diff --git a/easybuild/easyconfigs/a/accelerate/accelerate-1.10.0-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/a/accelerate/accelerate-1.10.0-foss-2023b-CUDA-12.4.0.eb
new file mode 100644
index 00000000000..ca32d9c6984
--- /dev/null
+++ b/easybuild/easyconfigs/a/accelerate/accelerate-1.10.0-foss-2023b-CUDA-12.4.0.eb
@@ -0,0 +1,36 @@
+easyblock = 'PythonBundle'
+
+name = 'accelerate'
+version = '1.10.0'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'https://github.com/huggingface/accelerate'
+description = """A simple way to launch, train, and use PyTorch models on almost any device and
+distributed configuration, automatic mixed precision (including fp8),
+and easy-to-configure FSDP and DeepSpeed support."""
+
+toolchain = {'name': 'foss', 'version': '2023b'}
+
+dependencies = [
+    ('Python', '3.11.5'),
+    ('Python-bundle-PyPI', '2023.10'),
+    ('SciPy-bundle', '2023.11'),
+    ('CUDA', '12.4.0', '', SYSTEM),
+    ('PyTorch-bundle', '2.3.0', versionsuffix),
+    ('PyYAML', '6.0.1'),
+    ('Safetensors', '0.4.4'),
+]
+
+exts_list = [
+    ('huggingface-hub', '0.30.2', {
+        'sources': ['huggingface_hub-%(version)s.tar.gz'],
+        'checksums': ['9a7897c5b6fd9dad3168a794a8998d6378210f5b9688d0dfc180b1a228dc2466'],
+    }),
+    (name, version, {
+        'checksums': ['8270568fda9036b5cccdc09703fef47872abccd56eb5f6d53b54ea5fb7581496'],
+    }),
+]
+
+sanity_check_commands = ['accelerate test']
+
+moduleclass = 'ai'
diff --git a/easybuild/easyconfigs/c/CUTLASS/CUTLASS-4.1.0-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/c/CUTLASS/CUTLASS-4.1.0-foss-2023b-CUDA-12.4.0.eb
new file mode 100644
index 00000000000..ee1c2c569e9
--- /dev/null
+++ b/easybuild/easyconfigs/c/CUTLASS/CUTLASS-4.1.0-foss-2023b-CUDA-12.4.0.eb
@@ -0,0 +1,58 @@
+easyblock = 'CMakeMake'
+
+name = 'CUTLASS'
+version = '4.1.0'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'https://github.com/NVIDIA/cutlass'
+description = """CUTLASS is a collection of CUDA C++ template
+abstractions for implementing high-performance matrix-matrix
+multiplication (GEMM) and related computations at all levels and scales
+within CUDA. It incorporates strategies for hierarchical decomposition
+and data movement similar to those used to implement cuBLAS and cuDNN.
+CUTLASS decomposes these "moving parts" into reusable, modular software
+components abstracted by C++ template classes. Primitives for different
+levels of a conceptual parallelization hierarchy can be specialized and
+tuned via custom tiling sizes, data types, and other algorithmic policy.
+The resulting flexibility simplifies their use as building blocks within
+custom kernels and applications."""
+
+toolchain = {'name': 'foss', 'version': '2023b'}
+
+github_account = 'NVIDIA'
+source_urls = [GITHUB_LOWER_SOURCE]
+sources = [{'download_filename': V_VERSION_TAR_GZ, 'filename': SOURCE_TAR_GZ}]
+patches = [
+    'CUTLASS-4.1.0_fix-version.patch',
+    'CUTLASS-4.1.0_add-arch-guards-to-tests.patch',
+]
+checksums = [
+    {'CUTLASS-4.1.0.tar.gz': '8d4675b11e9e5207e3940eaac0f46db934ada371cbb3627c9fda642d912b6230'},
+    {'CUTLASS-4.1.0_fix-version.patch': 'e2c7f66e6fd298b3af5339e17c0c75ded7d726cdf6cde003f60263e27ae46495'},
+    {'CUTLASS-4.1.0_add-arch-guards-to-tests.patch':
+     '81cd18d83bdedf3ed1f7add68bbff1635cf9d76bb9e184efbc62cd95caee4275'},
+]
+
+builddependencies = [
+    ('CMake', '3.27.6'),
+    ('Python', '3.11.5'),
+]
+
+dependencies = [
+    ('CUDA', '12.4.0', '', SYSTEM),
+    ('cuDNN', '9.1.1.17', versionsuffix, SYSTEM),
+]
+
+_copts = [
+    '-DCUTLASS_NVCC_ARCHS="%(cuda_cc_cmake)s"',
+    '-DCUTLASS_ENABLE_CUBLAS=1',
+    '-DCUTLASS_ENABLE_CUDNN=1',
+]
+configopts = ' '.join(_copts)
+
+sanity_check_paths = {
+    'files': ['include/cutlass/cutlass.h', 'lib/libcutlass.%s' % SHLIB_EXT],
+    'dirs': ['lib/cmake'],
+}
+
+moduleclass = 'lib'
diff --git a/easybuild/easyconfigs/c/CuPy/CuPy-13.6.0-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/c/CuPy/CuPy-13.6.0-foss-2023b-CUDA-12.4.0.eb
new file mode 100644
index 00000000000..fe488db1d3e
--- /dev/null
+++ b/easybuild/easyconfigs/c/CuPy/CuPy-13.6.0-foss-2023b-CUDA-12.4.0.eb
@@ -0,0 +1,77 @@
+easyblock = 'PythonBundle'
+
+name = 'CuPy'
+version = '13.6.0'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'https://cupy.dev'
+description = "CuPy is an open-source array library accelerated with NVIDIA CUDA."
+
+toolchain = {'name': 'foss', 'version': '2023b'}
+
+builddependencies = [
+    ('hypothesis', '6.90.0'),
+    ('Cython', '3.0.10'),
+    ('setuptools', '80.9.0'),
+]
+
+dependencies = [
+    ('Python', '3.11.5'),
+    ('SciPy-bundle', '2023.11'),
+    ('CUDA', '12.4.0', '', SYSTEM),
+    ('NCCL', '2.20.5', versionsuffix),
+    ('cuTENSOR', '2.0.2.5', versionsuffix, SYSTEM),
+    ('cuSPARSELt', '0.8.0.4', versionsuffix, SYSTEM),  # docs say 0.7.0 or 0.7.1
+]
+
+# default CUDA compute capabilities to use (override via --cuda-compute-capabilities)
+cuda_compute_capabilities = ['5.0', '6.0', '7.0', '7.5', '8.0', '8.6', '9.0']
+
+exts_default_options = {'source_urls': [PYPI_LOWER_SOURCE]}
+
+_skip_tests = [
+    '--ignore tests/example_tests',  # examples are not included
+    '--deselect tests/cupyx_tests/scipy_tests/signal_tests/test_ltisys.py::Test_bode::test_from_state_space',
+    '--deselect tests/cupyx_tests/scipy_tests/signal_tests/test_fir_filter_design.py::TestFirls::test_firls',
+    '--deselect tests/cupyx_tests/scipy_tests/signal_tests/test_ltisys.py::TestPlacePoles::test_real_2',
+    # New failures in 13.6.0, they all seems to be related to on-the-fly compilation failing
+    '--deselect tests/cupy_tests/core_tests/test_raw.py::TestRawPicklable',   # ::test_raw_picklable
+    '--deselect tests/cupy_tests/fft_tests/test_callback.py::Test1dCallbacks',
+]
+
+# For testing with new versions of CuPy, please enable the slow testing setting below,
+# but switch to the much lighter fast testing before submitting the .eb file, so users
+# can install on GPUs with moderate RAM.
+
+# _parallel_tests, _test_type = 4, 'not slow'
+_parallel_tests, _test_type = 1, 'fast'
+
+exts_list = [
+    ('fastrlock', '0.8.3', {
+        'checksums': ['4af6734d92eaa3ab4373e6c9a1dd0d5ad1304e172b1521733c6c3b3d73c8fa5d'],
+    }),
+    ('cupy', version, {
+        'patches': [
+            'cupy-13.0.0_cusparselt_0.6.0.patch',
+            'cupy-13.0.0_eb_ccc.patch',
+            'CuPy-13.6.0-Disable_TestRaw_with_nvcc_backend.patch',
+        ],
+        'preinstallopts': 'CUPY_NUM_BUILD_JOBS=%(parallel)s EB_CCC="%(cuda_cc_cmake)s" ',
+        'runtest': 'export CUPY_TEST_GPU_LIMIT=1 CUPY_CACHE_DIR="%%(builddir)s" && '
+                   'pytest -n %s tests -k "%s" ' % (_parallel_tests, _test_type) + ' '.join(_skip_tests),
+        'testinstall': True,
+        'checksums': [
+            {'cupy-13.6.0.tar.gz': '3cba30ae3dd32b5d5c6536e710cb98015227cd4ba83c46b3f1825a7ae55b6667'},
+            {'cupy-13.0.0_cusparselt_0.6.0.patch': '09cb12d26e78079c50b06f17002bf54c66e5e4743b917c5a218d3fe90124d499'},
+            {'cupy-13.0.0_eb_ccc.patch': 'bfe8b46344759f58491f55418bd9c856d6f72d681ee5fef12820009f808d2db1'},
+            {'CuPy-13.6.0-Disable_TestRaw_with_nvcc_backend.patch':
+             '958d80059b085017ed8c8de55ed82a0d52fdf964482e8ccc13d401515979d4b7'},
+        ],
+    }),
+]
+
+sanity_check_commands = [
+    "python -c 'import cupy'",
+]
+
+moduleclass = 'lib'
diff --git a/easybuild/easyconfigs/c/cuSPARSELt/cuSPARSELt-0.8.0.4-CUDA-12.4.0.eb b/easybuild/easyconfigs/c/cuSPARSELt/cuSPARSELt-0.8.0.4-CUDA-12.4.0.eb
new file mode 100644
index 00000000000..e59c12b3bf4
--- /dev/null
+++ b/easybuild/easyconfigs/c/cuSPARSELt/cuSPARSELt-0.8.0.4-CUDA-12.4.0.eb
@@ -0,0 +1,35 @@
+easyblock = 'Tarball'
+
+name = 'cuSPARSELt'
+version = '0.8.0.4'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'https://docs.nvidia.com/cuda/cusparselt/index.html'
+description = """NVIDIA cuSPARSELt is a high-performance CUDA library dedicated to general matrix-matrix operations in
+which at least one operand is a sparse matrix"""
+
+toolchain = SYSTEM
+
+local_cudamajver = '12'
+
+local_arch = {'arm64': 'sbsa', 'aarch64': 'sbsa'}.get(ARCH, ARCH)
+source_urls = ['https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-%s/' % local_arch]
+sources = ['libcusparse_lt-linux-%s-%%(version)s_cuda%s-archive.tar.xz' %
+           (local_arch, local_cudamajver)]
+checksums = [{
+    'libcusparse_lt-linux-x86_64-%%(version)s_cuda%s-archive.tar.xz' % local_cudamajver:
+        '483954591766bade877becef126d53908d5fef5d7468b503736af37388669c08',
+    'libcusparse_lt-linux-sbsa-%%(version)s_cuda%s-archive.tar.xz' % local_cudamajver:
+        'b59e2f8ffd154b156b2d74ccd7cad7775385693bec8cb9562596060072c515f2',
+}]
+
+dependencies = [('CUDA', '12.4.0')]
+
+sanity_check_paths = {
+    'files': ['include/cusparseLt.h',
+              'lib/libcusparseLt.%s' % SHLIB_EXT,
+              'lib/libcusparseLt_static.a'],
+    'dirs': [],
+}
+
+moduleclass = 'lib'
diff --git a/easybuild/easyconfigs/c/cuTENSOR/cuTENSOR-2.0.2.5-CUDA-12.4.0.eb b/easybuild/easyconfigs/c/cuTENSOR/cuTENSOR-2.0.2.5-CUDA-12.4.0.eb
new file mode 100644
index 00000000000..dd4624817a7
--- /dev/null
+++ b/easybuild/easyconfigs/c/cuTENSOR/cuTENSOR-2.0.2.5-CUDA-12.4.0.eb
@@ -0,0 +1,40 @@
+easyblock = 'Tarball'
+
+name = 'cuTENSOR'
+version = '2.0.2.5'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'https://developer.nvidia.com/cutensor'
+description = """The cuTENSOR Library is a GPU-accelerated tensor linear algebra library providing tensor contraction,
+ reduction and elementwise operations."""
+
+toolchain = SYSTEM
+
+source_urls = [
+    'https://developer.download.nvidia.com/compute/cutensor/redist/libcutensor/linux-%(arch)s/'
+]
+sources = ['libcutensor-linux-%(arch)s-%(version)s-archive.tar.xz']
+
+checksums = [{
+    'libcutensor-linux-sbsa-%(version)s-archive.tar.xz':
+        '5163dd40f11f328e469a6d9b0056c8346f5d59ed538c18d6b954e4ae657c69cc',
+    'libcutensor-linux-x86_64-%(version)s-archive.tar.xz':
+        '0e957ae7b352f599de34b6fa1ba999b0617887f885d7436ac5737d71a6b83baa',
+}]
+
+local_cudamajver = '12'
+dependencies = [('CUDA', '12.4.0')]
+
+sanity_check_paths = {
+    'files': ['include/cutensor.h', 'include/cutensor/types.h',
+              'lib/%s/libcutensor.%s' % (local_cudamajver, SHLIB_EXT),
+              'lib/%s/libcutensor_static.a' % local_cudamajver],
+    'dirs': [],
+}
+
+modextrapaths = {
+    'LD_LIBRARY_PATH': ['lib/%s' % local_cudamajver],
+    'LIBRARY_PATH': ['lib/%s' % local_cudamajver],
+}
+
+moduleclass = 'lib'
diff --git a/easybuild/easyconfigs/d/DLPack/DLPack-1.2-GCCcore-13.2.0.eb b/easybuild/easyconfigs/d/DLPack/DLPack-1.2-GCCcore-13.2.0.eb
new file mode 100644
index 00000000000..afb81cf369b
--- /dev/null
+++ b/easybuild/easyconfigs/d/DLPack/DLPack-1.2-GCCcore-13.2.0.eb
@@ -0,0 +1,27 @@
+easyblock = 'CMakeMake'
+
+name = 'DLPack'
+version = '1.2'
+
+homepage = 'https://dmlc.github.io/dlpack/latest/'
+description = """DLPack is a stable in-memory data structure for an ndarray
+system to interact with a variety of frameworks."""
+
+toolchain = {'name': 'GCCcore', 'version': '13.2.0'}
+
+github_account = 'dmlc'
+source_urls = [GITHUB_LOWER_SOURCE]
+sources = ['v%(version)s.tar.gz']
+checksums = ['58284a3b004a48450c958a23b30274527ebaf35a061124bbd4193fffa45efbd6']
+
+builddependencies = [
+    ('binutils', '2.40'),
+    ('CMake', '3.27.6'),
+]
+
+sanity_check_paths = {
+    'files': ['include/dlpack/dlpack.h', 'lib/cmake/dlpack/dlpackConfig.cmake'],
+    'dirs': [],
+}
+
+moduleclass = 'lib'
diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
new file mode 100644
index 00000000000..c4869f07bd0
--- /dev/null
+++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
@@ -0,0 +1,133 @@
+easyblock = 'PythonBundle'
+
+name = 'DeepSpeed'
+version = '0.14.5'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'http://www.deepspeed.ai/'
+description = """
+DeepSpeed is a deep learning optimization library that makes distributed training easy, efficient, and effective.
+"""
+
+toolchain = {'name': 'foss', 'version': '2023b'}
+
+builddependencies = [
+    ('Ninja', '1.11.1'),
+    ('Transformers', '4.44.0'),
+]
+local_pytorch_version = '2.3.0'
+dependencies = [
+    ('Python', '3.11.5'),
+    ('CUDA', '12.4.0', '', SYSTEM),
+    ('NCCL', '2.20.5', '-CUDA-%(cudaver)s'),
+    ('CuPy', '13.6.0', '-CUDA-%(cudaver)s'),
+    ('Triton', '2.3.1', '-CUDA-%(cudaver)s'),
+    ('accelerate', '1.10.0', '-CUDA-%(cudaver)s'),
+    ('PyTorch', local_pytorch_version, '-CUDA-%(cudaver)s'),
+    ('PyTorch-bundle', local_pytorch_version, '-CUDA-%(cudaver)s'),
+    ('mpi4py', '3.1.5'),
+    ('DLPack', '1.2'),
+    ('py-cpuinfo', '9.0.0'),
+    ('pydantic', '2.7.4'),
+    ('tqdm', '4.66.2'),
+    ('pdsh', '2.36'),
+    ('Seaborn', '0.13.2'),  # dependency for mup
+    ('libaio', '0.3.113'),  # for async_io (builddep only?)
+]
+
+local_excluded_ds_tests = (
+    'TestTensorBoard',
+    'TestWandb',
+    'TestCometMonitor',
+    'TestQuantizedInt',  # Downloads model from internet
+    'test_fp_quant[256-qbits8-bf16]',  # Error of 0.00909423828125 > 0.004
+    'test_DS4Sci_EvoformerAttention[tensor_shape1-dtype1]',  # Error of 0.05859375 > 0.05
+)
+
+components = [
+    ('CUTLASS', '4.1.0', {
+        'easyblock': 'Tarball',
+        'source_urls': ['https://github.com/NVIDIA/cutlass/archive/refs/tags'],
+        'sources': [{'download_filename': V_VERSION_TAR_GZ, 'filename': SOURCE_TAR_GZ}],
+        'checksums': ['8d4675b11e9e5207e3940eaac0f46db934ada371cbb3627c9fda642d912b6230'],
+        'start_dir': '%(namelower)s-%(version)s',
+        'target_dir': 'extra/cutlass',
+    }),
+]
+
+local_cutlass_path = '%(installdir)s/extra/cutlass'
+local_cutlass_opt = f"export CUTLASS_PATH='{local_cutlass_path}' && "
+
+github_account = 'microsoft'
+exts_list = [
+    ('hjson', '3.1.0', {
+        'checksums': ['55af475a27cf83a7969c808399d7bccdec8fb836a07ddbd574587593b9cdcf75'],
+    }),
+    ('nvidia-ml-py', '12.535.161', {
+        'checksums': ['2bcc31ff7a0ea291ed8d7fc39b149391a42c2fb1cb4256c935e692de488b4d17'],
+        'modulename': 'pynvml',
+    }),
+    ('mup', '1.0.0', {
+        'checksums': ['9639e3d19f90e754f985ed444542ed2f8a049f3c0488fcb6efe150f30922cf74'],
+    }),
+    ('qtorch', '0.3.0', {
+        'checksums': ['3fc2e9b27d58d18304ac46511ea03a3eb20f852944f6a5b6ef71b974c2da20bf'],
+        'preinstallopts': "TORCH_CUDA_ARCH_LIST='%(cuda_cc_semicolon_sep)s' ",
+    }),
+    ('DeepSpeed', '0.14.5', {
+        'source_urls': [GITHUB_SOURCE],
+        # Test suite not available on pypi
+        'sources': [{'download_filename': V_VERSION_TAR_GZ, 'filename': SOURCE_TAR_GZ}],
+        'patches': [
+            'DeepSpeed-0.14.5_avoid-access-to-home.patch',
+            'DeepSpeed-0.14.5_fix-test-parameterize.patch',
+            'DeepSpeed-0.14.5_no-ninja-dep.patch',
+            'DeepSpeed-0.14.5_pdsh-env-vars.patch',
+            'DeepSpeed-0.14.5_pic-compile.patch',
+            'DeepSpeed-0.14.5_test-nvme-offload.patch',
+        ],
+        'checksums': [
+            {'DeepSpeed-0.14.5.tar.gz': '9f5622715cbd89c7382bfecf7fb188419ad3f2af7764dc6de35917abc6390cce'},
+            {'DeepSpeed-0.14.5_avoid-access-to-home.patch':
+             'edb39720a27b74170c87c8c51ecb8be6fd6fe2fa346f2a10b343a73884c5c412'},
+            {'DeepSpeed-0.14.5_fix-test-parameterize.patch':
+             '1df9c7ceeca0b37aff85390b7bd25e266ddf88c5b1380980e6c13a064840d1d8'},
+            {'DeepSpeed-0.14.5_no-ninja-dep.patch':
+             'e974a928b03a180da4e67da2f347c25968cb41f9c6037a9796ab776a9a4b0547'},
+            {'DeepSpeed-0.14.5_pdsh-env-vars.patch':
+             '02556620ac643d273a2fa9c019d437cd874a6c19759fa59baaa0e9a41d0a5240'},
+            {'DeepSpeed-0.14.5_pic-compile.patch': '1b9c070b77cf24351bff29bab7d23baacde31c7ea211a4bc75732ac38a99d6b0'},
+            {'DeepSpeed-0.14.5_test-nvme-offload.patch':
+             '1592097867c5d4594a434cca727df134fcaa0e3ea8c595eb5951856a501cf422'},
+        ],
+        'jit_only_ops': [
+            'SPARSE_ATTN', 'FP_QUANTIZER', 'CUTLASS_OPS', 'RAGGED_DEVICE_OPS',
+            # Cannot be prebuilt in several scenarious,
+            # e.g. multiple GPU archs (cuda-compute-capabilities), no GPU present:
+            # See https://github.com/deepspeedai/DeepSpeed/pull/7760
+            'EVOFORMER_ATTN',
+        ],
+        'preinstallopts': ' && '.join((
+            # Use this version and no suffix
+            'echo "%(version)s" > version.txt',
+            'echo "" > build.txt',
+            # Disable use of git during build
+            'sed -i "s/command_exists(\'git\')/False/" setup.py',
+            local_cutlass_opt,
+        )),
+        'pretestopts': local_cutlass_opt,
+        'testinstall': True,
+        'runtest': ' && '.join((
+            'ln -s $PWD/tests/ ../tests',
+            'cd ..',
+            'export DS_UNITTEST_TIMEOUT=1200',
+            f"pytest tests/unit/ -k 'not {' and not '.join(local_excluded_ds_tests)}' --durations=0",
+        )),
+    }),
+]
+
+modextravars = {
+    'CUTLASS_PATH': local_cutlass_path,
+}
+
+moduleclass = 'ai'
diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_avoid-access-to-home.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_avoid-access-to-home.patch
new file mode 100644
index 00000000000..35704b40e88
--- /dev/null
+++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_avoid-access-to-home.patch
@@ -0,0 +1,60 @@
+From 9d17116fcdb44b81eb00d3bce91431dc35cd69b1 Mon Sep 17 00:00:00 2001
+From: "Joshua C. Randall" <jcrandall@alum.mit.edu>
+Date: Wed, 4 Sep 2024 19:22:07 +0100
+Subject: [PATCH] print warning if actual triton cache dir is on NFS, not just
+ for default (#6487)
+
+move the logic that prints a warning when triton cache dir is on NFS to
+act on the actual calculated cache_dir rather than on the default.
+
+this means that:
+- when the default directory (in the user's home directory) is on NFS
+but `TRITON_CACHE_DIR` is set to a non-NFS directory, no warning will be
+printed whereas prior to this change a spurious and confusing warning
+was printed
+- when the user's home directory is not on NFS but `TRITON_CACHE_DIR` is
+set to an NFS directory, a warning will be printed whereas prior to this
+change no warning would be printed
+
+fixes #6486
+
+Adapted to use $TRITON_HOME by Alexander Grund (TU Dresden)
+
+diff --git a/deepspeed/ops/transformer/inference/triton/matmul_ext.py b/deepspeed/ops/transformer/inference/triton/matmul_ext.py
+index c77d8a8e11c0..412c8740a216 100644
+--- a/deepspeed/ops/transformer/inference/triton/matmul_ext.py
++++ b/deepspeed/ops/transformer/inference/triton/matmul_ext.py
+@@ -40,13 +40,18 @@ class TritonCacheDir:
+     _warning_printed = False
+ 
+     @staticmethod
+-    def default_cache_dir():
+-        tmp_path = os.path.join(Path.home(), ".triton", "autotune")
+-        if is_nfs_path(tmp_path) and not TritonCacheDir._warning_printed:
++    def warn_if_nfs(cache_dir):
++        if is_nfs_path(cache_dir) and not TritonCacheDir._warning_printed:
+             print(
+-                f"Warning: The default cache directory for DeepSpeed Triton autotune, {tmp_path}, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path."
++                f"Warning: The cache directory for DeepSpeed Triton autotune, {cache_dir}, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path."
+             )
+             TritonCacheDir._warning_printed = True
++        return
++
++    @staticmethod
++    def default_cache_dir():
++        tt_home = os.environ.get('TRITON_HOME') or os.path.join(Path.home(), ".triton")
++        tmp_path = os.path.join(tt_home, "autotune")
+         return tmp_path
+ 
+ 
+@@ -80,9 +84,9 @@ def __init__(self, key):
+         self.lock_path = None
+         # if caching is enabled, get the lock and bin path
+         self.cache_dir = os.environ.get('TRITON_CACHE_DIR', TritonCacheDir.default_cache_dir())
++        TritonCacheDir.warn_if_nfs(self.cache_dir)
+         if self.cache_dir:
+             os.makedirs(self.cache_dir, exist_ok=True)
+-        if self.cache_dir:
+             self.file_path = os.path.join(self.cache_dir, self.key + ".pickle")
+             self.lock_path = self.file_path + ".lock"
+ 
diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_fix-test-parameterize.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_fix-test-parameterize.patch
new file mode 100644
index 00000000000..8776d6e3d61
--- /dev/null
+++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_fix-test-parameterize.patch
@@ -0,0 +1,82 @@
+Avoid this failure during tests:
+>  assert not 'unit/runtime/half_precision/onebit/test_onebit.py::TestOneBitLambFP16Pipeline::test[topo_config0]'
+
+Reason is that pytest-xdist doesn't seem to work well with having a dict in `pytest.mark.parameterize`
+See https://github.com/pytest-dev/pytest-xdist/issues/922
+
+It is not required at all here
+diff --git a/tests/unit/runtime/half_precision/onebit/test_onebit.py b/tests/unit/runtime/half_precision/onebit/test_onebit.py
+--- a/tests/unit/runtime/half_precision/onebit/test_onebit.py
++++ b/tests/unit/runtime/half_precision/onebit/test_onebit.py
+@@ -37,6 +37,8 @@ if get_accelerator().device_name() == 'hpu':
+     pytest.skip("1-bit compression is not supported by HPU.", allow_module_level=True)
+ 
+ 
++topo_config = {"num_pp": 2, "num_dp": 2}
++
+ @pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=["fp32", "fp16"])
+ class TestOneBitAdamBasic(DistributedTest):
+     world_size = 2
+@@ -342,19 +344,10 @@ class TestOneBitAdamCheckpointing(DistributedTest):
+             model.save_checkpoint(save_folder, tag=None)
+ 
+ 
+-@pytest.mark.parametrize(
+-    "topo_config",
+-    [
+-        {
+-            "num_pp": 2,
+-            "num_dp": 2
+-        },
+-    ],
+-)
+ class TestOneBitAdamFP16Pipeline(DistributedTest):
+     world_size = 4
+ 
+-    def test(self, topo_config):
++    def test(self):
+         if not get_accelerator().is_fp16_supported():
+             pytest.skip("fp16 is not supported")
+         config_dict = {
+@@ -709,19 +702,10 @@ class TestZeroOneAdamCheckpointing(DistributedTest):
+             model.save_checkpoint(save_folder, tag=None)
+ 
+ 
+-@pytest.mark.parametrize(
+-    "topo_config",
+-    [
+-        {
+-            "num_pp": 2,
+-            "num_dp": 2
+-        },
+-    ],
+-)
+ class TestZeroOneAdamFP16Pipeline(DistributedTest):
+     world_size = 4
+ 
+-    def test(self, topo_config):
++    def test(self):
+         if not get_accelerator().is_fp16_supported():
+             pytest.skip("fp16 is not supported")
+         config_dict = {
+@@ -1105,19 +1089,10 @@ class TestOneBitLambCheckpointing(DistributedTest):
+             model.save_checkpoint(save_folder, tag=None)
+ 
+ 
+-@pytest.mark.parametrize(
+-    "topo_config",
+-    [
+-        {
+-            "num_pp": 2,
+-            "num_dp": 2
+-        },
+-    ],
+-)
+ class TestOneBitLambFP16Pipeline(DistributedTest):
+     world_size = 4
+ 
+-    def test(self, topo_config):
++    def test(self):
+         if not get_accelerator().is_fp16_supported():
+             pytest.skip("fp16 is not supported")
+         config_dict = {
diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_no-ninja-dep.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_no-ninja-dep.patch
new file mode 100644
index 00000000000..d4e2a0d6699
--- /dev/null
+++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_no-ninja-dep.patch
@@ -0,0 +1,67 @@
+Patch away dependency on ninja python package by checking returncode of `ninja --version`.
+
+Author: Viktor Rehnberg (Chalmers University of Technology)
+Adapted by Alexander Grund (TU Dresden)
+
+diff --git a/deepspeed/env_report.py b/deepspeed/env_report.py
+--- a/deepspeed/env_report.py
++++ b/deepspeed/env_report.py
+@@ -59,11 +59,7 @@ def op_report(verbose=True):
+ 
+ 
+ def ninja_installed():
+-    try:
+-        import ninja  # noqa: F401 # type: ignore
+-    except ImportError:
+-        return False
+-    return True
++    return subprocess.run(["ninja", "--version"], check=False).returncode == 0
+ 
+ 
+ def nvcc_version():
+diff --git a/op_builder/builder.py b/op_builder/builder.py
+--- a/op_builder/builder.py
++++ b/op_builder/builder.py
+@@ -533,9 +533,7 @@ class OpBuilder(ABC):
+             raise RuntimeError(
+                 f"Unable to JIT load the {self.name} op due to it not being compatible due to hardware/software issue. {self.error_log}"
+             )
+-        try:
+-            import ninja  # noqa: F401 # type: ignore
+-        except ImportError:
++        if subprocess.run(["ninja", "--version"], check=False).returncode != 0:
+             raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")
+ 
+         if isinstance(self, CUDAOpBuilder) and not self.is_rocm_pytorch():
+diff --git a/op_builder/xpu/builder.py b/op_builder/xpu/builder.py
+--- a/op_builder/xpu/builder.py
++++ b/op_builder/xpu/builder.py
+@@ -6,6 +6,7 @@
+ import os
+ import time
+ import importlib
++import subprocess
+ 
+ try:
+     # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+@@ -86,9 +87,7 @@ class SYCLOpBuilder(OpBuilder):
+             raise RuntimeError(
+                 f"Unable to JIT load the {self.name} op due to it not being compatible due to hardware/software issue. {self.error_log}"
+             )
+-        try:
+-            import ninja  # noqa: F401
+-        except ImportError:
++        if subprocess.run(["ninja", "--version"], check=False).returncode != 0:
+             raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")
+ 
+         self.jit_mode = True
+diff --git a/requirements/requirements.txt b/requirements/requirements.txt
+index 80c9f9b3..eed77fa3 100755
+--- a/requirements/requirements.txt
++++ b/requirements/requirements.txt
+@@ -1,5 +1,4 @@
+ hjson
+-ninja
+ numpy
+ nvidia-ml-py
+ packaging>=20.0
diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch
new file mode 100644
index 00000000000..be3e25a5a71
--- /dev/null
+++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch
@@ -0,0 +1,28 @@
+Add software relevant environment variables
+
+The multinode runner launches processes with pdsh, if LD_LIBRARY_PATH is
+not included in these exports then the python .so file may not be found.
+Also including what seemed important and was added from loading DeepSpeed.
+(Couldn't add everything, then argument list becomes too long).
+
+See https://github.com/easybuilders/easybuild-easyconfigs/pull/21438#issuecomment-2373540098
+
+Note: Those are prefixes of variables to be included.
+
+Author: Viktor Rehnberg (Chalmers University of Technology)
+
+diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py
+--- a/deepspeed/launcher/runner.py
++++ b/deepspeed/launcher/runner.py
+@@ -32,6 +32,11 @@ from deepspeed.accelerator import get_accelerator
+ 
+ DLTS_HOSTFILE = "/job/hostfile"
+ EXPORT_ENVS = ['MLFLOW', 'PYTHON', 'MV2', 'UCX']
++EXPORT_ENVS += [ # Extra based on what's added by module load DeepSpeed
++    'LD_LIBRARY_PATH', 'PATH', 'EB', 'TRITON', 'CUDA',  # important
++    'ACLOCAL', 'CMAKE', 'CPATH', 'LIBRARY_PATH', 'MPL', 'NCCL',
++    'PKG_CONFIG_PATH', 'XDG_DATA_DIRS',
++]
+ EXPORT_ENVS += NEBULA_EXPORT_ENVS
+ DEEPSPEED_ENVIRONMENT_NAME = os.getenv("DS_ENV_FILE", ".deepspeed_env")
+ DEEPSPEED_ENVIRONMENT_PATHS = [os.path.expanduser("~"), '.']
diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pic-compile.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pic-compile.patch
new file mode 100644
index 00000000000..707bc826e88
--- /dev/null
+++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pic-compile.patch
@@ -0,0 +1,141 @@
+From 90afd671dadf9fd6a7a221428f2c04c16d637494 Mon Sep 17 00:00:00 2001
+From: Viktor Rehnberg <viktor.rehnberg@gmail.com>
+Date: Thu, 23 May 2024 07:09:53 +0000
+Subject: [PATCH] Compile with PIC
+
+---
+ op_builder/builder.py     | 15 ++++++++++-----
+ op_builder/cpu/builder.py |  3 ++-
+ op_builder/fused_adam.py  |  4 +++-
+ op_builder/fused_lamb.py  |  4 +++-
+ op_builder/fused_lion.py  |  4 +++-
+ op_builder/xpu/builder.py |  3 ++-
+ 6 files changed, 23 insertions(+), 10 deletions(-)
+
+diff --git a/op_builder/builder.py b/op_builder/builder.py
+index ec7566aa..f08e1799 100644
+--- a/op_builder/builder.py
++++ b/op_builder/builder.py
+@@ -288,13 +288,13 @@ class OpBuilder(ABC):
+         '''
+         Returns optional list of compiler flags to forward to nvcc when building CUDA sources
+         '''
+-        return []
++        return ['-Xcompiler', '-fPIC']
+ 
+     def cxx_args(self):
+         '''
+         Returns optional list of compiler flags to forward to the build
+         '''
+-        return []
++        return ['-fPIC']
+ 
+     def is_compatible(self, verbose=True):
+         '''
+@@ -746,15 +746,18 @@ class CUDAOpBuilder(OpBuilder):
+             )
+ 
+     def cxx_args(self):
++        args = super().cxx_args()
+         if sys.platform == "win32":
+-            return ['-O2']
++            args += ['-O2']
+         else:
+-            return ['-O3', '-std=c++17', '-g', '-Wno-reorder']
++            args += ['-O3', '-std=c++17', '-g', '-Wno-reorder']
++        return args
+ 
+     def nvcc_args(self):
+         if self.build_for_cpu:
+             return []
+-        args = ['-O3']
++        args = super().nvcc_args()
++        args += ['-O3']
+         if self.is_rocm_pytorch():
+             ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
+             args += [
+@@ -835,6 +838,8 @@ class TorchCPUOpBuilder(CUDAOpBuilder):
+                 '-lcublas',
+                 '-g',
+             ]
++        else:
++            args += super(CUDAOpBuilder, self).cxx_args()
+ 
+         CPU_ARCH = self.cpu_arch()
+         SIMD_WIDTH = self.simd_width()
+diff --git a/op_builder/cpu/builder.py b/op_builder/cpu/builder.py
+index d881842a..dfc5a31d 100644
+--- a/op_builder/cpu/builder.py
++++ b/op_builder/cpu/builder.py
+@@ -30,7 +30,8 @@ class CPUOpBuilder(OpBuilder):
+         return cpp_ext
+ 
+     def cxx_args(self):
+-        args = ['-O3', '-g', '-Wno-reorder']
++        args = super().cxx_args()
++        args += ['-O3', '-g', '-Wno-reorder']
+         CPU_ARCH = self.cpu_arch()
+         SIMD_WIDTH = self.simd_width()
+         args += [CPU_ARCH, '-fopenmp', SIMD_WIDTH]
+diff --git a/op_builder/fused_adam.py b/op_builder/fused_adam.py
+index ac6e4eea..0c723572 100644
+--- a/op_builder/fused_adam.py
++++ b/op_builder/fused_adam.py
+@@ -29,7 +29,9 @@ class FusedAdamBuilder(CUDAOpBuilder):
+         return args + self.version_dependent_macros()
+ 
+     def nvcc_args(self):
+-        nvcc_flags = ['-O3'] + self.version_dependent_macros()
++        nvcc_flags = super(CUDAOpBuilder, self).nvcc_args()
++        nvcc_flags += ['-O3']
++        nvcc_flags += self.version_dependent_macros()
+         if not self.is_rocm_pytorch():
+             nvcc_flags.extend(
+                 ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] +
+diff --git a/op_builder/fused_lamb.py b/op_builder/fused_lamb.py
+index f0cb5577..a59b97d4 100644
+--- a/op_builder/fused_lamb.py
++++ b/op_builder/fused_lamb.py
+@@ -29,7 +29,9 @@ class FusedLambBuilder(CUDAOpBuilder):
+         return args + self.version_dependent_macros()
+ 
+     def nvcc_args(self):
+-        nvcc_flags = ['-O3'] + self.version_dependent_macros()
++        nvcc_flags = super(CUDAOpBuilder, self).nvcc_args()
++        nvcc_flags += ['-O3']
++        nvcc_flags += self.version_dependent_macros()
+         if self.is_rocm_pytorch():
+             ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
+             nvcc_flags += ['-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR, '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR]
+diff --git a/op_builder/fused_lion.py b/op_builder/fused_lion.py
+index b900a8f2..119232b5 100644
+--- a/op_builder/fused_lion.py
++++ b/op_builder/fused_lion.py
+@@ -29,7 +29,9 @@ class FusedLionBuilder(CUDAOpBuilder):
+         return args + self.version_dependent_macros()
+ 
+     def nvcc_args(self):
+-        nvcc_flags = ['-O3'] + self.version_dependent_macros()
++        nvcc_flags = super(CUDAOpBuilder, self).nvcc_args()
++        nvcc_flags += ['-O3']
++        nvcc_flags += self.version_dependent_macros()
+         if not self.is_rocm_pytorch():
+             nvcc_flags.extend(
+                 ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] +
+diff --git a/op_builder/xpu/builder.py b/op_builder/xpu/builder.py
+index f430b7b6..5a1a2219 100644
+--- a/op_builder/xpu/builder.py
++++ b/op_builder/xpu/builder.py
+@@ -52,7 +52,8 @@ class SYCLOpBuilder(OpBuilder):
+         return version_ge_1_1 + version_ge_1_3 + version_ge_1_5
+ 
+     def cxx_args(self):
+-        cxx_flags = [
++        cxx_flags = super().cxx_args()
++        cxx_flags += [
+             '-fsycl', '-fsycl-targets=spir64_gen', '-g', '-gdwarf-4', '-O3', '-std=c++17', '-fPIC', '-DMKL_ILP64',
+             '-fno-strict-aliasing'
+         ]
+-- 
+2.39.3
+
diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_test-nvme-offload.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_test-nvme-offload.patch
new file mode 100644
index 00000000000..dcff709f2ce
--- /dev/null
+++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_test-nvme-offload.patch
@@ -0,0 +1,135 @@
+From ddbf7ab23ce2e83747ff6a1482ac512e06da82ca Mon Sep 17 00:00:00 2001
+From: Viktor Rehnberg <viktor.rehnberg@gmail.com>
+Date: Mon, 4 Nov 2024 15:31:55 +0100
+Subject: [PATCH] Fix quantization tests
+
+NVME tests didn't always run because the hard-coded nvme_path wasn't
+always writable. This commit changed to use tmp_path fixture instead and
+disabled distributed test to avoid thread locks hanging.
+---
+ .../quantization/test_intX_quantization.py    | 43 ++++++++++---------
+ 1 file changed, 22 insertions(+), 21 deletions(-)
+
+diff --git a/tests/unit/inference/quantization/test_intX_quantization.py b/tests/unit/inference/quantization/test_intX_quantization.py
+index 77b51fcd..9e0d7ac0 100644
+--- a/tests/unit/inference/quantization/test_intX_quantization.py
++++ b/tests/unit/inference/quantization/test_intX_quantization.py
+@@ -17,6 +17,7 @@ from transformers import AutoConfig, OPTConfig, AutoModel
+ import pytest
+ from collections import OrderedDict
+ from typing import Dict
++from pathlib import Path
+ 
+ device = get_accelerator().device_name() if get_accelerator().is_available() else 'cpu'
+ 
+@@ -53,11 +54,11 @@ def quantization_test_helper(pre_quant_type: torch.dtype, num_bits: int):
+     assert mean_diff < 0.15 and max_diff < 0.5, f'Numeric error exceed threshold, mean diff {mean_diff} (threshold 0.15), max diff {max_diff} (threshold 0.5)'
+ 
+ 
+-def zero3_post_init_quantization_test_helper(cpu_offload: bool, nvme_offload: bool, bits: int):
++def zero3_post_init_quantization_test_helper(cpu_offload: bool, nvme_offload: bool, bits: int, tmp_path: Path):
+     import deepspeed
+     from transformers.integrations.deepspeed import HfDeepSpeedConfig
+ 
+-    def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: bool, bits: int) -> Dict:
++    def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: bool, bits: int, tmp_path: Path) -> Dict:
+         GB = 1 << 30
+ 
+         ds_config = {
+@@ -127,7 +128,7 @@ def zero3_post_init_quantization_test_helper(cpu_offload: bool, nvme_offload: bo
+             ds_config["zero_optimization"]["offload_param"] = dict(
+                 device="nvme",
+                 pin_memory=True,
+-                nvme_path='~/tmp_offload_dir',
++                nvme_path=str(tmp_path / "tmp_offload_dir"),
+                 buffer_count=5,
+                 buffer_size=1 * GB,
+             )
+@@ -142,7 +143,7 @@ def zero3_post_init_quantization_test_helper(cpu_offload: bool, nvme_offload: bo
+         return ds_config
+ 
+     hf_config = AutoConfig.from_pretrained('facebook/opt-125m')
+-    ds_config = get_zero3_ds_config(hf_config=hf_config, cpu_offload=cpu_offload, nvme_offload=nvme_offload, bits=bits)
++    ds_config = get_zero3_ds_config(hf_config=hf_config, cpu_offload=cpu_offload, nvme_offload=nvme_offload, bits=bits, tmp_path=tmp_path)
+ 
+     input_ids = torch.ones(1, 16, dtype=torch.int32, device=device)
+     attention_mask = torch.ones(1, 16, dtype=torch.float32, device=device)
+@@ -170,11 +171,11 @@ def zero3_post_init_quantization_test_helper(cpu_offload: bool, nvme_offload: bo
+     assert mean_diff < 0.4, f'Numeric error exceed threshold, relative error {mean_diff} (threshold 0.4)'
+ 
+ 
+-def zero3_quantized_initialization_test_helper(cpu_offload: bool, nvme_offload: bool, bits: int):
++def zero3_quantized_initialization_test_helper(cpu_offload: bool, nvme_offload: bool, bits: int, tmp_path: Path):
+     import deepspeed
+     from transformers.integrations.deepspeed import HfDeepSpeedConfig
+ 
+-    def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: bool, bits: int) -> Dict:
++    def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: bool, bits: int, tmp_path: Path) -> Dict:
+         GB = 1 << 30
+ 
+         ds_config = {
+@@ -206,7 +207,7 @@ def zero3_quantized_initialization_test_helper(cpu_offload: bool, nvme_offload:
+             ds_config["zero_optimization"]["offload_param"] = dict(
+                 device="nvme",
+                 pin_memory=True,
+-                nvme_path='~/tmp_offload_dir',
++                nvme_path=str(tmp_path / "tmp_offload_dir"),
+                 buffer_count=5,
+                 buffer_size=1 * GB,
+             )
+@@ -221,7 +222,7 @@ def zero3_quantized_initialization_test_helper(cpu_offload: bool, nvme_offload:
+         return ds_config
+ 
+     hf_config = AutoConfig.from_pretrained('facebook/opt-125m')
+-    ds_config = get_zero3_ds_config(hf_config=hf_config, cpu_offload=cpu_offload, nvme_offload=nvme_offload, bits=bits)
++    ds_config = get_zero3_ds_config(hf_config=hf_config, cpu_offload=cpu_offload, nvme_offload=nvme_offload, bits=bits, tmp_path=tmp_path)
+ 
+     input_ids = torch.ones(1, 16, dtype=torch.int32, device=device)
+     attention_mask = torch.ones(1, 16, dtype=torch.float32, device=device)
+@@ -376,31 +377,31 @@ class TestQuantizedInt(DistributedTest):
+         quantization_test_helper(torch.float16, 8)
+ 
+     @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM')
+-    def test_zero3_int4_post_init_quant(self, quantization_bits):
++    def test_zero3_int4_post_init_quant(self, quantization_bits, tmp_path):
+         reset_random()
+-        zero3_post_init_quantization_test_helper(cpu_offload=False, nvme_offload=False, bits=quantization_bits)
++        zero3_post_init_quantization_test_helper(cpu_offload=False, nvme_offload=False, bits=quantization_bits, tmp_path=tmp_path)
+ 
+     @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM')
+-    def test_zero3_int4_post_init_quant_cpu_offload(self, quantization_bits):
++    def test_zero3_int4_post_init_quant_cpu_offload(self, quantization_bits, tmp_path):
+         reset_random()
+-        zero3_post_init_quantization_test_helper(cpu_offload=True, nvme_offload=False, bits=quantization_bits)
++        zero3_post_init_quantization_test_helper(cpu_offload=True, nvme_offload=False, bits=quantization_bits, tmp_path=tmp_path)
+ 
+     @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM')
+-    def test_zero3_int4_post_init_quant_nvme_offload(self):
++    def test_zero3_int4_post_init_quant_nvme_offload(self, tmp_path):
+         reset_random()
+-        zero3_post_init_quantization_test_helper(cpu_offload=False, nvme_offload=True, bits=4)
++        zero3_post_init_quantization_test_helper(cpu_offload=False, nvme_offload=True, bits=4, tmp_path=tmp_path)
+ 
+     @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM')
+-    def test_zero3_int4_quantized_initialization(self, quantization_bits):
++    def test_zero3_int4_quantized_initialization(self, quantization_bits, tmp_path):
+         reset_random()
+-        zero3_quantized_initialization_test_helper(cpu_offload=False, nvme_offload=False, bits=quantization_bits)
++        zero3_quantized_initialization_test_helper(cpu_offload=False, nvme_offload=False, bits=quantization_bits, tmp_path=tmp_path)
+ 
+     @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM')
+-    def test_zero3_int4_quantized_initialization_cpu_offload(self, quantization_bits):
++    def test_zero3_int4_quantized_initialization_cpu_offload(self, quantization_bits, tmp_path):
+         reset_random()
+-        zero3_quantized_initialization_test_helper(cpu_offload=True, nvme_offload=False, bits=quantization_bits)
++        zero3_quantized_initialization_test_helper(cpu_offload=True, nvme_offload=False, bits=quantization_bits, tmp_path=tmp_path)
+ 
+     @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM')
+-    def test_zero3_int4_quantized_initialization_nvme_offload(self):
++    def test_zero3_int4_quantized_initialization_nvme_offload(self, tmp_path):
+         reset_random()
+-        zero3_quantized_initialization_test_helper(cpu_offload=False, nvme_offload=True, bits=4)
++        zero3_quantized_initialization_test_helper(cpu_offload=False, nvme_offload=True, bits=4, tmp_path=tmp_path)
+-- 
+2.39.3
+
diff --git a/easybuild/easyconfigs/p/pytest-forked/pytest-forked-1.6.0-GCCcore-13.2.0.eb b/easybuild/easyconfigs/p/pytest-forked/pytest-forked-1.6.0-GCCcore-13.2.0.eb
new file mode 100644
index 00000000000..56cbb558a23
--- /dev/null
+++ b/easybuild/easyconfigs/p/pytest-forked/pytest-forked-1.6.0-GCCcore-13.2.0.eb
@@ -0,0 +1,22 @@
+easyblock = 'PythonPackage'
+
+name = 'pytest-forked'
+version = '1.6.0'
+
+homepage = 'https://github.com/pytest-dev/pytest-forked'
+description = "Run tests in isolated forked subprocesses."
+
+toolchain = {'name': 'GCCcore', 'version': '13.2.0'}
+
+sources = [SOURCE_TAR_GZ]
+checksums = ['4dafd46a9a600f65d822b8f605133ecf5b3e1941ebb3588e943b4e3eb71a5a3f']
+
+builddependencies = [
+    ('binutils', '2.40'),
+]
+dependencies = [
+    ('Python', '3.11.5'),
+    ('Python-bundle-PyPI', '2023.10'),
+]
+
+moduleclass = 'tools'