From ded25fef7815a68b0965bcf21c40f1bc1d715ee1 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Thu, 31 Jul 2025 18:19:40 +0200
Subject: [PATCH 1/5] adding easyconfigs:
 PyTorch-2.3.0-foss-2023b-CUDA-12.4.0.eb and patches:
 PyTorch-1.7.0_disable-dev-shm-test.patch,
 PyTorch-1.12.1_add-hypothesis-suppression.patch,
 PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch,
 PyTorch-1.12.1_fix-TestTorch.test_to.patch,
 PyTorch-1.12.1_skip-test_round_robin.patch,
 PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch,
 PyTorch-1.13.1_fix-protobuf-dependency.patch,
 PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch,
 PyTorch-1.13.1_skip-failing-singular-grad-test.patch,
 PyTorch-1.13.1_skip-tests-without-fbgemm.patch,
 PyTorch-2.0.1_avoid-test_quantization-failures.patch,
 PyTorch-2.0.1_fix-skip-decorators.patch, PyTorch-2.0.1_fix-vsx-loadu.patch,
 PyTorch-2.0.1_skip-failing-gradtest.patch,
 PyTorch-2.0.1_skip-test_shuffle_reproducibility.patch,
 PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch,
 PyTorch-2.1.0_fix-vsx-vector-shift-functions.patch,
 PyTorch-2.1.0_increase-tolerance-functorch-test_vmapvjpvjp.patch,
 PyTorch-2.1.0_remove-test-requiring-online-access.patch,
 PyTorch-2.1.0_skip-diff-test-on-ppc.patch,
 PyTorch-2.1.0_skip-dynamo-test_predispatch.patch,
 PyTorch-2.1.0_skip-test_jvp_linalg_det_singular.patch,
 PyTorch-2.1.2_skip-cpu_repro-test-without-vectorization.patch,
 PyTorch-2.1.2_workaround_dynamo_failure_without_nnpack.patch,
 PyTorch-2.3.0_disable_test_linear_package_if_no_half_types_are_available.patch,
 PyTorch-2.3.0_disable_DataType_dependent_test_if_tensorboard_is_not_available.patch,
 PyTorch-2.3.0_fix-cpuinfo-bug-with-smt.patch,
 PyTorch-2.3.0_increase-tolerance-test_jit-test_freeze_conv_relu_fusion.patch,
 PyTorch-2.3.0_skip-test_init_from_local_shards.patch,
 PyTorch-2.3.0_no-cuda-stubs-rpath.patch,
 PyTorch-2.3.0_disable-gcc12-warning.patch,
 PyTorch-2.3.0_fix-test_extension_backend-without-vectorization.patch,
 PyTorch-2.3.0_fix-test_fine_tuning.patch,
 PyTorch-2.3.0_disable_tests_which_need_network_download.patch,
 PyTorch-2.3.0_avoid_caffe2_test_cpp_jit.patch,
 PyTorch-2.3.0_fix_missing_masked_load_for_int_type.patch,
 PyTorch-2.3.0_relax-test_unbacked_reduction.patch,
 PyTorch-2.3.0_remove-fsspec-test.patch,
 PyTorch-2.3.0_skip_test_var_mean_differentiable.patch,
 PyTorch-2.3.0_skip_test_sdpa_nn_functional_scaled_dot_product_attention_cpu.patch,
 PyTorch-2.3.0_fix-mkldnn-avx512-f32-bias.patch,
 PyTorch-2.3.0_fix-unboxing-template-CUDA-12.4.patch,
 PyTorch-2.6.0_show-test-duration.patch, PyTorch-2.7.1_suport-64bit-BARs.patch

---
 .../PyTorch-2.3.0-foss-2023b-CUDA-12.4.0.eb   | 223 ++++++++++++++++++
 .../PyTorch-2.3.0_fix-test_fine_tuning.patch  |  45 ++++
 ....3.0_fix-unboxing-template-CUDA-12.4.patch |  77 ++++++
 ...est_jit-test_freeze_conv_relu_fusion.patch |  19 ++
 ...-2.3.0_relax-test_unbacked_reduction.patch |  19 ++
 .../PyTorch-2.3.0_remove-fsspec-test.patch    |  67 ++++++
 .../PyTorch-2.7.1_suport-64bit-BARs.patch     |  27 +++
 7 files changed, 477 insertions(+)
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0-foss-2023b-CUDA-12.4.0.eb
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_fix-test_fine_tuning.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_fix-unboxing-template-CUDA-12.4.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_increase-tolerance-test_jit-test_freeze_conv_relu_fusion.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_relax-test_unbacked_reduction.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_remove-fsspec-test.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_suport-64bit-BARs.patch

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0-foss-2023b-CUDA-12.4.0.eb
new file mode 100644
index 000000000000..720b2e04b548
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0-foss-2023b-CUDA-12.4.0.eb
@@ -0,0 +1,223 @@
+name = 'PyTorch'
+version = '2.3.0'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'https://pytorch.org/'
+description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
+PyTorch is a deep learning framework that puts Python first."""
+
+toolchain = {'name': 'foss', 'version': '2023b'}
+
+source_urls = [GITHUB_RELEASE]
+sources = ['%(namelower)s-v%(version)s.tar.gz']
+patches = [
+    'PyTorch-1.7.0_disable-dev-shm-test.patch',
+    'PyTorch-1.12.1_add-hypothesis-suppression.patch',
+    'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch',
+    'PyTorch-1.12.1_fix-TestTorch.test_to.patch',
+    'PyTorch-1.12.1_skip-test_round_robin.patch',
+    'PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch',
+    'PyTorch-1.13.1_fix-protobuf-dependency.patch',
+    'PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch',
+    'PyTorch-1.13.1_skip-failing-singular-grad-test.patch',
+    'PyTorch-1.13.1_skip-tests-without-fbgemm.patch',
+    'PyTorch-2.0.1_avoid-test_quantization-failures.patch',
+    'PyTorch-2.0.1_fix-skip-decorators.patch',
+    'PyTorch-2.0.1_fix-vsx-loadu.patch',
+    'PyTorch-2.0.1_skip-failing-gradtest.patch',
+    'PyTorch-2.0.1_skip-test_shuffle_reproducibility.patch',
+    'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch',
+    'PyTorch-2.1.0_fix-vsx-vector-shift-functions.patch',
+    'PyTorch-2.1.0_increase-tolerance-functorch-test_vmapvjpvjp.patch',
+    'PyTorch-2.1.0_remove-test-requiring-online-access.patch',
+    'PyTorch-2.1.0_skip-diff-test-on-ppc.patch',
+    'PyTorch-2.1.0_skip-dynamo-test_predispatch.patch',
+    'PyTorch-2.1.0_skip-test_jvp_linalg_det_singular.patch',
+    'PyTorch-2.1.2_skip-cpu_repro-test-without-vectorization.patch',
+    'PyTorch-2.1.2_workaround_dynamo_failure_without_nnpack.patch',
+    'PyTorch-2.3.0_disable_test_linear_package_if_no_half_types_are_available.patch',
+    'PyTorch-2.3.0_disable_DataType_dependent_test_if_tensorboard_is_not_available.patch',
+    'PyTorch-2.3.0_fix-cpuinfo-bug-with-smt.patch',
+    'PyTorch-2.3.0_increase-tolerance-test_jit-test_freeze_conv_relu_fusion.patch',
+    'PyTorch-2.3.0_skip-test_init_from_local_shards.patch',
+    'PyTorch-2.3.0_no-cuda-stubs-rpath.patch',
+    'PyTorch-2.3.0_disable-gcc12-warning.patch',
+    'PyTorch-2.3.0_fix-test_extension_backend-without-vectorization.patch',
+    'PyTorch-2.3.0_fix-test_fine_tuning.patch',
+    'PyTorch-2.3.0_disable_tests_which_need_network_download.patch',
+    'PyTorch-2.3.0_avoid_caffe2_test_cpp_jit.patch',
+    'PyTorch-2.3.0_fix_missing_masked_load_for_int_type.patch',
+    'PyTorch-2.3.0_relax-test_unbacked_reduction.patch',
+    'PyTorch-2.3.0_remove-fsspec-test.patch',
+    'PyTorch-2.3.0_skip_test_var_mean_differentiable.patch',
+    'PyTorch-2.3.0_skip_test_sdpa_nn_functional_scaled_dot_product_attention_cpu.patch',
+    'PyTorch-2.3.0_fix-mkldnn-avx512-f32-bias.patch',
+    'PyTorch-2.3.0_fix-unboxing-template-CUDA-12.4.patch',
+    'PyTorch-2.6.0_show-test-duration.patch',
+    'PyTorch-2.7.1_suport-64bit-BARs.patch',
+]
+checksums = [
+    {'pytorch-v2.3.0.tar.gz': '69579513b26261bbab32e13b7efc99ad287fcf3103087f2d4fdf1adacd25316f'},
+    {'PyTorch-1.7.0_disable-dev-shm-test.patch': '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a'},
+    {'PyTorch-1.12.1_add-hypothesis-suppression.patch':
+     'e71ffb94ebe69f580fa70e0de84017058325fdff944866d6bd03463626edc32c'},
+    {'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch':
+     '1efc9850c431d702e9117d4766277d3f88c5c8b3870997c9974971bce7f2ab83'},
+    {'PyTorch-1.12.1_fix-TestTorch.test_to.patch': '75f27987c3f25c501e719bd2b1c70a029ae0ee28514a97fe447516aee02b1535'},
+    {'PyTorch-1.12.1_skip-test_round_robin.patch': '63d4849b78605aa088fdff695637d9473ea60dee603a3ff7f788690d70c55349'},
+    {'PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch':
+     '5c7be91a6096083a0b1315efe0001537499c600f1f569953c6a2c7f4cc1d0910'},
+    {'PyTorch-1.13.1_fix-protobuf-dependency.patch':
+     '8bd755a0cab7233a243bc65ca57c9630dfccdc9bf8c9792f0de4e07a644fcb00'},
+    {'PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch':
+     'bdde0f2105215c95a54de64ec4b1a4520528510663174fef6d5b900eb1db3937'},
+    {'PyTorch-1.13.1_skip-failing-singular-grad-test.patch':
+     '72688a57b2bb617665ad1a1d5e362c5111ae912c10936bb38a089c0204729f48'},
+    {'PyTorch-1.13.1_skip-tests-without-fbgemm.patch':
+     '481e595f673baf8ae58b41697a6792b83048b0264aa79b422f48cd8c22948bb7'},
+    {'PyTorch-2.0.1_avoid-test_quantization-failures.patch':
+     '02e3f47e4ed1d7d6077e26f1ae50073dc2b20426269930b505f4aefe5d2f33cd'},
+    {'PyTorch-2.0.1_fix-skip-decorators.patch': '2039012cef45446065e1a2097839fe20bb29fe3c1dcc926c3695ebf29832e920'},
+    {'PyTorch-2.0.1_fix-vsx-loadu.patch': 'a0ffa61da2d47c6acd09aaf6d4791e527d8919a6f4f1aa7ed38454cdcadb1f72'},
+    {'PyTorch-2.0.1_skip-failing-gradtest.patch': '8030bdec6ba49b057ab232d19a7f1a5e542e47e2ec340653a246ec9ed59f8bc1'},
+    {'PyTorch-2.0.1_skip-test_shuffle_reproducibility.patch':
+     '7047862abc1abaff62954da59700f36d4f39fcf83167a638183b1b7f8fec78ae'},
+    {'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch':
+     '166c134573a95230e39b9ea09ece3ad8072f39d370c9a88fb2a1e24f6aaac2b5'},
+    {'PyTorch-2.1.0_fix-vsx-vector-shift-functions.patch':
+     '3793b4b878be1abe7791efcbd534774b87862cfe7dc4774ca8729b6cabb39e7e'},
+    {'PyTorch-2.1.0_increase-tolerance-functorch-test_vmapvjpvjp.patch':
+     'aef38adf1210d0c5455e91d7c7a9d9e5caad3ae568301e0ba9fc204309438e7b'},
+    {'PyTorch-2.1.0_remove-test-requiring-online-access.patch':
+     '35184b8c5a1b10f79e511cc25db3b8a5585a5d58b5d1aa25dd3d250200b14fd7'},
+    {'PyTorch-2.1.0_skip-diff-test-on-ppc.patch': '394157dbe565ffcbc1821cd63d05930957412156cc01e949ef3d3524176a1dda'},
+    {'PyTorch-2.1.0_skip-dynamo-test_predispatch.patch':
+     '6298daf9ddaa8542850eee9ea005f28594ab65b1f87af43d8aeca1579a8c4354'},
+    {'PyTorch-2.1.0_skip-test_jvp_linalg_det_singular.patch':
+     '5229ca88a71db7667a90ddc0b809b2c817698bd6e9c5aaabd73d3173cf9b99fe'},
+    {'PyTorch-2.1.2_skip-cpu_repro-test-without-vectorization.patch':
+     '7ace835af60c58d9e0754a34c19d4b9a0c3a531f19e5d0eba8e2e49206eaa7eb'},
+    {'PyTorch-2.1.2_workaround_dynamo_failure_without_nnpack.patch':
+     'fb96eefabf394617bbb3fbd3a7a7c1aa5991b3836edc2e5d2a30e708bfe49ba1'},
+    {'PyTorch-2.3.0_disable_test_linear_package_if_no_half_types_are_available.patch':
+     '23416f2d9d5226695ec3fbea0671e3650c655c19deefd3f0f8ddab5afa50f485'},
+    {'PyTorch-2.3.0_disable_DataType_dependent_test_if_tensorboard_is_not_available.patch':
+     '0dcbdfde6752c3ff54c5376f521b4a742167669feb7f0f1d4e1d4d55f72b664f'},
+    {'PyTorch-2.3.0_fix-cpuinfo-bug-with-smt.patch':
+     '29fb95d1dba070133b513de050febd328ed36905a73f1ca135dc633f16beafa4'},
+    {'PyTorch-2.3.0_increase-tolerance-test_jit-test_freeze_conv_relu_fusion.patch':
+     '6f8eba5b546129ea975cda1a8a7098ca3245ad2b040a31a98807ee6d69cad0d4'},
+    {'PyTorch-2.3.0_skip-test_init_from_local_shards.patch':
+     '90ed9c2870f57ee6dc032d00873a37e2217a2b92a13035ded1c25ad5306455f2'},
+    {'PyTorch-2.3.0_no-cuda-stubs-rpath.patch': '7ba26824b5def7379cff02ae821a080698e6affea0da45bc846e9ecb89939cb1'},
+    {'PyTorch-2.3.0_disable-gcc12-warning.patch': 'a8a624e1a2a5f4c82610173e50bd0f853e49bd5621b432f5aac689f9f6eb1514'},
+    {'PyTorch-2.3.0_fix-test_extension_backend-without-vectorization.patch':
+     '36aa2d5ba175be17f4e996f4fb2d544fe477d4a0bd0644cd59a85063779afc8e'},
+    {'PyTorch-2.3.0_fix-test_fine_tuning.patch': 'daa24801f3b2b5f76b639a14fba9a6ad84fe99ebed53401e217d02f94cfe48bf'},
+    {'PyTorch-2.3.0_disable_tests_which_need_network_download.patch':
+     'b7fd1a5135dfd4098cdc054182f7bf84a23ac98462a00477712182b5442da855'},
+    {'PyTorch-2.3.0_avoid_caffe2_test_cpp_jit.patch':
+     '041adcd91d994b8c2ab57d227f081cd57e572c157117b37171e1eb8eb576f8fc'},
+    {'PyTorch-2.3.0_fix_missing_masked_load_for_int_type.patch':
+     'aa6ff764f3f7bf84372a8a257fe1b4ae6dc4b9744ad35f0f9015f2696c62a41e'},
+    {'PyTorch-2.3.0_relax-test_unbacked_reduction.patch':
+     'c822f084bd97b6c76bea692e3a4664e227b3aea57c80e576a841943877085b77'},
+    {'PyTorch-2.3.0_remove-fsspec-test.patch': '09be192401013cd8cd66add9d6565ac3e879e004d77e61145f826b768267ff61'},
+    {'PyTorch-2.3.0_skip_test_var_mean_differentiable.patch':
+     '9703fd0f1fca8916f6d79d83e9a7efe8e3f717362a5fdaa8f5d9da90d0c75018'},
+    {'PyTorch-2.3.0_skip_test_sdpa_nn_functional_scaled_dot_product_attention_cpu.patch':
+     '7955f2655db3da18606574fdcbc5990be24098f49ad1db5e86ea756ea1cc506f'},
+    {'PyTorch-2.3.0_fix-mkldnn-avx512-f32-bias.patch':
+     'ee07d21c3ac7aeb0bd0e39507b18a417b9125284a529102929c4b5c6727c2976'},
+    {'PyTorch-2.3.0_fix-unboxing-template-CUDA-12.4.patch':
+     '6205d8249e7edcce5756e073ab0b11a0496da34eec1a55e3d24437a530d2886b'},
+    {'PyTorch-2.6.0_show-test-duration.patch': '5508f2f9619204d9f3c356dbd4000a00d58f452ab2d64ae920eb8bc8b5484d75'},
+    {'PyTorch-2.7.1_suport-64bit-BARs.patch': '317c3d220aa87426d86e137a6c1a8f910adf9580ca0848371e0f6800c05dbde1'},
+]
+
+osdependencies = [OS_PKG_IBVERBS_DEV]
+
+builddependencies = [
+    ('CMake', '3.27.6'),
+    ('hypothesis', '6.90.0'),
+    # For tests
+    ('pytest-flakefinder', '1.1.0'),
+    ('pytest-rerunfailures', '14.0'),
+    ('pytest-shard', '0.1.2'),
+    ('tlparse', '0.3.5'),
+    ('optree', '0.13.0'),
+    ('unittest-xml-reporting', '3.1.0'),
+]
+
+dependencies = [
+    ('CUDA', '12.4.0', '', SYSTEM),
+    ('cuDNN', '9.0.0.312', versionsuffix, SYSTEM),
+    ('magma', '2.7.2', versionsuffix),
+    ('NCCL', '2.20.5', versionsuffix),
+    # Version from .ci/docker/triton_version.txt
+    ('Triton', '2.3.1', versionsuffix),
+    ('Ninja', '1.11.1'),  # Required for JIT compilation of C++ extensions
+    ('Python', '3.11.5'),
+    ('Python-bundle-PyPI', '2023.10'),
+    ('protobuf', '25.3'),
+    ('protobuf-python', '4.25.3'),
+    ('pybind11', '2.11.1'),
+    ('SciPy-bundle', '2023.11'),
+    ('PyYAML', '6.0.1'),
+    ('MPFR', '4.2.1'),
+    ('GMP', '6.3.0'),
+    ('numactl', '2.0.16'),
+    ('FFmpeg', '6.0'),
+    ('Pillow', '10.2.0'),
+    ('expecttest', '0.2.1'),
+    ('networkx', '3.2.1'),
+    ('sympy', '1.12'),
+    ('Z3', '4.13.0',),
+]
+
+buildcmd = '%(python)s setup.py build'  # Run the (long) build in the build step
+
+excluded_tests = {
+    '': [
+        # This test seems to take too long on NVIDIA Ampere at least.
+        'distributed/test_distributed_spawn',
+        # Broken on CUDA 11.6/11.7: https://github.com/pytorch/pytorch/issues/75375
+        'distributions/test_constraints',
+        # no xdoctest
+        'doctests',
+        # failing on broadwell
+        # See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
+        'test_native_mha',
+        # intermittent failures on various systems
+        # See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
+        'distributed/rpc/test_tensorpipe_agent',
+        # This test is expected to fail when run in their CI, but won't in our case.
+        # It just checks for a "CI" env variable
+        'test_ci_sanity_check_fail',
+        # This fails consistently and is disabled upstream
+        # See https://github.com/pytorch/pytorch/issues/100152 and
+        # https://github.com/pytorch/pytorch/pull/124712
+        'test_cpp_extensions_open_device_registration',
+        # Test broken until 2.4: https://github.com/pytorch/pytorch/pull/124786
+        'distributed/checkpoint/test_save_load_api',
+        # Test broken until 2.4: https://github.com/pytorch/pytorch/issues/122184
+        'distributed/tensor/parallel/test_tp_random_state',
+        # Doesn't find "dist.all_reduce(" in generated code. Known failures, e.g.
+        # https://github.com/pytorch/pytorch/issues/121195
+        'test/distributed/test_compute_comm_reordering',
+        # Long test (~65min), tested successfully once during creation of EC
+        'inductor/test_aot_inductor',
+    ]
+}
+
+local_test_opts = '--continue-through-error --pipe-logs --verbose %(excluded_tests)s'
+runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py ' + local_test_opts
+
+# Especially test_quantization has a few corner cases that are triggered by the random input values,
+# those cannot be easily avoided, see https://github.com/pytorch/pytorch/issues/107030
+# So allow a low number of tests to fail as the tests "usually" succeed
+max_failed_tests = 16
+
+tests = ['PyTorch-check-cpp-extension.py']
+
+moduleclass = 'ai'
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_fix-test_fine_tuning.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_fix-test_fine_tuning.patch
new file mode 100644
index 000000000000..62dbfe21134d
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_fix-test_fine_tuning.patch
@@ -0,0 +1,45 @@
+Fixes
+> TypeError: get_state_dict() missing 1 required positional argument: 'optimizers'
+
+From 61d30b6e8acbd3cfb087761defa74f19f9be96bb Mon Sep 17 00:00:00 2001
+From: cdzhan <zhancdi@163.com>
+Date: Mon, 24 Jun 2024 20:02:08 +0800
+Subject: [PATCH] [easy][DCP] Fix test_fine_tuning.py for get/set_state_dict
+ API changes
+
+---
+ test/distributed/checkpoint/e2e/test_fine_tuning.py | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/test/distributed/checkpoint/e2e/test_fine_tuning.py b/test/distributed/checkpoint/e2e/test_fine_tuning.py
+index a93f242187709c..fd21524882c839 100644
+--- a/test/distributed/checkpoint/e2e/test_fine_tuning.py
++++ b/test/distributed/checkpoint/e2e/test_fine_tuning.py
+@@ -9,7 +9,9 @@
+ import torch.nn as nn
+ from torch.distributed._tensor import init_device_mesh
+ from torch.distributed.checkpoint.state_dict import (
++    get_model_state_dict,
+     get_state_dict,
++    set_model_state_dict,
+     set_state_dict,
+     StateDictOptions,
+ )
+@@ -120,7 +122,7 @@ def finetune(self, pretrain_dir: str, finetune_dir: str) -> None:
+         # Simulate that the fine tuning restart after 3 iterations
+         for i in range(2):
+             # Load pretrain submodules checkpoint
+-            pretrain_state_dict, _ = get_state_dict(
++            pretrain_state_dict = get_model_state_dict(
+                 model,
+                 submodules={model.pretrain},
+                 options=StateDictOptions(keep_submodule_prefixes=False),
+@@ -129,7 +131,7 @@ def finetune(self, pretrain_dir: str, finetune_dir: str) -> None:
+                 {"model": pretrain_state_dict},
+                 storage_reader=dist_cp.FileSystemReader(pretrain_dir),
+             )
+-            set_state_dict(
++            set_model_state_dict(
+                 model,
+                 model_state_dict={model.pretrain: pretrain_state_dict},
+                 options=StateDictOptions(strict=False),
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_fix-unboxing-template-CUDA-12.4.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_fix-unboxing-template-CUDA-12.4.patch
new file mode 100644
index 000000000000..5c2608ae0144
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_fix-unboxing-template-CUDA-12.4.patch
@@ -0,0 +1,77 @@
+From 2ac499392f5f5f2fa518ef74f0e3ab0921b87a2f Mon Sep 17 00:00:00 2001
+From: His-Wardship <elliot.ward.mlf2019@said.oxford.edu>
+Date: Thu, 4 Apr 2024 18:25:24 +0100
+Subject: [PATCH 1/3] Refactor SFINAE logic in boxing with intermediate helper
+ struct, fixes compilation for CUDA 12.4.
+
+---
+ aten/src/ATen/core/boxing/impl/boxing.h | 11 ++++++++++-
+ 1 file changed, 10 insertions(+), 1 deletion(-)
+
+diff --git a/aten/src/ATen/core/boxing/impl/boxing.h b/aten/src/ATen/core/boxing/impl/boxing.h
+index f8055d95b8824..0749e6cd59fe1 100644
+--- a/aten/src/ATen/core/boxing/impl/boxing.h
++++ b/aten/src/ATen/core/boxing/impl/boxing.h
+@@ -39,7 +39,16 @@ template <class T, class Enable = void>
+ struct has_ivalue_to : std::false_type {};
+ 
+ template <class T>
+-struct has_ivalue_to<T, std::void_t<decltype(std::declval<IValue>().to<T>())>>
++struct ivalue_to_helper
++{
++    using type = decltype(std::declval<IValue>().template to<T>());
++};
++template <class T>
++using ivalue_to_helper_t = typename ivalue_to_helper<T>::type;
++
++template <class T>
++struct has_ivalue_to<T, guts::void_t<ivalue_to_helper_t<T>>>
++
+ : std::true_type
+ {};
+ 
+
+From 1af7507f410337221131142210d23504c98e38b4 Mon Sep 17 00:00:00 2001
+From: His-Wardship <139779341+His-Wardship@users.noreply.github.com>
+Date: Thu, 4 Apr 2024 20:19:10 +0100
+Subject: [PATCH 2/3] remove superfluous blank line boxing.h
+
+---
+ aten/src/ATen/core/boxing/impl/boxing.h | 1 -
+ 1 file changed, 1 deletion(-)
+
+diff --git a/aten/src/ATen/core/boxing/impl/boxing.h b/aten/src/ATen/core/boxing/impl/boxing.h
+index 0749e6cd59fe1..8a88f2e656786 100644
+--- a/aten/src/ATen/core/boxing/impl/boxing.h
++++ b/aten/src/ATen/core/boxing/impl/boxing.h
+@@ -48,7 +48,6 @@ using ivalue_to_helper_t = typename ivalue_to_helper<T>::type;
+ 
+ template <class T>
+ struct has_ivalue_to<T, guts::void_t<ivalue_to_helper_t<T>>>
+-
+ : std::true_type
+ {};
+ 
+
+From 25f691941ccb7fad35a8d832738ae9b2c0f88b0f Mon Sep 17 00:00:00 2001
+From: His-Wardship <139779341+His-Wardship@users.noreply.github.com>
+Date: Sun, 7 Apr 2024 12:33:47 +0100
+Subject: [PATCH 3/3] update void_t to use std namespace
+
+---
+ aten/src/ATen/core/boxing/impl/boxing.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/aten/src/ATen/core/boxing/impl/boxing.h b/aten/src/ATen/core/boxing/impl/boxing.h
+index 8a88f2e656786..82fdd824ea65b 100644
+--- a/aten/src/ATen/core/boxing/impl/boxing.h
++++ b/aten/src/ATen/core/boxing/impl/boxing.h
+@@ -47,7 +47,7 @@ template <class T>
+ using ivalue_to_helper_t = typename ivalue_to_helper<T>::type;
+ 
+ template <class T>
+-struct has_ivalue_to<T, guts::void_t<ivalue_to_helper_t<T>>>
++struct has_ivalue_to<T, std::void_t<ivalue_to_helper_t<T>>>
+ : std::true_type
+ {};
+ 
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_increase-tolerance-test_jit-test_freeze_conv_relu_fusion.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_increase-tolerance-test_jit-test_freeze_conv_relu_fusion.patch
new file mode 100644
index 000000000000..99e43d05aa47
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_increase-tolerance-test_jit-test_freeze_conv_relu_fusion.patch
@@ -0,0 +1,19 @@
+test_jit, test_jit_legacy, test_jit_profiling fail in test_freeze_conv_relu_fusion with:
+> Greatest absolute difference: 3.053247928619385e-05 at index (1, 1, 0, 0, 0) (up to 1e-05 allowed)
+> Greatest relative difference: 0.0004548609140329063 at index (3, 1, 0, 0, 0) (up to 1.3e-06 allowed)
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py
+index f13c2b113b4..454df769d9c 100644
+--- a/test/jit/test_freezing.py
++++ b/test/jit/test_freezing.py
+@@ -2795,7 +2795,7 @@ class TestFrozenOptimizations(JitTestCase):
+                     else:
+                         FileCheck().check("aten::cudnn_convolution_relu").run(frozen_mod.graph)
+ 
+-                self.assertEqual(mod_eager(inp), frozen_mod(inp))
++                self.assertEqual(mod_eager(inp), frozen_mod(inp), atol=4e-5, rtol=5e-4)
+ 
+     @unittest.skipIf(not (TEST_CUDNN or TEST_WITH_ROCM), "requires CUDNN")
+     def test_freeze_conv_relu_fusion_not_forward(self):
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_relax-test_unbacked_reduction.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_relax-test_unbacked_reduction.patch
new file mode 100644
index 000000000000..b084fdf5ecf2
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_relax-test_unbacked_reduction.patch
@@ -0,0 +1,19 @@
+With our Triton version the expected failure doesn't happen anymore.
+See also https://github.com/pytorch/pytorch/issues/154217
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/inductor/test_torchinductor_dynamic_shapes.py b/test/inductor/test_torchinductor_dynamic_shapes.py
+index 11de2175fc4..68d33a9bb48 100644
+--- a/test/inductor/test_torchinductor_dynamic_shapes.py
++++ b/test/inductor/test_torchinductor_dynamic_shapes.py
+@@ -367,9 +367,6 @@ class TestInductorDynamic(TestCase):
+         except Exception:
+             if not expect_fail:
+                 raise
+-        else:
+-            if expect_fail:
+-                self.fail("expected to fail, but actually passed")
+ 
+     @torch._dynamo.config.patch(
+         capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_remove-fsspec-test.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_remove-fsspec-test.patch
new file mode 100644
index 000000000000..b207270fd1ab
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_remove-fsspec-test.patch
@@ -0,0 +1,67 @@
+Use part of a revert commit to avoid failures caused by:
+>     storage_reader = storage_reader or DCP.FileSystemReader()
+>                                        ^^^^^^^^^^^^^^^^^^^^^^
+> TypeError: FileSystemReader.__init__() missing 1 required positional argument: 'path'
+
+Author: Alexander Grund (TU Dresden)
+
+--- a/test/distributed/checkpoint/e2e/test_e2e_save_and_load.py
++++ b/test/distributed/checkpoint/e2e/test_e2e_save_and_load.py
+@@ -171,28 +171,7 @@ class TestE2ESaveAndLoad(DTensorTestBase, VerifyStateDictMixin):
+     def test_e2e_async(self):
+         self._run_e2e_test(compile=False, model_type=ModelType.FSDP, async_op=True)
+ 
+-    @with_comms
+-    @skip_if_lt_x_gpu(4)
+-    @with_temp_dir
+-    def test_fsspec(self):
+-        self._run_e2e_test(
+-            compile=False,
+-            model_type=ModelType.FSDP,
+-            storage_reader=DCP.FsspecReader(),
+-            storage_writer=DCP.FsspecWriter(),
+-        )
+-
+-    def _run_e2e_test(
+-        self,
+-        compile,
+-        model_type,
+-        async_op=False,
+-        storage_reader=None,
+-        storage_writer=None,
+-    ):
+-        storage_reader = storage_reader or DCP.FileSystemReader()
+-        storage_writer = storage_writer or DCP.FileSystemWriter()
+-
++    def _run_e2e_test(self, compile, model_type, async_op=False):
+         model, optim = self._create_model(compile, ModelType.NONE)
+         _train(model, optim, train_steps=2)
+ 
+@@ -207,9 +186,7 @@ class TestE2ESaveAndLoad(DTensorTestBase, VerifyStateDictMixin):
+         }
+ 
+         if async_op:
+-            f = saver.async_save(
+-                sd, checkpoint_id=self.temp_dir, storage_writer=storage_writer
+-            )
++            f = saver.async_save(sd, checkpoint_id=self.temp_dir)
+             t = time.monotonic()
+             while not f.done():
+                 time.sleep(1)
+@@ -217,7 +194,7 @@ class TestE2ESaveAndLoad(DTensorTestBase, VerifyStateDictMixin):
+ 
+             f.result()
+         else:
+-            DCP.save(sd, checkpoint_id=self.temp_dir, storage_writer=storage_writer)
++            DCP.save(sd, checkpoint_id=self.temp_dir)
+ 
+         loaded_stateful_obj = TestStatefulObj()
+         dist_model, dist_optim = self._create_model(compile, model_type)
+@@ -232,7 +209,6 @@ class TestE2ESaveAndLoad(DTensorTestBase, VerifyStateDictMixin):
+                 "s": loaded_stateful_obj,
+             },
+             checkpoint_id=self.temp_dir,
+-            storage_reader=storage_reader,
+         )
+ 
+         self.assertEqual(original_stateful_obj, loaded_stateful_obj)
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_suport-64bit-BARs.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_suport-64bit-BARs.patch
new file mode 100644
index 000000000000..6e8cdfb2d36a
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_suport-64bit-BARs.patch
@@ -0,0 +1,27 @@
+When the GPUs use 64bit BARs the RPC module fails during the initialization with:
+> E           RuntimeError: In getBar1SizeOfGpu at tensorpipe/channel/cuda_gdr/context_impl.cc:242 "": No such file or directory
+
+This causes KeyboardInterrupt errors in distributed/rpc/test_share_memory
+
+See https://github.com/pytorch/pytorch/issues/159354
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/third_party/tensorpipe/tensorpipe/channel/cuda_gdr/context_impl.cc b/third_party/tensorpipe/tensorpipe/channel/cuda_gdr/context_impl.cc
+index 182a04a..b26751e 100644
+--- a/third_party/tensorpipe/tensorpipe/channel/cuda_gdr/context_impl.cc
++++ b/third_party/tensorpipe/tensorpipe/channel/cuda_gdr/context_impl.cc
+@@ -239,6 +239,13 @@ size_t getBar1SizeOfGpu(int gpuIdx) {
+ 
+   struct stat bar1Stats;
+   int rv = ::stat(pciPath.c_str(), &bar1Stats);
++  if (rv < 0 && errno == ENOENT) {
++    // Some GPUs use 64 bit BARs using 2 slots each,
++    // so the BAR 0 spans slots 0 & 1 and BAR 1 is at slots 2 & 3
++    TP_VLOG(5) << "GPU #" << gpuIdx << " might has 64 bit BARs";
++    pciPath[pciPath.size() - 1] = '2';
++    rv = ::stat(pciPath.c_str(), &bar1Stats);
++  }
+   TP_THROW_SYSTEM_IF(rv < 0, errno);
+ 
+   return bar1Stats.st_size;

From 99383619d811a096af55ee224a62f35463942c90 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Thu, 31 Jul 2025 18:25:05 +0200
Subject: [PATCH 2/5] Skip 2 more long tests

---
 .../p/PyTorch/PyTorch-2.3.0-foss-2023b-CUDA-12.4.0.eb       | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0-foss-2023b-CUDA-12.4.0.eb
index 720b2e04b548..710011c3888b 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0-foss-2023b-CUDA-12.4.0.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0-foss-2023b-CUDA-12.4.0.eb
@@ -205,8 +205,10 @@ excluded_tests = {
         # Doesn't find "dist.all_reduce(" in generated code. Known failures, e.g.
         # https://github.com/pytorch/pytorch/issues/121195
         'test/distributed/test_compute_comm_reordering',
-        # Long test (~65min), tested successfully once during creation of EC
-        'inductor/test_aot_inductor',
+        # Long tests, tested successfully once during creation of EC
+        'inductor/test_aot_inductor',  # ~65min
+        'distributed/fsdp/test_fsdp_state_dict',  # ~202min
+        'distributed/fsdp/test_fsdp_core',  # ~88min
     ]
 }
 

From c5c98672e5665e071755e4788105e2600f36533a Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Tue, 5 Aug 2025 11:14:57 +0200
Subject: [PATCH 3/5] Fix test name

---
 .../p/PyTorch/PyTorch-2.3.0-foss-2023b-CUDA-12.4.0.eb           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0-foss-2023b-CUDA-12.4.0.eb
index 710011c3888b..bb1f766f4872 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0-foss-2023b-CUDA-12.4.0.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0-foss-2023b-CUDA-12.4.0.eb
@@ -204,7 +204,7 @@ excluded_tests = {
         'distributed/tensor/parallel/test_tp_random_state',
         # Doesn't find "dist.all_reduce(" in generated code. Known failures, e.g.
         # https://github.com/pytorch/pytorch/issues/121195
-        'test/distributed/test_compute_comm_reordering',
+        'distributed/test_compute_comm_reordering',
         # Long tests, tested successfully once during creation of EC
         'inductor/test_aot_inductor',  # ~65min
         'distributed/fsdp/test_fsdp_state_dict',  # ~202min

From 2370962a0d944cb966e83d43a289de5547f8297f Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Tue, 5 Aug 2025 11:16:41 +0200
Subject: [PATCH 4/5] Add dependencies

---
 .../c/cuDNN/cuDNN-9.0.0.312-CUDA-12.4.0.eb    |  46 +++++++
 .../Triton-2.3.1-foss-2023b-CUDA-12.4.0.eb    | 112 ++++++++++++++++++
 2 files changed, 158 insertions(+)
 create mode 100644 easybuild/easyconfigs/c/cuDNN/cuDNN-9.0.0.312-CUDA-12.4.0.eb
 create mode 100644 easybuild/easyconfigs/t/Triton/Triton-2.3.1-foss-2023b-CUDA-12.4.0.eb

diff --git a/easybuild/easyconfigs/c/cuDNN/cuDNN-9.0.0.312-CUDA-12.4.0.eb b/easybuild/easyconfigs/c/cuDNN/cuDNN-9.0.0.312-CUDA-12.4.0.eb
new file mode 100644
index 000000000000..cce5b8396b98
--- /dev/null
+++ b/easybuild/easyconfigs/c/cuDNN/cuDNN-9.0.0.312-CUDA-12.4.0.eb
@@ -0,0 +1,46 @@
+name = 'cuDNN'
+version = '9.0.0.312'
+versionsuffix = '-CUDA-%(cudaver)s'
+homepage = 'https://developer.nvidia.com/cudnn'
+description = """The NVIDIA CUDA Deep Neural Network library (cuDNN) is
+a GPU-accelerated library of primitives for deep neural networks."""
+
+toolchain = SYSTEM
+
+# note: cuDNN is tied to specific to CUDA versions,
+# see also https://docs.nvidia.com/deeplearning/cudnn/support-matrix/index.html#cudnn-cuda-hardware-versions
+local_cuda_major = '12'
+
+source_urls = [
+    'https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-%(cudnnarch)s/'
+]
+sources = ['%%(namelower)s-linux-%%(cudnnarch)s-%%(version)s_cuda%s-archive.tar.xz' % local_cuda_major]
+checksums = [{
+    '%%(namelower)s-linux-ppc64le-%%(version)s_cuda%s-archive.tar.xz' % local_cuda_major:
+        'b8ef6f249128e1985893a8787a21de35cb83ec47c6dc6fd1809061dd9a3ffb20',
+    '%%(namelower)s-linux-sbsa-%%(version)s_cuda%s-archive.tar.xz' % local_cuda_major:
+        '430fbf5b513c69e989b3a3a5a572369778ce0c214ce1259af6b935f9cab7dd54',
+    '%%(namelower)s-linux-x86_64-%%(version)s_cuda%s-archive.tar.xz' % local_cuda_major:
+        'd3890e609d6530ee5b88ff95b60c8e6b1c1ec7fa966ec533925f20f896fcc630',
+}]
+
+dependencies = [('CUDA', '12.4.0')]
+
+local_static_libs = [
+    'libcudnn_adv_static_v9.a',
+    'libcudnn_cnn_static_v9.a',
+    'libcudnn_engines_precompiled_static_v9.a',
+    'libcudnn_engines_runtime_compiled_static_v9.a',
+    'libcudnn_graph_static_v9.a',
+    'libcudnn_heuristic_static_v9.a',
+    'libcudnn_ops_static_v9.a',
+]
+sanity_check_paths = {
+    'files': [
+        'include/cudnn.h',
+        'lib64/libcudnn.%s' % SHLIB_EXT
+    ] + [ 'lib64/' + i for i in local_static_libs],
+    'dirs': ['include', 'lib64'],
+}
+
+moduleclass = 'numlib'
diff --git a/easybuild/easyconfigs/t/Triton/Triton-2.3.1-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/t/Triton/Triton-2.3.1-foss-2023b-CUDA-12.4.0.eb
new file mode 100644
index 000000000000..27418217576b
--- /dev/null
+++ b/easybuild/easyconfigs/t/Triton/Triton-2.3.1-foss-2023b-CUDA-12.4.0.eb
@@ -0,0 +1,112 @@
+easyblock = 'PythonBundle'
+
+name = 'Triton'
+version = '2.3.1'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'https://triton-lang.org/'
+
+description = """Triton is a language and compiler for parallel programming. It aims to provide a
+Python-based programming environment for productively writing custom DNN compute
+kernels capable of running at maximal throughput on modern GPU hardware."""
+
+toolchain = {'name': 'foss', 'version': '2023b'}
+
+builddependencies = [
+    ('CMake', '3.27.6'),
+    ('Ninja', '1.11.1'),
+    # LLVM dependencies
+    ('git', '2.42.0'),
+    ('libxml2', '2.11.5'),
+    ('ncurses', '6.4'),
+]
+
+dependencies = [
+    ('CUDA', '12.4.0', '', SYSTEM),
+    ('Python', '3.11.5'),
+    ('Python-bundle-PyPI', '2023.10'),
+    ('SciPy-bundle', '2023.11'),
+    ('lit', '18.1.7'),
+    ('pybind11', '2.11.1'),
+    ('Z3', '4.13.0'),
+    # LLVM dependencies
+    ('zlib', '1.2.13'),
+]
+
+components = [
+    # Hash from cmake/llvm-hash.txt
+    ('LLVM', '5e5a22caf88ac1ccfa8dc5720295fdeba0ad9372', {
+        'easyblock': 'CMakeNinja',
+        'source_urls': ['https://github.com/llvm/llvm-project/archive/'],
+        'sources': [{
+            'download_filename': '%(version)s.tar.gz',
+            'filename': 'llvm-project-%(version)s.tar.gz',
+        }],
+        'checksums': [
+            {'llvm-project-5e5a22caf88ac1ccfa8dc5720295fdeba0ad9372.tar.gz':
+             '9d9ae8ae30f6262ca0823493893398ea2ab6fbd49027e338e06ac7c25bb8caf4'},
+        ],
+        'start_dir': 'llvm-project-%(version)s',
+        'separate_build_dir': 'llvm_build_dir',
+        'configopts': ' '.join([
+            # Reduce dependencies
+            '-DLLVM_ENABLE_BINDINGS=OFF',  # Requires OCAML
+            '-DLLVM_ENABLE_LIBEDIT=OFF',
+            # See https://github.com/triton-lang/triton/blob/develop/scripts/build-llvm-project.sh
+            '-DCMAKE_BUILD_TYPE="Release"',
+            '-DLLVM_CCACHE_BUILD=OFF',
+            '-DLLVM_ENABLE_ASSERTIONS=ON',
+            '-DLLVM_OPTIMIZED_TABLEGEN=ON',
+            '-DLLVM_TARGETS_TO_BUILD="Native;NVPTX;AMDGPU"',
+            '-DLLVM_ENABLE_PROJECTS="mlir;llvm"',
+        ]),
+        'srcdir': 'llvm',
+        'skipsteps': ['install'],
+    })
+]
+
+local_preinstallopts = ' '.join([
+    'TRITON_BUILD_WITH_CLANG_LLD=0',
+    "TRITON_HOME='%(builddir)s/triton_home'",
+    'TRITON_PTXAS_PATH="$CUDA_HOME/bin/ptxas"',
+    'TRITON_CUOBJDUMP_PATH="$CUDA_HOME/bin/cubjdump"',
+    'TRITON_NVDISASM_PATH="$CUDA_HOME/bin/nvdisasm"',
+    'LLVM_SYSPATH="%(builddir)s/llvm_build_dir"',
+    'JSON_SYSPATH="$EBROOTNLOHMANN_JSON"',
+    'PYBIND11_SYSPATH="$EBROOTPYBIND11"',
+    'MAX_JOBS=%(parallel)s',
+    # Build type
+    'DEBUG=0 REL_WITH_DEB_INFO=0 TRITON_REL_BUILD_WITH_ASSERTS=1',
+    "TRITON_APPEND_CMAKE_ARGS='-DCMAKE_VERBOSE_MAKEFILE:BOOL=ON'",
+]) + ' '
+
+exts_list = [
+    (name, version, {
+        # make pip print output of cmake
+        'installopts': "-v ",
+        # ensure that libdevice.10.bc from $EBROOTCUDA/nvvm/libdevice is used:
+        'postinstallcmds': [
+            'rm -rf %(installdir)s/lib/python%(pyshortver)s/site-packages/triton/backends/nvidia/lib/libdevice.10.bc'
+        ],
+        'preinstallopts': local_preinstallopts,
+        'source_urls': ['https://github.com/triton-lang/triton/archive/'],
+        'sources': [{
+            'filename': SOURCE_TAR_GZ,
+            'download_filename': '958fccea74da58e7e0595ab88ae6cd3f6795a173.tar.gz',
+        }],
+        'patches': ['Triton-2.3.1_disable-dependency-download.patch'],
+        'checksums': [
+            {'Triton-2.3.1.tar.gz': '07d67f6e00de85cb4cace1fe716a22e6eaf623712137d21276445392d0475c44'},
+            {'Triton-2.3.1_disable-dependency-download.patch':
+             '8a768a27b378521967f91536cd13ff315efb966d2dc6a9710c8818c60fb688fe'},
+        ],
+        'start_dir': 'python',
+    }),
+]
+
+modextravars = {
+    'TRITON_PTXAS_PATH': '$CUDA_HOME/bin/ptxas',
+    # ensure that libdevice.10.bc from $EBROOTCUDA/nvvm/libdevice is used:
+    'TRITON_LIBDEVICE_PATH': '$CUDA_HOME/nvvm/libdevice/libdevice.10.bc'
+}
+moduleclass = 'devel'

From a5052e6114a13494f194ceddc456179a2a5444c2 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Tue, 5 Aug 2025 12:51:38 +0200
Subject: [PATCH 5/5] Add dependencies

---
 .../c/cuDNN/cuDNN-9.0.0.312-CUDA-12.4.0.eb    |  2 +-
 ...on-2.3.1_disable-dependency-download.patch | 33 +++++++++++++++++++
 2 files changed, 34 insertions(+), 1 deletion(-)
 create mode 100644 easybuild/easyconfigs/t/Triton/Triton-2.3.1_disable-dependency-download.patch

diff --git a/easybuild/easyconfigs/c/cuDNN/cuDNN-9.0.0.312-CUDA-12.4.0.eb b/easybuild/easyconfigs/c/cuDNN/cuDNN-9.0.0.312-CUDA-12.4.0.eb
index cce5b8396b98..c1f3b9f096de 100644
--- a/easybuild/easyconfigs/c/cuDNN/cuDNN-9.0.0.312-CUDA-12.4.0.eb
+++ b/easybuild/easyconfigs/c/cuDNN/cuDNN-9.0.0.312-CUDA-12.4.0.eb
@@ -39,7 +39,7 @@ sanity_check_paths = {
     'files': [
         'include/cudnn.h',
         'lib64/libcudnn.%s' % SHLIB_EXT
-    ] + [ 'lib64/' + i for i in local_static_libs],
+    ] + ['lib64/' + i for i in local_static_libs],
     'dirs': ['include', 'lib64'],
 }
 
diff --git a/easybuild/easyconfigs/t/Triton/Triton-2.3.1_disable-dependency-download.patch b/easybuild/easyconfigs/t/Triton/Triton-2.3.1_disable-dependency-download.patch
new file mode 100644
index 000000000000..26a0a53b1f15
--- /dev/null
+++ b/easybuild/easyconfigs/t/Triton/Triton-2.3.1_disable-dependency-download.patch
@@ -0,0 +1,33 @@
+Error when downloads would happen in setup.py
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/python/setup.py b/python/setup.py
+index cef967241..47976b1be 100644
+--- a/python/setup.py
++++ b/python/setup.py
+@@ -115,6 +115,7 @@ def get_thirdparty_packages(triton_cache_path):
+         version_file_path = os.path.join(package_dir, "version.txt")
+         if p.syspath_var_name not in os.environ and\
+            (not os.path.exists(version_file_path) or Path(version_file_path).read_text() != p.url):
++            raise RuntimeError('Forbidden download')
+             try:
+                 shutil.rmtree(package_root_dir)
+             except Exception:
+@@ -139,6 +140,7 @@ def get_thirdparty_packages(triton_cache_path):
+ def download_and_copy(src_path, variable, version, url_func):
+     if variable in os.environ:
+         return
++    raise RuntimeError('Forbidden download')
+     base_dir = os.path.dirname(__file__)
+     arch = platform.machine()
+     if arch == "x86_64":
+@@ -280,7 +282,7 @@ class CMakeBuild(build_ext):
+             build_args += ["--", "/m"]
+         else:
+             cmake_args += ["-DCMAKE_BUILD_TYPE=" + cfg]
+-            max_jobs = os.getenv("MAX_JOBS", str(2 * os.cpu_count()))
++            max_jobs = os.getenv("MAX_JOBS", str(len(os.sched_getaffinity(0))))
+             build_args += ['-j' + max_jobs]
+ 
+         if check_env_flag("TRITON_BUILD_WITH_CLANG_LLD"):