arbor-sim · jeffdaily · Jun 8, 2026 · Jun 9, 2026 · thorstenhater · Jun 9, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -191,7 +191,7 @@ elseif(ARB_GPU STREQUAL "hip")
     set(ARB_WITH_HIP_CLANG TRUE)
     # Specify AMD architecture using a (user provided) list.
     # Note: CMake native HIP architectures are introduced with version 3.21.
-    set(ARB_HIP_ARCHITECTURES gfx906 gfx900 CACHE STRING "AMD offload architectures (semicolon separated)")
+    set(ARB_HIP_ARCHITECTURES gfx90a CACHE STRING "AMD offload architectures (semicolon separated)")
 endif()
 
 if(ARB_WITH_NVCC OR ARB_WITH_CUDA_CLANG OR ARB_WITH_HIP_CLANG)
@@ -591,6 +591,12 @@ if(ARB_WITH_GPU)
         set(clang_options_ -DARB_HIP -xhip ${HIP_ARCH_STR})
         target_compile_options(arbor-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CXX>:${clang_options_}>)
         target_compile_options(arborenv-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CXX>:${clang_options_}>)
+        # On Windows the clang driver does not auto-add the HIP runtime library when linking
+        # shared libraries; link it explicitly so HIP launch symbols resolve.
+        if(WIN32)
+            find_library(AMDHIP64_LIB amdhip64 REQUIRED HINTS "${CMAKE_PREFIX_PATH}/lib")
+            target_link_libraries(arbor-private-deps INTERFACE "${AMDHIP64_LIB}")
+        endif()
     endif()
 endif()
 

diff --git a/arbor/backends/rand_impl.hpp b/arbor/backends/rand_impl.hpp
@@ -1,10 +1,16 @@
 #pragma once
 
 // This file is intended to be included in the source files directly in order
-// to be compiled by either the host or the device compiler 
+// to be compiled by either the host or the device compiler
 
 #include <type_traits>
 
+// On HIP, sincosf/sincos are device-only; use the sin+cos fallback in Random123.
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIP__)
+#  ifndef R123_NO_SINCOS
+#    define R123_NO_SINCOS 1
+#  endif
+#endif
 #include <Random123/boxmuller.hpp>
 #include <Random123/threefry.h>
 

diff --git a/arbor/include/CMakeLists.txt b/arbor/include/CMakeLists.txt
@@ -22,7 +22,7 @@ target_include_directories(arbor-public-headers INTERFACE
 
 add_custom_command(
     OUTPUT _always_rebuild
-    COMMAND true
+    COMMAND ${CMAKE_COMMAND} -E echo "" > ${CMAKE_CURRENT_BINARY_DIR}/_always_rebuild
 )
 
 set(arb_features)
@@ -59,10 +59,15 @@ endif()
 
 string(TOUPPER "${CMAKE_BUILD_TYPE}" arb_config_str)
 
+if(WIN32)
+    find_program(BASH_EXECUTABLE bash HINTS "C:/Program Files/Git/bin" REQUIRED)
+else()
+    set(BASH_EXECUTABLE bash)
+endif()
 add_custom_command(
     OUTPUT version.hpp-test
     DEPENDS _always_rebuild
-    COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/git-source-id ${FULL_VERSION_STRING} ${ARB_ARCH} ${arb_config_str} ${arb_features} > version.hpp-test
+    COMMAND ${BASH_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/git-source-id ${FULL_VERSION_STRING} ${ARB_ARCH} ${arb_config_str} ${arb_features} > version.hpp-test
 )
 
 set(version_hpp_path arbor/version.hpp)

diff --git a/arbor/include/arbor/gpu/cuda_api.hpp b/arbor/include/arbor/gpu/cuda_api.hpp
@@ -95,6 +95,9 @@ inline api_error_type device_mem_get_info(ARGS &&... args) {
     return cudaMemGetInfo(std::forward<ARGS>(args)...);
 }
 
+// Lane mask type: 32-bit on CUDA (warp size always 32).
+using lane_mask_type = unsigned;
+
 #ifdef __CUDACC__
 /// Atomics
 
@@ -143,6 +146,16 @@ template<typename T>
 __device__ __inline__ T shfl_down(unsigned mask, T var, unsigned lane_id, unsigned shift) {
     return __shfl_down_sync(mask, var, shift);
 }
+
+// Count leading zeros / find first set on a lane mask. lane_mask_type is 32-bit
+// on CUDA (warp size is always 32), so the 32-bit __clz/__ffs variants are used.
+__device__ __inline__ unsigned count_leading_zeros(lane_mask_type mask) {
+    return __clz(mask);
+}
+
+__device__ __inline__ unsigned find_first_set(lane_mask_type mask) {
+    return __ffs(mask);
+}
 #endif
 
 } // namespace gpu

diff --git a/arbor/include/arbor/gpu/gpu_common.hpp b/arbor/include/arbor/gpu/gpu_common.hpp
@@ -12,14 +12,20 @@ namespace arb {
 namespace gpu {
 
 namespace impl {
-// Number of threads per warp
-// This has always been 32, however it may change in future NVIDIA gpus
+// Number of threads per warp (wavefront on AMD).
+// NVIDIA: always 32.
+// AMD CDNA (gfx9xx): 64-wide wavefront.
+// AMD RDNA (gfx10xx, gfx11xx): 32-wide wavefront.
 HOST_DEVICE_IF_GPU
 constexpr inline unsigned threads_per_warp() {
-#ifdef ARB_HIP
-    return 64u;
+#if defined(ARB_HIP)
+  #if defined(__GFX9__)
+    return 64u;   // CDNA: gfx90a, gfx94x (wave64)
+  #else
+    return 32u;   // RDNA: gfx10xx, gfx11xx (wave32)
+  #endif
 #else
-    return 32u;
+    return 32u;   // CUDA
 #endif
 }
 

diff --git a/arbor/include/arbor/gpu/hip_api.hpp b/arbor/include/arbor/gpu/hip_api.hpp
@@ -128,35 +128,60 @@ shfl(T x, int lane) {
 
 __device__ __inline__ double shfl(double x, int lane)
 {
-    auto tmp = static_cast<uint64_t>(x);
+    // Use bit-preserving conversion to shuffle a double as two 32-bit halves.
+    auto tmp = __double_as_longlong(x);
     auto lo = static_cast<unsigned>(tmp);
     auto hi = static_cast<unsigned>(tmp >> 32);
     hi = __shfl(static_cast<int>(hi), lane, warpSize);
     lo = __shfl(static_cast<int>(lo), lane, warpSize);
-    return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
-                               static_cast<uint64_t>(lo));
+    return __longlong_as_double(static_cast<long long>(hi) << 32 |
+                                static_cast<unsigned long long>(lo));
 }
 
-__device__ __inline__ unsigned ballot(unsigned mask, unsigned is_root) {
+// Lane mask type: 64-bit on HIP (wave64 on CDNA, wave32 on RDNA).
+// Always use 64-bit to support wave64 devices.
+using lane_mask_type = unsigned long long;
+
+// On AMD, ballot/activemask return 64-bit on wave64 devices.
+// Use unsigned long long to match HIP's native __ballot return type.
+__device__ __inline__ unsigned long long ballot(unsigned long long mask, unsigned is_root) {
     return __ballot(is_root);
 }
 
-__device__ __inline__ unsigned active_mask() {
+__device__ __inline__ unsigned long long active_mask() {
     return __activemask();
 }
 
-__device__ __inline__ unsigned any(unsigned mask, unsigned width) {
-    return __any(width);
+__device__ __inline__ unsigned any(unsigned long long mask, unsigned width) {
+    // Emulate CUDA's __any_sync: only consider threads in the mask.
+    return (__ballot(width) & mask) != 0;
 }
 
 template<typename T>
-__device__ __inline__ T shfl_up(unsigned mask, T var, unsigned lane_id, unsigned shift) {
-    return shfl(var, (int)lane_id - shift);
+__device__ __inline__ T shfl_up(unsigned long long mask, T var, unsigned lane_id, unsigned shift) {
+    // Use HIP's native __shfl_up which handles boundary conditions:
+    // if (lane_id % width) < shift, returns var unchanged.
+    // The width is the wavefront size (warpSize) for the current arch.
+    return __shfl_up(var, shift, warpSize);
 }
 
 template<typename T>
-__device__ __inline__ T shfl_down(unsigned mask, T var, unsigned lane_id, unsigned shift) {
-    return shfl(var, (int)lane_id + shift);
+__device__ __inline__ T shfl_down(unsigned long long mask, T var, unsigned lane_id, unsigned shift) {
+    // Use HIP's native __shfl_down which handles boundary conditions:
+    // if (lane_id % width) + shift >= width, returns var unchanged.
+    // The width is the wavefront size (warpSize) for the current arch.
+    return __shfl_down(var, shift, warpSize);
+}
+
+// Count leading zeros / find first set on a lane mask. lane_mask_type is 64-bit
+// on HIP (ballot returns a wave64-wide mask on CDNA, wave32 on RDNA), so the
+// 64-bit __clzll/__ffsll variants are used to cover the full mask width.
+__device__ __inline__ unsigned count_leading_zeros(lane_mask_type mask) {
+    return __clzll(mask);
+}
+
+__device__ __inline__ unsigned find_first_set(lane_mask_type mask) {
+    return __ffsll(mask);
 }
 
 } // namespace gpu

diff --git a/arbor/include/arbor/gpu/reduce_by_key.hpp b/arbor/include/arbor/gpu/reduce_by_key.hpp
@@ -19,40 +19,46 @@ namespace gpu {
 // As currently implemented, this could be performed inline inside the reduce_by_key
 // function, however it is in a separate data type as a first step towards reusing
 // the same key set information for multipe reduction kernel calls.
+
 struct key_set_pos {
     unsigned width;         // distance to one past the end of this run
     unsigned lane_id;       // id of this warp lane
-    unsigned key_mask;      // warp mask of threads participating in reduction
+    lane_mask_type key_mask; // warp mask of threads participating in reduction
     unsigned is_root;       // if this lane is the first in the run
 
     // The constructor takes the index of each thread and a mask of which threads
     // in the warp are participating in the reduction, and the threads work cooperatively
     // using warp intrinsics and bit twiddling, so that each thread will have unique
     // information that describes its position in the key set.
     __device__
-    key_set_pos(int idx, unsigned mask) {
+    key_set_pos(int idx, lane_mask_type mask) {
         key_mask = mask;
         lane_id = threadIdx.x%impl::threads_per_warp();
-        unsigned num_lanes = impl::threads_per_warp()-__clz(key_mask);
+        // count_leading_zeros counts from the top bit of lane_mask_type, so the
+        // lane count is measured against its full bit width: 64 on HIP (ballot is
+        // wave64-wide on CDNA, wave32 on RDNA) and 32 on CUDA. On a wave32 HIP
+        // mask the upper 32 bits are zero, so 64 - count_leading_zeros still
+        // yields the correct count.
+        unsigned num_lanes = 8*sizeof(lane_mask_type) - count_leading_zeros(key_mask);
 
         // Determine if this thread is the root (i.e. first thread with this key).
         int left_idx  = shfl_up(key_mask, idx, lane_id, lane_id? 1: 0);
 
         is_root = lane_id? left_idx!=idx: 1;
 
         // Determine the range this thread contributes to.
-        unsigned roots = ballot(key_mask, is_root);
+        lane_mask_type roots = ballot(key_mask, is_root);
 
         // Find the distance to the lane id one past the end of the run.
         // Take care if this is the last run in the warp.
-        width = __ffs(roots>>(lane_id+1));
+        width = find_first_set(roots>>(lane_id+1));
         if (!width) width = num_lanes-lane_id;
     }
 };
 
 template <typename T, typename I>
 __device__ __inline__
-void reduce_by_key(T contribution, T* target, I i, unsigned mask) {
+void reduce_by_key(T contribution, T* target, I i, lane_mask_type mask) {
     key_set_pos run(i, mask);
     unsigned shift = 1;
     const unsigned width = run.width;

diff --git a/arbor/include/arbor/serdes.hpp b/arbor/include/arbor/serdes.hpp
@@ -179,6 +179,11 @@ ARB_ARBOR_API void serialize(::arb::serializer& ser, const K& k, const std::shar
     serialize(ser, k, *p);
 }
 
+template<typename K>
+ARB_ARBOR_API void serialize(::arb::serializer& ser, const K& k, long long v) {
+    ser.write(arb::to_serdes_key(k), v);
+}
+
 template<typename K>
 ARB_ARBOR_API void serialize(::arb::serializer& ser, const K& k, long v) {
     ser.write(arb::to_serdes_key(k), static_cast<long long>(v));
@@ -279,6 +284,11 @@ ARB_ARBOR_API void deserialize(::arb::serializer& ser, const K& k, std::shared_p
     deserialize(ser, k, *p);
 }
 
+template<typename K>
+ARB_ARBOR_API void deserialize(::arb::serializer& ser, const K& k, long long& v) {
+    ser.read(arb::to_serdes_key(k), v);
+}
+
 template<typename K>
 ARB_ARBOR_API void deserialize(::arb::serializer& ser, const K& k, long& v) {
     long long tmp;

diff --git a/arbor/memory/allocator.hpp b/arbor/memory/allocator.hpp
@@ -2,6 +2,12 @@
 
 #include <limits>
 
+#ifdef _WIN32
+#  include <malloc.h>
+#  define posix_memalign(p, a, s) ((*p = _aligned_malloc((s), (a))), ((*p) ? 0 : ENOMEM))
+#  include <errno.h>
+#endif
+
 #include "gpu_wrappers.hpp"
 #include "definitions.hpp"
 #include "util.hpp"
@@ -78,7 +84,11 @@ namespace impl {
         }
 
         void free_policy(void *ptr) {
+#ifdef _WIN32
+            _aligned_free(ptr);
+#else
             free(ptr);
+#endif
         }
 
         static constexpr size_type alignment() {

diff --git a/arbor/network.cpp b/arbor/network.cpp
@@ -2,6 +2,12 @@
 #include <arbor/network.hpp>
 #include <arbor/util/hash_def.hpp>
 
+// On HIP, sincosf/sincos are device-only; use the sin+cos fallback in Random123.
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIP__)
+#  ifndef R123_NO_SINCOS
+#    define R123_NO_SINCOS 1
+#  endif
+#endif
 #include <Random123/threefry.h>
 #include <Random123/boxmuller.hpp>
 #include <Random123/uniform.hpp>

diff --git a/arbor/util/dylib.cpp b/arbor/util/dylib.cpp
@@ -2,23 +2,34 @@
 #include <string>
 #include <filesystem>
 
-#include <dlfcn.h>
-
 #include <arbor/arbexcept.hpp>
 
 #include "util/dylib.hpp"
 #include "util/strprintf.hpp"
 
+#ifdef _WIN32
+#  define WIN32_LEAN_AND_MEAN
+#  include <windows.h>
+#else
+#  include <dlfcn.h>
+#endif
+
 namespace arb {
 namespace util {
 
 void* dl_open(const std::filesystem::path& fn) {
     try {
         std::ifstream fd{fn.c_str()};
-        if(!fd.good()) throw file_not_found_error{fn};
+        if(!fd.good()) throw file_not_found_error{fn.string()};
     } catch(...) {
-        throw file_not_found_error{fn};
+        throw file_not_found_error{fn.string()};
     }
+#ifdef _WIN32
+    auto result = reinterpret_cast<void*>(LoadLibraryW(fn.c_str()));
+    if (nullptr == result) {
+        throw dl_error{util::pprintf("[WIN32] dl_open failed with error: {}", GetLastError())};
+    }
+#else
     // Call once to clear errors not caused by us
     dlerror();
     auto result = dlopen(fn.c_str(), RTLD_LAZY);
@@ -27,22 +38,28 @@ void* dl_open(const std::filesystem::path& fn) {
         auto error = dlerror();
         throw dl_error{util::pprintf("[POSIX] dl_open failed with: {}", error)};
     }
+#endif
     return result;
 }
 
 namespace impl{
 void* dl_get_symbol(const std::filesystem::path& fn, const std::string& symbol) {
+    auto handle = dl_open(fn);
+#ifdef _WIN32
+    auto result = reinterpret_cast<void*>(GetProcAddress(reinterpret_cast<HMODULE>(handle), symbol.c_str()));
+    if (!result) {
+        throw dl_error{util::pprintf("[WIN32] dl_get_symbol failed with error: {}", GetLastError())};
+    }
+#else
     // Call once to clear errors not caused by us
     dlerror();
-
-    auto handle = dl_open(fn);
-
     // Get symbol from shared object, may return NULL if that is what symbol refers to
     auto result = dlsym(handle, symbol.c_str());
     // dlsym mayb return NULL even if succeeding
     if (auto error = dlerror()) {
         throw dl_error{util::pprintf("[POSIX] dl_get_symbol failed with: {}", error)};
     }
+#endif
     return result;
 }
 } // namespace impl