Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ elseif(ARB_GPU STREQUAL "hip")
set(ARB_WITH_HIP_CLANG TRUE)
# Specify AMD architecture using a (user provided) list.
# Note: CMake native HIP architectures are introduced with version 3.21.
set(ARB_HIP_ARCHITECTURES gfx906 gfx900 CACHE STRING "AMD offload architectures (semicolon separated)")
set(ARB_HIP_ARCHITECTURES gfx90a CACHE STRING "AMD offload architectures (semicolon separated)")
endif()

if(ARB_WITH_NVCC OR ARB_WITH_CUDA_CLANG OR ARB_WITH_HIP_CLANG)
Expand Down Expand Up @@ -591,6 +591,12 @@ if(ARB_WITH_GPU)
set(clang_options_ -DARB_HIP -xhip ${HIP_ARCH_STR})
target_compile_options(arbor-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CXX>:${clang_options_}>)
target_compile_options(arborenv-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CXX>:${clang_options_}>)
# On Windows the clang driver does not auto-add the HIP runtime library when linking
# shared libraries; link it explicitly so HIP launch symbols resolve.
if(WIN32)
find_library(AMDHIP64_LIB amdhip64 REQUIRED HINTS "${CMAKE_PREFIX_PATH}/lib")
target_link_libraries(arbor-private-deps INTERFACE "${AMDHIP64_LIB}")
endif()
endif()
endif()

Expand Down
8 changes: 7 additions & 1 deletion arbor/backends/rand_impl.hpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
#pragma once

// This file is intended to be included in the source files directly in order
// to be compiled by either the host or the device compiler
// to be compiled by either the host or the device compiler

#include <type_traits>

// On HIP, sincosf/sincos are device-only; use the sin+cos fallback in Random123.
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIP__)
# ifndef R123_NO_SINCOS
# define R123_NO_SINCOS 1
# endif
#endif
#include <Random123/boxmuller.hpp>
#include <Random123/threefry.h>

Expand Down
9 changes: 7 additions & 2 deletions arbor/include/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ target_include_directories(arbor-public-headers INTERFACE

add_custom_command(
OUTPUT _always_rebuild
COMMAND true
COMMAND ${CMAKE_COMMAND} -E echo "" > ${CMAKE_CURRENT_BINARY_DIR}/_always_rebuild
)

set(arb_features)
Expand Down Expand Up @@ -59,10 +59,15 @@ endif()

string(TOUPPER "${CMAKE_BUILD_TYPE}" arb_config_str)

if(WIN32)
find_program(BASH_EXECUTABLE bash HINTS "C:/Program Files/Git/bin" REQUIRED)
else()
set(BASH_EXECUTABLE bash)
endif()
add_custom_command(
OUTPUT version.hpp-test
DEPENDS _always_rebuild
COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/git-source-id ${FULL_VERSION_STRING} ${ARB_ARCH} ${arb_config_str} ${arb_features} > version.hpp-test
COMMAND ${BASH_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/git-source-id ${FULL_VERSION_STRING} ${ARB_ARCH} ${arb_config_str} ${arb_features} > version.hpp-test
)

set(version_hpp_path arbor/version.hpp)
Expand Down
13 changes: 13 additions & 0 deletions arbor/include/arbor/gpu/cuda_api.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,9 @@ inline api_error_type device_mem_get_info(ARGS &&... args) {
return cudaMemGetInfo(std::forward<ARGS>(args)...);
}

// Lane mask type: 32-bit on CUDA (warp size always 32).
using lane_mask_type = unsigned;

#ifdef __CUDACC__
/// Atomics

Expand Down Expand Up @@ -143,6 +146,16 @@ template<typename T>
__device__ __inline__ T shfl_down(unsigned mask, T var, unsigned lane_id, unsigned shift) {
return __shfl_down_sync(mask, var, shift);
}

// Count leading zeros / find first set on a lane mask. lane_mask_type is 32-bit
// on CUDA (warp size is always 32), so the 32-bit __clz/__ffs variants are used.
__device__ __inline__ unsigned count_leading_zeros(lane_mask_type mask) {
return __clz(mask);
}

__device__ __inline__ unsigned find_first_set(lane_mask_type mask) {
return __ffs(mask);
}
#endif

} // namespace gpu
Expand Down
16 changes: 11 additions & 5 deletions arbor/include/arbor/gpu/gpu_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,20 @@ namespace arb {
namespace gpu {

namespace impl {
// Number of threads per warp
// This has always been 32, however it may change in future NVIDIA gpus
// Number of threads per warp (wavefront on AMD).
// NVIDIA: always 32.
// AMD CDNA (gfx9xx): 64-wide wavefront.
// AMD RDNA (gfx10xx, gfx11xx): 32-wide wavefront.
HOST_DEVICE_IF_GPU
constexpr inline unsigned threads_per_warp() {
#ifdef ARB_HIP
return 64u;
#if defined(ARB_HIP)
#if defined(__GFX9__)
return 64u; // CDNA: gfx90a, gfx94x (wave64)
#else
return 32u; // RDNA: gfx10xx, gfx11xx (wave32)
#endif
#else
return 32u;
return 32u; // CUDA
#endif
}

Expand Down
47 changes: 36 additions & 11 deletions arbor/include/arbor/gpu/hip_api.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,35 +128,60 @@ shfl(T x, int lane) {

__device__ __inline__ double shfl(double x, int lane)
{
auto tmp = static_cast<uint64_t>(x);
// Use bit-preserving conversion to shuffle a double as two 32-bit halves.
auto tmp = __double_as_longlong(x);
auto lo = static_cast<unsigned>(tmp);
auto hi = static_cast<unsigned>(tmp >> 32);
hi = __shfl(static_cast<int>(hi), lane, warpSize);
lo = __shfl(static_cast<int>(lo), lane, warpSize);
return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
static_cast<uint64_t>(lo));
return __longlong_as_double(static_cast<long long>(hi) << 32 |
static_cast<unsigned long long>(lo));
}

__device__ __inline__ unsigned ballot(unsigned mask, unsigned is_root) {
// Lane mask type: 64-bit on HIP (wave64 on CDNA, wave32 on RDNA).
// Always use 64-bit to support wave64 devices.
using lane_mask_type = unsigned long long;

// On AMD, ballot/activemask return 64-bit on wave64 devices.
// Use unsigned long long to match HIP's native __ballot return type.
__device__ __inline__ unsigned long long ballot(unsigned long long mask, unsigned is_root) {
return __ballot(is_root);
}

__device__ __inline__ unsigned active_mask() {
__device__ __inline__ unsigned long long active_mask() {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: this changes type depending on the backend HIP/CUDA. Need to test that there no issues arising.

return __activemask();
}

__device__ __inline__ unsigned any(unsigned mask, unsigned width) {
return __any(width);
__device__ __inline__ unsigned any(unsigned long long mask, unsigned width) {
// Emulate CUDA's __any_sync: only consider threads in the mask.
return (__ballot(width) & mask) != 0;
}

template<typename T>
__device__ __inline__ T shfl_up(unsigned mask, T var, unsigned lane_id, unsigned shift) {
return shfl(var, (int)lane_id - shift);
__device__ __inline__ T shfl_up(unsigned long long mask, T var, unsigned lane_id, unsigned shift) {
// Use HIP's native __shfl_up which handles boundary conditions:
// if (lane_id % width) < shift, returns var unchanged.
// The width is the wavefront size (warpSize) for the current arch.
return __shfl_up(var, shift, warpSize);
}

template<typename T>
__device__ __inline__ T shfl_down(unsigned mask, T var, unsigned lane_id, unsigned shift) {
return shfl(var, (int)lane_id + shift);
__device__ __inline__ T shfl_down(unsigned long long mask, T var, unsigned lane_id, unsigned shift) {
// Use HIP's native __shfl_down which handles boundary conditions:
// if (lane_id % width) + shift >= width, returns var unchanged.
// The width is the wavefront size (warpSize) for the current arch.
return __shfl_down(var, shift, warpSize);
}

// Count leading zeros / find first set on a lane mask. lane_mask_type is 64-bit
// on HIP (ballot returns a wave64-wide mask on CDNA, wave32 on RDNA), so the
// 64-bit __clzll/__ffsll variants are used to cover the full mask width.
__device__ __inline__ unsigned count_leading_zeros(lane_mask_type mask) {
return __clzll(mask);
}

__device__ __inline__ unsigned find_first_set(lane_mask_type mask) {
return __ffsll(mask);
}

} // namespace gpu
Expand Down
18 changes: 12 additions & 6 deletions arbor/include/arbor/gpu/reduce_by_key.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,40 +19,46 @@ namespace gpu {
// As currently implemented, this could be performed inline inside the reduce_by_key
// function, however it is in a separate data type as a first step towards reusing
// the same key set information for multipe reduction kernel calls.

struct key_set_pos {
unsigned width; // distance to one past the end of this run
unsigned lane_id; // id of this warp lane
unsigned key_mask; // warp mask of threads participating in reduction
lane_mask_type key_mask; // warp mask of threads participating in reduction
unsigned is_root; // if this lane is the first in the run

// The constructor takes the index of each thread and a mask of which threads
// in the warp are participating in the reduction, and the threads work cooperatively
// using warp intrinsics and bit twiddling, so that each thread will have unique
// information that describes its position in the key set.
__device__
key_set_pos(int idx, unsigned mask) {
key_set_pos(int idx, lane_mask_type mask) {
key_mask = mask;
lane_id = threadIdx.x%impl::threads_per_warp();
unsigned num_lanes = impl::threads_per_warp()-__clz(key_mask);
// count_leading_zeros counts from the top bit of lane_mask_type, so the
// lane count is measured against its full bit width: 64 on HIP (ballot is
// wave64-wide on CDNA, wave32 on RDNA) and 32 on CUDA. On a wave32 HIP
// mask the upper 32 bits are zero, so 64 - count_leading_zeros still
// yields the correct count.
unsigned num_lanes = 8*sizeof(lane_mask_type) - count_leading_zeros(key_mask);

// Determine if this thread is the root (i.e. first thread with this key).
int left_idx = shfl_up(key_mask, idx, lane_id, lane_id? 1: 0);

is_root = lane_id? left_idx!=idx: 1;

// Determine the range this thread contributes to.
unsigned roots = ballot(key_mask, is_root);
lane_mask_type roots = ballot(key_mask, is_root);

// Find the distance to the lane id one past the end of the run.
// Take care if this is the last run in the warp.
width = __ffs(roots>>(lane_id+1));
width = find_first_set(roots>>(lane_id+1));
if (!width) width = num_lanes-lane_id;
}
};

template <typename T, typename I>
__device__ __inline__
void reduce_by_key(T contribution, T* target, I i, unsigned mask) {
void reduce_by_key(T contribution, T* target, I i, lane_mask_type mask) {
key_set_pos run(i, mask);
unsigned shift = 1;
const unsigned width = run.width;
Expand Down
10 changes: 10 additions & 0 deletions arbor/include/arbor/serdes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,11 @@ ARB_ARBOR_API void serialize(::arb::serializer& ser, const K& k, const std::shar
serialize(ser, k, *p);
}

template<typename K>
ARB_ARBOR_API void serialize(::arb::serializer& ser, const K& k, long long v) {
ser.write(arb::to_serdes_key(k), v);
}

template<typename K>
ARB_ARBOR_API void serialize(::arb::serializer& ser, const K& k, long v) {
ser.write(arb::to_serdes_key(k), static_cast<long long>(v));
Expand Down Expand Up @@ -279,6 +284,11 @@ ARB_ARBOR_API void deserialize(::arb::serializer& ser, const K& k, std::shared_p
deserialize(ser, k, *p);
}

template<typename K>
ARB_ARBOR_API void deserialize(::arb::serializer& ser, const K& k, long long& v) {
ser.read(arb::to_serdes_key(k), v);
}

template<typename K>
ARB_ARBOR_API void deserialize(::arb::serializer& ser, const K& k, long& v) {
long long tmp;
Expand Down
10 changes: 10 additions & 0 deletions arbor/memory/allocator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

#include <limits>

#ifdef _WIN32
# include <malloc.h>
# define posix_memalign(p, a, s) ((*p = _aligned_malloc((s), (a))), ((*p) ? 0 : ENOMEM))
# include <errno.h>
#endif

#include "gpu_wrappers.hpp"
#include "definitions.hpp"
#include "util.hpp"
Expand Down Expand Up @@ -78,7 +84,11 @@ namespace impl {
}

void free_policy(void *ptr) {
#ifdef _WIN32
_aligned_free(ptr);
#else
free(ptr);
#endif
}

static constexpr size_type alignment() {
Expand Down
6 changes: 6 additions & 0 deletions arbor/network.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@
#include <arbor/network.hpp>
#include <arbor/util/hash_def.hpp>

// On HIP, sincosf/sincos are device-only; use the sin+cos fallback in Random123.
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIP__)
# ifndef R123_NO_SINCOS
# define R123_NO_SINCOS 1
# endif
#endif
#include <Random123/threefry.h>
#include <Random123/boxmuller.hpp>
#include <Random123/uniform.hpp>
Expand Down
31 changes: 24 additions & 7 deletions arbor/util/dylib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,34 @@
#include <string>
#include <filesystem>

#include <dlfcn.h>

#include <arbor/arbexcept.hpp>

#include "util/dylib.hpp"
#include "util/strprintf.hpp"

#ifdef _WIN32
# define WIN32_LEAN_AND_MEAN

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

:D

# include <windows.h>
#else
# include <dlfcn.h>
#endif

namespace arb {
namespace util {

void* dl_open(const std::filesystem::path& fn) {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am amazed that this works.

try {
std::ifstream fd{fn.c_str()};
if(!fd.good()) throw file_not_found_error{fn};
if(!fd.good()) throw file_not_found_error{fn.string()};
} catch(...) {
throw file_not_found_error{fn};
throw file_not_found_error{fn.string()};
}
#ifdef _WIN32
auto result = reinterpret_cast<void*>(LoadLibraryW(fn.c_str()));
if (nullptr == result) {
throw dl_error{util::pprintf("[WIN32] dl_open failed with error: {}", GetLastError())};
}
#else
// Call once to clear errors not caused by us
dlerror();
auto result = dlopen(fn.c_str(), RTLD_LAZY);
Expand All @@ -27,22 +38,28 @@ void* dl_open(const std::filesystem::path& fn) {
auto error = dlerror();
throw dl_error{util::pprintf("[POSIX] dl_open failed with: {}", error)};
}
#endif
return result;
}

namespace impl{
void* dl_get_symbol(const std::filesystem::path& fn, const std::string& symbol) {
auto handle = dl_open(fn);
#ifdef _WIN32
auto result = reinterpret_cast<void*>(GetProcAddress(reinterpret_cast<HMODULE>(handle), symbol.c_str()));
if (!result) {
throw dl_error{util::pprintf("[WIN32] dl_get_symbol failed with error: {}", GetLastError())};
}
#else
// Call once to clear errors not caused by us
dlerror();

auto handle = dl_open(fn);

// Get symbol from shared object, may return NULL if that is what symbol refers to
auto result = dlsym(handle, symbol.c_str());
// dlsym mayb return NULL even if succeeding
if (auto error = dlerror()) {
throw dl_error{util::pprintf("[POSIX] dl_get_symbol failed with: {}", error)};
}
#endif
return result;
}
} // namespace impl
Expand Down
Loading
Loading