diff --git a/CMakeLists.txt b/CMakeLists.txt index 9165fa202..0461505d1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -449,6 +449,46 @@ if (NOT HAVE_LARGE_FILES) add_definitions("-D_FILE_OFFSET_BITS=64") endif() +# librspreload: probe libc for fcntl64/sendfile64; CMake sets RDMA_PRELOAD_WRAP_LFS64 when +# preload should implement the fcntl64/sendfile64 wrapper (see RDMA_PRELOAD_WRAP_LFS64 below). +set(CMAKE_REQUIRED_QUIET 1) +set(SAFE_CMAKE_REQUIRED_DEFINITIONS "${CMAKE_REQUIRED_DEFINITIONS}") +set(CMAKE_REQUIRED_DEFINITIONS "-D_GNU_SOURCE") +CHECK_C_SOURCE_COMPILES(" +#include +#include +int main(void) { + (void)&fcntl64; + (void)&sendfile64; + return 0; +} +" RDMA_PRELOAD_LIBC_HAS_FCNTL64_SENDFILE64) +set(CMAKE_REQUIRED_DEFINITIONS "${SAFE_CMAKE_REQUIRED_DEFINITIONS}") +set(CMAKE_REQUIRED_QUIET 0) +if(RDMA_PRELOAD_LIBC_HAS_FCNTL64_SENDFILE64) + set(RDMA_PRELOAD_HAVE_LFS_WRAPPER_SYMS 1) +else() + set(RDMA_PRELOAD_HAVE_LFS_WRAPPER_SYMS 0) +endif() + +# Single gate for preload.c fcntl64/sendfile64 wrappers: need libc symbols and must +# not compile preload with _FILE_OFFSET_BITS=64 (CMake adds that when HAVE_LARGE_FILES is false). +if(RDMA_PRELOAD_HAVE_LFS_WRAPPER_SYMS AND HAVE_LARGE_FILES) + set(RDMA_PRELOAD_WRAP_LFS64 1) +else() + set(RDMA_PRELOAD_WRAP_LFS64 0) +endif() + +# If the first test found fcntl64/sendfile64 with _GNU_SOURCE, the headers +# must declare them -- no separate check needed. +if(RDMA_PRELOAD_HAVE_LFS_WRAPPER_SYMS) + set(RDMA_PRELOAD_FCNTL64_IN_HEADER 1) + set(RDMA_PRELOAD_SENDFILE64_IN_HEADER 1) +else() + set(RDMA_PRELOAD_FCNTL64_IN_HEADER 0) + set(RDMA_PRELOAD_SENDFILE64_IN_HEADER 0) +endif() + # Provide a shim if C11 stdatomic.h is not supported. if (NOT HAVE_SPARSE) CHECK_INCLUDE_FILE("stdatomic.h" HAVE_STDATOMIC) diff --git a/buildlib/rdma_functions.cmake b/buildlib/rdma_functions.cmake index ef77d95d0..6e0c7d2e8 100644 --- a/buildlib/rdma_functions.cmake +++ b/buildlib/rdma_functions.cmake @@ -120,6 +120,19 @@ function(rdma_library DEST VERSION_SCRIPT SOVERSION VERSION) install(TARGETS ${DEST} DESTINATION "${CMAKE_INSTALL_LIBDIR}") endfunction() +# rsocket LD_PRELOAD module. Requires RDMA_PRELOAD_* variables from top-level +# CMakeLists.txt (RDMA_PRELOAD_WRAP_LFS64 and fcntl64/sendfile64 header probe). +function(rdma_rspreload_module DEST VERSION_SCRIPT) + add_library(${DEST} MODULE ${ARGN}) + target_compile_definitions(${DEST} PRIVATE + RDMA_PRELOAD_WRAP_LFS64=${RDMA_PRELOAD_WRAP_LFS64} + RDMA_PRELOAD_FCNTL64_IN_HEADER=${RDMA_PRELOAD_FCNTL64_IN_HEADER} + RDMA_PRELOAD_SENDFILE64_IN_HEADER=${RDMA_PRELOAD_SENDFILE64_IN_HEADER}) + set_target_properties(${DEST} PROPERTIES LINK_FLAGS ${CMAKE_SHARED_LINKER_FLAGS}) + set_target_properties(${DEST} PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${BUILD_LIB}") + rdma_set_library_map(${DEST} ${VERSION_SCRIPT}) +endfunction() + # Create a special provider with exported symbols in it The shared provider # exists as a normal system library with the normal shared library SONAME and # other convections. The system library is symlinked into the diff --git a/infiniband-diags/dump_fts.c b/infiniband-diags/dump_fts.c index c7f2df167..2e74d8b78 100644 --- a/infiniband-diags/dump_fts.c +++ b/infiniband-diags/dump_fts.c @@ -296,7 +296,7 @@ static void dump_unicast_tables(ibnd_node_t *node, int startl, int endl, ibnd_fabric_t *fabric) { ib_portid_t * portid = &node->path_portid; - char lft[IB_SMP_DATA_SIZE] = { 0 }; + uint8_t lft[IB_SMP_DATA_SIZE] = { 0 }; char str[200]; uint64_t nodeguid; int block, i, e, top; diff --git a/infiniband-diags/ibstat.c b/infiniband-diags/ibstat.c index 5e918aa57..6ce15fd11 100644 --- a/infiniband-diags/ibstat.c +++ b/infiniband-diags/ibstat.c @@ -43,12 +43,17 @@ #include #include #include +#include #include /* __be64 */ #include #include +/* IB spec: Port supports Connection Manager (cap_mask bit 16) */ +#define IB_PORT_CAP_CM (0x10000) +#define SYS_PORT_GID "gids/0" + static const char * const node_type_str[] = { "???", "CA", @@ -193,13 +198,55 @@ static int port_dump(umad_port_t * port, int alone) return 0; } +static bool port_has_gid(const char *ca_name, int portnum) +{ + char path[256]; + FILE *f; + + snprintf(path, sizeof(path), + SYS_INFINIBAND "/%s/ports/%d/" SYS_PORT_GID, ca_name, portnum); + + f = fopen(path, "r"); + if (!f) + return false; + + fclose(f); + return true; +} + +static int port_has_cm_cap(const char *ca_name, int portnum) +{ + char path[256], buf[32]; + uint32_t capmask; + + snprintf(path, sizeof(path), + SYS_INFINIBAND "/%s/ports/%d", ca_name, portnum); + + if (sys_read_string(path, SYS_PORT_CAPMASK, buf, sizeof(buf)) < 0) + return 0; + + capmask = strtoul(buf, NULL, 0); + return (capmask & IB_PORT_CAP_CM); +} + static int ca_stat(const char *ca_name, int portnum, int no_ports) { umad_ca_t ca; - int r; - - if ((r = umad_get_ca(ca_name, &ca)) < 0) + int r, check_port; + + r = umad_get_ca(ca_name, &ca); + if (r < 0) { + if (portnum == -1) + check_port = 1; + else + check_port = portnum; + + if ((!port_has_gid(ca_name, check_port)) && (!port_has_cm_cap(ca_name, check_port))) { + DEBUG("The device %s is not RDMA endpoint device", ca_name); + return 0; + } return r; + } if (!ca.node_type) return 0; diff --git a/kernel-headers/rdma/bnxt_re-abi.h b/kernel-headers/rdma/bnxt_re-abi.h index f24edf1c7..40955eaba 100644 --- a/kernel-headers/rdma/bnxt_re-abi.h +++ b/kernel-headers/rdma/bnxt_re-abi.h @@ -102,12 +102,17 @@ struct bnxt_re_pd_resp { struct bnxt_re_cq_req { __aligned_u64 cq_va; __aligned_u64 cq_handle; + __aligned_u64 comp_mask; }; -enum bnxt_re_cq_mask { +enum bnxt_re_resp_cq_mask { BNXT_RE_CQ_TOGGLE_PAGE_SUPPORT = 0x1, }; +enum bnxt_re_req_cq_mask { + BNXT_RE_CQ_FIXED_NUM_CQE_ENABLE = 0x1, +}; + struct bnxt_re_cq_resp { __u32 cqid; __u32 tail; @@ -163,6 +168,8 @@ enum bnxt_re_objects { BNXT_RE_OBJECT_ALLOC_PAGE = (1U << UVERBS_ID_NS_SHIFT), BNXT_RE_OBJECT_NOTIFY_DRV, BNXT_RE_OBJECT_GET_TOGGLE_MEM, + BNXT_RE_OBJECT_DBR, + BNXT_RE_OBJECT_DEFAULT_DBR, }; enum bnxt_re_alloc_page_type { @@ -231,4 +238,31 @@ struct bnxt_re_packet_pacing_caps { struct bnxt_re_query_device_ex_resp { struct bnxt_re_packet_pacing_caps packet_pacing_caps; }; + +struct bnxt_re_db_region { + __u32 dpi; + __u32 reserved; + __aligned_u64 umdbr; +}; + +enum bnxt_re_obj_dbr_alloc_attrs { + BNXT_RE_ALLOC_DBR_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + BNXT_RE_ALLOC_DBR_ATTR, + BNXT_RE_ALLOC_DBR_OFFSET, +}; + +enum bnxt_re_obj_dbr_free_attrs { + BNXT_RE_FREE_DBR_HANDLE = (1U << UVERBS_ID_NS_SHIFT), +}; + +enum bnxt_re_obj_default_dbr_attrs { + BNXT_RE_DEFAULT_DBR_ATTR = (1U << UVERBS_ID_NS_SHIFT), +}; + +enum bnxt_re_obj_dpi_methods { + BNXT_RE_METHOD_DBR_ALLOC = (1U << UVERBS_ID_NS_SHIFT), + BNXT_RE_METHOD_DBR_FREE, + BNXT_RE_METHOD_GET_DEFAULT_DBR, +}; + #endif /* __BNXT_RE_UVERBS_ABI_H__*/ diff --git a/kernel-headers/rdma/efa-abi.h b/kernel-headers/rdma/efa-abi.h index 13225b038..d5c18f8de 100644 --- a/kernel-headers/rdma/efa-abi.h +++ b/kernel-headers/rdma/efa-abi.h @@ -22,12 +22,12 @@ */ enum { - EFA_ALLOC_UCONTEXT_CMD_COMP_TX_BATCH = 1 << 0, - EFA_ALLOC_UCONTEXT_CMD_COMP_MIN_SQ_WR = 1 << 1, + EFA_ALLOC_UCONTEXT_CMD_SUPP_CAPS_TX_BATCH = 1 << 0, + EFA_ALLOC_UCONTEXT_CMD_SUPP_CAPS_MIN_SQ_WR = 1 << 1, }; struct efa_ibv_alloc_ucontext_cmd { - __u32 comp_mask; + __u32 supported_caps; __u8 reserved_20[4]; }; diff --git a/kernel-headers/rdma/ib_user_ioctl_verbs.h b/kernel-headers/rdma/ib_user_ioctl_verbs.h index 89e6a3f13..90c5cd8e7 100644 --- a/kernel-headers/rdma/ib_user_ioctl_verbs.h +++ b/kernel-headers/rdma/ib_user_ioctl_verbs.h @@ -46,6 +46,7 @@ enum ib_uverbs_core_support { IB_UVERBS_CORE_SUPPORT_OPTIONAL_MR_ACCESS = 1 << 0, + IB_UVERBS_CORE_SUPPORT_ROBUST_UDATA = 1 << 1, }; enum ib_uverbs_access_flags { diff --git a/librdmacm/CMakeLists.txt b/librdmacm/CMakeLists.txt index ea1d1550f..cba925a02 100644 --- a/librdmacm/CMakeLists.txt +++ b/librdmacm/CMakeLists.txt @@ -27,14 +27,10 @@ target_link_libraries(rdmacm LINK_PRIVATE # The preload library is a bit special, it needs to be open coded # Since it is a LD_PRELOAD it has no soname, and is installed in sub dir -add_library(rspreload MODULE +rdma_rspreload_module(rspreload librspreload.map preload.c indexer.c ) -# Even though this is a module we still want to use Wl,--no-undefined -set_target_properties(rspreload PROPERTIES LINK_FLAGS ${CMAKE_SHARED_LINKER_FLAGS}) -set_target_properties(rspreload PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${BUILD_LIB}") -rdma_set_library_map(rspreload librspreload.map) target_link_libraries(rspreload LINK_PRIVATE rdmacm ${CMAKE_THREAD_LIBS_INIT} diff --git a/librdmacm/librspreload.map b/librdmacm/librspreload.map index 67ecf33b8..f453db69e 100644 --- a/librdmacm/librspreload.map +++ b/librdmacm/librspreload.map @@ -4,11 +4,14 @@ the signature this will go sideways.. */ global: accept; + accept4; bind; close; connect; + dup; dup2; fcntl; + fcntl64; getpeername; getsockname; getsockopt; @@ -22,6 +25,7 @@ select; send; sendfile; + sendfile64; sendmsg; sendto; setsockopt; diff --git a/librdmacm/preload.c b/librdmacm/preload.c index b3175dd5d..c89c9b7d7 100644 --- a/librdmacm/preload.c +++ b/librdmacm/preload.c @@ -59,11 +59,21 @@ #include "cma.h" #include "indexer.h" +#if RDMA_PRELOAD_WRAP_LFS64 +#if !RDMA_PRELOAD_FCNTL64_IN_HEADER +int fcntl64(int socket, int cmd, ... /* arg */); +#endif +#if !RDMA_PRELOAD_SENDFILE64_IN_HEADER +ssize_t sendfile64(int out_fd, int in_fd, off64_t *offset64, size_t count); +#endif +#endif + struct socket_calls { int (*socket)(int domain, int type, int protocol); int (*bind)(int socket, const struct sockaddr *addr, socklen_t addrlen); int (*listen)(int socket, int backlog); int (*accept)(int socket, struct sockaddr *addr, socklen_t *addrlen); + int (*accept4)(int socket, struct sockaddr *addr, socklen_t *addrlen, int flags); int (*connect)(int socket, const struct sockaddr *addr, socklen_t addrlen); ssize_t (*recv)(int socket, void *buf, size_t len, int flags); ssize_t (*recvfrom)(int socket, void *buf, size_t len, int flags, @@ -87,8 +97,15 @@ struct socket_calls { int (*getsockopt)(int socket, int level, int optname, void *optval, socklen_t *optlen); int (*fcntl)(int socket, int cmd, ... /* arg */); +#if RDMA_PRELOAD_WRAP_LFS64 + int (*fcntl64)(int socket, int cmd, ... /* arg */); +#endif + int (*dup)(int oldfd); int (*dup2)(int oldfd, int newfd); ssize_t (*sendfile)(int out_fd, int in_fd, off_t *offset, size_t count); +#if RDMA_PRELOAD_WRAP_LFS64 + ssize_t (*sendfile64)(int out_fd, int in_fd, off64_t *offset64, size_t count); +#endif int (*fxstat)(int ver, int fd, struct stat *buf); }; @@ -389,6 +406,7 @@ static void init_preload(void) real.bind = dlsym(RTLD_NEXT, "bind"); real.listen = dlsym(RTLD_NEXT, "listen"); real.accept = dlsym(RTLD_NEXT, "accept"); + real.accept4 = dlsym(RTLD_NEXT, "accept4"); real.connect = dlsym(RTLD_NEXT, "connect"); real.recv = dlsym(RTLD_NEXT, "recv"); real.recvfrom = dlsym(RTLD_NEXT, "recvfrom"); @@ -408,8 +426,15 @@ static void init_preload(void) real.setsockopt = dlsym(RTLD_NEXT, "setsockopt"); real.getsockopt = dlsym(RTLD_NEXT, "getsockopt"); real.fcntl = dlsym(RTLD_NEXT, "fcntl"); +#if RDMA_PRELOAD_WRAP_LFS64 + real.fcntl64 = dlsym(RTLD_NEXT, "fcntl64"); +#endif + real.dup = dlsym(RTLD_NEXT, "dup"); real.dup2 = dlsym(RTLD_NEXT, "dup2"); real.sendfile = dlsym(RTLD_NEXT, "sendfile"); +#if RDMA_PRELOAD_WRAP_LFS64 + real.sendfile64 = dlsym(RTLD_NEXT, "sendfile64"); +#endif real.fxstat = dlsym(RTLD_NEXT, "__fxstat"); rs.socket = dlsym(RTLD_DEFAULT, "rsocket"); @@ -620,6 +645,38 @@ int accept(int socket, struct sockaddr *addr, socklen_t *addrlen) } } +int accept4(int socket, struct sockaddr *addr, socklen_t *addrlen, int flags) +{ + int cur_flags = 0; + int fd; + + fd = accept(socket, addr, addrlen); + if (fd < 0) + return fd; + if (flags & SOCK_NONBLOCK) { + cur_flags = fcntl(fd, F_GETFL); + + if (cur_flags != -1) + cur_flags = fcntl(fd, F_SETFL, flags | O_NONBLOCK); + + if (cur_flags == -1) + goto close; + } + + if (flags & SOCK_CLOEXEC) { + cur_flags = fcntl(fd, F_GETFD); + if (cur_flags != -1) + cur_flags = fcntl(fd, F_SETFD, flags | FD_CLOEXEC); + if (cur_flags == -1) + goto close; + } + return fd; +close: + close(fd); + return -1; +} + + /* * We can't fork RDMA connections and pass them from the parent to the child * process. Instead, we need to establish the RDMA connection after calling @@ -889,20 +946,30 @@ static struct pollfd *fds_alloc(nfds_t nfds) return rfds; } +static int *fds_r_alloc(nfds_t nfds) +{ + static __thread int *rfds_r; + static __thread nfds_t rnfds; + + if (nfds > rnfds) { + if (rfds_r) + free(rfds_r); + + rfds_r = malloc(sizeof(*rfds_r) * nfds); + rnfds = rfds_r ? nfds : 0; + } + + return rfds_r; +} + int poll(struct pollfd *fds, nfds_t nfds, int timeout) { struct pollfd *rfds; int i, ret; + int has_rsocket = 0; init_preload(); - for (i = 0; i < nfds; i++) { - if (fd_gett(fds[i].fd) == fd_rsocket) - goto use_rpoll; - } - return real.poll(fds, nfds, timeout); - -use_rpoll: rfds = fds_alloc(nfds); if (!rfds) return ERR(ENOMEM); @@ -911,9 +978,15 @@ int poll(struct pollfd *fds, nfds_t nfds, int timeout) rfds[i].fd = fd_getd(fds[i].fd); rfds[i].events = fds[i].events; rfds[i].revents = 0; + + if (fd_gett(fds[i].fd) == fd_rsocket) + has_rsocket = 1; } - ret = rpoll(rfds, nfds, timeout); + if (!has_rsocket) + ret = real.poll(rfds, nfds, timeout); + else + ret = rpoll(rfds, nfds, timeout); for (i = 0; i < nfds; i++) fds[i].revents = rfds[i].revents; @@ -922,7 +995,7 @@ int poll(struct pollfd *fds, nfds_t nfds, int timeout) } static void select_to_rpoll(struct pollfd *fds, int *nfds, - fd_set *readfds, fd_set *writefds, fd_set *exceptfds) + fd_set *readfds, fd_set *writefds, fd_set *exceptfds, int *user_fds) { int fd, events, i = 0; @@ -933,7 +1006,8 @@ static void select_to_rpoll(struct pollfd *fds, int *nfds, if (events || (exceptfds && FD_ISSET(fd, exceptfds))) { fds[i].fd = fd_getd(fd); - fds[i++].events = events; + fds[i].events = events; + user_fds[i++] = fd; } } @@ -941,30 +1015,27 @@ static void select_to_rpoll(struct pollfd *fds, int *nfds, } static int rpoll_to_select(struct pollfd *fds, int nfds, - fd_set *readfds, fd_set *writefds, fd_set *exceptfds) + fd_set *readfds, fd_set *writefds, fd_set *exceptfds, int *user_fds) { - int fd, rfd, i, cnt = 0; + int i, cnt = 0; - for (i = 0, fd = 0; i < nfds; fd++) { - rfd = fd_getd(fd); - if (rfd != fds[i].fd) - continue; + for (i = 0; i < nfds; i++) { + assert(fds[i].fd == fd_getd(user_fds[i])); if (readfds && (fds[i].revents & POLLIN)) { - FD_SET(fd, readfds); + FD_SET(user_fds[i], readfds); cnt++; } if (writefds && (fds[i].revents & POLLOUT)) { - FD_SET(fd, writefds); + FD_SET(user_fds[i], writefds); cnt++; } if (exceptfds && (fds[i].revents & ~(POLLIN | POLLOUT))) { - FD_SET(fd, exceptfds); + FD_SET(user_fds[i], exceptfds); cnt++; } - i++; } return cnt; @@ -980,12 +1051,17 @@ int select(int nfds, fd_set *readfds, fd_set *writefds, { struct pollfd *fds; int ret; + int *user_fds; fds = fds_alloc(nfds); if (!fds) return ERR(ENOMEM); - select_to_rpoll(fds, &nfds, readfds, writefds, exceptfds); + user_fds = fds_r_alloc(nfds); + if (!user_fds) + return ERR(ENOMEM); + + select_to_rpoll(fds, &nfds, readfds, writefds, exceptfds, user_fds); ret = rpoll(fds, nfds, rs_convert_timeout(timeout)); if (readfds) @@ -996,7 +1072,7 @@ int select(int nfds, fd_set *readfds, fd_set *writefds, FD_ZERO(exceptfds); if (ret > 0) - ret = rpoll_to_select(fds, nfds, readfds, writefds, exceptfds); + ret = rpoll_to_select(fds, nfds, readfds, writefds, exceptfds, user_fds); return ret; } @@ -1109,6 +1185,59 @@ int fcntl(int socket, int cmd, ... /* arg */) return ret; } +#if RDMA_PRELOAD_WRAP_LFS64 +int fcntl64(int socket, int cmd, ... /* arg */) +{ + va_list args; + long lparam; + void *pparam; + int fd, ret; + + init_preload(); + va_start(args, cmd); + switch (cmd) { + case F_GETFD: + case F_GETFL: + case F_GETOWN: + case F_GETSIG: + case F_GETLEASE: + ret = (fd_get(socket, &fd) == fd_rsocket) ? + rfcntl(fd, cmd) : real.fcntl64(fd, cmd); + break; + case F_DUPFD: + /*case F_DUPFD_CLOEXEC:*/ + case F_SETFD: + case F_SETFL: + case F_SETOWN: + case F_SETSIG: + case F_SETLEASE: + case F_NOTIFY: + lparam = va_arg(args, long); + ret = (fd_get(socket, &fd) == fd_rsocket) ? + rfcntl(fd, cmd, lparam) : real.fcntl64(fd, cmd, lparam); + break; + default: + pparam = va_arg(args, void *); + ret = (fd_get(socket, &fd) == fd_rsocket) ? + rfcntl(fd, cmd, pparam) : real.fcntl64(fd, cmd, pparam); + break; + } + va_end(args); + return ret; +} +#endif + +int dup(int oldfd) +{ + int new_fd; + + new_fd = fcntl(oldfd, F_DUPFD, 0); + if (new_fd < 0) + return new_fd; + + return dup2(oldfd, new_fd); +} + /* * dup2 is not thread safe */ @@ -1181,6 +1310,28 @@ ssize_t sendfile(int out_fd, int in_fd, off_t *offset, size_t count) return ret; } +#if RDMA_PRELOAD_WRAP_LFS64 +ssize_t sendfile64(int out_fd, int in_fd, off64_t *offset64, size_t count) +{ + void *file_addr; + int fd; + size_t ret; + + if (fd_get(out_fd, &fd) != fd_rsocket) + return real.sendfile64(fd, in_fd, offset64, count); + + file_addr = mmap(NULL, count, PROT_READ, 0, in_fd, offset64 ? *offset64 : 0); + if (file_addr == (void *) -1) + return -1; + + ret = rwrite(fd, file_addr, count); + if ((ret > 0) && offset64) + lseek(in_fd, ret, SEEK_CUR); + munmap(file_addr, count); + return ret; +} +#endif + int __fxstat(int ver, int socket, struct stat *buf) { int fd, ret; diff --git a/librdmacm/rsocket.c b/librdmacm/rsocket.c index 005bd0be8..808d12773 100644 --- a/librdmacm/rsocket.c +++ b/librdmacm/rsocket.c @@ -135,7 +135,7 @@ static uint16_t def_rqsize = 384; static uint32_t def_mem = (1 << 17); static uint32_t def_wmem = (1 << 17); static uint32_t polling_time = 10; -static int wake_up_interval = 5000; +static int wake_up_interval = 500; /* * Immediate data format is determined by the upper bits @@ -317,7 +317,7 @@ struct ds_qp { }; struct rsocket { - int type; + int type; /* SOCK_STREAM or SOCK_DGRAM only; flags in fd_flags/fs_flags */ int index; fastlock_t slock; fastlock_t rlock; @@ -377,6 +377,8 @@ struct rsocket { int opts; int fd_flags; + int fs_flags; + int ipv4_opts; uint64_t so_opts; uint64_t ipv6_opts; void *optval; @@ -652,6 +654,7 @@ static struct rsocket *rs_alloc(struct rsocket *inherited_rs, int type) return NULL; rs->type = type; + rs->index = -1; if (type == SOCK_DGRAM) { rs->udp_sock = -1; @@ -846,7 +849,7 @@ static int rs_create_cq(struct rsocket *rs, struct rdma_cm_id *cm_id) if (!cm_id->recv_cq) goto err1; - if (rs->fd_flags & O_NONBLOCK) { + if (rs->fs_flags & O_NONBLOCK) { if (set_fd_nonblock(cm_id->recv_cq_channel->fd, true)) goto err2; } @@ -1197,19 +1200,23 @@ int rsocket(int domain, int type, int protocol) { struct rsocket *rs; int index, ret; + int socket_type = type & ~(SOCK_CLOEXEC | SOCK_NONBLOCK); if ((domain != AF_INET && domain != AF_INET6 && domain != AF_IB) || - ((type != SOCK_STREAM) && (type != SOCK_DGRAM)) || - (type == SOCK_STREAM && protocol && protocol != IPPROTO_TCP) || - (type == SOCK_DGRAM && protocol && protocol != IPPROTO_UDP)) + (socket_type != SOCK_STREAM && socket_type != SOCK_DGRAM) || + ((socket_type == SOCK_STREAM) && protocol && protocol != IPPROTO_TCP) || + ((socket_type == SOCK_DGRAM) && protocol && protocol != IPPROTO_UDP)) return ERR(ENOTSUP); rs_configure(); - rs = rs_alloc(NULL, type); + rs = rs_alloc(NULL, socket_type); if (!rs) return ERR(ENOMEM); - if (type == SOCK_STREAM) { + rs->fd_flags = (type & SOCK_CLOEXEC) ? FD_CLOEXEC : 0; + rs->fs_flags = (type & SOCK_NONBLOCK) ? O_NONBLOCK : 0; + + if (socket_type == SOCK_STREAM) { ret = rdma_create_id(NULL, &rs->cm_id, rs, RDMA_PS_TCP); if (ret) goto err; @@ -1220,7 +1227,6 @@ int rsocket(int domain, int type, int protocol) ret = ds_init(rs, domain); if (ret) goto err; - index = rs->udp_sock; } @@ -1278,7 +1284,7 @@ int rlisten(int socket, int backlog) if (ret) return ret; - if (rs->fd_flags & O_NONBLOCK) { + if (rs->fs_flags & O_NONBLOCK) { ret = set_fd_nonblock(rs->accept_queue[0], true); if (ret) return ret; @@ -1313,6 +1319,8 @@ static void rs_accept(struct rsocket *rs) if (!new_rs) goto err; new_rs->cm_id = cm_id; + new_rs->fd_flags = rs->fd_flags; + new_rs->fs_flags = rs->fs_flags; ret = rs_insert(new_rs, new_rs->cm_id->channel->fd); if (ret < 0) @@ -1470,7 +1478,7 @@ static int rs_do_connect(struct rsocket *rs) rs->state = rs_connect_rdwr; break; case rs_accepting: - if (!(rs->fd_flags & O_NONBLOCK)) + if (!(rs->fs_flags & O_NONBLOCK)) set_fd_nonblock(rs->cm_id->channel->fd, true); ret = ucma_complete(rs->cm_id); @@ -1725,9 +1733,11 @@ int rconnect(int socket, const struct sockaddr *addr, socklen_t addrlen) if (rs->type == SOCK_STREAM) { memcpy(&rs->cm_id->route.addr.dst_addr, addr, addrlen); ret = rs_do_connect(rs); - if (ret == -1 && errno == EINPROGRESS) { + if (ret == 0 || (ret == -1 && errno == EINPROGRESS)) { save_errno = errno; - /* The app can still drive the CM state on failure */ + /* Add rsocket to internal thread that drives CM progress + * so the app can drive state and respond to disconnect requests. + */ rs_notify_svc(&connect_svc, rs, RS_SVC_ADD_CM); errno = save_errno; } @@ -2345,7 +2355,7 @@ static int ds_get_comp(struct rsocket *rs, int nonblock, int (*test)(struct rsoc static int rs_nonblocking(struct rsocket *rs, int flags) { - return (rs->fd_flags & O_NONBLOCK) || (flags & MSG_DONTWAIT); + return (rs->fs_flags & O_NONBLOCK) || (flags & MSG_DONTWAIT); } static int rs_is_cq_armed(struct rsocket *rs) @@ -3367,8 +3377,10 @@ int rpoll(struct pollfd *fds, nfds_t nfds, int timeout) if (timeout >= 0) { timeout -= (int) ((rs_time_us() - start_time) / 1000); - if (timeout <= 0) + if (timeout <= 0) { + rs_poll_exit(); return 0; + } pollsleep = min(timeout, wake_up_interval); } else { pollsleep = wake_up_interval; @@ -3492,7 +3504,7 @@ int rshutdown(int socket, int how) if (rs->opts & RS_OPT_KEEPALIVE) rs_notify_svc(&tcp_svc, rs, RS_SVC_REM_KEEPALIVE); - if (rs->fd_flags & O_NONBLOCK) + if (rs->fs_flags & O_NONBLOCK) rs_set_nonblocking(rs, 0); if (rs->state & rs_connected) { @@ -3525,8 +3537,8 @@ int rshutdown(int socket, int how) rs_process_cq(rs, 0, rs_conn_all_sends_done); out: - if ((rs->fd_flags & O_NONBLOCK) && (rs->state & rs_connected)) - rs_set_nonblocking(rs, rs->fd_flags); + if ((rs->fs_flags & O_NONBLOCK) && (rs->state & rs_connected)) + rs_set_nonblocking(rs, rs->fs_flags); if (rs->state & rs_disconnected) { /* Generate event by flushing receives to unblock rpoll */ @@ -3542,14 +3554,14 @@ static void ds_shutdown(struct rsocket *rs) if (rs->opts & RS_OPT_UDP_SVC) rs_notify_svc(&udp_svc, rs, RS_SVC_REM_DGRAM); - if (rs->fd_flags & O_NONBLOCK) + if (rs->fs_flags & O_NONBLOCK) rs_set_nonblocking(rs, 0); rs->state &= ~(rs_readable | rs_writable); ds_process_cqs(rs, 0, ds_all_sends_done); - if (rs->fd_flags & O_NONBLOCK) - rs_set_nonblocking(rs, rs->fd_flags); + if (rs->fs_flags & O_NONBLOCK) + rs_set_nonblocking(rs, rs->fs_flags); } int rclose(int socket) @@ -3658,7 +3670,7 @@ int rsetsockopt(int socket, int level, int optname, rs = idm_lookup(&idm, socket); if (!rs) return ERR(EBADF); - if (rs->type == SOCK_DGRAM && level != SOL_RDMA) { + if ((rs->type == SOCK_DGRAM) && level != SOL_RDMA) { ret = setsockopt(rs->udp_sock, level, optname, optval, optlen); if (ret) return ret; @@ -3681,8 +3693,8 @@ int rsetsockopt(int socket, int level, int optname, opt_on = *(int *) optval; break; case SO_RCVBUF: - if ((rs->type == SOCK_STREAM && !rs->rbuf) || - (rs->type == SOCK_DGRAM && !rs->qp_list)) + if (((rs->type == SOCK_STREAM) && !rs->rbuf) || + ((rs->type == SOCK_DGRAM) && !rs->qp_list)) rs->rbuf_size = (*(uint32_t *) optval) << 1; ret = 0; break; @@ -3710,6 +3722,18 @@ int rsetsockopt(int socket, int level, int optname, break; } break; + case IPPROTO_IP: + switch (optname) { + case IP_TOS: + rs->ipv4_opts = *(int *)optval; + ret = rdma_set_option(rs->cm_id, RDMA_OPTION_ID, + RDMA_OPTION_ID_TOS, + (void *) optval, optlen); + break; + default: + break; + } + break; case IPPROTO_TCP: opts = &rs->tcp_opts; switch (optname) { @@ -3731,6 +3755,7 @@ int rsetsockopt(int socket, int level, int optname, ret = 0; break; case TCP_MAXSEG: + case TCP_CONGESTION: ret = 0; break; default: @@ -3834,6 +3859,7 @@ int rgetsockopt(int socket, int level, int optname, struct rsocket *rs; void *opt; struct ibv_sa_path_rec *path_rec; + struct tcp_info *info; struct ibv_path_data path_data; socklen_t len; int ret = 0; @@ -3871,6 +3897,21 @@ int rgetsockopt(int socket, int level, int optname, *optlen = sizeof(int); rs->err = 0; break; + case SO_BROADCAST: + ret = 0; + break; + default: + ret = ENOTSUP; + break; + } + break; + case IPPROTO_IP: + switch (optname) { + case IP_TOS: + *((int *) optval) = rs->ipv4_opts; + *optlen = sizeof(int); + break; + default: ret = ENOTSUP; break; @@ -3878,6 +3919,7 @@ int rgetsockopt(int socket, int level, int optname, break; case IPPROTO_TCP: switch (optname) { + case TCP_CONGESTION: case TCP_KEEPCNT: case TCP_KEEPINTVL: *((int *) optval) = 1; /* N/A */ @@ -3896,6 +3938,17 @@ int rgetsockopt(int socket, int level, int optname, 2048; *optlen = sizeof(int); break; + case TCP_INFO: + //TODO: support other tcp_info fields. + info = (struct tcp_info *) optval; + memset(info, 0, sizeof(struct tcp_info)); + info->tcpi_state = (rs->state == rs_connected) ? + TCP_ESTABLISHED : TCP_CLOSE; + info->tcpi_snd_cwnd = rs->sq_size; + + *optlen = sizeof(struct tcp_info); + break; + default: ret = ENOTSUP; break; @@ -3986,15 +4039,22 @@ int rfcntl(int socket, int cmd, ... /* arg */ ) va_start(args, cmd); switch (cmd) { case F_GETFL: - ret = rs->fd_flags; + ret = rs->fs_flags; break; case F_SETFL: param = va_arg(args, int); - if ((rs->fd_flags & O_NONBLOCK) != (param & O_NONBLOCK)) + if ((rs->fs_flags & O_NONBLOCK) != (param & O_NONBLOCK)) ret = rs_set_nonblocking(rs, param & O_NONBLOCK); if (!ret) - rs->fd_flags = param; + rs->fs_flags = param; + break; + case F_GETFD: + ret = rs->fd_flags; + break; + case F_SETFD: + param = va_arg(args, int); + rs->fd_flags = param; break; default: ret = ERR(ENOTSUP); diff --git a/providers/efa/efa.c b/providers/efa/efa.c index 94a4126ba..ac684b8d9 100644 --- a/providers/efa/efa.c +++ b/providers/efa/efa.c @@ -64,8 +64,8 @@ static struct verbs_context *efa_alloc_context(struct ibv_device *vdev, struct efa_alloc_ucontext cmd = {}; struct efa_context *ctx; - cmd.comp_mask |= EFA_ALLOC_UCONTEXT_CMD_COMP_TX_BATCH; - cmd.comp_mask |= EFA_ALLOC_UCONTEXT_CMD_COMP_MIN_SQ_WR; + cmd.supported_caps |= EFA_ALLOC_UCONTEXT_CMD_SUPP_CAPS_TX_BATCH; + cmd.supported_caps |= EFA_ALLOC_UCONTEXT_CMD_SUPP_CAPS_MIN_SQ_WR; ctx = verbs_init_and_alloc_context(vdev, cmd_fd, ctx, ibvctx, RDMA_DRIVER_EFA); diff --git a/pyverbs/device.pyx b/pyverbs/device.pyx index 054f8b4d4..137eb0c48 100644 --- a/pyverbs/device.pyx +++ b/pyverbs/device.pyx @@ -266,6 +266,18 @@ cdef class Context(PyverbsCM): format(p=port_num), rc) return port_attrs + def query_port_speed(self, unsigned int port_num): + """ + Query port speed in 100 Mb/s granularity. + :param port_num: Port number to query + :return: Port speed + """ + cdef uint64_t port_speed + rc = v.ibv_query_port_speed(self.context, port_num, &port_speed) + if rc != 0: + raise PyverbsRDMAError(f'Failed to query port speed for port {port_num}', rc) + return port_speed + def query_gid_table(self, size_t max_entries, uint32_t flags=0): """ Queries the GID tables of the device for at most entries @@ -854,6 +866,16 @@ cdef class DM(PyverbsCM): free(data) return res + def export_dmabuf_fd(self): + """ + Export a dmabuf FD for this DM object. + :return: A file descriptor (int) for the dmabuf FD + """ + fd = v.ibv_dm_export_dmabuf_fd(self.dm) + if fd < 0: + raise PyverbsRDMAErrno('Failed to export dmabuf FD for DM') + return fd + @property def handle(self): return self.dm.handle @@ -1266,7 +1288,8 @@ def translate_event_type(event_type): e.IBV_EVENT_QP_LAST_WQE_REACHED: '.IBV_EVENT_QP_LAST_WQE_REACHED', e.IBV_EVENT_CLIENT_REREGISTER: 'IBV_EVENT_CLIENT_REREGISTER', e.IBV_EVENT_GID_CHANGE: 'IBV_EVENT_GID_CHANGE', - e.IBV_EVENT_WQ_FATAL: 'IBV_EVENT_WQ_FATAL' + e.IBV_EVENT_WQ_FATAL: 'IBV_EVENT_WQ_FATAL', + e.IBV_EVENT_DEVICE_SPEED_CHANGE: 'IBV_EVENT_DEVICE_SPEED_CHANGE' } try: return types[event_type] diff --git a/pyverbs/libibverbs.pxd b/pyverbs/libibverbs.pxd index 18889f561..96b4b098d 100644 --- a/pyverbs/libibverbs.pxd +++ b/pyverbs/libibverbs.pxd @@ -703,8 +703,11 @@ cdef extern from 'infiniband/verbs.h': size_t length) int ibv_memcpy_from_dm(void *host_addr, ibv_dm *dm, unsigned long dm_offset, size_t length) + int ibv_dm_export_dmabuf_fd(ibv_dm *dm) int ibv_query_port(ibv_context *context, uint8_t port_num, ibv_port_attr *port_attr) + int ibv_query_port_speed(ibv_context *context, uint8_t port_num, + uint64_t *port_speed) ibv_comp_channel *ibv_create_comp_channel(ibv_context *context) int ibv_destroy_comp_channel(ibv_comp_channel *channel) int ibv_get_cq_event(ibv_comp_channel *channel, ibv_cq **cq, diff --git a/pyverbs/libibverbs_enums.pxd b/pyverbs/libibverbs_enums.pxd index aae55d1c3..e7caf854b 100644 --- a/pyverbs/libibverbs_enums.pxd +++ b/pyverbs/libibverbs_enums.pxd @@ -104,6 +104,7 @@ cdef extern from '': IBV_EVENT_CLIENT_REREGISTER IBV_EVENT_GID_CHANGE IBV_EVENT_WQ_FATAL + IBV_EVENT_DEVICE_SPEED_CHANGE cpdef enum ibv_access_flags: IBV_ACCESS_LOCAL_WRITE diff --git a/pyverbs/providers/mlx5/libmlx5.pxd b/pyverbs/providers/mlx5/libmlx5.pxd index ac415de6a..0faa439ed 100644 --- a/pyverbs/providers/mlx5/libmlx5.pxd +++ b/pyverbs/providers/mlx5/libmlx5.pxd @@ -568,6 +568,7 @@ cdef extern from 'infiniband/mlx5dv.h': # DevX APIs mlx5dv_devx_uar *mlx5dv_devx_alloc_uar(v.ibv_context *context, uint32_t flags) void mlx5dv_devx_free_uar(mlx5dv_devx_uar *devx_uar) + int mlx5dv_devx_uar_export_dmabuf_fd(mlx5dv_devx_uar *devx_uar) int mlx5dv_devx_general_cmd(v.ibv_context *context, const void *in_, size_t inlen, void *out, size_t outlen) mlx5dv_devx_umem *mlx5dv_devx_umem_reg(v.ibv_context *ctx, void *addr, diff --git a/pyverbs/providers/mlx5/mlx5dv.pyx b/pyverbs/providers/mlx5/mlx5dv.pyx index a4635827b..f14118821 100644 --- a/pyverbs/providers/mlx5/mlx5dv.pyx +++ b/pyverbs/providers/mlx5/mlx5dv.pyx @@ -1455,6 +1455,16 @@ cdef class Mlx5UAR(PyverbsObject): def uar(self): return self.uar + def export_dmabuf_fd(self): + """ + Export a dmabuf FD for this UAR object. + :return: A file descriptor (int) for the dmabuf FD + """ + fd = dv.mlx5dv_devx_uar_export_dmabuf_fd(self.uar) + if fd < 0: + raise PyverbsRDMAErrno('Failed to export dmabuf FD for UAR') + return fd + cdef class Mlx5DmOpAddr(PyverbsCM): def __init__(self, DM dm not None, op=0): diff --git a/tests/test_device.py b/tests/test_device.py index 411a49529..a56fa8614 100644 --- a/tests/test_device.py +++ b/tests/test_device.py @@ -284,6 +284,22 @@ def test_query_port_bad_flow(self): 'Successfully queried non-existing port {p}'. \ format(p=port)) + def test_query_port_speed(self): + """ + Test ibv_query_port_speed + """ + for dev in self.get_device_list(): + with d.Context(name=dev.name.decode()) as ctx: + try: + port_speed = ctx.query_port_speed(self.ib_port) + except PyverbsRDMAError as ex: + if ex.error_code in [errno.EOPNOTSUPP, errno.EPROTONOSUPPORT]: + raise unittest.SkipTest('ibv_query_port_speed is not' + ' supported on this device') + raise ex + self.assertGreaterEqual(port_speed, 0, 'Port speed should be positive, got' + f' {port_speed} Mbps') + class DMTest(PyverbsAPITestCase): """ diff --git a/tests/test_mlx5_uar.py b/tests/test_mlx5_uar.py index 44a626727..d6f3ce87d 100644 --- a/tests/test_mlx5_uar.py +++ b/tests/test_mlx5_uar.py @@ -36,3 +36,17 @@ def test_alloc_uar(self): finally: for uar in self.uar_res.uars: uar.close() + + def test_uar_export_dmabuf_fd(self): + """Test exporting a UAR as a dmabuf FD""" + try: + self.uar_res.uars.append(Mlx5UAR(self.uar_res.ctx, _MLX5DV_UAR_ALLOC_TYPE_NC)) + dmabuf_fd = self.uar_res.uars[-1].export_dmabuf_fd() + self.assertGreater(dmabuf_fd, 0, 'Expected a valid dmabuf FD greater than 0') + except PyverbsRDMAError as ex: + if ex.error_code in [errno.EPROTONOSUPPORT, errno.EOPNOTSUPP]: + raise unittest.SkipTest('UAR dmabuf export is not supported') + raise ex + finally: + for uar in self.uar_res.uars: + uar.close() diff --git a/tests/test_mr.py b/tests/test_mr.py index fdde078e9..ef75f4b24 100644 --- a/tests/test_mr.py +++ b/tests/test_mr.py @@ -484,6 +484,18 @@ def test_dm_bad_registration(self): with self.assertRaisesRegex(PyverbsRDMAError, 'Failed to register a device MR'): DMMR(PD(self.ctx), dm_size + 4, dm_access, dm, 0) + def test_dm_export_dmabuf_fd(self): + """Test exporting a DM as a dmabuf FD""" + dm_size = 100 + try: + with d.DM(self.ctx, d.AllocDmAttr(length=dm_size)) as dm: + dmabuf_fd = dm.export_dmabuf_fd() + self.assertGreater(dmabuf_fd, 0, 'Expected a valid dmabuf FD greater than 0') + except PyverbsRDMAError as ex: + if ex.error_code in [errno.EOPNOTSUPP, errno.EPROTONOSUPPORT]: + raise unittest.SkipTest('DM dmabuf export is not supported') + raise ex + def check_dmabuf_support(gpu=0): """