Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/mpi/errhan/errnames.txt
Original file line number Diff line number Diff line change
Expand Up @@ -956,6 +956,7 @@ is too big (> MPIU_SHMW_GHND_SZ)
**xpmem_release: xpmem_release failed
**xpmem_remove: xpmem_remove failed
**xpmem_segtree_init: xpmem_segtree_init failed
**xpmem_segtree_finalize: xpmem_segtree_finalize failed

## GPU related error messages
**gpu_query_ptr: gpu_query_pointer_attr failed
Expand Down
6 changes: 4 additions & 2 deletions src/mpid/ch4/shm/ipc/src/ipc_p2p.h
Original file line number Diff line number Diff line change
Expand Up @@ -240,12 +240,14 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_IPCI_handle_lmt_recv(MPIDI_IPC_hdr * ipc_hdr,
{
void *src_buf = NULL;
/* map */
mpi_errno = MPIDI_XPMEM_ipc_handle_map(ipc_hdr->ipc_handle.xpmem, &src_buf);
mpi_errno = MPIDI_XPMEM_ipc_handle_map(&ipc_hdr->ipc_handle.xpmem, &src_buf);
MPIR_ERR_CHECK(mpi_errno);
/* copy */
mpi_errno = MPIDI_IPCI_copy_data(ipc_hdr, rreq, src_buf, src_data_sz);
MPIR_ERR_CHECK(mpi_errno);
/* skip unmap */
/* unmap */
mpi_errno = MPIDI_XPMEM_ipc_handle_unmap(&ipc_hdr->ipc_handle.xpmem);
MPIR_ERR_CHECK(mpi_errno);
}
break;
#endif
Expand Down
2 changes: 1 addition & 1 deletion src/mpid/ch4/shm/ipc/src/ipc_win.c
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ int MPIDI_IPC_mpi_win_create_hook(MPIR_Win * win)
#ifdef MPIDI_CH4_SHM_ENABLE_XPMEM
case MPIDI_IPCI_TYPE__XPMEM:
mpi_errno =
MPIDI_XPMEM_ipc_handle_map(ipc_shared_table[i].ipc_handle.xpmem,
MPIDI_XPMEM_ipc_handle_map(&ipc_shared_table[i].ipc_handle.xpmem,
&shared_table[i].shm_base_addr);
MPIR_ERR_CHECK(mpi_errno);
shared_table[i].mapped_type = 2;
Expand Down
6 changes: 2 additions & 4 deletions src/mpid/ch4/shm/ipc/xpmem/Makefile.mk
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,9 @@ noinst_HEADERS += src/mpid/ch4/shm/ipc/xpmem/xpmem_pre.h \
src/mpid/ch4/shm/ipc/xpmem/xpmem_post.h

if BUILD_SHM_IPC_XPMEM
noinst_HEADERS += src/mpid/ch4/shm/ipc/xpmem/xpmem_seg.h \
src/mpid/ch4/shm/ipc/xpmem/xpmem_types.h
noinst_HEADERS += src/mpid/ch4/shm/ipc/xpmem/xpmem_types.h

mpi_core_sources += src/mpid/ch4/shm/ipc/xpmem/globals.c \
src/mpid/ch4/shm/ipc/xpmem/xpmem_init.c \
src/mpid/ch4/shm/ipc/xpmem/xpmem_mem.c \
src/mpid/ch4/shm/ipc/xpmem/xpmem_seg.c
src/mpid/ch4/shm/ipc/xpmem/xpmem_mem.c
endif
29 changes: 22 additions & 7 deletions src/mpid/ch4/shm/ipc/xpmem/xpmem_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,24 @@
#include "mpidimpl.h"
#include "xpmem_post.h"
#include "mpidu_init_shm.h"
#include "xpmem_seg.h"

static int xpmem_initialized = 0;
/*
=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===

cvars:
- name : MPIR_CVAR_CH4_XPMEM_ENABLE
category : CH4
type : int
default : 1
class : none
verbosity : MPI_T_VERBOSITY_USER_BASIC
scope : MPI_T_SCOPE_ALL_EQ
description : >-
To manually disable XPMEM set to 0. The environment variable is valid only when the XPMEM
submodule is enabled.

=== END_MPI_T_CVAR_INFO_BLOCK ===
*/

int MPIDI_XPMEM_init_local(void)
{
Expand Down Expand Up @@ -75,7 +90,7 @@ int MPIDI_XPMEM_init_world(void)
}
MPIDU_Init_shm_barrier();

xpmem_initialized = 1;
MPIDI_XPMEMI_global.initialized = true;

fn_exit:
MPIR_FUNC_EXIT;
Expand All @@ -89,7 +104,7 @@ int MPIDI_XPMEM_init_world(void)
* kernel module to be loaded at runtime. If XPMEM is not available, disable its use via the
* special CVAR value. */
XPMEM_TRACE("init: xpmem_make failed. Disabling XPMEM support");
MPIR_CVAR_CH4_XPMEM_ENABLE = 0;
MPIDI_XPMEMI_global.initialized = false;

MPIR_CHKPMEM_REAP();
goto fn_exit;
Expand All @@ -101,15 +116,15 @@ int MPIDI_XPMEM_mpi_finalize_hook(void)
int i, ret = 0;
MPIR_FUNC_ENTER;

if (MPIDI_XPMEMI_global.segid == -1 || !xpmem_initialized) {
if (MPIDI_XPMEMI_global.segid == -1 || !MPIDI_XPMEMI_global.initialized) {
/* if XPMEM was disabled at runtime, return */
goto fn_exit;
}

for (i = 0; i < MPIR_Process.local_size; i++) {
/* should be called before xpmem_release
* MPIDI_XPMEMI_segtree_delete_all will call xpmem_detach */
MPL_gavl_tree_destroy(MPIDI_XPMEMI_global.segmaps[i].segcache_ubuf);
MPIDI_XPMEMI_segtree_finalize(MPIDI_XPMEMI_global.segmaps[i].segcache_ubuf);
if (MPIDI_XPMEMI_global.segmaps[i].apid != -1) {
XPMEM_TRACE("finalize: release apid: node_rank %d, 0x%lx\n",
i, (uint64_t) MPIDI_XPMEMI_global.segmaps[i].apid);
Expand All @@ -126,7 +141,7 @@ int MPIDI_XPMEM_mpi_finalize_hook(void)
/* success(0) or failure(-1) */
MPIR_ERR_CHKANDJUMP(ret == -1, mpi_errno, MPI_ERR_OTHER, "**xpmem_remove");

xpmem_initialized = 0;
MPIDI_XPMEMI_global.initialized = false;

fn_exit:
MPIR_FUNC_EXIT;
Expand Down
197 changes: 187 additions & 10 deletions src/mpid/ch4/shm/ipc/xpmem/xpmem_mem.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,29 +2,206 @@
* Copyright (C) by Argonne National Laboratory
* See COPYRIGHT in top-level directory
*/
#include "xpmem_seg.h"

#include "mpidimpl.h"
#include "xpmem_post.h"

int MPIDI_XPMEM_ipc_handle_map(MPIDI_XPMEM_ipc_handle_t handle, void **vaddr)
/*
=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===

cvars:
- name : MPIR_CVAR_CH4_XPMEM_SEG_CACHE_ENABLE
category : CH4
type : boolean
default : true
class : none
verbosity : MPI_T_VERBOSITY_USER_BASIC
scope : MPI_T_SCOPE_ALL_EQ
description : >-
Enable mapped segment cache on receiver side to avoid mapping overhead
per operation.

=== END_MPI_T_CVAR_INFO_BLOCK ===
*/

static MPIDI_XPMEMI_seg_t *seg_search(MPL_gavl_tree_t segcache, void *addr, uintptr_t size);
static void seg_insert(MPL_gavl_tree_t segcache, uintptr_t seg_low, uintptr_t seg_size,
void *att_vaddr);
static void seg_free(void *seg);

/* Maps region into the local process memory. Will check and use cached
* region if available, or insert new entry into cache.
* It internally rounds down the low address and rounds up the size to
* ensure the cached segment is page aligned. Specific tree is given to
* differentiate different cache tree (e.g. user buffer tree used to cache
* user buffer, and XPMEM cooperative counter tree used to cache counter
* obj)
*
* Input parameters:
* - handle: handle for region to be mapped
* Output parameters:
* - vaddr: corresponding start address of the remote buffer in local
* virtual address space. */
int MPIDI_XPMEM_ipc_handle_map(MPIDI_XPMEM_ipc_handle_t * handle, void **vaddr)
{
int mpi_errno = MPI_SUCCESS;
MPIR_FUNC_ENTER;

/* map the true data range, assuming no data outside true_lb/true_ub */
void *addr = MPIR_get_contig_ptr(handle.addr, handle.true_lb);
void *addr = MPIR_get_contig_ptr(handle->addr, handle->true_lb);
void *addr_out;
mpi_errno =
MPIDI_XPMEMI_seg_regist(handle.src_lrank, handle.range, addr, &addr_out,
MPIDI_XPMEMI_global.segmaps[handle.src_lrank].segcache_ubuf);
int node_rank = handle->src_lrank;
uintptr_t size = handle->range;
void *remote_vaddr = MPIR_get_contig_ptr(handle->addr, handle->true_lb);
MPIDI_XPMEMI_segmap_t *segmap = &MPIDI_XPMEMI_global.segmaps[node_rank];
MPL_gavl_tree_t segcache = segmap->segcache_ubuf;
MPIDI_XPMEMI_seg_t *seg = NULL;
uintptr_t seg_low;
uintptr_t seg_size;
void *att_vaddr;

MPIR_FUNC_ENTER;

/* Get apid if it is the first time registered on the local process. */
if (segmap->apid == -1) {
segmap->apid = xpmem_get(segmap->remote_segid, XPMEM_RDWR, XPMEM_PERMIT_MODE,
MPIDI_XPMEMI_PERMIT_VALUE);
/* 64-bit access permit ID or failure(-1) */
MPIR_ERR_CHKANDJUMP(segmap->apid == -1, mpi_errno, MPI_ERR_OTHER, "**xpmem_get");
XPMEM_TRACE("seg: register apid 0x%lx for node_rank %d, segid 0x%lx\n",
(uint64_t) segmap->apid, node_rank, (uint64_t) segmap->remote_segid);
}

/* Search a cached segment or create a new one. Both low and size must be page aligned. */
seg_low = MPL_ROUND_DOWN_ALIGN((uint64_t) remote_vaddr,
(uint64_t) MPIDI_XPMEMI_global.sys_page_sz);
seg_size =
MPL_ROUND_UP_ALIGN(size + ((uintptr_t) remote_vaddr - seg_low),
MPIDI_XPMEMI_global.sys_page_sz);

seg = seg_search(segcache, remote_vaddr, size);
if (seg == NULL) {
struct xpmem_addr xpmem_addr;
xpmem_addr.apid = segmap->apid;
xpmem_addr.offset = seg_low;
att_vaddr = xpmem_attach(xpmem_addr, seg_size, NULL);
MPIR_ERR_CHKANDJUMP2(att_vaddr == (void *) -1, mpi_errno, MPI_ERR_OTHER, "**xpmem_attach",
"**xpmem_attach %p %d", remote_vaddr, (int) size);
seg_insert(segcache, seg_low, seg_size, att_vaddr);
} else {
seg_low = seg->remote_align_addr;
att_vaddr = (void *) seg->att_vaddr;
}
handle->att_vaddr = att_vaddr;

if (handle.is_contig) {
/* return mapped vaddr without round down */
addr_out = (void *) ((uintptr_t) remote_vaddr - seg_low + att_vaddr);
XPMEM_TRACE("seg: mappped segment for node_rank %d, apid 0x%lx, "
"size 0x%lx->0x%lx, seg->low %p->0x%lx, attached_vaddr %p, vaddr %p\n",
node_rank, (uint64_t) segmap->apid, size, seg_size,
remote_vaddr, seg_low, (void *) att_vaddr, addr_out);

if (handle->is_contig) {
/* We'll do MPIR_Typerep_unpack */
*vaddr = addr_out;
} else {
/* We'll do MPIR_Localcopy */
*vaddr = MPIR_get_contig_ptr(addr_out, -handle.true_lb);
*vaddr = MPIR_get_contig_ptr(addr_out, -handle->true_lb);
}

fn_fail:
MPIR_FUNC_EXIT;
return mpi_errno;
}

int MPIDI_XPMEM_ipc_handle_unmap(MPIDI_XPMEM_ipc_handle_t * handle)
{
int mpi_errno = MPI_SUCCESS;
int ret;

MPIR_FUNC_ENTER;

/* skip unmap if cache enabled */
if (MPIR_CVAR_CH4_XPMEM_SEG_CACHE_ENABLE) {
goto fn_exit;
}

ret = xpmem_detach((void *) handle->att_vaddr);
MPIR_ERR_CHKANDJUMP(ret != 0, mpi_errno, MPI_ERR_OTHER, "**xpmem_detach");

fn_exit:
MPIR_FUNC_EXIT;
return mpi_errno;
fn_fail:
goto fn_exit;
}

/*** segment cache routines ***/

/* Initialize an empty tree for segment cache.
* It should be called only once for a AVL tree at MPI init.*/
int MPIDI_XPMEMI_segtree_init(MPL_gavl_tree_t * tree)
{
int mpi_errno = MPI_SUCCESS, ret;
MPIR_FUNC_ENTER;

if (MPIR_CVAR_CH4_XPMEM_SEG_CACHE_ENABLE) {
ret = MPL_gavl_tree_create(seg_free, tree);
MPIR_ERR_CHKANDJUMP(ret != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**xpmem_segtree_init");
}

fn_exit:
MPIR_FUNC_EXIT;
return mpi_errno;
fn_fail:
goto fn_exit;
}

int MPIDI_XPMEMI_segtree_finalize(MPL_gavl_tree_t tree)
{
int mpi_errno = MPI_SUCCESS, ret;
MPIR_FUNC_ENTER;

if (MPIR_CVAR_CH4_XPMEM_SEG_CACHE_ENABLE) {
ret = MPL_gavl_tree_destroy(tree);
MPIR_ERR_CHKANDJUMP(ret != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER,
"**xpmem_segtree_finalize");
}

fn_exit:
MPIR_FUNC_EXIT;
return mpi_errno;
fn_fail:
goto fn_exit;
}

static MPIDI_XPMEMI_seg_t *seg_search(MPL_gavl_tree_t segcache, void *addr, uintptr_t size)
{
if (MPIR_CVAR_CH4_XPMEM_SEG_CACHE_ENABLE) {
return MPL_gavl_tree_search(segcache, addr, size);
}

return NULL;
}

static void seg_insert(MPL_gavl_tree_t segcache, uintptr_t seg_low, uintptr_t seg_size,
void *att_vaddr)
{
if (MPIR_CVAR_CH4_XPMEM_SEG_CACHE_ENABLE) {
MPIDI_XPMEMI_seg_t *seg = MPL_malloc(sizeof(MPIDI_XPMEMI_seg_t), MPL_MEM_OTHER);
MPIR_Assert(seg != NULL);
seg->remote_align_addr = seg_low;
seg->att_vaddr = (uintptr_t) att_vaddr;
MPL_gavl_tree_insert(segcache, (void *) seg_low, seg_size, (void *) seg);
}
}

static void seg_free(void *seg)
{
MPIDI_XPMEMI_seg_t *seg_ptr = (MPIDI_XPMEMI_seg_t *) seg;
MPIR_FUNC_ENTER;

xpmem_detach((void *) seg_ptr->att_vaddr);
MPL_free(seg);

MPIR_FUNC_EXIT;
return;
}
19 changes: 5 additions & 14 deletions src/mpid/ch4/shm/ipc/xpmem/xpmem_post.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,15 @@
#ifndef XPMEM_POST_H_INCLUDED
#define XPMEM_POST_H_INCLUDED

#ifdef MPIDI_CH4_SHM_ENABLE_XPMEM
#include "ch4_impl.h"
#include "ipc_types.h"
#include "xpmem_types.h"

/*
=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===

cvars:
- name : MPIR_CVAR_CH4_XPMEM_ENABLE
category : CH4
type : int
default : 1
class : none
verbosity : MPI_T_VERBOSITY_USER_BASIC
scope : MPI_T_SCOPE_ALL_EQ
description : >-
To manually disable XPMEM set to 0. The environment variable is valid only when the XPMEM
submodule is enabled.

- name : MPIR_CVAR_CH4_IPC_XPMEM_P2P_THRESHOLD
category : CH4
type : int
Expand All @@ -38,7 +29,6 @@
=== END_MPI_T_CVAR_INFO_BLOCK ===
*/

#ifdef MPIDI_CH4_SHM_ENABLE_XPMEM
MPL_STATIC_INLINE_PREFIX int MPIDI_XPMEM_get_ipc_attr(const void *buf, MPI_Aint count,
MPI_Datatype datatype,
MPIDI_IPCI_ipc_attr_t * ipc_attr)
Expand All @@ -52,7 +42,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_XPMEM_get_ipc_attr(const void *buf, MPI_Aint
int dt_contig;
MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, true_lb);

if (!MPIR_CVAR_CH4_XPMEM_ENABLE || buf == MPI_BOTTOM ||
if (!MPIDI_XPMEMI_global.initialized || buf == MPI_BOTTOM ||
data_sz < MPIR_CVAR_CH4_IPC_XPMEM_P2P_THRESHOLD) {
goto fn_exit;
} else {
Expand Down Expand Up @@ -99,7 +89,8 @@ MPL_STATIC_INLINE_PREFIX void MPIDI_XPMEM_fill_ipc_handle(MPIDI_IPCI_ipc_attr_t
int MPIDI_XPMEM_init_local(void);
int MPIDI_XPMEM_init_world(void);
int MPIDI_XPMEM_mpi_finalize_hook(void);
int MPIDI_XPMEM_ipc_handle_map(MPIDI_XPMEM_ipc_handle_t mem_handle, void **vaddr);
int MPIDI_XPMEM_ipc_handle_map(MPIDI_XPMEM_ipc_handle_t * mem_handle, void **vaddr);
int MPIDI_XPMEM_ipc_handle_unmap(MPIDI_XPMEM_ipc_handle_t * handle);
#endif

#endif /* XPMEM_POST_H_INCLUDED */
1 change: 1 addition & 0 deletions src/mpid/ch4/shm/ipc/xpmem/xpmem_pre.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ typedef struct {
int is_contig;
const void *addr;
MPI_Aint true_lb, range;
const void *att_vaddr;
} MPIDI_XPMEM_ipc_handle_t;

/* local struct used for query and preparing memory handle.
Expand Down
Loading