Skip to content

Commit

Permalink
Merge pull request gpudirect#21 from gpudirect/cleaups
Browse files Browse the repository at this point in the history
bug fix and major refactor of code
  • Loading branch information
drossetti committed Jun 1, 2017
2 parents ac23645 + a7e902a commit 513efc1
Show file tree
Hide file tree
Showing 18 changed files with 893 additions and 721 deletions.
4 changes: 2 additions & 2 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,10 @@ tests_gds_kernel_loopback_latency_LDADD = $(top_builddir)/src/libgdsync.la $(LIB
SUFFIXES= .cu

.cu.o:
$(NVCC) $(NVCCFLAGS) -c -o $@ $<
$(NVCC) $(CPPFLAGS) $(NVCCFLAGS) -c -o $@ $<


.cu.lo:
$(LIBTOOL) --tag=CC --mode=compile $(NVCC) -o $@ -c $< $(NVCCFLAGS)
$(LIBTOOL) --tag=CC --mode=compile $(NVCC) -o $@ -c $< $(CPPFLAGS) $(NVCCFLAGS)

endif
63 changes: 0 additions & 63 deletions include/gdsync.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,67 +36,6 @@
#include <infiniband/verbs_exp.h>
#include <infiniband/peer_ops.h>

#define ibv_peer_commit ibv_exp_peer_commit
#define ibv_peer_commit_qp ibv_exp_peer_commit_qp

#define ibv_create_qp_ex ibv_exp_create_qp
#define ibv_qp_init_attr_ex ibv_exp_qp_init_attr
#define ibv_create_cq_attr_ex ibv_exp_cq_init_attr

#define IBV_QP_INIT_ATTR_PD IBV_EXP_QP_INIT_ATTR_PD
#define IBV_QP_INIT_ATTR_PEER_DIRECT IBV_EXP_QP_INIT_ATTR_PEER_DIRECT

#define IBV_CREATE_CQ_ATTR_PEER_DIRECT IBV_EXP_CQ_INIT_ATTR_PEER_DIRECT

#define IBV_PEER_OP_FENCE IBV_EXP_PEER_OP_FENCE
#define IBV_PEER_OP_STORE_DWORD IBV_EXP_PEER_OP_STORE_DWORD
#define IBV_PEER_OP_STORE_QWORD IBV_EXP_PEER_OP_STORE_QWORD
#define IBV_PEER_OP_POLL_AND_DWORD IBV_EXP_PEER_OP_POLL_AND_DWORD
#define IBV_PEER_OP_POLL_NOR_DWORD IBV_EXP_PEER_OP_POLL_NOR_DWORD
#define IBV_PEER_OP_POLL_GEQ_DWORD IBV_EXP_PEER_OP_POLL_GEQ_DWORD
#define IBV_PEER_OP_COPY_BLOCK IBV_EXP_PEER_OP_COPY_BLOCK

#define IBV_PEER_OP_FENCE_CAP IBV_EXP_PEER_OP_FENCE_CAP
#define IBV_PEER_OP_STORE_DWORD_CAP IBV_EXP_PEER_OP_STORE_DWORD_CAP
#define IBV_PEER_OP_STORE_QWORD_CAP IBV_EXP_PEER_OP_STORE_QWORD_CAP
#define IBV_PEER_OP_COPY_BLOCK_CAP IBV_EXP_PEER_OP_COPY_BLOCK_CAP
#define IBV_PEER_OP_POLL_AND_DWORD_CAP IBV_EXP_PEER_OP_POLL_AND_DWORD_CAP
#define IBV_PEER_OP_POLL_NOR_DWORD_CAP IBV_EXP_PEER_OP_POLL_NOR_DWORD_CAP

#define IBV_PEER_FENCE_OP_READ IBV_EXP_PEER_FENCE_OP_READ
#define IBV_PEER_FENCE_OP_WRITE IBV_EXP_PEER_FENCE_OP_WRITE
#define IBV_PEER_FENCE_FROM_CPU IBV_EXP_PEER_FENCE_FROM_CPU
#define IBV_PEER_FENCE_FROM_HCA IBV_EXP_PEER_FENCE_FROM_HCA
#define IBV_PEER_FENCE_MEM_SYS IBV_EXP_PEER_FENCE_MEM_SYS
#define IBV_PEER_FENCE_MEM_PEER IBV_EXP_PEER_FENCE_MEM_PEER

#define ibv_peer_direct_attr ibv_exp_peer_direct_attr
#define ibv_peer_direction ibv_exp_peer_direction
#define ibv_peer_op ibv_exp_peer_op

#define IBV_ROLLBACK_ABORT_UNCOMMITED IBV_EXP_ROLLBACK_ABORT_UNCOMMITED
#define IBV_ROLLBACK_ABORT_LATE IBV_EXP_ROLLBACK_ABORT_LATE

#define ibv_rollback_ctx ibv_exp_rollback_ctx
#define ibv_rollback_qp ibv_exp_rollback_qp
#define ibv_peer_peek ibv_exp_peer_peek
#define ibv_peer_peek_cq ibv_exp_peer_peek_cq
#define ibv_peer_abort_peek ibv_exp_peer_abort_peek
#define ibv_peer_abort_peek_cq ibv_exp_peer_abort_peek_cq

#define IBV_PEER_DIRECTION_FROM_CPU IBV_EXP_PEER_DIRECTION_FROM_CPU
#define IBV_PEER_DIRECTION_FROM_HCA IBV_EXP_PEER_DIRECTION_FROM_HCA
#define IBV_PEER_DIRECTION_FROM_PEER IBV_EXP_PEER_DIRECTION_FROM_PEER
#define IBV_PEER_DIRECTION_TO_CPU IBV_EXP_PEER_DIRECTION_TO_CPU
#define IBV_PEER_DIRECTION_TO_HCA IBV_EXP_PEER_DIRECTION_TO_HCA
#define IBV_PEER_DIRECTION_TO_PEER IBV_EXP_PEER_DIRECTION_TO_PEER

#define ibv_peer_buf ibv_exp_peer_buf
#define ibv_peer_buf_alloc_attr ibv_exp_peer_buf_alloc_attr

#define ibv_create_cq_ex_(ctx, attr, n, ch) \
ibv_exp_create_cq(ctx, n, NULL, ch, 0, attr)

#include <cuda.h>
#include <gdrapi.h>

Expand All @@ -111,7 +50,5 @@
GDS_BEGIN_DECLS

#include <gdsync/core.h>
#include <gdsync/tools.h>
#include <gdsync/mlx5.h>

GDS_END_DECLS
185 changes: 126 additions & 59 deletions include/gdsync/core.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
#error "don't include directly this header, use gdsync.h always"
#endif

#define GDS_API_MAJOR_VERSION 1U
#define GDS_API_MAJOR_VERSION 2U
#define GDS_API_MINOR_VERSION 1U
#define GDS_API_VERSION ((GDS_API_MAJOR_VERSION << 16) | GDS_API_MINOR_VERSION)
#define GDS_API_VERSION_COMPATIBLE(v) \
Expand All @@ -51,12 +51,11 @@ enum gds_create_qp_flags {
GDS_CREATE_QP_WQ_ON_GPU = 1<<0,
GDS_CREATE_QP_TX_CQ_ON_GPU = 1<<1,
GDS_CREATE_QP_RX_CQ_ON_GPU = 1<<2,
GDS_CREATE_QP_GPU_INVALIDATE_TX_CQ = 1<<3,
GDS_CREATE_QP_GPU_INVALIDATE_RX_CQ = 1<<4,
GDS_CREATE_QP_WQ_DBREC_ON_GPU = 1<<5,
};

typedef struct ibv_qp_init_attr_ex gds_qp_init_attr_t;
typedef struct ibv_exp_qp_init_attr gds_qp_init_attr_t;
typedef struct ibv_exp_send_wr gds_send_wr;

struct gds_cq {
struct ibv_cq *cq;
Expand All @@ -76,17 +75,13 @@ struct gds_qp *gds_create_qp(struct ibv_pd *pd, struct ibv_context *context,
int gpu_id, int flags);
int gds_destroy_qp(struct gds_qp *qp);

//struct ibv_exp_peer_direct_attr;
//int gds_register_peer(struct ibv_context *context, unsigned gpu_id, struct ibv_exp_peer_direct_attr **p_attr);
int gds_register_peer(struct ibv_context *context, unsigned gpu_id);

/* \brief: CPU-synchronous post send for peer QPs
*
* Notes:
* - this API might have higher overhead than ibv_post_send.
* - It is provided for convenience only.
*/
int gds_post_send(struct gds_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr);
int gds_post_send(struct gds_qp *qp, gds_send_wr *wr, gds_send_wr **bad_wr);

/* \brief: CPU-synchronous post recv for peer QPs
*
Expand All @@ -95,93 +90,165 @@ int gds_post_send(struct gds_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr
*/
int gds_post_recv(struct gds_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr);

// forward decls
enum gds_wait_cq_flags {
GDS_WAIT_CQ_CONSUME_CQE = 1 // GPU will invalidate CQE after polling on that completes.
// In this case, CPU must avoid calling ibv_poll_cq() on that CQ so
// to avoid races
};

int gds_stream_wait_cq(CUstream stream, struct gds_cq *cq, int flags);
// same as above, plus writing a trailing flag word efficiently
int gds_stream_wait_cq_ex(CUstream stream, struct gds_cq *cq, int flags, uint32_t *dw, uint32_t val);

/* \brief: GPU stream-synchronous send for peer QPs
*
* Notes:
* - execution of the send operation happens in CUDA stream order
*/
int gds_stream_queue_send(CUstream stream, struct gds_qp *qp, struct ibv_exp_send_wr *p_ewr, struct ibv_exp_send_wr **bad_ewr);
// same as above, plus writing a trailing flag word efficiently
int gds_stream_queue_send_ex(CUstream stream, struct gds_qp *qp, struct ibv_exp_send_wr *p_ewr, struct ibv_exp_send_wr **bad_ewr, uint32_t *dw, uint32_t val);

/* \brief GPU stream-synchronous post recv
* Notes:
* - this is not implemented and returns an error
* see notes for gds_post_recv
*/
int gds_stream_queue_recv(CUstream stream, struct gds_qp *qp, struct ibv_recv_wr *p_ewr, struct ibv_recv_wr **bad_ewr);
int gds_stream_queue_send(CUstream stream, struct gds_qp *qp, gds_send_wr *p_ewr, gds_send_wr **bad_ewr);


// batched submission APIs

typedef enum gds_memory_type {
GDS_MEMORY_GPU = 1,
GDS_MEMORY_HOST = 2,
GDS_MEMORY_IO = 4,
GDS_MEMORY_MASK = 0x7
} gds_memory_type_t;

typedef enum gds_wait_flags {
GDS_WAIT_POST_FLUSH = 1<<3,
} gds_wait_flags_t;

typedef enum gds_write_flags {
GDS_WRITE_PRE_BARRIER = 1<<4,
} gds_write_flags_t;

typedef enum gds_immcopy_flags {
GDS_IMMCOPY_POST_TAIL_FLUSH = 1<<4,
} gds_immcopy_flags_t;

typedef enum gds_membar_flags {
GDS_MEMBAR_FLUSH_REMOTE = 1<<4,
GDS_MEMBAR_DEFAULT = 1<<5,
GDS_MEMBAR_SYS = 1<<6,
} gds_membar_flags_t;

enum {
GDS_SEND_INFO_MAX_OPS = 32,
GDS_WAIT_INFO_MAX_OPS = 32
};

/**
* Represents a posted send operation on a particular QP
*/

typedef struct gds_send_request {
struct ibv_exp_peer_commit commit;
struct peer_op_wr wr[GDS_SEND_INFO_MAX_OPS];
} gds_send_request_t;

int gds_prepare_send(struct gds_qp *qp, gds_send_wr *p_ewr, gds_send_wr **bad_ewr, gds_send_request_t *request);
int gds_stream_post_send(CUstream stream, gds_send_request_t *request);
int gds_stream_post_send_all(CUstream stream, int count, gds_send_request_t *request);


/**
* Represents a wait operation on a particular CQ
*/

typedef struct gds_wait_request {
struct ibv_exp_peer_peek peek;
struct peer_op_wr wr[GDS_WAIT_INFO_MAX_OPS];
} gds_wait_request_t;

typedef struct gds_value32_descriptor {
/**
* Initializes a wait request out of the next heading CQE, which is kept in
* cq->curr_offset.
*
* flags: must be 0
*/
int gds_prepare_wait_cq(struct gds_cq *cq, gds_wait_request_t *request, int flags);

/**
* Issues the descriptors contained in request on the CUDA stream
*
*/
int gds_stream_post_wait_cq(CUstream stream, gds_wait_request_t *request);

/**
* Issues the descriptors contained in the array of requests on the CUDA stream.
* This has potentially less overhead than submitting each request individually.
*
*/
int gds_stream_post_wait_cq_all(CUstream stream, int count, gds_wait_request_t *request);

/**
* \brief CPU-synchronously enable polling on request
*
* Unblock calls to ibv_poll_cq. CPU will do what is necessary to make the corresponding
* CQE poll-able.
*
*/
int gds_post_wait_cq(struct gds_cq *cq, gds_wait_request_t *request, int flags);



/**
* Represents the condition operation for wait operations on memory words
*/

typedef enum gds_wait_cond_flag {
GDS_WAIT_COND_GEQ = 0, // must match verbs_exp enum
GDS_WAIT_COND_EQ,
GDS_WAIT_COND_AND,
GDS_WAIT_COND_NOR
} gds_wait_cond_flag_t;

/**
* Represents a wait operation on a 32-bits memory word
*/

typedef struct gds_wait_value32 {
uint32_t *ptr;
uint32_t value;
gds_wait_cond_flag_t cond_flags;
int flags; // takes gds_memory_type_t | gds_wait_flags_t
} gds_wait_value32_t;

/**
* flags: gds_memory_type_t | gds_wait_flags_t
*/
int gds_prepare_wait_value32(gds_wait_value32_t *desc, uint32_t *ptr, uint32_t value, gds_wait_cond_flag_t cond_flags, int flags);



/**
* Represents a write operation on a 32-bits memory word
*/

typedef struct gds_write_value32 {
uint32_t *ptr;
uint32_t value;
int cond_flags; // takes gds_poll_cond_flag_t, don't care field for GDS_TAG_WRITE_VALUE32
int flags; // takes gds_poll_memory_type_t | gds_poll_flags_t | gds_poke_flags_t
} gds_value32_descriptor_t;
int flags; // takes gds_memory_type_t | gds_write_flags_t
} gds_write_value32_t;

/**
* flags: gds_memory_type_t | gds_write_flags_t
*/
int gds_prepare_write_value32(gds_write_value32_t *desc, uint32_t *ptr, uint32_t value, int flags);



typedef enum gds_tag { GDS_TAG_SEND, GDS_TAG_WAIT, GDS_TAG_WAIT_VALUE32, GDS_TAG_WRITE_VALUE32 } gds_tag_t;

typedef struct gds_descriptor {
gds_tag_t tag;
gds_tag_t tag; /**< selector for union below */
union {
gds_send_request_t *send;
gds_wait_request_t *wait;
gds_value32_descriptor_t value32;
gds_send_request_t *send;
gds_wait_request_t *wait;
gds_wait_value32_t wait32;
gds_write_value32_t write32;
};
} gds_descriptor_t;

int gds_prepare_wait_value32(uint32_t *ptr, uint32_t value, int cond_flags, int flags, gds_value32_descriptor_t *desc);
int gds_stream_post_descriptors(CUstream stream, size_t n_descs, gds_descriptor_t *descs);

int gds_prepare_send(struct gds_qp *qp, struct ibv_exp_send_wr *p_ewr, struct ibv_exp_send_wr **bad_ewr, gds_send_request_t *request);
int gds_stream_post_send(CUstream stream, gds_send_request_t *request);
int gds_stream_post_send_ex(CUstream stream, gds_send_request_t *request, uint32_t *dw, uint32_t val);
int gds_stream_post_send_all(CUstream stream, int count, gds_send_request_t *request);
//int gds_stream_post_send_all_ex(CUstream stream, int count, gds_send_request_t request, uint32_t *dw, uint32_t val);

int gds_prepare_wait_cq(struct gds_cq *cq, gds_wait_request_t *request, int flags);
int gds_stream_post_wait_cq(CUstream stream, gds_wait_request_t *request);
int gds_stream_post_wait_cq_ex(CUstream stream, gds_wait_request_t *request, uint32_t *dw, uint32_t val);
int gds_stream_post_wait_cq_all(CUstream stream, int count, gds_wait_request_t *request);
//int gds_stream_post_wait_cq_all_ex(CUstream stream, int count, gds_wait_request_t request, uint32_t *dw, uint32_t val);
int gds_append_wait_cq(gds_wait_request_t *request, uint32_t *dw, uint32_t val);


/* \brief CPU-synchronously enable polling on request
*
* Unblock calls to ibv_poll_cq. CPU will do what is necessary to make the corresponding
* CQE poll-able.
*
/**
* flags: must be 0
*/
int gds_post_wait_cq(struct gds_cq *cq, gds_wait_request_t *request, int flags);
int gds_stream_post_descriptors(CUstream stream, size_t n_descs, gds_descriptor_t *descs, int flags);

/*
* Local variables:
Expand Down
14 changes: 9 additions & 5 deletions include/gdsync/device.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@

#include <gdsync.h> // for gds_poll_cond_flag_t

#ifdef __cplusplus

namespace gdsync {

static const clock_t large_timeout = 1ULL<<32;
Expand All @@ -38,7 +40,7 @@ namespace gdsync {
};

//typedef enum wait_cond { WAIT_GEQ, WAIT_EQ, WAIT_AND, WAIT_NOR } wait_cond_t;
typedef gds_poll_cond_flag_t wait_cond_t;
typedef gds_wait_cond_flag_t wait_cond_t;

struct sem32 {
typedef uint32_t T;
Expand Down Expand Up @@ -90,10 +92,10 @@ namespace gdsync {
template<typename S> __device__ inline int wait(S &sem, wait_cond_t cond) {
int ret = 0;
switch(cond) {
case GDS_POLL_COND_EQ: ret = wait_eq(sem); break;
case GDS_POLL_COND_GEQ: ret = wait_geq(sem); break;
case GDS_POLL_COND_AND: ret = wait_and(sem); break;
case GDS_POLL_COND_NOR: ret = wait_nor(sem); break;
case GDS_WAIT_COND_EQ: ret = wait_eq(sem); break;
case GDS_WAIT_COND_GEQ: ret = wait_geq(sem); break;
case GDS_WAIT_COND_AND: ret = wait_and(sem); break;
case GDS_WAIT_COND_NOR: ret = wait_nor(sem); break;
default: ret = ERROR_INVALID; break;
}
return ret;
Expand Down Expand Up @@ -157,3 +159,5 @@ namespace gdsync {
#endif

} // namespace gdsync

#endif // __cplusplus
Loading

0 comments on commit 513efc1

Please sign in to comment.