Merge pull request gpudirect#21 from gpudirect/cleaups

bug fix and major refactor of code
Deb25 · Jun 1, 2017 · 513efc1 · 513efc1
2 parents ac23645 + a7e902a
commit 513efc1
Show file tree

Hide file tree

Showing 18 changed files with 893 additions and 721 deletions.
diff --git a/Makefile.am b/Makefile.am
@@ -51,10 +51,10 @@ tests_gds_kernel_loopback_latency_LDADD = $(top_builddir)/src/libgdsync.la $(LIB
 SUFFIXES= .cu
 
 .cu.o:
-	$(NVCC) $(NVCCFLAGS)  -c -o $@ $<
+	$(NVCC) $(CPPFLAGS) $(NVCCFLAGS)  -c -o $@ $<
 
 
 .cu.lo:
-	$(LIBTOOL) --tag=CC --mode=compile $(NVCC) -o $@ -c $< $(NVCCFLAGS)
+	$(LIBTOOL) --tag=CC --mode=compile $(NVCC) -o $@ -c $< $(CPPFLAGS) $(NVCCFLAGS)
 
 endif
diff --git a/include/gdsync.h b/include/gdsync.h
@@ -36,67 +36,6 @@
 #include <infiniband/verbs_exp.h>
 #include <infiniband/peer_ops.h>
 
-#define ibv_peer_commit			 ibv_exp_peer_commit
-#define ibv_peer_commit_qp		 ibv_exp_peer_commit_qp
-
-#define ibv_create_qp_ex		 ibv_exp_create_qp
-#define ibv_qp_init_attr_ex		 ibv_exp_qp_init_attr
-#define ibv_create_cq_attr_ex		 ibv_exp_cq_init_attr
-
-#define IBV_QP_INIT_ATTR_PD		 IBV_EXP_QP_INIT_ATTR_PD
-#define IBV_QP_INIT_ATTR_PEER_DIRECT	 IBV_EXP_QP_INIT_ATTR_PEER_DIRECT
-
-#define IBV_CREATE_CQ_ATTR_PEER_DIRECT	 IBV_EXP_CQ_INIT_ATTR_PEER_DIRECT
-
-#define IBV_PEER_OP_FENCE		 IBV_EXP_PEER_OP_FENCE
-#define IBV_PEER_OP_STORE_DWORD		 IBV_EXP_PEER_OP_STORE_DWORD
-#define IBV_PEER_OP_STORE_QWORD		 IBV_EXP_PEER_OP_STORE_QWORD
-#define IBV_PEER_OP_POLL_AND_DWORD	 IBV_EXP_PEER_OP_POLL_AND_DWORD
-#define IBV_PEER_OP_POLL_NOR_DWORD	 IBV_EXP_PEER_OP_POLL_NOR_DWORD
-#define IBV_PEER_OP_POLL_GEQ_DWORD	 IBV_EXP_PEER_OP_POLL_GEQ_DWORD
-#define IBV_PEER_OP_COPY_BLOCK           IBV_EXP_PEER_OP_COPY_BLOCK
-
-#define IBV_PEER_OP_FENCE_CAP		 IBV_EXP_PEER_OP_FENCE_CAP
-#define IBV_PEER_OP_STORE_DWORD_CAP	 IBV_EXP_PEER_OP_STORE_DWORD_CAP
-#define IBV_PEER_OP_STORE_QWORD_CAP	 IBV_EXP_PEER_OP_STORE_QWORD_CAP
-#define IBV_PEER_OP_COPY_BLOCK_CAP       IBV_EXP_PEER_OP_COPY_BLOCK_CAP
-#define IBV_PEER_OP_POLL_AND_DWORD_CAP	 IBV_EXP_PEER_OP_POLL_AND_DWORD_CAP
-#define IBV_PEER_OP_POLL_NOR_DWORD_CAP	 IBV_EXP_PEER_OP_POLL_NOR_DWORD_CAP
-
-#define IBV_PEER_FENCE_OP_READ           IBV_EXP_PEER_FENCE_OP_READ      
-#define IBV_PEER_FENCE_OP_WRITE          IBV_EXP_PEER_FENCE_OP_WRITE
-#define IBV_PEER_FENCE_FROM_CPU          IBV_EXP_PEER_FENCE_FROM_CPU
-#define IBV_PEER_FENCE_FROM_HCA          IBV_EXP_PEER_FENCE_FROM_HCA
-#define IBV_PEER_FENCE_MEM_SYS           IBV_EXP_PEER_FENCE_MEM_SYS      
-#define IBV_PEER_FENCE_MEM_PEER          IBV_EXP_PEER_FENCE_MEM_PEER
-
-#define ibv_peer_direct_attr		 ibv_exp_peer_direct_attr
-#define ibv_peer_direction		 ibv_exp_peer_direction
-#define ibv_peer_op			 ibv_exp_peer_op
-
-#define IBV_ROLLBACK_ABORT_UNCOMMITED    IBV_EXP_ROLLBACK_ABORT_UNCOMMITED
-#define IBV_ROLLBACK_ABORT_LATE		 IBV_EXP_ROLLBACK_ABORT_LATE
-
-#define ibv_rollback_ctx		 ibv_exp_rollback_ctx
-#define ibv_rollback_qp			 ibv_exp_rollback_qp
-#define ibv_peer_peek			 ibv_exp_peer_peek
-#define ibv_peer_peek_cq		 ibv_exp_peer_peek_cq
-#define ibv_peer_abort_peek		 ibv_exp_peer_abort_peek
-#define ibv_peer_abort_peek_cq		 ibv_exp_peer_abort_peek_cq
-
-#define IBV_PEER_DIRECTION_FROM_CPU	 IBV_EXP_PEER_DIRECTION_FROM_CPU
-#define IBV_PEER_DIRECTION_FROM_HCA	 IBV_EXP_PEER_DIRECTION_FROM_HCA
-#define IBV_PEER_DIRECTION_FROM_PEER     IBV_EXP_PEER_DIRECTION_FROM_PEER
-#define IBV_PEER_DIRECTION_TO_CPU        IBV_EXP_PEER_DIRECTION_TO_CPU 
-#define IBV_PEER_DIRECTION_TO_HCA        IBV_EXP_PEER_DIRECTION_TO_HCA 
-#define IBV_PEER_DIRECTION_TO_PEER       IBV_EXP_PEER_DIRECTION_TO_PEER
-
-#define ibv_peer_buf			 ibv_exp_peer_buf
-#define ibv_peer_buf_alloc_attr		 ibv_exp_peer_buf_alloc_attr
-
-#define ibv_create_cq_ex_(ctx, attr, n, ch) \
-		ibv_exp_create_cq(ctx, n, NULL, ch, 0, attr)
-
 #include <cuda.h>
 #include <gdrapi.h>
 
@@ -111,7 +50,5 @@
 GDS_BEGIN_DECLS
 
 #include <gdsync/core.h>
-#include <gdsync/tools.h>
-#include <gdsync/mlx5.h>
 
 GDS_END_DECLS
diff --git a/include/gdsync/core.h b/include/gdsync/core.h
@@ -32,7 +32,7 @@
 #error "don't include directly this header, use gdsync.h always"
 #endif
 
-#define GDS_API_MAJOR_VERSION    1U
+#define GDS_API_MAJOR_VERSION    2U
 #define GDS_API_MINOR_VERSION    1U
 #define GDS_API_VERSION          ((GDS_API_MAJOR_VERSION << 16) | GDS_API_MINOR_VERSION)
 #define GDS_API_VERSION_COMPATIBLE(v) \
@@ -51,12 +51,11 @@ enum gds_create_qp_flags {
     GDS_CREATE_QP_WQ_ON_GPU    = 1<<0,
     GDS_CREATE_QP_TX_CQ_ON_GPU = 1<<1,
     GDS_CREATE_QP_RX_CQ_ON_GPU = 1<<2,
-    GDS_CREATE_QP_GPU_INVALIDATE_TX_CQ = 1<<3,
-    GDS_CREATE_QP_GPU_INVALIDATE_RX_CQ = 1<<4,
     GDS_CREATE_QP_WQ_DBREC_ON_GPU = 1<<5,
 };
 
-typedef struct ibv_qp_init_attr_ex gds_qp_init_attr_t;
+typedef struct ibv_exp_qp_init_attr gds_qp_init_attr_t;
+typedef struct ibv_exp_send_wr gds_send_wr;
 
 struct gds_cq {
         struct ibv_cq *cq;
@@ -76,17 +75,13 @@ struct gds_qp *gds_create_qp(struct ibv_pd *pd, struct ibv_context *context,
                              int gpu_id, int flags);
 int gds_destroy_qp(struct gds_qp *qp);
 
-//struct ibv_exp_peer_direct_attr;
-//int gds_register_peer(struct ibv_context *context, unsigned gpu_id, struct ibv_exp_peer_direct_attr **p_attr);
-int gds_register_peer(struct ibv_context *context, unsigned gpu_id);
-
 /* \brief: CPU-synchronous post send for peer QPs
  *
  * Notes:
  * - this API might have higher overhead than ibv_post_send. 
  * - It is provided for convenience only.
  */
-int gds_post_send(struct gds_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr);
+int gds_post_send(struct gds_qp *qp, gds_send_wr *wr, gds_send_wr **bad_wr);
 
 /* \brief: CPU-synchronous post recv for peer QPs
  *
@@ -95,93 +90,165 @@ int gds_post_send(struct gds_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr
  */
 int gds_post_recv(struct gds_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr);
 
-// forward decls
-enum gds_wait_cq_flags {
-    GDS_WAIT_CQ_CONSUME_CQE = 1 // GPU will invalidate CQE after polling on that completes.
-                                // In this case, CPU must avoid calling ibv_poll_cq() on that CQ so
-                                // to avoid races
-};
-
 int gds_stream_wait_cq(CUstream stream, struct gds_cq *cq, int flags);
-// same as above, plus writing a trailing flag word efficiently
-int gds_stream_wait_cq_ex(CUstream stream, struct gds_cq *cq, int flags, uint32_t *dw, uint32_t val);
 
 /* \brief: GPU stream-synchronous send for peer QPs
  *
  * Notes:
  * - execution of the send operation happens in CUDA stream order
  */
-int gds_stream_queue_send(CUstream stream, struct gds_qp *qp, struct ibv_exp_send_wr *p_ewr, struct ibv_exp_send_wr **bad_ewr);
-// same as above, plus writing a trailing flag word efficiently
-int gds_stream_queue_send_ex(CUstream stream, struct gds_qp *qp, struct ibv_exp_send_wr *p_ewr, struct ibv_exp_send_wr **bad_ewr, uint32_t *dw, uint32_t val);
-
-/* \brief GPU stream-synchronous post recv
- * Notes:
- * - this is not implemented and returns an error
- *   see notes for gds_post_recv
- */
-int gds_stream_queue_recv(CUstream stream, struct gds_qp *qp, struct ibv_recv_wr *p_ewr, struct ibv_recv_wr **bad_ewr);
+int gds_stream_queue_send(CUstream stream, struct gds_qp *qp, gds_send_wr *p_ewr, gds_send_wr **bad_ewr);
 
 
 // batched submission APIs
 
+typedef enum gds_memory_type {
+        GDS_MEMORY_GPU  = 1,
+        GDS_MEMORY_HOST = 2,
+        GDS_MEMORY_IO   = 4,
+	GDS_MEMORY_MASK = 0x7
+} gds_memory_type_t;
+
+typedef enum gds_wait_flags {
+	GDS_WAIT_POST_FLUSH = 1<<3,
+} gds_wait_flags_t;
+
+typedef enum gds_write_flags {
+	GDS_WRITE_PRE_BARRIER = 1<<4,
+} gds_write_flags_t;
+
+typedef enum gds_immcopy_flags {
+	GDS_IMMCOPY_POST_TAIL_FLUSH = 1<<4,
+} gds_immcopy_flags_t;
+
+typedef enum gds_membar_flags {
+	GDS_MEMBAR_FLUSH_REMOTE = 1<<4,
+	GDS_MEMBAR_DEFAULT      = 1<<5,
+	GDS_MEMBAR_SYS          = 1<<6,
+} gds_membar_flags_t;
+
 enum {
         GDS_SEND_INFO_MAX_OPS = 32,
         GDS_WAIT_INFO_MAX_OPS = 32
 };
 
+/**
+ * Represents a posted send operation on a particular QP
+ */
+
 typedef struct gds_send_request {
         struct ibv_exp_peer_commit commit;
         struct peer_op_wr wr[GDS_SEND_INFO_MAX_OPS];
 } gds_send_request_t;
 
+int gds_prepare_send(struct gds_qp *qp, gds_send_wr *p_ewr, gds_send_wr **bad_ewr, gds_send_request_t *request);
+int gds_stream_post_send(CUstream stream, gds_send_request_t *request);
+int gds_stream_post_send_all(CUstream stream, int count, gds_send_request_t *request);
+
+
+/**
+ * Represents a wait operation on a particular CQ
+ */
+
 typedef struct gds_wait_request {
         struct ibv_exp_peer_peek peek;
         struct peer_op_wr wr[GDS_WAIT_INFO_MAX_OPS];
 } gds_wait_request_t;
 
-typedef struct gds_value32_descriptor { 
+/**
+ * Initializes a wait request out of the next heading CQE, which is kept in
+ * cq->curr_offset.
+ *
+ * flags: must be 0
+ */
+int gds_prepare_wait_cq(struct gds_cq *cq, gds_wait_request_t *request, int flags);
+
+/**
+ * Issues the descriptors contained in request on the CUDA stream
+ *
+ */
+int gds_stream_post_wait_cq(CUstream stream, gds_wait_request_t *request);
+
+/**
+ * Issues the descriptors contained in the array of requests on the CUDA stream.
+ * This has potentially less overhead than submitting each request individually.
+ *
+ */
+int gds_stream_post_wait_cq_all(CUstream stream, int count, gds_wait_request_t *request);
+
+/**
+ * \brief CPU-synchronously enable polling on request
+ *
+ * Unblock calls to ibv_poll_cq. CPU will do what is necessary to make the corresponding
+ * CQE poll-able.
+ *
+ */
+int gds_post_wait_cq(struct gds_cq *cq, gds_wait_request_t *request, int flags);
+
+
+
+/**
+ * Represents the condition operation for wait operations on memory words
+ */
+
+typedef enum gds_wait_cond_flag {
+        GDS_WAIT_COND_GEQ = 0, // must match verbs_exp enum
+        GDS_WAIT_COND_EQ,
+        GDS_WAIT_COND_AND,
+        GDS_WAIT_COND_NOR
+} gds_wait_cond_flag_t;
+
+/**
+ * Represents a wait operation on a 32-bits memory word
+ */
+
+typedef struct gds_wait_value32 { 
+        uint32_t  *ptr;
+        uint32_t   value;
+        gds_wait_cond_flag_t cond_flags;
+        int        flags; // takes gds_memory_type_t | gds_wait_flags_t
+} gds_wait_value32_t;
+
+/**
+ * flags: gds_memory_type_t | gds_wait_flags_t
+ */
+int gds_prepare_wait_value32(gds_wait_value32_t *desc, uint32_t *ptr, uint32_t value, gds_wait_cond_flag_t cond_flags, int flags);
+
+
+
+/**
+ * Represents a write operation on a 32-bits memory word
+ */
+
+typedef struct gds_write_value32 { 
         uint32_t  *ptr;
         uint32_t   value;
-        int        cond_flags; // takes gds_poll_cond_flag_t, don't care field for GDS_TAG_WRITE_VALUE32
-        int        flags; // takes gds_poll_memory_type_t | gds_poll_flags_t | gds_poke_flags_t
-} gds_value32_descriptor_t;
+        int        flags; // takes gds_memory_type_t | gds_write_flags_t
+} gds_write_value32_t;
+
+/**
+ * flags:  gds_memory_type_t | gds_write_flags_t
+ */
+int gds_prepare_write_value32(gds_write_value32_t *desc, uint32_t *ptr, uint32_t value, int flags);
+
+
 
 typedef enum gds_tag { GDS_TAG_SEND, GDS_TAG_WAIT, GDS_TAG_WAIT_VALUE32, GDS_TAG_WRITE_VALUE32 } gds_tag_t;
 
 typedef struct gds_descriptor {
-        gds_tag_t tag;
+        gds_tag_t tag; /**< selector for union below */
         union {
-                gds_send_request_t      *send;
-                gds_wait_request_t      *wait;
-                gds_value32_descriptor_t value32;
+                gds_send_request_t  *send;
+                gds_wait_request_t  *wait;
+                gds_wait_value32_t   wait32;
+                gds_write_value32_t  write32;
         };
 } gds_descriptor_t;
 
-int gds_prepare_wait_value32(uint32_t *ptr, uint32_t value, int cond_flags, int flags, gds_value32_descriptor_t *desc);
-int gds_stream_post_descriptors(CUstream stream, size_t n_descs, gds_descriptor_t *descs);
-
-int gds_prepare_send(struct gds_qp *qp, struct ibv_exp_send_wr *p_ewr, struct ibv_exp_send_wr **bad_ewr, gds_send_request_t *request);
-int gds_stream_post_send(CUstream stream, gds_send_request_t *request);
-int gds_stream_post_send_ex(CUstream stream, gds_send_request_t *request, uint32_t *dw, uint32_t val);
-int gds_stream_post_send_all(CUstream stream, int count, gds_send_request_t *request);
-//int gds_stream_post_send_all_ex(CUstream stream, int count, gds_send_request_t request, uint32_t *dw, uint32_t val);
-
-int gds_prepare_wait_cq(struct gds_cq *cq, gds_wait_request_t *request, int flags);
-int gds_stream_post_wait_cq(CUstream stream, gds_wait_request_t *request);
-int gds_stream_post_wait_cq_ex(CUstream stream, gds_wait_request_t *request, uint32_t *dw, uint32_t val);
-int gds_stream_post_wait_cq_all(CUstream stream, int count, gds_wait_request_t *request);
-//int gds_stream_post_wait_cq_all_ex(CUstream stream, int count, gds_wait_request_t request, uint32_t *dw, uint32_t val);
-int gds_append_wait_cq(gds_wait_request_t *request, uint32_t *dw, uint32_t val);
-
-
-/* \brief CPU-synchronously enable polling on request
- *
- * Unblock calls to ibv_poll_cq. CPU will do what is necessary to make the corresponding
- * CQE poll-able.
- *
+/**
+ * flags: must be 0
  */
-int gds_post_wait_cq(struct gds_cq *cq, gds_wait_request_t *request, int flags);
+int gds_stream_post_descriptors(CUstream stream, size_t n_descs, gds_descriptor_t *descs, int flags);
 
 /*
  * Local variables:

diff --git a/include/gdsync/device.cuh b/include/gdsync/device.cuh
@@ -29,6 +29,8 @@
 
 #include <gdsync.h> // for gds_poll_cond_flag_t
 
+#ifdef  __cplusplus
+
 namespace gdsync {
 
     static const clock_t large_timeout = 1ULL<<32;
@@ -38,7 +40,7 @@ namespace gdsync {
     };
 
     //typedef enum wait_cond { WAIT_GEQ, WAIT_EQ, WAIT_AND, WAIT_NOR } wait_cond_t;
-    typedef gds_poll_cond_flag_t wait_cond_t;
+    typedef gds_wait_cond_flag_t wait_cond_t;
 
     struct sem32 {
         typedef uint32_t T;
@@ -90,10 +92,10 @@ namespace gdsync {
         template<typename S> __device__ inline int wait(S &sem, wait_cond_t cond) {
             int ret = 0;
             switch(cond) {
-            case GDS_POLL_COND_EQ:  ret = wait_eq(sem);  break;
-            case GDS_POLL_COND_GEQ: ret = wait_geq(sem); break;
-            case GDS_POLL_COND_AND: ret = wait_and(sem); break;
-            case GDS_POLL_COND_NOR: ret = wait_nor(sem); break;
+            case GDS_WAIT_COND_EQ:  ret = wait_eq(sem);  break;
+            case GDS_WAIT_COND_GEQ: ret = wait_geq(sem); break;
+            case GDS_WAIT_COND_AND: ret = wait_and(sem); break;
+            case GDS_WAIT_COND_NOR: ret = wait_nor(sem); break;
             default: ret = ERROR_INVALID; break;
             }
             return ret;
@@ -157,3 +159,5 @@ namespace gdsync {
 #endif
 
 } // namespace gdsync
+
+#endif // __cplusplus