Bug fixes for MNMG coarsen_graph, renumber_edgelist, relabel (rapidsa…

…i#1364) Bug fixes for MNMG coarsen_graph, renumber_edgelist, relabel Authors: - Seunghwa Kang (@seunghwak) Approvers: - Andrei Schaffer (@aschaffer) - Rick Ratzel (@rlratzel) - Chuck Hastings (@ChuckHastings) - Alex Fender (@afender) URL: rapidsai#1364
rlratzel · Feb 2, 2021 · 5813559 · 5813559
1 parent 44b0679
commit 5813559
Show file tree

Hide file tree

Showing 5 changed files with 51 additions and 24 deletions.
diff --git a/cpp/include/utilities/shuffle_comm.cuh b/cpp/include/utilities/shuffle_comm.cuh
@@ -50,15 +50,24 @@ rmm::device_uvector<size_t> sort_and_count(raft::comms::comms_t const &comm,
   auto gpu_id_first = thrust::make_transform_iterator(
     tx_value_first,
     [value_to_gpu_id_op] __device__(auto value) { return value_to_gpu_id_op(value); });
+  rmm::device_uvector<int> d_tx_dst_ranks(comm_size, stream);
   rmm::device_uvector<size_t> d_tx_value_counts(comm_size, stream);
-  thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream),
-                        gpu_id_first,
-                        gpu_id_first + thrust::distance(tx_value_first, tx_value_last),
-                        thrust::make_constant_iterator(size_t{1}),
-                        thrust::make_discard_iterator(),
-                        d_tx_value_counts.begin());
-  std::vector<size_t> tx_value_counts(comm_size);
-  raft::update_host(tx_value_counts.data(), d_tx_value_counts.data(), comm_size, stream);
+  auto last = thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream),
+                                    gpu_id_first,
+                                    gpu_id_first + thrust::distance(tx_value_first, tx_value_last),
+                                    thrust::make_constant_iterator(size_t{1}),
+                                    d_tx_dst_ranks.begin(),
+                                    d_tx_value_counts.begin());
+  if (thrust::distance(d_tx_value_counts.begin(), thrust::get<1>(last)) < comm_size) {
+    rmm::device_uvector<size_t> d_counts(comm_size, stream);
+    thrust::fill(rmm::exec_policy(stream)->on(stream), d_counts.begin(), d_counts.end(), size_t{0});
+    thrust::scatter(rmm::exec_policy(stream)->on(stream),
+                    d_tx_value_counts.begin(),
+                    thrust::get<1>(last),
+                    d_tx_dst_ranks.begin(),
+                    d_counts.begin());
+    d_tx_value_counts = std::move(d_counts);
+  }
 
   return std::move(d_tx_value_counts);
 }
@@ -83,13 +92,24 @@ rmm::device_uvector<size_t> sort_and_count(raft::comms::comms_t const &comm,
 
   auto gpu_id_first = thrust::make_transform_iterator(
     tx_key_first, [key_to_gpu_id_op] __device__(auto key) { return key_to_gpu_id_op(key); });
+  rmm::device_uvector<int> d_tx_dst_ranks(comm_size, stream);
   rmm::device_uvector<size_t> d_tx_value_counts(comm_size, stream);
-  thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream),
-                        gpu_id_first,
-                        gpu_id_first + thrust::distance(tx_key_first, tx_key_last),
-                        thrust::make_constant_iterator(size_t{1}),
-                        thrust::make_discard_iterator(),
-                        d_tx_value_counts.begin());
+  auto last = thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream),
+                                    gpu_id_first,
+                                    gpu_id_first + thrust::distance(tx_key_first, tx_key_last),
+                                    thrust::make_constant_iterator(size_t{1}),
+                                    d_tx_dst_ranks.begin(),
+                                    d_tx_value_counts.begin());
+  if (thrust::distance(d_tx_value_counts.begin(), thrust::get<1>(last)) < comm_size) {
+    rmm::device_uvector<size_t> d_counts(comm_size, stream);
+    thrust::fill(rmm::exec_policy(stream)->on(stream), d_counts.begin(), d_counts.end(), size_t{0});
+    thrust::scatter(rmm::exec_policy(stream)->on(stream),
+                    d_tx_value_counts.begin(),
+                    thrust::get<1>(last),
+                    d_tx_dst_ranks.begin(),
+                    d_counts.begin());
+    d_tx_value_counts = std::move(d_counts);
+  }
 
   return std::move(d_tx_value_counts);
 }
@@ -191,7 +211,7 @@ auto shuffle_values(raft::comms::comms_t const &comm,
 
   auto rx_value_buffer =
     allocate_dataframe_buffer<typename std::iterator_traits<TxValueIterator>::value_type>(
-      rx_offsets.back(), stream);
+      rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream);
 
   // FIXME: this needs to be replaced with AlltoAll once NCCL 2.8 is released
   // (if num_tx_dst_ranks == num_rx_src_ranks == comm_size).
@@ -234,7 +254,7 @@ auto sort_and_shuffle_values(raft::comms::comms_t const &comm,
 
   auto rx_value_buffer =
     allocate_dataframe_buffer<typename std::iterator_traits<ValueIterator>::value_type>(
-      rx_offsets.back() + rx_counts.back(), stream);
+      rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream);
 
   // FIXME: this needs to be replaced with AlltoAll once NCCL 2.8 is released
   // (if num_tx_dst_ranks == num_rx_src_ranks == comm_size).
@@ -275,7 +295,7 @@ auto sort_and_shuffle_kv_pairs(raft::comms::comms_t const &comm,
     detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream);
 
   rmm::device_uvector<typename std::iterator_traits<VertexIterator>::value_type> rx_keys(
-    rx_offsets.back() + rx_counts.back(), stream);
+    rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream);
   auto rx_value_buffer =
     allocate_dataframe_buffer<typename std::iterator_traits<ValueIterator>::value_type>(
       rx_keys.size(), stream);

diff --git a/cpp/src/experimental/coarsen_graph.cu b/cpp/src/experimental/coarsen_graph.cu
@@ -394,8 +394,12 @@ coarsen_graph(
   // 4. renumber
 
   rmm::device_uvector<vertex_t> renumber_map_labels(0, handle.get_stream());
-  partition_t<vertex_t> partition(
-    std::vector<vertex_t>{}, graph_view.is_hypergraph_partitioned(), 0, 0, 0, 0);
+  partition_t<vertex_t> partition(std::vector<vertex_t>(comm_size + 1, 0),
+                                  graph_view.is_hypergraph_partitioned(),
+                                  row_comm_size,
+                                  col_comm_size,
+                                  row_comm_rank,
+                                  col_comm_rank);
   vertex_t number_of_vertices{};
   edge_t number_of_edges{};
   std::tie(renumber_map_labels, partition, number_of_vertices, number_of_edges) =

diff --git a/cpp/src/experimental/graph.cu b/cpp/src/experimental/graph.cu
@@ -238,9 +238,9 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
     }
     number_of_local_edges_sum =
       host_scalar_allreduce(comm, number_of_local_edges_sum, default_stream);
-    CUGRAPH_EXPECTS(number_of_local_edges_sum == this->get_number_of_edges(),
-                    "Invalid input argument: the sum of local edges doe counts not match with "
-                    "number_of_local_edges.");
+    CUGRAPH_EXPECTS(
+      number_of_local_edges_sum == this->get_number_of_edges(),
+      "Invalid input argument: the sum of local edge counts does not match with number_of_edges.");
 
     CUGRAPH_EXPECTS(
       partition.get_vertex_partition_last(comm_size - 1) == number_of_vertices,

diff --git a/cpp/src/experimental/relabel.cu b/cpp/src/experimental/relabel.cu
@@ -116,6 +116,9 @@ void relabel(raft::handle_t const& handle,
 
       // update intermediate relabel map
 
+      CUDA_TRY(cudaStreamSynchronize(
+        handle.get_stream()));  // cuco::static_map currently does not take stream
+
       cuco::static_map<vertex_t, vertex_t> relabel_map{
         static_cast<size_t>(static_cast<double>(rx_label_pair_old_labels.size()) / load_factor),
         invalid_vertex_id<vertex_t>::value,

diff --git a/cpp/src/experimental/renumber_edgelist.cu b/cpp/src/experimental/renumber_edgelist.cu
@@ -281,7 +281,7 @@ void expensive_check_edgelist(
                                                         comm_size,
                                                         row_comm_size,
                                                         col_comm_size}] __device__(auto edge) {
-          return key_func(thrust::get<0>(edge), thrust::get<1>(edge)) == comm_rank;
+          return key_func(thrust::get<0>(edge), thrust::get<1>(edge)) != comm_rank;
         }) == 0,
       "Invalid input argument: edgelist_major_vertices & edgelist_minor_vertices should be "
       "pre-shuffled.");
@@ -447,7 +447,7 @@ renumber_edgelist(raft::handle_t const& handle,
                                                               handle.get_stream());
       std::vector<size_t> recvcounts(row_comm_size);
       for (int i = 0; i < row_comm_size; ++i) {
-        recvcounts[i] = partition.get_vertex_partition_size(row_comm_rank * row_comm_size + i);
+        recvcounts[i] = partition.get_vertex_partition_size(col_comm_rank * row_comm_size + i);
       }
       std::vector<size_t> displacements(row_comm_size, 0);
       std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1);