Skip to content

Commit

Permalink
Bug fixes for MNMG coarsen_graph, renumber_edgelist, relabel (rapidsa…
Browse files Browse the repository at this point in the history
…i#1364)

Bug fixes for MNMG coarsen_graph, renumber_edgelist, relabel

Authors:
  - Seunghwa Kang (@seunghwak)

Approvers:
  - Andrei Schaffer (@aschaffer)
  - Rick Ratzel (@rlratzel)
  - Chuck Hastings (@ChuckHastings)
  - Alex Fender (@afender)

URL: rapidsai#1364
  • Loading branch information
seunghwak committed Feb 2, 2021
1 parent 44b0679 commit 5813559
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 24 deletions.
54 changes: 37 additions & 17 deletions cpp/include/utilities/shuffle_comm.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,24 @@ rmm::device_uvector<size_t> sort_and_count(raft::comms::comms_t const &comm,
auto gpu_id_first = thrust::make_transform_iterator(
tx_value_first,
[value_to_gpu_id_op] __device__(auto value) { return value_to_gpu_id_op(value); });
rmm::device_uvector<int> d_tx_dst_ranks(comm_size, stream);
rmm::device_uvector<size_t> d_tx_value_counts(comm_size, stream);
thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream),
gpu_id_first,
gpu_id_first + thrust::distance(tx_value_first, tx_value_last),
thrust::make_constant_iterator(size_t{1}),
thrust::make_discard_iterator(),
d_tx_value_counts.begin());
std::vector<size_t> tx_value_counts(comm_size);
raft::update_host(tx_value_counts.data(), d_tx_value_counts.data(), comm_size, stream);
auto last = thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream),
gpu_id_first,
gpu_id_first + thrust::distance(tx_value_first, tx_value_last),
thrust::make_constant_iterator(size_t{1}),
d_tx_dst_ranks.begin(),
d_tx_value_counts.begin());
if (thrust::distance(d_tx_value_counts.begin(), thrust::get<1>(last)) < comm_size) {
rmm::device_uvector<size_t> d_counts(comm_size, stream);
thrust::fill(rmm::exec_policy(stream)->on(stream), d_counts.begin(), d_counts.end(), size_t{0});
thrust::scatter(rmm::exec_policy(stream)->on(stream),
d_tx_value_counts.begin(),
thrust::get<1>(last),
d_tx_dst_ranks.begin(),
d_counts.begin());
d_tx_value_counts = std::move(d_counts);
}

return std::move(d_tx_value_counts);
}
Expand All @@ -83,13 +92,24 @@ rmm::device_uvector<size_t> sort_and_count(raft::comms::comms_t const &comm,

auto gpu_id_first = thrust::make_transform_iterator(
tx_key_first, [key_to_gpu_id_op] __device__(auto key) { return key_to_gpu_id_op(key); });
rmm::device_uvector<int> d_tx_dst_ranks(comm_size, stream);
rmm::device_uvector<size_t> d_tx_value_counts(comm_size, stream);
thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream),
gpu_id_first,
gpu_id_first + thrust::distance(tx_key_first, tx_key_last),
thrust::make_constant_iterator(size_t{1}),
thrust::make_discard_iterator(),
d_tx_value_counts.begin());
auto last = thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream),
gpu_id_first,
gpu_id_first + thrust::distance(tx_key_first, tx_key_last),
thrust::make_constant_iterator(size_t{1}),
d_tx_dst_ranks.begin(),
d_tx_value_counts.begin());
if (thrust::distance(d_tx_value_counts.begin(), thrust::get<1>(last)) < comm_size) {
rmm::device_uvector<size_t> d_counts(comm_size, stream);
thrust::fill(rmm::exec_policy(stream)->on(stream), d_counts.begin(), d_counts.end(), size_t{0});
thrust::scatter(rmm::exec_policy(stream)->on(stream),
d_tx_value_counts.begin(),
thrust::get<1>(last),
d_tx_dst_ranks.begin(),
d_counts.begin());
d_tx_value_counts = std::move(d_counts);
}

return std::move(d_tx_value_counts);
}
Expand Down Expand Up @@ -191,7 +211,7 @@ auto shuffle_values(raft::comms::comms_t const &comm,

auto rx_value_buffer =
allocate_dataframe_buffer<typename std::iterator_traits<TxValueIterator>::value_type>(
rx_offsets.back(), stream);
rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream);

// FIXME: this needs to be replaced with AlltoAll once NCCL 2.8 is released
// (if num_tx_dst_ranks == num_rx_src_ranks == comm_size).
Expand Down Expand Up @@ -234,7 +254,7 @@ auto sort_and_shuffle_values(raft::comms::comms_t const &comm,

auto rx_value_buffer =
allocate_dataframe_buffer<typename std::iterator_traits<ValueIterator>::value_type>(
rx_offsets.back() + rx_counts.back(), stream);
rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream);

// FIXME: this needs to be replaced with AlltoAll once NCCL 2.8 is released
// (if num_tx_dst_ranks == num_rx_src_ranks == comm_size).
Expand Down Expand Up @@ -275,7 +295,7 @@ auto sort_and_shuffle_kv_pairs(raft::comms::comms_t const &comm,
detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream);

rmm::device_uvector<typename std::iterator_traits<VertexIterator>::value_type> rx_keys(
rx_offsets.back() + rx_counts.back(), stream);
rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream);
auto rx_value_buffer =
allocate_dataframe_buffer<typename std::iterator_traits<ValueIterator>::value_type>(
rx_keys.size(), stream);
Expand Down
8 changes: 6 additions & 2 deletions cpp/src/experimental/coarsen_graph.cu
Original file line number Diff line number Diff line change
Expand Up @@ -394,8 +394,12 @@ coarsen_graph(
// 4. renumber

rmm::device_uvector<vertex_t> renumber_map_labels(0, handle.get_stream());
partition_t<vertex_t> partition(
std::vector<vertex_t>{}, graph_view.is_hypergraph_partitioned(), 0, 0, 0, 0);
partition_t<vertex_t> partition(std::vector<vertex_t>(comm_size + 1, 0),
graph_view.is_hypergraph_partitioned(),
row_comm_size,
col_comm_size,
row_comm_rank,
col_comm_rank);
vertex_t number_of_vertices{};
edge_t number_of_edges{};
std::tie(renumber_map_labels, partition, number_of_vertices, number_of_edges) =
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/experimental/graph.cu
Original file line number Diff line number Diff line change
Expand Up @@ -238,9 +238,9 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
}
number_of_local_edges_sum =
host_scalar_allreduce(comm, number_of_local_edges_sum, default_stream);
CUGRAPH_EXPECTS(number_of_local_edges_sum == this->get_number_of_edges(),
"Invalid input argument: the sum of local edges doe counts not match with "
"number_of_local_edges.");
CUGRAPH_EXPECTS(
number_of_local_edges_sum == this->get_number_of_edges(),
"Invalid input argument: the sum of local edge counts does not match with number_of_edges.");

CUGRAPH_EXPECTS(
partition.get_vertex_partition_last(comm_size - 1) == number_of_vertices,
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/experimental/relabel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@ void relabel(raft::handle_t const& handle,

// update intermediate relabel map

CUDA_TRY(cudaStreamSynchronize(
handle.get_stream())); // cuco::static_map currently does not take stream

cuco::static_map<vertex_t, vertex_t> relabel_map{
static_cast<size_t>(static_cast<double>(rx_label_pair_old_labels.size()) / load_factor),
invalid_vertex_id<vertex_t>::value,
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/experimental/renumber_edgelist.cu
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ void expensive_check_edgelist(
comm_size,
row_comm_size,
col_comm_size}] __device__(auto edge) {
return key_func(thrust::get<0>(edge), thrust::get<1>(edge)) == comm_rank;
return key_func(thrust::get<0>(edge), thrust::get<1>(edge)) != comm_rank;
}) == 0,
"Invalid input argument: edgelist_major_vertices & edgelist_minor_vertices should be "
"pre-shuffled.");
Expand Down Expand Up @@ -447,7 +447,7 @@ renumber_edgelist(raft::handle_t const& handle,
handle.get_stream());
std::vector<size_t> recvcounts(row_comm_size);
for (int i = 0; i < row_comm_size; ++i) {
recvcounts[i] = partition.get_vertex_partition_size(row_comm_rank * row_comm_size + i);
recvcounts[i] = partition.get_vertex_partition_size(col_comm_rank * row_comm_size + i);
}
std::vector<size_t> displacements(row_comm_size, 0);
std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1);
Expand Down

0 comments on commit 5813559

Please sign in to comment.