Merge pull request rapidsai#1354 from seunghwak/fea_induced_subgraph

Implement induced subgraph extraction (SG C++)
rlratzel · Jan 26, 2021 · 9820990 · 9820990
2 parents 4f35bcb + a46f863
commit 9820990
Show file tree

Hide file tree

Showing 23 changed files with 964 additions and 135 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -19,6 +19,7 @@
 - PR #1279 Add self loop check variable in graph
 - PR #1277 SciPy sparse matrix input support for WCC, SCC, SSSP, and BFS
 - PR #1278 Add support for shortest_path_length and fix graph vertex checks
+- PR #1280 Add Multi(Di)Graph support
 
 ## Improvements
 - PR #1227 Pin cmake policies to cmake 3.17 version

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -370,6 +370,7 @@ add_library(cugraph SHARED
     src/experimental/coarsen_graph.cu
     src/experimental/renumber_edgelist.cu
     src/experimental/relabel.cu
+    src/experimental/induced_subgraph.cu
     src/experimental/bfs.cu
     src/experimental/sssp.cu
     src/experimental/pagerank.cu

diff --git a/cpp/include/experimental/graph_functions.hpp b/cpp/include/experimental/graph_functions.hpp
@@ -243,5 +243,49 @@ void relabel(raft::handle_t const& handle,
              vertex_t num_labels,
              bool do_expensive_check = false);
 
+/**
+ * @brief extract induced subgraph(s).
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weights.
+ * @tparam store_transposed Flag indicating whether to store the graph adjacency matrix as is or as
+ * transposed.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * or multi-GPU (true).
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Graph view object, we extract induced subgraphs from @p graph_view.
+ * @param subgraph_offsets Pointer to subgraph vertex offsets (size == @p num_subgraphs + 1).
+ * @param subgraph_vertices Pointer to subgraph vertices (size == @p subgraph_offsets[@p
+ * num_subgraphs]). The elements of @p subgraph_vertices for each subgraph should be sorted in
+ * ascending order and unique.
+ * @param num_subgraphs Number of induced subgraphs to extract.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>,
+ * rmm::device_uvector<weight_t>, rmm::device_uvector<size_t>> Quadraplet of edge major (destination
+ * if @p store_transposed is true, source otherwise) vertices, edge minor (source if @p
+ * store_transposed  is true, destination otherwise) vertices, edge weights, and edge offsets for
+ * each induced subgraphs (size == num_subgraphs + 1). The sizes of the edge major & minor vertices
+ * are edge_offsets[num_subgraphs]. The size of the edge weights is either
+ * edge_offsets[num_subgraphs] (if @p graph_view is weighted) or 0 (if @p graph_view is unweighted).
+ */
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<weight_t>,
+           rmm::device_uvector<size_t>>
+extract_induced_subgraphs(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu> const& graph_view,
+  size_t const* subgraph_offsets /* size == num_subgraphs + 1 */,
+  vertex_t const* subgraph_vertices /* size == subgraph_offsets[num_subgraphs] */,
+  size_t num_subgraphs,
+  bool do_expensive_check = false);
+
 }  // namespace experimental
 }  // namespace cugraph
diff --git a/cpp/include/patterns/copy_to_adj_matrix_row_col.cuh b/cpp/include/patterns/copy_to_adj_matrix_row_col.cuh
@@ -175,12 +175,6 @@ void copy_to_matrix_major(raft::handle_t const& handle,
                           map_first,
                           matrix_major_value_output_first);
         }
-
-        CUDA_TRY(cudaStreamSynchronize(
-          handle.get_stream()));  // this is as necessary rx_tmp_buffer will become out-of-scope
-                                  // once control flow exits this block (FIXME: we can reduce stream
-                                  // synchronization if we compute the maximum rx_counts and
-                                  // allocate rx_tmp_buffer outside the loop)
       }
     }
   } else {
@@ -370,10 +364,6 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
                                                                               rx_count,
                                                                               comm_src_rank,
                                                                               handle.get_stream());
-
-        CUDA_TRY(cudaStreamSynchronize(
-          handle.get_stream()));  // this is as necessary src_tmp_buffer will become out-of-scope
-                                  // once control flow exits this block
       }
 
       // FIXME: now we can clear tx_tmp_buffer
@@ -424,17 +414,7 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
                           map_first,
                           matrix_minor_value_output_first);
         }
-
-        CUDA_TRY(cudaStreamSynchronize(
-          handle.get_stream()));  // this is as necessary rx_tmp_buffer will become out-of-scope
-                                  // once control flow exits this block (FIXME: we can reduce stream
-                                  // synchronization if we compute the maximum rx_counts and
-                                  // allocate rx_tmp_buffer outside the loop)
       }
-
-      CUDA_TRY(cudaStreamSynchronize(
-        handle.get_stream()));  // this is as necessary dst_tmp_buffer will become out-of-scope once
-                                // control flow exits this block
     }
   } else {
     assert(graph_view.get_number_of_local_vertices() ==

diff --git a/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh b/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh
@@ -525,12 +525,6 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
                       handle.get_stream());
       }
     }
-
-    CUDA_TRY(cudaStreamSynchronize(
-      handle.get_stream()));  // this is as necessary major_tmp_buffer will become out-of-scope once
-                              // control flow exits this block (FIXME: we can reduce stream
-                              // synchronization if we compute the maximum major_tmp_buffer_size and
-                              // allocate major_tmp_buffer outside the loop)
   }
 
   if (GraphViewType::is_multi_gpu && (in != GraphViewType::is_adj_matrix_transposed)) {
@@ -592,10 +586,6 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
       }
     }
   }
-
-  CUDA_TRY(cudaStreamSynchronize(
-    handle.get_stream()));  // this is as necessary minor_tmp_buffer will become out-of-scope once
-                            // control flow exits this block
 }
 
 }  // namespace detail

diff --git a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
@@ -383,9 +383,6 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
       tmp_major_vertices              = std::move(rx_major_vertices);
       tmp_minor_keys                  = std::move(rx_minor_keys);
       tmp_key_aggregated_edge_weights = std::move(rx_key_aggregated_edge_weights);
-
-      CUDA_TRY(
-        cudaStreamSynchronize(handle.get_stream()));  // tx_value_counts will become out-of-scope
     }
 
     auto tmp_e_op_result_buffer =
@@ -464,18 +461,9 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
         major_vertices     = std::move(rx_major_vertices);
         e_op_result_buffer = std::move(rx_tmp_e_op_result_buffer);
       }
-
-      CUDA_TRY(cudaStreamSynchronize(
-        handle
-          .get_stream()));  // tmp_minor_keys, tmp_key_aggregated_edge_weights, rx_major_vertices,
-                            // and rx_tmp_e_op_result_buffer will become out-of-scope
     } else {
       major_vertices     = std::move(tmp_major_vertices);
       e_op_result_buffer = std::move(tmp_e_op_result_buffer);
-
-      CUDA_TRY(cudaStreamSynchronize(
-        handle.get_stream()));  // tmp_minor_keys and tmp_key_aggregated_edge_weights will become
-                                // out-of-scope
     }
   }
 

diff --git a/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh b/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh
@@ -254,10 +254,6 @@ transform_reduce_by_adj_matrix_row_col_key_e(
 
       tmp_keys         = std::move(rx_unique_keys);
       tmp_value_buffer = std::move(rx_value_for_unique_key_buffer);
-
-      CUDA_TRY(cudaStreamSynchronize(
-        handle
-          .get_stream()));  // unique_keys & value_for_unique_key_buffer will become out-of-scope
     }
 
     auto cur_size = keys.size();

diff --git a/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh b/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh
@@ -156,7 +156,7 @@ size_t reduce_buffer_elements(raft::handle_t const& handle,
     // FIXME: actually, we can find how many unique keys are here by now.
     // FIXME: if GraphViewType::is_multi_gpu is true, this should be executed on the GPU holding the
     // vertex unless reduce_op is a pure function.
-    rmm::device_vector<key_t> keys(num_buffer_elements);
+    rmm::device_uvector<key_t> keys(num_buffer_elements, handle.get_stream());
     rmm::device_vector<payload_t> values(num_buffer_elements);
     auto it = thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
                                     buffer_key_output_first,
@@ -176,9 +176,10 @@ size_t reduce_buffer_elements(raft::handle_t const& handle,
                  values.begin(),
                  values.begin() + num_reduced_buffer_elements,
                  buffer_payload_output_first);
-    CUDA_TRY(cudaStreamSynchronize(
-      handle.get_stream()));  // this is necessary as kyes & values will become out-of-scope once
-                              // this function returns
+    // FIXME: this is unecessary if we use a tuple of rmm::device_uvector objects for values
+    CUDA_TRY(
+      cudaStreamSynchronize(handle.get_stream()));  // this is necessary as values will become
+                                                    // out-of-scope once this function returns
     return num_reduced_buffer_elements;
   }
 }

diff --git a/cpp/src/experimental/coarsen_graph.cu b/cpp/src/experimental/coarsen_graph.cu
@@ -120,10 +120,6 @@ void sort_and_coarsen_edgelist(rmm::device_uvector<vertex_t> &edgelist_major_ver
       tmp_edgelist_weights.begin());
     number_of_edges = thrust::distance(tmp_edgelist_weights.begin(), thrust::get<1>(it));
 
-    CUDA_TRY(cudaStreamSynchronize(
-      stream));  // memory blocks owned by edgelist_(major_vertices,minor_vertices,weights) will be
-                 // freed after the assignments below
-
     edgelist_major_vertices = std::move(tmp_edgelist_major_vertices);
     edgelist_minor_vertices = std::move(tmp_edgelist_minor_vertices);
     edgelist_weights        = std::move(tmp_edgelist_weights);
@@ -313,10 +309,6 @@ coarsen_graph(
                    src_edge_first + edgelist_major_vertices.size(),
                    dst_edge_first);
     }
-
-    CUDA_TRY(cudaStreamSynchronize(
-      handle.get_stream()));  // edgelist_(major_vertices,minor_vertices,weights)
-                              // will become out-of-scope
   }
 
   sort_and_coarsen_edgelist(coarsened_edgelist_major_vertices,
@@ -355,11 +347,6 @@ coarsen_graph(
                               rx_edgelist_weights,
                               handle.get_stream());
 
-    CUDA_TRY(cudaStreamSynchronize(
-      handle.get_stream()));  // memory blocks owned by
-                              // coarsened_edgelist_(major_vertices,minor_vertices,weights)
-                              // will be freed after the assignments below
-
     coarsened_edgelist_major_vertices = std::move(rx_edgelist_major_vertices);
     coarsened_edgelist_minor_vertices = std::move(rx_edgelist_minor_vertices);
     coarsened_edgelist_weights        = std::move(rx_edgelist_weights);

diff --git a/cpp/src/experimental/graph.cu b/cpp/src/experimental/graph.cu
@@ -470,9 +470,7 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
       segment_offsets_.data(), segment_offsets.data(), segment_offsets.size(), default_stream);
 
     CUDA_TRY(cudaStreamSynchronize(
-      default_stream));  // this is necessary as d_thresholds and segment_offsets will become
-                         // out-of-scpe once control flow exits this block and segment_offsets_ can
-                         // be used right after return.
+      default_stream));  // this is necessary as segment_offsets_ can be used right after return.
   }
 
   // optional expensive checks (part 3/3)