diff --git a/ci/gpu/notebook_list.py b/ci/gpu/notebook_list.py
new file mode 100644
index 00000000000..bb54913ac8d
--- /dev/null
+++ b/ci/gpu/notebook_list.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import sys
+import glob
+
+from numba import cuda
+
+#
+# Not strictly true... however what we mean is
+# Pascal or earlier
+#
+pascal = False
+
+device = cuda.get_current_device()
+cc = getattr(device, 'COMPUTE_CAPABILITY')
+if (cc[0] < 7):
+    pascal = True
+
+for filename in glob.iglob('**/*.ipynb', recursive=True):
+    skip = False
+    for line in open(filename, 'r'):
+        if re.search('# Skip notebook test', line):
+            skip = True
+            print(f'SKIPPING {filename} (marked as skip)', file=sys.stderr)
+            break;
+        elif re.search('dask', line):
+            print(f'SKIPPING {filename} (suspected Dask usage, not currently automatable)', file=sys.stderr)
+            skip = True
+            break;
+        elif pascal and re.search('# Does not run on Pascal', line):
+            print(f'SKIPPING {filename} (does not run on Pascal)', file=sys.stderr)
+            skip = True
+            break;
+
+    if not skip:
+        print(filename)
diff --git a/ci/gpu/test-notebooks.sh b/ci/gpu/test-notebooks.sh
index 389d3be0bfd..f5f768d7f12 100755
--- a/ci/gpu/test-notebooks.sh
+++ b/ci/gpu/test-notebooks.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -20,11 +20,6 @@ LIBCUDF_KERNEL_CACHE_PATH=${WORKSPACE}/.jitcache
 cd ${NOTEBOOKS_DIR}
 TOPLEVEL_NB_FOLDERS=$(find . -name *.ipynb |cut -d'/' -f2|sort -u)
 
-# Add notebooks that should be skipped here
-# (space-separated list of filenames without paths)
-
-SKIPNBS="uvm.ipynb bfs_benchmark.ipynb louvain_benchmark.ipynb pagerank_benchmark.ipynb sssp_benchmark.ipynb release.ipynb nx_cugraph_bc_benchmarking.ipynb"
-
 ## Check env
 env
 
@@ -37,26 +32,14 @@ for folder in ${TOPLEVEL_NB_FOLDERS}; do
     echo "FOLDER: ${folder}"
     echo "========================================"
     cd ${NOTEBOOKS_DIR}/${folder}
-    for nb in $(find . -name "*.ipynb"); do
+    for nb in $(python ${WORKSPACE}/ci/gpu/notebook_list.py); do
         nbBasename=$(basename ${nb})
-        # Skip all NBs that use dask (in the code or even in their name)
-        if ((echo ${nb}|grep -qi dask) || \
-            (grep -q dask ${nb})); then
-            echo "--------------------------------------------------------------------------------"
-            echo "SKIPPING: ${nb} (suspected Dask usage, not currently automatable)"
-            echo "--------------------------------------------------------------------------------"
-        elif (echo " ${SKIPNBS} " | grep -q " ${nbBasename} "); then
-            echo "--------------------------------------------------------------------------------"
-            echo "SKIPPING: ${nb} (listed in skip list)"
-            echo "--------------------------------------------------------------------------------"
-        else
-            cd $(dirname ${nb})
-            nvidia-smi
-            ${NBTEST} ${nbBasename}
-            EXITCODE=$((EXITCODE | $?))
-            rm -rf ${LIBCUDF_KERNEL_CACHE_PATH}/*
-            cd ${NOTEBOOKS_DIR}/${folder}
-        fi
+        cd $(dirname ${nb})
+        nvidia-smi
+        ${NBTEST} ${nbBasename}
+        EXITCODE=$((EXITCODE | $?))
+        rm -rf ${LIBCUDF_KERNEL_CACHE_PATH}/*
+        cd ${NOTEBOOKS_DIR}/${folder}
     done
 done
 
diff --git a/ci/test.sh b/ci/test.sh
index db9390461c0..c173088862d 100755
--- a/ci/test.sh
+++ b/ci/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -64,7 +64,7 @@ else
     cd $WORKSPACE/ci/artifacts/cugraph/cpu/conda_work/cpp/build
 fi
 
-for gt in gtests/*; do
+for gt in tests/*_TEST; do
     test_name=$(basename $gt)
     echo "Running GoogleTest $test_name"
     ${gt} ${GTEST_FILTER} ${GTEST_ARGS}
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 19bfd24a591..c17bed0a902 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -351,7 +351,7 @@ add_library(cugraph SHARED
     src/community/louvain.cu
     src/community/leiden.cu
     src/community/ktruss.cu
-    src/community/ECG.cu
+    src/community/ecg.cu
     src/community/triangles_counting.cu
     src/community/extract_subgraph_by_vertex.cu
     src/cores/core_number.cu
diff --git a/cpp/src/community/dendrogram.cuh b/cpp/src/community/dendrogram.cuh
new file mode 100644
index 00000000000..414f5f3854d
--- /dev/null
+++ b/cpp/src/community/dendrogram.cuh
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <rmm/device_buffer.hpp>
+
+#include <memory>
+#include <vector>
+
+namespace cugraph {
+
+template <typename vertex_t>
+class Dendrogram {
+ public:
+  void add_level(vertex_t num_verts,
+                 cudaStream_t stream                 = 0,
+                 rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
+  {
+    level_ptr_.push_back(
+      std::make_unique<rmm::device_buffer>(num_verts * sizeof(vertex_t), stream, mr));
+    level_size_.push_back(num_verts);
+  }
+
+  size_t current_level() const { return level_size_.size() - 1; }
+
+  size_t num_levels() const { return level_size_.size(); }
+
+  vertex_t const *get_level_ptr_nocheck(size_t level) const
+  {
+    return static_cast<vertex_t const *>(level_ptr_[level]->data());
+  }
+
+  vertex_t *get_level_ptr_nocheck(size_t level)
+  {
+    return static_cast<vertex_t *>(level_ptr_[level]->data());
+  }
+
+  vertex_t get_level_size_nocheck(size_t level) const { return level_size_[level]; }
+
+  vertex_t const *current_level_begin() const { return get_level_ptr_nocheck(current_level()); }
+
+  vertex_t const *current_level_end() const { return current_level_begin() + current_level_size(); }
+
+  vertex_t *current_level_begin() { return get_level_ptr_nocheck(current_level()); }
+
+  vertex_t *current_level_end() { return current_level_begin() + current_level_size(); }
+
+  vertex_t current_level_size() const { return get_level_size_nocheck(current_level()); }
+
+ private:
+  std::vector<vertex_t> level_size_;
+  std::vector<std::unique_ptr<rmm::device_buffer>> level_ptr_;
+};
+
+}  // namespace cugraph
diff --git a/cpp/src/community/ECG.cu b/cpp/src/community/ecg.cu
similarity index 72%
rename from cpp/src/community/ECG.cu
rename to cpp/src/community/ecg.cu
index ea21f87ff7e..994204ecd32 100644
--- a/cpp/src/community/ECG.cu
+++ b/cpp/src/community/ecg.cu
@@ -15,13 +15,15 @@
  */
 
 #include <algorithms.hpp>
+#include <community/louvain.cuh>
+#include <converters/permute_graph.cuh>
+#include <utilities/error.hpp>
+#include <utilities/graph_utils.cuh>
 
 #include <rmm/thrust_rmm_allocator.h>
 #include <thrust/random.h>
-#include <converters/permute_graph.cuh>
+
 #include <ctime>
-#include <utilities/error.hpp>
-#include "utilities/graph_utils.cuh"
 
 namespace {
 template <typename IndexType>
@@ -41,26 +43,23 @@ binsearch_maxle(const IndexType *vec, const IndexType val, IndexType low, IndexT
   }
 }
 
+// FIXME: This shouldn't need to be a custom kernel, this
+//        seems like it should just be a thrust::transform
 template <typename IdxT, typename ValT>
-__global__ void match_check_kernel(IdxT size,
-                                   IdxT num_verts,
-                                   IdxT *offsets,
-                                   IdxT *indices,
-                                   IdxT *permutation,
-                                   IdxT *parts,
-                                   ValT *weights)
+__global__ void match_check_kernel(
+  IdxT size, IdxT num_verts, IdxT *offsets, IdxT *indices, IdxT *parts, ValT *weights)
 {
   IdxT tid = blockIdx.x * blockDim.x + threadIdx.x;
   while (tid < size) {
     IdxT source = binsearch_maxle(offsets, tid, (IdxT)0, num_verts);
     IdxT dest   = indices[tid];
-    if (parts[permutation[source]] == parts[permutation[dest]]) weights[tid] += 1;
+    if (parts[source] == parts[dest]) weights[tid] += 1;
     tid += gridDim.x * blockDim.x;
   }
 }
 
 struct prg {
-  __host__ __device__ float operator()(int n)
+  __device__ float operator()(int n)
   {
     thrust::default_random_engine rng;
     thrust::uniform_real_distribution<float> dist(0.0, 1.0);
@@ -93,7 +92,7 @@ struct update_functor {
 template <typename T>
 void get_permutation_vector(T size, T seed, T *permutation, cudaStream_t stream)
 {
-  rmm::device_vector<float> randoms_v(size);
+  rmm::device_uvector<float> randoms_v(size, stream);
 
   thrust::counting_iterator<uint32_t> index(seed);
   thrust::transform(
@@ -103,6 +102,31 @@ void get_permutation_vector(T size, T seed, T *permutation, cudaStream_t stream)
     rmm::exec_policy(stream)->on(stream), randoms_v.begin(), randoms_v.end(), permutation);
 }
 
+template <typename graph_type>
+class EcgLouvain : public cugraph::Louvain<graph_type> {
+ public:
+  using graph_t  = graph_type;
+  using vertex_t = typename graph_type::vertex_type;
+  using edge_t   = typename graph_type::edge_type;
+  using weight_t = typename graph_type::weight_type;
+
+  EcgLouvain(raft::handle_t const &handle, graph_type const &graph, vertex_t seed)
+    : cugraph::Louvain<graph_type>(handle, graph), seed_(seed)
+  {
+  }
+
+  void initialize_dendrogram_level(vertex_t num_vertices) override
+  {
+    this->dendrogram_->add_level(num_vertices);
+
+    get_permutation_vector(
+      num_vertices, seed_, this->dendrogram_->current_level_begin(), this->stream_);
+  }
+
+ private:
+  vertex_t seed_;
+};
+
 }  // anonymous namespace
 
 namespace cugraph {
@@ -114,37 +138,34 @@ void ecg(raft::handle_t const &handle,
          vertex_t ensemble_size,
          vertex_t *clustering)
 {
+  using graph_type = GraphCSRView<vertex_t, edge_t, weight_t>;
+
   CUGRAPH_EXPECTS(graph.edge_data != nullptr,
-                  "Invalid input argument: louvain expects a weighted graph");
-  CUGRAPH_EXPECTS(clustering != nullptr, "Invalid input argument: clustering is NULL");
+                  "Invalid input argument: ecg expects a weighted graph");
+  CUGRAPH_EXPECTS(clustering != nullptr,
+                  "Invalid input argument: clustering is NULL, should be a device pointer to "
+                  "memory for storing the result");
 
   cudaStream_t stream{0};
 
-  rmm::device_vector<weight_t> ecg_weights_v(graph.edge_data,
-                                             graph.edge_data + graph.number_of_edges);
+  rmm::device_uvector<weight_t> ecg_weights_v(graph.number_of_edges, handle.get_stream());
+
+  thrust::copy(rmm::exec_policy(stream)->on(stream),
+               graph.edge_data,
+               graph.edge_data + graph.number_of_edges,
+               ecg_weights_v.data());
 
   vertex_t size{graph.number_of_vertices};
-  vertex_t seed{1};
 
-  auto permuted_graph = std::make_unique<GraphCSR<vertex_t, edge_t, weight_t>>(
-    size, graph.number_of_edges, graph.has_data());
+  // FIXME:  This seed should be a parameter
+  vertex_t seed{1};
 
   // Iterate over each member of the ensemble
   for (vertex_t i = 0; i < ensemble_size; i++) {
-    // Take random permutation of the graph
-    rmm::device_vector<vertex_t> permutation_v(size);
-    vertex_t *d_permutation = permutation_v.data().get();
-
-    get_permutation_vector(size, seed, d_permutation, stream);
+    EcgLouvain<graph_type> runner(handle, graph, seed);
     seed += size;
 
-    detail::permute_graph<vertex_t, edge_t, weight_t>(graph, d_permutation, permuted_graph->view());
-
-    // Run one level of Louvain clustering on the random permutation
-    rmm::device_vector<vertex_t> parts_v(size);
-    vertex_t *d_parts = parts_v.data().get();
-
-    cugraph::louvain(handle, permuted_graph->view(), d_parts, size_t{1});
+    weight_t wt = runner(size_t{1}, weight_t{1});
 
     // For each edge in the graph determine whether the endpoints are in the same partition
     // Keep a sum for each edge of the total number of times its endpoints are in the same partition
@@ -155,17 +176,16 @@ void ecg(raft::handle_t const &handle,
                                                    graph.number_of_vertices,
                                                    graph.offsets,
                                                    graph.indices,
-                                                   permutation_v.data().get(),
-                                                   d_parts,
-                                                   ecg_weights_v.data().get());
+                                                   runner.get_dendrogram().get_level_ptr_nocheck(0),
+                                                   ecg_weights_v.data());
   }
 
   // Set weights = min_weight + (1 - min-weight)*sum/ensemble_size
   update_functor<weight_t> uf(min_weight, ensemble_size);
   thrust::transform(rmm::exec_policy(stream)->on(stream),
-                    ecg_weights_v.data().get(),
-                    ecg_weights_v.data().get() + graph.number_of_edges,
-                    ecg_weights_v.data().get(),
+                    ecg_weights_v.begin(),
+                    ecg_weights_v.end(),
+                    ecg_weights_v.begin(),
                     uf);
 
   // Run Louvain on the original graph using the computed weights
@@ -173,7 +193,7 @@ void ecg(raft::handle_t const &handle,
   GraphCSRView<vertex_t, edge_t, weight_t> louvain_graph;
   louvain_graph.indices            = graph.indices;
   louvain_graph.offsets            = graph.offsets;
-  louvain_graph.edge_data          = ecg_weights_v.data().get();
+  louvain_graph.edge_data          = ecg_weights_v.data();
   louvain_graph.number_of_vertices = graph.number_of_vertices;
   louvain_graph.number_of_edges    = graph.number_of_edges;
 
diff --git a/cpp/src/community/flatten_dendrogram.cuh b/cpp/src/community/flatten_dendrogram.cuh
new file mode 100644
index 00000000000..892fe2d1c51
--- /dev/null
+++ b/cpp/src/community/flatten_dendrogram.cuh
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <community/dendrogram.cuh>
+#include <experimental/graph_functions.hpp>
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <raft/handle.hpp>
+
+namespace cugraph {
+
+template <typename vertex_t, bool multi_gpu>
+void partition_at_level(raft::handle_t const &handle,
+                        Dendrogram<vertex_t> const &dendrogram,
+                        vertex_t const *d_vertex_ids,
+                        vertex_t *d_partition,
+                        size_t level)
+{
+  vertex_t local_num_verts = dendrogram.get_level_size_nocheck(0);
+
+  thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+               d_vertex_ids,
+               d_vertex_ids + local_num_verts,
+               d_partition);
+
+  std::for_each(thrust::make_counting_iterator<size_t>(0),
+                thrust::make_counting_iterator<size_t>(level),
+                [&handle, &dendrogram, d_vertex_ids, &d_partition, local_num_verts](size_t l) {
+                  cugraph::experimental::relabel<vertex_t, multi_gpu>(
+                    handle,
+                    std::tuple<vertex_t const *, vertex_t const *>(
+                      d_vertex_ids, dendrogram.get_level_ptr_nocheck(l)),
+                    dendrogram.get_level_size_nocheck(l),
+                    d_partition,
+                    local_num_verts);
+                });
+}
+
+}  // namespace cugraph
diff --git a/cpp/src/community/leiden.cu b/cpp/src/community/leiden.cu
index 9e5a847cdf0..427e62d3286 100644
--- a/cpp/src/community/leiden.cu
+++ b/cpp/src/community/leiden.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,8 +14,11 @@
  * limitations under the License.
  */
 
+#include <community/flatten_dendrogram.cuh>
 #include <community/leiden.cuh>
 
+#include <rmm/device_uvector.hpp>
+
 namespace cugraph {
 
 template <typename vertex_t, typename edge_t, typename weight_t>
@@ -27,11 +30,29 @@ std::pair<size_t, weight_t> leiden(raft::handle_t const &handle,
 {
   CUGRAPH_EXPECTS(graph.edge_data != nullptr,
                   "Invalid input argument: leiden expects a weighted graph");
-  CUGRAPH_EXPECTS(clustering != nullptr, "Invalid input argument: clustering is null");
+  CUGRAPH_EXPECTS(clustering != nullptr,
+                  "Invalid input argument: clustering is null, should be a device pointer to "
+                  "memory for storing the result");
 
   Leiden<GraphCSRView<vertex_t, edge_t, weight_t>> runner(handle, graph);
+  weight_t wt = runner(max_level, resolution);
+
+  rmm::device_uvector<vertex_t> vertex_ids_v(graph.number_of_vertices, handle.get_stream());
+
+  thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+               thrust::make_counting_iterator<vertex_t>(0),  // MNMG - base vertex id
+               thrust::make_counting_iterator<vertex_t>(
+                 graph.number_of_vertices),  // MNMG - base vertex id + number_of_vertices
+               vertex_ids_v.begin());
+
+  partition_at_level<vertex_t, false>(handle,
+                                      runner.get_dendrogram(),
+                                      vertex_ids_v.data(),
+                                      clustering,
+                                      runner.get_dendrogram().num_levels());
 
-  return runner(clustering, max_level, resolution);
+  // FIXME: Consider returning the Dendrogram at some point
+  return std::make_pair(runner.get_dendrogram().num_levels(), wt);
 }
 
 // Explicit template instantations
diff --git a/cpp/src/community/leiden.cuh b/cpp/src/community/leiden.cuh
index f2f84433284..141f8beac40 100644
--- a/cpp/src/community/leiden.cuh
+++ b/cpp/src/community/leiden.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 
 #include <community/louvain.cuh>
 
+#include <rmm/device_uvector.hpp>
+
 namespace cugraph {
 
 template <typename graph_type>
@@ -28,7 +30,8 @@ class Leiden : public Louvain<graph_type> {
   using weight_t = typename graph_type::weight_type;
 
   Leiden(raft::handle_t const &handle, graph_type const &graph)
-    : Louvain<graph_type>(handle, graph), constraint_v_(graph.number_of_vertices)
+    : Louvain<graph_type>(handle, graph),
+      constraint_v_(graph.number_of_vertices, handle.get_stream())
   {
   }
 
@@ -38,22 +41,28 @@ class Leiden : public Louvain<graph_type> {
   {
     this->timer_start("update_clustering_constrained");
 
-    rmm::device_vector<vertex_t> next_cluster_v(this->cluster_v_);
-    rmm::device_vector<weight_t> delta_Q_v(graph.number_of_edges);
-    rmm::device_vector<vertex_t> cluster_hash_v(graph.number_of_edges);
-    rmm::device_vector<weight_t> old_cluster_sum_v(graph.number_of_vertices);
+    rmm::device_uvector<vertex_t> next_cluster_v(this->dendrogram_->current_level_size(),
+                                                 this->stream_);
+    rmm::device_uvector<weight_t> delta_Q_v(graph.number_of_edges, this->stream_);
+    rmm::device_uvector<vertex_t> cluster_hash_v(graph.number_of_edges, this->stream_);
+    rmm::device_uvector<weight_t> old_cluster_sum_v(graph.number_of_vertices, this->stream_);
 
-    vertex_t const *d_src_indices    = this->src_indices_v_.data().get();
+    vertex_t const *d_src_indices    = this->src_indices_v_.data();
     vertex_t const *d_dst_indices    = graph.indices;
-    vertex_t *d_cluster_hash         = cluster_hash_v.data().get();
-    vertex_t *d_cluster              = this->cluster_v_.data().get();
-    weight_t const *d_vertex_weights = this->vertex_weights_v_.data().get();
-    weight_t *d_cluster_weights      = this->cluster_weights_v_.data().get();
-    weight_t *d_delta_Q              = delta_Q_v.data().get();
-    vertex_t *d_constraint           = constraint_v_.data().get();
+    vertex_t *d_cluster_hash         = cluster_hash_v.data();
+    vertex_t *d_cluster              = this->dendrogram_->current_level_begin();
+    weight_t const *d_vertex_weights = this->vertex_weights_v_.data();
+    weight_t *d_cluster_weights      = this->cluster_weights_v_.data();
+    weight_t *d_delta_Q              = delta_Q_v.data();
+    vertex_t *d_constraint           = constraint_v_.data();
+
+    thrust::copy(rmm::exec_policy(this->stream_)->on(this->stream_),
+                 this->dendrogram_->current_level_begin(),
+                 this->dendrogram_->current_level_end(),
+                 next_cluster_v.data());
 
-    weight_t new_Q =
-      this->modularity(total_edge_weight, resolution, graph, this->cluster_v_.data().get());
+    weight_t new_Q = this->modularity(
+      total_edge_weight, resolution, graph, this->dendrogram_->current_level_begin());
 
     weight_t cur_Q = new_Q - 1;
 
@@ -83,13 +92,13 @@ class Leiden : public Louvain<graph_type> {
 
       up_down = !up_down;
 
-      new_Q = this->modularity(total_edge_weight, resolution, graph, next_cluster_v.data().get());
+      new_Q = this->modularity(total_edge_weight, resolution, graph, next_cluster_v.data());
 
       if (new_Q > cur_Q) {
         thrust::copy(rmm::exec_policy(this->stream_)->on(this->stream_),
                      next_cluster_v.begin(),
                      next_cluster_v.end(),
-                     this->cluster_v_.begin());
+                     this->dendrogram_->current_level_begin());
       }
     }
 
@@ -97,9 +106,7 @@ class Leiden : public Louvain<graph_type> {
     return cur_Q;
   }
 
-  std::pair<size_t, weight_t> operator()(vertex_t *d_cluster_vec,
-                                         size_t max_level,
-                                         weight_t resolution)
+  weight_t operator()(size_t max_level, weight_t resolution) override
   {
     size_t num_level{0};
 
@@ -109,57 +116,50 @@ class Leiden : public Louvain<graph_type> {
 
     weight_t best_modularity = weight_t{-1};
 
-    //
-    //  Initialize every cluster to reference each vertex to itself
-    //
-    thrust::sequence(rmm::exec_policy(this->stream_)->on(this->stream_),
-                     this->cluster_v_.begin(),
-                     this->cluster_v_.end());
-    thrust::copy(rmm::exec_policy(this->stream_)->on(this->stream_),
-                 this->cluster_v_.begin(),
-                 this->cluster_v_.end(),
-                 d_cluster_vec);
-
     //
     //  Our copy of the graph.  Each iteration of the outer loop will
     //  shrink this copy of the graph.
     //
-    GraphCSRView<vertex_t, edge_t, weight_t> current_graph(this->offsets_v_.data().get(),
-                                                           this->indices_v_.data().get(),
-                                                           this->weights_v_.data().get(),
+    GraphCSRView<vertex_t, edge_t, weight_t> current_graph(this->offsets_v_.data(),
+                                                           this->indices_v_.data(),
+                                                           this->weights_v_.data(),
                                                            this->number_of_vertices_,
                                                            this->number_of_edges_);
 
-    current_graph.get_source_indices(this->src_indices_v_.data().get());
+    current_graph.get_source_indices(this->src_indices_v_.data());
 
     while (num_level < max_level) {
+      //
+      //  Initialize every cluster to reference each vertex to itself
+      //
+      this->dendrogram_->add_level(current_graph.number_of_vertices);
+
+      thrust::sequence(rmm::exec_policy(this->stream_)->on(this->stream_),
+                       this->dendrogram_->current_level_begin(),
+                       this->dendrogram_->current_level_end());
+
       this->compute_vertex_and_cluster_weights(current_graph);
 
       weight_t new_Q = this->update_clustering(total_edge_weight, resolution, current_graph);
 
-      thrust::copy(rmm::exec_policy(this->stream_)->on(this->stream_),
-                   this->cluster_v_.begin(),
-                   this->cluster_v_.end(),
-                   constraint_v_.begin());
-
       new_Q = update_clustering_constrained(total_edge_weight, resolution, current_graph);
 
       if (new_Q <= best_modularity) { break; }
 
       best_modularity = new_Q;
 
-      this->shrink_graph(current_graph, d_cluster_vec);
+      this->shrink_graph(current_graph);
 
       num_level++;
     }
 
     this->timer_display(std::cout);
 
-    return std::make_pair(num_level, best_modularity);
+    return best_modularity;
   }
 
  private:
-  rmm::device_vector<vertex_t> constraint_v_;
+  rmm::device_uvector<vertex_t> constraint_v_;
 };
 
 }  // namespace cugraph
diff --git a/cpp/src/community/louvain.cu b/cpp/src/community/louvain.cu
index 81a68a31663..aef6fcdafde 100644
--- a/cpp/src/community/louvain.cu
+++ b/cpp/src/community/louvain.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,10 +14,13 @@
  * limitations under the License.
  */
 
+#include <community/flatten_dendrogram.cuh>
 #include <community/louvain.cuh>
 #include <experimental/graph.hpp>
 #include <experimental/louvain.cuh>
 
+#include <rmm/device_uvector.hpp>
+
 namespace cugraph {
 
 namespace detail {
@@ -31,10 +34,29 @@ std::pair<size_t, weight_t> louvain(raft::handle_t const &handle,
 {
   CUGRAPH_EXPECTS(graph_view.edge_data != nullptr,
                   "Invalid input argument: louvain expects a weighted graph");
-  CUGRAPH_EXPECTS(clustering != nullptr, "Invalid input argument: clustering is null");
+  CUGRAPH_EXPECTS(clustering != nullptr,
+                  "Invalid input argument: clustering is null, should be a device pointer to "
+                  "memory for storing the result");
 
   Louvain<GraphCSRView<vertex_t, edge_t, weight_t>> runner(handle, graph_view);
-  return runner(clustering, max_level, resolution);
+  weight_t wt = runner(max_level, resolution);
+
+  rmm::device_uvector<vertex_t> vertex_ids_v(graph_view.number_of_vertices, handle.get_stream());
+
+  thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+               thrust::make_counting_iterator<vertex_t>(0),  // MNMG - base vertex id
+               thrust::make_counting_iterator<vertex_t>(
+                 graph_view.number_of_vertices),  // MNMG - base vertex id + number_of_vertices
+               vertex_ids_v.begin());
+
+  partition_at_level<vertex_t, false>(handle,
+                                      runner.get_dendrogram(),
+                                      vertex_ids_v.data(),
+                                      clustering,
+                                      runner.get_dendrogram().num_levels());
+
+  // FIXME: Consider returning the Dendrogram at some point
+  return std::make_pair(runner.get_dendrogram().num_levels(), wt);
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
@@ -45,7 +67,9 @@ std::pair<size_t, weight_t> louvain(
   size_t max_level,
   weight_t resolution)
 {
-  CUGRAPH_EXPECTS(clustering != nullptr, "Invalid input argument: clustering is null");
+  CUGRAPH_EXPECTS(clustering != nullptr,
+                  "Invalid input argument: clustering is null, should be a device pointer to "
+                  "memory for storing the result");
 
   // "FIXME": remove this check and the guards below
   //
@@ -61,7 +85,13 @@ std::pair<size_t, weight_t> louvain(
   } else {
     experimental::Louvain<experimental::graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu>>
       runner(handle, graph_view);
-    return runner(clustering, max_level, resolution);
+
+    weight_t wt = runner(max_level, resolution);
+    // TODO: implement this...
+    // runner.get_dendrogram().partition_at_level(clustering, runner.get_dendrogram().num_levels());
+
+    // FIXME: Consider returning the Dendrogram at some point
+    return std::make_pair(runner.get_dendrogram().num_levels(), wt);
   }
 }
 
diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh
index 7ca3638f42b..f13c64867cb 100644
--- a/cpp/src/community/louvain.cuh
+++ b/cpp/src/community/louvain.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,11 +17,13 @@
 
 #include <graph.hpp>
 
-#include <rmm/thrust_rmm_allocator.h>
-
 #include <converters/COOtoCSR.cuh>
 #include <utilities/graph_utils.cuh>
 
+#include <community/dendrogram.cuh>
+
+#include <rmm/device_uvector.hpp>
+
 //#define TIMING
 
 #ifdef TIMING
@@ -44,26 +46,42 @@ class Louvain {
       hr_timer_(),
 #endif
       handle_(handle),
+      dendrogram_(std::make_unique<Dendrogram<vertex_t>>()),
 
       // FIXME:  Don't really need to copy here but would need
       //         to change the logic to populate this properly
       //         in generate_superverticies_graph.
       //
-      offsets_v_(graph.offsets, graph.offsets + graph.number_of_vertices + 1),
-      indices_v_(graph.indices, graph.indices + graph.number_of_edges),
-      weights_v_(graph.edge_data, graph.edge_data + graph.number_of_edges),
-      src_indices_v_(graph.number_of_edges),
-      vertex_weights_v_(graph.number_of_vertices),
-      cluster_weights_v_(graph.number_of_vertices),
-      cluster_v_(graph.number_of_vertices),
-      tmp_arr_v_(graph.number_of_vertices),
-      cluster_inverse_v_(graph.number_of_vertices),
+      offsets_v_(graph.number_of_vertices + 1, handle.get_stream()),
+      indices_v_(graph.number_of_edges, handle.get_stream()),
+      weights_v_(graph.number_of_edges, handle.get_stream()),
+      src_indices_v_(graph.number_of_edges, handle.get_stream()),
+      vertex_weights_v_(graph.number_of_vertices, handle.get_stream()),
+      cluster_weights_v_(graph.number_of_vertices, handle.get_stream()),
+      tmp_arr_v_(graph.number_of_vertices, handle.get_stream()),
+      cluster_inverse_v_(graph.number_of_vertices, handle.get_stream()),
       number_of_vertices_(graph.number_of_vertices),
       number_of_edges_(graph.number_of_edges),
       stream_(handle.get_stream())
   {
+    thrust::copy(rmm::exec_policy(stream_)->on(stream_),
+                 graph.offsets,
+                 graph.offsets + graph.number_of_vertices + 1,
+                 offsets_v_.begin());
+
+    thrust::copy(rmm::exec_policy(stream_)->on(stream_),
+                 graph.indices,
+                 graph.indices + graph.number_of_edges,
+                 indices_v_.begin());
+
+    thrust::copy(rmm::exec_policy(stream_)->on(stream_),
+                 graph.edge_data,
+                 graph.edge_data + graph.number_of_edges,
+                 weights_v_.begin());
   }
 
+  virtual ~Louvain() {}
+
   weight_t modularity(weight_t total_edge_weight,
                       weight_t resolution,
                       graph_t const &graph,
@@ -71,43 +89,45 @@ class Louvain {
   {
     vertex_t n_verts = graph.number_of_vertices;
 
-    rmm::device_vector<weight_t> inc(n_verts, weight_t{0.0});
-    rmm::device_vector<weight_t> deg(n_verts, weight_t{0.0});
+    rmm::device_uvector<weight_t> inc(n_verts, stream_);
+    rmm::device_uvector<weight_t> deg(n_verts, stream_);
 
-    edge_t const *d_offsets   = graph.offsets;
-    vertex_t const *d_indices = graph.indices;
-    weight_t const *d_weights = graph.edge_data;
-    weight_t *d_inc           = inc.data().get();
-    weight_t *d_deg           = deg.data().get();
+    thrust::fill(rmm::exec_policy(stream_)->on(stream_), inc.begin(), inc.end(), weight_t{0.0});
+    thrust::fill(rmm::exec_policy(stream_)->on(stream_), deg.begin(), deg.end(), weight_t{0.0});
 
     // FIXME:  Already have weighted degree computed in main loop,
     //         could pass that in rather than computing d_deg... which
     //         would save an atomicAdd (synchronization)
     //
-    thrust::for_each(
-      rmm::exec_policy(stream_)->on(stream_),
-      thrust::make_counting_iterator(0),
-      thrust::make_counting_iterator(graph.number_of_vertices),
-      [d_inc, d_deg, d_offsets, d_indices, d_weights, d_cluster] __device__(vertex_t v) {
-        vertex_t community = d_cluster[v];
-        weight_t increase{0.0};
-        weight_t degree{0.0};
-
-        for (edge_t loc = d_offsets[v]; loc < d_offsets[v + 1]; ++loc) {
-          vertex_t neighbor = d_indices[loc];
-          degree += d_weights[loc];
-          if (d_cluster[neighbor] == community) { increase += d_weights[loc]; }
-        }
+    thrust::for_each(rmm::exec_policy(stream_)->on(stream_),
+                     thrust::make_counting_iterator(0),
+                     thrust::make_counting_iterator(graph.number_of_vertices),
+                     [d_inc     = inc.data(),
+                      d_deg     = deg.data(),
+                      d_offsets = graph.offsets,
+                      d_indices = graph.indices,
+                      d_weights = graph.edge_data,
+                      d_cluster] __device__(vertex_t v) {
+                       vertex_t community = d_cluster[v];
+                       weight_t increase{0.0};
+                       weight_t degree{0.0};
+
+                       for (edge_t loc = d_offsets[v]; loc < d_offsets[v + 1]; ++loc) {
+                         vertex_t neighbor = d_indices[loc];
+                         degree += d_weights[loc];
+                         if (d_cluster[neighbor] == community) { increase += d_weights[loc]; }
+                       }
 
-        if (degree > weight_t{0.0}) atomicAdd(d_deg + community, degree);
-        if (increase > weight_t{0.0}) atomicAdd(d_inc + community, increase);
-      });
+                       if (degree > weight_t{0.0}) atomicAdd(d_deg + community, degree);
+                       if (increase > weight_t{0.0}) atomicAdd(d_inc + community, increase);
+                     });
 
     weight_t Q = thrust::transform_reduce(
       rmm::exec_policy(stream_)->on(stream_),
       thrust::make_counting_iterator(0),
       thrust::make_counting_iterator(graph.number_of_vertices),
-      [d_deg, d_inc, total_edge_weight, resolution] __device__(vertex_t community) {
+      [d_deg = deg.data(), d_inc = inc.data(), total_edge_weight, resolution] __device__(
+        vertex_t community) {
         return ((d_inc[community] / total_edge_weight) - resolution *
                                                            (d_deg[community] * d_deg[community]) /
                                                            (total_edge_weight * total_edge_weight));
@@ -118,37 +138,35 @@ class Louvain {
     return Q;
   }
 
-  virtual std::pair<size_t, weight_t> operator()(vertex_t *d_cluster_vec,
-                                                 size_t max_level,
-                                                 weight_t resolution)
-  {
-    size_t num_level{0};
+  Dendrogram<vertex_t> &get_dendrogram() const { return *dendrogram_; }
+
+  std::unique_ptr<Dendrogram<vertex_t>> move_dendrogram() { return dendrogram_; }
 
+  virtual weight_t operator()(size_t max_level, weight_t resolution)
+  {
     weight_t total_edge_weight =
       thrust::reduce(rmm::exec_policy(stream_)->on(stream_), weights_v_.begin(), weights_v_.end());
 
     weight_t best_modularity = weight_t{-1};
 
-    //
-    //  Initialize every cluster to reference each vertex to itself
-    //
-    thrust::sequence(rmm::exec_policy(stream_)->on(stream_), cluster_v_.begin(), cluster_v_.end());
-    thrust::copy(
-      rmm::exec_policy(stream_)->on(stream_), cluster_v_.begin(), cluster_v_.end(), d_cluster_vec);
-
     //
     //  Our copy of the graph.  Each iteration of the outer loop will
     //  shrink this copy of the graph.
     //
-    GraphCSRView<vertex_t, edge_t, weight_t> current_graph(offsets_v_.data().get(),
-                                                           indices_v_.data().get(),
-                                                           weights_v_.data().get(),
+    GraphCSRView<vertex_t, edge_t, weight_t> current_graph(offsets_v_.data(),
+                                                           indices_v_.data(),
+                                                           weights_v_.data(),
                                                            number_of_vertices_,
                                                            number_of_edges_);
 
-    current_graph.get_source_indices(src_indices_v_.data().get());
+    current_graph.get_source_indices(src_indices_v_.data());
+
+    while (dendrogram_->num_levels() < max_level) {
+      //
+      //  Initialize every cluster to reference each vertex to itself
+      //
+      initialize_dendrogram_level(current_graph.number_of_vertices);
 
-    while (num_level < max_level) {
       compute_vertex_and_cluster_weights(current_graph);
 
       weight_t new_Q = update_clustering(total_edge_weight, resolution, current_graph);
@@ -157,14 +175,17 @@ class Louvain {
 
       best_modularity = new_Q;
 
-      shrink_graph(current_graph, d_cluster_vec);
+      shrink_graph(current_graph);
 
-      num_level++;
+      // TODO:  Note, somehow after shrink_graph - having converted to device_uvector - the
+      //        modularity of the new graph is too small...
+      //   Was that always true?  Perhaps I need to discard the bottom of the dendrogram
+      //   in the break statement above?
     }
 
     timer_display(std::cout);
 
-    return std::make_pair(num_level, best_modularity);
+    return best_modularity;
   }
 
  protected:
@@ -190,6 +211,15 @@ class Louvain {
 #endif
   }
 
+  virtual void initialize_dendrogram_level(vertex_t num_vertices)
+  {
+    dendrogram_->add_level(num_vertices);
+
+    thrust::sequence(rmm::exec_policy(stream_)->on(stream_),
+                     dendrogram_->current_level_begin(),
+                     dendrogram_->current_level_end());
+  }
+
  public:
   void compute_vertex_and_cluster_weights(graph_type const &graph)
   {
@@ -198,8 +228,8 @@ class Louvain {
     edge_t const *d_offsets     = graph.offsets;
     vertex_t const *d_indices   = graph.indices;
     weight_t const *d_weights   = graph.edge_data;
-    weight_t *d_vertex_weights  = vertex_weights_v_.data().get();
-    weight_t *d_cluster_weights = cluster_weights_v_.data().get();
+    weight_t *d_vertex_weights  = vertex_weights_v_.data();
+    weight_t *d_cluster_weights = cluster_weights_v_.data();
 
     //
     // MNMG:  copy_v_transform_reduce_out_nbr, then copy
@@ -229,18 +259,25 @@ class Louvain {
     //
     // MNMG: This is the hard one, see writeup
     //
-    rmm::device_vector<vertex_t> next_cluster_v(cluster_v_);
-    rmm::device_vector<weight_t> delta_Q_v(graph.number_of_edges);
-    rmm::device_vector<vertex_t> cluster_hash_v(graph.number_of_edges);
-    rmm::device_vector<weight_t> old_cluster_sum_v(graph.number_of_vertices);
 
-    vertex_t *d_cluster_hash         = cluster_hash_v.data().get();
-    vertex_t *d_cluster              = cluster_v_.data().get();
-    weight_t const *d_vertex_weights = vertex_weights_v_.data().get();
-    weight_t *d_cluster_weights      = cluster_weights_v_.data().get();
-    weight_t *d_delta_Q              = delta_Q_v.data().get();
+    // TODO:  will this work, or do I need to use the size and then copy?
+    rmm::device_uvector<vertex_t> next_cluster_v(dendrogram_->current_level_size(), stream_);
+    rmm::device_uvector<weight_t> delta_Q_v(graph.number_of_edges, stream_);
+    rmm::device_uvector<vertex_t> cluster_hash_v(graph.number_of_edges, stream_);
+    rmm::device_uvector<weight_t> old_cluster_sum_v(graph.number_of_vertices, stream_);
+
+    vertex_t *d_cluster              = dendrogram_->current_level_begin();
+    weight_t const *d_vertex_weights = vertex_weights_v_.data();
+    weight_t *d_cluster_weights      = cluster_weights_v_.data();
+    weight_t *d_delta_Q              = delta_Q_v.data();
+
+    thrust::copy(rmm::exec_policy(stream_)->on(stream_),
+                 dendrogram_->current_level_begin(),
+                 dendrogram_->current_level_end(),
+                 next_cluster_v.data());
 
-    weight_t new_Q = modularity(total_edge_weight, resolution, graph, cluster_v_.data().get());
+    weight_t new_Q =
+      modularity(total_edge_weight, resolution, graph, dendrogram_->current_level_begin());
 
     weight_t cur_Q = new_Q - 1;
 
@@ -259,13 +296,13 @@ class Louvain {
 
       up_down = !up_down;
 
-      new_Q = modularity(total_edge_weight, resolution, graph, next_cluster_v.data().get());
+      new_Q = modularity(total_edge_weight, resolution, graph, next_cluster_v.data());
 
       if (new_Q > cur_Q) {
         thrust::copy(rmm::exec_policy(stream_)->on(stream_),
                      next_cluster_v.begin(),
                      next_cluster_v.end(),
-                     cluster_v_.begin());
+                     dendrogram_->current_level_begin());
       }
     }
 
@@ -276,45 +313,37 @@ class Louvain {
   void compute_delta_modularity(weight_t total_edge_weight,
                                 weight_t resolution,
                                 graph_type const &graph,
-                                rmm::device_vector<vertex_t> &cluster_hash_v,
-                                rmm::device_vector<weight_t> &old_cluster_sum_v,
-                                rmm::device_vector<weight_t> &delta_Q_v)
+                                rmm::device_uvector<vertex_t> &cluster_hash_v,
+                                rmm::device_uvector<weight_t> &old_cluster_sum_v,
+                                rmm::device_uvector<weight_t> &delta_Q_v)
   {
-    vertex_t const *d_src_indices     = src_indices_v_.data().get();
-    vertex_t const *d_dst_indices     = graph.indices;
     edge_t const *d_offsets           = graph.offsets;
     weight_t const *d_weights         = graph.edge_data;
-    vertex_t const *d_cluster         = cluster_v_.data().get();
-    weight_t const *d_vertex_weights  = vertex_weights_v_.data().get();
-    weight_t const *d_cluster_weights = cluster_weights_v_.data().get();
+    vertex_t const *d_cluster         = dendrogram_->current_level_begin();
+    weight_t const *d_vertex_weights  = vertex_weights_v_.data();
+    weight_t const *d_cluster_weights = cluster_weights_v_.data();
 
-    vertex_t *d_cluster_hash    = cluster_hash_v.data().get();
-    weight_t *d_delta_Q         = delta_Q_v.data().get();
-    weight_t *d_old_cluster_sum = old_cluster_sum_v.data().get();
+    vertex_t *d_cluster_hash    = cluster_hash_v.data();
+    weight_t *d_delta_Q         = delta_Q_v.data();
+    weight_t *d_old_cluster_sum = old_cluster_sum_v.data();
     weight_t *d_new_cluster_sum = d_delta_Q;
 
-    thrust::fill(cluster_hash_v.begin(), cluster_hash_v.end(), vertex_t{-1});
-    thrust::fill(delta_Q_v.begin(), delta_Q_v.end(), weight_t{0.0});
-    thrust::fill(old_cluster_sum_v.begin(), old_cluster_sum_v.end(), weight_t{0.0});
+    thrust::fill(rmm::exec_policy(stream_)->on(stream_),
+                 cluster_hash_v.begin(),
+                 cluster_hash_v.end(),
+                 vertex_t{-1});
+    thrust::fill(
+      rmm::exec_policy(stream_)->on(stream_), delta_Q_v.begin(), delta_Q_v.end(), weight_t{0.0});
+    thrust::fill(rmm::exec_policy(stream_)->on(stream_),
+                 old_cluster_sum_v.begin(),
+                 old_cluster_sum_v.end(),
+                 weight_t{0.0});
 
-    // MNMG:  New technique using reduce_by_key.  Would require a segmented sort
-    //        or a pair of sorts on each node, so probably slower than what's here.
-    //        This might still be faster even in MNMG...
-    //
-    //
-    // FIXME:  Eventually this should use cuCollections concurrent map
-    //         implementation, but that won't be available for a while.
-    //
-    // For each source vertex, we're going to build a hash
-    // table to the destination cluster ids.  We can use
-    // the offsets ranges to define the bounds of the hash
-    // table.
-    //
     thrust::for_each(rmm::exec_policy(stream_)->on(stream_),
                      thrust::make_counting_iterator<edge_t>(0),
                      thrust::make_counting_iterator<edge_t>(graph.number_of_edges),
-                     [d_src_indices,
-                      d_dst_indices,
+                     [d_src_indices = src_indices_v_.data(),
+                      d_dst_indices = graph.indices,
                       d_cluster,
                       d_offsets,
                       d_cluster_hash,
@@ -355,7 +384,7 @@ class Louvain {
       [total_edge_weight,
        resolution,
        d_cluster_hash,
-       d_src_indices,
+       d_src_indices = src_indices_v_.data(),
        d_cluster,
        d_vertex_weights,
        d_delta_Q,
@@ -383,33 +412,37 @@ class Louvain {
   }
 
   void assign_nodes(graph_type const &graph,
-                    rmm::device_vector<vertex_t> &cluster_hash_v,
-                    rmm::device_vector<vertex_t> &next_cluster_v,
-                    rmm::device_vector<weight_t> &delta_Q_v,
+                    rmm::device_uvector<vertex_t> &cluster_hash_v,
+                    rmm::device_uvector<vertex_t> &next_cluster_v,
+                    rmm::device_uvector<weight_t> &delta_Q_v,
                     bool up_down)
   {
-    rmm::device_vector<vertex_t> temp_vertices_v(graph.number_of_vertices);
-    rmm::device_vector<vertex_t> temp_cluster_v(graph.number_of_vertices, vertex_t{-1});
-    rmm::device_vector<weight_t> temp_delta_Q_v(graph.number_of_vertices, weight_t{0.0});
+    rmm::device_uvector<vertex_t> temp_vertices_v(graph.number_of_vertices, stream_);
+    rmm::device_uvector<vertex_t> temp_cluster_v(graph.number_of_vertices, stream_);
+    rmm::device_uvector<weight_t> temp_delta_Q_v(graph.number_of_vertices, stream_);
+
+    thrust::fill(rmm::exec_policy(stream_)->on(stream_),
+                 temp_cluster_v.begin(),
+                 temp_cluster_v.end(),
+                 vertex_t{-1});
 
-    weight_t *d_delta_Q              = delta_Q_v.data().get();
-    vertex_t *d_next_cluster         = next_cluster_v.data().get();
-    vertex_t *d_cluster_hash         = cluster_hash_v.data().get();
-    weight_t const *d_vertex_weights = vertex_weights_v_.data().get();
-    weight_t *d_cluster_weights      = cluster_weights_v_.data().get();
+    thrust::fill(rmm::exec_policy(stream_)->on(stream_),
+                 temp_delta_Q_v.begin(),
+                 temp_delta_Q_v.end(),
+                 weight_t{0});
 
     auto cluster_reduce_iterator =
-      thrust::make_zip_iterator(thrust::make_tuple(d_cluster_hash, d_delta_Q));
+      thrust::make_zip_iterator(thrust::make_tuple(cluster_hash_v.begin(), delta_Q_v.begin()));
 
-    auto output_edge_iterator2 = thrust::make_zip_iterator(
-      thrust::make_tuple(temp_cluster_v.data().get(), temp_delta_Q_v.data().get()));
+    auto output_edge_iterator2 =
+      thrust::make_zip_iterator(thrust::make_tuple(temp_cluster_v.begin(), temp_delta_Q_v.begin()));
 
     auto cluster_reduce_end =
       thrust::reduce_by_key(rmm::exec_policy(stream_)->on(stream_),
                             src_indices_v_.begin(),
                             src_indices_v_.end(),
                             cluster_reduce_iterator,
-                            temp_vertices_v.data().get(),
+                            temp_vertices_v.data(),
                             output_edge_iterator2,
                             thrust::equal_to<vertex_t>(),
                             [] __device__(auto pair1, auto pair2) {
@@ -422,22 +455,18 @@ class Louvain {
                                 return pair2;
                             });
 
-    vertex_t final_size = thrust::distance(temp_vertices_v.data().get(), cluster_reduce_end.first);
-
-    vertex_t *d_temp_vertices = temp_vertices_v.data().get();
-    vertex_t *d_temp_clusters = temp_cluster_v.data().get();
-    weight_t *d_temp_delta_Q  = temp_delta_Q_v.data().get();
+    vertex_t final_size = thrust::distance(temp_vertices_v.data(), cluster_reduce_end.first);
 
     thrust::for_each(rmm::exec_policy(stream_)->on(stream_),
                      thrust::make_counting_iterator<vertex_t>(0),
                      thrust::make_counting_iterator<vertex_t>(final_size),
-                     [d_temp_delta_Q,
-                      up_down,
-                      d_next_cluster,
-                      d_temp_vertices,
-                      d_vertex_weights,
-                      d_temp_clusters,
-                      d_cluster_weights] __device__(vertex_t id) {
+                     [up_down,
+                      d_temp_delta_Q    = temp_delta_Q_v.data(),
+                      d_next_cluster    = next_cluster_v.data(),
+                      d_temp_vertices   = temp_vertices_v.data(),
+                      d_vertex_weights  = vertex_weights_v_.data(),
+                      d_temp_clusters   = temp_cluster_v.data(),
+                      d_cluster_weights = cluster_weights_v_.data()] __device__(vertex_t id) {
                        if ((d_temp_clusters[id] >= 0) && (d_temp_delta_Q[id] > weight_t{0.0})) {
                          vertex_t new_cluster = d_temp_clusters[id];
                          vertex_t old_cluster = d_next_cluster[d_temp_vertices[id]];
@@ -453,38 +482,38 @@ class Louvain {
                      });
   }
 
-  void shrink_graph(graph_t &graph, vertex_t *d_cluster_vec)
+  void shrink_graph(graph_t &graph)
   {
     timer_start("shrinking graph");
 
     // renumber the clusters to the range 0..(num_clusters-1)
-    vertex_t num_clusters = renumber_clusters(d_cluster_vec);
-    cluster_weights_v_.resize(num_clusters);
+    vertex_t num_clusters = renumber_clusters();
+    cluster_weights_v_.resize(num_clusters, stream_);
 
     // shrink our graph to represent the graph of supervertices
     generate_superverticies_graph(graph, num_clusters);
 
-    // assign each new vertex to its own cluster
-    thrust::sequence(rmm::exec_policy(stream_)->on(stream_), cluster_v_.begin(), cluster_v_.end());
-
     timer_stop(stream_);
   }
 
-  vertex_t renumber_clusters(vertex_t *d_cluster_vec)
+  vertex_t renumber_clusters()
   {
-    vertex_t *d_tmp_array       = tmp_arr_v_.data().get();
-    vertex_t *d_cluster_inverse = cluster_inverse_v_.data().get();
-    vertex_t *d_cluster         = cluster_v_.data().get();
+    vertex_t *d_tmp_array       = tmp_arr_v_.data();
+    vertex_t *d_cluster_inverse = cluster_inverse_v_.data();
+    vertex_t *d_cluster         = dendrogram_->current_level_begin();
 
-    vertex_t old_num_clusters = cluster_v_.size();
+    vertex_t old_num_clusters = dendrogram_->current_level_size();
 
     //
     //  New technique.  Initialize cluster_inverse_v_ to 0
     //
-    thrust::fill(cluster_inverse_v_.begin(), cluster_inverse_v_.end(), vertex_t{0});
+    thrust::fill(rmm::exec_policy(stream_)->on(stream_),
+                 cluster_inverse_v_.begin(),
+                 cluster_inverse_v_.end(),
+                 vertex_t{0});
 
     //
-    // Iterate over every element c in cluster_v_ and set cluster_inverse_v to 1
+    // Iterate over every element c in the current clustering and set cluster_inverse_v to 1
     //
     auto first_1 = thrust::make_constant_iterator<vertex_t>(1);
     auto last_1  = first_1 + old_num_clusters;
@@ -492,7 +521,7 @@ class Louvain {
     thrust::scatter(rmm::exec_policy(stream_)->on(stream_),
                     first_1,
                     last_1,
-                    cluster_v_.begin(),
+                    dendrogram_->current_level_begin(),
                     cluster_inverse_v_.begin());
 
     //
@@ -506,7 +535,7 @@ class Louvain {
       [d_cluster_inverse] __device__(const vertex_t idx) { return d_cluster_inverse[idx] == 1; });
 
     vertex_t new_num_clusters = thrust::distance(tmp_arr_v_.begin(), copy_end);
-    tmp_arr_v_.resize(new_num_clusters);
+    tmp_arr_v_.resize(new_num_clusters, stream_);
 
     //
     // Now we can set each value in cluster_inverse of a cluster to its index
@@ -525,32 +554,16 @@ class Louvain {
                        d_cluster[i] = d_cluster_inverse[d_cluster[i]];
                      });
 
-    thrust::for_each(rmm::exec_policy(stream_)->on(stream_),
-                     thrust::make_counting_iterator<vertex_t>(0),
-                     thrust::make_counting_iterator<vertex_t>(number_of_vertices_),
-                     [d_cluster_vec, d_cluster] __device__(vertex_t i) {
-                       d_cluster_vec[i] = d_cluster[d_cluster_vec[i]];
-                     });
-
-    cluster_inverse_v_.resize(new_num_clusters);
-    cluster_v_.resize(new_num_clusters);
+    cluster_inverse_v_.resize(new_num_clusters, stream_);
 
     return new_num_clusters;
   }
 
   void generate_superverticies_graph(graph_t &graph, vertex_t num_clusters)
   {
-    rmm::device_vector<vertex_t> new_src_v(graph.number_of_edges);
-    rmm::device_vector<vertex_t> new_dst_v(graph.number_of_edges);
-    rmm::device_vector<weight_t> new_weight_v(graph.number_of_edges);
-
-    vertex_t *d_old_src    = src_indices_v_.data().get();
-    vertex_t *d_old_dst    = graph.indices;
-    weight_t *d_old_weight = graph.edge_data;
-    vertex_t *d_new_src    = new_src_v.data().get();
-    vertex_t *d_new_dst    = new_dst_v.data().get();
-    vertex_t *d_clusters   = cluster_v_.data().get();
-    weight_t *d_new_weight = new_weight_v.data().get();
+    rmm::device_uvector<vertex_t> new_src_v(graph.number_of_edges, stream_);
+    rmm::device_uvector<vertex_t> new_dst_v(graph.number_of_edges, stream_);
+    rmm::device_uvector<weight_t> new_weight_v(graph.number_of_edges, stream_);
 
     //
     //  Renumber the COO
@@ -558,13 +571,13 @@ class Louvain {
     thrust::for_each(rmm::exec_policy(stream_)->on(stream_),
                      thrust::make_counting_iterator<edge_t>(0),
                      thrust::make_counting_iterator<edge_t>(graph.number_of_edges),
-                     [d_old_src,
-                      d_old_dst,
-                      d_old_weight,
-                      d_new_src,
-                      d_new_dst,
-                      d_new_weight,
-                      d_clusters] __device__(edge_t e) {
+                     [d_old_src    = src_indices_v_.data(),
+                      d_old_dst    = graph.indices,
+                      d_old_weight = graph.edge_data,
+                      d_new_src    = new_src_v.data(),
+                      d_new_dst    = new_dst_v.data(),
+                      d_new_weight = new_weight_v.data(),
+                      d_clusters   = dendrogram_->current_level_begin()] __device__(edge_t e) {
                        d_new_src[e]    = d_clusters[d_old_src[e]];
                        d_new_dst[e]    = d_clusters[d_old_dst[e]];
                        d_new_weight[e] = d_old_weight[e];
@@ -572,39 +585,42 @@ class Louvain {
 
     thrust::stable_sort_by_key(
       rmm::exec_policy(stream_)->on(stream_),
-      d_new_dst,
-      d_new_dst + graph.number_of_edges,
-      thrust::make_zip_iterator(thrust::make_tuple(d_new_src, d_new_weight)));
+      new_dst_v.begin(),
+      new_dst_v.end(),
+      thrust::make_zip_iterator(thrust::make_tuple(new_src_v.begin(), new_weight_v.begin())));
     thrust::stable_sort_by_key(
       rmm::exec_policy(stream_)->on(stream_),
-      d_new_src,
-      d_new_src + graph.number_of_edges,
-      thrust::make_zip_iterator(thrust::make_tuple(d_new_dst, d_new_weight)));
+      new_src_v.begin(),
+      new_src_v.end(),
+      thrust::make_zip_iterator(thrust::make_tuple(new_dst_v.begin(), new_weight_v.begin())));
 
     //
     //  Now we reduce by key to combine the weights of duplicate
     //  edges.
     //
-    auto start     = thrust::make_zip_iterator(thrust::make_tuple(d_new_src, d_new_dst));
-    auto new_start = thrust::make_zip_iterator(thrust::make_tuple(d_old_src, d_old_dst));
-    auto new_end   = thrust::reduce_by_key(rmm::exec_policy(stream_)->on(stream_),
+    auto start =
+      thrust::make_zip_iterator(thrust::make_tuple(new_src_v.begin(), new_dst_v.begin()));
+    auto new_start =
+      thrust::make_zip_iterator(thrust::make_tuple(src_indices_v_.data(), graph.indices));
+    auto new_end = thrust::reduce_by_key(rmm::exec_policy(stream_)->on(stream_),
                                          start,
                                          start + graph.number_of_edges,
-                                         d_new_weight,
+                                         new_weight_v.begin(),
                                          new_start,
-                                         d_old_weight,
+                                         graph.edge_data,
                                          thrust::equal_to<thrust::tuple<vertex_t, vertex_t>>(),
                                          thrust::plus<weight_t>());
 
     graph.number_of_edges    = thrust::distance(new_start, new_end.first);
     graph.number_of_vertices = num_clusters;
 
-    detail::fill_offset(d_old_src, graph.offsets, num_clusters, graph.number_of_edges, stream_);
+    detail::fill_offset(
+      src_indices_v_.data(), graph.offsets, num_clusters, graph.number_of_edges, stream_);
     CHECK_CUDA(stream_);
 
-    src_indices_v_.resize(graph.number_of_edges);
-    indices_v_.resize(graph.number_of_edges);
-    weights_v_.resize(graph.number_of_edges);
+    src_indices_v_.resize(graph.number_of_edges, stream_);
+    indices_v_.resize(graph.number_of_edges, stream_);
+    weights_v_.resize(graph.number_of_edges, stream_);
   }
 
  protected:
@@ -613,27 +629,28 @@ class Louvain {
   edge_t number_of_edges_;
   cudaStream_t stream_;
 
+  std::unique_ptr<Dendrogram<vertex_t>> dendrogram_;
+
   //
   //  Copy of graph
   //
-  rmm::device_vector<edge_t> offsets_v_;
-  rmm::device_vector<vertex_t> indices_v_;
-  rmm::device_vector<weight_t> weights_v_;
-  rmm::device_vector<vertex_t> src_indices_v_;
+  rmm::device_uvector<edge_t> offsets_v_;
+  rmm::device_uvector<vertex_t> indices_v_;
+  rmm::device_uvector<weight_t> weights_v_;
+  rmm::device_uvector<vertex_t> src_indices_v_;
 
   //
   //  Weights and clustering across iterations of algorithm
   //
-  rmm::device_vector<weight_t> vertex_weights_v_;
-  rmm::device_vector<weight_t> cluster_weights_v_;
-  rmm::device_vector<vertex_t> cluster_v_;
+  rmm::device_uvector<weight_t> vertex_weights_v_;
+  rmm::device_uvector<weight_t> cluster_weights_v_;
 
   //
   //  Temporaries used within kernels.  Each iteration uses less
   //  of this memory
   //
-  rmm::device_vector<vertex_t> tmp_arr_v_;
-  rmm::device_vector<vertex_t> cluster_inverse_v_;
+  rmm::device_uvector<vertex_t> tmp_arr_v_;
+  rmm::device_uvector<vertex_t> cluster_inverse_v_;
 
 #ifdef TIMING
   HighResTimer hr_timer_;
diff --git a/cpp/src/experimental/louvain.cuh b/cpp/src/experimental/louvain.cuh
index 4257953d390..cbd831a67bc 100644
--- a/cpp/src/experimental/louvain.cuh
+++ b/cpp/src/experimental/louvain.cuh
@@ -33,6 +33,8 @@
 
 #include <experimental/include_cuco_static_map.cuh>
 
+#include <community/dendrogram.cuh>
+
 //#define TIMING
 
 #ifdef TIMING
@@ -374,9 +376,9 @@ create_graph(raft::handle_t const &handle,
 //  as above would allow us to eventually run the single GPU version of single level Louvain
 //  on the contracted graphs - which should be more efficient.
 //
-// FIXME: We should return the dendogram and let the python layer clean it up (or have a
-//  separate C++ function to flatten the dendogram).  There are customers that might
-//  like the dendogram and the implementation would be a bit cleaner if we did the
+// FIXME: We should return the dendrogram and let the python layer clean it up (or have a
+//  separate C++ function to flatten the dendrogram).  There are customers that might
+//  like the dendrogram and the implementation would be a bit cleaner if we did the
 //  collapsing as a separate step
 //
 template <typename graph_view_type>
@@ -398,6 +400,7 @@ class Louvain {
       hr_timer_(),
 #endif
       handle_(handle),
+      dendrogram_(std::make_unique<Dendrogram<vertex_t>>()),
       current_graph_view_(graph_view),
       compute_partition_(graph_view),
       local_num_vertices_(graph_view.get_number_of_local_vertices()),
@@ -406,7 +409,6 @@ class Louvain {
       local_num_edges_(graph_view.get_number_of_edges()),
       vertex_weights_v_(graph_view.get_number_of_local_vertices()),
       cluster_weights_v_(graph_view.get_number_of_local_vertices()),
-      cluster_v_(graph_view.get_number_of_local_vertices()),
       number_of_vertices_(graph_view.get_number_of_local_vertices()),
       stream_(handle.get_stream())
   {
@@ -440,11 +442,12 @@ class Louvain {
     }
   }
 
-  virtual std::pair<size_t, weight_t> operator()(vertex_t *d_cluster_vec,
-                                                 size_t max_level,
-                                                 weight_t resolution)
+  Dendrogram<vertex_t> &get_dendrogram() const { return *dendrogram_; }
+
+  std::unique_ptr<Dendrogram<vertex_t>> move_dendrogram() { return dendrogram_; }
+
+  virtual weight_t operator()(size_t max_level, weight_t resolution)
   {
-    size_t num_level{0};
     weight_t best_modularity = weight_t{-1};
 
 #ifdef CUCO_STATIC_MAP_DEFINED
@@ -457,17 +460,12 @@ class Louvain {
       [] __device__(auto, auto, weight_t wt, auto, auto) { return wt; },
       weight_t{0});
 
-    //
-    //  Initialize every cluster to reference each vertex to itself
-    //
-    thrust::sequence(rmm::exec_policy(stream_)->on(stream_),
-                     cluster_v_.begin(),
-                     cluster_v_.end(),
-                     base_vertex_id_);
-    thrust::copy(
-      rmm::exec_policy(stream_)->on(stream_), cluster_v_.begin(), cluster_v_.end(), d_cluster_vec);
+    while (dendrogram_->num_levels() < max_level) {
+      //
+      //  Initialize every cluster to reference each vertex to itself
+      //
+      initialize_dendrogram_level(current_graph_view_.get_number_of_local_vertices());
 
-    while (num_level < max_level) {
       compute_vertex_and_cluster_weights();
 
       weight_t new_Q = update_clustering(total_edge_weight, resolution);
@@ -476,15 +474,13 @@ class Louvain {
 
       best_modularity = new_Q;
 
-      shrink_graph(d_cluster_vec);
-
-      num_level++;
+      shrink_graph();
     }
 
     timer_display(std::cout);
 #endif
 
-    return std::make_pair(num_level, best_modularity);
+    return best_modularity;
   }
 
  protected:
@@ -512,6 +508,17 @@ class Louvain {
 #endif
   }
 
+ protected:
+  void initialize_dendrogram_level(vertex_t num_vertices)
+  {
+    dendrogram_->add_level(num_vertices);
+
+    thrust::sequence(rmm::exec_policy(stream_)->on(stream_),
+                     dendrogram_->current_level_begin(),
+                     dendrogram_->current_level_end(),
+                     base_vertex_id_);
+  }
+
  public:
   weight_t modularity(weight_t total_edge_weight, weight_t resolution)
   {
@@ -561,23 +568,16 @@ class Louvain {
                  cluster_weights_v_.begin());
 
     cache_vertex_properties(
-      vertex_weights_v_, src_vertex_weights_cache_v_, dst_vertex_weights_cache_v_);
+      vertex_weights_v_.begin(), src_vertex_weights_cache_v_, dst_vertex_weights_cache_v_);
 
     cache_vertex_properties(
-      cluster_weights_v_, src_cluster_weights_cache_v_, dst_cluster_weights_cache_v_);
+      cluster_weights_v_.begin(), src_cluster_weights_cache_v_, dst_cluster_weights_cache_v_);
 
     timer_stop(stream_);
   }
 
-  //
-  // FIXME:  Consider returning d_src_cache and d_dst_cache
-  //         (as a pair).  This would be a nice optimization
-  //         for single GPU, as we wouldn't need to make 3 copies
-  //         of the data, could return a pair of device pointers to
-  //         local_input_v.
-  //
-  template <typename T>
-  void cache_vertex_properties(rmm::device_vector<T> const &local_input_v,
+  template <typename iterator_t, typename T>
+  void cache_vertex_properties(iterator_t const &local_input_iterator,
                                rmm::device_vector<T> &src_cache_v,
                                rmm::device_vector<T> &dst_cache_v,
                                bool src = true,
@@ -586,13 +586,13 @@ class Louvain {
     if (src) {
       src_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_rows());
       copy_to_adj_matrix_row(
-        handle_, current_graph_view_, local_input_v.begin(), src_cache_v.begin());
+        handle_, current_graph_view_, local_input_iterator, src_cache_v.begin());
     }
 
     if (dst) {
       dst_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_cols());
       copy_to_adj_matrix_col(
-        handle_, current_graph_view_, local_input_v.begin(), dst_cache_v.begin());
+        handle_, current_graph_view_, local_input_iterator, dst_cache_v.begin());
     }
   }
 
@@ -601,9 +601,10 @@ class Louvain {
   {
     timer_start("update_clustering");
 
-    rmm::device_vector<vertex_t> next_cluster_v(cluster_v_);
+    rmm::device_vector<vertex_t> next_cluster_v(dendrogram_->current_level_begin(),
+                                                dendrogram_->current_level_end());
 
-    cache_vertex_properties(next_cluster_v, src_cluster_cache_v_, dst_cluster_cache_v_);
+    cache_vertex_properties(next_cluster_v.begin(), src_cluster_cache_v_, dst_cluster_cache_v_);
 
     weight_t new_Q = modularity(total_edge_weight, resolution);
     weight_t cur_Q = new_Q - 1;
@@ -620,7 +621,7 @@ class Louvain {
 
       up_down = !up_down;
 
-      cache_vertex_properties(next_cluster_v, src_cluster_cache_v_, dst_cluster_cache_v_);
+      cache_vertex_properties(next_cluster_v.begin(), src_cluster_cache_v_, dst_cluster_cache_v_);
 
       new_Q = modularity(total_edge_weight, resolution);
 
@@ -628,12 +629,13 @@ class Louvain {
         thrust::copy(rmm::exec_policy(stream_)->on(stream_),
                      next_cluster_v.begin(),
                      next_cluster_v.end(),
-                     cluster_v_.begin());
+                     dendrogram_->current_level_begin());
       }
     }
 
     // cache the final clustering locally on each cpu
-    cache_vertex_properties(cluster_v_, src_cluster_cache_v_, dst_cluster_cache_v_);
+    cache_vertex_properties(
+      dendrogram_->current_level_begin(), src_cluster_cache_v_, dst_cluster_cache_v_);
 
     timer_stop(stream_);
     return cur_Q;
@@ -662,7 +664,7 @@ class Louvain {
       old_cluster_sum_v.begin());
 
     cache_vertex_properties(
-      old_cluster_sum_v, src_old_cluster_sum_cache_v, empty_cache_weight_v_, true, false);
+      old_cluster_sum_v.begin(), src_old_cluster_sum_cache_v, empty_cache_weight_v_, true, false);
 
     detail::src_cluster_equality_comparator_t<vertex_t, edge_t> compare(
       src_indices_v_.data().get(),
@@ -1118,7 +1120,7 @@ class Louvain {
       });
 
     cache_vertex_properties(
-      cluster_weights_v_, src_cluster_weights_cache_v_, dst_cluster_weights_cache_v_);
+      cluster_weights_v_.begin(), src_cluster_weights_cache_v_, dst_cluster_weights_cache_v_);
   }
 
   template <typename hash_t, typename compare_t, typename skip_edge_t, typename count_t>
@@ -1204,7 +1206,7 @@ class Louvain {
     return std::make_pair(relevant_edges_v, relevant_edge_weights_v);
   }
 
-  void shrink_graph(vertex_t *d_cluster_vec)
+  void shrink_graph()
   {
     timer_start("shrinking graph");
 
@@ -1216,18 +1218,12 @@ class Louvain {
     // renumber the clusters to the range 0..(num_clusters-1)
     vertex_t num_clusters = renumber_clusters(hash_map);
 
-    renumber_result(hash_map, d_cluster_vec, num_clusters);
+    // TODO: renumber result needs to be moved to the dendrogram
+    // renumber_result(hash_map, num_clusters);
 
     // shrink our graph to represent the graph of supervertices
     generate_supervertices_graph(hash_map, num_clusters);
 
-    // assign each new vertex to its own cluster
-    //  MNMG:  This can be done locally with no communication required
-    thrust::sequence(rmm::exec_policy(stream_)->on(stream_),
-                     cluster_v_.begin(),
-                     cluster_v_.end(),
-                     base_vertex_id_);
-
     timer_stop(stream_);
   }
 
@@ -1401,119 +1397,6 @@ class Louvain {
     }
   }
 
-  void renumber_result(cuco::static_map<vertex_t, vertex_t> const &hash_map,
-                       vertex_t *d_cluster_vec,
-                       vertex_t num_clusters)
-  {
-    if (graph_view_t::is_multi_gpu) {
-      //
-      // FIXME: Perhaps there's a general purpose function hidden here...
-      //        Given a set of vertex_t values, and a distributed set of
-      //        vertex properties, go to the proper node and retrieve
-      //        the vertex properties and return them to this gpu.
-      //
-      std::size_t capacity{static_cast<std::size_t>((local_num_vertices_) / 0.7)};
-      cuco::static_map<vertex_t, vertex_t> result_hash_map(
-        capacity, std::numeric_limits<vertex_t>::max(), std::numeric_limits<vertex_t>::max());
-
-      auto cluster_iter = thrust::make_transform_iterator(d_cluster_vec, [] __device__(vertex_t c) {
-        return detail::create_cuco_pair_t<vertex_t>()(c);
-      });
-
-      result_hash_map.insert(cluster_iter, cluster_iter + local_num_vertices_);
-
-      rmm::device_vector<vertex_t> used_cluster_ids_v(result_hash_map.get_size());
-
-      auto transform_iter = thrust::make_transform_iterator(
-        thrust::make_counting_iterator<std::size_t>(0),
-        [d_result_hash_map = result_hash_map.get_device_view()] __device__(std::size_t idx) {
-          return d_result_hash_map.begin_slot()[idx].first.load();
-        });
-
-      used_cluster_ids_v = detail::remove_elements_from_vector(
-        used_cluster_ids_v,
-        transform_iter,
-        transform_iter + result_hash_map.get_capacity(),
-        [vmax = std::numeric_limits<vertex_t>::max()] __device__(vertex_t cluster) {
-          return cluster != vmax;
-        },
-        stream_);
-
-      auto partition_cluster_ids_iter = thrust::make_transform_iterator(
-        used_cluster_ids_v.begin(),
-        [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) {
-          return d_vertex_device_view(v);
-        });
-
-      rmm::device_vector<vertex_t> old_cluster_ids_v =
-        variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(handle_,
-                                                               used_cluster_ids_v.size(),
-                                                               used_cluster_ids_v.begin(),
-                                                               partition_cluster_ids_iter);
-
-      rmm::device_vector<std::size_t> original_gpus_v =
-        variable_shuffle<graph_view_t::is_multi_gpu, std::size_t>(
-          handle_,
-          used_cluster_ids_v.size(),
-          thrust::make_constant_iterator<std::size_t>(rank_),
-          partition_cluster_ids_iter);
-
-      // Now each GPU has old cluster ids, let's compute new cluster ids
-      rmm::device_vector<vertex_t> new_cluster_ids_v(old_cluster_ids_v.size());
-
-      thrust::transform(rmm::exec_policy(stream_)->on(stream_),
-                        old_cluster_ids_v.begin(),
-                        old_cluster_ids_v.end(),
-                        new_cluster_ids_v.begin(),
-                        [base_vertex_id = base_vertex_id_,
-                         d_cluster      = cluster_v_.data().get(),
-                         d_hash_map = hash_map.get_device_view()] __device__(vertex_t cluster_id) {
-                          vertex_t c = d_cluster[cluster_id - base_vertex_id];
-                          auto pos   = d_hash_map.find(c);
-                          return pos->second.load();
-                        });
-
-      // Shuffle everything back
-      old_cluster_ids_v = variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(
-        handle_, old_cluster_ids_v.size(), old_cluster_ids_v.begin(), original_gpus_v.begin());
-      new_cluster_ids_v = variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(
-        handle_, new_cluster_ids_v.size(), new_cluster_ids_v.begin(), original_gpus_v.begin());
-
-      // Update result_hash_map
-      thrust::for_each_n(
-        rmm::exec_policy(stream_)->on(stream_),
-        thrust::make_zip_iterator(
-          thrust::make_tuple(old_cluster_ids_v.begin(), new_cluster_ids_v.begin())),
-        old_cluster_ids_v.size(),
-        [d_result_hash_map = result_hash_map.get_device_view()] __device__(auto pair) mutable {
-          auto pos = d_result_hash_map.find(thrust::get<0>(pair));
-          pos->second.store(thrust::get<1>(pair));
-        });
-
-      thrust::transform(
-        rmm::exec_policy(stream_)->on(stream_),
-        d_cluster_vec,
-        d_cluster_vec + number_of_vertices_,
-        d_cluster_vec,
-        [d_result_hash_map = result_hash_map.get_device_view()] __device__(vertex_t c) {
-          auto pos = d_result_hash_map.find(c);
-          return pos->second.load();
-        });
-
-    } else {
-      thrust::transform(rmm::exec_policy(stream_)->on(stream_),
-                        d_cluster_vec,
-                        d_cluster_vec + number_of_vertices_,
-                        d_cluster_vec,
-                        [d_hash_map    = hash_map.get_device_view(),
-                         d_dst_cluster = dst_cluster_cache_v_.data()] __device__(vertex_t v) {
-                          vertex_t c = d_dst_cluster[v];
-                          auto pos   = d_hash_map.find(c);
-                          return pos->second.load();
-                        });
-    }
-  }
-
   void generate_supervertices_graph(cuco::static_map<vertex_t, vertex_t> const &hash_map,
                                     vertex_t num_clusters)
   {
@@ -1672,6 +1555,8 @@ class Louvain {
   raft::handle_t const &handle_;
   cudaStream_t stream_;
 
+  std::unique_ptr<Dendrogram<vertex_t>> dendrogram_;
+
   vertex_t number_of_vertices_;
   vertex_t base_vertex_id_{0};
   vertex_t base_src_vertex_id_{0};
@@ -1707,7 +1592,6 @@ class Louvain {
   rmm::device_vector<weight_t> src_cluster_weights_cache_v_{};
   rmm::device_vector<weight_t> dst_cluster_weights_cache_v_{};
 
-  rmm::device_vector<vertex_t> cluster_v_;
   rmm::device_vector<vertex_t> src_cluster_cache_v_{};
   rmm::device_vector<vertex_t> dst_cluster_cache_v_{};
 
diff --git a/cpp/tests/experimental/louvain_test.cu b/cpp/tests/experimental/louvain_test.cu
index 4a47b1a1aca..35a26923df6 100644
--- a/cpp/tests/experimental/louvain_test.cu
+++ b/cpp/tests/experimental/louvain_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/notebooks/community/ECG.ipynb b/notebooks/community/ECG.ipynb
index d7595dadb26..4a9eedd3c3a 100644
--- a/notebooks/community/ECG.ipynb
+++ b/notebooks/community/ECG.ipynb
@@ -5,6 +5,7 @@
    "metadata": {},
    "source": [
     "# Ensemble Clustering for Graphs (ECG)\n",
+    "# Does not run on Pascal\n",
     "In this notebook, we will use cuGraph to identify the cluster in a test graph using the Ensemble Clustering for Graph approach.  \n",
     "\n",
     "\n",
diff --git a/notebooks/community/Louvain.ipynb b/notebooks/community/Louvain.ipynb
index e5e5e6a04ed..bfb8e299f49 100755
--- a/notebooks/community/Louvain.ipynb
+++ b/notebooks/community/Louvain.ipynb
@@ -5,6 +5,7 @@
    "metadata": {},
    "source": [
     "# Louvain Community Detection\n",
+    "# Does not run on Pascal\n",
     "\n",
     "\n",
     "In this notebook, we will use cuGraph to identify the cluster in a test graph using the Louvain algorithm  \n",
diff --git a/notebooks/community/Subgraph-Extraction.ipynb b/notebooks/community/Subgraph-Extraction.ipynb
index e068ef53aa5..cac52262d4d 100755
--- a/notebooks/community/Subgraph-Extraction.ipynb
+++ b/notebooks/community/Subgraph-Extraction.ipynb
@@ -5,6 +5,7 @@
    "metadata": {},
    "source": [
     "# Subgraph Extraction\n",
+    "# Does not run on Pascal\n",
     "\n",
     "In this notebook, we will use cuGraph to extract a subgraph from the test graph.  \n",
     "\n",
diff --git a/notebooks/community/Triangle-Counting.ipynb b/notebooks/community/Triangle-Counting.ipynb
index 09d7906a526..19d3f838fc6 100755
--- a/notebooks/community/Triangle-Counting.ipynb
+++ b/notebooks/community/Triangle-Counting.ipynb
@@ -21,7 +21,7 @@
     "\n",
     "\n",
     "## Introduction\n",
-    "Triancle Counting, as the name implies, finds the number of triangles in a graph.  Triangles are important in computing the clustering Coefficient and can be used for clustering.  \n",
+    "Triangle Counting, as the name implies, finds the number of triangles in a graph.  Triangles are important in computing the clustering Coefficient and can be used for clustering.  \n",
     "\n",
     "\n",
     "To compute the Pagerank scores for a graph in cuGraph we use:<br>\n",
diff --git a/notebooks/cugraph_benchmarks/bfs_benchmark.ipynb b/notebooks/cugraph_benchmarks/bfs_benchmark.ipynb
index 58eb94bf0ee..6ae695e206e 100644
--- a/notebooks/cugraph_benchmarks/bfs_benchmark.ipynb
+++ b/notebooks/cugraph_benchmarks/bfs_benchmark.ipynb
@@ -5,6 +5,7 @@
    "metadata": {},
    "source": [
     "# BFS Performance Benchmarking\n",
+    "# Skip notebook test\n",
     "\n",
     "This notebook benchmarks performance of running BFS within cuGraph against NetworkX. \n",
     "\n",
diff --git a/notebooks/cugraph_benchmarks/louvain_benchmark.ipynb b/notebooks/cugraph_benchmarks/louvain_benchmark.ipynb
index a12b7c4bcc2..00e99a28617 100644
--- a/notebooks/cugraph_benchmarks/louvain_benchmark.ipynb
+++ b/notebooks/cugraph_benchmarks/louvain_benchmark.ipynb
@@ -5,6 +5,7 @@
    "metadata": {},
    "source": [
     "# Louvain Performance Benchmarking\n",
+    "# Skip notebook test\n",
     "\n",
     "This notebook benchmarks performance improvement of running the Louvain clustering algorithm within cuGraph against NetworkX. The test is run over eight test networks (graphs) and then results plotted.  \n",
     "<p><p>\n",
diff --git a/notebooks/cugraph_benchmarks/nx_cugraph_bc_benchmarking.ipynb b/notebooks/cugraph_benchmarks/nx_cugraph_bc_benchmarking.ipynb
index 6f76868f9a4..403c317ac0a 100644
--- a/notebooks/cugraph_benchmarks/nx_cugraph_bc_benchmarking.ipynb
+++ b/notebooks/cugraph_benchmarks/nx_cugraph_bc_benchmarking.ipynb
@@ -5,6 +5,7 @@
    "metadata": {},
    "source": [
     "# Benchmarking NetworkX compatibility\n",
+    "# Skip notebook test\n",
     "This notebook benchmark the use of a NetworkX Graph object as input into algorithms.  <p>\n",
     "The intention of the feature is to be able to drop cuGraph into existing NetworkX code in spot where performance is not optimal.\n",
     "\n",
diff --git a/notebooks/cugraph_benchmarks/pagerank_benchmark.ipynb b/notebooks/cugraph_benchmarks/pagerank_benchmark.ipynb
index c2933a10c7d..d0416efdd87 100644
--- a/notebooks/cugraph_benchmarks/pagerank_benchmark.ipynb
+++ b/notebooks/cugraph_benchmarks/pagerank_benchmark.ipynb
@@ -5,6 +5,7 @@
    "metadata": {},
    "source": [
     "# PageRank Performance Benchmarking\n",
+    "# Skip notebook test\n",
     "\n",
     "This notebook benchmarks performance of running PageRank within cuGraph against NetworkX. NetworkX contains several implementations of PageRank.  This benchmark will compare cuGraph versus the defaukt Nx implementation as well as the SciPy version\n",
     "\n",
diff --git a/notebooks/cugraph_benchmarks/release.ipynb b/notebooks/cugraph_benchmarks/release.ipynb
index d3110da3621..3c6da55abc0 100644
--- a/notebooks/cugraph_benchmarks/release.ipynb
+++ b/notebooks/cugraph_benchmarks/release.ipynb
@@ -5,6 +5,7 @@
    "metadata": {},
    "source": [
     "# Release Benchmarking\n",
+    "# Skip notebook test\n",
     "\n",
     "With every release, RAPIDS publishes a release slide deck that includes the current performance state of cuGraph. \n",
     "This notebook, starting with release 0.15, runs all the various algorithms to computes the performance gain.  \n",
diff --git a/notebooks/cugraph_benchmarks/sssp_benchmark.ipynb b/notebooks/cugraph_benchmarks/sssp_benchmark.ipynb
index 2d040e0acaf..32b562e7a1e 100644
--- a/notebooks/cugraph_benchmarks/sssp_benchmark.ipynb
+++ b/notebooks/cugraph_benchmarks/sssp_benchmark.ipynb
@@ -5,6 +5,7 @@
    "metadata": {},
    "source": [
     "# SSSP Performance Benchmarking\n",
+    "# Skip notebook test\n",
     "\n",
     "This notebook benchmarks performance of running SSSP within cuGraph against NetworkX. \n",
     "\n",
diff --git a/notebooks/demo/uvm.ipynb b/notebooks/demo/uvm.ipynb
index d279be8ed54..8fa2b08b6d1 100644
--- a/notebooks/demo/uvm.ipynb
+++ b/notebooks/demo/uvm.ipynb
@@ -6,6 +6,7 @@
    "source": [
     "# Oversubscribing GPU memory in cuGraph\n",
     "#### Author : Alex Fender\n",
+    "# Skip notebook test\n",
     "\n",
     "In this notebook, we will show how to **scale to 4x larger graphs than before** without incurring a performance drop using managed memory features in cuGraph. We will compute the PageRank of each user in Twitter's dataset on a single GPU as an example. This technique applies to all features.\n",
     "\n",
diff --git a/python/cugraph/tests/test_ecg.py b/python/cugraph/tests/test_ecg.py
index 86f9ed343ce..3028c702721 100644
--- a/python/cugraph/tests/test_ecg.py
+++ b/python/cugraph/tests/test_ecg.py
@@ -16,32 +16,38 @@
 import pytest
 import networkx as nx
 import cugraph
+
 from cugraph.tests import utils
+from cugraph.utilities.utils import is_device_version_less_than
+
 from pathlib import PurePath
 
 
 def cugraph_call(G, min_weight, ensemble_size):
     df = cugraph.ecg(G, min_weight, ensemble_size)
     num_parts = df["partition"].max() + 1
-    score = cugraph.analyzeClustering_modularity(G, num_parts, df,
-                                                 'vertex', 'partition')
+    score = cugraph.analyzeClustering_modularity(
+        G, num_parts, df, "vertex", "partition"
+    )
 
     return score, num_parts
 
 
 def golden_call(graph_file):
-    if graph_file == PurePath(utils.RAPIDS_DATASET_ROOT_DIR)/"dolphins.csv":
+    if graph_file == PurePath(utils.RAPIDS_DATASET_ROOT_DIR) / "dolphins.csv":
         return 0.4962422251701355
-    if graph_file == PurePath(utils.RAPIDS_DATASET_ROOT_DIR)/"karate.csv":
+    if graph_file == PurePath(utils.RAPIDS_DATASET_ROOT_DIR) / "karate.csv":
         return 0.38428664207458496
-    if graph_file == PurePath(utils.RAPIDS_DATASET_ROOT_DIR)/"netscience.csv":
+    if (
+        graph_file
+        == PurePath(utils.RAPIDS_DATASET_ROOT_DIR) / "netscience.csv"
+    ):
         return 0.9279554486274719
 
 
-DATASETS = [PurePath(utils.RAPIDS_DATASET_ROOT_DIR)/f for f in [
-    "karate.csv",
-    "dolphins.csv",
-    "netscience.csv"]
+DATASETS = [
+    PurePath(utils.RAPIDS_DATASET_ROOT_DIR) / f
+    for f in ["karate.csv", "dolphins.csv", "netscience.csv"]
 ]
 
 MIN_WEIGHTS = [0.05, 0.10, 0.15]
@@ -55,20 +61,32 @@ def golden_call(graph_file):
 def test_ecg_clustering(graph_file, min_weight, ensemble_size):
     gc.collect()
 
-    # Read in the graph and get a cugraph object
-    cu_M = utils.read_csv_file(graph_file, read_weights_in_sp=False)
-    G = cugraph.Graph()
-    G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2")
+    if is_device_version_less_than((7, 0)):
+        cu_M = utils.read_csv_file(graph_file, read_weights_in_sp=False)
+        G = cugraph.Graph()
+        G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2")
 
-    # Get the modularity score for partitioning versus random assignment
-    cu_score, num_parts = cugraph_call(G, min_weight, ensemble_size)
-    golden_score = golden_call(graph_file)
+        # Get the modularity score for partitioning versus random assignment
+        with pytest.raises(RuntimeError):
+            cu_score, num_parts = cugraph_call(G, min_weight, ensemble_size)
+    else:
+        # Read in the graph and get a cugraph object
+        cu_M = utils.read_csv_file(graph_file, read_weights_in_sp=False)
+        G = cugraph.Graph()
+        G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2")
+
+        # Get the modularity score for partitioning versus random assignment
+        cu_score, num_parts = cugraph_call(G, min_weight, ensemble_size)
+        golden_score = golden_call(graph_file)
 
-    # Assert that the partitioning has better modularity than the random
-    # assignment
-    assert cu_score > (0.95 * golden_score)
+        # Assert that the partitioning has better modularity than the random
+        # assignment
+        assert cu_score > (0.95 * golden_score)
 
 
+@pytest.mark.skipif(
+    is_device_version_less_than((7, 0)), reason="Not supported on Pascal"
+)
 @pytest.mark.parametrize("graph_file", DATASETS)
 @pytest.mark.parametrize("min_weight", MIN_WEIGHTS)
 @pytest.mark.parametrize("ensemble_size", ENSEMBLE_SIZES)
@@ -78,8 +96,7 @@ def test_ecg_clustering_nx(graph_file, min_weight, ensemble_size):
     # Read in the graph and get a NetworkX graph
     M = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True)
     G = nx.from_pandas_edgelist(
-        M, source="0", target="1", edge_attr="weight",
-        create_using=nx.Graph()
+        M, source="0", target="1", edge_attr="weight", create_using=nx.Graph()
     )
 
     # Get the modularity score for partitioning versus random assignment
diff --git a/python/cugraph/tests/test_leiden.py b/python/cugraph/tests/test_leiden.py
index d6a7f86b5c5..89203d5014c 100644
--- a/python/cugraph/tests/test_leiden.py
+++ b/python/cugraph/tests/test_leiden.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -20,6 +20,8 @@
 import cugraph
 from cugraph.tests import utils
 
+from cugraph.utilities.utils import is_device_version_less_than
+
 # Temporarily suppress warnings till networkX fixes deprecation warnings
 # (Using or importing the ABCs from 'collections' instead of from
 # 'collections.abc' is deprecated, and in 3.8 it will stop working) for
@@ -53,6 +55,9 @@ def cugraph_louvain(G, edgevals=False):
     return parts, mod
 
 
+@pytest.mark.skipif(
+    is_device_version_less_than((7, 0)), reason="Not supported on Pascal"
+)
 @pytest.mark.parametrize("graph_file", utils.DATASETS)
 def test_leiden(graph_file):
     gc.collect()
@@ -73,6 +78,9 @@ def test_leiden(graph_file):
     assert leiden_mod >= (0.99 * louvain_mod)
 
 
+@pytest.mark.skipif(
+    is_device_version_less_than((7, 0)), reason="Not supported on Pascal"
+)
 @pytest.mark.parametrize("graph_file", utils.DATASETS)
 def test_leiden_nx(graph_file):
     gc.collect()
@@ -81,18 +89,13 @@ def test_leiden_nx(graph_file):
     NM = utils.read_csv_for_nx(graph_file)
 
     if edgevals:
-        G = nx.from_pandas_edgelist(NM,
-                                    create_using=nx.Graph(),
-                                    source="0",
-                                    target="1"
-                                    )
+        G = nx.from_pandas_edgelist(
+            NM, create_using=nx.Graph(), source="0", target="1"
+        )
     else:
-        G = nx.from_pandas_edgelist(NM,
-                                    create_using=nx.Graph(),
-                                    source="0",
-                                    target="1",
-                                    edge_attr="2"
-                                    )
+        G = nx.from_pandas_edgelist(
+            NM, create_using=nx.Graph(), source="0", target="1", edge_attr="2"
+        )
 
     leiden_parts, leiden_mod = cugraph_leiden(G, edgevals=True)
     louvain_parts, louvain_mod = cugraph_louvain(G, edgevals=True)
diff --git a/python/cugraph/tests/test_louvain.py b/python/cugraph/tests/test_louvain.py
index d6b0030eb73..50e9ccaa4c5 100644
--- a/python/cugraph/tests/test_louvain.py
+++ b/python/cugraph/tests/test_louvain.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,6 +18,7 @@
 
 import cugraph
 from cugraph.tests import utils
+from cugraph.utilities.utils import is_device_version_less_than
 
 # Temporarily suppress warnings till networkX fixes deprecation warnings
 # (Using or importing the ABCs from 'collections' instead of from
@@ -71,51 +72,63 @@ def networkx_call(M):
 def test_louvain_with_edgevals(graph_file):
     gc.collect()
 
-    M = utils.read_csv_for_nx(graph_file)
-    cu_M = utils.read_csv_file(graph_file)
-    cu_parts, cu_mod = cugraph_call(cu_M, edgevals=True)
+    if is_device_version_less_than((7, 0)):
+        cu_M = utils.read_csv_file(graph_file)
+        with pytest.raises(RuntimeError):
+            cu_parts, cu_mod = cugraph_call(cu_M)
+    else:
+        M = utils.read_csv_for_nx(graph_file)
+        cu_M = utils.read_csv_file(graph_file)
+        cu_parts, cu_mod = cugraph_call(cu_M, edgevals=True)
 
-    nx_parts = networkx_call(M)
-    # Calculating modularity scores for comparison
-    Gnx = nx.from_pandas_edgelist(
-        M, source="0", target="1", edge_attr="weight", create_using=nx.Graph()
-    )
+        nx_parts = networkx_call(M)
+        # Calculating modularity scores for comparison
+        Gnx = nx.from_pandas_edgelist(
+            M, source="0", target="1",
+            edge_attr="weight", create_using=nx.Graph()
+        )
 
-    cu_parts = cu_parts.to_pandas()
-    cu_map = dict(zip(cu_parts['vertex'], cu_parts['partition']))
+        cu_parts = cu_parts.to_pandas()
+        cu_map = dict(zip(cu_parts["vertex"], cu_parts["partition"]))
 
-    assert set(nx_parts.keys()) == set(cu_map.keys())
+        assert set(nx_parts.keys()) == set(cu_map.keys())
 
-    cu_mod_nx = community.modularity(cu_map, Gnx)
-    nx_mod = community.modularity(nx_parts, Gnx)
+        cu_mod_nx = community.modularity(cu_map, Gnx)
+        nx_mod = community.modularity(nx_parts, Gnx)
 
-    assert len(cu_parts) == len(nx_parts)
-    assert cu_mod > (0.82 * nx_mod)
-    assert abs(cu_mod - cu_mod_nx) < 0.0001
+        assert len(cu_parts) == len(nx_parts)
+        assert cu_mod > (0.82 * nx_mod)
+        assert abs(cu_mod - cu_mod_nx) < 0.0001
 
 
 @pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED)
 def test_louvain(graph_file):
     gc.collect()
 
-    M = utils.read_csv_for_nx(graph_file)
-    cu_M = utils.read_csv_file(graph_file)
-    cu_parts, cu_mod = cugraph_call(cu_M)
-    nx_parts = networkx_call(M)
+    if is_device_version_less_than((7, 0)):
+        cu_M = utils.read_csv_file(graph_file)
+        with pytest.raises(RuntimeError):
+            cu_parts, cu_mod = cugraph_call(cu_M)
+    else:
+        M = utils.read_csv_for_nx(graph_file)
+        cu_M = utils.read_csv_file(graph_file)
+        cu_parts, cu_mod = cugraph_call(cu_M)
+        nx_parts = networkx_call(M)
 
-    # Calculating modularity scores for comparison
-    Gnx = nx.from_pandas_edgelist(
-        M, source="0", target="1", edge_attr="weight", create_using=nx.Graph()
-    )
+        # Calculating modularity scores for comparison
+        Gnx = nx.from_pandas_edgelist(
+            M, source="0", target="1",
+            edge_attr="weight", create_using=nx.Graph()
+        )
 
-    cu_parts = cu_parts.to_pandas()
-    cu_map = dict(zip(cu_parts['vertex'], cu_parts['partition']))
+        cu_parts = cu_parts.to_pandas()
+        cu_map = dict(zip(cu_parts["vertex"], cu_parts["partition"]))
 
-    assert set(nx_parts.keys()) == set(cu_map.keys())
+        assert set(nx_parts.keys()) == set(cu_map.keys())
 
-    cu_mod_nx = community.modularity(cu_map, Gnx)
-    nx_mod = community.modularity(nx_parts, Gnx)
+        cu_mod_nx = community.modularity(cu_map, Gnx)
+        nx_mod = community.modularity(nx_parts, Gnx)
 
-    assert len(cu_parts) == len(nx_parts)
-    assert cu_mod > (0.82 * nx_mod)
-    assert abs(cu_mod - cu_mod_nx) < 0.0001
+        assert len(cu_parts) == len(nx_parts)
+        assert cu_mod > (0.82 * nx_mod)
+        assert abs(cu_mod - cu_mod_nx) < 0.0001