Implement ArrayOfRagged (#927)

* Implement ArrayOfRagged * Fix issues and pass tests * fix style * change few statements of functions and move the definiation of template Array1OfRagged to header file * add offsets test code
k2-fsa · pkufool · Apr 14, 2022 · Sep 8, 2021 · Sep 14, 2021 · Sep 14, 2021
commit 3b83183234d0f1d8391872630551c5af7c491ed2
diff --git a/k2/csrc/array_of_ragged.cu b/k2/csrc/array_of_ragged.cu
@@ -1,5 +1,7 @@
 /**
- * Copyright      2022  Xiaomi Corporation (authors: Wei Kang)
+ * Copyright      2022  Xiaomi Corporation (authors: Daniel Povey, Wei Kang)
+ *                2022  ASLP@NWPU          (authors: Hang Lyu)
+
  *
  * See LICENSE for clarification regarding multiple authors
  *
@@ -20,35 +22,107 @@
 
 namespace k2 {
 
-Array1OfRaggedShape::Array1OfRaggedShape(RaggedShape *src, int32_t num_srcs)
-    : num_srcs_(num_srcs) {
-  K2_CHECK_GE(num_srcs, 1);
-  K2_CHECK(src);
-  num_axes_ = src[0].NumAxes();
-  c_ = src[0].Context();
+Array1OfRaggedShape::Array1OfRaggedShape(RaggedShape *srcs, int32_t num_srcs) :
+  num_srcs_(num_srcs) {
+  K2_CHECK_GT(num_srcs, 0);
+  K2_CHECK(srcs);
+
+  // Initialize context and num_axes_.
+  c_ = srcs[0].Context();
+  num_axes_ = srcs[0].NumAxes();
+
+  // Check if they have same num-axes and compatible context.
+  for (int32_t i = 1; i < num_srcs_; ++i) {
+    K2_CHECK_EQ(num_axes_, srcs[i].NumAxes());
+    K2_CHECK(c_->IsCompatible(*(srcs[i].Context())));
+  }
 
-  row_splits_ =
-      Array2<const int32_t *>(GetCpuContext(), num_axes_ - 1, num_srcs_);
+  // Initialize row_splits__, row_ids_ and tot_sizes_.
+  //
+  // Notice: since the Data() function is a __host__ function, it cannot be
+  // called on GPU. It limits us to work on CPU so that the row_splits_ and
+  // row_ids_ are populated on CPU, although the operator() of Array2 is a
+  // __host__ and __device__ function. Bear in mind, we cannot access the
+  // GPU data on CPU.
+  row_splits_ = Array2<const int32_t *>(GetCpuContext(),
+                                        num_axes_ - 1, num_srcs_);
   row_ids_ = Array2<const int32_t *>(GetCpuContext(), num_axes_ - 1, num_srcs_);
+
+  // Notice: no matter the return value of TotSize() is from 'cached_tot_size'
+  //  or the Back() function (i.e. operator[]) of array1, it it a CPU value.
   tot_sizes_ = Array1<int32_t>(GetCpuContext(), num_axes_, 0);
 
   auto row_splits_acc = row_splits_.Accessor(),
        row_ids_acc = row_ids_.Accessor();
+  // Bear in mind, when axis == 0, the TotSize() is row_splits.Dim() - 1.
+  // When 0 < axis < NumAxes(), the TotSize() is row_splits.Back().
   int32_t *tot_sizes_data = tot_sizes_.Data();
 
   for (int32_t i = 0; i < num_srcs_; ++i) {
-    K2_CHECK_EQ(src[i].NumAxes(), num_axes_);
-    K2_CHECK(c_->IsCompatible(*(src[i].Context())));
     for (int32_t j = 1; j < num_axes_; ++j) {
-      row_splits_acc(j - 1, i) = src[i].RowSplits(j).Data();
-      row_ids_acc(j - 1, i) = src[i].RowIds(j).Data();
-      tot_sizes_data[j] += src[i].TotSize(j);
+      row_splits_acc(j - 1, i) = srcs[i].RowSplits(j).Data();
+      row_ids_acc(j - 1, i) = srcs[i].RowIds(j).Data();
+      tot_sizes_data[j] += srcs[i].TotSize(j);
     }
-    tot_sizes_data[0] += src[i].TotSize(0);
+    tot_sizes_data[0] += srcs[i].TotSize(0);
   }
 
   row_splits_ = row_splits_.To(c_);
   row_ids_ = row_ids_.To(c_);
+  tot_sizes_ = tot_sizes_.To(c_);
+
+
+  // Initialize meat_row_splits_
+  // We populate this on CPU and transfer to GPU.
+  meta_row_splits_ = Array2<int32_t>(GetCpuContext(), num_axes_, num_srcs_ + 1);
+  offsets_ = Array2<int32_t>(GetCpuContext(), num_axes_ + 1, num_srcs_ + 1);
+
+  auto meta_row_splits_acc = meta_row_splits_.Accessor(),
+       offsets_acc = offsets_.Accessor();
+
+  // Initialize the 1st row of offsets_, which contains 0,1,2,...
+  for (int32_t col = 0; col <= num_srcs_; ++col) {
+    offsets_acc(0, col) = col;
+  }
+  // Initialize the 1st col of meta_row_splits_ and offsets_
+  for (int32_t row = 0; row < num_axes_; ++row) {
+    meta_row_splits_acc(row, 0) = 0;
+    offsets_acc(row + 1, 0) = 0;
+  }
+
+  // The meta_row_splits_ is the cumulative sum of the tot-sizes of the
+  // individual arrays.
+  for (int32_t i = 0; i < num_axes_; ++i) {
+    for (int32_t j = 1; j <= num_srcs_; ++j) {
+      meta_row_splits_acc(i, j) = meta_row_splits_acc(i, j - 1) +
+                                  srcs[j - 1].TotSize(i);
+      offsets_acc(i + 1, j) = meta_row_splits_acc(i, j);
+    }
+  }
+
+  // Initialize meta_row_ids_
+  // Elements are in [0, NumSrcs() - 1]
+  meta_row_ids_.resize(num_axes_);
+
+  for (int32_t axis = 0; axis < num_axes_; ++axis) {
+    // The length equals to TotSize(axis)
+    meta_row_ids_.at(axis) = Array1<int32_t>(
+        GetCpuContext(), meta_row_splits_acc(axis, num_srcs_));
+    int32_t *meta_row_ids_data = meta_row_ids_[axis].Data();
+
+    int32_t cur_row_start = meta_row_splits_acc(axis, 0);
+    for (int32_t src = 0; src < num_srcs_; ++src) {
+      int32_t next_row_start = meta_row_splits_acc(axis, src + 1);
+      for (; cur_row_start < next_row_start; ++cur_row_start) {
+        meta_row_ids_data[cur_row_start] = src;
+      }
+    }
+    meta_row_ids_[axis] = meta_row_ids_[axis].To(c_);
+  }
+
+  meta_row_splits_ = meta_row_splits_.To(c_);
+  offsets_ = offsets_.To(c_);
 }
 
+
 }  // namespace k2
diff --git a/k2/csrc/array_of_ragged.h b/k2/csrc/array_of_ragged.h
@@ -1,5 +1,6 @@
 /**
  * Copyright      2022  Xiaomi Corporation (authors: Daniel Povey, Wei Kang)
+ *                2022  ASLP@NWPU          (authors: Hang Lyu)
  *
  * See LICENSE for clarification regarding multiple authors
  *
@@ -24,31 +25,48 @@
 #include <vector>
 
 #include "k2/csrc/array.h"
+#include "k2/csrc/array_ops.h"
 #include "k2/csrc/context.h"
 #include "k2/csrc/log.h"
-#include "k2/csrc/ragged_ops.h"
+#include "k2/csrc/ragged.h"
 
 namespace k2 {
+
+/*
+  Array1OfRagged<T> is a 1-dimensional array of Ragged<T>.
+  It is intended for situations where you want to do some operations on
+  arrays of ragged arrays, without explicitly concatenating them (e.g. to
+  save time).   This is a fairly low-level interface, intended to
+  be used mostly by CUDA/C++ implementation code.  It is a convenience
+  wrapper that saves you the trouble of creating arrays of pointers.
+ */
+
+
 /*
   Array1OfRaggedShape is a convenience function that gives you easy access
   to pointers-of-pointers for an array of ragged shapes.
  */
 class Array1OfRaggedShape {
  public:
+  // Default constructor.
+  Array1OfRaggedShape() = default;
+
   /*
     Constructor.
     Args:
-       srcs: pointers to the source shapes, a CPU pointer
-       num_srcs: the number of source shapes.  All shapes must have the
-                 same NumAxes() and must be on the same device.
+      srcs: pointers to the source shapes, a CPU pointer
+      num_srcs: the number of source shapes.  All shapes must have the
+                same NumAxes() and must be on the same device.
 
    TODO: we'll likely, later, add optional args which dictate which of
    the MetaRowSplits() and MetaRowIds() are to be pre-populated; this should
    enable us to save kernels by combining certain operations across the
    axes.
+
   */
-  Array1OfRaggedShape(RaggedShape *srcs, int32_t num_srcs);
-  Array1OfRaggedShape() = default;
+  Array1OfRaggedShape(RaggedShape *srcs,
+                     int32_t num_srcs);
+
 
   int32_t NumSrcs() const { return num_srcs_; }
   int32_t NumAxes() const { return num_axes_; }
@@ -63,23 +81,35 @@ class Array1OfRaggedShape {
   // Returns device-accessible vector of row-splits for a particular
   // axis, indexed by 0 <= src < num_srcs.
   const int32_t **RowSplits(int32_t axis) {
-    return row_splits_.Row(axis - 1).Data();
+      K2_CHECK_LT(static_cast<uint32_t>(axis),
+                  static_cast<uint32_t>(num_axes_));
+      return row_splits_.Row(axis - 1).Data();
   }
 
   // Returns device-accessible array of row-ids for the individual shapes
   // indexed [axis-1][src], with 0 <= src < num_srcs.  The shape of this
   // Array2 is [NumAxes() - 1][NumSrcs()].
-  const Array2<const int32_t *> *RowIds() const { return &row_ids_; }
+  const Array2<const int32_t*> *RowIds() const { return &row_ids_; }
+
 
   // Returns device-accessible vector of row-splits for a particular
   // axis, indexed by 0 <= src < num_srcs.
-  const int32_t **RowIds(int32_t axis) { return row_ids_.Row(axis - 1).Data(); }
+  const int32_t **RowIds(int32_t axis) {
+      K2_CHECK_LT(static_cast<uint32_t>(axis),
+                  static_cast<uint32_t>(num_axes_));
+      return row_ids_.Row(axis - 1).Data();
+  }
+
 
   /* Return the  total size on this axis, which is the sum of the TotSize() of
      the individual shapes.  Requires 0 <= axis < NumAxes() and
      for axis=0 the returned value is the same as Dim0().
   */
-  int32_t TotSize(int32_t axis) const { return tot_sizes_[axis]; }
+  int32_t TotSize(int32_t axis) const {
+      K2_CHECK_LT(static_cast<uint32_t>(axis),
+                  static_cast<uint32_t>(num_axes_));
+      return tot_sizes_[axis];
+  }
 
   // equivalent to TotSize(0).
   int32_t Dim0() const { return TotSize(0); }
@@ -88,7 +118,7 @@ class Array1OfRaggedShape {
      along the src axis, of the tot-sizes of the individual arrays.
      This Array2 is of shape [NumAxes()][NumSrcs() + 1], indexed [axis][src];
      caution, the indexing is different from RowSplits(), there is no offset.
-     Also, the meta_row_splits0 is a thing, unlike with regular row-splits
+     Also, the meta_row_splits_ is a thing, unlike with regular row-splits
      which start from 1.
 
      Caution: the lengths of the arrays pointed to by the elements of this
@@ -99,38 +129,47 @@ class Array1OfRaggedShape {
      to GPU, this will be faster than invoking an extra kernel in normal cases
      when the NumSrcs() is small.  [Also: see GetRowInfoMulti()].
    */
-  // TODO: implement it...
-  Array2<int32_t> MetaRowSplits();
+  const Array2<int32_t> &MetaRowSplits() const { return meta_row_splits_; }
 
   // could POSSIBLY add this so this code could be used in functions like
   // Stack(). would be like MetaRowSplits but with an extra 1st row containing
   // 0,1,2,... We could perhaps create it with 1 extra initial row so this is
   // always convenient to output.
-  // TODO: implement it...
-  Array2<int32_t> Offsets();
+  const Array2<int32_t> &Offsets() const { return offsets_; }
 
   /*
-    Returns the meta-row-splits for a particular axis, with 0 <= axis <
-    NumAxes(); this is the cumulative sum of the TotSize(axis) for all of the
-    sources, with MetaRowSplits(axis).Dim() == NumSrcs() + 1.
+    Returns the meta-row-splits for a particular axis, with
+    0 <= axis < NumAxes();
+    this is the cumulative sum of the TotSize(axis) for all of the sources,
+    with MetaRowSplits(axis).Dim() == NumSrcs() + 1.
 
-    Note: in ragged_ops.cu we refer to this as composed_row_splits
+    Note: in ragged_opts.cu we refer to this as composed_row_splits
   */
-  // TODO: implement it...
-  Array1<int32_t> MetaRowSplits(int32_t axis);
+  Array1<int32_t> MetaRowSplits(int32_t axis) {
+    K2_CHECK_LT(static_cast<uint32_t>(axis),
+                static_cast<uint32_t>(num_axes_));
+    return meta_row_splits_.Row(axis);
+  }
 
   /* Return the device-accessible meta-row-ids, which are the row-ids
      corresponding to MetaRowSplits(); this tells us, for indexes into the
-     appended/concatenated array, which source array they belong to, i.e.
-     elements are in [0,NumSrcs()-1].
+     appended/concatenated array, which source array they belong to,
+     i.e. elements are in [0,NumSrcs()-1].
 
      This cannot be an Array2 because unlike the MetaRowSplits(), all the
      row-ids arrays are of different lengths.
 
      Note: in ragged_ops.cu we refer to this as composed_row_ids.
   */
-  // TODO: implement it...
-  Array1<int32_t *> MetaRowIds();
+  Array1<const int32_t*> MetaRowIds() {
+    Array1<const int32_t*> ans(GetCpuContext(), num_axes_);
+    const int32_t* *ans_data = ans.Data();
+    for (int32_t i = 0; i < num_axes_; ++i) {
+      ans_data[i] = meta_row_ids_[i].Data();
+    }
+    ans = ans.To(c_);
+    return ans;
+  }
 
   /*
     Returns the meta-row-ids for a particular axis, with 0 <= axis < NumAxes();
@@ -140,18 +179,28 @@ class Array1OfRaggedShape {
     would tell us which source an idx012 with value 100 into axis 2 of
     concatenated array would come from.
   */
-  // TODO: implement it...
-  Array1<int32_t> MetaRowIds(int32_t axis);
+  const Array1<int32_t> &MetaRowIds(int32_t axis) const {
+    K2_CHECK_LT(static_cast<uint32_t>(axis),
+                static_cast<uint32_t>(num_axes_));
+    return meta_row_ids_[axis];
+  }
 
  private:
   ContextPtr c_;
   int32_t num_srcs_;
   int32_t num_axes_;
+
   Array2<const int32_t *> row_splits_;  // shape [num_axes_ - 1][num_srcs_]
   Array2<const int32_t *> row_ids_;     // shape [num_axes_ - 1][num_srcs_]
-  Array1<int32_t> tot_sizes_;           // dim num_axes_, this is on CPU
+  Array1<int32_t> tot_sizes_;           // dim num_axes_
+
+  Array2<int32_t> meta_row_splits_;     // shape [num_axes_][num_srcs_ + 1]
+  Array2<int32_t> offsets_;             // shape [num_axes_][num_srcs_ + 1]
+  std::vector<Array1<int32_t> > meta_row_ids_;  // dim num_axes_
 };
 
+
+
 /*
   Array1OfRagged<T> is a 1-dimensional array of Ragged<T>.
   It is intended for situations where you want to do some operations on
@@ -171,17 +220,14 @@ struct Array1OfRagged {
   int32_t NumSrcs() const { return values.Dim(); }
   ContextPtr &Context() { return shape.Context(); }
 
+  // Default constructor will not leave this a valid Array1OfRagged object,
+  // you shouldn't do anything with it. Both members will be initialized with
+  // default constructors.
   Array1OfRagged() = default;
 
-  /*
-      Constructor.
-      Args:
-         srcs: pointers to the source ragged tensors, a CPU pointer
-         num_srcs: the number of source ragged tensors.  All ragged tensors must
-                   have the same NumAxes() and must be on the same device.
-  */
+  // The 'srcs' should have the same number of axes.
   Array1OfRagged(Ragged<T> *srcs, int32_t num_srcs) {
-    K2_CHECK_GE(num_srcs, 1);
+    K2_CHECK_GT(num_srcs, 0);
     K2_CHECK(srcs);
     values = Array1<T *>(GetCpuContext(), num_srcs);
     T **values_data = values.Data();
@@ -195,6 +241,7 @@ struct Array1OfRagged {
   }
 };
 
+
 }  // namespace k2
 
 #endif  // K2_CSRC_ARRAY_OF_RAGGED_H_