Skip to content

Commit

Permalink
Support for raw vectors (#2952)
Browse files Browse the repository at this point in the history
* support for raw vectors in mutate and summarise. #1803

* raw support for arrange, joins (minimal raw x raw), group_by. #1803

* wrong place in NEWS.md

* astyle

* up rcpp generated code

* drop Rank specialization for RAWSXP

* not naming parameter that is not used, identified by `-Werror=unused-parameter`

* typo s/defaut/default/

* handle raw in grouped and rowwise mutate #1803

* test  `Collecter_Impl::can_promote`

* test that `slice` handles raw matrices columns.
test for `MatrixColumnSubsetVisitor<RAWSXP>::subset_int`

* using ConstColumn instead of a const_cast

* tests for joins on raw column

* use poor man's map_chr instead of `purrr`'s

* move `default_value` to its own header file and use it in `Lead` and `Lag` constructors instead of override them for `RAWSXP`.

* rm the `VectorVisitorImpl<RAWSXP>` specialization, and specialize the lower level `comparisons` instead.

* pleasing `-Werror=unused-parameter` once again 🙊
  • Loading branch information
Romain François authored and krlmlr committed Dec 21, 2017
1 parent 85e31dd commit c04deb3
Show file tree
Hide file tree
Showing 31 changed files with 258 additions and 87 deletions.
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@
* Move build-time vs. run-time checks out of `.onLoad()` and into `dr_dplyr()`.


* Support for raw vector columns in `mutate`, `summarise`, `arrange`, `group_by`
and joins (minimal `raw` x `raw` support initially) (#1803).

# dplyr 0.7.1

* Use new versions of bindrcpp and glue to avoid protection problems.
Expand Down
42 changes: 42 additions & 0 deletions inst/include/dplyr/Collecter.h
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,46 @@ class Collecter_Impl<INTSXP> : public Collecter {

};

template <>
class Collecter_Impl<RAWSXP> : public Collecter {
public:
Collecter_Impl(int n_): data(n_, (Rbyte)0) {}

void collect(const SlicingIndex& index, SEXP v, int offset = 0) {
warn_loss_attr(v);
RawVector source(v);
Rbyte* source_ptr = source.begin() + offset;
for (int i = 0; i < index.size(); i++) {
data[index[i]] = source_ptr[i];
}
}

inline SEXP get() {
return data;
}

inline bool compatible(SEXP x) {
return TYPEOF(x) == RAWSXP ;
}

bool can_promote(SEXP x) const {
return
(TYPEOF(x) == REALSXP && !Rf_inherits(x, "POSIXct") && !Rf_inherits(x, "Date")) ||
(TYPEOF(x) == INTSXP && !Rf_inherits(x, "factor"))
;
}

std::string describe() const {
return "raw";
}

protected:
RawVector data;

};



template <int RTYPE>
class TypedCollecter : public Collecter_Impl<RTYPE> {
public:
Expand Down Expand Up @@ -619,6 +659,8 @@ inline Collecter* collecter(SEXP model, int n) {
stop("Columns of class data.frame not supported");
}
return new Collecter_Impl<VECSXP>(n);
case RAWSXP:
return new Collecter_Impl<RAWSXP>(n);
default:
break;
}
Expand Down
2 changes: 2 additions & 0 deletions inst/include/dplyr/Gatherer.h
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,8 @@ inline Gatherer* constant_gatherer(SEXP x, int n, const SymbolString& name) {
return new ConstantGathererImpl<CPLXSXP>(x, n);
case VECSXP:
return new ConstantGathererImpl<VECSXP>(x, n);
case RAWSXP:
return new ConstantGathererImpl<RAWSXP>(x, n);
default:
break;
}
Expand Down
27 changes: 25 additions & 2 deletions inst/include/dplyr/MatrixColumnSubsetVectorVisitor.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ class MatrixColumnSubsetVisitor : public SubsetVectorVisitor {

typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
typedef typename Matrix<RTYPE>::Column Column;
typedef typename Matrix<RTYPE>::ConstColumn ConstColumn;

MatrixColumnSubsetVisitor(const Matrix<RTYPE>& data_) : data(data_) {}

Expand All @@ -31,7 +32,7 @@ class MatrixColumnSubsetVisitor : public SubsetVectorVisitor {
for (int h = 0; h < nc; h++) {
ChunkIndexMap::const_iterator it = index.begin();
Column column = res.column(h);
Column source_column = const_cast<Matrix<RTYPE>&>(data).column(h);
ConstColumn source_column = data.column(h);

for (int i = 0; i < n; i++, ++it) {
column[i] = source_column[ it->first ];
Expand Down Expand Up @@ -64,7 +65,7 @@ class MatrixColumnSubsetVisitor : public SubsetVectorVisitor {
Matrix<RTYPE> res(n, nc);
for (int h = 0; h < nc; h++) {
Column column = res.column(h);
Column source_column = const_cast<Matrix<RTYPE>&>(data).column(h);
ConstColumn source_column = data.column(h);
for (int k = 0; k < n; k++) {
int idx = index[k];
if (idx < 0) {
Expand All @@ -80,6 +81,28 @@ class MatrixColumnSubsetVisitor : public SubsetVectorVisitor {
Matrix<RTYPE> data;
};

// because RAWSXP does not have the NA concept
template <>
template <typename Container>
inline SEXP MatrixColumnSubsetVisitor<RAWSXP>::subset_int(const Container& index) const {
int n = index.size(), nc = data.ncol();
Matrix<RAWSXP> res(n, nc);
for (int h = 0; h < nc; h++) {
Column column = res.column(h);
ConstColumn source_column = data.column(h);
for (int k = 0; k < n; k++) {
int idx = index[k];
if (idx < 0) {
column[k] = (Rbyte)0;
} else {
column[k] = source_column[ index[k] ];
}
}
}
return res;
}


}

#endif
4 changes: 4 additions & 0 deletions inst/include/dplyr/OrderVisitorImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,8 @@ inline OrderVisitor* order_visitor_asc_matrix(SEXP vec) {
return new OrderVisitorMatrix<STRSXP, ascending>(vec);
case DPLYR_CPLXSXP:
return new OrderVisitorMatrix<CPLXSXP, ascending>(vec);
case DPLYR_RAWSXP:
return new OrderVisitorMatrix<RAWSXP, ascending>(vec);
case DPLYR_VECSXP:
stop("Matrix can't be a list");
}
Expand All @@ -262,6 +264,8 @@ inline OrderVisitor* order_visitor_asc_vector(SEXP vec) {
return new OrderCharacterVectorVisitorImpl<ascending>(vec);
case CPLXSXP:
return new OrderVectorVisitorImpl<CPLXSXP, ascending, Vector<CPLXSXP > >(vec);
case RAWSXP:
return new OrderVectorVisitorImpl<RAWSXP, ascending, Vector<RAWSXP > >(vec);
case VECSXP:
{
if (Rf_inherits(vec, "data.frame")) {
Expand Down
4 changes: 4 additions & 0 deletions inst/include/dplyr/Result/GroupedSubset.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ inline GroupedSubset* grouped_subset(SEXP x, int max_size) {
return new GroupedSubsetTemplate<VECSXP>(x, max_size);
case CPLXSXP:
return new GroupedSubsetTemplate<CPLXSXP>(x, max_size);
case RAWSXP:
return new GroupedSubsetTemplate<RAWSXP>(x, max_size);
default:
break;
}
Expand Down Expand Up @@ -126,6 +128,8 @@ inline GroupedSubset* summarised_subset(SummarisedVariable x) {
return new SummarisedSubsetTemplate<VECSXP>(x);
case CPLXSXP:
return new SummarisedSubsetTemplate<CPLXSXP>(x);
case RAWSXP:
return new SummarisedSubsetTemplate<RAWSXP>(x);
default:
break;
}
Expand Down
2 changes: 1 addition & 1 deletion inst/include/dplyr/Result/Lag.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class Lag : public Result {
Lag(SEXP data_, int n_, const RObject& def_, bool is_summary_) :
data(data_),
n(n_),
def(Vector<RTYPE>::get_na()),
def(default_value<RTYPE>()),
is_summary(is_summary_)
{
if (!Rf_isNull(def_)) {
Expand Down
4 changes: 2 additions & 2 deletions inst/include/dplyr/Result/Lead.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

#include <tools/scalar_type.h>
#include <tools/utils.h>

#include <dplyr/default_value.h>
#include <dplyr/Result/Result.h>

namespace dplyr {
Expand All @@ -16,7 +16,7 @@ class Lead : public Result {
Lead(SEXP data_, int n_, const RObject& def_, bool is_summary_) :
data(data_),
n(n_),
def(Vector<RTYPE>::get_na()),
def(default_value<RTYPE>()),
is_summary(is_summary_)
{
if (!Rf_isNull(def_)) {
Expand Down
2 changes: 1 addition & 1 deletion inst/include/dplyr/Result/Rank.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ class RankEqual {
};

// powers both dense_rank and min_rank, see dplyr.cpp for how it is used
template <int RTYPE, typename Increment, bool ascending = true>
template <int RTYPE, typename Increment, bool ascending>
class Rank_Impl : public Result, public Increment {
public:
typedef typename Increment::OutputVector OutputVector;
Expand Down
2 changes: 2 additions & 0 deletions inst/include/dplyr/Result/RowwiseSubset.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ inline RowwiseSubset* rowwise_subset(SEXP x) {
return new RowwiseSubsetTemplate<CPLXSXP>(x);
case DPLYR_VECSXP:
return new RowwiseSubsetTemplate<VECSXP>(x);
case DPLYR_RAWSXP:
return new RowwiseSubsetTemplate<RAWSXP>(x);
}

stop("Unreachable");
Expand Down
11 changes: 11 additions & 0 deletions inst/include/dplyr/SubsetVectorVisitorImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,17 @@ class SubsetVectorVisitorImpl : public SubsetVectorVisitor {

};

template <>
template <typename Container>
SEXP SubsetVectorVisitorImpl<RAWSXP>::subset_int_index(const Container& index) const {
int n = output_size(index);
RawVector out(n);
for (int i = 0; i < n; i++)
out[i] = (index[i] < 0) ? (Rbyte)0 : vec[ index[i] ];
copy_most_attributes(out, vec);
return out;
}

template <>
template <typename Container>
SEXP SubsetVectorVisitorImpl<VECSXP>::subset_int_index(const Container& index) const {
Expand Down
4 changes: 3 additions & 1 deletion inst/include/dplyr/VectorVisitorImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ template <> inline std::string VectorVisitorType<CPLXSXP>() {
template <> inline std::string VectorVisitorType<VECSXP>() {
return "list";
}
template <> inline std::string VectorVisitorType<RAWSXP>() {
return "raw";
}

/**
* Implementations
Expand Down Expand Up @@ -133,7 +136,6 @@ class FactorVisitor : public VectorVisitorImpl<INTSXP> {
SEXP* levels_ptr;
};


template <>
class VectorVisitorImpl<STRSXP> : public VectorVisitor {
public:
Expand Down
5 changes: 4 additions & 1 deletion inst/include/dplyr/checks.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ enum SupportedType {
DPLYR_REALSXP = REALSXP,
DPLYR_CPLXSXP = CPLXSXP,
DPLYR_STRSXP = STRSXP,
DPLYR_VECSXP = VECSXP
DPLYR_VECSXP = VECSXP,
DPLYR_RAWSXP = RAWSXP
};

inline std::string type_name(SEXP x) {
Expand Down Expand Up @@ -69,6 +70,8 @@ inline SupportedType check_supported_type(SEXP x, const SymbolString& name = Str
return DPLYR_STRSXP;
case VECSXP:
return DPLYR_VECSXP;
case RAWSXP:
return DPLYR_RAWSXP;
default:
if (name.is_empty()) {
Rcpp::stop("is of unsupported type %s", type_name(x));
Expand Down
23 changes: 23 additions & 0 deletions inst/include/dplyr/comparisons.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,29 @@ struct comparisons {

};

template <>
struct comparisons<RAWSXP> {
typedef Rbyte STORAGE;

static inline bool is_less(STORAGE lhs, STORAGE rhs) {
return lhs < rhs;
}

static inline bool is_greater(STORAGE lhs, STORAGE rhs) {
return lhs > rhs;
}

static inline bool equal_or_both_na(STORAGE lhs, STORAGE rhs) {
return lhs == rhs;
}

static inline bool is_na(STORAGE) {
return false ;
}

};


template <>
struct comparisons<STRSXP> {
static inline bool is_less(SEXP lhs, SEXP rhs) {
Expand Down
22 changes: 22 additions & 0 deletions inst/include/dplyr/default_value.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#ifndef dplyr_dplyr_default_value_H
#define dplyr_dplyr_default_value_H

namespace dplyr {

template <int RTYPE>
inline typename Rcpp::traits::storage_type<RTYPE>::type default_value() {
return Rcpp::Vector<RTYPE>::get_na() ;
}

template <>
inline Rbyte default_value<RAWSXP>() {
return (Rbyte)0 ;
}

template <>
inline SEXP default_value<VECSXP>() {
return R_NilValue ;
}

}
#endif
1 change: 1 addition & 0 deletions inst/include/dplyr/dplyr.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <dplyr/DataFrameJoinVisitors.h>
#include <dplyr/Order.h>
#include <dplyr/Hybrid.h>
#include <dplyr/default_value.h>
#include <dplyr/Result/all.h>
#include <dplyr/Gatherer.h>
#include <dplyr/Replicator.h>
Expand Down
4 changes: 4 additions & 0 deletions inst/include/dplyr/subset_visitor_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ inline SubsetVectorVisitor* subset_visitor_matrix(SEXP vec) {
return new MatrixColumnSubsetVisitor<REALSXP>(vec);
case LGLSXP:
return new MatrixColumnSubsetVisitor<LGLSXP>(vec);
case RAWSXP:
return new MatrixColumnSubsetVisitor<RAWSXP>(vec);
case STRSXP:
return new MatrixColumnSubsetVisitor<STRSXP>(vec);
case VECSXP:
Expand Down Expand Up @@ -63,6 +65,8 @@ inline SubsetVectorVisitor* subset_visitor_vector(SEXP vec) {
return new SubsetVectorVisitorImpl<REALSXP>(vec);
case LGLSXP:
return new SubsetVectorVisitorImpl<LGLSXP>(vec);
case RAWSXP:
return new SubsetVectorVisitorImpl<RAWSXP>(vec);
case STRSXP:
return new SubsetVectorVisitorImpl<STRSXP>(vec);

Expand Down
2 changes: 2 additions & 0 deletions inst/include/dplyr/visitor_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ inline VectorVisitor* visitor_vector(SEXP vec) {
return new VectorVisitorImpl<LGLSXP>(vec);
case STRSXP:
return new VectorVisitorImpl<STRSXP>(vec);
case RAWSXP:
return new VectorVisitorImpl<RAWSXP>(vec);

case VECSXP: {
if (Rf_inherits(vec, "data.frame")) {
Expand Down
2 changes: 2 additions & 0 deletions inst/include/dplyr/white_list.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ inline bool white_list(SEXP x) {
return true;
}
switch (TYPEOF(x)) {
case RAWSXP:
return true;
case INTSXP:
return true;
case REALSXP:
Expand Down
3 changes: 0 additions & 3 deletions src/arrange.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ List arrange_impl(DataFrame data, QuosureList quosures) {

check_valid_colnames(data);
assert_all_white_list(data);

List variables(nargs);
LogicalVector ascending(nargs);

Expand Down Expand Up @@ -63,10 +62,8 @@ List arrange_impl(DataFrame data, QuosureList quosures) {
ascending[i] = !is_desc;
}
variables.names() = quosures.names();

OrderVisitors o(variables, ascending, nargs);
IntegerVector index = o.apply();

DataFrameSubsetVisitors visitors(data, data.names());
List res = visitors.subset(index, get_class(data));

Expand Down
Loading

0 comments on commit c04deb3

Please sign in to comment.