Skip to content

Commit

Permalink
speedup <DataMask>$set(name, chunks) (tidyverse#5474)
Browse files Browse the repository at this point in the history
* speedup <DataMask>$set(name, chunks)

* Also internalise <DataMask>$add()

* convert to utf8 if needed before comparing strings
  • Loading branch information
romainfrancois committed Sep 2, 2020
1 parent d9c228a commit 745c9a6
Show file tree
Hide file tree
Showing 4 changed files with 143 additions and 19 deletions.
21 changes: 2 additions & 19 deletions R/data-mask.R
Original file line number Diff line number Diff line change
Expand Up @@ -88,28 +88,11 @@ DataMask <- R6Class("DataMask",
}
}

pos <- which(names(private$resolved) == name)
is_new_column <- length(pos) == 0L

if (is_new_column) {
pos <- length(private$resolved) + 1L
used <- FALSE
} else {
used <- private$used[[pos]]
}

if (!used) {
private$used[[pos]] <- TRUE
private$which_used <- c(private$which_used, pos)
}

private$resolved[[name]] <- chunks
.Call(`dplyr_mask_add`, private, name, chunks)
},

set = function(name, chunks) {
private$resolved[[name]] <- chunks
private$used <- !map_lgl(private$resolved, is.null)
private$which_used <- which(private$used)
.Call(`dplyr_mask_set`, private, name, chunks)
},

remove = function(name) {
Expand Down
10 changes: 10 additions & 0 deletions src/dplyr.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@
#include <Rinternals.h>
#include <R_ext/Rdynload.h>

#define UTF8_MASK (1<<3)
#define ASCII_MASK (1<<6)

#define IS_ASCII(x) (LEVELS(x) & ASCII_MASK)
#define IS_UTF8(x) (LEVELS(x) & UTF8_MASK)

namespace dplyr {

struct envs {
Expand All @@ -26,6 +32,7 @@ struct symbols {
static SEXP which_used;
static SEXP dot_drop;
static SEXP abort_glue;
static SEXP used;
};

struct vectors {
Expand Down Expand Up @@ -63,6 +70,9 @@ SEXP dplyr_summarise_recycle_chunks(SEXP chunks, SEXP rows, SEXP ptypes);
SEXP dplyr_group_indices(SEXP data, SEXP s_nr);
SEXP dplyr_group_keys(SEXP group_data);

SEXP dplyr_mask_set(SEXP env_private, SEXP s_name, SEXP chunks);
SEXP dplyr_mask_add(SEXP env_private, SEXP s_name, SEXP chunks);

#define DPLYR_MASK_INIT() \
SEXP rows = PROTECT(Rf_findVarInFrame(env_private, dplyr::symbols::rows)); \
R_xlen_t ngroups = XLENGTH(rows); \
Expand Down
4 changes: 4 additions & 0 deletions src/init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ SEXP symbols::bindings = Rf_install("bindings");
SEXP symbols::which_used = Rf_install("which_used");
SEXP symbols::dot_drop = Rf_install(".drop");
SEXP symbols::abort_glue = Rf_install("abort_glue");
SEXP symbols::used = Rf_install("used");

SEXP vectors::classes_vctrs_list_of = get_classes_vctrs_list_of();
SEXP vectors::empty_int_vector = get_empty_int_vector();
Expand Down Expand Up @@ -82,6 +83,9 @@ static const R_CallMethodDef CallEntries[] = {
{"dplyr_group_indices", (DL_FUNC)& dplyr_group_indices, 2},
{"dplyr_group_keys", (DL_FUNC)& dplyr_group_keys, 1},

{"dplyr_mask_set", (DL_FUNC)& dplyr_mask_set, 3},
{"dplyr_mask_add", (DL_FUNC)& dplyr_mask_add, 3},

{NULL, NULL, 0}
};

Expand Down
127 changes: 127 additions & 0 deletions src/mask.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#include "dplyr.h"

SEXP as_utf8(SEXP s) {
if (!IS_UTF8(s) && !IS_ASCII(s)) {
s = Rf_mkCharCE(Rf_translateCharUTF8(s), CE_UTF8);
}
return s;
}

R_xlen_t find_first(SEXP haystack, SEXP needle) {
needle = as_utf8(needle);
R_xlen_t n = XLENGTH(haystack);
R_xlen_t i_name = 0;
for (; i_name < n; i_name++) {
if (needle == as_utf8(STRING_ELT(haystack, i_name))) break;
}

return i_name;
}

SEXP integers_append(SEXP ints, int x) {
R_xlen_t n = XLENGTH(ints);
SEXP new_ints = PROTECT(Rf_allocVector(INTSXP, n + 1));
int* p_ints = INTEGER(ints);
int* p_new_ints = INTEGER(new_ints);
for (R_xlen_t i = 0; i < n; i++) {
p_new_ints[i] = p_ints[i];
}
p_new_ints[n] = x;
UNPROTECT(1);
return new_ints;
}

SEXP dplyr_mask_add(SEXP env_private, SEXP s_name, SEXP chunks) {
SEXP name = STRING_ELT(s_name, 0);

// we assume control over these
SEXP resolved = Rf_findVarInFrame(env_private, dplyr::symbols::resolved);
SEXP names_resolved = PROTECT(Rf_getAttrib(resolved, R_NamesSymbol));
SEXP used = Rf_findVarInFrame(env_private, dplyr::symbols::used);
SEXP which_used = Rf_findVarInFrame(env_private, dplyr::symbols::which_used);

// search for position of name
R_xlen_t n = XLENGTH(names_resolved);
R_xlen_t i_name = find_first(names_resolved, name);

int* p_used = LOGICAL(used);
bool is_new_column = i_name == n;
if (is_new_column) {
SEXP new_used = PROTECT(Rf_allocVector(LGLSXP, n + 1));
SEXP new_resolved = PROTECT(Rf_allocVector(VECSXP, n + 1));
SEXP new_names_resolved = PROTECT(Rf_allocVector(STRSXP, n + 1));
int* p_new_used = LOGICAL(new_used);

for (R_xlen_t i = 0; i < n; i++) {
SET_VECTOR_ELT(new_resolved, i, VECTOR_ELT(resolved, i));
SET_STRING_ELT(new_names_resolved, i, STRING_ELT(names_resolved, i));
p_new_used[i] = p_used[i];
}
SET_VECTOR_ELT(new_resolved, n, chunks);
SET_STRING_ELT(new_names_resolved, n, name);
p_new_used[n] = TRUE;

SEXP new_which_used = PROTECT(integers_append(which_used, n + 1));

Rf_namesgets(new_resolved, new_names_resolved);
Rf_defineVar(dplyr::symbols::resolved, new_resolved, env_private);
Rf_defineVar(dplyr::symbols::used, new_used, env_private);
Rf_defineVar(dplyr::symbols::which_used, new_which_used, env_private);

UNPROTECT(4);
} else {
SET_VECTOR_ELT(resolved, i_name, chunks);
p_used[i_name] = TRUE;

SEXP new_which_used = PROTECT(integers_append(which_used, i_name + 1));
Rf_defineVar(dplyr::symbols::which_used, new_which_used, env_private);
UNPROTECT(1);
}
UNPROTECT(1); // names_resolved
return R_NilValue;
}

SEXP dplyr_mask_set(SEXP env_private, SEXP s_name, SEXP chunks) {
SEXP name = STRING_ELT(s_name, 0);

// we assume control over these
SEXP resolved = Rf_findVarInFrame(env_private, dplyr::symbols::resolved);
SEXP names_resolved = PROTECT(Rf_getAttrib(resolved, R_NamesSymbol));
SEXP used = Rf_findVarInFrame(env_private, dplyr::symbols::used);

// search for position of name
R_xlen_t n = XLENGTH(names_resolved);
R_xlen_t i_name = find_first(names_resolved, name);
UNPROTECT(1); // names_resolved

if (i_name == n && chunks == R_NilValue) {
// early return, as this is removing a resolved that wasn't
// so it does nothing
return R_NilValue;
}

// update used
LOGICAL(used)[i_name] = chunks != R_NilValue;
SET_VECTOR_ELT(resolved, i_name, chunks);

// count how many are used
int* p_used = LOGICAL(used);
R_xlen_t n_used = 0;
for (R_xlen_t i = 0; i < n; i++, ++p_used) {
n_used += *p_used;
}

// update which_used
SEXP which_used = PROTECT(Rf_allocVector(INTSXP, n_used));
int* p_which_used = INTEGER(which_used);
p_used = LOGICAL(used);
for (R_xlen_t i = 0, j = 0; i < n; i++) {
if (p_used[i]) {
p_which_used[j++] = i + 1;
}
}
Rf_defineVar(dplyr::symbols::which_used, which_used, env_private);

UNPROTECT(1); // which_used
return R_NilValue;
}

0 comments on commit 745c9a6

Please sign in to comment.