Skip to content

Commit

Permalink
Refactor evaluation with lazy vec_chop() and dynamic mask (with activ…
Browse files Browse the repository at this point in the history
…e bindings) (tidyverse#5520)

* + lazy_vec_chop()

* + dplyr_data_masks()

* stack imbalmance

* + env_resolved() to query which promise has been resolved

* install .indices for top mask and individual masks

* DataMask holds chops and masks

* use individual data masks instead of meta data mask that gets updated all the time

* filter() internals also not using metamask

* No longer using DataMask$mask

* abandon DataMask$forget()

* remove DataMask$bindings

* Abandon DataMask$which_used

* Remove DataMask$used, as it can be recalculated based on $chops and $resolved

* Abandon DataMask$resolved, instead just keep track of DataMask$all_vars and rely on chops for resolvedness

* rm  lazy_vec_chop.R, as this is only ever used inside DataMask. Make the data mask and pronouns internally

* register symbols

* use callables for as_data_pronoun() and new_data_mask() instead of R callbacks.

* top can be NULL

* back to only one mask, with active bindings instead of promises

* comment why the [] is needed

* comments about the parent envirinments of $chops and $mask in DataMask

* use VECTOR_PTR_RO() and STRING_PTR_RO()

* DPLYR_MASK_EVAL() takes single rgument

* reset all_vars on mutate(=NULL)

* NEWS bullet

* update comment

* remove warning from installChar() (tidyverse#5609)

* remove warning from installChar()
  • Loading branch information
romainfrancois committed Nov 19, 2020
1 parent 2b23229 commit 5b29134
Show file tree
Hide file tree
Showing 17 changed files with 395 additions and 228 deletions.
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# dplyr (development version)

* Improved performance with many columns, with a dynamic data mask using active
bindings and lazy chops (#5017).

* `mutate()` and friends preserves row names in data frames once more (#5418).

* `group_by()` uses the ungrouped data for the implicit mutate step (#5598).
Expand Down
5 changes: 4 additions & 1 deletion R/context.R
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,10 @@ cur_group <- function() {
#' @rdname context
#' @export
cur_group_id <- function() {
peek_mask("cur_group_id()")$get_current_group()
# [] to get a copy because the current group is dealt with internally
# if we don't get a copy, code like this won't give correct result:
# summarise(id = cur_group_id())
peek_mask("cur_group_id()")$get_current_group()[]
}

#' @rdname context
Expand Down
159 changes: 62 additions & 97 deletions R/data-mask.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,71 +11,21 @@ DataMask <- R6Class("DataMask",
frame <- caller_env(n = 2)
local_mask(self, frame)

private$data <- data
private$caller <- caller
private$bindings <- env(empty_env())
private$keys <- group_keys(data)
private$group_vars <- group_vars(data)

# A function that returns all the chunks for a column
resolve_chunks <- if (inherits(data, "rowwise_df")) {
function(index) {
col <- .subset2(data, index)
res <- vec_chop(col, rows)
if (vec_is_list(col)) {
res <- map(res, `[[`, 1L)
}
res
}
} else if (is_grouped_df(data)) {
function(index) vec_chop(.subset2(data, index), rows)
} else {
# for ungrouped data frames, there is only one chunk that
# is made of the full column
function(index) list(.subset2(data, index))
}

private$used <- rep(FALSE, ncol(data))

names_bindings <- chr_unserialise_unicode(names2(data))
if (anyDuplicated(names_bindings)) {
abort("Can't transform a data frame with duplicate names.")
}
names(data) <- names_bindings
private$all_vars <- names_bindings
private$data <- data
private$caller <- caller

private$resolved <- set_names(vector(mode = "list", length = ncol(data)), names_bindings)

promise_fn <- function(index, chunks = resolve_chunks(index), name = names_bindings[index]) {
# resolve the chunks and hold the slice for current group
res <- .subset2(chunks, self$get_current_group())

# track - not safe to directly use `index`
self$set(name, chunks)

# return result for current slice
res
}

promises <- map(seq_len(ncol(data)), function(.x) expr(promise_fn(!!.x)))
env_bind_lazy(private$bindings, !!!set_names(promises, names_bindings))

private$mask <- new_data_mask(private$bindings)
private$mask$.data <- as_data_pronoun(private$mask)
},

forget = function(fn) {
names_bindings <- self$current_vars()
private$chops <- .Call(dplyr_lazy_vec_chop_impl, data, rows)
private$mask <- .Call(dplyr_data_masks_setup, private$chops, data, rows)

osbolete_promise_fn <- function(name) {
abort(c(
"Obsolete data mask.",
x = glue("Too late to resolve `{name}` after the end of `dplyr::{fn}()`."),
i = glue("Did you save an object that uses `{name}` lazily in a column in the `dplyr::{fn}()` expression ?")
))
}
private$keys <- group_keys(data)
private$group_vars <- group_vars(data)

promises <- map(names_bindings, function(.x) expr(osbolete_promise_fn(!!.x)))
env_unbind(private$bindings, names_bindings)
env_bind_lazy(private$bindings, !!!set_names(promises, names_bindings))
},

add = function(name, chunks) {
Expand All @@ -91,33 +41,12 @@ DataMask <- R6Class("DataMask",
.Call(`dplyr_mask_add`, private, name, chunks)
},

set = function(name, chunks) {
.Call(`dplyr_mask_set`, private, name, chunks)
},

remove = function(name) {
self$set(name, NULL)
env_unbind(private$bindings, name)
.Call(`dplyr_mask_remove`, private, name)
},

resolve = function(name) {
chunks <- self$get_resolved(name)

if (is.null(chunks)) {
column <- private$data[[name]]
chunks <- vec_chop(column, private$rows)
if (inherits(private$data, "rowwise_df") && vec_is_list(column)) {
chunks <- map(chunks, `[[`, 1)
}
self$set(name, chunks)
}

chunks
},


get_resolved = function(name) {
private$resolved[[name]]
private$chops[[name]]
},

eval_all = function(quo) {
Expand All @@ -143,7 +72,7 @@ DataMask <- R6Class("DataMask",
},

current_cols = function(vars) {
env_get_list(private$bindings, vars)
env_get_list(parent.env(private$mask), vars)
},

current_rows = function() {
Expand All @@ -155,31 +84,27 @@ DataMask <- R6Class("DataMask",
},

current_vars = function() {
names(private$resolved)
private$all_vars
},

current_non_group_vars = function() {
current_vars <- self$current_vars()
setdiff(current_vars, private$group_vars)
setdiff(self$current_vars(), private$group_vars)
},

get_current_group = function() {
# The [] is so that we get a copy, which is important for how
# current_group is dealt with internally, to avoid defining it at each
# iteration of the dplyr_mask_eval_*() loops.
private$current_group[]
parent.env(private$chops)$.current_group
},

set_current_group = function(group) {
private$current_group <- group
parent.env(private$chops)$.current_group <- group
},

full_data = function() {
private$data
},

get_used = function() {
private$used
.Call(env_resolved, private$chops, private$all_vars)
},

unused_vars = function() {
Expand Down Expand Up @@ -233,23 +158,63 @@ DataMask <- R6Class("DataMask",

across_cache_reset = function() {
private$across_cache <- list()
},

forget = function(fn) {
names_bindings <- self$current_vars()

osbolete_promise_fn <- function(name) {
abort(c(
"Obsolete data mask.",
x = glue("Too late to resolve `{name}` after the end of `dplyr::{fn}()`."),
i = glue("Did you save an object that uses `{name}` lazily in a column in the `dplyr::{fn}()` expression ?")
))
}

promises <- map(names_bindings, function(.x) expr(osbolete_promise_fn(!!.x)))
bindings <- parent.env(private$mask)
suppressWarnings({
rm(list = names_bindings, envir = bindings)
env_bind_lazy(bindings, !!!set_names(promises, names_bindings))
})
}

),

private = list(
# the input data
data = NULL,

# environment that contains lazy vec_chop()s for each input column
# and list of result chunks as they get added.
#
# The parent environment of chops has:
# - .indices: the list of indices
# - .current_group: scalar integer that identifies the current group
chops = NULL,

# dynamic data mask, with active bindings for each column
# this is an rlang data mask, as such the bindings are actually
# in the parent environment of `mask`
mask = NULL,
old_vars = character(),

# names of all the variables, this initially is names(data)
# grows (and sometimes shrinks) as new columns are added/removed
all_vars = character(),

# names of the grouping variables
group_vars = character(),
used = logical(),
resolved = list(),
which_used = integer(),

# list of indices, one integer vector per group
rows = NULL,

# data frame of keys, one row per group
keys = NULL,
bindings = NULL,
current_group = 0L,

# caller environment of the verb (summarise(), ...)
caller = NULL,

# cache for across
across_cache = list()
)
)
3 changes: 2 additions & 1 deletion R/mutate.R
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ transmute.data.frame <- function(.data, ...) {
mutate_cols <- function(.data, ...) {
mask <- DataMask$new(.data, caller_env())
on.exit(mask$forget("mutate"), add = TRUE)

rows <- mask$get_rows()

dots <- enquos(...)
Expand Down Expand Up @@ -254,7 +255,7 @@ mutate_cols <- function(.data, ...) {
if (name %in% names(new_columns)) {
# already have result and chunks
result <- new_columns[[name]]
chunks <- mask$get_resolved(name)
chunks <- mask$resolve(name)
} else if (name %in% names(.data)) {
# column from the original data
result <- .data[[name]]
Expand Down
1 change: 1 addition & 0 deletions R/slice.R
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,7 @@ slice_rows <- function(.data, ...) {

mask <- DataMask$new(.data, caller_env())
on.exit(mask$forget("slice"), add = TRUE)

rows <- mask$get_rows()

quo <- quo(c(!!!dots))
Expand Down
2 changes: 1 addition & 1 deletion R/zzz.r
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
toset <- !(names(op.dplyr) %in% names(op))
if (any(toset)) options(op.dplyr[toset])

.Call(dplyr_init_library, ns_env("dplyr"))
.Call(dplyr_init_library, ns_env("dplyr"), ns_env("vctrs"), ns_env("rlang"))

invisible()
}
Expand Down
Loading

0 comments on commit 5b29134

Please sign in to comment.