Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Experiments around a vctrs powered group_by() #4504

Merged
merged 30 commits into from
Jul 31, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
b4be19c
experiment with vctrs
romainfrancois Jul 22, 2019
d61303c
initial impl of expand_groups()
romainfrancois Jul 22, 2019
bd681f8
move expand_groups C++ function to its own file
romainfrancois Jul 23, 2019
731527b
no longer need ListExpander
romainfrancois Jul 23, 2019
bc3aaa8
implementation of VectorExpander
romainfrancois Jul 23, 2019
377eeec
collecting new rows recursively in *Expander
romainfrancois Jul 23, 2019
24b231a
simplify impl of expand_groups, i.e. no use of boost::shared_ptr
romainfrancois Jul 23, 2019
dce8df6
support for add= in bunch_by()
romainfrancois Jul 25, 2019
0aba2c0
dealing with implicit NA in factors in bunch_by()
romainfrancois Jul 26, 2019
5988379
bunch_by() deals with empty factors
romainfrancois Jul 26, 2019
b2c43ff
bunch_by() warning about implicit NA in factors
romainfrancois Jul 26, 2019
2bb1611
bunch_by() returns ungrouped data when no grouping variable selected
romainfrancois Jul 26, 2019
3aac179
moving warning about implicit NA to the R side
romainfrancois Jul 26, 2019
78f5fad
bunch_by() handles list as grouping variables
romainfrancois Jul 27, 2019
e2d9c30
Reinject bunch_by() in existing function grouped_df()
romainfrancois Jul 29, 2019
66a17f9
use grouped_df() instead of grouped_df_impl()
romainfrancois Jul 29, 2019
a032b15
skipping some tests until some tibble fixes
romainfrancois Jul 29, 2019
568a3aa
- grouped_df_impl() c++ function
romainfrancois Jul 29, 2019
21124d2
using version 0.8.99.9000 in case we need to release a 0.8.4 before t…
romainfrancois Jul 30, 2019
1281049
R implementation of regroup()
romainfrancois Jul 30, 2019
edd459a
Trim old Slicer code that is no longer used because group_by() hashes…
romainfrancois Jul 30, 2019
8321217
Declare global variables (bc of %<-%).
romainfrancois Jul 30, 2019
39e63dd
using dev tibble
romainfrancois Jul 30, 2019
4c3bfcc
adapt to https://github.com/r-lib/vctrs/pull/515
romainfrancois Jul 31, 2019
5fbde9f
reverse order of remotes
romainfrancois Jul 31, 2019
3c56f69
no longer need ::: for vec_split_id()
romainfrancois Jul 31, 2019
e56febe
skip a test for now :shrug:
romainfrancois Jul 31, 2019
92682da
NEWS [ci skip]
romainfrancois Jul 31, 2019
e49db84
Using master vctrs
romainfrancois Jul 31, 2019
9399d13
Merge branch 'dev_0_9_0' into vctrs_group_by
romainfrancois Jul 31, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: dplyr
Title: A Grammar of Data Manipulation
Version: 0.8.3.9000
Version: 0.8.99.9000
Authors@R: c(
person("Hadley", "Wickham", , "[email protected]", c("aut", "cre"), comment = c(ORCID = "0000-0003-4757-117X")),
person("Romain", "Fran\u00e7ois", role = "aut", comment = c(ORCID = "0000-0002-2444-4226")),
Expand All @@ -25,9 +25,11 @@ Imports:
R6,
Rcpp (>= 1.0.1),
rlang (>= 0.4.0),
tibble (>= 2.0.0),
tibble (>= 2.1.3.9000),
tidyselect (>= 0.2.5),
utils
utils,
vctrs (>= 0.2.0.9000),
zeallot
Suggests:
bit64,
callr,
Expand Down Expand Up @@ -63,3 +65,6 @@ Encoding: UTF-8
LazyData: yes
Roxygen: list(markdown = TRUE, roclets = c("rd", "namespace", "collate"))
RoxygenNote: 6.1.1
Remotes:
tidyverse/tibble,
r-lib/vctrs
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -455,13 +455,15 @@ export(vars)
export(with_order)
export(wrap_dbplyr_obj)
import(rlang)
import(vctrs)
importFrom(R6,R6Class)
importFrom(Rcpp,Rcpp.plugin.maker)
importFrom(Rcpp,cppFunction)
importFrom(assertthat,"on_failure<-")
importFrom(assertthat,assert_that)
importFrom(assertthat,is.flag)
importFrom(glue,glue)
importFrom(glue,glue_collapse)
importFrom(magrittr,"%>%")
importFrom(methods,is)
importFrom(pkgconfig,get_config)
Expand Down Expand Up @@ -505,4 +507,5 @@ importFrom(tidyselect,one_of)
importFrom(tidyselect,starts_with)
importFrom(utils,head)
importFrom(utils,tail)
importFrom(zeallot,"%<-%")
useDynLib(dplyr, .registration = TRUE)
6 changes: 5 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
# dplyr (development version)
# dplyr 0.9.0 (in development)

* `group_by()` uses hashing from the `vctrs` package.

# dplyr 0.8.4 (development version)

* Better performance for extracting slices of factors and ordered factors (#4501).

Expand Down
28 changes: 12 additions & 16 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,10 @@ n_distinct_multi <- function(variables, na_rm = FALSE) {
.Call(`_dplyr_n_distinct_multi`, variables, na_rm)
}

expand_groups <- function(old_groups, positions, nr) {
.Call(`_dplyr_expand_groups`, old_groups, positions, nr)
}

filter_impl <- function(df, quo) {
.Call(`_dplyr_filter_impl`, df, quo)
}
Expand All @@ -92,22 +96,6 @@ slice_impl <- function(df, quosure) {
.Call(`_dplyr_slice_impl`, df, quosure)
}

grouped_indices_grouped_df_impl <- function(gdf) {
.Call(`_dplyr_grouped_indices_grouped_df_impl`, gdf)
}

group_size_grouped_cpp <- function(gdf) {
.Call(`_dplyr_group_size_grouped_cpp`, gdf)
}

regroup <- function(grouping_data, frame) {
.Call(`_dplyr_regroup`, grouping_data, frame)
}

grouped_df_impl <- function(data, symbols, drop) {
.Call(`_dplyr_grouped_df_impl`, data, symbols, drop)
}

group_data_grouped_df <- function(data) {
.Call(`_dplyr_group_data_grouped_df`, data)
}
Expand All @@ -120,6 +108,14 @@ group_split_impl <- function(gdf, keep, frame) {
.Call(`_dplyr_group_split_impl`, gdf, keep, frame)
}

grouped_indices_grouped_df_impl <- function(gdf) {
.Call(`_dplyr_grouped_indices_grouped_df_impl`, gdf)
}

group_size_grouped_cpp <- function(gdf) {
.Call(`_dplyr_group_size_grouped_cpp`, gdf)
}

hybrids <- function() {
.Call(`_dplyr_hybrids`)
}
Expand Down
2 changes: 1 addition & 1 deletion R/dataframe.R
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ setequal.data.frame <- function(x, y, ...) {

reconstruct_set <- function(out, x) {
if (is_grouped_df(x)) {
out <- grouped_df_impl(out, group_vars(x), group_by_drop_default(x))
out <- grouped_df(out, group_vars(x), group_by_drop_default(x))
}

out
Expand Down
2 changes: 1 addition & 1 deletion R/dplyr.r
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
#' @useDynLib dplyr, .registration = TRUE
#' @import rlang
#' @importFrom assertthat assert_that is.flag on_failure<-
#' @importFrom glue glue
#' @importFrom glue glue glue_collapse
#' @importFrom Rcpp cppFunction Rcpp.plugin.maker
#' @importFrom stats setNames update
#' @importFrom utils head tail
Expand Down
99 changes: 91 additions & 8 deletions R/grouped-df.r
Original file line number Diff line number Diff line change
@@ -1,3 +1,81 @@
utils::globalVariables(c("old_rows", ".rows", "new_indices", "new_rows"))

make_grouped_df_groups_attribute <- function(data, vars, drop = FALSE) {
data <- as_tibble(data)

assert_that(
(is.list(vars) && all(sapply(vars, is.name))) || is.character(vars)
)
if (is.list(vars)) {
vars <- deparse_names(vars)
}

unknown <- setdiff(vars, tbl_vars(data))
if (n_unknown <- length(unknown)) {
if(n_unknown == 1) {
abort(glue("Column `{unknown}` is unknown"))
} else {
abort(glue("Column `{unknown}` are unknown", unknown = glue_collapse(unknown, sep = ", ")))
}
}

# Only train the dictionary based on selected columns
grouping_variables <- select(ungroup(data), one_of(vars))
c(old_keys, old_rows) %<-% vec_split_id(grouping_variables)

# Keys and associated rows, in order
orders <- vec_order(old_keys)
old_keys <- vec_slice(old_keys, orders)
old_rows <- old_rows[orders]

map2(old_keys, names(old_keys), function(x, n) {
if (is.factor(x) && anyNA(x)) {
warn(glue("Factor `{n}` contains implicit NA, consider using `forcats::fct_explicit_na`"))
}
})

groups <- tibble(!!!old_keys, .rows := old_rows)

if (!isTRUE(drop) && any(map_lgl(old_keys, is.factor))) {
# Extra work is needed to auto expand empty groups

uniques <- map(old_keys, function(.) {
if (is.factor(.)) . else vec_unique(.)
})

# Internally we only work with integers
#
# so for any grouping column that is not a factor
# we need to match the values to the unique values
positions <- map2(old_keys, uniques, function(.x, .y) {
if (is.factor(.x)) .x else vec_match(.x, .y)
})

# Expand groups internally adds empty groups recursively
# we get back:
# - indices: a list of how to vec_slice the current keys
# to get the new keys
#
# - rows: the new list of rows (i.e. the same as old rows,
# but with some extra empty integer(0) added for empty groups)
c(new_indices, new_rows) %<-% expand_groups(groups, positions, vec_size(old_keys))

# Make the new keys from the old keys and the new_indices
new_keys <- pmap(list(old_keys, new_indices, uniques), function(key, index, unique) {
if(is.factor(key)) {
new_factor(index, levels = levels(key))
} else {
vec_slice(unique, index)
}
})
names(new_keys) <- names(grouping_variables)

groups <- tibble(!!!new_keys, .rows := new_rows)
}

structure(groups, .drop = drop)
}

#' A grouped data frame.
#'
#' The easiest way to create a grouped data frame is to call the `group_by()`
Expand All @@ -8,16 +86,21 @@
#' @param data a tbl or data frame.
#' @param vars a character vector or a list of [name()]
#' @param drop When `.drop = TRUE`, empty groups are dropped.
#'
#' @import vctrs
#' @importFrom zeallot %<-%
#'
#' @export
grouped_df <- function(data, vars, drop = FALSE) {
assert_that(
is.data.frame(data),
(is.list(vars) && all(sapply(vars, is.name))) || is.character(vars)
)
if (is.list(vars)) {
vars <- deparse_names(vars)
if (!length(vars)) {
return(as_tibble(data))
}
grouped_df_impl(data, unname(vars), drop)

# structure the grouped data
new_grouped_df(
data,
groups = make_grouped_df_groups_attribute(data, vars, drop = drop)
)
}

#' Low-level construction and validation for the grouped_df class
Expand Down Expand Up @@ -357,7 +440,7 @@ distinct.grouped_df <- function(.data, ..., .keep_all = FALSE) {
)
vars <- match_vars(dist$vars, dist$data)
keep <- match_vars(dist$keep, dist$data)
out <- distinct_impl(dist$data, vars, keep, environment())
out <- as_tibble(distinct_impl(dist$data, vars, keep, environment()))
grouped_df(out, groups(.data), group_by_drop_default(.data))
}
#' @export
Expand Down
34 changes: 29 additions & 5 deletions R/tbl-df.r
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,30 @@ arrange_.tbl_df <- function(.data, ..., .dots = list(), .by_group = FALSE) {
arrange_impl(.data, dots, environment())
}

regroup <- function(data) {
# only keep the non empty groups
non_empty <- map_lgl(group_rows(data), function(.x) length(.x) > 0)
gdata <- filter(group_data(data), non_empty)

# then group the grouping data to get expansion if needed
gdata <- grouped_df(gdata, head(names(gdata), -1L), isTRUE(attr(group_data(data), ".drop")))
new_groups <- group_data(gdata)
old_rows <- gdata$.rows

new_rows <- map(new_groups$.rows, function(.x) {
if (length(.x) == 1L) {
old_rows[[.x]]
} else {
integer()
}
})
new_groups$.rows <- new_rows

attr(data, "groups") <- new_groups
data
}


#' @export
filter.tbl_df <- function(.data, ..., .preserve = FALSE) {
dots <- enquos(...)
Expand All @@ -57,7 +81,7 @@ filter.tbl_df <- function(.data, ..., .preserve = FALSE) {
quo <- all_exprs(!!!dots, .vectorised = TRUE)
out <- filter_impl(.data, quo)
if (!.preserve && is_grouped_df(.data)) {
attr(out, "groups") <- regroup(attr(out, "groups"), environment())
out <- regroup(out)
}
out
}
Expand All @@ -77,7 +101,7 @@ slice.tbl_df <- function(.data, ..., .preserve = FALSE) {
quo <- quo(c(!!!dots))
out <- slice_impl(.data, quo)
if (!.preserve && is_grouped_df(.data)) {
attr(out, "groups") <- regroup(attr(out, "groups"), environment())
out <- regroup(out)
}
out
}
Expand Down Expand Up @@ -287,7 +311,7 @@ semi_join.tbl_df <- function(x, y, by = NULL, copy = FALSE, ...,
y <- auto_copy(x, y, copy = copy)
out <- semi_join_impl(x, y, by$x, by$y, check_na_matches(na_matches), environment())
if (is_grouped_df(x)) {
out <- grouped_df_impl(out, group_vars(x), group_by_drop_default(x))
out <- grouped_df(out, group_vars(x), group_by_drop_default(x))
}
out
}
Expand All @@ -303,7 +327,7 @@ anti_join.tbl_df <- function(x, y, by = NULL, copy = FALSE, ...,
y <- auto_copy(x, y, copy = copy)
out <- anti_join_impl(x, y, by$x, by$y, check_na_matches(na_matches), environment())
if (is_grouped_df(x)) {
out <- grouped_df_impl(out, group_vars(x), group_by_drop_default(x))
out <- grouped_df(out, group_vars(x), group_by_drop_default(x))
}
out
}
Expand All @@ -312,7 +336,7 @@ reconstruct_join <- function(out, x, vars) {
if (is_grouped_df(x)) {
groups_in_old <- match(group_vars(x), tbl_vars(x))
groups_in_alias <- match(groups_in_old, vars$x)
out <- grouped_df_impl(out, vars$alias[groups_in_alias], group_by_drop_default(x))
out <- grouped_df(out, vars$alias[groups_in_alias], group_by_drop_default(x))
}
out
}
Expand Down
Loading