tidyverse · romainfrancois · Jul 31, 2019 · Jul 22, 2019 · Jul 22, 2019 · Jul 23, 2019
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: dplyr
 Title: A Grammar of Data Manipulation
-Version: 0.8.3.9000
+Version: 0.8.99.9000
 Authors@R: c(
     person("Hadley", "Wickham", , "[email protected]", c("aut", "cre"), comment = c(ORCID = "0000-0003-4757-117X")),
     person("Romain", "Fran\u00e7ois", role = "aut", comment = c(ORCID = "0000-0002-2444-4226")),
@@ -25,9 +25,11 @@ Imports:
     R6,
     Rcpp (>= 1.0.1),
     rlang (>= 0.4.0),
-    tibble (>= 2.0.0),
+    tibble (>= 2.1.3.9000),
     tidyselect (>= 0.2.5),
-    utils
+    utils,
+    vctrs (>= 0.2.0.9000),
+    zeallot
 Suggests: 
     bit64,
     callr,
@@ -63,3 +65,6 @@ Encoding: UTF-8
 LazyData: yes
 Roxygen: list(markdown = TRUE, roclets = c("rd", "namespace", "collate"))
 RoxygenNote: 6.1.1
+Remotes:
+    tidyverse/tibble,
+    r-lib/vctrs
diff --git a/NAMESPACE b/NAMESPACE
@@ -455,13 +455,15 @@ export(vars)
 export(with_order)
 export(wrap_dbplyr_obj)
 import(rlang)
+import(vctrs)
 importFrom(R6,R6Class)
 importFrom(Rcpp,Rcpp.plugin.maker)
 importFrom(Rcpp,cppFunction)
 importFrom(assertthat,"on_failure<-")
 importFrom(assertthat,assert_that)
 importFrom(assertthat,is.flag)
 importFrom(glue,glue)
+importFrom(glue,glue_collapse)
 importFrom(magrittr,"%>%")
 importFrom(methods,is)
 importFrom(pkgconfig,get_config)
@@ -505,4 +507,5 @@ importFrom(tidyselect,one_of)
 importFrom(tidyselect,starts_with)
 importFrom(utils,head)
 importFrom(utils,tail)
+importFrom(zeallot,"%<-%")
 useDynLib(dplyr, .registration = TRUE)
diff --git a/NEWS.md b/NEWS.md
@@ -1,4 +1,8 @@
-# dplyr (development version)
+# dplyr 0.9.0 (in development)
+
+* `group_by()` uses hashing from the `vctrs` package. 
+
+# dplyr 0.8.4 (development version)
 
 * Better performance for extracting slices of factors and ordered factors (#4501).
 

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -84,6 +84,10 @@ n_distinct_multi <- function(variables, na_rm = FALSE) {
     .Call(`_dplyr_n_distinct_multi`, variables, na_rm)
 }
 
+expand_groups <- function(old_groups, positions, nr) {
+    .Call(`_dplyr_expand_groups`, old_groups, positions, nr)
+}
+
 filter_impl <- function(df, quo) {
     .Call(`_dplyr_filter_impl`, df, quo)
 }
@@ -92,22 +96,6 @@ slice_impl <- function(df, quosure) {
     .Call(`_dplyr_slice_impl`, df, quosure)
 }
 
-grouped_indices_grouped_df_impl <- function(gdf) {
-    .Call(`_dplyr_grouped_indices_grouped_df_impl`, gdf)
-}
-
-group_size_grouped_cpp <- function(gdf) {
-    .Call(`_dplyr_group_size_grouped_cpp`, gdf)
-}
-
-regroup <- function(grouping_data, frame) {
-    .Call(`_dplyr_regroup`, grouping_data, frame)
-}
-
-grouped_df_impl <- function(data, symbols, drop) {
-    .Call(`_dplyr_grouped_df_impl`, data, symbols, drop)
-}
-
 group_data_grouped_df <- function(data) {
     .Call(`_dplyr_group_data_grouped_df`, data)
 }
@@ -120,6 +108,14 @@ group_split_impl <- function(gdf, keep, frame) {
     .Call(`_dplyr_group_split_impl`, gdf, keep, frame)
 }
 
+grouped_indices_grouped_df_impl <- function(gdf) {
+    .Call(`_dplyr_grouped_indices_grouped_df_impl`, gdf)
+}
+
+group_size_grouped_cpp <- function(gdf) {
+    .Call(`_dplyr_group_size_grouped_cpp`, gdf)
+}
+
 hybrids <- function() {
     .Call(`_dplyr_hybrids`)
 }

diff --git a/R/dataframe.R b/R/dataframe.R
@@ -199,7 +199,7 @@ setequal.data.frame <- function(x, y, ...) {
 
 reconstruct_set <- function(out, x) {
   if (is_grouped_df(x)) {
-    out <- grouped_df_impl(out, group_vars(x), group_by_drop_default(x))
+    out <- grouped_df(out, group_vars(x), group_by_drop_default(x))
   }
 
   out

diff --git a/R/dplyr.r b/R/dplyr.r
@@ -37,7 +37,7 @@
 #' @useDynLib dplyr, .registration = TRUE
 #' @import rlang
 #' @importFrom assertthat assert_that is.flag on_failure<-
-#' @importFrom glue glue
+#' @importFrom glue glue glue_collapse
 #' @importFrom Rcpp cppFunction Rcpp.plugin.maker
 #' @importFrom stats setNames update
 #' @importFrom utils head tail

diff --git a/R/grouped-df.r b/R/grouped-df.r
@@ -1,3 +1,81 @@
+utils::globalVariables(c("old_rows", ".rows", "new_indices", "new_rows"))
+
+make_grouped_df_groups_attribute <- function(data, vars, drop = FALSE) {
+  data <- as_tibble(data)
+
+  assert_that(
+    (is.list(vars) && all(sapply(vars, is.name))) || is.character(vars)
+  )
+  if (is.list(vars)) {
+    vars <- deparse_names(vars)
+  }
+
+  unknown <- setdiff(vars, tbl_vars(data))
+  if (n_unknown <- length(unknown)) {
+    if(n_unknown == 1) {
+      abort(glue("Column `{unknown}` is unknown"))
+    } else {
+      abort(glue("Column `{unknown}` are unknown", unknown = glue_collapse(unknown, sep  = ", ")))
+    }
+  }
+
+  # Only train the dictionary based on selected columns
+  grouping_variables <- select(ungroup(data), one_of(vars))
+  c(old_keys, old_rows) %<-% vec_split_id(grouping_variables)
+
+  # Keys and associated rows, in order
+  orders <- vec_order(old_keys)
+  old_keys <- vec_slice(old_keys, orders)
+  old_rows <- old_rows[orders]
+
+  map2(old_keys, names(old_keys), function(x, n) {
+    if (is.factor(x) && anyNA(x)) {
+      warn(glue("Factor `{n}` contains implicit NA, consider using `forcats::fct_explicit_na`"))
+    }
+  })
+
+  groups <- tibble(!!!old_keys, .rows := old_rows)
+
+  if (!isTRUE(drop) && any(map_lgl(old_keys, is.factor))) {
+    # Extra work is needed to auto expand empty groups
+
+    uniques <- map(old_keys, function(.) {
+      if (is.factor(.)) . else vec_unique(.)
+    })
+
+    # Internally we only work with integers
+    #
+    # so for any grouping column that is not a factor
+    # we need to match the values to the unique values
+    positions <- map2(old_keys, uniques, function(.x, .y) {
+      if (is.factor(.x)) .x else vec_match(.x, .y)
+    })
+
+    # Expand groups internally adds empty groups recursively
+    # we get back:
+    # - indices: a list of how to vec_slice the current keys
+    #            to get the new keys
+    #
+    # - rows:    the new list of rows (i.e. the same as old rows,
+    #            but with some extra empty integer(0) added for empty groups)
+    c(new_indices, new_rows) %<-% expand_groups(groups, positions, vec_size(old_keys))
+
+    # Make the new keys from the old keys and the new_indices
+    new_keys <- pmap(list(old_keys, new_indices, uniques), function(key, index, unique) {
+      if(is.factor(key)) {
+        new_factor(index, levels = levels(key))
+      } else {
+        vec_slice(unique, index)
+      }
+    })
+    names(new_keys) <- names(grouping_variables)
+
+    groups <- tibble(!!!new_keys, .rows := new_rows)
+  }
+
+  structure(groups, .drop = drop)
+}
+
 #' A grouped data frame.
 #'
 #' The easiest way to create a grouped data frame is to call the `group_by()`
@@ -8,16 +86,21 @@
 #' @param data a tbl or data frame.
 #' @param vars a character vector or a list of [name()]
 #' @param drop When `.drop = TRUE`, empty groups are dropped.
+#'
+#' @import vctrs
+#' @importFrom zeallot %<-%
+#'
 #' @export
 grouped_df <- function(data, vars, drop = FALSE) {
-  assert_that(
-    is.data.frame(data),
-    (is.list(vars) && all(sapply(vars, is.name))) || is.character(vars)
-  )
-  if (is.list(vars)) {
-    vars <- deparse_names(vars)
+  if (!length(vars)) {
+    return(as_tibble(data))
   }
-  grouped_df_impl(data, unname(vars), drop)
+
+  # structure the grouped data
+  new_grouped_df(
+    data,
+    groups = make_grouped_df_groups_attribute(data, vars, drop = drop)
+  )
 }
 
 #' Low-level construction and validation for the grouped_df class
@@ -357,7 +440,7 @@ distinct.grouped_df <- function(.data, ..., .keep_all = FALSE) {
   )
   vars <- match_vars(dist$vars, dist$data)
   keep <- match_vars(dist$keep, dist$data)
-  out <- distinct_impl(dist$data, vars, keep, environment())
+  out <- as_tibble(distinct_impl(dist$data, vars, keep, environment()))
   grouped_df(out, groups(.data), group_by_drop_default(.data))
 }
 #' @export

diff --git a/R/tbl-df.r b/R/tbl-df.r
@@ -44,6 +44,30 @@ arrange_.tbl_df <- function(.data, ..., .dots = list(), .by_group = FALSE) {
   arrange_impl(.data, dots, environment())
 }
 
+regroup <- function(data) {
+  # only keep the non empty groups
+  non_empty <- map_lgl(group_rows(data), function(.x) length(.x) > 0)
+  gdata <- filter(group_data(data), non_empty)
+
+  # then group the grouping data to get expansion if needed
+  gdata <- grouped_df(gdata, head(names(gdata), -1L), isTRUE(attr(group_data(data), ".drop")))
+  new_groups <- group_data(gdata)
+  old_rows  <- gdata$.rows
+
+  new_rows <- map(new_groups$.rows, function(.x) {
+    if (length(.x) == 1L) {
+      old_rows[[.x]]
+    } else {
+      integer()
+    }
+  })
+  new_groups$.rows <- new_rows
+
+  attr(data, "groups") <- new_groups
+  data
+}
+
+
 #' @export
 filter.tbl_df <- function(.data, ..., .preserve = FALSE) {
   dots <- enquos(...)
@@ -57,7 +81,7 @@ filter.tbl_df <- function(.data, ..., .preserve = FALSE) {
   quo <- all_exprs(!!!dots, .vectorised = TRUE)
   out <- filter_impl(.data, quo)
   if (!.preserve && is_grouped_df(.data)) {
-    attr(out, "groups") <- regroup(attr(out, "groups"), environment())
+    out <- regroup(out)
   }
   out
 }
@@ -77,7 +101,7 @@ slice.tbl_df <- function(.data, ..., .preserve = FALSE) {
   quo <- quo(c(!!!dots))
   out <- slice_impl(.data, quo)
   if (!.preserve && is_grouped_df(.data)) {
-    attr(out, "groups") <- regroup(attr(out, "groups"), environment())
+    out <- regroup(out)
   }
   out
 }
@@ -287,7 +311,7 @@ semi_join.tbl_df <- function(x, y, by = NULL, copy = FALSE, ...,
   y <- auto_copy(x, y, copy = copy)
   out <- semi_join_impl(x, y, by$x, by$y, check_na_matches(na_matches), environment())
   if (is_grouped_df(x)) {
-    out <- grouped_df_impl(out, group_vars(x), group_by_drop_default(x))
+    out <- grouped_df(out, group_vars(x), group_by_drop_default(x))
   }
   out
 }
@@ -303,7 +327,7 @@ anti_join.tbl_df <- function(x, y, by = NULL, copy = FALSE, ...,
   y <- auto_copy(x, y, copy = copy)
   out <- anti_join_impl(x, y, by$x, by$y, check_na_matches(na_matches), environment())
   if (is_grouped_df(x)) {
-    out <- grouped_df_impl(out, group_vars(x), group_by_drop_default(x))
+    out <- grouped_df(out, group_vars(x), group_by_drop_default(x))
   }
   out
 }
@@ -312,7 +336,7 @@ reconstruct_join <- function(out, x, vars) {
   if (is_grouped_df(x)) {
     groups_in_old <- match(group_vars(x), tbl_vars(x))
     groups_in_alias <- match(groups_in_old, vars$x)
-    out <- grouped_df_impl(out, vars$alias[groups_in_alias], group_by_drop_default(x))
+    out <- grouped_df(out, vars$alias[groups_in_alias], group_by_drop_default(x))
   }
   out
 }