Many new windowing functions

dracomas · Nov 22, 2013 · c89dc9f · c89dc9f
1 parent e991ef4
commit c89dc9f
Show file tree

Hide file tree

Showing 25 changed files with 715 additions and 27 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -59,6 +59,7 @@ Collate:
     'join-dt.r'
     'join-sql.r'
     'join.r'
+    'lead-lag.R'
     'manip-cpp.r'
     'manip-cube.r'
     'manip-df.r'
@@ -67,9 +68,13 @@ Collate:
     'manip-grouped-dt.r'
     'manip-sql.r'
     'manip.r'
+    'nth-value.R'
+    'order-by.R'
+    'over.R'
     'partial-eval.r'
     'query.r'
     'query-bq.r'
+    'rank.R'
     'rbind.r'
     'sets.r'
     'sql-escape.r'
@@ -97,5 +102,4 @@ Collate:
     'utils-format.r'
     'utils.r'
     'view.r'
-    'window.R'
     'zzz.r'
diff --git a/NAMESPACE b/NAMESPACE
@@ -269,6 +269,11 @@ export(collect)
 export(compare_tbls)
 export(compute)
 export(copy_to)
+export(cumall)
+export(cumany)
+export(cume_dist)
+export(cummean)
+export(dense_rank)
 export(desc)
 export(dim_desc)
 export(do)
@@ -278,6 +283,7 @@ export(explain_sql)
 export(explain_tbl)
 export(failwith)
 export(filter)
+export(first_value)
 export(group_by)
 export(group_size)
 export(grouped_dt)
@@ -305,13 +311,20 @@ export(lahman_mysql)
 export(lahman_postgres)
 export(lahman_sqlite)
 export(lahman_srcs)
+export(last_value)
 export(lead)
 export(left_join)
 export(make_tbl)
+export(min_rank)
 export(mutate)
 export(n)
+export(nth_value)
+export(ntile)
+export(order_by)
 export(partial_eval)
+export(percent_rank)
 export(query)
+export(row_number)
 export(same_src)
 export(select)
 export(select_eval)
@@ -354,6 +367,7 @@ export(ungroup)
 export(union)
 export(var_eval)
 export(var_index)
+export(with_order)
 exportClasses(Query)
 import(assertthat)
 importFrom(Rcpp,cppFunction)

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -93,3 +93,28 @@ rbind_all <- function(dots) {
     .Call('dplyr_rbind_all', PACKAGE = 'dplyr', dots)
 }
 
+#' Cumulativate versions of any, all, and mean
+#' 
+#' dplyr adds \code{cumall}, \code{cumany}, and \code{cummean} to complete
+#' R's set of cumulate functions to match the aggregation functions available
+#' in most databases
+#' 
+#' @param x For \code{cumall} & \code{cumany}, a logical vector; for
+#'   \code{cummean} an integer or numeric vector
+#' @export
+cumall <- function(x) {
+    .Call('dplyr_cumall', PACKAGE = 'dplyr', x)
+}
+
+#' @export
+#' @rdname cumall
+cumany <- function(x) {
+    .Call('dplyr_cumany', PACKAGE = 'dplyr', x)
+}
+
+#' @export
+#' @rdname cumall
+cummean <- function(x) {
+    .Call('dplyr_cummean', PACKAGE = 'dplyr', x)
+}
+
diff --git a/R/chain.r b/R/chain.r
@@ -53,7 +53,7 @@
 #'     arr = mean(ArrDelay, na.rm = TRUE), 
 #'     dep = mean(DepDelay, na.rm = TRUE)
 #'   ) %.%
-#'   filter(a3, arr > 30 | dep > 30)
+#'   filter(arr > 30 | dep > 30)
 #'
 #' chain(
 #'   hflights,
@@ -63,7 +63,7 @@
 #'     arr = mean(ArrDelay, na.rm = TRUE), 
 #'     dep = mean(DepDelay, na.rm = TRUE)
 #'   ),
-#'   filter(a3, arr > 30 | dep > 30)
+#'   filter(arr > 30 | dep > 30)
 #' )
 chain <- function(..., env = parent.frame()) {
   chain_q(dots(...), env = env)

diff --git a/R/window.R → R/lead-lag.R b/R/window.R → R/lead-lag.R
@@ -7,6 +7,7 @@
 #' @param n a postive integer of length 1, giving the number of positions to
 #'   lead or lag by
 #' @param default value used for non-existant rows. Defaults to \code{NA}.
+#' @param order_by override the default ordering to use another vector
 #' @examples
 #' lead(1:10, 1)
 #' lead(1:10, 2)
@@ -16,24 +17,42 @@
 #' 
 #' x <- runif(5)
 #' cbind(ahead = lead(x), x, behind = lag(x))
+#' 
+#' # Use order_by if data not already ordered
+#' df <- data.frame(year = 2000:2005, value = (0:5) ^ 2)
+#' scrambled <- df[sample(nrow(df)), ]
+#' 
+#' wrong <- mutate(scrambled, prev = lag(value))
+#' arrange(wrong, year)
+#' 
+#' right <- mutate(scrambled, prev = lag(value, order_by = year))
+#' arrange(right, year)
 #' @name lead-lag
 NULL
 
 #' @export
 #' @rdname lead-lag
-lead <- function(x, n = 1L, default = NA) {
+lead <- function(x, n = 1L, default = NA, order_by = NULL) {
+  if (!is.null(order_by)) {
+    return(with_order(order_by, lead, x, n = n, default = default))
+  }
+
   if (n == 0) return(x)
   if (n < 0 || length(n) > 1) stop("n must be a single positive integer")
 
   xlen <- length(x)
   n <- pmin(n, xlen)
 
-  c(x[-seq_len(n)], rep(default, n)) 
+  c(x[-seq_len(n)], rep(default, n))
 }
 
 #' @export
 #' @rdname lead-lag
-lag <- function(x, n = 1L, default = NA) {
+lag <- function(x, n = 1L, default = NA, order_by = NULL) {
+  if (!is.null(order_by)) {
+    return(with_order(order_by, lag, x, n = n, default = default))
+  }
+
   if (n == 0) return(x)
   if (n < 0 || length(n) > 1) stop("n must be a single positive integer")
 

diff --git a/R/nth-value.R b/R/nth-value.R
@@ -0,0 +1,36 @@
+#' Extract the first, last or nth value from a vector.
+#' 
+#' These are straightforward wrappers around \code{\link{[}}. The main 
+#' advantage is that you can provide an optional secondary vector that defines
+#' the ordering.
+#' 
+#' @param x A vector
+#' @param n For \code{nth_value}, a single integer specifying the position.
+#'   If larger than x, an \code{NA} of the same type as x will be returned.
+#' @param order_by An optional vector used to determine the order
+#' @export
+#' @examples
+#' x <- 1:10
+#' y <- 10:1
+#' 
+#' last_value(x)
+#' last_value(x, y)
+nth_value <- function(x, n, order_by = NULL) {
+  stopifnot(length(n) == 1, is.numeric(x))
+
+  # if n > length(x), x[n] will be NA of correct type
+  if (is.null(order_by)) x[n] else x[order(order_by)[n]]
+}
+
+#' @export
+#' @rdname nth_value
+first_value <- function(x, order_by = NULL) {
+  nth_value(x, 1, order_by)
+}
+
+#' @export
+#' @rdname nth_value
+last_value <- function(x, order_by = NULL) {
+  nth_value(x, length(x), order_by)
+}
+
diff --git a/R/order-by.R b/R/order-by.R
@@ -0,0 +1,46 @@
+#' A helper function for ordering window function output.
+#' 
+#' This is a useful function to control the order of window functions in 
+#' R that don't have a specific ordering parameter. When translated to SQL
+#' it will modify the order clause of the OVER function. 
+#' 
+#' This function works by changing the \code{call} to instead call
+#' \code{\link{with_order}} with the appropriate arguments.
+#' 
+#' @param order_by a vector to order_by
+#' @param call a function call to a window function, where the first argument
+#'   is the vector being operated on
+#' @export
+#' @examples
+#' df <- data.frame(year = 2000:2005, value = (0:5) ^ 2)
+#' scrambled <- df[sample(nrow(df)), ]
+#' 
+#' wrong <- mutate(scrambled, running = cumsum(value))
+#' arrange(wrong, year)
+#' 
+#' right <- mutate(scrambled, running = order_by(year, cumsum(value)))
+#' arrange(right, year)
+order_by <- function(order_by, call) {
+  call <- substitute(call)
+  stopifnot(is.call(call))
+
+  new_call <- as.call(c(quote(with_order), substitute(order_by), as.list(call)))
+  eval(new_call, parent.frame())
+}
+
+#' Run a function with one order, translating result back to original order
+#' 
+#' This is used to power the ordering parameters of dplyr's window functions
+#' 
+#' @param order_by vector to order by
+#' @param fun window function
+#' @param x,... arguments to \code{f}
+#' @keywords internal
+#' @export
+with_order <- function(order_by, fun, x, ...) {
+  ord <- order(order_by)
+  undo <- match(seq_along(order_by), ord)
+
+  out <- fun(x[ord], ...)
+  out[undo]
+}
diff --git a/R/over.R b/R/over.R
@@ -0,0 +1,39 @@
+# Purely for SQL generation: to hard to efficiently do in R.
+over <- function(expr, partition = NULL, order = NULL, frame = NULL) {
+  args <- !is.null(partition) + !is.null(order) + !is.null(frame)
+  if (args == 0) {
+    stop("Must supply at least one of partition, order, frame", call. = FALSE)
+  }
+
+  if (!is.null(partition)) {
+    partition <- paste0("PARTITION BY ", paste0(partition, collapse = ", "))
+  }
+  if (!is.null(order)) {
+    order <- paste0("ORDER BY ", paste0(order, collapse = ", "))
+  }
+  if (!is.null(rows)) {
+    if (is.numeric(frame)) frame <- rows(frame[1], frame[2])
+    rows <- paste0("ROWS ", frame)
+  }
+
+  over <- paste0(c(partition, order, rows), collapse = " ")
+
+  paste0(expr, " OVER (", over, ")")
+}
+
+rows <- function(from = -Inf, to = 0) {
+  if (from >= to) stop("from must be less than to", call. = FALSE)
+
+  dir <- function(x) if (x < 0) "PRECEDING" else "FOLLOWING"
+  val <- function(x) if (is.finite(x)) as.integer(abs(x)) else "UNBOUNDED"
+  bound <- function(x) {
+    if (x == 0) return("CURRENT ROW")
+    paste(val(x), dir(x))
+  }
+
+  if (to == 0) {
+    bound(from)
+  } else {
+    paste0("BETWEEN ", bound(from), " AND ", bound(to))
+  }
+}
diff --git a/R/rank.R b/R/rank.R
@@ -0,0 +1,52 @@
+#' Windowed rank functions
+#' 
+#' @name ranking
+#' @param x a vector of values to rank
+#' @examples
+#' x <- c(5, 1, 3, 2, 2)
+#' row_number(x)
+#' min_rank(x)
+#' dense_rank(x)
+#' percent_rank(x)
+#' cume_dist(x)
+#' 
+#' ntile(x, 2)
+#' ntile(runif(100), 10)
+NULL
+
+#' @export
+#' @rdname ranking
+row_number <- function(x) order(x)
+
+# Definition from 
+# http://blogs.msdn.com/b/craigfr/archive/2008/03/31/ranking-functions-rank-dense-rank-and-ntile.aspx
+#' @param n number of groups to split up into.
+#' @export
+#' @rdname ranking
+ntile <- function(x, n) {
+  floor((n * (rank(x, ties.method = "first") - 1) / length(x)) + 1)
+}
+
+#' @export
+#' @rdname ranking
+min_rank <- function(x) base::rank(x, ties.method = "min")
+
+#' @export
+#' @rdname ranking
+dense_rank <- function(x) {
+  r <- rank(x)
+  match(r, sort(unique(r)))
+}
+
+#' @export
+#' @rdname ranking
+percent_rank <- function(x) {
+  (min_rank(x) - 1) / (length(x) - 1)
+}
+
+#' @export
+#' @rdname ranking
+cume_dist <- function(x) {
+  rank(x, ties.method = "max") / length(x)
+}
+
diff --git a/R/tally.R b/R/tally.R
@@ -4,7 +4,7 @@
 #' \code{\link{n}} or \code{\link{sum}(n)} depending on whether you're tallying 
 #' for the first time, or re-tallying. 
 #' 
-#' @param a \code{\link{tbl}} to tally
+#' @param x a \code{\link{tbl}} to tally
 #' @param wt if not specified, will tally the number of rows. If specified,
 #'   will perform a "weighted" tally but summing over the specified variable.
 #' @export
@@ -15,7 +15,8 @@
 #'   
 #'   plays_by_year <- tally(group_by(batting_tbl, playerID, stint))
 #'   tally(plays_by_year)
-#'   tally(tally(plays_by_year))
+#'   # FIXME: https://github.com/hadley/dplyr/issues/129
+#'   # tally(tally(plays_by_year))
 #'   tally(group_by(plays_by_year, stint))
 #'   
 #'   # This looks a little nicer if you use the infix %.% operator

diff --git a/R/top-n.R b/R/top-n.R
@@ -5,7 +5,7 @@
 #' \code{\link{rank}} to select the top n entries in each group, ordered
 #' by \code{wt}.
 #' 
-#' @param a \code{\link{tbl}} to filter
+#' @param x a \code{\link{tbl}} to filter
 #' @param n number of rows to return. If \code{x} is grouped, this is
 #'   the number of rows per group. May include more than \code{n} if there
 #'   are ties.