Skip to content

Commit

Permalink
Many new windowing functions
Browse files Browse the repository at this point in the history
  • Loading branch information
hadley committed Nov 22, 2013
1 parent e991ef4 commit c89dc9f
Show file tree
Hide file tree
Showing 25 changed files with 715 additions and 27 deletions.
6 changes: 5 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ Collate:
'join-dt.r'
'join-sql.r'
'join.r'
'lead-lag.R'
'manip-cpp.r'
'manip-cube.r'
'manip-df.r'
Expand All @@ -67,9 +68,13 @@ Collate:
'manip-grouped-dt.r'
'manip-sql.r'
'manip.r'
'nth-value.R'
'order-by.R'
'over.R'
'partial-eval.r'
'query.r'
'query-bq.r'
'rank.R'
'rbind.r'
'sets.r'
'sql-escape.r'
Expand Down Expand Up @@ -97,5 +102,4 @@ Collate:
'utils-format.r'
'utils.r'
'view.r'
'window.R'
'zzz.r'
14 changes: 14 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,11 @@ export(collect)
export(compare_tbls)
export(compute)
export(copy_to)
export(cumall)
export(cumany)
export(cume_dist)
export(cummean)
export(dense_rank)
export(desc)
export(dim_desc)
export(do)
Expand All @@ -278,6 +283,7 @@ export(explain_sql)
export(explain_tbl)
export(failwith)
export(filter)
export(first_value)
export(group_by)
export(group_size)
export(grouped_dt)
Expand Down Expand Up @@ -305,13 +311,20 @@ export(lahman_mysql)
export(lahman_postgres)
export(lahman_sqlite)
export(lahman_srcs)
export(last_value)
export(lead)
export(left_join)
export(make_tbl)
export(min_rank)
export(mutate)
export(n)
export(nth_value)
export(ntile)
export(order_by)
export(partial_eval)
export(percent_rank)
export(query)
export(row_number)
export(same_src)
export(select)
export(select_eval)
Expand Down Expand Up @@ -354,6 +367,7 @@ export(ungroup)
export(union)
export(var_eval)
export(var_index)
export(with_order)
exportClasses(Query)
import(assertthat)
importFrom(Rcpp,cppFunction)
Expand Down
25 changes: 25 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -93,3 +93,28 @@ rbind_all <- function(dots) {
.Call('dplyr_rbind_all', PACKAGE = 'dplyr', dots)
}

#' Cumulativate versions of any, all, and mean
#'
#' dplyr adds \code{cumall}, \code{cumany}, and \code{cummean} to complete
#' R's set of cumulate functions to match the aggregation functions available
#' in most databases
#'
#' @param x For \code{cumall} & \code{cumany}, a logical vector; for
#' \code{cummean} an integer or numeric vector
#' @export
cumall <- function(x) {
.Call('dplyr_cumall', PACKAGE = 'dplyr', x)
}

#' @export
#' @rdname cumall
cumany <- function(x) {
.Call('dplyr_cumany', PACKAGE = 'dplyr', x)
}

#' @export
#' @rdname cumall
cummean <- function(x) {
.Call('dplyr_cummean', PACKAGE = 'dplyr', x)
}

4 changes: 2 additions & 2 deletions R/chain.r
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
#' arr = mean(ArrDelay, na.rm = TRUE),
#' dep = mean(DepDelay, na.rm = TRUE)
#' ) %.%
#' filter(a3, arr > 30 | dep > 30)
#' filter(arr > 30 | dep > 30)
#'
#' chain(
#' hflights,
Expand All @@ -63,7 +63,7 @@
#' arr = mean(ArrDelay, na.rm = TRUE),
#' dep = mean(DepDelay, na.rm = TRUE)
#' ),
#' filter(a3, arr > 30 | dep > 30)
#' filter(arr > 30 | dep > 30)
#' )
chain <- function(..., env = parent.frame()) {
chain_q(dots(...), env = env)
Expand Down
25 changes: 22 additions & 3 deletions R/window.R → R/lead-lag.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#' @param n a postive integer of length 1, giving the number of positions to
#' lead or lag by
#' @param default value used for non-existant rows. Defaults to \code{NA}.
#' @param order_by override the default ordering to use another vector
#' @examples
#' lead(1:10, 1)
#' lead(1:10, 2)
Expand All @@ -16,24 +17,42 @@
#'
#' x <- runif(5)
#' cbind(ahead = lead(x), x, behind = lag(x))
#'
#' # Use order_by if data not already ordered
#' df <- data.frame(year = 2000:2005, value = (0:5) ^ 2)
#' scrambled <- df[sample(nrow(df)), ]
#'
#' wrong <- mutate(scrambled, prev = lag(value))
#' arrange(wrong, year)
#'
#' right <- mutate(scrambled, prev = lag(value, order_by = year))
#' arrange(right, year)
#' @name lead-lag
NULL

#' @export
#' @rdname lead-lag
lead <- function(x, n = 1L, default = NA) {
lead <- function(x, n = 1L, default = NA, order_by = NULL) {
if (!is.null(order_by)) {
return(with_order(order_by, lead, x, n = n, default = default))
}

if (n == 0) return(x)
if (n < 0 || length(n) > 1) stop("n must be a single positive integer")

xlen <- length(x)
n <- pmin(n, xlen)

c(x[-seq_len(n)], rep(default, n))
c(x[-seq_len(n)], rep(default, n))
}

#' @export
#' @rdname lead-lag
lag <- function(x, n = 1L, default = NA) {
lag <- function(x, n = 1L, default = NA, order_by = NULL) {
if (!is.null(order_by)) {
return(with_order(order_by, lag, x, n = n, default = default))
}

if (n == 0) return(x)
if (n < 0 || length(n) > 1) stop("n must be a single positive integer")

Expand Down
36 changes: 36 additions & 0 deletions R/nth-value.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#' Extract the first, last or nth value from a vector.
#'
#' These are straightforward wrappers around \code{\link{[}}. The main
#' advantage is that you can provide an optional secondary vector that defines
#' the ordering.
#'
#' @param x A vector
#' @param n For \code{nth_value}, a single integer specifying the position.
#' If larger than x, an \code{NA} of the same type as x will be returned.
#' @param order_by An optional vector used to determine the order
#' @export
#' @examples
#' x <- 1:10
#' y <- 10:1
#'
#' last_value(x)
#' last_value(x, y)
nth_value <- function(x, n, order_by = NULL) {
stopifnot(length(n) == 1, is.numeric(x))

# if n > length(x), x[n] will be NA of correct type
if (is.null(order_by)) x[n] else x[order(order_by)[n]]
}

#' @export
#' @rdname nth_value
first_value <- function(x, order_by = NULL) {
nth_value(x, 1, order_by)
}

#' @export
#' @rdname nth_value
last_value <- function(x, order_by = NULL) {
nth_value(x, length(x), order_by)
}

46 changes: 46 additions & 0 deletions R/order-by.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#' A helper function for ordering window function output.
#'
#' This is a useful function to control the order of window functions in
#' R that don't have a specific ordering parameter. When translated to SQL
#' it will modify the order clause of the OVER function.
#'
#' This function works by changing the \code{call} to instead call
#' \code{\link{with_order}} with the appropriate arguments.
#'
#' @param order_by a vector to order_by
#' @param call a function call to a window function, where the first argument
#' is the vector being operated on
#' @export
#' @examples
#' df <- data.frame(year = 2000:2005, value = (0:5) ^ 2)
#' scrambled <- df[sample(nrow(df)), ]
#'
#' wrong <- mutate(scrambled, running = cumsum(value))
#' arrange(wrong, year)
#'
#' right <- mutate(scrambled, running = order_by(year, cumsum(value)))
#' arrange(right, year)
order_by <- function(order_by, call) {
call <- substitute(call)
stopifnot(is.call(call))

new_call <- as.call(c(quote(with_order), substitute(order_by), as.list(call)))
eval(new_call, parent.frame())
}

#' Run a function with one order, translating result back to original order
#'
#' This is used to power the ordering parameters of dplyr's window functions
#'
#' @param order_by vector to order by
#' @param fun window function
#' @param x,... arguments to \code{f}
#' @keywords internal
#' @export
with_order <- function(order_by, fun, x, ...) {
ord <- order(order_by)
undo <- match(seq_along(order_by), ord)

out <- fun(x[ord], ...)
out[undo]
}
39 changes: 39 additions & 0 deletions R/over.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Purely for SQL generation: to hard to efficiently do in R.
over <- function(expr, partition = NULL, order = NULL, frame = NULL) {
args <- !is.null(partition) + !is.null(order) + !is.null(frame)
if (args == 0) {
stop("Must supply at least one of partition, order, frame", call. = FALSE)
}

if (!is.null(partition)) {
partition <- paste0("PARTITION BY ", paste0(partition, collapse = ", "))
}
if (!is.null(order)) {
order <- paste0("ORDER BY ", paste0(order, collapse = ", "))
}
if (!is.null(rows)) {
if (is.numeric(frame)) frame <- rows(frame[1], frame[2])
rows <- paste0("ROWS ", frame)
}

over <- paste0(c(partition, order, rows), collapse = " ")

paste0(expr, " OVER (", over, ")")
}

rows <- function(from = -Inf, to = 0) {
if (from >= to) stop("from must be less than to", call. = FALSE)

dir <- function(x) if (x < 0) "PRECEDING" else "FOLLOWING"
val <- function(x) if (is.finite(x)) as.integer(abs(x)) else "UNBOUNDED"
bound <- function(x) {
if (x == 0) return("CURRENT ROW")
paste(val(x), dir(x))
}

if (to == 0) {
bound(from)
} else {
paste0("BETWEEN ", bound(from), " AND ", bound(to))
}
}
52 changes: 52 additions & 0 deletions R/rank.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#' Windowed rank functions
#'
#' @name ranking
#' @param x a vector of values to rank
#' @examples
#' x <- c(5, 1, 3, 2, 2)
#' row_number(x)
#' min_rank(x)
#' dense_rank(x)
#' percent_rank(x)
#' cume_dist(x)
#'
#' ntile(x, 2)
#' ntile(runif(100), 10)
NULL

#' @export
#' @rdname ranking
row_number <- function(x) order(x)

# Definition from
# http://blogs.msdn.com/b/craigfr/archive/2008/03/31/ranking-functions-rank-dense-rank-and-ntile.aspx
#' @param n number of groups to split up into.
#' @export
#' @rdname ranking
ntile <- function(x, n) {
floor((n * (rank(x, ties.method = "first") - 1) / length(x)) + 1)
}

#' @export
#' @rdname ranking
min_rank <- function(x) base::rank(x, ties.method = "min")

#' @export
#' @rdname ranking
dense_rank <- function(x) {
r <- rank(x)
match(r, sort(unique(r)))
}

#' @export
#' @rdname ranking
percent_rank <- function(x) {
(min_rank(x) - 1) / (length(x) - 1)
}

#' @export
#' @rdname ranking
cume_dist <- function(x) {
rank(x, ties.method = "max") / length(x)
}

5 changes: 3 additions & 2 deletions R/tally.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#' \code{\link{n}} or \code{\link{sum}(n)} depending on whether you're tallying
#' for the first time, or re-tallying.
#'
#' @param a \code{\link{tbl}} to tally
#' @param x a \code{\link{tbl}} to tally
#' @param wt if not specified, will tally the number of rows. If specified,
#' will perform a "weighted" tally but summing over the specified variable.
#' @export
Expand All @@ -15,7 +15,8 @@
#'
#' plays_by_year <- tally(group_by(batting_tbl, playerID, stint))
#' tally(plays_by_year)
#' tally(tally(plays_by_year))
#' # FIXME: https://github.com/hadley/dplyr/issues/129
#' # tally(tally(plays_by_year))
#' tally(group_by(plays_by_year, stint))
#'
#' # This looks a little nicer if you use the infix %.% operator
Expand Down
2 changes: 1 addition & 1 deletion R/top-n.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#' \code{\link{rank}} to select the top n entries in each group, ordered
#' by \code{wt}.
#'
#' @param a \code{\link{tbl}} to filter
#' @param x a \code{\link{tbl}} to filter
#' @param n number of rows to return. If \code{x} is grouped, this is
#' the number of rows per group. May include more than \code{n} if there
#' are ties.
Expand Down
Loading

0 comments on commit c89dc9f

Please sign in to comment.