forked from tidyverse/dplyr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bench-merge.R
103 lines (81 loc) · 2.66 KB
/
bench-merge.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# Compare base, data table, dplyr and pandas
#
# To install pandas on OS X:
# * brew update && brew install python
# * pip install --upgrade setuptools
# * pip install --upgrade pip
# * pip install pandas
library(dplyr)
library(data.table)
library(microbenchmark)
library(reshape2)
set.seed(1014)
# Generate sample data ---------------------------------------------------------
random_strings <- function(n, m) {
mat <- matrix(sample(letters, m * n, rep = TRUE), ncol = m)
apply(mat, 1, paste, collapse = "")
}
N <- 10000
indices <- random_strings(N, 10)
indices2 <- random_strings(N, 10)
left <- data.frame(
key = rep(indices[1:8000], 10),
key2 = rep(indices2[1:8000], 10),
value = rnorm(80000)
)
right <- data.frame(
key = indices[2001:10000],
key2 = indices2[2001:10000],
value2 = rnorm(8000)
)
write.csv(left, "pandas/left.csv", row.names = FALSE)
write.csv(right, "pandas/right.csv", row.names = FALSE)
# Equivalent functions for each technique --------------------------------------
base <- list(
setup = function(x, y) list(x = x, y = y),
left = function(x, y) base::merge(x, y, all.x = TRUE),
right = function(x, y) base::merge(x, y, all.y = TRUE),
inner = function(x, y) base::merge(x, y)
)
data.table <- list(
setup = function(x, y) {
list(
x = data.table(x, key = c("key", "key2")),
y = data.table(y, key = c("key", "key2"))
)
},
left = function(x, y) x[y],
right = function(x, y) y[x],
inner = function(x, y) merge(x, y, all = FALSE)
)
dplyr <- list(
setup = function(x, y) list(x = x, y = y),
left = function(x, y) left_join(x, y, by = c("key", "key2")),
right = function(x, y) NULL,
inner = function(x, y) inner_join(x, y, by = c("key", "key2"))
)
techniques <- list(base = base, data.table = data.table, dplyr = dplyr)
# Aggregate results ------------------------------------------------------------
niter <- 10
r <- lapply(names(techniques), function(nm) {
tech <- techniques[[nm]]
df <- tech$setup(left, right)
m <- microbenchmark(
left = tech$left(df$x, df$y),
right = tech$right(df$x, df$y),
inner = tech$inner(df$x, df$y),
times = niter
)
means <- tapply(m$time, m$expr, FUN = mean) / 1e9
data.frame(type = names(means), mean = means, tech = nm,
row.names = NULL, stringsAsFactors = FALSE)
})
system("cd pandas && python bench_merge.py")
pandas_raw <- read.csv("pandas/out.csv")
pandas <- data.frame(type = pandas_raw$X, mean = pandas_raw$dont_sort, tech = "pandas")
options(digits = 3)
all <- c(r, list(pandas))
all <- do.call("rbind", all)
all$mean <- all$mean * 1000 # millisec
all$mean[all$mean < .1] <- NA # NA out dplyr right
dcast(all, tech ~ type, value.var = "mean")