forked from tidyverse/dplyr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sets.r
182 lines (157 loc) · 4.46 KB
/
sets.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#' Set operations
#'
#' These functions override the set functions provided in base to make them
#' generic so that efficient versions for data frames and other tables can be
#' provided. The default methods call the base versions. Beware that
#' `intersect()`, `union()` and `setdiff()` remove duplicates.
#'
#' @param x,y objects to perform set function on (ignoring order)
#' @param ... other arguments passed on to methods
#' @name setops
#' @examples
#' mtcars$model <- rownames(mtcars)
#' first <- mtcars[1:20, ]
#' second <- mtcars[10:32, ]
#'
#' intersect(first, second)
#' union(first, second)
#' setdiff(first, second)
#' setdiff(second, first)
#'
#' union_all(first, second)
#' setequal(mtcars, mtcars[32:1, ])
#'
#' # Handling of duplicates:
#' a <- data.frame(column = c(1:10, 10))
#' b <- data.frame(column = c(1:5, 5))
#'
#' # intersection is 1 to 5, duplicates removed (5)
#' intersect(a, b)
#'
#' # union is 1 to 10, duplicates removed (5 and 10)
#' union(a, b)
#'
#' # set difference, duplicates removed (10)
#' setdiff(a, b)
#'
#' # union all does not remove duplicates
#' union_all(a, b)
NULL
#' @rdname setops
#' @export
union_all <- function(x, y, ...) UseMethod("union_all")
#' @export
union_all.default <- function(x, y, ...) vec_c(x, y, ...)
#' @importFrom generics intersect
#' @export
generics::intersect
#' @importFrom generics union
#' @export
generics::union
#' @importFrom generics setdiff
#' @export
generics::setdiff
#' @importFrom generics setequal
#' @export
generics::setequal
#' @export
intersect.data.frame <- function(x, y, ...) {
check_compatible(x, y)
cast <- vec_cast_common(x, y)
new_x <- cast[[1L]]
new_y <- cast[[2L]]
out <- vec_unique(vec_slice(new_x, vec_in(new_x, new_y)))
reconstruct_set(out, x)
}
#' @export
union.data.frame <- function(x, y, ...) {
check_compatible(x, y)
out <- vec_unique(vec_rbind(!!!vec_cast_common(x, y)))
reconstruct_set(out, x)
}
#' @export
union_all.data.frame <- function(x, y, ...) {
out <- bind_rows(x, y)
reconstruct_set(out, x)
}
#' @export
setdiff.data.frame <- function(x, y, ...) {
check_compatible(x, y)
cast <- vec_cast_common(x, y)
new_x <- cast[[1L]]
new_y <- cast[[2L]]
out <- vec_unique(vec_slice(new_x, !vec_in(new_x, new_y)))
reconstruct_set(out, x)
}
#' @export
setequal.data.frame <- function(x, y, ...) {
isTRUE(equal_data_frame(x, y))
}
reconstruct_set <- function(out, x) {
if (is_grouped_df(x)) {
out <- grouped_df(out, group_vars(x), group_by_drop_default(x))
}
out
}
# Helpers -----------------------------------------------------------------
is_compatible_data_frame <- function(x, y, ignore_col_order = TRUE, convert = TRUE) {
nc <- ncol(x)
if (nc != ncol(y)) {
return(glue("- different number of columns: {nc} vs {ncol(y)}"))
}
names_x <- names(x)
names_y <- names(y)
names_y_not_in_x <- setdiff(names_y, names_x)
names_x_not_in_y <- setdiff(names_x, names_y)
if (length(names_y_not_in_x) == 0L && length(names_x_not_in_y) == 0L) {
# check if same order
if (!isTRUE(ignore_col_order)) {
if (!identical(names_x, names_y)) {
return("- Same column names, but different order")
}
}
} else {
# names are not the same, explain why
msg <- "not compatible: \n"
if (length(names_y_not_in_x)) {
msg <- paste0(msg, "- Cols in y but not x: ", glue_collapse(glue('`{names_y_not_in_x}`'), sep = ", "), ".\n")
}
if (length(names_x_not_in_y)) {
msg <- paste0(msg, "- Cols in x but not y: ", glue_collapse(glue('`{names_x_not_in_y}`'), sep = ", "), ".\n")
}
return(msg)
}
msg <- ""
for (name in names_x) {
x_i <- x[[name]]
y_i <- y[[name]]
if (convert) {
tryCatch(
vec_ptype2(x_i, y_i),
error = function(e) {
msg <<- paste0(msg,
glue("- Incompatible types for column `{name}`: {vec_ptype_full(x_i)} vs {vec_ptype_full(y_i)}"),
"\n"
)
}
)
} else {
if (!identical(vec_ptype(x_i), vec_ptype(y_i))) {
msg <- paste0(msg,
glue("- Different types for column `{name}`: {vec_ptype_full(x_i)} vs {vec_ptype_full(y_i)}"),
"\n"
)
}
}
}
if (msg != "") {
return(msg)
}
TRUE
}
check_compatible <- function(x, y, ignore_col_order = TRUE, convert = TRUE) {
compat <- is_compatible_data_frame(x, y, ignore_col_order = ignore_col_order, convert = convert)
if (is.character(compat)) {
abort(paste0("not compatible: \n", glue_collapse(compat, sep = "\n")))
}
}