1#' Join two data frames together.
2#'
3#' Join, like merge, is designed for the types of problems
4#' where you would use a sql join.
5#'
6#' The four join types return:
7#'
8#' \itemize{
9#'  \item \code{inner}:  only rows with matching keys in both x and y
10#'  \item \code{left}:   all rows in x, adding matching columns from y
11#'  \item \code{right}:  all rows in y, adding matching columns from x
12#'  \item \code{full}:   all rows in x with matching columns in y, then the
13#'    rows of y that don't match x.
14#' }
15#'
16#' Note that from plyr 1.5, \code{join} will (by default) return all matches,
17#' not just the first match, as it did previously.
18#'
19#' Unlike merge, preserves the order of x no matter what join type is used.
20#' If needed, rows from y will be added to the bottom.  Join is often faster
21#' than merge, although it is somewhat less featureful - it currently offers
22#' no way to rename output or merge on different variables in the x and y
23#' data frames.
24#'
25#' @param x data frame
26#' @param y data frame
27#' @param by character vector of variable names to join by. If omitted, will
28#'   match on all common variables.
29#' @param type type of join: left (default), right, inner or full.  See
30#'   details for more information.
31#' @param match how should duplicate ids be matched? Either match just the
32#'   \code{"first"} matching row, or match \code{"all"} matching rows. Defaults
33#'   to \code{"all"} for compatibility with merge, but \code{"first"} is
34#'   significantly faster.
35#' @keywords manip
36#' @export
37#' @examples
38#' first <- ddply(baseball, "id", summarise, first = min(year))
39#' system.time(b2 <- merge(baseball, first, by = "id", all.x = TRUE))
40#' system.time(b3 <- join(baseball, first, by = "id"))
41#'
42#' b2 <- arrange(b2, id, year, stint)
43#' b3 <- arrange(b3, id, year, stint)
44#' stopifnot(all.equal(b2, b3))
45join <- function(x, y, by = NULL, type = "left", match = "all") {
46  type <- match.arg(type, c("left", "right", "inner", "full"))
47  match <- match.arg(match, c("first", "all"))
48
49  if (is.null(by)) {
50    by <- intersect(names(x), names(y))
51    message("Joining by: ", paste(by, collapse = ", "))
52  }
53
54  switch(match,
55    "first" = .join_first(x, y, by, type),
56    "all" = .join_all(x, y, by, type))
57}
58
59.join_first <- function(x, y, by, type) {
60  keys <- join.keys(x, y, by = by)
61
62  x.cols <- setdiff(names(x), by)
63  y.cols <- setdiff(names(y), by)
64
65  if (type == "inner") {
66    x.match <- match(keys$y, keys$x, 0)
67    y.match <- match(keys$x, keys$y, 0)
68
69    cbind(
70      x[x.match, by, drop = FALSE],
71      x[x.match, x.cols, drop = FALSE],
72      y[y.match, y.cols, drop = FALSE]
73    )
74  } else if (type == "left") {
75    y.match <- match(keys$x, keys$y)
76    y.matched <- unrowname(y[y.match, y.cols, drop = FALSE])
77
78    cbind(x[by], x[x.cols], y.matched)
79  } else if (type == "right") {
80    if (any(duplicated(keys$y))) {
81      stop("Duplicated key in y", call. = FALSE)
82    }
83
84    x.match <- match(keys$y, keys$x)
85    x.matched <- unrowname(x[x.match, x.cols, drop = FALSE])
86
87    cbind(y[by], x.matched, y[y.cols])
88  } else if (type == "full") {
89    # x with matching y's then any unmatched ys
90
91    y.match <- match(keys$x, keys$y)
92    y.matched <- unrowname(y[y.match, y.cols, drop = FALSE])
93
94    y.unmatch <- is.na(match(keys$y, keys$x))
95
96    rbind.fill(cbind(x[c(by, x.cols)], y.matched), y[y.unmatch, , drop = FALSE])
97  }
98}
99
100# Basic idea to perform a full cartesian product of the two data frames
101# and then evaluate which rows meet the merging criteria. But that is
102# horrendously inefficient, so we do various types of hashing, implemented
103# in R as split_indices
104.join_all <- function(x, y, by, type) {
105  x.cols <- setdiff(names(x), by)
106  y.cols <- setdiff(names(y), by)
107
108  if (type == "inner") {
109    ids <- join_ids(x, y, by)
110    out <- cbind(x[ids$x, , drop = FALSE], y[ids$y, y.cols, drop = FALSE])
111  } else if (type == "left") {
112    ids <- join_ids(x, y, by, all = TRUE)
113    out <- cbind(x[ids$x, , drop = FALSE], y[ids$y, y.cols, drop = FALSE])
114  } else if (type == "right") {
115    # Flip x and y, but make sure to put new columns in the right place
116    ids <- join_ids(y, x, by, all = TRUE)
117    out <- cbind(
118      y[ids$x, by, drop = FALSE],
119      x[ids$y, x.cols, drop = FALSE],
120      y[ids$x, y.cols, drop = FALSE]
121    )
122  } else if (type == "full") {
123    # x's with all matching y's, then non-matching y's - just the same as
124    # join.first
125    ids <- join_ids(x, y, by, all = TRUE)
126
127    matched <- cbind(x[ids$x, , drop = FALSE],
128                     y[ids$y, y.cols, drop = FALSE])
129    unmatched <- y[setdiff(seq_len(nrow(y)), ids$y), , drop = FALSE]
130    out <- rbind.fill(matched, unmatched)
131  }
132
133  unrowname(out)
134}
135
136join_ids <- function(x, y, by, all = FALSE) {
137  keys <- join.keys(x, y, by = by)
138
139  ys <- split_indices(keys$y, keys$n)
140  length(ys) <- keys$n
141
142  if (all) {
143    # replace NULL with NA to preserve those x's without matching y's
144    nulls <- vapply(ys, function(x) length(x) == 0, logical(1))
145    ys[nulls] <- list(NA_real_)
146  }
147
148  ys <- ys[keys$x]
149  xs <- rep(seq_along(keys$x), vapply(ys, length, numeric(1)))
150
151  list(x = xs, y = unlist(ys))
152}
153
154#' Join keys.
155#' Given two data frames, create a unique key for each row.
156#'
157#' @param x data frame
158#' @param y data frame
159#' @param by character vector of variable names to join by
160#' @keywords internal
161#' @export
162join.keys <- function(x, y, by) {
163  joint <- rbind.fill(x[by], y[by])
164  keys <- id(joint, drop = TRUE)
165
166  n_x <- nrow(x)
167  n_y <- nrow(y)
168
169  list(
170    x = keys[seq_len(n_x)],
171    y = keys[n_x + seq_len(n_y)],
172    n = attr(keys, "n")
173  )
174}
175