1#' Join two data frames together. 2#' 3#' Join, like merge, is designed for the types of problems 4#' where you would use a sql join. 5#' 6#' The four join types return: 7#' 8#' \itemize{ 9#' \item \code{inner}: only rows with matching keys in both x and y 10#' \item \code{left}: all rows in x, adding matching columns from y 11#' \item \code{right}: all rows in y, adding matching columns from x 12#' \item \code{full}: all rows in x with matching columns in y, then the 13#' rows of y that don't match x. 14#' } 15#' 16#' Note that from plyr 1.5, \code{join} will (by default) return all matches, 17#' not just the first match, as it did previously. 18#' 19#' Unlike merge, preserves the order of x no matter what join type is used. 20#' If needed, rows from y will be added to the bottom. Join is often faster 21#' than merge, although it is somewhat less featureful - it currently offers 22#' no way to rename output or merge on different variables in the x and y 23#' data frames. 24#' 25#' @param x data frame 26#' @param y data frame 27#' @param by character vector of variable names to join by. If omitted, will 28#' match on all common variables. 29#' @param type type of join: left (default), right, inner or full. See 30#' details for more information. 31#' @param match how should duplicate ids be matched? Either match just the 32#' \code{"first"} matching row, or match \code{"all"} matching rows. Defaults 33#' to \code{"all"} for compatibility with merge, but \code{"first"} is 34#' significantly faster. 35#' @keywords manip 36#' @export 37#' @examples 38#' first <- ddply(baseball, "id", summarise, first = min(year)) 39#' system.time(b2 <- merge(baseball, first, by = "id", all.x = TRUE)) 40#' system.time(b3 <- join(baseball, first, by = "id")) 41#' 42#' b2 <- arrange(b2, id, year, stint) 43#' b3 <- arrange(b3, id, year, stint) 44#' stopifnot(all.equal(b2, b3)) 45join <- function(x, y, by = NULL, type = "left", match = "all") { 46 type <- match.arg(type, c("left", "right", "inner", "full")) 47 match <- match.arg(match, c("first", "all")) 48 49 if (is.null(by)) { 50 by <- intersect(names(x), names(y)) 51 message("Joining by: ", paste(by, collapse = ", ")) 52 } 53 54 switch(match, 55 "first" = .join_first(x, y, by, type), 56 "all" = .join_all(x, y, by, type)) 57} 58 59.join_first <- function(x, y, by, type) { 60 keys <- join.keys(x, y, by = by) 61 62 x.cols <- setdiff(names(x), by) 63 y.cols <- setdiff(names(y), by) 64 65 if (type == "inner") { 66 x.match <- match(keys$y, keys$x, 0) 67 y.match <- match(keys$x, keys$y, 0) 68 69 cbind( 70 x[x.match, by, drop = FALSE], 71 x[x.match, x.cols, drop = FALSE], 72 y[y.match, y.cols, drop = FALSE] 73 ) 74 } else if (type == "left") { 75 y.match <- match(keys$x, keys$y) 76 y.matched <- unrowname(y[y.match, y.cols, drop = FALSE]) 77 78 cbind(x[by], x[x.cols], y.matched) 79 } else if (type == "right") { 80 if (any(duplicated(keys$y))) { 81 stop("Duplicated key in y", call. = FALSE) 82 } 83 84 x.match <- match(keys$y, keys$x) 85 x.matched <- unrowname(x[x.match, x.cols, drop = FALSE]) 86 87 cbind(y[by], x.matched, y[y.cols]) 88 } else if (type == "full") { 89 # x with matching y's then any unmatched ys 90 91 y.match <- match(keys$x, keys$y) 92 y.matched <- unrowname(y[y.match, y.cols, drop = FALSE]) 93 94 y.unmatch <- is.na(match(keys$y, keys$x)) 95 96 rbind.fill(cbind(x[c(by, x.cols)], y.matched), y[y.unmatch, , drop = FALSE]) 97 } 98} 99 100# Basic idea to perform a full cartesian product of the two data frames 101# and then evaluate which rows meet the merging criteria. But that is 102# horrendously inefficient, so we do various types of hashing, implemented 103# in R as split_indices 104.join_all <- function(x, y, by, type) { 105 x.cols <- setdiff(names(x), by) 106 y.cols <- setdiff(names(y), by) 107 108 if (type == "inner") { 109 ids <- join_ids(x, y, by) 110 out <- cbind(x[ids$x, , drop = FALSE], y[ids$y, y.cols, drop = FALSE]) 111 } else if (type == "left") { 112 ids <- join_ids(x, y, by, all = TRUE) 113 out <- cbind(x[ids$x, , drop = FALSE], y[ids$y, y.cols, drop = FALSE]) 114 } else if (type == "right") { 115 # Flip x and y, but make sure to put new columns in the right place 116 ids <- join_ids(y, x, by, all = TRUE) 117 out <- cbind( 118 y[ids$x, by, drop = FALSE], 119 x[ids$y, x.cols, drop = FALSE], 120 y[ids$x, y.cols, drop = FALSE] 121 ) 122 } else if (type == "full") { 123 # x's with all matching y's, then non-matching y's - just the same as 124 # join.first 125 ids <- join_ids(x, y, by, all = TRUE) 126 127 matched <- cbind(x[ids$x, , drop = FALSE], 128 y[ids$y, y.cols, drop = FALSE]) 129 unmatched <- y[setdiff(seq_len(nrow(y)), ids$y), , drop = FALSE] 130 out <- rbind.fill(matched, unmatched) 131 } 132 133 unrowname(out) 134} 135 136join_ids <- function(x, y, by, all = FALSE) { 137 keys <- join.keys(x, y, by = by) 138 139 ys <- split_indices(keys$y, keys$n) 140 length(ys) <- keys$n 141 142 if (all) { 143 # replace NULL with NA to preserve those x's without matching y's 144 nulls <- vapply(ys, function(x) length(x) == 0, logical(1)) 145 ys[nulls] <- list(NA_real_) 146 } 147 148 ys <- ys[keys$x] 149 xs <- rep(seq_along(keys$x), vapply(ys, length, numeric(1))) 150 151 list(x = xs, y = unlist(ys)) 152} 153 154#' Join keys. 155#' Given two data frames, create a unique key for each row. 156#' 157#' @param x data frame 158#' @param y data frame 159#' @param by character vector of variable names to join by 160#' @keywords internal 161#' @export 162join.keys <- function(x, y, by) { 163 joint <- rbind.fill(x[by], y[by]) 164 keys <- id(joint, drop = TRUE) 165 166 n_x <- nrow(x) 167 n_y <- nrow(y) 168 169 list( 170 x = keys[seq_len(n_x)], 171 y = keys[n_x + seq_len(n_y)], 172 n = attr(keys, "n") 173 ) 174} 175