1join_cols <- function(x_names, y_names, by = NULL, suffix = c(".x", ".y"), keep = FALSE) { 2 check_duplicate_vars(x_names, "x") 3 check_duplicate_vars(y_names, "y") 4 5 by <- standardise_join_by(by, x_names = x_names, y_names = y_names) 6 suffix <- standardise_join_suffix(suffix) 7 8 x_by <- set_names(match(by$x, x_names), by$x) 9 y_by <- set_names(match(by$y, y_names), by$x) 10 11 x_loc <- seq_along(x_names) 12 names(x_loc) <- x_names 13 if (!keep) { 14 # in x_out, key variables need to keep the same name, and aux 15 # variables need suffixes for duplicates that appear in y_out 16 y_aux <- setdiff(y_names, c(by$x, if (!keep) by$y)) 17 x_is_aux <- !x_names %in% by$x 18 names(x_loc)[x_is_aux] <- add_suffixes(x_names[x_is_aux], c(by$x, y_aux), suffix$x) 19 } else { 20 # in x_out, key variables and aux variables need suffixes 21 # for duplicates that appear in y_out 22 names(x_loc) <- add_suffixes(x_names, y_names, suffix$x) 23 } 24 25 y_loc <- seq_along(y_names) 26 names(y_loc) <- add_suffixes(y_names, x_names, suffix$y) 27 if (!keep) { 28 y_loc <- y_loc[!y_names %in% by$y] 29 } 30 31 # key = named location to use for matching 32 # out = named locations to use in output 33 list( 34 x = list(key = x_by, out = x_loc), 35 y = list(key = y_by, out = y_loc) 36 ) 37} 38 39standardise_join_by <- function(by, x_names, y_names) { 40 if (is.null(by)) { 41 by <- intersect(x_names, y_names) 42 if (length(by) == 0) { 43 abort(c( 44 "`by` must be supplied when `x` and `y` have no common variables.", 45 i = "use by = character()` to perform a cross-join." 46 )) 47 } 48 by_quoted <- encodeString(by, quote = '"') 49 if (length(by_quoted) == 1L) { 50 by_code <- by_quoted 51 } else { 52 by_code <- paste0("c(", paste(by_quoted, collapse = ", "), ")") 53 } 54 inform(paste0("Joining, by = ", by_code)) 55 56 by <- list(x = by, y = by) 57 } else if (is.character(by)) { 58 by_x <- names(by) %||% by 59 by_y <- unname(by) 60 61 # If x partially named, assume unnamed are the same in both tables 62 by_x[by_x == ""] <- by_y[by_x == ""] 63 64 by <- list(x = by_x, y = by_y) 65 } else if (is.list(by)) { 66 # TODO: check lengths 67 by <- by[c("x", "y")] 68 } else { 69 bad_args("by", "must be a (named) character vector, list, or NULL, not {friendly_type_of(by)}.") 70 } 71 72 check_join_vars(by$x, x_names) 73 check_join_vars(by$y, y_names) 74 75 by 76} 77 78check_join_vars <- function(vars, names) { 79 if (!is.character(vars)) { 80 abort("join columns must be character vectors.") 81 } 82 83 na <- is.na(vars) 84 if (any(na)) { 85 abort(c( 86 "Join columns must be not NA.", 87 x = glue("Problem at position {err_vars(na)}.") 88 )) 89 } 90 91 dup <- duplicated(vars) 92 if (any(dup)) { 93 abort(c( 94 "Join columns must be unique.", 95 x = glue("Problem at position {err_vars(dup)}.") 96 )) 97 } 98 99 missing <- setdiff(vars, names) 100 if (length(missing) > 0) { 101 abort(c( 102 "Join columns must be present in data.", 103 x = glue("Problem with {err_vars(missing)}.") 104 )) 105 } 106} 107 108check_duplicate_vars <- function(vars, input) { 109 dup <- duplicated(vars) 110 if (any(dup)) { 111 abort(c( 112 glue("Input columns in `{input}` must be unique."), 113 x = glue("Problem with {err_vars(vars[dup])}.") 114 )) 115 } 116} 117 118standardise_join_suffix <- function(x) { 119 if (!is.character(x) || length(x) != 2) { 120 abort(c( 121 "`suffix` must be a character vector of length 2.", 122 i = glue("suffix is {friendly_type_of(x)} of length {length(x)}.") 123 )) 124 } 125 126 if (any(is.na(x))) { 127 bad_args("suffix", "can't be NA.") 128 } 129 130 list(x = x[[1]], y = x[[2]]) 131} 132 133add_suffixes <- function(x, y, suffix) { 134 if (identical(suffix, "")) { 135 return(x) 136 } 137 138 out <- rep_along(x, na_chr) 139 for (i in seq_along(x)) { 140 nm <- x[[i]] 141 while (nm %in% y || nm %in% out[seq_len(i - 1)]) { 142 nm <- paste0(nm, suffix) 143 } 144 145 out[[i]] <- nm 146 } 147 out 148} 149