1join_cols <- function(x_names, y_names, by = NULL, suffix = c(".x", ".y"), keep = FALSE) {
2  check_duplicate_vars(x_names, "x")
3  check_duplicate_vars(y_names, "y")
4
5  by <- standardise_join_by(by, x_names = x_names, y_names = y_names)
6  suffix <- standardise_join_suffix(suffix)
7
8  x_by <- set_names(match(by$x, x_names), by$x)
9  y_by <- set_names(match(by$y, y_names), by$x)
10
11  x_loc <- seq_along(x_names)
12  names(x_loc) <- x_names
13  if (!keep) {
14    # in x_out, key variables need to keep the same name, and aux
15    # variables need suffixes for duplicates that appear in y_out
16    y_aux <- setdiff(y_names, c(by$x, if (!keep) by$y))
17    x_is_aux <- !x_names %in% by$x
18    names(x_loc)[x_is_aux] <- add_suffixes(x_names[x_is_aux], c(by$x, y_aux), suffix$x)
19  } else {
20    # in x_out, key variables and aux variables need suffixes
21    # for duplicates that appear in y_out
22    names(x_loc) <- add_suffixes(x_names, y_names, suffix$x)
23  }
24
25  y_loc <- seq_along(y_names)
26  names(y_loc) <- add_suffixes(y_names, x_names, suffix$y)
27  if (!keep) {
28    y_loc <- y_loc[!y_names %in% by$y]
29  }
30
31  # key = named location to use for matching
32  # out = named locations to use in output
33  list(
34    x = list(key = x_by, out = x_loc),
35    y = list(key = y_by, out = y_loc)
36  )
37}
38
39standardise_join_by <- function(by, x_names, y_names) {
40  if (is.null(by)) {
41    by <- intersect(x_names, y_names)
42    if (length(by) == 0) {
43      abort(c(
44        "`by` must be supplied when `x` and `y` have no common variables.",
45        i = "use by = character()` to perform a cross-join."
46      ))
47    }
48    by_quoted <- encodeString(by, quote = '"')
49    if (length(by_quoted) == 1L) {
50      by_code <- by_quoted
51    } else {
52      by_code <- paste0("c(", paste(by_quoted, collapse = ", "), ")")
53    }
54    inform(paste0("Joining, by = ", by_code))
55
56    by <- list(x = by, y = by)
57  } else if (is.character(by)) {
58    by_x <- names(by) %||% by
59    by_y <- unname(by)
60
61    # If x partially named, assume unnamed are the same in both tables
62    by_x[by_x == ""] <- by_y[by_x == ""]
63
64    by <- list(x = by_x, y = by_y)
65  } else if (is.list(by)) {
66    # TODO: check lengths
67    by <- by[c("x", "y")]
68  } else {
69    bad_args("by", "must be a (named) character vector, list, or NULL, not {friendly_type_of(by)}.")
70  }
71
72  check_join_vars(by$x, x_names)
73  check_join_vars(by$y, y_names)
74
75  by
76}
77
78check_join_vars <- function(vars, names) {
79  if (!is.character(vars)) {
80    abort("join columns must be character vectors.")
81  }
82
83  na <- is.na(vars)
84  if (any(na)) {
85    abort(c(
86      "Join columns must be not NA.",
87      x = glue("Problem at position {err_vars(na)}.")
88    ))
89  }
90
91  dup <- duplicated(vars)
92  if (any(dup)) {
93    abort(c(
94      "Join columns must be unique.",
95      x = glue("Problem at position {err_vars(dup)}.")
96    ))
97  }
98
99  missing <- setdiff(vars, names)
100  if (length(missing) > 0) {
101    abort(c(
102      "Join columns must be present in data.",
103      x = glue("Problem with {err_vars(missing)}.")
104    ))
105  }
106}
107
108check_duplicate_vars <- function(vars, input) {
109  dup <- duplicated(vars)
110  if (any(dup)) {
111    abort(c(
112      glue("Input columns in `{input}` must be unique."),
113      x = glue("Problem with {err_vars(vars[dup])}.")
114    ))
115  }
116}
117
118standardise_join_suffix <- function(x) {
119  if (!is.character(x) || length(x) != 2) {
120    abort(c(
121      "`suffix` must be a character vector of length 2.",
122      i = glue("suffix is {friendly_type_of(x)} of length {length(x)}.")
123    ))
124  }
125
126  if (any(is.na(x))) {
127    bad_args("suffix", "can't be NA.")
128  }
129
130  list(x = x[[1]], y = x[[2]])
131}
132
133add_suffixes <- function(x, y, suffix) {
134  if (identical(suffix, "")) {
135    return(x)
136  }
137
138  out <- rep_along(x, na_chr)
139  for (i in seq_along(x)) {
140    nm <- x[[i]]
141    while (nm %in% y || nm %in% out[seq_len(i - 1)]) {
142      nm <- paste0(nm, suffix)
143    }
144
145    out[[i]] <- nm
146  }
147  out
148}
149