1#' Pivot data from wide to long
2#'
3#' @description
4#' `pivot_longer()` "lengthens" data, increasing the number of rows and
5#' decreasing the number of columns. The inverse transformation is
6#' [pivot_wider()]
7#'
8#' Learn more in `vignette("pivot")`.
9#'
10#' @details
11#' `pivot_longer()` is an updated approach to [gather()], designed to be both
12#' simpler to use and to handle more use cases. We recommend you use
13#' `pivot_longer()` for new code; `gather()` isn't going away but is no longer
14#' under active development.
15#'
16#' @param data A data frame to pivot.
17#' @param cols <[`tidy-select`][tidyr_tidy_select]> Columns to pivot into
18#'   longer format.
19#' @param names_to A string specifying the name of the column to create
20#'   from the data stored in the column names of `data`.
21#'
22#'   Can be a character vector, creating multiple columns, if `names_sep`
23#'   or `names_pattern` is provided. In this case, there are two special
24#'   values you can take advantage of:
25#'
26#'   * `NA` will discard that component of the name.
27#'   * `.value` indicates that component of the name defines the name of the
28#'     column containing the cell values, overriding `values_to`.
29#' @param names_prefix A regular expression used to remove matching text
30#'   from the start of each variable name.
31#' @param names_sep,names_pattern If `names_to` contains multiple values,
32#'   these arguments control how the column name is broken up.
33#'
34#'   `names_sep` takes the same specification as [separate()], and can either
35#'   be a numeric vector (specifying positions to break on), or a single string
36#'   (specifying a regular expression to split on).
37#'
38#'   `names_pattern` takes the same specification as [extract()], a regular
39#'   expression containing matching groups (`()`).
40#'
41#'   If these arguments do not give you enough control, use
42#'   `pivot_longer_spec()` to create a spec object and process manually as
43#'   needed.
44#' @param names_repair What happens if the output has invalid column names?
45#'   The default, `"check_unique"` is to error if the columns are duplicated.
46#'   Use `"minimal"` to allow duplicates in the output, or `"unique"` to
47#'   de-duplicated by adding numeric suffixes. See [vctrs::vec_as_names()]
48#'   for more options.
49#' @param values_to A string specifying the name of the column to create
50#'   from the data stored in cell values. If `names_to` is a character
51#'   containing the special `.value` sentinel, this value will be ignored,
52#'   and the name of the value column will be derived from part of the
53#'   existing column names.
54#' @param values_drop_na If `TRUE`, will drop rows that contain only `NA`s
55#'   in the `value_to` column. This effectively converts explicit missing values
56#'   to implicit missing values, and should generally be used only when missing
57#'   values in `data` were created by its structure.
58#' @param names_transform,values_transform A list of column name-function pairs.
59#'   Use these arguments if you need to change the types of specific columns.
60#'   For example, `names_transform = list(week = as.integer)` would convert
61#'   a character variable called `week` to an integer.
62#'
63#'   If not specified, the type of the columns generated from `names_to` will
64#'   be character, and the type of the variables generated from `values_to`
65#'   will be the common type of the input columns used to generate them.
66#' @param names_ptypes,values_ptypes A list of column name-prototype pairs.
67#'   A prototype (or ptype for short) is a zero-length vector (like `integer()`
68#'   or `numeric()`) that defines the type, class, and attributes of a vector.
69#'   Use these arguments if you want to confirm that the created columns are
70#'   the types that you expect. Note that if you want to change (instead of confirm)
71#'   the types of specific columns, you should use `names_transform` or
72#'   `values_transform` instead.
73#' @param ... Additional arguments passed on to methods.
74#' @export
75#' @examples
76#' # See vignette("pivot") for examples and explanation
77#'
78#' # Simplest case where column names are character data
79#' relig_income
80#' relig_income %>%
81#'   pivot_longer(!religion, names_to = "income", values_to = "count")
82#'
83#' # Slightly more complex case where columns have common prefix,
84#' # and missing missings are structural so should be dropped.
85#' billboard
86#' billboard %>%
87#'  pivot_longer(
88#'    cols = starts_with("wk"),
89#'    names_to = "week",
90#'    names_prefix = "wk",
91#'    values_to = "rank",
92#'    values_drop_na = TRUE
93#'  )
94#'
95#' # Multiple variables stored in column names
96#' who %>% pivot_longer(
97#'   cols = new_sp_m014:newrel_f65,
98#'   names_to = c("diagnosis", "gender", "age"),
99#'   names_pattern = "new_?(.*)_(.)(.*)",
100#'   values_to = "count"
101#' )
102#'
103#' # Multiple observations per row
104#' anscombe
105#' anscombe %>%
106#'  pivot_longer(everything(),
107#'    names_to = c(".value", "set"),
108#'    names_pattern = "(.)(.)"
109#'  )
110pivot_longer <- function(data,
111                         cols,
112                         names_to = "name",
113                         names_prefix = NULL,
114                         names_sep = NULL,
115                         names_pattern = NULL,
116                         names_ptypes = list(),
117                         names_transform = list(),
118                         names_repair = "check_unique",
119                         values_to = "value",
120                         values_drop_na = FALSE,
121                         values_ptypes = list(),
122                         values_transform = list(),
123                         ...
124                         ) {
125
126  ellipsis::check_dots_used()
127  UseMethod("pivot_longer")
128}
129
130#' @export
131pivot_longer.data.frame <- function(data,
132                                    cols,
133                                    names_to = "name",
134                                    names_prefix = NULL,
135                                    names_sep = NULL,
136                                    names_pattern = NULL,
137                                    names_ptypes = list(),
138                                    names_transform = list(),
139                                    names_repair = "check_unique",
140                                    values_to = "value",
141                                    values_drop_na = FALSE,
142                                    values_ptypes = list(),
143                                    values_transform = list(),
144                                    ...
145                                    ) {
146  cols <- enquo(cols)
147  spec <- build_longer_spec(data, !!cols,
148    names_to = names_to,
149    values_to = values_to,
150    names_prefix = names_prefix,
151    names_sep = names_sep,
152    names_pattern = names_pattern,
153    names_ptypes = names_ptypes,
154    names_transform = names_transform
155  )
156
157  pivot_longer_spec(data, spec,
158    names_repair = names_repair,
159    values_drop_na = values_drop_na,
160    values_ptypes = values_ptypes,
161    values_transform = values_transform
162  )
163}
164
165
166#' Pivot data from wide to long using a spec
167#'
168#' This is a low level interface to pivotting, inspired by the cdata package,
169#' that allows you to describe pivotting with a data frame.
170#'
171#' @keywords internal
172#' @export
173#' @inheritParams pivot_longer
174#' @param spec A specification data frame. This is useful for more complex
175#'  pivots because it gives you greater control on how metadata stored in the
176#'  column names turns into columns in the result.
177#'
178#'   Must be a data frame containing character `.name` and `.value` columns.
179#'   Additional columns in `spec` should be named to match columns in the
180#'   long format of the dataset and contain values corresponding to columns
181#'   pivoted from the wide format.
182#'   The special `.seq` variable is used to disambiguate rows internally;
183#'   it is automatically removed after pivotting.
184#'
185#' @examples
186#' # See vignette("pivot") for examples and explanation
187#'
188#' # Use `build_longer_spec()` to build `spec` using similar syntax to `pivot_longer()`
189#' # and run `pivot_longer_spec()` based on `spec`.
190#' spec <- relig_income %>% build_longer_spec(
191#'   cols = !religion,
192#'   names_to = "income",
193#'   values_to = "count"
194#' )
195#' spec
196#'
197#' pivot_longer_spec(relig_income, spec)
198#'
199#' # Is equivalent to:
200#' relig_income %>% pivot_longer(
201#'   cols = !religion,
202#'   names_to = "income",
203#'   values_to = "count")
204#'
205pivot_longer_spec <- function(data,
206                              spec,
207                              names_repair = "check_unique",
208                              values_drop_na = FALSE,
209                              values_ptypes = list(),
210                              values_transform = list()
211                              ) {
212  spec <- check_spec(spec)
213  spec <- deduplicate_spec(spec, data)
214
215  # Quick hack to ensure that split() preserves order
216  v_fct <- factor(spec$.value, levels = unique(spec$.value))
217  values <- split(spec$.name, v_fct)
218  value_keys <- split(spec[-(1:2)], v_fct)
219  keys <- vec_unique(spec[-(1:2)])
220
221  vals <- set_names(vec_init(list(), length(values)), names(values))
222  for (value in names(values)) {
223    cols <- values[[value]]
224    col_id <- vec_match(value_keys[[value]], keys)
225
226    val_cols <- vec_init(list(), nrow(keys))
227    val_cols[col_id] <- unname(as.list(data[cols]))
228    val_cols[-col_id] <- list(rep(NA, nrow(data)))
229
230    if (has_name(values_transform, value)) {
231      val_cols <- lapply(val_cols, values_transform[[value]])
232    }
233    val_type <- vec_ptype_common(!!!set_names(val_cols[col_id], cols), .ptype = values_ptypes[[value]])
234    out <- vec_c(!!!val_cols, .ptype = val_type)
235    # Interleave into correct order
236    # TODO somehow `t(matrix(x))` is _faster_ than `matrix(x, byrow = TRUE)`
237    # if this gets fixed in R this should use `byrow = TRUE` again
238    n_vals <- nrow(data) * length(val_cols)
239    idx <- t(matrix(seq_len(n_vals), ncol = length(val_cols)))
240    vals[[value]] <- vec_slice(out, as.integer(idx))
241  }
242  vals <- as_tibble(vals)
243
244  # Join together df, spec, and val to produce final tibble
245  df_out <- drop_cols(as_tibble(data, .name_repair = "minimal"), spec$.name)
246
247  out <- wrap_error_names(vec_cbind(
248    vec_rep_each(df_out, vec_size(keys)),
249    vec_rep(keys, vec_size(data)),
250    vals,
251    .name_repair = names_repair
252  ))
253
254  if (values_drop_na) {
255    out <- vec_slice(out, !vec_equal_na(vals))
256  }
257
258  out$.seq <- NULL
259
260  reconstruct_tibble(data, out)
261}
262
263#' @rdname pivot_longer_spec
264#' @export
265build_longer_spec <- function(data, cols,
266                              names_to = "name",
267                              values_to = "value",
268                              names_prefix = NULL,
269                              names_sep = NULL,
270                              names_pattern = NULL,
271                              names_ptypes = NULL,
272                              names_transform = NULL) {
273  cols <- tidyselect::eval_select(enquo(cols), data[unique(names(data))])
274
275  if (length(cols) == 0) {
276    abort(glue::glue("`cols` must select at least one column."))
277  }
278
279  if (is.null(names_prefix)) {
280    names <- names(cols)
281  } else {
282    names <- gsub(paste0("^", names_prefix), "", names(cols))
283  }
284
285  if (length(names_to) > 1) {
286    if (!xor(is.null(names_sep), is.null(names_pattern))) {
287      abort(glue::glue(
288        "If you supply multiple names in `names_to` you must also supply one",
289        " of `names_sep` or `names_pattern`."
290      ))
291    }
292
293    if (!is.null(names_sep)) {
294      names <- str_separate(names, names_to, sep = names_sep)
295    } else {
296      names <- str_extract(names, names_to, regex = names_pattern)
297    }
298  } else if (length(names_to) == 0) {
299    names <- tibble::new_tibble(x = list(), nrow = length(names))
300  } else {
301    if (!is.null(names_sep)) {
302      abort("`names_sep` can not be used with length-1 `names_to`")
303    }
304    if (!is.null(names_pattern)) {
305      names <- str_extract(names, names_to, regex = names_pattern)[[1]]
306    }
307
308    names <- tibble(!!names_to := names)
309  }
310
311  if (".value" %in% names_to) {
312    values_to <- NULL
313  } else {
314    vec_assert(values_to, ptype = character(), size = 1)
315  }
316
317  # optionally, cast variables generated from columns
318  cast_cols <- intersect(names(names), names(names_ptypes))
319  for (col in cast_cols) {
320    names[[col]] <- vec_cast(names[[col]], names_ptypes[[col]])
321  }
322
323  # transform cols
324  coerce_cols <- intersect(names(names), names(names_transform))
325  for (col in coerce_cols) {
326    f <- as_function(names_transform[[col]])
327    names[[col]] <- f(names[[col]])
328  }
329
330  out <- tibble(.name = names(cols))
331  out[[".value"]] <- values_to
332  out <- vec_cbind(out, names)
333  out
334}
335
336drop_cols <- function(df, cols) {
337  if (is.character(cols)) {
338    df[setdiff(names(df), cols)]
339  } else if (is.integer(cols)) {
340    df[-cols]
341  } else {
342    abort("Invalid input")
343  }
344}
345
346# Ensure that there's a one-to-one match from spec to data by adding
347# a special .seq variable which is automatically removed after pivotting.
348deduplicate_spec <- function(spec, df) {
349
350  # Ensure each .name has a unique output identifier
351  key <- spec[setdiff(names(spec), ".name")]
352  if (vec_duplicate_any(key)) {
353    pos <- vec_group_loc(key)$loc
354    seq <- vector("integer", length = nrow(spec))
355    for (i in seq_along(pos)) {
356      seq[pos[[i]]] <- seq_along(pos[[i]])
357    }
358    spec$.seq <- seq
359  }
360
361  # Match spec to data, handling duplicated column names
362  col_id <- vec_match(names(df), spec$.name)
363  has_match <- !is.na(col_id)
364
365  if (!vec_duplicate_any(col_id[has_match])) {
366    return(spec)
367  }
368
369  spec <- vec_slice(spec, col_id[has_match])
370  # Need to use numeric indices because names only match first
371  spec$.name <- seq_along(df)[has_match]
372
373  pieces <- vec_split(seq_len(nrow(spec)), col_id[has_match])
374  copy <- integer(nrow(spec))
375  for (i in seq_along(pieces$val)) {
376    idx <- pieces$val[[i]]
377    copy[idx] <- seq_along(idx)
378  }
379
380  spec$.seq <- copy
381  spec
382}
383