tidyr/R/pivot-long.R

#' Pivot data from wide to long
#'
#' @description
#' `pivot_longer()` "lengthens" data, increasing the number of rows and
#' decreasing the number of columns. The inverse transformation is
#' [pivot_wider()]
#'
#' Learn more in `vignette("pivot")`.
#'
#' @details
#' `pivot_longer()` is an updated approach to [gather()], designed to be both
#' simpler to use and to handle more use cases. We recommend you use
#' `pivot_longer()` for new code; `gather()` isn't going away but is no longer
#' under active development.
#'
#' @param data A data frame to pivot.
#' @param cols <[`tidy-select`][tidyr_tidy_select]> Columns to pivot into
#'   longer format.
#' @param names_to A string specifying the name of the column to create
#'   from the data stored in the column names of `data`.
#'
#'   Can be a character vector, creating multiple columns, if `names_sep`
#'   or `names_pattern` is provided. In this case, there are two special
#'   values you can take advantage of:
#'
#'   * `NA` will discard that component of the name.
#'   * `.value` indicates that component of the name defines the name of the
#'     column containing the cell values, overriding `values_to`.
#' @param names_prefix A regular expression used to remove matching text
#'   from the start of each variable name.
#' @param names_sep,names_pattern If `names_to` contains multiple values,
#'   these arguments control how the column name is broken up.
#'
#'   `names_sep` takes the same specification as [separate()], and can either
#'   be a numeric vector (specifying positions to break on), or a single string
#'   (specifying a regular expression to split on).
#'
#'   `names_pattern` takes the same specification as [extract()], a regular
#'   expression containing matching groups (`()`).
#'
#'   If these arguments do not give you enough control, use
#'   `pivot_longer_spec()` to create a spec object and process manually as
#'   needed.
#' @param names_repair What happens if the output has invalid column names?
#'   The default, `"check_unique"` is to error if the columns are duplicated.
#'   Use `"minimal"` to allow duplicates in the output, or `"unique"` to
#'   de-duplicated by adding numeric suffixes. See [vctrs::vec_as_names()]
#'   for more options.
#' @param values_to A string specifying the name of the column to create
#'   from the data stored in cell values. If `names_to` is a character
#'   containing the special `.value` sentinel, this value will be ignored,
#'   and the name of the value column will be derived from part of the
#'   existing column names.
#' @param values_drop_na If `TRUE`, will drop rows that contain only `NA`s
#'   in the `value_to` column. This effectively converts explicit missing values
#'   to implicit missing values, and should generally be used only when missing
#'   values in `data` were created by its structure.
#' @param names_transform,values_transform A list of column name-function pairs.
#'   Use these arguments if you need to change the types of specific columns.
#'   For example, `names_transform = list(week = as.integer)` would convert
#'   a character variable called `week` to an integer.
#'
#'   If not specified, the type of the columns generated from `names_to` will
#'   be character, and the type of the variables generated from `values_to`
#'   will be the common type of the input columns used to generate them.
#' @param names_ptypes,values_ptypes A list of column name-prototype pairs.
#'   A prototype (or ptype for short) is a zero-length vector (like `integer()`
#'   or `numeric()`) that defines the type, class, and attributes of a vector.
#'   Use these arguments if you want to confirm that the created columns are
#'   the types that you expect. Note that if you want to change (instead of confirm)
#'   the types of specific columns, you should use `names_transform` or
#'   `values_transform` instead.
#' @param ... Additional arguments passed on to methods.
#' @export
#' @examples
#' # See vignette("pivot") for examples and explanation
#'
#' # Simplest case where column names are character data
#' relig_income
#' relig_income %>%
#'   pivot_longer(!religion, names_to = "income", values_to = "count")
#'
#' # Slightly more complex case where columns have common prefix,
#' # and missing missings are structural so should be dropped.
#' billboard
#' billboard %>%
#'  pivot_longer(
#'    cols = starts_with("wk"),
#'    names_to = "week",
#'    names_prefix = "wk",
#'    values_to = "rank",
#'    values_drop_na = TRUE
#'  )
#'
#' # Multiple variables stored in column names
#' who %>% pivot_longer(
#'   cols = new_sp_m014:newrel_f65,
#'   names_to = c("diagnosis", "gender", "age"),
#'   names_pattern = "new_?(.*)_(.)(.*)",
#'   values_to = "count"
#' )
#'
#' # Multiple observations per row
#' anscombe
#' anscombe %>%
#'  pivot_longer(everything(),
#'    names_to = c(".value", "set"),
#'    names_pattern = "(.)(.)"
#'  )
pivot_longer <- function(data,
                         cols,
                         names_to = "name",
                         names_prefix = NULL,
                         names_sep = NULL,
                         names_pattern = NULL,
                         names_ptypes = list(),
                         names_transform = list(),
                         names_repair = "check_unique",
                         values_to = "value",
                         values_drop_na = FALSE,
                         values_ptypes = list(),
                         values_transform = list(),
                         ...
                         ) {

  ellipsis::check_dots_used()
  UseMethod("pivot_longer")
}

#' @export
pivot_longer.data.frame <- function(data,
                                    cols,
                                    names_to = "name",
                                    names_prefix = NULL,
                                    names_sep = NULL,
                                    names_pattern = NULL,
                                    names_ptypes = list(),
                                    names_transform = list(),
                                    names_repair = "check_unique",
                                    values_to = "value",
                                    values_drop_na = FALSE,
                                    values_ptypes = list(),
                                    values_transform = list(),
                                    ...
                                    ) {
  cols <- enquo(cols)
  spec <- build_longer_spec(data, !!cols,
    names_to = names_to,
    values_to = values_to,
    names_prefix = names_prefix,
    names_sep = names_sep,
    names_pattern = names_pattern,
    names_ptypes = names_ptypes,
    names_transform = names_transform
  )

  pivot_longer_spec(data, spec,
    names_repair = names_repair,
    values_drop_na = values_drop_na,
    values_ptypes = values_ptypes,
    values_transform = values_transform
  )
}


#' Pivot data from wide to long using a spec
#'
#' This is a low level interface to pivotting, inspired by the cdata package,
#' that allows you to describe pivotting with a data frame.
#'
#' @keywords internal
#' @export
#' @inheritParams pivot_longer
#' @param spec A specification data frame. This is useful for more complex
#'  pivots because it gives you greater control on how metadata stored in the
#'  column names turns into columns in the result.
#'
#'   Must be a data frame containing character `.name` and `.value` columns.
#'   Additional columns in `spec` should be named to match columns in the
#'   long format of the dataset and contain values corresponding to columns
#'   pivoted from the wide format.
#'   The special `.seq` variable is used to disambiguate rows internally;
#'   it is automatically removed after pivotting.
#'
#' @examples
#' # See vignette("pivot") for examples and explanation
#'
#' # Use `build_longer_spec()` to build `spec` using similar syntax to `pivot_longer()`
#' # and run `pivot_longer_spec()` based on `spec`.
#' spec <- relig_income %>% build_longer_spec(
#'   cols = !religion,
#'   names_to = "income",
#'   values_to = "count"
#' )
#' spec
#'
#' pivot_longer_spec(relig_income, spec)
#'
#' # Is equivalent to:
#' relig_income %>% pivot_longer(
#'   cols = !religion,
#'   names_to = "income",
#'   values_to = "count")
#'
pivot_longer_spec <- function(data,
                              spec,
                              names_repair = "check_unique",
                              values_drop_na = FALSE,
                              values_ptypes = list(),
                              values_transform = list()
                              ) {
  spec <- check_spec(spec)
  spec <- deduplicate_spec(spec, data)

  # Quick hack to ensure that split() preserves order
  v_fct <- factor(spec$.value, levels = unique(spec$.value))
  values <- split(spec$.name, v_fct)
  value_keys <- split(spec[-(1:2)], v_fct)
  keys <- vec_unique(spec[-(1:2)])

  vals <- set_names(vec_init(list(), length(values)), names(values))
  for (value in names(values)) {
    cols <- values[[value]]
    col_id <- vec_match(value_keys[[value]], keys)

    val_cols <- vec_init(list(), nrow(keys))
    val_cols[col_id] <- unname(as.list(data[cols]))
    val_cols[-col_id] <- list(rep(NA, nrow(data)))

    if (has_name(values_transform, value)) {
      val_cols <- lapply(val_cols, values_transform[[value]])
    }
    val_type <- vec_ptype_common(!!!set_names(val_cols[col_id], cols), .ptype = values_ptypes[[value]])
    out <- vec_c(!!!val_cols, .ptype = val_type)
    # Interleave into correct order
    # TODO somehow `t(matrix(x))` is _faster_ than `matrix(x, byrow = TRUE)`
    # if this gets fixed in R this should use `byrow = TRUE` again
    n_vals <- nrow(data) * length(val_cols)
    idx <- t(matrix(seq_len(n_vals), ncol = length(val_cols)))
    vals[[value]] <- vec_slice(out, as.integer(idx))
  }
  vals <- as_tibble(vals)

  # Join together df, spec, and val to produce final tibble
  df_out <- drop_cols(as_tibble(data, .name_repair = "minimal"), spec$.name)

  out <- wrap_error_names(vec_cbind(
    vec_rep_each(df_out, vec_size(keys)),
    vec_rep(keys, vec_size(data)),
    vals,
    .name_repair = names_repair
  ))

  if (values_drop_na) {
    out <- vec_slice(out, !vec_equal_na(vals))
  }

  out$.seq <- NULL

  reconstruct_tibble(data, out)
}

#' @rdname pivot_longer_spec
#' @export
build_longer_spec <- function(data, cols,
                              names_to = "name",
                              values_to = "value",
                              names_prefix = NULL,
                              names_sep = NULL,
                              names_pattern = NULL,
                              names_ptypes = NULL,
                              names_transform = NULL) {
  cols <- tidyselect::eval_select(enquo(cols), data[unique(names(data))])

  if (length(cols) == 0) {
    abort(glue::glue("`cols` must select at least one column."))
  }

  if (is.null(names_prefix)) {
    names <- names(cols)
  } else {
    names <- gsub(paste0("^", names_prefix), "", names(cols))
  }

  if (length(names_to) > 1) {
    if (!xor(is.null(names_sep), is.null(names_pattern))) {
      abort(glue::glue(
        "If you supply multiple names in `names_to` you must also supply one",
        " of `names_sep` or `names_pattern`."
      ))
    }

    if (!is.null(names_sep)) {
      names <- str_separate(names, names_to, sep = names_sep)
    } else {
      names <- str_extract(names, names_to, regex = names_pattern)
    }
  } else if (length(names_to) == 0) {
    names <- tibble::new_tibble(x = list(), nrow = length(names))
  } else {
    if (!is.null(names_sep)) {
      abort("`names_sep` can not be used with length-1 `names_to`")
    }
    if (!is.null(names_pattern)) {
      names <- str_extract(names, names_to, regex = names_pattern)[[1]]
    }

    names <- tibble(!!names_to := names)
  }

  if (".value" %in% names_to) {
    values_to <- NULL
  } else {
    vec_assert(values_to, ptype = character(), size = 1)
  }

  # optionally, cast variables generated from columns
  cast_cols <- intersect(names(names), names(names_ptypes))
  for (col in cast_cols) {
    names[[col]] <- vec_cast(names[[col]], names_ptypes[[col]])
  }

  # transform cols
  coerce_cols <- intersect(names(names), names(names_transform))
  for (col in coerce_cols) {
    f <- as_function(names_transform[[col]])
    names[[col]] <- f(names[[col]])
  }

  out <- tibble(.name = names(cols))
  out[[".value"]] <- values_to
  out <- vec_cbind(out, names)
  out
}

drop_cols <- function(df, cols) {
  if (is.character(cols)) {
    df[setdiff(names(df), cols)]
  } else if (is.integer(cols)) {
    df[-cols]
  } else {
    abort("Invalid input")
  }
}

# Ensure that there's a one-to-one match from spec to data by adding
# a special .seq variable which is automatically removed after pivotting.
deduplicate_spec <- function(spec, df) {

  # Ensure each .name has a unique output identifier
  key <- spec[setdiff(names(spec), ".name")]
  if (vec_duplicate_any(key)) {
    pos <- vec_group_loc(key)$loc
    seq <- vector("integer", length = nrow(spec))
    for (i in seq_along(pos)) {
      seq[pos[[i]]] <- seq_along(pos[[i]])
    }
    spec$.seq <- seq
  }

  # Match spec to data, handling duplicated column names
  col_id <- vec_match(names(df), spec$.name)
  has_match <- !is.na(col_id)

  if (!vec_duplicate_any(col_id[has_match])) {
    return(spec)
  }

  spec <- vec_slice(spec, col_id[has_match])
  # Need to use numeric indices because names only match first
  spec$.name <- seq_along(df)[has_match]

  pieces <- vec_split(seq_len(nrow(spec)), col_id[has_match])
  copy <- integer(nrow(spec))
  for (i in seq_along(pieces$val)) {
    idx <- pieces$val[[i]]
    copy[idx] <- seq_along(idx)
  }

  spec$.seq <- copy
  spec
}