1#' Pivot data from wide to long 2#' 3#' @description 4#' `pivot_longer()` "lengthens" data, increasing the number of rows and 5#' decreasing the number of columns. The inverse transformation is 6#' [pivot_wider()] 7#' 8#' Learn more in `vignette("pivot")`. 9#' 10#' @details 11#' `pivot_longer()` is an updated approach to [gather()], designed to be both 12#' simpler to use and to handle more use cases. We recommend you use 13#' `pivot_longer()` for new code; `gather()` isn't going away but is no longer 14#' under active development. 15#' 16#' @param data A data frame to pivot. 17#' @param cols <[`tidy-select`][tidyr_tidy_select]> Columns to pivot into 18#' longer format. 19#' @param names_to A string specifying the name of the column to create 20#' from the data stored in the column names of `data`. 21#' 22#' Can be a character vector, creating multiple columns, if `names_sep` 23#' or `names_pattern` is provided. In this case, there are two special 24#' values you can take advantage of: 25#' 26#' * `NA` will discard that component of the name. 27#' * `.value` indicates that component of the name defines the name of the 28#' column containing the cell values, overriding `values_to`. 29#' @param names_prefix A regular expression used to remove matching text 30#' from the start of each variable name. 31#' @param names_sep,names_pattern If `names_to` contains multiple values, 32#' these arguments control how the column name is broken up. 33#' 34#' `names_sep` takes the same specification as [separate()], and can either 35#' be a numeric vector (specifying positions to break on), or a single string 36#' (specifying a regular expression to split on). 37#' 38#' `names_pattern` takes the same specification as [extract()], a regular 39#' expression containing matching groups (`()`). 40#' 41#' If these arguments do not give you enough control, use 42#' `pivot_longer_spec()` to create a spec object and process manually as 43#' needed. 44#' @param names_repair What happens if the output has invalid column names? 45#' The default, `"check_unique"` is to error if the columns are duplicated. 46#' Use `"minimal"` to allow duplicates in the output, or `"unique"` to 47#' de-duplicated by adding numeric suffixes. See [vctrs::vec_as_names()] 48#' for more options. 49#' @param values_to A string specifying the name of the column to create 50#' from the data stored in cell values. If `names_to` is a character 51#' containing the special `.value` sentinel, this value will be ignored, 52#' and the name of the value column will be derived from part of the 53#' existing column names. 54#' @param values_drop_na If `TRUE`, will drop rows that contain only `NA`s 55#' in the `value_to` column. This effectively converts explicit missing values 56#' to implicit missing values, and should generally be used only when missing 57#' values in `data` were created by its structure. 58#' @param names_transform,values_transform A list of column name-function pairs. 59#' Use these arguments if you need to change the types of specific columns. 60#' For example, `names_transform = list(week = as.integer)` would convert 61#' a character variable called `week` to an integer. 62#' 63#' If not specified, the type of the columns generated from `names_to` will 64#' be character, and the type of the variables generated from `values_to` 65#' will be the common type of the input columns used to generate them. 66#' @param names_ptypes,values_ptypes A list of column name-prototype pairs. 67#' A prototype (or ptype for short) is a zero-length vector (like `integer()` 68#' or `numeric()`) that defines the type, class, and attributes of a vector. 69#' Use these arguments if you want to confirm that the created columns are 70#' the types that you expect. Note that if you want to change (instead of confirm) 71#' the types of specific columns, you should use `names_transform` or 72#' `values_transform` instead. 73#' @param ... Additional arguments passed on to methods. 74#' @export 75#' @examples 76#' # See vignette("pivot") for examples and explanation 77#' 78#' # Simplest case where column names are character data 79#' relig_income 80#' relig_income %>% 81#' pivot_longer(!religion, names_to = "income", values_to = "count") 82#' 83#' # Slightly more complex case where columns have common prefix, 84#' # and missing missings are structural so should be dropped. 85#' billboard 86#' billboard %>% 87#' pivot_longer( 88#' cols = starts_with("wk"), 89#' names_to = "week", 90#' names_prefix = "wk", 91#' values_to = "rank", 92#' values_drop_na = TRUE 93#' ) 94#' 95#' # Multiple variables stored in column names 96#' who %>% pivot_longer( 97#' cols = new_sp_m014:newrel_f65, 98#' names_to = c("diagnosis", "gender", "age"), 99#' names_pattern = "new_?(.*)_(.)(.*)", 100#' values_to = "count" 101#' ) 102#' 103#' # Multiple observations per row 104#' anscombe 105#' anscombe %>% 106#' pivot_longer(everything(), 107#' names_to = c(".value", "set"), 108#' names_pattern = "(.)(.)" 109#' ) 110pivot_longer <- function(data, 111 cols, 112 names_to = "name", 113 names_prefix = NULL, 114 names_sep = NULL, 115 names_pattern = NULL, 116 names_ptypes = list(), 117 names_transform = list(), 118 names_repair = "check_unique", 119 values_to = "value", 120 values_drop_na = FALSE, 121 values_ptypes = list(), 122 values_transform = list(), 123 ... 124 ) { 125 126 ellipsis::check_dots_used() 127 UseMethod("pivot_longer") 128} 129 130#' @export 131pivot_longer.data.frame <- function(data, 132 cols, 133 names_to = "name", 134 names_prefix = NULL, 135 names_sep = NULL, 136 names_pattern = NULL, 137 names_ptypes = list(), 138 names_transform = list(), 139 names_repair = "check_unique", 140 values_to = "value", 141 values_drop_na = FALSE, 142 values_ptypes = list(), 143 values_transform = list(), 144 ... 145 ) { 146 cols <- enquo(cols) 147 spec <- build_longer_spec(data, !!cols, 148 names_to = names_to, 149 values_to = values_to, 150 names_prefix = names_prefix, 151 names_sep = names_sep, 152 names_pattern = names_pattern, 153 names_ptypes = names_ptypes, 154 names_transform = names_transform 155 ) 156 157 pivot_longer_spec(data, spec, 158 names_repair = names_repair, 159 values_drop_na = values_drop_na, 160 values_ptypes = values_ptypes, 161 values_transform = values_transform 162 ) 163} 164 165 166#' Pivot data from wide to long using a spec 167#' 168#' This is a low level interface to pivotting, inspired by the cdata package, 169#' that allows you to describe pivotting with a data frame. 170#' 171#' @keywords internal 172#' @export 173#' @inheritParams pivot_longer 174#' @param spec A specification data frame. This is useful for more complex 175#' pivots because it gives you greater control on how metadata stored in the 176#' column names turns into columns in the result. 177#' 178#' Must be a data frame containing character `.name` and `.value` columns. 179#' Additional columns in `spec` should be named to match columns in the 180#' long format of the dataset and contain values corresponding to columns 181#' pivoted from the wide format. 182#' The special `.seq` variable is used to disambiguate rows internally; 183#' it is automatically removed after pivotting. 184#' 185#' @examples 186#' # See vignette("pivot") for examples and explanation 187#' 188#' # Use `build_longer_spec()` to build `spec` using similar syntax to `pivot_longer()` 189#' # and run `pivot_longer_spec()` based on `spec`. 190#' spec <- relig_income %>% build_longer_spec( 191#' cols = !religion, 192#' names_to = "income", 193#' values_to = "count" 194#' ) 195#' spec 196#' 197#' pivot_longer_spec(relig_income, spec) 198#' 199#' # Is equivalent to: 200#' relig_income %>% pivot_longer( 201#' cols = !religion, 202#' names_to = "income", 203#' values_to = "count") 204#' 205pivot_longer_spec <- function(data, 206 spec, 207 names_repair = "check_unique", 208 values_drop_na = FALSE, 209 values_ptypes = list(), 210 values_transform = list() 211 ) { 212 spec <- check_spec(spec) 213 spec <- deduplicate_spec(spec, data) 214 215 # Quick hack to ensure that split() preserves order 216 v_fct <- factor(spec$.value, levels = unique(spec$.value)) 217 values <- split(spec$.name, v_fct) 218 value_keys <- split(spec[-(1:2)], v_fct) 219 keys <- vec_unique(spec[-(1:2)]) 220 221 vals <- set_names(vec_init(list(), length(values)), names(values)) 222 for (value in names(values)) { 223 cols <- values[[value]] 224 col_id <- vec_match(value_keys[[value]], keys) 225 226 val_cols <- vec_init(list(), nrow(keys)) 227 val_cols[col_id] <- unname(as.list(data[cols])) 228 val_cols[-col_id] <- list(rep(NA, nrow(data))) 229 230 if (has_name(values_transform, value)) { 231 val_cols <- lapply(val_cols, values_transform[[value]]) 232 } 233 val_type <- vec_ptype_common(!!!set_names(val_cols[col_id], cols), .ptype = values_ptypes[[value]]) 234 out <- vec_c(!!!val_cols, .ptype = val_type) 235 # Interleave into correct order 236 # TODO somehow `t(matrix(x))` is _faster_ than `matrix(x, byrow = TRUE)` 237 # if this gets fixed in R this should use `byrow = TRUE` again 238 n_vals <- nrow(data) * length(val_cols) 239 idx <- t(matrix(seq_len(n_vals), ncol = length(val_cols))) 240 vals[[value]] <- vec_slice(out, as.integer(idx)) 241 } 242 vals <- as_tibble(vals) 243 244 # Join together df, spec, and val to produce final tibble 245 df_out <- drop_cols(as_tibble(data, .name_repair = "minimal"), spec$.name) 246 247 out <- wrap_error_names(vec_cbind( 248 vec_rep_each(df_out, vec_size(keys)), 249 vec_rep(keys, vec_size(data)), 250 vals, 251 .name_repair = names_repair 252 )) 253 254 if (values_drop_na) { 255 out <- vec_slice(out, !vec_equal_na(vals)) 256 } 257 258 out$.seq <- NULL 259 260 reconstruct_tibble(data, out) 261} 262 263#' @rdname pivot_longer_spec 264#' @export 265build_longer_spec <- function(data, cols, 266 names_to = "name", 267 values_to = "value", 268 names_prefix = NULL, 269 names_sep = NULL, 270 names_pattern = NULL, 271 names_ptypes = NULL, 272 names_transform = NULL) { 273 cols <- tidyselect::eval_select(enquo(cols), data[unique(names(data))]) 274 275 if (length(cols) == 0) { 276 abort(glue::glue("`cols` must select at least one column.")) 277 } 278 279 if (is.null(names_prefix)) { 280 names <- names(cols) 281 } else { 282 names <- gsub(paste0("^", names_prefix), "", names(cols)) 283 } 284 285 if (length(names_to) > 1) { 286 if (!xor(is.null(names_sep), is.null(names_pattern))) { 287 abort(glue::glue( 288 "If you supply multiple names in `names_to` you must also supply one", 289 " of `names_sep` or `names_pattern`." 290 )) 291 } 292 293 if (!is.null(names_sep)) { 294 names <- str_separate(names, names_to, sep = names_sep) 295 } else { 296 names <- str_extract(names, names_to, regex = names_pattern) 297 } 298 } else if (length(names_to) == 0) { 299 names <- tibble::new_tibble(x = list(), nrow = length(names)) 300 } else { 301 if (!is.null(names_sep)) { 302 abort("`names_sep` can not be used with length-1 `names_to`") 303 } 304 if (!is.null(names_pattern)) { 305 names <- str_extract(names, names_to, regex = names_pattern)[[1]] 306 } 307 308 names <- tibble(!!names_to := names) 309 } 310 311 if (".value" %in% names_to) { 312 values_to <- NULL 313 } else { 314 vec_assert(values_to, ptype = character(), size = 1) 315 } 316 317 # optionally, cast variables generated from columns 318 cast_cols <- intersect(names(names), names(names_ptypes)) 319 for (col in cast_cols) { 320 names[[col]] <- vec_cast(names[[col]], names_ptypes[[col]]) 321 } 322 323 # transform cols 324 coerce_cols <- intersect(names(names), names(names_transform)) 325 for (col in coerce_cols) { 326 f <- as_function(names_transform[[col]]) 327 names[[col]] <- f(names[[col]]) 328 } 329 330 out <- tibble(.name = names(cols)) 331 out[[".value"]] <- values_to 332 out <- vec_cbind(out, names) 333 out 334} 335 336drop_cols <- function(df, cols) { 337 if (is.character(cols)) { 338 df[setdiff(names(df), cols)] 339 } else if (is.integer(cols)) { 340 df[-cols] 341 } else { 342 abort("Invalid input") 343 } 344} 345 346# Ensure that there's a one-to-one match from spec to data by adding 347# a special .seq variable which is automatically removed after pivotting. 348deduplicate_spec <- function(spec, df) { 349 350 # Ensure each .name has a unique output identifier 351 key <- spec[setdiff(names(spec), ".name")] 352 if (vec_duplicate_any(key)) { 353 pos <- vec_group_loc(key)$loc 354 seq <- vector("integer", length = nrow(spec)) 355 for (i in seq_along(pos)) { 356 seq[pos[[i]]] <- seq_along(pos[[i]]) 357 } 358 spec$.seq <- seq 359 } 360 361 # Match spec to data, handling duplicated column names 362 col_id <- vec_match(names(df), spec$.name) 363 has_match <- !is.na(col_id) 364 365 if (!vec_duplicate_any(col_id[has_match])) { 366 return(spec) 367 } 368 369 spec <- vec_slice(spec, col_id[has_match]) 370 # Need to use numeric indices because names only match first 371 spec$.name <- seq_along(df)[has_match] 372 373 pieces <- vec_split(seq_len(nrow(spec)), col_id[has_match]) 374 copy <- integer(nrow(spec)) 375 for (i in seq_along(pieces$val)) { 376 idx <- pieces$val[[i]] 377 copy[idx] <- seq_along(idx) 378 } 379 380 spec$.seq <- copy 381 spec 382} 383