1#' Dummy Variables Creation 2#' 3#' `step_dummy()` creates a *specification* of a recipe 4#' step that will convert nominal data (e.g. character or factors) 5#' into one or more numeric binary model terms for the levels of 6#' the original data. 7#' 8#' @inheritParams step_pca 9#' @inheritParams step_center 10#' @param ... One or more selector functions to choose variables 11#' for this step. See [selections()] for more details. The selected 12#' variables _must_ be factors. 13#' @param one_hot A logical. For C levels, should C dummy variables be created 14#' rather than C-1? 15#' @param preserve Use `keep_original_cols` to specify whether the selected 16#' column(s) should be retained (in addition to the new dummy variables). 17#' @param naming A function that defines the naming convention for 18#' new dummy columns. See Details below. 19#' @param levels A list that contains the information needed to 20#' create dummy variables for each variable contained in 21#' `terms`. This is `NULL` until the step is trained by 22#' [prep.recipe()]. 23#' @template step-return 24#' @family dummy variable and encoding steps 25#' @seealso [dummy_names()] 26#' @export 27#' @details `step_dummy()` will create a set of binary dummy 28#' variables from a factor variable. For example, if an unordered 29#' factor column in the data set has levels of "red", "green", 30#' "blue", the dummy variable bake will create two additional 31#' columns of 0/1 data for two of those three values (and remove 32#' the original column). For ordered factors, polynomial contrasts 33#' are used to encode the numeric values. 34#' 35#' By default, the excluded dummy variable (i.e. the reference 36#' cell) will correspond to the first level of the unordered 37#' factor being converted. 38#' 39#' @template dummy-naming 40#' 41#' @details 42#' To change the type of contrast being used, change the global 43#' contrast option via `options`. 44#' 45#' When the factor being converted has a missing value, all of the 46#' corresponding dummy variables are also missing. See [step_unknown()] for 47#' a solution. 48#' 49#' When data to be processed contains novel levels (i.e., not 50#' contained in the training set), a missing value is assigned to 51#' the results. See [step_other()] for an alternative. 52#' 53#' If no columns are selected (perhaps due to an earlier `step_zv()`), 54#' `bake()` will return the data as-is (e.g. with no dummy variables). 55#' 56#' Note that, by default, the new dummy variable column names obey the naming 57#' rules for columns. If there are levels such as "0", [dummy_names()] will put 58#' a leading "X" in front of the level (since it uses [make.names()]). This can 59#' be changed by passing in a different function to the `naming` argument for 60#' this step. 61#' 62#' The [package vignette for dummy variables](https://recipes.tidymodels.org/articles/Dummies.html) 63#' and interactions has more information. 64#' 65#' When you [`tidy()`] this step, a tibble with columns `terms` (the 66#' selectors or original variables selected) and `columns` (the 67#' list of corresponding binary columns) is returned. 68#' 69#' @examples 70#' library(modeldata) 71#' data(okc) 72#' okc <- okc[complete.cases(okc),] 73#' 74#' # Original data: diet has 18 levels 75#' length(unique(okc$diet)) 76#' unique(okc$diet) %>% sort() 77#' 78#' rec <- recipe(~ diet + age + height, data = okc) 79#' 80#' # Default dummy coding: 17 dummy variables 81#' dummies <- rec %>% 82#' step_dummy(diet) %>% 83#' prep(training = okc) 84#' 85#' dummy_data <- bake(dummies, new_data = NULL) 86#' 87#' dummy_data %>% 88#' select(starts_with("diet")) %>% 89#' names() # level "anything" is the reference level 90#' 91#' # Obtain the full set of 18 dummy variables using `one_hot` option 92#' dummies_one_hot <- rec %>% 93#' step_dummy(diet, one_hot = TRUE) %>% 94#' prep(training = okc) 95#' 96#' dummy_data_one_hot <- bake(dummies_one_hot, new_data = NULL) 97#' 98#' dummy_data_one_hot %>% 99#' select(starts_with("diet")) %>% 100#' names() # no reference level 101#' 102#' 103#' tidy(dummies, number = 1) 104#' tidy(dummies_one_hot, number = 1) 105 106 107step_dummy <- 108 function(recipe, 109 ..., 110 role = "predictor", 111 trained = FALSE, 112 one_hot = FALSE, 113 preserve = deprecated(), 114 naming = dummy_names, 115 levels = NULL, 116 keep_original_cols = FALSE, 117 skip = FALSE, 118 id = rand_id("dummy")) { 119 120 if (lifecycle::is_present(preserve)) { 121 lifecycle::deprecate_warn( 122 "0.1.16", 123 "step_dummy(preserve = )", 124 "step_dummy(keep_original_cols = )" 125 ) 126 keep_original_cols <- preserve 127 } 128 129 add_step( 130 recipe, 131 step_dummy_new( 132 terms = ellipse_check(...), 133 role = role, 134 trained = trained, 135 one_hot = one_hot, 136 preserve = keep_original_cols, 137 naming = naming, 138 levels = levels, 139 keep_original_cols = keep_original_cols, 140 skip = skip, 141 id = id 142 ) 143 ) 144 } 145 146step_dummy_new <- 147 function(terms, role, trained, one_hot, preserve, naming, levels, 148 keep_original_cols, skip, id) { 149 step( 150 subclass = "dummy", 151 terms = terms, 152 role = role, 153 trained = trained, 154 one_hot = one_hot, 155 preserve = preserve, 156 naming = naming, 157 levels = levels, 158 keep_original_cols = keep_original_cols, 159 skip = skip, 160 id = id 161 ) 162 } 163 164passover <- function(cmd) { 165 # cat("`step_dummy()` was not able to select any columns. ", 166 # "No dummy variables will be created.\n") 167} # figure out how to return a warning() without exiting 168 169#' @export 170prep.step_dummy <- function(x, training, info = NULL, ...) { 171 col_names <- recipes_eval_select(x$terms, training, info) 172 173 if (length(col_names) > 0) { 174 fac_check <- vapply(training[, col_names], is.factor, logical(1)) 175 if (any(!fac_check)) 176 rlang::warn( 177 paste0( 178 "The following variables are not factor vectors and will be ignored: ", 179 paste0("`", names(fac_check)[!fac_check], "`", collapse = ", ") 180 ) 181 ) 182 col_names <- col_names[fac_check] 183 if (length(col_names) == 0) { 184 rlang::abort( 185 paste0( 186 "The `terms` argument in `step_dummy` did not select ", 187 "any factor columns." 188 ) 189 ) 190 } 191 192 193 ## I hate doing this but currently we are going to have 194 ## to save the terms object from the original (= training) 195 ## data 196 levels <- vector(mode = "list", length = length(col_names)) 197 names(levels) <- col_names 198 for (i in seq_along(col_names)) { 199 form_chr <- paste0("~", col_names[i]) 200 if (x$one_hot) { 201 form_chr <- paste0(form_chr, "-1") 202 } 203 form <- as.formula(form_chr) 204 terms <- model.frame(form, 205 data = training[1,], 206 xlev = x$levels[[i]], 207 na.action = na.pass) 208 levels[[i]] <- attr(terms, "terms") 209 210 ## About factor levels here: once dummy variables are made, 211 ## the `stringsAsFactors` info saved in the recipe (under 212 ## recipe$levels will remove the original record of the 213 ## factor levels at the end of `prep.recipe` since it is 214 ## not a factor anymore. We'll save them here and reset them 215 ## in `bake.step_dummy` just prior to calling `model.matrix` 216 attr(levels[[i]], "values") <- 217 levels(getElement(training, col_names[i])) 218 attr(levels[[i]], ".Environment") <- NULL 219 } 220 } else { 221 levels <- NULL 222 } 223 224 step_dummy_new( 225 terms = x$terms, 226 role = x$role, 227 trained = TRUE, 228 one_hot = x$one_hot, 229 preserve = x$preserve, 230 naming = x$naming, 231 levels = levels, 232 keep_original_cols = get_keep_original_cols(x), 233 skip = x$skip, 234 id = x$id 235 ) 236} 237 238warn_new_levels <- function(dat, lvl, details = NULL) { 239 ind <- which(!(dat %in% lvl)) 240 if (length(ind) > 0) { 241 lvl2 <- unique(dat[ind]) 242 rlang::warn( 243 paste0("There are new levels in a factor: ", 244 paste0(lvl2, collapse = ", "), 245 details 246 ) 247 ) 248 } 249 invisible(NULL) 250} 251 252#' @export 253bake.step_dummy <- function(object, new_data, ...) { 254 255 # If no terms were selected 256 if (length(object$levels) == 0) { 257 return(new_data) 258 } 259 260 col_names <- names(object$levels) 261 keep_original_cols <- get_keep_original_cols(object) 262 263 ## `na.action` cannot be passed to `model.matrix` but we 264 ## can change it globally for a bit 265 old_opt <- options()$na.action 266 options(na.action = "na.pass") 267 on.exit(options(na.action = old_opt)) 268 269 for (i in seq_along(object$levels)) { 270 # Make sure that the incoming data has levels consistent with 271 # the original (see the note above) 272 orig_var <- names(object$levels)[i] 273 fac_type <- attr(object$levels[[i]], "dataClasses") 274 275 if (!any(names(attributes(object$levels[[i]])) == "values")) 276 rlang::abort("Factor level values not recorded") 277 278 if (length(attr(object$levels[[i]], "values")) == 1) 279 rlang::abort( 280 paste0("Only one factor level in ", orig_var, ": ", 281 attr(object$levels[[i]], "values")) 282 ) 283 284 warn_new_levels( 285 new_data[[orig_var]], 286 attr(object$levels[[i]], "values") 287 ) 288 289 new_data[, orig_var] <- 290 factor(getElement(new_data, orig_var), 291 levels = attr(object$levels[[i]], "values"), 292 ordered = fac_type == "ordered") 293 294 indicators <- 295 model.frame( 296 as.formula(paste0("~", orig_var)), 297 data = new_data[, orig_var], 298 xlev = attr(object$levels[[i]], "values"), 299 na.action = na.pass 300 ) 301 302 indicators <- 303 model.matrix( 304 object = object$levels[[i]], 305 data = indicators 306 ) 307 indicators <- as_tibble(indicators) 308 309 options(na.action = old_opt) 310 on.exit(expr = NULL) 311 312 if (!object$one_hot) { 313 indicators <- indicators[, colnames(indicators) != "(Intercept)", drop = FALSE] 314 } 315 316 ## use backticks for nonstandard factor levels here 317 used_lvl <- gsub(paste0("^", col_names[i]), "", colnames(indicators)) 318 colnames(indicators) <- object$naming(col_names[i], used_lvl, fac_type == "ordered") 319 new_data <- bind_cols(new_data, as_tibble(indicators)) 320 if (any(!object$preserve, !keep_original_cols)) { 321 new_data[, col_names[i]] <- NULL 322 } 323 } 324 if (!is_tibble(new_data)) 325 new_data <- as_tibble(new_data) 326 new_data 327} 328 329print.step_dummy <- 330 function(x, width = max(20, options()$width - 20), ...) { 331 if (x$trained) { 332 if (length(x$levels) > 0) { 333 cat("Dummy variables from ") 334 cat(format_ch_vec(names(x$levels), width = width)) 335 } else { 336 cat("Dummy variables were *not* created since no columns were selected.") 337 } 338 } else { 339 cat("Dummy variables from ", sep = "") 340 cat(format_selectors(x$terms, width = width)) 341 } 342 if (x$trained) 343 cat(" [trained]\n") 344 else 345 cat("\n") 346 invisible(x) 347 } 348 349 350get_dummy_columns <- function(x, one_hot) { 351 x <- attr(x, "values") 352 if (!one_hot) x <- x[-1] 353 tibble(columns = x) 354} 355 356 357#' @rdname tidy.recipe 358#' @export 359tidy.step_dummy <- function(x, ...) { 360 if (is_trained(x)) { 361 if (length(x$levels) > 0) { 362 res <- purrr::map_dfr(x$levels, get_dummy_columns, x$one_hot, .id = "terms") 363 } else { 364 res <- tibble(terms = rlang::na_chr, columns = rlang::na_chr) 365 } 366 } else { 367 res <- tibble(terms = sel2char(x$terms), columns = rlang::na_chr) 368 } 369 res$id <- x$id 370 res 371} 372