1#' Dummy Variables Creation
2#'
3#' `step_dummy()` creates a *specification* of a recipe
4#'  step that will convert nominal data (e.g. character or factors)
5#'  into one or more numeric binary model terms for the levels of
6#'  the original data.
7#'
8#' @inheritParams step_pca
9#' @inheritParams step_center
10#' @param ... One or more selector functions to choose variables
11#'  for this step. See [selections()] for more details. The selected
12#'  variables _must_ be factors.
13#' @param one_hot A logical. For C levels, should C dummy variables be created
14#' rather than C-1?
15#' @param preserve Use `keep_original_cols` to specify whether the selected
16#'  column(s) should be retained (in addition to the new dummy variables).
17#' @param naming A function that defines the naming convention for
18#'  new dummy columns. See Details below.
19#' @param levels A list that contains the information needed to
20#'  create dummy variables for each variable contained in
21#'  `terms`. This is `NULL` until the step is trained by
22#'  [prep.recipe()].
23#' @template step-return
24#' @family dummy variable and encoding steps
25#' @seealso [dummy_names()]
26#' @export
27#' @details `step_dummy()` will create a set of binary dummy
28#'  variables from a factor variable. For example, if an unordered
29#'  factor column in the data set has levels of "red", "green",
30#'  "blue", the dummy variable bake will create two additional
31#'  columns of 0/1 data for two of those three values (and remove
32#'  the original column). For ordered factors, polynomial contrasts
33#'  are used to encode the numeric values.
34#'
35#' By default, the excluded dummy variable (i.e. the reference
36#'  cell) will correspond to the first level of the unordered
37#'  factor being converted.
38#'
39#' @template dummy-naming
40#'
41#' @details
42#' To change the type of contrast being used, change the global
43#' contrast option via `options`.
44#'
45#' When the factor being converted has a missing value, all of the
46#'  corresponding dummy variables are also missing. See [step_unknown()] for
47#'  a solution.
48#'
49#' When data to be processed contains novel levels (i.e., not
50#' contained in the training set), a missing value is assigned to
51#' the results. See [step_other()] for an alternative.
52#'
53#' If no columns are selected (perhaps due to an earlier `step_zv()`),
54#'  `bake()` will return the data as-is (e.g. with no dummy variables).
55#'
56#' Note that, by default, the new dummy variable column names obey the naming
57#' rules for columns. If there are levels such as "0", [dummy_names()] will put
58#' a leading "X" in front of the level (since it uses [make.names()]). This can
59#' be changed by passing in a different function to the `naming` argument for
60#' this step.
61#'
62#' The [package vignette for dummy variables](https://recipes.tidymodels.org/articles/Dummies.html)
63#' and interactions has more information.
64#'
65#'  When you [`tidy()`] this step, a tibble with columns `terms` (the
66#'  selectors or original variables selected) and `columns` (the
67#'  list of corresponding binary columns) is returned.
68#'
69#' @examples
70#' library(modeldata)
71#' data(okc)
72#' okc <- okc[complete.cases(okc),]
73#'
74#' # Original data: diet has 18 levels
75#' length(unique(okc$diet))
76#' unique(okc$diet) %>% sort()
77#'
78#' rec <- recipe(~ diet + age + height, data = okc)
79#'
80#' # Default dummy coding: 17 dummy variables
81#' dummies <- rec %>%
82#'     step_dummy(diet) %>%
83#'     prep(training = okc)
84#'
85#' dummy_data <- bake(dummies, new_data = NULL)
86#'
87#' dummy_data %>%
88#'     select(starts_with("diet")) %>%
89#'     names() # level "anything" is the reference level
90#'
91#' # Obtain the full set of 18 dummy variables using `one_hot` option
92#' dummies_one_hot <- rec %>%
93#'     step_dummy(diet, one_hot = TRUE) %>%
94#'     prep(training = okc)
95#'
96#' dummy_data_one_hot <- bake(dummies_one_hot, new_data = NULL)
97#'
98#' dummy_data_one_hot %>%
99#'     select(starts_with("diet")) %>%
100#'     names() # no reference level
101#'
102#'
103#' tidy(dummies, number = 1)
104#' tidy(dummies_one_hot, number = 1)
105
106
107step_dummy <-
108  function(recipe,
109           ...,
110           role = "predictor",
111           trained = FALSE,
112           one_hot = FALSE,
113           preserve = deprecated(),
114           naming = dummy_names,
115           levels = NULL,
116           keep_original_cols = FALSE,
117           skip = FALSE,
118           id = rand_id("dummy")) {
119
120    if (lifecycle::is_present(preserve)) {
121      lifecycle::deprecate_warn(
122        "0.1.16",
123        "step_dummy(preserve = )",
124        "step_dummy(keep_original_cols = )"
125      )
126      keep_original_cols <- preserve
127    }
128
129    add_step(
130      recipe,
131      step_dummy_new(
132        terms = ellipse_check(...),
133        role = role,
134        trained = trained,
135        one_hot = one_hot,
136        preserve = keep_original_cols,
137        naming = naming,
138        levels = levels,
139        keep_original_cols = keep_original_cols,
140        skip = skip,
141        id = id
142      )
143    )
144  }
145
146step_dummy_new <-
147  function(terms, role, trained, one_hot, preserve, naming, levels,
148           keep_original_cols, skip, id) {
149    step(
150      subclass = "dummy",
151      terms = terms,
152      role = role,
153      trained = trained,
154      one_hot = one_hot,
155      preserve = preserve,
156      naming = naming,
157      levels = levels,
158      keep_original_cols = keep_original_cols,
159      skip = skip,
160      id = id
161    )
162  }
163
164passover <- function(cmd) {
165  # cat("`step_dummy()` was not able to select any columns. ",
166  #     "No dummy variables will be created.\n")
167} # figure out how to return a warning() without exiting
168
169#' @export
170prep.step_dummy <- function(x, training, info = NULL, ...) {
171  col_names <- recipes_eval_select(x$terms, training, info)
172
173  if (length(col_names) > 0) {
174    fac_check <- vapply(training[, col_names], is.factor, logical(1))
175    if (any(!fac_check))
176      rlang::warn(
177        paste0(
178        "The following variables are not factor vectors and will be ignored: ",
179        paste0("`", names(fac_check)[!fac_check], "`", collapse = ", ")
180        )
181      )
182    col_names <- col_names[fac_check]
183    if (length(col_names) == 0) {
184      rlang::abort(
185        paste0(
186        "The `terms` argument in `step_dummy` did not select ",
187        "any factor columns."
188        )
189      )
190    }
191
192
193    ## I hate doing this but currently we are going to have
194    ## to save the terms object from the original (= training)
195    ## data
196    levels <- vector(mode = "list", length = length(col_names))
197    names(levels) <- col_names
198    for (i in seq_along(col_names)) {
199      form_chr <- paste0("~", col_names[i])
200      if (x$one_hot) {
201        form_chr <- paste0(form_chr, "-1")
202      }
203      form <- as.formula(form_chr)
204      terms <- model.frame(form,
205                           data = training[1,],
206                           xlev = x$levels[[i]],
207                           na.action = na.pass)
208      levels[[i]] <- attr(terms, "terms")
209
210      ## About factor levels here: once dummy variables are made,
211      ## the `stringsAsFactors` info saved in the recipe (under
212      ## recipe$levels will remove the original record of the
213      ## factor levels at the end of `prep.recipe` since it is
214      ## not a factor anymore. We'll save them here and reset them
215      ## in `bake.step_dummy` just prior to calling `model.matrix`
216      attr(levels[[i]], "values") <-
217        levels(getElement(training, col_names[i]))
218      attr(levels[[i]], ".Environment") <- NULL
219    }
220  } else {
221    levels <- NULL
222  }
223
224  step_dummy_new(
225    terms = x$terms,
226    role = x$role,
227    trained = TRUE,
228    one_hot = x$one_hot,
229    preserve = x$preserve,
230    naming = x$naming,
231    levels = levels,
232    keep_original_cols = get_keep_original_cols(x),
233    skip = x$skip,
234    id = x$id
235  )
236}
237
238warn_new_levels <- function(dat, lvl, details = NULL) {
239  ind <- which(!(dat %in% lvl))
240  if (length(ind) > 0) {
241    lvl2 <- unique(dat[ind])
242    rlang::warn(
243      paste0("There are new levels in a factor: ",
244            paste0(lvl2, collapse = ", "),
245            details
246            )
247      )
248  }
249  invisible(NULL)
250}
251
252#' @export
253bake.step_dummy <- function(object, new_data, ...) {
254
255  # If no terms were selected
256  if (length(object$levels) == 0) {
257    return(new_data)
258  }
259
260  col_names <- names(object$levels)
261  keep_original_cols <- get_keep_original_cols(object)
262
263  ## `na.action` cannot be passed to `model.matrix` but we
264  ## can change it globally for a bit
265  old_opt <- options()$na.action
266  options(na.action = "na.pass")
267  on.exit(options(na.action = old_opt))
268
269  for (i in seq_along(object$levels)) {
270    # Make sure that the incoming data has levels consistent with
271    # the original (see the note above)
272    orig_var <- names(object$levels)[i]
273    fac_type <- attr(object$levels[[i]], "dataClasses")
274
275    if (!any(names(attributes(object$levels[[i]])) == "values"))
276      rlang::abort("Factor level values not recorded")
277
278    if (length(attr(object$levels[[i]], "values")) == 1)
279      rlang::abort(
280        paste0("Only one factor level in ", orig_var, ": ",
281               attr(object$levels[[i]], "values"))
282        )
283
284    warn_new_levels(
285      new_data[[orig_var]],
286      attr(object$levels[[i]], "values")
287    )
288
289    new_data[, orig_var] <-
290      factor(getElement(new_data, orig_var),
291             levels = attr(object$levels[[i]], "values"),
292             ordered = fac_type == "ordered")
293
294    indicators <-
295      model.frame(
296        as.formula(paste0("~", orig_var)),
297        data = new_data[, orig_var],
298        xlev = attr(object$levels[[i]], "values"),
299        na.action = na.pass
300      )
301
302    indicators <-
303      model.matrix(
304        object = object$levels[[i]],
305        data = indicators
306      )
307    indicators <- as_tibble(indicators)
308
309    options(na.action = old_opt)
310    on.exit(expr = NULL)
311
312    if (!object$one_hot) {
313      indicators <- indicators[, colnames(indicators) != "(Intercept)", drop = FALSE]
314    }
315
316    ## use backticks for nonstandard factor levels here
317    used_lvl <- gsub(paste0("^", col_names[i]), "", colnames(indicators))
318    colnames(indicators) <- object$naming(col_names[i], used_lvl, fac_type == "ordered")
319    new_data <- bind_cols(new_data, as_tibble(indicators))
320    if (any(!object$preserve, !keep_original_cols)) {
321      new_data[, col_names[i]] <- NULL
322    }
323  }
324  if (!is_tibble(new_data))
325    new_data <- as_tibble(new_data)
326  new_data
327}
328
329print.step_dummy <-
330  function(x, width = max(20, options()$width - 20), ...) {
331    if (x$trained) {
332      if (length(x$levels) > 0) {
333        cat("Dummy variables from ")
334        cat(format_ch_vec(names(x$levels), width = width))
335      } else {
336        cat("Dummy variables were *not* created since no columns were selected.")
337      }
338    } else {
339      cat("Dummy variables from ", sep = "")
340      cat(format_selectors(x$terms, width = width))
341    }
342    if (x$trained)
343      cat(" [trained]\n")
344    else
345      cat("\n")
346    invisible(x)
347  }
348
349
350get_dummy_columns <- function(x, one_hot) {
351  x <- attr(x, "values")
352  if (!one_hot) x <- x[-1]
353  tibble(columns = x)
354}
355
356
357#' @rdname tidy.recipe
358#' @export
359tidy.step_dummy <- function(x, ...) {
360  if (is_trained(x)) {
361    if (length(x$levels) > 0) {
362      res <- purrr::map_dfr(x$levels, get_dummy_columns, x$one_hot, .id = "terms")
363    } else {
364      res <- tibble(terms = rlang::na_chr, columns = rlang::na_chr)
365    }
366  } else {
367    res <- tibble(terms = sel2char(x$terms), columns = rlang::na_chr)
368  }
369  res$id <- x$id
370  res
371}
372