1#' Impute nominal data using the most common value
2#'
3#'   `step_impute_mode` creates a *specification* of a
4#'  recipe step that will substitute missing values of nominal
5#'  variables by the training set mode of those variables.
6#'
7#' @inheritParams step_center
8#' @param modes A named character vector of modes. This is
9#'  `NULL` until computed by [prep.recipe()].
10#' @param ptype A data frame prototype to cast new data sets to. This is
11#'  commonly a 0-row slice of the training set.
12#' @template step-return
13#' @family imputation steps
14#' @export
15#' @details `step_impute_mode` estimates the variable modes
16#'  from the data used in the `training` argument of
17#'  `prep.recipe`. `bake.recipe` then applies the new
18#'  values to new data sets using these values. If the training set
19#'  data has more than one mode, one is selected at random.
20#'
21#' When you [`tidy()`] this step, a tibble with columns `terms` (the
22#'  selectors or variables selected) and `model` (the mode
23#'  value) is returned.
24#'
25#'  As of `recipes` 0.1.16, this function name changed from `step_modeimpute()`
26#'    to `step_impute_mode()`.
27#'
28#' @examples
29#' library(modeldata)
30#' data("credit_data")
31#'
32#' ## missing data per column
33#' vapply(credit_data, function(x) mean(is.na(x)), c(num = 0))
34#'
35#' set.seed(342)
36#' in_training <- sample(1:nrow(credit_data), 2000)
37#'
38#' credit_tr <- credit_data[ in_training, ]
39#' credit_te <- credit_data[-in_training, ]
40#' missing_examples <- c(14, 394, 565)
41#'
42#' rec <- recipe(Price ~ ., data = credit_tr)
43#'
44#' impute_rec <- rec %>%
45#'   step_impute_mode(Status, Home, Marital)
46#'
47#' imp_models <- prep(impute_rec, training = credit_tr)
48#'
49#' imputed_te <- bake(imp_models, new_data = credit_te, everything())
50#'
51#' table(credit_te$Home, imputed_te$Home, useNA = "always")
52#'
53#' tidy(impute_rec, number = 1)
54#' tidy(imp_models, number = 1)
55
56step_impute_mode <-
57  function(recipe,
58           ...,
59           role = NA,
60           trained = FALSE,
61           modes = NULL,
62           ptype = NULL,
63           skip = FALSE,
64           id = rand_id("impute_mode")) {
65    add_step(
66      recipe,
67      step_impute_mode_new(
68        terms = ellipse_check(...),
69        role = role,
70        trained = trained,
71        modes = modes,
72        ptype = ptype,
73        skip = skip,
74        id = id
75      )
76    )
77  }
78
79#' @rdname step_impute_mode
80#' @export
81step_modeimpute <-
82  function(recipe,
83           ...,
84           role = NA,
85           trained = FALSE,
86           modes = NULL,
87           ptype = NULL,
88           skip = FALSE,
89           id = rand_id("impute_mode")) {
90    lifecycle::deprecate_warn(
91      when = "0.1.16",
92      what = "recipes::step_modeimpute()",
93      with = "recipes::step_impute_mode()"
94    )
95    step_impute_mode(
96      recipe,
97      ...,
98      role = role,
99      trained = trained,
100      modes = modes,
101      ptype = ptype,
102      skip = skip,
103      id = id
104    )
105  }
106
107step_impute_mode_new <-
108  function(terms, role, trained, modes, ptype, skip, id) {
109    step(
110      subclass = "impute_mode",
111      terms = terms,
112      role = role,
113      trained = trained,
114      modes = modes,
115      ptype = ptype,
116      skip = skip,
117      id = id
118    )
119  }
120
121#' @export
122prep.step_impute_mode <- function(x, training, info = NULL, ...) {
123  col_names <- recipes_eval_select(x$terms, training, info)
124  modes <- vapply(training[, col_names], mode_est, c(mode = ""))
125  ptype <- vec_slice(training[, col_names], 0)
126  step_impute_mode_new(
127    terms = x$terms,
128    role = x$role,
129    trained = TRUE,
130    modes = modes,
131    ptype = ptype,
132    skip = x$skip,
133    id = x$id
134  )
135}
136
137#' @export
138#' @keywords internal
139prep.step_modeimpute <- prep.step_impute_mode
140
141#' @export
142bake.step_impute_mode <- function(object, new_data, ...) {
143
144  for (i in names(object$modes)) {
145    if (any(is.na(new_data[, i]))) {
146      if(is.null(object$ptype)) {
147        rlang::warn(
148          paste0(
149            "'ptype' was added to `step_impute_mode()` after this recipe was created.\n",
150            "Regenerate your recipe to avoid this warning."
151          )
152        )
153      } else {
154        new_data[[i]] <- vec_cast(new_data[[i]], object$ptype[[i]])
155      }
156      mode_val <- cast(object$modes[[i]], new_data[[i]])
157      new_data[is.na(new_data[[i]]), i] <- mode_val
158    }
159  }
160  as_tibble(new_data)
161}
162
163#' @export
164#' @keywords internal
165bake.step_modeimpute <- bake.step_impute_mode
166
167#' @export
168print.step_impute_mode <-
169  function(x, width = max(20, options()$width - 30), ...) {
170    cat("Mode Imputation for ", sep = "")
171    printer(names(x$modes), x$terms, x$trained, width = width)
172    invisible(x)
173  }
174
175#' @export
176#' @keywords internal
177print.step_modeimpute <- print.step_impute_mode
178
179mode_est <- function(x) {
180  if (!is.character(x) & !is.factor(x))
181    rlang::abort("The data should be character or factor to compute the mode.")
182  tab <- table(x)
183  modes <- names(tab)[tab == max(tab)]
184  sample(modes, size = 1)
185}
186
187#' @rdname tidy.recipe
188#' @export
189tidy.step_impute_mode <- function(x, ...) {
190  if (is_trained(x)) {
191    res <- tibble(terms = names(x$modes),
192                  model = x$modes)
193  } else {
194    term_names <- sel2char(x$terms)
195    res <- tibble(terms = term_names, model = na_chr)
196  }
197  res$id <- x$id
198  res
199}
200
201#' @export
202#' @keywords internal
203tidy.step_modeimpute <- tidy.step_impute_mode
204