1#' Impute nominal data using the most common value 2#' 3#' `step_impute_mode` creates a *specification* of a 4#' recipe step that will substitute missing values of nominal 5#' variables by the training set mode of those variables. 6#' 7#' @inheritParams step_center 8#' @param modes A named character vector of modes. This is 9#' `NULL` until computed by [prep.recipe()]. 10#' @param ptype A data frame prototype to cast new data sets to. This is 11#' commonly a 0-row slice of the training set. 12#' @template step-return 13#' @family imputation steps 14#' @export 15#' @details `step_impute_mode` estimates the variable modes 16#' from the data used in the `training` argument of 17#' `prep.recipe`. `bake.recipe` then applies the new 18#' values to new data sets using these values. If the training set 19#' data has more than one mode, one is selected at random. 20#' 21#' When you [`tidy()`] this step, a tibble with columns `terms` (the 22#' selectors or variables selected) and `model` (the mode 23#' value) is returned. 24#' 25#' As of `recipes` 0.1.16, this function name changed from `step_modeimpute()` 26#' to `step_impute_mode()`. 27#' 28#' @examples 29#' library(modeldata) 30#' data("credit_data") 31#' 32#' ## missing data per column 33#' vapply(credit_data, function(x) mean(is.na(x)), c(num = 0)) 34#' 35#' set.seed(342) 36#' in_training <- sample(1:nrow(credit_data), 2000) 37#' 38#' credit_tr <- credit_data[ in_training, ] 39#' credit_te <- credit_data[-in_training, ] 40#' missing_examples <- c(14, 394, 565) 41#' 42#' rec <- recipe(Price ~ ., data = credit_tr) 43#' 44#' impute_rec <- rec %>% 45#' step_impute_mode(Status, Home, Marital) 46#' 47#' imp_models <- prep(impute_rec, training = credit_tr) 48#' 49#' imputed_te <- bake(imp_models, new_data = credit_te, everything()) 50#' 51#' table(credit_te$Home, imputed_te$Home, useNA = "always") 52#' 53#' tidy(impute_rec, number = 1) 54#' tidy(imp_models, number = 1) 55 56step_impute_mode <- 57 function(recipe, 58 ..., 59 role = NA, 60 trained = FALSE, 61 modes = NULL, 62 ptype = NULL, 63 skip = FALSE, 64 id = rand_id("impute_mode")) { 65 add_step( 66 recipe, 67 step_impute_mode_new( 68 terms = ellipse_check(...), 69 role = role, 70 trained = trained, 71 modes = modes, 72 ptype = ptype, 73 skip = skip, 74 id = id 75 ) 76 ) 77 } 78 79#' @rdname step_impute_mode 80#' @export 81step_modeimpute <- 82 function(recipe, 83 ..., 84 role = NA, 85 trained = FALSE, 86 modes = NULL, 87 ptype = NULL, 88 skip = FALSE, 89 id = rand_id("impute_mode")) { 90 lifecycle::deprecate_warn( 91 when = "0.1.16", 92 what = "recipes::step_modeimpute()", 93 with = "recipes::step_impute_mode()" 94 ) 95 step_impute_mode( 96 recipe, 97 ..., 98 role = role, 99 trained = trained, 100 modes = modes, 101 ptype = ptype, 102 skip = skip, 103 id = id 104 ) 105 } 106 107step_impute_mode_new <- 108 function(terms, role, trained, modes, ptype, skip, id) { 109 step( 110 subclass = "impute_mode", 111 terms = terms, 112 role = role, 113 trained = trained, 114 modes = modes, 115 ptype = ptype, 116 skip = skip, 117 id = id 118 ) 119 } 120 121#' @export 122prep.step_impute_mode <- function(x, training, info = NULL, ...) { 123 col_names <- recipes_eval_select(x$terms, training, info) 124 modes <- vapply(training[, col_names], mode_est, c(mode = "")) 125 ptype <- vec_slice(training[, col_names], 0) 126 step_impute_mode_new( 127 terms = x$terms, 128 role = x$role, 129 trained = TRUE, 130 modes = modes, 131 ptype = ptype, 132 skip = x$skip, 133 id = x$id 134 ) 135} 136 137#' @export 138#' @keywords internal 139prep.step_modeimpute <- prep.step_impute_mode 140 141#' @export 142bake.step_impute_mode <- function(object, new_data, ...) { 143 144 for (i in names(object$modes)) { 145 if (any(is.na(new_data[, i]))) { 146 if(is.null(object$ptype)) { 147 rlang::warn( 148 paste0( 149 "'ptype' was added to `step_impute_mode()` after this recipe was created.\n", 150 "Regenerate your recipe to avoid this warning." 151 ) 152 ) 153 } else { 154 new_data[[i]] <- vec_cast(new_data[[i]], object$ptype[[i]]) 155 } 156 mode_val <- cast(object$modes[[i]], new_data[[i]]) 157 new_data[is.na(new_data[[i]]), i] <- mode_val 158 } 159 } 160 as_tibble(new_data) 161} 162 163#' @export 164#' @keywords internal 165bake.step_modeimpute <- bake.step_impute_mode 166 167#' @export 168print.step_impute_mode <- 169 function(x, width = max(20, options()$width - 30), ...) { 170 cat("Mode Imputation for ", sep = "") 171 printer(names(x$modes), x$terms, x$trained, width = width) 172 invisible(x) 173 } 174 175#' @export 176#' @keywords internal 177print.step_modeimpute <- print.step_impute_mode 178 179mode_est <- function(x) { 180 if (!is.character(x) & !is.factor(x)) 181 rlang::abort("The data should be character or factor to compute the mode.") 182 tab <- table(x) 183 modes <- names(tab)[tab == max(tab)] 184 sample(modes, size = 1) 185} 186 187#' @rdname tidy.recipe 188#' @export 189tidy.step_impute_mode <- function(x, ...) { 190 if (is_trained(x)) { 191 res <- tibble(terms = names(x$modes), 192 model = x$modes) 193 } else { 194 term_names <- sel2char(x$terms) 195 res <- tibble(terms = term_names, model = na_chr) 196 } 197 res$id <- x$id 198 res 199} 200 201#' @export 202#' @keywords internal 203tidy.step_modeimpute <- tidy.step_impute_mode 204