1#' Impute via bagged trees 2#' 3#' `step_impute_bag` creates a *specification* of a recipe step that will 4#' create bagged tree models to impute missing data. 5#' 6#' @inheritParams step_center 7#' @param ... One or more selector functions to choose variables to be imputed. 8#' When used with `imp_vars`, these dots indicate which variables are used to 9#' predict the missing data in each variable. See [selections()] for more 10#' details. 11#' @param impute_with A call to `imp_vars` to specify which variables are used 12#' to impute the variables that can include specific variable names separated 13#' by commas or different selectors (see [selections()]). If a column is 14#' included in both lists to be imputed and to be an imputation predictor, it 15#' will be removed from the latter and not used to impute itself. 16#' @param trees An integer for the number of bagged trees to use in each model. 17#' @param options A list of options to [ipred::ipredbagg()]. Defaults are set 18#' for the arguments `nbagg` and `keepX` but others can be passed in. **Note** 19#' that the arguments `X` and `y` should not be passed here. 20#' @param seed_val An integer used to create reproducible models. The same seed 21#' is used across all imputation models. 22#' @param models The [ipred::ipredbagg()] objects are stored here once this 23#' bagged trees have be trained by [prep.recipe()]. 24#' @template step-return 25#' @family imputation steps 26#' @export 27#' @details For each variable requiring imputation, a bagged tree is created 28#' where the outcome is the variable of interest and the predictors are any 29#' other variables listed in the `impute_with` formula. One advantage to the 30#' bagged tree is that is can accept predictors that have missing values 31#' themselves. This imputation method can be used when the variable of interest 32#' (and predictors) are numeric or categorical. Imputed categorical variables 33#' will remain categorical. Also, integers will be imputed to integer too. 34#' 35#' Note that if a variable that is to be imputed is also in `impute_with`, 36#' this variable will be ignored. 37#' 38#' It is possible that missing values will still occur after imputation if a 39#' large majority (or all) of the imputing variables are also missing. 40#' 41#' When you [`tidy()`] this step, a tibble with columns `terms` (the selectors 42#' or variables selected) and `model` (the bagged tree object) is returned. 43#' 44#' As of `recipes` 0.1.16, this function name changed from `step_bagimpute()` 45#' to `step_impute_bag()`. 46#' @references Kuhn, M. and Johnson, K. (2013). *Applied Predictive Modeling*. 47#' Springer Verlag. 48#' @examples 49#' library(modeldata) 50#' data("credit_data") 51#' 52#' ## missing data per column 53#' vapply(credit_data, function(x) mean(is.na(x)), c(num = 0)) 54#' 55#' set.seed(342) 56#' in_training <- sample(1:nrow(credit_data), 2000) 57#' 58#' credit_tr <- credit_data[ in_training, ] 59#' credit_te <- credit_data[-in_training, ] 60#' missing_examples <- c(14, 394, 565) 61#' 62#' rec <- recipe(Price ~ ., data = credit_tr) 63#' \dontrun{ 64#' impute_rec <- rec %>% 65#' step_impute_bag(Status, Home, Marital, Job, Income, Assets, Debt) 66#' 67#' imp_models <- prep(impute_rec, training = credit_tr) 68#' 69#' imputed_te <- bake(imp_models, new_data = credit_te, everything()) 70#' 71#' credit_te[missing_examples,] 72#' imputed_te[missing_examples, names(credit_te)] 73#' 74#' tidy(impute_rec, number = 1) 75#' tidy(imp_models, number = 1) 76#' 77#' ## Specifying which variables to imputate with 78#' 79#' impute_rec <- rec %>% 80#' step_impute_bag(Status, Home, Marital, Job, Income, Assets, Debt, 81#' impute_with = imp_vars(Time, Age, Expenses), 82#' # for quick execution, nbagg lowered 83#' options = list(nbagg = 5, keepX = FALSE)) 84#' 85#' imp_models <- prep(impute_rec, training = credit_tr) 86#' 87#' imputed_te <- bake(imp_models, new_data = credit_te, everything()) 88#' 89#' credit_te[missing_examples,] 90#' imputed_te[missing_examples, names(credit_te)] 91#' 92#' tidy(impute_rec, number = 1) 93#' tidy(imp_models, number = 1) 94#' } 95 96step_impute_bag <- 97 function(recipe, 98 ..., 99 role = NA, 100 trained = FALSE, 101 impute_with = imp_vars(all_predictors()), 102 trees = 25, 103 models = NULL, 104 options = list(keepX = FALSE), 105 seed_val = sample.int(10 ^ 4, 1), 106 skip = FALSE, 107 id = rand_id("impute_bag")) { 108 if (is.null(impute_with)) 109 rlang::abort("Please list some variables in `impute_with`") 110 add_step( 111 recipe, 112 step_impute_bag_new( 113 terms = ellipse_check(...), 114 role = role, 115 trained = trained, 116 impute_with = impute_with, 117 trees = trees, 118 models = models, 119 options = options, 120 seed_val = seed_val, 121 skip = skip, 122 id = id 123 ) 124 ) 125 } 126 127#' @rdname step_impute_bag 128#' @export 129step_bagimpute <- 130 function(recipe, 131 ..., 132 role = NA, 133 trained = FALSE, 134 impute_with = imp_vars(all_predictors()), 135 trees = 25, 136 models = NULL, 137 options = list(keepX = FALSE), 138 seed_val = sample.int(10 ^ 4, 1), 139 skip = FALSE, 140 id = rand_id("impute_bag")) { 141 lifecycle::deprecate_warn( 142 when = "0.1.16", 143 what = "recipes::step_bagimpute()", 144 with = "recipes::step_impute_bag()" 145 ) 146 step_impute_bag( 147 recipe, 148 ..., 149 role = role, 150 trained = trained, 151 impute_with = impute_with, 152 trees = trees, 153 models = models, 154 options = options, 155 seed_val = seed_val, 156 skip = skip, 157 id = id 158 ) 159 } 160 161step_impute_bag_new <- 162 function(terms, role, trained, models, options, impute_with, trees, 163 seed_val, skip, id) { 164 step( 165 subclass = "impute_bag", 166 terms = terms, 167 role = role, 168 trained = trained, 169 impute_with = impute_with, 170 trees = trees, 171 models = models, 172 options = options, 173 seed_val = seed_val, 174 skip = skip, 175 id = id 176 ) 177 } 178 179 180bag_wrap <- function(vars, dat, opt, seed_val) { 181 seed_val <- seed_val[1] 182 dat <- as.data.frame(dat[, c(vars$y, vars$x)]) 183 if (is.character(dat[[vars$y]])) { 184 dat[[vars$y]] <- factor(dat[[vars$y]]) 185 } 186 187 if (!is.null(seed_val) && !is.na(seed_val)) 188 set.seed(seed_val) 189 190 out <- do.call("ipredbagg", 191 c(list(y = dat[, vars$y], 192 X = dat[, vars$x, drop = FALSE]), 193 opt)) 194 out$..imp_vars <- vars$x 195 out 196} 197 198## This figures out which data should be used to predict each variable 199## scheduled for imputation 200impute_var_lists <- function(to_impute, impute_using, training, info) { 201 to_impute <- recipes_eval_select(to_impute, training, info) 202 impute_using <- recipes_eval_select(impute_using, training, info) 203 204 var_lists <- vector(mode = "list", length = length(to_impute)) 205 for (i in seq_along(var_lists)) { 206 var_lists[[i]] <- list(y = to_impute[i], 207 x = impute_using[!(impute_using %in% to_impute[i])]) 208 } 209 var_lists 210} 211 212#' @export 213prep.step_impute_bag <- function(x, training, info = NULL, ...) { 214 var_lists <- 215 impute_var_lists( 216 to_impute = x$terms, 217 impute_using = x$impute_with, 218 training = training, 219 info = info 220 ) 221 opt <- x$options 222 opt$nbagg <- x$trees 223 224 x$models <- lapply( 225 var_lists, 226 bag_wrap, 227 dat = training, 228 opt = opt, 229 seed_val = x$seed_val 230 ) 231 names(x$models) <- vapply(var_lists, function(x) x$y, c("")) 232 233 step_impute_bag_new( 234 terms = x$terms, 235 role = x$role, 236 trained = TRUE, 237 models = x$models, 238 options = x$options, 239 impute_with = x$impute_with, 240 trees = x$trees, 241 seed_val = x$seed_val, 242 skip = x$skip, 243 id = x$id 244 ) 245} 246 247#' @export 248#' @keywords internal 249prep.step_bagimpute <- prep.step_impute_bag 250 251#' @export 252bake.step_impute_bag <- function(object, new_data, ...) { 253 missing_rows <- !complete.cases(new_data) 254 if (!any(missing_rows)) 255 return(new_data) 256 257 old_data <- new_data 258 for (i in seq(along.with = object$models)) { 259 imp_var <- names(object$models)[i] 260 missing_rows <- !complete.cases(new_data[, imp_var]) 261 if (any(missing_rows)) { 262 preds <- object$models[[imp_var]]$..imp_vars 263 pred_data <- old_data[missing_rows, preds, drop = FALSE] 264 ## do a better job of checking this: 265 if (all(is.na(pred_data))) { 266 rlang::warn("All predictors are missing; cannot impute") 267 } else { 268 pred_vals <- predict(object$models[[imp_var]], pred_data) 269 # For an ipred bug reported on 2021-09-14: 270 pred_vals <- cast(pred_vals, object$models[[imp_var]]$y) 271 new_data[missing_rows, imp_var] <- pred_vals 272 } 273 } 274 } 275 ## changes character to factor! 276 as_tibble(new_data) 277} 278 279#' @export 280#' @keywords internal 281bake.step_bagimpute <- bake.step_impute_bag 282 283#' @export 284print.step_impute_bag <- 285 function(x, width = max(20, options()$width - 31), ...) { 286 cat("Bagged tree imputation for ", sep = "") 287 printer(names(x$models), x$terms, x$trained, width = width) 288 invisible(x) 289 } 290 291#' @export 292#' @keywords internal 293print.step_bagimpute <- print.step_impute_bag 294 295#' @export 296#' @rdname step_impute_bag 297imp_vars <- function(...) quos(...) 298 299#' @rdname tidy.recipe 300#' @export 301tidy.step_impute_bag <- function(x, ...) { 302 if (is_trained(x)) { 303 res <- tibble(terms = names(x$models), 304 model = x$models) 305 } else { 306 term_names <- sel2char(x$terms) 307 res <- tibble(terms = term_names, model = NA) 308 } 309 res$id <- x$id 310 res 311} 312 313#' @export 314#' @keywords internal 315tidy.step_bagimpute <- tidy.step_impute_bag 316 317# ------------------------------------------------------------------------------ 318 319#' @rdname tunable.recipe 320#' @export 321tunable.step_impute_bag <- function(x, ...) { 322 tibble::tibble( 323 name = "trees", 324 call_info = list(list(pkg = "dials", fun = "trees", range = c(5L, 25L))), 325 source = "recipe", 326 component = "step_impute_bag", 327 component_id = x$id 328 ) 329} 330 331tunable.step_bagimpute <- tunable.step_impute_bag 332