1#' Impute via bagged trees
2#'
3#' `step_impute_bag` creates a *specification* of a recipe step that will
4#'  create bagged tree models to impute missing data.
5#'
6#' @inheritParams step_center
7#' @param ... One or more selector functions to choose variables to be imputed.
8#'  When used with `imp_vars`, these dots indicate which variables are used to
9#'  predict the missing data in each variable. See [selections()] for more
10#'  details.
11#' @param impute_with A call to `imp_vars` to specify which variables are used
12#'  to impute the variables that can include specific variable names separated
13#'  by commas or different selectors (see [selections()]). If a column is
14#'  included in both lists to be imputed and to be an imputation predictor, it
15#'  will be removed from the latter and not used to impute itself.
16#' @param trees An integer for the number of bagged trees to use in each model.
17#' @param options A list of options to [ipred::ipredbagg()]. Defaults are set
18#'  for the arguments `nbagg` and `keepX` but others can be passed in. **Note**
19#'  that the arguments `X` and `y` should not be passed here.
20#' @param seed_val An integer used to create reproducible models. The same seed
21#'  is used across all imputation models.
22#' @param models The [ipred::ipredbagg()] objects are stored here once this
23#'  bagged trees have be trained by [prep.recipe()].
24#' @template step-return
25#' @family imputation steps
26#' @export
27#' @details For each variable requiring imputation, a bagged tree is created
28#'  where the outcome is the variable of interest and the predictors are any
29#'  other variables listed in the `impute_with` formula. One advantage to the
30#'  bagged tree is that is can accept predictors that have missing values
31#'  themselves. This imputation method can be used when the variable of interest
32#'  (and predictors) are numeric or categorical. Imputed categorical variables
33#'  will remain categorical. Also, integers will be imputed to integer too.
34#'
35#'   Note that if a variable that is to be imputed is also in `impute_with`,
36#'  this variable will be ignored.
37#'
38#'   It is possible that missing values will still occur after imputation if a
39#'  large majority (or all) of the imputing variables are also missing.
40#'
41#'  When you [`tidy()`] this step, a tibble with columns `terms` (the selectors
42#'  or variables selected) and `model` (the bagged tree object) is returned.
43#'
44#'  As of `recipes` 0.1.16, this function name changed from `step_bagimpute()`
45#'    to `step_impute_bag()`.
46#' @references Kuhn, M. and Johnson, K. (2013). *Applied Predictive Modeling*.
47#'  Springer Verlag.
48#' @examples
49#' library(modeldata)
50#' data("credit_data")
51#'
52#' ## missing data per column
53#' vapply(credit_data, function(x) mean(is.na(x)), c(num = 0))
54#'
55#' set.seed(342)
56#' in_training <- sample(1:nrow(credit_data), 2000)
57#'
58#' credit_tr <- credit_data[ in_training, ]
59#' credit_te <- credit_data[-in_training, ]
60#' missing_examples <- c(14, 394, 565)
61#'
62#' rec <- recipe(Price ~ ., data = credit_tr)
63#' \dontrun{
64#' impute_rec <- rec %>%
65#'   step_impute_bag(Status, Home, Marital, Job, Income, Assets, Debt)
66#'
67#' imp_models <- prep(impute_rec, training = credit_tr)
68#'
69#' imputed_te <- bake(imp_models, new_data = credit_te, everything())
70#'
71#' credit_te[missing_examples,]
72#' imputed_te[missing_examples, names(credit_te)]
73#'
74#' tidy(impute_rec, number = 1)
75#' tidy(imp_models, number = 1)
76#'
77#' ## Specifying which variables to imputate with
78#'
79#'  impute_rec <- rec %>%
80#'   step_impute_bag(Status, Home, Marital, Job, Income, Assets, Debt,
81#'                  impute_with = imp_vars(Time, Age, Expenses),
82#'                  # for quick execution, nbagg lowered
83#'                  options = list(nbagg = 5, keepX = FALSE))
84#'
85#' imp_models <- prep(impute_rec, training = credit_tr)
86#'
87#' imputed_te <- bake(imp_models, new_data = credit_te, everything())
88#'
89#' credit_te[missing_examples,]
90#' imputed_te[missing_examples, names(credit_te)]
91#'
92#' tidy(impute_rec, number = 1)
93#' tidy(imp_models, number = 1)
94#' }
95
96step_impute_bag <-
97  function(recipe,
98           ...,
99           role = NA,
100           trained = FALSE,
101           impute_with = imp_vars(all_predictors()),
102           trees = 25,
103           models = NULL,
104           options = list(keepX = FALSE),
105           seed_val = sample.int(10 ^ 4, 1),
106           skip = FALSE,
107           id = rand_id("impute_bag")) {
108    if (is.null(impute_with))
109      rlang::abort("Please list some variables in `impute_with`")
110    add_step(
111      recipe,
112      step_impute_bag_new(
113        terms = ellipse_check(...),
114        role = role,
115        trained = trained,
116        impute_with = impute_with,
117        trees = trees,
118        models = models,
119        options = options,
120        seed_val = seed_val,
121        skip = skip,
122        id = id
123      )
124    )
125  }
126
127#' @rdname step_impute_bag
128#' @export
129step_bagimpute <-
130  function(recipe,
131           ...,
132           role = NA,
133           trained = FALSE,
134           impute_with = imp_vars(all_predictors()),
135           trees = 25,
136           models = NULL,
137           options = list(keepX = FALSE),
138           seed_val = sample.int(10 ^ 4, 1),
139           skip = FALSE,
140           id = rand_id("impute_bag")) {
141    lifecycle::deprecate_warn(
142      when = "0.1.16",
143      what = "recipes::step_bagimpute()",
144      with = "recipes::step_impute_bag()"
145    )
146    step_impute_bag(
147      recipe,
148      ...,
149      role = role,
150      trained = trained,
151      impute_with = impute_with,
152      trees = trees,
153      models = models,
154      options = options,
155      seed_val = seed_val,
156      skip = skip,
157      id = id
158    )
159  }
160
161step_impute_bag_new <-
162  function(terms, role, trained, models, options, impute_with, trees,
163           seed_val, skip, id) {
164    step(
165      subclass = "impute_bag",
166      terms = terms,
167      role = role,
168      trained = trained,
169      impute_with = impute_with,
170      trees = trees,
171      models = models,
172      options = options,
173      seed_val = seed_val,
174      skip = skip,
175      id = id
176    )
177  }
178
179
180bag_wrap <- function(vars, dat, opt, seed_val) {
181  seed_val <- seed_val[1]
182  dat <- as.data.frame(dat[, c(vars$y, vars$x)])
183  if (is.character(dat[[vars$y]])) {
184    dat[[vars$y]] <- factor(dat[[vars$y]])
185  }
186
187  if (!is.null(seed_val) && !is.na(seed_val))
188    set.seed(seed_val)
189
190  out <- do.call("ipredbagg",
191                 c(list(y = dat[, vars$y],
192                        X = dat[, vars$x, drop = FALSE]),
193                   opt))
194  out$..imp_vars <- vars$x
195  out
196}
197
198## This figures out which data should be used to predict each variable
199## scheduled for imputation
200impute_var_lists <- function(to_impute, impute_using, training, info) {
201  to_impute <- recipes_eval_select(to_impute, training, info)
202  impute_using <- recipes_eval_select(impute_using, training, info)
203
204  var_lists <- vector(mode = "list", length = length(to_impute))
205  for (i in seq_along(var_lists)) {
206    var_lists[[i]] <- list(y = to_impute[i],
207                           x = impute_using[!(impute_using %in% to_impute[i])])
208  }
209  var_lists
210}
211
212#' @export
213prep.step_impute_bag <- function(x, training, info = NULL, ...) {
214  var_lists <-
215    impute_var_lists(
216      to_impute = x$terms,
217      impute_using = x$impute_with,
218      training = training,
219      info = info
220    )
221  opt <- x$options
222  opt$nbagg <- x$trees
223
224  x$models <- lapply(
225    var_lists,
226    bag_wrap,
227    dat = training,
228    opt = opt,
229    seed_val = x$seed_val
230  )
231  names(x$models) <- vapply(var_lists, function(x) x$y, c(""))
232
233  step_impute_bag_new(
234    terms = x$terms,
235    role = x$role,
236    trained = TRUE,
237    models = x$models,
238    options = x$options,
239    impute_with = x$impute_with,
240    trees = x$trees,
241    seed_val = x$seed_val,
242    skip = x$skip,
243    id = x$id
244  )
245}
246
247#' @export
248#' @keywords internal
249prep.step_bagimpute <- prep.step_impute_bag
250
251#' @export
252bake.step_impute_bag <- function(object, new_data, ...) {
253  missing_rows <- !complete.cases(new_data)
254  if (!any(missing_rows))
255    return(new_data)
256
257  old_data <- new_data
258  for (i in seq(along.with = object$models)) {
259    imp_var <- names(object$models)[i]
260    missing_rows <- !complete.cases(new_data[, imp_var])
261    if (any(missing_rows)) {
262      preds <- object$models[[imp_var]]$..imp_vars
263      pred_data <- old_data[missing_rows, preds, drop = FALSE]
264      ## do a better job of checking this:
265      if (all(is.na(pred_data))) {
266        rlang::warn("All predictors are missing; cannot impute")
267      } else {
268        pred_vals <- predict(object$models[[imp_var]], pred_data)
269        # For an ipred bug reported on 2021-09-14:
270        pred_vals <- cast(pred_vals, object$models[[imp_var]]$y)
271        new_data[missing_rows, imp_var] <- pred_vals
272      }
273    }
274  }
275  ## changes character to factor!
276  as_tibble(new_data)
277}
278
279#' @export
280#' @keywords internal
281bake.step_bagimpute <- bake.step_impute_bag
282
283#' @export
284print.step_impute_bag <-
285  function(x, width = max(20, options()$width - 31), ...) {
286    cat("Bagged tree imputation for ", sep = "")
287    printer(names(x$models), x$terms, x$trained, width = width)
288    invisible(x)
289  }
290
291#' @export
292#' @keywords internal
293print.step_bagimpute <- print.step_impute_bag
294
295#' @export
296#' @rdname step_impute_bag
297imp_vars <- function(...) quos(...)
298
299#' @rdname tidy.recipe
300#' @export
301tidy.step_impute_bag <- function(x, ...) {
302  if (is_trained(x)) {
303    res <- tibble(terms = names(x$models),
304                  model = x$models)
305  } else {
306    term_names <- sel2char(x$terms)
307    res <- tibble(terms = term_names, model = NA)
308  }
309  res$id <- x$id
310  res
311}
312
313#' @export
314#' @keywords internal
315tidy.step_bagimpute <- tidy.step_impute_bag
316
317# ------------------------------------------------------------------------------
318
319#' @rdname tunable.recipe
320#' @export
321tunable.step_impute_bag <- function(x, ...) {
322  tibble::tibble(
323    name = "trees",
324    call_info = list(list(pkg = "dials", fun = "trees", range = c(5L, 25L))),
325    source = "recipe",
326    component = "step_impute_bag",
327    component_id = x$id
328  )
329}
330
331tunable.step_bagimpute <- tunable.step_impute_bag
332