1#' Create Missing Data Column Indicators 2#' 3#' `step_indicate_na` creates a *specification* of a recipe step that will 4#' create and append additional binary columns to the dataset to indicate 5#' which observations are missing. 6#' 7#' @inheritParams step_pca 8#' @inheritParams step_center 9#' @param columns A character string of variable names that will 10#' be populated (eventually) by the terms argument. 11#' @param prefix A character string that will be the prefix to the 12#' resulting new variables. Defaults to "na_ind". 13#' @template step-return 14#' @details When you [`tidy()`] this step, a tibble with 15#' columns `terms` (the selectors or variables selected) and `model` (the 16#' median value) is returned. 17#' @family dummy variable and encoding steps 18#' @export 19#' @examples 20#' library(modeldata) 21#' data("credit_data") 22#' 23#' ## missing data per column 24#' purrr::map_dbl(credit_data, function(x) mean(is.na(x))) 25#' 26#' set.seed(342) 27#' in_training <- sample(1:nrow(credit_data), 2000) 28#' 29#' credit_tr <- credit_data[ in_training, ] 30#' credit_te <- credit_data[-in_training, ] 31#' 32#' rec <- recipe(Price ~ ., data = credit_tr) 33#' 34#' impute_rec <- rec %>% 35#' step_indicate_na(Income, Assets, Debt) 36#' 37#' imp_models <- prep(impute_rec, training = credit_tr) 38#' 39#' imputed_te <- bake(imp_models, new_data = credit_te, everything()) 40 41step_indicate_na <- 42 function(recipe, 43 ..., 44 role = "predictor", 45 trained = FALSE, 46 columns = NULL, 47 prefix = "na_ind", 48 skip = FALSE, 49 id = rand_id("indicate_na")) { 50 51 terms = ellipse_check(...) 52 53 add_step( 54 recipe, 55 step_indicate_na_new( 56 terms = terms, 57 role = role, 58 trained = trained, 59 columns = columns, 60 prefix = prefix, 61 skip = skip, 62 id = id 63 ) 64 ) 65 } 66 67step_indicate_na_new <- 68 function(terms, role, trained, columns, prefix, skip, id) { 69 step( 70 subclass = "indicate_na", 71 terms = terms, 72 role = role, 73 trained = trained, 74 columns = columns, 75 prefix = prefix, 76 skip = skip, 77 id = id 78 ) 79 } 80 81#' @export 82prep.step_indicate_na <- function(x, training, info = NULL, ...) { 83 col_names <- recipes_eval_select(x$terms, training, info) 84 85 step_indicate_na_new( 86 terms = x$terms, 87 role = x$role, 88 trained = TRUE, 89 columns = col_names, 90 prefix = x$prefix, 91 skip = x$skip, 92 id = x$id 93 ) 94} 95 96#' @export 97bake.step_indicate_na <- function(object, new_data, ...) { 98 col_names <- object$columns 99 100 df_ind_na <- purrr::map_dfc( 101 new_data[col_names], 102 ~ifelse(is.na(.x), 1L, 0L) 103 ) %>% 104 dplyr::rename_with(~paste0(object$prefix, "_", .x)) 105 new_data <- dplyr::bind_cols(new_data, df_ind_na) 106 107 tibble::as_tibble(new_data) 108} 109 110print.step_indicate_na <- 111 function(x, width = max(20, options()$width - 30), ...) { 112 cat("Creating missing data variable indicators for ", sep = "") 113 printer(x$columns, x$terms, x$trained, width = width) 114 invisible(x) 115 } 116 117#' @rdname tidy.recipe 118#' @export 119tidy.step_indicate_na <- function(x, ...) { 120 if (is_trained(x)) { 121 res <- tibble::tibble(terms = x$columns) 122 } else { 123 res <- tibble::tibble(terms = sel2char(x$terms)) 124 } 125 res$id <- x$id 126 res 127} 128 129