1#' Create Missing Data Column Indicators
2#'
3#' `step_indicate_na` creates a *specification* of a recipe step that will
4#'  create and append additional binary columns to the dataset to indicate
5#'  which observations are missing.
6#'
7#' @inheritParams step_pca
8#' @inheritParams step_center
9#' @param columns A character string of variable names that will
10#'  be populated (eventually) by the terms argument.
11#' @param prefix A character string that will be the prefix to the
12#'  resulting new variables. Defaults to "na_ind".
13#' @template step-return
14#' @details  When you [`tidy()`] this step, a tibble with
15#'  columns `terms` (the selectors or variables selected) and `model` (the
16#'  median value) is returned.
17#' @family dummy variable and encoding steps
18#' @export
19#' @examples
20#' library(modeldata)
21#' data("credit_data")
22#'
23#' ## missing data per column
24#' purrr::map_dbl(credit_data, function(x) mean(is.na(x)))
25#'
26#' set.seed(342)
27#' in_training <- sample(1:nrow(credit_data), 2000)
28#'
29#' credit_tr <- credit_data[ in_training, ]
30#' credit_te <- credit_data[-in_training, ]
31#'
32#' rec <- recipe(Price ~ ., data = credit_tr)
33#'
34#' impute_rec <- rec %>%
35#'   step_indicate_na(Income, Assets, Debt)
36#'
37#' imp_models <- prep(impute_rec, training = credit_tr)
38#'
39#' imputed_te <- bake(imp_models, new_data = credit_te, everything())
40
41step_indicate_na <-
42  function(recipe,
43           ...,
44           role = "predictor",
45           trained = FALSE,
46           columns = NULL,
47           prefix = "na_ind",
48           skip = FALSE,
49           id = rand_id("indicate_na")) {
50
51    terms = ellipse_check(...)
52
53    add_step(
54      recipe,
55      step_indicate_na_new(
56        terms = terms,
57        role = role,
58        trained = trained,
59        columns = columns,
60        prefix = prefix,
61        skip = skip,
62        id = id
63      )
64    )
65  }
66
67step_indicate_na_new <-
68  function(terms, role, trained, columns, prefix, skip, id) {
69    step(
70      subclass = "indicate_na",
71      terms = terms,
72      role = role,
73      trained = trained,
74      columns = columns,
75      prefix = prefix,
76      skip = skip,
77      id = id
78    )
79  }
80
81#' @export
82prep.step_indicate_na <- function(x, training, info = NULL, ...) {
83  col_names <- recipes_eval_select(x$terms, training, info)
84
85  step_indicate_na_new(
86    terms = x$terms,
87    role = x$role,
88    trained = TRUE,
89    columns = col_names,
90    prefix = x$prefix,
91    skip = x$skip,
92    id = x$id
93  )
94}
95
96#' @export
97bake.step_indicate_na <- function(object, new_data, ...) {
98  col_names <- object$columns
99
100  df_ind_na <- purrr::map_dfc(
101      new_data[col_names],
102      ~ifelse(is.na(.x), 1L, 0L)
103    ) %>%
104    dplyr::rename_with(~paste0(object$prefix, "_", .x))
105  new_data <- dplyr::bind_cols(new_data, df_ind_na)
106
107  tibble::as_tibble(new_data)
108}
109
110print.step_indicate_na <-
111  function(x, width = max(20, options()$width - 30), ...) {
112    cat("Creating missing data variable indicators for ", sep = "")
113    printer(x$columns, x$terms, x$trained, width = width)
114    invisible(x)
115  }
116
117#' @rdname tidy.recipe
118#' @export
119tidy.step_indicate_na <- function(x, ...) {
120  if (is_trained(x)) {
121    res <- tibble::tibble(terms = x$columns)
122  } else {
123    res <- tibble::tibble(terms = sel2char(x$terms))
124  }
125  res$id <- x$id
126  res
127}
128
129