1#' Create a recipe for preprocessing data
2#'
3#' A recipe is a description of the steps to be applied to a data set in
4#'   order to prepare it for data analysis.
5#'
6#' @aliases recipe recipe.default recipe.formula
7#' @export
8recipe <- function(x, ...)
9  UseMethod("recipe")
10
11#' @rdname recipe
12#' @export
13recipe.default <- function(x, ...)
14  rlang::abort("`x` should be a data frame, matrix, or tibble")
15
16#' @rdname recipe
17#' @param vars A character string of column names corresponding to variables
18#'   that will be used in any context (see below)
19#' @param roles A character string (the same length of `vars`) that
20#'   describes a single role that the variable will take. This value could be
21#'   anything but common roles are `"outcome"`, `"predictor"`,
22#'   `"case_weight"`, or `"ID"`
23#' @param ... Further arguments passed to or from other methods (not currently
24#'   used).
25#' @param formula A model formula. No in-line functions should be used here
26#'  (e.g. `log(x)`, `x:y`, etc.) and minus signs are not allowed. These types of
27#'  transformations should be enacted using `step` functions in this package.
28#'  Dots are allowed as are simple multivariate outcome terms (i.e. no need for
29#'  `cbind`; see Examples). A model formula may not be the best choice for
30#'  high-dimensional data with many columns, because of problems with memory.
31#' @param x,data A data frame or tibble of the *template* data set
32#'   (see below).
33#' @return An object of class `recipe` with sub-objects:
34#'   \item{var_info}{A tibble containing information about the original data
35#'   set columns}
36#'   \item{term_info}{A tibble that contains the current set of terms in the
37#'   data set. This initially defaults to the same data contained in
38#'   `var_info`.}
39#'   \item{steps}{A list of `step`  or `check` objects that define the sequence of
40#'   preprocessing operations that will be applied to data. The default value is
41#'   `NULL`}
42#'   \item{template}{A tibble of the data. This is initialized to be the same
43#'   as the data given in the `data` argument but can be different after
44#'   the recipe is trained.}
45#'
46#' @includeRmd man/rmd/recipes.Rmd details
47#'
48#' @export
49#' @examples
50#'
51#' # formula example with single outcome:
52#' library(modeldata)
53#' data(biomass)
54#'
55#' # split data
56#' biomass_tr <- biomass[biomass$dataset == "Training",]
57#' biomass_te <- biomass[biomass$dataset == "Testing",]
58#'
59#' # With only predictors and outcomes, use a formula
60#' rec <- recipe(HHV ~ carbon + hydrogen + oxygen + nitrogen + sulfur,
61#'               data = biomass_tr)
62#'
63#' # Now add preprocessing steps to the recipe
64#' sp_signed <- rec %>%
65#'   step_normalize(all_numeric_predictors()) %>%
66#'   step_spatialsign(all_numeric_predictors())
67#' sp_signed
68#'
69#' # ---------------------------------------------------------------------------
70#' # formula multivariate example:
71#' # no need for `cbind(carbon, hydrogen)` for left-hand side
72#'
73#' multi_y <- recipe(carbon + hydrogen ~ oxygen + nitrogen + sulfur,
74#'                   data = biomass_tr)
75#' multi_y <- multi_y %>%
76#'   step_center(all_numeric_predictors()) %>%
77#'   step_scale(all_numeric_predictors())
78#'
79#' # ---------------------------------------------------------------------------
80#' # example using `update_role` instead of formula:
81#' # best choice for high-dimensional data
82#'
83#' rec <- recipe(biomass_tr) %>%
84#'   update_role(carbon, hydrogen, oxygen, nitrogen, sulfur,
85#'            new_role = "predictor") %>%
86#'   update_role(HHV, new_role = "outcome") %>%
87#'   update_role(sample, new_role = "id variable") %>%
88#'   update_role(dataset, new_role = "splitting indicator")
89#' rec
90recipe.data.frame <-
91  function(x,
92           formula = NULL,
93           ...,
94           vars = NULL,
95           roles = NULL) {
96
97    if (!is.null(formula)) {
98      if (!is.null(vars))
99        rlang::abort(
100          paste0("This `vars` specification will be ignored ",
101             "when a formula is used"
102             )
103          )
104      if (!is.null(roles))
105        rlang::abort(
106          paste0("This `roles` specification will be ignored ",
107             "when a formula is used"
108             )
109          )
110
111      obj <- recipe.formula(formula, x, ...)
112      return(obj)
113    }
114
115    if (is.null(vars))
116      vars <- colnames(x)
117
118    if (!is_tibble(x))
119      x <- as_tibble(x)
120
121    if (any(table(vars) > 1))
122      rlang::abort("`vars` should have unique members")
123    if (any(!(vars %in% colnames(x))))
124      rlang::abort("1+ elements of `vars` are not in `x`")
125
126    x <- x[, vars]
127
128    var_info <- tibble(variable = vars)
129
130    ## Check and add roles when available
131    if (!is.null(roles)) {
132      if (length(roles) != length(vars))
133        rlang::abort(
134          paste0("The number of roles should be the same as the number of ",
135             "variables")
136        )
137      var_info$role <- roles
138    } else
139      var_info$role <- NA
140
141    ## Add types
142    var_info <- full_join(get_types(x), var_info, by = "variable")
143    var_info$source <- "original"
144
145    ## Return final object of class `recipe`
146    out <- list(
147      var_info = var_info,
148      term_info = var_info,
149      steps = NULL,
150      template = x,
151      levels = NULL,
152      retained = NA
153    )
154    class(out) <- "recipe"
155    out
156  }
157
158#' @rdname recipe
159#' @export
160recipe.formula <- function(formula, data, ...) {
161  # check for minus:
162  f_funcs <- fun_calls(formula)
163  if (any(f_funcs == "-")) {
164    rlang::abort("`-` is not allowed in a recipe formula. Use `step_rm()` instead.")
165  }
166
167  # Check for other in-line functions
168  args <- form2args(formula, data, ...)
169  obj <- recipe.data.frame(
170    x = args$x,
171    formula = NULL,
172    ...,
173    vars = args$vars,
174    roles = args$roles
175  )
176  obj
177}
178
179#' @rdname recipe
180#' @export
181recipe.matrix <- function(x, ...) {
182  x <- as.data.frame(x)
183  recipe.data.frame(x, ...)
184}
185
186form2args <- function(formula, data, ...) {
187  if (!is_formula(formula))
188    formula <- as.formula(formula)
189
190  ## check for in-line formulas
191  inline_check(formula)
192
193  if (!is_tibble(data))
194    data <- as_tibble(data)
195
196  ## use rlang to get both sides of the formula
197  outcomes <- get_lhs_vars(formula, data)
198  predictors <- get_rhs_vars(formula, data, no_lhs = TRUE)
199
200  ## if . was used on the rhs, subtract out the outcomes
201  predictors <- predictors[!(predictors %in% outcomes)]
202
203  ## get `vars` from lhs and rhs of formula
204  vars <- c(predictors, outcomes)
205
206  ## subset data columns
207  data <- data[, vars]
208
209  ## derive roles
210  roles <- rep("predictor", length(predictors))
211  if (length(outcomes) > 0)
212    roles <- c(roles, rep("outcome", length(outcomes)))
213
214  ## pass to recipe.default with vars and roles
215
216  list(x = data, vars = vars, roles = roles)
217}
218
219inline_check <- function(x) {
220  funs <- fun_calls(x)
221  funs <- funs[!(funs %in% c("~", "+", "-"))]
222
223  if (length(funs) > 0) {
224    rlang::abort(paste0(
225      "No in-line functions should be used here; ",
226      "use steps to define baking actions."
227    ))
228  }
229
230  invisible(x)
231}
232
233
234#' @aliases prep prep.recipe
235#' @param x an object
236#' @param ... further arguments passed to or from other methods (not currently
237#'   used).
238#' @export
239prep <- function(x, ...)
240  UseMethod("prep")
241
242#' Estimate a preprocessing recipe
243#'
244#' For a recipe with at least one preprocessing operation, estimate the required
245#'   parameters from a training set that can be later applied to other data
246#'   sets.
247#' @param training A data frame or tibble that will be used to estimate
248#'   parameters for preprocessing.
249#' @param fresh A logical indicating whether already trained operation should be
250#'   re-trained. If `TRUE`, you should pass in a data set to the argument
251#'   `training`.
252#' @param verbose A logical that controls whether progress is reported as operations
253#'   are executed.
254#' @param log_changes A logical for printing a summary for each step regarding
255#'  which (if any) columns were added or removed during training.
256#' @param retain A logical: should the *preprocessed* training set be saved
257#'   into the `template` slot of the recipe after training? This is a good
258#'     idea if you want to add more steps later but want to avoid re-training
259#'     the existing steps. Also, it is advisable to use `retain = TRUE`
260#'     if any steps use the option `skip = FALSE`. **Note** that this can make
261#'     the final recipe size large. When `verbose = TRUE`, a message is written
262#'     with the approximate object size in memory but may be an underestimate
263#'     since it does not take environments into account.
264#' @param strings_as_factors A logical: should character columns be converted to
265#'   factors? This affects the preprocessed training set (when
266#'   `retain = TRUE`) as well as the results of `bake.recipe`.
267#' @return A recipe whose step objects have been updated with the required
268#'   quantities (e.g. parameter estimates, model objects, etc). Also, the
269#'   `term_info` object is likely to be modified as the operations are
270#'   executed.
271#' @details
272#'
273#' Given a data set, this function estimates the required quantities and
274#' statistics needed by any operations. [prep()] returns an updated recipe
275#' with the estimates. If you are using a recipe as a preprocessor for modeling,
276#' we **highly recommend** that you use a `workflow()` instead of manually
277#' estimating a recipe (see the example in [recipe()]).
278#'
279#' Note that missing data is handled in the steps; there is no global
280#'   `na.rm` option at the recipe level or in [prep()].
281#'
282#' Also, if a recipe has been trained using [prep()] and then steps
283#'   are added, [prep()] will only update the new operations. If
284#'   `fresh = TRUE`, all of the operations will be (re)estimated.
285#'
286#' As the steps are executed, the `training` set is updated. For example,
287#'   if the first step is to center the data and the second is to scale the
288#'   data, the step for scaling is given the centered data.
289#'
290#'
291#' @examples
292#' data(ames, package = "modeldata")
293#'
294#' library(dplyr)
295#'
296#' ames <- mutate(ames, Sale_Price = log10(Sale_Price))
297#'
298#' ames_rec <-
299#'   recipe(
300#'     Sale_Price ~ Longitude + Latitude + Neighborhood + Year_Built + Central_Air,
301#'     data = ames
302#'   ) %>%
303#'   step_other(Neighborhood, threshold = 0.05) %>%
304#'   step_dummy(all_nominal()) %>%
305#'   step_interact(~ starts_with("Central_Air"):Year_Built) %>%
306#'   step_ns(Longitude, Latitude, deg_free = 5)
307#'
308#' prep(ames_rec, verbose = TRUE)
309#'
310#' prep(ames_rec, log_changes = TRUE)
311#' @rdname prep
312#' @export
313prep.recipe <-
314  function(x,
315           training = NULL,
316           fresh = FALSE,
317           verbose = FALSE,
318           retain = TRUE,
319           log_changes = FALSE,
320           strings_as_factors = TRUE,
321           ...) {
322
323    training <- check_training_set(training, x, fresh)
324
325    tr_data <- train_info(training)
326
327    # Record the original levels for later checking
328    orig_lvls <- lapply(training, get_levels)
329
330    if (strings_as_factors) {
331      lvls <- lapply(training, get_levels)
332      training <- strings2factors(training, lvls)
333    } else {
334      lvls <- NULL
335    }
336
337    # The only way to get the results for skipped steps is to
338    # use `retain = TRUE` so issue a warning if this is not the case
339    skippers <- map_lgl(x$steps, is_skipable)
340    if (any(skippers) & !retain)
341      rlang::warn(
342        paste0(
343          "Since some operations have `skip = TRUE`, using ",
344          "`retain = TRUE` will allow those steps results to ",
345          "be accessible."
346        )
347      )
348
349
350    running_info <- x$term_info %>% mutate(number = 0, skip = FALSE)
351    for (i in seq(along.with = x$steps)) {
352      needs_tuning <- map_lgl(x$steps[[i]], is_tune)
353      if (any(needs_tuning)) {
354        arg <- names(needs_tuning)[needs_tuning]
355        arg <- paste0("'", arg, "'", collapse = ", ")
356        msg <-
357          paste0(
358            "You cannot `prep()` a tuneable recipe. Argument(s) with `tune()`: ",
359            arg,
360            ". Do you want to use a tuning function such as `tune_grid()`?"
361          )
362        rlang::abort(msg)
363      }
364      note <- paste("oper",  i, gsub("_", " ", class(x$steps[[i]])[1]))
365      if (!x$steps[[i]]$trained | fresh) {
366
367        if (verbose) {
368          cat(note, "[training]", "\n")
369        }
370
371        before_nms <- names(training)
372
373        # Compute anything needed for the preprocessing steps
374        # then apply it to the current training set
375        x$steps[[i]] <-
376          prep(x$steps[[i]],
377               training = training,
378               info = x$term_info)
379        training <- bake(x$steps[[i]], new_data = training)
380        x$term_info <-
381          merge_term_info(get_types(training), x$term_info)
382
383        # Update the roles and the term source
384        if (!is.na(x$steps[[i]]$role)) {
385          new_vars <- setdiff(x$term_info$variable, running_info$variable)
386          pos_new_var <- x$term_info$variable %in% new_vars
387          pos_new_and_na_role <- pos_new_var & is.na(x$term_info$role)
388          pos_new_and_na_source <- pos_new_var  & is.na(x$term_info$source)
389
390          x$term_info$role[pos_new_and_na_role] <- x$steps[[i]]$role
391          x$term_info$source[pos_new_and_na_source] <- "derived"
392
393        }
394
395        changelog(log_changes, before_nms, names(training), x$steps[[i]])
396
397        running_info <- rbind(
398          running_info,
399          mutate(x$term_info, number = i, skip = x$steps[[i]]$skip)
400        )
401
402      }
403      else {
404        if (verbose) cat(note, "[pre-trained]\n")
405      }
406    }
407
408    ## The steps may have changed the data so reassess the levels
409    if (strings_as_factors) {
410      lvls <- lapply(training, get_levels)
411      check_lvls <- has_lvls(lvls)
412      if (!any(check_lvls)) lvls <- NULL
413    } else lvls <- NULL
414
415    if (retain) {
416      if (verbose)
417        cat("The retained training set is ~",
418            format(object.size(training), units = "Mb", digits = 2),
419            " in memory.\n\n")
420
421      x$template <- training
422    } else {
423      x$template <- training[0,]
424    }
425
426    x$tr_info <- tr_data
427    x$levels <- lvls
428    x$orig_lvls <- orig_lvls
429    x$retained <- retain
430    # In case a variable was removed, and that removal step used
431    # `skip = TRUE`, we need to retain its record so that
432    # selectors can be properly used with `bake`. This tibble
433    # captures every variable originally in the data or that was
434    # created along the way. `number` will be the last step where
435    # that variable was available.
436    x$last_term_info <-
437      running_info %>%
438      group_by(variable) %>%
439      arrange(desc(number)) %>%
440      summarise(
441        type = dplyr::first(type),
442        role = as.list(unique(unlist(role))),
443        source = dplyr::first(source),
444        number = dplyr::first(number),
445        skip = dplyr::first(skip),
446        .groups = "keep"
447      )
448    x
449  }
450
451#' @rdname bake
452#' @aliases bake bake.recipe
453#' @export
454bake <- function(object, ...)
455  UseMethod("bake")
456
457#' Apply a trained preprocessing recipe
458#'
459#' For a recipe with at least one preprocessing operation that has been trained by
460#'   [prep.recipe()], apply the computations to new data.
461#' @param object A trained object such as a [recipe()] with at least
462#'   one preprocessing operation.
463#' @param new_data A data frame or tibble for whom the preprocessing will be
464#'   applied. If `NULL` is given to `new_data`, the pre-processed _training
465#'   data_ will be returned (assuming that `prep(retain = TRUE)` was used).
466#' @param ... One or more selector functions to choose which variables will be
467#'   returned by the function. See [selections()] for more details.
468#'   If no selectors are given, the default is to use
469#'   [everything()].
470#' @param composition Either "tibble", "matrix", "data.frame", or
471#'  "dgCMatrix" for the format of the processed data set. Note that
472#'  all computations during the baking process are done in a
473#'  non-sparse format. Also, note that this argument should be
474#'  called **after** any selectors and the selectors should only
475#'  resolve to numeric columns (otherwise an error is thrown).
476#' @return A tibble, matrix, or sparse matrix that may have different
477#'  columns than the original columns in `new_data`.
478#' @details [bake()] takes a trained recipe and applies its operations to a
479#'  data set to create a design matrix. If you are using a recipe as a
480#'  preprocessor for modeling, we **highly recommend** that you use a `workflow()`
481#'  instead of manually applying a recipe (see the example in [recipe()]).
482#'
483#' If the data set is not too large, time can be saved by using the
484#'  `retain = TRUE` option of [prep()]. This stores the processed version of the
485#'  training set. With this option set, `bake(object, new_data = NULL)`
486#'  will return it for free.
487#'
488#' Also, any steps with `skip = TRUE` will not be applied to the
489#'   data when `bake()` is invoked with a data set in `new_data`.
490#'   `bake(object, new_data = NULL)` will always have all of the steps applied.
491#' @seealso [recipe()], [prep()]
492#' @rdname bake
493#' @examples
494#' data(ames, package = "modeldata")
495#'
496#' ames <- mutate(ames, Sale_Price = log10(Sale_Price))
497#'
498#' ames_rec <-
499#'   recipe(Sale_Price ~ ., data = ames[-(1:6), ]) %>%
500#'   step_other(Neighborhood, threshold = 0.05) %>%
501#'   step_dummy(all_nominal()) %>%
502#'   step_interact(~ starts_with("Central_Air"):Year_Built) %>%
503#'   step_ns(Longitude, Latitude, deg_free = 2) %>%
504#'   step_zv(all_predictors()) %>%
505#'   prep()
506#'
507#' # return the training set (already embedded in ames_rec)
508#' bake(ames_rec, new_data = NULL)
509#'
510#' # apply processing to other data:
511#' bake(ames_rec, new_data = head(ames))
512#'
513#' # only return selected variables:
514#' bake(ames_rec, new_data = head(ames), all_numeric_predictors())
515#' bake(ames_rec, new_data = head(ames), starts_with(c("Longitude", "Latitude")))
516#' @export
517bake.recipe <- function(object, new_data, ..., composition = "tibble") {
518  if (rlang::is_missing(new_data)) {
519    rlang::abort("'new_data' must be either a data frame or NULL. No value is not allowed.")
520  }
521  if (is.null(new_data)) {
522    return(juice(object, ..., composition = composition))
523  }
524
525  if (!fully_trained(object)) {
526    rlang::abort("At least one step has not been trained. Please run `prep`.")
527  }
528
529  if (!any(composition == formats)) {
530    rlang::abort(
531      paste0(
532      "`composition` should be one of: ",
533      paste0("'", formats, "'", collapse = ",")
534      )
535    )
536  }
537
538  terms <- quos(...)
539  if (is_empty(terms)) {
540    terms <- quos(everything())
541  }
542
543  # In case someone used the deprecated `newdata`:
544  if (is.null(new_data) || is.null(ncol(new_data))) {
545    if (any(names(terms) == "newdata")) {
546      rlang::abort("Please use `new_data` instead of `newdata` with `bake`.")
547    } else {
548      rlang::abort("Please pass a data set to `new_data`.")
549    }
550  }
551
552  if (!is_tibble(new_data)) {
553    new_data <- as_tibble(new_data)
554  }
555
556  check_nominal_type(new_data, object$orig_lvls)
557
558  # Drop completely new columns from `new_data` and reorder columns that do
559  # still exist to match the ordering used when training
560  original_names <- names(new_data)
561  original_training_names <- unique(object$var_info$variable)
562  bakeable_names <- intersect(original_training_names, original_names)
563  new_data <- new_data[, bakeable_names]
564
565  n_steps <- length(object$steps)
566
567  for (i in seq_len(n_steps)) {
568    step <- object$steps[[i]]
569
570    if (is_skipable(step)) {
571      next
572    }
573
574    new_data <- bake(step, new_data = new_data)
575
576    if (!is_tibble(new_data)) {
577      new_data <- as_tibble(new_data)
578    }
579  }
580
581  # Use `last_term_info`, which maintains info on all columns that got added
582  # and removed from the training data. This is important for skipped steps
583  # which might have resulted in columns not being added/removed in the test
584  # set.
585  info <- object$last_term_info
586
587  # Now reduce to only user selected columns
588  out_names <- recipes_eval_select(terms, new_data, info)
589  new_data <- new_data[, out_names]
590
591  ## The levels are not null when no nominal data are present or
592  ## if strings_as_factors = FALSE in `prep`
593  if (!is.null(object$levels)) {
594    var_levels <- object$levels
595    var_levels <- var_levels[out_names]
596    check_values <-
597      vapply(var_levels, function(x)
598        (!all(is.na(x))), c(all = TRUE))
599    var_levels <- var_levels[check_values]
600    if (length(var_levels) > 0)
601      new_data <- strings2factors(new_data, var_levels)
602  }
603
604  if (composition == "dgCMatrix") {
605    new_data <- convert_matrix(new_data, sparse = TRUE)
606  } else if (composition == "matrix") {
607    new_data <- convert_matrix(new_data, sparse = FALSE)
608  } else if (composition == "data.frame") {
609    new_data <- base::as.data.frame(new_data)
610  }
611
612  new_data
613}
614
615#' Print a Recipe
616#'
617#' @aliases print.recipe
618#' @param x A `recipe` object
619#' @param form_width The number of characters used to print the variables or
620#'   terms in a formula
621#' @param ... further arguments passed to or from other methods (not currently
622#'   used).
623#' @return The original object (invisibly)
624#'
625#' @export
626print.recipe <- function(x, form_width = 30, ...) {
627  cat("Recipe\n\n")
628  cat("Inputs:\n\n")
629  no_role <- is.na(x$var_info$role)
630  if (any(!no_role)) {
631    tab <- as.data.frame(table(x$var_info$role))
632    colnames(tab) <- c("role", "#variables")
633    print(tab, row.names = FALSE)
634    if (any(no_role)) {
635      cat("\n ", sum(no_role), "variables with undeclared roles\n")
636    }
637  } else {
638    cat(" ", nrow(x$var_info), "variables (no declared roles)\n")
639  }
640  if ("tr_info" %in% names(x)) {
641    nmiss <- x$tr_info$nrows - x$tr_info$ncomplete
642    cat("\nTraining data contained ",
643        x$tr_info$nrows,
644        " data points and ",
645        sep = "")
646    if (x$tr_info$nrows == x$tr_info$ncomplete)
647      cat("no missing data.\n")
648    else
649      cat(nmiss,
650          "incomplete",
651          ifelse(nmiss > 1, "rows.", "row."),
652          "\n")
653  }
654  if (!is.null(x$steps)) {
655    cat("\nOperations:\n\n")
656    for (i in seq_along(x$steps))
657      print(x$steps[[i]], form_width = form_width)
658  }
659  invisible(x)
660}
661
662#' Summarize a recipe
663#'
664#' This function prints the current set of variables/features and some of their
665#' characteristics.
666#' @aliases summary.recipe
667#' @param object A `recipe` object
668#' @param original A logical: show the current set of variables or the original
669#'   set when the recipe was defined.
670#' @param ... further arguments passed to or from other methods (not currently
671#'   used).
672#' @return A tibble with columns `variable`, `type`, `role`,
673#'   and `source`.
674#' @details
675#' Note that, until the recipe has been trained,
676#' the current and original variables are the same.
677#'
678#' It is possible for variables to have multiple roles by adding them with
679#' [add_role()]. If a variable has multiple roles, it will have more than one
680#' row in the summary tibble.
681#'
682#' @examples
683#' rec <- recipe( ~ ., data = USArrests)
684#' summary(rec)
685#' rec <- step_pca(rec, all_numeric(), num_comp = 3)
686#' summary(rec) # still the same since not yet trained
687#' rec <- prep(rec, training = USArrests)
688#' summary(rec)
689#' @export
690#' @seealso [recipe()] [prep.recipe()]
691summary.recipe <- function(object, original = FALSE, ...) {
692  if (original)
693    object$var_info
694  else
695    object$term_info
696}
697
698
699#' Extract transformed training set
700#'
701#' As of `recipes` version 0.1.14, **`juice()` is superseded** in favor of
702#' `bake(object, new_data = NULL)`.
703#'
704#' As steps are estimated by `prep`, these operations are
705#'  applied to the training set. Rather than running `bake()`
706#'  to duplicate this processing, this function will return
707#'  variables from the processed training set.
708#' @inheritParams bake.recipe
709#' @param object A `recipe` object that has been prepared
710#'   with the option `retain = TRUE`.
711#' @details When preparing a recipe, if the training data set is
712#'  retained using `retain = TRUE`, there is no need to `bake()` the
713#'  recipe to get the preprocessed training set.
714#'
715#'  `juice()` will return the results of a recipe where _all steps_
716#'  have been applied to the data, irrespective of the value of
717#'  the step's `skip` argument.
718#' @export
719#' @seealso [recipe()] [prep.recipe()] [bake.recipe()]
720juice <- function(object, ..., composition = "tibble") {
721  if (!fully_trained(object)) {
722    rlang::abort("At least one step has not been trained. Please run `prep()`.")
723  }
724
725  if (!isTRUE(object$retained)) {
726    rlang::abort(paste0(
727      "Use `retain = TRUE` in `prep()` to be able ",
728      "to extract the training set"
729    ))
730  }
731
732  if (!any(composition == formats)) {
733    rlang::abort(paste0(
734      "`composition` should be one of: ",
735      paste0("'", formats, "'", collapse = ",")
736    ))
737  }
738
739  terms <- quos(...)
740  if (is_empty(terms)) {
741    terms <- quos(everything())
742  }
743
744  # Get user requested columns
745  new_data <- object$template
746  out_names <- recipes_eval_select(terms, new_data, object$term_info)
747  new_data <- new_data[, out_names]
748
749  ## Since most models require factors, do the conversion from character
750  if (!is.null(object$levels)) {
751    var_levels <- object$levels
752    var_levels <- var_levels[out_names]
753    check_values <-
754      vapply(var_levels, function(x)
755        (!all(is.na(x))), c(all = TRUE))
756    var_levels <- var_levels[check_values]
757    if (length(var_levels) > 0)
758      new_data <- strings2factors(new_data, var_levels)
759  }
760
761  if (composition == "dgCMatrix") {
762    new_data <- convert_matrix(new_data, sparse = TRUE)
763  } else if (composition == "matrix") {
764    new_data <- convert_matrix(new_data, sparse = FALSE)
765  } else if (composition == "data.frame") {
766    new_data <- base::as.data.frame(new_data)
767  }
768
769  new_data
770}
771
772formats <- c("tibble", "dgCMatrix", "matrix", "data.frame")
773
774utils::globalVariables(c("number"))
775
776# ------------------------------------------------------------------------------
777
778#' S3 methods for tracking which additional packages are needed for steps.
779#'
780#' @param x A recipe or recipe step
781#' @param infra Should recipes itself be included in the result?
782#' @return A character vector
783#' @name required_pkgs.recipe
784#' @keywords internal
785#' @export
786required_pkgs.recipe <- function(x, infra = TRUE, ...) {
787  res <- purrr::map(x$steps, required_pkgs)
788  res <- unique(unlist(res))
789  if (infra) {
790    res <- c("recipes", res)
791  }
792  res <- unique(res)
793  res <- res[length(res) != 0]
794  res
795}
796
797#' @rdname required_pkgs.recipe
798#' @export
799required_pkgs.step <- function(x, ...) {
800  character(0)
801}
802
803#' @rdname required_pkgs.recipe
804#' @export
805required_pkgs.check <- function(x, ...) {
806  character(0)
807}
808