1#' Create a recipe for preprocessing data 2#' 3#' A recipe is a description of the steps to be applied to a data set in 4#' order to prepare it for data analysis. 5#' 6#' @aliases recipe recipe.default recipe.formula 7#' @export 8recipe <- function(x, ...) 9 UseMethod("recipe") 10 11#' @rdname recipe 12#' @export 13recipe.default <- function(x, ...) 14 rlang::abort("`x` should be a data frame, matrix, or tibble") 15 16#' @rdname recipe 17#' @param vars A character string of column names corresponding to variables 18#' that will be used in any context (see below) 19#' @param roles A character string (the same length of `vars`) that 20#' describes a single role that the variable will take. This value could be 21#' anything but common roles are `"outcome"`, `"predictor"`, 22#' `"case_weight"`, or `"ID"` 23#' @param ... Further arguments passed to or from other methods (not currently 24#' used). 25#' @param formula A model formula. No in-line functions should be used here 26#' (e.g. `log(x)`, `x:y`, etc.) and minus signs are not allowed. These types of 27#' transformations should be enacted using `step` functions in this package. 28#' Dots are allowed as are simple multivariate outcome terms (i.e. no need for 29#' `cbind`; see Examples). A model formula may not be the best choice for 30#' high-dimensional data with many columns, because of problems with memory. 31#' @param x,data A data frame or tibble of the *template* data set 32#' (see below). 33#' @return An object of class `recipe` with sub-objects: 34#' \item{var_info}{A tibble containing information about the original data 35#' set columns} 36#' \item{term_info}{A tibble that contains the current set of terms in the 37#' data set. This initially defaults to the same data contained in 38#' `var_info`.} 39#' \item{steps}{A list of `step` or `check` objects that define the sequence of 40#' preprocessing operations that will be applied to data. The default value is 41#' `NULL`} 42#' \item{template}{A tibble of the data. This is initialized to be the same 43#' as the data given in the `data` argument but can be different after 44#' the recipe is trained.} 45#' 46#' @includeRmd man/rmd/recipes.Rmd details 47#' 48#' @export 49#' @examples 50#' 51#' # formula example with single outcome: 52#' library(modeldata) 53#' data(biomass) 54#' 55#' # split data 56#' biomass_tr <- biomass[biomass$dataset == "Training",] 57#' biomass_te <- biomass[biomass$dataset == "Testing",] 58#' 59#' # With only predictors and outcomes, use a formula 60#' rec <- recipe(HHV ~ carbon + hydrogen + oxygen + nitrogen + sulfur, 61#' data = biomass_tr) 62#' 63#' # Now add preprocessing steps to the recipe 64#' sp_signed <- rec %>% 65#' step_normalize(all_numeric_predictors()) %>% 66#' step_spatialsign(all_numeric_predictors()) 67#' sp_signed 68#' 69#' # --------------------------------------------------------------------------- 70#' # formula multivariate example: 71#' # no need for `cbind(carbon, hydrogen)` for left-hand side 72#' 73#' multi_y <- recipe(carbon + hydrogen ~ oxygen + nitrogen + sulfur, 74#' data = biomass_tr) 75#' multi_y <- multi_y %>% 76#' step_center(all_numeric_predictors()) %>% 77#' step_scale(all_numeric_predictors()) 78#' 79#' # --------------------------------------------------------------------------- 80#' # example using `update_role` instead of formula: 81#' # best choice for high-dimensional data 82#' 83#' rec <- recipe(biomass_tr) %>% 84#' update_role(carbon, hydrogen, oxygen, nitrogen, sulfur, 85#' new_role = "predictor") %>% 86#' update_role(HHV, new_role = "outcome") %>% 87#' update_role(sample, new_role = "id variable") %>% 88#' update_role(dataset, new_role = "splitting indicator") 89#' rec 90recipe.data.frame <- 91 function(x, 92 formula = NULL, 93 ..., 94 vars = NULL, 95 roles = NULL) { 96 97 if (!is.null(formula)) { 98 if (!is.null(vars)) 99 rlang::abort( 100 paste0("This `vars` specification will be ignored ", 101 "when a formula is used" 102 ) 103 ) 104 if (!is.null(roles)) 105 rlang::abort( 106 paste0("This `roles` specification will be ignored ", 107 "when a formula is used" 108 ) 109 ) 110 111 obj <- recipe.formula(formula, x, ...) 112 return(obj) 113 } 114 115 if (is.null(vars)) 116 vars <- colnames(x) 117 118 if (!is_tibble(x)) 119 x <- as_tibble(x) 120 121 if (any(table(vars) > 1)) 122 rlang::abort("`vars` should have unique members") 123 if (any(!(vars %in% colnames(x)))) 124 rlang::abort("1+ elements of `vars` are not in `x`") 125 126 x <- x[, vars] 127 128 var_info <- tibble(variable = vars) 129 130 ## Check and add roles when available 131 if (!is.null(roles)) { 132 if (length(roles) != length(vars)) 133 rlang::abort( 134 paste0("The number of roles should be the same as the number of ", 135 "variables") 136 ) 137 var_info$role <- roles 138 } else 139 var_info$role <- NA 140 141 ## Add types 142 var_info <- full_join(get_types(x), var_info, by = "variable") 143 var_info$source <- "original" 144 145 ## Return final object of class `recipe` 146 out <- list( 147 var_info = var_info, 148 term_info = var_info, 149 steps = NULL, 150 template = x, 151 levels = NULL, 152 retained = NA 153 ) 154 class(out) <- "recipe" 155 out 156 } 157 158#' @rdname recipe 159#' @export 160recipe.formula <- function(formula, data, ...) { 161 # check for minus: 162 f_funcs <- fun_calls(formula) 163 if (any(f_funcs == "-")) { 164 rlang::abort("`-` is not allowed in a recipe formula. Use `step_rm()` instead.") 165 } 166 167 # Check for other in-line functions 168 args <- form2args(formula, data, ...) 169 obj <- recipe.data.frame( 170 x = args$x, 171 formula = NULL, 172 ..., 173 vars = args$vars, 174 roles = args$roles 175 ) 176 obj 177} 178 179#' @rdname recipe 180#' @export 181recipe.matrix <- function(x, ...) { 182 x <- as.data.frame(x) 183 recipe.data.frame(x, ...) 184} 185 186form2args <- function(formula, data, ...) { 187 if (!is_formula(formula)) 188 formula <- as.formula(formula) 189 190 ## check for in-line formulas 191 inline_check(formula) 192 193 if (!is_tibble(data)) 194 data <- as_tibble(data) 195 196 ## use rlang to get both sides of the formula 197 outcomes <- get_lhs_vars(formula, data) 198 predictors <- get_rhs_vars(formula, data, no_lhs = TRUE) 199 200 ## if . was used on the rhs, subtract out the outcomes 201 predictors <- predictors[!(predictors %in% outcomes)] 202 203 ## get `vars` from lhs and rhs of formula 204 vars <- c(predictors, outcomes) 205 206 ## subset data columns 207 data <- data[, vars] 208 209 ## derive roles 210 roles <- rep("predictor", length(predictors)) 211 if (length(outcomes) > 0) 212 roles <- c(roles, rep("outcome", length(outcomes))) 213 214 ## pass to recipe.default with vars and roles 215 216 list(x = data, vars = vars, roles = roles) 217} 218 219inline_check <- function(x) { 220 funs <- fun_calls(x) 221 funs <- funs[!(funs %in% c("~", "+", "-"))] 222 223 if (length(funs) > 0) { 224 rlang::abort(paste0( 225 "No in-line functions should be used here; ", 226 "use steps to define baking actions." 227 )) 228 } 229 230 invisible(x) 231} 232 233 234#' @aliases prep prep.recipe 235#' @param x an object 236#' @param ... further arguments passed to or from other methods (not currently 237#' used). 238#' @export 239prep <- function(x, ...) 240 UseMethod("prep") 241 242#' Estimate a preprocessing recipe 243#' 244#' For a recipe with at least one preprocessing operation, estimate the required 245#' parameters from a training set that can be later applied to other data 246#' sets. 247#' @param training A data frame or tibble that will be used to estimate 248#' parameters for preprocessing. 249#' @param fresh A logical indicating whether already trained operation should be 250#' re-trained. If `TRUE`, you should pass in a data set to the argument 251#' `training`. 252#' @param verbose A logical that controls whether progress is reported as operations 253#' are executed. 254#' @param log_changes A logical for printing a summary for each step regarding 255#' which (if any) columns were added or removed during training. 256#' @param retain A logical: should the *preprocessed* training set be saved 257#' into the `template` slot of the recipe after training? This is a good 258#' idea if you want to add more steps later but want to avoid re-training 259#' the existing steps. Also, it is advisable to use `retain = TRUE` 260#' if any steps use the option `skip = FALSE`. **Note** that this can make 261#' the final recipe size large. When `verbose = TRUE`, a message is written 262#' with the approximate object size in memory but may be an underestimate 263#' since it does not take environments into account. 264#' @param strings_as_factors A logical: should character columns be converted to 265#' factors? This affects the preprocessed training set (when 266#' `retain = TRUE`) as well as the results of `bake.recipe`. 267#' @return A recipe whose step objects have been updated with the required 268#' quantities (e.g. parameter estimates, model objects, etc). Also, the 269#' `term_info` object is likely to be modified as the operations are 270#' executed. 271#' @details 272#' 273#' Given a data set, this function estimates the required quantities and 274#' statistics needed by any operations. [prep()] returns an updated recipe 275#' with the estimates. If you are using a recipe as a preprocessor for modeling, 276#' we **highly recommend** that you use a `workflow()` instead of manually 277#' estimating a recipe (see the example in [recipe()]). 278#' 279#' Note that missing data is handled in the steps; there is no global 280#' `na.rm` option at the recipe level or in [prep()]. 281#' 282#' Also, if a recipe has been trained using [prep()] and then steps 283#' are added, [prep()] will only update the new operations. If 284#' `fresh = TRUE`, all of the operations will be (re)estimated. 285#' 286#' As the steps are executed, the `training` set is updated. For example, 287#' if the first step is to center the data and the second is to scale the 288#' data, the step for scaling is given the centered data. 289#' 290#' 291#' @examples 292#' data(ames, package = "modeldata") 293#' 294#' library(dplyr) 295#' 296#' ames <- mutate(ames, Sale_Price = log10(Sale_Price)) 297#' 298#' ames_rec <- 299#' recipe( 300#' Sale_Price ~ Longitude + Latitude + Neighborhood + Year_Built + Central_Air, 301#' data = ames 302#' ) %>% 303#' step_other(Neighborhood, threshold = 0.05) %>% 304#' step_dummy(all_nominal()) %>% 305#' step_interact(~ starts_with("Central_Air"):Year_Built) %>% 306#' step_ns(Longitude, Latitude, deg_free = 5) 307#' 308#' prep(ames_rec, verbose = TRUE) 309#' 310#' prep(ames_rec, log_changes = TRUE) 311#' @rdname prep 312#' @export 313prep.recipe <- 314 function(x, 315 training = NULL, 316 fresh = FALSE, 317 verbose = FALSE, 318 retain = TRUE, 319 log_changes = FALSE, 320 strings_as_factors = TRUE, 321 ...) { 322 323 training <- check_training_set(training, x, fresh) 324 325 tr_data <- train_info(training) 326 327 # Record the original levels for later checking 328 orig_lvls <- lapply(training, get_levels) 329 330 if (strings_as_factors) { 331 lvls <- lapply(training, get_levels) 332 training <- strings2factors(training, lvls) 333 } else { 334 lvls <- NULL 335 } 336 337 # The only way to get the results for skipped steps is to 338 # use `retain = TRUE` so issue a warning if this is not the case 339 skippers <- map_lgl(x$steps, is_skipable) 340 if (any(skippers) & !retain) 341 rlang::warn( 342 paste0( 343 "Since some operations have `skip = TRUE`, using ", 344 "`retain = TRUE` will allow those steps results to ", 345 "be accessible." 346 ) 347 ) 348 349 350 running_info <- x$term_info %>% mutate(number = 0, skip = FALSE) 351 for (i in seq(along.with = x$steps)) { 352 needs_tuning <- map_lgl(x$steps[[i]], is_tune) 353 if (any(needs_tuning)) { 354 arg <- names(needs_tuning)[needs_tuning] 355 arg <- paste0("'", arg, "'", collapse = ", ") 356 msg <- 357 paste0( 358 "You cannot `prep()` a tuneable recipe. Argument(s) with `tune()`: ", 359 arg, 360 ". Do you want to use a tuning function such as `tune_grid()`?" 361 ) 362 rlang::abort(msg) 363 } 364 note <- paste("oper", i, gsub("_", " ", class(x$steps[[i]])[1])) 365 if (!x$steps[[i]]$trained | fresh) { 366 367 if (verbose) { 368 cat(note, "[training]", "\n") 369 } 370 371 before_nms <- names(training) 372 373 # Compute anything needed for the preprocessing steps 374 # then apply it to the current training set 375 x$steps[[i]] <- 376 prep(x$steps[[i]], 377 training = training, 378 info = x$term_info) 379 training <- bake(x$steps[[i]], new_data = training) 380 x$term_info <- 381 merge_term_info(get_types(training), x$term_info) 382 383 # Update the roles and the term source 384 if (!is.na(x$steps[[i]]$role)) { 385 new_vars <- setdiff(x$term_info$variable, running_info$variable) 386 pos_new_var <- x$term_info$variable %in% new_vars 387 pos_new_and_na_role <- pos_new_var & is.na(x$term_info$role) 388 pos_new_and_na_source <- pos_new_var & is.na(x$term_info$source) 389 390 x$term_info$role[pos_new_and_na_role] <- x$steps[[i]]$role 391 x$term_info$source[pos_new_and_na_source] <- "derived" 392 393 } 394 395 changelog(log_changes, before_nms, names(training), x$steps[[i]]) 396 397 running_info <- rbind( 398 running_info, 399 mutate(x$term_info, number = i, skip = x$steps[[i]]$skip) 400 ) 401 402 } 403 else { 404 if (verbose) cat(note, "[pre-trained]\n") 405 } 406 } 407 408 ## The steps may have changed the data so reassess the levels 409 if (strings_as_factors) { 410 lvls <- lapply(training, get_levels) 411 check_lvls <- has_lvls(lvls) 412 if (!any(check_lvls)) lvls <- NULL 413 } else lvls <- NULL 414 415 if (retain) { 416 if (verbose) 417 cat("The retained training set is ~", 418 format(object.size(training), units = "Mb", digits = 2), 419 " in memory.\n\n") 420 421 x$template <- training 422 } else { 423 x$template <- training[0,] 424 } 425 426 x$tr_info <- tr_data 427 x$levels <- lvls 428 x$orig_lvls <- orig_lvls 429 x$retained <- retain 430 # In case a variable was removed, and that removal step used 431 # `skip = TRUE`, we need to retain its record so that 432 # selectors can be properly used with `bake`. This tibble 433 # captures every variable originally in the data or that was 434 # created along the way. `number` will be the last step where 435 # that variable was available. 436 x$last_term_info <- 437 running_info %>% 438 group_by(variable) %>% 439 arrange(desc(number)) %>% 440 summarise( 441 type = dplyr::first(type), 442 role = as.list(unique(unlist(role))), 443 source = dplyr::first(source), 444 number = dplyr::first(number), 445 skip = dplyr::first(skip), 446 .groups = "keep" 447 ) 448 x 449 } 450 451#' @rdname bake 452#' @aliases bake bake.recipe 453#' @export 454bake <- function(object, ...) 455 UseMethod("bake") 456 457#' Apply a trained preprocessing recipe 458#' 459#' For a recipe with at least one preprocessing operation that has been trained by 460#' [prep.recipe()], apply the computations to new data. 461#' @param object A trained object such as a [recipe()] with at least 462#' one preprocessing operation. 463#' @param new_data A data frame or tibble for whom the preprocessing will be 464#' applied. If `NULL` is given to `new_data`, the pre-processed _training 465#' data_ will be returned (assuming that `prep(retain = TRUE)` was used). 466#' @param ... One or more selector functions to choose which variables will be 467#' returned by the function. See [selections()] for more details. 468#' If no selectors are given, the default is to use 469#' [everything()]. 470#' @param composition Either "tibble", "matrix", "data.frame", or 471#' "dgCMatrix" for the format of the processed data set. Note that 472#' all computations during the baking process are done in a 473#' non-sparse format. Also, note that this argument should be 474#' called **after** any selectors and the selectors should only 475#' resolve to numeric columns (otherwise an error is thrown). 476#' @return A tibble, matrix, or sparse matrix that may have different 477#' columns than the original columns in `new_data`. 478#' @details [bake()] takes a trained recipe and applies its operations to a 479#' data set to create a design matrix. If you are using a recipe as a 480#' preprocessor for modeling, we **highly recommend** that you use a `workflow()` 481#' instead of manually applying a recipe (see the example in [recipe()]). 482#' 483#' If the data set is not too large, time can be saved by using the 484#' `retain = TRUE` option of [prep()]. This stores the processed version of the 485#' training set. With this option set, `bake(object, new_data = NULL)` 486#' will return it for free. 487#' 488#' Also, any steps with `skip = TRUE` will not be applied to the 489#' data when `bake()` is invoked with a data set in `new_data`. 490#' `bake(object, new_data = NULL)` will always have all of the steps applied. 491#' @seealso [recipe()], [prep()] 492#' @rdname bake 493#' @examples 494#' data(ames, package = "modeldata") 495#' 496#' ames <- mutate(ames, Sale_Price = log10(Sale_Price)) 497#' 498#' ames_rec <- 499#' recipe(Sale_Price ~ ., data = ames[-(1:6), ]) %>% 500#' step_other(Neighborhood, threshold = 0.05) %>% 501#' step_dummy(all_nominal()) %>% 502#' step_interact(~ starts_with("Central_Air"):Year_Built) %>% 503#' step_ns(Longitude, Latitude, deg_free = 2) %>% 504#' step_zv(all_predictors()) %>% 505#' prep() 506#' 507#' # return the training set (already embedded in ames_rec) 508#' bake(ames_rec, new_data = NULL) 509#' 510#' # apply processing to other data: 511#' bake(ames_rec, new_data = head(ames)) 512#' 513#' # only return selected variables: 514#' bake(ames_rec, new_data = head(ames), all_numeric_predictors()) 515#' bake(ames_rec, new_data = head(ames), starts_with(c("Longitude", "Latitude"))) 516#' @export 517bake.recipe <- function(object, new_data, ..., composition = "tibble") { 518 if (rlang::is_missing(new_data)) { 519 rlang::abort("'new_data' must be either a data frame or NULL. No value is not allowed.") 520 } 521 if (is.null(new_data)) { 522 return(juice(object, ..., composition = composition)) 523 } 524 525 if (!fully_trained(object)) { 526 rlang::abort("At least one step has not been trained. Please run `prep`.") 527 } 528 529 if (!any(composition == formats)) { 530 rlang::abort( 531 paste0( 532 "`composition` should be one of: ", 533 paste0("'", formats, "'", collapse = ",") 534 ) 535 ) 536 } 537 538 terms <- quos(...) 539 if (is_empty(terms)) { 540 terms <- quos(everything()) 541 } 542 543 # In case someone used the deprecated `newdata`: 544 if (is.null(new_data) || is.null(ncol(new_data))) { 545 if (any(names(terms) == "newdata")) { 546 rlang::abort("Please use `new_data` instead of `newdata` with `bake`.") 547 } else { 548 rlang::abort("Please pass a data set to `new_data`.") 549 } 550 } 551 552 if (!is_tibble(new_data)) { 553 new_data <- as_tibble(new_data) 554 } 555 556 check_nominal_type(new_data, object$orig_lvls) 557 558 # Drop completely new columns from `new_data` and reorder columns that do 559 # still exist to match the ordering used when training 560 original_names <- names(new_data) 561 original_training_names <- unique(object$var_info$variable) 562 bakeable_names <- intersect(original_training_names, original_names) 563 new_data <- new_data[, bakeable_names] 564 565 n_steps <- length(object$steps) 566 567 for (i in seq_len(n_steps)) { 568 step <- object$steps[[i]] 569 570 if (is_skipable(step)) { 571 next 572 } 573 574 new_data <- bake(step, new_data = new_data) 575 576 if (!is_tibble(new_data)) { 577 new_data <- as_tibble(new_data) 578 } 579 } 580 581 # Use `last_term_info`, which maintains info on all columns that got added 582 # and removed from the training data. This is important for skipped steps 583 # which might have resulted in columns not being added/removed in the test 584 # set. 585 info <- object$last_term_info 586 587 # Now reduce to only user selected columns 588 out_names <- recipes_eval_select(terms, new_data, info) 589 new_data <- new_data[, out_names] 590 591 ## The levels are not null when no nominal data are present or 592 ## if strings_as_factors = FALSE in `prep` 593 if (!is.null(object$levels)) { 594 var_levels <- object$levels 595 var_levels <- var_levels[out_names] 596 check_values <- 597 vapply(var_levels, function(x) 598 (!all(is.na(x))), c(all = TRUE)) 599 var_levels <- var_levels[check_values] 600 if (length(var_levels) > 0) 601 new_data <- strings2factors(new_data, var_levels) 602 } 603 604 if (composition == "dgCMatrix") { 605 new_data <- convert_matrix(new_data, sparse = TRUE) 606 } else if (composition == "matrix") { 607 new_data <- convert_matrix(new_data, sparse = FALSE) 608 } else if (composition == "data.frame") { 609 new_data <- base::as.data.frame(new_data) 610 } 611 612 new_data 613} 614 615#' Print a Recipe 616#' 617#' @aliases print.recipe 618#' @param x A `recipe` object 619#' @param form_width The number of characters used to print the variables or 620#' terms in a formula 621#' @param ... further arguments passed to or from other methods (not currently 622#' used). 623#' @return The original object (invisibly) 624#' 625#' @export 626print.recipe <- function(x, form_width = 30, ...) { 627 cat("Recipe\n\n") 628 cat("Inputs:\n\n") 629 no_role <- is.na(x$var_info$role) 630 if (any(!no_role)) { 631 tab <- as.data.frame(table(x$var_info$role)) 632 colnames(tab) <- c("role", "#variables") 633 print(tab, row.names = FALSE) 634 if (any(no_role)) { 635 cat("\n ", sum(no_role), "variables with undeclared roles\n") 636 } 637 } else { 638 cat(" ", nrow(x$var_info), "variables (no declared roles)\n") 639 } 640 if ("tr_info" %in% names(x)) { 641 nmiss <- x$tr_info$nrows - x$tr_info$ncomplete 642 cat("\nTraining data contained ", 643 x$tr_info$nrows, 644 " data points and ", 645 sep = "") 646 if (x$tr_info$nrows == x$tr_info$ncomplete) 647 cat("no missing data.\n") 648 else 649 cat(nmiss, 650 "incomplete", 651 ifelse(nmiss > 1, "rows.", "row."), 652 "\n") 653 } 654 if (!is.null(x$steps)) { 655 cat("\nOperations:\n\n") 656 for (i in seq_along(x$steps)) 657 print(x$steps[[i]], form_width = form_width) 658 } 659 invisible(x) 660} 661 662#' Summarize a recipe 663#' 664#' This function prints the current set of variables/features and some of their 665#' characteristics. 666#' @aliases summary.recipe 667#' @param object A `recipe` object 668#' @param original A logical: show the current set of variables or the original 669#' set when the recipe was defined. 670#' @param ... further arguments passed to or from other methods (not currently 671#' used). 672#' @return A tibble with columns `variable`, `type`, `role`, 673#' and `source`. 674#' @details 675#' Note that, until the recipe has been trained, 676#' the current and original variables are the same. 677#' 678#' It is possible for variables to have multiple roles by adding them with 679#' [add_role()]. If a variable has multiple roles, it will have more than one 680#' row in the summary tibble. 681#' 682#' @examples 683#' rec <- recipe( ~ ., data = USArrests) 684#' summary(rec) 685#' rec <- step_pca(rec, all_numeric(), num_comp = 3) 686#' summary(rec) # still the same since not yet trained 687#' rec <- prep(rec, training = USArrests) 688#' summary(rec) 689#' @export 690#' @seealso [recipe()] [prep.recipe()] 691summary.recipe <- function(object, original = FALSE, ...) { 692 if (original) 693 object$var_info 694 else 695 object$term_info 696} 697 698 699#' Extract transformed training set 700#' 701#' As of `recipes` version 0.1.14, **`juice()` is superseded** in favor of 702#' `bake(object, new_data = NULL)`. 703#' 704#' As steps are estimated by `prep`, these operations are 705#' applied to the training set. Rather than running `bake()` 706#' to duplicate this processing, this function will return 707#' variables from the processed training set. 708#' @inheritParams bake.recipe 709#' @param object A `recipe` object that has been prepared 710#' with the option `retain = TRUE`. 711#' @details When preparing a recipe, if the training data set is 712#' retained using `retain = TRUE`, there is no need to `bake()` the 713#' recipe to get the preprocessed training set. 714#' 715#' `juice()` will return the results of a recipe where _all steps_ 716#' have been applied to the data, irrespective of the value of 717#' the step's `skip` argument. 718#' @export 719#' @seealso [recipe()] [prep.recipe()] [bake.recipe()] 720juice <- function(object, ..., composition = "tibble") { 721 if (!fully_trained(object)) { 722 rlang::abort("At least one step has not been trained. Please run `prep()`.") 723 } 724 725 if (!isTRUE(object$retained)) { 726 rlang::abort(paste0( 727 "Use `retain = TRUE` in `prep()` to be able ", 728 "to extract the training set" 729 )) 730 } 731 732 if (!any(composition == formats)) { 733 rlang::abort(paste0( 734 "`composition` should be one of: ", 735 paste0("'", formats, "'", collapse = ",") 736 )) 737 } 738 739 terms <- quos(...) 740 if (is_empty(terms)) { 741 terms <- quos(everything()) 742 } 743 744 # Get user requested columns 745 new_data <- object$template 746 out_names <- recipes_eval_select(terms, new_data, object$term_info) 747 new_data <- new_data[, out_names] 748 749 ## Since most models require factors, do the conversion from character 750 if (!is.null(object$levels)) { 751 var_levels <- object$levels 752 var_levels <- var_levels[out_names] 753 check_values <- 754 vapply(var_levels, function(x) 755 (!all(is.na(x))), c(all = TRUE)) 756 var_levels <- var_levels[check_values] 757 if (length(var_levels) > 0) 758 new_data <- strings2factors(new_data, var_levels) 759 } 760 761 if (composition == "dgCMatrix") { 762 new_data <- convert_matrix(new_data, sparse = TRUE) 763 } else if (composition == "matrix") { 764 new_data <- convert_matrix(new_data, sparse = FALSE) 765 } else if (composition == "data.frame") { 766 new_data <- base::as.data.frame(new_data) 767 } 768 769 new_data 770} 771 772formats <- c("tibble", "dgCMatrix", "matrix", "data.frame") 773 774utils::globalVariables(c("number")) 775 776# ------------------------------------------------------------------------------ 777 778#' S3 methods for tracking which additional packages are needed for steps. 779#' 780#' @param x A recipe or recipe step 781#' @param infra Should recipes itself be included in the result? 782#' @return A character vector 783#' @name required_pkgs.recipe 784#' @keywords internal 785#' @export 786required_pkgs.recipe <- function(x, infra = TRUE, ...) { 787 res <- purrr::map(x$steps, required_pkgs) 788 res <- unique(unlist(res)) 789 if (infra) { 790 res <- c("recipes", res) 791 } 792 res <- unique(res) 793 res <- res[length(res) != 0] 794 res 795} 796 797#' @rdname required_pkgs.recipe 798#' @export 799required_pkgs.step <- function(x, ...) { 800 character(0) 801} 802 803#' @rdname required_pkgs.recipe 804#' @export 805required_pkgs.check <- function(x, ...) { 806 character(0) 807} 808