1#' Backwards Feature Selection
2#'
3#' A simple backwards selection, a.k.a. recursive feature elimination (RFE),
4#' algorithm
5#'
6#' More details on this function can be found at
7#' \url{http://topepo.github.io/caret/recursive-feature-elimination.html}.
8#'
9#' This function implements backwards selection of predictors based on
10#' predictor importance ranking. The predictors are ranked and the less
11#' important ones are sequentially eliminated prior to modeling. The goal is to
12#' find a subset of predictors that can be used to produce an accurate model.
13#' The web page \url{http://topepo.github.io/caret/recursive-feature-elimination.html#rfe}
14#' has more details and examples related to this function.
15#'
16#' \code{rfe} can be used with "explicit parallelism", where different
17#' resamples (e.g. cross-validation group) can be split up and run on multiple
18#' machines or processors. By default, \code{rfe} will use a single processor
19#' on the host machine. As of version 4.99 of this package, the framework used
20#' for parallel processing uses the \pkg{foreach} package. To run the resamples
21#' in parallel, the code for \code{rfe} does not change; prior to the call to
22#' \code{rfe}, a parallel backend is registered with \pkg{foreach} (see the
23#' examples below).
24#'
25#' \code{rfeIter} is the basic algorithm while \code{rfe} wraps these
26#' operations inside of resampling. To avoid selection bias, it is better to
27#' use the function \code{rfe} than \code{rfeIter}.
28#'
29#' When updating a model, if the entire set of resamples were not saved using
30#' \code{rfeControl(returnResamp = "final")}, the existing resamples are
31#' removed with a warning.
32#'
33#' @aliases rfe rfe.default rfeIter predict.rfe update.rfe
34#' @param x A matrix or data frame of predictors for model training. This
35#' object must have unique column names. For the recipes method, \code{x}
36#' is a recipe object.
37#' @param y a vector of training set outcomes (either numeric or factor)
38#' @param testX a matrix or data frame of test set predictors. This must have
39#' the same column names as \code{x}
40#' @param testY a vector of test set outcomes
41#' @param sizes a numeric vector of integers corresponding to the number of
42#' features that should be retained
43#' @param metric a string that specifies what summary metric will be used to
44#' select the optimal model. By default, possible values are "RMSE" and
45#' "Rsquared" for regression and "Accuracy" and "Kappa" for classification. If
46#' custom performance metrics are used (via the \code{functions} argument in
47#' \code{\link{rfeControl}}, the value of \code{metric} should match one of the
48#' arguments.
49#' @param maximize a logical: should the metric be maximized or minimized?
50#' @param rfeControl a list of options, including functions for fitting and
51#' prediction. The web page
52#' \url{http://topepo.github.io/caret/recursive-feature-elimination.html#rfe} has more
53#' details and examples related to this function.
54#' @param object an object of class \code{rfe}
55#' @param size a single integers corresponding to the number of features that
56#' should be retained in the updated model
57#' @param label an optional character string to be printed when in verbose
58#' mode.
59#' @param seeds an optional vector of integers for the size. The vector should
60#' have length of \code{length(sizes)+1}
61#' @param \dots options to pass to the model fitting function (ignored in
62#' \code{predict.rfe})
63#' @return A list with elements \item{finalVariables}{a list of size
64#' \code{length(sizes) + 1} containing the column names of the ``surviving''
65#' predictors at each stage of selection. The first element corresponds to all
66#' the predictors (i.e. \code{size = ncol(x)})} \item{pred }{a data frame with
67#' columns for the test set outcome, the predicted outcome and the subset
68#' size.}
69#' @note We using a recipe as an input, there may be some subset
70#'  sizes that are not well-replicated over resamples. `rfe` method
71#'  will only consider subset sizes where at least half of the
72#'  resamples have associated results in the search for an optimal
73#'  subset size.
74#' @author Max Kuhn
75#' @seealso \code{\link{rfeControl}}
76#' @keywords models
77#' @examples
78#'
79#' \dontrun{
80#' data(BloodBrain)
81#'
82#' x <- scale(bbbDescr[,-nearZeroVar(bbbDescr)])
83#' x <- x[, -findCorrelation(cor(x), .8)]
84#' x <- as.data.frame(x, stringsAsFactors = TRUE)
85#'
86#' set.seed(1)
87#' lmProfile <- rfe(x, logBBB,
88#'                  sizes = c(2:25, 30, 35, 40, 45, 50, 55, 60, 65),
89#'                  rfeControl = rfeControl(functions = lmFuncs,
90#'                                          number = 200))
91#' set.seed(1)
92#' lmProfile2 <- rfe(x, logBBB,
93#'                  sizes = c(2:25, 30, 35, 40, 45, 50, 55, 60, 65),
94#'                  rfeControl = rfeControl(functions = lmFuncs,
95#'                                          rerank = TRUE,
96#'                                          number = 200))
97#'
98#' xyplot(lmProfile$results$RMSE + lmProfile2$results$RMSE  ~
99#'        lmProfile$results$Variables,
100#'        type = c("g", "p", "l"),
101#'        auto.key = TRUE)
102#'
103#' rfProfile <- rfe(x, logBBB,
104#'                  sizes = c(2, 5, 10, 20),
105#'                  rfeControl = rfeControl(functions = rfFuncs))
106#'
107#' bagProfile <- rfe(x, logBBB,
108#'                   sizes = c(2, 5, 10, 20),
109#'                   rfeControl = rfeControl(functions = treebagFuncs))
110#'
111#' set.seed(1)
112#' svmProfile <- rfe(x, logBBB,
113#'                   sizes = c(2, 5, 10, 20),
114#'                   rfeControl = rfeControl(functions = caretFuncs,
115#'                                           number = 200),
116#'                   ## pass options to train()
117#'                   method = "svmRadial")
118#'
119#' ## classification
120#'
121#' data(mdrr)
122#' mdrrDescr <- mdrrDescr[,-nearZeroVar(mdrrDescr)]
123#' mdrrDescr <- mdrrDescr[, -findCorrelation(cor(mdrrDescr), .8)]
124#'
125#' set.seed(1)
126#' inTrain <- createDataPartition(mdrrClass, p = .75, list = FALSE)[,1]
127#'
128#' train <- mdrrDescr[ inTrain, ]
129#' test  <- mdrrDescr[-inTrain, ]
130#' trainClass <- mdrrClass[ inTrain]
131#' testClass  <- mdrrClass[-inTrain]
132#'
133#' set.seed(2)
134#' ldaProfile <- rfe(train, trainClass,
135#'                   sizes = c(1:10, 15, 30),
136#'                   rfeControl = rfeControl(functions = ldaFuncs, method = "cv"))
137#' plot(ldaProfile, type = c("o", "g"))
138#'
139#' postResample(predict(ldaProfile, test), testClass)
140#'
141#' }
142#'
143#' #######################################
144#' ## Parallel Processing Example via multicore
145#'
146#' \dontrun{
147#' library(doMC)
148#'
149#' ## Note: if the underlying model also uses foreach, the
150#' ## number of cores specified above will double (along with
151#' ## the memory requirements)
152#' registerDoMC(cores = 2)
153#'
154#' set.seed(1)
155#' lmProfile <- rfe(x, logBBB,
156#'                  sizes = c(2:25, 30, 35, 40, 45, 50, 55, 60, 65),
157#'                  rfeControl = rfeControl(functions = lmFuncs,
158#'                                          number = 200))
159#'
160#'
161#' }
162#'
163#'
164#' @export rfe
165rfe <- function (x, ...) UseMethod("rfe")
166
167#' @rdname rfe
168#' @method rfe default
169#' @importFrom stats predict runif
170#' @export
171"rfe.default" <-
172  function(x, y,
173           sizes = 2^(2:4),
174           metric = ifelse(is.factor(y), "Accuracy", "RMSE"),
175           maximize = ifelse(metric %in% c("RMSE", "MAE", "logLoss"), FALSE, TRUE),
176           rfeControl = rfeControl(), ...)
177  {
178    startTime <- proc.time()
179    funcCall <- match.call(expand.dots = TRUE)
180    if(!("caret" %in% loadedNamespaces())) loadNamespace("caret")
181
182    if(nrow(x) != length(y)) stop("there should be the same number of samples in x and y")
183    numFeat <- ncol(x)
184    classLevels <- levels(y)
185
186    if(is.null(rfeControl$index))
187      rfeControl$index <- switch(tolower(rfeControl$method),
188                                 cv = createFolds(y, rfeControl$number, returnTrain = TRUE),
189                                 repeatedcv = createMultiFolds(y, rfeControl$number, rfeControl$repeats),
190                                 loocv = createFolds(y, length(y), returnTrain = TRUE),
191                                 boot =, boot632 = createResample(y, rfeControl$number),
192                                 test = createDataPartition(y, 1, rfeControl$p),
193                                 lgocv = createDataPartition(y, rfeControl$number, rfeControl$p))
194
195    if(is.null(names(rfeControl$index))) names(rfeControl$index) <- prettySeq(rfeControl$index)
196    if(is.null(rfeControl$indexOut)){
197      rfeControl$indexOut <- lapply(rfeControl$index,
198                                    function(training, allSamples) allSamples[-unique(training)],
199                                    allSamples = seq(along = y))
200      names(rfeControl$indexOut) <- prettySeq(rfeControl$indexOut)
201    }
202
203    sizes <- sort(unique(sizes))
204    sizes <- sizes[sizes <= ncol(x)]
205
206    ## check summary function and metric
207    testOutput <- data.frame(pred = sample(y, min(10, length(y))),
208                             obs = sample(y, min(10, length(y))))
209
210    if(is.factor(y))
211    {
212      for(i in seq(along = classLevels)) testOutput[, classLevels[i]] <- runif(nrow(testOutput))
213    }
214
215    test <- rfeControl$functions$summary(testOutput, lev = classLevels)
216    perfNames <- names(test)
217
218    if(!(metric %in% perfNames))
219    {
220      warning(paste("Metric '", metric, "' is not created by the summary function; '",
221                    perfNames[1], "' will be used instead", sep = ""))
222      metric <- perfNames[1]
223    }
224
225    ## Set or check the seeds when needed
226    totalSize <- if(any(sizes == ncol(x))) length(sizes) else length(sizes) + 1
227    if(is.null(rfeControl$seeds))
228    {
229      seeds <- vector(mode = "list", length = length(rfeControl$index))
230      seeds <- lapply(seeds, function(x) sample.int(n = 1000000, size = totalSize))
231      seeds[[length(rfeControl$index) + 1]] <- sample.int(n = 1000000, size = 1)
232      rfeControl$seeds <- seeds
233    } else {
234      if(!(length(rfeControl$seeds) == 1 && is.na(rfeControl$seeds)))
235      {
236        ## check versus number of tasks
237        numSeeds <- unlist(lapply(rfeControl$seeds, length))
238        badSeed <- (length(rfeControl$seeds) < length(rfeControl$index) + 1) ||
239          (any(numSeeds[-length(numSeeds)] < totalSize))
240        if(badSeed) stop(paste("Bad seeds: the seed object should be a list of length",
241                               length(rfeControl$index) + 1, "with",
242                               length(rfeControl$index), "integer vectors of size",
243                               totalSize, "and the last list element having a",
244                               "single integer"))
245      }
246    }
247
248    if(rfeControl$method == "LOOCV")
249    {
250      tmp <- looRfeWorkflow(x, y, sizes, ppOpts = NULL, ctrl = rfeControl, lev = classLevels, ...)
251      selectedVars <- do.call("c", tmp$everything[names(tmp$everything) == "finalVariables"])
252      selectedVars <- do.call("rbind", selectedVars)
253      externPerf <- tmp$performance
254    } else {
255      tmp <- nominalRfeWorkflow(x, y, sizes, ppOpts = NULL, ctrl = rfeControl, lev = classLevels, ...)
256      selectedVars <- do.call("rbind", tmp$everything[names(tmp$everything) == "selectedVars"])
257      resamples <- do.call("rbind", tmp$everything[names(tmp$everything) == "resamples"])
258      rownames(resamples) <- NULL
259      externPerf <- tmp$performance
260    }
261    rownames(selectedVars) <- NULL
262
263    bestSubset <- rfeControl$functions$selectSize(x = externPerf,
264                                                  metric = metric,
265                                                  maximize = maximize)
266
267    bestVar <- rfeControl$functions$selectVar(selectedVars, bestSubset)
268
269    finalTime <- system.time(
270      fit <- rfeControl$functions$fit(x[, bestVar, drop = FALSE],
271                                      y,
272                                      first = FALSE,
273                                      last = TRUE,
274                                      ...))
275
276
277    if(is.factor(y) & any(names(tmp$performance) == ".cell1"))
278    {
279      keepers <- c("Resample", "Variables", grep("\\.cell", names(tmp$performance), value = TRUE))
280      resampledCM <- subset(tmp$performance, Variables == bestSubset)
281      tmp$performance <- tmp$performance[, -grep("\\.cell", names(tmp$performance))]
282    } else resampledCM <- NULL
283
284    if(!(rfeControl$method %in% c("LOOCV"))) {
285      resamples <- switch(rfeControl$returnResamp,
286                          none = NULL,
287                          all = resamples,
288                          final = subset(resamples, Variables == bestSubset))
289    } else resamples <- NULL
290
291    endTime <- proc.time()
292    times <- list(everything = endTime - startTime,
293                  final = finalTime)
294
295    #########################################################################
296    ## Now, based on probability or static ranking, figure out the best vars
297    ## and the best subset size and fit final model
298
299    out <- structure(
300      list(
301        pred = if(rfeControl$saveDetails) do.call("rbind", tmp$everything[names(tmp$everything) == "predictions"]) else NULL,
302        variables = selectedVars,
303        results = as.data.frame(externPerf, stringsAsFactors = TRUE),
304        bestSubset = bestSubset,
305        fit = fit,
306        optVariables = bestVar,
307        optsize = bestSubset,
308        call = funcCall,
309        control = rfeControl,
310        resample = resamples,
311        metric = metric,
312        maximize = maximize,
313        perfNames = perfNames,
314        times = times,
315        resampledCM = resampledCM,
316        obsLevels = classLevels,
317        dots = list(...)),
318      class = "rfe")
319    if(rfeControl$timingSamps > 0)
320    {
321      out$times$prediction <- system.time(predict(out, x[1:min(nrow(x), rfeControl$timingSamps),,drop = FALSE]))
322    } else  out$times$prediction <- rep(NA, 3)
323    out
324  }
325
326#' @method rfe formula
327#' @inheritParams train
328#' @importFrom stats .getXlevels contrasts model.matrix model.response
329#' @rdname rfe
330#' @export
331rfe.formula <- function (form, data, ..., subset, na.action, contrasts = NULL)
332{
333  m <- match.call(expand.dots = FALSE)
334  if (is.matrix(eval.parent(m$data))) m$data <- as.data.frame(data, stringsAsFactors = TRUE)
335  m$... <- m$contrasts <- NULL
336  m[[1]] <- as.name("model.frame")
337  m <- eval.parent(m)
338  Terms <- attr(m, "terms")
339  x <- model.matrix(Terms, m, contrasts)
340  cons <- attr(x, "contrast")
341  xint <- match("(Intercept)", colnames(x), nomatch = 0)
342  if (xint > 0)  x <- x[, -xint, drop = FALSE]
343  y <- model.response(m)
344  res <- rfe(as.data.frame(x, stringsAsFactors = TRUE), y, ...)
345  res$terms <- Terms
346  res$coefnames <- colnames(x)
347  res$call <- match.call()
348  res$na.action <- attr(m, "na.action")
349  res$contrasts <- cons
350  res$xlevels <- .getXlevels(Terms, m)
351  class(res) <- c("rfe", "rfe.formula")
352  res
353}
354
355######################################################################
356######################################################################
357#' @method print rfe
358#' @export
359print.rfe <- function(x, top = 5, digits = max(3, getOption("digits") - 3), ...)
360{
361
362  cat("\nRecursive feature selection\n\n")
363
364  resampleN <- unlist(lapply(x$control$index, length))
365  numResamp <- length(resampleN)
366
367  resampText <- resampName(x)
368  cat("Outer resampling method:", resampText, "\n")
369
370  cat("\nResampling performance over subset size:\n\n")
371  x$results$Selected <- ""
372  x$results$Selected[x$results$Variables == x$bestSubset] <- "*"
373  print(format(x$results, digits = digits), row.names = FALSE)
374  cat("\n")
375
376  cat("The top ",
377      min(top, x$bestSubset),
378      " variables (out of ",
379      x$bestSubset,
380      "):\n   ",
381      paste(x$optVariables[1:min(top, x$bestSubset)], collapse = ", "),
382      "\n\n",
383      sep = "")
384
385  invisible(x)
386}
387
388######################################################################
389######################################################################
390
391#' @rdname rfe
392#' @importFrom stats complete.cases
393#' @importFrom utils flush.console
394#' @export
395rfeIter <- function(x, y,
396                    testX, testY, sizes,
397                    rfeControl = rfeControl(),
398                    label = "",
399                    seeds = NA,
400                    ...)
401{
402  if(is.null(colnames(x))) stop("x must have column names")
403
404  if(is.null(testX) | is.null(testY)) stop("a test set must be specified")
405  if(is.null(sizes)) stop("please specify the number of features")
406
407  predictionMatrix <- matrix(NA, nrow = length(testY), ncol = length(sizes))
408  p <- ncol(x)
409
410  retained <- colnames(x)
411  sizeValues <- sort(unique(c(sizes, ncol(x))), decreasing = TRUE)
412  sizeText <- format(sizeValues)
413
414  finalVariables <- vector(length(sizeValues), mode = "list")
415  for(k in seq(along = sizeValues))
416  {
417    if(!any(is.na(seeds))) set.seed(seeds[k])
418    if(rfeControl$verbose)
419    {
420      cat("+(rfe) fit",
421          ifelse(label != "",
422                 label, ""),
423          "size:",  sizeText[k], "\n")
424    }
425    flush.console()
426    fitObject <- rfeControl$functions$fit(x[,retained,drop = FALSE], y,
427                                          first = p == ncol(x[,retained,drop = FALSE]),
428                                          last = FALSE,
429                                          ...)
430    if(rfeControl$verbose)
431    {
432      cat("-(rfe) fit",
433          ifelse(label != "",
434                 label, ""),
435          "size:",  sizeText[k], "\n")
436    }
437    modelPred <- rfeControl$functions$pred(fitObject, testX[,retained,drop = FALSE])
438    if(is.data.frame(modelPred) | is.matrix(modelPred))
439    {
440      if(is.matrix(modelPred)) {
441        modelPred <- as.data.frame(modelPred, stringsAsFactors = TRUE)
442        ## in the case where the function returns a matrix with a single column
443        ## make sure that it is named pred
444        if(ncol(modelPred) == 1) names(modelPred) <- "pred"
445      }
446      modelPred$obs <- testY
447      modelPred$Variables <- sizeValues[k]
448    } else modelPred <- data.frame(pred = modelPred, obs = testY, Variables = sizeValues[k])
449
450    ## save as a vector and rbind at end
451    rfePred <- if(k == 1) modelPred else rbind(rfePred, modelPred)
452
453
454    if(!exists("modImp")) ##todo: get away from this since it finds object in other spaces
455    {
456      if(rfeControl$verbose)
457      {
458        cat("+(rfe) imp",
459            ifelse(label != "",
460                   label, ""), "\n")
461      }
462      modImp <- rfeControl$functions$rank(fitObject, x[,retained,drop = FALSE], y)
463      if(rfeControl$verbose)
464      {
465        cat("-(rfe) imp",
466            ifelse(label != "",
467                   label, ""), "\n")
468      }
469    } else {
470      if(rfeControl$rerank)
471      {
472        if(rfeControl$verbose)
473        {
474          cat("+(rfe) imp",
475              ifelse(label != "",
476                     label, ""),
477              "size:",  sizeText[k], "\n")
478        }
479        modImp <- rfeControl$functions$rank(fitObject, x[,retained,drop = FALSE], y)
480        if(rfeControl$verbose)
481        {
482          cat("-(rfe) imp",
483              ifelse(label != "",
484                     label, ""),
485              "size:",  sizeText[k], "\n")
486        }
487      }
488    }
489
490    if(nrow(modImp) < sizeValues[k]) {
491      msg1 <- paste0("rfe is expecting ", sizeValues[k],
492                     " importance values but only has ", nrow(modImp), ". ",
493                     "This may be caused by having zero-variance predictors, ",
494                     "excessively-correlated predictors, factor predictors ",
495                     "that were expanded into dummy variables or you may have ",
496                     "failed to drop one of your dummy variables.")
497      warning(msg1, call. = FALSE)
498      modImp <- repair_rank(modImp, colnames(x))
499    }
500    if(any(!complete.cases(modImp))){
501      warning(paste("There were missing importance values.",
502                 "There may be linear dependencies in your predictor variables"),
503              call. = FALSE)
504    }
505    if (!any(names(modImp) == "var")) {
506      stop("The importance score data should include a column named `var`.")
507    }
508    finalVariables[[k]] <- subset(modImp, var %in% retained)
509    finalVariables[[k]]$Variables <- sizeValues[[k]]
510    if(k < length(sizeValues)) retained <- as.character(modImp$var)[1:sizeValues[k+1]]
511  }
512  list(finalVariables = finalVariables, pred = rfePred)
513
514}
515
516######################################################################
517######################################################################
518
519
520
521
522
523#' Plot RFE Performance Profiles
524#'
525#' These functions plot the resampling results for the candidate subset sizes
526#' evaluated during the recursive feature elimination (RFE) process
527#'
528#' These plots show the average performance versus the subset sizes.
529#'
530#' @aliases plot.rfe ggplot.rfe
531#' @param x an object of class \code{\link{rfe}}.
532#' @param metric What measure of performance to plot. Examples of possible
533#' values are "RMSE", "Rsquared", "Accuracy" or "Kappa". Other values can be
534#' used depending on what metrics have been calculated.
535#' @param \dots \code{plot} only: specifications to be passed to
536#' \code{\link[lattice]{xyplot}}. The function automatically sets some
537#' arguments (e.g. axis labels) but passing in values here will over-ride the
538#' defaults.
539#' @param data an object of class \code{\link{rfe}}.
540#' @param output either "data", "ggplot" or "layered". The first returns a data
541#' frame while the second returns a simple \code{ggplot} object with no layers.
542#' The third value returns a plot with a set of layers.
543#' @param mapping,environment unused arguments to make consistent with
544#' \pkg{ggplot2} generic method
545#' @return a lattice or ggplot object
546#' @note We using a recipe as an input, there may be some subset sizes that are
547#'  not well-replicated over resamples. The `ggplot` method will only show
548#'  subset sizes where at least half of the resamples have associated results.
549#' @author Max Kuhn
550#' @seealso \code{\link{rfe}}, \code{\link[lattice]{xyplot}},
551#' \code{\link[ggplot2]{ggplot}}
552#' @references Kuhn (2008), ``Building Predictive Models in R Using the caret''
553#' (\doi{10.18637/jss.v028.i05})
554#' @keywords hplot
555#' @method plot rfe
556#' @export
557#' @examples
558#'
559#' \dontrun{
560#' data(BloodBrain)
561#'
562#' x <- scale(bbbDescr[,-nearZeroVar(bbbDescr)])
563#' x <- x[, -findCorrelation(cor(x), .8)]
564#' x <- as.data.frame(x, stringsAsFactors = TRUE)
565#'
566#' set.seed(1)
567#' lmProfile <- rfe(x, logBBB,
568#'                  sizes = c(2:25, 30, 35, 40, 45, 50, 55, 60, 65),
569#'                  rfeControl = rfeControl(functions = lmFuncs,
570#'                                          number = 200))
571#' plot(lmProfile)
572#' plot(lmProfile, metric = "Rsquared")
573#' ggplot(lmProfile)
574#' }
575#' @export plot.rfe
576plot.rfe <- function (x,
577                      metric = x$metric,
578                      ...) {
579  x$results$Selected <- ""
580  x$results$Selected[x$results$Variables == x$bestSubset] <- "*"
581
582  results <- x$results[, colnames(x$results) %in% c("Variables", "Selected", metric)]
583  metric <- metric[which(metric %in% colnames(results))]
584
585  plotForm <- as.formula(paste(metric, "~ Variables"))
586  panel.profile <- function(x, y, groups, ...)
587  {
588    panel.xyplot(x, y, ...)
589    panel.xyplot(x[groups == "*"], y[groups == "*"], pch = 16)
590  }
591  resampText <- resampName(x, FALSE)
592  resampText <- paste(metric, resampText)
593  out <- xyplot(plotForm, data = results, groups = Selected, panel =  panel.profile,
594                ylab = resampText,
595                ...)
596
597  out
598}
599
600######################################################################
601######################################################################
602#' Controlling the Feature Selection Algorithms
603#'
604#' This function generates a control object that can be used to specify the
605#' details of the feature selection algorithms used in this package.
606#'
607#' More details on this function can be found at
608#' \url{http://topepo.github.io/caret/recursive-feature-elimination.html#rfe}.
609#'
610#' Backwards selection requires function to be specified for some operations.
611#'
612#' The \code{fit} function builds the model based on the current data set. The
613#' arguments for the function must be: \itemize{ \item\code{x} the current
614#' training set of predictor data with the appropriate subset of variables
615#' \item\code{y} the current outcome data (either a numeric or factor vector)
616#' \item\code{first} a single logical value for whether the current predictor
617#' set has all possible variables \item\code{last} similar to \code{first}, but
618#' \code{TRUE} when the last model is fit with the final subset size and
619#' predictors.  \item\code{...}optional arguments to pass to the fit function
620#' in the call to \code{rfe} } The function should return a model object that
621#' can be used to generate predictions.
622#'
623#' The \code{pred} function returns a vector of predictions (numeric or
624#' factors) from the current model. The arguments are: \itemize{
625#' \item\code{object} the model generated by the \code{fit} function
626#' \item\code{x} the current set of predictor set for the held-back samples }
627#'
628#' The \code{rank} function is used to return the predictors in the order of
629#' the most important to the least important. Inputs are: \itemize{
630#' \item\code{object} the model generated by the \code{fit} function
631#' \item\code{x} the current set of predictor set for the training samples
632#' \item\code{y} the current training outcomes } The function should return a
633#' data frame with a column called \code{var} that has the current variable
634#' names. The first row should be the most important predictor etc. Other
635#' columns can be included in the output and will be returned in the final
636#' \code{rfe} object.
637#'
638#' The \code{selectSize} function determines the optimal number of predictors
639#' based on the resampling output. Inputs for the function are: \itemize{
640#' \item\code{x}a matrix with columns for the performance metrics and the
641#' number of variables, called "\code{Variables}" \item\code{metric}a character
642#' string of the performance measure to optimize (e.g. "RMSE", "Rsquared",
643#' "Accuracy" or "Kappa") \item\code{maximize}a single logical for whether the
644#' metric should be maximized } This function should return an integer
645#' corresponding to the optimal subset size. \pkg{caret} comes with two
646#' examples functions for this purpose: \code{\link{pickSizeBest}} and
647#' \code{\link{pickSizeTolerance}}.
648#'
649#' After the optimal subset size is determined, the \code{selectVar} function
650#' will be used to calculate the best rankings for each variable across all the
651#' resampling iterations. Inputs for the function are: \itemize{ \item\code{y}
652#' a list of variables importance for each resampling iteration and each subset
653#' size (generated by the user-defined \code{rank} function). In the example,
654#' each each of the cross-validation groups the output of the \code{rank}
655#' function is saved for each of the subset sizes (including the original
656#' subset). If the rankings are not recomputed at each iteration, the values
657#' will be the same within each cross-validation iteration.  \item\code{size}
658#' the integer returned by the \code{selectSize} function } This function
659#' should return a character string of predictor names (of length \code{size})
660#' in the order of most important to least important
661#'
662#' Examples of these functions are included in the package:
663#' \code{\link{lmFuncs}}, \code{\link{rfFuncs}}, \code{\link{treebagFuncs}} and
664#' \code{\link{nbFuncs}}.
665#'
666#' Model details about these functions, including examples, are at
667#' \url{http://topepo.github.io/caret/recursive-feature-elimination.html}. .
668#'
669#' @param functions a list of functions for model fitting, prediction and
670#' variable importance (see Details below)
671#' @param rerank a logical: should variable importance be re-calculated each
672#' time features are removed?
673#' @param method The external resampling method: \code{boot}, \code{cv},
674#' \code{LOOCV} or \code{LGOCV} (for repeated training/test splits
675#' @param number Either the number of folds or number of resampling iterations
676#' @param repeats For repeated k-fold cross-validation only: the number of
677#' complete sets of folds to compute
678#' @param saveDetails a logical to save the predictions and variable
679#' importances from the selection process
680#' @param verbose a logical to print a log for each external resampling
681#' iteration
682#' @param returnResamp A character string indicating how much of the resampled
683#' summary metrics should be saved. Values can be ``final'', ``all'' or
684#' ``none''
685#' @param p For leave-group out cross-validation: the training percentage
686#' @param index a list with elements for each external resampling iteration.
687#' Each list element is the sample rows used for training at that iteration.
688#' @param indexOut a list (the same length as \code{index}) that dictates which
689#' sample are held-out for each resample. If \code{NULL}, then the unique set
690#' of samples not contained in \code{index} is used.
691#' @param timingSamps the number of training set samples that will be used to
692#' measure the time for predicting samples (zero indicates that the prediction
693#' time should not be estimated).
694#' @param seeds an optional set of integers that will be used to set the seed
695#' at each resampling iteration. This is useful when the models are run in
696#' parallel. A value of \code{NA} will stop the seed from being set within the
697#' worker processes while a value of \code{NULL} will set the seeds using a
698#' random set of integers. Alternatively, a list can be used. The list should
699#' have \code{B+1} elements where \code{B} is the number of resamples. The
700#' first \code{B} elements of the list should be vectors of integers of length
701#' \code{P} where \code{P} is the number of subsets being evaluated (including
702#' the full set). The last element of the list only needs to be a single
703#' integer (for the final model). See the Examples section below.
704#' @param allowParallel if a parallel backend is loaded and available, should
705#' the function use it?
706#' @return A list
707#' @author Max Kuhn
708#' @seealso \code{\link{rfe}}, \code{\link{lmFuncs}}, \code{\link{rfFuncs}},
709#' \code{\link{treebagFuncs}}, \code{\link{nbFuncs}},
710#' \code{\link{pickSizeBest}}, \code{\link{pickSizeTolerance}}
711#' @keywords utilities
712#' @examples
713#'
714#'   \dontrun{
715#' subsetSizes <- c(2, 4, 6, 8)
716#' set.seed(123)
717#' seeds <- vector(mode = "list", length = 51)
718#' for(i in 1:50) seeds[[i]] <- sample.int(1000, length(subsetSizes) + 1)
719#' seeds[[51]] <- sample.int(1000, 1)
720#'
721#' set.seed(1)
722#' rfMod <- rfe(bbbDescr, logBBB,
723#'              sizes = subsetSizes,
724#'              rfeControl = rfeControl(functions = rfFuncs,
725#'                                      seeds = seeds,
726#'                                      number = 50))
727#'   }
728#'
729#' @export rfeControl
730rfeControl <- function(functions = NULL,
731                       rerank = FALSE,
732                       method = "boot",
733                       saveDetails = FALSE,
734                       number = ifelse(method %in% c("cv", "repeatedcv"), 10, 25),
735                       repeats = ifelse(method %in% c("cv", "repeatedcv"), 1, number),
736                       verbose = FALSE,
737                       returnResamp = "final",
738                       p = .75,
739                       index = NULL,
740                       indexOut = NULL,
741                       timingSamps = 0,
742                       seeds = NA,
743                       allowParallel = TRUE)
744{
745  list(
746    functions = if(is.null(functions)) caretFuncs else functions,
747    rerank = rerank,
748    method = method,
749    saveDetails = saveDetails,
750    number = number,
751    repeats = repeats,
752    returnResamp = returnResamp,
753    verbose = verbose,
754    p = p,
755    index = index,
756    indexOut = indexOut,
757    timingSamps = timingSamps,
758    seeds = seeds,
759    allowParallel = allowParallel)
760}
761
762######################################################################
763######################################################################
764## some built-in functions for certain models
765
766#' @rdname caretFuncs
767#' @export
768pickSizeBest <- function(x, metric, maximize)
769{
770  best <- if(maximize) which.max(x[,metric]) else which.min(x[,metric])
771  min(x[best, "Variables"])
772}
773
774#' @rdname caretFuncs
775#' @export
776pickSizeTolerance <- function(x, metric, tol = 1.5, maximize)
777{
778  if(!maximize)
779  {
780    best <- min(x[,metric])
781    perf <- (x[,metric] - best)/best * 100
782    flag <- perf <= tol
783  } else {
784    best <- max(x[,metric])
785    perf <- (best - x[,metric])/best * 100
786    flag <- perf <= tol
787  }
788  min(x[flag, "Variables"])
789}
790
791#' @rdname caretFuncs
792#' @export
793pickVars <- function(y, size)
794{
795  finalImp <- ddply(y[, c("Overall", "var")],
796                    .(var),
797                    function(x) mean(x$Overall, na.rm = TRUE))
798  names(finalImp)[2] <- "Overall"
799  finalImp <- finalImp[order(finalImp$Overall, decreasing = TRUE),]
800  as.character(finalImp$var[1:size])
801}
802
803
804
805
806#' Backwards Feature Selection Helper Functions
807#'
808#' Ancillary functions for backwards selection
809#'
810#' This page describes the functions that are used in backwards selection (aka
811#' recursive feature elimination). The functions described here are passed to
812#' the algorithm via the \code{functions} argument of \code{\link{rfeControl}}.
813#'
814#' See \code{\link{rfeControl}} for details on how these functions should be
815#' defined.
816#'
817#' The 'pick' functions are used to find the appropriate subset size for
818#' different situations. \code{pickBest} will find the position associated with
819#' the numerically best value (see the \code{maximize} argument to help define
820#' this).
821#'
822#' \code{pickSizeTolerance} picks the lowest position (i.e. the smallest subset
823#' size) that has no more of an X percent loss in performances. When
824#' maximizing, it calculates (O-X)/O*100, where X is the set of performance
825#' values and O is max(X). This is the percent loss. When X is to be minimized,
826#' it uses (X-O)/O*100 (so that values greater than X have a positive "loss").
827#' The function finds the smallest subset size that has a percent loss less
828#' than \code{tol}.
829#'
830#' Both of the 'pick' functions assume that the data are sorted from smallest
831#' subset size to largest.
832#'
833#' @aliases caretFuncs lmFuncs rfFuncs gamFuncs treebagFuncs ldaFuncs nbFuncs
834#' lrFuncs pickSizeBest pickSizeTolerance pickVars
835#' @param x a matrix or data frame with the performance metric of interest
836#' @param metric a character string with the name of the performance metric
837#' that should be used to choose the appropriate number of variables
838#' @param maximize a logical; should the metric be maximized?
839#' @param tol a scalar to denote the acceptable difference in optimal
840#' performance (see Details below)
841#' @param y a list of data frames with variables \code{Overall} and \code{var}
842#' @param size an integer for the number of variables to retain
843#' @author Max Kuhn
844#' @seealso \code{\link{rfeControl}}, \code{\link{rfe}}
845#' @keywords models
846#' @examples
847#'
848#' ## For picking subset sizes:
849#' ## Minimize the RMSE
850#' example <- data.frame(RMSE = c(1.2, 1.1, 1.05, 1.01, 1.01, 1.03, 1.00),
851#'                       Variables = 1:7)
852#' ## Percent Loss in performance (positive)
853#' example$PctLoss <- (example$RMSE - min(example$RMSE))/min(example$RMSE)*100
854#'
855#' xyplot(RMSE ~ Variables, data= example)
856#' xyplot(PctLoss ~ Variables, data= example)
857#'
858#' absoluteBest <- pickSizeBest(example, metric = "RMSE", maximize = FALSE)
859#' within5Pct <- pickSizeTolerance(example, metric = "RMSE", maximize = FALSE)
860#'
861#' cat("numerically optimal:",
862#'     example$RMSE[absoluteBest],
863#'     "RMSE in position",
864#'     absoluteBest, "\n")
865#' cat("Accepting a 1.5 pct loss:",
866#'     example$RMSE[within5Pct],
867#'     "RMSE in position",
868#'     within5Pct, "\n")
869#'
870#' ## Example where we would like to maximize
871#' example2 <- data.frame(Rsquared = c(0.4, 0.6, 0.94, 0.95, 0.95, 0.95, 0.95),
872#'                       Variables = 1:7)
873#' ## Percent Loss in performance (positive)
874#' example2$PctLoss <- (max(example2$Rsquared) - example2$Rsquared)/max(example2$Rsquared)*100
875#'
876#' xyplot(Rsquared ~ Variables, data= example2)
877#' xyplot(PctLoss ~ Variables, data= example2)
878#'
879#' absoluteBest2 <- pickSizeBest(example2, metric = "Rsquared", maximize = TRUE)
880#' within5Pct2 <- pickSizeTolerance(example2, metric = "Rsquared", maximize = TRUE)
881#'
882#' cat("numerically optimal:",
883#'     example2$Rsquared[absoluteBest2],
884#'     "R^2 in position",
885#'     absoluteBest2, "\n")
886#' cat("Accepting a 1.5 pct loss:",
887#'     example2$Rsquared[within5Pct2],
888#'     "R^2 in position",
889#'     within5Pct2, "\n")
890#'
891#' @export caretFuncs
892caretFuncs <- list(summary = defaultSummary,
893                   fit = function(x, y, first, last, ...) train(x, y, ...),
894                   pred = function(object, x) {
895                     tmp <- predict(object, x)
896                     if(object$modelType == "Classification" & object$control$classProbs) {
897                       out <- cbind(data.frame(pred = tmp),
898                                    as.data.frame(predict(object, x, type = "prob"), stringsAsFactors = TRUE), stringsAsFactors = TRUE)
899                     } else out <- tmp
900                     out
901                   },
902                   rank = function(object, x, y) {
903                     vimp <- varImp(object, scale = FALSE)$importance
904                     if(!is.data.frame(vimp)) vimp <- as.data.frame(vimp, stringsAsFactors = TRUE)
905                     if(object$modelType == "Regression") {
906                       vimp <- vimp[order(vimp[,1], decreasing = TRUE),,drop = FALSE]
907                     } else {
908                       if(all(levels(y) %in% colnames(vimp)) & !("Overall" %in% colnames(vimp))) {
909                         avImp <- apply(vimp[, levels(y), drop = TRUE], 1, mean)
910                         vimp$Overall <- avImp
911                       }
912                     }
913                     vimp <- vimp[order(vimp$Overall, decreasing = TRUE),, drop = FALSE]
914                     vimp$var <- rownames(vimp)
915                     vimp
916                   },
917                   selectSize = pickSizeBest,
918                   selectVar = pickVars)
919
920## write a better imp sort function
921#' @rdname caretFuncs
922#' @importFrom stats predict
923#' @export
924ldaFuncs <- list(summary = defaultSummary,
925                 fit = function(x, y, first, last, ...)
926                 {
927                   loadNamespace("MASS")
928                   MASS::lda(x, y, ...)
929                 },
930                 pred = function(object, x)
931                 {
932                   tmp <- predict(object, x)
933                   out <- cbind(data.frame(pred = tmp$class),
934                                as.data.frame(tmp$posterior, stringsAsFactors = FALSE), stringsAsFactors = TRUE)
935                   out
936                 },
937                 rank = function(object, x, y)
938                 {
939                   vimp <- filterVarImp(x, y, TRUE)
940
941                   vimp$Overall <- apply(vimp, 1, mean)
942                   vimp <- vimp[order(vimp$Overall, decreasing = TRUE),]
943
944                   vimp <- as.data.frame(vimp, stringsAsFactors = TRUE)[, "Overall",drop = FALSE]
945                   vimp$var <- rownames(vimp)
946                   vimp
947
948                 },
949                 selectSize = pickSizeBest,
950                 selectVar = pickVars
951)
952
953#' @rdname caretFuncs
954#' @importFrom stats predict
955#' @export
956treebagFuncs <- list(summary = defaultSummary,
957                     fit = function(x, y, first, last, ...) {
958                       loadNamespace("ipred")
959                       ipred::ipredbagg(y, x, ...)
960                     },
961                     pred = function(object, x) {
962                       tmp <- predict(object, x)
963                       if(is.factor(object$y)) {
964                         out <- cbind(data.frame(pred = tmp),
965                                      as.data.frame(predict(object, x, type = "prob"), stringsAsFactors = TRUE), stringsAsFactors = TRUE)
966                       } else out <- tmp
967                       out
968                     },
969                     rank = function(object, x, y) {
970                       vimp <- varImp(object)
971                       vimp <- vimp[order(vimp$Overall, decreasing = TRUE),,drop = FALSE]
972                       vimp$var <- rownames(vimp)
973                       vimp
974                     },
975                     selectSize = pickSizeBest,
976                     selectVar = pickVars)
977
978#' @rdname caretFuncs
979#' @importFrom stats predict
980#' @export
981gamFuncs <- list(summary = defaultSummary,
982                 fit = function(x, y, first, last, ...){
983                   loaded <- search()
984                   gamLoaded <- any(loaded == "package:gam")
985                   if(gamLoaded) detach(package:gam)
986                   loadNamespace("mgcv")
987                   gam <- get("gam", asNamespace("mgcv"))
988                   dat <- if(is.data.frame(x)) x else as.data.frame(x, stringsAsFactors = TRUE)
989                   dat$y <- y
990                   args <- list(formula = gamFormula(x, smoother = "s", y = "y"),
991                                data = dat,
992                                family = if(!is.factor(y)) gaussian else  binomial)
993                   do.call("gam", args)
994                 },
995                 pred = function(object, x) {
996                   if(!is.data.frame(x)) x <- as.data.frame(x, stringsAsFactors = TRUE)
997                   loaded <- search()
998                   gamLoaded <- any(loaded == "package:gam")
999                   if(gamLoaded) detach(package:gam)
1000                   loadNamespace("mgcv")
1001                   rsp <- predict(object, newdata = x, type = "response")
1002                   if(object$family$family == "binomial") {
1003                     lvl <- levels(object$model$y)
1004                     out <- data.frame(p1 = rsp,
1005                                       p2 = 1-rsp,
1006                                       pred = factor(ifelse(rsp > .5, lvl[2], lvl[1]),
1007                                                     levels = lvl))
1008                     colnames(out)[1:2] <- make.names(lvl)
1009                     out
1010                   } else out <- data.frame(pred = rsp)
1011                   out
1012
1013                 },
1014                 rank = function(object, x, y) {
1015
1016                   loaded <- search()
1017                   gamLoaded <- any(loaded == "package:gam")
1018                   if(gamLoaded) detach(package:gam)
1019                   loadNamespace("mgcv")
1020                   vimp <- varImp(object)
1021                   vimp$var <- rownames(vimp)
1022                   if(any(!(colnames(x) %in% rownames(vimp)))) {
1023                     missing <- colnames(x)[!(colnames(x) %in% rownames(vimp))]
1024                     tmpdf <- data.frame(var = missing,
1025                                         Overall = rep(0, length(missing)))
1026                     vimp <- rbind(vimp, tmpdf)
1027                   }
1028                   vimp <- vimp[order(vimp$Overall, decreasing = TRUE),, drop = FALSE]
1029                   vimp
1030                 },
1031                 selectSize = pickSizeBest,
1032                 selectVar = pickVars)
1033
1034#' @rdname caretFuncs
1035#' @importFrom stats predict
1036#' @export
1037rfFuncs <-  list(summary = defaultSummary,
1038                 fit = function(x, y, first, last, ...) {
1039                   loadNamespace("randomForest")
1040                   randomForest::randomForest(x, y, importance = TRUE, ...)
1041                 },
1042                 pred = function(object, x)  {
1043                   tmp <- predict(object, x)
1044                   if(is.factor(object$y)) {
1045                     out <- cbind(data.frame(pred = tmp),
1046                                  as.data.frame(predict(object, x, type = "prob"),
1047                                                stringsAsFactors = TRUE))
1048                   } else out <- tmp
1049                   out
1050                 },
1051                 rank = function(object, x, y) {
1052                   vimp <- varImp(object)
1053
1054                   if(is.factor(y)) {
1055                     if(all(levels(y) %in% colnames(vimp))) {
1056                       avImp <- apply(vimp[, levels(y), drop = TRUE], 1, mean)
1057                       vimp$Overall <- avImp
1058                     }
1059                   }
1060
1061                   vimp <- vimp[order(vimp$Overall, decreasing = TRUE),, drop = FALSE]
1062                   if (ncol(x) == 1) {
1063                     vimp$var <- colnames(x)
1064                   } else vimp$var <- rownames(vimp)
1065                   vimp
1066                 },
1067                 selectSize = pickSizeBest,
1068                 selectVar = pickVars)
1069
1070
1071#' @rdname caretFuncs
1072#' @importFrom stats predict lm
1073#' @export
1074lmFuncs <- list(summary = defaultSummary,
1075                fit = function(x, y, first, last, ...) {
1076                  tmp <- if(is.data.frame(x)) x else as.data.frame(x, stringsAsFactors = TRUE)
1077                  tmp$y <- y
1078                  lm(y~., data = tmp)
1079                },
1080                pred = function(object, x) {
1081                  if(!is.data.frame(x)) x <- as.data.frame(x, stringsAsFactors = TRUE)
1082                  predict(object, x)
1083                },
1084                rank = function(object, x, y) {
1085                  coefs <- abs(coef(object))
1086                  coefs <- coefs[names(coefs) != "(Intercept)"]
1087                  coefs[is.na(coefs)] <- 0
1088                  vimp <- data.frame(Overall = unname(coefs),
1089                                     var = names(coefs))
1090                  rownames(vimp) <- names(coefs)
1091                  vimp <- vimp[order(vimp$Overall, decreasing = TRUE),, drop = FALSE]
1092                  vimp
1093                },
1094                selectSize = pickSizeBest,
1095                selectVar = pickVars)
1096
1097
1098#' @rdname caretFuncs
1099#' @importFrom stats predict
1100#' @export
1101nbFuncs <- list(summary = defaultSummary,
1102                fit = function(x, y, first, last, ...){
1103                  loadNamespace("klaR")
1104                  klaR::NaiveBayes(x, y, usekernel = TRUE, fL = 2, ...)
1105                },
1106                pred = function(object, x) {
1107                  tmp <- predict(object, x)
1108                  out <- cbind(data.frame(pred = tmp$class),
1109                               as.data.frame(tmp$posterior, stringsAsFactors = TRUE))
1110                  out
1111                },
1112                rank = function(object, x, y) {
1113                  vimp <- filterVarImp(x, y)
1114                  if(is.factor(y)) {
1115                    avImp <- apply(vimp, 1, mean)
1116                    vimp$Overall <- avImp
1117                  }
1118
1119                  vimp <- vimp[order(vimp$Overall,decreasing = TRUE),, drop = FALSE]
1120
1121                  vimp$var <- rownames(vimp)
1122                  vimp
1123                },
1124                selectSize = pickSizeBest,
1125                selectVar = pickVars)
1126
1127
1128#' @rdname caretFuncs
1129#' @importFrom stats predict glm
1130#' @export
1131lrFuncs <- ldaFuncs
1132lrFuncs$fit <- function (x, y, first, last, ...)  {
1133  tmp <- if(is.data.frame(x)) x else as.data.frame(x, stringsAsFactors = TRUE)
1134  tmp$Class <- y
1135  glm(Class ~ ., data = tmp, family = "binomial")
1136}
1137lrFuncs$pred <- function (object, x) {
1138  if(!is.data.frame(x)) x <- as.data.frame(x, stringsAsFactors = TRUE)
1139  lvl <- levels(object$data$Class)
1140  tmp <- predict(object, x, type = "response")
1141  out <- data.frame(1-tmp, tmp)
1142  colnames(out) <- lvl
1143  out$pred <- factor(ifelse(tmp > .5, lvl[2], lvl[1]),
1144                     levels = lvl)
1145  out
1146}
1147
1148lrFuncs$rank <- function (object, x, y) {
1149  vimp <- varImp(object, scale = FALSE)
1150  vimp <- vimp[order(vimp$Overall, decreasing = TRUE),, drop = FALSE]
1151  vimp$var <- rownames(vimp)
1152  vimp
1153}
1154
1155######################################################################
1156######################################################################
1157## lattice functions
1158
1159#' Lattice functions for plotting resampling results of recursive feature
1160#' selection
1161#'
1162#' A set of lattice functions are provided to plot the resampled performance
1163#' estimates (e.g. classification accuracy, RMSE) over different subset sizes.
1164#'
1165#' By default, only the resampling results for the optimal model are saved in
1166#' the \code{rfe} object. The function \code{\link{rfeControl}} can be used to
1167#' save all the results using the \code{returnResamp} argument.
1168#'
1169#' If leave-one-out or out-of-bag resampling was specified, plots cannot be
1170#' produced (see the \code{method} argument of \code{\link{rfeControl}})
1171#'
1172#' @aliases xyplot.rfe stripplot.rfe densityplot.rfe histogram.rfe
1173#' @param x An object produced by \code{\link{rfe}}
1174#' @param data This argument is not used
1175#' @param metric A character string specifying the single performance metric
1176#' that will be plotted
1177#' @param \dots arguments to pass to either
1178#' \code{\link[lattice:histogram]{histogram}},
1179#' \code{\link[lattice:histogram]{densityplot}},
1180#' \code{\link[lattice:xyplot]{xyplot}} or
1181#' \code{\link[lattice:xyplot]{stripplot}}
1182#' @return A lattice plot object
1183#' @author Max Kuhn
1184#' @seealso \code{\link{rfe}}, \code{\link{rfeControl}},
1185#' \code{\link[lattice:histogram]{histogram}},
1186#' \code{\link[lattice:histogram]{densityplot}},
1187#' \code{\link[lattice:xyplot]{xyplot}},
1188#' \code{\link[lattice:xyplot]{stripplot}}
1189#' @keywords hplot
1190#' @examples
1191#'
1192#' \dontrun{
1193#' library(mlbench)
1194#' n <- 100
1195#' p <- 40
1196#' sigma <- 1
1197#' set.seed(1)
1198#' sim <- mlbench.friedman1(n, sd = sigma)
1199#' x <- cbind(sim$x,  matrix(rnorm(n * p), nrow = n))
1200#' y <- sim$y
1201#' colnames(x) <- paste("var", 1:ncol(x), sep = "")
1202#'
1203#' normalization <- preProcess(x)
1204#' x <- predict(normalization, x)
1205#' x <- as.data.frame(x, stringsAsFactors = TRUE)
1206#' subsets <- c(10, 15, 20, 25)
1207#'
1208#' ctrl <- rfeControl(
1209#'                    functions = lmFuncs,
1210#'                    method = "cv",
1211#'                    verbose = FALSE,
1212#'                    returnResamp = "all")
1213#'
1214#' lmProfile <- rfe(x, y,
1215#'                  sizes = subsets,
1216#'                  rfeControl = ctrl)
1217#' xyplot(lmProfile)
1218#' stripplot(lmProfile)
1219#'
1220#' histogram(lmProfile)
1221#' densityplot(lmProfile)
1222#' }
1223#'
1224#' @importFrom stats as.formula
1225#' @export
1226densityplot.rfe <- function(x,
1227                            data = NULL,
1228                            metric = x$metric,
1229                            ...)
1230{
1231  if (!is.null(match.call()$data))
1232    warning("explicit 'data' specification ignored")
1233
1234  if(x$control$method %in%  c("oob", "LOOCV"))
1235    stop("Resampling plots cannot be done with leave-out-out CV or out-of-bag resampling")
1236
1237  data <- as.data.frame(x$resample, stringsAsFactors = TRUE)
1238  data$Variable <- factor(data$Variable,
1239                          levels = paste(sort(unique(data$Variable))))
1240
1241  form <- as.formula(paste("~", metric, "|Variable"))
1242  densityplot(form, data = data, ...)
1243}
1244
1245#' @importFrom stats as.formula
1246#' @export
1247histogram.rfe <- function(x,
1248                          data = NULL,
1249                          metric = x$metric,
1250                          ...)
1251{
1252  if (!is.null(match.call()$data))
1253    warning("explicit 'data' specification ignored")
1254
1255  if(x$control$method %in%  c("oob", "LOOCV"))
1256    stop("Resampling plots cannot be done with leave-out-out CV or out-of-bag resampling")
1257
1258  data <- as.data.frame(x$resample, stringsAsFactors = TRUE)
1259  data$Variable <- factor(data$Variable,
1260                          levels = paste(sort(unique(data$Variable))))
1261
1262  form <- as.formula(paste("~", metric, "|Variable"))
1263  histogram(form, data = data, ...)
1264}
1265
1266#' @importFrom stats as.formula
1267#' @export
1268stripplot.rfe <- function(x,
1269                          data = NULL,
1270                          metric = x$metric,
1271                          ...)
1272{
1273  if (!is.null(match.call()$data))
1274    warning("explicit 'data' specification ignored")
1275
1276  if(x$control$method %in%  c("oob", "LOOCV"))
1277    stop("Resampling plots cannot be done with leave-out-out CV or out-of-bag resampling")
1278
1279  data <- as.data.frame(x$resample, stringsAsFactors = TRUE)
1280  data$Variable <- factor(data$Variable,
1281                          levels = paste(sort(unique(data$Variable))))
1282  theDots <- list(...)
1283  if(any(names(theDots) == "horizontal"))
1284  {
1285    formText <- if(theDots$horizontal) paste("Variable ~", metric) else paste(metric, "~ Variable")
1286  } else  formText <- paste("Variable ~", metric)
1287
1288  form <- as.formula(formText)
1289
1290  stripplot(form, data = data, ...)
1291
1292}
1293
1294#' @importFrom stats as.formula
1295#' @export
1296xyplot.rfe <- function(x,
1297                       data = NULL,
1298                       metric = x$metric,
1299                       ...)
1300{
1301  if (!is.null(match.call()$data))
1302    warning("explicit 'data' specification ignored")
1303
1304  if(x$control$method %in%  c("oob", "LOOCV"))
1305    stop("Resampling plots cannot be done with leave-out-out CV or out-of-bag resampling")
1306
1307  data <- as.data.frame(x$resample, stringsAsFactors = TRUE)
1308
1309  form <- as.formula(paste(metric, " ~ Variables"))
1310  xyplot(form, data = data, ...)
1311}
1312
1313######################################################################
1314######################################################################
1315## other functions
1316
1317#' @export
1318predictors.rfe <- function(x, ...) x$optVariables
1319
1320#' @export
1321varImp.rfe <- function(object, drop = FALSE, ...)
1322{
1323  imp <- subset(object$variables, Variables == object$optsize)
1324  imp <- ddply(imp[, c("Overall", "var")], .(var), function(x) mean(x$Overall, rm.na = TRUE))
1325  names(imp)[2] <- "Overall"
1326
1327  if(drop) imp <- subset(imp, var %in% object$optVar)
1328  rownames(imp) <- imp$var
1329  imp$var <- NULL
1330  imp[order(-imp$Overall),,drop = FALSE]
1331}
1332
1333#' @importFrom stats .checkMFClasses delete.response model.frame model.matrix na.omit
1334#' @export
1335predict.rfe <- function(object, newdata, ...) {
1336  if(length(list(...)) > 0)
1337    warning("... are ignored for predict.rfe")
1338
1339  if(inherits(object, "rfe.formula")) {
1340    newdata <- as.data.frame(newdata, stringsAsFactors = FALSE)
1341    rn <- row.names(newdata)
1342    Terms <- delete.response(object$terms)
1343    m <- model.frame(Terms, newdata, na.action = na.omit,
1344                     xlev = object$xlevels)
1345    if (!is.null(cl <- attr(Terms, "dataClasses")))
1346      .checkMFClasses(cl, m)
1347    keep <- match(row.names(m), rn)
1348    newdata <- model.matrix(Terms, m, contrasts = object$contrasts)
1349    xint <- match("(Intercept)", colnames(newdata), nomatch = 0)
1350    if (xint > 0)  newdata <- newdata[, -xint, drop = FALSE]
1351  } else {
1352    if (any(names(object) == "recipe")) {
1353      newdata <-
1354        bake(object$recipe, newdata, all_predictors(), composition = "data.frame")
1355    }
1356  }
1357  checkCols <- object$optVar %in% colnames(newdata)
1358  if(!all(checkCols))
1359    stop(paste("missing columns from newdata:",
1360               paste(object$optVar[!checkCols], collapse = ", ")))
1361
1362  newdata <- newdata[, object$optVar, drop = FALSE]
1363  object$control$functions$pred(object$fit, newdata)
1364}
1365
1366#' @rdname rfe
1367#' @method update rfe
1368#' @export
1369update.rfe <- function(object, x, y, size, ...) {
1370  size <- size[1]
1371  selectedVars <- object$variables
1372  bestVar <- object$control$functions$selectVar(selectedVars, size)
1373
1374  if (!is.null(object$recipe)) {
1375    if (is.null(object$recipe$template))
1376      stop("Recipe is missing data to be juiced.", call. = FALSE)
1377    args <-
1378      list(
1379        x = juice(object$recipe, all_predictors(), composition = "data.frame"),
1380        y = juice(object$recipe, all_outcomes(), composition = "data.frame"),
1381        first = FALSE, last = TRUE
1382      )
1383    args$y <- args$y[,1]
1384  } else {
1385    args <-
1386      list(x = x, y = y, first = FALSE, last = TRUE)
1387  }
1388  args$x <- args$x[, bestVar, drop = FALSE]
1389
1390  if (length(object$dots) > 0)
1391    args <- c(args, object$dots)
1392
1393  object$fit <- do.call(object$control$functions$fit, args)
1394
1395  object$bestSubset <- size
1396  object$bestVar <- bestVar
1397
1398  if (object$control$returnResamp == "final") {
1399    warning("The saved resamples are no longer appropriate and were removed")
1400    object$resampledCM <- object$resample <- NULL
1401  }
1402  object
1403}
1404
1405
1406repair_rank <- function(imp, nms, fill = -Inf) {
1407  no_val <- !(nms %in% imp$var)
1408  missing_rows <- imp[rep(1, sum(no_val)),]
1409  missing_rows$var <- nms[no_val]
1410  other_col <- colnames(imp)[colnames(imp) != "var"]
1411  for(i in other_col) missing_rows[, i] <- NA
1412  out <- rbind(imp, missing_rows)
1413  rownames(out) <- NULL
1414  out
1415}
1416
1417###################################################################
1418
1419rfe_rec <- function(x, y, test_x, test_y, perf_dat,
1420                    sizes, rfeControl = rfeControl(),
1421                    label = "", seeds = NA, ...) {
1422  p <- ncol(x)
1423
1424  if (length(sizes) > 0 && max(sizes) > p)
1425    sizes <- sizes[sizes <= p]
1426
1427  if (all(sizes < 2))
1428    stop(
1429      "After the recipe, there are less than two predictors remaining. `rfe` ",
1430      "requires at least two.",
1431      call. = FALSE
1432    )
1433
1434  if (length(sizes) == 0)
1435    stop(
1436      "After the recipe, there are only ",
1437      p,
1438      " predictors remaining. ",
1439      "The `sizes` values are inconsistent with this.",
1440      call. = FALSE
1441    )
1442
1443  predictionMatrix <-
1444    matrix(NA, nrow = length(test_y), ncol = length(sizes))
1445
1446  retained <- colnames(x)
1447  sizeValues <- sort(unique(c(sizes, p)), decreasing = TRUE)
1448  sizeText <- format(sizeValues)
1449
1450  finalVariables <- vector(length(sizeValues), mode = "list")
1451  for (k in seq(along = sizeValues)) {
1452    if (!any(is.na(seeds)))
1453      set.seed(seeds[k])
1454
1455    if (rfeControl$verbose) {
1456      cat("+(rfe) fit",
1457          ifelse(label != "",
1458                 label, ""),
1459          "size:",
1460          sizeText[k],
1461          "\n")
1462    }
1463    flush.console()
1464    fitObject <-
1465      rfeControl$functions$fit(
1466        x[, retained, drop = FALSE], y,
1467        first = p == ncol(x[, retained, drop = FALSE]),
1468        last = FALSE,
1469        ...
1470      )
1471    if (rfeControl$verbose) {
1472      cat("-(rfe) fit",
1473          ifelse(label != "",
1474                 label, ""),
1475          "size:",
1476          sizeText[k],
1477          "\n")
1478    }
1479    modelPred <-
1480      rfeControl$functions$pred(fitObject, test_x[, retained, drop = FALSE])
1481    if (is.data.frame(modelPred) | is.matrix(modelPred)) {
1482      if (is.matrix(modelPred)) {
1483        modelPred <- as.data.frame(modelPred, stringsAsFactors = TRUE)
1484        ## in the case where the function returns a matrix with a single column
1485        ## make sure that it is named pred
1486        if (ncol(modelPred) == 1)
1487          names(modelPred) <- "pred"
1488      }
1489      modelPred$obs <- test_y
1490      modelPred$Variables <- sizeValues[k]
1491    } else
1492      modelPred <-
1493      data.frame(pred = modelPred,
1494                 obs = test_y,
1495                 Variables = sizeValues[k])
1496    ## save as a vector and rbind at end
1497    rfePred <- if (k == 1)
1498      modelPred
1499    else
1500      rbind(rfePred, modelPred)
1501
1502
1503    if (!exists("modImp")) {
1504      ##todo: get away from this since it finds object in other spaces
1505
1506      if (rfeControl$verbose){
1507        cat("+(rfe) imp",
1508            ifelse(label != "",
1509                   label, ""), "\n")
1510      }
1511      modImp <-
1512        rfeControl$functions$rank(fitObject, x[, retained, drop = FALSE], y)
1513      if (rfeControl$verbose){
1514        cat("-(rfe) imp",
1515            ifelse(label != "",
1516                   label, ""), "\n")
1517      }
1518    } else {
1519      if (rfeControl$rerank){
1520        if (rfeControl$verbose){
1521          cat("+(rfe) imp",
1522              ifelse(label != "",
1523                     label, ""),
1524              "size:",
1525              sizeText[k],
1526              "\n")
1527        }
1528        modImp <-
1529          rfeControl$functions$rank(fitObject, x[, retained, drop = FALSE], y)
1530        if (rfeControl$verbose){
1531          cat("-(rfe) imp",
1532              ifelse(label != "",
1533                     label, ""),
1534              "size:",
1535              sizeText[k],
1536              "\n")
1537        }
1538      }
1539    }
1540
1541    if (nrow(modImp) < sizeValues[k]) {
1542      msg1 <- paste0(
1543        "rfe is expecting ",
1544        sizeValues[k],
1545        " importance values but only has ",
1546        nrow(modImp),
1547        ". ",
1548        "This may be caused by having zero-variance predictors, ",
1549        "excessively-correlated predictors, factor predictors ",
1550        "that were expanded into dummy variables or you may have ",
1551        "failed to drop one of your dummy variables."
1552      )
1553      warning(msg1, call. = FALSE)
1554      modImp <- repair_rank(modImp, colnames(x))
1555    }
1556    if (any(!complete.cases(modImp))) {
1557      warning(
1558        paste(
1559          "There were missing importance values.",
1560          "There may be linear dependencies in your predictor variables"
1561        ),
1562        call. = FALSE
1563      )
1564    }
1565    finalVariables[[k]] <- subset(modImp, var %in% retained)
1566    finalVariables[[k]]$Variables <- sizeValues[[k]]
1567    if (k < length(sizeValues))
1568      retained <- as.character(modImp$var)[1:sizeValues[k + 1]]
1569  }
1570  list(finalVariables = finalVariables, pred = rfePred)
1571}
1572
1573#' @method rfe recipe
1574#' @rdname rfe
1575#' @export
1576"rfe.recipe" <-
1577  function(x,
1578           data,
1579           sizes = 2 ^ (2:4),
1580           metric = NULL,
1581           maximize = NULL,
1582           rfeControl = rfeControl(),
1583           ...) {
1584    startTime <- proc.time()
1585    funcCall <- match.call(expand.dots = TRUE)
1586    if (!("caret" %in% loadedNamespaces()))
1587      loadNamespace("caret")
1588
1589    ###################################################################
1590
1591    if(rfeControl$verbose)
1592      cat("Preparing recipe\n")
1593
1594    trained_rec <- prep(x, training = data,
1595                        fresh = TRUE,
1596                        retain = TRUE,
1597                        verbose = FALSE,
1598                        stringsAsFactors = TRUE)
1599    x_dat <- juice(trained_rec, all_predictors(), composition = "data.frame")
1600    y_dat <- juice(trained_rec, all_outcomes(), composition = "data.frame")
1601    if(ncol(y_dat) > 1)
1602      stop("`rfe` doesn't support multivariate outcomes", call. = FALSE)
1603    y_dat <- y_dat[[1]]
1604    is_weight <- summary(trained_rec)$role == "case weight"
1605    if(any(is_weight))
1606      stop("`rfe` does not allow for weights.", call. = FALSE)
1607
1608    is_perf <- summary(trained_rec)$role == "performance var"
1609    if(any(is_perf)) {
1610      perf_data <- juice(trained_rec, has_role("performance var"))
1611    } else perf_data <- NULL
1612
1613    p <- ncol(x_dat)
1614    classLevels <- levels(y_dat)
1615
1616    # now do default metrics:
1617    if (is.null(metric))
1618      metric <- ifelse(is.factor(y_dat), "Accuracy", "RMSE")
1619
1620    maximize <-
1621      ifelse(metric %in% c("RMSE", "MAE", "logLoss"), FALSE, TRUE) # TODO make a function
1622
1623
1624
1625    if (is.null(rfeControl$index))
1626      rfeControl$index <- switch(
1627        tolower(rfeControl$method),
1628        cv = createFolds(y_dat, rfeControl$number, returnTrain = TRUE),
1629        repeatedcv = createMultiFolds(y_dat, rfeControl$number, rfeControl$repeats),
1630        loocv = createFolds(y_dat, length(y_dat), returnTrain = TRUE),
1631        boot = ,
1632        boot632 = createResample(y_dat, rfeControl$number),
1633        test = createDataPartition(y_dat, 1, rfeControl$p),
1634        lgocv = createDataPartition(y_dat, rfeControl$number, rfeControl$p)
1635      )
1636
1637    if (is.null(names(rfeControl$index)))
1638      names(rfeControl$index) <- prettySeq(rfeControl$index)
1639    if (is.null(rfeControl$indexOut)) {
1640      rfeControl$indexOut <- lapply(rfeControl$index,
1641                                    function(training, allSamples)
1642                                      allSamples[-unique(training)],
1643                                    allSamples = seq(along = y_dat))
1644      names(rfeControl$indexOut) <- prettySeq(rfeControl$indexOut)
1645    }
1646
1647    sizes <- sort(unique(sizes))
1648    if (any(sizes > p))
1649      warning("For the training set, the recipe generated fewer predictors ",
1650              "than the ", max(sizes), " expected in `sizes` and the number ",
1651              "of subsets will be truncated to be <= ", p, ".",
1652              call. = FALSE)
1653    sizes <- sizes[sizes <= p]
1654
1655    ## check summary function and metric
1656    testOutput <- data.frame(pred = sample(y_dat, min(10, length(y_dat))),
1657                             obs = sample(y_dat, min(10, length(y_dat))))
1658    if (is.factor(y_dat)) {
1659      for (i in seq(along = classLevels))
1660        testOutput[, classLevels[i]] <- runif(nrow(testOutput))
1661    }
1662    if(!is.null(perf_data))
1663      testOutput <- cbind(testOutput, perf_data)
1664
1665
1666    test <-
1667      rfeControl$functions$summary(testOutput, lev = classLevels)
1668    perfNames <- names(test)
1669
1670    if (!(metric %in% perfNames)) {
1671      warning(
1672        paste(
1673          "Metric '",
1674          metric,
1675          "' is not created by the summary function; '",
1676          perfNames[1],
1677          "' will be used instead",
1678          sep = ""
1679        )
1680      )
1681      metric <- perfNames[1]
1682    }
1683
1684    ## Set or check the seeds when needed
1685    totalSize <-
1686      if (any(sizes == p))
1687        length(sizes)
1688    else
1689      length(sizes) + 1
1690    if (is.null(rfeControl$seeds)) {
1691      seeds <- vector(mode = "list", length = length(rfeControl$index))
1692      seeds <-
1693        lapply(seeds, function(x)
1694          sample.int(n = 1000000, size = totalSize))
1695      seeds[[length(rfeControl$index) + 1]] <-
1696        sample.int(n = 1000000, size = 1)
1697      rfeControl$seeds <- seeds
1698    } else {
1699      if (!(length(rfeControl$seeds) == 1 && is.na(rfeControl$seeds))) {
1700        ## check versus number of tasks
1701        numSeeds <- unlist(lapply(rfeControl$seeds, length))
1702        badSeed <-
1703          (length(rfeControl$seeds) < length(rfeControl$index) + 1) ||
1704          (any(numSeeds[-length(numSeeds)] < totalSize))
1705        if (badSeed)
1706          stop(
1707            paste(
1708              "Bad seeds: the seed object should be a list of length",
1709              length(rfeControl$index) + 1,
1710              "with",
1711              length(rfeControl$index),
1712              "integer vectors of size",
1713              totalSize,
1714              "and the last list element having a",
1715              "single integer"
1716            )
1717          )
1718      }
1719    }
1720
1721    if (rfeControl$method == "LOOCV") {
1722      tmp <-
1723        rfe_rec_loo(
1724          rec = x,
1725          data = data,
1726          sizes = sizes,
1727          ctrl = rfeControl,
1728          lev = classLevels,
1729          ...
1730        )
1731      selectedVars <-
1732        do.call("c", tmp$everything[names(tmp$everything) == "finalVariables"])
1733      selectedVars <- do.call("rbind", selectedVars)
1734      externPerf <- tmp$performance
1735    } else {
1736      tmp <-
1737        rfe_rec_workflow(
1738          rec = x,
1739          data = data,
1740          sizes = sizes,
1741          ctrl = rfeControl,
1742          lev = classLevels,
1743          ...
1744        )
1745
1746      selectedVars <-
1747        do.call("rbind", tmp$everything[names(tmp$everything) == "selectedVars"])
1748      resamples <-
1749        do.call("rbind", tmp$everything[names(tmp$everything) == "resamples"])
1750      rownames(resamples) <- NULL
1751      externPerf <- tmp$performance
1752    }
1753    rownames(selectedVars) <- NULL
1754
1755    ## There may be variables selected that are not generated by the recipe
1756    ## created on the traning set.
1757
1758    all_var <- as.character(unique(selectedVars$var))
1759    x_names <- colnames(x_dat)
1760    orphans <- all_var[!(all_var %in% x_names)]
1761
1762    externPerf <- subset(externPerf, Variables <= length(x_names))
1763
1764    numResamples <- length(rfeControl$index)
1765    bestSubset <-
1766      rfeControl$functions$selectSize(
1767        x = subset(externPerf, Num_Resamples >= floor(.5*numResamples)),
1768        metric = metric,
1769        maximize = maximize
1770      )
1771
1772    bestVar <-
1773      rfeControl$functions$selectVar(subset(selectedVars, var %in% x_names), bestSubset)
1774    # In case of orpahns:
1775    bestVar <- bestVar[!is.na(bestVar)]
1776    bestSubset <- length(bestVar)
1777
1778    finalTime <-
1779      system.time(
1780        fit <- rfeControl$functions$fit(
1781          x_dat[, bestVar, drop = FALSE],
1782          y_dat,
1783          first = FALSE,
1784          last = TRUE,
1785          ...
1786        )
1787      )
1788
1789    if (is.factor(y_dat) & any(names(tmp$performance) == ".cell1")) {
1790      keepers <-
1791        c("Resample",
1792          "Variables",
1793          grep("\\.cell", names(tmp$performance), value = TRUE))
1794      resampledCM <-
1795        subset(tmp$performance, Variables == bestSubset)
1796      tmp$performance <-
1797        tmp$performance[,-grep("\\.cell", names(tmp$performance))]
1798    } else
1799      resampledCM <- NULL
1800
1801    if (!(rfeControl$method %in% c("LOOCV"))) {
1802      resamples <- switch(
1803        rfeControl$returnResamp,
1804        none = NULL,
1805        all = resamples,
1806        final = subset(resamples, Variables == bestSubset)
1807      )
1808    } else
1809      resamples <- NULL
1810
1811    endTime <- proc.time()
1812    times <- list(everything = endTime - startTime,
1813                  final = finalTime)
1814
1815    #########################################################################
1816    ## Now, based on probability or static ranking, figure out the best vars
1817    ## and the best subset size and fit final model
1818
1819    out <- structure(
1820      list(
1821        pred = if (rfeControl$saveDetails)
1822          do.call("rbind", tmp$everything[names(tmp$everything) == "predictions"])
1823        else
1824          NULL,
1825        variables = selectedVars,
1826        results = as.data.frame(externPerf, stringsAsFactors = FALSE),
1827        bestSubset = bestSubset,
1828        fit = fit,
1829        optVariables = bestVar,
1830        optsize = bestSubset,
1831        call = funcCall,
1832        control = rfeControl,
1833        resample = resamples,
1834        metric = metric,
1835        maximize = maximize,
1836        perfNames = perfNames,
1837        times = times,
1838        resampledCM = resampledCM,
1839        obsLevels = classLevels,
1840        dots = list(...),
1841        recipe = trained_rec
1842      ),
1843      class = "rfe"
1844    )
1845    if (rfeControl$timingSamps > 0) {
1846      out$times$prediction <-
1847        system.time(predict(out, x_dat[1:min(nrow(x_dat), rfeControl$timingSamps), , drop = FALSE]))
1848    } else
1849      out$times$prediction <- rep(NA, 3)
1850    out
1851  }
1852
1853
1854rfe_rec_workflow <- function(rec, data, sizes, ctrl, lev, ...) {
1855  loadNamespace("caret")
1856  loadNamespace("recipes")
1857
1858  resampleIndex <- ctrl$index
1859  if (ctrl$method %in% c("boot632")) {
1860    resampleIndex <- c(list("AllData" = rep(0, nrow(data))), resampleIndex)
1861    ctrl$indexOut <-
1862      c(list("AllData" = rep(0, nrow(data))),  ctrl$indexOut)
1863  }
1864
1865  `%op%` <- getOper(ctrl$allowParallel && foreach::getDoParWorkers() > 1)
1866  result <-
1867    foreach(
1868      iter = seq(along = resampleIndex),
1869      .combine = "c",
1870      .verbose = FALSE,
1871      .errorhandling = "stop",
1872      .packages = "caret"
1873    ) %op% {
1874      loadNamespace("caret")
1875      requireNamespace("plyr")
1876      requireNamespace("methods")
1877      loadNamespace("recipes")
1878
1879      if (names(resampleIndex)[iter] != "AllData") {
1880        modelIndex <- resampleIndex[[iter]]
1881        holdoutIndex <- ctrl$indexOut[[iter]]
1882      } else {
1883        modelIndex <- 1:nrow(data)
1884        holdoutIndex <- modelIndex
1885      }
1886
1887      seeds <-
1888        if (!(length(ctrl$seeds) == 1 &&
1889              is.na(ctrl$seeds)))
1890          ctrl$seeds[[iter]] else
1891            NA
1892
1893      if (ctrl$verbose)
1894        cat("+(rfe)",
1895            names(resampleIndex)[iter],
1896            "recipe",
1897            "\n")
1898
1899      trained_rec <- prep(
1900        rec, training = data[modelIndex,,drop = FALSE], fresh = TRUE,
1901        verbose = FALSE, stringsAsFactors = TRUE,
1902        retain = TRUE
1903      )
1904
1905      x <- juice(trained_rec, all_predictors(), composition = "data.frame")
1906      y <- juice(trained_rec, all_outcomes())[[1]]
1907      test_x <- bake(
1908        trained_rec,
1909        new_data = data[-modelIndex, , drop = FALSE],
1910        all_predictors(),
1911        composition = "data.frame"
1912      )
1913      test_y <- bake(
1914        trained_rec,
1915        new_data = data[-modelIndex, , drop = FALSE],
1916        all_outcomes()
1917      )[[1]]
1918
1919      is_perf <- summary(trained_rec)$role == "performance var"
1920      if(any(is_perf)) {
1921        test_perf <- bake(
1922          trained_rec,
1923          new_data = data[-modelIndex, , drop = FALSE],
1924          has_role("performance var"),
1925          composition = "data.frame"
1926        )
1927      } else test_perf <- NULL
1928
1929      p <- ncol(x)
1930
1931      if(length(sizes) > 0 && max(sizes) > p)
1932        sizes <- sizes[sizes <= p]
1933
1934      if (all(sizes < 2))
1935        stop(
1936          "After the recipe, there are less than two predictors remaining. `rfe` ",
1937          "requires at least two.",
1938          call. = FALSE
1939        )
1940
1941      if (length(sizes) == 0)
1942        stop(
1943          "After the recipe, there are only ",
1944          p,
1945          " predictors remaining. ",
1946          "The `sizes` values are inconsistent with this.",
1947          call. = FALSE
1948        )
1949
1950      if (ctrl$verbose)
1951        cat("-(rfe)",
1952            names(resampleIndex)[iter],
1953            "recipe",
1954            "\n")
1955
1956      rfeResults <- rfe_rec(
1957        x, y,
1958        test_x, test_y,
1959        test_perf,
1960        sizes, ctrl,
1961        label = names(resampleIndex)[iter],
1962        seeds = seeds,
1963        ...
1964      )
1965      resamples <-
1966        plyr::ddply(rfeResults$pred,
1967                    .(Variables),
1968                    ctrl$functions$summary,
1969                    lev = lev)
1970
1971      if (ctrl$saveDetails) {
1972        rfeResults$pred$Resample <- names(resampleIndex)[iter]
1973        ## If the user did not have nrow(x) in 'sizes', rfeIter added it.
1974        ## So, we need to find out how many set of predictions there are:
1975        nReps <- length(table(rfeResults$pred$Variables))
1976        rfeResults$pred$rowIndex <-
1977          rep(seq(along = y)[unique(holdoutIndex)], nReps)
1978      }
1979
1980      if (is.factor(y) && length(lev) <= 50) {
1981        cells <-
1982          plyr::ddply(rfeResults$pred, .(Variables), function(x)
1983            flatTable(x$pred, x$obs))
1984        resamples <- merge(resamples, cells)
1985      }
1986
1987      resamples$Resample <- names(resampleIndex)[iter]
1988      vars <- do.call("rbind", rfeResults$finalVariables)
1989      vars$Resample <- names(resampleIndex)[iter]
1990      list(
1991        resamples = resamples,
1992        selectedVars = vars,
1993        predictions = if (ctrl$saveDetails)
1994          rfeResults$pred else NULL
1995      )
1996    }
1997
1998  resamples <-
1999    do.call("rbind", result[names(result) == "resamples"])
2000  rownames(resamples) <- NULL
2001
2002  if (ctrl$method %in% c("boot632")) {
2003    perfNames <- names(resamples)
2004    perfNames <-
2005      perfNames[!(perfNames %in% c("Resample", "Variables"))]
2006    perfNames <- perfNames[!grepl("^cell[0-9]", perfNames)]
2007    apparent <- subset(resamples, Resample == "AllData")
2008    apparent <-
2009      apparent[, !grepl("^\\.cell|Resample", colnames(apparent)), drop = FALSE]
2010    names(apparent)[which(names(apparent) %in% perfNames)] <-
2011      paste(names(apparent)[which(names(apparent) %in% perfNames)],
2012            "Apparent", sep = "")
2013    names(apparent) <- gsub("^\\.", "", names(apparent))
2014    resamples <- subset(resamples, Resample != "AllData")
2015  }
2016
2017  externPerf <-
2018    plyr::ddply(resamples[, !grepl("\\.cell|Resample", colnames(resamples)), drop = FALSE],
2019                .(Variables),
2020                MeanSD,
2021                exclude = "Variables")
2022  numVars <-
2023    plyr::ddply(resamples[, !grepl("\\.cell|Resample", colnames(resamples)), drop = FALSE],
2024                .(Variables),
2025                function(x) c(Num_Resamples = nrow(x)))
2026
2027  externPerf <- merge(externPerf, numVars, by = "Variables", all = TRUE)
2028  externPerf <- externPerf[order(externPerf$Variables),, drop = FALSE]
2029
2030  if (ctrl$method %in% c("boot632")) {
2031    externPerf <- merge(externPerf, apparent)
2032    for (p in seq(along = perfNames)) {
2033      const <- 1 - exp(-1)
2034      externPerf[, perfNames[p]] <-
2035        (const * externPerf[, perfNames[p]]) +  ((1 - const) * externPerf[, paste(perfNames[p], "Apparent", sep = "")])
2036    }
2037    externPerf <-
2038      externPerf[,!(names(externPerf) %in% paste(perfNames, "Apparent", sep = ""))]
2039  }
2040  list(performance = externPerf, everything = result)
2041}
2042
2043rfe_rec_loo <- function(rec, data, sizes, ctrl, lev, ...) {
2044  loadNamespace("caret")
2045  loadNamespace("recipes")
2046
2047  resampleIndex <- ctrl$index
2048  `%op%` <- getOper(ctrl$allowParallel && getDoParWorkers() > 1)
2049  result <-
2050    foreach(
2051      iter = seq(along = resampleIndex),
2052      .combine = "c",
2053      .verbose = FALSE,
2054      .errorhandling = "stop",
2055      .packages = "caret"
2056    ) %op% {
2057
2058      loadNamespace("caret")
2059      loadNamespace("recipes")
2060
2061      requireNamespaceQuietStop("methods")
2062
2063      modelIndex <- resampleIndex[[iter]]
2064      holdoutIndex <- -unique(resampleIndex[[iter]])
2065
2066      seeds <-
2067        if (!(length(ctrl$seeds) == 1 &&
2068              is.na(ctrl$seeds)))
2069          ctrl$seeds[[iter]]  else NA
2070      if(ctrl$verbose)
2071        cat("Preparing recipe\n")
2072      trained_rec <- prep(
2073        rec, training = data[modelIndex,,drop = FALSE], fresh = TRUE,
2074        verbose = FALSE, stringsAsFactors = TRUE,
2075        retain = TRUE
2076      )
2077
2078      x <- juice(trained_rec, all_predictors(), composition = "data.frame")
2079      y <- juice(trained_rec, all_outcomes())[[1]]
2080      test_x <- bake(
2081        trained_rec,
2082        new_data = data[-modelIndex, , drop = FALSE],
2083        all_predictors(),
2084        composition = "data.frame"
2085      )
2086      test_y <- bake(
2087        trained_rec,
2088        new_data = data[-modelIndex, , drop = FALSE],
2089        all_outcomes()
2090      )[[1]]
2091
2092      is_perf <- summary(trained_rec)$role == "performance var"
2093      if(any(is_perf)) {
2094        test_perf <- bake(
2095          trained_rec,
2096          new_data = data[-modelIndex, , drop = FALSE],
2097          has_role("performance var"),
2098          composition = "data.frame"
2099        )
2100      } else test_perf <- NULL
2101
2102      p <- ncol(x)
2103
2104      if(length(sizes) > 0 && max(sizes) > p)
2105        sizes <- sizes[sizes <= p]
2106
2107      if (all(sizes < 2))
2108        stop(
2109          "After the recipe, there are less than two predictors remaining. `rfe` ",
2110          "requires at least two.",
2111          call. = FALSE
2112        )
2113
2114      if (length(sizes) == 0)
2115        stop(
2116          "After the recipe, there are only ",
2117          p,
2118          " predictors remaining. ",
2119          "The `sizes` values are inconsistent with this.",
2120          call. = FALSE
2121        )
2122
2123      rfeResults <- rfe_rec(
2124        x, y,
2125        test_x, test_y,
2126        test_perf,
2127        sizes, ctrl,
2128        label = names(resampleIndex)[iter],
2129        seeds = seeds,
2130        ...
2131      )
2132      rfeResults
2133    }
2134  preds <- do.call("rbind", result[names(result) == "pred"])
2135  resamples <-
2136    ddply(preds, .(Variables), ctrl$functions$summary, lev = lev)
2137  list(performance = resamples, everything = result)
2138}
2139
2140
2141
2142