1#' Backwards Feature Selection 2#' 3#' A simple backwards selection, a.k.a. recursive feature elimination (RFE), 4#' algorithm 5#' 6#' More details on this function can be found at 7#' \url{http://topepo.github.io/caret/recursive-feature-elimination.html}. 8#' 9#' This function implements backwards selection of predictors based on 10#' predictor importance ranking. The predictors are ranked and the less 11#' important ones are sequentially eliminated prior to modeling. The goal is to 12#' find a subset of predictors that can be used to produce an accurate model. 13#' The web page \url{http://topepo.github.io/caret/recursive-feature-elimination.html#rfe} 14#' has more details and examples related to this function. 15#' 16#' \code{rfe} can be used with "explicit parallelism", where different 17#' resamples (e.g. cross-validation group) can be split up and run on multiple 18#' machines or processors. By default, \code{rfe} will use a single processor 19#' on the host machine. As of version 4.99 of this package, the framework used 20#' for parallel processing uses the \pkg{foreach} package. To run the resamples 21#' in parallel, the code for \code{rfe} does not change; prior to the call to 22#' \code{rfe}, a parallel backend is registered with \pkg{foreach} (see the 23#' examples below). 24#' 25#' \code{rfeIter} is the basic algorithm while \code{rfe} wraps these 26#' operations inside of resampling. To avoid selection bias, it is better to 27#' use the function \code{rfe} than \code{rfeIter}. 28#' 29#' When updating a model, if the entire set of resamples were not saved using 30#' \code{rfeControl(returnResamp = "final")}, the existing resamples are 31#' removed with a warning. 32#' 33#' @aliases rfe rfe.default rfeIter predict.rfe update.rfe 34#' @param x A matrix or data frame of predictors for model training. This 35#' object must have unique column names. For the recipes method, \code{x} 36#' is a recipe object. 37#' @param y a vector of training set outcomes (either numeric or factor) 38#' @param testX a matrix or data frame of test set predictors. This must have 39#' the same column names as \code{x} 40#' @param testY a vector of test set outcomes 41#' @param sizes a numeric vector of integers corresponding to the number of 42#' features that should be retained 43#' @param metric a string that specifies what summary metric will be used to 44#' select the optimal model. By default, possible values are "RMSE" and 45#' "Rsquared" for regression and "Accuracy" and "Kappa" for classification. If 46#' custom performance metrics are used (via the \code{functions} argument in 47#' \code{\link{rfeControl}}, the value of \code{metric} should match one of the 48#' arguments. 49#' @param maximize a logical: should the metric be maximized or minimized? 50#' @param rfeControl a list of options, including functions for fitting and 51#' prediction. The web page 52#' \url{http://topepo.github.io/caret/recursive-feature-elimination.html#rfe} has more 53#' details and examples related to this function. 54#' @param object an object of class \code{rfe} 55#' @param size a single integers corresponding to the number of features that 56#' should be retained in the updated model 57#' @param label an optional character string to be printed when in verbose 58#' mode. 59#' @param seeds an optional vector of integers for the size. The vector should 60#' have length of \code{length(sizes)+1} 61#' @param \dots options to pass to the model fitting function (ignored in 62#' \code{predict.rfe}) 63#' @return A list with elements \item{finalVariables}{a list of size 64#' \code{length(sizes) + 1} containing the column names of the ``surviving'' 65#' predictors at each stage of selection. The first element corresponds to all 66#' the predictors (i.e. \code{size = ncol(x)})} \item{pred }{a data frame with 67#' columns for the test set outcome, the predicted outcome and the subset 68#' size.} 69#' @note We using a recipe as an input, there may be some subset 70#' sizes that are not well-replicated over resamples. `rfe` method 71#' will only consider subset sizes where at least half of the 72#' resamples have associated results in the search for an optimal 73#' subset size. 74#' @author Max Kuhn 75#' @seealso \code{\link{rfeControl}} 76#' @keywords models 77#' @examples 78#' 79#' \dontrun{ 80#' data(BloodBrain) 81#' 82#' x <- scale(bbbDescr[,-nearZeroVar(bbbDescr)]) 83#' x <- x[, -findCorrelation(cor(x), .8)] 84#' x <- as.data.frame(x, stringsAsFactors = TRUE) 85#' 86#' set.seed(1) 87#' lmProfile <- rfe(x, logBBB, 88#' sizes = c(2:25, 30, 35, 40, 45, 50, 55, 60, 65), 89#' rfeControl = rfeControl(functions = lmFuncs, 90#' number = 200)) 91#' set.seed(1) 92#' lmProfile2 <- rfe(x, logBBB, 93#' sizes = c(2:25, 30, 35, 40, 45, 50, 55, 60, 65), 94#' rfeControl = rfeControl(functions = lmFuncs, 95#' rerank = TRUE, 96#' number = 200)) 97#' 98#' xyplot(lmProfile$results$RMSE + lmProfile2$results$RMSE ~ 99#' lmProfile$results$Variables, 100#' type = c("g", "p", "l"), 101#' auto.key = TRUE) 102#' 103#' rfProfile <- rfe(x, logBBB, 104#' sizes = c(2, 5, 10, 20), 105#' rfeControl = rfeControl(functions = rfFuncs)) 106#' 107#' bagProfile <- rfe(x, logBBB, 108#' sizes = c(2, 5, 10, 20), 109#' rfeControl = rfeControl(functions = treebagFuncs)) 110#' 111#' set.seed(1) 112#' svmProfile <- rfe(x, logBBB, 113#' sizes = c(2, 5, 10, 20), 114#' rfeControl = rfeControl(functions = caretFuncs, 115#' number = 200), 116#' ## pass options to train() 117#' method = "svmRadial") 118#' 119#' ## classification 120#' 121#' data(mdrr) 122#' mdrrDescr <- mdrrDescr[,-nearZeroVar(mdrrDescr)] 123#' mdrrDescr <- mdrrDescr[, -findCorrelation(cor(mdrrDescr), .8)] 124#' 125#' set.seed(1) 126#' inTrain <- createDataPartition(mdrrClass, p = .75, list = FALSE)[,1] 127#' 128#' train <- mdrrDescr[ inTrain, ] 129#' test <- mdrrDescr[-inTrain, ] 130#' trainClass <- mdrrClass[ inTrain] 131#' testClass <- mdrrClass[-inTrain] 132#' 133#' set.seed(2) 134#' ldaProfile <- rfe(train, trainClass, 135#' sizes = c(1:10, 15, 30), 136#' rfeControl = rfeControl(functions = ldaFuncs, method = "cv")) 137#' plot(ldaProfile, type = c("o", "g")) 138#' 139#' postResample(predict(ldaProfile, test), testClass) 140#' 141#' } 142#' 143#' ####################################### 144#' ## Parallel Processing Example via multicore 145#' 146#' \dontrun{ 147#' library(doMC) 148#' 149#' ## Note: if the underlying model also uses foreach, the 150#' ## number of cores specified above will double (along with 151#' ## the memory requirements) 152#' registerDoMC(cores = 2) 153#' 154#' set.seed(1) 155#' lmProfile <- rfe(x, logBBB, 156#' sizes = c(2:25, 30, 35, 40, 45, 50, 55, 60, 65), 157#' rfeControl = rfeControl(functions = lmFuncs, 158#' number = 200)) 159#' 160#' 161#' } 162#' 163#' 164#' @export rfe 165rfe <- function (x, ...) UseMethod("rfe") 166 167#' @rdname rfe 168#' @method rfe default 169#' @importFrom stats predict runif 170#' @export 171"rfe.default" <- 172 function(x, y, 173 sizes = 2^(2:4), 174 metric = ifelse(is.factor(y), "Accuracy", "RMSE"), 175 maximize = ifelse(metric %in% c("RMSE", "MAE", "logLoss"), FALSE, TRUE), 176 rfeControl = rfeControl(), ...) 177 { 178 startTime <- proc.time() 179 funcCall <- match.call(expand.dots = TRUE) 180 if(!("caret" %in% loadedNamespaces())) loadNamespace("caret") 181 182 if(nrow(x) != length(y)) stop("there should be the same number of samples in x and y") 183 numFeat <- ncol(x) 184 classLevels <- levels(y) 185 186 if(is.null(rfeControl$index)) 187 rfeControl$index <- switch(tolower(rfeControl$method), 188 cv = createFolds(y, rfeControl$number, returnTrain = TRUE), 189 repeatedcv = createMultiFolds(y, rfeControl$number, rfeControl$repeats), 190 loocv = createFolds(y, length(y), returnTrain = TRUE), 191 boot =, boot632 = createResample(y, rfeControl$number), 192 test = createDataPartition(y, 1, rfeControl$p), 193 lgocv = createDataPartition(y, rfeControl$number, rfeControl$p)) 194 195 if(is.null(names(rfeControl$index))) names(rfeControl$index) <- prettySeq(rfeControl$index) 196 if(is.null(rfeControl$indexOut)){ 197 rfeControl$indexOut <- lapply(rfeControl$index, 198 function(training, allSamples) allSamples[-unique(training)], 199 allSamples = seq(along = y)) 200 names(rfeControl$indexOut) <- prettySeq(rfeControl$indexOut) 201 } 202 203 sizes <- sort(unique(sizes)) 204 sizes <- sizes[sizes <= ncol(x)] 205 206 ## check summary function and metric 207 testOutput <- data.frame(pred = sample(y, min(10, length(y))), 208 obs = sample(y, min(10, length(y)))) 209 210 if(is.factor(y)) 211 { 212 for(i in seq(along = classLevels)) testOutput[, classLevels[i]] <- runif(nrow(testOutput)) 213 } 214 215 test <- rfeControl$functions$summary(testOutput, lev = classLevels) 216 perfNames <- names(test) 217 218 if(!(metric %in% perfNames)) 219 { 220 warning(paste("Metric '", metric, "' is not created by the summary function; '", 221 perfNames[1], "' will be used instead", sep = "")) 222 metric <- perfNames[1] 223 } 224 225 ## Set or check the seeds when needed 226 totalSize <- if(any(sizes == ncol(x))) length(sizes) else length(sizes) + 1 227 if(is.null(rfeControl$seeds)) 228 { 229 seeds <- vector(mode = "list", length = length(rfeControl$index)) 230 seeds <- lapply(seeds, function(x) sample.int(n = 1000000, size = totalSize)) 231 seeds[[length(rfeControl$index) + 1]] <- sample.int(n = 1000000, size = 1) 232 rfeControl$seeds <- seeds 233 } else { 234 if(!(length(rfeControl$seeds) == 1 && is.na(rfeControl$seeds))) 235 { 236 ## check versus number of tasks 237 numSeeds <- unlist(lapply(rfeControl$seeds, length)) 238 badSeed <- (length(rfeControl$seeds) < length(rfeControl$index) + 1) || 239 (any(numSeeds[-length(numSeeds)] < totalSize)) 240 if(badSeed) stop(paste("Bad seeds: the seed object should be a list of length", 241 length(rfeControl$index) + 1, "with", 242 length(rfeControl$index), "integer vectors of size", 243 totalSize, "and the last list element having a", 244 "single integer")) 245 } 246 } 247 248 if(rfeControl$method == "LOOCV") 249 { 250 tmp <- looRfeWorkflow(x, y, sizes, ppOpts = NULL, ctrl = rfeControl, lev = classLevels, ...) 251 selectedVars <- do.call("c", tmp$everything[names(tmp$everything) == "finalVariables"]) 252 selectedVars <- do.call("rbind", selectedVars) 253 externPerf <- tmp$performance 254 } else { 255 tmp <- nominalRfeWorkflow(x, y, sizes, ppOpts = NULL, ctrl = rfeControl, lev = classLevels, ...) 256 selectedVars <- do.call("rbind", tmp$everything[names(tmp$everything) == "selectedVars"]) 257 resamples <- do.call("rbind", tmp$everything[names(tmp$everything) == "resamples"]) 258 rownames(resamples) <- NULL 259 externPerf <- tmp$performance 260 } 261 rownames(selectedVars) <- NULL 262 263 bestSubset <- rfeControl$functions$selectSize(x = externPerf, 264 metric = metric, 265 maximize = maximize) 266 267 bestVar <- rfeControl$functions$selectVar(selectedVars, bestSubset) 268 269 finalTime <- system.time( 270 fit <- rfeControl$functions$fit(x[, bestVar, drop = FALSE], 271 y, 272 first = FALSE, 273 last = TRUE, 274 ...)) 275 276 277 if(is.factor(y) & any(names(tmp$performance) == ".cell1")) 278 { 279 keepers <- c("Resample", "Variables", grep("\\.cell", names(tmp$performance), value = TRUE)) 280 resampledCM <- subset(tmp$performance, Variables == bestSubset) 281 tmp$performance <- tmp$performance[, -grep("\\.cell", names(tmp$performance))] 282 } else resampledCM <- NULL 283 284 if(!(rfeControl$method %in% c("LOOCV"))) { 285 resamples <- switch(rfeControl$returnResamp, 286 none = NULL, 287 all = resamples, 288 final = subset(resamples, Variables == bestSubset)) 289 } else resamples <- NULL 290 291 endTime <- proc.time() 292 times <- list(everything = endTime - startTime, 293 final = finalTime) 294 295 ######################################################################### 296 ## Now, based on probability or static ranking, figure out the best vars 297 ## and the best subset size and fit final model 298 299 out <- structure( 300 list( 301 pred = if(rfeControl$saveDetails) do.call("rbind", tmp$everything[names(tmp$everything) == "predictions"]) else NULL, 302 variables = selectedVars, 303 results = as.data.frame(externPerf, stringsAsFactors = TRUE), 304 bestSubset = bestSubset, 305 fit = fit, 306 optVariables = bestVar, 307 optsize = bestSubset, 308 call = funcCall, 309 control = rfeControl, 310 resample = resamples, 311 metric = metric, 312 maximize = maximize, 313 perfNames = perfNames, 314 times = times, 315 resampledCM = resampledCM, 316 obsLevels = classLevels, 317 dots = list(...)), 318 class = "rfe") 319 if(rfeControl$timingSamps > 0) 320 { 321 out$times$prediction <- system.time(predict(out, x[1:min(nrow(x), rfeControl$timingSamps),,drop = FALSE])) 322 } else out$times$prediction <- rep(NA, 3) 323 out 324 } 325 326#' @method rfe formula 327#' @inheritParams train 328#' @importFrom stats .getXlevels contrasts model.matrix model.response 329#' @rdname rfe 330#' @export 331rfe.formula <- function (form, data, ..., subset, na.action, contrasts = NULL) 332{ 333 m <- match.call(expand.dots = FALSE) 334 if (is.matrix(eval.parent(m$data))) m$data <- as.data.frame(data, stringsAsFactors = TRUE) 335 m$... <- m$contrasts <- NULL 336 m[[1]] <- as.name("model.frame") 337 m <- eval.parent(m) 338 Terms <- attr(m, "terms") 339 x <- model.matrix(Terms, m, contrasts) 340 cons <- attr(x, "contrast") 341 xint <- match("(Intercept)", colnames(x), nomatch = 0) 342 if (xint > 0) x <- x[, -xint, drop = FALSE] 343 y <- model.response(m) 344 res <- rfe(as.data.frame(x, stringsAsFactors = TRUE), y, ...) 345 res$terms <- Terms 346 res$coefnames <- colnames(x) 347 res$call <- match.call() 348 res$na.action <- attr(m, "na.action") 349 res$contrasts <- cons 350 res$xlevels <- .getXlevels(Terms, m) 351 class(res) <- c("rfe", "rfe.formula") 352 res 353} 354 355###################################################################### 356###################################################################### 357#' @method print rfe 358#' @export 359print.rfe <- function(x, top = 5, digits = max(3, getOption("digits") - 3), ...) 360{ 361 362 cat("\nRecursive feature selection\n\n") 363 364 resampleN <- unlist(lapply(x$control$index, length)) 365 numResamp <- length(resampleN) 366 367 resampText <- resampName(x) 368 cat("Outer resampling method:", resampText, "\n") 369 370 cat("\nResampling performance over subset size:\n\n") 371 x$results$Selected <- "" 372 x$results$Selected[x$results$Variables == x$bestSubset] <- "*" 373 print(format(x$results, digits = digits), row.names = FALSE) 374 cat("\n") 375 376 cat("The top ", 377 min(top, x$bestSubset), 378 " variables (out of ", 379 x$bestSubset, 380 "):\n ", 381 paste(x$optVariables[1:min(top, x$bestSubset)], collapse = ", "), 382 "\n\n", 383 sep = "") 384 385 invisible(x) 386} 387 388###################################################################### 389###################################################################### 390 391#' @rdname rfe 392#' @importFrom stats complete.cases 393#' @importFrom utils flush.console 394#' @export 395rfeIter <- function(x, y, 396 testX, testY, sizes, 397 rfeControl = rfeControl(), 398 label = "", 399 seeds = NA, 400 ...) 401{ 402 if(is.null(colnames(x))) stop("x must have column names") 403 404 if(is.null(testX) | is.null(testY)) stop("a test set must be specified") 405 if(is.null(sizes)) stop("please specify the number of features") 406 407 predictionMatrix <- matrix(NA, nrow = length(testY), ncol = length(sizes)) 408 p <- ncol(x) 409 410 retained <- colnames(x) 411 sizeValues <- sort(unique(c(sizes, ncol(x))), decreasing = TRUE) 412 sizeText <- format(sizeValues) 413 414 finalVariables <- vector(length(sizeValues), mode = "list") 415 for(k in seq(along = sizeValues)) 416 { 417 if(!any(is.na(seeds))) set.seed(seeds[k]) 418 if(rfeControl$verbose) 419 { 420 cat("+(rfe) fit", 421 ifelse(label != "", 422 label, ""), 423 "size:", sizeText[k], "\n") 424 } 425 flush.console() 426 fitObject <- rfeControl$functions$fit(x[,retained,drop = FALSE], y, 427 first = p == ncol(x[,retained,drop = FALSE]), 428 last = FALSE, 429 ...) 430 if(rfeControl$verbose) 431 { 432 cat("-(rfe) fit", 433 ifelse(label != "", 434 label, ""), 435 "size:", sizeText[k], "\n") 436 } 437 modelPred <- rfeControl$functions$pred(fitObject, testX[,retained,drop = FALSE]) 438 if(is.data.frame(modelPred) | is.matrix(modelPred)) 439 { 440 if(is.matrix(modelPred)) { 441 modelPred <- as.data.frame(modelPred, stringsAsFactors = TRUE) 442 ## in the case where the function returns a matrix with a single column 443 ## make sure that it is named pred 444 if(ncol(modelPred) == 1) names(modelPred) <- "pred" 445 } 446 modelPred$obs <- testY 447 modelPred$Variables <- sizeValues[k] 448 } else modelPred <- data.frame(pred = modelPred, obs = testY, Variables = sizeValues[k]) 449 450 ## save as a vector and rbind at end 451 rfePred <- if(k == 1) modelPred else rbind(rfePred, modelPred) 452 453 454 if(!exists("modImp")) ##todo: get away from this since it finds object in other spaces 455 { 456 if(rfeControl$verbose) 457 { 458 cat("+(rfe) imp", 459 ifelse(label != "", 460 label, ""), "\n") 461 } 462 modImp <- rfeControl$functions$rank(fitObject, x[,retained,drop = FALSE], y) 463 if(rfeControl$verbose) 464 { 465 cat("-(rfe) imp", 466 ifelse(label != "", 467 label, ""), "\n") 468 } 469 } else { 470 if(rfeControl$rerank) 471 { 472 if(rfeControl$verbose) 473 { 474 cat("+(rfe) imp", 475 ifelse(label != "", 476 label, ""), 477 "size:", sizeText[k], "\n") 478 } 479 modImp <- rfeControl$functions$rank(fitObject, x[,retained,drop = FALSE], y) 480 if(rfeControl$verbose) 481 { 482 cat("-(rfe) imp", 483 ifelse(label != "", 484 label, ""), 485 "size:", sizeText[k], "\n") 486 } 487 } 488 } 489 490 if(nrow(modImp) < sizeValues[k]) { 491 msg1 <- paste0("rfe is expecting ", sizeValues[k], 492 " importance values but only has ", nrow(modImp), ". ", 493 "This may be caused by having zero-variance predictors, ", 494 "excessively-correlated predictors, factor predictors ", 495 "that were expanded into dummy variables or you may have ", 496 "failed to drop one of your dummy variables.") 497 warning(msg1, call. = FALSE) 498 modImp <- repair_rank(modImp, colnames(x)) 499 } 500 if(any(!complete.cases(modImp))){ 501 warning(paste("There were missing importance values.", 502 "There may be linear dependencies in your predictor variables"), 503 call. = FALSE) 504 } 505 if (!any(names(modImp) == "var")) { 506 stop("The importance score data should include a column named `var`.") 507 } 508 finalVariables[[k]] <- subset(modImp, var %in% retained) 509 finalVariables[[k]]$Variables <- sizeValues[[k]] 510 if(k < length(sizeValues)) retained <- as.character(modImp$var)[1:sizeValues[k+1]] 511 } 512 list(finalVariables = finalVariables, pred = rfePred) 513 514} 515 516###################################################################### 517###################################################################### 518 519 520 521 522 523#' Plot RFE Performance Profiles 524#' 525#' These functions plot the resampling results for the candidate subset sizes 526#' evaluated during the recursive feature elimination (RFE) process 527#' 528#' These plots show the average performance versus the subset sizes. 529#' 530#' @aliases plot.rfe ggplot.rfe 531#' @param x an object of class \code{\link{rfe}}. 532#' @param metric What measure of performance to plot. Examples of possible 533#' values are "RMSE", "Rsquared", "Accuracy" or "Kappa". Other values can be 534#' used depending on what metrics have been calculated. 535#' @param \dots \code{plot} only: specifications to be passed to 536#' \code{\link[lattice]{xyplot}}. The function automatically sets some 537#' arguments (e.g. axis labels) but passing in values here will over-ride the 538#' defaults. 539#' @param data an object of class \code{\link{rfe}}. 540#' @param output either "data", "ggplot" or "layered". The first returns a data 541#' frame while the second returns a simple \code{ggplot} object with no layers. 542#' The third value returns a plot with a set of layers. 543#' @param mapping,environment unused arguments to make consistent with 544#' \pkg{ggplot2} generic method 545#' @return a lattice or ggplot object 546#' @note We using a recipe as an input, there may be some subset sizes that are 547#' not well-replicated over resamples. The `ggplot` method will only show 548#' subset sizes where at least half of the resamples have associated results. 549#' @author Max Kuhn 550#' @seealso \code{\link{rfe}}, \code{\link[lattice]{xyplot}}, 551#' \code{\link[ggplot2]{ggplot}} 552#' @references Kuhn (2008), ``Building Predictive Models in R Using the caret'' 553#' (\doi{10.18637/jss.v028.i05}) 554#' @keywords hplot 555#' @method plot rfe 556#' @export 557#' @examples 558#' 559#' \dontrun{ 560#' data(BloodBrain) 561#' 562#' x <- scale(bbbDescr[,-nearZeroVar(bbbDescr)]) 563#' x <- x[, -findCorrelation(cor(x), .8)] 564#' x <- as.data.frame(x, stringsAsFactors = TRUE) 565#' 566#' set.seed(1) 567#' lmProfile <- rfe(x, logBBB, 568#' sizes = c(2:25, 30, 35, 40, 45, 50, 55, 60, 65), 569#' rfeControl = rfeControl(functions = lmFuncs, 570#' number = 200)) 571#' plot(lmProfile) 572#' plot(lmProfile, metric = "Rsquared") 573#' ggplot(lmProfile) 574#' } 575#' @export plot.rfe 576plot.rfe <- function (x, 577 metric = x$metric, 578 ...) { 579 x$results$Selected <- "" 580 x$results$Selected[x$results$Variables == x$bestSubset] <- "*" 581 582 results <- x$results[, colnames(x$results) %in% c("Variables", "Selected", metric)] 583 metric <- metric[which(metric %in% colnames(results))] 584 585 plotForm <- as.formula(paste(metric, "~ Variables")) 586 panel.profile <- function(x, y, groups, ...) 587 { 588 panel.xyplot(x, y, ...) 589 panel.xyplot(x[groups == "*"], y[groups == "*"], pch = 16) 590 } 591 resampText <- resampName(x, FALSE) 592 resampText <- paste(metric, resampText) 593 out <- xyplot(plotForm, data = results, groups = Selected, panel = panel.profile, 594 ylab = resampText, 595 ...) 596 597 out 598} 599 600###################################################################### 601###################################################################### 602#' Controlling the Feature Selection Algorithms 603#' 604#' This function generates a control object that can be used to specify the 605#' details of the feature selection algorithms used in this package. 606#' 607#' More details on this function can be found at 608#' \url{http://topepo.github.io/caret/recursive-feature-elimination.html#rfe}. 609#' 610#' Backwards selection requires function to be specified for some operations. 611#' 612#' The \code{fit} function builds the model based on the current data set. The 613#' arguments for the function must be: \itemize{ \item\code{x} the current 614#' training set of predictor data with the appropriate subset of variables 615#' \item\code{y} the current outcome data (either a numeric or factor vector) 616#' \item\code{first} a single logical value for whether the current predictor 617#' set has all possible variables \item\code{last} similar to \code{first}, but 618#' \code{TRUE} when the last model is fit with the final subset size and 619#' predictors. \item\code{...}optional arguments to pass to the fit function 620#' in the call to \code{rfe} } The function should return a model object that 621#' can be used to generate predictions. 622#' 623#' The \code{pred} function returns a vector of predictions (numeric or 624#' factors) from the current model. The arguments are: \itemize{ 625#' \item\code{object} the model generated by the \code{fit} function 626#' \item\code{x} the current set of predictor set for the held-back samples } 627#' 628#' The \code{rank} function is used to return the predictors in the order of 629#' the most important to the least important. Inputs are: \itemize{ 630#' \item\code{object} the model generated by the \code{fit} function 631#' \item\code{x} the current set of predictor set for the training samples 632#' \item\code{y} the current training outcomes } The function should return a 633#' data frame with a column called \code{var} that has the current variable 634#' names. The first row should be the most important predictor etc. Other 635#' columns can be included in the output and will be returned in the final 636#' \code{rfe} object. 637#' 638#' The \code{selectSize} function determines the optimal number of predictors 639#' based on the resampling output. Inputs for the function are: \itemize{ 640#' \item\code{x}a matrix with columns for the performance metrics and the 641#' number of variables, called "\code{Variables}" \item\code{metric}a character 642#' string of the performance measure to optimize (e.g. "RMSE", "Rsquared", 643#' "Accuracy" or "Kappa") \item\code{maximize}a single logical for whether the 644#' metric should be maximized } This function should return an integer 645#' corresponding to the optimal subset size. \pkg{caret} comes with two 646#' examples functions for this purpose: \code{\link{pickSizeBest}} and 647#' \code{\link{pickSizeTolerance}}. 648#' 649#' After the optimal subset size is determined, the \code{selectVar} function 650#' will be used to calculate the best rankings for each variable across all the 651#' resampling iterations. Inputs for the function are: \itemize{ \item\code{y} 652#' a list of variables importance for each resampling iteration and each subset 653#' size (generated by the user-defined \code{rank} function). In the example, 654#' each each of the cross-validation groups the output of the \code{rank} 655#' function is saved for each of the subset sizes (including the original 656#' subset). If the rankings are not recomputed at each iteration, the values 657#' will be the same within each cross-validation iteration. \item\code{size} 658#' the integer returned by the \code{selectSize} function } This function 659#' should return a character string of predictor names (of length \code{size}) 660#' in the order of most important to least important 661#' 662#' Examples of these functions are included in the package: 663#' \code{\link{lmFuncs}}, \code{\link{rfFuncs}}, \code{\link{treebagFuncs}} and 664#' \code{\link{nbFuncs}}. 665#' 666#' Model details about these functions, including examples, are at 667#' \url{http://topepo.github.io/caret/recursive-feature-elimination.html}. . 668#' 669#' @param functions a list of functions for model fitting, prediction and 670#' variable importance (see Details below) 671#' @param rerank a logical: should variable importance be re-calculated each 672#' time features are removed? 673#' @param method The external resampling method: \code{boot}, \code{cv}, 674#' \code{LOOCV} or \code{LGOCV} (for repeated training/test splits 675#' @param number Either the number of folds or number of resampling iterations 676#' @param repeats For repeated k-fold cross-validation only: the number of 677#' complete sets of folds to compute 678#' @param saveDetails a logical to save the predictions and variable 679#' importances from the selection process 680#' @param verbose a logical to print a log for each external resampling 681#' iteration 682#' @param returnResamp A character string indicating how much of the resampled 683#' summary metrics should be saved. Values can be ``final'', ``all'' or 684#' ``none'' 685#' @param p For leave-group out cross-validation: the training percentage 686#' @param index a list with elements for each external resampling iteration. 687#' Each list element is the sample rows used for training at that iteration. 688#' @param indexOut a list (the same length as \code{index}) that dictates which 689#' sample are held-out for each resample. If \code{NULL}, then the unique set 690#' of samples not contained in \code{index} is used. 691#' @param timingSamps the number of training set samples that will be used to 692#' measure the time for predicting samples (zero indicates that the prediction 693#' time should not be estimated). 694#' @param seeds an optional set of integers that will be used to set the seed 695#' at each resampling iteration. This is useful when the models are run in 696#' parallel. A value of \code{NA} will stop the seed from being set within the 697#' worker processes while a value of \code{NULL} will set the seeds using a 698#' random set of integers. Alternatively, a list can be used. The list should 699#' have \code{B+1} elements where \code{B} is the number of resamples. The 700#' first \code{B} elements of the list should be vectors of integers of length 701#' \code{P} where \code{P} is the number of subsets being evaluated (including 702#' the full set). The last element of the list only needs to be a single 703#' integer (for the final model). See the Examples section below. 704#' @param allowParallel if a parallel backend is loaded and available, should 705#' the function use it? 706#' @return A list 707#' @author Max Kuhn 708#' @seealso \code{\link{rfe}}, \code{\link{lmFuncs}}, \code{\link{rfFuncs}}, 709#' \code{\link{treebagFuncs}}, \code{\link{nbFuncs}}, 710#' \code{\link{pickSizeBest}}, \code{\link{pickSizeTolerance}} 711#' @keywords utilities 712#' @examples 713#' 714#' \dontrun{ 715#' subsetSizes <- c(2, 4, 6, 8) 716#' set.seed(123) 717#' seeds <- vector(mode = "list", length = 51) 718#' for(i in 1:50) seeds[[i]] <- sample.int(1000, length(subsetSizes) + 1) 719#' seeds[[51]] <- sample.int(1000, 1) 720#' 721#' set.seed(1) 722#' rfMod <- rfe(bbbDescr, logBBB, 723#' sizes = subsetSizes, 724#' rfeControl = rfeControl(functions = rfFuncs, 725#' seeds = seeds, 726#' number = 50)) 727#' } 728#' 729#' @export rfeControl 730rfeControl <- function(functions = NULL, 731 rerank = FALSE, 732 method = "boot", 733 saveDetails = FALSE, 734 number = ifelse(method %in% c("cv", "repeatedcv"), 10, 25), 735 repeats = ifelse(method %in% c("cv", "repeatedcv"), 1, number), 736 verbose = FALSE, 737 returnResamp = "final", 738 p = .75, 739 index = NULL, 740 indexOut = NULL, 741 timingSamps = 0, 742 seeds = NA, 743 allowParallel = TRUE) 744{ 745 list( 746 functions = if(is.null(functions)) caretFuncs else functions, 747 rerank = rerank, 748 method = method, 749 saveDetails = saveDetails, 750 number = number, 751 repeats = repeats, 752 returnResamp = returnResamp, 753 verbose = verbose, 754 p = p, 755 index = index, 756 indexOut = indexOut, 757 timingSamps = timingSamps, 758 seeds = seeds, 759 allowParallel = allowParallel) 760} 761 762###################################################################### 763###################################################################### 764## some built-in functions for certain models 765 766#' @rdname caretFuncs 767#' @export 768pickSizeBest <- function(x, metric, maximize) 769{ 770 best <- if(maximize) which.max(x[,metric]) else which.min(x[,metric]) 771 min(x[best, "Variables"]) 772} 773 774#' @rdname caretFuncs 775#' @export 776pickSizeTolerance <- function(x, metric, tol = 1.5, maximize) 777{ 778 if(!maximize) 779 { 780 best <- min(x[,metric]) 781 perf <- (x[,metric] - best)/best * 100 782 flag <- perf <= tol 783 } else { 784 best <- max(x[,metric]) 785 perf <- (best - x[,metric])/best * 100 786 flag <- perf <= tol 787 } 788 min(x[flag, "Variables"]) 789} 790 791#' @rdname caretFuncs 792#' @export 793pickVars <- function(y, size) 794{ 795 finalImp <- ddply(y[, c("Overall", "var")], 796 .(var), 797 function(x) mean(x$Overall, na.rm = TRUE)) 798 names(finalImp)[2] <- "Overall" 799 finalImp <- finalImp[order(finalImp$Overall, decreasing = TRUE),] 800 as.character(finalImp$var[1:size]) 801} 802 803 804 805 806#' Backwards Feature Selection Helper Functions 807#' 808#' Ancillary functions for backwards selection 809#' 810#' This page describes the functions that are used in backwards selection (aka 811#' recursive feature elimination). The functions described here are passed to 812#' the algorithm via the \code{functions} argument of \code{\link{rfeControl}}. 813#' 814#' See \code{\link{rfeControl}} for details on how these functions should be 815#' defined. 816#' 817#' The 'pick' functions are used to find the appropriate subset size for 818#' different situations. \code{pickBest} will find the position associated with 819#' the numerically best value (see the \code{maximize} argument to help define 820#' this). 821#' 822#' \code{pickSizeTolerance} picks the lowest position (i.e. the smallest subset 823#' size) that has no more of an X percent loss in performances. When 824#' maximizing, it calculates (O-X)/O*100, where X is the set of performance 825#' values and O is max(X). This is the percent loss. When X is to be minimized, 826#' it uses (X-O)/O*100 (so that values greater than X have a positive "loss"). 827#' The function finds the smallest subset size that has a percent loss less 828#' than \code{tol}. 829#' 830#' Both of the 'pick' functions assume that the data are sorted from smallest 831#' subset size to largest. 832#' 833#' @aliases caretFuncs lmFuncs rfFuncs gamFuncs treebagFuncs ldaFuncs nbFuncs 834#' lrFuncs pickSizeBest pickSizeTolerance pickVars 835#' @param x a matrix or data frame with the performance metric of interest 836#' @param metric a character string with the name of the performance metric 837#' that should be used to choose the appropriate number of variables 838#' @param maximize a logical; should the metric be maximized? 839#' @param tol a scalar to denote the acceptable difference in optimal 840#' performance (see Details below) 841#' @param y a list of data frames with variables \code{Overall} and \code{var} 842#' @param size an integer for the number of variables to retain 843#' @author Max Kuhn 844#' @seealso \code{\link{rfeControl}}, \code{\link{rfe}} 845#' @keywords models 846#' @examples 847#' 848#' ## For picking subset sizes: 849#' ## Minimize the RMSE 850#' example <- data.frame(RMSE = c(1.2, 1.1, 1.05, 1.01, 1.01, 1.03, 1.00), 851#' Variables = 1:7) 852#' ## Percent Loss in performance (positive) 853#' example$PctLoss <- (example$RMSE - min(example$RMSE))/min(example$RMSE)*100 854#' 855#' xyplot(RMSE ~ Variables, data= example) 856#' xyplot(PctLoss ~ Variables, data= example) 857#' 858#' absoluteBest <- pickSizeBest(example, metric = "RMSE", maximize = FALSE) 859#' within5Pct <- pickSizeTolerance(example, metric = "RMSE", maximize = FALSE) 860#' 861#' cat("numerically optimal:", 862#' example$RMSE[absoluteBest], 863#' "RMSE in position", 864#' absoluteBest, "\n") 865#' cat("Accepting a 1.5 pct loss:", 866#' example$RMSE[within5Pct], 867#' "RMSE in position", 868#' within5Pct, "\n") 869#' 870#' ## Example where we would like to maximize 871#' example2 <- data.frame(Rsquared = c(0.4, 0.6, 0.94, 0.95, 0.95, 0.95, 0.95), 872#' Variables = 1:7) 873#' ## Percent Loss in performance (positive) 874#' example2$PctLoss <- (max(example2$Rsquared) - example2$Rsquared)/max(example2$Rsquared)*100 875#' 876#' xyplot(Rsquared ~ Variables, data= example2) 877#' xyplot(PctLoss ~ Variables, data= example2) 878#' 879#' absoluteBest2 <- pickSizeBest(example2, metric = "Rsquared", maximize = TRUE) 880#' within5Pct2 <- pickSizeTolerance(example2, metric = "Rsquared", maximize = TRUE) 881#' 882#' cat("numerically optimal:", 883#' example2$Rsquared[absoluteBest2], 884#' "R^2 in position", 885#' absoluteBest2, "\n") 886#' cat("Accepting a 1.5 pct loss:", 887#' example2$Rsquared[within5Pct2], 888#' "R^2 in position", 889#' within5Pct2, "\n") 890#' 891#' @export caretFuncs 892caretFuncs <- list(summary = defaultSummary, 893 fit = function(x, y, first, last, ...) train(x, y, ...), 894 pred = function(object, x) { 895 tmp <- predict(object, x) 896 if(object$modelType == "Classification" & object$control$classProbs) { 897 out <- cbind(data.frame(pred = tmp), 898 as.data.frame(predict(object, x, type = "prob"), stringsAsFactors = TRUE), stringsAsFactors = TRUE) 899 } else out <- tmp 900 out 901 }, 902 rank = function(object, x, y) { 903 vimp <- varImp(object, scale = FALSE)$importance 904 if(!is.data.frame(vimp)) vimp <- as.data.frame(vimp, stringsAsFactors = TRUE) 905 if(object$modelType == "Regression") { 906 vimp <- vimp[order(vimp[,1], decreasing = TRUE),,drop = FALSE] 907 } else { 908 if(all(levels(y) %in% colnames(vimp)) & !("Overall" %in% colnames(vimp))) { 909 avImp <- apply(vimp[, levels(y), drop = TRUE], 1, mean) 910 vimp$Overall <- avImp 911 } 912 } 913 vimp <- vimp[order(vimp$Overall, decreasing = TRUE),, drop = FALSE] 914 vimp$var <- rownames(vimp) 915 vimp 916 }, 917 selectSize = pickSizeBest, 918 selectVar = pickVars) 919 920## write a better imp sort function 921#' @rdname caretFuncs 922#' @importFrom stats predict 923#' @export 924ldaFuncs <- list(summary = defaultSummary, 925 fit = function(x, y, first, last, ...) 926 { 927 loadNamespace("MASS") 928 MASS::lda(x, y, ...) 929 }, 930 pred = function(object, x) 931 { 932 tmp <- predict(object, x) 933 out <- cbind(data.frame(pred = tmp$class), 934 as.data.frame(tmp$posterior, stringsAsFactors = FALSE), stringsAsFactors = TRUE) 935 out 936 }, 937 rank = function(object, x, y) 938 { 939 vimp <- filterVarImp(x, y, TRUE) 940 941 vimp$Overall <- apply(vimp, 1, mean) 942 vimp <- vimp[order(vimp$Overall, decreasing = TRUE),] 943 944 vimp <- as.data.frame(vimp, stringsAsFactors = TRUE)[, "Overall",drop = FALSE] 945 vimp$var <- rownames(vimp) 946 vimp 947 948 }, 949 selectSize = pickSizeBest, 950 selectVar = pickVars 951) 952 953#' @rdname caretFuncs 954#' @importFrom stats predict 955#' @export 956treebagFuncs <- list(summary = defaultSummary, 957 fit = function(x, y, first, last, ...) { 958 loadNamespace("ipred") 959 ipred::ipredbagg(y, x, ...) 960 }, 961 pred = function(object, x) { 962 tmp <- predict(object, x) 963 if(is.factor(object$y)) { 964 out <- cbind(data.frame(pred = tmp), 965 as.data.frame(predict(object, x, type = "prob"), stringsAsFactors = TRUE), stringsAsFactors = TRUE) 966 } else out <- tmp 967 out 968 }, 969 rank = function(object, x, y) { 970 vimp <- varImp(object) 971 vimp <- vimp[order(vimp$Overall, decreasing = TRUE),,drop = FALSE] 972 vimp$var <- rownames(vimp) 973 vimp 974 }, 975 selectSize = pickSizeBest, 976 selectVar = pickVars) 977 978#' @rdname caretFuncs 979#' @importFrom stats predict 980#' @export 981gamFuncs <- list(summary = defaultSummary, 982 fit = function(x, y, first, last, ...){ 983 loaded <- search() 984 gamLoaded <- any(loaded == "package:gam") 985 if(gamLoaded) detach(package:gam) 986 loadNamespace("mgcv") 987 gam <- get("gam", asNamespace("mgcv")) 988 dat <- if(is.data.frame(x)) x else as.data.frame(x, stringsAsFactors = TRUE) 989 dat$y <- y 990 args <- list(formula = gamFormula(x, smoother = "s", y = "y"), 991 data = dat, 992 family = if(!is.factor(y)) gaussian else binomial) 993 do.call("gam", args) 994 }, 995 pred = function(object, x) { 996 if(!is.data.frame(x)) x <- as.data.frame(x, stringsAsFactors = TRUE) 997 loaded <- search() 998 gamLoaded <- any(loaded == "package:gam") 999 if(gamLoaded) detach(package:gam) 1000 loadNamespace("mgcv") 1001 rsp <- predict(object, newdata = x, type = "response") 1002 if(object$family$family == "binomial") { 1003 lvl <- levels(object$model$y) 1004 out <- data.frame(p1 = rsp, 1005 p2 = 1-rsp, 1006 pred = factor(ifelse(rsp > .5, lvl[2], lvl[1]), 1007 levels = lvl)) 1008 colnames(out)[1:2] <- make.names(lvl) 1009 out 1010 } else out <- data.frame(pred = rsp) 1011 out 1012 1013 }, 1014 rank = function(object, x, y) { 1015 1016 loaded <- search() 1017 gamLoaded <- any(loaded == "package:gam") 1018 if(gamLoaded) detach(package:gam) 1019 loadNamespace("mgcv") 1020 vimp <- varImp(object) 1021 vimp$var <- rownames(vimp) 1022 if(any(!(colnames(x) %in% rownames(vimp)))) { 1023 missing <- colnames(x)[!(colnames(x) %in% rownames(vimp))] 1024 tmpdf <- data.frame(var = missing, 1025 Overall = rep(0, length(missing))) 1026 vimp <- rbind(vimp, tmpdf) 1027 } 1028 vimp <- vimp[order(vimp$Overall, decreasing = TRUE),, drop = FALSE] 1029 vimp 1030 }, 1031 selectSize = pickSizeBest, 1032 selectVar = pickVars) 1033 1034#' @rdname caretFuncs 1035#' @importFrom stats predict 1036#' @export 1037rfFuncs <- list(summary = defaultSummary, 1038 fit = function(x, y, first, last, ...) { 1039 loadNamespace("randomForest") 1040 randomForest::randomForest(x, y, importance = TRUE, ...) 1041 }, 1042 pred = function(object, x) { 1043 tmp <- predict(object, x) 1044 if(is.factor(object$y)) { 1045 out <- cbind(data.frame(pred = tmp), 1046 as.data.frame(predict(object, x, type = "prob"), 1047 stringsAsFactors = TRUE)) 1048 } else out <- tmp 1049 out 1050 }, 1051 rank = function(object, x, y) { 1052 vimp <- varImp(object) 1053 1054 if(is.factor(y)) { 1055 if(all(levels(y) %in% colnames(vimp))) { 1056 avImp <- apply(vimp[, levels(y), drop = TRUE], 1, mean) 1057 vimp$Overall <- avImp 1058 } 1059 } 1060 1061 vimp <- vimp[order(vimp$Overall, decreasing = TRUE),, drop = FALSE] 1062 if (ncol(x) == 1) { 1063 vimp$var <- colnames(x) 1064 } else vimp$var <- rownames(vimp) 1065 vimp 1066 }, 1067 selectSize = pickSizeBest, 1068 selectVar = pickVars) 1069 1070 1071#' @rdname caretFuncs 1072#' @importFrom stats predict lm 1073#' @export 1074lmFuncs <- list(summary = defaultSummary, 1075 fit = function(x, y, first, last, ...) { 1076 tmp <- if(is.data.frame(x)) x else as.data.frame(x, stringsAsFactors = TRUE) 1077 tmp$y <- y 1078 lm(y~., data = tmp) 1079 }, 1080 pred = function(object, x) { 1081 if(!is.data.frame(x)) x <- as.data.frame(x, stringsAsFactors = TRUE) 1082 predict(object, x) 1083 }, 1084 rank = function(object, x, y) { 1085 coefs <- abs(coef(object)) 1086 coefs <- coefs[names(coefs) != "(Intercept)"] 1087 coefs[is.na(coefs)] <- 0 1088 vimp <- data.frame(Overall = unname(coefs), 1089 var = names(coefs)) 1090 rownames(vimp) <- names(coefs) 1091 vimp <- vimp[order(vimp$Overall, decreasing = TRUE),, drop = FALSE] 1092 vimp 1093 }, 1094 selectSize = pickSizeBest, 1095 selectVar = pickVars) 1096 1097 1098#' @rdname caretFuncs 1099#' @importFrom stats predict 1100#' @export 1101nbFuncs <- list(summary = defaultSummary, 1102 fit = function(x, y, first, last, ...){ 1103 loadNamespace("klaR") 1104 klaR::NaiveBayes(x, y, usekernel = TRUE, fL = 2, ...) 1105 }, 1106 pred = function(object, x) { 1107 tmp <- predict(object, x) 1108 out <- cbind(data.frame(pred = tmp$class), 1109 as.data.frame(tmp$posterior, stringsAsFactors = TRUE)) 1110 out 1111 }, 1112 rank = function(object, x, y) { 1113 vimp <- filterVarImp(x, y) 1114 if(is.factor(y)) { 1115 avImp <- apply(vimp, 1, mean) 1116 vimp$Overall <- avImp 1117 } 1118 1119 vimp <- vimp[order(vimp$Overall,decreasing = TRUE),, drop = FALSE] 1120 1121 vimp$var <- rownames(vimp) 1122 vimp 1123 }, 1124 selectSize = pickSizeBest, 1125 selectVar = pickVars) 1126 1127 1128#' @rdname caretFuncs 1129#' @importFrom stats predict glm 1130#' @export 1131lrFuncs <- ldaFuncs 1132lrFuncs$fit <- function (x, y, first, last, ...) { 1133 tmp <- if(is.data.frame(x)) x else as.data.frame(x, stringsAsFactors = TRUE) 1134 tmp$Class <- y 1135 glm(Class ~ ., data = tmp, family = "binomial") 1136} 1137lrFuncs$pred <- function (object, x) { 1138 if(!is.data.frame(x)) x <- as.data.frame(x, stringsAsFactors = TRUE) 1139 lvl <- levels(object$data$Class) 1140 tmp <- predict(object, x, type = "response") 1141 out <- data.frame(1-tmp, tmp) 1142 colnames(out) <- lvl 1143 out$pred <- factor(ifelse(tmp > .5, lvl[2], lvl[1]), 1144 levels = lvl) 1145 out 1146} 1147 1148lrFuncs$rank <- function (object, x, y) { 1149 vimp <- varImp(object, scale = FALSE) 1150 vimp <- vimp[order(vimp$Overall, decreasing = TRUE),, drop = FALSE] 1151 vimp$var <- rownames(vimp) 1152 vimp 1153} 1154 1155###################################################################### 1156###################################################################### 1157## lattice functions 1158 1159#' Lattice functions for plotting resampling results of recursive feature 1160#' selection 1161#' 1162#' A set of lattice functions are provided to plot the resampled performance 1163#' estimates (e.g. classification accuracy, RMSE) over different subset sizes. 1164#' 1165#' By default, only the resampling results for the optimal model are saved in 1166#' the \code{rfe} object. The function \code{\link{rfeControl}} can be used to 1167#' save all the results using the \code{returnResamp} argument. 1168#' 1169#' If leave-one-out or out-of-bag resampling was specified, plots cannot be 1170#' produced (see the \code{method} argument of \code{\link{rfeControl}}) 1171#' 1172#' @aliases xyplot.rfe stripplot.rfe densityplot.rfe histogram.rfe 1173#' @param x An object produced by \code{\link{rfe}} 1174#' @param data This argument is not used 1175#' @param metric A character string specifying the single performance metric 1176#' that will be plotted 1177#' @param \dots arguments to pass to either 1178#' \code{\link[lattice:histogram]{histogram}}, 1179#' \code{\link[lattice:histogram]{densityplot}}, 1180#' \code{\link[lattice:xyplot]{xyplot}} or 1181#' \code{\link[lattice:xyplot]{stripplot}} 1182#' @return A lattice plot object 1183#' @author Max Kuhn 1184#' @seealso \code{\link{rfe}}, \code{\link{rfeControl}}, 1185#' \code{\link[lattice:histogram]{histogram}}, 1186#' \code{\link[lattice:histogram]{densityplot}}, 1187#' \code{\link[lattice:xyplot]{xyplot}}, 1188#' \code{\link[lattice:xyplot]{stripplot}} 1189#' @keywords hplot 1190#' @examples 1191#' 1192#' \dontrun{ 1193#' library(mlbench) 1194#' n <- 100 1195#' p <- 40 1196#' sigma <- 1 1197#' set.seed(1) 1198#' sim <- mlbench.friedman1(n, sd = sigma) 1199#' x <- cbind(sim$x, matrix(rnorm(n * p), nrow = n)) 1200#' y <- sim$y 1201#' colnames(x) <- paste("var", 1:ncol(x), sep = "") 1202#' 1203#' normalization <- preProcess(x) 1204#' x <- predict(normalization, x) 1205#' x <- as.data.frame(x, stringsAsFactors = TRUE) 1206#' subsets <- c(10, 15, 20, 25) 1207#' 1208#' ctrl <- rfeControl( 1209#' functions = lmFuncs, 1210#' method = "cv", 1211#' verbose = FALSE, 1212#' returnResamp = "all") 1213#' 1214#' lmProfile <- rfe(x, y, 1215#' sizes = subsets, 1216#' rfeControl = ctrl) 1217#' xyplot(lmProfile) 1218#' stripplot(lmProfile) 1219#' 1220#' histogram(lmProfile) 1221#' densityplot(lmProfile) 1222#' } 1223#' 1224#' @importFrom stats as.formula 1225#' @export 1226densityplot.rfe <- function(x, 1227 data = NULL, 1228 metric = x$metric, 1229 ...) 1230{ 1231 if (!is.null(match.call()$data)) 1232 warning("explicit 'data' specification ignored") 1233 1234 if(x$control$method %in% c("oob", "LOOCV")) 1235 stop("Resampling plots cannot be done with leave-out-out CV or out-of-bag resampling") 1236 1237 data <- as.data.frame(x$resample, stringsAsFactors = TRUE) 1238 data$Variable <- factor(data$Variable, 1239 levels = paste(sort(unique(data$Variable)))) 1240 1241 form <- as.formula(paste("~", metric, "|Variable")) 1242 densityplot(form, data = data, ...) 1243} 1244 1245#' @importFrom stats as.formula 1246#' @export 1247histogram.rfe <- function(x, 1248 data = NULL, 1249 metric = x$metric, 1250 ...) 1251{ 1252 if (!is.null(match.call()$data)) 1253 warning("explicit 'data' specification ignored") 1254 1255 if(x$control$method %in% c("oob", "LOOCV")) 1256 stop("Resampling plots cannot be done with leave-out-out CV or out-of-bag resampling") 1257 1258 data <- as.data.frame(x$resample, stringsAsFactors = TRUE) 1259 data$Variable <- factor(data$Variable, 1260 levels = paste(sort(unique(data$Variable)))) 1261 1262 form <- as.formula(paste("~", metric, "|Variable")) 1263 histogram(form, data = data, ...) 1264} 1265 1266#' @importFrom stats as.formula 1267#' @export 1268stripplot.rfe <- function(x, 1269 data = NULL, 1270 metric = x$metric, 1271 ...) 1272{ 1273 if (!is.null(match.call()$data)) 1274 warning("explicit 'data' specification ignored") 1275 1276 if(x$control$method %in% c("oob", "LOOCV")) 1277 stop("Resampling plots cannot be done with leave-out-out CV or out-of-bag resampling") 1278 1279 data <- as.data.frame(x$resample, stringsAsFactors = TRUE) 1280 data$Variable <- factor(data$Variable, 1281 levels = paste(sort(unique(data$Variable)))) 1282 theDots <- list(...) 1283 if(any(names(theDots) == "horizontal")) 1284 { 1285 formText <- if(theDots$horizontal) paste("Variable ~", metric) else paste(metric, "~ Variable") 1286 } else formText <- paste("Variable ~", metric) 1287 1288 form <- as.formula(formText) 1289 1290 stripplot(form, data = data, ...) 1291 1292} 1293 1294#' @importFrom stats as.formula 1295#' @export 1296xyplot.rfe <- function(x, 1297 data = NULL, 1298 metric = x$metric, 1299 ...) 1300{ 1301 if (!is.null(match.call()$data)) 1302 warning("explicit 'data' specification ignored") 1303 1304 if(x$control$method %in% c("oob", "LOOCV")) 1305 stop("Resampling plots cannot be done with leave-out-out CV or out-of-bag resampling") 1306 1307 data <- as.data.frame(x$resample, stringsAsFactors = TRUE) 1308 1309 form <- as.formula(paste(metric, " ~ Variables")) 1310 xyplot(form, data = data, ...) 1311} 1312 1313###################################################################### 1314###################################################################### 1315## other functions 1316 1317#' @export 1318predictors.rfe <- function(x, ...) x$optVariables 1319 1320#' @export 1321varImp.rfe <- function(object, drop = FALSE, ...) 1322{ 1323 imp <- subset(object$variables, Variables == object$optsize) 1324 imp <- ddply(imp[, c("Overall", "var")], .(var), function(x) mean(x$Overall, rm.na = TRUE)) 1325 names(imp)[2] <- "Overall" 1326 1327 if(drop) imp <- subset(imp, var %in% object$optVar) 1328 rownames(imp) <- imp$var 1329 imp$var <- NULL 1330 imp[order(-imp$Overall),,drop = FALSE] 1331} 1332 1333#' @importFrom stats .checkMFClasses delete.response model.frame model.matrix na.omit 1334#' @export 1335predict.rfe <- function(object, newdata, ...) { 1336 if(length(list(...)) > 0) 1337 warning("... are ignored for predict.rfe") 1338 1339 if(inherits(object, "rfe.formula")) { 1340 newdata <- as.data.frame(newdata, stringsAsFactors = FALSE) 1341 rn <- row.names(newdata) 1342 Terms <- delete.response(object$terms) 1343 m <- model.frame(Terms, newdata, na.action = na.omit, 1344 xlev = object$xlevels) 1345 if (!is.null(cl <- attr(Terms, "dataClasses"))) 1346 .checkMFClasses(cl, m) 1347 keep <- match(row.names(m), rn) 1348 newdata <- model.matrix(Terms, m, contrasts = object$contrasts) 1349 xint <- match("(Intercept)", colnames(newdata), nomatch = 0) 1350 if (xint > 0) newdata <- newdata[, -xint, drop = FALSE] 1351 } else { 1352 if (any(names(object) == "recipe")) { 1353 newdata <- 1354 bake(object$recipe, newdata, all_predictors(), composition = "data.frame") 1355 } 1356 } 1357 checkCols <- object$optVar %in% colnames(newdata) 1358 if(!all(checkCols)) 1359 stop(paste("missing columns from newdata:", 1360 paste(object$optVar[!checkCols], collapse = ", "))) 1361 1362 newdata <- newdata[, object$optVar, drop = FALSE] 1363 object$control$functions$pred(object$fit, newdata) 1364} 1365 1366#' @rdname rfe 1367#' @method update rfe 1368#' @export 1369update.rfe <- function(object, x, y, size, ...) { 1370 size <- size[1] 1371 selectedVars <- object$variables 1372 bestVar <- object$control$functions$selectVar(selectedVars, size) 1373 1374 if (!is.null(object$recipe)) { 1375 if (is.null(object$recipe$template)) 1376 stop("Recipe is missing data to be juiced.", call. = FALSE) 1377 args <- 1378 list( 1379 x = juice(object$recipe, all_predictors(), composition = "data.frame"), 1380 y = juice(object$recipe, all_outcomes(), composition = "data.frame"), 1381 first = FALSE, last = TRUE 1382 ) 1383 args$y <- args$y[,1] 1384 } else { 1385 args <- 1386 list(x = x, y = y, first = FALSE, last = TRUE) 1387 } 1388 args$x <- args$x[, bestVar, drop = FALSE] 1389 1390 if (length(object$dots) > 0) 1391 args <- c(args, object$dots) 1392 1393 object$fit <- do.call(object$control$functions$fit, args) 1394 1395 object$bestSubset <- size 1396 object$bestVar <- bestVar 1397 1398 if (object$control$returnResamp == "final") { 1399 warning("The saved resamples are no longer appropriate and were removed") 1400 object$resampledCM <- object$resample <- NULL 1401 } 1402 object 1403} 1404 1405 1406repair_rank <- function(imp, nms, fill = -Inf) { 1407 no_val <- !(nms %in% imp$var) 1408 missing_rows <- imp[rep(1, sum(no_val)),] 1409 missing_rows$var <- nms[no_val] 1410 other_col <- colnames(imp)[colnames(imp) != "var"] 1411 for(i in other_col) missing_rows[, i] <- NA 1412 out <- rbind(imp, missing_rows) 1413 rownames(out) <- NULL 1414 out 1415} 1416 1417################################################################### 1418 1419rfe_rec <- function(x, y, test_x, test_y, perf_dat, 1420 sizes, rfeControl = rfeControl(), 1421 label = "", seeds = NA, ...) { 1422 p <- ncol(x) 1423 1424 if (length(sizes) > 0 && max(sizes) > p) 1425 sizes <- sizes[sizes <= p] 1426 1427 if (all(sizes < 2)) 1428 stop( 1429 "After the recipe, there are less than two predictors remaining. `rfe` ", 1430 "requires at least two.", 1431 call. = FALSE 1432 ) 1433 1434 if (length(sizes) == 0) 1435 stop( 1436 "After the recipe, there are only ", 1437 p, 1438 " predictors remaining. ", 1439 "The `sizes` values are inconsistent with this.", 1440 call. = FALSE 1441 ) 1442 1443 predictionMatrix <- 1444 matrix(NA, nrow = length(test_y), ncol = length(sizes)) 1445 1446 retained <- colnames(x) 1447 sizeValues <- sort(unique(c(sizes, p)), decreasing = TRUE) 1448 sizeText <- format(sizeValues) 1449 1450 finalVariables <- vector(length(sizeValues), mode = "list") 1451 for (k in seq(along = sizeValues)) { 1452 if (!any(is.na(seeds))) 1453 set.seed(seeds[k]) 1454 1455 if (rfeControl$verbose) { 1456 cat("+(rfe) fit", 1457 ifelse(label != "", 1458 label, ""), 1459 "size:", 1460 sizeText[k], 1461 "\n") 1462 } 1463 flush.console() 1464 fitObject <- 1465 rfeControl$functions$fit( 1466 x[, retained, drop = FALSE], y, 1467 first = p == ncol(x[, retained, drop = FALSE]), 1468 last = FALSE, 1469 ... 1470 ) 1471 if (rfeControl$verbose) { 1472 cat("-(rfe) fit", 1473 ifelse(label != "", 1474 label, ""), 1475 "size:", 1476 sizeText[k], 1477 "\n") 1478 } 1479 modelPred <- 1480 rfeControl$functions$pred(fitObject, test_x[, retained, drop = FALSE]) 1481 if (is.data.frame(modelPred) | is.matrix(modelPred)) { 1482 if (is.matrix(modelPred)) { 1483 modelPred <- as.data.frame(modelPred, stringsAsFactors = TRUE) 1484 ## in the case where the function returns a matrix with a single column 1485 ## make sure that it is named pred 1486 if (ncol(modelPred) == 1) 1487 names(modelPred) <- "pred" 1488 } 1489 modelPred$obs <- test_y 1490 modelPred$Variables <- sizeValues[k] 1491 } else 1492 modelPred <- 1493 data.frame(pred = modelPred, 1494 obs = test_y, 1495 Variables = sizeValues[k]) 1496 ## save as a vector and rbind at end 1497 rfePred <- if (k == 1) 1498 modelPred 1499 else 1500 rbind(rfePred, modelPred) 1501 1502 1503 if (!exists("modImp")) { 1504 ##todo: get away from this since it finds object in other spaces 1505 1506 if (rfeControl$verbose){ 1507 cat("+(rfe) imp", 1508 ifelse(label != "", 1509 label, ""), "\n") 1510 } 1511 modImp <- 1512 rfeControl$functions$rank(fitObject, x[, retained, drop = FALSE], y) 1513 if (rfeControl$verbose){ 1514 cat("-(rfe) imp", 1515 ifelse(label != "", 1516 label, ""), "\n") 1517 } 1518 } else { 1519 if (rfeControl$rerank){ 1520 if (rfeControl$verbose){ 1521 cat("+(rfe) imp", 1522 ifelse(label != "", 1523 label, ""), 1524 "size:", 1525 sizeText[k], 1526 "\n") 1527 } 1528 modImp <- 1529 rfeControl$functions$rank(fitObject, x[, retained, drop = FALSE], y) 1530 if (rfeControl$verbose){ 1531 cat("-(rfe) imp", 1532 ifelse(label != "", 1533 label, ""), 1534 "size:", 1535 sizeText[k], 1536 "\n") 1537 } 1538 } 1539 } 1540 1541 if (nrow(modImp) < sizeValues[k]) { 1542 msg1 <- paste0( 1543 "rfe is expecting ", 1544 sizeValues[k], 1545 " importance values but only has ", 1546 nrow(modImp), 1547 ". ", 1548 "This may be caused by having zero-variance predictors, ", 1549 "excessively-correlated predictors, factor predictors ", 1550 "that were expanded into dummy variables or you may have ", 1551 "failed to drop one of your dummy variables." 1552 ) 1553 warning(msg1, call. = FALSE) 1554 modImp <- repair_rank(modImp, colnames(x)) 1555 } 1556 if (any(!complete.cases(modImp))) { 1557 warning( 1558 paste( 1559 "There were missing importance values.", 1560 "There may be linear dependencies in your predictor variables" 1561 ), 1562 call. = FALSE 1563 ) 1564 } 1565 finalVariables[[k]] <- subset(modImp, var %in% retained) 1566 finalVariables[[k]]$Variables <- sizeValues[[k]] 1567 if (k < length(sizeValues)) 1568 retained <- as.character(modImp$var)[1:sizeValues[k + 1]] 1569 } 1570 list(finalVariables = finalVariables, pred = rfePred) 1571} 1572 1573#' @method rfe recipe 1574#' @rdname rfe 1575#' @export 1576"rfe.recipe" <- 1577 function(x, 1578 data, 1579 sizes = 2 ^ (2:4), 1580 metric = NULL, 1581 maximize = NULL, 1582 rfeControl = rfeControl(), 1583 ...) { 1584 startTime <- proc.time() 1585 funcCall <- match.call(expand.dots = TRUE) 1586 if (!("caret" %in% loadedNamespaces())) 1587 loadNamespace("caret") 1588 1589 ################################################################### 1590 1591 if(rfeControl$verbose) 1592 cat("Preparing recipe\n") 1593 1594 trained_rec <- prep(x, training = data, 1595 fresh = TRUE, 1596 retain = TRUE, 1597 verbose = FALSE, 1598 stringsAsFactors = TRUE) 1599 x_dat <- juice(trained_rec, all_predictors(), composition = "data.frame") 1600 y_dat <- juice(trained_rec, all_outcomes(), composition = "data.frame") 1601 if(ncol(y_dat) > 1) 1602 stop("`rfe` doesn't support multivariate outcomes", call. = FALSE) 1603 y_dat <- y_dat[[1]] 1604 is_weight <- summary(trained_rec)$role == "case weight" 1605 if(any(is_weight)) 1606 stop("`rfe` does not allow for weights.", call. = FALSE) 1607 1608 is_perf <- summary(trained_rec)$role == "performance var" 1609 if(any(is_perf)) { 1610 perf_data <- juice(trained_rec, has_role("performance var")) 1611 } else perf_data <- NULL 1612 1613 p <- ncol(x_dat) 1614 classLevels <- levels(y_dat) 1615 1616 # now do default metrics: 1617 if (is.null(metric)) 1618 metric <- ifelse(is.factor(y_dat), "Accuracy", "RMSE") 1619 1620 maximize <- 1621 ifelse(metric %in% c("RMSE", "MAE", "logLoss"), FALSE, TRUE) # TODO make a function 1622 1623 1624 1625 if (is.null(rfeControl$index)) 1626 rfeControl$index <- switch( 1627 tolower(rfeControl$method), 1628 cv = createFolds(y_dat, rfeControl$number, returnTrain = TRUE), 1629 repeatedcv = createMultiFolds(y_dat, rfeControl$number, rfeControl$repeats), 1630 loocv = createFolds(y_dat, length(y_dat), returnTrain = TRUE), 1631 boot = , 1632 boot632 = createResample(y_dat, rfeControl$number), 1633 test = createDataPartition(y_dat, 1, rfeControl$p), 1634 lgocv = createDataPartition(y_dat, rfeControl$number, rfeControl$p) 1635 ) 1636 1637 if (is.null(names(rfeControl$index))) 1638 names(rfeControl$index) <- prettySeq(rfeControl$index) 1639 if (is.null(rfeControl$indexOut)) { 1640 rfeControl$indexOut <- lapply(rfeControl$index, 1641 function(training, allSamples) 1642 allSamples[-unique(training)], 1643 allSamples = seq(along = y_dat)) 1644 names(rfeControl$indexOut) <- prettySeq(rfeControl$indexOut) 1645 } 1646 1647 sizes <- sort(unique(sizes)) 1648 if (any(sizes > p)) 1649 warning("For the training set, the recipe generated fewer predictors ", 1650 "than the ", max(sizes), " expected in `sizes` and the number ", 1651 "of subsets will be truncated to be <= ", p, ".", 1652 call. = FALSE) 1653 sizes <- sizes[sizes <= p] 1654 1655 ## check summary function and metric 1656 testOutput <- data.frame(pred = sample(y_dat, min(10, length(y_dat))), 1657 obs = sample(y_dat, min(10, length(y_dat)))) 1658 if (is.factor(y_dat)) { 1659 for (i in seq(along = classLevels)) 1660 testOutput[, classLevels[i]] <- runif(nrow(testOutput)) 1661 } 1662 if(!is.null(perf_data)) 1663 testOutput <- cbind(testOutput, perf_data) 1664 1665 1666 test <- 1667 rfeControl$functions$summary(testOutput, lev = classLevels) 1668 perfNames <- names(test) 1669 1670 if (!(metric %in% perfNames)) { 1671 warning( 1672 paste( 1673 "Metric '", 1674 metric, 1675 "' is not created by the summary function; '", 1676 perfNames[1], 1677 "' will be used instead", 1678 sep = "" 1679 ) 1680 ) 1681 metric <- perfNames[1] 1682 } 1683 1684 ## Set or check the seeds when needed 1685 totalSize <- 1686 if (any(sizes == p)) 1687 length(sizes) 1688 else 1689 length(sizes) + 1 1690 if (is.null(rfeControl$seeds)) { 1691 seeds <- vector(mode = "list", length = length(rfeControl$index)) 1692 seeds <- 1693 lapply(seeds, function(x) 1694 sample.int(n = 1000000, size = totalSize)) 1695 seeds[[length(rfeControl$index) + 1]] <- 1696 sample.int(n = 1000000, size = 1) 1697 rfeControl$seeds <- seeds 1698 } else { 1699 if (!(length(rfeControl$seeds) == 1 && is.na(rfeControl$seeds))) { 1700 ## check versus number of tasks 1701 numSeeds <- unlist(lapply(rfeControl$seeds, length)) 1702 badSeed <- 1703 (length(rfeControl$seeds) < length(rfeControl$index) + 1) || 1704 (any(numSeeds[-length(numSeeds)] < totalSize)) 1705 if (badSeed) 1706 stop( 1707 paste( 1708 "Bad seeds: the seed object should be a list of length", 1709 length(rfeControl$index) + 1, 1710 "with", 1711 length(rfeControl$index), 1712 "integer vectors of size", 1713 totalSize, 1714 "and the last list element having a", 1715 "single integer" 1716 ) 1717 ) 1718 } 1719 } 1720 1721 if (rfeControl$method == "LOOCV") { 1722 tmp <- 1723 rfe_rec_loo( 1724 rec = x, 1725 data = data, 1726 sizes = sizes, 1727 ctrl = rfeControl, 1728 lev = classLevels, 1729 ... 1730 ) 1731 selectedVars <- 1732 do.call("c", tmp$everything[names(tmp$everything) == "finalVariables"]) 1733 selectedVars <- do.call("rbind", selectedVars) 1734 externPerf <- tmp$performance 1735 } else { 1736 tmp <- 1737 rfe_rec_workflow( 1738 rec = x, 1739 data = data, 1740 sizes = sizes, 1741 ctrl = rfeControl, 1742 lev = classLevels, 1743 ... 1744 ) 1745 1746 selectedVars <- 1747 do.call("rbind", tmp$everything[names(tmp$everything) == "selectedVars"]) 1748 resamples <- 1749 do.call("rbind", tmp$everything[names(tmp$everything) == "resamples"]) 1750 rownames(resamples) <- NULL 1751 externPerf <- tmp$performance 1752 } 1753 rownames(selectedVars) <- NULL 1754 1755 ## There may be variables selected that are not generated by the recipe 1756 ## created on the traning set. 1757 1758 all_var <- as.character(unique(selectedVars$var)) 1759 x_names <- colnames(x_dat) 1760 orphans <- all_var[!(all_var %in% x_names)] 1761 1762 externPerf <- subset(externPerf, Variables <= length(x_names)) 1763 1764 numResamples <- length(rfeControl$index) 1765 bestSubset <- 1766 rfeControl$functions$selectSize( 1767 x = subset(externPerf, Num_Resamples >= floor(.5*numResamples)), 1768 metric = metric, 1769 maximize = maximize 1770 ) 1771 1772 bestVar <- 1773 rfeControl$functions$selectVar(subset(selectedVars, var %in% x_names), bestSubset) 1774 # In case of orpahns: 1775 bestVar <- bestVar[!is.na(bestVar)] 1776 bestSubset <- length(bestVar) 1777 1778 finalTime <- 1779 system.time( 1780 fit <- rfeControl$functions$fit( 1781 x_dat[, bestVar, drop = FALSE], 1782 y_dat, 1783 first = FALSE, 1784 last = TRUE, 1785 ... 1786 ) 1787 ) 1788 1789 if (is.factor(y_dat) & any(names(tmp$performance) == ".cell1")) { 1790 keepers <- 1791 c("Resample", 1792 "Variables", 1793 grep("\\.cell", names(tmp$performance), value = TRUE)) 1794 resampledCM <- 1795 subset(tmp$performance, Variables == bestSubset) 1796 tmp$performance <- 1797 tmp$performance[,-grep("\\.cell", names(tmp$performance))] 1798 } else 1799 resampledCM <- NULL 1800 1801 if (!(rfeControl$method %in% c("LOOCV"))) { 1802 resamples <- switch( 1803 rfeControl$returnResamp, 1804 none = NULL, 1805 all = resamples, 1806 final = subset(resamples, Variables == bestSubset) 1807 ) 1808 } else 1809 resamples <- NULL 1810 1811 endTime <- proc.time() 1812 times <- list(everything = endTime - startTime, 1813 final = finalTime) 1814 1815 ######################################################################### 1816 ## Now, based on probability or static ranking, figure out the best vars 1817 ## and the best subset size and fit final model 1818 1819 out <- structure( 1820 list( 1821 pred = if (rfeControl$saveDetails) 1822 do.call("rbind", tmp$everything[names(tmp$everything) == "predictions"]) 1823 else 1824 NULL, 1825 variables = selectedVars, 1826 results = as.data.frame(externPerf, stringsAsFactors = FALSE), 1827 bestSubset = bestSubset, 1828 fit = fit, 1829 optVariables = bestVar, 1830 optsize = bestSubset, 1831 call = funcCall, 1832 control = rfeControl, 1833 resample = resamples, 1834 metric = metric, 1835 maximize = maximize, 1836 perfNames = perfNames, 1837 times = times, 1838 resampledCM = resampledCM, 1839 obsLevels = classLevels, 1840 dots = list(...), 1841 recipe = trained_rec 1842 ), 1843 class = "rfe" 1844 ) 1845 if (rfeControl$timingSamps > 0) { 1846 out$times$prediction <- 1847 system.time(predict(out, x_dat[1:min(nrow(x_dat), rfeControl$timingSamps), , drop = FALSE])) 1848 } else 1849 out$times$prediction <- rep(NA, 3) 1850 out 1851 } 1852 1853 1854rfe_rec_workflow <- function(rec, data, sizes, ctrl, lev, ...) { 1855 loadNamespace("caret") 1856 loadNamespace("recipes") 1857 1858 resampleIndex <- ctrl$index 1859 if (ctrl$method %in% c("boot632")) { 1860 resampleIndex <- c(list("AllData" = rep(0, nrow(data))), resampleIndex) 1861 ctrl$indexOut <- 1862 c(list("AllData" = rep(0, nrow(data))), ctrl$indexOut) 1863 } 1864 1865 `%op%` <- getOper(ctrl$allowParallel && foreach::getDoParWorkers() > 1) 1866 result <- 1867 foreach( 1868 iter = seq(along = resampleIndex), 1869 .combine = "c", 1870 .verbose = FALSE, 1871 .errorhandling = "stop", 1872 .packages = "caret" 1873 ) %op% { 1874 loadNamespace("caret") 1875 requireNamespace("plyr") 1876 requireNamespace("methods") 1877 loadNamespace("recipes") 1878 1879 if (names(resampleIndex)[iter] != "AllData") { 1880 modelIndex <- resampleIndex[[iter]] 1881 holdoutIndex <- ctrl$indexOut[[iter]] 1882 } else { 1883 modelIndex <- 1:nrow(data) 1884 holdoutIndex <- modelIndex 1885 } 1886 1887 seeds <- 1888 if (!(length(ctrl$seeds) == 1 && 1889 is.na(ctrl$seeds))) 1890 ctrl$seeds[[iter]] else 1891 NA 1892 1893 if (ctrl$verbose) 1894 cat("+(rfe)", 1895 names(resampleIndex)[iter], 1896 "recipe", 1897 "\n") 1898 1899 trained_rec <- prep( 1900 rec, training = data[modelIndex,,drop = FALSE], fresh = TRUE, 1901 verbose = FALSE, stringsAsFactors = TRUE, 1902 retain = TRUE 1903 ) 1904 1905 x <- juice(trained_rec, all_predictors(), composition = "data.frame") 1906 y <- juice(trained_rec, all_outcomes())[[1]] 1907 test_x <- bake( 1908 trained_rec, 1909 new_data = data[-modelIndex, , drop = FALSE], 1910 all_predictors(), 1911 composition = "data.frame" 1912 ) 1913 test_y <- bake( 1914 trained_rec, 1915 new_data = data[-modelIndex, , drop = FALSE], 1916 all_outcomes() 1917 )[[1]] 1918 1919 is_perf <- summary(trained_rec)$role == "performance var" 1920 if(any(is_perf)) { 1921 test_perf <- bake( 1922 trained_rec, 1923 new_data = data[-modelIndex, , drop = FALSE], 1924 has_role("performance var"), 1925 composition = "data.frame" 1926 ) 1927 } else test_perf <- NULL 1928 1929 p <- ncol(x) 1930 1931 if(length(sizes) > 0 && max(sizes) > p) 1932 sizes <- sizes[sizes <= p] 1933 1934 if (all(sizes < 2)) 1935 stop( 1936 "After the recipe, there are less than two predictors remaining. `rfe` ", 1937 "requires at least two.", 1938 call. = FALSE 1939 ) 1940 1941 if (length(sizes) == 0) 1942 stop( 1943 "After the recipe, there are only ", 1944 p, 1945 " predictors remaining. ", 1946 "The `sizes` values are inconsistent with this.", 1947 call. = FALSE 1948 ) 1949 1950 if (ctrl$verbose) 1951 cat("-(rfe)", 1952 names(resampleIndex)[iter], 1953 "recipe", 1954 "\n") 1955 1956 rfeResults <- rfe_rec( 1957 x, y, 1958 test_x, test_y, 1959 test_perf, 1960 sizes, ctrl, 1961 label = names(resampleIndex)[iter], 1962 seeds = seeds, 1963 ... 1964 ) 1965 resamples <- 1966 plyr::ddply(rfeResults$pred, 1967 .(Variables), 1968 ctrl$functions$summary, 1969 lev = lev) 1970 1971 if (ctrl$saveDetails) { 1972 rfeResults$pred$Resample <- names(resampleIndex)[iter] 1973 ## If the user did not have nrow(x) in 'sizes', rfeIter added it. 1974 ## So, we need to find out how many set of predictions there are: 1975 nReps <- length(table(rfeResults$pred$Variables)) 1976 rfeResults$pred$rowIndex <- 1977 rep(seq(along = y)[unique(holdoutIndex)], nReps) 1978 } 1979 1980 if (is.factor(y) && length(lev) <= 50) { 1981 cells <- 1982 plyr::ddply(rfeResults$pred, .(Variables), function(x) 1983 flatTable(x$pred, x$obs)) 1984 resamples <- merge(resamples, cells) 1985 } 1986 1987 resamples$Resample <- names(resampleIndex)[iter] 1988 vars <- do.call("rbind", rfeResults$finalVariables) 1989 vars$Resample <- names(resampleIndex)[iter] 1990 list( 1991 resamples = resamples, 1992 selectedVars = vars, 1993 predictions = if (ctrl$saveDetails) 1994 rfeResults$pred else NULL 1995 ) 1996 } 1997 1998 resamples <- 1999 do.call("rbind", result[names(result) == "resamples"]) 2000 rownames(resamples) <- NULL 2001 2002 if (ctrl$method %in% c("boot632")) { 2003 perfNames <- names(resamples) 2004 perfNames <- 2005 perfNames[!(perfNames %in% c("Resample", "Variables"))] 2006 perfNames <- perfNames[!grepl("^cell[0-9]", perfNames)] 2007 apparent <- subset(resamples, Resample == "AllData") 2008 apparent <- 2009 apparent[, !grepl("^\\.cell|Resample", colnames(apparent)), drop = FALSE] 2010 names(apparent)[which(names(apparent) %in% perfNames)] <- 2011 paste(names(apparent)[which(names(apparent) %in% perfNames)], 2012 "Apparent", sep = "") 2013 names(apparent) <- gsub("^\\.", "", names(apparent)) 2014 resamples <- subset(resamples, Resample != "AllData") 2015 } 2016 2017 externPerf <- 2018 plyr::ddply(resamples[, !grepl("\\.cell|Resample", colnames(resamples)), drop = FALSE], 2019 .(Variables), 2020 MeanSD, 2021 exclude = "Variables") 2022 numVars <- 2023 plyr::ddply(resamples[, !grepl("\\.cell|Resample", colnames(resamples)), drop = FALSE], 2024 .(Variables), 2025 function(x) c(Num_Resamples = nrow(x))) 2026 2027 externPerf <- merge(externPerf, numVars, by = "Variables", all = TRUE) 2028 externPerf <- externPerf[order(externPerf$Variables),, drop = FALSE] 2029 2030 if (ctrl$method %in% c("boot632")) { 2031 externPerf <- merge(externPerf, apparent) 2032 for (p in seq(along = perfNames)) { 2033 const <- 1 - exp(-1) 2034 externPerf[, perfNames[p]] <- 2035 (const * externPerf[, perfNames[p]]) + ((1 - const) * externPerf[, paste(perfNames[p], "Apparent", sep = "")]) 2036 } 2037 externPerf <- 2038 externPerf[,!(names(externPerf) %in% paste(perfNames, "Apparent", sep = ""))] 2039 } 2040 list(performance = externPerf, everything = result) 2041} 2042 2043rfe_rec_loo <- function(rec, data, sizes, ctrl, lev, ...) { 2044 loadNamespace("caret") 2045 loadNamespace("recipes") 2046 2047 resampleIndex <- ctrl$index 2048 `%op%` <- getOper(ctrl$allowParallel && getDoParWorkers() > 1) 2049 result <- 2050 foreach( 2051 iter = seq(along = resampleIndex), 2052 .combine = "c", 2053 .verbose = FALSE, 2054 .errorhandling = "stop", 2055 .packages = "caret" 2056 ) %op% { 2057 2058 loadNamespace("caret") 2059 loadNamespace("recipes") 2060 2061 requireNamespaceQuietStop("methods") 2062 2063 modelIndex <- resampleIndex[[iter]] 2064 holdoutIndex <- -unique(resampleIndex[[iter]]) 2065 2066 seeds <- 2067 if (!(length(ctrl$seeds) == 1 && 2068 is.na(ctrl$seeds))) 2069 ctrl$seeds[[iter]] else NA 2070 if(ctrl$verbose) 2071 cat("Preparing recipe\n") 2072 trained_rec <- prep( 2073 rec, training = data[modelIndex,,drop = FALSE], fresh = TRUE, 2074 verbose = FALSE, stringsAsFactors = TRUE, 2075 retain = TRUE 2076 ) 2077 2078 x <- juice(trained_rec, all_predictors(), composition = "data.frame") 2079 y <- juice(trained_rec, all_outcomes())[[1]] 2080 test_x <- bake( 2081 trained_rec, 2082 new_data = data[-modelIndex, , drop = FALSE], 2083 all_predictors(), 2084 composition = "data.frame" 2085 ) 2086 test_y <- bake( 2087 trained_rec, 2088 new_data = data[-modelIndex, , drop = FALSE], 2089 all_outcomes() 2090 )[[1]] 2091 2092 is_perf <- summary(trained_rec)$role == "performance var" 2093 if(any(is_perf)) { 2094 test_perf <- bake( 2095 trained_rec, 2096 new_data = data[-modelIndex, , drop = FALSE], 2097 has_role("performance var"), 2098 composition = "data.frame" 2099 ) 2100 } else test_perf <- NULL 2101 2102 p <- ncol(x) 2103 2104 if(length(sizes) > 0 && max(sizes) > p) 2105 sizes <- sizes[sizes <= p] 2106 2107 if (all(sizes < 2)) 2108 stop( 2109 "After the recipe, there are less than two predictors remaining. `rfe` ", 2110 "requires at least two.", 2111 call. = FALSE 2112 ) 2113 2114 if (length(sizes) == 0) 2115 stop( 2116 "After the recipe, there are only ", 2117 p, 2118 " predictors remaining. ", 2119 "The `sizes` values are inconsistent with this.", 2120 call. = FALSE 2121 ) 2122 2123 rfeResults <- rfe_rec( 2124 x, y, 2125 test_x, test_y, 2126 test_perf, 2127 sizes, ctrl, 2128 label = names(resampleIndex)[iter], 2129 seeds = seeds, 2130 ... 2131 ) 2132 rfeResults 2133 } 2134 preds <- do.call("rbind", result[names(result) == "pred"]) 2135 resamples <- 2136 ddply(preds, .(Variables), ctrl$functions$summary, lev = lev) 2137 list(performance = resamples, everything = result) 2138} 2139 2140 2141 2142