1#' Lift Plot 2#' 3#' For classification models, this function creates a 'lift plot' that 4#' describes how well a model ranks samples for one class 5#' 6#' \code{lift.formula} is used to process the data and \code{xyplot.lift} is 7#' used to create the plot. 8#' 9#' To construct data for the the lift and gain plots, the following steps are 10#' used for each model: 11#' 12#' \enumerate{ \item The data are ordered by the numeric model prediction used 13#' on the right-hand side of the model formula \item Each unique value of the 14#' score is treated as a cut point \item The number of samples with true 15#' results equal to \code{class} are determined \item The lift is calculated as 16#' the ratio of the percentage of samples in each split corresponding to 17#' \code{class} over the same percentage in the entire data set} \code{lift} 18#' with \code{plot = "gain"} produces a plot of the cumulative lift values by 19#' the percentage of samples evaluated while \code{plot = "lift"} shows the cut 20#' point value versus the lift statistic. 21#' 22#' This implementation uses the \pkg{lattice} function 23#' \code{\link[lattice:xyplot]{xyplot}}, so plot elements can be changed via 24#' panel functions, \code{\link[lattice:trellis.par.get]{trellis.par.set}} or 25#' other means. \code{lift} uses the panel function \code{\link{panel.lift2}} 26#' by default, but it can be changes using 27#' \code{\link[lattice:update.trellis]{update.trellis}} (see the examples in 28#' \code{\link{panel.lift2}}). 29#' 30#' The following elements are set by default in the plot but can be changed by 31#' passing new values into \code{xyplot.lift}: \code{xlab = "\% Samples 32#' Tested"}, \code{ylab = "\% Samples Found"}, \code{type = "S"}, \code{ylim = 33#' extendrange(c(0, 100))} and \code{xlim = extendrange(c(0, 100))}. 34#' 35#' @aliases lift lift.formula lift.default xyplot.lift 36#' @param x a \code{lattice} formula (see \code{\link[lattice:xyplot]{xyplot}} 37#' for syntax) where the left-hand side of the formula is a factor class 38#' variable of the observed outcome and the right-hand side specifies one or 39#' model columns corresponding to a numeric ranking variable for a model (e.g. 40#' class probabilities). The classification variable should have two levels. 41#' @param data For \code{lift.formula}, a data frame (or more precisely, 42#' anything that is a valid \code{envir} argument in \code{eval}, e.g., a list 43#' or an environment) containing values for any variables in the formula, as 44#' well as \code{groups} and \code{subset} if applicable. If not found in 45#' \code{data}, or if \code{data} is unspecified, the variables are looked for 46#' in the environment of the formula. This argument is not used for 47#' \code{xyplot.lift} or \code{ggplot.lift}. 48#' @param class a character string for the class of interest 49#' @param subset An expression that evaluates to a logical or integer indexing 50#' vector. It is evaluated in \code{data}. Only the resulting rows of 51#' \code{data} are used for the plot. 52#' @param lattice.options A list that could be supplied to 53#' \code{\link[lattice:lattice.options]{lattice.options}} 54#' @param cuts If a single value is given, a sequence of values between 0 and 1 55#' are created with length \code{cuts}. If a vector, these values are used as 56#' the cuts. If \code{NULL}, each unique value of the model prediction is used. 57#' This is helpful when the data set is large. 58#' @param labels A named list of labels for keys. The list should have an 59#' element for each term on the right-hand side of the formula and the names 60#' should match the names of the models. 61#' @param plot Either "gain" (the default) or "lift". The former plots the 62#' number of samples called events versus the event rate while the latter shows 63#' the event cut-off versus the lift statistic. 64#' @param values A vector of numbers between 0 and 100 specifying reference 65#' values for the percentage of samples found (i.e. the y-axis). Corresponding 66#' points on the x-axis are found via interpolation and line segments are shown 67#' to indicate how many samples must be tested before these percentages are 68#' found. The lines use either the \code{plot.line} or \code{superpose.line} 69#' component of the current lattice theme to draw the lines (depending on 70#' whether groups were used. These values are only used when \code{type = 71#' "gain"}. 72#' @param \dots options to pass through to \code{\link[lattice:xyplot]{xyplot}} 73#' or the panel function (not used in \code{lift.formula}). 74#' @return \code{lift.formula} returns a list with elements: \item{data}{the 75#' data used for plotting} \item{cuts}{the number of cuts} \item{class}{the 76#' event class} \item{probNames}{the names of the model probabilities} 77#' \item{pct}{the baseline event rate} 78#' 79#' \code{xyplot.lift} returns a \pkg{lattice} object 80#' @author Max Kuhn, some \pkg{lattice} code and documentation by Deepayan 81#' Sarkar 82#' @seealso \code{\link[lattice:xyplot]{xyplot}}, 83#' \code{\link[lattice:trellis.par.get]{trellis.par.set}} 84#' @keywords hplot 85#' @examples 86#' 87#' set.seed(1) 88#' simulated <- data.frame(obs = factor(rep(letters[1:2], each = 100)), 89#' perfect = sort(runif(200), decreasing = TRUE), 90#' random = runif(200)) 91#' 92#' lift1 <- lift(obs ~ random, data = simulated) 93#' lift1 94#' xyplot(lift1) 95#' 96#' lift2 <- lift(obs ~ random + perfect, data = simulated) 97#' lift2 98#' xyplot(lift2, auto.key = list(columns = 2)) 99#' 100#' xyplot(lift2, auto.key = list(columns = 2), value = c(10, 30)) 101#' 102#' xyplot(lift2, plot = "lift", auto.key = list(columns = 2)) 103#' 104#' @export lift 105lift <- function(x, ...) UseMethod("lift") 106 107#' @rdname lift 108#' @method lift default 109#' @export 110lift.default <- function(x, ...) stop("'x' should be a formula") 111 112#' @rdname lift 113#' @method lift formula 114#' @export 115lift.formula <- function(x, data = NULL, 116 class = NULL, 117 subset = TRUE, 118 lattice.options = NULL, 119 cuts = NULL, 120 labels = NULL, ...) 121{ 122 123 if (!is.null(lattice.options)) { 124 oopt <- lattice.options(lattice.options) 125 on.exit(lattice.options(oopt), add = TRUE) 126 } 127 128 formula <- x 129 groups <- NULL 130 subset <- eval(substitute(subset), data, environment(x)) 131 132 form <- latticeParseFormula(formula, data, subset = subset, 133 groups = groups, multiple = TRUE, outer = TRUE, 134 subscripts = TRUE, drop = TRUE) 135 liftData <- data.frame(prob = form$y) 136 probNames <- strsplit(form$right.name, " + ", fixed = TRUE)[[1]] 137 138 if(!is.null(labels)) { 139 if(length(labels) != length(probNames)) 140 stop("labels should have an element for each term on the rhs of the formula") 141 if(!all(probNames %in% names(labels))) 142 stop(paste("labels should be a named vector or list with names:", 143 paste(probNames, collapse = ", "))) 144 } 145 146 liftData <- data.frame(liftClassVar = rep(form$left, length(probNames)), 147 liftProbVar = form$right) 148 liftData$liftModelVar <- if(length(probNames) > 1) form$condition[[length(form$condition)]] else probNames 149 150 if(length(form$condition) > 0 && any(names(form$condition) != "")) { 151 ind <- sum(names(form$condition) != "") 152 tmp <- as.data.frame(form$condition[1:ind], stringsAsFactors = TRUE) 153 liftData <- cbind(liftData, tmp) 154 } 155 if(!is.factor(liftData$liftClassVar)) 156 stop("the left-hand side of the formula must be a factor of classes") 157 158 splitVars <- names(liftData)[!(names(liftData) %in% c("liftClassVar", "liftProbVar"))] 159 160 if(is.null(class)) class <- levels(liftData$liftClassVar)[1] 161 plotData <- ddply(liftData, splitVars, liftCalc, class = class, cuts = cuts) 162 if(!is.null(labels)) { 163 plotData$originalName <- plotData$liftModelVar 164 plotData$liftModelVar <- as.character(plotData$liftModelVar) 165 for(i in seq(along = labels)) plotData$liftModelVar[plotData$liftModelVar == names(labels)[i]] <- labels[i] 166 plotData$liftModelVar <- factor(plotData$liftModelVar, 167 levels = labels) 168 } 169 out <- list(data = plotData, class = class, probNames = probNames, 170 pct = mean(liftData$liftClassVar == class)*100, call = match.call()) 171 class(out) <- "lift" 172 out 173} 174 175#' @rdname lift 176#' @method print lift 177#' @export 178print.lift <- function(x, ...) { 179 printCall(x$call) 180 cat("Models:", paste(unique(x$data$liftModelVar), collapse = ", "), "\n") 181 cat("Event: ", x$class, " (", round( x$pct, 1), "%)\n", sep = "") 182 invisible(x) 183} 184 185#' @method plot lift 186#' @export 187plot.lift <- function(x, y = NULL, ...) xyplot.lift(x = x, data = NULL, ...) 188 189#' @rdname lift 190#' @method xyplot lift 191#' @importFrom stats as.formula 192#' @importFrom grDevices extendrange 193#' @export 194xyplot.lift <- function(x, data = NULL, plot = "gain", values = NULL, ...){ 195 if(!(plot %in% c("lift", "gain"))) 196 stop("`plot`` should be either 'lift' or 'gain'", call. = FALSE) 197 if(plot == "gain") { 198 lFormula <- "CumEventPct ~ CumTestedPct" 199 rng <- extendrange(c(0, 100)) 200 opts <- list(...) 201 if(!any(names(opts) == "xlab")) opts$xlab <- "% Samples Tested" 202 if(!any(names(opts) == "ylab")) opts$ylab <- "% Samples Found" 203 if(!any(names(opts) == "type")) opts$type <- "l" 204 if(!any(names(opts) == "ylim")) opts$ylim <- rng 205 if(!any(names(opts) == "xlim")) opts$xlim <- rng 206 if(!any(names(opts) == "panel")) opts$panel <- panel.lift2 207 } else { 208 lFormula <- "lift ~ cuts" 209 x$data <- x$data[order(x$data$liftModelVar, x$data$cuts),] 210 rng <- extendrange(c(0, 100)) 211 opts <- list(...) 212 if(!any(names(opts) == "xlab")) opts$xlab <- "Cut-Off" 213 if(!any(names(opts) == "ylab")) opts$ylab <- "Lift" 214 if(!any(names(opts) == "type")) opts$type <- "l" 215 } 216 args <- list(x = as.formula(lFormula), 217 data = x$data, 218 pct = x$pc, 219 values = values) 220 if(length(x$probNames) > 1) args$groups <- x$data$liftModelVar 221 222 args <- c(args, opts) 223 do.call("xyplot", args) 224} 225 226#' @importFrom stats complete.cases 227liftCalc <- function(x, class = levels(x$liftClassVar)[1], cuts = NULL) { 228 x <- x[complete.cases(x),] 229 lvl <- levels(x$liftClassVar) 230 x <- x[order(x$liftProbVar, decreasing = TRUE),] 231 232 nEvents <- sum(x$liftClassVar == class) 233 baseline <- mean(x$liftClassVar == class) 234 if(!is.null(cuts)) { 235 if(length(cuts) == 1) { 236 cuts <- rev(seq(0, 1, length = cuts)) 237 } else { 238 cuts <- unique(c(1, sort(cuts, decreasing = TRUE), 0)) 239 } 240 } else { 241 cuts <- sort(unique(x$liftProbVar), decreasing = TRUE) 242 cuts <- unique(c(1, sort(cuts, decreasing = TRUE), 0)) 243 } 244 245 class2 <- levels(x$liftClassVar) 246 class2 <- class2[class2 != class] 247 tmp <- data.frame(cuts = cuts, 248 events = NA, 249 n = NA, 250 Sn = NA, 251 Sp = NA) 252 for(i in seq(along = cuts)) { 253 sub <- x$liftClassVar[x$liftProbVar >= tmp$cuts[i]] 254 tmp$n[i] <- length(sub) 255 tmp$events[i] <- sum(sub == class) 256 prd <- factor(ifelse(x$liftProbVar >= tmp$cuts[i], class, class2), 257 levels = levels(x$liftClassVar)) 258 tmp$Sn[i] <- sensitivity(prd, 259 x$liftClassVar, 260 positive = class) 261 tmp$Sp[i] <- specificity(prd, 262 x$liftClassVar, 263 negative = class2) 264 } 265 266 tmp$EventPct <- ifelse(tmp$n > 0, tmp$events/tmp$n*100, 0) 267 tmp$CumEventPct <- tmp$events/nEvents*100 268 tmp$lift <- tmp$events/tmp$n/baseline 269 tmp$CumTestedPct <- tmp$n/nrow(x)*100 270 tmp 271} 272 273#' @export 274panel.lift <- function(x, y, ...) { 275 panel.xyplot(x, y, ...) 276 panel.abline(0, 1, col = "black") 277} 278 279 280 281#' Lattice Panel Functions for Lift Plots 282#' 283#' Two panel functions that be used in conjunction with \code{\link{lift}}. 284#' 285#' \code{panel.lift} plots the data with a simple (black) 45 degree reference 286#' line. 287#' 288#' \code{panel.lift2} is the default for \code{\link{lift}} and plots the data 289#' points with a shaded region encompassing the space between to the random 290#' model and perfect model trajectories. The color of the region is determined 291#' by the lattice \code{reference.line} information (see example below). 292#' 293#' @aliases panel.lift panel.lift2 294#' @param x the percentage of searched to be plotted in the scatterplot 295#' @param y the percentage of events found to be plotted in the scatterplot 296#' @param pct the baseline percentage of true events in the data 297#' @param values A vector of numbers between 0 and 100 specifying reference 298#' values for the percentage of samples found (i.e. the y-axis). Corresponding 299#' points on the x-axis are found via interpolation and line segments are shown 300#' to indicate how many samples must be tested before these percentages are 301#' found. The lines use either the \code{plot.line} or \code{superpose.line} 302#' component of the current lattice theme to draw the lines (depending on 303#' whether groups were used 304#' @param \dots options to pass to 305#' \code{\link[lattice:panel.xyplot]{panel.xyplot}} 306#' @author Max Kuhn 307#' @seealso \code{\link{lift}}, 308#' \code{\link[lattice:panel.xyplot]{panel.xyplot}}, 309#' \code{\link[lattice:xyplot]{xyplot}}, 310#' \link[lattice:trellis.par.get]{trellis.par.set} 311#' @keywords hplot 312#' @examples 313#' 314#' set.seed(1) 315#' simulated <- data.frame(obs = factor(rep(letters[1:2], each = 100)), 316#' perfect = sort(runif(200), decreasing = TRUE), 317#' random = runif(200)) 318#' 319#' regionInfo <- trellis.par.get("reference.line") 320#' regionInfo$col <- "lightblue" 321#' trellis.par.set("reference.line", regionInfo) 322#' 323#' lift2 <- lift(obs ~ random + perfect, data = simulated) 324#' lift2 325#' xyplot(lift2, auto.key = list(columns = 2)) 326#' 327#' ## use a different panel function 328#' xyplot(lift2, panel = panel.lift) 329#' 330#' @export panel.lift2 331panel.lift2 <- function (x, y, pct = 0, values = NULL, ...) { 332 polyx <- c(0, pct, 100, 0) 333 polyy <- c(0, 100, 100, 0) 334 regionStyle <- trellis.par.get("reference.line") 335 336 panel.polygon(polyx, polyy, 337 col = regionStyle$col, 338 border = regionStyle$col) 339 panel.xyplot(x, y, ...) 340 if(!is.null(values)){ 341 theDots <- list(...) 342 if(any(names(theDots) == "groups")) { 343 dat <- data.frame(x = x, y = y, groups = theDots$groups) 344 ung <- unique(dat$groups) 345 for(i in seq(along = ung)) { 346 dat0 <- subset(dat, groups == ung[i]) 347 plotRef(dat0$x, dat0$y, values, iter = i) 348 } 349 350 } else plotRef(x, y, values) 351 352 } 353} 354 355#' @importFrom stats approx 356plotRef <- function(x, y, v, iter = 0) { 357 if(iter == 0) { 358 lineStyle <- trellis.par.get("plot.line") 359 } else { 360 lineStyle <- trellis.par.get("superpose.line") 361 lineStyle <- lapply(lineStyle, function(x, i) x[min(length(x), i)], i = iter) 362 } 363 tmp_dat <- data.frame(CumTestedPct = x, 364 CumEventPct = y) 365 ref_values <- get_ref_point(tmp_dat, v = v) 366 ref_values <- ref_values[!is.na(ref_values$CumTestedPct), ] 367 if(nrow(ref_values) > 0) { 368 for(i in 1:nrow(ref_values)) { 369 panel.segments(x0 = ref_values$CumTestedPct[i], 370 x1 = ref_values$CumTestedPct[i], 371 y0 = 0, 372 y1 = ref_values$CumEventPct[i], 373 lty = lineStyle$lty, col = lineStyle$col, 374 alpha = lineStyle$alpha, lwd = lineStyle$lwd) 375 panel.segments(x0 = 0, 376 x1 = ref_values$CumTestedPct[i], 377 y0 = ref_values$CumEventPct[i], 378 y1 = ref_values$CumEventPct[i], 379 lty = lineStyle$lty, col = lineStyle$col, 380 alpha = lineStyle$alpha, lwd = lineStyle$lwd) 381 } 382 } 383} 384 385 386 387utils::globalVariables(c("CumEventPct", "CumTestedPct", 388 "cuts", "x1", "x2", "y1", "y2")) 389#' @rdname lift 390#' @param mapping,environment Not used (required for \code{ggplot} consistency). 391#' @method ggplot lift 392#' @export 393ggplot.lift <- function (data = NULL, mapping = NULL, plot = "gain", values = NULL, ..., 394 environment = NULL) { 395 if(!(plot %in% c("lift", "gain"))) 396 stop("`plot`` should be either 'lift' or 'gain'", call. = FALSE) 397 names(data$data)[names(data$data) == "liftModelVar"] <- "Model" 398 nmod <- length(unique(data$data$Model)) 399 if(plot == "gain") { 400 lines1 <- data.frame(x1 = 0, x2 = 100, y1 = 0, y2 = 100) 401 lines2 <- data.frame(x1 = 0, x2 = data$pct, y1 = 0, y2 = 100) 402 lines3 <- data.frame(x1 = data$pct, x2 = 100, y1 = 100, y2 = 100) 403 rng <- extendrange(c(0, 100)) 404 res <- ggplot(data$data, aes(x = CumTestedPct, y = CumEventPct)) + 405 geom_segment(data = lines1, 406 aes(x = x1, y = y1, xend = x2, yend = y2), 407 alpha = .2, lty = 2) + 408 geom_segment(data = lines2, 409 aes(x = x1, y = y1, xend = x2, yend = y2), 410 alpha = .2, lty = 2) + 411 geom_segment(data = lines3, 412 aes(x = x1, y = y1, xend = x2, yend = y2), 413 alpha = .2, lty = 2) + 414 xlab("% Samples Tested") + ylab("% Samples Found") + 415 xlim(rng) + ylim(rng) 416 res <- if(nmod == 1) 417 res + geom_line() 418 else 419 res + geom_line(aes(col = Model)) 420 if(!is.null(values)) { 421 ref_values <- ddply(data$data, .(Model), get_ref_point, v = values) 422 ref_values <- ref_values[!is.na(ref_values$CumTestedPct),] 423 if(nrow(ref_values) > 0) { 424 if(nmod > 1) { 425 res <- res + 426 geom_segment(data = ref_values, 427 aes(x = CumTestedPct, y = CumEventPct, 428 xend = CumTestedPct, yend = 0, 429 color = Model))+ 430 geom_segment(data = ref_values, 431 aes(x = CumTestedPct, y = CumEventPct, 432 xend = 0, yend = CumEventPct, 433 color = Model)) 434 } else { 435 res <- res + 436 geom_segment(data = ref_values, 437 aes(x = CumTestedPct, y = CumEventPct, 438 xend = CumTestedPct, yend = 0)) + 439 geom_segment(data = ref_values, 440 aes(x = CumTestedPct, y = CumEventPct, 441 xend = 0, yend = CumEventPct)) 442 } 443 } 444 } 445 } else { 446 data$data <- data$data[!is.na(data$data$lift),] 447 res <- ggplot(data$data, aes(x = cuts, y = lift)) + 448 xlab("Cut-Off") + ylab("Lift") 449 res <- if(nmod == 1) 450 res + geom_line() 451 else 452 res + geom_line(aes(col = Model)) 453 } 454 res 455} 456 457 458get_ref_point <- function(dat, v, window = 5) { 459 x <- dat$CumTestedPct 460 y <- dat$CumEventPct 461 erx <- extendrange(x) 462 ery <- extendrange(y) 463 464 res <- data.frame(CumEventPct = v, 465 CumTestedPct = NA) 466 467 for(i in seq(along = v)) { 468 nearest <- which.min((y - v[i])^2) 469 index <- max(1, nearest - window):min(length(y), nearest + window) 470 res$CumTestedPct[i] <- 471 if (length(index) > 2) 472 approx(y[index], x[index], xout = v[i])$y 473 else 474 NA 475 } 476 res 477} 478 479