1#' Lift Plot
2#'
3#' For classification models, this function creates a 'lift plot' that
4#' describes how well a model ranks samples for one class
5#'
6#' \code{lift.formula} is used to process the data and \code{xyplot.lift} is
7#' used to create the plot.
8#'
9#' To construct data for the the lift and gain plots, the following steps are
10#' used for each model:
11#'
12#' \enumerate{ \item The data are ordered by the numeric model prediction used
13#' on the right-hand side of the model formula \item Each unique value of the
14#' score is treated as a cut point \item The number of samples with true
15#' results equal to \code{class} are determined \item The lift is calculated as
16#' the ratio of the percentage of samples in each split corresponding to
17#' \code{class} over the same percentage in the entire data set} \code{lift}
18#' with \code{plot = "gain"} produces a plot of the cumulative lift values by
19#' the percentage of samples evaluated while \code{plot = "lift"} shows the cut
20#' point value versus the lift statistic.
21#'
22#' This implementation uses the \pkg{lattice} function
23#' \code{\link[lattice:xyplot]{xyplot}}, so plot elements can be changed via
24#' panel functions, \code{\link[lattice:trellis.par.get]{trellis.par.set}} or
25#' other means. \code{lift} uses the panel function \code{\link{panel.lift2}}
26#' by default, but it can be changes using
27#' \code{\link[lattice:update.trellis]{update.trellis}} (see the examples in
28#' \code{\link{panel.lift2}}).
29#'
30#' The following elements are set by default in the plot but can be changed by
31#' passing new values into \code{xyplot.lift}: \code{xlab = "\% Samples
32#' Tested"}, \code{ylab = "\% Samples Found"}, \code{type = "S"}, \code{ylim =
33#' extendrange(c(0, 100))} and \code{xlim = extendrange(c(0, 100))}.
34#'
35#' @aliases lift lift.formula lift.default xyplot.lift
36#' @param x a \code{lattice} formula (see \code{\link[lattice:xyplot]{xyplot}}
37#' for syntax) where the left-hand side of the formula is a factor class
38#' variable of the observed outcome and the right-hand side specifies one or
39#' model columns corresponding to a numeric ranking variable for a model (e.g.
40#' class probabilities). The classification variable should have two levels.
41#' @param data For \code{lift.formula}, a data frame (or more precisely,
42#' anything that is a valid \code{envir} argument in \code{eval}, e.g., a list
43#' or an environment) containing values for any variables in the formula, as
44#' well as \code{groups} and \code{subset} if applicable. If not found in
45#' \code{data}, or if \code{data} is unspecified, the variables are looked for
46#' in the environment of the formula. This argument is not used for
47#' \code{xyplot.lift} or \code{ggplot.lift}.
48#' @param class a character string for the class of interest
49#' @param subset An expression that evaluates to a logical or integer indexing
50#' vector. It is evaluated in \code{data}. Only the resulting rows of
51#' \code{data} are used for the plot.
52#' @param lattice.options A list that could be supplied to
53#' \code{\link[lattice:lattice.options]{lattice.options}}
54#' @param cuts If a single value is given, a sequence of values between 0 and 1
55#' are created with length \code{cuts}. If a vector, these values are used as
56#' the cuts. If \code{NULL}, each unique value of the model prediction is used.
57#' This is helpful when the data set is large.
58#' @param labels A named list of labels for keys. The list should have an
59#' element for each term on the right-hand side of the formula and the names
60#' should match the names of the models.
61#' @param plot Either "gain" (the default) or "lift". The former plots the
62#' number of samples called events versus the event rate while the latter shows
63#' the event cut-off versus the lift statistic.
64#' @param values A vector of numbers between 0 and 100 specifying reference
65#' values for the percentage of samples found (i.e. the y-axis). Corresponding
66#' points on the x-axis are found via interpolation and line segments are shown
67#' to indicate how many samples must be tested before these percentages are
68#' found. The lines use either the \code{plot.line} or \code{superpose.line}
69#' component of the current lattice theme to draw the lines (depending on
70#' whether groups were used. These values are only used when \code{type =
71#' "gain"}.
72#' @param \dots options to pass through to \code{\link[lattice:xyplot]{xyplot}}
73#' or the panel function (not used in \code{lift.formula}).
74#' @return \code{lift.formula} returns a list with elements: \item{data}{the
75#' data used for plotting} \item{cuts}{the number of cuts} \item{class}{the
76#' event class} \item{probNames}{the names of the model probabilities}
77#' \item{pct}{the baseline event rate}
78#'
79#' \code{xyplot.lift} returns a \pkg{lattice} object
80#' @author Max Kuhn, some \pkg{lattice} code and documentation by Deepayan
81#' Sarkar
82#' @seealso \code{\link[lattice:xyplot]{xyplot}},
83#' \code{\link[lattice:trellis.par.get]{trellis.par.set}}
84#' @keywords hplot
85#' @examples
86#'
87#' set.seed(1)
88#' simulated <- data.frame(obs = factor(rep(letters[1:2], each = 100)),
89#'                         perfect = sort(runif(200), decreasing = TRUE),
90#'                         random = runif(200))
91#'
92#' lift1 <- lift(obs ~ random, data = simulated)
93#' lift1
94#' xyplot(lift1)
95#'
96#' lift2 <- lift(obs ~ random + perfect, data = simulated)
97#' lift2
98#' xyplot(lift2, auto.key = list(columns = 2))
99#'
100#' xyplot(lift2, auto.key = list(columns = 2), value = c(10, 30))
101#'
102#' xyplot(lift2, plot = "lift", auto.key = list(columns = 2))
103#'
104#' @export lift
105lift <- function(x, ...) UseMethod("lift")
106
107#' @rdname lift
108#' @method lift default
109#' @export
110lift.default <- function(x, ...) stop("'x' should be a formula")
111
112#' @rdname lift
113#' @method lift formula
114#' @export
115lift.formula <- function(x, data = NULL,
116                         class = NULL,
117                         subset = TRUE,
118                         lattice.options = NULL,
119                         cuts = NULL,
120                         labels = NULL, ...)
121{
122
123  if (!is.null(lattice.options)) {
124    oopt <- lattice.options(lattice.options)
125    on.exit(lattice.options(oopt), add = TRUE)
126  }
127
128  formula <- x
129  groups  <- NULL
130  subset <- eval(substitute(subset), data, environment(x))
131
132  form <- latticeParseFormula(formula, data, subset = subset,
133                              groups = groups, multiple = TRUE, outer = TRUE,
134                              subscripts = TRUE, drop = TRUE)
135  liftData <- data.frame(prob = form$y)
136  probNames <- strsplit(form$right.name, " + ", fixed = TRUE)[[1]]
137
138  if(!is.null(labels)) {
139    if(length(labels) != length(probNames))
140      stop("labels should have an element for each term on the rhs of the formula")
141    if(!all(probNames %in% names(labels)))
142      stop(paste("labels should be a named vector or list with names:",
143                 paste(probNames, collapse = ", ")))
144  }
145
146  liftData <- data.frame(liftClassVar = rep(form$left, length(probNames)),
147                         liftProbVar = form$right)
148  liftData$liftModelVar <- if(length(probNames) > 1) form$condition[[length(form$condition)]] else probNames
149
150  if(length(form$condition) > 0 && any(names(form$condition) != "")) {
151    ind <- sum(names(form$condition) != "")
152    tmp <- as.data.frame(form$condition[1:ind], stringsAsFactors = TRUE)
153    liftData <- cbind(liftData, tmp)
154  }
155  if(!is.factor(liftData$liftClassVar))
156    stop("the left-hand side of the formula must be a factor of classes")
157
158  splitVars <- names(liftData)[!(names(liftData) %in% c("liftClassVar", "liftProbVar"))]
159
160  if(is.null(class)) class <- levels(liftData$liftClassVar)[1]
161  plotData <- ddply(liftData, splitVars, liftCalc, class = class, cuts = cuts)
162  if(!is.null(labels)) {
163    plotData$originalName <- plotData$liftModelVar
164    plotData$liftModelVar <- as.character(plotData$liftModelVar)
165    for(i in seq(along = labels)) plotData$liftModelVar[plotData$liftModelVar == names(labels)[i]] <- labels[i]
166    plotData$liftModelVar <- factor(plotData$liftModelVar,
167                                    levels = labels)
168  }
169  out <- list(data = plotData, class = class, probNames = probNames,
170              pct =  mean(liftData$liftClassVar == class)*100, call = match.call())
171  class(out) <- "lift"
172  out
173}
174
175#' @rdname lift
176#' @method print lift
177#' @export
178print.lift <- function(x, ...) {
179  printCall(x$call)
180  cat("Models:", paste(unique(x$data$liftModelVar), collapse = ", "), "\n")
181  cat("Event: ", x$class, " (", round( x$pct, 1), "%)\n", sep = "")
182  invisible(x)
183}
184
185#' @method plot lift
186#' @export
187plot.lift <- function(x, y = NULL, ...) xyplot.lift(x = x, data = NULL, ...)
188
189#' @rdname lift
190#' @method xyplot lift
191#' @importFrom stats as.formula
192#' @importFrom grDevices extendrange
193#' @export
194xyplot.lift <- function(x, data = NULL, plot = "gain", values = NULL, ...){
195  if(!(plot %in% c("lift", "gain")))
196    stop("`plot`` should be either 'lift' or 'gain'", call. = FALSE)
197  if(plot == "gain") {
198    lFormula <- "CumEventPct ~ CumTestedPct"
199    rng <- extendrange(c(0, 100))
200    opts <- list(...)
201    if(!any(names(opts) == "xlab")) opts$xlab <- "% Samples Tested"
202    if(!any(names(opts) == "ylab")) opts$ylab <- "% Samples Found"
203    if(!any(names(opts) == "type")) opts$type <- "l"
204    if(!any(names(opts) == "ylim")) opts$ylim <- rng
205    if(!any(names(opts) == "xlim")) opts$xlim <- rng
206    if(!any(names(opts) == "panel")) opts$panel <- panel.lift2
207  } else {
208    lFormula <- "lift ~ cuts"
209    x$data <- x$data[order(x$data$liftModelVar, x$data$cuts),]
210    rng <- extendrange(c(0, 100))
211    opts <- list(...)
212    if(!any(names(opts) == "xlab")) opts$xlab <- "Cut-Off"
213    if(!any(names(opts) == "ylab")) opts$ylab <- "Lift"
214    if(!any(names(opts) == "type")) opts$type <- "l"
215  }
216  args <- list(x = as.formula(lFormula),
217               data = x$data,
218               pct = x$pc,
219               values = values)
220  if(length(x$probNames) > 1) args$groups <- x$data$liftModelVar
221
222  args <- c(args, opts)
223  do.call("xyplot", args)
224}
225
226#' @importFrom stats complete.cases
227liftCalc <- function(x, class = levels(x$liftClassVar)[1], cuts = NULL) {
228  x <- x[complete.cases(x),]
229  lvl <- levels(x$liftClassVar)
230  x <- x[order(x$liftProbVar, decreasing = TRUE),]
231
232  nEvents <- sum(x$liftClassVar == class)
233  baseline <- mean(x$liftClassVar == class)
234  if(!is.null(cuts)) {
235    if(length(cuts) == 1) {
236      cuts <- rev(seq(0, 1, length = cuts))
237    } else {
238      cuts <- unique(c(1, sort(cuts, decreasing = TRUE), 0))
239    }
240  } else {
241    cuts <- sort(unique(x$liftProbVar), decreasing = TRUE)
242    cuts <- unique(c(1, sort(cuts, decreasing = TRUE), 0))
243  }
244
245  class2 <- levels(x$liftClassVar)
246  class2 <- class2[class2 != class]
247  tmp <- data.frame(cuts = cuts,
248                    events = NA,
249                    n = NA,
250                    Sn = NA,
251                    Sp = NA)
252  for(i in seq(along = cuts)) {
253    sub <- x$liftClassVar[x$liftProbVar >= tmp$cuts[i]]
254    tmp$n[i] <- length(sub)
255    tmp$events[i] <- sum(sub == class)
256    prd <- factor(ifelse(x$liftProbVar >= tmp$cuts[i], class, class2),
257                  levels = levels(x$liftClassVar))
258    tmp$Sn[i] <- sensitivity(prd,
259                             x$liftClassVar,
260                             positive = class)
261    tmp$Sp[i] <- specificity(prd,
262                             x$liftClassVar,
263                             negative = class2)
264  }
265
266  tmp$EventPct <- ifelse(tmp$n > 0, tmp$events/tmp$n*100, 0)
267  tmp$CumEventPct <- tmp$events/nEvents*100
268  tmp$lift <- tmp$events/tmp$n/baseline
269  tmp$CumTestedPct <- tmp$n/nrow(x)*100
270  tmp
271}
272
273#' @export
274panel.lift <- function(x,  y, ...) {
275  panel.xyplot(x, y, ...)
276  panel.abline(0, 1, col = "black")
277}
278
279
280
281#' Lattice Panel Functions for Lift Plots
282#'
283#' Two panel functions that be used in conjunction with \code{\link{lift}}.
284#'
285#' \code{panel.lift} plots the data with a simple (black) 45 degree reference
286#' line.
287#'
288#' \code{panel.lift2} is the default for \code{\link{lift}} and plots the data
289#' points with a shaded region encompassing the space between to the random
290#' model and perfect model trajectories. The color of the region is determined
291#' by the lattice \code{reference.line} information (see example below).
292#'
293#' @aliases panel.lift panel.lift2
294#' @param x the percentage of searched to be plotted in the scatterplot
295#' @param y the percentage of events found to be plotted in the scatterplot
296#' @param pct the baseline percentage of true events in the data
297#' @param values A vector of numbers between 0 and 100 specifying reference
298#' values for the percentage of samples found (i.e. the y-axis). Corresponding
299#' points on the x-axis are found via interpolation and line segments are shown
300#' to indicate how many samples must be tested before these percentages are
301#' found. The lines use either the \code{plot.line} or \code{superpose.line}
302#' component of the current lattice theme to draw the lines (depending on
303#' whether groups were used
304#' @param \dots options to pass to
305#' \code{\link[lattice:panel.xyplot]{panel.xyplot}}
306#' @author Max Kuhn
307#' @seealso \code{\link{lift}},
308#' \code{\link[lattice:panel.xyplot]{panel.xyplot}},
309#' \code{\link[lattice:xyplot]{xyplot}},
310#' \link[lattice:trellis.par.get]{trellis.par.set}
311#' @keywords hplot
312#' @examples
313#'
314#' set.seed(1)
315#' simulated <- data.frame(obs = factor(rep(letters[1:2], each = 100)),
316#'                         perfect = sort(runif(200), decreasing = TRUE),
317#'                         random = runif(200))
318#'
319#' regionInfo <- trellis.par.get("reference.line")
320#' regionInfo$col <- "lightblue"
321#' trellis.par.set("reference.line", regionInfo)
322#'
323#' lift2 <- lift(obs ~ random + perfect, data = simulated)
324#' lift2
325#' xyplot(lift2, auto.key = list(columns = 2))
326#'
327#' ## use a different panel function
328#' xyplot(lift2, panel = panel.lift)
329#'
330#' @export panel.lift2
331panel.lift2 <- function (x, y, pct = 0, values = NULL, ...)  {
332  polyx <- c(0, pct, 100, 0)
333  polyy <- c(0, 100, 100, 0)
334  regionStyle <- trellis.par.get("reference.line")
335
336  panel.polygon(polyx, polyy,
337                col = regionStyle$col,
338                border = regionStyle$col)
339  panel.xyplot(x, y, ...)
340  if(!is.null(values)){
341    theDots <- list(...)
342    if(any(names(theDots) == "groups")) {
343      dat <- data.frame(x = x, y = y, groups = theDots$groups)
344      ung <- unique(dat$groups)
345      for(i in seq(along = ung))  {
346        dat0 <- subset(dat, groups == ung[i])
347        plotRef(dat0$x, dat0$y, values, iter = i)
348      }
349
350    } else plotRef(x, y, values)
351
352  }
353}
354
355#' @importFrom stats approx
356plotRef <- function(x, y, v, iter = 0) {
357  if(iter == 0) {
358    lineStyle <- trellis.par.get("plot.line")
359  } else {
360    lineStyle <- trellis.par.get("superpose.line")
361    lineStyle <- lapply(lineStyle, function(x, i) x[min(length(x), i)], i = iter)
362  }
363  tmp_dat <- data.frame(CumTestedPct = x,
364                        CumEventPct = y)
365  ref_values <- get_ref_point(tmp_dat, v = v)
366  ref_values <- ref_values[!is.na(ref_values$CumTestedPct), ]
367  if(nrow(ref_values) > 0) {
368    for(i in 1:nrow(ref_values)) {
369      panel.segments(x0 = ref_values$CumTestedPct[i],
370                     x1 = ref_values$CumTestedPct[i],
371                     y0 = 0,
372                     y1 = ref_values$CumEventPct[i],
373                     lty = lineStyle$lty, col = lineStyle$col,
374                     alpha = lineStyle$alpha, lwd = lineStyle$lwd)
375      panel.segments(x0 = 0,
376                     x1 = ref_values$CumTestedPct[i],
377                     y0 = ref_values$CumEventPct[i],
378                     y1 = ref_values$CumEventPct[i],
379                     lty = lineStyle$lty, col = lineStyle$col,
380                     alpha = lineStyle$alpha, lwd = lineStyle$lwd)
381    }
382  }
383}
384
385
386
387utils::globalVariables(c("CumEventPct", "CumTestedPct",
388                         "cuts", "x1", "x2", "y1", "y2"))
389#' @rdname lift
390#' @param mapping,environment  Not used (required for \code{ggplot} consistency).
391#' @method ggplot lift
392#' @export
393ggplot.lift <- function (data = NULL, mapping = NULL, plot = "gain", values = NULL, ...,
394                 environment = NULL) {
395  if(!(plot %in% c("lift", "gain")))
396    stop("`plot`` should be either 'lift' or 'gain'", call. = FALSE)
397  names(data$data)[names(data$data) == "liftModelVar"] <- "Model"
398  nmod <- length(unique(data$data$Model))
399  if(plot == "gain") {
400    lines1 <- data.frame(x1 = 0, x2 = 100, y1 = 0, y2 = 100)
401    lines2 <- data.frame(x1 = 0, x2 = data$pct, y1 = 0, y2 = 100)
402    lines3 <- data.frame(x1 = data$pct, x2 = 100, y1 = 100, y2 = 100)
403    rng <- extendrange(c(0, 100))
404    res <- ggplot(data$data, aes(x = CumTestedPct, y = CumEventPct)) +
405      geom_segment(data = lines1,
406                   aes(x = x1, y = y1, xend = x2, yend = y2),
407                   alpha = .2, lty = 2) +
408      geom_segment(data = lines2,
409                   aes(x = x1, y = y1, xend = x2, yend = y2),
410                   alpha = .2, lty = 2) +
411      geom_segment(data = lines3,
412                   aes(x = x1, y = y1, xend = x2, yend = y2),
413                   alpha = .2, lty = 2) +
414      xlab("% Samples Tested") + ylab("% Samples Found") +
415      xlim(rng) + ylim(rng)
416    res <- if(nmod == 1)
417      res + geom_line()
418    else
419      res + geom_line(aes(col = Model))
420    if(!is.null(values)) {
421      ref_values <- ddply(data$data, .(Model), get_ref_point, v = values)
422      ref_values <- ref_values[!is.na(ref_values$CumTestedPct),]
423      if(nrow(ref_values) > 0) {
424        if(nmod > 1) {
425          res <- res +
426            geom_segment(data = ref_values,
427                         aes(x = CumTestedPct, y = CumEventPct,
428                             xend = CumTestedPct, yend = 0,
429                             color = Model))+
430            geom_segment(data = ref_values,
431                         aes(x = CumTestedPct, y = CumEventPct,
432                             xend = 0, yend = CumEventPct,
433                             color = Model))
434        } else {
435          res <- res +
436            geom_segment(data = ref_values,
437                         aes(x = CumTestedPct, y = CumEventPct,
438                             xend = CumTestedPct, yend = 0)) +
439            geom_segment(data = ref_values,
440                         aes(x = CumTestedPct, y = CumEventPct,
441                             xend = 0, yend = CumEventPct))
442        }
443      }
444    }
445  } else {
446    data$data <- data$data[!is.na(data$data$lift),]
447    res <- ggplot(data$data, aes(x = cuts, y = lift)) +
448      xlab("Cut-Off") + ylab("Lift")
449    res <- if(nmod == 1)
450      res + geom_line()
451    else
452      res + geom_line(aes(col = Model))
453  }
454  res
455}
456
457
458get_ref_point <- function(dat, v, window = 5) {
459  x <- dat$CumTestedPct
460  y <- dat$CumEventPct
461  erx <- extendrange(x)
462  ery <- extendrange(y)
463
464  res <- data.frame(CumEventPct = v,
465                    CumTestedPct = NA)
466
467  for(i in seq(along = v)) {
468    nearest <- which.min((y - v[i])^2)
469    index <- max(1, nearest - window):min(length(y), nearest + window)
470    res$CumTestedPct[i] <-
471      if (length(index) > 2)
472        approx(y[index], x[index], xout = v[i])$y
473    else
474      NA
475  }
476  res
477}
478
479