1#' A box and whiskers plot (in the style of Tukey)
2#'
3#' The boxplot compactly displays the distribution of a continuous variable.
4#' It visualises five summary statistics (the median, two hinges
5#' and two whiskers), and all "outlying" points individually.
6#'
7#' @eval rd_orientation()
8#'
9#' @section Summary statistics:
10#' The lower and upper hinges correspond to the first and third quartiles
11#' (the 25th and 75th percentiles). This differs slightly from the method used
12#' by the [boxplot()] function, and may be apparent with small samples.
13#' See [boxplot.stats()] for for more information on how hinge
14#' positions are calculated for [boxplot()].
15#'
16#' The upper whisker extends from the hinge to the largest value no further than
17#' 1.5 * IQR from the hinge (where IQR is the inter-quartile range, or distance
18#' between the first and third quartiles). The lower whisker extends from the
19#' hinge to the smallest value at most 1.5 * IQR of the hinge. Data beyond the
20#' end of the whiskers are called "outlying" points and are plotted
21#' individually.
22#'
23#' In a notched box plot, the notches extend `1.58 * IQR / sqrt(n)`.
24#' This gives a roughly 95% confidence interval for comparing medians.
25#' See McGill et al. (1978) for more details.
26#'
27#' @eval rd_aesthetics("geom", "boxplot")
28#'
29#' @seealso [geom_quantile()] for continuous `x`,
30#'   [geom_violin()] for a richer display of the distribution, and
31#'   [geom_jitter()] for a useful technique for small data.
32#' @inheritParams layer
33#' @inheritParams geom_bar
34#' @param geom,stat Use to override the default connection between
35#'   `geom_boxplot()` and `stat_boxplot()`.
36#' @param outlier.colour,outlier.color,outlier.fill,outlier.shape,outlier.size,outlier.stroke,outlier.alpha
37#'   Default aesthetics for outliers. Set to `NULL` to inherit from the
38#'   aesthetics used for the box.
39#'
40#'   In the unlikely event you specify both US and UK spellings of colour, the
41#'   US spelling will take precedence.
42#'
43#'   Sometimes it can be useful to hide the outliers, for example when overlaying
44#'   the raw data points on top of the boxplot. Hiding the outliers can be achieved
45#'   by setting `outlier.shape = NA`. Importantly, this does not remove the outliers,
46#'   it only hides them, so the range calculated for the y-axis will be the
47#'   same with outliers shown and outliers hidden.
48#'
49#' @param notch If `FALSE` (default) make a standard box plot. If
50#'   `TRUE`, make a notched box plot. Notches are used to compare groups;
51#'   if the notches of two boxes do not overlap, this suggests that the medians
52#'   are significantly different.
53#' @param notchwidth For a notched box plot, width of the notch relative to
54#'   the body (defaults to `notchwidth = 0.5`).
55#' @param varwidth If `FALSE` (default) make a standard box plot. If
56#'   `TRUE`, boxes are drawn with widths proportional to the
57#'   square-roots of the number of observations in the groups (possibly
58#'   weighted, using the `weight` aesthetic).
59#' @export
60#' @references McGill, R., Tukey, J. W. and Larsen, W. A. (1978) Variations of
61#'     box plots. The American Statistician 32, 12-16.
62#' @examples
63#' p <- ggplot(mpg, aes(class, hwy))
64#' p + geom_boxplot()
65#' # Orientation follows the discrete axis
66#' ggplot(mpg, aes(hwy, class)) + geom_boxplot()
67#'
68#' p + geom_boxplot(notch = TRUE)
69#' p + geom_boxplot(varwidth = TRUE)
70#' p + geom_boxplot(fill = "white", colour = "#3366FF")
71#' # By default, outlier points match the colour of the box. Use
72#' # outlier.colour to override
73#' p + geom_boxplot(outlier.colour = "red", outlier.shape = 1)
74#' # Remove outliers when overlaying boxplot with original data points
75#' p + geom_boxplot(outlier.shape = NA) + geom_jitter(width = 0.2)
76#'
77#' # Boxplots are automatically dodged when any aesthetic is a factor
78#' p + geom_boxplot(aes(colour = drv))
79#'
80#' # You can also use boxplots with continuous x, as long as you supply
81#' # a grouping variable. cut_width is particularly useful
82#' ggplot(diamonds, aes(carat, price)) +
83#'   geom_boxplot()
84#' ggplot(diamonds, aes(carat, price)) +
85#'   geom_boxplot(aes(group = cut_width(carat, 0.25)))
86#' # Adjust the transparency of outliers using outlier.alpha
87#' ggplot(diamonds, aes(carat, price)) +
88#'   geom_boxplot(aes(group = cut_width(carat, 0.25)), outlier.alpha = 0.1)
89#'
90#' \donttest{
91#' # It's possible to draw a boxplot with your own computations if you
92#' # use stat = "identity":
93#' y <- rnorm(100)
94#' df <- data.frame(
95#'   x = 1,
96#'   y0 = min(y),
97#'   y25 = quantile(y, 0.25),
98#'   y50 = median(y),
99#'   y75 = quantile(y, 0.75),
100#'   y100 = max(y)
101#' )
102#' ggplot(df, aes(x)) +
103#'   geom_boxplot(
104#'    aes(ymin = y0, lower = y25, middle = y50, upper = y75, ymax = y100),
105#'    stat = "identity"
106#'  )
107#' }
108geom_boxplot <- function(mapping = NULL, data = NULL,
109                         stat = "boxplot", position = "dodge2",
110                         ...,
111                         outlier.colour = NULL,
112                         outlier.color = NULL,
113                         outlier.fill = NULL,
114                         outlier.shape = 19,
115                         outlier.size = 1.5,
116                         outlier.stroke = 0.5,
117                         outlier.alpha = NULL,
118                         notch = FALSE,
119                         notchwidth = 0.5,
120                         varwidth = FALSE,
121                         na.rm = FALSE,
122                         orientation = NA,
123                         show.legend = NA,
124                         inherit.aes = TRUE) {
125
126  # varwidth = TRUE is not compatible with preserve = "total"
127  if (is.character(position)) {
128    if (varwidth == TRUE) position <- position_dodge2(preserve = "single")
129  } else {
130    if (identical(position$preserve, "total") & varwidth == TRUE) {
131      warn("Can't preserve total widths when varwidth = TRUE.")
132      position$preserve <- "single"
133    }
134  }
135
136  layer(
137    data = data,
138    mapping = mapping,
139    stat = stat,
140    geom = GeomBoxplot,
141    position = position,
142    show.legend = show.legend,
143    inherit.aes = inherit.aes,
144    params = list(
145      outlier.colour = outlier.color %||% outlier.colour,
146      outlier.fill = outlier.fill,
147      outlier.shape = outlier.shape,
148      outlier.size = outlier.size,
149      outlier.stroke = outlier.stroke,
150      outlier.alpha = outlier.alpha,
151      notch = notch,
152      notchwidth = notchwidth,
153      varwidth = varwidth,
154      na.rm = na.rm,
155      orientation = orientation,
156      ...
157    )
158  )
159}
160
161#' @rdname ggplot2-ggproto
162#' @format NULL
163#' @usage NULL
164#' @export
165GeomBoxplot <- ggproto("GeomBoxplot", Geom,
166
167  # need to declare `width` here in case this geom is used with a stat that
168  # doesn't have a `width` parameter (e.g., `stat_identity`).
169  extra_params = c("na.rm", "width", "orientation"),
170
171  setup_params = function(data, params) {
172    params$flipped_aes <- has_flipped_aes(data, params)
173    params
174  },
175
176  setup_data = function(data, params) {
177    data$flipped_aes <- params$flipped_aes
178    data <- flip_data(data, params$flipped_aes)
179    data$width <- data$width %||%
180      params$width %||% (resolution(data$x, FALSE) * 0.9)
181
182    if (!is.null(data$outliers)) {
183      suppressWarnings({
184        out_min <- vapply(data$outliers, min, numeric(1))
185        out_max <- vapply(data$outliers, max, numeric(1))
186      })
187
188      data$ymin_final  <- pmin(out_min, data$ymin)
189      data$ymax_final  <- pmax(out_max, data$ymax)
190    }
191
192    # if `varwidth` not requested or not available, don't use it
193    if (is.null(params) || is.null(params$varwidth) || !params$varwidth || is.null(data$relvarwidth)) {
194      data$xmin <- data$x - data$width / 2
195      data$xmax <- data$x + data$width / 2
196    } else {
197      # make `relvarwidth` relative to the size of the largest group
198      data$relvarwidth <- data$relvarwidth / max(data$relvarwidth)
199      data$xmin <- data$x - data$relvarwidth * data$width / 2
200      data$xmax <- data$x + data$relvarwidth * data$width / 2
201    }
202    data$width <- NULL
203    if (!is.null(data$relvarwidth)) data$relvarwidth <- NULL
204
205    flip_data(data, params$flipped_aes)
206  },
207
208  draw_group = function(data, panel_params, coord, fatten = 2,
209                        outlier.colour = NULL, outlier.fill = NULL,
210                        outlier.shape = 19,
211                        outlier.size = 1.5, outlier.stroke = 0.5,
212                        outlier.alpha = NULL,
213                        notch = FALSE, notchwidth = 0.5, varwidth = FALSE, flipped_aes = FALSE) {
214    data <- flip_data(data, flipped_aes)
215    # this may occur when using geom_boxplot(stat = "identity")
216    if (nrow(data) != 1) {
217      abort("Can't draw more than one boxplot per group. Did you forget aes(group = ...)?")
218    }
219
220    common <- list(
221      colour = data$colour,
222      size = data$size,
223      linetype = data$linetype,
224      fill = alpha(data$fill, data$alpha),
225      group = data$group
226    )
227
228    whiskers <- new_data_frame(c(
229      list(
230        x = c(data$x, data$x),
231        xend = c(data$x, data$x),
232        y = c(data$upper, data$lower),
233        yend = c(data$ymax, data$ymin),
234        alpha = c(NA_real_, NA_real_)
235      ),
236      common
237    ), n = 2)
238    whiskers <- flip_data(whiskers, flipped_aes)
239
240    box <- new_data_frame(c(
241      list(
242        xmin = data$xmin,
243        xmax = data$xmax,
244        ymin = data$lower,
245        y = data$middle,
246        ymax = data$upper,
247        ynotchlower = ifelse(notch, data$notchlower, NA),
248        ynotchupper = ifelse(notch, data$notchupper, NA),
249        notchwidth = notchwidth,
250        alpha = data$alpha
251      ),
252      common
253    ))
254    box <- flip_data(box, flipped_aes)
255
256    if (!is.null(data$outliers) && length(data$outliers[[1]] >= 1)) {
257      outliers <- new_data_frame(list(
258        y = data$outliers[[1]],
259        x = data$x[1],
260        colour = outlier.colour %||% data$colour[1],
261        fill = outlier.fill %||% data$fill[1],
262        shape = outlier.shape %||% data$shape[1],
263        size = outlier.size %||% data$size[1],
264        stroke = outlier.stroke %||% data$stroke[1],
265        fill = NA,
266        alpha = outlier.alpha %||% data$alpha[1]
267      ), n = length(data$outliers[[1]]))
268      outliers <- flip_data(outliers, flipped_aes)
269
270      outliers_grob <- GeomPoint$draw_panel(outliers, panel_params, coord)
271    } else {
272      outliers_grob <- NULL
273    }
274
275    ggname("geom_boxplot", grobTree(
276      outliers_grob,
277      GeomSegment$draw_panel(whiskers, panel_params, coord),
278      GeomCrossbar$draw_panel(box, fatten = fatten, panel_params, coord, flipped_aes = flipped_aes)
279    ))
280  },
281
282  draw_key = draw_key_boxplot,
283
284  default_aes = aes(weight = 1, colour = "grey20", fill = "white", size = 0.5,
285    alpha = NA, shape = 19, linetype = "solid"),
286
287  required_aes = c("x|y", "lower|xlower", "upper|xupper", "middle|xmiddle", "ymin|xmin", "ymax|xmax")
288)
289