1#' Summarise y values at unique/binned x
2#'
3#' `stat_summary()` operates on unique `x` or `y`; `stat_summary_bin()`
4#' operates on binned `x` or `y`. They are more flexible versions of
5#' [stat_bin()]: instead of just counting, they can compute any
6#' aggregate.
7#'
8#' @eval rd_orientation()
9#'
10#' @eval rd_aesthetics("stat", "summary")
11#' @seealso [geom_errorbar()], [geom_pointrange()],
12#'  [geom_linerange()], [geom_crossbar()] for geoms to
13#'  display summarised data
14#' @inheritParams stat_identity
15#' @section Summary functions:
16#' You can either supply summary functions individually (`fun`,
17#' `fun.max`, `fun.min`), or as a single function (`fun.data`):
18#'
19#' \describe{
20#'   \item{fun.data}{Complete summary function. Should take numeric vector as
21#'      input and return data frame as output}
22#'   \item{fun.min}{min summary function (should take numeric vector and
23#'     return single number)}
24#'   \item{fun}{main summary function (should take numeric vector and return
25#'     single number)}
26#'   \item{fun.max}{max summary function (should take numeric vector and
27#'     return single number)}
28#' }
29#'
30#' A simple vector function is easiest to work with as you can return a single
31#' number, but is somewhat less flexible. If your summary function computes
32#' multiple values at once (e.g. min and max), use `fun.data`.
33#'
34#' `fun.data` will receive data as if it was oriented along the x-axis and
35#' should return a data.frame that corresponds to that orientation. The layer
36#' will take care of flipping the input and output if it is oriented along the
37#' y-axis.
38#'
39#' If no aggregation functions are supplied, will default to
40#' [mean_se()].
41#'
42#' @param fun.data A function that is given the complete data and should
43#'   return a data frame with variables `ymin`, `y`, and `ymax`.
44#' @param fun.min,fun,fun.max Alternatively, supply three individual
45#'   functions that are each passed a vector of values and should return a
46#'   single number.
47#' @param fun.ymin,fun.y,fun.ymax Deprecated, use the versions specified above
48#'   instead.
49#' @param fun.args Optional additional arguments passed on to the functions.
50#' @export
51#' @examples
52#' d <- ggplot(mtcars, aes(cyl, mpg)) + geom_point()
53#' d + stat_summary(fun.data = "mean_cl_boot", colour = "red", size = 2)
54#'
55#' # Orientation follows the discrete axis
56#' ggplot(mtcars, aes(mpg, factor(cyl))) +
57#'   geom_point() +
58#'   stat_summary(fun.data = "mean_cl_boot", colour = "red", size = 2)
59#'
60#' # You can supply individual functions to summarise the value at
61#' # each x:
62#' d + stat_summary(fun = "median", colour = "red", size = 2, geom = "point")
63#' d + stat_summary(fun = "mean", colour = "red", size = 2, geom = "point")
64#' d + aes(colour = factor(vs)) + stat_summary(fun = mean, geom="line")
65#'
66#' d + stat_summary(fun = mean, fun.min = min, fun.max = max, colour = "red")
67#'
68#' d <- ggplot(diamonds, aes(cut))
69#' d + geom_bar()
70#' d + stat_summary(aes(y = price), fun = "mean", geom = "bar")
71#'
72#' # Orientation of stat_summary_bin is ambiguous and must be specified directly
73#' ggplot(diamonds, aes(carat, price)) +
74#'   stat_summary_bin(fun = "mean", geom = "bar", orientation = 'y')
75#'
76#' \donttest{
77#' # Don't use ylim to zoom into a summary plot - this throws the
78#' # data away
79#' p <- ggplot(mtcars, aes(cyl, mpg)) +
80#'   stat_summary(fun = "mean", geom = "point")
81#' p
82#' p + ylim(15, 30)
83#' # Instead use coord_cartesian
84#' p + coord_cartesian(ylim = c(15, 30))
85#'
86#' # A set of useful summary functions is provided from the Hmisc package:
87#' stat_sum_df <- function(fun, geom="crossbar", ...) {
88#'   stat_summary(fun.data = fun, colour = "red", geom = geom, width = 0.2, ...)
89#' }
90#' d <- ggplot(mtcars, aes(cyl, mpg)) + geom_point()
91#' # The crossbar geom needs grouping to be specified when used with
92#' # a continuous x axis.
93#' d + stat_sum_df("mean_cl_boot", mapping = aes(group = cyl))
94#' d + stat_sum_df("mean_sdl", mapping = aes(group = cyl))
95#' d + stat_sum_df("mean_sdl", fun.args = list(mult = 1), mapping = aes(group = cyl))
96#' d + stat_sum_df("median_hilow", mapping = aes(group = cyl))
97#'
98#' # An example with highly skewed distributions:
99#' if (require("ggplot2movies")) {
100#' set.seed(596)
101#' mov <- movies[sample(nrow(movies), 1000), ]
102#'  m2 <-
103#'    ggplot(mov, aes(x = factor(round(rating)), y = votes)) +
104#'    geom_point()
105#'  m2 <-
106#'    m2 +
107#'    stat_summary(
108#'      fun.data = "mean_cl_boot",
109#'      geom = "crossbar",
110#'      colour = "red", width = 0.3
111#'    ) +
112#'    xlab("rating")
113#' m2
114#' # Notice how the overplotting skews off visual perception of the mean
115#' # supplementing the raw data with summary statistics is _very_ important
116#'
117#' # Next, we'll look at votes on a log scale.
118#'
119#' # Transforming the scale means the data are transformed
120#' # first, after which statistics are computed:
121#' m2 + scale_y_log10()
122#' # Transforming the coordinate system occurs after the
123#' # statistic has been computed. This means we're calculating the summary on the raw data
124#' # and stretching the geoms onto the log scale.  Compare the widths of the
125#' # standard errors.
126#' m2 + coord_trans(y="log10")
127#' }
128#' }
129stat_summary <- function(mapping = NULL, data = NULL,
130                         geom = "pointrange", position = "identity",
131                         ...,
132                         fun.data = NULL,
133                         fun = NULL,
134                         fun.max = NULL,
135                         fun.min = NULL,
136                         fun.args = list(),
137                         na.rm = FALSE,
138                         orientation = NA,
139                         show.legend = NA,
140                         inherit.aes = TRUE,
141                         fun.y, fun.ymin, fun.ymax) {
142  if (!missing(fun.y)) {
143    warn("`fun.y` is deprecated. Use `fun` instead.")
144    fun = fun %||% fun.y
145  }
146  if (!missing(fun.ymin)) {
147    warn("`fun.ymin` is deprecated. Use `fun.min` instead.")
148    fun.min = fun.min %||% fun.ymin
149  }
150  if (!missing(fun.ymax)) {
151    warn("`fun.ymax` is deprecated. Use `fun.max` instead.")
152    fun.max = fun.max %||% fun.ymax
153  }
154  layer(
155    data = data,
156    mapping = mapping,
157    stat = StatSummary,
158    geom = geom,
159    position = position,
160    show.legend = show.legend,
161    inherit.aes = inherit.aes,
162    params = list(
163      fun.data = fun.data,
164      fun = fun,
165      fun.max = fun.max,
166      fun.min = fun.min,
167      fun.args = fun.args,
168      na.rm = na.rm,
169      orientation = orientation,
170      ...
171    )
172  )
173}
174
175#' @rdname ggplot2-ggproto
176#' @format NULL
177#' @usage NULL
178#' @export
179StatSummary <- ggproto("StatSummary", Stat,
180  required_aes = c("x", "y"),
181
182  extra_params = c("na.rm", "orientation"),
183  setup_params = function(data, params) {
184    params$flipped_aes <- has_flipped_aes(data, params)
185    params
186  },
187
188  compute_panel = function(data, scales, fun.data = NULL, fun = NULL,
189                     fun.max = NULL, fun.min = NULL, fun.args = list(),
190                     na.rm = FALSE, flipped_aes = FALSE) {
191    data <- flip_data(data, flipped_aes)
192    fun <- make_summary_fun(fun.data, fun, fun.max, fun.min, fun.args)
193    summarised <- summarise_by_x(data, fun)
194    summarised$flipped_aes <- flipped_aes
195    flip_data(summarised, flipped_aes)
196  }
197)
198
199# Summarise a data.frame by parts
200# Summarise a data frame by unique value of x
201#
202# This function is used by [stat_summary()] to break a
203# data.frame into pieces, summarise each piece, and join the pieces
204# back together, retaining original columns unaffected by the summary.
205#
206# @param [data.frame()] to summarise
207# @param vector to summarise by
208# @param summary function (must take and return a data.frame)
209# @param other arguments passed on to summary function
210# @keyword internal
211summarise_by_x <- function(data, summary, ...) {
212  summary <- dapply(data, c("group", "x"), summary, ...)
213  unique <- dapply(data, c("group", "x"), uniquecols)
214  unique$y <- NULL
215
216  merge(summary, unique, by = c("x", "group"), sort = FALSE)
217}
218
219#' A selection of summary functions from Hmisc
220#'
221#' @description
222#' These are wrappers around functions from \pkg{Hmisc} designed to make them
223#' easier to use with [stat_summary()]. See the Hmisc documentation
224#' for more details:
225#'
226#'  - [Hmisc::smean.cl.boot()]
227#'  - [Hmisc::smean.cl.normal()]
228#'  - [Hmisc::smean.sdl()]
229#'  - [Hmisc::smedian.hilow()]
230#'
231#' @param x a numeric vector
232#' @param ... other arguments passed on to the respective Hmisc function.
233#' @return A data frame with columns `y`, `ymin`, and `ymax`.
234#' @name hmisc
235#' @examples
236#' if (requireNamespace("Hmisc", quietly = TRUE)) {
237#' x <- rnorm(100)
238#' mean_cl_boot(x)
239#' mean_cl_normal(x)
240#' mean_sdl(x)
241#' median_hilow(x)
242#' }
243NULL
244
245wrap_hmisc <- function(fun) {
246
247  function(x, ...) {
248    check_installed("Hmisc")
249
250    fun <- getExportedValue("Hmisc", fun)
251    result <- do.call(fun, list(x = quote(x), ...))
252
253    rename(
254      new_data_frame(as.list(result)),
255      c(Median = "y", Mean = "y", Lower = "ymin", Upper = "ymax")
256    )
257  }
258}
259#' @export
260#' @rdname hmisc
261mean_cl_boot <- wrap_hmisc("smean.cl.boot")
262#' @export
263#' @rdname hmisc
264mean_cl_normal <- wrap_hmisc("smean.cl.normal")
265#' @export
266#' @rdname hmisc
267mean_sdl <- wrap_hmisc("smean.sdl")
268#' @export
269#' @rdname hmisc
270median_hilow <- wrap_hmisc("smedian.hilow")
271
272#' Calculate mean and standard error of the mean
273#'
274#' For use with [stat_summary()]
275#'
276#' @param x numeric vector.
277#' @param mult number of multiples of standard error.
278#' @return A data frame with three columns:
279#' \describe{
280#'     \item{`y`}{ The mean.}
281#'     \item{`ymin`}{ The mean minus the multiples of the standard error.}
282#'     \item{`ymax`}{ The mean plus the multiples of the standard error.}
283#' }
284#' @export
285#' @examples
286#' x <- rnorm(100)
287#' mean_se(x)
288mean_se <- function(x, mult = 1) {
289  x <- stats::na.omit(x)
290  se <- mult * sqrt(stats::var(x) / length(x))
291  mean <- mean(x)
292  new_data_frame(list(y = mean, ymin = mean - se, ymax = mean + se), n = 1)
293}
294