1#' Summarise y values at unique/binned x 2#' 3#' `stat_summary()` operates on unique `x` or `y`; `stat_summary_bin()` 4#' operates on binned `x` or `y`. They are more flexible versions of 5#' [stat_bin()]: instead of just counting, they can compute any 6#' aggregate. 7#' 8#' @eval rd_orientation() 9#' 10#' @eval rd_aesthetics("stat", "summary") 11#' @seealso [geom_errorbar()], [geom_pointrange()], 12#' [geom_linerange()], [geom_crossbar()] for geoms to 13#' display summarised data 14#' @inheritParams stat_identity 15#' @section Summary functions: 16#' You can either supply summary functions individually (`fun`, 17#' `fun.max`, `fun.min`), or as a single function (`fun.data`): 18#' 19#' \describe{ 20#' \item{fun.data}{Complete summary function. Should take numeric vector as 21#' input and return data frame as output} 22#' \item{fun.min}{min summary function (should take numeric vector and 23#' return single number)} 24#' \item{fun}{main summary function (should take numeric vector and return 25#' single number)} 26#' \item{fun.max}{max summary function (should take numeric vector and 27#' return single number)} 28#' } 29#' 30#' A simple vector function is easiest to work with as you can return a single 31#' number, but is somewhat less flexible. If your summary function computes 32#' multiple values at once (e.g. min and max), use `fun.data`. 33#' 34#' `fun.data` will receive data as if it was oriented along the x-axis and 35#' should return a data.frame that corresponds to that orientation. The layer 36#' will take care of flipping the input and output if it is oriented along the 37#' y-axis. 38#' 39#' If no aggregation functions are supplied, will default to 40#' [mean_se()]. 41#' 42#' @param fun.data A function that is given the complete data and should 43#' return a data frame with variables `ymin`, `y`, and `ymax`. 44#' @param fun.min,fun,fun.max Alternatively, supply three individual 45#' functions that are each passed a vector of values and should return a 46#' single number. 47#' @param fun.ymin,fun.y,fun.ymax Deprecated, use the versions specified above 48#' instead. 49#' @param fun.args Optional additional arguments passed on to the functions. 50#' @export 51#' @examples 52#' d <- ggplot(mtcars, aes(cyl, mpg)) + geom_point() 53#' d + stat_summary(fun.data = "mean_cl_boot", colour = "red", size = 2) 54#' 55#' # Orientation follows the discrete axis 56#' ggplot(mtcars, aes(mpg, factor(cyl))) + 57#' geom_point() + 58#' stat_summary(fun.data = "mean_cl_boot", colour = "red", size = 2) 59#' 60#' # You can supply individual functions to summarise the value at 61#' # each x: 62#' d + stat_summary(fun = "median", colour = "red", size = 2, geom = "point") 63#' d + stat_summary(fun = "mean", colour = "red", size = 2, geom = "point") 64#' d + aes(colour = factor(vs)) + stat_summary(fun = mean, geom="line") 65#' 66#' d + stat_summary(fun = mean, fun.min = min, fun.max = max, colour = "red") 67#' 68#' d <- ggplot(diamonds, aes(cut)) 69#' d + geom_bar() 70#' d + stat_summary(aes(y = price), fun = "mean", geom = "bar") 71#' 72#' # Orientation of stat_summary_bin is ambiguous and must be specified directly 73#' ggplot(diamonds, aes(carat, price)) + 74#' stat_summary_bin(fun = "mean", geom = "bar", orientation = 'y') 75#' 76#' \donttest{ 77#' # Don't use ylim to zoom into a summary plot - this throws the 78#' # data away 79#' p <- ggplot(mtcars, aes(cyl, mpg)) + 80#' stat_summary(fun = "mean", geom = "point") 81#' p 82#' p + ylim(15, 30) 83#' # Instead use coord_cartesian 84#' p + coord_cartesian(ylim = c(15, 30)) 85#' 86#' # A set of useful summary functions is provided from the Hmisc package: 87#' stat_sum_df <- function(fun, geom="crossbar", ...) { 88#' stat_summary(fun.data = fun, colour = "red", geom = geom, width = 0.2, ...) 89#' } 90#' d <- ggplot(mtcars, aes(cyl, mpg)) + geom_point() 91#' # The crossbar geom needs grouping to be specified when used with 92#' # a continuous x axis. 93#' d + stat_sum_df("mean_cl_boot", mapping = aes(group = cyl)) 94#' d + stat_sum_df("mean_sdl", mapping = aes(group = cyl)) 95#' d + stat_sum_df("mean_sdl", fun.args = list(mult = 1), mapping = aes(group = cyl)) 96#' d + stat_sum_df("median_hilow", mapping = aes(group = cyl)) 97#' 98#' # An example with highly skewed distributions: 99#' if (require("ggplot2movies")) { 100#' set.seed(596) 101#' mov <- movies[sample(nrow(movies), 1000), ] 102#' m2 <- 103#' ggplot(mov, aes(x = factor(round(rating)), y = votes)) + 104#' geom_point() 105#' m2 <- 106#' m2 + 107#' stat_summary( 108#' fun.data = "mean_cl_boot", 109#' geom = "crossbar", 110#' colour = "red", width = 0.3 111#' ) + 112#' xlab("rating") 113#' m2 114#' # Notice how the overplotting skews off visual perception of the mean 115#' # supplementing the raw data with summary statistics is _very_ important 116#' 117#' # Next, we'll look at votes on a log scale. 118#' 119#' # Transforming the scale means the data are transformed 120#' # first, after which statistics are computed: 121#' m2 + scale_y_log10() 122#' # Transforming the coordinate system occurs after the 123#' # statistic has been computed. This means we're calculating the summary on the raw data 124#' # and stretching the geoms onto the log scale. Compare the widths of the 125#' # standard errors. 126#' m2 + coord_trans(y="log10") 127#' } 128#' } 129stat_summary <- function(mapping = NULL, data = NULL, 130 geom = "pointrange", position = "identity", 131 ..., 132 fun.data = NULL, 133 fun = NULL, 134 fun.max = NULL, 135 fun.min = NULL, 136 fun.args = list(), 137 na.rm = FALSE, 138 orientation = NA, 139 show.legend = NA, 140 inherit.aes = TRUE, 141 fun.y, fun.ymin, fun.ymax) { 142 if (!missing(fun.y)) { 143 warn("`fun.y` is deprecated. Use `fun` instead.") 144 fun = fun %||% fun.y 145 } 146 if (!missing(fun.ymin)) { 147 warn("`fun.ymin` is deprecated. Use `fun.min` instead.") 148 fun.min = fun.min %||% fun.ymin 149 } 150 if (!missing(fun.ymax)) { 151 warn("`fun.ymax` is deprecated. Use `fun.max` instead.") 152 fun.max = fun.max %||% fun.ymax 153 } 154 layer( 155 data = data, 156 mapping = mapping, 157 stat = StatSummary, 158 geom = geom, 159 position = position, 160 show.legend = show.legend, 161 inherit.aes = inherit.aes, 162 params = list( 163 fun.data = fun.data, 164 fun = fun, 165 fun.max = fun.max, 166 fun.min = fun.min, 167 fun.args = fun.args, 168 na.rm = na.rm, 169 orientation = orientation, 170 ... 171 ) 172 ) 173} 174 175#' @rdname ggplot2-ggproto 176#' @format NULL 177#' @usage NULL 178#' @export 179StatSummary <- ggproto("StatSummary", Stat, 180 required_aes = c("x", "y"), 181 182 extra_params = c("na.rm", "orientation"), 183 setup_params = function(data, params) { 184 params$flipped_aes <- has_flipped_aes(data, params) 185 params 186 }, 187 188 compute_panel = function(data, scales, fun.data = NULL, fun = NULL, 189 fun.max = NULL, fun.min = NULL, fun.args = list(), 190 na.rm = FALSE, flipped_aes = FALSE) { 191 data <- flip_data(data, flipped_aes) 192 fun <- make_summary_fun(fun.data, fun, fun.max, fun.min, fun.args) 193 summarised <- summarise_by_x(data, fun) 194 summarised$flipped_aes <- flipped_aes 195 flip_data(summarised, flipped_aes) 196 } 197) 198 199# Summarise a data.frame by parts 200# Summarise a data frame by unique value of x 201# 202# This function is used by [stat_summary()] to break a 203# data.frame into pieces, summarise each piece, and join the pieces 204# back together, retaining original columns unaffected by the summary. 205# 206# @param [data.frame()] to summarise 207# @param vector to summarise by 208# @param summary function (must take and return a data.frame) 209# @param other arguments passed on to summary function 210# @keyword internal 211summarise_by_x <- function(data, summary, ...) { 212 summary <- dapply(data, c("group", "x"), summary, ...) 213 unique <- dapply(data, c("group", "x"), uniquecols) 214 unique$y <- NULL 215 216 merge(summary, unique, by = c("x", "group"), sort = FALSE) 217} 218 219#' A selection of summary functions from Hmisc 220#' 221#' @description 222#' These are wrappers around functions from \pkg{Hmisc} designed to make them 223#' easier to use with [stat_summary()]. See the Hmisc documentation 224#' for more details: 225#' 226#' - [Hmisc::smean.cl.boot()] 227#' - [Hmisc::smean.cl.normal()] 228#' - [Hmisc::smean.sdl()] 229#' - [Hmisc::smedian.hilow()] 230#' 231#' @param x a numeric vector 232#' @param ... other arguments passed on to the respective Hmisc function. 233#' @return A data frame with columns `y`, `ymin`, and `ymax`. 234#' @name hmisc 235#' @examples 236#' if (requireNamespace("Hmisc", quietly = TRUE)) { 237#' x <- rnorm(100) 238#' mean_cl_boot(x) 239#' mean_cl_normal(x) 240#' mean_sdl(x) 241#' median_hilow(x) 242#' } 243NULL 244 245wrap_hmisc <- function(fun) { 246 247 function(x, ...) { 248 check_installed("Hmisc") 249 250 fun <- getExportedValue("Hmisc", fun) 251 result <- do.call(fun, list(x = quote(x), ...)) 252 253 rename( 254 new_data_frame(as.list(result)), 255 c(Median = "y", Mean = "y", Lower = "ymin", Upper = "ymax") 256 ) 257 } 258} 259#' @export 260#' @rdname hmisc 261mean_cl_boot <- wrap_hmisc("smean.cl.boot") 262#' @export 263#' @rdname hmisc 264mean_cl_normal <- wrap_hmisc("smean.cl.normal") 265#' @export 266#' @rdname hmisc 267mean_sdl <- wrap_hmisc("smean.sdl") 268#' @export 269#' @rdname hmisc 270median_hilow <- wrap_hmisc("smedian.hilow") 271 272#' Calculate mean and standard error of the mean 273#' 274#' For use with [stat_summary()] 275#' 276#' @param x numeric vector. 277#' @param mult number of multiples of standard error. 278#' @return A data frame with three columns: 279#' \describe{ 280#' \item{`y`}{ The mean.} 281#' \item{`ymin`}{ The mean minus the multiples of the standard error.} 282#' \item{`ymax`}{ The mean plus the multiples of the standard error.} 283#' } 284#' @export 285#' @examples 286#' x <- rnorm(100) 287#' mean_se(x) 288mean_se <- function(x, mult = 1) { 289 x <- stats::na.omit(x) 290 se <- mult * sqrt(stats::var(x) / length(x)) 291 mean <- mean(x) 292 new_data_frame(list(y = mean, ymin = mean - se, ymax = mean + se), n = 1) 293} 294