1#' Plot feature importance as a bar graph 2#' 3#' Represents previously calculated feature importance as a bar graph. 4#' \code{xgb.plot.importance} uses base R graphics, while \code{xgb.ggplot.importance} uses the ggplot backend. 5#' 6#' @param importance_matrix a \code{data.table} returned by \code{\link{xgb.importance}}. 7#' @param top_n maximal number of top features to include into the plot. 8#' @param measure the name of importance measure to plot. 9#' When \code{NULL}, 'Gain' would be used for trees and 'Weight' would be used for gblinear. 10#' @param rel_to_first whether importance values should be represented as relative to the highest ranked feature. 11#' See Details. 12#' @param left_margin (base R barplot) allows to adjust the left margin size to fit feature names. 13#' When it is NULL, the existing \code{par('mar')} is used. 14#' @param cex (base R barplot) passed as \code{cex.names} parameter to \code{barplot}. 15#' @param plot (base R barplot) whether a barplot should be produced. 16#' If FALSE, only a data.table is returned. 17#' @param n_clusters (ggplot only) a \code{numeric} vector containing the min and the max range 18#' of the possible number of clusters of bars. 19#' @param ... other parameters passed to \code{barplot} (except horiz, border, cex.names, names.arg, and las). 20#' 21#' @details 22#' The graph represents each feature as a horizontal bar of length proportional to the importance of a feature. 23#' Features are shown ranked in a decreasing importance order. 24#' It works for importances from both \code{gblinear} and \code{gbtree} models. 25#' 26#' When \code{rel_to_first = FALSE}, the values would be plotted as they were in \code{importance_matrix}. 27#' For gbtree model, that would mean being normalized to the total of 1 28#' ("what is feature's importance contribution relative to the whole model?"). 29#' For linear models, \code{rel_to_first = FALSE} would show actual values of the coefficients. 30#' Setting \code{rel_to_first = TRUE} allows to see the picture from the perspective of 31#' "what is feature's importance contribution relative to the most important feature?" 32#' 33#' The ggplot-backend method also performs 1-D clustering of the importance values, 34#' with bar colors corresponding to different clusters that have somewhat similar importance values. 35#' 36#' @return 37#' The \code{xgb.plot.importance} function creates a \code{barplot} (when \code{plot=TRUE}) 38#' and silently returns a processed data.table with \code{n_top} features sorted by importance. 39#' 40#' The \code{xgb.ggplot.importance} function returns a ggplot graph which could be customized afterwards. 41#' E.g., to change the title of the graph, add \code{+ ggtitle("A GRAPH NAME")} to the result. 42#' 43#' @seealso 44#' \code{\link[graphics]{barplot}}. 45#' 46#' @examples 47#' data(agaricus.train) 48#' 49#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 3, 50#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") 51#' 52#' importance_matrix <- xgb.importance(colnames(agaricus.train$data), model = bst) 53#' 54#' xgb.plot.importance(importance_matrix, rel_to_first = TRUE, xlab = "Relative importance") 55#' 56#' (gg <- xgb.ggplot.importance(importance_matrix, measure = "Frequency", rel_to_first = TRUE)) 57#' gg + ggplot2::ylab("Frequency") 58#' 59#' @rdname xgb.plot.importance 60#' @export 61xgb.plot.importance <- function(importance_matrix = NULL, top_n = NULL, measure = NULL, 62 rel_to_first = FALSE, left_margin = 10, cex = NULL, plot = TRUE, ...) { 63 check.deprecation(...) 64 if (!is.data.table(importance_matrix)) { 65 stop("importance_matrix: must be a data.table") 66 } 67 68 imp_names <- colnames(importance_matrix) 69 if (is.null(measure)) { 70 if (all(c("Feature", "Gain") %in% imp_names)) { 71 measure <- "Gain" 72 } else if (all(c("Feature", "Weight") %in% imp_names)) { 73 measure <- "Weight" 74 } else { 75 stop("Importance matrix column names are not as expected!") 76 } 77 } else { 78 if (!measure %in% imp_names) 79 stop("Invalid `measure`") 80 if (!"Feature" %in% imp_names) 81 stop("Importance matrix column names are not as expected!") 82 } 83 84 # also aggregate, just in case when the values were not yet summed up by feature 85 importance_matrix <- importance_matrix[, Importance := sum(get(measure)), by = Feature] 86 87 # make sure it's ordered 88 importance_matrix <- importance_matrix[order(-abs(Importance))] 89 90 if (!is.null(top_n)) { 91 top_n <- min(top_n, nrow(importance_matrix)) 92 importance_matrix <- head(importance_matrix, top_n) 93 } 94 if (rel_to_first) { 95 importance_matrix[, Importance := Importance / max(abs(Importance))] 96 } 97 if (is.null(cex)) { 98 cex <- 2.5 / log2(1 + nrow(importance_matrix)) 99 } 100 101 if (plot) { 102 original_mar <- par()$mar 103 104 # reset margins so this function doesn't have side effects 105 on.exit({par(mar = original_mar)}) 106 107 mar <- original_mar 108 if (!is.null(left_margin)) 109 mar[2] <- left_margin 110 par(mar = mar) 111 112 # reverse the order of rows to have the highest ranked at the top 113 importance_matrix[rev(seq_len(nrow(importance_matrix))), 114 barplot(Importance, horiz = TRUE, border = NA, cex.names = cex, 115 names.arg = Feature, las = 1, ...)] 116 } 117 118 invisible(importance_matrix) 119} 120 121# Avoid error messages during CRAN check. 122# The reason is that these variables are never declared 123# They are mainly column names inferred by Data.table... 124globalVariables(c("Feature", "Importance")) 125