1# 2# This file is for the low level reusable utility functions 3# that are not supposed to be visible to a user. 4# 5 6# 7# General helper utilities ---------------------------------------------------- 8# 9 10# SQL-style NVL shortcut. 11NVL <- function(x, val) { 12 if (is.null(x)) 13 return(val) 14 if (is.vector(x)) { 15 x[is.na(x)] <- val 16 return(x) 17 } 18 if (typeof(x) == 'closure') 19 return(x) 20 stop("typeof(x) == ", typeof(x), " is not supported by NVL") 21} 22 23# List of classification and ranking objectives 24.CLASSIFICATION_OBJECTIVES <- function() { 25 return(c('binary:logistic', 'binary:logitraw', 'binary:hinge', 'multi:softmax', 26 'multi:softprob', 'rank:pairwise', 'rank:ndcg', 'rank:map')) 27} 28 29 30# 31# Low-level functions for boosting -------------------------------------------- 32# 33 34# Merges booster params with whatever is provided in ... 35# plus runs some checks 36check.booster.params <- function(params, ...) { 37 if (!identical(class(params), "list")) 38 stop("params must be a list") 39 40 # in R interface, allow for '.' instead of '_' in parameter names 41 names(params) <- gsub("\\.", "_", names(params)) 42 43 # merge parameters from the params and the dots-expansion 44 dot_params <- list(...) 45 names(dot_params) <- gsub("\\.", "_", names(dot_params)) 46 if (length(intersect(names(params), 47 names(dot_params))) > 0) 48 stop("Same parameters in 'params' and in the call are not allowed. Please check your 'params' list.") 49 params <- c(params, dot_params) 50 51 # providing a parameter multiple times makes sense only for 'eval_metric' 52 name_freqs <- table(names(params)) 53 multi_names <- setdiff(names(name_freqs[name_freqs > 1]), 'eval_metric') 54 if (length(multi_names) > 0) { 55 warning("The following parameters were provided multiple times:\n\t", 56 paste(multi_names, collapse = ', '), "\n Only the last value for each of them will be used.\n") 57 # While xgboost internals would choose the last value for a multiple-times parameter, 58 # enforce it here in R as well (b/c multi-parameters might be used further in R code, 59 # and R takes the 1st value when multiple elements with the same name are present in a list). 60 for (n in multi_names) { 61 del_idx <- which(n == names(params)) 62 del_idx <- del_idx[-length(del_idx)] 63 params[[del_idx]] <- NULL 64 } 65 } 66 67 # for multiclass, expect num_class to be set 68 if (typeof(params[['objective']]) == "character" && 69 substr(NVL(params[['objective']], 'x'), 1, 6) == 'multi:' && 70 as.numeric(NVL(params[['num_class']], 0)) < 2) { 71 stop("'num_class' > 1 parameter must be set for multiclass classification") 72 } 73 74 # monotone_constraints parser 75 76 if (!is.null(params[['monotone_constraints']]) && 77 typeof(params[['monotone_constraints']]) != "character") { 78 vec2str <- paste(params[['monotone_constraints']], collapse = ',') 79 vec2str <- paste0('(', vec2str, ')') 80 params[['monotone_constraints']] <- vec2str 81 } 82 83 # interaction constraints parser (convert from list of column indices to string) 84 if (!is.null(params[['interaction_constraints']]) && 85 typeof(params[['interaction_constraints']]) != "character"){ 86 # check input class 87 if (!identical(class(params[['interaction_constraints']]), 'list')) stop('interaction_constraints should be class list') 88 if (!all(unique(sapply(params[['interaction_constraints']], class)) %in% c('numeric', 'integer'))) { 89 stop('interaction_constraints should be a list of numeric/integer vectors') 90 } 91 92 # recast parameter as string 93 interaction_constraints <- sapply(params[['interaction_constraints']], function(x) paste0('[', paste(x, collapse = ','), ']')) 94 params[['interaction_constraints']] <- paste0('[', paste(interaction_constraints, collapse = ','), ']') 95 } 96 return(params) 97} 98 99 100# Performs some checks related to custom objective function. 101# WARNING: has side-effects and can modify 'params' and 'obj' in its calling frame 102check.custom.obj <- function(env = parent.frame()) { 103 if (!is.null(env$params[['objective']]) && !is.null(env$obj)) 104 stop("Setting objectives in 'params' and 'obj' at the same time is not allowed") 105 106 if (!is.null(env$obj) && typeof(env$obj) != 'closure') 107 stop("'obj' must be a function") 108 109 # handle the case when custom objective function was provided through params 110 if (!is.null(env$params[['objective']]) && 111 typeof(env$params$objective) == 'closure') { 112 env$obj <- env$params$objective 113 env$params$objective <- NULL 114 } 115} 116 117# Performs some checks related to custom evaluation function. 118# WARNING: has side-effects and can modify 'params' and 'feval' in its calling frame 119check.custom.eval <- function(env = parent.frame()) { 120 if (!is.null(env$params[['eval_metric']]) && !is.null(env$feval)) 121 stop("Setting evaluation metrics in 'params' and 'feval' at the same time is not allowed") 122 123 if (!is.null(env$feval) && typeof(env$feval) != 'closure') 124 stop("'feval' must be a function") 125 126 # handle a situation when custom eval function was provided through params 127 if (!is.null(env$params[['eval_metric']]) && 128 typeof(env$params$eval_metric) == 'closure') { 129 env$feval <- env$params$eval_metric 130 env$params$eval_metric <- NULL 131 } 132 133 # require maximize to be set when custom feval and early stopping are used together 134 if (!is.null(env$feval) && 135 is.null(env$maximize) && ( 136 !is.null(env$early_stopping_rounds) || 137 has.callbacks(env$callbacks, 'cb.early.stop'))) 138 stop("Please set 'maximize' to indicate whether the evaluation metric needs to be maximized or not") 139} 140 141 142# Update a booster handle for an iteration with dtrain data 143xgb.iter.update <- function(booster_handle, dtrain, iter, obj = NULL) { 144 if (!identical(class(booster_handle), "xgb.Booster.handle")) { 145 stop("booster_handle must be of xgb.Booster.handle class") 146 } 147 if (!inherits(dtrain, "xgb.DMatrix")) { 148 stop("dtrain must be of xgb.DMatrix class") 149 } 150 151 if (is.null(obj)) { 152 .Call(XGBoosterUpdateOneIter_R, booster_handle, as.integer(iter), dtrain) 153 } else { 154 pred <- predict(booster_handle, dtrain, outputmargin = TRUE, training = TRUE, 155 ntreelimit = 0) 156 gpair <- obj(pred, dtrain) 157 .Call(XGBoosterBoostOneIter_R, booster_handle, dtrain, gpair$grad, gpair$hess) 158 } 159 return(TRUE) 160} 161 162 163# Evaluate one iteration. 164# Returns a named vector of evaluation metrics 165# with the names in a 'datasetname-metricname' format. 166xgb.iter.eval <- function(booster_handle, watchlist, iter, feval = NULL) { 167 if (!identical(class(booster_handle), "xgb.Booster.handle")) 168 stop("class of booster_handle must be xgb.Booster.handle") 169 170 if (length(watchlist) == 0) 171 return(NULL) 172 173 evnames <- names(watchlist) 174 if (is.null(feval)) { 175 msg <- .Call(XGBoosterEvalOneIter_R, booster_handle, as.integer(iter), watchlist, as.list(evnames)) 176 mat <- matrix(strsplit(msg, '\\s+|:')[[1]][-1], nrow = 2) 177 res <- structure(as.numeric(mat[2, ]), names = mat[1, ]) 178 } else { 179 res <- sapply(seq_along(watchlist), function(j) { 180 w <- watchlist[[j]] 181 ## predict using all trees 182 preds <- predict(booster_handle, w, outputmargin = TRUE, iterationrange = c(1, 1)) 183 eval_res <- feval(preds, w) 184 out <- eval_res$value 185 names(out) <- paste0(evnames[j], "-", eval_res$metric) 186 out 187 }) 188 } 189 return(res) 190} 191 192 193# 194# Helper functions for cross validation --------------------------------------- 195# 196 197# Possibly convert the labels into factors, depending on the objective. 198# The labels are converted into factors only when the given objective refers to the classification 199# or ranking tasks. 200convert.labels <- function(labels, objective_name) { 201 if (objective_name %in% .CLASSIFICATION_OBJECTIVES()) { 202 return(as.factor(labels)) 203 } else { 204 return(labels) 205 } 206} 207 208# Generates random (stratified if needed) CV folds 209generate.cv.folds <- function(nfold, nrows, stratified, label, params) { 210 211 # cannot do it for rank 212 objective <- params$objective 213 if (is.character(objective) && strtrim(objective, 5) == 'rank:') { 214 stop("\n\tAutomatic generation of CV-folds is not implemented for ranking!\n", 215 "\tConsider providing pre-computed CV-folds through the 'folds=' parameter.\n") 216 } 217 # shuffle 218 rnd_idx <- sample.int(nrows) 219 if (stratified && 220 length(label) == length(rnd_idx)) { 221 y <- label[rnd_idx] 222 # WARNING: some heuristic logic is employed to identify classification setting! 223 # - For classification, need to convert y labels to factor before making the folds, 224 # and then do stratification by factor levels. 225 # - For regression, leave y numeric and do stratification by quantiles. 226 if (is.character(objective)) { 227 y <- convert.labels(y, params$objective) 228 } else { 229 # If no 'objective' given in params, it means that user either wants to 230 # use the default 'reg:squarederror' objective or has provided a custom 231 # obj function. Here, assume classification setting when y has 5 or less 232 # unique values: 233 if (length(unique(y)) <= 5) { 234 y <- factor(y) 235 } 236 } 237 folds <- xgb.createFolds(y, nfold) 238 } else { 239 # make simple non-stratified folds 240 kstep <- length(rnd_idx) %/% nfold 241 folds <- list() 242 for (i in seq_len(nfold - 1)) { 243 folds[[i]] <- rnd_idx[seq_len(kstep)] 244 rnd_idx <- rnd_idx[-seq_len(kstep)] 245 } 246 folds[[nfold]] <- rnd_idx 247 } 248 return(folds) 249} 250 251# Creates CV folds stratified by the values of y. 252# It was borrowed from caret::createFolds and simplified 253# by always returning an unnamed list of fold indices. 254xgb.createFolds <- function(y, k = 10) 255{ 256 if (is.numeric(y)) { 257 ## Group the numeric data based on their magnitudes 258 ## and sample within those groups. 259 260 ## When the number of samples is low, we may have 261 ## issues further slicing the numeric data into 262 ## groups. The number of groups will depend on the 263 ## ratio of the number of folds to the sample size. 264 ## At most, we will use quantiles. If the sample 265 ## is too small, we just do regular unstratified 266 ## CV 267 cuts <- floor(length(y) / k) 268 if (cuts < 2) cuts <- 2 269 if (cuts > 5) cuts <- 5 270 y <- cut(y, 271 unique(stats::quantile(y, probs = seq(0, 1, length = cuts))), 272 include.lowest = TRUE) 273 } 274 275 if (k < length(y)) { 276 ## reset levels so that the possible levels and 277 ## the levels in the vector are the same 278 y <- factor(as.character(y)) 279 numInClass <- table(y) 280 foldVector <- vector(mode = "integer", length(y)) 281 282 ## For each class, balance the fold allocation as far 283 ## as possible, then resample the remainder. 284 ## The final assignment of folds is also randomized. 285 for (i in seq_along(numInClass)) { 286 ## create a vector of integers from 1:k as many times as possible without 287 ## going over the number of samples in the class. Note that if the number 288 ## of samples in a class is less than k, nothing is produced here. 289 seqVector <- rep(seq_len(k), numInClass[i] %/% k) 290 ## add enough random integers to get length(seqVector) == numInClass[i] 291 if (numInClass[i] %% k > 0) seqVector <- c(seqVector, sample.int(k, numInClass[i] %% k)) 292 ## shuffle the integers for fold assignment and assign to this classes's data 293 ## seqVector[sample.int(length(seqVector))] is used to handle length(seqVector) == 1 294 foldVector[y == dimnames(numInClass)$y[i]] <- seqVector[sample.int(length(seqVector))] 295 } 296 } else { 297 foldVector <- seq(along = y) 298 } 299 300 out <- split(seq(along = y), foldVector) 301 names(out) <- NULL 302 out 303} 304 305 306# 307# Deprectaion notice utilities ------------------------------------------------ 308# 309 310#' Deprecation notices. 311#' 312#' At this time, some of the parameter names were changed in order to make the code style more uniform. 313#' The deprecated parameters would be removed in the next release. 314#' 315#' To see all the current deprecated and new parameters, check the \code{xgboost:::depr_par_lut} table. 316#' 317#' A deprecation warning is shown when any of the deprecated parameters is used in a call. 318#' An additional warning is shown when there was a partial match to a deprecated parameter 319#' (as R is able to partially match parameter names). 320#' 321#' @name xgboost-deprecated 322NULL 323 324#' Do not use \code{\link[base]{saveRDS}} or \code{\link[base]{save}} for long-term archival of 325#' models. Instead, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}}. 326#' 327#' It is a common practice to use the built-in \code{\link[base]{saveRDS}} function (or 328#' \code{\link[base]{save}}) to persist R objects to the disk. While it is possible to persist 329#' \code{xgb.Booster} objects using \code{\link[base]{saveRDS}}, it is not advisable to do so if 330#' the model is to be accessed in the future. If you train a model with the current version of 331#' XGBoost and persist it with \code{\link[base]{saveRDS}}, the model is not guaranteed to be 332#' accessible in later releases of XGBoost. To ensure that your model can be accessed in future 333#' releases of XGBoost, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}} instead. 334#' 335#' @details 336#' Use \code{\link{xgb.save}} to save the XGBoost model as a stand-alone file. You may opt into 337#' the JSON format by specifying the JSON extension. To read the model back, use 338#' \code{\link{xgb.load}}. 339#' 340#' Use \code{\link{xgb.save.raw}} to save the XGBoost model as a sequence (vector) of raw bytes 341#' in a future-proof manner. Future releases of XGBoost will be able to read the raw bytes and 342#' re-construct the corresponding model. To read the model back, use \code{\link{xgb.load.raw}}. 343#' The \code{\link{xgb.save.raw}} function is useful if you'd like to persist the XGBoost model 344#' as part of another R object. 345#' 346#' Note: Do not use \code{\link{xgb.serialize}} to store models long-term. It persists not only the 347#' model but also internal configurations and parameters, and its format is not stable across 348#' multiple XGBoost versions. Use \code{\link{xgb.serialize}} only for checkpointing. 349#' 350#' For more details and explanation about model persistence and archival, consult the page 351#' \url{https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html}. 352#' 353#' @examples 354#' data(agaricus.train, package='xgboost') 355#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2, 356#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") 357#' 358#' # Save as a stand-alone file; load it with xgb.load() 359#' xgb.save(bst, 'xgb.model') 360#' bst2 <- xgb.load('xgb.model') 361#' 362#' # Save as a stand-alone file (JSON); load it with xgb.load() 363#' xgb.save(bst, 'xgb.model.json') 364#' bst2 <- xgb.load('xgb.model.json') 365#' if (file.exists('xgb.model.json')) file.remove('xgb.model.json') 366#' 367#' # Save as a raw byte vector; load it with xgb.load.raw() 368#' xgb_bytes <- xgb.save.raw(bst) 369#' bst2 <- xgb.load.raw(xgb_bytes) 370#' 371#' # Persist XGBoost model as part of another R object 372#' obj <- list(xgb_model_bytes = xgb.save.raw(bst), description = "My first XGBoost model") 373#' # Persist the R object. Here, saveRDS() is okay, since it doesn't persist 374#' # xgb.Booster directly. What's being persisted is the future-proof byte representation 375#' # as given by xgb.save.raw(). 376#' saveRDS(obj, 'my_object.rds') 377#' # Read back the R object 378#' obj2 <- readRDS('my_object.rds') 379#' # Re-construct xgb.Booster object from the bytes 380#' bst2 <- xgb.load.raw(obj2$xgb_model_bytes) 381#' if (file.exists('my_object.rds')) file.remove('my_object.rds') 382#' 383#' @name a-compatibility-note-for-saveRDS-save 384NULL 385 386# Lookup table for the deprecated parameters bookkeeping 387depr_par_lut <- matrix(c( 388 'print.every.n', 'print_every_n', 389 'early.stop.round', 'early_stopping_rounds', 390 'training.data', 'data', 391 'with.stats', 'with_stats', 392 'numberOfClusters', 'n_clusters', 393 'features.keep', 'features_keep', 394 'plot.height', 'plot_height', 395 'plot.width', 'plot_width', 396 'n_first_tree', 'trees', 397 'dummy', 'DUMMY' 398), ncol = 2, byrow = TRUE) 399colnames(depr_par_lut) <- c('old', 'new') 400 401# Checks the dot-parameters for deprecated names 402# (including partial matching), gives a deprecation warning, 403# and sets new parameters to the old parameters' values within its parent frame. 404# WARNING: has side-effects 405check.deprecation <- function(..., env = parent.frame()) { 406 pars <- list(...) 407 # exact and partial matches 408 all_match <- pmatch(names(pars), depr_par_lut[, 1]) 409 # indices of matched pars' names 410 idx_pars <- which(!is.na(all_match)) 411 if (length(idx_pars) == 0) return() 412 # indices of matched LUT rows 413 idx_lut <- all_match[idx_pars] 414 # which of idx_lut were the exact matches? 415 ex_match <- depr_par_lut[idx_lut, 1] %in% names(pars) 416 for (i in seq_along(idx_pars)) { 417 pars_par <- names(pars)[idx_pars[i]] 418 old_par <- depr_par_lut[idx_lut[i], 1] 419 new_par <- depr_par_lut[idx_lut[i], 2] 420 if (!ex_match[i]) { 421 warning("'", pars_par, "' was partially matched to '", old_par, "'") 422 } 423 .Deprecated(new_par, old = old_par, package = 'xgboost') 424 if (new_par != 'NULL') { 425 eval(parse(text = paste(new_par, '<-', pars[[pars_par]])), envir = env) 426 } 427 } 428} 429