1#
2# This file is for the low level reusable utility functions
3# that are not supposed to be visible to a user.
4#
5
6#
7# General helper utilities ----------------------------------------------------
8#
9
10# SQL-style NVL shortcut.
11NVL <- function(x, val) {
12  if (is.null(x))
13    return(val)
14  if (is.vector(x)) {
15    x[is.na(x)] <- val
16    return(x)
17  }
18  if (typeof(x) == 'closure')
19    return(x)
20  stop("typeof(x) == ", typeof(x), " is not supported by NVL")
21}
22
23# List of classification and ranking objectives
24.CLASSIFICATION_OBJECTIVES <- function() {
25  return(c('binary:logistic', 'binary:logitraw', 'binary:hinge', 'multi:softmax',
26           'multi:softprob', 'rank:pairwise', 'rank:ndcg', 'rank:map'))
27}
28
29
30#
31# Low-level functions for boosting --------------------------------------------
32#
33
34# Merges booster params with whatever is provided in ...
35# plus runs some checks
36check.booster.params <- function(params, ...) {
37  if (!identical(class(params), "list"))
38    stop("params must be a list")
39
40  # in R interface, allow for '.' instead of '_' in parameter names
41  names(params) <- gsub("\\.", "_", names(params))
42
43  # merge parameters from the params and the dots-expansion
44  dot_params <- list(...)
45  names(dot_params) <- gsub("\\.", "_", names(dot_params))
46  if (length(intersect(names(params),
47                       names(dot_params))) > 0)
48    stop("Same parameters in 'params' and in the call are not allowed. Please check your 'params' list.")
49  params <- c(params, dot_params)
50
51  # providing a parameter multiple times makes sense only for 'eval_metric'
52  name_freqs <- table(names(params))
53  multi_names <- setdiff(names(name_freqs[name_freqs > 1]), 'eval_metric')
54  if (length(multi_names) > 0) {
55    warning("The following parameters were provided multiple times:\n\t",
56            paste(multi_names, collapse = ', '), "\n  Only the last value for each of them will be used.\n")
57    # While xgboost internals would choose the last value for a multiple-times parameter,
58    # enforce it here in R as well (b/c multi-parameters might be used further in R code,
59    # and R takes the 1st value when multiple elements with the same name are present in a list).
60    for (n in multi_names) {
61      del_idx <- which(n == names(params))
62      del_idx <- del_idx[-length(del_idx)]
63      params[[del_idx]] <- NULL
64    }
65  }
66
67  # for multiclass, expect num_class to be set
68  if (typeof(params[['objective']]) == "character" &&
69      substr(NVL(params[['objective']], 'x'), 1, 6) == 'multi:' &&
70      as.numeric(NVL(params[['num_class']], 0)) < 2) {
71        stop("'num_class' > 1 parameter must be set for multiclass classification")
72  }
73
74  # monotone_constraints parser
75
76  if (!is.null(params[['monotone_constraints']]) &&
77      typeof(params[['monotone_constraints']]) != "character") {
78        vec2str <- paste(params[['monotone_constraints']], collapse = ',')
79        vec2str <- paste0('(', vec2str, ')')
80        params[['monotone_constraints']] <- vec2str
81  }
82
83  # interaction constraints parser (convert from list of column indices to string)
84  if (!is.null(params[['interaction_constraints']]) &&
85      typeof(params[['interaction_constraints']]) != "character"){
86    # check input class
87    if (!identical(class(params[['interaction_constraints']]), 'list')) stop('interaction_constraints should be class list')
88    if (!all(unique(sapply(params[['interaction_constraints']], class)) %in% c('numeric', 'integer'))) {
89      stop('interaction_constraints should be a list of numeric/integer vectors')
90    }
91
92    # recast parameter as string
93    interaction_constraints <- sapply(params[['interaction_constraints']], function(x) paste0('[', paste(x, collapse = ','), ']'))
94    params[['interaction_constraints']] <- paste0('[', paste(interaction_constraints, collapse = ','), ']')
95  }
96  return(params)
97}
98
99
100# Performs some checks related to custom objective function.
101# WARNING: has side-effects and can modify 'params' and 'obj' in its calling frame
102check.custom.obj <- function(env = parent.frame()) {
103  if (!is.null(env$params[['objective']]) && !is.null(env$obj))
104    stop("Setting objectives in 'params' and 'obj' at the same time is not allowed")
105
106  if (!is.null(env$obj) && typeof(env$obj) != 'closure')
107    stop("'obj' must be a function")
108
109  # handle the case when custom objective function was provided through params
110  if (!is.null(env$params[['objective']]) &&
111      typeof(env$params$objective) == 'closure') {
112    env$obj <- env$params$objective
113    env$params$objective <- NULL
114  }
115}
116
117# Performs some checks related to custom evaluation function.
118# WARNING: has side-effects and can modify 'params' and 'feval' in its calling frame
119check.custom.eval <- function(env = parent.frame()) {
120  if (!is.null(env$params[['eval_metric']]) && !is.null(env$feval))
121    stop("Setting evaluation metrics in 'params' and 'feval' at the same time is not allowed")
122
123  if (!is.null(env$feval) && typeof(env$feval) != 'closure')
124    stop("'feval' must be a function")
125
126  # handle a situation when custom eval function was provided through params
127  if (!is.null(env$params[['eval_metric']]) &&
128      typeof(env$params$eval_metric) == 'closure') {
129    env$feval <- env$params$eval_metric
130    env$params$eval_metric <- NULL
131  }
132
133  # require maximize to be set when custom feval and early stopping are used together
134  if (!is.null(env$feval) &&
135      is.null(env$maximize) && (
136        !is.null(env$early_stopping_rounds) ||
137        has.callbacks(env$callbacks, 'cb.early.stop')))
138    stop("Please set 'maximize' to indicate whether the evaluation metric needs to be maximized or not")
139}
140
141
142# Update a booster handle for an iteration with dtrain data
143xgb.iter.update <- function(booster_handle, dtrain, iter, obj = NULL) {
144  if (!identical(class(booster_handle), "xgb.Booster.handle")) {
145    stop("booster_handle must be of xgb.Booster.handle class")
146  }
147  if (!inherits(dtrain, "xgb.DMatrix")) {
148    stop("dtrain must be of xgb.DMatrix class")
149  }
150
151  if (is.null(obj)) {
152    .Call(XGBoosterUpdateOneIter_R, booster_handle, as.integer(iter), dtrain)
153  } else {
154    pred <- predict(booster_handle, dtrain, outputmargin = TRUE, training = TRUE,
155                    ntreelimit = 0)
156    gpair <- obj(pred, dtrain)
157    .Call(XGBoosterBoostOneIter_R, booster_handle, dtrain, gpair$grad, gpair$hess)
158  }
159  return(TRUE)
160}
161
162
163# Evaluate one iteration.
164# Returns a named vector of evaluation metrics
165# with the names in a 'datasetname-metricname' format.
166xgb.iter.eval <- function(booster_handle, watchlist, iter, feval = NULL) {
167  if (!identical(class(booster_handle), "xgb.Booster.handle"))
168    stop("class of booster_handle must be xgb.Booster.handle")
169
170  if (length(watchlist) == 0)
171    return(NULL)
172
173  evnames <- names(watchlist)
174  if (is.null(feval)) {
175    msg <- .Call(XGBoosterEvalOneIter_R, booster_handle, as.integer(iter), watchlist, as.list(evnames))
176    mat <- matrix(strsplit(msg, '\\s+|:')[[1]][-1], nrow = 2)
177    res <- structure(as.numeric(mat[2, ]), names = mat[1, ])
178  } else {
179    res <- sapply(seq_along(watchlist), function(j) {
180      w <- watchlist[[j]]
181      ## predict using all trees
182      preds <- predict(booster_handle, w, outputmargin = TRUE, iterationrange = c(1, 1))
183      eval_res <- feval(preds, w)
184      out <- eval_res$value
185      names(out) <- paste0(evnames[j], "-", eval_res$metric)
186      out
187    })
188  }
189  return(res)
190}
191
192
193#
194# Helper functions for cross validation ---------------------------------------
195#
196
197# Possibly convert the labels into factors, depending on the objective.
198# The labels are converted into factors only when the given objective refers to the classification
199# or ranking tasks.
200convert.labels <- function(labels, objective_name) {
201  if (objective_name %in% .CLASSIFICATION_OBJECTIVES()) {
202    return(as.factor(labels))
203  } else {
204    return(labels)
205  }
206}
207
208# Generates random (stratified if needed) CV folds
209generate.cv.folds <- function(nfold, nrows, stratified, label, params) {
210
211  # cannot do it for rank
212  objective <- params$objective
213  if (is.character(objective) && strtrim(objective, 5) == 'rank:') {
214    stop("\n\tAutomatic generation of CV-folds is not implemented for ranking!\n",
215         "\tConsider providing pre-computed CV-folds through the 'folds=' parameter.\n")
216  }
217  # shuffle
218  rnd_idx <- sample.int(nrows)
219  if (stratified &&
220      length(label) == length(rnd_idx)) {
221    y <- label[rnd_idx]
222    # WARNING: some heuristic logic is employed to identify classification setting!
223    #  - For classification, need to convert y labels to factor before making the folds,
224    #    and then do stratification by factor levels.
225    #  - For regression, leave y numeric and do stratification by quantiles.
226    if (is.character(objective)) {
227      y <- convert.labels(y, params$objective)
228    } else {
229      # If no 'objective' given in params, it means that user either wants to
230      # use the default 'reg:squarederror' objective or has provided a custom
231      # obj function.  Here, assume classification setting when y has 5 or less
232      # unique values:
233      if (length(unique(y)) <= 5) {
234        y <- factor(y)
235      }
236    }
237    folds <- xgb.createFolds(y, nfold)
238  } else {
239    # make simple non-stratified folds
240    kstep <- length(rnd_idx) %/% nfold
241    folds <- list()
242    for (i in seq_len(nfold - 1)) {
243      folds[[i]] <- rnd_idx[seq_len(kstep)]
244      rnd_idx <- rnd_idx[-seq_len(kstep)]
245    }
246    folds[[nfold]] <- rnd_idx
247  }
248  return(folds)
249}
250
251# Creates CV folds stratified by the values of y.
252# It was borrowed from caret::createFolds and simplified
253# by always returning an unnamed list of fold indices.
254xgb.createFolds <- function(y, k = 10)
255{
256  if (is.numeric(y)) {
257    ## Group the numeric data based on their magnitudes
258    ## and sample within those groups.
259
260    ## When the number of samples is low, we may have
261    ## issues further slicing the numeric data into
262    ## groups. The number of groups will depend on the
263    ## ratio of the number of folds to the sample size.
264    ## At most, we will use quantiles. If the sample
265    ## is too small, we just do regular unstratified
266    ## CV
267    cuts <- floor(length(y) / k)
268    if (cuts < 2) cuts <- 2
269    if (cuts > 5) cuts <- 5
270    y <- cut(y,
271             unique(stats::quantile(y, probs = seq(0, 1, length = cuts))),
272             include.lowest = TRUE)
273  }
274
275  if (k < length(y)) {
276    ## reset levels so that the possible levels and
277    ## the levels in the vector are the same
278    y <- factor(as.character(y))
279    numInClass <- table(y)
280    foldVector <- vector(mode = "integer", length(y))
281
282    ## For each class, balance the fold allocation as far
283    ## as possible, then resample the remainder.
284    ## The final assignment of folds is also randomized.
285    for (i in seq_along(numInClass)) {
286      ## create a vector of integers from 1:k as many times as possible without
287      ## going over the number of samples in the class. Note that if the number
288      ## of samples in a class is less than k, nothing is produced here.
289      seqVector <- rep(seq_len(k), numInClass[i] %/% k)
290      ## add enough random integers to get  length(seqVector) == numInClass[i]
291      if (numInClass[i] %% k > 0) seqVector <- c(seqVector, sample.int(k, numInClass[i] %% k))
292      ## shuffle the integers for fold assignment and assign to this classes's data
293      ## seqVector[sample.int(length(seqVector))] is used to handle length(seqVector) == 1
294      foldVector[y == dimnames(numInClass)$y[i]] <- seqVector[sample.int(length(seqVector))]
295    }
296  } else {
297    foldVector <- seq(along = y)
298  }
299
300  out <- split(seq(along = y), foldVector)
301  names(out) <- NULL
302  out
303}
304
305
306#
307# Deprectaion notice utilities ------------------------------------------------
308#
309
310#' Deprecation notices.
311#'
312#' At this time, some of the parameter names were changed in order to make the code style more uniform.
313#' The deprecated parameters would be removed in the next release.
314#'
315#' To see all the current deprecated and new parameters, check the \code{xgboost:::depr_par_lut} table.
316#'
317#' A deprecation warning is shown when any of the deprecated parameters is used in a call.
318#' An additional warning is shown when there was a partial match to a deprecated parameter
319#' (as R is able to partially match parameter names).
320#'
321#' @name xgboost-deprecated
322NULL
323
324#' Do not use \code{\link[base]{saveRDS}} or \code{\link[base]{save}} for long-term archival of
325#' models. Instead, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}}.
326#'
327#' It is a common practice to use the built-in \code{\link[base]{saveRDS}} function (or
328#' \code{\link[base]{save}}) to persist R objects to the disk. While it is possible to persist
329#' \code{xgb.Booster} objects using \code{\link[base]{saveRDS}}, it is not advisable to do so if
330#' the model is to be accessed in the future. If you train a model with the current version of
331#' XGBoost and persist it with \code{\link[base]{saveRDS}}, the model is not guaranteed to be
332#' accessible in later releases of XGBoost. To ensure that your model can be accessed in future
333#' releases of XGBoost, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}} instead.
334#'
335#' @details
336#' Use \code{\link{xgb.save}} to save the XGBoost model as a stand-alone file. You may opt into
337#' the JSON format by specifying the JSON extension. To read the model back, use
338#' \code{\link{xgb.load}}.
339#'
340#' Use \code{\link{xgb.save.raw}} to save the XGBoost model as a sequence (vector) of raw bytes
341#' in a future-proof manner. Future releases of XGBoost will be able to read the raw bytes and
342#' re-construct the corresponding model. To read the model back, use \code{\link{xgb.load.raw}}.
343#' The \code{\link{xgb.save.raw}} function is useful if you'd like to persist the XGBoost model
344#' as part of another R object.
345#'
346#' Note: Do not use \code{\link{xgb.serialize}} to store models long-term. It persists not only the
347#' model but also internal configurations and parameters, and its format is not stable across
348#' multiple XGBoost versions. Use \code{\link{xgb.serialize}} only for checkpointing.
349#'
350#' For more details and explanation about model persistence and archival, consult the page
351#' \url{https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html}.
352#'
353#' @examples
354#' data(agaricus.train, package='xgboost')
355#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
356#'                eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
357#'
358#' # Save as a stand-alone file; load it with xgb.load()
359#' xgb.save(bst, 'xgb.model')
360#' bst2 <- xgb.load('xgb.model')
361#'
362#' # Save as a stand-alone file (JSON); load it with xgb.load()
363#' xgb.save(bst, 'xgb.model.json')
364#' bst2 <- xgb.load('xgb.model.json')
365#' if (file.exists('xgb.model.json')) file.remove('xgb.model.json')
366#'
367#' # Save as a raw byte vector; load it with xgb.load.raw()
368#' xgb_bytes <- xgb.save.raw(bst)
369#' bst2 <- xgb.load.raw(xgb_bytes)
370#'
371#' # Persist XGBoost model as part of another R object
372#' obj <- list(xgb_model_bytes = xgb.save.raw(bst), description = "My first XGBoost model")
373#' # Persist the R object. Here, saveRDS() is okay, since it doesn't persist
374#' # xgb.Booster directly. What's being persisted is the future-proof byte representation
375#' # as given by xgb.save.raw().
376#' saveRDS(obj, 'my_object.rds')
377#' # Read back the R object
378#' obj2 <- readRDS('my_object.rds')
379#' # Re-construct xgb.Booster object from the bytes
380#' bst2 <- xgb.load.raw(obj2$xgb_model_bytes)
381#' if (file.exists('my_object.rds')) file.remove('my_object.rds')
382#'
383#' @name a-compatibility-note-for-saveRDS-save
384NULL
385
386# Lookup table for the deprecated parameters bookkeeping
387depr_par_lut <- matrix(c(
388  'print.every.n', 'print_every_n',
389  'early.stop.round', 'early_stopping_rounds',
390  'training.data', 'data',
391  'with.stats', 'with_stats',
392  'numberOfClusters', 'n_clusters',
393  'features.keep', 'features_keep',
394  'plot.height', 'plot_height',
395  'plot.width', 'plot_width',
396  'n_first_tree', 'trees',
397  'dummy', 'DUMMY'
398), ncol = 2, byrow = TRUE)
399colnames(depr_par_lut) <- c('old', 'new')
400
401# Checks the dot-parameters for deprecated names
402# (including partial matching), gives a deprecation warning,
403# and sets new parameters to the old parameters' values within its parent frame.
404# WARNING: has side-effects
405check.deprecation <- function(..., env = parent.frame()) {
406  pars <- list(...)
407  # exact and partial matches
408  all_match <- pmatch(names(pars), depr_par_lut[, 1])
409  # indices of matched pars' names
410  idx_pars <- which(!is.na(all_match))
411  if (length(idx_pars) == 0) return()
412  # indices of matched LUT rows
413  idx_lut <- all_match[idx_pars]
414  # which of idx_lut were the exact matches?
415  ex_match <- depr_par_lut[idx_lut, 1] %in% names(pars)
416  for (i in seq_along(idx_pars)) {
417    pars_par <- names(pars)[idx_pars[i]]
418    old_par <- depr_par_lut[idx_lut[i], 1]
419    new_par <- depr_par_lut[idx_lut[i], 2]
420    if (!ex_match[i]) {
421      warning("'", pars_par, "' was partially matched to '", old_par, "'")
422    }
423    .Deprecated(new_par, old = old_par, package = 'xgboost')
424    if (new_par != 'NULL') {
425      eval(parse(text = paste(new_par, '<-', pars[[pars_par]])), envir = env)
426    }
427  }
428}
429