1require(xgboost)
2
3context("basic functions")
4
5data(agaricus.train, package = 'xgboost')
6data(agaricus.test, package = 'xgboost')
7train <- agaricus.train
8test <- agaricus.test
9set.seed(1994)
10
11# disable some tests for Win32
12windows_flag <- .Platform$OS.type == "windows" &&
13               .Machine$sizeof.pointer != 8
14solaris_flag <- (Sys.info()['sysname'] == "SunOS")
15
16test_that("train and predict binary classification", {
17  nrounds <- 2
18  expect_output(
19    bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
20                  eta = 1, nthread = 2, nrounds = nrounds, objective = "binary:logistic",
21                  eval_metric = "error")
22  , "train-error")
23  expect_equal(class(bst), "xgb.Booster")
24  expect_equal(bst$niter, nrounds)
25  expect_false(is.null(bst$evaluation_log))
26  expect_equal(nrow(bst$evaluation_log), nrounds)
27  expect_lt(bst$evaluation_log[, min(train_error)], 0.03)
28
29  pred <- predict(bst, test$data)
30  expect_length(pred, 1611)
31
32  pred1 <- predict(bst, train$data, ntreelimit = 1)
33  expect_length(pred1, 6513)
34  err_pred1 <- sum((pred1 > 0.5) != train$label) / length(train$label)
35  err_log <- bst$evaluation_log[1, train_error]
36  expect_lt(abs(err_pred1 - err_log), 10e-6)
37
38  pred2 <- predict(bst, train$data, iterationrange = c(1, 2))
39  expect_length(pred1, 6513)
40  expect_equal(pred1, pred2)
41})
42
43test_that("parameter validation works", {
44  p <- list(foo = "bar")
45  nrounds <- 1
46  set.seed(1994)
47
48  d <- cbind(
49    x1 = rnorm(10),
50    x2 = rnorm(10),
51    x3 = rnorm(10))
52  y <- d[, "x1"] + d[, "x2"]^2 +
53    ifelse(d[, "x3"] > .5, d[, "x3"]^2, 2^d[, "x3"]) +
54    rnorm(10)
55  dtrain <- xgb.DMatrix(data = d, info = list(label = y))
56
57  correct <- function() {
58    params <- list(max_depth = 2, booster = "dart",
59                   rate_drop = 0.5, one_drop = TRUE,
60                   objective = "reg:squarederror")
61    xgb.train(params = params, data = dtrain, nrounds = nrounds)
62  }
63  expect_silent(correct())
64  incorrect <- function() {
65    params <- list(max_depth = 2, booster = "dart",
66                   rate_drop = 0.5, one_drop = TRUE,
67                   objective = "reg:squarederror",
68                   foo = "bar", bar = "foo")
69    output <- capture.output(
70      xgb.train(params = params, data = dtrain, nrounds = nrounds))
71    print(output)
72  }
73  expect_output(incorrect(), '\\\\"bar\\\\", \\\\"foo\\\\"')
74})
75
76
77test_that("dart prediction works", {
78  nrounds <- 32
79  set.seed(1994)
80
81  d <- cbind(
82    x1 = rnorm(100),
83    x2 = rnorm(100),
84    x3 = rnorm(100))
85  y <- d[, "x1"] + d[, "x2"]^2 +
86    ifelse(d[, "x3"] > .5, d[, "x3"]^2, 2^d[, "x3"]) +
87    rnorm(100)
88
89  set.seed(1994)
90  booster_by_xgboost <- xgboost(data = d, label = y, max_depth = 2, booster = "dart",
91                                rate_drop = 0.5, one_drop = TRUE,
92                                eta = 1, nthread = 2, nrounds = nrounds, objective = "reg:squarederror")
93  pred_by_xgboost_0 <- predict(booster_by_xgboost, newdata = d, ntreelimit = 0)
94  pred_by_xgboost_1 <- predict(booster_by_xgboost, newdata = d, ntreelimit = nrounds)
95  expect_true(all(matrix(pred_by_xgboost_0, byrow = TRUE) == matrix(pred_by_xgboost_1, byrow = TRUE)))
96
97  pred_by_xgboost_2 <- predict(booster_by_xgboost, newdata = d, training = TRUE)
98  expect_false(all(matrix(pred_by_xgboost_0, byrow = TRUE) == matrix(pred_by_xgboost_2, byrow = TRUE)))
99
100  set.seed(1994)
101  dtrain <- xgb.DMatrix(data = d, info = list(label = y))
102  booster_by_train <- xgb.train(params = list(
103                                    booster = "dart",
104                                    max_depth = 2,
105                                    eta = 1,
106                                    rate_drop = 0.5,
107                                    one_drop = TRUE,
108                                    nthread = 1,
109                                    tree_method = "exact",
110                                    objective = "reg:squarederror"
111                                ),
112                                data = dtrain,
113                                nrounds = nrounds
114                                )
115  pred_by_train_0 <- predict(booster_by_train, newdata = dtrain, ntreelimit = 0)
116  pred_by_train_1 <- predict(booster_by_train, newdata = dtrain, ntreelimit = nrounds)
117  pred_by_train_2 <- predict(booster_by_train, newdata = dtrain, training = TRUE)
118
119  expect_true(all(matrix(pred_by_train_0, byrow = TRUE) == matrix(pred_by_xgboost_0, byrow = TRUE)))
120  expect_true(all(matrix(pred_by_train_1, byrow = TRUE) == matrix(pred_by_xgboost_1, byrow = TRUE)))
121  expect_true(all(matrix(pred_by_train_2, byrow = TRUE) == matrix(pred_by_xgboost_2, byrow = TRUE)))
122})
123
124test_that("train and predict softprob", {
125  lb <- as.numeric(iris$Species) - 1
126  set.seed(11)
127  expect_output(
128    bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
129                   max_depth = 3, eta = 0.5, nthread = 2, nrounds = 5,
130                   objective = "multi:softprob", num_class = 3, eval_metric = "merror")
131  , "train-merror")
132  expect_false(is.null(bst$evaluation_log))
133  expect_lt(bst$evaluation_log[, min(train_merror)], 0.025)
134  expect_equal(bst$niter * 3, xgb.ntree(bst))
135  pred <- predict(bst, as.matrix(iris[, -5]))
136  expect_length(pred, nrow(iris) * 3)
137  # row sums add up to total probability of 1:
138  expect_equal(rowSums(matrix(pred, ncol = 3, byrow = TRUE)), rep(1, nrow(iris)), tolerance = 1e-7)
139  # manually calculate error at the last iteration:
140  mpred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE)
141  expect_equal(as.numeric(t(mpred)), pred)
142  pred_labels <- max.col(mpred) - 1
143  err <- sum(pred_labels != lb) / length(lb)
144  expect_equal(bst$evaluation_log[5, train_merror], err, tolerance = 5e-6)
145  # manually calculate error at the 1st iteration:
146  mpred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, ntreelimit = 1)
147  pred_labels <- max.col(mpred) - 1
148  err <- sum(pred_labels != lb) / length(lb)
149  expect_equal(bst$evaluation_log[1, train_merror], err, tolerance = 5e-6)
150
151  mpred1 <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, iterationrange = c(1, 2))
152  expect_equal(mpred, mpred1)
153
154  d <- cbind(
155    x1 = rnorm(100),
156    x2 = rnorm(100),
157    x3 = rnorm(100)
158  )
159  y <- sample.int(10, 100, replace = TRUE) - 1
160  dtrain <- xgb.DMatrix(data = d, info = list(label = y))
161  booster <- xgb.train(
162    params = list(tree_method = "hist"), data = dtrain, nrounds = 4, num_class = 10,
163    objective = "multi:softprob"
164  )
165  predt <- predict(booster, as.matrix(d), reshape = TRUE, strict_shape = FALSE)
166  expect_equal(ncol(predt), 10)
167  expect_equal(rowSums(predt), rep(1, 100), tolerance = 1e-7)
168})
169
170test_that("train and predict softmax", {
171  lb <- as.numeric(iris$Species) - 1
172  set.seed(11)
173  expect_output(
174    bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
175                   max_depth = 3, eta = 0.5, nthread = 2, nrounds = 5,
176                   objective = "multi:softmax", num_class = 3, eval_metric = "merror")
177  , "train-merror")
178  expect_false(is.null(bst$evaluation_log))
179  expect_lt(bst$evaluation_log[, min(train_merror)], 0.025)
180  expect_equal(bst$niter * 3, xgb.ntree(bst))
181
182  pred <- predict(bst, as.matrix(iris[, -5]))
183  expect_length(pred, nrow(iris))
184  err <- sum(pred != lb) / length(lb)
185  expect_equal(bst$evaluation_log[5, train_merror], err, tolerance = 5e-6)
186})
187
188test_that("train and predict RF", {
189  set.seed(11)
190  lb <- train$label
191  # single iteration
192  bst <- xgboost(data = train$data, label = lb, max_depth = 5,
193                 nthread = 2, nrounds = 1, objective = "binary:logistic", eval_metric = "error",
194                 num_parallel_tree = 20, subsample = 0.6, colsample_bytree = 0.1)
195  expect_equal(bst$niter, 1)
196  expect_equal(xgb.ntree(bst), 20)
197
198  pred <- predict(bst, train$data)
199  pred_err <- sum((pred > 0.5) != lb) / length(lb)
200  expect_lt(abs(bst$evaluation_log[1, train_error] - pred_err), 10e-6)
201  #expect_lt(pred_err, 0.03)
202
203  pred <- predict(bst, train$data, ntreelimit = 20)
204  pred_err_20 <- sum((pred > 0.5) != lb) / length(lb)
205  expect_equal(pred_err_20, pred_err)
206
207  pred1 <- predict(bst, train$data, iterationrange = c(1, 2))
208  expect_equal(pred, pred1)
209})
210
211test_that("train and predict RF with softprob", {
212  lb <- as.numeric(iris$Species) - 1
213  nrounds <- 15
214  set.seed(11)
215  bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
216                 max_depth = 3, eta = 0.9, nthread = 2, nrounds = nrounds,
217                 objective = "multi:softprob", eval_metric = "merror",
218                 num_class = 3, verbose = 0,
219                 num_parallel_tree = 4, subsample = 0.5, colsample_bytree = 0.5)
220  expect_equal(bst$niter, 15)
221  expect_equal(xgb.ntree(bst), 15 * 3 * 4)
222  # predict for all iterations:
223  pred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE)
224  expect_equal(dim(pred), c(nrow(iris), 3))
225  pred_labels <- max.col(pred) - 1
226  err <- sum(pred_labels != lb) / length(lb)
227  expect_equal(bst$evaluation_log[nrounds, train_merror], err, tolerance = 5e-6)
228  # predict for 7 iterations and adjust for 4 parallel trees per iteration
229  pred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, ntreelimit = 7 * 4)
230  err <- sum((max.col(pred) - 1) != lb) / length(lb)
231  expect_equal(bst$evaluation_log[7, train_merror], err, tolerance = 5e-6)
232})
233
234test_that("use of multiple eval metrics works", {
235  expect_output(
236    bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
237                  eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic",
238                  eval_metric = 'error', eval_metric = 'auc', eval_metric = "logloss")
239  , "train-error.*train-auc.*train-logloss")
240  expect_false(is.null(bst$evaluation_log))
241  expect_equal(dim(bst$evaluation_log), c(2, 4))
242  expect_equal(colnames(bst$evaluation_log), c("iter", "train_error", "train_auc", "train_logloss"))
243})
244
245
246test_that("training continuation works", {
247  dtrain <- xgb.DMatrix(train$data, label = train$label)
248  watchlist <- list(train = dtrain)
249  param <- list(objective = "binary:logistic", max_depth = 2, eta = 1, nthread = 2)
250
251  # for the reference, use 4 iterations at once:
252  set.seed(11)
253  bst <- xgb.train(param, dtrain, nrounds = 4, watchlist, verbose = 0)
254  # first two iterations:
255  set.seed(11)
256  bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0)
257  # continue for two more:
258  bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1)
259  if (!windows_flag && !solaris_flag)
260    expect_equal(bst$raw, bst2$raw)
261  expect_false(is.null(bst2$evaluation_log))
262  expect_equal(dim(bst2$evaluation_log), c(4, 2))
263  expect_equal(bst2$evaluation_log, bst$evaluation_log)
264  # test continuing from raw model data
265  bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1$raw)
266  if (!windows_flag && !solaris_flag)
267    expect_equal(bst$raw, bst2$raw)
268  expect_equal(dim(bst2$evaluation_log), c(2, 2))
269  # test continuing from a model in file
270  xgb.save(bst1, "xgboost.json")
271  bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = "xgboost.json")
272  if (!windows_flag && !solaris_flag)
273    expect_equal(bst$raw, bst2$raw)
274  expect_equal(dim(bst2$evaluation_log), c(2, 2))
275  file.remove("xgboost.json")
276})
277
278test_that("model serialization works", {
279  out_path <- "model_serialization"
280  dtrain <- xgb.DMatrix(train$data, label = train$label)
281  watchlist <- list(train = dtrain)
282  param <- list(objective = "binary:logistic")
283  booster <- xgb.train(param, dtrain, nrounds = 4, watchlist)
284  raw <- xgb.serialize(booster)
285  saveRDS(raw, out_path)
286  raw <- readRDS(out_path)
287
288  loaded <- xgb.unserialize(raw)
289  raw_from_loaded <- xgb.serialize(loaded)
290  expect_equal(raw, raw_from_loaded)
291  file.remove(out_path)
292})
293
294test_that("xgb.cv works", {
295  set.seed(11)
296  expect_output(
297    cv <- xgb.cv(data = train$data, label = train$label, max_depth = 2, nfold = 5,
298                 eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic",
299                 eval_metric = "error", verbose = TRUE)
300  , "train-error:")
301  expect_is(cv, 'xgb.cv.synchronous')
302  expect_false(is.null(cv$evaluation_log))
303  expect_lt(cv$evaluation_log[, min(test_error_mean)], 0.03)
304  expect_lt(cv$evaluation_log[, min(test_error_std)], 0.008)
305  expect_equal(cv$niter, 2)
306  expect_false(is.null(cv$folds) && is.list(cv$folds))
307  expect_length(cv$folds, 5)
308  expect_false(is.null(cv$params) && is.list(cv$params))
309  expect_false(is.null(cv$callbacks))
310  expect_false(is.null(cv$call))
311})
312
313test_that("xgb.cv works with stratified folds", {
314  dtrain <- xgb.DMatrix(train$data, label = train$label)
315  set.seed(314159)
316  cv <- xgb.cv(data = dtrain, max_depth = 2, nfold = 5,
317               eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic",
318               verbose = TRUE, stratified = FALSE)
319  set.seed(314159)
320  cv2 <- xgb.cv(data = dtrain, max_depth = 2, nfold = 5,
321                eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic",
322                verbose = TRUE, stratified = TRUE)
323  # Stratified folds should result in a different evaluation logs
324  expect_true(all(cv$evaluation_log[, test_logloss_mean] != cv2$evaluation_log[, test_logloss_mean]))
325})
326
327test_that("train and predict with non-strict classes", {
328  # standard dense matrix input
329  train_dense <- as.matrix(train$data)
330  bst <- xgboost(data = train_dense, label = train$label, max_depth = 2,
331                 eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 0)
332  pr0 <- predict(bst, train_dense)
333
334  # dense matrix-like input of non-matrix class
335  class(train_dense) <- 'shmatrix'
336  expect_true(is.matrix(train_dense))
337  expect_error(
338    bst <- xgboost(data = train_dense, label = train$label, max_depth = 2,
339                   eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 0)
340    , regexp = NA)
341  expect_error(pr <- predict(bst, train_dense), regexp = NA)
342  expect_equal(pr0, pr)
343
344  # dense matrix-like input of non-matrix class with some inheritance
345  class(train_dense) <- c('pphmatrix', 'shmatrix')
346  expect_true(is.matrix(train_dense))
347  expect_error(
348    bst <- xgboost(data = train_dense, label = train$label, max_depth = 2,
349                   eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 0)
350    , regexp = NA)
351  expect_error(pr <- predict(bst, train_dense), regexp = NA)
352  expect_equal(pr0, pr)
353
354  # when someone inherits from xgb.Booster, it should still be possible to use it as xgb.Booster
355  class(bst) <- c('super.Booster', 'xgb.Booster')
356  expect_error(pr <- predict(bst, train_dense), regexp = NA)
357  expect_equal(pr0, pr)
358})
359
360test_that("max_delta_step works", {
361  dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
362  watchlist <- list(train = dtrain)
363  param <- list(objective = "binary:logistic", eval_metric = "logloss", max_depth = 2, nthread = 2, eta = 0.5)
364  nrounds <- 5
365  # model with no restriction on max_delta_step
366  bst1 <- xgb.train(param, dtrain, nrounds, watchlist, verbose = 1)
367  # model with restricted max_delta_step
368  bst2 <- xgb.train(param, dtrain, nrounds, watchlist, verbose = 1, max_delta_step = 1)
369  # the no-restriction model is expected to have consistently lower loss during the initial iterations
370  expect_true(all(bst1$evaluation_log$train_logloss < bst2$evaluation_log$train_logloss))
371  expect_lt(mean(bst1$evaluation_log$train_logloss) / mean(bst2$evaluation_log$train_logloss), 0.8)
372})
373
374test_that("colsample_bytree works", {
375  # Randomly generate data matrix by sampling from uniform distribution [-1, 1]
376  set.seed(1)
377  train_x <- matrix(runif(1000, min = -1, max = 1), ncol = 100)
378  train_y <- as.numeric(rowSums(train_x) > 0)
379  test_x <- matrix(runif(1000, min = -1, max = 1), ncol = 100)
380  test_y <- as.numeric(rowSums(test_x) > 0)
381  colnames(train_x) <- paste0("Feature_", sprintf("%03d", 1:100))
382  colnames(test_x) <- paste0("Feature_", sprintf("%03d", 1:100))
383  dtrain <- xgb.DMatrix(train_x, label = train_y)
384  dtest <- xgb.DMatrix(test_x, label = test_y)
385  watchlist <- list(train = dtrain, eval = dtest)
386  ## Use colsample_bytree = 0.01, so that roughly one out of 100 features is chosen for
387  ## each tree
388  param <- list(max_depth = 2, eta = 0, nthread = 2,
389                colsample_bytree = 0.01, objective = "binary:logistic",
390                eval_metric = "auc")
391  set.seed(2)
392  bst <- xgb.train(param, dtrain, nrounds = 100, watchlist, verbose = 0)
393  xgb.importance(model = bst)
394  # If colsample_bytree works properly, a variety of features should be used
395  # in the 100 trees
396  expect_gte(nrow(xgb.importance(model = bst)), 30)
397})
398
399test_that("Configuration works", {
400  bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
401                 eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic",
402                 eval_metric = 'error', eval_metric = 'auc', eval_metric = "logloss")
403  config <- xgb.config(bst)
404  xgb.config(bst) <- config
405  reloaded_config <- xgb.config(bst)
406  expect_equal(config, reloaded_config);
407})
408
409test_that("strict_shape works", {
410  n_rounds <- 2
411
412  test_strict_shape <- function(bst, X, n_groups) {
413    predt <- predict(bst, X, strict_shape = TRUE)
414    margin <- predict(bst, X, outputmargin = TRUE, strict_shape = TRUE)
415    contri <- predict(bst, X, predcontrib = TRUE, strict_shape = TRUE)
416    interact <- predict(bst, X, predinteraction = TRUE, strict_shape = TRUE)
417    leaf <- predict(bst, X, predleaf = TRUE, strict_shape = TRUE)
418
419    n_rows <- nrow(X)
420    n_cols <- ncol(X)
421
422    expect_equal(dim(predt), c(n_groups, n_rows))
423    expect_equal(dim(margin), c(n_groups, n_rows))
424    expect_equal(dim(contri), c(n_cols + 1, n_groups, n_rows))
425    expect_equal(dim(interact), c(n_cols + 1, n_cols + 1, n_groups, n_rows))
426    expect_equal(dim(leaf), c(1, n_groups, n_rounds, n_rows))
427
428    if (n_groups != 1) {
429      for (g in seq_len(n_groups)) {
430        expect_lt(max(abs(colSums(contri[, g, ]) - margin[g, ])), 1e-5)
431      }
432    }
433  }
434
435  test_iris <- function() {
436    y <- as.numeric(iris$Species) - 1
437    X <- as.matrix(iris[, -5])
438
439    bst <- xgboost(data = X, label = y,
440                   max_depth = 2, nrounds = n_rounds,
441                   objective = "multi:softprob", num_class = 3, eval_metric = "merror")
442
443    test_strict_shape(bst, X, 3)
444  }
445
446
447  test_agaricus <- function() {
448    data(agaricus.train, package = 'xgboost')
449    X <- agaricus.train$data
450    y <- agaricus.train$label
451
452    bst <- xgboost(data = X, label = y, max_depth = 2,
453                   nrounds = n_rounds, objective = "binary:logistic",
454                   eval_metric = 'error', eval_metric = 'auc', eval_metric = "logloss")
455
456    test_strict_shape(bst, X, 1)
457  }
458
459  test_iris()
460  test_agaricus()
461})
462