1library(testthat)
2context('Test helper functions')
3
4require(xgboost)
5require(data.table)
6require(Matrix)
7require(vcd, quietly = TRUE)
8
9float_tolerance <- 5e-6
10
11# disable some tests for 32-bit environment
12flag_32bit <- .Machine$sizeof.pointer != 8
13
14set.seed(1982)
15data(Arthritis)
16df <- data.table(Arthritis, keep.rownames = FALSE)
17df[, AgeDiscret := as.factor(round(Age / 10, 0))]
18df[, AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))]
19df[, ID := NULL]
20sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df) # nolint
21label <- df[, ifelse(Improved == "Marked", 1, 0)]
22
23# binary
24nrounds <- 12
25bst.Tree <- xgboost(data = sparse_matrix, label = label, max_depth = 9,
26                    eta = 1, nthread = 2, nrounds = nrounds, verbose = 0,
27                    objective = "binary:logistic", booster = "gbtree")
28
29bst.GLM <- xgboost(data = sparse_matrix, label = label,
30                   eta = 1, nthread = 1, nrounds = nrounds, verbose = 0,
31                   objective = "binary:logistic", booster = "gblinear")
32
33feature.names <- colnames(sparse_matrix)
34
35# multiclass
36mlabel <- as.numeric(iris$Species) - 1
37nclass <- 3
38mbst.Tree <- xgboost(data = as.matrix(iris[, -5]), label = mlabel, verbose = 0,
39                     max_depth = 3, eta = 0.5, nthread = 2, nrounds = nrounds,
40                     objective = "multi:softprob", num_class = nclass, base_score = 0)
41
42mbst.GLM <- xgboost(data = as.matrix(iris[, -5]), label = mlabel, verbose = 0,
43                    booster = "gblinear", eta = 0.1, nthread = 1, nrounds = nrounds,
44                    objective = "multi:softprob", num_class = nclass, base_score = 0)
45
46
47test_that("xgb.dump works", {
48  if (!flag_32bit)
49    expect_length(xgb.dump(bst.Tree), 200)
50  dump_file <- file.path(tempdir(), 'xgb.model.dump')
51  expect_true(xgb.dump(bst.Tree, dump_file, with_stats = TRUE))
52  expect_true(file.exists(dump_file))
53  expect_gt(file.size(dump_file), 8000)
54
55  # JSON format
56  dmp <- xgb.dump(bst.Tree, dump_format = "json")
57  expect_length(dmp, 1)
58  if (!flag_32bit)
59    expect_length(grep('nodeid', strsplit(dmp, '\n')[[1]]), 188)
60})
61
62test_that("xgb.dump works for gblinear", {
63  expect_length(xgb.dump(bst.GLM), 14)
64  # also make sure that it works properly for a sparse model where some coefficients
65  # are 0 from setting large L1 regularization:
66  bst.GLM.sp <- xgboost(data = sparse_matrix, label = label, eta = 1, nthread = 2, nrounds = 1,
67                        alpha = 2, objective = "binary:logistic", booster = "gblinear")
68  d.sp <- xgb.dump(bst.GLM.sp)
69  expect_length(d.sp, 14)
70  expect_gt(sum(d.sp == "0"), 0)
71
72  # JSON format
73  dmp <- xgb.dump(bst.GLM.sp, dump_format = "json")
74  expect_length(dmp, 1)
75  expect_length(grep('\\d', strsplit(dmp, '\n')[[1]]), 11)
76})
77
78test_that("predict leafs works", {
79  # no error for gbtree
80  expect_error(pred_leaf <- predict(bst.Tree, sparse_matrix, predleaf = TRUE), regexp = NA)
81  expect_equal(dim(pred_leaf), c(nrow(sparse_matrix), nrounds))
82  # error for gblinear
83  expect_error(predict(bst.GLM, sparse_matrix, predleaf = TRUE))
84})
85
86test_that("predict feature contributions works", {
87  # gbtree binary classifier
88  expect_error(pred_contr <- predict(bst.Tree, sparse_matrix, predcontrib = TRUE), regexp = NA)
89  expect_equal(dim(pred_contr), c(nrow(sparse_matrix), ncol(sparse_matrix) + 1))
90  expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS"))
91  pred <- predict(bst.Tree, sparse_matrix, outputmargin = TRUE)
92  expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-5)
93  # must work with data that has no column names
94  X <- sparse_matrix
95  colnames(X) <- NULL
96  expect_error(pred_contr_ <- predict(bst.Tree, X, predcontrib = TRUE), regexp = NA)
97  expect_equal(pred_contr, pred_contr_, check.attributes = FALSE,
98               tolerance = float_tolerance)
99
100  # gbtree binary classifier (approximate method)
101  expect_error(pred_contr <- predict(bst.Tree, sparse_matrix, predcontrib = TRUE, approxcontrib = TRUE), regexp = NA)
102  expect_equal(dim(pred_contr), c(nrow(sparse_matrix), ncol(sparse_matrix) + 1))
103  expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS"))
104  pred <- predict(bst.Tree, sparse_matrix, outputmargin = TRUE)
105  expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-5)
106
107  # gblinear binary classifier
108  expect_error(pred_contr <- predict(bst.GLM, sparse_matrix, predcontrib = TRUE), regexp = NA)
109  expect_equal(dim(pred_contr), c(nrow(sparse_matrix), ncol(sparse_matrix) + 1))
110  expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS"))
111  pred <- predict(bst.GLM, sparse_matrix, outputmargin = TRUE)
112  expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-5)
113  # manual calculation of linear terms
114  coefs <- as.numeric(xgb.dump(bst.GLM)[-c(1, 2, 4)])
115  coefs <- c(coefs[-1], coefs[1]) # intercept must be the last
116  pred_contr_manual <- sweep(cbind(sparse_matrix, 1), 2, coefs, FUN = "*")
117  expect_equal(as.numeric(pred_contr), as.numeric(pred_contr_manual),
118               tolerance = float_tolerance)
119
120  # gbtree multiclass
121  pred <- predict(mbst.Tree, as.matrix(iris[, -5]), outputmargin = TRUE, reshape = TRUE)
122  pred_contr <- predict(mbst.Tree, as.matrix(iris[, -5]), predcontrib = TRUE)
123  expect_is(pred_contr, "list")
124  expect_length(pred_contr, 3)
125  for (g in seq_along(pred_contr)) {
126    expect_equal(colnames(pred_contr[[g]]), c(colnames(iris[, -5]), "BIAS"))
127    expect_lt(max(abs(rowSums(pred_contr[[g]]) - pred[, g])), 1e-5)
128  }
129
130  # gblinear multiclass (set base_score = 0, which is base margin in multiclass)
131  pred <- predict(mbst.GLM, as.matrix(iris[, -5]), outputmargin = TRUE, reshape = TRUE)
132  pred_contr <- predict(mbst.GLM, as.matrix(iris[, -5]), predcontrib = TRUE)
133  expect_length(pred_contr, 3)
134  coefs_all <- matrix(
135    data = as.numeric(xgb.dump(mbst.GLM)[-c(1, 2, 6)]),
136    ncol = 3,
137    byrow = TRUE
138  )
139  for (g in seq_along(pred_contr)) {
140    expect_equal(colnames(pred_contr[[g]]), c(colnames(iris[, -5]), "BIAS"))
141    expect_lt(max(abs(rowSums(pred_contr[[g]]) - pred[, g])), float_tolerance)
142    # manual calculation of linear terms
143    coefs <- c(coefs_all[-1, g], coefs_all[1, g]) # intercept needs to be the last
144    pred_contr_manual <- sweep(as.matrix(cbind(iris[, -5], 1)), 2, coefs, FUN = "*")
145    expect_equal(as.numeric(pred_contr[[g]]), as.numeric(pred_contr_manual),
146                 tolerance = float_tolerance)
147  }
148})
149
150test_that("SHAPs sum to predictions, with or without DART", {
151  d <- cbind(
152    x1 = rnorm(100),
153    x2 = rnorm(100),
154    x3 = rnorm(100))
155  y <- d[, "x1"] + d[, "x2"]^2 +
156    ifelse(d[, "x3"] > .5, d[, "x3"]^2, 2^d[, "x3"]) +
157    rnorm(100)
158  nrounds <- 30
159
160  for (booster in list("gbtree", "dart")) {
161    fit <- xgboost(
162      params = c(
163        list(
164          booster = booster,
165          objective = "reg:squarederror",
166          eval_metric = "rmse"),
167        if (booster == "dart")
168          list(rate_drop = .01, one_drop = TRUE)),
169      data = d,
170      label = y,
171      nrounds = nrounds)
172
173    pr <- function(...)
174      predict(fit, newdata = d, ...)
175    pred <- pr()
176    shap <- pr(predcontrib = TRUE)
177    shapi <- pr(predinteraction = TRUE)
178    tol <- 1e-5
179
180    expect_equal(rowSums(shap), pred, tol = tol)
181    expect_equal(apply(shapi, 1, sum), pred, tol = tol)
182    for (i in seq_len(nrow(d)))
183      for (f in list(rowSums, colSums))
184        expect_equal(f(shapi[i, , ]), shap[i, ], tol = tol)
185  }
186})
187
188test_that("xgb-attribute functionality", {
189  val <- "my attribute value"
190  list.val <- list(my_attr = val, a = 123, b = 'ok')
191  list.ch <- list.val[order(names(list.val))]
192  list.ch <- lapply(list.ch, as.character)
193  # note: iter is 0-index in xgb attributes
194  list.default <- list(niter = as.character(nrounds - 1))
195  list.ch <- c(list.ch, list.default)
196  # proper input:
197  expect_error(xgb.attr(bst.Tree, NULL))
198  expect_error(xgb.attr(val, val))
199  # set & get:
200  expect_null(xgb.attr(bst.Tree, "asdf"))
201  expect_equal(xgb.attributes(bst.Tree), list.default)
202  xgb.attr(bst.Tree, "my_attr") <- val
203  expect_equal(xgb.attr(bst.Tree, "my_attr"), val)
204  xgb.attributes(bst.Tree) <- list.val
205  expect_equal(xgb.attributes(bst.Tree), list.ch)
206  # serializing:
207  xgb.save(bst.Tree, 'xgb.model')
208  bst <- xgb.load('xgb.model')
209  if (file.exists('xgb.model')) file.remove('xgb.model')
210  expect_equal(xgb.attr(bst, "my_attr"), val)
211  expect_equal(xgb.attributes(bst), list.ch)
212  # deletion:
213  xgb.attr(bst, "my_attr") <- NULL
214  expect_null(xgb.attr(bst, "my_attr"))
215  expect_equal(xgb.attributes(bst), list.ch[c("a", "b", "niter")])
216  xgb.attributes(bst) <- list(a = NULL, b = NULL)
217  expect_equal(xgb.attributes(bst), list.default)
218  xgb.attributes(bst) <- list(niter = NULL)
219  expect_null(xgb.attributes(bst))
220})
221
222if (grepl('Windows', Sys.info()[['sysname']]) ||
223    grepl('Linux', Sys.info()[['sysname']]) ||
224    grepl('Darwin', Sys.info()[['sysname']])) {
225    test_that("xgb-attribute numeric precision", {
226      # check that lossless conversion works with 17 digits
227      # numeric -> character -> numeric
228      X <- 10^runif(100, -20, 20)
229      if (capabilities('long.double')) {
230          X2X <- as.numeric(format(X, digits = 17))
231          expect_equal(X, X2X, tolerance = float_tolerance)
232      }
233      # retrieved attributes to be the same as written
234      for (x in X) {
235        xgb.attr(bst.Tree, "x") <- x
236        expect_equal(as.numeric(xgb.attr(bst.Tree, "x")), x, tolerance = float_tolerance)
237        xgb.attributes(bst.Tree) <- list(a = "A", b = x)
238        expect_equal(as.numeric(xgb.attr(bst.Tree, "b")), x, tolerance = float_tolerance)
239      }
240    })
241}
242
243test_that("xgb.Booster serializing as R object works", {
244  saveRDS(bst.Tree, 'xgb.model.rds')
245  bst <- readRDS('xgb.model.rds')
246  dtrain <- xgb.DMatrix(sparse_matrix, label = label)
247  expect_equal(predict(bst.Tree, dtrain), predict(bst, dtrain), tolerance = float_tolerance)
248  expect_equal(xgb.dump(bst.Tree), xgb.dump(bst))
249  xgb.save(bst, 'xgb.model')
250  if (file.exists('xgb.model')) file.remove('xgb.model')
251  bst <- readRDS('xgb.model.rds')
252  if (file.exists('xgb.model.rds')) file.remove('xgb.model.rds')
253  nil_ptr <- new("externalptr")
254  class(nil_ptr) <- "xgb.Booster.handle"
255  expect_true(identical(bst$handle, nil_ptr))
256  bst <- xgb.Booster.complete(bst)
257  expect_true(!identical(bst$handle, nil_ptr))
258  expect_equal(predict(bst.Tree, dtrain), predict(bst, dtrain), tolerance = float_tolerance)
259})
260
261test_that("xgb.model.dt.tree works with and without feature names", {
262  names.dt.trees <- c("Tree", "Node", "ID", "Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover")
263  dt.tree <- xgb.model.dt.tree(feature_names = feature.names, model = bst.Tree)
264  expect_equal(names.dt.trees, names(dt.tree))
265  if (!flag_32bit)
266    expect_equal(dim(dt.tree), c(188, 10))
267  expect_output(str(dt.tree), 'Feature.*\\"Age\\"')
268
269  dt.tree.0 <- xgb.model.dt.tree(model = bst.Tree)
270  expect_equal(dt.tree, dt.tree.0)
271
272  # when model contains no feature names:
273  bst.Tree.x <- bst.Tree
274  bst.Tree.x$feature_names <- NULL
275  dt.tree.x <- xgb.model.dt.tree(model = bst.Tree.x)
276  expect_output(str(dt.tree.x), 'Feature.*\\"3\\"')
277  expect_equal(dt.tree[, -4, with = FALSE], dt.tree.x[, -4, with = FALSE])
278
279  # using integer node ID instead of character
280  dt.tree.int <- xgb.model.dt.tree(model = bst.Tree, use_int_id = TRUE)
281  expect_equal(as.integer(tstrsplit(dt.tree$Yes, '-')[[2]]), dt.tree.int$Yes)
282  expect_equal(as.integer(tstrsplit(dt.tree$No, '-')[[2]]), dt.tree.int$No)
283  expect_equal(as.integer(tstrsplit(dt.tree$Missing, '-')[[2]]), dt.tree.int$Missing)
284})
285
286test_that("xgb.model.dt.tree throws error for gblinear", {
287  expect_error(xgb.model.dt.tree(model = bst.GLM))
288})
289
290test_that("xgb.importance works with and without feature names", {
291  importance.Tree <- xgb.importance(feature_names = feature.names, model = bst.Tree)
292  if (!flag_32bit)
293    expect_equal(dim(importance.Tree), c(7, 4))
294  expect_equal(colnames(importance.Tree), c("Feature", "Gain", "Cover", "Frequency"))
295  expect_output(str(importance.Tree), 'Feature.*\\"Age\\"')
296
297  importance.Tree.0 <- xgb.importance(model = bst.Tree)
298  expect_equal(importance.Tree, importance.Tree.0, tolerance = float_tolerance)
299
300  # when model contains no feature names:
301  bst.Tree.x <- bst.Tree
302  bst.Tree.x$feature_names <- NULL
303  importance.Tree.x <- xgb.importance(model = bst.Tree)
304  expect_equal(importance.Tree[, -1, with = FALSE], importance.Tree.x[, -1, with = FALSE],
305               tolerance = float_tolerance)
306
307  imp2plot <- xgb.plot.importance(importance_matrix = importance.Tree)
308  expect_equal(colnames(imp2plot), c("Feature", "Gain", "Cover", "Frequency", "Importance"))
309  xgb.ggplot.importance(importance_matrix = importance.Tree)
310
311  # for multiclass
312  imp.Tree <- xgb.importance(model = mbst.Tree)
313  expect_equal(dim(imp.Tree), c(4, 4))
314
315  trees <- seq(from = 0, by = 2, length.out = 2)
316  importance <- xgb.importance(feature_names = feature.names, model = bst.Tree, trees = trees)
317
318  importance_from_dump <- function() {
319    model_text_dump <- xgb.dump(model = bst.Tree, with_stats = TRUE, trees = trees)
320    imp <- xgb.model.dt.tree(
321      feature_names = feature.names,
322      text = model_text_dump,
323      trees = trees
324    )[
325      Feature != "Leaf", .(
326        Gain = sum(Quality),
327        Cover = sum(Cover),
328        Frequency = .N
329      ),
330      by = Feature
331    ][
332      , `:=`(
333        Gain = Gain / sum(Gain),
334        Cover = Cover / sum(Cover),
335        Frequency = Frequency / sum(Frequency)
336      )
337    ][
338      order(Gain, decreasing = TRUE)
339    ]
340    imp
341  }
342  expect_equal(importance_from_dump(), importance, tolerance = 1e-6)
343})
344
345test_that("xgb.importance works with GLM model", {
346  importance.GLM <- xgb.importance(feature_names = feature.names, model = bst.GLM)
347  expect_equal(dim(importance.GLM), c(10, 2))
348  expect_equal(colnames(importance.GLM), c("Feature", "Weight"))
349  xgb.importance(model = bst.GLM)
350  imp2plot <- xgb.plot.importance(importance.GLM)
351  expect_equal(colnames(imp2plot), c("Feature", "Weight", "Importance"))
352  xgb.ggplot.importance(importance.GLM)
353
354  # for multiclass
355  imp.GLM <- xgb.importance(model = mbst.GLM)
356  expect_equal(dim(imp.GLM), c(12, 3))
357  expect_equal(imp.GLM$Class, rep(0:2, each = 4))
358})
359
360test_that("xgb.model.dt.tree and xgb.importance work with a single split model", {
361  bst1 <- xgboost(data = sparse_matrix, label = label, max_depth = 1,
362                  eta = 1, nthread = 2, nrounds = 1, verbose = 0,
363                  objective = "binary:logistic")
364  expect_error(dt <- xgb.model.dt.tree(model = bst1), regexp = NA) # no error
365  expect_equal(nrow(dt), 3)
366  expect_error(imp <- xgb.importance(model = bst1), regexp = NA) # no error
367  expect_equal(nrow(imp), 1)
368  expect_equal(imp$Gain, 1)
369})
370
371test_that("xgb.plot.tree works with and without feature names", {
372  expect_silent(xgb.plot.tree(feature_names = feature.names, model = bst.Tree))
373  expect_silent(xgb.plot.tree(model = bst.Tree))
374})
375
376test_that("xgb.plot.multi.trees works with and without feature names", {
377  xgb.plot.multi.trees(model = bst.Tree, feature_names = feature.names, features_keep = 3)
378  xgb.plot.multi.trees(model = bst.Tree, features_keep = 3)
379})
380
381test_that("xgb.plot.deepness works", {
382  d2p <- xgb.plot.deepness(model = bst.Tree)
383  expect_equal(colnames(d2p), c("ID", "Tree", "Depth", "Cover", "Weight"))
384  xgb.plot.deepness(model = bst.Tree, which = "med.depth")
385  xgb.ggplot.deepness(model = bst.Tree)
386})
387
388test_that("xgb.shap.data works when top_n is provided", {
389  data_list <- xgb.shap.data(data = sparse_matrix, model = bst.Tree, top_n = 2)
390  expect_equal(names(data_list), c("data", "shap_contrib"))
391  expect_equal(NCOL(data_list$data), 2)
392  expect_equal(NCOL(data_list$shap_contrib), 2)
393  expect_equal(NROW(data_list$data), NROW(data_list$shap_contrib))
394  expect_gt(length(colnames(data_list$data)), 0)
395  expect_gt(length(colnames(data_list$shap_contrib)), 0)
396
397  # for multiclass without target class provided
398  data_list <- xgb.shap.data(data = as.matrix(iris[, -5]), model = mbst.Tree, top_n = 2)
399  expect_equal(dim(data_list$shap_contrib), c(nrow(iris), 2))
400  # for multiclass with target class provided
401  data_list <- xgb.shap.data(data = as.matrix(iris[, -5]), model = mbst.Tree, top_n = 2, target_class = 0)
402  expect_equal(dim(data_list$shap_contrib), c(nrow(iris), 2))
403})
404
405test_that("xgb.shap.data works with subsampling", {
406  data_list <- xgb.shap.data(data = sparse_matrix, model = bst.Tree, top_n = 2, subsample = 0.8)
407  expect_equal(NROW(data_list$data), as.integer(0.8 * nrow(sparse_matrix)))
408  expect_equal(NROW(data_list$data), NROW(data_list$shap_contrib))
409})
410
411test_that("prepare.ggplot.shap.data works", {
412  data_list <- xgb.shap.data(data = sparse_matrix, model = bst.Tree, top_n = 2)
413  plot_data <- prepare.ggplot.shap.data(data_list, normalize = TRUE)
414  expect_s3_class(plot_data, "data.frame")
415  expect_equal(names(plot_data), c("id", "feature", "feature_value", "shap_value"))
416  expect_s3_class(plot_data$feature, "factor")
417  # Each observation should have 1 row for each feature
418  expect_equal(nrow(plot_data), nrow(sparse_matrix) * 2)
419})
420
421test_that("xgb.plot.shap works", {
422  sh <- xgb.plot.shap(data = sparse_matrix, model = bst.Tree, top_n = 2, col = 4)
423  expect_equal(names(sh), c("data", "shap_contrib"))
424})
425
426test_that("xgb.plot.shap.summary works", {
427  expect_silent(xgb.plot.shap.summary(data = sparse_matrix, model = bst.Tree, top_n = 2))
428  expect_silent(xgb.ggplot.shap.summary(data = sparse_matrix, model = bst.Tree, top_n = 2))
429})
430
431test_that("check.deprecation works", {
432  ttt <- function(a = NNULL, DUMMY=NULL, ...) {
433    check.deprecation(...)
434    as.list((environment()))
435  }
436  res <- ttt(a = 1, DUMMY = 2, z = 3)
437  expect_equal(res, list(a = 1, DUMMY = 2))
438  expect_warning(
439    res <- ttt(a = 1, dummy = 22, z = 3)
440  , "\'dummy\' is deprecated")
441  expect_equal(res, list(a = 1, DUMMY = 22))
442  expect_warning(
443    res <- ttt(a = 1, dumm = 22, z = 3)
444  , "\'dumm\' was partially matched to \'dummy\'")
445  expect_equal(res, list(a = 1, DUMMY = 22))
446})
447
448test_that('convert.labels works', {
449  y <- c(0, 1, 0, 0, 1)
450  for (objective in c('binary:logistic', 'binary:logitraw', 'binary:hinge')) {
451    res <- xgboost:::convert.labels(y, objective_name = objective)
452    expect_s3_class(res, 'factor')
453    expect_equal(res, factor(res))
454  }
455  y <- c(0, 1, 3, 2, 1, 4)
456  for (objective in c('multi:softmax', 'multi:softprob', 'rank:pairwise', 'rank:ndcg',
457                      'rank:map')) {
458    res <- xgboost:::convert.labels(y, objective_name = objective)
459    expect_s3_class(res, 'factor')
460    expect_equal(res, factor(res))
461  }
462  y <- c(1.2, 3.0, -1.0, 10.0)
463  for (objective in c('reg:squarederror', 'reg:squaredlogerror', 'reg:logistic',
464                      'reg:pseudohubererror', 'count:poisson', 'survival:cox', 'survival:aft',
465                      'reg:gamma', 'reg:tweedie')) {
466    res <- xgboost:::convert.labels(y, objective_name = objective)
467    expect_equal(class(res), 'numeric')
468  }
469})
470