1library(testthat) 2context('Test helper functions') 3 4require(xgboost) 5require(data.table) 6require(Matrix) 7require(vcd, quietly = TRUE) 8 9float_tolerance <- 5e-6 10 11# disable some tests for 32-bit environment 12flag_32bit <- .Machine$sizeof.pointer != 8 13 14set.seed(1982) 15data(Arthritis) 16df <- data.table(Arthritis, keep.rownames = FALSE) 17df[, AgeDiscret := as.factor(round(Age / 10, 0))] 18df[, AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))] 19df[, ID := NULL] 20sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df) # nolint 21label <- df[, ifelse(Improved == "Marked", 1, 0)] 22 23# binary 24nrounds <- 12 25bst.Tree <- xgboost(data = sparse_matrix, label = label, max_depth = 9, 26 eta = 1, nthread = 2, nrounds = nrounds, verbose = 0, 27 objective = "binary:logistic", booster = "gbtree") 28 29bst.GLM <- xgboost(data = sparse_matrix, label = label, 30 eta = 1, nthread = 1, nrounds = nrounds, verbose = 0, 31 objective = "binary:logistic", booster = "gblinear") 32 33feature.names <- colnames(sparse_matrix) 34 35# multiclass 36mlabel <- as.numeric(iris$Species) - 1 37nclass <- 3 38mbst.Tree <- xgboost(data = as.matrix(iris[, -5]), label = mlabel, verbose = 0, 39 max_depth = 3, eta = 0.5, nthread = 2, nrounds = nrounds, 40 objective = "multi:softprob", num_class = nclass, base_score = 0) 41 42mbst.GLM <- xgboost(data = as.matrix(iris[, -5]), label = mlabel, verbose = 0, 43 booster = "gblinear", eta = 0.1, nthread = 1, nrounds = nrounds, 44 objective = "multi:softprob", num_class = nclass, base_score = 0) 45 46 47test_that("xgb.dump works", { 48 if (!flag_32bit) 49 expect_length(xgb.dump(bst.Tree), 200) 50 dump_file <- file.path(tempdir(), 'xgb.model.dump') 51 expect_true(xgb.dump(bst.Tree, dump_file, with_stats = TRUE)) 52 expect_true(file.exists(dump_file)) 53 expect_gt(file.size(dump_file), 8000) 54 55 # JSON format 56 dmp <- xgb.dump(bst.Tree, dump_format = "json") 57 expect_length(dmp, 1) 58 if (!flag_32bit) 59 expect_length(grep('nodeid', strsplit(dmp, '\n')[[1]]), 188) 60}) 61 62test_that("xgb.dump works for gblinear", { 63 expect_length(xgb.dump(bst.GLM), 14) 64 # also make sure that it works properly for a sparse model where some coefficients 65 # are 0 from setting large L1 regularization: 66 bst.GLM.sp <- xgboost(data = sparse_matrix, label = label, eta = 1, nthread = 2, nrounds = 1, 67 alpha = 2, objective = "binary:logistic", booster = "gblinear") 68 d.sp <- xgb.dump(bst.GLM.sp) 69 expect_length(d.sp, 14) 70 expect_gt(sum(d.sp == "0"), 0) 71 72 # JSON format 73 dmp <- xgb.dump(bst.GLM.sp, dump_format = "json") 74 expect_length(dmp, 1) 75 expect_length(grep('\\d', strsplit(dmp, '\n')[[1]]), 11) 76}) 77 78test_that("predict leafs works", { 79 # no error for gbtree 80 expect_error(pred_leaf <- predict(bst.Tree, sparse_matrix, predleaf = TRUE), regexp = NA) 81 expect_equal(dim(pred_leaf), c(nrow(sparse_matrix), nrounds)) 82 # error for gblinear 83 expect_error(predict(bst.GLM, sparse_matrix, predleaf = TRUE)) 84}) 85 86test_that("predict feature contributions works", { 87 # gbtree binary classifier 88 expect_error(pred_contr <- predict(bst.Tree, sparse_matrix, predcontrib = TRUE), regexp = NA) 89 expect_equal(dim(pred_contr), c(nrow(sparse_matrix), ncol(sparse_matrix) + 1)) 90 expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS")) 91 pred <- predict(bst.Tree, sparse_matrix, outputmargin = TRUE) 92 expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-5) 93 # must work with data that has no column names 94 X <- sparse_matrix 95 colnames(X) <- NULL 96 expect_error(pred_contr_ <- predict(bst.Tree, X, predcontrib = TRUE), regexp = NA) 97 expect_equal(pred_contr, pred_contr_, check.attributes = FALSE, 98 tolerance = float_tolerance) 99 100 # gbtree binary classifier (approximate method) 101 expect_error(pred_contr <- predict(bst.Tree, sparse_matrix, predcontrib = TRUE, approxcontrib = TRUE), regexp = NA) 102 expect_equal(dim(pred_contr), c(nrow(sparse_matrix), ncol(sparse_matrix) + 1)) 103 expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS")) 104 pred <- predict(bst.Tree, sparse_matrix, outputmargin = TRUE) 105 expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-5) 106 107 # gblinear binary classifier 108 expect_error(pred_contr <- predict(bst.GLM, sparse_matrix, predcontrib = TRUE), regexp = NA) 109 expect_equal(dim(pred_contr), c(nrow(sparse_matrix), ncol(sparse_matrix) + 1)) 110 expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS")) 111 pred <- predict(bst.GLM, sparse_matrix, outputmargin = TRUE) 112 expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-5) 113 # manual calculation of linear terms 114 coefs <- as.numeric(xgb.dump(bst.GLM)[-c(1, 2, 4)]) 115 coefs <- c(coefs[-1], coefs[1]) # intercept must be the last 116 pred_contr_manual <- sweep(cbind(sparse_matrix, 1), 2, coefs, FUN = "*") 117 expect_equal(as.numeric(pred_contr), as.numeric(pred_contr_manual), 118 tolerance = float_tolerance) 119 120 # gbtree multiclass 121 pred <- predict(mbst.Tree, as.matrix(iris[, -5]), outputmargin = TRUE, reshape = TRUE) 122 pred_contr <- predict(mbst.Tree, as.matrix(iris[, -5]), predcontrib = TRUE) 123 expect_is(pred_contr, "list") 124 expect_length(pred_contr, 3) 125 for (g in seq_along(pred_contr)) { 126 expect_equal(colnames(pred_contr[[g]]), c(colnames(iris[, -5]), "BIAS")) 127 expect_lt(max(abs(rowSums(pred_contr[[g]]) - pred[, g])), 1e-5) 128 } 129 130 # gblinear multiclass (set base_score = 0, which is base margin in multiclass) 131 pred <- predict(mbst.GLM, as.matrix(iris[, -5]), outputmargin = TRUE, reshape = TRUE) 132 pred_contr <- predict(mbst.GLM, as.matrix(iris[, -5]), predcontrib = TRUE) 133 expect_length(pred_contr, 3) 134 coefs_all <- matrix( 135 data = as.numeric(xgb.dump(mbst.GLM)[-c(1, 2, 6)]), 136 ncol = 3, 137 byrow = TRUE 138 ) 139 for (g in seq_along(pred_contr)) { 140 expect_equal(colnames(pred_contr[[g]]), c(colnames(iris[, -5]), "BIAS")) 141 expect_lt(max(abs(rowSums(pred_contr[[g]]) - pred[, g])), float_tolerance) 142 # manual calculation of linear terms 143 coefs <- c(coefs_all[-1, g], coefs_all[1, g]) # intercept needs to be the last 144 pred_contr_manual <- sweep(as.matrix(cbind(iris[, -5], 1)), 2, coefs, FUN = "*") 145 expect_equal(as.numeric(pred_contr[[g]]), as.numeric(pred_contr_manual), 146 tolerance = float_tolerance) 147 } 148}) 149 150test_that("SHAPs sum to predictions, with or without DART", { 151 d <- cbind( 152 x1 = rnorm(100), 153 x2 = rnorm(100), 154 x3 = rnorm(100)) 155 y <- d[, "x1"] + d[, "x2"]^2 + 156 ifelse(d[, "x3"] > .5, d[, "x3"]^2, 2^d[, "x3"]) + 157 rnorm(100) 158 nrounds <- 30 159 160 for (booster in list("gbtree", "dart")) { 161 fit <- xgboost( 162 params = c( 163 list( 164 booster = booster, 165 objective = "reg:squarederror", 166 eval_metric = "rmse"), 167 if (booster == "dart") 168 list(rate_drop = .01, one_drop = TRUE)), 169 data = d, 170 label = y, 171 nrounds = nrounds) 172 173 pr <- function(...) 174 predict(fit, newdata = d, ...) 175 pred <- pr() 176 shap <- pr(predcontrib = TRUE) 177 shapi <- pr(predinteraction = TRUE) 178 tol <- 1e-5 179 180 expect_equal(rowSums(shap), pred, tol = tol) 181 expect_equal(apply(shapi, 1, sum), pred, tol = tol) 182 for (i in seq_len(nrow(d))) 183 for (f in list(rowSums, colSums)) 184 expect_equal(f(shapi[i, , ]), shap[i, ], tol = tol) 185 } 186}) 187 188test_that("xgb-attribute functionality", { 189 val <- "my attribute value" 190 list.val <- list(my_attr = val, a = 123, b = 'ok') 191 list.ch <- list.val[order(names(list.val))] 192 list.ch <- lapply(list.ch, as.character) 193 # note: iter is 0-index in xgb attributes 194 list.default <- list(niter = as.character(nrounds - 1)) 195 list.ch <- c(list.ch, list.default) 196 # proper input: 197 expect_error(xgb.attr(bst.Tree, NULL)) 198 expect_error(xgb.attr(val, val)) 199 # set & get: 200 expect_null(xgb.attr(bst.Tree, "asdf")) 201 expect_equal(xgb.attributes(bst.Tree), list.default) 202 xgb.attr(bst.Tree, "my_attr") <- val 203 expect_equal(xgb.attr(bst.Tree, "my_attr"), val) 204 xgb.attributes(bst.Tree) <- list.val 205 expect_equal(xgb.attributes(bst.Tree), list.ch) 206 # serializing: 207 xgb.save(bst.Tree, 'xgb.model') 208 bst <- xgb.load('xgb.model') 209 if (file.exists('xgb.model')) file.remove('xgb.model') 210 expect_equal(xgb.attr(bst, "my_attr"), val) 211 expect_equal(xgb.attributes(bst), list.ch) 212 # deletion: 213 xgb.attr(bst, "my_attr") <- NULL 214 expect_null(xgb.attr(bst, "my_attr")) 215 expect_equal(xgb.attributes(bst), list.ch[c("a", "b", "niter")]) 216 xgb.attributes(bst) <- list(a = NULL, b = NULL) 217 expect_equal(xgb.attributes(bst), list.default) 218 xgb.attributes(bst) <- list(niter = NULL) 219 expect_null(xgb.attributes(bst)) 220}) 221 222if (grepl('Windows', Sys.info()[['sysname']]) || 223 grepl('Linux', Sys.info()[['sysname']]) || 224 grepl('Darwin', Sys.info()[['sysname']])) { 225 test_that("xgb-attribute numeric precision", { 226 # check that lossless conversion works with 17 digits 227 # numeric -> character -> numeric 228 X <- 10^runif(100, -20, 20) 229 if (capabilities('long.double')) { 230 X2X <- as.numeric(format(X, digits = 17)) 231 expect_equal(X, X2X, tolerance = float_tolerance) 232 } 233 # retrieved attributes to be the same as written 234 for (x in X) { 235 xgb.attr(bst.Tree, "x") <- x 236 expect_equal(as.numeric(xgb.attr(bst.Tree, "x")), x, tolerance = float_tolerance) 237 xgb.attributes(bst.Tree) <- list(a = "A", b = x) 238 expect_equal(as.numeric(xgb.attr(bst.Tree, "b")), x, tolerance = float_tolerance) 239 } 240 }) 241} 242 243test_that("xgb.Booster serializing as R object works", { 244 saveRDS(bst.Tree, 'xgb.model.rds') 245 bst <- readRDS('xgb.model.rds') 246 dtrain <- xgb.DMatrix(sparse_matrix, label = label) 247 expect_equal(predict(bst.Tree, dtrain), predict(bst, dtrain), tolerance = float_tolerance) 248 expect_equal(xgb.dump(bst.Tree), xgb.dump(bst)) 249 xgb.save(bst, 'xgb.model') 250 if (file.exists('xgb.model')) file.remove('xgb.model') 251 bst <- readRDS('xgb.model.rds') 252 if (file.exists('xgb.model.rds')) file.remove('xgb.model.rds') 253 nil_ptr <- new("externalptr") 254 class(nil_ptr) <- "xgb.Booster.handle" 255 expect_true(identical(bst$handle, nil_ptr)) 256 bst <- xgb.Booster.complete(bst) 257 expect_true(!identical(bst$handle, nil_ptr)) 258 expect_equal(predict(bst.Tree, dtrain), predict(bst, dtrain), tolerance = float_tolerance) 259}) 260 261test_that("xgb.model.dt.tree works with and without feature names", { 262 names.dt.trees <- c("Tree", "Node", "ID", "Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover") 263 dt.tree <- xgb.model.dt.tree(feature_names = feature.names, model = bst.Tree) 264 expect_equal(names.dt.trees, names(dt.tree)) 265 if (!flag_32bit) 266 expect_equal(dim(dt.tree), c(188, 10)) 267 expect_output(str(dt.tree), 'Feature.*\\"Age\\"') 268 269 dt.tree.0 <- xgb.model.dt.tree(model = bst.Tree) 270 expect_equal(dt.tree, dt.tree.0) 271 272 # when model contains no feature names: 273 bst.Tree.x <- bst.Tree 274 bst.Tree.x$feature_names <- NULL 275 dt.tree.x <- xgb.model.dt.tree(model = bst.Tree.x) 276 expect_output(str(dt.tree.x), 'Feature.*\\"3\\"') 277 expect_equal(dt.tree[, -4, with = FALSE], dt.tree.x[, -4, with = FALSE]) 278 279 # using integer node ID instead of character 280 dt.tree.int <- xgb.model.dt.tree(model = bst.Tree, use_int_id = TRUE) 281 expect_equal(as.integer(tstrsplit(dt.tree$Yes, '-')[[2]]), dt.tree.int$Yes) 282 expect_equal(as.integer(tstrsplit(dt.tree$No, '-')[[2]]), dt.tree.int$No) 283 expect_equal(as.integer(tstrsplit(dt.tree$Missing, '-')[[2]]), dt.tree.int$Missing) 284}) 285 286test_that("xgb.model.dt.tree throws error for gblinear", { 287 expect_error(xgb.model.dt.tree(model = bst.GLM)) 288}) 289 290test_that("xgb.importance works with and without feature names", { 291 importance.Tree <- xgb.importance(feature_names = feature.names, model = bst.Tree) 292 if (!flag_32bit) 293 expect_equal(dim(importance.Tree), c(7, 4)) 294 expect_equal(colnames(importance.Tree), c("Feature", "Gain", "Cover", "Frequency")) 295 expect_output(str(importance.Tree), 'Feature.*\\"Age\\"') 296 297 importance.Tree.0 <- xgb.importance(model = bst.Tree) 298 expect_equal(importance.Tree, importance.Tree.0, tolerance = float_tolerance) 299 300 # when model contains no feature names: 301 bst.Tree.x <- bst.Tree 302 bst.Tree.x$feature_names <- NULL 303 importance.Tree.x <- xgb.importance(model = bst.Tree) 304 expect_equal(importance.Tree[, -1, with = FALSE], importance.Tree.x[, -1, with = FALSE], 305 tolerance = float_tolerance) 306 307 imp2plot <- xgb.plot.importance(importance_matrix = importance.Tree) 308 expect_equal(colnames(imp2plot), c("Feature", "Gain", "Cover", "Frequency", "Importance")) 309 xgb.ggplot.importance(importance_matrix = importance.Tree) 310 311 # for multiclass 312 imp.Tree <- xgb.importance(model = mbst.Tree) 313 expect_equal(dim(imp.Tree), c(4, 4)) 314 315 trees <- seq(from = 0, by = 2, length.out = 2) 316 importance <- xgb.importance(feature_names = feature.names, model = bst.Tree, trees = trees) 317 318 importance_from_dump <- function() { 319 model_text_dump <- xgb.dump(model = bst.Tree, with_stats = TRUE, trees = trees) 320 imp <- xgb.model.dt.tree( 321 feature_names = feature.names, 322 text = model_text_dump, 323 trees = trees 324 )[ 325 Feature != "Leaf", .( 326 Gain = sum(Quality), 327 Cover = sum(Cover), 328 Frequency = .N 329 ), 330 by = Feature 331 ][ 332 , `:=`( 333 Gain = Gain / sum(Gain), 334 Cover = Cover / sum(Cover), 335 Frequency = Frequency / sum(Frequency) 336 ) 337 ][ 338 order(Gain, decreasing = TRUE) 339 ] 340 imp 341 } 342 expect_equal(importance_from_dump(), importance, tolerance = 1e-6) 343}) 344 345test_that("xgb.importance works with GLM model", { 346 importance.GLM <- xgb.importance(feature_names = feature.names, model = bst.GLM) 347 expect_equal(dim(importance.GLM), c(10, 2)) 348 expect_equal(colnames(importance.GLM), c("Feature", "Weight")) 349 xgb.importance(model = bst.GLM) 350 imp2plot <- xgb.plot.importance(importance.GLM) 351 expect_equal(colnames(imp2plot), c("Feature", "Weight", "Importance")) 352 xgb.ggplot.importance(importance.GLM) 353 354 # for multiclass 355 imp.GLM <- xgb.importance(model = mbst.GLM) 356 expect_equal(dim(imp.GLM), c(12, 3)) 357 expect_equal(imp.GLM$Class, rep(0:2, each = 4)) 358}) 359 360test_that("xgb.model.dt.tree and xgb.importance work with a single split model", { 361 bst1 <- xgboost(data = sparse_matrix, label = label, max_depth = 1, 362 eta = 1, nthread = 2, nrounds = 1, verbose = 0, 363 objective = "binary:logistic") 364 expect_error(dt <- xgb.model.dt.tree(model = bst1), regexp = NA) # no error 365 expect_equal(nrow(dt), 3) 366 expect_error(imp <- xgb.importance(model = bst1), regexp = NA) # no error 367 expect_equal(nrow(imp), 1) 368 expect_equal(imp$Gain, 1) 369}) 370 371test_that("xgb.plot.tree works with and without feature names", { 372 expect_silent(xgb.plot.tree(feature_names = feature.names, model = bst.Tree)) 373 expect_silent(xgb.plot.tree(model = bst.Tree)) 374}) 375 376test_that("xgb.plot.multi.trees works with and without feature names", { 377 xgb.plot.multi.trees(model = bst.Tree, feature_names = feature.names, features_keep = 3) 378 xgb.plot.multi.trees(model = bst.Tree, features_keep = 3) 379}) 380 381test_that("xgb.plot.deepness works", { 382 d2p <- xgb.plot.deepness(model = bst.Tree) 383 expect_equal(colnames(d2p), c("ID", "Tree", "Depth", "Cover", "Weight")) 384 xgb.plot.deepness(model = bst.Tree, which = "med.depth") 385 xgb.ggplot.deepness(model = bst.Tree) 386}) 387 388test_that("xgb.shap.data works when top_n is provided", { 389 data_list <- xgb.shap.data(data = sparse_matrix, model = bst.Tree, top_n = 2) 390 expect_equal(names(data_list), c("data", "shap_contrib")) 391 expect_equal(NCOL(data_list$data), 2) 392 expect_equal(NCOL(data_list$shap_contrib), 2) 393 expect_equal(NROW(data_list$data), NROW(data_list$shap_contrib)) 394 expect_gt(length(colnames(data_list$data)), 0) 395 expect_gt(length(colnames(data_list$shap_contrib)), 0) 396 397 # for multiclass without target class provided 398 data_list <- xgb.shap.data(data = as.matrix(iris[, -5]), model = mbst.Tree, top_n = 2) 399 expect_equal(dim(data_list$shap_contrib), c(nrow(iris), 2)) 400 # for multiclass with target class provided 401 data_list <- xgb.shap.data(data = as.matrix(iris[, -5]), model = mbst.Tree, top_n = 2, target_class = 0) 402 expect_equal(dim(data_list$shap_contrib), c(nrow(iris), 2)) 403}) 404 405test_that("xgb.shap.data works with subsampling", { 406 data_list <- xgb.shap.data(data = sparse_matrix, model = bst.Tree, top_n = 2, subsample = 0.8) 407 expect_equal(NROW(data_list$data), as.integer(0.8 * nrow(sparse_matrix))) 408 expect_equal(NROW(data_list$data), NROW(data_list$shap_contrib)) 409}) 410 411test_that("prepare.ggplot.shap.data works", { 412 data_list <- xgb.shap.data(data = sparse_matrix, model = bst.Tree, top_n = 2) 413 plot_data <- prepare.ggplot.shap.data(data_list, normalize = TRUE) 414 expect_s3_class(plot_data, "data.frame") 415 expect_equal(names(plot_data), c("id", "feature", "feature_value", "shap_value")) 416 expect_s3_class(plot_data$feature, "factor") 417 # Each observation should have 1 row for each feature 418 expect_equal(nrow(plot_data), nrow(sparse_matrix) * 2) 419}) 420 421test_that("xgb.plot.shap works", { 422 sh <- xgb.plot.shap(data = sparse_matrix, model = bst.Tree, top_n = 2, col = 4) 423 expect_equal(names(sh), c("data", "shap_contrib")) 424}) 425 426test_that("xgb.plot.shap.summary works", { 427 expect_silent(xgb.plot.shap.summary(data = sparse_matrix, model = bst.Tree, top_n = 2)) 428 expect_silent(xgb.ggplot.shap.summary(data = sparse_matrix, model = bst.Tree, top_n = 2)) 429}) 430 431test_that("check.deprecation works", { 432 ttt <- function(a = NNULL, DUMMY=NULL, ...) { 433 check.deprecation(...) 434 as.list((environment())) 435 } 436 res <- ttt(a = 1, DUMMY = 2, z = 3) 437 expect_equal(res, list(a = 1, DUMMY = 2)) 438 expect_warning( 439 res <- ttt(a = 1, dummy = 22, z = 3) 440 , "\'dummy\' is deprecated") 441 expect_equal(res, list(a = 1, DUMMY = 22)) 442 expect_warning( 443 res <- ttt(a = 1, dumm = 22, z = 3) 444 , "\'dumm\' was partially matched to \'dummy\'") 445 expect_equal(res, list(a = 1, DUMMY = 22)) 446}) 447 448test_that('convert.labels works', { 449 y <- c(0, 1, 0, 0, 1) 450 for (objective in c('binary:logistic', 'binary:logitraw', 'binary:hinge')) { 451 res <- xgboost:::convert.labels(y, objective_name = objective) 452 expect_s3_class(res, 'factor') 453 expect_equal(res, factor(res)) 454 } 455 y <- c(0, 1, 3, 2, 1, 4) 456 for (objective in c('multi:softmax', 'multi:softprob', 'rank:pairwise', 'rank:ndcg', 457 'rank:map')) { 458 res <- xgboost:::convert.labels(y, objective_name = objective) 459 expect_s3_class(res, 'factor') 460 expect_equal(res, factor(res)) 461 } 462 y <- c(1.2, 3.0, -1.0, 10.0) 463 for (objective in c('reg:squarederror', 'reg:squaredlogerror', 'reg:logistic', 464 'reg:pseudohubererror', 'count:poisson', 'survival:cox', 'survival:aft', 465 'reg:gamma', 'reg:tweedie')) { 466 res <- xgboost:::convert.labels(y, objective_name = objective) 467 expect_equal(class(res), 'numeric') 468 } 469}) 470