1require(xgboost) 2require(methods) 3 4# we load in the agaricus dataset 5# In this example, we are aiming to predict whether a mushroom is edible 6data(agaricus.train, package = 'xgboost') 7data(agaricus.test, package = 'xgboost') 8train <- agaricus.train 9test <- agaricus.test 10# the loaded data is stored in sparseMatrix, and label is a numeric vector in {0,1} 11class(train$label) 12class(train$data) 13 14#-------------Basic Training using XGBoost----------------- 15# this is the basic usage of xgboost you can put matrix in data field 16# note: we are putting in sparse matrix here, xgboost naturally handles sparse input 17# use sparse matrix when your feature is sparse(e.g. when you are using one-hot encoding vector) 18print("Training xgboost with sparseMatrix") 19bst <- xgboost(data = train$data, label = train$label, max_depth = 2, eta = 1, nrounds = 2, 20 nthread = 2, objective = "binary:logistic") 21# alternatively, you can put in dense matrix, i.e. basic R-matrix 22print("Training xgboost with Matrix") 23bst <- xgboost(data = as.matrix(train$data), label = train$label, max_depth = 2, eta = 1, nrounds = 2, 24 nthread = 2, objective = "binary:logistic") 25 26# you can also put in xgb.DMatrix object, which stores label, data and other meta datas needed for advanced features 27print("Training xgboost with xgb.DMatrix") 28dtrain <- xgb.DMatrix(data = train$data, label = train$label) 29bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, nthread = 2, 30 objective = "binary:logistic") 31 32# Verbose = 0,1,2 33print("Train xgboost with verbose 0, no message") 34bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, 35 nthread = 2, objective = "binary:logistic", verbose = 0) 36print("Train xgboost with verbose 1, print evaluation metric") 37bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, 38 nthread = 2, objective = "binary:logistic", verbose = 1) 39print("Train xgboost with verbose 2, also print information about tree") 40bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, 41 nthread = 2, objective = "binary:logistic", verbose = 2) 42 43# you can also specify data as file path to a LIBSVM format input 44# since we do not have this file with us, the following line is just for illustration 45# bst <- xgboost(data = 'agaricus.train.svm', max_depth = 2, eta = 1, nrounds = 2,objective = "binary:logistic") 46 47#--------------------basic prediction using xgboost-------------- 48# you can do prediction using the following line 49# you can put in Matrix, sparseMatrix, or xgb.DMatrix 50pred <- predict(bst, test$data) 51err <- mean(as.numeric(pred > 0.5) != test$label) 52print(paste("test-error=", err)) 53 54#-------------------save and load models------------------------- 55# save model to binary local file 56xgb.save(bst, "xgboost.model") 57# load binary model to R 58bst2 <- xgb.load("xgboost.model") 59pred2 <- predict(bst2, test$data) 60# pred2 should be identical to pred 61print(paste("sum(abs(pred2-pred))=", sum(abs(pred2 - pred)))) 62 63# save model to R's raw vector 64raw <- xgb.save.raw(bst) 65# load binary model to R 66bst3 <- xgb.load(raw) 67pred3 <- predict(bst3, test$data) 68# pred3 should be identical to pred 69print(paste("sum(abs(pred3-pred))=", sum(abs(pred3 - pred)))) 70 71#----------------Advanced features -------------- 72# to use advanced features, we need to put data in xgb.DMatrix 73dtrain <- xgb.DMatrix(data = train$data, label = train$label) 74dtest <- xgb.DMatrix(data = test$data, label = test$label) 75#---------------Using watchlist---------------- 76# watchlist is a list of xgb.DMatrix, each of them is tagged with name 77watchlist <- list(train = dtrain, test = dtest) 78# to train with watchlist, use xgb.train, which contains more advanced features 79# watchlist allows us to monitor the evaluation result on all data in the list 80print("Train xgboost using xgb.train with watchlist") 81bst <- xgb.train(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, watchlist = watchlist, 82 nthread = 2, objective = "binary:logistic") 83# we can change evaluation metrics, or use multiple evaluation metrics 84print("train xgboost using xgb.train with watchlist, watch logloss and error") 85bst <- xgb.train(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, watchlist = watchlist, 86 eval_metric = "error", eval_metric = "logloss", 87 nthread = 2, objective = "binary:logistic") 88 89# xgb.DMatrix can also be saved using xgb.DMatrix.save 90xgb.DMatrix.save(dtrain, "dtrain.buffer") 91# to load it in, simply call xgb.DMatrix 92dtrain2 <- xgb.DMatrix("dtrain.buffer") 93bst <- xgb.train(data = dtrain2, max_depth = 2, eta = 1, nrounds = 2, watchlist = watchlist, 94 nthread = 2, objective = "binary:logistic") 95# information can be extracted from xgb.DMatrix using getinfo 96label <- getinfo(dtest, "label") 97pred <- predict(bst, dtest) 98err <- as.numeric(sum(as.integer(pred > 0.5) != label)) / length(label) 99print(paste("test-error=", err)) 100 101# You can dump the tree you learned using xgb.dump into a text file 102dump_path <- file.path(tempdir(), 'dump.raw.txt') 103xgb.dump(bst, dump_path, with_stats = TRUE) 104 105# Finally, you can check which features are the most important. 106print("Most important features (look at column Gain):") 107imp_matrix <- xgb.importance(feature_names = colnames(train$data), model = bst) 108print(imp_matrix) 109 110# Feature importance bar plot by gain 111print("Feature importance Plot : ") 112print(xgb.plot.importance(importance_matrix = imp_matrix)) 113