1require(xgboost)
2require(methods)
3
4# we load in the agaricus dataset
5# In this example, we are aiming to predict whether a mushroom is edible
6data(agaricus.train, package = 'xgboost')
7data(agaricus.test, package = 'xgboost')
8train <- agaricus.train
9test <- agaricus.test
10# the loaded data is stored in sparseMatrix, and label is a numeric vector in {0,1}
11class(train$label)
12class(train$data)
13
14#-------------Basic Training using XGBoost-----------------
15# this is the basic usage of xgboost you can put matrix in data field
16# note: we are putting in sparse matrix here, xgboost naturally handles sparse input
17# use sparse matrix when your feature is sparse(e.g. when you are using one-hot encoding vector)
18print("Training xgboost with sparseMatrix")
19bst <- xgboost(data = train$data, label = train$label, max_depth = 2, eta = 1, nrounds = 2,
20               nthread = 2, objective = "binary:logistic")
21# alternatively, you can put in dense matrix, i.e. basic R-matrix
22print("Training xgboost with Matrix")
23bst <- xgboost(data = as.matrix(train$data), label = train$label, max_depth = 2, eta = 1, nrounds = 2,
24               nthread = 2, objective = "binary:logistic")
25
26# you can also put in xgb.DMatrix object, which stores label, data and other meta datas needed for advanced features
27print("Training xgboost with xgb.DMatrix")
28dtrain <- xgb.DMatrix(data = train$data, label = train$label)
29bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, nthread = 2,
30               objective = "binary:logistic")
31
32# Verbose = 0,1,2
33print("Train xgboost with verbose 0, no message")
34bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2,
35               nthread = 2, objective = "binary:logistic", verbose = 0)
36print("Train xgboost with verbose 1, print evaluation metric")
37bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2,
38               nthread = 2, objective = "binary:logistic", verbose = 1)
39print("Train xgboost with verbose 2, also print information about tree")
40bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2,
41               nthread = 2, objective = "binary:logistic", verbose = 2)
42
43# you can also specify data as file path to a LIBSVM format input
44# since we do not have this file with us, the following line is just for illustration
45# bst <- xgboost(data = 'agaricus.train.svm', max_depth = 2, eta = 1, nrounds = 2,objective = "binary:logistic")
46
47#--------------------basic prediction using xgboost--------------
48# you can do prediction using the following line
49# you can put in Matrix, sparseMatrix, or xgb.DMatrix
50pred <- predict(bst, test$data)
51err <- mean(as.numeric(pred > 0.5) != test$label)
52print(paste("test-error=", err))
53
54#-------------------save and load models-------------------------
55# save model to binary local file
56xgb.save(bst, "xgboost.model")
57# load binary model to R
58bst2 <- xgb.load("xgboost.model")
59pred2 <- predict(bst2, test$data)
60# pred2 should be identical to pred
61print(paste("sum(abs(pred2-pred))=", sum(abs(pred2 - pred))))
62
63# save model to R's raw vector
64raw <- xgb.save.raw(bst)
65# load binary model to R
66bst3 <- xgb.load(raw)
67pred3 <- predict(bst3, test$data)
68# pred3 should be identical to pred
69print(paste("sum(abs(pred3-pred))=", sum(abs(pred3 - pred))))
70
71#----------------Advanced features --------------
72# to use advanced features, we need to put data in xgb.DMatrix
73dtrain <- xgb.DMatrix(data = train$data, label = train$label)
74dtest <- xgb.DMatrix(data = test$data, label = test$label)
75#---------------Using watchlist----------------
76# watchlist is a list of xgb.DMatrix, each of them is tagged with name
77watchlist <- list(train = dtrain, test = dtest)
78# to train with watchlist, use xgb.train, which contains more advanced features
79# watchlist allows us to monitor the evaluation result on all data in the list
80print("Train xgboost using xgb.train with watchlist")
81bst <- xgb.train(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, watchlist = watchlist,
82                 nthread = 2, objective = "binary:logistic")
83# we can change evaluation metrics, or use multiple evaluation metrics
84print("train xgboost using xgb.train with watchlist, watch logloss and error")
85bst <- xgb.train(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, watchlist = watchlist,
86                 eval_metric = "error", eval_metric = "logloss",
87                 nthread = 2, objective = "binary:logistic")
88
89# xgb.DMatrix can also be saved using xgb.DMatrix.save
90xgb.DMatrix.save(dtrain, "dtrain.buffer")
91# to load it in, simply call xgb.DMatrix
92dtrain2 <- xgb.DMatrix("dtrain.buffer")
93bst <- xgb.train(data = dtrain2, max_depth = 2, eta = 1, nrounds = 2, watchlist = watchlist,
94                 nthread = 2, objective = "binary:logistic")
95# information can be extracted from xgb.DMatrix using getinfo
96label <- getinfo(dtest, "label")
97pred <- predict(bst, dtest)
98err <- as.numeric(sum(as.integer(pred > 0.5) != label)) / length(label)
99print(paste("test-error=", err))
100
101# You can dump the tree you learned using xgb.dump into a text file
102dump_path <- file.path(tempdir(), 'dump.raw.txt')
103xgb.dump(bst, dump_path, with_stats = TRUE)
104
105# Finally, you can check which features are the most important.
106print("Most important features (look at column Gain):")
107imp_matrix <- xgb.importance(feature_names = colnames(train$data), model = bst)
108print(imp_matrix)
109
110# Feature importance bar plot by gain
111print("Feature importance Plot : ")
112print(xgb.plot.importance(importance_matrix = imp_matrix))
113