1import numpy as np
2import scipy.sparse
3import pickle
4import xgboost as xgb
5import os
6
7# Make sure the demo knows where to load the data.
8CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
9XGBOOST_ROOT_DIR = os.path.dirname(os.path.dirname(CURRENT_DIR))
10DEMO_DIR = os.path.join(XGBOOST_ROOT_DIR, 'demo')
11
12# simple example
13# load file from text file, also binary buffer generated by xgboost
14dtrain = xgb.DMatrix(os.path.join(DEMO_DIR, 'data', 'agaricus.txt.train?indexing_mode=1'))
15dtest = xgb.DMatrix(os.path.join(DEMO_DIR, 'data', 'agaricus.txt.test?indexing_mode=1'))
16
17# specify parameters via map, definition are same as c++ version
18param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
19
20# specify validations set to watch performance
21watchlist = [(dtest, 'eval'), (dtrain, 'train')]
22num_round = 2
23bst = xgb.train(param, dtrain, num_round, watchlist)
24
25# this is prediction
26preds = bst.predict(dtest)
27labels = dtest.get_label()
28print('error=%f' %
29      (sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) /
30       float(len(preds))))
31bst.save_model('0001.model')
32# dump model
33bst.dump_model('dump.raw.txt')
34# dump model with feature map
35bst.dump_model('dump.nice.txt', os.path.join(DEMO_DIR, 'data/featmap.txt'))
36
37# save dmatrix into binary buffer
38dtest.save_binary('dtest.buffer')
39# save model
40bst.save_model('xgb.model')
41# load model and data in
42bst2 = xgb.Booster(model_file='xgb.model')
43dtest2 = xgb.DMatrix('dtest.buffer')
44preds2 = bst2.predict(dtest2)
45# assert they are the same
46assert np.sum(np.abs(preds2 - preds)) == 0
47
48# alternatively, you can pickle the booster
49pks = pickle.dumps(bst2)
50# load model and data in
51bst3 = pickle.loads(pks)
52preds3 = bst3.predict(dtest2)
53# assert they are the same
54assert np.sum(np.abs(preds3 - preds)) == 0
55
56###
57# build dmatrix from scipy.sparse
58print('start running example of build DMatrix from scipy.sparse CSR Matrix')
59labels = []
60row = []
61col = []
62dat = []
63i = 0
64for l in open(os.path.join(DEMO_DIR, 'data', 'agaricus.txt.train')):
65    arr = l.split()
66    labels.append(int(arr[0]))
67    for it in arr[1:]:
68        k, v = it.split(':')
69        row.append(i)
70        col.append(int(k))
71        dat.append(float(v))
72    i += 1
73csr = scipy.sparse.csr_matrix((dat, (row, col)))
74dtrain = xgb.DMatrix(csr, label=labels)
75watchlist = [(dtest, 'eval'), (dtrain, 'train')]
76bst = xgb.train(param, dtrain, num_round, watchlist)
77
78print('start running example of build DMatrix from scipy.sparse CSC Matrix')
79# we can also construct from csc matrix
80csc = scipy.sparse.csc_matrix((dat, (row, col)))
81dtrain = xgb.DMatrix(csc, label=labels)
82watchlist = [(dtest, 'eval'), (dtrain, 'train')]
83bst = xgb.train(param, dtrain, num_round, watchlist)
84
85print('start running example of build DMatrix from numpy array')
86# NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix
87# in internal implementation then convert to DMatrix
88npymat = csr.todense()
89dtrain = xgb.DMatrix(npymat, label=labels)
90watchlist = [(dtest, 'eval'), (dtrain, 'train')]
91bst = xgb.train(param, dtrain, num_round, watchlist)
92