1# -*- coding: utf-8 -*-
2import numpy as np
3import os
4import xgboost as xgb
5import pytest
6import json
7from pathlib import Path
8import tempfile
9import testing as tm
10
11dpath = 'demo/data/'
12rng = np.random.RandomState(1994)
13
14
15class TestBasic:
16    def test_compat(self):
17        from xgboost.compat import lazy_isinstance
18        a = np.array([1, 2, 3])
19        assert lazy_isinstance(a, 'numpy', 'ndarray')
20        assert not lazy_isinstance(a, 'numpy', 'dataframe')
21
22    def test_basic(self):
23        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
24        dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
25        param = {'max_depth': 2, 'eta': 1,
26                 'objective': 'binary:logistic'}
27        # specify validations set to watch performance
28        watchlist = [(dtrain, 'train')]
29        num_round = 2
30        bst = xgb.train(param, dtrain, num_round, watchlist, verbose_eval=True)
31
32        preds = bst.predict(dtrain)
33        labels = dtrain.get_label()
34        err = sum(1 for i in range(len(preds))
35                  if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
36        # error must be smaller than 10%
37        assert err < 0.1
38
39        preds = bst.predict(dtest)
40        labels = dtest.get_label()
41        err = sum(1 for i in range(len(preds))
42                  if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
43        # error must be smaller than 10%
44        assert err < 0.1
45
46        with tempfile.TemporaryDirectory() as tmpdir:
47            dtest_path = os.path.join(tmpdir, 'dtest.dmatrix')
48            # save dmatrix into binary buffer
49            dtest.save_binary(dtest_path)
50            # save model
51            model_path = os.path.join(tmpdir, 'model.booster')
52            bst.save_model(model_path)
53            # load model and data in
54            bst2 = xgb.Booster(model_file=model_path)
55            dtest2 = xgb.DMatrix(dtest_path)
56            preds2 = bst2.predict(dtest2)
57            # assert they are the same
58            assert np.sum(np.abs(preds2 - preds)) == 0
59
60    def test_metric_config(self):
61        # Make sure that the metric configuration happens in booster so the
62        # string `['error', 'auc']` doesn't get passed down to core.
63        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
64        dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
65        param = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
66                 'objective': 'binary:logistic', 'eval_metric': ['error', 'auc']}
67        watchlist = [(dtest, 'eval'), (dtrain, 'train')]
68        num_round = 2
69        booster = xgb.train(param, dtrain, num_round, watchlist)
70        predt_0 = booster.predict(dtrain)
71        with tempfile.TemporaryDirectory() as tmpdir:
72            path = os.path.join(tmpdir, 'model.json')
73            booster.save_model(path)
74
75            booster = xgb.Booster(params=param, model_file=path)
76            predt_1 = booster.predict(dtrain)
77            np.testing.assert_allclose(predt_0, predt_1)
78
79    def test_record_results(self):
80        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
81        dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
82        param = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
83                 'objective': 'binary:logistic', 'eval_metric': 'error'}
84        # specify validations set to watch performance
85        watchlist = [(dtest, 'eval'), (dtrain, 'train')]
86        num_round = 2
87        result = {}
88        res2 = {}
89        xgb.train(param, dtrain, num_round, watchlist,
90                  callbacks=[xgb.callback.record_evaluation(result)])
91        xgb.train(param, dtrain, num_round, watchlist,
92                  evals_result=res2)
93        assert result['train']['error'][0] < 0.1
94        assert res2 == result
95
96    def test_multiclass(self):
97        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
98        dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
99        param = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'num_class': 2}
100        # specify validations set to watch performance
101        watchlist = [(dtest, 'eval'), (dtrain, 'train')]
102        num_round = 2
103        bst = xgb.train(param, dtrain, num_round, watchlist)
104        # this is prediction
105        preds = bst.predict(dtest)
106        labels = dtest.get_label()
107        err = sum(1 for i in range(len(preds))
108                  if preds[i] != labels[i]) / float(len(preds))
109        # error must be smaller than 10%
110        assert err < 0.1
111
112        with tempfile.TemporaryDirectory() as tmpdir:
113            dtest_path = os.path.join(tmpdir, 'dtest.buffer')
114            model_path = os.path.join(tmpdir, 'xgb.model')
115            # save dmatrix into binary buffer
116            dtest.save_binary(dtest_path)
117            # save model
118            bst.save_model(model_path)
119            # load model and data in
120            bst2 = xgb.Booster(model_file=model_path)
121            dtest2 = xgb.DMatrix(dtest_path)
122            preds2 = bst2.predict(dtest2)
123            # assert they are the same
124            assert np.sum(np.abs(preds2 - preds)) == 0
125
126    def test_dump(self):
127        data = np.random.randn(100, 2)
128        target = np.array([0, 1] * 50)
129        features = ['Feature1', 'Feature2']
130
131        dm = xgb.DMatrix(data, label=target, feature_names=features)
132        params = {'objective': 'binary:logistic',
133                  'eval_metric': 'logloss',
134                  'eta': 0.3,
135                  'max_depth': 1}
136
137        bst = xgb.train(params, dm, num_boost_round=1)
138
139        # number of feature importances should == number of features
140        dump1 = bst.get_dump()
141        assert len(dump1) == 1, 'Expected only 1 tree to be dumped.'
142        len(dump1[0].splitlines()) == 3, 'Expected 1 root and 2 leaves - 3 lines in dump.'
143
144        dump2 = bst.get_dump(with_stats=True)
145        assert dump2[0].count('\n') == 3, 'Expected 1 root and 2 leaves - 3 lines in dump.'
146        msg = 'Expected more info when with_stats=True is given.'
147        assert dump2[0].find('\n') > dump1[0].find('\n'), msg
148
149        dump3 = bst.get_dump(dump_format="json")
150        dump3j = json.loads(dump3[0])
151        assert dump3j['nodeid'] == 0, 'Expected the root node on top.'
152
153        dump4 = bst.get_dump(dump_format="json", with_stats=True)
154        dump4j = json.loads(dump4[0])
155        assert 'gain' in dump4j, "Expected 'gain' to be dumped in JSON."
156
157        with pytest.raises(ValueError):
158            bst.get_dump(fmap="foo")
159
160    def test_feature_score(self):
161        rng = np.random.RandomState(0)
162        data = rng.randn(100, 2)
163        target = np.array([0, 1] * 50)
164        features = ["F0"]
165        with pytest.raises(ValueError):
166            xgb.DMatrix(data, label=target, feature_names=features)
167
168        params = {"objective": "binary:logistic"}
169        dm = xgb.DMatrix(data, label=target, feature_names=["F0", "F1"])
170        booster = xgb.train(params, dm, num_boost_round=1)
171        # no error since feature names might be assigned before the booster seeing data
172        # and booster doesn't known about the actual number of features.
173        booster.feature_names = ["F0"]
174        with pytest.raises(ValueError):
175            booster.get_fscore()
176
177        booster.feature_names = None
178        # Use JSON to make sure the output has native Python type
179        scores = json.loads(json.dumps(booster.get_fscore()))
180        np.testing.assert_allclose(scores["f0"], 6.0)
181
182    def test_load_file_invalid(self):
183        with pytest.raises(xgb.core.XGBoostError):
184            xgb.Booster(model_file='incorrect_path')
185
186        with pytest.raises(xgb.core.XGBoostError):
187            xgb.Booster(model_file=u'不正なパス')
188
189    def test_dmatrix_numpy_init_omp(self):
190
191        rows = [1000, 11326, 15000]
192        cols = 50
193        for row in rows:
194            X = np.random.randn(row, cols)
195            y = np.random.randn(row).astype('f')
196            dm = xgb.DMatrix(X, y, nthread=0)
197            np.testing.assert_array_equal(dm.get_label(), y)
198            assert dm.num_row() == row
199            assert dm.num_col() == cols
200
201            dm = xgb.DMatrix(X, y, nthread=10)
202            np.testing.assert_array_equal(dm.get_label(), y)
203            assert dm.num_row() == row
204            assert dm.num_col() == cols
205
206    def test_cv(self):
207        dm = xgb.DMatrix(dpath + 'agaricus.txt.train')
208        params = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
209                  'objective': 'binary:logistic'}
210
211        # return np.ndarray
212        cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=False)
213        assert isinstance(cv, dict)
214        assert len(cv) == (4)
215
216    def test_cv_no_shuffle(self):
217        dm = xgb.DMatrix(dpath + 'agaricus.txt.train')
218        params = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
219                  'objective': 'binary:logistic'}
220
221        # return np.ndarray
222        cv = xgb.cv(params, dm, num_boost_round=10, shuffle=False, nfold=10,
223                    as_pandas=False)
224        assert isinstance(cv, dict)
225        assert len(cv) == (4)
226
227    def test_cv_explicit_fold_indices(self):
228        dm = xgb.DMatrix(dpath + 'agaricus.txt.train')
229        params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective':
230                  'binary:logistic'}
231        folds = [
232            # Train        Test
233            ([1, 3], [5, 8]),
234            ([7, 9], [23, 43]),
235        ]
236
237        # return np.ndarray
238        cv = xgb.cv(params, dm, num_boost_round=10, folds=folds,
239                    as_pandas=False)
240        assert isinstance(cv, dict)
241        assert len(cv) == (4)
242
243    @pytest.mark.skipif(**tm.skip_s390x())
244    def test_cv_explicit_fold_indices_labels(self):
245        params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective':
246                  'reg:squarederror'}
247        N = 100
248        F = 3
249        dm = xgb.DMatrix(data=np.random.randn(N, F), label=np.arange(N))
250        folds = [
251            # Train        Test
252            ([1, 3], [5, 8]),
253            ([7, 9], [23, 43, 11]),
254        ]
255
256        # Use callback to log the test labels in each fold
257        def cb(cbackenv):
258            print([fold.dtest.get_label() for fold in cbackenv.cvfolds])
259
260        # Run cross validation and capture standard out to test callback result
261        with tm.captured_output() as (out, err):
262            xgb.cv(
263                params, dm, num_boost_round=1, folds=folds, callbacks=[cb],
264                as_pandas=False
265            )
266            output = out.getvalue().strip()
267        solution = ('[array([5., 8.], dtype=float32), array([23., 43., 11.],' +
268                    ' dtype=float32)]')
269        assert output == solution
270
271
272class TestBasicPathLike:
273    """Unit tests using pathlib.Path for file interaction."""
274
275    def test_DMatrix_init_from_path(self):
276        """Initialization from the data path."""
277        dpath = Path('demo/data')
278        dtrain = xgb.DMatrix(dpath / 'agaricus.txt.train')
279        assert dtrain.num_row() == 6513
280        assert dtrain.num_col() == 127
281
282    def test_DMatrix_save_to_path(self):
283        """Saving to a binary file using pathlib from a DMatrix."""
284        data = np.random.randn(100, 2)
285        target = np.array([0, 1] * 50)
286        features = ['Feature1', 'Feature2']
287
288        dm = xgb.DMatrix(data, label=target, feature_names=features)
289
290        # save, assert exists, remove file
291        binary_path = Path("dtrain.bin")
292        dm.save_binary(binary_path)
293        assert binary_path.exists()
294        Path.unlink(binary_path)
295
296    def test_Booster_init_invalid_path(self):
297        """An invalid model_file path should raise XGBoostError."""
298        with pytest.raises(xgb.core.XGBoostError):
299            xgb.Booster(model_file=Path("invalidpath"))
300
301    def test_Booster_save_and_load(self):
302        """Saving and loading model files from paths."""
303        save_path = Path("saveload.model")
304
305        data = np.random.randn(100, 2)
306        target = np.array([0, 1] * 50)
307        features = ['Feature1', 'Feature2']
308
309        dm = xgb.DMatrix(data, label=target, feature_names=features)
310        params = {'objective': 'binary:logistic',
311                  'eval_metric': 'logloss',
312                  'eta': 0.3,
313                  'max_depth': 1}
314
315        bst = xgb.train(params, dm, num_boost_round=1)
316
317        # save, assert exists
318        bst.save_model(save_path)
319        assert save_path.exists()
320
321        def dump_assertions(dump):
322            """Assertions for the expected dump from Booster"""
323            assert len(dump) == 1, 'Exepcted only 1 tree to be dumped.'
324            assert len(dump[0].splitlines()) == 3, 'Expected 1 root and 2 leaves - 3 lines.'
325
326        # load the model again using Path
327        bst2 = xgb.Booster(model_file=save_path)
328        dump2 = bst2.get_dump()
329        dump_assertions(dump2)
330
331        # load again using load_model
332        bst3 = xgb.Booster()
333        bst3.load_model(save_path)
334        dump3 = bst3.get_dump()
335        dump_assertions(dump3)
336
337        # remove file
338        Path.unlink(save_path)
339