1# -*- coding: utf-8 -*- 2import numpy as np 3import os 4import xgboost as xgb 5import pytest 6import json 7from pathlib import Path 8import tempfile 9import testing as tm 10 11dpath = 'demo/data/' 12rng = np.random.RandomState(1994) 13 14 15class TestBasic: 16 def test_compat(self): 17 from xgboost.compat import lazy_isinstance 18 a = np.array([1, 2, 3]) 19 assert lazy_isinstance(a, 'numpy', 'ndarray') 20 assert not lazy_isinstance(a, 'numpy', 'dataframe') 21 22 def test_basic(self): 23 dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') 24 dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') 25 param = {'max_depth': 2, 'eta': 1, 26 'objective': 'binary:logistic'} 27 # specify validations set to watch performance 28 watchlist = [(dtrain, 'train')] 29 num_round = 2 30 bst = xgb.train(param, dtrain, num_round, watchlist, verbose_eval=True) 31 32 preds = bst.predict(dtrain) 33 labels = dtrain.get_label() 34 err = sum(1 for i in range(len(preds)) 35 if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) 36 # error must be smaller than 10% 37 assert err < 0.1 38 39 preds = bst.predict(dtest) 40 labels = dtest.get_label() 41 err = sum(1 for i in range(len(preds)) 42 if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) 43 # error must be smaller than 10% 44 assert err < 0.1 45 46 with tempfile.TemporaryDirectory() as tmpdir: 47 dtest_path = os.path.join(tmpdir, 'dtest.dmatrix') 48 # save dmatrix into binary buffer 49 dtest.save_binary(dtest_path) 50 # save model 51 model_path = os.path.join(tmpdir, 'model.booster') 52 bst.save_model(model_path) 53 # load model and data in 54 bst2 = xgb.Booster(model_file=model_path) 55 dtest2 = xgb.DMatrix(dtest_path) 56 preds2 = bst2.predict(dtest2) 57 # assert they are the same 58 assert np.sum(np.abs(preds2 - preds)) == 0 59 60 def test_metric_config(self): 61 # Make sure that the metric configuration happens in booster so the 62 # string `['error', 'auc']` doesn't get passed down to core. 63 dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') 64 dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') 65 param = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 66 'objective': 'binary:logistic', 'eval_metric': ['error', 'auc']} 67 watchlist = [(dtest, 'eval'), (dtrain, 'train')] 68 num_round = 2 69 booster = xgb.train(param, dtrain, num_round, watchlist) 70 predt_0 = booster.predict(dtrain) 71 with tempfile.TemporaryDirectory() as tmpdir: 72 path = os.path.join(tmpdir, 'model.json') 73 booster.save_model(path) 74 75 booster = xgb.Booster(params=param, model_file=path) 76 predt_1 = booster.predict(dtrain) 77 np.testing.assert_allclose(predt_0, predt_1) 78 79 def test_record_results(self): 80 dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') 81 dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') 82 param = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 83 'objective': 'binary:logistic', 'eval_metric': 'error'} 84 # specify validations set to watch performance 85 watchlist = [(dtest, 'eval'), (dtrain, 'train')] 86 num_round = 2 87 result = {} 88 res2 = {} 89 xgb.train(param, dtrain, num_round, watchlist, 90 callbacks=[xgb.callback.record_evaluation(result)]) 91 xgb.train(param, dtrain, num_round, watchlist, 92 evals_result=res2) 93 assert result['train']['error'][0] < 0.1 94 assert res2 == result 95 96 def test_multiclass(self): 97 dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') 98 dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') 99 param = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'num_class': 2} 100 # specify validations set to watch performance 101 watchlist = [(dtest, 'eval'), (dtrain, 'train')] 102 num_round = 2 103 bst = xgb.train(param, dtrain, num_round, watchlist) 104 # this is prediction 105 preds = bst.predict(dtest) 106 labels = dtest.get_label() 107 err = sum(1 for i in range(len(preds)) 108 if preds[i] != labels[i]) / float(len(preds)) 109 # error must be smaller than 10% 110 assert err < 0.1 111 112 with tempfile.TemporaryDirectory() as tmpdir: 113 dtest_path = os.path.join(tmpdir, 'dtest.buffer') 114 model_path = os.path.join(tmpdir, 'xgb.model') 115 # save dmatrix into binary buffer 116 dtest.save_binary(dtest_path) 117 # save model 118 bst.save_model(model_path) 119 # load model and data in 120 bst2 = xgb.Booster(model_file=model_path) 121 dtest2 = xgb.DMatrix(dtest_path) 122 preds2 = bst2.predict(dtest2) 123 # assert they are the same 124 assert np.sum(np.abs(preds2 - preds)) == 0 125 126 def test_dump(self): 127 data = np.random.randn(100, 2) 128 target = np.array([0, 1] * 50) 129 features = ['Feature1', 'Feature2'] 130 131 dm = xgb.DMatrix(data, label=target, feature_names=features) 132 params = {'objective': 'binary:logistic', 133 'eval_metric': 'logloss', 134 'eta': 0.3, 135 'max_depth': 1} 136 137 bst = xgb.train(params, dm, num_boost_round=1) 138 139 # number of feature importances should == number of features 140 dump1 = bst.get_dump() 141 assert len(dump1) == 1, 'Expected only 1 tree to be dumped.' 142 len(dump1[0].splitlines()) == 3, 'Expected 1 root and 2 leaves - 3 lines in dump.' 143 144 dump2 = bst.get_dump(with_stats=True) 145 assert dump2[0].count('\n') == 3, 'Expected 1 root and 2 leaves - 3 lines in dump.' 146 msg = 'Expected more info when with_stats=True is given.' 147 assert dump2[0].find('\n') > dump1[0].find('\n'), msg 148 149 dump3 = bst.get_dump(dump_format="json") 150 dump3j = json.loads(dump3[0]) 151 assert dump3j['nodeid'] == 0, 'Expected the root node on top.' 152 153 dump4 = bst.get_dump(dump_format="json", with_stats=True) 154 dump4j = json.loads(dump4[0]) 155 assert 'gain' in dump4j, "Expected 'gain' to be dumped in JSON." 156 157 with pytest.raises(ValueError): 158 bst.get_dump(fmap="foo") 159 160 def test_feature_score(self): 161 rng = np.random.RandomState(0) 162 data = rng.randn(100, 2) 163 target = np.array([0, 1] * 50) 164 features = ["F0"] 165 with pytest.raises(ValueError): 166 xgb.DMatrix(data, label=target, feature_names=features) 167 168 params = {"objective": "binary:logistic"} 169 dm = xgb.DMatrix(data, label=target, feature_names=["F0", "F1"]) 170 booster = xgb.train(params, dm, num_boost_round=1) 171 # no error since feature names might be assigned before the booster seeing data 172 # and booster doesn't known about the actual number of features. 173 booster.feature_names = ["F0"] 174 with pytest.raises(ValueError): 175 booster.get_fscore() 176 177 booster.feature_names = None 178 # Use JSON to make sure the output has native Python type 179 scores = json.loads(json.dumps(booster.get_fscore())) 180 np.testing.assert_allclose(scores["f0"], 6.0) 181 182 def test_load_file_invalid(self): 183 with pytest.raises(xgb.core.XGBoostError): 184 xgb.Booster(model_file='incorrect_path') 185 186 with pytest.raises(xgb.core.XGBoostError): 187 xgb.Booster(model_file=u'不正なパス') 188 189 def test_dmatrix_numpy_init_omp(self): 190 191 rows = [1000, 11326, 15000] 192 cols = 50 193 for row in rows: 194 X = np.random.randn(row, cols) 195 y = np.random.randn(row).astype('f') 196 dm = xgb.DMatrix(X, y, nthread=0) 197 np.testing.assert_array_equal(dm.get_label(), y) 198 assert dm.num_row() == row 199 assert dm.num_col() == cols 200 201 dm = xgb.DMatrix(X, y, nthread=10) 202 np.testing.assert_array_equal(dm.get_label(), y) 203 assert dm.num_row() == row 204 assert dm.num_col() == cols 205 206 def test_cv(self): 207 dm = xgb.DMatrix(dpath + 'agaricus.txt.train') 208 params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 209 'objective': 'binary:logistic'} 210 211 # return np.ndarray 212 cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=False) 213 assert isinstance(cv, dict) 214 assert len(cv) == (4) 215 216 def test_cv_no_shuffle(self): 217 dm = xgb.DMatrix(dpath + 'agaricus.txt.train') 218 params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 219 'objective': 'binary:logistic'} 220 221 # return np.ndarray 222 cv = xgb.cv(params, dm, num_boost_round=10, shuffle=False, nfold=10, 223 as_pandas=False) 224 assert isinstance(cv, dict) 225 assert len(cv) == (4) 226 227 def test_cv_explicit_fold_indices(self): 228 dm = xgb.DMatrix(dpath + 'agaricus.txt.train') 229 params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective': 230 'binary:logistic'} 231 folds = [ 232 # Train Test 233 ([1, 3], [5, 8]), 234 ([7, 9], [23, 43]), 235 ] 236 237 # return np.ndarray 238 cv = xgb.cv(params, dm, num_boost_round=10, folds=folds, 239 as_pandas=False) 240 assert isinstance(cv, dict) 241 assert len(cv) == (4) 242 243 @pytest.mark.skipif(**tm.skip_s390x()) 244 def test_cv_explicit_fold_indices_labels(self): 245 params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective': 246 'reg:squarederror'} 247 N = 100 248 F = 3 249 dm = xgb.DMatrix(data=np.random.randn(N, F), label=np.arange(N)) 250 folds = [ 251 # Train Test 252 ([1, 3], [5, 8]), 253 ([7, 9], [23, 43, 11]), 254 ] 255 256 # Use callback to log the test labels in each fold 257 def cb(cbackenv): 258 print([fold.dtest.get_label() for fold in cbackenv.cvfolds]) 259 260 # Run cross validation and capture standard out to test callback result 261 with tm.captured_output() as (out, err): 262 xgb.cv( 263 params, dm, num_boost_round=1, folds=folds, callbacks=[cb], 264 as_pandas=False 265 ) 266 output = out.getvalue().strip() 267 solution = ('[array([5., 8.], dtype=float32), array([23., 43., 11.],' + 268 ' dtype=float32)]') 269 assert output == solution 270 271 272class TestBasicPathLike: 273 """Unit tests using pathlib.Path for file interaction.""" 274 275 def test_DMatrix_init_from_path(self): 276 """Initialization from the data path.""" 277 dpath = Path('demo/data') 278 dtrain = xgb.DMatrix(dpath / 'agaricus.txt.train') 279 assert dtrain.num_row() == 6513 280 assert dtrain.num_col() == 127 281 282 def test_DMatrix_save_to_path(self): 283 """Saving to a binary file using pathlib from a DMatrix.""" 284 data = np.random.randn(100, 2) 285 target = np.array([0, 1] * 50) 286 features = ['Feature1', 'Feature2'] 287 288 dm = xgb.DMatrix(data, label=target, feature_names=features) 289 290 # save, assert exists, remove file 291 binary_path = Path("dtrain.bin") 292 dm.save_binary(binary_path) 293 assert binary_path.exists() 294 Path.unlink(binary_path) 295 296 def test_Booster_init_invalid_path(self): 297 """An invalid model_file path should raise XGBoostError.""" 298 with pytest.raises(xgb.core.XGBoostError): 299 xgb.Booster(model_file=Path("invalidpath")) 300 301 def test_Booster_save_and_load(self): 302 """Saving and loading model files from paths.""" 303 save_path = Path("saveload.model") 304 305 data = np.random.randn(100, 2) 306 target = np.array([0, 1] * 50) 307 features = ['Feature1', 'Feature2'] 308 309 dm = xgb.DMatrix(data, label=target, feature_names=features) 310 params = {'objective': 'binary:logistic', 311 'eval_metric': 'logloss', 312 'eta': 0.3, 313 'max_depth': 1} 314 315 bst = xgb.train(params, dm, num_boost_round=1) 316 317 # save, assert exists 318 bst.save_model(save_path) 319 assert save_path.exists() 320 321 def dump_assertions(dump): 322 """Assertions for the expected dump from Booster""" 323 assert len(dump) == 1, 'Exepcted only 1 tree to be dumped.' 324 assert len(dump[0].splitlines()) == 3, 'Expected 1 root and 2 leaves - 3 lines.' 325 326 # load the model again using Path 327 bst2 = xgb.Booster(model_file=save_path) 328 dump2 = bst2.get_dump() 329 dump_assertions(dump2) 330 331 # load again using load_model 332 bst3 = xgb.Booster() 333 bst3.load_model(save_path) 334 dump3 = bst3.get_dump() 335 dump_assertions(dump3) 336 337 # remove file 338 Path.unlink(save_path) 339