1# coding: utf-8 2import os 3import urllib 4import zipfile 5import sys 6from contextlib import contextmanager 7from io import StringIO 8from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED 9from xgboost.compat import DASK_INSTALLED 10import pytest 11import gc 12import xgboost as xgb 13import numpy as np 14import platform 15 16hypothesis = pytest.importorskip('hypothesis') 17sklearn = pytest.importorskip('sklearn') 18from hypothesis import strategies 19from hypothesis.extra.numpy import arrays 20from joblib import Memory 21from sklearn import datasets 22 23try: 24 import cupy as cp 25except ImportError: 26 cp = None 27 28memory = Memory('./cachedir', verbose=0) 29 30 31def no_sklearn(): 32 return {'condition': not SKLEARN_INSTALLED, 33 'reason': 'Scikit-Learn is not installed'} 34 35 36def no_dask(): 37 return {'condition': not DASK_INSTALLED, 38 'reason': 'Dask is not installed'} 39 40 41def no_pandas(): 42 return {'condition': not PANDAS_INSTALLED, 43 'reason': 'Pandas is not installed.'} 44 45 46def no_modin(): 47 reason = 'Modin is not installed.' 48 try: 49 import modin.pandas as _ # noqa 50 return {'condition': False, 'reason': reason} 51 except ImportError: 52 return {'condition': True, 'reason': reason} 53 54 55def no_dt(): 56 import importlib.util 57 spec = importlib.util.find_spec('datatable') 58 return {'condition': spec is None, 59 'reason': 'Datatable is not installed.'} 60 61 62def no_matplotlib(): 63 reason = 'Matplotlib is not installed.' 64 try: 65 import matplotlib.pyplot as _ # noqa 66 return {'condition': False, 67 'reason': reason} 68 except ImportError: 69 return {'condition': True, 70 'reason': reason} 71 72 73def no_dask_cuda(): 74 reason = 'dask_cuda is not installed.' 75 try: 76 import dask_cuda as _ # noqa 77 return {'condition': False, 'reason': reason} 78 except ImportError: 79 return {'condition': True, 'reason': reason} 80 81 82def no_cudf(): 83 try: 84 import cudf # noqa 85 CUDF_INSTALLED = True 86 except ImportError: 87 CUDF_INSTALLED = False 88 89 return {'condition': not CUDF_INSTALLED, 90 'reason': 'CUDF is not installed'} 91 92 93def no_cupy(): 94 reason = 'cupy is not installed.' 95 try: 96 import cupy as _ # noqa 97 return {'condition': False, 'reason': reason} 98 except ImportError: 99 return {'condition': True, 'reason': reason} 100 101 102def no_dask_cudf(): 103 reason = 'dask_cudf is not installed.' 104 try: 105 import dask_cudf as _ # noqa 106 return {'condition': False, 'reason': reason} 107 except ImportError: 108 return {'condition': True, 'reason': reason} 109 110 111def no_json_schema(): 112 reason = 'jsonschema is not installed' 113 try: 114 import jsonschema # noqa 115 return {'condition': False, 'reason': reason} 116 except ImportError: 117 return {'condition': True, 'reason': reason} 118 119 120def no_graphviz(): 121 reason = 'graphviz is not installed' 122 try: 123 import graphviz # noqa 124 return {'condition': False, 'reason': reason} 125 except ImportError: 126 return {'condition': True, 'reason': reason} 127 128 129def no_multiple(*args): 130 condition = False 131 reason = '' 132 for arg in args: 133 condition = (condition or arg['condition']) 134 if arg['condition']: 135 reason = arg['reason'] 136 break 137 return {'condition': condition, 'reason': reason} 138 139 140def skip_s390x(): 141 condition = platform.machine() == "s390x" 142 reason = "Known to fail on s390x" 143 return {"condition": condition, "reason": reason} 144 145 146class IteratorForTest(xgb.core.DataIter): 147 def __init__(self, X, y): 148 assert len(X) == len(y) 149 self.X = X 150 self.y = y 151 self.it = 0 152 super().__init__("./") 153 154 def next(self, input_data): 155 if self.it == len(self.X): 156 return 0 157 # Use copy to make sure the iterator doesn't hold a reference to the data. 158 input_data(data=self.X[self.it].copy(), label=self.y[self.it].copy()) 159 gc.collect() # clear up the copy, see if XGBoost access freed memory. 160 self.it += 1 161 return 1 162 163 def reset(self): 164 self.it = 0 165 166 def as_arrays(self): 167 X = np.concatenate(self.X, axis=0) 168 y = np.concatenate(self.y, axis=0) 169 return X, y 170 171 172# Contains a dataset in numpy format as well as the relevant objective and metric 173class TestDataset: 174 def __init__(self, name, get_dataset, objective, metric): 175 self.name = name 176 self.objective = objective 177 self.metric = metric 178 self.X, self.y = get_dataset() 179 self.w = None 180 self.margin = None 181 182 def set_params(self, params_in): 183 params_in['objective'] = self.objective 184 params_in['eval_metric'] = self.metric 185 if self.objective == "multi:softmax": 186 params_in["num_class"] = int(np.max(self.y) + 1) 187 return params_in 188 189 def get_dmat(self): 190 return xgb.DMatrix(self.X, self.y, self.w, base_margin=self.margin) 191 192 def get_device_dmat(self): 193 w = None if self.w is None else cp.array(self.w) 194 X = cp.array(self.X, dtype=np.float32) 195 y = cp.array(self.y, dtype=np.float32) 196 return xgb.DeviceQuantileDMatrix(X, y, w, base_margin=self.margin) 197 198 def get_external_dmat(self): 199 n_samples = self.X.shape[0] 200 n_batches = 10 201 per_batch = n_samples // n_batches + 1 202 203 predictor = [] 204 response = [] 205 for i in range(n_batches): 206 beg = i * per_batch 207 end = min((i + 1) * per_batch, n_samples) 208 assert end != beg 209 X = self.X[beg: end, ...] 210 y = self.y[beg: end] 211 predictor.append(X) 212 response.append(y) 213 214 it = IteratorForTest(predictor, response) 215 return xgb.DMatrix(it) 216 217 def __repr__(self): 218 return self.name 219 220 221@memory.cache 222def get_boston(): 223 data = datasets.load_boston() 224 return data.data, data.target 225 226 227@memory.cache 228def get_digits(): 229 data = datasets.load_digits() 230 return data.data, data.target 231 232 233@memory.cache 234def get_cancer(): 235 data = datasets.load_breast_cancer() 236 return data.data, data.target 237 238 239@memory.cache 240def get_sparse(): 241 rng = np.random.RandomState(199) 242 n = 2000 243 sparsity = 0.75 244 X, y = datasets.make_regression(n, random_state=rng) 245 flag = rng.binomial(1, sparsity, X.shape) 246 for i in range(X.shape[0]): 247 for j in range(X.shape[1]): 248 if flag[i, j]: 249 X[i, j] = np.nan 250 return X, y 251 252 253@memory.cache 254def get_mq2008(dpath): 255 from sklearn.datasets import load_svmlight_files 256 257 src = 'https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip' 258 target = dpath + '/MQ2008.zip' 259 if not os.path.exists(target): 260 urllib.request.urlretrieve(url=src, filename=target) 261 262 with zipfile.ZipFile(target, 'r') as f: 263 f.extractall(path=dpath) 264 265 (x_train, y_train, qid_train, x_test, y_test, qid_test, 266 x_valid, y_valid, qid_valid) = load_svmlight_files( 267 (dpath + "MQ2008/Fold1/train.txt", 268 dpath + "MQ2008/Fold1/test.txt", 269 dpath + "MQ2008/Fold1/vali.txt"), 270 query_id=True, zero_based=False) 271 272 return (x_train, y_train, qid_train, x_test, y_test, qid_test, 273 x_valid, y_valid, qid_valid) 274 275 276@memory.cache 277def make_categorical( 278 n_samples: int, n_features: int, n_categories: int, onehot: bool 279): 280 import pandas as pd 281 282 rng = np.random.RandomState(1994) 283 284 pd_dict = {} 285 for i in range(n_features + 1): 286 c = rng.randint(low=0, high=n_categories, size=n_samples) 287 pd_dict[str(i)] = pd.Series(c, dtype=np.int64) 288 289 df = pd.DataFrame(pd_dict) 290 label = df.iloc[:, 0] 291 df = df.iloc[:, 1:] 292 for i in range(0, n_features): 293 label += df.iloc[:, i] 294 label += 1 295 296 df = df.astype("category") 297 categories = np.arange(0, n_categories) 298 for col in df.columns: 299 df[col] = df[col].cat.set_categories(categories) 300 301 if onehot: 302 return pd.get_dummies(df), label 303 return df, label 304 305 306_unweighted_datasets_strategy = strategies.sampled_from( 307 [TestDataset('boston', get_boston, 'reg:squarederror', 'rmse'), 308 TestDataset('digits', get_digits, 'multi:softmax', 'mlogloss'), 309 TestDataset("cancer", get_cancer, "binary:logistic", "logloss"), 310 TestDataset 311 ("sparse", get_sparse, "reg:squarederror", "rmse"), 312 TestDataset("empty", lambda: (np.empty((0, 100)), np.empty(0)), "reg:squarederror", 313 "rmse")]) 314 315 316@strategies.composite 317def _dataset_weight_margin(draw): 318 data = draw(_unweighted_datasets_strategy) 319 if draw(strategies.booleans()): 320 data.w = draw(arrays(np.float64, (len(data.y)), elements=strategies.floats(0.1, 2.0))) 321 if draw(strategies.booleans()): 322 num_class = 1 323 if data.objective == "multi:softmax": 324 num_class = int(np.max(data.y) + 1) 325 data.margin = draw( 326 arrays(np.float64, (len(data.y) * num_class), elements=strategies.floats(0.5, 1.0))) 327 328 return data 329 330 331# A strategy for drawing from a set of example datasets 332# May add random weights to the dataset 333dataset_strategy = _dataset_weight_margin() 334 335 336def non_increasing(L, tolerance=1e-4): 337 return all((y - x) < tolerance for x, y in zip(L, L[1:])) 338 339 340def eval_error_metric(predt, dtrain: xgb.DMatrix): 341 label = dtrain.get_label() 342 r = np.zeros(predt.shape) 343 gt = predt > 0.5 344 if predt.size == 0: 345 return "CustomErr", 0 346 r[gt] = 1 - label[gt] 347 le = predt <= 0.5 348 r[le] = label[le] 349 return 'CustomErr', np.sum(r) 350 351 352def softmax(x): 353 e = np.exp(x) 354 return e / np.sum(e) 355 356 357def softprob_obj(classes): 358 def objective(labels, predt): 359 rows = labels.shape[0] 360 grad = np.zeros((rows, classes), dtype=float) 361 hess = np.zeros((rows, classes), dtype=float) 362 eps = 1e-6 363 for r in range(predt.shape[0]): 364 target = labels[r] 365 p = softmax(predt[r, :]) 366 for c in range(predt.shape[1]): 367 assert target >= 0 or target <= classes 368 g = p[c] - 1.0 if c == target else p[c] 369 h = max((2.0 * p[c] * (1.0 - p[c])).item(), eps) 370 grad[r, c] = g 371 hess[r, c] = h 372 373 grad = grad.reshape((rows * classes, 1)) 374 hess = hess.reshape((rows * classes, 1)) 375 return grad, hess 376 377 return objective 378 379 380class DirectoryExcursion: 381 def __init__(self, path: os.PathLike, cleanup=False): 382 '''Change directory. Change back and optionally cleaning up the directory when exit. 383 384 ''' 385 self.path = path 386 self.curdir = os.path.normpath(os.path.abspath(os.path.curdir)) 387 self.cleanup = cleanup 388 self.files = {} 389 390 def __enter__(self): 391 os.chdir(self.path) 392 if self.cleanup: 393 self.files = { 394 os.path.join(root, f) 395 for root, subdir, files in os.walk(self.path) for f in files 396 } 397 398 def __exit__(self, *args): 399 os.chdir(self.curdir) 400 if self.cleanup: 401 files = { 402 os.path.join(root, f) 403 for root, subdir, files in os.walk(self.path) for f in files 404 } 405 diff = files.difference(self.files) 406 for f in diff: 407 os.remove(f) 408 409 410@contextmanager 411def captured_output(): 412 """Reassign stdout temporarily in order to test printed statements 413 Taken from: 414 https://stackoverflow.com/questions/4219717/how-to-assert-output-with-nosetest-unittest-in-python 415 416 Also works for pytest. 417 418 """ 419 new_out, new_err = StringIO(), StringIO() 420 old_out, old_err = sys.stdout, sys.stderr 421 try: 422 sys.stdout, sys.stderr = new_out, new_err 423 yield sys.stdout, sys.stderr 424 finally: 425 sys.stdout, sys.stderr = old_out, old_err 426 427 428try: 429 # Python 3.7+ 430 from contextlib import nullcontext as noop_context 431except ImportError: 432 # Python 3.6 433 from contextlib import suppress as noop_context 434 435 436CURDIR = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) 437PROJECT_ROOT = os.path.normpath( 438 os.path.join(CURDIR, os.path.pardir, os.path.pardir)) 439