1# coding: utf-8
2import os
3import urllib
4import zipfile
5import sys
6from contextlib import contextmanager
7from io import StringIO
8from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED
9from xgboost.compat import DASK_INSTALLED
10import pytest
11import gc
12import xgboost as xgb
13import numpy as np
14import platform
15
16hypothesis = pytest.importorskip('hypothesis')
17sklearn = pytest.importorskip('sklearn')
18from hypothesis import strategies
19from hypothesis.extra.numpy import arrays
20from joblib import Memory
21from sklearn import datasets
22
23try:
24    import cupy as cp
25except ImportError:
26    cp = None
27
28memory = Memory('./cachedir', verbose=0)
29
30
31def no_sklearn():
32    return {'condition': not SKLEARN_INSTALLED,
33            'reason': 'Scikit-Learn is not installed'}
34
35
36def no_dask():
37    return {'condition': not DASK_INSTALLED,
38            'reason': 'Dask is not installed'}
39
40
41def no_pandas():
42    return {'condition': not PANDAS_INSTALLED,
43            'reason': 'Pandas is not installed.'}
44
45
46def no_modin():
47    reason = 'Modin is not installed.'
48    try:
49        import modin.pandas as _  # noqa
50        return {'condition': False, 'reason': reason}
51    except ImportError:
52        return {'condition': True, 'reason': reason}
53
54
55def no_dt():
56    import importlib.util
57    spec = importlib.util.find_spec('datatable')
58    return {'condition': spec is None,
59            'reason': 'Datatable is not installed.'}
60
61
62def no_matplotlib():
63    reason = 'Matplotlib is not installed.'
64    try:
65        import matplotlib.pyplot as _  # noqa
66        return {'condition': False,
67                'reason': reason}
68    except ImportError:
69        return {'condition': True,
70                'reason': reason}
71
72
73def no_dask_cuda():
74    reason = 'dask_cuda is not installed.'
75    try:
76        import dask_cuda as _  # noqa
77        return {'condition': False, 'reason': reason}
78    except ImportError:
79        return {'condition': True, 'reason': reason}
80
81
82def no_cudf():
83    try:
84        import cudf  # noqa
85        CUDF_INSTALLED = True
86    except ImportError:
87        CUDF_INSTALLED = False
88
89    return {'condition': not CUDF_INSTALLED,
90            'reason': 'CUDF is not installed'}
91
92
93def no_cupy():
94    reason = 'cupy is not installed.'
95    try:
96        import cupy as _  # noqa
97        return {'condition': False, 'reason': reason}
98    except ImportError:
99        return {'condition': True, 'reason': reason}
100
101
102def no_dask_cudf():
103    reason = 'dask_cudf is not installed.'
104    try:
105        import dask_cudf as _  # noqa
106        return {'condition': False, 'reason': reason}
107    except ImportError:
108        return {'condition': True, 'reason': reason}
109
110
111def no_json_schema():
112    reason = 'jsonschema is not installed'
113    try:
114        import jsonschema  # noqa
115        return {'condition': False, 'reason': reason}
116    except ImportError:
117        return {'condition': True, 'reason': reason}
118
119
120def no_graphviz():
121    reason = 'graphviz is not installed'
122    try:
123        import graphviz  # noqa
124        return {'condition': False, 'reason': reason}
125    except ImportError:
126        return {'condition': True, 'reason': reason}
127
128
129def no_multiple(*args):
130    condition = False
131    reason = ''
132    for arg in args:
133        condition = (condition or arg['condition'])
134        if arg['condition']:
135            reason = arg['reason']
136            break
137    return {'condition': condition, 'reason': reason}
138
139
140def skip_s390x():
141    condition = platform.machine() == "s390x"
142    reason = "Known to fail on s390x"
143    return {"condition": condition, "reason": reason}
144
145
146class IteratorForTest(xgb.core.DataIter):
147    def __init__(self, X, y):
148        assert len(X) == len(y)
149        self.X = X
150        self.y = y
151        self.it = 0
152        super().__init__("./")
153
154    def next(self, input_data):
155        if self.it == len(self.X):
156            return 0
157        # Use copy to make sure the iterator doesn't hold a reference to the data.
158        input_data(data=self.X[self.it].copy(), label=self.y[self.it].copy())
159        gc.collect()            # clear up the copy, see if XGBoost access freed memory.
160        self.it += 1
161        return 1
162
163    def reset(self):
164        self.it = 0
165
166    def as_arrays(self):
167        X = np.concatenate(self.X, axis=0)
168        y = np.concatenate(self.y, axis=0)
169        return X, y
170
171
172# Contains a dataset in numpy format as well as the relevant objective and metric
173class TestDataset:
174    def __init__(self, name, get_dataset, objective, metric):
175        self.name = name
176        self.objective = objective
177        self.metric = metric
178        self.X, self.y = get_dataset()
179        self.w = None
180        self.margin = None
181
182    def set_params(self, params_in):
183        params_in['objective'] = self.objective
184        params_in['eval_metric'] = self.metric
185        if self.objective == "multi:softmax":
186            params_in["num_class"] = int(np.max(self.y) + 1)
187        return params_in
188
189    def get_dmat(self):
190        return xgb.DMatrix(self.X, self.y, self.w, base_margin=self.margin)
191
192    def get_device_dmat(self):
193        w = None if self.w is None else cp.array(self.w)
194        X = cp.array(self.X, dtype=np.float32)
195        y = cp.array(self.y, dtype=np.float32)
196        return xgb.DeviceQuantileDMatrix(X, y, w, base_margin=self.margin)
197
198    def get_external_dmat(self):
199        n_samples = self.X.shape[0]
200        n_batches = 10
201        per_batch = n_samples // n_batches + 1
202
203        predictor = []
204        response = []
205        for i in range(n_batches):
206            beg = i * per_batch
207            end = min((i + 1) * per_batch, n_samples)
208            assert end != beg
209            X = self.X[beg: end, ...]
210            y = self.y[beg: end]
211            predictor.append(X)
212            response.append(y)
213
214        it = IteratorForTest(predictor, response)
215        return xgb.DMatrix(it)
216
217    def __repr__(self):
218        return self.name
219
220
221@memory.cache
222def get_boston():
223    data = datasets.load_boston()
224    return data.data, data.target
225
226
227@memory.cache
228def get_digits():
229    data = datasets.load_digits()
230    return data.data, data.target
231
232
233@memory.cache
234def get_cancer():
235    data = datasets.load_breast_cancer()
236    return data.data, data.target
237
238
239@memory.cache
240def get_sparse():
241    rng = np.random.RandomState(199)
242    n = 2000
243    sparsity = 0.75
244    X, y = datasets.make_regression(n, random_state=rng)
245    flag = rng.binomial(1, sparsity, X.shape)
246    for i in range(X.shape[0]):
247        for j in range(X.shape[1]):
248            if flag[i, j]:
249                X[i, j] = np.nan
250    return X, y
251
252
253@memory.cache
254def get_mq2008(dpath):
255    from sklearn.datasets import load_svmlight_files
256
257    src = 'https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip'
258    target = dpath + '/MQ2008.zip'
259    if not os.path.exists(target):
260        urllib.request.urlretrieve(url=src, filename=target)
261
262    with zipfile.ZipFile(target, 'r') as f:
263        f.extractall(path=dpath)
264
265    (x_train, y_train, qid_train, x_test, y_test, qid_test,
266     x_valid, y_valid, qid_valid) = load_svmlight_files(
267         (dpath + "MQ2008/Fold1/train.txt",
268          dpath + "MQ2008/Fold1/test.txt",
269          dpath + "MQ2008/Fold1/vali.txt"),
270         query_id=True, zero_based=False)
271
272    return (x_train, y_train, qid_train, x_test, y_test, qid_test,
273            x_valid, y_valid, qid_valid)
274
275
276@memory.cache
277def make_categorical(
278    n_samples: int, n_features: int, n_categories: int, onehot: bool
279):
280    import pandas as pd
281
282    rng = np.random.RandomState(1994)
283
284    pd_dict = {}
285    for i in range(n_features + 1):
286        c = rng.randint(low=0, high=n_categories, size=n_samples)
287        pd_dict[str(i)] = pd.Series(c, dtype=np.int64)
288
289    df = pd.DataFrame(pd_dict)
290    label = df.iloc[:, 0]
291    df = df.iloc[:, 1:]
292    for i in range(0, n_features):
293        label += df.iloc[:, i]
294    label += 1
295
296    df = df.astype("category")
297    categories = np.arange(0, n_categories)
298    for col in df.columns:
299        df[col] = df[col].cat.set_categories(categories)
300
301    if onehot:
302        return pd.get_dummies(df), label
303    return df, label
304
305
306_unweighted_datasets_strategy = strategies.sampled_from(
307    [TestDataset('boston', get_boston, 'reg:squarederror', 'rmse'),
308     TestDataset('digits', get_digits, 'multi:softmax', 'mlogloss'),
309     TestDataset("cancer", get_cancer, "binary:logistic", "logloss"),
310     TestDataset
311     ("sparse", get_sparse, "reg:squarederror", "rmse"),
312     TestDataset("empty", lambda: (np.empty((0, 100)), np.empty(0)), "reg:squarederror",
313                 "rmse")])
314
315
316@strategies.composite
317def _dataset_weight_margin(draw):
318    data = draw(_unweighted_datasets_strategy)
319    if draw(strategies.booleans()):
320        data.w = draw(arrays(np.float64, (len(data.y)), elements=strategies.floats(0.1, 2.0)))
321    if draw(strategies.booleans()):
322        num_class = 1
323        if data.objective == "multi:softmax":
324            num_class = int(np.max(data.y) + 1)
325        data.margin = draw(
326            arrays(np.float64, (len(data.y) * num_class), elements=strategies.floats(0.5, 1.0)))
327
328    return data
329
330
331# A strategy for drawing from a set of example datasets
332# May add random weights to the dataset
333dataset_strategy = _dataset_weight_margin()
334
335
336def non_increasing(L, tolerance=1e-4):
337    return all((y - x) < tolerance for x, y in zip(L, L[1:]))
338
339
340def eval_error_metric(predt, dtrain: xgb.DMatrix):
341    label = dtrain.get_label()
342    r = np.zeros(predt.shape)
343    gt = predt > 0.5
344    if predt.size == 0:
345        return "CustomErr", 0
346    r[gt] = 1 - label[gt]
347    le = predt <= 0.5
348    r[le] = label[le]
349    return 'CustomErr', np.sum(r)
350
351
352def softmax(x):
353    e = np.exp(x)
354    return e / np.sum(e)
355
356
357def softprob_obj(classes):
358    def objective(labels, predt):
359        rows = labels.shape[0]
360        grad = np.zeros((rows, classes), dtype=float)
361        hess = np.zeros((rows, classes), dtype=float)
362        eps = 1e-6
363        for r in range(predt.shape[0]):
364            target = labels[r]
365            p = softmax(predt[r, :])
366            for c in range(predt.shape[1]):
367                assert target >= 0 or target <= classes
368                g = p[c] - 1.0 if c == target else p[c]
369                h = max((2.0 * p[c] * (1.0 - p[c])).item(), eps)
370                grad[r, c] = g
371                hess[r, c] = h
372
373        grad = grad.reshape((rows * classes, 1))
374        hess = hess.reshape((rows * classes, 1))
375        return grad, hess
376
377    return objective
378
379
380class DirectoryExcursion:
381    def __init__(self, path: os.PathLike, cleanup=False):
382        '''Change directory.  Change back and optionally cleaning up the directory when exit.
383
384        '''
385        self.path = path
386        self.curdir = os.path.normpath(os.path.abspath(os.path.curdir))
387        self.cleanup = cleanup
388        self.files = {}
389
390    def __enter__(self):
391        os.chdir(self.path)
392        if self.cleanup:
393            self.files = {
394                os.path.join(root, f)
395                for root, subdir, files in os.walk(self.path) for f in files
396            }
397
398    def __exit__(self, *args):
399        os.chdir(self.curdir)
400        if self.cleanup:
401            files = {
402                os.path.join(root, f)
403                for root, subdir, files in os.walk(self.path) for f in files
404            }
405            diff = files.difference(self.files)
406            for f in diff:
407                os.remove(f)
408
409
410@contextmanager
411def captured_output():
412    """Reassign stdout temporarily in order to test printed statements
413    Taken from:
414    https://stackoverflow.com/questions/4219717/how-to-assert-output-with-nosetest-unittest-in-python
415
416    Also works for pytest.
417
418    """
419    new_out, new_err = StringIO(), StringIO()
420    old_out, old_err = sys.stdout, sys.stderr
421    try:
422        sys.stdout, sys.stderr = new_out, new_err
423        yield sys.stdout, sys.stderr
424    finally:
425        sys.stdout, sys.stderr = old_out, old_err
426
427
428try:
429    # Python 3.7+
430    from contextlib import nullcontext as noop_context
431except ImportError:
432    # Python 3.6
433    from contextlib import suppress as noop_context
434
435
436CURDIR = os.path.normpath(os.path.abspath(os.path.dirname(__file__)))
437PROJECT_ROOT = os.path.normpath(
438    os.path.join(CURDIR, os.path.pardir, os.path.pardir))
439