1import xgboost as xgb 2from xgboost.data import SingleBatchInternalIter as SingleBatch 3import numpy as np 4from testing import IteratorForTest 5from typing import Tuple, List 6import pytest 7from hypothesis import given, strategies, settings 8from scipy.sparse import csr_matrix 9 10 11def make_batches( 12 n_samples_per_batch: int, n_features: int, n_batches: int, use_cupy: bool = False 13) -> Tuple[List[np.ndarray], List[np.ndarray]]: 14 X = [] 15 y = [] 16 if use_cupy: 17 import cupy 18 19 rng = cupy.random.RandomState(1994) 20 else: 21 rng = np.random.RandomState(1994) 22 for i in range(n_batches): 23 _X = rng.randn(n_samples_per_batch, n_features) 24 _y = rng.randn(n_samples_per_batch) 25 X.append(_X) 26 y.append(_y) 27 return X, y 28 29 30def test_single_batch(tree_method: str = "approx") -> None: 31 from sklearn.datasets import load_breast_cancer 32 33 n_rounds = 10 34 X, y = load_breast_cancer(return_X_y=True) 35 X = X.astype(np.float32) 36 y = y.astype(np.float32) 37 38 Xy = xgb.DMatrix(SingleBatch(data=X, label=y)) 39 from_it = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds) 40 41 Xy = xgb.DMatrix(X, y) 42 from_dmat = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds) 43 assert from_it.get_dump() == from_dmat.get_dump() 44 45 X, y = load_breast_cancer(return_X_y=True, as_frame=True) 46 X = X.astype(np.float32) 47 Xy = xgb.DMatrix(SingleBatch(data=X, label=y)) 48 from_pd = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds) 49 # remove feature info to generate exact same text representation. 50 from_pd.feature_names = None 51 from_pd.feature_types = None 52 53 assert from_pd.get_dump() == from_it.get_dump() 54 55 X, y = load_breast_cancer(return_X_y=True) 56 X = csr_matrix(X) 57 Xy = xgb.DMatrix(SingleBatch(data=X, label=y)) 58 from_it = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds) 59 60 X, y = load_breast_cancer(return_X_y=True) 61 Xy = xgb.DMatrix(SingleBatch(data=X, label=y), missing=0.0) 62 from_np = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds) 63 assert from_np.get_dump() == from_it.get_dump() 64 65 66def run_data_iterator( 67 n_samples_per_batch: int, 68 n_features: int, 69 n_batches: int, 70 tree_method: str, 71 use_cupy: bool, 72) -> None: 73 n_rounds = 2 74 75 it = IteratorForTest( 76 *make_batches(n_samples_per_batch, n_features, n_batches, use_cupy) 77 ) 78 if n_batches == 0: 79 with pytest.raises(ValueError, match="1 batch"): 80 Xy = xgb.DMatrix(it) 81 return 82 83 Xy = xgb.DMatrix(it) 84 assert Xy.num_row() == n_samples_per_batch * n_batches 85 assert Xy.num_col() == n_features 86 87 results_from_it: xgb.callback.EvaluationMonitor.EvalsLog = {} 88 from_it = xgb.train( 89 {"tree_method": tree_method, "max_depth": 2}, 90 Xy, 91 num_boost_round=n_rounds, 92 evals=[(Xy, "Train")], 93 evals_result=results_from_it, 94 verbose_eval=False, 95 ) 96 it_predt = from_it.predict(Xy) 97 98 X, y = it.as_arrays() 99 Xy = xgb.DMatrix(X, y) 100 assert Xy.num_row() == n_samples_per_batch * n_batches 101 assert Xy.num_col() == n_features 102 103 results_from_arrays: xgb.callback.EvaluationMonitor.EvalsLog = {} 104 from_arrays = xgb.train( 105 {"tree_method": tree_method, "max_depth": 2}, 106 Xy, 107 num_boost_round=n_rounds, 108 evals=[(Xy, "Train")], 109 evals_result=results_from_arrays, 110 verbose_eval=False, 111 ) 112 arr_predt = from_arrays.predict(Xy) 113 114 if tree_method != "gpu_hist": 115 rtol = 1e-1 # flaky 116 else: 117 # Model can be sensitive to quantiles, use 1e-2 to relax the test. 118 np.testing.assert_allclose(it_predt, arr_predt, rtol=1e-2) 119 rtol = 1e-6 120 121 np.testing.assert_allclose( 122 results_from_it["Train"]["rmse"], 123 results_from_arrays["Train"]["rmse"], 124 rtol=rtol, 125 ) 126 127 128@given( 129 strategies.integers(0, 1024), strategies.integers(1, 7), strategies.integers(0, 13) 130) 131@settings(deadline=None) 132def test_data_iterator( 133 n_samples_per_batch: int, n_features: int, n_batches: int 134) -> None: 135 run_data_iterator(n_samples_per_batch, n_features, n_batches, "approx", False) 136 run_data_iterator(n_samples_per_batch, n_features, n_batches, "hist", False) 137