1import xgboost as xgb
2from xgboost.data import SingleBatchInternalIter as SingleBatch
3import numpy as np
4from testing import IteratorForTest
5from typing import Tuple, List
6import pytest
7from hypothesis import given, strategies, settings
8from scipy.sparse import csr_matrix
9
10
11def make_batches(
12    n_samples_per_batch: int, n_features: int, n_batches: int, use_cupy: bool = False
13) -> Tuple[List[np.ndarray], List[np.ndarray]]:
14    X = []
15    y = []
16    if use_cupy:
17        import cupy
18
19        rng = cupy.random.RandomState(1994)
20    else:
21        rng = np.random.RandomState(1994)
22    for i in range(n_batches):
23        _X = rng.randn(n_samples_per_batch, n_features)
24        _y = rng.randn(n_samples_per_batch)
25        X.append(_X)
26        y.append(_y)
27    return X, y
28
29
30def test_single_batch(tree_method: str = "approx") -> None:
31    from sklearn.datasets import load_breast_cancer
32
33    n_rounds = 10
34    X, y = load_breast_cancer(return_X_y=True)
35    X = X.astype(np.float32)
36    y = y.astype(np.float32)
37
38    Xy = xgb.DMatrix(SingleBatch(data=X, label=y))
39    from_it = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
40
41    Xy = xgb.DMatrix(X, y)
42    from_dmat = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
43    assert from_it.get_dump() == from_dmat.get_dump()
44
45    X, y = load_breast_cancer(return_X_y=True, as_frame=True)
46    X = X.astype(np.float32)
47    Xy = xgb.DMatrix(SingleBatch(data=X, label=y))
48    from_pd = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
49    # remove feature info to generate exact same text representation.
50    from_pd.feature_names = None
51    from_pd.feature_types = None
52
53    assert from_pd.get_dump() == from_it.get_dump()
54
55    X, y = load_breast_cancer(return_X_y=True)
56    X = csr_matrix(X)
57    Xy = xgb.DMatrix(SingleBatch(data=X, label=y))
58    from_it = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
59
60    X, y = load_breast_cancer(return_X_y=True)
61    Xy = xgb.DMatrix(SingleBatch(data=X, label=y), missing=0.0)
62    from_np = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
63    assert from_np.get_dump() == from_it.get_dump()
64
65
66def run_data_iterator(
67    n_samples_per_batch: int,
68    n_features: int,
69    n_batches: int,
70    tree_method: str,
71    use_cupy: bool,
72) -> None:
73    n_rounds = 2
74
75    it = IteratorForTest(
76        *make_batches(n_samples_per_batch, n_features, n_batches, use_cupy)
77    )
78    if n_batches == 0:
79        with pytest.raises(ValueError, match="1 batch"):
80            Xy = xgb.DMatrix(it)
81        return
82
83    Xy = xgb.DMatrix(it)
84    assert Xy.num_row() == n_samples_per_batch * n_batches
85    assert Xy.num_col() == n_features
86
87    results_from_it: xgb.callback.EvaluationMonitor.EvalsLog = {}
88    from_it = xgb.train(
89        {"tree_method": tree_method, "max_depth": 2},
90        Xy,
91        num_boost_round=n_rounds,
92        evals=[(Xy, "Train")],
93        evals_result=results_from_it,
94        verbose_eval=False,
95    )
96    it_predt = from_it.predict(Xy)
97
98    X, y = it.as_arrays()
99    Xy = xgb.DMatrix(X, y)
100    assert Xy.num_row() == n_samples_per_batch * n_batches
101    assert Xy.num_col() == n_features
102
103    results_from_arrays: xgb.callback.EvaluationMonitor.EvalsLog = {}
104    from_arrays = xgb.train(
105        {"tree_method": tree_method, "max_depth": 2},
106        Xy,
107        num_boost_round=n_rounds,
108        evals=[(Xy, "Train")],
109        evals_result=results_from_arrays,
110        verbose_eval=False,
111    )
112    arr_predt = from_arrays.predict(Xy)
113
114    if tree_method != "gpu_hist":
115        rtol = 1e-1  # flaky
116    else:
117        # Model can be sensitive to quantiles, use 1e-2 to relax the test.
118        np.testing.assert_allclose(it_predt, arr_predt, rtol=1e-2)
119        rtol = 1e-6
120
121    np.testing.assert_allclose(
122        results_from_it["Train"]["rmse"],
123        results_from_arrays["Train"]["rmse"],
124        rtol=rtol,
125    )
126
127
128@given(
129    strategies.integers(0, 1024), strategies.integers(1, 7), strategies.integers(0, 13)
130)
131@settings(deadline=None)
132def test_data_iterator(
133    n_samples_per_batch: int, n_features: int, n_batches: int
134) -> None:
135    run_data_iterator(n_samples_per_batch, n_features, n_batches, "approx", False)
136    run_data_iterator(n_samples_per_batch, n_features, n_batches, "hist", False)
137