1from sklearn.model_selection import train_test_split
2from sklearn.metrics import accuracy_score
3from sklearn.datasets import make_classification, make_regression
4import numpy as np
5import pytest
6
7from sklearn.ensemble import HistGradientBoostingRegressor
8from sklearn.ensemble import HistGradientBoostingClassifier
9from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
10from sklearn.ensemble._hist_gradient_boosting.utils import get_equivalent_estimator
11
12
13@pytest.mark.parametrize("seed", range(5))
14@pytest.mark.parametrize("min_samples_leaf", (1, 20))
15@pytest.mark.parametrize(
16    "n_samples, max_leaf_nodes",
17    [
18        (255, 4096),
19        (1000, 8),
20    ],
21)
22def test_same_predictions_regression(seed, min_samples_leaf, n_samples, max_leaf_nodes):
23    # Make sure sklearn has the same predictions as lightgbm for easy targets.
24    #
25    # In particular when the size of the trees are bound and the number of
26    # samples is large enough, the structure of the prediction trees found by
27    # LightGBM and sklearn should be exactly identical.
28    #
29    # Notes:
30    # - Several candidate splits may have equal gains when the number of
31    #   samples in a node is low (and because of float errors). Therefore the
32    #   predictions on the test set might differ if the structure of the tree
33    #   is not exactly the same. To avoid this issue we only compare the
34    #   predictions on the test set when the number of samples is large enough
35    #   and max_leaf_nodes is low enough.
36    # - To ignore  discrepancies caused by small differences the binning
37    #   strategy, data is pre-binned if n_samples > 255.
38    # - We don't check the absolute_error loss here. This is because
39    #   LightGBM's computation of the median (used for the initial value of
40    #   raw_prediction) is a bit off (they'll e.g. return midpoints when there
41    #   is no need to.). Since these tests only run 1 iteration, the
42    #   discrepancy between the initial values leads to biggish differences in
43    #   the predictions. These differences are much smaller with more
44    #   iterations.
45    pytest.importorskip("lightgbm")
46
47    rng = np.random.RandomState(seed=seed)
48    max_iter = 1
49    max_bins = 255
50
51    X, y = make_regression(
52        n_samples=n_samples, n_features=5, n_informative=5, random_state=0
53    )
54
55    if n_samples > 255:
56        # bin data and convert it to float32 so that the estimator doesn't
57        # treat it as pre-binned
58        X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)
59
60    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
61
62    est_sklearn = HistGradientBoostingRegressor(
63        max_iter=max_iter,
64        max_bins=max_bins,
65        learning_rate=1,
66        early_stopping=False,
67        min_samples_leaf=min_samples_leaf,
68        max_leaf_nodes=max_leaf_nodes,
69    )
70    est_lightgbm = get_equivalent_estimator(est_sklearn, lib="lightgbm")
71
72    est_lightgbm.fit(X_train, y_train)
73    est_sklearn.fit(X_train, y_train)
74
75    # We need X to be treated an numerical data, not pre-binned data.
76    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)
77
78    pred_lightgbm = est_lightgbm.predict(X_train)
79    pred_sklearn = est_sklearn.predict(X_train)
80    # less than 1% of the predictions are different up to the 3rd decimal
81    assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-3) < 0.011
82
83    if max_leaf_nodes < 10 and n_samples >= 1000:
84        pred_lightgbm = est_lightgbm.predict(X_test)
85        pred_sklearn = est_sklearn.predict(X_test)
86        # less than 1% of the predictions are different up to the 4th decimal
87        assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-4) < 0.01
88
89
90@pytest.mark.parametrize("seed", range(5))
91@pytest.mark.parametrize("min_samples_leaf", (1, 20))
92@pytest.mark.parametrize(
93    "n_samples, max_leaf_nodes",
94    [
95        (255, 4096),
96        (1000, 8),
97    ],
98)
99def test_same_predictions_classification(
100    seed, min_samples_leaf, n_samples, max_leaf_nodes
101):
102    # Same as test_same_predictions_regression but for classification
103    pytest.importorskip("lightgbm")
104
105    rng = np.random.RandomState(seed=seed)
106    max_iter = 1
107    n_classes = 2
108    max_bins = 255
109
110    X, y = make_classification(
111        n_samples=n_samples,
112        n_classes=n_classes,
113        n_features=5,
114        n_informative=5,
115        n_redundant=0,
116        random_state=0,
117    )
118
119    if n_samples > 255:
120        # bin data and convert it to float32 so that the estimator doesn't
121        # treat it as pre-binned
122        X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)
123
124    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
125
126    est_sklearn = HistGradientBoostingClassifier(
127        loss="binary_crossentropy",
128        max_iter=max_iter,
129        max_bins=max_bins,
130        learning_rate=1,
131        early_stopping=False,
132        min_samples_leaf=min_samples_leaf,
133        max_leaf_nodes=max_leaf_nodes,
134    )
135    est_lightgbm = get_equivalent_estimator(est_sklearn, lib="lightgbm")
136
137    est_lightgbm.fit(X_train, y_train)
138    est_sklearn.fit(X_train, y_train)
139
140    # We need X to be treated an numerical data, not pre-binned data.
141    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)
142
143    pred_lightgbm = est_lightgbm.predict(X_train)
144    pred_sklearn = est_sklearn.predict(X_train)
145    assert np.mean(pred_sklearn == pred_lightgbm) > 0.89
146
147    acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
148    acc_sklearn = accuracy_score(y_train, pred_sklearn)
149    np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn)
150
151    if max_leaf_nodes < 10 and n_samples >= 1000:
152
153        pred_lightgbm = est_lightgbm.predict(X_test)
154        pred_sklearn = est_sklearn.predict(X_test)
155        assert np.mean(pred_sklearn == pred_lightgbm) > 0.89
156
157        acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
158        acc_sklearn = accuracy_score(y_test, pred_sklearn)
159        np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
160
161
162@pytest.mark.parametrize("seed", range(5))
163@pytest.mark.parametrize("min_samples_leaf", (1, 20))
164@pytest.mark.parametrize(
165    "n_samples, max_leaf_nodes",
166    [
167        (255, 4096),
168        (10000, 8),
169    ],
170)
171def test_same_predictions_multiclass_classification(
172    seed, min_samples_leaf, n_samples, max_leaf_nodes
173):
174    # Same as test_same_predictions_regression but for classification
175    pytest.importorskip("lightgbm")
176
177    rng = np.random.RandomState(seed=seed)
178    n_classes = 3
179    max_iter = 1
180    max_bins = 255
181    lr = 1
182
183    X, y = make_classification(
184        n_samples=n_samples,
185        n_classes=n_classes,
186        n_features=5,
187        n_informative=5,
188        n_redundant=0,
189        n_clusters_per_class=1,
190        random_state=0,
191    )
192
193    if n_samples > 255:
194        # bin data and convert it to float32 so that the estimator doesn't
195        # treat it as pre-binned
196        X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)
197
198    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
199
200    est_sklearn = HistGradientBoostingClassifier(
201        loss="categorical_crossentropy",
202        max_iter=max_iter,
203        max_bins=max_bins,
204        learning_rate=lr,
205        early_stopping=False,
206        min_samples_leaf=min_samples_leaf,
207        max_leaf_nodes=max_leaf_nodes,
208    )
209    est_lightgbm = get_equivalent_estimator(
210        est_sklearn, lib="lightgbm", n_classes=n_classes
211    )
212
213    est_lightgbm.fit(X_train, y_train)
214    est_sklearn.fit(X_train, y_train)
215
216    # We need X to be treated an numerical data, not pre-binned data.
217    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)
218
219    pred_lightgbm = est_lightgbm.predict(X_train)
220    pred_sklearn = est_sklearn.predict(X_train)
221    assert np.mean(pred_sklearn == pred_lightgbm) > 0.89
222
223    proba_lightgbm = est_lightgbm.predict_proba(X_train)
224    proba_sklearn = est_sklearn.predict_proba(X_train)
225    # assert more than 75% of the predicted probabilities are the same up to
226    # the second decimal
227    assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > 0.75
228
229    acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
230    acc_sklearn = accuracy_score(y_train, pred_sklearn)
231
232    np.testing.assert_allclose(acc_lightgbm, acc_sklearn, rtol=0, atol=5e-2)
233
234    if max_leaf_nodes < 10 and n_samples >= 1000:
235
236        pred_lightgbm = est_lightgbm.predict(X_test)
237        pred_sklearn = est_sklearn.predict(X_test)
238        assert np.mean(pred_sklearn == pred_lightgbm) > 0.89
239
240        proba_lightgbm = est_lightgbm.predict_proba(X_train)
241        proba_sklearn = est_sklearn.predict_proba(X_train)
242        # assert more than 75% of the predicted probabilities are the same up
243        # to the second decimal
244        assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > 0.75
245
246        acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
247        acc_sklearn = accuracy_score(y_test, pred_sklearn)
248        np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
249