1from sklearn.model_selection import train_test_split 2from sklearn.metrics import accuracy_score 3from sklearn.datasets import make_classification, make_regression 4import numpy as np 5import pytest 6 7from sklearn.ensemble import HistGradientBoostingRegressor 8from sklearn.ensemble import HistGradientBoostingClassifier 9from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper 10from sklearn.ensemble._hist_gradient_boosting.utils import get_equivalent_estimator 11 12 13@pytest.mark.parametrize("seed", range(5)) 14@pytest.mark.parametrize("min_samples_leaf", (1, 20)) 15@pytest.mark.parametrize( 16 "n_samples, max_leaf_nodes", 17 [ 18 (255, 4096), 19 (1000, 8), 20 ], 21) 22def test_same_predictions_regression(seed, min_samples_leaf, n_samples, max_leaf_nodes): 23 # Make sure sklearn has the same predictions as lightgbm for easy targets. 24 # 25 # In particular when the size of the trees are bound and the number of 26 # samples is large enough, the structure of the prediction trees found by 27 # LightGBM and sklearn should be exactly identical. 28 # 29 # Notes: 30 # - Several candidate splits may have equal gains when the number of 31 # samples in a node is low (and because of float errors). Therefore the 32 # predictions on the test set might differ if the structure of the tree 33 # is not exactly the same. To avoid this issue we only compare the 34 # predictions on the test set when the number of samples is large enough 35 # and max_leaf_nodes is low enough. 36 # - To ignore discrepancies caused by small differences the binning 37 # strategy, data is pre-binned if n_samples > 255. 38 # - We don't check the absolute_error loss here. This is because 39 # LightGBM's computation of the median (used for the initial value of 40 # raw_prediction) is a bit off (they'll e.g. return midpoints when there 41 # is no need to.). Since these tests only run 1 iteration, the 42 # discrepancy between the initial values leads to biggish differences in 43 # the predictions. These differences are much smaller with more 44 # iterations. 45 pytest.importorskip("lightgbm") 46 47 rng = np.random.RandomState(seed=seed) 48 max_iter = 1 49 max_bins = 255 50 51 X, y = make_regression( 52 n_samples=n_samples, n_features=5, n_informative=5, random_state=0 53 ) 54 55 if n_samples > 255: 56 # bin data and convert it to float32 so that the estimator doesn't 57 # treat it as pre-binned 58 X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32) 59 60 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) 61 62 est_sklearn = HistGradientBoostingRegressor( 63 max_iter=max_iter, 64 max_bins=max_bins, 65 learning_rate=1, 66 early_stopping=False, 67 min_samples_leaf=min_samples_leaf, 68 max_leaf_nodes=max_leaf_nodes, 69 ) 70 est_lightgbm = get_equivalent_estimator(est_sklearn, lib="lightgbm") 71 72 est_lightgbm.fit(X_train, y_train) 73 est_sklearn.fit(X_train, y_train) 74 75 # We need X to be treated an numerical data, not pre-binned data. 76 X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) 77 78 pred_lightgbm = est_lightgbm.predict(X_train) 79 pred_sklearn = est_sklearn.predict(X_train) 80 # less than 1% of the predictions are different up to the 3rd decimal 81 assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-3) < 0.011 82 83 if max_leaf_nodes < 10 and n_samples >= 1000: 84 pred_lightgbm = est_lightgbm.predict(X_test) 85 pred_sklearn = est_sklearn.predict(X_test) 86 # less than 1% of the predictions are different up to the 4th decimal 87 assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-4) < 0.01 88 89 90@pytest.mark.parametrize("seed", range(5)) 91@pytest.mark.parametrize("min_samples_leaf", (1, 20)) 92@pytest.mark.parametrize( 93 "n_samples, max_leaf_nodes", 94 [ 95 (255, 4096), 96 (1000, 8), 97 ], 98) 99def test_same_predictions_classification( 100 seed, min_samples_leaf, n_samples, max_leaf_nodes 101): 102 # Same as test_same_predictions_regression but for classification 103 pytest.importorskip("lightgbm") 104 105 rng = np.random.RandomState(seed=seed) 106 max_iter = 1 107 n_classes = 2 108 max_bins = 255 109 110 X, y = make_classification( 111 n_samples=n_samples, 112 n_classes=n_classes, 113 n_features=5, 114 n_informative=5, 115 n_redundant=0, 116 random_state=0, 117 ) 118 119 if n_samples > 255: 120 # bin data and convert it to float32 so that the estimator doesn't 121 # treat it as pre-binned 122 X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32) 123 124 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) 125 126 est_sklearn = HistGradientBoostingClassifier( 127 loss="binary_crossentropy", 128 max_iter=max_iter, 129 max_bins=max_bins, 130 learning_rate=1, 131 early_stopping=False, 132 min_samples_leaf=min_samples_leaf, 133 max_leaf_nodes=max_leaf_nodes, 134 ) 135 est_lightgbm = get_equivalent_estimator(est_sklearn, lib="lightgbm") 136 137 est_lightgbm.fit(X_train, y_train) 138 est_sklearn.fit(X_train, y_train) 139 140 # We need X to be treated an numerical data, not pre-binned data. 141 X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) 142 143 pred_lightgbm = est_lightgbm.predict(X_train) 144 pred_sklearn = est_sklearn.predict(X_train) 145 assert np.mean(pred_sklearn == pred_lightgbm) > 0.89 146 147 acc_lightgbm = accuracy_score(y_train, pred_lightgbm) 148 acc_sklearn = accuracy_score(y_train, pred_sklearn) 149 np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn) 150 151 if max_leaf_nodes < 10 and n_samples >= 1000: 152 153 pred_lightgbm = est_lightgbm.predict(X_test) 154 pred_sklearn = est_sklearn.predict(X_test) 155 assert np.mean(pred_sklearn == pred_lightgbm) > 0.89 156 157 acc_lightgbm = accuracy_score(y_test, pred_lightgbm) 158 acc_sklearn = accuracy_score(y_test, pred_sklearn) 159 np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2) 160 161 162@pytest.mark.parametrize("seed", range(5)) 163@pytest.mark.parametrize("min_samples_leaf", (1, 20)) 164@pytest.mark.parametrize( 165 "n_samples, max_leaf_nodes", 166 [ 167 (255, 4096), 168 (10000, 8), 169 ], 170) 171def test_same_predictions_multiclass_classification( 172 seed, min_samples_leaf, n_samples, max_leaf_nodes 173): 174 # Same as test_same_predictions_regression but for classification 175 pytest.importorskip("lightgbm") 176 177 rng = np.random.RandomState(seed=seed) 178 n_classes = 3 179 max_iter = 1 180 max_bins = 255 181 lr = 1 182 183 X, y = make_classification( 184 n_samples=n_samples, 185 n_classes=n_classes, 186 n_features=5, 187 n_informative=5, 188 n_redundant=0, 189 n_clusters_per_class=1, 190 random_state=0, 191 ) 192 193 if n_samples > 255: 194 # bin data and convert it to float32 so that the estimator doesn't 195 # treat it as pre-binned 196 X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32) 197 198 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) 199 200 est_sklearn = HistGradientBoostingClassifier( 201 loss="categorical_crossentropy", 202 max_iter=max_iter, 203 max_bins=max_bins, 204 learning_rate=lr, 205 early_stopping=False, 206 min_samples_leaf=min_samples_leaf, 207 max_leaf_nodes=max_leaf_nodes, 208 ) 209 est_lightgbm = get_equivalent_estimator( 210 est_sklearn, lib="lightgbm", n_classes=n_classes 211 ) 212 213 est_lightgbm.fit(X_train, y_train) 214 est_sklearn.fit(X_train, y_train) 215 216 # We need X to be treated an numerical data, not pre-binned data. 217 X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) 218 219 pred_lightgbm = est_lightgbm.predict(X_train) 220 pred_sklearn = est_sklearn.predict(X_train) 221 assert np.mean(pred_sklearn == pred_lightgbm) > 0.89 222 223 proba_lightgbm = est_lightgbm.predict_proba(X_train) 224 proba_sklearn = est_sklearn.predict_proba(X_train) 225 # assert more than 75% of the predicted probabilities are the same up to 226 # the second decimal 227 assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > 0.75 228 229 acc_lightgbm = accuracy_score(y_train, pred_lightgbm) 230 acc_sklearn = accuracy_score(y_train, pred_sklearn) 231 232 np.testing.assert_allclose(acc_lightgbm, acc_sklearn, rtol=0, atol=5e-2) 233 234 if max_leaf_nodes < 10 and n_samples >= 1000: 235 236 pred_lightgbm = est_lightgbm.predict(X_test) 237 pred_sklearn = est_sklearn.predict(X_test) 238 assert np.mean(pred_sklearn == pred_lightgbm) > 0.89 239 240 proba_lightgbm = est_lightgbm.predict_proba(X_train) 241 proba_sklearn = est_sklearn.predict_proba(X_train) 242 # assert more than 75% of the predicted probabilities are the same up 243 # to the second decimal 244 assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > 0.75 245 246 acc_lightgbm = accuracy_score(y_test, pred_lightgbm) 247 acc_sklearn = accuracy_score(y_test, pred_sklearn) 248 np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2) 249