1import numpy as np 2import pytest 3 4from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower 5from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE 6from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE 7from sklearn.ensemble._hist_gradient_boosting.common import MonotonicConstraint 8from sklearn.ensemble._hist_gradient_boosting.splitting import ( 9 Splitter, 10 compute_node_value, 11) 12from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder 13from sklearn.ensemble import HistGradientBoostingRegressor 14from sklearn.ensemble import HistGradientBoostingClassifier 15from sklearn.utils._openmp_helpers import _openmp_effective_n_threads 16 17n_threads = _openmp_effective_n_threads() 18 19 20def is_increasing(a): 21 return (np.diff(a) >= 0.0).all() 22 23 24def is_decreasing(a): 25 return (np.diff(a) <= 0.0).all() 26 27 28def assert_leaves_values_monotonic(predictor, monotonic_cst): 29 # make sure leaves values (from left to right) are either all increasing 30 # or all decreasing (or neither) depending on the monotonic constraint. 31 nodes = predictor.nodes 32 33 def get_leaves_values(): 34 """get leaves values from left to right""" 35 values = [] 36 37 def depth_first_collect_leaf_values(node_idx): 38 node = nodes[node_idx] 39 if node["is_leaf"]: 40 values.append(node["value"]) 41 return 42 depth_first_collect_leaf_values(node["left"]) 43 depth_first_collect_leaf_values(node["right"]) 44 45 depth_first_collect_leaf_values(0) # start at root (0) 46 return values 47 48 values = get_leaves_values() 49 50 if monotonic_cst == MonotonicConstraint.NO_CST: 51 # some increasing, some decreasing 52 assert not is_increasing(values) and not is_decreasing(values) 53 elif monotonic_cst == MonotonicConstraint.POS: 54 # all increasing 55 assert is_increasing(values) 56 else: # NEG 57 # all decreasing 58 assert is_decreasing(values) 59 60 61def assert_children_values_monotonic(predictor, monotonic_cst): 62 # Make sure siblings values respect the monotonic constraints. Left should 63 # be lower (resp greater) than right child if constraint is POS (resp. 64 # NEG). 65 # Note that this property alone isn't enough to ensure full monotonicity, 66 # since we also need to guanrantee that all the descendents of the left 67 # child won't be greater (resp. lower) than the right child, or its 68 # descendents. That's why we need to bound the predicted values (this is 69 # tested in assert_children_values_bounded) 70 nodes = predictor.nodes 71 left_lower = [] 72 left_greater = [] 73 for node in nodes: 74 if node["is_leaf"]: 75 continue 76 77 left_idx = node["left"] 78 right_idx = node["right"] 79 80 if nodes[left_idx]["value"] < nodes[right_idx]["value"]: 81 left_lower.append(node) 82 elif nodes[left_idx]["value"] > nodes[right_idx]["value"]: 83 left_greater.append(node) 84 85 if monotonic_cst == MonotonicConstraint.NO_CST: 86 assert left_lower and left_greater 87 elif monotonic_cst == MonotonicConstraint.POS: 88 assert left_lower and not left_greater 89 else: # NEG 90 assert not left_lower and left_greater 91 92 93def assert_children_values_bounded(grower, monotonic_cst): 94 # Make sure that the values of the children of a node are bounded by the 95 # middle value between that node and its sibling (if there is a monotonic 96 # constraint). 97 # As a bonus, we also check that the siblings values are properly ordered 98 # which is slightly redundant with assert_children_values_monotonic (but 99 # this check is done on the grower nodes whereas 100 # assert_children_values_monotonic is done on the predictor nodes) 101 102 if monotonic_cst == MonotonicConstraint.NO_CST: 103 return 104 105 def recursively_check_children_node_values(node, right_sibling=None): 106 if node.is_leaf: 107 return 108 if right_sibling is not None: 109 middle = (node.value + right_sibling.value) / 2 110 if monotonic_cst == MonotonicConstraint.POS: 111 assert node.left_child.value <= node.right_child.value <= middle 112 if not right_sibling.is_leaf: 113 assert ( 114 middle 115 <= right_sibling.left_child.value 116 <= right_sibling.right_child.value 117 ) 118 else: # NEG 119 assert node.left_child.value >= node.right_child.value >= middle 120 if not right_sibling.is_leaf: 121 assert ( 122 middle 123 >= right_sibling.left_child.value 124 >= right_sibling.right_child.value 125 ) 126 127 recursively_check_children_node_values( 128 node.left_child, right_sibling=node.right_child 129 ) 130 recursively_check_children_node_values(node.right_child) 131 132 recursively_check_children_node_values(grower.root) 133 134 135@pytest.mark.parametrize("seed", range(3)) 136@pytest.mark.parametrize( 137 "monotonic_cst", 138 ( 139 MonotonicConstraint.NO_CST, 140 MonotonicConstraint.POS, 141 MonotonicConstraint.NEG, 142 ), 143) 144def test_nodes_values(monotonic_cst, seed): 145 # Build a single tree with only one feature, and make sure the nodes 146 # values respect the monotonic constraints. 147 148 # Considering the following tree with a monotonic POS constraint, we 149 # should have: 150 # 151 # root 152 # / \ 153 # 5 10 # middle = 7.5 154 # / \ / \ 155 # a b c d 156 # 157 # a <= b and c <= d (assert_children_values_monotonic) 158 # a, b <= middle <= c, d (assert_children_values_bounded) 159 # a <= b <= c <= d (assert_leaves_values_monotonic) 160 # 161 # The last one is a consequence of the others, but can't hurt to check 162 163 rng = np.random.RandomState(seed) 164 n_samples = 1000 165 n_features = 1 166 X_binned = rng.randint(0, 255, size=(n_samples, n_features), dtype=np.uint8) 167 X_binned = np.asfortranarray(X_binned) 168 169 gradients = rng.normal(size=n_samples).astype(G_H_DTYPE) 170 hessians = np.ones(shape=1, dtype=G_H_DTYPE) 171 172 grower = TreeGrower( 173 X_binned, gradients, hessians, monotonic_cst=[monotonic_cst], shrinkage=0.1 174 ) 175 grower.grow() 176 177 # grow() will shrink the leaves values at the very end. For our comparison 178 # tests, we need to revert the shrinkage of the leaves, else we would 179 # compare the value of a leaf (shrunk) with a node (not shrunk) and the 180 # test would not be correct. 181 for leave in grower.finalized_leaves: 182 leave.value /= grower.shrinkage 183 184 # We pass undefined binning_thresholds because we won't use predict anyway 185 predictor = grower.make_predictor( 186 binning_thresholds=np.zeros((X_binned.shape[1], X_binned.max() + 1)) 187 ) 188 189 # The consistency of the bounds can only be checked on the tree grower 190 # as the node bounds are not copied into the predictor tree. The 191 # consistency checks on the values of node children and leaves can be 192 # done either on the grower tree or on the predictor tree. We only 193 # do those checks on the predictor tree as the latter is derived from 194 # the former. 195 assert_children_values_monotonic(predictor, monotonic_cst) 196 assert_children_values_bounded(grower, monotonic_cst) 197 assert_leaves_values_monotonic(predictor, monotonic_cst) 198 199 200@pytest.mark.parametrize("seed", range(3)) 201def test_predictions(seed): 202 # Train a model with a POS constraint on the first feature and a NEG 203 # constraint on the second feature, and make sure the constraints are 204 # respected by checking the predictions. 205 # test adapted from lightgbm's test_monotone_constraint(), itself inspired 206 # by https://xgboost.readthedocs.io/en/latest/tutorials/monotonic.html 207 208 rng = np.random.RandomState(seed) 209 210 n_samples = 1000 211 f_0 = rng.rand(n_samples) # positive correlation with y 212 f_1 = rng.rand(n_samples) # negative correslation with y 213 X = np.c_[f_0, f_1] 214 noise = rng.normal(loc=0.0, scale=0.01, size=n_samples) 215 y = 5 * f_0 + np.sin(10 * np.pi * f_0) - 5 * f_1 - np.cos(10 * np.pi * f_1) + noise 216 217 gbdt = HistGradientBoostingRegressor(monotonic_cst=[1, -1]) 218 gbdt.fit(X, y) 219 220 linspace = np.linspace(0, 1, 100) 221 sin = np.sin(linspace) 222 constant = np.full_like(linspace, fill_value=0.5) 223 224 # We now assert the predictions properly respect the constraints, on each 225 # feature. When testing for a feature we need to set the other one to a 226 # constant, because the monotonic constraints are only a "all else being 227 # equal" type of constraints: 228 # a constraint on the first feature only means that 229 # x0 < x0' => f(x0, x1) < f(x0', x1) 230 # while x1 stays constant. 231 # The constraint does not guanrantee that 232 # x0 < x0' => f(x0, x1) < f(x0', x1') 233 234 # First feature (POS) 235 # assert pred is all increasing when f_0 is all increasing 236 X = np.c_[linspace, constant] 237 pred = gbdt.predict(X) 238 assert is_increasing(pred) 239 # assert pred actually follows the variations of f_0 240 X = np.c_[sin, constant] 241 pred = gbdt.predict(X) 242 assert np.all((np.diff(pred) >= 0) == (np.diff(sin) >= 0)) 243 244 # Second feature (NEG) 245 # assert pred is all decreasing when f_1 is all increasing 246 X = np.c_[constant, linspace] 247 pred = gbdt.predict(X) 248 assert is_decreasing(pred) 249 # assert pred actually follows the inverse variations of f_1 250 X = np.c_[constant, sin] 251 pred = gbdt.predict(X) 252 assert ((np.diff(pred) <= 0) == (np.diff(sin) >= 0)).all() 253 254 255def test_input_error(): 256 X = [[1, 2], [2, 3], [3, 4]] 257 y = [0, 1, 2] 258 259 gbdt = HistGradientBoostingRegressor(monotonic_cst=[1, 0, -1]) 260 with pytest.raises( 261 ValueError, match="monotonic_cst has shape 3 but the input data" 262 ): 263 gbdt.fit(X, y) 264 265 for monotonic_cst in ([1, 3], [1, -3]): 266 gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst) 267 with pytest.raises( 268 ValueError, match="must be None or an array-like of -1, 0 or 1" 269 ): 270 gbdt.fit(X, y) 271 272 gbdt = HistGradientBoostingClassifier(monotonic_cst=[0, 1]) 273 with pytest.raises( 274 ValueError, 275 match="monotonic constraints are not supported for multiclass classification", 276 ): 277 gbdt.fit(X, y) 278 279 280def test_bounded_value_min_gain_to_split(): 281 # The purpose of this test is to show that when computing the gain at a 282 # given split, the value of the current node should be properly bounded to 283 # respect the monotonic constraints, because it strongly interacts with 284 # min_gain_to_split. We build a simple example where gradients are [1, 1, 285 # 100, 1, 1] (hessians are all ones). The best split happens on the 3rd 286 # bin, and depending on whether the value of the node is bounded or not, 287 # the min_gain_to_split constraint is or isn't satisfied. 288 l2_regularization = 0 289 min_hessian_to_split = 0 290 min_samples_leaf = 1 291 n_bins = n_samples = 5 292 X_binned = np.arange(n_samples).reshape(-1, 1).astype(X_BINNED_DTYPE) 293 sample_indices = np.arange(n_samples, dtype=np.uint32) 294 all_hessians = np.ones(n_samples, dtype=G_H_DTYPE) 295 all_gradients = np.array([1, 1, 100, 1, 1], dtype=G_H_DTYPE) 296 sum_gradients = all_gradients.sum() 297 sum_hessians = all_hessians.sum() 298 hessians_are_constant = False 299 300 builder = HistogramBuilder( 301 X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads 302 ) 303 n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], dtype=np.uint32) 304 has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) 305 monotonic_cst = np.array( 306 [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8 307 ) 308 is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8) 309 missing_values_bin_idx = n_bins - 1 310 children_lower_bound, children_upper_bound = -np.inf, np.inf 311 312 min_gain_to_split = 2000 313 splitter = Splitter( 314 X_binned, 315 n_bins_non_missing, 316 missing_values_bin_idx, 317 has_missing_values, 318 is_categorical, 319 monotonic_cst, 320 l2_regularization, 321 min_hessian_to_split, 322 min_samples_leaf, 323 min_gain_to_split, 324 hessians_are_constant, 325 ) 326 327 histograms = builder.compute_histograms_brute(sample_indices) 328 329 # Since the gradient array is [1, 1, 100, 1, 1] 330 # the max possible gain happens on the 3rd bin (or equivalently in the 2nd) 331 # and is equal to about 1307, which less than min_gain_to_split = 2000, so 332 # the node is considered unsplittable (gain = -1) 333 current_lower_bound, current_upper_bound = -np.inf, np.inf 334 value = compute_node_value( 335 sum_gradients, 336 sum_hessians, 337 current_lower_bound, 338 current_upper_bound, 339 l2_regularization, 340 ) 341 # the unbounded value is equal to -sum_gradients / sum_hessians 342 assert value == pytest.approx(-104 / 5) 343 split_info = splitter.find_node_split( 344 n_samples, 345 histograms, 346 sum_gradients, 347 sum_hessians, 348 value, 349 lower_bound=children_lower_bound, 350 upper_bound=children_upper_bound, 351 ) 352 assert split_info.gain == -1 # min_gain_to_split not respected 353 354 # here again the max possible gain is on the 3rd bin but we now cap the 355 # value of the node into [-10, inf]. 356 # This means the gain is now about 2430 which is more than the 357 # min_gain_to_split constraint. 358 current_lower_bound, current_upper_bound = -10, np.inf 359 value = compute_node_value( 360 sum_gradients, 361 sum_hessians, 362 current_lower_bound, 363 current_upper_bound, 364 l2_regularization, 365 ) 366 assert value == -10 367 split_info = splitter.find_node_split( 368 n_samples, 369 histograms, 370 sum_gradients, 371 sum_hessians, 372 value, 373 lower_bound=children_lower_bound, 374 upper_bound=children_upper_bound, 375 ) 376 assert split_info.gain > min_gain_to_split 377