1import numpy as np
2import pytest
3
4from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
5from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
6from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
7from sklearn.ensemble._hist_gradient_boosting.common import MonotonicConstraint
8from sklearn.ensemble._hist_gradient_boosting.splitting import (
9    Splitter,
10    compute_node_value,
11)
12from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder
13from sklearn.ensemble import HistGradientBoostingRegressor
14from sklearn.ensemble import HistGradientBoostingClassifier
15from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
16
17n_threads = _openmp_effective_n_threads()
18
19
20def is_increasing(a):
21    return (np.diff(a) >= 0.0).all()
22
23
24def is_decreasing(a):
25    return (np.diff(a) <= 0.0).all()
26
27
28def assert_leaves_values_monotonic(predictor, monotonic_cst):
29    # make sure leaves values (from left to right) are either all increasing
30    # or all decreasing (or neither) depending on the monotonic constraint.
31    nodes = predictor.nodes
32
33    def get_leaves_values():
34        """get leaves values from left to right"""
35        values = []
36
37        def depth_first_collect_leaf_values(node_idx):
38            node = nodes[node_idx]
39            if node["is_leaf"]:
40                values.append(node["value"])
41                return
42            depth_first_collect_leaf_values(node["left"])
43            depth_first_collect_leaf_values(node["right"])
44
45        depth_first_collect_leaf_values(0)  # start at root (0)
46        return values
47
48    values = get_leaves_values()
49
50    if monotonic_cst == MonotonicConstraint.NO_CST:
51        # some increasing, some decreasing
52        assert not is_increasing(values) and not is_decreasing(values)
53    elif monotonic_cst == MonotonicConstraint.POS:
54        # all increasing
55        assert is_increasing(values)
56    else:  # NEG
57        # all decreasing
58        assert is_decreasing(values)
59
60
61def assert_children_values_monotonic(predictor, monotonic_cst):
62    # Make sure siblings values respect the monotonic constraints. Left should
63    # be lower (resp greater) than right child if constraint is POS (resp.
64    # NEG).
65    # Note that this property alone isn't enough to ensure full monotonicity,
66    # since we also need to guanrantee that all the descendents of the left
67    # child won't be greater (resp. lower) than the right child, or its
68    # descendents. That's why we need to bound the predicted values (this is
69    # tested in assert_children_values_bounded)
70    nodes = predictor.nodes
71    left_lower = []
72    left_greater = []
73    for node in nodes:
74        if node["is_leaf"]:
75            continue
76
77        left_idx = node["left"]
78        right_idx = node["right"]
79
80        if nodes[left_idx]["value"] < nodes[right_idx]["value"]:
81            left_lower.append(node)
82        elif nodes[left_idx]["value"] > nodes[right_idx]["value"]:
83            left_greater.append(node)
84
85    if monotonic_cst == MonotonicConstraint.NO_CST:
86        assert left_lower and left_greater
87    elif monotonic_cst == MonotonicConstraint.POS:
88        assert left_lower and not left_greater
89    else:  # NEG
90        assert not left_lower and left_greater
91
92
93def assert_children_values_bounded(grower, monotonic_cst):
94    # Make sure that the values of the children of a node are bounded by the
95    # middle value between that node and its sibling (if there is a monotonic
96    # constraint).
97    # As a bonus, we also check that the siblings values are properly ordered
98    # which is slightly redundant with assert_children_values_monotonic (but
99    # this check is done on the grower nodes whereas
100    # assert_children_values_monotonic is done on the predictor nodes)
101
102    if monotonic_cst == MonotonicConstraint.NO_CST:
103        return
104
105    def recursively_check_children_node_values(node, right_sibling=None):
106        if node.is_leaf:
107            return
108        if right_sibling is not None:
109            middle = (node.value + right_sibling.value) / 2
110            if monotonic_cst == MonotonicConstraint.POS:
111                assert node.left_child.value <= node.right_child.value <= middle
112                if not right_sibling.is_leaf:
113                    assert (
114                        middle
115                        <= right_sibling.left_child.value
116                        <= right_sibling.right_child.value
117                    )
118            else:  # NEG
119                assert node.left_child.value >= node.right_child.value >= middle
120                if not right_sibling.is_leaf:
121                    assert (
122                        middle
123                        >= right_sibling.left_child.value
124                        >= right_sibling.right_child.value
125                    )
126
127        recursively_check_children_node_values(
128            node.left_child, right_sibling=node.right_child
129        )
130        recursively_check_children_node_values(node.right_child)
131
132    recursively_check_children_node_values(grower.root)
133
134
135@pytest.mark.parametrize("seed", range(3))
136@pytest.mark.parametrize(
137    "monotonic_cst",
138    (
139        MonotonicConstraint.NO_CST,
140        MonotonicConstraint.POS,
141        MonotonicConstraint.NEG,
142    ),
143)
144def test_nodes_values(monotonic_cst, seed):
145    # Build a single tree with only one feature, and make sure the nodes
146    # values respect the monotonic constraints.
147
148    # Considering the following tree with a monotonic POS constraint, we
149    # should have:
150    #
151    #       root
152    #      /    \
153    #     5     10    # middle = 7.5
154    #    / \   / \
155    #   a  b  c  d
156    #
157    # a <= b and c <= d  (assert_children_values_monotonic)
158    # a, b <= middle <= c, d (assert_children_values_bounded)
159    # a <= b <= c <= d (assert_leaves_values_monotonic)
160    #
161    # The last one is a consequence of the others, but can't hurt to check
162
163    rng = np.random.RandomState(seed)
164    n_samples = 1000
165    n_features = 1
166    X_binned = rng.randint(0, 255, size=(n_samples, n_features), dtype=np.uint8)
167    X_binned = np.asfortranarray(X_binned)
168
169    gradients = rng.normal(size=n_samples).astype(G_H_DTYPE)
170    hessians = np.ones(shape=1, dtype=G_H_DTYPE)
171
172    grower = TreeGrower(
173        X_binned, gradients, hessians, monotonic_cst=[monotonic_cst], shrinkage=0.1
174    )
175    grower.grow()
176
177    # grow() will shrink the leaves values at the very end. For our comparison
178    # tests, we need to revert the shrinkage of the leaves, else we would
179    # compare the value of a leaf (shrunk) with a node (not shrunk) and the
180    # test would not be correct.
181    for leave in grower.finalized_leaves:
182        leave.value /= grower.shrinkage
183
184    # We pass undefined binning_thresholds because we won't use predict anyway
185    predictor = grower.make_predictor(
186        binning_thresholds=np.zeros((X_binned.shape[1], X_binned.max() + 1))
187    )
188
189    # The consistency of the bounds can only be checked on the tree grower
190    # as the node bounds are not copied into the predictor tree. The
191    # consistency checks on the values of node children and leaves can be
192    # done either on the grower tree or on the predictor tree. We only
193    # do those checks on the predictor tree as the latter is derived from
194    # the former.
195    assert_children_values_monotonic(predictor, monotonic_cst)
196    assert_children_values_bounded(grower, monotonic_cst)
197    assert_leaves_values_monotonic(predictor, monotonic_cst)
198
199
200@pytest.mark.parametrize("seed", range(3))
201def test_predictions(seed):
202    # Train a model with a POS constraint on the first feature and a NEG
203    # constraint on the second feature, and make sure the constraints are
204    # respected by checking the predictions.
205    # test adapted from lightgbm's test_monotone_constraint(), itself inspired
206    # by https://xgboost.readthedocs.io/en/latest/tutorials/monotonic.html
207
208    rng = np.random.RandomState(seed)
209
210    n_samples = 1000
211    f_0 = rng.rand(n_samples)  # positive correlation with y
212    f_1 = rng.rand(n_samples)  # negative correslation with y
213    X = np.c_[f_0, f_1]
214    noise = rng.normal(loc=0.0, scale=0.01, size=n_samples)
215    y = 5 * f_0 + np.sin(10 * np.pi * f_0) - 5 * f_1 - np.cos(10 * np.pi * f_1) + noise
216
217    gbdt = HistGradientBoostingRegressor(monotonic_cst=[1, -1])
218    gbdt.fit(X, y)
219
220    linspace = np.linspace(0, 1, 100)
221    sin = np.sin(linspace)
222    constant = np.full_like(linspace, fill_value=0.5)
223
224    # We now assert the predictions properly respect the constraints, on each
225    # feature. When testing for a feature we need to set the other one to a
226    # constant, because the monotonic constraints are only a "all else being
227    # equal" type of constraints:
228    # a constraint on the first feature only means that
229    # x0 < x0' => f(x0, x1) < f(x0', x1)
230    # while x1 stays constant.
231    # The constraint does not guanrantee that
232    # x0 < x0' => f(x0, x1) < f(x0', x1')
233
234    # First feature (POS)
235    # assert pred is all increasing when f_0 is all increasing
236    X = np.c_[linspace, constant]
237    pred = gbdt.predict(X)
238    assert is_increasing(pred)
239    # assert pred actually follows the variations of f_0
240    X = np.c_[sin, constant]
241    pred = gbdt.predict(X)
242    assert np.all((np.diff(pred) >= 0) == (np.diff(sin) >= 0))
243
244    # Second feature (NEG)
245    # assert pred is all decreasing when f_1 is all increasing
246    X = np.c_[constant, linspace]
247    pred = gbdt.predict(X)
248    assert is_decreasing(pred)
249    # assert pred actually follows the inverse variations of f_1
250    X = np.c_[constant, sin]
251    pred = gbdt.predict(X)
252    assert ((np.diff(pred) <= 0) == (np.diff(sin) >= 0)).all()
253
254
255def test_input_error():
256    X = [[1, 2], [2, 3], [3, 4]]
257    y = [0, 1, 2]
258
259    gbdt = HistGradientBoostingRegressor(monotonic_cst=[1, 0, -1])
260    with pytest.raises(
261        ValueError, match="monotonic_cst has shape 3 but the input data"
262    ):
263        gbdt.fit(X, y)
264
265    for monotonic_cst in ([1, 3], [1, -3]):
266        gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst)
267        with pytest.raises(
268            ValueError, match="must be None or an array-like of -1, 0 or 1"
269        ):
270            gbdt.fit(X, y)
271
272    gbdt = HistGradientBoostingClassifier(monotonic_cst=[0, 1])
273    with pytest.raises(
274        ValueError,
275        match="monotonic constraints are not supported for multiclass classification",
276    ):
277        gbdt.fit(X, y)
278
279
280def test_bounded_value_min_gain_to_split():
281    # The purpose of this test is to show that when computing the gain at a
282    # given split, the value of the current node should be properly bounded to
283    # respect the monotonic constraints, because it strongly interacts with
284    # min_gain_to_split. We build a simple example where gradients are [1, 1,
285    # 100, 1, 1] (hessians are all ones). The best split happens on the 3rd
286    # bin, and depending on whether the value of the node is bounded or not,
287    # the min_gain_to_split constraint is or isn't satisfied.
288    l2_regularization = 0
289    min_hessian_to_split = 0
290    min_samples_leaf = 1
291    n_bins = n_samples = 5
292    X_binned = np.arange(n_samples).reshape(-1, 1).astype(X_BINNED_DTYPE)
293    sample_indices = np.arange(n_samples, dtype=np.uint32)
294    all_hessians = np.ones(n_samples, dtype=G_H_DTYPE)
295    all_gradients = np.array([1, 1, 100, 1, 1], dtype=G_H_DTYPE)
296    sum_gradients = all_gradients.sum()
297    sum_hessians = all_hessians.sum()
298    hessians_are_constant = False
299
300    builder = HistogramBuilder(
301        X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
302    )
303    n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], dtype=np.uint32)
304    has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
305    monotonic_cst = np.array(
306        [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
307    )
308    is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8)
309    missing_values_bin_idx = n_bins - 1
310    children_lower_bound, children_upper_bound = -np.inf, np.inf
311
312    min_gain_to_split = 2000
313    splitter = Splitter(
314        X_binned,
315        n_bins_non_missing,
316        missing_values_bin_idx,
317        has_missing_values,
318        is_categorical,
319        monotonic_cst,
320        l2_regularization,
321        min_hessian_to_split,
322        min_samples_leaf,
323        min_gain_to_split,
324        hessians_are_constant,
325    )
326
327    histograms = builder.compute_histograms_brute(sample_indices)
328
329    # Since the gradient array is [1, 1, 100, 1, 1]
330    # the max possible gain happens on the 3rd bin (or equivalently in the 2nd)
331    # and is equal to about 1307, which less than min_gain_to_split = 2000, so
332    # the node is considered unsplittable (gain = -1)
333    current_lower_bound, current_upper_bound = -np.inf, np.inf
334    value = compute_node_value(
335        sum_gradients,
336        sum_hessians,
337        current_lower_bound,
338        current_upper_bound,
339        l2_regularization,
340    )
341    # the unbounded value is equal to -sum_gradients / sum_hessians
342    assert value == pytest.approx(-104 / 5)
343    split_info = splitter.find_node_split(
344        n_samples,
345        histograms,
346        sum_gradients,
347        sum_hessians,
348        value,
349        lower_bound=children_lower_bound,
350        upper_bound=children_upper_bound,
351    )
352    assert split_info.gain == -1  # min_gain_to_split not respected
353
354    # here again the max possible gain is on the 3rd bin but we now cap the
355    # value of the node into [-10, inf].
356    # This means the gain is now about 2430 which is more than the
357    # min_gain_to_split constraint.
358    current_lower_bound, current_upper_bound = -10, np.inf
359    value = compute_node_value(
360        sum_gradients,
361        sum_hessians,
362        current_lower_bound,
363        current_upper_bound,
364        l2_regularization,
365    )
366    assert value == -10
367    split_info = splitter.find_node_split(
368        n_samples,
369        histograms,
370        sum_gradients,
371        sum_hessians,
372        value,
373        lower_bound=children_lower_bound,
374        upper_bound=children_upper_bound,
375    )
376    assert split_info.gain > min_gain_to_split
377