import numpy as np
import pytest
from numpy.testing import assert_array_equal

from sklearn.ensemble._hist_gradient_boosting.common import HISTOGRAM_DTYPE
from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
from sklearn.ensemble._hist_gradient_boosting.common import MonotonicConstraint
from sklearn.ensemble._hist_gradient_boosting.splitting import (
    Splitter,
    compute_node_value,
)
from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder
from sklearn.utils._testing import skip_if_32bit
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads

n_threads = _openmp_effective_n_threads()


@pytest.mark.parametrize("n_bins", [3, 32, 256])
def test_histogram_split(n_bins):
    rng = np.random.RandomState(42)
    feature_idx = 0
    l2_regularization = 0
    min_hessian_to_split = 1e-3
    min_samples_leaf = 1
    min_gain_to_split = 0.0
    X_binned = np.asfortranarray(
        rng.randint(0, n_bins - 1, size=(int(1e4), 1)), dtype=X_BINNED_DTYPE
    )
    binned_feature = X_binned.T[feature_idx]
    sample_indices = np.arange(binned_feature.shape[0], dtype=np.uint32)
    ordered_hessians = np.ones_like(binned_feature, dtype=G_H_DTYPE)
    all_hessians = ordered_hessians
    sum_hessians = all_hessians.sum()
    hessians_are_constant = False

    for true_bin in range(1, n_bins - 2):
        for sign in [-1, 1]:
            ordered_gradients = np.full_like(binned_feature, sign, dtype=G_H_DTYPE)
            ordered_gradients[binned_feature <= true_bin] *= -1
            all_gradients = ordered_gradients
            sum_gradients = all_gradients.sum()

            builder = HistogramBuilder(
                X_binned,
                n_bins,
                all_gradients,
                all_hessians,
                hessians_are_constant,
                n_threads,
            )
            n_bins_non_missing = np.array(
                [n_bins - 1] * X_binned.shape[1], dtype=np.uint32
            )
            has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
            monotonic_cst = np.array(
                [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
            )
            is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8)
            missing_values_bin_idx = n_bins - 1
            splitter = Splitter(
                X_binned,
                n_bins_non_missing,
                missing_values_bin_idx,
                has_missing_values,
                is_categorical,
                monotonic_cst,
                l2_regularization,
                min_hessian_to_split,
                min_samples_leaf,
                min_gain_to_split,
                hessians_are_constant,
            )

            histograms = builder.compute_histograms_brute(sample_indices)
            value = compute_node_value(
                sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
            )
            split_info = splitter.find_node_split(
                sample_indices.shape[0], histograms, sum_gradients, sum_hessians, value
            )

            assert split_info.bin_idx == true_bin
            assert split_info.gain >= 0
            assert split_info.feature_idx == feature_idx
            assert (
                split_info.n_samples_left + split_info.n_samples_right
                == sample_indices.shape[0]
            )
            # Constant hessian: 1. per sample.
            assert split_info.n_samples_left == split_info.sum_hessian_left


@skip_if_32bit
@pytest.mark.parametrize("constant_hessian", [True, False])
def test_gradient_and_hessian_sanity(constant_hessian):
    # This test checks that the values of gradients and hessians are
    # consistent in different places:
    # - in split_info: si.sum_gradient_left + si.sum_gradient_right must be
    #   equal to the gradient at the node. Same for hessians.
    # - in the histograms: summing 'sum_gradients' over the bins must be
    #   constant across all features, and those sums must be equal to the
    #   node's gradient. Same for hessians.

    rng = np.random.RandomState(42)

    n_bins = 10
    n_features = 20
    n_samples = 500
    l2_regularization = 0.0
    min_hessian_to_split = 1e-3
    min_samples_leaf = 1
    min_gain_to_split = 0.0

    X_binned = rng.randint(
        0, n_bins, size=(n_samples, n_features), dtype=X_BINNED_DTYPE
    )
    X_binned = np.asfortranarray(X_binned)
    sample_indices = np.arange(n_samples, dtype=np.uint32)
    all_gradients = rng.randn(n_samples).astype(G_H_DTYPE)
    sum_gradients = all_gradients.sum()
    if constant_hessian:
        all_hessians = np.ones(1, dtype=G_H_DTYPE)
        sum_hessians = 1 * n_samples
    else:
        all_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE)
        sum_hessians = all_hessians.sum()

    builder = HistogramBuilder(
        X_binned, n_bins, all_gradients, all_hessians, constant_hessian, n_threads
    )
    n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], dtype=np.uint32)
    has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
    monotonic_cst = np.array(
        [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
    )
    is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8)
    missing_values_bin_idx = n_bins - 1
    splitter = Splitter(
        X_binned,
        n_bins_non_missing,
        missing_values_bin_idx,
        has_missing_values,
        is_categorical,
        monotonic_cst,
        l2_regularization,
        min_hessian_to_split,
        min_samples_leaf,
        min_gain_to_split,
        constant_hessian,
    )

    hists_parent = builder.compute_histograms_brute(sample_indices)
    value_parent = compute_node_value(
        sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
    )
    si_parent = splitter.find_node_split(
        n_samples, hists_parent, sum_gradients, sum_hessians, value_parent
    )
    sample_indices_left, sample_indices_right, _ = splitter.split_indices(
        si_parent, sample_indices
    )

    hists_left = builder.compute_histograms_brute(sample_indices_left)
    value_left = compute_node_value(
        si_parent.sum_gradient_left,
        si_parent.sum_hessian_left,
        -np.inf,
        np.inf,
        l2_regularization,
    )
    hists_right = builder.compute_histograms_brute(sample_indices_right)
    value_right = compute_node_value(
        si_parent.sum_gradient_right,
        si_parent.sum_hessian_right,
        -np.inf,
        np.inf,
        l2_regularization,
    )
    si_left = splitter.find_node_split(
        n_samples,
        hists_left,
        si_parent.sum_gradient_left,
        si_parent.sum_hessian_left,
        value_left,
    )
    si_right = splitter.find_node_split(
        n_samples,
        hists_right,
        si_parent.sum_gradient_right,
        si_parent.sum_hessian_right,
        value_right,
    )

    # make sure that si.sum_gradient_left + si.sum_gradient_right have their
    # expected value, same for hessians
    for si, indices in (
        (si_parent, sample_indices),
        (si_left, sample_indices_left),
        (si_right, sample_indices_right),
    ):
        gradient = si.sum_gradient_right + si.sum_gradient_left
        expected_gradient = all_gradients[indices].sum()
        hessian = si.sum_hessian_right + si.sum_hessian_left
        if constant_hessian:
            expected_hessian = indices.shape[0] * all_hessians[0]
        else:
            expected_hessian = all_hessians[indices].sum()

        assert np.isclose(gradient, expected_gradient)
        assert np.isclose(hessian, expected_hessian)

    # make sure sum of gradients in histograms are the same for all features,
    # and make sure they're equal to their expected value
    hists_parent = np.asarray(hists_parent, dtype=HISTOGRAM_DTYPE)
    hists_left = np.asarray(hists_left, dtype=HISTOGRAM_DTYPE)
    hists_right = np.asarray(hists_right, dtype=HISTOGRAM_DTYPE)
    for hists, indices in (
        (hists_parent, sample_indices),
        (hists_left, sample_indices_left),
        (hists_right, sample_indices_right),
    ):
        # note: gradients and hessians have shape (n_features,),
        # we're comparing them to *scalars*. This has the benefit of also
        # making sure that all the entries are equal across features.
        gradients = hists["sum_gradients"].sum(axis=1)  # shape = (n_features,)
        expected_gradient = all_gradients[indices].sum()  # scalar
        hessians = hists["sum_hessians"].sum(axis=1)
        if constant_hessian:
            # 0 is not the actual hessian, but it's not computed in this case
            expected_hessian = 0.0
        else:
            expected_hessian = all_hessians[indices].sum()

        assert np.allclose(gradients, expected_gradient)
        assert np.allclose(hessians, expected_hessian)


def test_split_indices():
    # Check that split_indices returns the correct splits and that
    # splitter.partition is consistent with what is returned.
    rng = np.random.RandomState(421)

    n_bins = 5
    n_samples = 10
    l2_regularization = 0.0
    min_hessian_to_split = 1e-3
    min_samples_leaf = 1
    min_gain_to_split = 0.0

    # split will happen on feature 1 and on bin 3
    X_binned = [
        [0, 0],
        [0, 3],
        [0, 4],
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 4],
        [0, 0],
        [0, 4],
    ]
    X_binned = np.asfortranarray(X_binned, dtype=X_BINNED_DTYPE)
    sample_indices = np.arange(n_samples, dtype=np.uint32)
    all_gradients = rng.randn(n_samples).astype(G_H_DTYPE)
    all_hessians = np.ones(1, dtype=G_H_DTYPE)
    sum_gradients = all_gradients.sum()
    sum_hessians = 1 * n_samples
    hessians_are_constant = True

    builder = HistogramBuilder(
        X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
    )
    n_bins_non_missing = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32)
    has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
    monotonic_cst = np.array(
        [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
    )
    is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8)
    missing_values_bin_idx = n_bins - 1
    splitter = Splitter(
        X_binned,
        n_bins_non_missing,
        missing_values_bin_idx,
        has_missing_values,
        is_categorical,
        monotonic_cst,
        l2_regularization,
        min_hessian_to_split,
        min_samples_leaf,
        min_gain_to_split,
        hessians_are_constant,
    )

    assert np.all(sample_indices == splitter.partition)

    histograms = builder.compute_histograms_brute(sample_indices)
    value = compute_node_value(
        sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
    )
    si_root = splitter.find_node_split(
        n_samples, histograms, sum_gradients, sum_hessians, value
    )

    # sanity checks for best split
    assert si_root.feature_idx == 1
    assert si_root.bin_idx == 3

    samples_left, samples_right, position_right = splitter.split_indices(
        si_root, splitter.partition
    )
    assert set(samples_left) == set([0, 1, 3, 4, 5, 6, 8])
    assert set(samples_right) == set([2, 7, 9])

    assert list(samples_left) == list(splitter.partition[:position_right])
    assert list(samples_right) == list(splitter.partition[position_right:])

    # Check that the resulting split indices sizes are consistent with the
    # count statistics anticipated when looking for the best split.
    assert samples_left.shape[0] == si_root.n_samples_left
    assert samples_right.shape[0] == si_root.n_samples_right


def test_min_gain_to_split():
    # Try to split a pure node (all gradients are equal, same for hessians)
    # with min_gain_to_split = 0 and make sure that the node is not split (best
    # possible gain = -1). Note: before the strict inequality comparison, this
    # test would fail because the node would be split with a gain of 0.
    rng = np.random.RandomState(42)
    l2_regularization = 0
    min_hessian_to_split = 0
    min_samples_leaf = 1
    min_gain_to_split = 0.0
    n_bins = 255
    n_samples = 100
    X_binned = np.asfortranarray(
        rng.randint(0, n_bins, size=(n_samples, 1)), dtype=X_BINNED_DTYPE
    )
    binned_feature = X_binned[:, 0]
    sample_indices = np.arange(n_samples, dtype=np.uint32)
    all_hessians = np.ones_like(binned_feature, dtype=G_H_DTYPE)
    all_gradients = np.ones_like(binned_feature, dtype=G_H_DTYPE)
    sum_gradients = all_gradients.sum()
    sum_hessians = all_hessians.sum()
    hessians_are_constant = False

    builder = HistogramBuilder(
        X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
    )
    n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], dtype=np.uint32)
    has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
    monotonic_cst = np.array(
        [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
    )
    is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8)
    missing_values_bin_idx = n_bins - 1
    splitter = Splitter(
        X_binned,
        n_bins_non_missing,
        missing_values_bin_idx,
        has_missing_values,
        is_categorical,
        monotonic_cst,
        l2_regularization,
        min_hessian_to_split,
        min_samples_leaf,
        min_gain_to_split,
        hessians_are_constant,
    )

    histograms = builder.compute_histograms_brute(sample_indices)
    value = compute_node_value(
        sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
    )
    split_info = splitter.find_node_split(
        n_samples, histograms, sum_gradients, sum_hessians, value
    )
    assert split_info.gain == -1


@pytest.mark.parametrize(
    "X_binned, all_gradients, has_missing_values, n_bins_non_missing, "
    " expected_split_on_nan, expected_bin_idx, expected_go_to_left",
    [
        # basic sanity check with no missing values: given the gradient
        # values, the split must occur on bin_idx=3
        (
            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],  # X_binned
            [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],  # gradients
            False,  # no missing values
            10,  # n_bins_non_missing
            False,  # don't split on nans
            3,  # expected_bin_idx
            "not_applicable",
        ),
        # We replace 2 samples by NaNs (bin_idx=8)
        # These 2 samples were mapped to the left node before, so they should
        # be mapped to left node again
        # Notice how the bin_idx threshold changes from 3 to 1.
        (
            [8, 0, 1, 8, 2, 3, 4, 5, 6, 7],  # 8 <=> missing
            [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
            True,  # missing values
            8,  # n_bins_non_missing
            False,  # don't split on nans
            1,  # cut on bin_idx=1
            True,
        ),  # missing values go to left
        # same as above, but with non-consecutive missing_values_bin
        (
            [9, 0, 1, 9, 2, 3, 4, 5, 6, 7],  # 9 <=> missing
            [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
            True,  # missing values
            8,  # n_bins_non_missing
            False,  # don't split on nans
            1,  # cut on bin_idx=1
            True,
        ),  # missing values go to left
        # this time replacing 2 samples that were on the right.
        (
            [0, 1, 2, 3, 8, 4, 8, 5, 6, 7],  # 8 <=> missing
            [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
            True,  # missing values
            8,  # n_bins_non_missing
            False,  # don't split on nans
            3,  # cut on bin_idx=3 (like in first case)
            False,
        ),  # missing values go to right
        # same as above, but with non-consecutive missing_values_bin
        (
            [0, 1, 2, 3, 9, 4, 9, 5, 6, 7],  # 9 <=> missing
            [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
            True,  # missing values
            8,  # n_bins_non_missing
            False,  # don't split on nans
            3,  # cut on bin_idx=3 (like in first case)
            False,
        ),  # missing values go to right
        # For the following cases, split_on_nans is True (we replace all of
        # the samples with nans, instead of just 2).
        (
            [0, 1, 2, 3, 4, 4, 4, 4, 4, 4],  # 4 <=> missing
            [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
            True,  # missing values
            4,  # n_bins_non_missing
            True,  # split on nans
            3,  # cut on bin_idx=3
            False,
        ),  # missing values go to right
        # same as above, but with non-consecutive missing_values_bin
        (
            [0, 1, 2, 3, 9, 9, 9, 9, 9, 9],  # 9 <=> missing
            [1, 1, 1, 1, 1, 1, 5, 5, 5, 5],
            True,  # missing values
            4,  # n_bins_non_missing
            True,  # split on nans
            3,  # cut on bin_idx=3
            False,
        ),  # missing values go to right
        (
            [6, 6, 6, 6, 0, 1, 2, 3, 4, 5],  # 6 <=> missing
            [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
            True,  # missing values
            6,  # n_bins_non_missing
            True,  # split on nans
            5,  # cut on bin_idx=5
            False,
        ),  # missing values go to right
        # same as above, but with non-consecutive missing_values_bin
        (
            [9, 9, 9, 9, 0, 1, 2, 3, 4, 5],  # 9 <=> missing
            [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
            True,  # missing values
            6,  # n_bins_non_missing
            True,  # split on nans
            5,  # cut on bin_idx=5
            False,
        ),  # missing values go to right
    ],
)
def test_splitting_missing_values(
    X_binned,
    all_gradients,
    has_missing_values,
    n_bins_non_missing,
    expected_split_on_nan,
    expected_bin_idx,
    expected_go_to_left,
):
    # Make sure missing values are properly supported.
    # we build an artificial example with gradients such that the best split
    # is on bin_idx=3, when there are no missing values.
    # Then we introduce missing values and:
    #   - make sure the chosen bin is correct (find_best_bin()): it's
    #     still the same split, even though the index of the bin may change
    #   - make sure the missing values are mapped to the correct child
    #     (split_indices())

    n_bins = max(X_binned) + 1
    n_samples = len(X_binned)
    l2_regularization = 0.0
    min_hessian_to_split = 1e-3
    min_samples_leaf = 1
    min_gain_to_split = 0.0

    sample_indices = np.arange(n_samples, dtype=np.uint32)
    X_binned = np.array(X_binned, dtype=X_BINNED_DTYPE).reshape(-1, 1)
    X_binned = np.asfortranarray(X_binned)
    all_gradients = np.array(all_gradients, dtype=G_H_DTYPE)
    has_missing_values = np.array([has_missing_values], dtype=np.uint8)
    all_hessians = np.ones(1, dtype=G_H_DTYPE)
    sum_gradients = all_gradients.sum()
    sum_hessians = 1 * n_samples
    hessians_are_constant = True

    builder = HistogramBuilder(
        X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
    )

    n_bins_non_missing = np.array([n_bins_non_missing], dtype=np.uint32)
    monotonic_cst = np.array(
        [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
    )
    is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8)
    missing_values_bin_idx = n_bins - 1
    splitter = Splitter(
        X_binned,
        n_bins_non_missing,
        missing_values_bin_idx,
        has_missing_values,
        is_categorical,
        monotonic_cst,
        l2_regularization,
        min_hessian_to_split,
        min_samples_leaf,
        min_gain_to_split,
        hessians_are_constant,
    )

    histograms = builder.compute_histograms_brute(sample_indices)
    value = compute_node_value(
        sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
    )
    split_info = splitter.find_node_split(
        n_samples, histograms, sum_gradients, sum_hessians, value
    )

    assert split_info.bin_idx == expected_bin_idx
    if has_missing_values:
        assert split_info.missing_go_to_left == expected_go_to_left

    split_on_nan = split_info.bin_idx == n_bins_non_missing[0] - 1
    assert split_on_nan == expected_split_on_nan

    # Make sure the split is properly computed.
    # This also make sure missing values are properly assigned to the correct
    # child in split_indices()
    samples_left, samples_right, _ = splitter.split_indices(
        split_info, splitter.partition
    )

    if not expected_split_on_nan:
        # When we don't split on nans, the split should always be the same.
        assert set(samples_left) == set([0, 1, 2, 3])
        assert set(samples_right) == set([4, 5, 6, 7, 8, 9])
    else:
        # When we split on nans, samples with missing values are always mapped
        # to the right child.
        missing_samples_indices = np.flatnonzero(
            np.array(X_binned) == missing_values_bin_idx
        )
        non_missing_samples_indices = np.flatnonzero(
            np.array(X_binned) != missing_values_bin_idx
        )

        assert set(samples_right) == set(missing_samples_indices)
        assert set(samples_left) == set(non_missing_samples_indices)


@pytest.mark.parametrize(
    "X_binned, has_missing_values, n_bins_non_missing, ",
    [
        # one category
        ([0] * 20, False, 1),
        # all categories appear less than MIN_CAT_SUPPORT (hardcoded to 10)
        ([0] * 9 + [1] * 8, False, 2),
        # only one category appears more than MIN_CAT_SUPPORT
        ([0] * 12 + [1] * 8, False, 2),
        # missing values + category appear less than MIN_CAT_SUPPORT
        # 9 is missing
        ([0] * 9 + [1] * 8 + [9] * 4, True, 2),
        # no non-missing category
        ([9] * 11, True, 0),
    ],
)
def test_splitting_categorical_cat_smooth(
    X_binned, has_missing_values, n_bins_non_missing
):
    # Checks categorical splits are correct when the MIN_CAT_SUPPORT constraint
    # isn't respected: there are no splits

    n_bins = max(X_binned) + 1
    n_samples = len(X_binned)
    X_binned = np.array([X_binned], dtype=X_BINNED_DTYPE).T
    X_binned = np.asfortranarray(X_binned)

    l2_regularization = 0.0
    min_hessian_to_split = 1e-3
    min_samples_leaf = 1
    min_gain_to_split = 0.0

    sample_indices = np.arange(n_samples, dtype=np.uint32)
    all_gradients = np.ones(n_samples, dtype=G_H_DTYPE)
    has_missing_values = np.array([has_missing_values], dtype=np.uint8)
    all_hessians = np.ones(1, dtype=G_H_DTYPE)
    sum_gradients = all_gradients.sum()
    sum_hessians = n_samples
    hessians_are_constant = True

    builder = HistogramBuilder(
        X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
    )

    n_bins_non_missing = np.array([n_bins_non_missing], dtype=np.uint32)
    monotonic_cst = np.array(
        [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
    )
    is_categorical = np.ones_like(monotonic_cst, dtype=np.uint8)
    missing_values_bin_idx = n_bins - 1

    splitter = Splitter(
        X_binned,
        n_bins_non_missing,
        missing_values_bin_idx,
        has_missing_values,
        is_categorical,
        monotonic_cst,
        l2_regularization,
        min_hessian_to_split,
        min_samples_leaf,
        min_gain_to_split,
        hessians_are_constant,
    )

    histograms = builder.compute_histograms_brute(sample_indices)
    value = compute_node_value(
        sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
    )
    split_info = splitter.find_node_split(
        n_samples, histograms, sum_gradients, sum_hessians, value
    )

    # no split found
    assert split_info.gain == -1


def _assert_categories_equals_bitset(categories, bitset):
    # assert that the bitset exactly corresponds to the categories
    # bitset is assumed to be an array of 8 uint32 elements

    # form bitset from threshold
    expected_bitset = np.zeros(8, dtype=np.uint32)
    for cat in categories:
        idx = cat // 32
        shift = cat % 32
        expected_bitset[idx] |= 1 << shift

    # check for equality
    assert_array_equal(expected_bitset, bitset)


@pytest.mark.parametrize(
    "X_binned, all_gradients, expected_categories_left, n_bins_non_missing,"
    "missing_values_bin_idx, has_missing_values, expected_missing_go_to_left",
    [
        # 4 categories
        (
            [0, 1, 2, 3] * 11,  # X_binned
            [10, 1, 10, 10] * 11,  # all_gradients
            [1],  # expected_categories_left
            4,  # n_bins_non_missing
            4,  # missing_values_bin_idx
            False,  # has_missing_values
            None,
        ),  # expected_missing_go_to_left, unchecked
        # Make sure that the categories that are on the right (second half) of
        # the sorted categories array can still go in the left child. In this
        # case, the best split was found when scanning from right to left.
        (
            [0, 1, 2, 3] * 11,  # X_binned
            [10, 10, 10, 1] * 11,  # all_gradients
            [3],  # expected_categories_left
            4,  # n_bins_non_missing
            4,  # missing_values_bin_idx
            False,  # has_missing_values
            None,
        ),  # expected_missing_go_to_left, unchecked
        # categories that don't respect MIN_CAT_SUPPORT (cat 4) are always
        # mapped to the right child
        (
            [0, 1, 2, 3] * 11 + [4] * 5,  # X_binned
            [10, 10, 10, 1] * 11 + [10] * 5,  # all_gradients
            [3],  # expected_categories_left
            4,  # n_bins_non_missing
            4,  # missing_values_bin_idx
            False,  # has_missing_values
            None,
        ),  # expected_missing_go_to_left, unchecked
        # categories that don't respect MIN_CAT_SUPPORT are always mapped to
        # the right child: in this case a more sensible split could have been
        # 3, 4 - 0, 1, 2
        # But the split is still 3 - 0, 1, 2, 4. this is because we only scan
        # up to the middle of the sorted category array (0, 1, 2, 3), and
        # because we exclude cat 4 in this array.
        (
            [0, 1, 2, 3] * 11 + [4] * 5,  # X_binned
            [10, 10, 10, 1] * 11 + [1] * 5,  # all_gradients
            [3],  # expected_categories_left
            4,  # n_bins_non_missing
            4,  # missing_values_bin_idx
            False,  # has_missing_values
            None,
        ),  # expected_missing_go_to_left, unchecked
        # 4 categories with missing values that go to the right
        (
            [0, 1, 2] * 11 + [9] * 11,  # X_binned
            [10, 1, 10] * 11 + [10] * 11,  # all_gradients
            [1],  # expected_categories_left
            3,  # n_bins_non_missing
            9,  # missing_values_bin_idx
            True,  # has_missing_values
            False,
        ),  # expected_missing_go_to_left
        # 4 categories with missing values that go to the left
        (
            [0, 1, 2] * 11 + [9] * 11,  # X_binned
            [10, 1, 10] * 11 + [1] * 11,  # all_gradients
            [1, 9],  # expected_categories_left
            3,  # n_bins_non_missing
            9,  # missing_values_bin_idx
            True,  # has_missing_values
            True,
        ),  # expected_missing_go_to_left
        # split is on the missing value
        (
            [0, 1, 2, 3, 4] * 11 + [255] * 12,  # X_binned
            [10, 10, 10, 10, 10] * 11 + [1] * 12,  # all_gradients
            [255],  # expected_categories_left
            5,  # n_bins_non_missing
            255,  # missing_values_bin_idx
            True,  # has_missing_values
            True,
        ),  # expected_missing_go_to_left
        # split on even categories
        (
            list(range(60)) * 12,  # X_binned
            [10, 1] * 360,  # all_gradients
            list(range(1, 60, 2)),  # expected_categories_left
            59,  # n_bins_non_missing
            59,  # missing_values_bin_idx
            True,  # has_missing_values
            True,
        ),  # expected_missing_go_to_left
        # split on every 8 categories
        (
            list(range(256)) * 12,  # X_binned
            [10, 10, 10, 10, 10, 10, 10, 1] * 384,  # all_gradients
            list(range(7, 256, 8)),  # expected_categories_left
            255,  # n_bins_non_missing
            255,  # missing_values_bin_idx
            True,  # has_missing_values
            True,
        ),  # expected_missing_go_to_left
    ],
)
def test_splitting_categorical_sanity(
    X_binned,
    all_gradients,
    expected_categories_left,
    n_bins_non_missing,
    missing_values_bin_idx,
    has_missing_values,
    expected_missing_go_to_left,
):
    # Tests various combinations of categorical splits

    n_samples = len(X_binned)
    n_bins = max(X_binned) + 1

    X_binned = np.array(X_binned, dtype=X_BINNED_DTYPE).reshape(-1, 1)
    X_binned = np.asfortranarray(X_binned)

    l2_regularization = 0.0
    min_hessian_to_split = 1e-3
    min_samples_leaf = 1
    min_gain_to_split = 0.0

    sample_indices = np.arange(n_samples, dtype=np.uint32)
    all_gradients = np.array(all_gradients, dtype=G_H_DTYPE)
    all_hessians = np.ones(1, dtype=G_H_DTYPE)
    has_missing_values = np.array([has_missing_values], dtype=np.uint8)
    sum_gradients = all_gradients.sum()
    sum_hessians = n_samples
    hessians_are_constant = True

    builder = HistogramBuilder(
        X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
    )

    n_bins_non_missing = np.array([n_bins_non_missing], dtype=np.uint32)
    monotonic_cst = np.array(
        [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
    )
    is_categorical = np.ones_like(monotonic_cst, dtype=np.uint8)

    splitter = Splitter(
        X_binned,
        n_bins_non_missing,
        missing_values_bin_idx,
        has_missing_values,
        is_categorical,
        monotonic_cst,
        l2_regularization,
        min_hessian_to_split,
        min_samples_leaf,
        min_gain_to_split,
        hessians_are_constant,
    )

    histograms = builder.compute_histograms_brute(sample_indices)

    value = compute_node_value(
        sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
    )
    split_info = splitter.find_node_split(
        n_samples, histograms, sum_gradients, sum_hessians, value
    )

    assert split_info.is_categorical
    assert split_info.gain > 0
    _assert_categories_equals_bitset(
        expected_categories_left, split_info.left_cat_bitset
    )
    if has_missing_values:
        assert split_info.missing_go_to_left == expected_missing_go_to_left
    # If there is no missing value during training, the flag missing_go_to_left
    # is set later in the grower.

    # make sure samples are split correctly
    samples_left, samples_right, _ = splitter.split_indices(
        split_info, splitter.partition
    )

    left_mask = np.isin(X_binned.ravel(), expected_categories_left)
    assert_array_equal(sample_indices[left_mask], samples_left)
    assert_array_equal(sample_indices[~left_mask], samples_right)