1import numpy as np
2from scipy import sparse as sp
3from scipy import stats
4
5import pytest
6
7from sklearn.svm._bounds import l1_min_c
8from sklearn.svm import LinearSVC
9from sklearn.linear_model import LogisticRegression
10from sklearn.svm._newrand import set_seed_wrap, bounded_rand_int_wrap
11
12
13dense_X = [[-1, 0], [0, 1], [1, 1], [1, 1]]
14sparse_X = sp.csr_matrix(dense_X)
15
16Y1 = [0, 1, 1, 1]
17Y2 = [2, 1, 0, 0]
18
19
20@pytest.mark.parametrize("loss", ["squared_hinge", "log"])
21@pytest.mark.parametrize("X_label", ["sparse", "dense"])
22@pytest.mark.parametrize("Y_label", ["two-classes", "multi-class"])
23@pytest.mark.parametrize("intercept_label", ["no-intercept", "fit-intercept"])
24def test_l1_min_c(loss, X_label, Y_label, intercept_label):
25    Xs = {"sparse": sparse_X, "dense": dense_X}
26    Ys = {"two-classes": Y1, "multi-class": Y2}
27    intercepts = {
28        "no-intercept": {"fit_intercept": False},
29        "fit-intercept": {"fit_intercept": True, "intercept_scaling": 10},
30    }
31
32    X = Xs[X_label]
33    Y = Ys[Y_label]
34    intercept_params = intercepts[intercept_label]
35    check_l1_min_c(X, Y, loss, **intercept_params)
36
37
38def test_l1_min_c_l2_loss():
39    # loss='l2' should raise ValueError
40    msg = "loss type not in"
41    with pytest.raises(ValueError, match=msg):
42        l1_min_c(dense_X, Y1, loss="l2")
43
44
45def check_l1_min_c(X, y, loss, fit_intercept=True, intercept_scaling=None):
46    min_c = l1_min_c(
47        X,
48        y,
49        loss=loss,
50        fit_intercept=fit_intercept,
51        intercept_scaling=intercept_scaling,
52    )
53
54    clf = {
55        "log": LogisticRegression(penalty="l1", solver="liblinear"),
56        "squared_hinge": LinearSVC(loss="squared_hinge", penalty="l1", dual=False),
57    }[loss]
58
59    clf.fit_intercept = fit_intercept
60    clf.intercept_scaling = intercept_scaling
61
62    clf.C = min_c
63    clf.fit(X, y)
64    assert (np.asarray(clf.coef_) == 0).all()
65    assert (np.asarray(clf.intercept_) == 0).all()
66
67    clf.C = min_c * 1.01
68    clf.fit(X, y)
69    assert (np.asarray(clf.coef_) != 0).any() or (np.asarray(clf.intercept_) != 0).any()
70
71
72def test_ill_posed_min_c():
73    X = [[0, 0], [0, 0]]
74    y = [0, 1]
75    with pytest.raises(ValueError):
76        l1_min_c(X, y)
77
78
79def test_unsupported_loss():
80    with pytest.raises(ValueError):
81        l1_min_c(dense_X, Y1, loss="l1")
82
83
84_MAX_UNSIGNED_INT = 4294967295
85
86
87@pytest.mark.parametrize("seed, val", [(None, 81), (0, 54), (_MAX_UNSIGNED_INT, 9)])
88def test_newrand_set_seed(seed, val):
89    """Test that `set_seed` produces deterministic results"""
90    if seed is not None:
91        set_seed_wrap(seed)
92    x = bounded_rand_int_wrap(100)
93    assert x == val, f"Expected {val} but got {x} instead"
94
95
96@pytest.mark.parametrize("seed", [-1, _MAX_UNSIGNED_INT + 1])
97def test_newrand_set_seed_overflow(seed):
98    """Test that `set_seed_wrap` is defined for unsigned 32bits ints"""
99    with pytest.raises(OverflowError):
100        set_seed_wrap(seed)
101
102
103@pytest.mark.parametrize("range_, n_pts", [(_MAX_UNSIGNED_INT, 10000), (100, 25)])
104def test_newrand_bounded_rand_int(range_, n_pts):
105    """Test that `bounded_rand_int` follows a uniform distribution"""
106    n_iter = 100
107    ks_pvals = []
108    uniform_dist = stats.uniform(loc=0, scale=range_)
109    # perform multiple samplings to make chance of outlier sampling negligible
110    for _ in range(n_iter):
111        # Deterministic random sampling
112        sample = [bounded_rand_int_wrap(range_) for _ in range(n_pts)]
113        res = stats.kstest(sample, uniform_dist.cdf)
114        ks_pvals.append(res.pvalue)
115    # Null hypothesis = samples come from an uniform distribution.
116    # Under the null hypothesis, p-values should be uniformly distributed
117    # and not concentrated on low values
118    # (this may seem counter-intuitive but is backed by multiple refs)
119    # So we can do two checks:
120
121    # (1) check uniformity of p-values
122    uniform_p_vals_dist = stats.uniform(loc=0, scale=1)
123    res_pvals = stats.kstest(ks_pvals, uniform_p_vals_dist.cdf)
124    assert res_pvals.pvalue > 0.05, (
125        "Null hypothesis rejected: generated random numbers are not uniform."
126        " Details: the (meta) p-value of the test of uniform distribution"
127        f" of p-values is {res_pvals.pvalue} which is not > 0.05"
128    )
129
130    # (2) (safety belt) check that 90% of p-values are above 0.05
131    min_10pct_pval = np.percentile(ks_pvals, q=10)
132    # lower 10th quantile pvalue <= 0.05 means that the test rejects the
133    # null hypothesis that the sample came from the uniform distribution
134    assert min_10pct_pval > 0.05, (
135        "Null hypothesis rejected: generated random numbers are not uniform. "
136        f"Details: lower 10th quantile p-value of {min_10pct_pval} not > 0.05."
137    )
138
139
140@pytest.mark.parametrize("range_", [-1, _MAX_UNSIGNED_INT + 1])
141def test_newrand_bounded_rand_int_limits(range_):
142    """Test that `bounded_rand_int_wrap` is defined for unsigned 32bits ints"""
143    with pytest.raises(OverflowError):
144        bounded_rand_int_wrap(range_)
145