1import numpy as np 2from scipy import sparse as sp 3from scipy import stats 4 5import pytest 6 7from sklearn.svm._bounds import l1_min_c 8from sklearn.svm import LinearSVC 9from sklearn.linear_model import LogisticRegression 10from sklearn.svm._newrand import set_seed_wrap, bounded_rand_int_wrap 11 12 13dense_X = [[-1, 0], [0, 1], [1, 1], [1, 1]] 14sparse_X = sp.csr_matrix(dense_X) 15 16Y1 = [0, 1, 1, 1] 17Y2 = [2, 1, 0, 0] 18 19 20@pytest.mark.parametrize("loss", ["squared_hinge", "log"]) 21@pytest.mark.parametrize("X_label", ["sparse", "dense"]) 22@pytest.mark.parametrize("Y_label", ["two-classes", "multi-class"]) 23@pytest.mark.parametrize("intercept_label", ["no-intercept", "fit-intercept"]) 24def test_l1_min_c(loss, X_label, Y_label, intercept_label): 25 Xs = {"sparse": sparse_X, "dense": dense_X} 26 Ys = {"two-classes": Y1, "multi-class": Y2} 27 intercepts = { 28 "no-intercept": {"fit_intercept": False}, 29 "fit-intercept": {"fit_intercept": True, "intercept_scaling": 10}, 30 } 31 32 X = Xs[X_label] 33 Y = Ys[Y_label] 34 intercept_params = intercepts[intercept_label] 35 check_l1_min_c(X, Y, loss, **intercept_params) 36 37 38def test_l1_min_c_l2_loss(): 39 # loss='l2' should raise ValueError 40 msg = "loss type not in" 41 with pytest.raises(ValueError, match=msg): 42 l1_min_c(dense_X, Y1, loss="l2") 43 44 45def check_l1_min_c(X, y, loss, fit_intercept=True, intercept_scaling=None): 46 min_c = l1_min_c( 47 X, 48 y, 49 loss=loss, 50 fit_intercept=fit_intercept, 51 intercept_scaling=intercept_scaling, 52 ) 53 54 clf = { 55 "log": LogisticRegression(penalty="l1", solver="liblinear"), 56 "squared_hinge": LinearSVC(loss="squared_hinge", penalty="l1", dual=False), 57 }[loss] 58 59 clf.fit_intercept = fit_intercept 60 clf.intercept_scaling = intercept_scaling 61 62 clf.C = min_c 63 clf.fit(X, y) 64 assert (np.asarray(clf.coef_) == 0).all() 65 assert (np.asarray(clf.intercept_) == 0).all() 66 67 clf.C = min_c * 1.01 68 clf.fit(X, y) 69 assert (np.asarray(clf.coef_) != 0).any() or (np.asarray(clf.intercept_) != 0).any() 70 71 72def test_ill_posed_min_c(): 73 X = [[0, 0], [0, 0]] 74 y = [0, 1] 75 with pytest.raises(ValueError): 76 l1_min_c(X, y) 77 78 79def test_unsupported_loss(): 80 with pytest.raises(ValueError): 81 l1_min_c(dense_X, Y1, loss="l1") 82 83 84_MAX_UNSIGNED_INT = 4294967295 85 86 87@pytest.mark.parametrize("seed, val", [(None, 81), (0, 54), (_MAX_UNSIGNED_INT, 9)]) 88def test_newrand_set_seed(seed, val): 89 """Test that `set_seed` produces deterministic results""" 90 if seed is not None: 91 set_seed_wrap(seed) 92 x = bounded_rand_int_wrap(100) 93 assert x == val, f"Expected {val} but got {x} instead" 94 95 96@pytest.mark.parametrize("seed", [-1, _MAX_UNSIGNED_INT + 1]) 97def test_newrand_set_seed_overflow(seed): 98 """Test that `set_seed_wrap` is defined for unsigned 32bits ints""" 99 with pytest.raises(OverflowError): 100 set_seed_wrap(seed) 101 102 103@pytest.mark.parametrize("range_, n_pts", [(_MAX_UNSIGNED_INT, 10000), (100, 25)]) 104def test_newrand_bounded_rand_int(range_, n_pts): 105 """Test that `bounded_rand_int` follows a uniform distribution""" 106 n_iter = 100 107 ks_pvals = [] 108 uniform_dist = stats.uniform(loc=0, scale=range_) 109 # perform multiple samplings to make chance of outlier sampling negligible 110 for _ in range(n_iter): 111 # Deterministic random sampling 112 sample = [bounded_rand_int_wrap(range_) for _ in range(n_pts)] 113 res = stats.kstest(sample, uniform_dist.cdf) 114 ks_pvals.append(res.pvalue) 115 # Null hypothesis = samples come from an uniform distribution. 116 # Under the null hypothesis, p-values should be uniformly distributed 117 # and not concentrated on low values 118 # (this may seem counter-intuitive but is backed by multiple refs) 119 # So we can do two checks: 120 121 # (1) check uniformity of p-values 122 uniform_p_vals_dist = stats.uniform(loc=0, scale=1) 123 res_pvals = stats.kstest(ks_pvals, uniform_p_vals_dist.cdf) 124 assert res_pvals.pvalue > 0.05, ( 125 "Null hypothesis rejected: generated random numbers are not uniform." 126 " Details: the (meta) p-value of the test of uniform distribution" 127 f" of p-values is {res_pvals.pvalue} which is not > 0.05" 128 ) 129 130 # (2) (safety belt) check that 90% of p-values are above 0.05 131 min_10pct_pval = np.percentile(ks_pvals, q=10) 132 # lower 10th quantile pvalue <= 0.05 means that the test rejects the 133 # null hypothesis that the sample came from the uniform distribution 134 assert min_10pct_pval > 0.05, ( 135 "Null hypothesis rejected: generated random numbers are not uniform. " 136 f"Details: lower 10th quantile p-value of {min_10pct_pval} not > 0.05." 137 ) 138 139 140@pytest.mark.parametrize("range_", [-1, _MAX_UNSIGNED_INT + 1]) 141def test_newrand_bounded_rand_int_limits(range_): 142 """Test that `bounded_rand_int_wrap` is defined for unsigned 32bits ints""" 143 with pytest.raises(OverflowError): 144 bounded_rand_int_wrap(range_) 145