1import numpy as np 2import pytest 3from scipy.sparse import csr_matrix 4 5from sklearn.utils import check_random_state 6from sklearn.utils._testing import assert_array_equal, assert_almost_equal 7from sklearn.feature_selection._mutual_info import _compute_mi 8from sklearn.feature_selection import mutual_info_regression, mutual_info_classif 9 10 11def test_compute_mi_dd(): 12 # In discrete case computations are straightforward and can be done 13 # by hand on given vectors. 14 x = np.array([0, 1, 1, 0, 0]) 15 y = np.array([1, 0, 0, 0, 1]) 16 17 H_x = H_y = -(3 / 5) * np.log(3 / 5) - (2 / 5) * np.log(2 / 5) 18 H_xy = -1 / 5 * np.log(1 / 5) - 2 / 5 * np.log(2 / 5) - 2 / 5 * np.log(2 / 5) 19 I_xy = H_x + H_y - H_xy 20 21 assert_almost_equal(_compute_mi(x, y, True, True), I_xy) 22 23 24def test_compute_mi_cc(): 25 # For two continuous variables a good approach is to test on bivariate 26 # normal distribution, where mutual information is known. 27 28 # Mean of the distribution, irrelevant for mutual information. 29 mean = np.zeros(2) 30 31 # Setup covariance matrix with correlation coeff. equal 0.5. 32 sigma_1 = 1 33 sigma_2 = 10 34 corr = 0.5 35 cov = np.array( 36 [ 37 [sigma_1 ** 2, corr * sigma_1 * sigma_2], 38 [corr * sigma_1 * sigma_2, sigma_2 ** 2], 39 ] 40 ) 41 42 # True theoretical mutual information. 43 I_theory = np.log(sigma_1) + np.log(sigma_2) - 0.5 * np.log(np.linalg.det(cov)) 44 45 rng = check_random_state(0) 46 Z = rng.multivariate_normal(mean, cov, size=1000) 47 48 x, y = Z[:, 0], Z[:, 1] 49 50 # Theory and computed values won't be very close, assert that the 51 # first figures after decimal point match. 52 for n_neighbors in [3, 5, 7]: 53 I_computed = _compute_mi(x, y, False, False, n_neighbors) 54 assert_almost_equal(I_computed, I_theory, 1) 55 56 57def test_compute_mi_cd(): 58 # To test define a joint distribution as follows: 59 # p(x, y) = p(x) p(y | x) 60 # X ~ Bernoulli(p) 61 # (Y | x = 0) ~ Uniform(-1, 1) 62 # (Y | x = 1) ~ Uniform(0, 2) 63 64 # Use the following formula for mutual information: 65 # I(X; Y) = H(Y) - H(Y | X) 66 # Two entropies can be computed by hand: 67 # H(Y) = -(1-p)/2 * ln((1-p)/2) - p/2*log(p/2) - 1/2*log(1/2) 68 # H(Y | X) = ln(2) 69 70 # Now we need to implement sampling from out distribution, which is 71 # done easily using conditional distribution logic. 72 73 n_samples = 1000 74 rng = check_random_state(0) 75 76 for p in [0.3, 0.5, 0.7]: 77 x = rng.uniform(size=n_samples) > p 78 79 y = np.empty(n_samples) 80 mask = x == 0 81 y[mask] = rng.uniform(-1, 1, size=np.sum(mask)) 82 y[~mask] = rng.uniform(0, 2, size=np.sum(~mask)) 83 84 I_theory = -0.5 * ( 85 (1 - p) * np.log(0.5 * (1 - p)) + p * np.log(0.5 * p) + np.log(0.5) 86 ) - np.log(2) 87 88 # Assert the same tolerance. 89 for n_neighbors in [3, 5, 7]: 90 I_computed = _compute_mi(x, y, True, False, n_neighbors) 91 assert_almost_equal(I_computed, I_theory, 1) 92 93 94def test_compute_mi_cd_unique_label(): 95 # Test that adding unique label doesn't change MI. 96 n_samples = 100 97 x = np.random.uniform(size=n_samples) > 0.5 98 99 y = np.empty(n_samples) 100 mask = x == 0 101 y[mask] = np.random.uniform(-1, 1, size=np.sum(mask)) 102 y[~mask] = np.random.uniform(0, 2, size=np.sum(~mask)) 103 104 mi_1 = _compute_mi(x, y, True, False) 105 106 x = np.hstack((x, 2)) 107 y = np.hstack((y, 10)) 108 mi_2 = _compute_mi(x, y, True, False) 109 110 assert mi_1 == mi_2 111 112 113# We are going test that feature ordering by MI matches our expectations. 114def test_mutual_info_classif_discrete(): 115 X = np.array([[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]]) 116 y = np.array([0, 1, 2, 2, 1]) 117 118 # Here X[:, 0] is the most informative feature, and X[:, 1] is weakly 119 # informative. 120 mi = mutual_info_classif(X, y, discrete_features=True) 121 assert_array_equal(np.argsort(-mi), np.array([0, 2, 1])) 122 123 124def test_mutual_info_regression(): 125 # We generate sample from multivariate normal distribution, using 126 # transformation from initially uncorrelated variables. The zero 127 # variables after transformation is selected as the target vector, 128 # it has the strongest correlation with the variable 2, and 129 # the weakest correlation with the variable 1. 130 T = np.array([[1, 0.5, 2, 1], [0, 1, 0.1, 0.0], [0, 0.1, 1, 0.1], [0, 0.1, 0.1, 1]]) 131 cov = T.dot(T.T) 132 mean = np.zeros(4) 133 134 rng = check_random_state(0) 135 Z = rng.multivariate_normal(mean, cov, size=1000) 136 X = Z[:, 1:] 137 y = Z[:, 0] 138 139 mi = mutual_info_regression(X, y, random_state=0) 140 assert_array_equal(np.argsort(-mi), np.array([1, 2, 0])) 141 142 143def test_mutual_info_classif_mixed(): 144 # Here the target is discrete and there are two continuous and one 145 # discrete feature. The idea of this test is clear from the code. 146 rng = check_random_state(0) 147 X = rng.rand(1000, 3) 148 X[:, 1] += X[:, 0] 149 y = ((0.5 * X[:, 0] + X[:, 2]) > 0.5).astype(int) 150 X[:, 2] = X[:, 2] > 0.5 151 152 mi = mutual_info_classif(X, y, discrete_features=[2], n_neighbors=3, random_state=0) 153 assert_array_equal(np.argsort(-mi), [2, 0, 1]) 154 for n_neighbors in [5, 7, 9]: 155 mi_nn = mutual_info_classif( 156 X, y, discrete_features=[2], n_neighbors=n_neighbors, random_state=0 157 ) 158 # Check that the continuous values have an higher MI with greater 159 # n_neighbors 160 assert mi_nn[0] > mi[0] 161 assert mi_nn[1] > mi[1] 162 # The n_neighbors should not have any effect on the discrete value 163 # The MI should be the same 164 assert mi_nn[2] == mi[2] 165 166 167def test_mutual_info_options(): 168 X = np.array([[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]], dtype=float) 169 y = np.array([0, 1, 2, 2, 1], dtype=float) 170 X_csr = csr_matrix(X) 171 172 for mutual_info in (mutual_info_regression, mutual_info_classif): 173 with pytest.raises(ValueError): 174 mutual_info(X_csr, y, discrete_features=False) 175 with pytest.raises(ValueError): 176 mutual_info(X, y, discrete_features="manual") 177 with pytest.raises(ValueError): 178 mutual_info(X_csr, y, discrete_features=[True, False, True]) 179 with pytest.raises(IndexError): 180 mutual_info(X, y, discrete_features=[True, False, True, False]) 181 with pytest.raises(IndexError): 182 mutual_info(X, y, discrete_features=[1, 4]) 183 184 mi_1 = mutual_info(X, y, discrete_features="auto", random_state=0) 185 mi_2 = mutual_info(X, y, discrete_features=False, random_state=0) 186 mi_3 = mutual_info(X_csr, y, discrete_features="auto", random_state=0) 187 mi_4 = mutual_info(X_csr, y, discrete_features=True, random_state=0) 188 mi_5 = mutual_info(X, y, discrete_features=[True, False, True], random_state=0) 189 mi_6 = mutual_info(X, y, discrete_features=[0, 2], random_state=0) 190 191 assert_array_equal(mi_1, mi_2) 192 assert_array_equal(mi_3, mi_4) 193 assert_array_equal(mi_5, mi_6) 194 195 assert not np.allclose(mi_1, mi_3) 196