1import numpy as np
2import pytest
3from scipy.sparse import csr_matrix
4
5from sklearn.utils import check_random_state
6from sklearn.utils._testing import assert_array_equal, assert_almost_equal
7from sklearn.feature_selection._mutual_info import _compute_mi
8from sklearn.feature_selection import mutual_info_regression, mutual_info_classif
9
10
11def test_compute_mi_dd():
12    # In discrete case computations are straightforward and can be done
13    # by hand on given vectors.
14    x = np.array([0, 1, 1, 0, 0])
15    y = np.array([1, 0, 0, 0, 1])
16
17    H_x = H_y = -(3 / 5) * np.log(3 / 5) - (2 / 5) * np.log(2 / 5)
18    H_xy = -1 / 5 * np.log(1 / 5) - 2 / 5 * np.log(2 / 5) - 2 / 5 * np.log(2 / 5)
19    I_xy = H_x + H_y - H_xy
20
21    assert_almost_equal(_compute_mi(x, y, True, True), I_xy)
22
23
24def test_compute_mi_cc():
25    # For two continuous variables a good approach is to test on bivariate
26    # normal distribution, where mutual information is known.
27
28    # Mean of the distribution, irrelevant for mutual information.
29    mean = np.zeros(2)
30
31    # Setup covariance matrix with correlation coeff. equal 0.5.
32    sigma_1 = 1
33    sigma_2 = 10
34    corr = 0.5
35    cov = np.array(
36        [
37            [sigma_1 ** 2, corr * sigma_1 * sigma_2],
38            [corr * sigma_1 * sigma_2, sigma_2 ** 2],
39        ]
40    )
41
42    # True theoretical mutual information.
43    I_theory = np.log(sigma_1) + np.log(sigma_2) - 0.5 * np.log(np.linalg.det(cov))
44
45    rng = check_random_state(0)
46    Z = rng.multivariate_normal(mean, cov, size=1000)
47
48    x, y = Z[:, 0], Z[:, 1]
49
50    # Theory and computed values won't be very close, assert that the
51    # first figures after decimal point match.
52    for n_neighbors in [3, 5, 7]:
53        I_computed = _compute_mi(x, y, False, False, n_neighbors)
54        assert_almost_equal(I_computed, I_theory, 1)
55
56
57def test_compute_mi_cd():
58    # To test define a joint distribution as follows:
59    # p(x, y) = p(x) p(y | x)
60    # X ~ Bernoulli(p)
61    # (Y | x = 0) ~ Uniform(-1, 1)
62    # (Y | x = 1) ~ Uniform(0, 2)
63
64    # Use the following formula for mutual information:
65    # I(X; Y) = H(Y) - H(Y | X)
66    # Two entropies can be computed by hand:
67    # H(Y) = -(1-p)/2 * ln((1-p)/2) - p/2*log(p/2) - 1/2*log(1/2)
68    # H(Y | X) = ln(2)
69
70    # Now we need to implement sampling from out distribution, which is
71    # done easily using conditional distribution logic.
72
73    n_samples = 1000
74    rng = check_random_state(0)
75
76    for p in [0.3, 0.5, 0.7]:
77        x = rng.uniform(size=n_samples) > p
78
79        y = np.empty(n_samples)
80        mask = x == 0
81        y[mask] = rng.uniform(-1, 1, size=np.sum(mask))
82        y[~mask] = rng.uniform(0, 2, size=np.sum(~mask))
83
84        I_theory = -0.5 * (
85            (1 - p) * np.log(0.5 * (1 - p)) + p * np.log(0.5 * p) + np.log(0.5)
86        ) - np.log(2)
87
88        # Assert the same tolerance.
89        for n_neighbors in [3, 5, 7]:
90            I_computed = _compute_mi(x, y, True, False, n_neighbors)
91            assert_almost_equal(I_computed, I_theory, 1)
92
93
94def test_compute_mi_cd_unique_label():
95    # Test that adding unique label doesn't change MI.
96    n_samples = 100
97    x = np.random.uniform(size=n_samples) > 0.5
98
99    y = np.empty(n_samples)
100    mask = x == 0
101    y[mask] = np.random.uniform(-1, 1, size=np.sum(mask))
102    y[~mask] = np.random.uniform(0, 2, size=np.sum(~mask))
103
104    mi_1 = _compute_mi(x, y, True, False)
105
106    x = np.hstack((x, 2))
107    y = np.hstack((y, 10))
108    mi_2 = _compute_mi(x, y, True, False)
109
110    assert mi_1 == mi_2
111
112
113# We are going test that feature ordering by MI matches our expectations.
114def test_mutual_info_classif_discrete():
115    X = np.array([[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]])
116    y = np.array([0, 1, 2, 2, 1])
117
118    # Here X[:, 0] is the most informative feature, and X[:, 1] is weakly
119    # informative.
120    mi = mutual_info_classif(X, y, discrete_features=True)
121    assert_array_equal(np.argsort(-mi), np.array([0, 2, 1]))
122
123
124def test_mutual_info_regression():
125    # We generate sample from multivariate normal distribution, using
126    # transformation from initially uncorrelated variables. The zero
127    # variables after transformation is selected as the target vector,
128    # it has the strongest correlation with the variable 2, and
129    # the weakest correlation with the variable 1.
130    T = np.array([[1, 0.5, 2, 1], [0, 1, 0.1, 0.0], [0, 0.1, 1, 0.1], [0, 0.1, 0.1, 1]])
131    cov = T.dot(T.T)
132    mean = np.zeros(4)
133
134    rng = check_random_state(0)
135    Z = rng.multivariate_normal(mean, cov, size=1000)
136    X = Z[:, 1:]
137    y = Z[:, 0]
138
139    mi = mutual_info_regression(X, y, random_state=0)
140    assert_array_equal(np.argsort(-mi), np.array([1, 2, 0]))
141
142
143def test_mutual_info_classif_mixed():
144    # Here the target is discrete and there are two continuous and one
145    # discrete feature. The idea of this test is clear from the code.
146    rng = check_random_state(0)
147    X = rng.rand(1000, 3)
148    X[:, 1] += X[:, 0]
149    y = ((0.5 * X[:, 0] + X[:, 2]) > 0.5).astype(int)
150    X[:, 2] = X[:, 2] > 0.5
151
152    mi = mutual_info_classif(X, y, discrete_features=[2], n_neighbors=3, random_state=0)
153    assert_array_equal(np.argsort(-mi), [2, 0, 1])
154    for n_neighbors in [5, 7, 9]:
155        mi_nn = mutual_info_classif(
156            X, y, discrete_features=[2], n_neighbors=n_neighbors, random_state=0
157        )
158        # Check that the continuous values have an higher MI with greater
159        # n_neighbors
160        assert mi_nn[0] > mi[0]
161        assert mi_nn[1] > mi[1]
162        # The n_neighbors should not have any effect on the discrete value
163        # The MI should be the same
164        assert mi_nn[2] == mi[2]
165
166
167def test_mutual_info_options():
168    X = np.array([[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]], dtype=float)
169    y = np.array([0, 1, 2, 2, 1], dtype=float)
170    X_csr = csr_matrix(X)
171
172    for mutual_info in (mutual_info_regression, mutual_info_classif):
173        with pytest.raises(ValueError):
174            mutual_info(X_csr, y, discrete_features=False)
175        with pytest.raises(ValueError):
176            mutual_info(X, y, discrete_features="manual")
177        with pytest.raises(ValueError):
178            mutual_info(X_csr, y, discrete_features=[True, False, True])
179        with pytest.raises(IndexError):
180            mutual_info(X, y, discrete_features=[True, False, True, False])
181        with pytest.raises(IndexError):
182            mutual_info(X, y, discrete_features=[1, 4])
183
184        mi_1 = mutual_info(X, y, discrete_features="auto", random_state=0)
185        mi_2 = mutual_info(X, y, discrete_features=False, random_state=0)
186        mi_3 = mutual_info(X_csr, y, discrete_features="auto", random_state=0)
187        mi_4 = mutual_info(X_csr, y, discrete_features=True, random_state=0)
188        mi_5 = mutual_info(X, y, discrete_features=[True, False, True], random_state=0)
189        mi_6 = mutual_info(X, y, discrete_features=[0, 2], random_state=0)
190
191        assert_array_equal(mi_1, mi_2)
192        assert_array_equal(mi_3, mi_4)
193        assert_array_equal(mi_5, mi_6)
194
195    assert not np.allclose(mi_1, mi_3)
196