1# Author: Nikolay Mayorov <n59_ru@hotmail.com>
2# License: 3-clause BSD
3
4import numpy as np
5from scipy.sparse import issparse
6from scipy.special import digamma
7
8from ..metrics.cluster import mutual_info_score
9from ..neighbors import NearestNeighbors, KDTree
10from ..preprocessing import scale
11from ..utils import check_random_state
12from ..utils.fixes import _astype_copy_false
13from ..utils.validation import check_array, check_X_y
14from ..utils.multiclass import check_classification_targets
15
16
17def _compute_mi_cc(x, y, n_neighbors):
18    """Compute mutual information between two continuous variables.
19
20    Parameters
21    ----------
22    x, y : ndarray, shape (n_samples,)
23        Samples of two continuous random variables, must have an identical
24        shape.
25
26    n_neighbors : int
27        Number of nearest neighbors to search for each point, see [1]_.
28
29    Returns
30    -------
31    mi : float
32        Estimated mutual information. If it turned out to be negative it is
33        replace by 0.
34
35    Notes
36    -----
37    True mutual information can't be negative. If its estimate by a numerical
38    method is negative, it means (providing the method is adequate) that the
39    mutual information is close to 0 and replacing it by 0 is a reasonable
40    strategy.
41
42    References
43    ----------
44    .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
45           information". Phys. Rev. E 69, 2004.
46    """
47    n_samples = x.size
48
49    x = x.reshape((-1, 1))
50    y = y.reshape((-1, 1))
51    xy = np.hstack((x, y))
52
53    # Here we rely on NearestNeighbors to select the fastest algorithm.
54    nn = NearestNeighbors(metric="chebyshev", n_neighbors=n_neighbors)
55
56    nn.fit(xy)
57    radius = nn.kneighbors()[0]
58    radius = np.nextafter(radius[:, -1], 0)
59
60    # KDTree is explicitly fit to allow for the querying of number of
61    # neighbors within a specified radius
62    kd = KDTree(x, metric="chebyshev")
63    nx = kd.query_radius(x, radius, count_only=True, return_distance=False)
64    nx = np.array(nx) - 1.0
65
66    kd = KDTree(y, metric="chebyshev")
67    ny = kd.query_radius(y, radius, count_only=True, return_distance=False)
68    ny = np.array(ny) - 1.0
69
70    mi = (
71        digamma(n_samples)
72        + digamma(n_neighbors)
73        - np.mean(digamma(nx + 1))
74        - np.mean(digamma(ny + 1))
75    )
76
77    return max(0, mi)
78
79
80def _compute_mi_cd(c, d, n_neighbors):
81    """Compute mutual information between continuous and discrete variables.
82
83    Parameters
84    ----------
85    c : ndarray, shape (n_samples,)
86        Samples of a continuous random variable.
87
88    d : ndarray, shape (n_samples,)
89        Samples of a discrete random variable.
90
91    n_neighbors : int
92        Number of nearest neighbors to search for each point, see [1]_.
93
94    Returns
95    -------
96    mi : float
97        Estimated mutual information. If it turned out to be negative it is
98        replace by 0.
99
100    Notes
101    -----
102    True mutual information can't be negative. If its estimate by a numerical
103    method is negative, it means (providing the method is adequate) that the
104    mutual information is close to 0 and replacing it by 0 is a reasonable
105    strategy.
106
107    References
108    ----------
109    .. [1] B. C. Ross "Mutual Information between Discrete and Continuous
110       Data Sets". PLoS ONE 9(2), 2014.
111    """
112    n_samples = c.shape[0]
113    c = c.reshape((-1, 1))
114
115    radius = np.empty(n_samples)
116    label_counts = np.empty(n_samples)
117    k_all = np.empty(n_samples)
118    nn = NearestNeighbors()
119    for label in np.unique(d):
120        mask = d == label
121        count = np.sum(mask)
122        if count > 1:
123            k = min(n_neighbors, count - 1)
124            nn.set_params(n_neighbors=k)
125            nn.fit(c[mask])
126            r = nn.kneighbors()[0]
127            radius[mask] = np.nextafter(r[:, -1], 0)
128            k_all[mask] = k
129        label_counts[mask] = count
130
131    # Ignore points with unique labels.
132    mask = label_counts > 1
133    n_samples = np.sum(mask)
134    label_counts = label_counts[mask]
135    k_all = k_all[mask]
136    c = c[mask]
137    radius = radius[mask]
138
139    kd = KDTree(c)
140    m_all = kd.query_radius(c, radius, count_only=True, return_distance=False)
141    m_all = np.array(m_all) - 1.0
142
143    mi = (
144        digamma(n_samples)
145        + np.mean(digamma(k_all))
146        - np.mean(digamma(label_counts))
147        - np.mean(digamma(m_all + 1))
148    )
149
150    return max(0, mi)
151
152
153def _compute_mi(x, y, x_discrete, y_discrete, n_neighbors=3):
154    """Compute mutual information between two variables.
155
156    This is a simple wrapper which selects a proper function to call based on
157    whether `x` and `y` are discrete or not.
158    """
159    if x_discrete and y_discrete:
160        return mutual_info_score(x, y)
161    elif x_discrete and not y_discrete:
162        return _compute_mi_cd(y, x, n_neighbors)
163    elif not x_discrete and y_discrete:
164        return _compute_mi_cd(x, y, n_neighbors)
165    else:
166        return _compute_mi_cc(x, y, n_neighbors)
167
168
169def _iterate_columns(X, columns=None):
170    """Iterate over columns of a matrix.
171
172    Parameters
173    ----------
174    X : ndarray or csc_matrix, shape (n_samples, n_features)
175        Matrix over which to iterate.
176
177    columns : iterable or None, default=None
178        Indices of columns to iterate over. If None, iterate over all columns.
179
180    Yields
181    ------
182    x : ndarray, shape (n_samples,)
183        Columns of `X` in dense format.
184    """
185    if columns is None:
186        columns = range(X.shape[1])
187
188    if issparse(X):
189        for i in columns:
190            x = np.zeros(X.shape[0])
191            start_ptr, end_ptr = X.indptr[i], X.indptr[i + 1]
192            x[X.indices[start_ptr:end_ptr]] = X.data[start_ptr:end_ptr]
193            yield x
194    else:
195        for i in columns:
196            yield X[:, i]
197
198
199def _estimate_mi(
200    X,
201    y,
202    discrete_features="auto",
203    discrete_target=False,
204    n_neighbors=3,
205    copy=True,
206    random_state=None,
207):
208    """Estimate mutual information between the features and the target.
209
210    Parameters
211    ----------
212    X : array-like or sparse matrix, shape (n_samples, n_features)
213        Feature matrix.
214
215    y : array-like of shape (n_samples,)
216        Target vector.
217
218    discrete_features : {'auto', bool, array-like}, default='auto'
219        If bool, then determines whether to consider all features discrete
220        or continuous. If array, then it should be either a boolean mask
221        with shape (n_features,) or array with indices of discrete features.
222        If 'auto', it is assigned to False for dense `X` and to True for
223        sparse `X`.
224
225    discrete_target : bool, default=False
226        Whether to consider `y` as a discrete variable.
227
228    n_neighbors : int, default=3
229        Number of neighbors to use for MI estimation for continuous variables,
230        see [1]_ and [2]_. Higher values reduce variance of the estimation, but
231        could introduce a bias.
232
233    copy : bool, default=True
234        Whether to make a copy of the given data. If set to False, the initial
235        data will be overwritten.
236
237    random_state : int, RandomState instance or None, default=None
238        Determines random number generation for adding small noise to
239        continuous variables in order to remove repeated values.
240        Pass an int for reproducible results across multiple function calls.
241        See :term:`Glossary <random_state>`.
242
243    Returns
244    -------
245    mi : ndarray, shape (n_features,)
246        Estimated mutual information between each feature and the target.
247        A negative value will be replaced by 0.
248
249    References
250    ----------
251    .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
252           information". Phys. Rev. E 69, 2004.
253    .. [2] B. C. Ross "Mutual Information between Discrete and Continuous
254           Data Sets". PLoS ONE 9(2), 2014.
255    """
256    X, y = check_X_y(X, y, accept_sparse="csc", y_numeric=not discrete_target)
257    n_samples, n_features = X.shape
258
259    if isinstance(discrete_features, (str, bool)):
260        if isinstance(discrete_features, str):
261            if discrete_features == "auto":
262                discrete_features = issparse(X)
263            else:
264                raise ValueError("Invalid string value for discrete_features.")
265        discrete_mask = np.empty(n_features, dtype=bool)
266        discrete_mask.fill(discrete_features)
267    else:
268        discrete_features = check_array(discrete_features, ensure_2d=False)
269        if discrete_features.dtype != "bool":
270            discrete_mask = np.zeros(n_features, dtype=bool)
271            discrete_mask[discrete_features] = True
272        else:
273            discrete_mask = discrete_features
274
275    continuous_mask = ~discrete_mask
276    if np.any(continuous_mask) and issparse(X):
277        raise ValueError("Sparse matrix `X` can't have continuous features.")
278
279    rng = check_random_state(random_state)
280    if np.any(continuous_mask):
281        if copy:
282            X = X.copy()
283
284        if not discrete_target:
285            X[:, continuous_mask] = scale(
286                X[:, continuous_mask], with_mean=False, copy=False
287            )
288
289        # Add small noise to continuous features as advised in Kraskov et. al.
290        X = X.astype(float, **_astype_copy_false(X))
291        means = np.maximum(1, np.mean(np.abs(X[:, continuous_mask]), axis=0))
292        X[:, continuous_mask] += (
293            1e-10 * means * rng.randn(n_samples, np.sum(continuous_mask))
294        )
295
296    if not discrete_target:
297        y = scale(y, with_mean=False)
298        y += 1e-10 * np.maximum(1, np.mean(np.abs(y))) * rng.randn(n_samples)
299
300    mi = [
301        _compute_mi(x, y, discrete_feature, discrete_target, n_neighbors)
302        for x, discrete_feature in zip(_iterate_columns(X), discrete_mask)
303    ]
304
305    return np.array(mi)
306
307
308def mutual_info_regression(
309    X, y, *, discrete_features="auto", n_neighbors=3, copy=True, random_state=None
310):
311    """Estimate mutual information for a continuous target variable.
312
313    Mutual information (MI) [1]_ between two random variables is a non-negative
314    value, which measures the dependency between the variables. It is equal
315    to zero if and only if two random variables are independent, and higher
316    values mean higher dependency.
317
318    The function relies on nonparametric methods based on entropy estimation
319    from k-nearest neighbors distances as described in [2]_ and [3]_. Both
320    methods are based on the idea originally proposed in [4]_.
321
322    It can be used for univariate features selection, read more in the
323    :ref:`User Guide <univariate_feature_selection>`.
324
325    Parameters
326    ----------
327    X : array-like or sparse matrix, shape (n_samples, n_features)
328        Feature matrix.
329
330    y : array-like of shape (n_samples,)
331        Target vector.
332
333    discrete_features : {'auto', bool, array-like}, default='auto'
334        If bool, then determines whether to consider all features discrete
335        or continuous. If array, then it should be either a boolean mask
336        with shape (n_features,) or array with indices of discrete features.
337        If 'auto', it is assigned to False for dense `X` and to True for
338        sparse `X`.
339
340    n_neighbors : int, default=3
341        Number of neighbors to use for MI estimation for continuous variables,
342        see [2]_ and [3]_. Higher values reduce variance of the estimation, but
343        could introduce a bias.
344
345    copy : bool, default=True
346        Whether to make a copy of the given data. If set to False, the initial
347        data will be overwritten.
348
349    random_state : int, RandomState instance or None, default=None
350        Determines random number generation for adding small noise to
351        continuous variables in order to remove repeated values.
352        Pass an int for reproducible results across multiple function calls.
353        See :term:`Glossary <random_state>`.
354
355    Returns
356    -------
357    mi : ndarray, shape (n_features,)
358        Estimated mutual information between each feature and the target.
359
360    Notes
361    -----
362    1. The term "discrete features" is used instead of naming them
363       "categorical", because it describes the essence more accurately.
364       For example, pixel intensities of an image are discrete features
365       (but hardly categorical) and you will get better results if mark them
366       as such. Also note, that treating a continuous variable as discrete and
367       vice versa will usually give incorrect results, so be attentive about
368       that.
369    2. True mutual information can't be negative. If its estimate turns out
370       to be negative, it is replaced by zero.
371
372    References
373    ----------
374    .. [1] `Mutual Information
375           <https://en.wikipedia.org/wiki/Mutual_information>`_
376           on Wikipedia.
377    .. [2] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
378           information". Phys. Rev. E 69, 2004.
379    .. [3] B. C. Ross "Mutual Information between Discrete and Continuous
380           Data Sets". PLoS ONE 9(2), 2014.
381    .. [4] L. F. Kozachenko, N. N. Leonenko, "Sample Estimate of the Entropy
382           of a Random Vector", Probl. Peredachi Inf., 23:2 (1987), 9-16
383    """
384    return _estimate_mi(X, y, discrete_features, False, n_neighbors, copy, random_state)
385
386
387def mutual_info_classif(
388    X, y, *, discrete_features="auto", n_neighbors=3, copy=True, random_state=None
389):
390    """Estimate mutual information for a discrete target variable.
391
392    Mutual information (MI) [1]_ between two random variables is a non-negative
393    value, which measures the dependency between the variables. It is equal
394    to zero if and only if two random variables are independent, and higher
395    values mean higher dependency.
396
397    The function relies on nonparametric methods based on entropy estimation
398    from k-nearest neighbors distances as described in [2]_ and [3]_. Both
399    methods are based on the idea originally proposed in [4]_.
400
401    It can be used for univariate features selection, read more in the
402    :ref:`User Guide <univariate_feature_selection>`.
403
404    Parameters
405    ----------
406    X : array-like or sparse matrix, shape (n_samples, n_features)
407        Feature matrix.
408
409    y : array-like of shape (n_samples,)
410        Target vector.
411
412    discrete_features : {'auto', bool, array-like}, default='auto'
413        If bool, then determines whether to consider all features discrete
414        or continuous. If array, then it should be either a boolean mask
415        with shape (n_features,) or array with indices of discrete features.
416        If 'auto', it is assigned to False for dense `X` and to True for
417        sparse `X`.
418
419    n_neighbors : int, default=3
420        Number of neighbors to use for MI estimation for continuous variables,
421        see [2]_ and [3]_. Higher values reduce variance of the estimation, but
422        could introduce a bias.
423
424    copy : bool, default=True
425        Whether to make a copy of the given data. If set to False, the initial
426        data will be overwritten.
427
428    random_state : int, RandomState instance or None, default=None
429        Determines random number generation for adding small noise to
430        continuous variables in order to remove repeated values.
431        Pass an int for reproducible results across multiple function calls.
432        See :term:`Glossary <random_state>`.
433
434    Returns
435    -------
436    mi : ndarray, shape (n_features,)
437        Estimated mutual information between each feature and the target.
438
439    Notes
440    -----
441    1. The term "discrete features" is used instead of naming them
442       "categorical", because it describes the essence more accurately.
443       For example, pixel intensities of an image are discrete features
444       (but hardly categorical) and you will get better results if mark them
445       as such. Also note, that treating a continuous variable as discrete and
446       vice versa will usually give incorrect results, so be attentive about
447       that.
448    2. True mutual information can't be negative. If its estimate turns out
449       to be negative, it is replaced by zero.
450
451    References
452    ----------
453    .. [1] `Mutual Information
454           <https://en.wikipedia.org/wiki/Mutual_information>`_
455           on Wikipedia.
456    .. [2] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
457           information". Phys. Rev. E 69, 2004.
458    .. [3] B. C. Ross "Mutual Information between Discrete and Continuous
459           Data Sets". PLoS ONE 9(2), 2014.
460    .. [4] L. F. Kozachenko, N. N. Leonenko, "Sample Estimate of the Entropy
461           of a Random Vector:, Probl. Peredachi Inf., 23:2 (1987), 9-16
462    """
463    check_classification_targets(y)
464    return _estimate_mi(X, y, discrete_features, True, n_neighbors, copy, random_state)
465