1# Author: Nikolay Mayorov <n59_ru@hotmail.com> 2# License: 3-clause BSD 3 4import numpy as np 5from scipy.sparse import issparse 6from scipy.special import digamma 7 8from ..metrics.cluster import mutual_info_score 9from ..neighbors import NearestNeighbors, KDTree 10from ..preprocessing import scale 11from ..utils import check_random_state 12from ..utils.fixes import _astype_copy_false 13from ..utils.validation import check_array, check_X_y 14from ..utils.multiclass import check_classification_targets 15 16 17def _compute_mi_cc(x, y, n_neighbors): 18 """Compute mutual information between two continuous variables. 19 20 Parameters 21 ---------- 22 x, y : ndarray, shape (n_samples,) 23 Samples of two continuous random variables, must have an identical 24 shape. 25 26 n_neighbors : int 27 Number of nearest neighbors to search for each point, see [1]_. 28 29 Returns 30 ------- 31 mi : float 32 Estimated mutual information. If it turned out to be negative it is 33 replace by 0. 34 35 Notes 36 ----- 37 True mutual information can't be negative. If its estimate by a numerical 38 method is negative, it means (providing the method is adequate) that the 39 mutual information is close to 0 and replacing it by 0 is a reasonable 40 strategy. 41 42 References 43 ---------- 44 .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual 45 information". Phys. Rev. E 69, 2004. 46 """ 47 n_samples = x.size 48 49 x = x.reshape((-1, 1)) 50 y = y.reshape((-1, 1)) 51 xy = np.hstack((x, y)) 52 53 # Here we rely on NearestNeighbors to select the fastest algorithm. 54 nn = NearestNeighbors(metric="chebyshev", n_neighbors=n_neighbors) 55 56 nn.fit(xy) 57 radius = nn.kneighbors()[0] 58 radius = np.nextafter(radius[:, -1], 0) 59 60 # KDTree is explicitly fit to allow for the querying of number of 61 # neighbors within a specified radius 62 kd = KDTree(x, metric="chebyshev") 63 nx = kd.query_radius(x, radius, count_only=True, return_distance=False) 64 nx = np.array(nx) - 1.0 65 66 kd = KDTree(y, metric="chebyshev") 67 ny = kd.query_radius(y, radius, count_only=True, return_distance=False) 68 ny = np.array(ny) - 1.0 69 70 mi = ( 71 digamma(n_samples) 72 + digamma(n_neighbors) 73 - np.mean(digamma(nx + 1)) 74 - np.mean(digamma(ny + 1)) 75 ) 76 77 return max(0, mi) 78 79 80def _compute_mi_cd(c, d, n_neighbors): 81 """Compute mutual information between continuous and discrete variables. 82 83 Parameters 84 ---------- 85 c : ndarray, shape (n_samples,) 86 Samples of a continuous random variable. 87 88 d : ndarray, shape (n_samples,) 89 Samples of a discrete random variable. 90 91 n_neighbors : int 92 Number of nearest neighbors to search for each point, see [1]_. 93 94 Returns 95 ------- 96 mi : float 97 Estimated mutual information. If it turned out to be negative it is 98 replace by 0. 99 100 Notes 101 ----- 102 True mutual information can't be negative. If its estimate by a numerical 103 method is negative, it means (providing the method is adequate) that the 104 mutual information is close to 0 and replacing it by 0 is a reasonable 105 strategy. 106 107 References 108 ---------- 109 .. [1] B. C. Ross "Mutual Information between Discrete and Continuous 110 Data Sets". PLoS ONE 9(2), 2014. 111 """ 112 n_samples = c.shape[0] 113 c = c.reshape((-1, 1)) 114 115 radius = np.empty(n_samples) 116 label_counts = np.empty(n_samples) 117 k_all = np.empty(n_samples) 118 nn = NearestNeighbors() 119 for label in np.unique(d): 120 mask = d == label 121 count = np.sum(mask) 122 if count > 1: 123 k = min(n_neighbors, count - 1) 124 nn.set_params(n_neighbors=k) 125 nn.fit(c[mask]) 126 r = nn.kneighbors()[0] 127 radius[mask] = np.nextafter(r[:, -1], 0) 128 k_all[mask] = k 129 label_counts[mask] = count 130 131 # Ignore points with unique labels. 132 mask = label_counts > 1 133 n_samples = np.sum(mask) 134 label_counts = label_counts[mask] 135 k_all = k_all[mask] 136 c = c[mask] 137 radius = radius[mask] 138 139 kd = KDTree(c) 140 m_all = kd.query_radius(c, radius, count_only=True, return_distance=False) 141 m_all = np.array(m_all) - 1.0 142 143 mi = ( 144 digamma(n_samples) 145 + np.mean(digamma(k_all)) 146 - np.mean(digamma(label_counts)) 147 - np.mean(digamma(m_all + 1)) 148 ) 149 150 return max(0, mi) 151 152 153def _compute_mi(x, y, x_discrete, y_discrete, n_neighbors=3): 154 """Compute mutual information between two variables. 155 156 This is a simple wrapper which selects a proper function to call based on 157 whether `x` and `y` are discrete or not. 158 """ 159 if x_discrete and y_discrete: 160 return mutual_info_score(x, y) 161 elif x_discrete and not y_discrete: 162 return _compute_mi_cd(y, x, n_neighbors) 163 elif not x_discrete and y_discrete: 164 return _compute_mi_cd(x, y, n_neighbors) 165 else: 166 return _compute_mi_cc(x, y, n_neighbors) 167 168 169def _iterate_columns(X, columns=None): 170 """Iterate over columns of a matrix. 171 172 Parameters 173 ---------- 174 X : ndarray or csc_matrix, shape (n_samples, n_features) 175 Matrix over which to iterate. 176 177 columns : iterable or None, default=None 178 Indices of columns to iterate over. If None, iterate over all columns. 179 180 Yields 181 ------ 182 x : ndarray, shape (n_samples,) 183 Columns of `X` in dense format. 184 """ 185 if columns is None: 186 columns = range(X.shape[1]) 187 188 if issparse(X): 189 for i in columns: 190 x = np.zeros(X.shape[0]) 191 start_ptr, end_ptr = X.indptr[i], X.indptr[i + 1] 192 x[X.indices[start_ptr:end_ptr]] = X.data[start_ptr:end_ptr] 193 yield x 194 else: 195 for i in columns: 196 yield X[:, i] 197 198 199def _estimate_mi( 200 X, 201 y, 202 discrete_features="auto", 203 discrete_target=False, 204 n_neighbors=3, 205 copy=True, 206 random_state=None, 207): 208 """Estimate mutual information between the features and the target. 209 210 Parameters 211 ---------- 212 X : array-like or sparse matrix, shape (n_samples, n_features) 213 Feature matrix. 214 215 y : array-like of shape (n_samples,) 216 Target vector. 217 218 discrete_features : {'auto', bool, array-like}, default='auto' 219 If bool, then determines whether to consider all features discrete 220 or continuous. If array, then it should be either a boolean mask 221 with shape (n_features,) or array with indices of discrete features. 222 If 'auto', it is assigned to False for dense `X` and to True for 223 sparse `X`. 224 225 discrete_target : bool, default=False 226 Whether to consider `y` as a discrete variable. 227 228 n_neighbors : int, default=3 229 Number of neighbors to use for MI estimation for continuous variables, 230 see [1]_ and [2]_. Higher values reduce variance of the estimation, but 231 could introduce a bias. 232 233 copy : bool, default=True 234 Whether to make a copy of the given data. If set to False, the initial 235 data will be overwritten. 236 237 random_state : int, RandomState instance or None, default=None 238 Determines random number generation for adding small noise to 239 continuous variables in order to remove repeated values. 240 Pass an int for reproducible results across multiple function calls. 241 See :term:`Glossary <random_state>`. 242 243 Returns 244 ------- 245 mi : ndarray, shape (n_features,) 246 Estimated mutual information between each feature and the target. 247 A negative value will be replaced by 0. 248 249 References 250 ---------- 251 .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual 252 information". Phys. Rev. E 69, 2004. 253 .. [2] B. C. Ross "Mutual Information between Discrete and Continuous 254 Data Sets". PLoS ONE 9(2), 2014. 255 """ 256 X, y = check_X_y(X, y, accept_sparse="csc", y_numeric=not discrete_target) 257 n_samples, n_features = X.shape 258 259 if isinstance(discrete_features, (str, bool)): 260 if isinstance(discrete_features, str): 261 if discrete_features == "auto": 262 discrete_features = issparse(X) 263 else: 264 raise ValueError("Invalid string value for discrete_features.") 265 discrete_mask = np.empty(n_features, dtype=bool) 266 discrete_mask.fill(discrete_features) 267 else: 268 discrete_features = check_array(discrete_features, ensure_2d=False) 269 if discrete_features.dtype != "bool": 270 discrete_mask = np.zeros(n_features, dtype=bool) 271 discrete_mask[discrete_features] = True 272 else: 273 discrete_mask = discrete_features 274 275 continuous_mask = ~discrete_mask 276 if np.any(continuous_mask) and issparse(X): 277 raise ValueError("Sparse matrix `X` can't have continuous features.") 278 279 rng = check_random_state(random_state) 280 if np.any(continuous_mask): 281 if copy: 282 X = X.copy() 283 284 if not discrete_target: 285 X[:, continuous_mask] = scale( 286 X[:, continuous_mask], with_mean=False, copy=False 287 ) 288 289 # Add small noise to continuous features as advised in Kraskov et. al. 290 X = X.astype(float, **_astype_copy_false(X)) 291 means = np.maximum(1, np.mean(np.abs(X[:, continuous_mask]), axis=0)) 292 X[:, continuous_mask] += ( 293 1e-10 * means * rng.randn(n_samples, np.sum(continuous_mask)) 294 ) 295 296 if not discrete_target: 297 y = scale(y, with_mean=False) 298 y += 1e-10 * np.maximum(1, np.mean(np.abs(y))) * rng.randn(n_samples) 299 300 mi = [ 301 _compute_mi(x, y, discrete_feature, discrete_target, n_neighbors) 302 for x, discrete_feature in zip(_iterate_columns(X), discrete_mask) 303 ] 304 305 return np.array(mi) 306 307 308def mutual_info_regression( 309 X, y, *, discrete_features="auto", n_neighbors=3, copy=True, random_state=None 310): 311 """Estimate mutual information for a continuous target variable. 312 313 Mutual information (MI) [1]_ between two random variables is a non-negative 314 value, which measures the dependency between the variables. It is equal 315 to zero if and only if two random variables are independent, and higher 316 values mean higher dependency. 317 318 The function relies on nonparametric methods based on entropy estimation 319 from k-nearest neighbors distances as described in [2]_ and [3]_. Both 320 methods are based on the idea originally proposed in [4]_. 321 322 It can be used for univariate features selection, read more in the 323 :ref:`User Guide <univariate_feature_selection>`. 324 325 Parameters 326 ---------- 327 X : array-like or sparse matrix, shape (n_samples, n_features) 328 Feature matrix. 329 330 y : array-like of shape (n_samples,) 331 Target vector. 332 333 discrete_features : {'auto', bool, array-like}, default='auto' 334 If bool, then determines whether to consider all features discrete 335 or continuous. If array, then it should be either a boolean mask 336 with shape (n_features,) or array with indices of discrete features. 337 If 'auto', it is assigned to False for dense `X` and to True for 338 sparse `X`. 339 340 n_neighbors : int, default=3 341 Number of neighbors to use for MI estimation for continuous variables, 342 see [2]_ and [3]_. Higher values reduce variance of the estimation, but 343 could introduce a bias. 344 345 copy : bool, default=True 346 Whether to make a copy of the given data. If set to False, the initial 347 data will be overwritten. 348 349 random_state : int, RandomState instance or None, default=None 350 Determines random number generation for adding small noise to 351 continuous variables in order to remove repeated values. 352 Pass an int for reproducible results across multiple function calls. 353 See :term:`Glossary <random_state>`. 354 355 Returns 356 ------- 357 mi : ndarray, shape (n_features,) 358 Estimated mutual information between each feature and the target. 359 360 Notes 361 ----- 362 1. The term "discrete features" is used instead of naming them 363 "categorical", because it describes the essence more accurately. 364 For example, pixel intensities of an image are discrete features 365 (but hardly categorical) and you will get better results if mark them 366 as such. Also note, that treating a continuous variable as discrete and 367 vice versa will usually give incorrect results, so be attentive about 368 that. 369 2. True mutual information can't be negative. If its estimate turns out 370 to be negative, it is replaced by zero. 371 372 References 373 ---------- 374 .. [1] `Mutual Information 375 <https://en.wikipedia.org/wiki/Mutual_information>`_ 376 on Wikipedia. 377 .. [2] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual 378 information". Phys. Rev. E 69, 2004. 379 .. [3] B. C. Ross "Mutual Information between Discrete and Continuous 380 Data Sets". PLoS ONE 9(2), 2014. 381 .. [4] L. F. Kozachenko, N. N. Leonenko, "Sample Estimate of the Entropy 382 of a Random Vector", Probl. Peredachi Inf., 23:2 (1987), 9-16 383 """ 384 return _estimate_mi(X, y, discrete_features, False, n_neighbors, copy, random_state) 385 386 387def mutual_info_classif( 388 X, y, *, discrete_features="auto", n_neighbors=3, copy=True, random_state=None 389): 390 """Estimate mutual information for a discrete target variable. 391 392 Mutual information (MI) [1]_ between two random variables is a non-negative 393 value, which measures the dependency between the variables. It is equal 394 to zero if and only if two random variables are independent, and higher 395 values mean higher dependency. 396 397 The function relies on nonparametric methods based on entropy estimation 398 from k-nearest neighbors distances as described in [2]_ and [3]_. Both 399 methods are based on the idea originally proposed in [4]_. 400 401 It can be used for univariate features selection, read more in the 402 :ref:`User Guide <univariate_feature_selection>`. 403 404 Parameters 405 ---------- 406 X : array-like or sparse matrix, shape (n_samples, n_features) 407 Feature matrix. 408 409 y : array-like of shape (n_samples,) 410 Target vector. 411 412 discrete_features : {'auto', bool, array-like}, default='auto' 413 If bool, then determines whether to consider all features discrete 414 or continuous. If array, then it should be either a boolean mask 415 with shape (n_features,) or array with indices of discrete features. 416 If 'auto', it is assigned to False for dense `X` and to True for 417 sparse `X`. 418 419 n_neighbors : int, default=3 420 Number of neighbors to use for MI estimation for continuous variables, 421 see [2]_ and [3]_. Higher values reduce variance of the estimation, but 422 could introduce a bias. 423 424 copy : bool, default=True 425 Whether to make a copy of the given data. If set to False, the initial 426 data will be overwritten. 427 428 random_state : int, RandomState instance or None, default=None 429 Determines random number generation for adding small noise to 430 continuous variables in order to remove repeated values. 431 Pass an int for reproducible results across multiple function calls. 432 See :term:`Glossary <random_state>`. 433 434 Returns 435 ------- 436 mi : ndarray, shape (n_features,) 437 Estimated mutual information between each feature and the target. 438 439 Notes 440 ----- 441 1. The term "discrete features" is used instead of naming them 442 "categorical", because it describes the essence more accurately. 443 For example, pixel intensities of an image are discrete features 444 (but hardly categorical) and you will get better results if mark them 445 as such. Also note, that treating a continuous variable as discrete and 446 vice versa will usually give incorrect results, so be attentive about 447 that. 448 2. True mutual information can't be negative. If its estimate turns out 449 to be negative, it is replaced by zero. 450 451 References 452 ---------- 453 .. [1] `Mutual Information 454 <https://en.wikipedia.org/wiki/Mutual_information>`_ 455 on Wikipedia. 456 .. [2] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual 457 information". Phys. Rev. E 69, 2004. 458 .. [3] B. C. Ross "Mutual Information between Discrete and Continuous 459 Data Sets". PLoS ONE 9(2), 2014. 460 .. [4] L. F. Kozachenko, N. N. Leonenko, "Sample Estimate of the Entropy 461 of a Random Vector:, Probl. Peredachi Inf., 23:2 (1987), 9-16 462 """ 463 check_classification_targets(y) 464 return _estimate_mi(X, y, discrete_features, True, n_neighbors, copy, random_state) 465