1# -*- coding: utf-8 -*- 2""" Distance dependence measure and the dCov test. 3 4Implementation of Székely et al. (2007) calculation of distance 5dependence statistics, including the Distance covariance (dCov) test 6for independence of random vectors of arbitrary length. 7 8Author: Ron Itzikovitch 9 10References 11---------- 12.. Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007) 13 "Measuring and testing dependence by correlation of distances". 14 Annals of Statistics, Vol. 35 No. 6, pp. 2769-2794. 15 16""" 17from collections import namedtuple 18import warnings 19 20import numpy as np 21from scipy.spatial.distance import pdist, squareform 22from scipy.stats import norm 23 24from statsmodels.tools.sm_exceptions import HypothesisTestWarning 25 26DistDependStat = namedtuple( 27 "DistDependStat", 28 ["test_statistic", "distance_correlation", 29 "distance_covariance", "dvar_x", "dvar_y", "S"], 30) 31 32 33def distance_covariance_test(x, y, B=None, method="auto"): 34 r"""The Distance Covariance (dCov) test 35 36 Apply the Distance Covariance (dCov) test of independence to `x` and `y`. 37 This test was introduced in [1]_, and is based on the distance covariance 38 statistic. The test is applicable to random vectors of arbitrary length 39 (see the notes section for more details). 40 41 Parameters 42 ---------- 43 x : array_like, 1-D or 2-D 44 If `x` is 1-D than it is assumed to be a vector of observations of a 45 single random variable. If `x` is 2-D than the rows should be 46 observations and the columns are treated as the components of a 47 random vector, i.e., each column represents a different component of 48 the random vector `x`. 49 y : array_like, 1-D or 2-D 50 Same as `x`, but only the number of observation has to match that of 51 `x`. If `y` is 2-D note that the number of columns of `y` (i.e., the 52 number of components in the random vector) does not need to match 53 the number of columns in `x`. 54 B : int, optional, default=`None` 55 The number of iterations to perform when evaluating the null 56 distribution of the test statistic when the `emp` method is 57 applied (see below). if `B` is `None` than as in [1]_ we set 58 `B` to be ``B = 200 + 5000/n``, where `n` is the number of 59 observations. 60 method : {'auto', 'emp', 'asym'}, optional, default=auto 61 The method by which to obtain the p-value for the test. 62 63 - `auto` : Default method. The number of observations will be used to 64 determine the method. 65 - `emp` : Empirical evaluation of the p-value using permutations of 66 the rows of `y` to obtain the null distribution. 67 - `asym` : An asymptotic approximation of the distribution of the test 68 statistic is used to find the p-value. 69 70 Returns 71 ------- 72 test_statistic : float 73 The value of the test statistic used in the test. 74 pval : float 75 The p-value. 76 chosen_method : str 77 The method that was used to obtain the p-value. Mostly relevant when 78 the function is called with `method='auto'`. 79 80 Notes 81 ----- 82 The test applies to random vectors of arbitrary dimensions, i.e., `x` 83 can be a 1-D vector of observations for a single random variable while 84 `y` can be a `k` by `n` 2-D array (where `k > 1`). In other words, it 85 is also possible for `x` and `y` to both be 2-D arrays and have the 86 same number of rows (observations) while differing in the number of 87 columns. 88 89 As noted in [1]_ the statistics are sensitive to all types of departures 90 from independence, including nonlinear or nonmonotone dependence 91 structure. 92 93 References 94 ---------- 95 .. [1] Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007) 96 "Measuring and testing by correlation of distances". 97 Annals of Statistics, Vol. 35 No. 6, pp. 2769-2794. 98 99 Examples 100 -------- 101 >>> from statsmodels.stats.dist_dependence_measures import 102 ... distance_covariance_test 103 >>> data = np.random.rand(1000, 10) 104 >>> x, y = data[:, :3], data[:, 3:] 105 >>> x.shape 106 (1000, 3) 107 >>> y.shape 108 (1000, 7) 109 >>> distance_covariance_test(x, y) 110 (1.0426404792714983, 0.2971148340813543, 'asym') 111 # (test_statistic, pval, chosen_method) 112 113 """ 114 x, y = _validate_and_tranform_x_and_y(x, y) 115 116 n = x.shape[0] 117 stats = distance_statistics(x, y) 118 119 if method == "auto" and n <= 500 or method == "emp": 120 chosen_method = "emp" 121 test_statistic, pval = _empirical_pvalue(x, y, B, n, stats) 122 123 elif method == "auto" and n > 500 or method == "asym": 124 chosen_method = "asym" 125 test_statistic, pval = _asymptotic_pvalue(stats) 126 127 else: 128 raise ValueError("Unknown 'method' parameter: {}".format(method)) 129 130 # In case we got an extreme p-value (0 or 1) when using the empirical 131 # distribution of the test statistic under the null, we fall back 132 # to the asymptotic approximation. 133 if chosen_method == "emp" and pval in [0, 1]: 134 msg = ( 135 f"p-value was {pval} when using the empirical method. " 136 "The asymptotic approximation will be used instead" 137 ) 138 warnings.warn(msg, HypothesisTestWarning) 139 _, pval = _asymptotic_pvalue(stats) 140 141 return test_statistic, pval, chosen_method 142 143 144def _validate_and_tranform_x_and_y(x, y): 145 r"""Ensure `x` and `y` have proper shape and transform/reshape them if 146 required. 147 148 Parameters 149 ---------- 150 x : array_like, 1-D or 2-D 151 If `x` is 1-D than it is assumed to be a vector of observations of a 152 single random variable. If `x` is 2-D than the rows should be 153 observations and the columns are treated as the components of a 154 random vector, i.e., each column represents a different component of 155 the random vector `x`. 156 y : array_like, 1-D or 2-D 157 Same as `x`, but only the number of observation has to match that of 158 `x`. If `y` is 2-D note that the number of columns of `y` (i.e., the 159 number of components in the random vector) does not need to match 160 the number of columns in `x`. 161 162 Returns 163 ------- 164 x : array_like, 1-D or 2-D 165 y : array_like, 1-D or 2-D 166 167 Raises 168 ------ 169 ValueError 170 If `x` and `y` have a different number of observations. 171 172 """ 173 x = np.asanyarray(x) 174 y = np.asanyarray(y) 175 176 if x.shape[0] != y.shape[0]: 177 raise ValueError( 178 "x and y must have the same number of observations (rows)." 179 ) 180 181 if len(x.shape) == 1: 182 x = x.reshape((x.shape[0], 1)) 183 184 if len(y.shape) == 1: 185 y = y.reshape((y.shape[0], 1)) 186 187 return x, y 188 189 190def _empirical_pvalue(x, y, B, n, stats): 191 r"""Calculate the empirical p-value based on permutations of `y`'s rows 192 193 Parameters 194 ---------- 195 x : array_like, 1-D or 2-D 196 If `x` is 1-D than it is assumed to be a vector of observations of a 197 single random variable. If `x` is 2-D than the rows should be 198 observations and the columns are treated as the components of a 199 random vector, i.e., each column represents a different component of 200 the random vector `x`. 201 y : array_like, 1-D or 2-D 202 Same as `x`, but only the number of observation has to match that of 203 `x`. If `y` is 2-D note that the number of columns of `y` (i.e., the 204 number of components in the random vector) does not need to match 205 the number of columns in `x`. 206 B : int 207 The number of iterations when evaluating the null distribution. 208 n : Number of observations found in each of `x` and `y`. 209 stats: namedtuple 210 The result obtained from calling ``distance_statistics(x, y)``. 211 212 Returns 213 ------- 214 test_statistic : float 215 The empirical test statistic. 216 pval : float 217 The empirical p-value. 218 219 """ 220 B = int(B) if B else int(np.floor(200 + 5000 / n)) 221 empirical_dist = _get_test_statistic_distribution(x, y, B) 222 pval = 1 - np.searchsorted( 223 sorted(empirical_dist), stats.test_statistic 224 ) / len(empirical_dist) 225 test_statistic = stats.test_statistic 226 227 return test_statistic, pval 228 229 230def _asymptotic_pvalue(stats): 231 r"""Calculate the p-value based on an approximation of the distribution of 232 the test statistic under the null. 233 234 Parameters 235 ---------- 236 stats: namedtuple 237 The result obtained from calling ``distance_statistics(x, y)``. 238 239 Returns 240 ------- 241 test_statistic : float 242 The test statistic. 243 pval : float 244 The asymptotic p-value. 245 246 """ 247 test_statistic = np.sqrt(stats.test_statistic / stats.S) 248 pval = (1 - norm.cdf(test_statistic)) * 2 249 250 return test_statistic, pval 251 252 253def _get_test_statistic_distribution(x, y, B): 254 r""" 255 Parameters 256 ---------- 257 x : array_like, 1-D or 2-D 258 If `x` is 1-D than it is assumed to be a vector of observations of a 259 single random variable. If `x` is 2-D than the rows should be 260 observations and the columns are treated as the components of a 261 random vector, i.e., each column represents a different component of 262 the random vector `x`. 263 y : array_like, 1-D or 2-D 264 Same as `x`, but only the number of observation has to match that of 265 `x`. If `y` is 2-D note that the number of columns of `y` (i.e., the 266 number of components in the random vector) does not need to match 267 the number of columns in `x`. 268 B : int 269 The number of iterations to perform when evaluating the null 270 distribution. 271 272 Returns 273 ------- 274 emp_dist : array_like 275 The empirical distribution of the test statistic. 276 277 """ 278 y = y.copy() 279 emp_dist = np.zeros(B) 280 x_dist = squareform(pdist(x, "euclidean")) 281 282 for i in range(B): 283 np.random.shuffle(y) 284 emp_dist[i] = distance_statistics(x, y, x_dist=x_dist).test_statistic 285 286 return emp_dist 287 288 289def distance_statistics(x, y, x_dist=None, y_dist=None): 290 r"""Calculate various distance dependence statistics. 291 292 Calculate several distance dependence statistics as described in [1]_. 293 294 Parameters 295 ---------- 296 x : array_like, 1-D or 2-D 297 If `x` is 1-D than it is assumed to be a vector of observations of a 298 single random variable. If `x` is 2-D than the rows should be 299 observations and the columns are treated as the components of a 300 random vector, i.e., each column represents a different component of 301 the random vector `x`. 302 y : array_like, 1-D or 2-D 303 Same as `x`, but only the number of observation has to match that of 304 `x`. If `y` is 2-D note that the number of columns of `y` (i.e., the 305 number of components in the random vector) does not need to match 306 the number of columns in `x`. 307 x_dist : array_like, 2-D, optional 308 A square 2-D array_like object whose values are the euclidean 309 distances between `x`'s rows. 310 y_dist : array_like, 2-D, optional 311 A square 2-D array_like object whose values are the euclidean 312 distances between `y`'s rows. 313 314 Returns 315 ------- 316 namedtuple 317 A named tuple of distance dependence statistics (DistDependStat) with 318 the following values: 319 320 - test_statistic : float - The "basic" test statistic (i.e., the one 321 used when the `emp` method is chosen when calling 322 ``distance_covariance_test()`` 323 - distance_correlation : float - The distance correlation 324 between `x` and `y`. 325 - distance_covariance : float - The distance covariance of 326 `x` and `y`. 327 - dvar_x : float - The distance variance of `x`. 328 - dvar_y : float - The distance variance of `y`. 329 - S : float - The mean of the euclidean distances in `x` multiplied 330 by those of `y`. Mostly used internally. 331 332 References 333 ---------- 334 .. [1] Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007) 335 "Measuring and testing dependence by correlation of distances". 336 Annals of Statistics, Vol. 35 No. 6, pp. 2769-2794. 337 338 Examples 339 -------- 340 341 >>> from statsmodels.stats.dist_dependence_measures import 342 ... distance_statistics 343 >>> distance_statistics(np.random.random(1000), np.random.random(1000)) 344 DistDependStat(test_statistic=0.07948284320205831, 345 distance_correlation=0.04269511890990793, 346 distance_covariance=0.008915315092696293, 347 dvar_x=0.20719027438266704, dvar_y=0.21044934264957588, 348 S=0.10892061635588891) 349 350 """ 351 x, y = _validate_and_tranform_x_and_y(x, y) 352 353 n = x.shape[0] 354 355 a = x_dist if x_dist is not None else squareform(pdist(x, "euclidean")) 356 b = y_dist if y_dist is not None else squareform(pdist(y, "euclidean")) 357 358 a_row_means = a.mean(axis=0, keepdims=True) 359 b_row_means = b.mean(axis=0, keepdims=True) 360 a_col_means = a.mean(axis=1, keepdims=True) 361 b_col_means = b.mean(axis=1, keepdims=True) 362 a_mean = a.mean() 363 b_mean = b.mean() 364 365 A = a - a_row_means - a_col_means + a_mean 366 B = b - b_row_means - b_col_means + b_mean 367 368 S = a_mean * b_mean 369 dcov = np.sqrt(np.multiply(A, B).mean()) 370 dvar_x = np.sqrt(np.multiply(A, A).mean()) 371 dvar_y = np.sqrt(np.multiply(B, B).mean()) 372 dcor = dcov / np.sqrt(dvar_x * dvar_y) 373 374 test_statistic = n * dcov ** 2 375 376 return DistDependStat( 377 test_statistic=test_statistic, 378 distance_correlation=dcor, 379 distance_covariance=dcov, 380 dvar_x=dvar_x, 381 dvar_y=dvar_y, 382 S=S, 383 ) 384 385 386def distance_covariance(x, y): 387 r"""Distance covariance. 388 389 Calculate the empirical distance covariance as described in [1]_. 390 391 Parameters 392 ---------- 393 x : array_like, 1-D or 2-D 394 If `x` is 1-D than it is assumed to be a vector of observations of a 395 single random variable. If `x` is 2-D than the rows should be 396 observations and the columns are treated as the components of a 397 random vector, i.e., each column represents a different component of 398 the random vector `x`. 399 y : array_like, 1-D or 2-D 400 Same as `x`, but only the number of observation has to match that of 401 `x`. If `y` is 2-D note that the number of columns of `y` (i.e., the 402 number of components in the random vector) does not need to match 403 the number of columns in `x`. 404 405 Returns 406 ------- 407 float 408 The empirical distance covariance between `x` and `y`. 409 410 References 411 ---------- 412 .. [1] Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007) 413 "Measuring and testing dependence by correlation of distances". 414 Annals of Statistics, Vol. 35 No. 6, pp. 2769-2794. 415 416 Examples 417 -------- 418 419 >>> from statsmodels.stats.dist_dependence_measures import 420 ... distance_covariance 421 >>> distance_covariance(np.random.random(1000), np.random.random(1000)) 422 0.007575063951951362 423 424 """ 425 return distance_statistics(x, y).distance_covariance 426 427 428def distance_variance(x): 429 r"""Distance variance. 430 431 Calculate the empirical distance variance as described in [1]_. 432 433 Parameters 434 ---------- 435 x : array_like, 1-D or 2-D 436 If `x` is 1-D than it is assumed to be a vector of observations of a 437 single random variable. If `x` is 2-D than the rows should be 438 observations and the columns are treated as the components of a 439 random vector, i.e., each column represents a different component of 440 the random vector `x`. 441 442 Returns 443 ------- 444 float 445 The empirical distance variance of `x`. 446 447 References 448 ---------- 449 .. [1] Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007) 450 "Measuring and testing dependence by correlation of distances". 451 Annals of Statistics, Vol. 35 No. 6, pp. 2769-2794. 452 453 Examples 454 -------- 455 456 >>> from statsmodels.stats.dist_dependence_measures import 457 ... distance_variance 458 >>> distance_variance(np.random.random(1000)) 459 0.21732609190659702 460 461 """ 462 return distance_covariance(x, x) 463 464 465def distance_correlation(x, y): 466 r"""Distance correlation. 467 468 Calculate the empirical distance correlation as described in [1]_. 469 This statistic is analogous to product-moment correlation and describes 470 the dependence between `x` and `y`, which are random vectors of 471 arbitrary length. The statistics' values range between 0 (implies 472 independence) and 1 (implies complete dependence). 473 474 Parameters 475 ---------- 476 x : array_like, 1-D or 2-D 477 If `x` is 1-D than it is assumed to be a vector of observations of a 478 single random variable. If `x` is 2-D than the rows should be 479 observations and the columns are treated as the components of a 480 random vector, i.e., each column represents a different component of 481 the random vector `x`. 482 y : array_like, 1-D or 2-D 483 Same as `x`, but only the number of observation has to match that of 484 `x`. If `y` is 2-D note that the number of columns of `y` (i.e., the 485 number of components in the random vector) does not need to match 486 the number of columns in `x`. 487 488 Returns 489 ------- 490 float 491 The empirical distance correlation between `x` and `y`. 492 493 References 494 ---------- 495 .. [1] Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007) 496 "Measuring and testing dependence by correlation of distances". 497 Annals of Statistics, Vol. 35 No. 6, pp. 2769-2794. 498 499 Examples 500 -------- 501 502 >>> from statsmodels.stats.dist_dependence_measures import 503 ... distance_correlation 504 >>> distance_correlation(np.random.random(1000), np.random.random(1000)) 505 0.04060497840149489 506 507 """ 508 return distance_statistics(x, y).distance_correlation 509