1# -*- coding: utf-8 -*-
2""" Distance dependence measure and the dCov test.
3
4Implementation of Székely et al. (2007) calculation of distance
5dependence statistics, including the Distance covariance (dCov) test
6for independence of random vectors of arbitrary length.
7
8Author: Ron Itzikovitch
9
10References
11----------
12.. Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007)
13   "Measuring and testing dependence by correlation of distances".
14   Annals of Statistics, Vol. 35 No. 6, pp. 2769-2794.
15
16"""
17from collections import namedtuple
18import warnings
19
20import numpy as np
21from scipy.spatial.distance import pdist, squareform
22from scipy.stats import norm
23
24from statsmodels.tools.sm_exceptions import HypothesisTestWarning
25
26DistDependStat = namedtuple(
27    "DistDependStat",
28    ["test_statistic", "distance_correlation",
29     "distance_covariance", "dvar_x", "dvar_y", "S"],
30)
31
32
33def distance_covariance_test(x, y, B=None, method="auto"):
34    r"""The Distance Covariance (dCov) test
35
36    Apply the Distance Covariance (dCov) test of independence to `x` and `y`.
37    This test was introduced in [1]_, and is based on the distance covariance
38    statistic. The test is applicable to random vectors of arbitrary length
39    (see the notes section for more details).
40
41    Parameters
42    ----------
43    x : array_like, 1-D or 2-D
44        If `x` is 1-D than it is assumed to be a vector of observations of a
45        single random variable. If `x` is 2-D than the rows should be
46        observations and the columns are treated as the components of a
47        random vector, i.e., each column represents a different component of
48        the random vector `x`.
49    y : array_like, 1-D or 2-D
50        Same as `x`, but only the number of observation has to match that of
51        `x`. If `y` is 2-D note that the number of columns of `y` (i.e., the
52        number of components in the random vector) does not need to match
53        the number of columns in `x`.
54    B : int, optional, default=`None`
55        The number of iterations to perform when evaluating the null
56        distribution of the test statistic when the `emp` method is
57        applied (see below). if `B` is `None` than as in [1]_ we set
58        `B` to be ``B = 200 + 5000/n``, where `n` is the number of
59        observations.
60    method : {'auto', 'emp', 'asym'}, optional, default=auto
61        The method by which to obtain the p-value for the test.
62
63        - `auto` : Default method. The number of observations will be used to
64          determine the method.
65        - `emp` : Empirical evaluation of the p-value using permutations of
66          the rows of `y` to obtain the null distribution.
67        - `asym` : An asymptotic approximation of the distribution of the test
68          statistic is used to find the p-value.
69
70    Returns
71    -------
72    test_statistic : float
73        The value of the test statistic used in the test.
74    pval : float
75        The p-value.
76    chosen_method : str
77        The method that was used to obtain the p-value. Mostly relevant when
78        the function is called with `method='auto'`.
79
80    Notes
81    -----
82    The test applies to random vectors of arbitrary dimensions, i.e., `x`
83    can be a 1-D vector of observations for a single random variable while
84    `y` can be a `k` by `n` 2-D array (where `k > 1`). In other words, it
85    is also possible for `x` and `y` to both be 2-D arrays and have the
86    same number of rows (observations) while differing in the number of
87    columns.
88
89    As noted in [1]_ the statistics are sensitive to all types of departures
90    from independence, including nonlinear or nonmonotone dependence
91    structure.
92
93    References
94    ----------
95    .. [1] Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007)
96       "Measuring and testing by correlation of distances".
97       Annals of Statistics, Vol. 35 No. 6, pp. 2769-2794.
98
99    Examples
100    --------
101    >>> from statsmodels.stats.dist_dependence_measures import
102    ... distance_covariance_test
103    >>> data = np.random.rand(1000, 10)
104    >>> x, y = data[:, :3], data[:, 3:]
105    >>> x.shape
106    (1000, 3)
107    >>> y.shape
108    (1000, 7)
109    >>> distance_covariance_test(x, y)
110    (1.0426404792714983, 0.2971148340813543, 'asym')
111    # (test_statistic, pval, chosen_method)
112
113    """
114    x, y = _validate_and_tranform_x_and_y(x, y)
115
116    n = x.shape[0]
117    stats = distance_statistics(x, y)
118
119    if method == "auto" and n <= 500 or method == "emp":
120        chosen_method = "emp"
121        test_statistic, pval = _empirical_pvalue(x, y, B, n, stats)
122
123    elif method == "auto" and n > 500 or method == "asym":
124        chosen_method = "asym"
125        test_statistic, pval = _asymptotic_pvalue(stats)
126
127    else:
128        raise ValueError("Unknown 'method' parameter: {}".format(method))
129
130    # In case we got an extreme p-value (0 or 1) when using the empirical
131    # distribution of the test statistic under the null, we fall back
132    # to the asymptotic approximation.
133    if chosen_method == "emp" and pval in [0, 1]:
134        msg = (
135            f"p-value was {pval} when using the empirical method. "
136            "The asymptotic approximation will be used instead"
137        )
138        warnings.warn(msg, HypothesisTestWarning)
139        _, pval = _asymptotic_pvalue(stats)
140
141    return test_statistic, pval, chosen_method
142
143
144def _validate_and_tranform_x_and_y(x, y):
145    r"""Ensure `x` and `y` have proper shape and transform/reshape them if
146    required.
147
148    Parameters
149    ----------
150    x : array_like, 1-D or 2-D
151        If `x` is 1-D than it is assumed to be a vector of observations of a
152        single random variable. If `x` is 2-D than the rows should be
153        observations and the columns are treated as the components of a
154        random vector, i.e., each column represents a different component of
155        the random vector `x`.
156    y : array_like, 1-D or 2-D
157        Same as `x`, but only the number of observation has to match that of
158        `x`. If `y` is 2-D note that the number of columns of `y` (i.e., the
159        number of components in the random vector) does not need to match
160        the number of columns in `x`.
161
162    Returns
163    -------
164    x : array_like, 1-D or 2-D
165    y : array_like, 1-D or 2-D
166
167    Raises
168    ------
169    ValueError
170        If `x` and `y` have a different number of observations.
171
172    """
173    x = np.asanyarray(x)
174    y = np.asanyarray(y)
175
176    if x.shape[0] != y.shape[0]:
177        raise ValueError(
178            "x and y must have the same number of observations (rows)."
179        )
180
181    if len(x.shape) == 1:
182        x = x.reshape((x.shape[0], 1))
183
184    if len(y.shape) == 1:
185        y = y.reshape((y.shape[0], 1))
186
187    return x, y
188
189
190def _empirical_pvalue(x, y, B, n, stats):
191    r"""Calculate the empirical p-value based on permutations of `y`'s rows
192
193    Parameters
194    ----------
195    x : array_like, 1-D or 2-D
196        If `x` is 1-D than it is assumed to be a vector of observations of a
197        single random variable. If `x` is 2-D than the rows should be
198        observations and the columns are treated as the components of a
199        random vector, i.e., each column represents a different component of
200        the random vector `x`.
201    y : array_like, 1-D or 2-D
202        Same as `x`, but only the number of observation has to match that of
203        `x`. If `y` is 2-D note that the number of columns of `y` (i.e., the
204        number of components in the random vector) does not need to match
205        the number of columns in `x`.
206    B : int
207        The number of iterations when evaluating the null distribution.
208    n : Number of observations found in each of `x` and `y`.
209    stats: namedtuple
210        The result obtained from calling ``distance_statistics(x, y)``.
211
212    Returns
213    -------
214    test_statistic : float
215        The empirical test statistic.
216    pval : float
217        The empirical p-value.
218
219    """
220    B = int(B) if B else int(np.floor(200 + 5000 / n))
221    empirical_dist = _get_test_statistic_distribution(x, y, B)
222    pval = 1 - np.searchsorted(
223        sorted(empirical_dist), stats.test_statistic
224    ) / len(empirical_dist)
225    test_statistic = stats.test_statistic
226
227    return test_statistic, pval
228
229
230def _asymptotic_pvalue(stats):
231    r"""Calculate the p-value based on an approximation of the distribution of
232    the test statistic under the null.
233
234    Parameters
235    ----------
236    stats: namedtuple
237        The result obtained from calling ``distance_statistics(x, y)``.
238
239    Returns
240    -------
241    test_statistic : float
242        The test statistic.
243    pval : float
244        The asymptotic p-value.
245
246    """
247    test_statistic = np.sqrt(stats.test_statistic / stats.S)
248    pval = (1 - norm.cdf(test_statistic)) * 2
249
250    return test_statistic, pval
251
252
253def _get_test_statistic_distribution(x, y, B):
254    r"""
255    Parameters
256    ----------
257    x : array_like, 1-D or 2-D
258        If `x` is 1-D than it is assumed to be a vector of observations of a
259        single random variable. If `x` is 2-D than the rows should be
260        observations and the columns are treated as the components of a
261        random vector, i.e., each column represents a different component of
262        the random vector `x`.
263    y : array_like, 1-D or 2-D
264        Same as `x`, but only the number of observation has to match that of
265        `x`. If `y` is 2-D note that the number of columns of `y` (i.e., the
266        number of components in the random vector) does not need to match
267        the number of columns in `x`.
268    B : int
269        The number of iterations to perform when evaluating the null
270        distribution.
271
272    Returns
273    -------
274    emp_dist : array_like
275        The empirical distribution of the test statistic.
276
277    """
278    y = y.copy()
279    emp_dist = np.zeros(B)
280    x_dist = squareform(pdist(x, "euclidean"))
281
282    for i in range(B):
283        np.random.shuffle(y)
284        emp_dist[i] = distance_statistics(x, y, x_dist=x_dist).test_statistic
285
286    return emp_dist
287
288
289def distance_statistics(x, y, x_dist=None, y_dist=None):
290    r"""Calculate various distance dependence statistics.
291
292    Calculate several distance dependence statistics as described in [1]_.
293
294    Parameters
295    ----------
296    x : array_like, 1-D or 2-D
297        If `x` is 1-D than it is assumed to be a vector of observations of a
298        single random variable. If `x` is 2-D than the rows should be
299        observations and the columns are treated as the components of a
300        random vector, i.e., each column represents a different component of
301        the random vector `x`.
302    y : array_like, 1-D or 2-D
303        Same as `x`, but only the number of observation has to match that of
304        `x`. If `y` is 2-D note that the number of columns of `y` (i.e., the
305        number of components in the random vector) does not need to match
306        the number of columns in `x`.
307    x_dist : array_like, 2-D, optional
308        A square 2-D array_like object whose values are the euclidean
309        distances between `x`'s rows.
310    y_dist : array_like, 2-D, optional
311        A square 2-D array_like object whose values are the euclidean
312        distances between `y`'s rows.
313
314    Returns
315    -------
316    namedtuple
317        A named tuple of distance dependence statistics (DistDependStat) with
318        the following values:
319
320        - test_statistic : float - The "basic" test statistic (i.e., the one
321          used when the `emp` method is chosen when calling
322          ``distance_covariance_test()``
323        - distance_correlation : float - The distance correlation
324          between `x` and `y`.
325        - distance_covariance : float - The distance covariance of
326          `x` and `y`.
327        - dvar_x : float - The distance variance of `x`.
328        - dvar_y : float - The distance variance of `y`.
329        - S : float - The mean of the euclidean distances in `x` multiplied
330          by those of `y`. Mostly used internally.
331
332    References
333    ----------
334    .. [1] Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007)
335       "Measuring and testing dependence by correlation of distances".
336       Annals of Statistics, Vol. 35 No. 6, pp. 2769-2794.
337
338    Examples
339    --------
340
341    >>> from statsmodels.stats.dist_dependence_measures import
342    ... distance_statistics
343    >>> distance_statistics(np.random.random(1000), np.random.random(1000))
344    DistDependStat(test_statistic=0.07948284320205831,
345    distance_correlation=0.04269511890990793,
346    distance_covariance=0.008915315092696293,
347    dvar_x=0.20719027438266704, dvar_y=0.21044934264957588,
348    S=0.10892061635588891)
349
350    """
351    x, y = _validate_and_tranform_x_and_y(x, y)
352
353    n = x.shape[0]
354
355    a = x_dist if x_dist is not None else squareform(pdist(x, "euclidean"))
356    b = y_dist if y_dist is not None else squareform(pdist(y, "euclidean"))
357
358    a_row_means = a.mean(axis=0, keepdims=True)
359    b_row_means = b.mean(axis=0, keepdims=True)
360    a_col_means = a.mean(axis=1, keepdims=True)
361    b_col_means = b.mean(axis=1, keepdims=True)
362    a_mean = a.mean()
363    b_mean = b.mean()
364
365    A = a - a_row_means - a_col_means + a_mean
366    B = b - b_row_means - b_col_means + b_mean
367
368    S = a_mean * b_mean
369    dcov = np.sqrt(np.multiply(A, B).mean())
370    dvar_x = np.sqrt(np.multiply(A, A).mean())
371    dvar_y = np.sqrt(np.multiply(B, B).mean())
372    dcor = dcov / np.sqrt(dvar_x * dvar_y)
373
374    test_statistic = n * dcov ** 2
375
376    return DistDependStat(
377        test_statistic=test_statistic,
378        distance_correlation=dcor,
379        distance_covariance=dcov,
380        dvar_x=dvar_x,
381        dvar_y=dvar_y,
382        S=S,
383    )
384
385
386def distance_covariance(x, y):
387    r"""Distance covariance.
388
389    Calculate the empirical distance covariance as described in [1]_.
390
391    Parameters
392    ----------
393    x : array_like, 1-D or 2-D
394        If `x` is 1-D than it is assumed to be a vector of observations of a
395        single random variable. If `x` is 2-D than the rows should be
396        observations and the columns are treated as the components of a
397        random vector, i.e., each column represents a different component of
398        the random vector `x`.
399    y : array_like, 1-D or 2-D
400        Same as `x`, but only the number of observation has to match that of
401        `x`. If `y` is 2-D note that the number of columns of `y` (i.e., the
402        number of components in the random vector) does not need to match
403        the number of columns in `x`.
404
405    Returns
406    -------
407    float
408        The empirical distance covariance between `x` and `y`.
409
410    References
411    ----------
412    .. [1] Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007)
413       "Measuring and testing dependence by correlation of distances".
414       Annals of Statistics, Vol. 35 No. 6, pp. 2769-2794.
415
416    Examples
417    --------
418
419    >>> from statsmodels.stats.dist_dependence_measures import
420    ... distance_covariance
421    >>> distance_covariance(np.random.random(1000), np.random.random(1000))
422    0.007575063951951362
423
424    """
425    return distance_statistics(x, y).distance_covariance
426
427
428def distance_variance(x):
429    r"""Distance variance.
430
431    Calculate the empirical distance variance as described in [1]_.
432
433    Parameters
434    ----------
435    x : array_like, 1-D or 2-D
436        If `x` is 1-D than it is assumed to be a vector of observations of a
437        single random variable. If `x` is 2-D than the rows should be
438        observations and the columns are treated as the components of a
439        random vector, i.e., each column represents a different component of
440        the random vector `x`.
441
442    Returns
443    -------
444    float
445        The empirical distance variance of `x`.
446
447    References
448    ----------
449    .. [1] Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007)
450       "Measuring and testing dependence by correlation of distances".
451       Annals of Statistics, Vol. 35 No. 6, pp. 2769-2794.
452
453    Examples
454    --------
455
456    >>> from statsmodels.stats.dist_dependence_measures import
457    ... distance_variance
458    >>> distance_variance(np.random.random(1000))
459    0.21732609190659702
460
461    """
462    return distance_covariance(x, x)
463
464
465def distance_correlation(x, y):
466    r"""Distance correlation.
467
468    Calculate the empirical distance correlation as described in [1]_.
469    This statistic is analogous to product-moment correlation and describes
470    the dependence between `x` and `y`, which are random vectors of
471    arbitrary length. The statistics' values range between 0 (implies
472    independence) and 1 (implies complete dependence).
473
474    Parameters
475    ----------
476    x : array_like, 1-D or 2-D
477        If `x` is 1-D than it is assumed to be a vector of observations of a
478        single random variable. If `x` is 2-D than the rows should be
479        observations and the columns are treated as the components of a
480        random vector, i.e., each column represents a different component of
481        the random vector `x`.
482    y : array_like, 1-D or 2-D
483        Same as `x`, but only the number of observation has to match that of
484        `x`. If `y` is 2-D note that the number of columns of `y` (i.e., the
485        number of components in the random vector) does not need to match
486        the number of columns in `x`.
487
488    Returns
489    -------
490    float
491        The empirical distance correlation between `x` and `y`.
492
493    References
494    ----------
495    .. [1] Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007)
496       "Measuring and testing dependence by correlation of distances".
497       Annals of Statistics, Vol. 35 No. 6, pp. 2769-2794.
498
499    Examples
500    --------
501
502    >>> from statsmodels.stats.dist_dependence_measures import
503    ... distance_correlation
504    >>> distance_correlation(np.random.random(1000), np.random.random(1000))
505    0.04060497840149489
506
507    """
508    return distance_statistics(x, y).distance_correlation
509