1"""
2Implementation of algorithm for sparse multi-subjects learning of Gaussian
3graphical models.
4"""
5# Authors: Philippe Gervais
6# License: simplified BSD
7
8import warnings
9import collections.abc
10import operator
11import itertools
12
13import numpy as np
14import scipy.linalg
15
16from joblib import Memory, delayed, Parallel
17from sklearn.base import BaseEstimator
18from sklearn.covariance import empirical_covariance
19from sklearn.model_selection import check_cv
20from sklearn.utils.extmath import fast_logdet
21
22from .._utils import CacheMixin
23from .._utils import logger
24from .._utils.extmath import is_spd
25
26
27def compute_alpha_max(emp_covs, n_samples):
28    """Compute the critical value of the regularization parameter.
29
30    Above this value, the precisions matrices computed by
31    group_sparse_covariance are diagonal (complete sparsity)
32
33    This function also returns the value below which the precision
34    matrices are fully dense (i.e. minimal number of zero coefficients).
35
36    The formula used in this function was derived using the same method
37    as in :footcite:`Duchi2012`.
38
39    Parameters
40    ----------
41    emp_covs : array-like, shape (n_features, n_features, n_subjects)
42        covariance matrix for each subject.
43
44    n_samples : array-like, shape (n_subjects,)
45        number of samples used in the computation of every covariance matrix.
46        n_samples.sum() can be arbitrary.
47
48    Returns
49    -------
50    alpha_max : float
51        minimal value for the regularization parameter that gives a
52        fully sparse matrix.
53
54    alpha_min : float
55        minimal value for the regularization parameter that gives a fully
56        dense matrix.
57
58    References
59    ---------
60    .. footbibliography::
61
62    """
63    A = np.copy(emp_covs)
64    n_samples = np.asarray(n_samples).copy()
65    n_samples /= n_samples.sum()
66
67    for k in range(emp_covs.shape[-1]):
68        # Set diagonal to zero
69        A[..., k].flat[::A.shape[0] + 1] = 0
70        A[..., k] *= n_samples[k]
71
72    norms = np.sqrt((A ** 2).sum(axis=-1))
73
74    return np.max(norms), np.min(norms[norms > 0])
75
76
77def _update_submatrix(full, sub, sub_inv, p, h, v):
78    """Update submatrix and its inverse.
79
80    sub_inv is the inverse of the submatrix of "full" obtained by removing
81    the p-th row and column.
82
83    sub_inv is modified in-place. After execution of this function, it contains
84    the inverse of the submatrix of "full" obtained by removing the n+1-th row
85    and column.
86
87    This computation is based on the Sherman-Woodbury-Morrison identity.
88
89    """
90    n = p - 1
91    v[:n + 1] = full[:n + 1, n]
92    v[n + 1:] = full[n + 2:, n]
93    h[:n + 1] = full[n, :n + 1]
94    h[n + 1:] = full[n, n + 2:]
95
96    # change row: first usage of SWM identity
97    coln = sub_inv[:, n:n + 1]  # 2d array, useful for sub_inv below
98    V = h - sub[n, :]
99    coln = coln / (1. + np.dot(V, coln))
100    # The following line is equivalent to
101    # sub_inv -= np.outer(coln, np.dot(V, sub_inv))
102    sub_inv -= np.dot(coln, np.dot(V, sub_inv)[np.newaxis, :])
103    sub[n, :] = h
104
105    # change column: second usage of SWM identity
106    rown = sub_inv[n:n + 1, :]  # 2d array, useful for sub_inv below
107    U = v - sub[:, n]
108    rown = rown / (1. + np.dot(rown, U))
109    # The following line is equivalent to (but faster)
110    # sub_inv -= np.outer(np.dot(sub_inv, U), rown)
111    sub_inv -= np.dot(np.dot(sub_inv, U)[:, np.newaxis], rown)
112    sub[:, n] = v   # equivalent to sub[n, :] += U
113
114    # Make sub_inv symmetric (overcome some numerical limitations)
115    sub_inv += sub_inv.T.copy()
116    sub_inv /= 2.
117
118
119def _assert_submatrix(full, sub, n):
120    """Check that "sub" is the matrix obtained by removing the p-th col and row
121    in "full". Used only for debugging.
122
123    """
124    true_sub = np.empty_like(sub)
125    true_sub[:n, :n] = full[:n, :n]
126    true_sub[n:, n:] = full[n + 1:, n + 1:]
127    true_sub[:n, n:] = full[:n, n + 1:]
128    true_sub[n:, :n] = full[n + 1:, :n]
129
130    np.testing.assert_almost_equal(true_sub, sub)
131
132
133def group_sparse_covariance(subjects, alpha, max_iter=50, tol=1e-3, verbose=0,
134                            probe_function=None, precisions_init=None,
135                            debug=False):
136    """Compute sparse precision matrices and covariance matrices.
137
138    The precision matrices returned by this function are sparse, and share a
139    common sparsity pattern: all have zeros at the same location. This is
140    achieved by simultaneous computation of all precision matrices at the
141    same time.
142
143    Running time is linear on max_iter, and number of subjects (len(subjects)),
144    but cubic on number of features (subjects[0].shape[1]).
145
146    The present algorithm is based on :footcite:`Honorio2015`.
147
148    Parameters
149    ----------
150    subjects : list of numpy.ndarray
151        input subjects. Each subject is a 2D array, whose columns contain
152        signals. Each array shape must be (sample number, feature number).
153        The sample number can vary from subject to subject, but all subjects
154        must have the same number of features (i.e. of columns).
155
156    alpha : float
157        regularization parameter. With normalized covariances matrices and
158        number of samples, sensible values lie in the [0, 1] range(zero is
159        no regularization: output is not sparse)
160
161    max_iter : int, optional
162        maximum number of iterations. Default=50.
163
164    tol : positive float or None, optional
165        The tolerance to declare convergence: if the duality gap goes below
166        this value, optimization is stopped. If None, no check is performed.
167        Default=1e-3.
168
169    verbose : int, optional
170        verbosity level. Zero means "no message". Default=0.
171
172    probe_function : callable or None, optional
173        This value is called before the first iteration and after each
174        iteration. If it returns True, then optimization is stopped
175        prematurely.
176        The function is given as arguments (in that order):
177
178        - empirical covariances (ndarray),
179        - number of samples for each subject (ndarray),
180        - regularization parameter (float)
181        - maximum iteration number (integer)
182        - tolerance (float)
183        - current iteration number (integer). -1 means "before first iteration"
184        - current value of precisions (ndarray).
185        - previous value of precisions (ndarray). None before first iteration.
186
187    precisions_init : numpy.ndarray, optional
188        initial value of the precision matrices. If not provided, a diagonal
189        matrix with the variances of each input signal is used.
190
191    debug : bool, optional
192        if True, perform checks during computation. It can help find
193        numerical problems, but increases computation time a lot.
194        Default=False.
195
196    Returns
197    -------
198    emp_covs : numpy.ndarray, shape (n_features, n_features, n_subjects)
199        empirical covariances matrices
200
201    precisions : numpy.ndarray, shape (n_features, n_features, n_subjects)
202        estimated precision matrices
203
204    References
205    ----------
206    .. footbibliography::
207
208    """
209
210    emp_covs, n_samples = empirical_covariances(
211        subjects, assume_centered=False)
212
213    precisions = _group_sparse_covariance(
214        emp_covs, n_samples, alpha, max_iter=max_iter, tol=tol,
215        verbose=verbose, precisions_init=precisions_init,
216        probe_function=probe_function, debug=debug)
217
218    return emp_covs, precisions
219
220
221def _group_sparse_covariance(emp_covs, n_samples, alpha, max_iter=10, tol=1e-3,
222                             precisions_init=None, probe_function=None,
223                             verbose=0, debug=False):
224    """Internal version of group_sparse_covariance.
225    See its docstring for details.
226
227    """
228    if tol == -1:
229        tol = None
230    if not isinstance(alpha, (int, float)) or alpha < 0:
231        raise ValueError("Regularization parameter alpha must be a "
232                         "positive number.\n"
233                         "You provided: {0}".format(str(alpha)))
234
235    n_subjects = emp_covs.shape[-1]
236    n_features = emp_covs[0].shape[0]
237    n_samples = np.asarray(n_samples)
238    n_samples /= n_samples.sum()  # essential for numerical stability
239
240    # Check diagonal normalization.
241    ones = np.ones(emp_covs.shape[0])
242    for k in range(n_subjects):
243        if (abs(emp_covs[..., k].flat[::emp_covs.shape[0] + 1] - ones)
244                > 0.1).any():
245            warnings.warn("input signals do not all have unit variance. This "
246                          "can lead to numerical instability.")
247            break
248
249    if precisions_init is None:
250        # Fortran order make omega[..., k] contiguous, which is often useful.
251        omega = np.ndarray(shape=emp_covs.shape, dtype=np.float64,
252                           order="F")
253        for k in range(n_subjects):
254            # Values on main diagonals are far from zero, because they
255            # are timeseries energy.
256            omega[..., k] = np.diag(1. / np.diag(emp_covs[..., k]))
257    else:
258        omega = precisions_init.copy()
259
260    # Preallocate arrays
261    y = np.ndarray(shape=(n_subjects, n_features - 1), dtype=np.float64)
262    u = np.ndarray(shape=(n_subjects, n_features - 1), dtype=np.float64)
263    y_1 = np.ndarray(shape=(n_subjects, n_features - 2), dtype=np.float64)
264    h_12 = np.ndarray(shape=(n_subjects, n_features - 2), dtype=np.float64)
265    q = np.ndarray(shape=(n_subjects,), dtype=np.float64)
266    aq = np.ndarray(shape=(n_subjects,), dtype=np.float64)  # temp. array
267    c = np.ndarray(shape=(n_subjects,), dtype=np.float64)
268    W = np.ndarray(shape=(omega.shape[0] - 1, omega.shape[1] - 1,
269                          omega.shape[2]),
270                   dtype=np.float64, order="F")
271    W_inv = np.ndarray(shape=W.shape, dtype=np.float64, order="F")
272
273    # Auxiliary arrays.
274    v = np.ndarray((omega.shape[0] - 1,), dtype=np.float64)
275    h = np.ndarray((omega.shape[1] - 1,), dtype=np.float64)
276
277    # Optional.
278    tolerance_reached = False
279    max_norm = None
280
281    omega_old = np.empty_like(omega)
282    if probe_function is not None:
283        # iteration number -1 means called before iteration loop.
284        probe_function(emp_covs, n_samples, alpha, max_iter, tol,
285                       -1, omega, None)
286    probe_interrupted = False
287
288    # Start optimization loop. Variables are named following (mostly) the
289    # Honorio-Samaras paper notations.
290
291    # Used in the innermost loop. Computed here to save some computation.
292    alpha2 = alpha ** 2
293
294    for n in range(max_iter):
295        if max_norm is not None:
296            suffix = (" variation (max norm): {max_norm:.3e} ".format(
297                max_norm=max_norm))
298        else:
299            suffix = ""
300        if verbose > 1:
301            logger.log("* iteration {iter_n:d} ({percentage:.0f} %){suffix}"
302                    " ...".format(iter_n=n, percentage=100. * n / max_iter,
303                                  suffix=suffix), verbose=verbose)
304
305        omega_old[...] = omega
306        for p in range(n_features):
307
308            if p == 0:
309                # Initial state: remove first col/row
310                W = omega[1:, 1:, :].copy()   # stack of W(k)
311                W_inv = np.ndarray(shape=W.shape, dtype=np.float64)
312                for k in range(W.shape[2]):
313                    # stack of W^-1(k)
314                    W_inv[..., k] = scipy.linalg.inv(W[..., k])
315                    if debug:
316                        np.testing.assert_almost_equal(
317                            np.dot(W_inv[..., k], W[..., k]),
318                            np.eye(W_inv[..., k].shape[0]), decimal=10)
319                        _assert_submatrix(omega[..., k], W[..., k], p)
320                        assert(is_spd(W_inv[..., k]))
321            else:
322                # Update W and W_inv
323                if debug:
324                    omega_orig = omega.copy()
325
326                for k in range(n_subjects):
327                    _update_submatrix(omega[..., k],
328                                      W[..., k], W_inv[..., k], p, h, v)
329
330                    if debug:
331                        _assert_submatrix(omega[..., k], W[..., k], p)
332                        assert(is_spd(W_inv[..., k], decimal=14))
333                        np.testing.assert_almost_equal(
334                            np.dot(W[..., k], W_inv[..., k]),
335                            np.eye(W_inv[..., k].shape[0]), decimal=10)
336                if debug:
337                    # Check that omega has not been modified.
338                    np.testing.assert_almost_equal(omega_orig, omega)
339
340            # In the following lines, implicit loop on k (subjects)
341            # Extract y and u
342            y[:, :p] = omega[:p, p, :].T
343            y[:, p:] = omega[p + 1:, p, :].T
344
345            u[:, :p] = emp_covs[:p, p, :].T
346            u[:, p:] = emp_covs[p + 1:, p, :].T
347
348            for m in range(n_features - 1):
349                # Coordinate descent on y
350
351                # T(k) -> n_samples[k]
352                # v(k) -> emp_covs[p, p, k]
353                # h_22(k) -> W_inv[m, m, k]
354                # h_12(k) -> W_inv[:m, m, k],  W_inv[m+1:, m, k]
355                # y_1(k) -> y[k, :m], y[k, m+1:]
356                # u_2(k) -> u[k, m]
357                h_12[:, :m] = W_inv[:m, m, :].T
358                h_12[:, m:] = W_inv[m + 1:, m, :].T
359                y_1[:, :m] = y[:, :m]
360                y_1[:, m:] = y[:, m + 1:]
361
362                c[:] = - n_samples * (
363                    emp_covs[p, p, :] * (h_12 * y_1).sum(axis=1) + u[:, m]
364                    )
365                c2 = np.sqrt(np.dot(c, c))
366
367                # x -> y[:][m]
368                if c2 <= alpha:
369                    y[:, m] = 0  # x* = 0
370                else:
371                    # q(k) -> T(k) * v(k) * h_22(k)
372                    # \lambda -> gamma   (lambda is a Python keyword)
373                    q[:] = n_samples * emp_covs[p, p, :] * W_inv[m, m, :]
374                    if debug:
375                        assert(np.all(q > 0))
376                    # x* = \lambda* diag(1 + \lambda q)^{-1} c
377
378                    # Newton-Raphson loop. Loosely based on Scipy's.
379                    # Tolerance does not seem to be important for numerical
380                    # stability (tolerance of 1e-2 works) but has an effect on
381                    # overall convergence rate (the tighter the better.)
382
383                    gamma = 0.  # initial value
384                    # Precompute some quantities
385                    cc = c * c
386                    two_ccq = 2. * cc * q
387                    for _ in itertools.repeat(None, 100):
388                        # Function whose zero must be determined (fval) and
389                        # its derivative (fder).
390                        # Written inplace to save some function calls.
391                        aq = 1. + gamma * q
392                        aq2 = aq * aq
393                        fder = (two_ccq / (aq2 * aq)).sum()
394
395                        if fder == 0:
396                            msg = "derivative was zero."
397                            warnings.warn(msg, RuntimeWarning)
398                            break
399                        fval = - (alpha2 - (cc / aq2).sum()) / fder
400                        gamma = fval + gamma
401                        if abs(fval) < 1.5e-8:
402                            break
403
404                    if abs(fval) > 0.1:
405                        warnings.warn("Newton-Raphson step did not converge.\n"
406                                      "This may indicate a badly conditioned "
407                                      "system.")
408
409                    if debug:
410                        assert gamma >= 0., gamma
411                    y[:, m] = (gamma * c) / aq  # x*
412
413            # Copy back y in omega (column and row)
414            omega[:p, p, :] = y[:, :p].T
415            omega[p + 1:, p, :] = y[:, p:].T
416            omega[p, :p, :] = y[:, :p].T
417            omega[p, p + 1:, :] = y[:, p:].T
418
419            for k in range(n_subjects):
420                omega[p, p, k] = 1. / emp_covs[p, p, k] + np.dot(
421                    np.dot(y[k, :], W_inv[..., k]), y[k, :])
422
423                if debug:
424                    assert(is_spd(omega[..., k]))
425
426        if probe_function is not None:
427            if probe_function(emp_covs, n_samples, alpha, max_iter, tol,
428                              n, omega, omega_old) is True:
429                probe_interrupted = True
430                logger.log("probe_function interrupted loop", verbose=verbose,
431                           msg_level=2)
432                break
433
434        # Compute max of variation
435        omega_old -= omega
436        omega_old = abs(omega_old)
437        max_norm = omega_old.max()
438
439        if tol is not None and max_norm < tol:
440            logger.log("tolerance reached at iteration number {0:d}: {1:.3e}"
441                       "".format(n + 1, max_norm), verbose=verbose)
442            tolerance_reached = True
443            break
444
445    if tol is not None and not tolerance_reached and not probe_interrupted:
446        warnings.warn("Maximum number of iterations reached without getting "
447                      "to the requested tolerance level.")
448
449    return omega
450
451
452class GroupSparseCovariance(BaseEstimator, CacheMixin):
453    """Covariance and precision matrix estimator.
454
455    The model used has been introduced in :footcite:`Varoquaux2010a`, and the
456    algorithm used is based on what is described in :footcite:`Honorio2015`.
457
458    Parameters
459    ----------
460    alpha : float, optional
461        regularization parameter. With normalized covariances matrices and
462        number of samples, sensible values lie in the [0, 1] range(zero is
463        no regularization: output is not sparse).
464        Default=0.1.
465
466    tol : positive float, optional
467        The tolerance to declare convergence: if the dual gap goes below
468        this value, iterations are stopped. Default=1e-3.
469
470    max_iter : int, optional
471        maximum number of iterations. The default value is rather
472        conservative. Default=10.
473
474    verbose : int, optional
475        verbosity level. Zero means "no message". Default=0.
476
477    memory : instance of joblib.Memory or string, optional
478        Used to cache the masking process.
479        By default, no caching is done. If a string is given, it is the
480        path to the caching directory.
481
482    memory_level : int, optional
483        Caching aggressiveness. Higher values mean more caching.
484        Default=0.
485
486    Attributes
487    ----------
488    `covariances_` : numpy.ndarray, shape (n_features, n_features, n_subjects)
489        empirical covariance matrices.
490
491    `precisions_` : numpy.ndarraye, shape (n_features, n_features, n_subjects)
492        precisions matrices estimated using the group-sparse algorithm.
493
494    References
495    ----------
496    .. footbibliography::
497
498    """
499
500    def __init__(self, alpha=0.1, tol=1e-3, max_iter=10, verbose=0,
501                 memory=Memory(location=None), memory_level=0):
502        self.alpha = alpha
503        self.tol = tol
504        self.max_iter = max_iter
505
506        self.memory = memory
507        self.memory_level = memory_level
508        self.verbose = verbose
509
510    def fit(self, subjects, y=None):
511        """Fits the group sparse precision model according to the given
512        training data and parameters.
513
514        Parameters
515        ----------
516        subjects : list of numpy.ndarray with shapes (n_samples, n_features)
517            input subjects. Each subject is a 2D array, whose columns contain
518            signals. Sample number can vary from subject to subject, but all
519            subjects must have the same number of features (i.e. of columns).
520
521        Returns
522        -------
523        self : GroupSparseCovariance instance
524            the object itself. Useful for chaining operations.
525
526        """
527        logger.log("Computing covariance matrices", verbose=self.verbose)
528        self.covariances_, n_samples = empirical_covariances(
529            subjects, assume_centered=False)
530
531        logger.log("Computing precision matrices", verbose=self.verbose)
532        ret = self._cache(_group_sparse_covariance)(
533                self.covariances_, n_samples, self.alpha,
534                tol=self.tol, max_iter=self.max_iter,
535                verbose=max(0, self.verbose - 1), debug=False)
536
537        self.precisions_ = ret
538        return self
539
540
541def empirical_covariances(subjects, assume_centered=False, standardize=False):
542    """Compute empirical covariances for several signals.
543
544    Parameters
545    ----------
546    subjects : list of numpy.ndarray, shape for each (n_samples, n_features)
547        input subjects. Each subject is a 2D array, whose columns contain
548        signals. Sample number can vary from subject to subject, but all
549        subjects must have the same number of features (i.e. of columns).
550
551    assume_centered : bool, optional
552        if True, assume that all input signals are centered. This slightly
553        decreases computation time by avoiding useless computation.
554        Default=False.
555
556    standardize : bool, optional
557        if True, set every signal variance to one before computing their
558        covariance matrix (i.e. compute a correlation matrix).
559        Default=False.
560
561    Returns
562    -------
563    emp_covs : numpy.ndarray, shape : (feature number, feature number, subject number)
564        empirical covariances.
565
566    n_samples : numpy.ndarray, shape: (subject number,)
567        number of samples for each subject. dtype is np.float64.
568
569    """
570    if not hasattr(subjects, "__iter__"):
571        raise ValueError("'subjects' input argument must be an iterable. "
572                         "You provided {0}".format(subjects.__class__))
573
574    n_subjects = [s.shape[1] for s in subjects]
575    if len(set(n_subjects)) > 1:
576        raise ValueError("All subjects must have the same number of "
577                         "features.\nYou provided: {0}".format(str(n_subjects))
578                         )
579    n_subjects = len(subjects)
580    n_features = subjects[0].shape[1]
581
582    # Enable to change dtype here because depending on user, conversion from
583    # single precision to double will be required or not.
584    emp_covs = np.empty((n_features, n_features, n_subjects), order="F")
585    for k, s in enumerate(subjects):
586        if standardize:
587            s = s / s.std(axis=0)  # copy on purpose
588        M = empirical_covariance(s, assume_centered=assume_centered)
589
590        # Force matrix symmetry, for numerical stability
591        # of _group_sparse_covariance
592        emp_covs[..., k] = M + M.T
593    emp_covs /= 2
594
595    n_samples = np.asarray([s.shape[0] for s in subjects], dtype=np.float64)
596
597    return emp_covs, n_samples
598
599
600def group_sparse_scores(precisions, n_samples, emp_covs, alpha,
601                        duality_gap=False, debug=False):
602    """Compute scores used by group_sparse_covariance.
603
604    The log-likelihood of a given list of empirical covariances /
605    precisions.
606
607    Parameters
608    ----------
609    precisions : numpy.ndarray, shape (n_features, n_features, n_subjects)
610        estimated precisions.
611
612    n_samples : array-like, shape (n_subjects,)
613        number of samples used in estimating each subject in "precisions".
614        n_samples.sum() must be equal to 1.
615
616    emp_covs : numpy.ndarray, shape (n_features, n_features, n_subjects)
617        empirical covariance matrix
618
619    alpha : float
620        regularization parameter
621
622    duality_gap : bool, optional
623        if True, also returns a duality gap upper bound. Default=False.
624
625    debug : bool, optional
626        if True, some consistency checks are performed to help solving
627        numerical problems. Default=False.
628
629    Returns
630    -------
631    log_lik : float
632        log-likelihood of precisions on the given covariances. This is the
633        opposite of the loss function, without the regularization term
634
635    objective : float
636        value of objective function. This is the value minimized by
637        group_sparse_covariance()
638
639    duality_gap : float
640        duality gap upper bound. The returned bound is tight: it vanishes for
641        the optimal precision matrices
642
643    """
644    n_features, _, n_subjects = emp_covs.shape
645
646    log_lik = 0
647    for k in range(n_subjects):
648        log_lik_k = - np.sum(emp_covs[...,  k] * precisions[..., k])
649        log_lik_k += fast_logdet(precisions[..., k])
650        log_lik += n_samples[k] * log_lik_k
651
652    l2 = np.sqrt((precisions ** 2).sum(axis=-1))
653    l12 = l2.sum() - np.diag(l2).sum()  # Do not count diagonal terms
654    objective = alpha * l12 - log_lik
655    ret = (log_lik, objective)
656
657    # Compute duality gap if requested
658    if duality_gap is True:
659        A = np.empty(precisions.shape, dtype=np.float64, order="F")
660        for k in range(n_subjects):
661            # TODO: can be computed more efficiently using W_inv. See
662            # Friedman, Jerome, Trevor Hastie, and Robert Tibshirani.
663            # 'Sparse Inverse Covariance Estimation with the Graphical Lasso'.
664            # Biostatistics 9, no. 3 (1 July 2008): 432-441.
665            precisions_inv = scipy.linalg.inv(precisions[..., k])
666            if debug:
667                assert is_spd(precisions_inv)
668
669            A[..., k] = n_samples[k] * (precisions_inv - emp_covs[..., k])
670
671            if debug:
672                np.testing.assert_almost_equal(A[..., k], A[..., k].T)
673
674        # Project A on the set of feasible points
675        alpha_max = np.sqrt((A ** 2).sum(axis=-1))
676        mask = alpha_max > alpha
677        for k in range(A.shape[-1]):
678            A[mask, k] *= alpha / alpha_max[mask]
679            # Set zeros on diagonals. Essential to get an always positive
680            # duality gap.
681            A[..., k].flat[::A.shape[0] + 1] = 0
682
683        dual_obj = 0  # dual objective
684        for k in range(n_subjects):
685            B = emp_covs[..., k] + A[..., k] / n_samples[k]
686            dual_obj += n_samples[k] * (n_features + fast_logdet(B))
687
688        # The previous computation can lead to a non-feasible point, because
689        # one of the Bs may not be positive definite.
690        # Use another value in this case, that ensure positive definiteness
691        # of B. The upper bound on the duality gap is not tight in the
692        # following, but is smaller than infinity, which is better in any case.
693        if not np.isfinite(dual_obj):
694            for k in range(n_subjects):
695                A[..., k] = - n_samples[k] * emp_covs[..., k]
696                A[..., k].flat[::A.shape[0] + 1] = 0
697            alpha_max = np.sqrt((A ** 2).sum(axis=-1)).max()
698            # the second value (0.05 is arbitrary: positive in ]0,1[)
699            gamma = min((alpha / alpha_max, 0.05))
700            dual_obj = 0
701            for k in range(n_subjects):
702                # add gamma on the diagonal
703                B = ((1. - gamma) * emp_covs[..., k]
704                     + gamma * np.eye(emp_covs.shape[0]))
705                dual_obj += n_samples[k] * (n_features + fast_logdet(B))
706
707        gap = objective - dual_obj
708        ret = ret + (gap,)
709    return ret
710
711
712def group_sparse_covariance_path(train_subjs, alphas, test_subjs=None,
713                                 tol=1e-3, max_iter=10, precisions_init=None,
714                                 verbose=0, debug=False, probe_function=None):
715    """Get estimated precision matrices for different values of alpha.
716
717    Calling this function is faster than calling group_sparse_covariance()
718    repeatedly, because it makes use of the first result to initialize the
719    next computation.
720
721    Parameters
722    ----------
723    train_subjs : list of numpy.ndarray
724        list of signals.
725
726    alphas : list of float
727         values of alpha to use. Best results for sorted values (decreasing)
728
729    test_subjs : list of numpy.ndarray, optional
730        list of signals, independent from those in train_subjs, on which to
731        compute a score. If None, no score is computed.
732
733    verbose : int, optional
734        verbosity level. Default=0.
735
736    tol, max_iter, debug, precisions_init :
737        Passed to group_sparse_covariance(). See the corresponding docstring
738        for details.
739
740    probe_function : callable, optional
741        This value is called before the first iteration and after each
742        iteration. If it returns True, then optimization is stopped
743        prematurely.
744        The function is given as arguments (in that order):
745
746        - empirical covariances (ndarray),
747        - number of samples for each subject (ndarray),
748        - regularization parameter (float)
749        - maximum iteration number (integer)
750        - tolerance (float)
751        - current iteration number (integer). -1 means "before first iteration"
752        - current value of precisions (ndarray).
753        - previous value of precisions (ndarray). None before first iteration.
754
755    Returns
756    -------
757    precisions_list : list of numpy.ndarray
758        estimated precisions for each value of alpha provided. The length of
759        this list is the same as that of parameter "alphas".
760
761    scores : list of float
762        for each estimated precision, score obtained on the test set. Output
763        only if test_subjs is not None.
764
765    """
766    train_covs, train_n_samples = empirical_covariances(
767        train_subjs, assume_centered=False, standardize=True)
768
769    scores = []
770    precisions_list = []
771    for alpha in alphas:
772        precisions = _group_sparse_covariance(
773            train_covs, train_n_samples, alpha, tol=tol, max_iter=max_iter,
774            precisions_init=precisions_init, verbose=max(0, verbose - 1),
775            debug=debug, probe_function=probe_function)
776
777        # Compute log-likelihood
778        if test_subjs is not None:
779            test_covs, _ = empirical_covariances(
780                test_subjs, assume_centered=False, standardize=True)
781            scores.append(group_sparse_scores(precisions, train_n_samples,
782                                              test_covs, 0)[0])
783        precisions_list.append(precisions)
784        precisions_init = precisions
785
786    if test_subjs is not None:
787        return precisions_list, scores
788    else:
789        return precisions_list
790
791
792class EarlyStopProbe(object):
793    """Callable probe for early stopping in GroupSparseCovarianceCV.
794
795    Stop optimizing as soon as the score on the test set starts decreasing.
796    An instance of this class is supposed to be passed in the probe_function
797    argument of group_sparse_covariance().
798
799    """
800    def __init__(self, test_subjs, verbose=0):
801        self.test_emp_covs, _ = empirical_covariances(test_subjs)
802        self.verbose = verbose
803
804    def __call__(self, emp_covs, n_samples, alpha, max_iter, tol,
805                 iter_n, omega, prev_omega):
806        log_lik, _ = group_sparse_scores(
807            omega, n_samples, self.test_emp_covs, alpha)
808        if iter_n > -1 and self.last_log_lik > log_lik:
809            logger.log("Log-likelihood on test set is decreasing. "
810                       "Stopping at iteration %d"
811                       % iter_n, verbose=self.verbose)
812            return True
813        self.last_log_lik = log_lik
814
815
816class GroupSparseCovarianceCV(BaseEstimator, CacheMixin):
817    """Sparse inverse covariance w/ cross-validated choice of the parameter.
818
819    A cross-validated value for the regularization parameter is first
820    determined using several calls to group_sparse_covariance. Then a final
821    optimization is run to get a value for the precision matrices, using the
822    selected value of the parameter. Different values of tolerance and of
823    maximum iteration number can be used in these two phases (see the tol
824    and tol_cv keyword below for example).
825
826    Parameters
827    ----------
828    alphas : integer, optional
829        initial number of points in the grid of regularization parameter
830        values. Each step of grid refinement adds that many points as well.
831        Default=4.
832
833    n_refinements : integer, optional
834        number of times the initial grid should be refined. Default=4.
835
836    cv : integer, optional
837        number of folds in a K-fold cross-validation scheme. If None is passed,
838        defaults to 3.
839
840    tol_cv : float, optional
841        tolerance used to get the optimal alpha value. It has the same meaning
842        as the `tol` parameter in :func:`group_sparse_covariance`.
843        Default=1e-2.
844
845    max_iter_cv : integer, optional
846        maximum number of iterations for each optimization, during the alpha-
847        selection phase. Default=50.
848
849    tol : float, optional
850        tolerance used during the final optimization for determining precision
851        matrices value. Default=1e-3.
852
853    max_iter : integer, optional
854        maximum number of iterations in the final optimization. Default=100.
855
856    verbose : integer, optional
857        verbosity level. 0 means nothing is printed to the user. Default=0.
858
859    n_jobs : integer, optional
860        maximum number of cpu cores to use. The number of cores actually used
861        at the same time cannot exceed the number of folds in folding strategy
862        (that is, the value of cv). Default=1.
863
864    debug : bool, optional
865        if True, activates some internal checks for consistency. Only useful
866        for nilearn developers, not users. Default=False.
867
868    early_stopping : bool, optional
869        if True, reduce computation time by using a heuristic to reduce the
870        number of iterations required to get the optimal value for alpha. Be
871        aware that this can lead to slightly different values for the optimal
872        alpha compared to early_stopping=False. Default=True.
873
874    Attributes
875    ----------
876    `covariances_` : numpy.ndarray, shape (n_features, n_features, n_subjects)
877        covariance matrices, one per subject.
878
879    `precisions_` : numpy.ndarray, shape (n_features, n_features, n_subjects)
880        precision matrices, one per subject. All matrices have the same
881        sparsity pattern (if a coefficient is zero for a given matrix, it
882        is also zero for every other.)
883
884    `alpha_` : float
885        penalization parameter value selected.
886
887    `cv_alphas_` : list of floats
888        all values of the penalization parameter explored.
889
890    `cv_scores_` : numpy.ndarray, shape (n_alphas, n_folds)
891        scores obtained on test set for each value of the penalization
892        parameter explored.
893
894    See also
895    --------
896    GroupSparseCovariance,
897    sklearn.covariance.GraphicalLassoCV
898
899    Notes
900    -----
901    The search for the optimal penalization parameter (alpha) is done on an
902    iteratively refined grid: first the cross-validated scores on a grid are
903    computed, then a new refined grid is centered around the maximum, and so
904    on.
905
906    """
907    def __init__(self, alphas=4, n_refinements=4, cv=None,
908                 tol_cv=1e-2, max_iter_cv=50,
909                 tol=1e-3, max_iter=100, verbose=0,
910                 n_jobs=1, debug=False, early_stopping=True):
911        self.alphas = alphas
912        self.n_refinements = n_refinements
913        self.tol_cv = tol_cv
914        self.max_iter_cv = max_iter_cv
915        self.cv = cv
916        self.tol = tol
917        self.max_iter = max_iter
918
919        self.verbose = verbose
920        self.n_jobs = n_jobs
921        self.debug = debug
922        self.early_stopping = early_stopping
923
924    def fit(self, subjects, y=None):
925        """Compute cross-validated group-sparse precisions.
926
927        Parameters
928        ----------
929        subjects : list of numpy.ndarray with shapes (n_samples, n_features)
930            input subjects. Each subject is a 2D array, whose columns contain
931            signals. Sample number can vary from subject to subject, but all
932            subjects must have the same number of features (i.e. of columns.)
933
934        Returns
935        -------
936        self : GroupSparseCovarianceCV
937            the object instance itself.
938
939        """
940        # Empirical covariances
941        emp_covs, n_samples = \
942                  empirical_covariances(subjects, assume_centered=False)
943        n_subjects = emp_covs.shape[2]
944
945        # One cv generator per subject must be created, because each subject
946        # can have a different number of samples from the others.
947        cv = []
948        for k in range(n_subjects):
949            cv.append(check_cv(
950                    self.cv, np.ones(subjects[k].shape[0]),
951                    classifier=False
952                    ).split(subjects[k])
953                      )
954        path = list()  # List of (alpha, scores, covs)
955        n_alphas = self.alphas
956
957        if isinstance(n_alphas, collections.abc.Sequence):
958            alphas = list(self.alphas)
959            n_refinements = 1
960        else:
961            n_refinements = self.n_refinements
962            alpha_1, _ = compute_alpha_max(emp_covs, n_samples)
963            alpha_0 = 1e-2 * alpha_1
964            alphas = np.logspace(np.log10(alpha_0), np.log10(alpha_1),
965                                 n_alphas)[::-1]
966
967        covs_init = itertools.repeat(None)
968
969        # Copying the cv generators to use them n_refinements times.
970        cv_ = zip(*cv)
971
972        for i, (this_cv) in enumerate(itertools.tee(cv_, n_refinements)):
973            # Compute the cross-validated loss on the current grid
974            train_test_subjs = []
975            for train_test in this_cv:
976                assert(len(train_test) == n_subjects)
977                train_test_subjs.append(list(zip(*[(subject[train, :],
978                                                    subject[test, :])
979                                             for subject, (train, test)
980                                             in zip(subjects, train_test)])))
981            if self.early_stopping:
982                probes = [EarlyStopProbe(test_subjs,
983                                         verbose=max(0, self.verbose - 1))
984                          for _, test_subjs in train_test_subjs]
985            else:
986                probes = itertools.repeat(None)
987
988            this_path = Parallel(n_jobs=self.n_jobs,
989                                 verbose=self.verbose)(
990                delayed(group_sparse_covariance_path)(
991                    train_subjs, alphas, test_subjs=test_subjs,
992                    max_iter=self.max_iter_cv, tol=self.tol_cv,
993                    verbose=max(0, self.verbose - 1), debug=self.debug,
994                    # Warm restart is useless with early stopping.
995                    precisions_init=None if self.early_stopping else prec_init,
996                    probe_function=probe)
997                for (train_subjs, test_subjs), prec_init, probe
998                in zip(train_test_subjs, covs_init, probes))
999
1000            # this_path[i] is a tuple (precisions_list, scores)
1001            # - scores: scores obtained with the i-th folding, for each value
1002            #   of alpha.
1003            # - precisions_list: corresponding precisions matrices, for each
1004            #   value of alpha.
1005            precisions_list, scores = list(zip(*this_path))
1006            # now scores[i][j] is the score for the i-th folding, j-th value of
1007            # alpha (analogous for precisions_list)
1008            precisions_list = list(zip(*precisions_list))
1009            scores = [np.mean(sc) for sc in zip(*scores)]
1010            # scores[i] is the mean score obtained for the i-th value of alpha.
1011
1012            path.extend(list(zip(alphas, scores, precisions_list)))
1013            path = sorted(path, key=operator.itemgetter(0), reverse=True)
1014
1015            # Find the maximum score (avoid using the built-in 'max' function
1016            # to have a fully-reproducible selection of the smallest alpha in
1017            # case of equality)
1018            best_score = -np.inf
1019            last_finite_idx = 0
1020            for index, (alpha, this_score, _) in enumerate(path):
1021                if this_score >= .1 / np.finfo(np.float64).eps:
1022                    this_score = np.nan
1023                if np.isfinite(this_score):
1024                    last_finite_idx = index
1025                if this_score >= best_score:
1026                    best_score = this_score
1027                    best_index = index
1028
1029            # Refine the grid
1030            if best_index == 0:
1031                # We do not need to go back: we have chosen
1032                # the highest value of alpha for which there are
1033                # non-zero coefficients
1034                alpha_1 = path[0][0]
1035                alpha_0 = path[1][0]
1036                covs_init = path[0][2]
1037            elif (best_index == last_finite_idx
1038                    and not best_index == len(path) - 1):
1039                # We have non-converged models on the upper bound of the
1040                # grid, we need to refine the grid there
1041                alpha_1 = path[best_index][0]
1042                alpha_0 = path[best_index + 1][0]
1043                covs_init = path[best_index][2]
1044            elif best_index == len(path) - 1:
1045                alpha_1 = path[best_index][0]
1046                alpha_0 = 0.01 * path[best_index][0]
1047                covs_init = path[best_index][2]
1048            else:
1049                alpha_1 = path[best_index - 1][0]
1050                alpha_0 = path[best_index + 1][0]
1051                covs_init = path[best_index - 1][2]
1052            alphas = np.logspace(np.log10(alpha_1), np.log10(alpha_0),
1053                                 len(alphas) + 2)
1054            alphas = alphas[1:-1]
1055            if n_refinements > 1:
1056                logger.log("[GroupSparseCovarianceCV] Done refinement "
1057                           "% 2i out of %i" % (i + 1, n_refinements),
1058                           verbose=self.verbose)
1059
1060        path = list(zip(*path))
1061        cv_scores_ = list(path[1])
1062        alphas = list(path[0])
1063
1064        self.cv_scores_ = np.array(cv_scores_)
1065        self.alpha_ = alphas[best_index]
1066        self.cv_alphas_ = alphas
1067
1068        # Finally, fit the model with the selected alpha
1069        logger.log("Final optimization", verbose=self.verbose)
1070        self.covariances_ = emp_covs
1071        self.precisions_ = _group_sparse_covariance(
1072            emp_covs, n_samples, self.alpha_, tol=self.tol,
1073            max_iter=self.max_iter,
1074            verbose=max(0, self.verbose - 1), debug=self.debug)
1075        return self
1076