1"""
2Extended math utilities.
3"""
4# Authors: Gael Varoquaux
5#          Alexandre Gramfort
6#          Alexandre T. Passos
7#          Olivier Grisel
8#          Lars Buitinck
9#          Stefan van der Walt
10#          Kyle Kastner
11#          Giorgio Patrini
12# License: BSD 3 clause
13
14import warnings
15
16import numpy as np
17from scipy import linalg, sparse
18
19from . import check_random_state
20from ._logistic_sigmoid import _log_logistic_sigmoid
21from .fixes import np_version, parse_version
22from .sparsefuncs_fast import csr_row_norms
23from .validation import check_array
24
25
26def squared_norm(x):
27    """Squared Euclidean or Frobenius norm of x.
28
29    Faster than norm(x) ** 2.
30
31    Parameters
32    ----------
33    x : array-like
34
35    Returns
36    -------
37    float
38        The Euclidean norm when x is a vector, the Frobenius norm when x
39        is a matrix (2-d array).
40    """
41    x = np.ravel(x, order="K")
42    if np.issubdtype(x.dtype, np.integer):
43        warnings.warn(
44            "Array type is integer, np.dot may overflow. "
45            "Data should be float type to avoid this issue",
46            UserWarning,
47        )
48    return np.dot(x, x)
49
50
51def row_norms(X, squared=False):
52    """Row-wise (squared) Euclidean norm of X.
53
54    Equivalent to np.sqrt((X * X).sum(axis=1)), but also supports sparse
55    matrices and does not create an X.shape-sized temporary.
56
57    Performs no input validation.
58
59    Parameters
60    ----------
61    X : array-like
62        The input array.
63    squared : bool, default=False
64        If True, return squared norms.
65
66    Returns
67    -------
68    array-like
69        The row-wise (squared) Euclidean norm of X.
70    """
71    if sparse.issparse(X):
72        if not isinstance(X, sparse.csr_matrix):
73            X = sparse.csr_matrix(X)
74        norms = csr_row_norms(X)
75    else:
76        norms = np.einsum("ij,ij->i", X, X)
77
78    if not squared:
79        np.sqrt(norms, norms)
80    return norms
81
82
83def fast_logdet(A):
84    """Compute log(det(A)) for A symmetric.
85
86    Equivalent to : np.log(nl.det(A)) but more robust.
87    It returns -Inf if det(A) is non positive or is not defined.
88
89    Parameters
90    ----------
91    A : array-like
92        The matrix.
93    """
94    sign, ld = np.linalg.slogdet(A)
95    if not sign > 0:
96        return -np.inf
97    return ld
98
99
100def density(w, **kwargs):
101    """Compute density of a sparse vector.
102
103    Parameters
104    ----------
105    w : array-like
106        The sparse vector.
107
108    Returns
109    -------
110    float
111        The density of w, between 0 and 1.
112    """
113    if hasattr(w, "toarray"):
114        d = float(w.nnz) / (w.shape[0] * w.shape[1])
115    else:
116        d = 0 if w is None else float((w != 0).sum()) / w.size
117    return d
118
119
120def safe_sparse_dot(a, b, *, dense_output=False):
121    """Dot product that handle the sparse matrix case correctly.
122
123    Parameters
124    ----------
125    a : {ndarray, sparse matrix}
126    b : {ndarray, sparse matrix}
127    dense_output : bool, default=False
128        When False, ``a`` and ``b`` both being sparse will yield sparse output.
129        When True, output will always be a dense array.
130
131    Returns
132    -------
133    dot_product : {ndarray, sparse matrix}
134        Sparse if ``a`` and ``b`` are sparse and ``dense_output=False``.
135    """
136    if a.ndim > 2 or b.ndim > 2:
137        if sparse.issparse(a):
138            # sparse is always 2D. Implies b is 3D+
139            # [i, j] @ [k, ..., l, m, n] -> [i, k, ..., l, n]
140            b_ = np.rollaxis(b, -2)
141            b_2d = b_.reshape((b.shape[-2], -1))
142            ret = a @ b_2d
143            ret = ret.reshape(a.shape[0], *b_.shape[1:])
144        elif sparse.issparse(b):
145            # sparse is always 2D. Implies a is 3D+
146            # [k, ..., l, m] @ [i, j] -> [k, ..., l, j]
147            a_2d = a.reshape(-1, a.shape[-1])
148            ret = a_2d @ b
149            ret = ret.reshape(*a.shape[:-1], b.shape[1])
150        else:
151            ret = np.dot(a, b)
152    else:
153        ret = a @ b
154
155    if (
156        sparse.issparse(a)
157        and sparse.issparse(b)
158        and dense_output
159        and hasattr(ret, "toarray")
160    ):
161        return ret.toarray()
162    return ret
163
164
165def randomized_range_finder(
166    A, *, size, n_iter, power_iteration_normalizer="auto", random_state=None
167):
168    """Compute an orthonormal matrix whose range approximates the range of A.
169
170    Parameters
171    ----------
172    A : 2D array
173        The input data matrix.
174
175    size : int
176        Size of the return array.
177
178    n_iter : int
179        Number of power iterations used to stabilize the result.
180
181    power_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto'
182        Whether the power iterations are normalized with step-by-step
183        QR factorization (the slowest but most accurate), 'none'
184        (the fastest but numerically unstable when `n_iter` is large, e.g.
185        typically 5 or larger), or 'LU' factorization (numerically stable
186        but can lose slightly in accuracy). The 'auto' mode applies no
187        normalization if `n_iter` <= 2 and switches to LU otherwise.
188
189        .. versionadded:: 0.18
190
191    random_state : int, RandomState instance or None, default=None
192        The seed of the pseudo random number generator to use when shuffling
193        the data, i.e. getting the random vectors to initialize the algorithm.
194        Pass an int for reproducible results across multiple function calls.
195        See :term:`Glossary <random_state>`.
196
197    Returns
198    -------
199    Q : ndarray
200        A (size x size) projection matrix, the range of which
201        approximates well the range of the input matrix A.
202
203    Notes
204    -----
205
206    Follows Algorithm 4.3 of
207    Finding structure with randomness: Stochastic algorithms for constructing
208    approximate matrix decompositions
209    Halko, et al., 2009 (arXiv:909) https://arxiv.org/pdf/0909.4061.pdf
210
211    An implementation of a randomized algorithm for principal component
212    analysis
213    A. Szlam et al. 2014
214    """
215    random_state = check_random_state(random_state)
216
217    # Generating normal random vectors with shape: (A.shape[1], size)
218    Q = random_state.normal(size=(A.shape[1], size))
219    if A.dtype.kind == "f":
220        # Ensure f32 is preserved as f32
221        Q = Q.astype(A.dtype, copy=False)
222
223    # Deal with "auto" mode
224    if power_iteration_normalizer == "auto":
225        if n_iter <= 2:
226            power_iteration_normalizer = "none"
227        else:
228            power_iteration_normalizer = "LU"
229
230    # Perform power iterations with Q to further 'imprint' the top
231    # singular vectors of A in Q
232    for i in range(n_iter):
233        if power_iteration_normalizer == "none":
234            Q = safe_sparse_dot(A, Q)
235            Q = safe_sparse_dot(A.T, Q)
236        elif power_iteration_normalizer == "LU":
237            Q, _ = linalg.lu(safe_sparse_dot(A, Q), permute_l=True)
238            Q, _ = linalg.lu(safe_sparse_dot(A.T, Q), permute_l=True)
239        elif power_iteration_normalizer == "QR":
240            Q, _ = linalg.qr(safe_sparse_dot(A, Q), mode="economic")
241            Q, _ = linalg.qr(safe_sparse_dot(A.T, Q), mode="economic")
242
243    # Sample the range of A using by linear projection of Q
244    # Extract an orthonormal basis
245    Q, _ = linalg.qr(safe_sparse_dot(A, Q), mode="economic")
246    return Q
247
248
249def randomized_svd(
250    M,
251    n_components,
252    *,
253    n_oversamples=10,
254    n_iter="auto",
255    power_iteration_normalizer="auto",
256    transpose="auto",
257    flip_sign=True,
258    random_state="warn",
259):
260    """Computes a truncated randomized SVD.
261
262    This method solves the fixed-rank approximation problem described in the
263    Halko et al paper (problem (1.5), p5).
264
265    Parameters
266    ----------
267    M : {ndarray, sparse matrix}
268        Matrix to decompose.
269
270    n_components : int
271        Number of singular values and vectors to extract.
272
273    n_oversamples : int, default=10
274        Additional number of random vectors to sample the range of M so as
275        to ensure proper conditioning. The total number of random vectors
276        used to find the range of M is n_components + n_oversamples. Smaller
277        number can improve speed but can negatively impact the quality of
278        approximation of singular vectors and singular values. Users might wish
279        to increase this parameter up to `2*k - n_components` where k is the
280        effective rank, for large matrices, noisy problems, matrices with
281        slowly decaying spectrums, or to increase precision accuracy. See Halko
282        et al (pages 5, 23 and 26).
283
284    n_iter : int or 'auto', default='auto'
285        Number of power iterations. It can be used to deal with very noisy
286        problems. When 'auto', it is set to 4, unless `n_components` is small
287        (< .1 * min(X.shape)) in which case `n_iter` is set to 7.
288        This improves precision with few components. Note that in general
289        users should rather increase `n_oversamples` before increasing `n_iter`
290        as the principle of the randomized method is to avoid usage of these
291        more costly power iterations steps. When `n_components` is equal
292        or greater to the effective matrix rank and the spectrum does not
293        present a slow decay, `n_iter=0` or `1` should even work fine in theory
294        (see Halko et al paper, page 9).
295
296        .. versionchanged:: 0.18
297
298    power_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto'
299        Whether the power iterations are normalized with step-by-step
300        QR factorization (the slowest but most accurate), 'none'
301        (the fastest but numerically unstable when `n_iter` is large, e.g.
302        typically 5 or larger), or 'LU' factorization (numerically stable
303        but can lose slightly in accuracy). The 'auto' mode applies no
304        normalization if `n_iter` <= 2 and switches to LU otherwise.
305
306        .. versionadded:: 0.18
307
308    transpose : bool or 'auto', default='auto'
309        Whether the algorithm should be applied to M.T instead of M. The
310        result should approximately be the same. The 'auto' mode will
311        trigger the transposition if M.shape[1] > M.shape[0] since this
312        implementation of randomized SVD tend to be a little faster in that
313        case.
314
315        .. versionchanged:: 0.18
316
317    flip_sign : bool, default=True
318        The output of a singular value decomposition is only unique up to a
319        permutation of the signs of the singular vectors. If `flip_sign` is
320        set to `True`, the sign ambiguity is resolved by making the largest
321        loadings for each component in the left singular vectors positive.
322
323    random_state : int, RandomState instance or None, default='warn'
324        The seed of the pseudo random number generator to use when
325        shuffling the data, i.e. getting the random vectors to initialize
326        the algorithm. Pass an int for reproducible results across multiple
327        function calls. See :term:`Glossary <random_state>`.
328
329        .. versionchanged:: 1.2
330            The previous behavior (`random_state=0`) is deprecated, and
331            from v1.2 the default value will be `random_state=None`. Set
332            the value of `random_state` explicitly to suppress the deprecation
333            warning.
334
335    Notes
336    -----
337    This algorithm finds a (usually very good) approximate truncated
338    singular value decomposition using randomization to speed up the
339    computations. It is particularly fast on large matrices on which
340    you wish to extract only a small number of components. In order to
341    obtain further speed up, `n_iter` can be set <=2 (at the cost of
342    loss of precision). To increase the precision it is recommended to
343    increase `n_oversamples`, up to `2*k-n_components` where k is the
344    effective rank. Usually, `n_components` is chosen to be greater than k
345    so increasing `n_oversamples` up to `n_components` should be enough.
346
347    References
348    ----------
349    * Finding structure with randomness: Stochastic algorithms for constructing
350      approximate matrix decompositions (Algorithm 4.3)
351      Halko, et al., 2009 https://arxiv.org/abs/0909.4061
352
353    * A randomized algorithm for the decomposition of matrices
354      Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert
355
356    * An implementation of a randomized algorithm for principal component
357      analysis
358      A. Szlam et al. 2014
359    """
360    if isinstance(M, (sparse.lil_matrix, sparse.dok_matrix)):
361        warnings.warn(
362            "Calculating SVD of a {} is expensive. "
363            "csr_matrix is more efficient.".format(type(M).__name__),
364            sparse.SparseEfficiencyWarning,
365        )
366
367    if random_state == "warn":
368        warnings.warn(
369            "If 'random_state' is not supplied, the current default "
370            "is to use 0 as a fixed seed. This will change to  "
371            "None in version 1.2 leading to non-deterministic results "
372            "that better reflect nature of the randomized_svd solver. "
373            "If you want to silence this warning, set 'random_state' "
374            "to an integer seed or to None explicitly depending "
375            "if you want your code to be deterministic or not.",
376            FutureWarning,
377        )
378        random_state = 0
379
380    random_state = check_random_state(random_state)
381    n_random = n_components + n_oversamples
382    n_samples, n_features = M.shape
383
384    if n_iter == "auto":
385        # Checks if the number of iterations is explicitly specified
386        # Adjust n_iter. 7 was found a good compromise for PCA. See #5299
387        n_iter = 7 if n_components < 0.1 * min(M.shape) else 4
388
389    if transpose == "auto":
390        transpose = n_samples < n_features
391    if transpose:
392        # this implementation is a bit faster with smaller shape[1]
393        M = M.T
394
395    Q = randomized_range_finder(
396        M,
397        size=n_random,
398        n_iter=n_iter,
399        power_iteration_normalizer=power_iteration_normalizer,
400        random_state=random_state,
401    )
402
403    # project M to the (k + p) dimensional space using the basis vectors
404    B = safe_sparse_dot(Q.T, M)
405
406    # compute the SVD on the thin matrix: (k + p) wide
407    Uhat, s, Vt = linalg.svd(B, full_matrices=False)
408
409    del B
410    U = np.dot(Q, Uhat)
411
412    if flip_sign:
413        if not transpose:
414            U, Vt = svd_flip(U, Vt)
415        else:
416            # In case of transpose u_based_decision=false
417            # to actually flip based on u and not v.
418            U, Vt = svd_flip(U, Vt, u_based_decision=False)
419
420    if transpose:
421        # transpose back the results according to the input convention
422        return Vt[:n_components, :].T, s[:n_components], U[:, :n_components].T
423    else:
424        return U[:, :n_components], s[:n_components], Vt[:n_components, :]
425
426
427def _randomized_eigsh(
428    M,
429    n_components,
430    *,
431    n_oversamples=10,
432    n_iter="auto",
433    power_iteration_normalizer="auto",
434    selection="module",
435    random_state=None,
436):
437    """Computes a truncated eigendecomposition using randomized methods
438
439    This method solves the fixed-rank approximation problem described in the
440    Halko et al paper.
441
442    The choice of which components to select can be tuned with the `selection`
443    parameter.
444
445    .. versionadded:: 0.24
446
447    Parameters
448    ----------
449    M : ndarray or sparse matrix
450        Matrix to decompose, it should be real symmetric square or complex
451        hermitian
452
453    n_components : int
454        Number of eigenvalues and vectors to extract.
455
456    n_oversamples : int, default=10
457        Additional number of random vectors to sample the range of M so as
458        to ensure proper conditioning. The total number of random vectors
459        used to find the range of M is n_components + n_oversamples. Smaller
460        number can improve speed but can negatively impact the quality of
461        approximation of eigenvectors and eigenvalues. Users might wish
462        to increase this parameter up to `2*k - n_components` where k is the
463        effective rank, for large matrices, noisy problems, matrices with
464        slowly decaying spectrums, or to increase precision accuracy. See Halko
465        et al (pages 5, 23 and 26).
466
467    n_iter : int or 'auto', default='auto'
468        Number of power iterations. It can be used to deal with very noisy
469        problems. When 'auto', it is set to 4, unless `n_components` is small
470        (< .1 * min(X.shape)) in which case `n_iter` is set to 7.
471        This improves precision with few components. Note that in general
472        users should rather increase `n_oversamples` before increasing `n_iter`
473        as the principle of the randomized method is to avoid usage of these
474        more costly power iterations steps. When `n_components` is equal
475        or greater to the effective matrix rank and the spectrum does not
476        present a slow decay, `n_iter=0` or `1` should even work fine in theory
477        (see Halko et al paper, page 9).
478
479    power_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto'
480        Whether the power iterations are normalized with step-by-step
481        QR factorization (the slowest but most accurate), 'none'
482        (the fastest but numerically unstable when `n_iter` is large, e.g.
483        typically 5 or larger), or 'LU' factorization (numerically stable
484        but can lose slightly in accuracy). The 'auto' mode applies no
485        normalization if `n_iter` <= 2 and switches to LU otherwise.
486
487    selection : {'value', 'module'}, default='module'
488        Strategy used to select the n components. When `selection` is `'value'`
489        (not yet implemented, will become the default when implemented), the
490        components corresponding to the n largest eigenvalues are returned.
491        When `selection` is `'module'`, the components corresponding to the n
492        eigenvalues with largest modules are returned.
493
494    random_state : int, RandomState instance, default=None
495        The seed of the pseudo random number generator to use when shuffling
496        the data, i.e. getting the random vectors to initialize the algorithm.
497        Pass an int for reproducible results across multiple function calls.
498        See :term:`Glossary <random_state>`.
499
500    Notes
501    -----
502    This algorithm finds a (usually very good) approximate truncated
503    eigendecomposition using randomized methods to speed up the computations.
504
505    This method is particularly fast on large matrices on which
506    you wish to extract only a small number of components. In order to
507    obtain further speed up, `n_iter` can be set <=2 (at the cost of
508    loss of precision). To increase the precision it is recommended to
509    increase `n_oversamples`, up to `2*k-n_components` where k is the
510    effective rank. Usually, `n_components` is chosen to be greater than k
511    so increasing `n_oversamples` up to `n_components` should be enough.
512
513    Strategy 'value': not implemented yet.
514    Algorithms 5.3, 5.4 and 5.5 in the Halko et al paper should provide good
515    condidates for a future implementation.
516
517    Strategy 'module':
518    The principle is that for diagonalizable matrices, the singular values and
519    eigenvalues are related: if t is an eigenvalue of A, then :math:`|t|` is a
520    singular value of A. This method relies on a randomized SVD to find the n
521    singular components corresponding to the n singular values with largest
522    modules, and then uses the signs of the singular vectors to find the true
523    sign of t: if the sign of left and right singular vectors are different
524    then the corresponding eigenvalue is negative.
525
526    Returns
527    -------
528    eigvals : 1D array of shape (n_components,) containing the `n_components`
529        eigenvalues selected (see ``selection`` parameter).
530    eigvecs : 2D array of shape (M.shape[0], n_components) containing the
531        `n_components` eigenvectors corresponding to the `eigvals`, in the
532        corresponding order. Note that this follows the `scipy.linalg.eigh`
533        convention.
534
535    See Also
536    --------
537    :func:`randomized_svd`
538
539    References
540    ----------
541    * Finding structure with randomness: Stochastic algorithms for constructing
542      approximate matrix decompositions (Algorithm 4.3 for strategy 'module')
543      Halko, et al., 2009 https://arxiv.org/abs/0909.4061
544
545    """
546    if selection == "value":  # pragma: no cover
547        # to do : an algorithm can be found in the Halko et al reference
548        raise NotImplementedError()
549
550    elif selection == "module":
551        # Note: no need for deterministic U and Vt (flip_sign=True),
552        # as we only use the dot product UVt afterwards
553        U, S, Vt = randomized_svd(
554            M,
555            n_components=n_components,
556            n_oversamples=n_oversamples,
557            n_iter=n_iter,
558            power_iteration_normalizer=power_iteration_normalizer,
559            flip_sign=False,
560            random_state=random_state,
561        )
562
563        eigvecs = U[:, :n_components]
564        eigvals = S[:n_components]
565
566        # Conversion of Singular values into Eigenvalues:
567        # For any eigenvalue t, the corresponding singular value is |t|.
568        # So if there is a negative eigenvalue t, the corresponding singular
569        # value will be -t, and the left (U) and right (V) singular vectors
570        # will have opposite signs.
571        # Fastest way: see <https://stackoverflow.com/a/61974002/7262247>
572        diag_VtU = np.einsum("ji,ij->j", Vt[:n_components, :], U[:, :n_components])
573        signs = np.sign(diag_VtU)
574        eigvals = eigvals * signs
575
576    else:  # pragma: no cover
577        raise ValueError("Invalid `selection`: %r" % selection)
578
579    return eigvals, eigvecs
580
581
582def weighted_mode(a, w, *, axis=0):
583    """Returns an array of the weighted modal (most common) value in a.
584
585    If there is more than one such value, only the first is returned.
586    The bin-count for the modal bins is also returned.
587
588    This is an extension of the algorithm in scipy.stats.mode.
589
590    Parameters
591    ----------
592    a : array-like
593        n-dimensional array of which to find mode(s).
594    w : array-like
595        n-dimensional array of weights for each value.
596    axis : int, default=0
597        Axis along which to operate. Default is 0, i.e. the first axis.
598
599    Returns
600    -------
601    vals : ndarray
602        Array of modal values.
603    score : ndarray
604        Array of weighted counts for each mode.
605
606    Examples
607    --------
608    >>> from sklearn.utils.extmath import weighted_mode
609    >>> x = [4, 1, 4, 2, 4, 2]
610    >>> weights = [1, 1, 1, 1, 1, 1]
611    >>> weighted_mode(x, weights)
612    (array([4.]), array([3.]))
613
614    The value 4 appears three times: with uniform weights, the result is
615    simply the mode of the distribution.
616
617    >>> weights = [1, 3, 0.5, 1.5, 1, 2]  # deweight the 4's
618    >>> weighted_mode(x, weights)
619    (array([2.]), array([3.5]))
620
621    The value 2 has the highest score: it appears twice with weights of
622    1.5 and 2: the sum of these is 3.5.
623
624    See Also
625    --------
626    scipy.stats.mode
627    """
628    if axis is None:
629        a = np.ravel(a)
630        w = np.ravel(w)
631        axis = 0
632    else:
633        a = np.asarray(a)
634        w = np.asarray(w)
635
636    if a.shape != w.shape:
637        w = np.full(a.shape, w, dtype=w.dtype)
638
639    scores = np.unique(np.ravel(a))  # get ALL unique values
640    testshape = list(a.shape)
641    testshape[axis] = 1
642    oldmostfreq = np.zeros(testshape)
643    oldcounts = np.zeros(testshape)
644    for score in scores:
645        template = np.zeros(a.shape)
646        ind = a == score
647        template[ind] = w[ind]
648        counts = np.expand_dims(np.sum(template, axis), axis)
649        mostfrequent = np.where(counts > oldcounts, score, oldmostfreq)
650        oldcounts = np.maximum(counts, oldcounts)
651        oldmostfreq = mostfrequent
652    return mostfrequent, oldcounts
653
654
655def cartesian(arrays, out=None):
656    """Generate a cartesian product of input arrays.
657
658    Parameters
659    ----------
660    arrays : list of array-like
661        1-D arrays to form the cartesian product of.
662    out : ndarray of shape (M, len(arrays)), default=None
663        Array to place the cartesian product in.
664
665    Returns
666    -------
667    out : ndarray of shape (M, len(arrays))
668        Array containing the cartesian products formed of input arrays.
669
670    Notes
671    -----
672    This function may not be used on more than 32 arrays
673    because the underlying numpy functions do not support it.
674
675    Examples
676    --------
677    >>> from sklearn.utils.extmath import cartesian
678    >>> cartesian(([1, 2, 3], [4, 5], [6, 7]))
679    array([[1, 4, 6],
680           [1, 4, 7],
681           [1, 5, 6],
682           [1, 5, 7],
683           [2, 4, 6],
684           [2, 4, 7],
685           [2, 5, 6],
686           [2, 5, 7],
687           [3, 4, 6],
688           [3, 4, 7],
689           [3, 5, 6],
690           [3, 5, 7]])
691    """
692    arrays = [np.asarray(x) for x in arrays]
693    shape = (len(x) for x in arrays)
694    dtype = arrays[0].dtype
695
696    ix = np.indices(shape)
697    ix = ix.reshape(len(arrays), -1).T
698
699    if out is None:
700        out = np.empty_like(ix, dtype=dtype)
701
702    for n, arr in enumerate(arrays):
703        out[:, n] = arrays[n][ix[:, n]]
704
705    return out
706
707
708def svd_flip(u, v, u_based_decision=True):
709    """Sign correction to ensure deterministic output from SVD.
710
711    Adjusts the columns of u and the rows of v such that the loadings in the
712    columns in u that are largest in absolute value are always positive.
713
714    Parameters
715    ----------
716    u : ndarray
717        u and v are the output of `linalg.svd` or
718        :func:`~sklearn.utils.extmath.randomized_svd`, with matching inner
719        dimensions so one can compute `np.dot(u * s, v)`.
720
721    v : ndarray
722        u and v are the output of `linalg.svd` or
723        :func:`~sklearn.utils.extmath.randomized_svd`, with matching inner
724        dimensions so one can compute `np.dot(u * s, v)`.
725        The input v should really be called vt to be consistent with scipy's
726        output.
727
728    u_based_decision : bool, default=True
729        If True, use the columns of u as the basis for sign flipping.
730        Otherwise, use the rows of v. The choice of which variable to base the
731        decision on is generally algorithm dependent.
732
733
734    Returns
735    -------
736    u_adjusted, v_adjusted : arrays with the same dimensions as the input.
737
738    """
739    if u_based_decision:
740        # columns of u, rows of v
741        max_abs_cols = np.argmax(np.abs(u), axis=0)
742        signs = np.sign(u[max_abs_cols, range(u.shape[1])])
743        u *= signs
744        v *= signs[:, np.newaxis]
745    else:
746        # rows of v, columns of u
747        max_abs_rows = np.argmax(np.abs(v), axis=1)
748        signs = np.sign(v[range(v.shape[0]), max_abs_rows])
749        u *= signs
750        v *= signs[:, np.newaxis]
751    return u, v
752
753
754def log_logistic(X, out=None):
755    """Compute the log of the logistic function, ``log(1 / (1 + e ** -x))``.
756
757    This implementation is numerically stable because it splits positive and
758    negative values::
759
760        -log(1 + exp(-x_i))     if x_i > 0
761        x_i - log(1 + exp(x_i)) if x_i <= 0
762
763    For the ordinary logistic function, use ``scipy.special.expit``.
764
765    Parameters
766    ----------
767    X : array-like of shape (M, N) or (M,)
768        Argument to the logistic function.
769
770    out : array-like of shape (M, N) or (M,), default=None
771        Preallocated output array.
772
773    Returns
774    -------
775    out : ndarray of shape (M, N) or (M,)
776        Log of the logistic function evaluated at every point in x.
777
778    Notes
779    -----
780    See the blog post describing this implementation:
781    http://fa.bianp.net/blog/2013/numerical-optimizers-for-logistic-regression/
782    """
783    is_1d = X.ndim == 1
784    X = np.atleast_2d(X)
785    X = check_array(X, dtype=np.float64)
786
787    n_samples, n_features = X.shape
788
789    if out is None:
790        out = np.empty_like(X)
791
792    _log_logistic_sigmoid(n_samples, n_features, X, out)
793
794    if is_1d:
795        return np.squeeze(out)
796    return out
797
798
799def softmax(X, copy=True):
800    """
801    Calculate the softmax function.
802
803    The softmax function is calculated by
804    np.exp(X) / np.sum(np.exp(X), axis=1)
805
806    This will cause overflow when large values are exponentiated.
807    Hence the largest value in each row is subtracted from each data
808    point to prevent this.
809
810    Parameters
811    ----------
812    X : array-like of float of shape (M, N)
813        Argument to the logistic function.
814
815    copy : bool, default=True
816        Copy X or not.
817
818    Returns
819    -------
820    out : ndarray of shape (M, N)
821        Softmax function evaluated at every point in x.
822    """
823    if copy:
824        X = np.copy(X)
825    max_prob = np.max(X, axis=1).reshape((-1, 1))
826    X -= max_prob
827    np.exp(X, X)
828    sum_prob = np.sum(X, axis=1).reshape((-1, 1))
829    X /= sum_prob
830    return X
831
832
833def make_nonnegative(X, min_value=0):
834    """Ensure `X.min()` >= `min_value`.
835
836    Parameters
837    ----------
838    X : array-like
839        The matrix to make non-negative.
840    min_value : float, default=0
841        The threshold value.
842
843    Returns
844    -------
845    array-like
846        The thresholded array.
847
848    Raises
849    ------
850    ValueError
851        When X is sparse.
852    """
853    min_ = X.min()
854    if min_ < min_value:
855        if sparse.issparse(X):
856            raise ValueError(
857                "Cannot make the data matrix"
858                " nonnegative because it is sparse."
859                " Adding a value to every entry would"
860                " make it no longer sparse."
861            )
862        X = X + (min_value - min_)
863    return X
864
865
866# Use at least float64 for the accumulating functions to avoid precision issue
867# see https://github.com/numpy/numpy/issues/9393. The float64 is also retained
868# as it is in case the float overflows
869def _safe_accumulator_op(op, x, *args, **kwargs):
870    """
871    This function provides numpy accumulator functions with a float64 dtype
872    when used on a floating point input. This prevents accumulator overflow on
873    smaller floating point dtypes.
874
875    Parameters
876    ----------
877    op : function
878        A numpy accumulator function such as np.mean or np.sum.
879    x : ndarray
880        A numpy array to apply the accumulator function.
881    *args : positional arguments
882        Positional arguments passed to the accumulator function after the
883        input x.
884    **kwargs : keyword arguments
885        Keyword arguments passed to the accumulator function.
886
887    Returns
888    -------
889    result
890        The output of the accumulator function passed to this function.
891    """
892    if np.issubdtype(x.dtype, np.floating) and x.dtype.itemsize < 8:
893        result = op(x, *args, **kwargs, dtype=np.float64)
894    else:
895        result = op(x, *args, **kwargs)
896    return result
897
898
899def _incremental_mean_and_var(
900    X, last_mean, last_variance, last_sample_count, sample_weight=None
901):
902    """Calculate mean update and a Youngs and Cramer variance update.
903
904    If sample_weight is given, the weighted mean and variance is computed.
905
906    Update a given mean and (possibly) variance according to new data given
907    in X. last_mean is always required to compute the new mean.
908    If last_variance is None, no variance is computed and None return for
909    updated_variance.
910
911    From the paper "Algorithms for computing the sample variance: analysis and
912    recommendations", by Chan, Golub, and LeVeque.
913
914    Parameters
915    ----------
916    X : array-like of shape (n_samples, n_features)
917        Data to use for variance update.
918
919    last_mean : array-like of shape (n_features,)
920
921    last_variance : array-like of shape (n_features,)
922
923    last_sample_count : array-like of shape (n_features,)
924        The number of samples encountered until now if sample_weight is None.
925        If sample_weight is not None, this is the sum of sample_weight
926        encountered.
927
928    sample_weight : array-like of shape (n_samples,) or None
929        Sample weights. If None, compute the unweighted mean/variance.
930
931    Returns
932    -------
933    updated_mean : ndarray of shape (n_features,)
934
935    updated_variance : ndarray of shape (n_features,)
936        None if last_variance was None.
937
938    updated_sample_count : ndarray of shape (n_features,)
939
940    Notes
941    -----
942    NaNs are ignored during the algorithm.
943
944    References
945    ----------
946    T. Chan, G. Golub, R. LeVeque. Algorithms for computing the sample
947        variance: recommendations, The American Statistician, Vol. 37, No. 3,
948        pp. 242-247
949
950    Also, see the sparse implementation of this in
951    `utils.sparsefuncs.incr_mean_variance_axis` and
952    `utils.sparsefuncs_fast.incr_mean_variance_axis0`
953    """
954    # old = stats until now
955    # new = the current increment
956    # updated = the aggregated stats
957    last_sum = last_mean * last_sample_count
958    X_nan_mask = np.isnan(X)
959    if np.any(X_nan_mask):
960        sum_op = np.nansum
961    else:
962        sum_op = np.sum
963    if sample_weight is not None:
964        if np_version >= parse_version("1.16.6"):
965            # equivalent to np.nansum(X * sample_weight, axis=0)
966            # safer because np.float64(X*W) != np.float64(X)*np.float64(W)
967            # dtype arg of np.matmul only exists since version 1.16
968            new_sum = _safe_accumulator_op(
969                np.matmul, sample_weight, np.where(X_nan_mask, 0, X)
970            )
971        else:
972            new_sum = _safe_accumulator_op(
973                np.nansum, X * sample_weight[:, None], axis=0
974            )
975        new_sample_count = _safe_accumulator_op(
976            np.sum, sample_weight[:, None] * (~X_nan_mask), axis=0
977        )
978    else:
979        new_sum = _safe_accumulator_op(sum_op, X, axis=0)
980        n_samples = X.shape[0]
981        new_sample_count = n_samples - np.sum(X_nan_mask, axis=0)
982
983    updated_sample_count = last_sample_count + new_sample_count
984
985    updated_mean = (last_sum + new_sum) / updated_sample_count
986
987    if last_variance is None:
988        updated_variance = None
989    else:
990        T = new_sum / new_sample_count
991        temp = X - T
992        if sample_weight is not None:
993            if np_version >= parse_version("1.16.6"):
994                # equivalent to np.nansum((X-T)**2 * sample_weight, axis=0)
995                # safer because np.float64(X*W) != np.float64(X)*np.float64(W)
996                # dtype arg of np.matmul only exists since version 1.16
997                correction = _safe_accumulator_op(
998                    np.matmul, sample_weight, np.where(X_nan_mask, 0, temp)
999                )
1000                temp **= 2
1001                new_unnormalized_variance = _safe_accumulator_op(
1002                    np.matmul, sample_weight, np.where(X_nan_mask, 0, temp)
1003                )
1004            else:
1005                correction = _safe_accumulator_op(
1006                    sum_op, temp * sample_weight[:, None], axis=0
1007                )
1008                temp *= temp
1009                new_unnormalized_variance = _safe_accumulator_op(
1010                    sum_op, temp * sample_weight[:, None], axis=0
1011                )
1012        else:
1013            correction = _safe_accumulator_op(sum_op, temp, axis=0)
1014            temp **= 2
1015            new_unnormalized_variance = _safe_accumulator_op(sum_op, temp, axis=0)
1016
1017        # correction term of the corrected 2 pass algorithm.
1018        # See "Algorithms for computing the sample variance: analysis
1019        # and recommendations", by Chan, Golub, and LeVeque.
1020        new_unnormalized_variance -= correction ** 2 / new_sample_count
1021
1022        last_unnormalized_variance = last_variance * last_sample_count
1023
1024        with np.errstate(divide="ignore", invalid="ignore"):
1025            last_over_new_count = last_sample_count / new_sample_count
1026            updated_unnormalized_variance = (
1027                last_unnormalized_variance
1028                + new_unnormalized_variance
1029                + last_over_new_count
1030                / updated_sample_count
1031                * (last_sum / last_over_new_count - new_sum) ** 2
1032            )
1033
1034        zeros = last_sample_count == 0
1035        updated_unnormalized_variance[zeros] = new_unnormalized_variance[zeros]
1036        updated_variance = updated_unnormalized_variance / updated_sample_count
1037
1038    return updated_mean, updated_variance, updated_sample_count
1039
1040
1041def _deterministic_vector_sign_flip(u):
1042    """Modify the sign of vectors for reproducibility.
1043
1044    Flips the sign of elements of all the vectors (rows of u) such that
1045    the absolute maximum element of each vector is positive.
1046
1047    Parameters
1048    ----------
1049    u : ndarray
1050        Array with vectors as its rows.
1051
1052    Returns
1053    -------
1054    u_flipped : ndarray with same shape as u
1055        Array with the sign flipped vectors as its rows.
1056    """
1057    max_abs_rows = np.argmax(np.abs(u), axis=1)
1058    signs = np.sign(u[range(u.shape[0]), max_abs_rows])
1059    u *= signs[:, np.newaxis]
1060    return u
1061
1062
1063def stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08):
1064    """Use high precision for cumsum and check that final value matches sum.
1065
1066    Parameters
1067    ----------
1068    arr : array-like
1069        To be cumulatively summed as flat.
1070    axis : int, default=None
1071        Axis along which the cumulative sum is computed.
1072        The default (None) is to compute the cumsum over the flattened array.
1073    rtol : float, default=1e-05
1074        Relative tolerance, see ``np.allclose``.
1075    atol : float, default=1e-08
1076        Absolute tolerance, see ``np.allclose``.
1077    """
1078    out = np.cumsum(arr, axis=axis, dtype=np.float64)
1079    expected = np.sum(arr, axis=axis, dtype=np.float64)
1080    if not np.all(
1081        np.isclose(
1082            out.take(-1, axis=axis), expected, rtol=rtol, atol=atol, equal_nan=True
1083        )
1084    ):
1085        warnings.warn(
1086            "cumsum was found to be unstable: "
1087            "its last element does not correspond to sum",
1088            RuntimeWarning,
1089        )
1090    return out
1091