1""" 2Extended math utilities. 3""" 4# Authors: Gael Varoquaux 5# Alexandre Gramfort 6# Alexandre T. Passos 7# Olivier Grisel 8# Lars Buitinck 9# Stefan van der Walt 10# Kyle Kastner 11# Giorgio Patrini 12# License: BSD 3 clause 13 14import warnings 15 16import numpy as np 17from scipy import linalg, sparse 18 19from . import check_random_state 20from ._logistic_sigmoid import _log_logistic_sigmoid 21from .fixes import np_version, parse_version 22from .sparsefuncs_fast import csr_row_norms 23from .validation import check_array 24 25 26def squared_norm(x): 27 """Squared Euclidean or Frobenius norm of x. 28 29 Faster than norm(x) ** 2. 30 31 Parameters 32 ---------- 33 x : array-like 34 35 Returns 36 ------- 37 float 38 The Euclidean norm when x is a vector, the Frobenius norm when x 39 is a matrix (2-d array). 40 """ 41 x = np.ravel(x, order="K") 42 if np.issubdtype(x.dtype, np.integer): 43 warnings.warn( 44 "Array type is integer, np.dot may overflow. " 45 "Data should be float type to avoid this issue", 46 UserWarning, 47 ) 48 return np.dot(x, x) 49 50 51def row_norms(X, squared=False): 52 """Row-wise (squared) Euclidean norm of X. 53 54 Equivalent to np.sqrt((X * X).sum(axis=1)), but also supports sparse 55 matrices and does not create an X.shape-sized temporary. 56 57 Performs no input validation. 58 59 Parameters 60 ---------- 61 X : array-like 62 The input array. 63 squared : bool, default=False 64 If True, return squared norms. 65 66 Returns 67 ------- 68 array-like 69 The row-wise (squared) Euclidean norm of X. 70 """ 71 if sparse.issparse(X): 72 if not isinstance(X, sparse.csr_matrix): 73 X = sparse.csr_matrix(X) 74 norms = csr_row_norms(X) 75 else: 76 norms = np.einsum("ij,ij->i", X, X) 77 78 if not squared: 79 np.sqrt(norms, norms) 80 return norms 81 82 83def fast_logdet(A): 84 """Compute log(det(A)) for A symmetric. 85 86 Equivalent to : np.log(nl.det(A)) but more robust. 87 It returns -Inf if det(A) is non positive or is not defined. 88 89 Parameters 90 ---------- 91 A : array-like 92 The matrix. 93 """ 94 sign, ld = np.linalg.slogdet(A) 95 if not sign > 0: 96 return -np.inf 97 return ld 98 99 100def density(w, **kwargs): 101 """Compute density of a sparse vector. 102 103 Parameters 104 ---------- 105 w : array-like 106 The sparse vector. 107 108 Returns 109 ------- 110 float 111 The density of w, between 0 and 1. 112 """ 113 if hasattr(w, "toarray"): 114 d = float(w.nnz) / (w.shape[0] * w.shape[1]) 115 else: 116 d = 0 if w is None else float((w != 0).sum()) / w.size 117 return d 118 119 120def safe_sparse_dot(a, b, *, dense_output=False): 121 """Dot product that handle the sparse matrix case correctly. 122 123 Parameters 124 ---------- 125 a : {ndarray, sparse matrix} 126 b : {ndarray, sparse matrix} 127 dense_output : bool, default=False 128 When False, ``a`` and ``b`` both being sparse will yield sparse output. 129 When True, output will always be a dense array. 130 131 Returns 132 ------- 133 dot_product : {ndarray, sparse matrix} 134 Sparse if ``a`` and ``b`` are sparse and ``dense_output=False``. 135 """ 136 if a.ndim > 2 or b.ndim > 2: 137 if sparse.issparse(a): 138 # sparse is always 2D. Implies b is 3D+ 139 # [i, j] @ [k, ..., l, m, n] -> [i, k, ..., l, n] 140 b_ = np.rollaxis(b, -2) 141 b_2d = b_.reshape((b.shape[-2], -1)) 142 ret = a @ b_2d 143 ret = ret.reshape(a.shape[0], *b_.shape[1:]) 144 elif sparse.issparse(b): 145 # sparse is always 2D. Implies a is 3D+ 146 # [k, ..., l, m] @ [i, j] -> [k, ..., l, j] 147 a_2d = a.reshape(-1, a.shape[-1]) 148 ret = a_2d @ b 149 ret = ret.reshape(*a.shape[:-1], b.shape[1]) 150 else: 151 ret = np.dot(a, b) 152 else: 153 ret = a @ b 154 155 if ( 156 sparse.issparse(a) 157 and sparse.issparse(b) 158 and dense_output 159 and hasattr(ret, "toarray") 160 ): 161 return ret.toarray() 162 return ret 163 164 165def randomized_range_finder( 166 A, *, size, n_iter, power_iteration_normalizer="auto", random_state=None 167): 168 """Compute an orthonormal matrix whose range approximates the range of A. 169 170 Parameters 171 ---------- 172 A : 2D array 173 The input data matrix. 174 175 size : int 176 Size of the return array. 177 178 n_iter : int 179 Number of power iterations used to stabilize the result. 180 181 power_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto' 182 Whether the power iterations are normalized with step-by-step 183 QR factorization (the slowest but most accurate), 'none' 184 (the fastest but numerically unstable when `n_iter` is large, e.g. 185 typically 5 or larger), or 'LU' factorization (numerically stable 186 but can lose slightly in accuracy). The 'auto' mode applies no 187 normalization if `n_iter` <= 2 and switches to LU otherwise. 188 189 .. versionadded:: 0.18 190 191 random_state : int, RandomState instance or None, default=None 192 The seed of the pseudo random number generator to use when shuffling 193 the data, i.e. getting the random vectors to initialize the algorithm. 194 Pass an int for reproducible results across multiple function calls. 195 See :term:`Glossary <random_state>`. 196 197 Returns 198 ------- 199 Q : ndarray 200 A (size x size) projection matrix, the range of which 201 approximates well the range of the input matrix A. 202 203 Notes 204 ----- 205 206 Follows Algorithm 4.3 of 207 Finding structure with randomness: Stochastic algorithms for constructing 208 approximate matrix decompositions 209 Halko, et al., 2009 (arXiv:909) https://arxiv.org/pdf/0909.4061.pdf 210 211 An implementation of a randomized algorithm for principal component 212 analysis 213 A. Szlam et al. 2014 214 """ 215 random_state = check_random_state(random_state) 216 217 # Generating normal random vectors with shape: (A.shape[1], size) 218 Q = random_state.normal(size=(A.shape[1], size)) 219 if A.dtype.kind == "f": 220 # Ensure f32 is preserved as f32 221 Q = Q.astype(A.dtype, copy=False) 222 223 # Deal with "auto" mode 224 if power_iteration_normalizer == "auto": 225 if n_iter <= 2: 226 power_iteration_normalizer = "none" 227 else: 228 power_iteration_normalizer = "LU" 229 230 # Perform power iterations with Q to further 'imprint' the top 231 # singular vectors of A in Q 232 for i in range(n_iter): 233 if power_iteration_normalizer == "none": 234 Q = safe_sparse_dot(A, Q) 235 Q = safe_sparse_dot(A.T, Q) 236 elif power_iteration_normalizer == "LU": 237 Q, _ = linalg.lu(safe_sparse_dot(A, Q), permute_l=True) 238 Q, _ = linalg.lu(safe_sparse_dot(A.T, Q), permute_l=True) 239 elif power_iteration_normalizer == "QR": 240 Q, _ = linalg.qr(safe_sparse_dot(A, Q), mode="economic") 241 Q, _ = linalg.qr(safe_sparse_dot(A.T, Q), mode="economic") 242 243 # Sample the range of A using by linear projection of Q 244 # Extract an orthonormal basis 245 Q, _ = linalg.qr(safe_sparse_dot(A, Q), mode="economic") 246 return Q 247 248 249def randomized_svd( 250 M, 251 n_components, 252 *, 253 n_oversamples=10, 254 n_iter="auto", 255 power_iteration_normalizer="auto", 256 transpose="auto", 257 flip_sign=True, 258 random_state="warn", 259): 260 """Computes a truncated randomized SVD. 261 262 This method solves the fixed-rank approximation problem described in the 263 Halko et al paper (problem (1.5), p5). 264 265 Parameters 266 ---------- 267 M : {ndarray, sparse matrix} 268 Matrix to decompose. 269 270 n_components : int 271 Number of singular values and vectors to extract. 272 273 n_oversamples : int, default=10 274 Additional number of random vectors to sample the range of M so as 275 to ensure proper conditioning. The total number of random vectors 276 used to find the range of M is n_components + n_oversamples. Smaller 277 number can improve speed but can negatively impact the quality of 278 approximation of singular vectors and singular values. Users might wish 279 to increase this parameter up to `2*k - n_components` where k is the 280 effective rank, for large matrices, noisy problems, matrices with 281 slowly decaying spectrums, or to increase precision accuracy. See Halko 282 et al (pages 5, 23 and 26). 283 284 n_iter : int or 'auto', default='auto' 285 Number of power iterations. It can be used to deal with very noisy 286 problems. When 'auto', it is set to 4, unless `n_components` is small 287 (< .1 * min(X.shape)) in which case `n_iter` is set to 7. 288 This improves precision with few components. Note that in general 289 users should rather increase `n_oversamples` before increasing `n_iter` 290 as the principle of the randomized method is to avoid usage of these 291 more costly power iterations steps. When `n_components` is equal 292 or greater to the effective matrix rank and the spectrum does not 293 present a slow decay, `n_iter=0` or `1` should even work fine in theory 294 (see Halko et al paper, page 9). 295 296 .. versionchanged:: 0.18 297 298 power_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto' 299 Whether the power iterations are normalized with step-by-step 300 QR factorization (the slowest but most accurate), 'none' 301 (the fastest but numerically unstable when `n_iter` is large, e.g. 302 typically 5 or larger), or 'LU' factorization (numerically stable 303 but can lose slightly in accuracy). The 'auto' mode applies no 304 normalization if `n_iter` <= 2 and switches to LU otherwise. 305 306 .. versionadded:: 0.18 307 308 transpose : bool or 'auto', default='auto' 309 Whether the algorithm should be applied to M.T instead of M. The 310 result should approximately be the same. The 'auto' mode will 311 trigger the transposition if M.shape[1] > M.shape[0] since this 312 implementation of randomized SVD tend to be a little faster in that 313 case. 314 315 .. versionchanged:: 0.18 316 317 flip_sign : bool, default=True 318 The output of a singular value decomposition is only unique up to a 319 permutation of the signs of the singular vectors. If `flip_sign` is 320 set to `True`, the sign ambiguity is resolved by making the largest 321 loadings for each component in the left singular vectors positive. 322 323 random_state : int, RandomState instance or None, default='warn' 324 The seed of the pseudo random number generator to use when 325 shuffling the data, i.e. getting the random vectors to initialize 326 the algorithm. Pass an int for reproducible results across multiple 327 function calls. See :term:`Glossary <random_state>`. 328 329 .. versionchanged:: 1.2 330 The previous behavior (`random_state=0`) is deprecated, and 331 from v1.2 the default value will be `random_state=None`. Set 332 the value of `random_state` explicitly to suppress the deprecation 333 warning. 334 335 Notes 336 ----- 337 This algorithm finds a (usually very good) approximate truncated 338 singular value decomposition using randomization to speed up the 339 computations. It is particularly fast on large matrices on which 340 you wish to extract only a small number of components. In order to 341 obtain further speed up, `n_iter` can be set <=2 (at the cost of 342 loss of precision). To increase the precision it is recommended to 343 increase `n_oversamples`, up to `2*k-n_components` where k is the 344 effective rank. Usually, `n_components` is chosen to be greater than k 345 so increasing `n_oversamples` up to `n_components` should be enough. 346 347 References 348 ---------- 349 * Finding structure with randomness: Stochastic algorithms for constructing 350 approximate matrix decompositions (Algorithm 4.3) 351 Halko, et al., 2009 https://arxiv.org/abs/0909.4061 352 353 * A randomized algorithm for the decomposition of matrices 354 Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert 355 356 * An implementation of a randomized algorithm for principal component 357 analysis 358 A. Szlam et al. 2014 359 """ 360 if isinstance(M, (sparse.lil_matrix, sparse.dok_matrix)): 361 warnings.warn( 362 "Calculating SVD of a {} is expensive. " 363 "csr_matrix is more efficient.".format(type(M).__name__), 364 sparse.SparseEfficiencyWarning, 365 ) 366 367 if random_state == "warn": 368 warnings.warn( 369 "If 'random_state' is not supplied, the current default " 370 "is to use 0 as a fixed seed. This will change to " 371 "None in version 1.2 leading to non-deterministic results " 372 "that better reflect nature of the randomized_svd solver. " 373 "If you want to silence this warning, set 'random_state' " 374 "to an integer seed or to None explicitly depending " 375 "if you want your code to be deterministic or not.", 376 FutureWarning, 377 ) 378 random_state = 0 379 380 random_state = check_random_state(random_state) 381 n_random = n_components + n_oversamples 382 n_samples, n_features = M.shape 383 384 if n_iter == "auto": 385 # Checks if the number of iterations is explicitly specified 386 # Adjust n_iter. 7 was found a good compromise for PCA. See #5299 387 n_iter = 7 if n_components < 0.1 * min(M.shape) else 4 388 389 if transpose == "auto": 390 transpose = n_samples < n_features 391 if transpose: 392 # this implementation is a bit faster with smaller shape[1] 393 M = M.T 394 395 Q = randomized_range_finder( 396 M, 397 size=n_random, 398 n_iter=n_iter, 399 power_iteration_normalizer=power_iteration_normalizer, 400 random_state=random_state, 401 ) 402 403 # project M to the (k + p) dimensional space using the basis vectors 404 B = safe_sparse_dot(Q.T, M) 405 406 # compute the SVD on the thin matrix: (k + p) wide 407 Uhat, s, Vt = linalg.svd(B, full_matrices=False) 408 409 del B 410 U = np.dot(Q, Uhat) 411 412 if flip_sign: 413 if not transpose: 414 U, Vt = svd_flip(U, Vt) 415 else: 416 # In case of transpose u_based_decision=false 417 # to actually flip based on u and not v. 418 U, Vt = svd_flip(U, Vt, u_based_decision=False) 419 420 if transpose: 421 # transpose back the results according to the input convention 422 return Vt[:n_components, :].T, s[:n_components], U[:, :n_components].T 423 else: 424 return U[:, :n_components], s[:n_components], Vt[:n_components, :] 425 426 427def _randomized_eigsh( 428 M, 429 n_components, 430 *, 431 n_oversamples=10, 432 n_iter="auto", 433 power_iteration_normalizer="auto", 434 selection="module", 435 random_state=None, 436): 437 """Computes a truncated eigendecomposition using randomized methods 438 439 This method solves the fixed-rank approximation problem described in the 440 Halko et al paper. 441 442 The choice of which components to select can be tuned with the `selection` 443 parameter. 444 445 .. versionadded:: 0.24 446 447 Parameters 448 ---------- 449 M : ndarray or sparse matrix 450 Matrix to decompose, it should be real symmetric square or complex 451 hermitian 452 453 n_components : int 454 Number of eigenvalues and vectors to extract. 455 456 n_oversamples : int, default=10 457 Additional number of random vectors to sample the range of M so as 458 to ensure proper conditioning. The total number of random vectors 459 used to find the range of M is n_components + n_oversamples. Smaller 460 number can improve speed but can negatively impact the quality of 461 approximation of eigenvectors and eigenvalues. Users might wish 462 to increase this parameter up to `2*k - n_components` where k is the 463 effective rank, for large matrices, noisy problems, matrices with 464 slowly decaying spectrums, or to increase precision accuracy. See Halko 465 et al (pages 5, 23 and 26). 466 467 n_iter : int or 'auto', default='auto' 468 Number of power iterations. It can be used to deal with very noisy 469 problems. When 'auto', it is set to 4, unless `n_components` is small 470 (< .1 * min(X.shape)) in which case `n_iter` is set to 7. 471 This improves precision with few components. Note that in general 472 users should rather increase `n_oversamples` before increasing `n_iter` 473 as the principle of the randomized method is to avoid usage of these 474 more costly power iterations steps. When `n_components` is equal 475 or greater to the effective matrix rank and the spectrum does not 476 present a slow decay, `n_iter=0` or `1` should even work fine in theory 477 (see Halko et al paper, page 9). 478 479 power_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto' 480 Whether the power iterations are normalized with step-by-step 481 QR factorization (the slowest but most accurate), 'none' 482 (the fastest but numerically unstable when `n_iter` is large, e.g. 483 typically 5 or larger), or 'LU' factorization (numerically stable 484 but can lose slightly in accuracy). The 'auto' mode applies no 485 normalization if `n_iter` <= 2 and switches to LU otherwise. 486 487 selection : {'value', 'module'}, default='module' 488 Strategy used to select the n components. When `selection` is `'value'` 489 (not yet implemented, will become the default when implemented), the 490 components corresponding to the n largest eigenvalues are returned. 491 When `selection` is `'module'`, the components corresponding to the n 492 eigenvalues with largest modules are returned. 493 494 random_state : int, RandomState instance, default=None 495 The seed of the pseudo random number generator to use when shuffling 496 the data, i.e. getting the random vectors to initialize the algorithm. 497 Pass an int for reproducible results across multiple function calls. 498 See :term:`Glossary <random_state>`. 499 500 Notes 501 ----- 502 This algorithm finds a (usually very good) approximate truncated 503 eigendecomposition using randomized methods to speed up the computations. 504 505 This method is particularly fast on large matrices on which 506 you wish to extract only a small number of components. In order to 507 obtain further speed up, `n_iter` can be set <=2 (at the cost of 508 loss of precision). To increase the precision it is recommended to 509 increase `n_oversamples`, up to `2*k-n_components` where k is the 510 effective rank. Usually, `n_components` is chosen to be greater than k 511 so increasing `n_oversamples` up to `n_components` should be enough. 512 513 Strategy 'value': not implemented yet. 514 Algorithms 5.3, 5.4 and 5.5 in the Halko et al paper should provide good 515 condidates for a future implementation. 516 517 Strategy 'module': 518 The principle is that for diagonalizable matrices, the singular values and 519 eigenvalues are related: if t is an eigenvalue of A, then :math:`|t|` is a 520 singular value of A. This method relies on a randomized SVD to find the n 521 singular components corresponding to the n singular values with largest 522 modules, and then uses the signs of the singular vectors to find the true 523 sign of t: if the sign of left and right singular vectors are different 524 then the corresponding eigenvalue is negative. 525 526 Returns 527 ------- 528 eigvals : 1D array of shape (n_components,) containing the `n_components` 529 eigenvalues selected (see ``selection`` parameter). 530 eigvecs : 2D array of shape (M.shape[0], n_components) containing the 531 `n_components` eigenvectors corresponding to the `eigvals`, in the 532 corresponding order. Note that this follows the `scipy.linalg.eigh` 533 convention. 534 535 See Also 536 -------- 537 :func:`randomized_svd` 538 539 References 540 ---------- 541 * Finding structure with randomness: Stochastic algorithms for constructing 542 approximate matrix decompositions (Algorithm 4.3 for strategy 'module') 543 Halko, et al., 2009 https://arxiv.org/abs/0909.4061 544 545 """ 546 if selection == "value": # pragma: no cover 547 # to do : an algorithm can be found in the Halko et al reference 548 raise NotImplementedError() 549 550 elif selection == "module": 551 # Note: no need for deterministic U and Vt (flip_sign=True), 552 # as we only use the dot product UVt afterwards 553 U, S, Vt = randomized_svd( 554 M, 555 n_components=n_components, 556 n_oversamples=n_oversamples, 557 n_iter=n_iter, 558 power_iteration_normalizer=power_iteration_normalizer, 559 flip_sign=False, 560 random_state=random_state, 561 ) 562 563 eigvecs = U[:, :n_components] 564 eigvals = S[:n_components] 565 566 # Conversion of Singular values into Eigenvalues: 567 # For any eigenvalue t, the corresponding singular value is |t|. 568 # So if there is a negative eigenvalue t, the corresponding singular 569 # value will be -t, and the left (U) and right (V) singular vectors 570 # will have opposite signs. 571 # Fastest way: see <https://stackoverflow.com/a/61974002/7262247> 572 diag_VtU = np.einsum("ji,ij->j", Vt[:n_components, :], U[:, :n_components]) 573 signs = np.sign(diag_VtU) 574 eigvals = eigvals * signs 575 576 else: # pragma: no cover 577 raise ValueError("Invalid `selection`: %r" % selection) 578 579 return eigvals, eigvecs 580 581 582def weighted_mode(a, w, *, axis=0): 583 """Returns an array of the weighted modal (most common) value in a. 584 585 If there is more than one such value, only the first is returned. 586 The bin-count for the modal bins is also returned. 587 588 This is an extension of the algorithm in scipy.stats.mode. 589 590 Parameters 591 ---------- 592 a : array-like 593 n-dimensional array of which to find mode(s). 594 w : array-like 595 n-dimensional array of weights for each value. 596 axis : int, default=0 597 Axis along which to operate. Default is 0, i.e. the first axis. 598 599 Returns 600 ------- 601 vals : ndarray 602 Array of modal values. 603 score : ndarray 604 Array of weighted counts for each mode. 605 606 Examples 607 -------- 608 >>> from sklearn.utils.extmath import weighted_mode 609 >>> x = [4, 1, 4, 2, 4, 2] 610 >>> weights = [1, 1, 1, 1, 1, 1] 611 >>> weighted_mode(x, weights) 612 (array([4.]), array([3.])) 613 614 The value 4 appears three times: with uniform weights, the result is 615 simply the mode of the distribution. 616 617 >>> weights = [1, 3, 0.5, 1.5, 1, 2] # deweight the 4's 618 >>> weighted_mode(x, weights) 619 (array([2.]), array([3.5])) 620 621 The value 2 has the highest score: it appears twice with weights of 622 1.5 and 2: the sum of these is 3.5. 623 624 See Also 625 -------- 626 scipy.stats.mode 627 """ 628 if axis is None: 629 a = np.ravel(a) 630 w = np.ravel(w) 631 axis = 0 632 else: 633 a = np.asarray(a) 634 w = np.asarray(w) 635 636 if a.shape != w.shape: 637 w = np.full(a.shape, w, dtype=w.dtype) 638 639 scores = np.unique(np.ravel(a)) # get ALL unique values 640 testshape = list(a.shape) 641 testshape[axis] = 1 642 oldmostfreq = np.zeros(testshape) 643 oldcounts = np.zeros(testshape) 644 for score in scores: 645 template = np.zeros(a.shape) 646 ind = a == score 647 template[ind] = w[ind] 648 counts = np.expand_dims(np.sum(template, axis), axis) 649 mostfrequent = np.where(counts > oldcounts, score, oldmostfreq) 650 oldcounts = np.maximum(counts, oldcounts) 651 oldmostfreq = mostfrequent 652 return mostfrequent, oldcounts 653 654 655def cartesian(arrays, out=None): 656 """Generate a cartesian product of input arrays. 657 658 Parameters 659 ---------- 660 arrays : list of array-like 661 1-D arrays to form the cartesian product of. 662 out : ndarray of shape (M, len(arrays)), default=None 663 Array to place the cartesian product in. 664 665 Returns 666 ------- 667 out : ndarray of shape (M, len(arrays)) 668 Array containing the cartesian products formed of input arrays. 669 670 Notes 671 ----- 672 This function may not be used on more than 32 arrays 673 because the underlying numpy functions do not support it. 674 675 Examples 676 -------- 677 >>> from sklearn.utils.extmath import cartesian 678 >>> cartesian(([1, 2, 3], [4, 5], [6, 7])) 679 array([[1, 4, 6], 680 [1, 4, 7], 681 [1, 5, 6], 682 [1, 5, 7], 683 [2, 4, 6], 684 [2, 4, 7], 685 [2, 5, 6], 686 [2, 5, 7], 687 [3, 4, 6], 688 [3, 4, 7], 689 [3, 5, 6], 690 [3, 5, 7]]) 691 """ 692 arrays = [np.asarray(x) for x in arrays] 693 shape = (len(x) for x in arrays) 694 dtype = arrays[0].dtype 695 696 ix = np.indices(shape) 697 ix = ix.reshape(len(arrays), -1).T 698 699 if out is None: 700 out = np.empty_like(ix, dtype=dtype) 701 702 for n, arr in enumerate(arrays): 703 out[:, n] = arrays[n][ix[:, n]] 704 705 return out 706 707 708def svd_flip(u, v, u_based_decision=True): 709 """Sign correction to ensure deterministic output from SVD. 710 711 Adjusts the columns of u and the rows of v such that the loadings in the 712 columns in u that are largest in absolute value are always positive. 713 714 Parameters 715 ---------- 716 u : ndarray 717 u and v are the output of `linalg.svd` or 718 :func:`~sklearn.utils.extmath.randomized_svd`, with matching inner 719 dimensions so one can compute `np.dot(u * s, v)`. 720 721 v : ndarray 722 u and v are the output of `linalg.svd` or 723 :func:`~sklearn.utils.extmath.randomized_svd`, with matching inner 724 dimensions so one can compute `np.dot(u * s, v)`. 725 The input v should really be called vt to be consistent with scipy's 726 output. 727 728 u_based_decision : bool, default=True 729 If True, use the columns of u as the basis for sign flipping. 730 Otherwise, use the rows of v. The choice of which variable to base the 731 decision on is generally algorithm dependent. 732 733 734 Returns 735 ------- 736 u_adjusted, v_adjusted : arrays with the same dimensions as the input. 737 738 """ 739 if u_based_decision: 740 # columns of u, rows of v 741 max_abs_cols = np.argmax(np.abs(u), axis=0) 742 signs = np.sign(u[max_abs_cols, range(u.shape[1])]) 743 u *= signs 744 v *= signs[:, np.newaxis] 745 else: 746 # rows of v, columns of u 747 max_abs_rows = np.argmax(np.abs(v), axis=1) 748 signs = np.sign(v[range(v.shape[0]), max_abs_rows]) 749 u *= signs 750 v *= signs[:, np.newaxis] 751 return u, v 752 753 754def log_logistic(X, out=None): 755 """Compute the log of the logistic function, ``log(1 / (1 + e ** -x))``. 756 757 This implementation is numerically stable because it splits positive and 758 negative values:: 759 760 -log(1 + exp(-x_i)) if x_i > 0 761 x_i - log(1 + exp(x_i)) if x_i <= 0 762 763 For the ordinary logistic function, use ``scipy.special.expit``. 764 765 Parameters 766 ---------- 767 X : array-like of shape (M, N) or (M,) 768 Argument to the logistic function. 769 770 out : array-like of shape (M, N) or (M,), default=None 771 Preallocated output array. 772 773 Returns 774 ------- 775 out : ndarray of shape (M, N) or (M,) 776 Log of the logistic function evaluated at every point in x. 777 778 Notes 779 ----- 780 See the blog post describing this implementation: 781 http://fa.bianp.net/blog/2013/numerical-optimizers-for-logistic-regression/ 782 """ 783 is_1d = X.ndim == 1 784 X = np.atleast_2d(X) 785 X = check_array(X, dtype=np.float64) 786 787 n_samples, n_features = X.shape 788 789 if out is None: 790 out = np.empty_like(X) 791 792 _log_logistic_sigmoid(n_samples, n_features, X, out) 793 794 if is_1d: 795 return np.squeeze(out) 796 return out 797 798 799def softmax(X, copy=True): 800 """ 801 Calculate the softmax function. 802 803 The softmax function is calculated by 804 np.exp(X) / np.sum(np.exp(X), axis=1) 805 806 This will cause overflow when large values are exponentiated. 807 Hence the largest value in each row is subtracted from each data 808 point to prevent this. 809 810 Parameters 811 ---------- 812 X : array-like of float of shape (M, N) 813 Argument to the logistic function. 814 815 copy : bool, default=True 816 Copy X or not. 817 818 Returns 819 ------- 820 out : ndarray of shape (M, N) 821 Softmax function evaluated at every point in x. 822 """ 823 if copy: 824 X = np.copy(X) 825 max_prob = np.max(X, axis=1).reshape((-1, 1)) 826 X -= max_prob 827 np.exp(X, X) 828 sum_prob = np.sum(X, axis=1).reshape((-1, 1)) 829 X /= sum_prob 830 return X 831 832 833def make_nonnegative(X, min_value=0): 834 """Ensure `X.min()` >= `min_value`. 835 836 Parameters 837 ---------- 838 X : array-like 839 The matrix to make non-negative. 840 min_value : float, default=0 841 The threshold value. 842 843 Returns 844 ------- 845 array-like 846 The thresholded array. 847 848 Raises 849 ------ 850 ValueError 851 When X is sparse. 852 """ 853 min_ = X.min() 854 if min_ < min_value: 855 if sparse.issparse(X): 856 raise ValueError( 857 "Cannot make the data matrix" 858 " nonnegative because it is sparse." 859 " Adding a value to every entry would" 860 " make it no longer sparse." 861 ) 862 X = X + (min_value - min_) 863 return X 864 865 866# Use at least float64 for the accumulating functions to avoid precision issue 867# see https://github.com/numpy/numpy/issues/9393. The float64 is also retained 868# as it is in case the float overflows 869def _safe_accumulator_op(op, x, *args, **kwargs): 870 """ 871 This function provides numpy accumulator functions with a float64 dtype 872 when used on a floating point input. This prevents accumulator overflow on 873 smaller floating point dtypes. 874 875 Parameters 876 ---------- 877 op : function 878 A numpy accumulator function such as np.mean or np.sum. 879 x : ndarray 880 A numpy array to apply the accumulator function. 881 *args : positional arguments 882 Positional arguments passed to the accumulator function after the 883 input x. 884 **kwargs : keyword arguments 885 Keyword arguments passed to the accumulator function. 886 887 Returns 888 ------- 889 result 890 The output of the accumulator function passed to this function. 891 """ 892 if np.issubdtype(x.dtype, np.floating) and x.dtype.itemsize < 8: 893 result = op(x, *args, **kwargs, dtype=np.float64) 894 else: 895 result = op(x, *args, **kwargs) 896 return result 897 898 899def _incremental_mean_and_var( 900 X, last_mean, last_variance, last_sample_count, sample_weight=None 901): 902 """Calculate mean update and a Youngs and Cramer variance update. 903 904 If sample_weight is given, the weighted mean and variance is computed. 905 906 Update a given mean and (possibly) variance according to new data given 907 in X. last_mean is always required to compute the new mean. 908 If last_variance is None, no variance is computed and None return for 909 updated_variance. 910 911 From the paper "Algorithms for computing the sample variance: analysis and 912 recommendations", by Chan, Golub, and LeVeque. 913 914 Parameters 915 ---------- 916 X : array-like of shape (n_samples, n_features) 917 Data to use for variance update. 918 919 last_mean : array-like of shape (n_features,) 920 921 last_variance : array-like of shape (n_features,) 922 923 last_sample_count : array-like of shape (n_features,) 924 The number of samples encountered until now if sample_weight is None. 925 If sample_weight is not None, this is the sum of sample_weight 926 encountered. 927 928 sample_weight : array-like of shape (n_samples,) or None 929 Sample weights. If None, compute the unweighted mean/variance. 930 931 Returns 932 ------- 933 updated_mean : ndarray of shape (n_features,) 934 935 updated_variance : ndarray of shape (n_features,) 936 None if last_variance was None. 937 938 updated_sample_count : ndarray of shape (n_features,) 939 940 Notes 941 ----- 942 NaNs are ignored during the algorithm. 943 944 References 945 ---------- 946 T. Chan, G. Golub, R. LeVeque. Algorithms for computing the sample 947 variance: recommendations, The American Statistician, Vol. 37, No. 3, 948 pp. 242-247 949 950 Also, see the sparse implementation of this in 951 `utils.sparsefuncs.incr_mean_variance_axis` and 952 `utils.sparsefuncs_fast.incr_mean_variance_axis0` 953 """ 954 # old = stats until now 955 # new = the current increment 956 # updated = the aggregated stats 957 last_sum = last_mean * last_sample_count 958 X_nan_mask = np.isnan(X) 959 if np.any(X_nan_mask): 960 sum_op = np.nansum 961 else: 962 sum_op = np.sum 963 if sample_weight is not None: 964 if np_version >= parse_version("1.16.6"): 965 # equivalent to np.nansum(X * sample_weight, axis=0) 966 # safer because np.float64(X*W) != np.float64(X)*np.float64(W) 967 # dtype arg of np.matmul only exists since version 1.16 968 new_sum = _safe_accumulator_op( 969 np.matmul, sample_weight, np.where(X_nan_mask, 0, X) 970 ) 971 else: 972 new_sum = _safe_accumulator_op( 973 np.nansum, X * sample_weight[:, None], axis=0 974 ) 975 new_sample_count = _safe_accumulator_op( 976 np.sum, sample_weight[:, None] * (~X_nan_mask), axis=0 977 ) 978 else: 979 new_sum = _safe_accumulator_op(sum_op, X, axis=0) 980 n_samples = X.shape[0] 981 new_sample_count = n_samples - np.sum(X_nan_mask, axis=0) 982 983 updated_sample_count = last_sample_count + new_sample_count 984 985 updated_mean = (last_sum + new_sum) / updated_sample_count 986 987 if last_variance is None: 988 updated_variance = None 989 else: 990 T = new_sum / new_sample_count 991 temp = X - T 992 if sample_weight is not None: 993 if np_version >= parse_version("1.16.6"): 994 # equivalent to np.nansum((X-T)**2 * sample_weight, axis=0) 995 # safer because np.float64(X*W) != np.float64(X)*np.float64(W) 996 # dtype arg of np.matmul only exists since version 1.16 997 correction = _safe_accumulator_op( 998 np.matmul, sample_weight, np.where(X_nan_mask, 0, temp) 999 ) 1000 temp **= 2 1001 new_unnormalized_variance = _safe_accumulator_op( 1002 np.matmul, sample_weight, np.where(X_nan_mask, 0, temp) 1003 ) 1004 else: 1005 correction = _safe_accumulator_op( 1006 sum_op, temp * sample_weight[:, None], axis=0 1007 ) 1008 temp *= temp 1009 new_unnormalized_variance = _safe_accumulator_op( 1010 sum_op, temp * sample_weight[:, None], axis=0 1011 ) 1012 else: 1013 correction = _safe_accumulator_op(sum_op, temp, axis=0) 1014 temp **= 2 1015 new_unnormalized_variance = _safe_accumulator_op(sum_op, temp, axis=0) 1016 1017 # correction term of the corrected 2 pass algorithm. 1018 # See "Algorithms for computing the sample variance: analysis 1019 # and recommendations", by Chan, Golub, and LeVeque. 1020 new_unnormalized_variance -= correction ** 2 / new_sample_count 1021 1022 last_unnormalized_variance = last_variance * last_sample_count 1023 1024 with np.errstate(divide="ignore", invalid="ignore"): 1025 last_over_new_count = last_sample_count / new_sample_count 1026 updated_unnormalized_variance = ( 1027 last_unnormalized_variance 1028 + new_unnormalized_variance 1029 + last_over_new_count 1030 / updated_sample_count 1031 * (last_sum / last_over_new_count - new_sum) ** 2 1032 ) 1033 1034 zeros = last_sample_count == 0 1035 updated_unnormalized_variance[zeros] = new_unnormalized_variance[zeros] 1036 updated_variance = updated_unnormalized_variance / updated_sample_count 1037 1038 return updated_mean, updated_variance, updated_sample_count 1039 1040 1041def _deterministic_vector_sign_flip(u): 1042 """Modify the sign of vectors for reproducibility. 1043 1044 Flips the sign of elements of all the vectors (rows of u) such that 1045 the absolute maximum element of each vector is positive. 1046 1047 Parameters 1048 ---------- 1049 u : ndarray 1050 Array with vectors as its rows. 1051 1052 Returns 1053 ------- 1054 u_flipped : ndarray with same shape as u 1055 Array with the sign flipped vectors as its rows. 1056 """ 1057 max_abs_rows = np.argmax(np.abs(u), axis=1) 1058 signs = np.sign(u[range(u.shape[0]), max_abs_rows]) 1059 u *= signs[:, np.newaxis] 1060 return u 1061 1062 1063def stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08): 1064 """Use high precision for cumsum and check that final value matches sum. 1065 1066 Parameters 1067 ---------- 1068 arr : array-like 1069 To be cumulatively summed as flat. 1070 axis : int, default=None 1071 Axis along which the cumulative sum is computed. 1072 The default (None) is to compute the cumsum over the flattened array. 1073 rtol : float, default=1e-05 1074 Relative tolerance, see ``np.allclose``. 1075 atol : float, default=1e-08 1076 Absolute tolerance, see ``np.allclose``. 1077 """ 1078 out = np.cumsum(arr, axis=axis, dtype=np.float64) 1079 expected = np.sum(arr, axis=axis, dtype=np.float64) 1080 if not np.all( 1081 np.isclose( 1082 out.take(-1, axis=axis), expected, rtol=rtol, atol=atol, equal_nan=True 1083 ) 1084 ): 1085 warnings.warn( 1086 "cumsum was found to be unstable: " 1087 "its last element does not correspond to sum", 1088 RuntimeWarning, 1089 ) 1090 return out 1091