1""" 2Least Angle Regression algorithm. See the documentation on the 3Generalized Linear Model for a complete discussion. 4""" 5# Author: Fabian Pedregosa <fabian.pedregosa@inria.fr> 6# Alexandre Gramfort <alexandre.gramfort@inria.fr> 7# Gael Varoquaux 8# 9# License: BSD 3 clause 10 11from math import log 12import sys 13import warnings 14 15import numpy as np 16from scipy import linalg, interpolate 17from scipy.linalg.lapack import get_lapack_funcs 18from joblib import Parallel 19 20from ._base import LinearModel 21from ._base import _deprecate_normalize 22from ._base import LinearRegression 23from ..base import RegressorMixin, MultiOutputMixin 24 25# mypy error: Module 'sklearn.utils' has no attribute 'arrayfuncs' 26from ..utils import arrayfuncs, as_float_array # type: ignore 27from ..utils import check_random_state 28from ..model_selection import check_cv 29from ..exceptions import ConvergenceWarning 30from ..utils.fixes import delayed 31 32SOLVE_TRIANGULAR_ARGS = {"check_finite": False} 33 34 35def lars_path( 36 X, 37 y, 38 Xy=None, 39 *, 40 Gram=None, 41 max_iter=500, 42 alpha_min=0, 43 method="lar", 44 copy_X=True, 45 eps=np.finfo(float).eps, 46 copy_Gram=True, 47 verbose=0, 48 return_path=True, 49 return_n_iter=False, 50 positive=False, 51): 52 """Compute Least Angle Regression or Lasso path using LARS algorithm [1] 53 54 The optimization objective for the case method='lasso' is:: 55 56 (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1 57 58 in the case of method='lars', the objective function is only known in 59 the form of an implicit equation (see discussion in [1]) 60 61 Read more in the :ref:`User Guide <least_angle_regression>`. 62 63 Parameters 64 ---------- 65 X : None or array-like of shape (n_samples, n_features) 66 Input data. Note that if X is None then the Gram matrix must be 67 specified, i.e., cannot be None or False. 68 69 y : None or array-like of shape (n_samples,) 70 Input targets. 71 72 Xy : array-like of shape (n_samples,) or (n_samples, n_targets), \ 73 default=None 74 Xy = np.dot(X.T, y) that can be precomputed. It is useful 75 only when the Gram matrix is precomputed. 76 77 Gram : None, 'auto', array-like of shape (n_features, n_features), \ 78 default=None 79 Precomputed Gram matrix (X' * X), if ``'auto'``, the Gram 80 matrix is precomputed from the given X, if there are more samples 81 than features. 82 83 max_iter : int, default=500 84 Maximum number of iterations to perform, set to infinity for no limit. 85 86 alpha_min : float, default=0 87 Minimum correlation along the path. It corresponds to the 88 regularization parameter alpha parameter in the Lasso. 89 90 method : {'lar', 'lasso'}, default='lar' 91 Specifies the returned model. Select ``'lar'`` for Least Angle 92 Regression, ``'lasso'`` for the Lasso. 93 94 copy_X : bool, default=True 95 If ``False``, ``X`` is overwritten. 96 97 eps : float, default=np.finfo(float).eps 98 The machine-precision regularization in the computation of the 99 Cholesky diagonal factors. Increase this for very ill-conditioned 100 systems. Unlike the ``tol`` parameter in some iterative 101 optimization-based algorithms, this parameter does not control 102 the tolerance of the optimization. 103 104 copy_Gram : bool, default=True 105 If ``False``, ``Gram`` is overwritten. 106 107 verbose : int, default=0 108 Controls output verbosity. 109 110 return_path : bool, default=True 111 If ``return_path==True`` returns the entire path, else returns only the 112 last point of the path. 113 114 return_n_iter : bool, default=False 115 Whether to return the number of iterations. 116 117 positive : bool, default=False 118 Restrict coefficients to be >= 0. 119 This option is only allowed with method 'lasso'. Note that the model 120 coefficients will not converge to the ordinary-least-squares solution 121 for small values of alpha. Only coefficients up to the smallest alpha 122 value (``alphas_[alphas_ > 0.].min()`` when fit_path=True) reached by 123 the stepwise Lars-Lasso algorithm are typically in congruence with the 124 solution of the coordinate descent lasso_path function. 125 126 Returns 127 ------- 128 alphas : array-like of shape (n_alphas + 1,) 129 Maximum of covariances (in absolute value) at each iteration. 130 ``n_alphas`` is either ``max_iter``, ``n_features`` or the 131 number of nodes in the path with ``alpha >= alpha_min``, whichever 132 is smaller. 133 134 active : array-like of shape (n_alphas,) 135 Indices of active variables at the end of the path. 136 137 coefs : array-like of shape (n_features, n_alphas + 1) 138 Coefficients along the path 139 140 n_iter : int 141 Number of iterations run. Returned only if return_n_iter is set 142 to True. 143 144 See Also 145 -------- 146 lars_path_gram 147 lasso_path 148 lasso_path_gram 149 LassoLars 150 Lars 151 LassoLarsCV 152 LarsCV 153 sklearn.decomposition.sparse_encode 154 155 References 156 ---------- 157 .. [1] "Least Angle Regression", Efron et al. 158 http://statweb.stanford.edu/~tibs/ftp/lars.pdf 159 160 .. [2] `Wikipedia entry on the Least-angle regression 161 <https://en.wikipedia.org/wiki/Least-angle_regression>`_ 162 163 .. [3] `Wikipedia entry on the Lasso 164 <https://en.wikipedia.org/wiki/Lasso_(statistics)>`_ 165 166 """ 167 if X is None and Gram is not None: 168 raise ValueError( 169 "X cannot be None if Gram is not None" 170 "Use lars_path_gram to avoid passing X and y." 171 ) 172 return _lars_path_solver( 173 X=X, 174 y=y, 175 Xy=Xy, 176 Gram=Gram, 177 n_samples=None, 178 max_iter=max_iter, 179 alpha_min=alpha_min, 180 method=method, 181 copy_X=copy_X, 182 eps=eps, 183 copy_Gram=copy_Gram, 184 verbose=verbose, 185 return_path=return_path, 186 return_n_iter=return_n_iter, 187 positive=positive, 188 ) 189 190 191def lars_path_gram( 192 Xy, 193 Gram, 194 *, 195 n_samples, 196 max_iter=500, 197 alpha_min=0, 198 method="lar", 199 copy_X=True, 200 eps=np.finfo(float).eps, 201 copy_Gram=True, 202 verbose=0, 203 return_path=True, 204 return_n_iter=False, 205 positive=False, 206): 207 """lars_path in the sufficient stats mode [1] 208 209 The optimization objective for the case method='lasso' is:: 210 211 (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1 212 213 in the case of method='lars', the objective function is only known in 214 the form of an implicit equation (see discussion in [1]) 215 216 Read more in the :ref:`User Guide <least_angle_regression>`. 217 218 Parameters 219 ---------- 220 Xy : array-like of shape (n_samples,) or (n_samples, n_targets) 221 Xy = np.dot(X.T, y). 222 223 Gram : array-like of shape (n_features, n_features) 224 Gram = np.dot(X.T * X). 225 226 n_samples : int or float 227 Equivalent size of sample. 228 229 max_iter : int, default=500 230 Maximum number of iterations to perform, set to infinity for no limit. 231 232 alpha_min : float, default=0 233 Minimum correlation along the path. It corresponds to the 234 regularization parameter alpha parameter in the Lasso. 235 236 method : {'lar', 'lasso'}, default='lar' 237 Specifies the returned model. Select ``'lar'`` for Least Angle 238 Regression, ``'lasso'`` for the Lasso. 239 240 copy_X : bool, default=True 241 If ``False``, ``X`` is overwritten. 242 243 eps : float, default=np.finfo(float).eps 244 The machine-precision regularization in the computation of the 245 Cholesky diagonal factors. Increase this for very ill-conditioned 246 systems. Unlike the ``tol`` parameter in some iterative 247 optimization-based algorithms, this parameter does not control 248 the tolerance of the optimization. 249 250 copy_Gram : bool, default=True 251 If ``False``, ``Gram`` is overwritten. 252 253 verbose : int, default=0 254 Controls output verbosity. 255 256 return_path : bool, default=True 257 If ``return_path==True`` returns the entire path, else returns only the 258 last point of the path. 259 260 return_n_iter : bool, default=False 261 Whether to return the number of iterations. 262 263 positive : bool, default=False 264 Restrict coefficients to be >= 0. 265 This option is only allowed with method 'lasso'. Note that the model 266 coefficients will not converge to the ordinary-least-squares solution 267 for small values of alpha. Only coefficients up to the smallest alpha 268 value (``alphas_[alphas_ > 0.].min()`` when fit_path=True) reached by 269 the stepwise Lars-Lasso algorithm are typically in congruence with the 270 solution of the coordinate descent lasso_path function. 271 272 Returns 273 ------- 274 alphas : array-like of shape (n_alphas + 1,) 275 Maximum of covariances (in absolute value) at each iteration. 276 ``n_alphas`` is either ``max_iter``, ``n_features`` or the 277 number of nodes in the path with ``alpha >= alpha_min``, whichever 278 is smaller. 279 280 active : array-like of shape (n_alphas,) 281 Indices of active variables at the end of the path. 282 283 coefs : array-like of shape (n_features, n_alphas + 1) 284 Coefficients along the path 285 286 n_iter : int 287 Number of iterations run. Returned only if return_n_iter is set 288 to True. 289 290 See Also 291 -------- 292 lars_path 293 lasso_path 294 lasso_path_gram 295 LassoLars 296 Lars 297 LassoLarsCV 298 LarsCV 299 sklearn.decomposition.sparse_encode 300 301 References 302 ---------- 303 .. [1] "Least Angle Regression", Efron et al. 304 http://statweb.stanford.edu/~tibs/ftp/lars.pdf 305 306 .. [2] `Wikipedia entry on the Least-angle regression 307 <https://en.wikipedia.org/wiki/Least-angle_regression>`_ 308 309 .. [3] `Wikipedia entry on the Lasso 310 <https://en.wikipedia.org/wiki/Lasso_(statistics)>`_ 311 312 """ 313 return _lars_path_solver( 314 X=None, 315 y=None, 316 Xy=Xy, 317 Gram=Gram, 318 n_samples=n_samples, 319 max_iter=max_iter, 320 alpha_min=alpha_min, 321 method=method, 322 copy_X=copy_X, 323 eps=eps, 324 copy_Gram=copy_Gram, 325 verbose=verbose, 326 return_path=return_path, 327 return_n_iter=return_n_iter, 328 positive=positive, 329 ) 330 331 332def _lars_path_solver( 333 X, 334 y, 335 Xy=None, 336 Gram=None, 337 n_samples=None, 338 max_iter=500, 339 alpha_min=0, 340 method="lar", 341 copy_X=True, 342 eps=np.finfo(float).eps, 343 copy_Gram=True, 344 verbose=0, 345 return_path=True, 346 return_n_iter=False, 347 positive=False, 348): 349 """Compute Least Angle Regression or Lasso path using LARS algorithm [1] 350 351 The optimization objective for the case method='lasso' is:: 352 353 (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1 354 355 in the case of method='lars', the objective function is only known in 356 the form of an implicit equation (see discussion in [1]) 357 358 Read more in the :ref:`User Guide <least_angle_regression>`. 359 360 Parameters 361 ---------- 362 X : None or ndarray of shape (n_samples, n_features) 363 Input data. Note that if X is None then Gram must be specified, 364 i.e., cannot be None or False. 365 366 y : None or ndarray of shape (n_samples,) 367 Input targets. 368 369 Xy : array-like of shape (n_samples,) or (n_samples, n_targets), \ 370 default=None 371 `Xy = np.dot(X.T, y)` that can be precomputed. It is useful 372 only when the Gram matrix is precomputed. 373 374 Gram : None, 'auto' or array-like of shape (n_features, n_features), \ 375 default=None 376 Precomputed Gram matrix `(X' * X)`, if ``'auto'``, the Gram 377 matrix is precomputed from the given X, if there are more samples 378 than features. 379 380 n_samples : int or float, default=None 381 Equivalent size of sample. If `None`, it will be `n_samples`. 382 383 max_iter : int, default=500 384 Maximum number of iterations to perform, set to infinity for no limit. 385 386 alpha_min : float, default=0 387 Minimum correlation along the path. It corresponds to the 388 regularization parameter alpha parameter in the Lasso. 389 390 method : {'lar', 'lasso'}, default='lar' 391 Specifies the returned model. Select ``'lar'`` for Least Angle 392 Regression, ``'lasso'`` for the Lasso. 393 394 copy_X : bool, default=True 395 If ``False``, ``X`` is overwritten. 396 397 eps : float, default=np.finfo(float).eps 398 The machine-precision regularization in the computation of the 399 Cholesky diagonal factors. Increase this for very ill-conditioned 400 systems. Unlike the ``tol`` parameter in some iterative 401 optimization-based algorithms, this parameter does not control 402 the tolerance of the optimization. 403 404 copy_Gram : bool, default=True 405 If ``False``, ``Gram`` is overwritten. 406 407 verbose : int, default=0 408 Controls output verbosity. 409 410 return_path : bool, default=True 411 If ``return_path==True`` returns the entire path, else returns only the 412 last point of the path. 413 414 return_n_iter : bool, default=False 415 Whether to return the number of iterations. 416 417 positive : bool, default=False 418 Restrict coefficients to be >= 0. 419 This option is only allowed with method 'lasso'. Note that the model 420 coefficients will not converge to the ordinary-least-squares solution 421 for small values of alpha. Only coefficients up to the smallest alpha 422 value (``alphas_[alphas_ > 0.].min()`` when fit_path=True) reached by 423 the stepwise Lars-Lasso algorithm are typically in congruence with the 424 solution of the coordinate descent lasso_path function. 425 426 Returns 427 ------- 428 alphas : array-like of shape (n_alphas + 1,) 429 Maximum of covariances (in absolute value) at each iteration. 430 ``n_alphas`` is either ``max_iter``, ``n_features`` or the 431 number of nodes in the path with ``alpha >= alpha_min``, whichever 432 is smaller. 433 434 active : array-like of shape (n_alphas,) 435 Indices of active variables at the end of the path. 436 437 coefs : array-like of shape (n_features, n_alphas + 1) 438 Coefficients along the path 439 440 n_iter : int 441 Number of iterations run. Returned only if return_n_iter is set 442 to True. 443 444 See Also 445 -------- 446 lasso_path 447 LassoLars 448 Lars 449 LassoLarsCV 450 LarsCV 451 sklearn.decomposition.sparse_encode 452 453 References 454 ---------- 455 .. [1] "Least Angle Regression", Efron et al. 456 http://statweb.stanford.edu/~tibs/ftp/lars.pdf 457 458 .. [2] `Wikipedia entry on the Least-angle regression 459 <https://en.wikipedia.org/wiki/Least-angle_regression>`_ 460 461 .. [3] `Wikipedia entry on the Lasso 462 <https://en.wikipedia.org/wiki/Lasso_(statistics)>`_ 463 464 """ 465 if method == "lar" and positive: 466 raise ValueError("Positive constraint not supported for 'lar' coding method.") 467 468 n_samples = n_samples if n_samples is not None else y.size 469 470 if Xy is None: 471 Cov = np.dot(X.T, y) 472 else: 473 Cov = Xy.copy() 474 475 if Gram is None or Gram is False: 476 Gram = None 477 if X is None: 478 raise ValueError("X and Gram cannot both be unspecified.") 479 elif isinstance(Gram, str) and Gram == "auto" or Gram is True: 480 if Gram is True or X.shape[0] > X.shape[1]: 481 Gram = np.dot(X.T, X) 482 else: 483 Gram = None 484 elif copy_Gram: 485 Gram = Gram.copy() 486 487 if Gram is None: 488 n_features = X.shape[1] 489 else: 490 n_features = Cov.shape[0] 491 if Gram.shape != (n_features, n_features): 492 raise ValueError("The shapes of the inputs Gram and Xy do not match.") 493 494 if copy_X and X is not None and Gram is None: 495 # force copy. setting the array to be fortran-ordered 496 # speeds up the calculation of the (partial) Gram matrix 497 # and allows to easily swap columns 498 X = X.copy("F") 499 500 max_features = min(max_iter, n_features) 501 502 dtypes = set(a.dtype for a in (X, y, Xy, Gram) if a is not None) 503 if len(dtypes) == 1: 504 # use the precision level of input data if it is consistent 505 return_dtype = next(iter(dtypes)) 506 else: 507 # fallback to double precision otherwise 508 return_dtype = np.float64 509 510 if return_path: 511 coefs = np.zeros((max_features + 1, n_features), dtype=return_dtype) 512 alphas = np.zeros(max_features + 1, dtype=return_dtype) 513 else: 514 coef, prev_coef = ( 515 np.zeros(n_features, dtype=return_dtype), 516 np.zeros(n_features, dtype=return_dtype), 517 ) 518 alpha, prev_alpha = ( 519 np.array([0.0], dtype=return_dtype), 520 np.array([0.0], dtype=return_dtype), 521 ) 522 # above better ideas? 523 524 n_iter, n_active = 0, 0 525 active, indices = list(), np.arange(n_features) 526 # holds the sign of covariance 527 sign_active = np.empty(max_features, dtype=np.int8) 528 drop = False 529 530 # will hold the cholesky factorization. Only lower part is 531 # referenced. 532 if Gram is None: 533 L = np.empty((max_features, max_features), dtype=X.dtype) 534 swap, nrm2 = linalg.get_blas_funcs(("swap", "nrm2"), (X,)) 535 else: 536 L = np.empty((max_features, max_features), dtype=Gram.dtype) 537 swap, nrm2 = linalg.get_blas_funcs(("swap", "nrm2"), (Cov,)) 538 (solve_cholesky,) = get_lapack_funcs(("potrs",), (L,)) 539 540 if verbose: 541 if verbose > 1: 542 print("Step\t\tAdded\t\tDropped\t\tActive set size\t\tC") 543 else: 544 sys.stdout.write(".") 545 sys.stdout.flush() 546 547 tiny32 = np.finfo(np.float32).tiny # to avoid division by 0 warning 548 cov_precision = np.finfo(Cov.dtype).precision 549 equality_tolerance = np.finfo(np.float32).eps 550 551 if Gram is not None: 552 Gram_copy = Gram.copy() 553 Cov_copy = Cov.copy() 554 555 while True: 556 if Cov.size: 557 if positive: 558 C_idx = np.argmax(Cov) 559 else: 560 C_idx = np.argmax(np.abs(Cov)) 561 562 C_ = Cov[C_idx] 563 564 if positive: 565 C = C_ 566 else: 567 C = np.fabs(C_) 568 else: 569 C = 0.0 570 571 if return_path: 572 alpha = alphas[n_iter, np.newaxis] 573 coef = coefs[n_iter] 574 prev_alpha = alphas[n_iter - 1, np.newaxis] 575 prev_coef = coefs[n_iter - 1] 576 577 alpha[0] = C / n_samples 578 if alpha[0] <= alpha_min + equality_tolerance: # early stopping 579 if abs(alpha[0] - alpha_min) > equality_tolerance: 580 # interpolation factor 0 <= ss < 1 581 if n_iter > 0: 582 # In the first iteration, all alphas are zero, the formula 583 # below would make ss a NaN 584 ss = (prev_alpha[0] - alpha_min) / (prev_alpha[0] - alpha[0]) 585 coef[:] = prev_coef + ss * (coef - prev_coef) 586 alpha[0] = alpha_min 587 if return_path: 588 coefs[n_iter] = coef 589 break 590 591 if n_iter >= max_iter or n_active >= n_features: 592 break 593 if not drop: 594 595 ########################################################## 596 # Append x_j to the Cholesky factorization of (Xa * Xa') # 597 # # 598 # ( L 0 ) # 599 # L -> ( ) , where L * w = Xa' x_j # 600 # ( w z ) and z = ||x_j|| # 601 # # 602 ########################################################## 603 604 if positive: 605 sign_active[n_active] = np.ones_like(C_) 606 else: 607 sign_active[n_active] = np.sign(C_) 608 m, n = n_active, C_idx + n_active 609 610 Cov[C_idx], Cov[0] = swap(Cov[C_idx], Cov[0]) 611 indices[n], indices[m] = indices[m], indices[n] 612 Cov_not_shortened = Cov 613 Cov = Cov[1:] # remove Cov[0] 614 615 if Gram is None: 616 X.T[n], X.T[m] = swap(X.T[n], X.T[m]) 617 c = nrm2(X.T[n_active]) ** 2 618 L[n_active, :n_active] = np.dot(X.T[n_active], X.T[:n_active].T) 619 else: 620 # swap does only work inplace if matrix is fortran 621 # contiguous ... 622 Gram[m], Gram[n] = swap(Gram[m], Gram[n]) 623 Gram[:, m], Gram[:, n] = swap(Gram[:, m], Gram[:, n]) 624 c = Gram[n_active, n_active] 625 L[n_active, :n_active] = Gram[n_active, :n_active] 626 627 # Update the cholesky decomposition for the Gram matrix 628 if n_active: 629 linalg.solve_triangular( 630 L[:n_active, :n_active], 631 L[n_active, :n_active], 632 trans=0, 633 lower=1, 634 overwrite_b=True, 635 **SOLVE_TRIANGULAR_ARGS, 636 ) 637 638 v = np.dot(L[n_active, :n_active], L[n_active, :n_active]) 639 diag = max(np.sqrt(np.abs(c - v)), eps) 640 L[n_active, n_active] = diag 641 642 if diag < 1e-7: 643 # The system is becoming too ill-conditioned. 644 # We have degenerate vectors in our active set. 645 # We'll 'drop for good' the last regressor added. 646 647 # Note: this case is very rare. It is no longer triggered by 648 # the test suite. The `equality_tolerance` margin added in 0.16 649 # to get early stopping to work consistently on all versions of 650 # Python including 32 bit Python under Windows seems to make it 651 # very difficult to trigger the 'drop for good' strategy. 652 warnings.warn( 653 "Regressors in active set degenerate. " 654 "Dropping a regressor, after %i iterations, " 655 "i.e. alpha=%.3e, " 656 "with an active set of %i regressors, and " 657 "the smallest cholesky pivot element being %.3e." 658 " Reduce max_iter or increase eps parameters." 659 % (n_iter, alpha, n_active, diag), 660 ConvergenceWarning, 661 ) 662 663 # XXX: need to figure a 'drop for good' way 664 Cov = Cov_not_shortened 665 Cov[0] = 0 666 Cov[C_idx], Cov[0] = swap(Cov[C_idx], Cov[0]) 667 continue 668 669 active.append(indices[n_active]) 670 n_active += 1 671 672 if verbose > 1: 673 print( 674 "%s\t\t%s\t\t%s\t\t%s\t\t%s" % (n_iter, active[-1], "", n_active, C) 675 ) 676 677 if method == "lasso" and n_iter > 0 and prev_alpha[0] < alpha[0]: 678 # alpha is increasing. This is because the updates of Cov are 679 # bringing in too much numerical error that is greater than 680 # than the remaining correlation with the 681 # regressors. Time to bail out 682 warnings.warn( 683 "Early stopping the lars path, as the residues " 684 "are small and the current value of alpha is no " 685 "longer well controlled. %i iterations, alpha=%.3e, " 686 "previous alpha=%.3e, with an active set of %i " 687 "regressors." % (n_iter, alpha, prev_alpha, n_active), 688 ConvergenceWarning, 689 ) 690 break 691 692 # least squares solution 693 least_squares, _ = solve_cholesky( 694 L[:n_active, :n_active], sign_active[:n_active], lower=True 695 ) 696 697 if least_squares.size == 1 and least_squares == 0: 698 # This happens because sign_active[:n_active] = 0 699 least_squares[...] = 1 700 AA = 1.0 701 else: 702 # is this really needed ? 703 AA = 1.0 / np.sqrt(np.sum(least_squares * sign_active[:n_active])) 704 705 if not np.isfinite(AA): 706 # L is too ill-conditioned 707 i = 0 708 L_ = L[:n_active, :n_active].copy() 709 while not np.isfinite(AA): 710 L_.flat[:: n_active + 1] += (2 ** i) * eps 711 least_squares, _ = solve_cholesky( 712 L_, sign_active[:n_active], lower=True 713 ) 714 tmp = max(np.sum(least_squares * sign_active[:n_active]), eps) 715 AA = 1.0 / np.sqrt(tmp) 716 i += 1 717 least_squares *= AA 718 719 if Gram is None: 720 # equiangular direction of variables in the active set 721 eq_dir = np.dot(X.T[:n_active].T, least_squares) 722 # correlation between each unactive variables and 723 # eqiangular vector 724 corr_eq_dir = np.dot(X.T[n_active:], eq_dir) 725 else: 726 # if huge number of features, this takes 50% of time, I 727 # think could be avoided if we just update it using an 728 # orthogonal (QR) decomposition of X 729 corr_eq_dir = np.dot(Gram[:n_active, n_active:].T, least_squares) 730 731 # Explicit rounding can be necessary to avoid `np.argmax(Cov)` yielding 732 # unstable results because of rounding errors. 733 np.around(corr_eq_dir, decimals=cov_precision, out=corr_eq_dir) 734 735 g1 = arrayfuncs.min_pos((C - Cov) / (AA - corr_eq_dir + tiny32)) 736 if positive: 737 gamma_ = min(g1, C / AA) 738 else: 739 g2 = arrayfuncs.min_pos((C + Cov) / (AA + corr_eq_dir + tiny32)) 740 gamma_ = min(g1, g2, C / AA) 741 742 # TODO: better names for these variables: z 743 drop = False 744 z = -coef[active] / (least_squares + tiny32) 745 z_pos = arrayfuncs.min_pos(z) 746 if z_pos < gamma_: 747 # some coefficients have changed sign 748 idx = np.where(z == z_pos)[0][::-1] 749 750 # update the sign, important for LAR 751 sign_active[idx] = -sign_active[idx] 752 753 if method == "lasso": 754 gamma_ = z_pos 755 drop = True 756 757 n_iter += 1 758 759 if return_path: 760 if n_iter >= coefs.shape[0]: 761 del coef, alpha, prev_alpha, prev_coef 762 # resize the coefs and alphas array 763 add_features = 2 * max(1, (max_features - n_active)) 764 coefs = np.resize(coefs, (n_iter + add_features, n_features)) 765 coefs[-add_features:] = 0 766 alphas = np.resize(alphas, n_iter + add_features) 767 alphas[-add_features:] = 0 768 coef = coefs[n_iter] 769 prev_coef = coefs[n_iter - 1] 770 else: 771 # mimic the effect of incrementing n_iter on the array references 772 prev_coef = coef 773 prev_alpha[0] = alpha[0] 774 coef = np.zeros_like(coef) 775 776 coef[active] = prev_coef[active] + gamma_ * least_squares 777 778 # update correlations 779 Cov -= gamma_ * corr_eq_dir 780 781 # See if any coefficient has changed sign 782 if drop and method == "lasso": 783 784 # handle the case when idx is not length of 1 785 for ii in idx: 786 arrayfuncs.cholesky_delete(L[:n_active, :n_active], ii) 787 788 n_active -= 1 789 # handle the case when idx is not length of 1 790 drop_idx = [active.pop(ii) for ii in idx] 791 792 if Gram is None: 793 # propagate dropped variable 794 for ii in idx: 795 for i in range(ii, n_active): 796 X.T[i], X.T[i + 1] = swap(X.T[i], X.T[i + 1]) 797 # yeah this is stupid 798 indices[i], indices[i + 1] = indices[i + 1], indices[i] 799 800 # TODO: this could be updated 801 residual = y - np.dot(X[:, :n_active], coef[active]) 802 temp = np.dot(X.T[n_active], residual) 803 804 Cov = np.r_[temp, Cov] 805 else: 806 for ii in idx: 807 for i in range(ii, n_active): 808 indices[i], indices[i + 1] = indices[i + 1], indices[i] 809 Gram[i], Gram[i + 1] = swap(Gram[i], Gram[i + 1]) 810 Gram[:, i], Gram[:, i + 1] = swap(Gram[:, i], Gram[:, i + 1]) 811 812 # Cov_n = Cov_j + x_j * X + increment(betas) TODO: 813 # will this still work with multiple drops ? 814 815 # recompute covariance. Probably could be done better 816 # wrong as Xy is not swapped with the rest of variables 817 818 # TODO: this could be updated 819 temp = Cov_copy[drop_idx] - np.dot(Gram_copy[drop_idx], coef) 820 Cov = np.r_[temp, Cov] 821 822 sign_active = np.delete(sign_active, idx) 823 sign_active = np.append(sign_active, 0.0) # just to maintain size 824 if verbose > 1: 825 print( 826 "%s\t\t%s\t\t%s\t\t%s\t\t%s" 827 % (n_iter, "", drop_idx, n_active, abs(temp)) 828 ) 829 830 if return_path: 831 # resize coefs in case of early stop 832 alphas = alphas[: n_iter + 1] 833 coefs = coefs[: n_iter + 1] 834 835 if return_n_iter: 836 return alphas, active, coefs.T, n_iter 837 else: 838 return alphas, active, coefs.T 839 else: 840 if return_n_iter: 841 return alpha, active, coef, n_iter 842 else: 843 return alpha, active, coef 844 845 846############################################################################### 847# Estimator classes 848 849 850class Lars(MultiOutputMixin, RegressorMixin, LinearModel): 851 """Least Angle Regression model a.k.a. LAR. 852 853 Read more in the :ref:`User Guide <least_angle_regression>`. 854 855 Parameters 856 ---------- 857 fit_intercept : bool, default=True 858 Whether to calculate the intercept for this model. If set 859 to false, no intercept will be used in calculations 860 (i.e. data is expected to be centered). 861 862 verbose : bool or int, default=False 863 Sets the verbosity amount. 864 865 normalize : bool, default=True 866 This parameter is ignored when ``fit_intercept`` is set to False. 867 If True, the regressors X will be normalized before regression by 868 subtracting the mean and dividing by the l2-norm. 869 If you wish to standardize, please use 870 :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit`` 871 on an estimator with ``normalize=False``. 872 873 .. deprecated:: 1.0 874 ``normalize`` was deprecated in version 1.0. It will default 875 to False in 1.2 and be removed in 1.4. 876 877 precompute : bool, 'auto' or array-like , default='auto' 878 Whether to use a precomputed Gram matrix to speed up 879 calculations. If set to ``'auto'`` let us decide. The Gram 880 matrix can also be passed as argument. 881 882 n_nonzero_coefs : int, default=500 883 Target number of non-zero coefficients. Use ``np.inf`` for no limit. 884 885 eps : float, default=np.finfo(float).eps 886 The machine-precision regularization in the computation of the 887 Cholesky diagonal factors. Increase this for very ill-conditioned 888 systems. Unlike the ``tol`` parameter in some iterative 889 optimization-based algorithms, this parameter does not control 890 the tolerance of the optimization. 891 892 copy_X : bool, default=True 893 If ``True``, X will be copied; else, it may be overwritten. 894 895 fit_path : bool, default=True 896 If True the full path is stored in the ``coef_path_`` attribute. 897 If you compute the solution for a large problem or many targets, 898 setting ``fit_path`` to ``False`` will lead to a speedup, especially 899 with a small alpha. 900 901 jitter : float, default=None 902 Upper bound on a uniform noise parameter to be added to the 903 `y` values, to satisfy the model's assumption of 904 one-at-a-time computations. Might help with stability. 905 906 .. versionadded:: 0.23 907 908 random_state : int, RandomState instance or None, default=None 909 Determines random number generation for jittering. Pass an int 910 for reproducible output across multiple function calls. 911 See :term:`Glossary <random_state>`. Ignored if `jitter` is None. 912 913 .. versionadded:: 0.23 914 915 Attributes 916 ---------- 917 alphas_ : array-like of shape (n_alphas + 1,) or list of such arrays 918 Maximum of covariances (in absolute value) at each iteration. 919 ``n_alphas`` is either ``max_iter``, ``n_features`` or the 920 number of nodes in the path with ``alpha >= alpha_min``, whichever 921 is smaller. If this is a list of array-like, the length of the outer 922 list is `n_targets`. 923 924 active_ : list of shape (n_alphas,) or list of such lists 925 Indices of active variables at the end of the path. 926 If this is a list of list, the length of the outer list is `n_targets`. 927 928 coef_path_ : array-like of shape (n_features, n_alphas + 1) or list \ 929 of such arrays 930 The varying values of the coefficients along the path. It is not 931 present if the ``fit_path`` parameter is ``False``. If this is a list 932 of array-like, the length of the outer list is `n_targets`. 933 934 coef_ : array-like of shape (n_features,) or (n_targets, n_features) 935 Parameter vector (w in the formulation formula). 936 937 intercept_ : float or array-like of shape (n_targets,) 938 Independent term in decision function. 939 940 n_iter_ : array-like or int 941 The number of iterations taken by lars_path to find the 942 grid of alphas for each target. 943 944 n_features_in_ : int 945 Number of features seen during :term:`fit`. 946 947 .. versionadded:: 0.24 948 949 feature_names_in_ : ndarray of shape (`n_features_in_`,) 950 Names of features seen during :term:`fit`. Defined only when `X` 951 has feature names that are all strings. 952 953 .. versionadded:: 1.0 954 955 See Also 956 -------- 957 lars_path: Compute Least Angle Regression or Lasso 958 path using LARS algorithm. 959 LarsCV : Cross-validated Least Angle Regression model. 960 sklearn.decomposition.sparse_encode : Sparse coding. 961 962 Examples 963 -------- 964 >>> from sklearn import linear_model 965 >>> reg = linear_model.Lars(n_nonzero_coefs=1, normalize=False) 966 >>> reg.fit([[-1, 1], [0, 0], [1, 1]], [-1.1111, 0, -1.1111]) 967 Lars(n_nonzero_coefs=1, normalize=False) 968 >>> print(reg.coef_) 969 [ 0. -1.11...] 970 """ 971 972 method = "lar" 973 positive = False 974 975 def __init__( 976 self, 977 *, 978 fit_intercept=True, 979 verbose=False, 980 normalize="deprecated", 981 precompute="auto", 982 n_nonzero_coefs=500, 983 eps=np.finfo(float).eps, 984 copy_X=True, 985 fit_path=True, 986 jitter=None, 987 random_state=None, 988 ): 989 self.fit_intercept = fit_intercept 990 self.verbose = verbose 991 self.normalize = normalize 992 self.precompute = precompute 993 self.n_nonzero_coefs = n_nonzero_coefs 994 self.eps = eps 995 self.copy_X = copy_X 996 self.fit_path = fit_path 997 self.jitter = jitter 998 self.random_state = random_state 999 1000 @staticmethod 1001 def _get_gram(precompute, X, y): 1002 if (not hasattr(precompute, "__array__")) and ( 1003 (precompute is True) 1004 or (precompute == "auto" and X.shape[0] > X.shape[1]) 1005 or (precompute == "auto" and y.shape[1] > 1) 1006 ): 1007 precompute = np.dot(X.T, X) 1008 1009 return precompute 1010 1011 def _fit(self, X, y, max_iter, alpha, fit_path, normalize, Xy=None): 1012 """Auxiliary method to fit the model using X, y as training data""" 1013 n_features = X.shape[1] 1014 1015 X, y, X_offset, y_offset, X_scale = self._preprocess_data( 1016 X, y, self.fit_intercept, normalize, self.copy_X 1017 ) 1018 1019 if y.ndim == 1: 1020 y = y[:, np.newaxis] 1021 1022 n_targets = y.shape[1] 1023 1024 Gram = self._get_gram(self.precompute, X, y) 1025 1026 self.alphas_ = [] 1027 self.n_iter_ = [] 1028 self.coef_ = np.empty((n_targets, n_features), dtype=X.dtype) 1029 1030 if fit_path: 1031 self.active_ = [] 1032 self.coef_path_ = [] 1033 for k in range(n_targets): 1034 this_Xy = None if Xy is None else Xy[:, k] 1035 alphas, active, coef_path, n_iter_ = lars_path( 1036 X, 1037 y[:, k], 1038 Gram=Gram, 1039 Xy=this_Xy, 1040 copy_X=self.copy_X, 1041 copy_Gram=True, 1042 alpha_min=alpha, 1043 method=self.method, 1044 verbose=max(0, self.verbose - 1), 1045 max_iter=max_iter, 1046 eps=self.eps, 1047 return_path=True, 1048 return_n_iter=True, 1049 positive=self.positive, 1050 ) 1051 self.alphas_.append(alphas) 1052 self.active_.append(active) 1053 self.n_iter_.append(n_iter_) 1054 self.coef_path_.append(coef_path) 1055 self.coef_[k] = coef_path[:, -1] 1056 1057 if n_targets == 1: 1058 self.alphas_, self.active_, self.coef_path_, self.coef_ = [ 1059 a[0] 1060 for a in (self.alphas_, self.active_, self.coef_path_, self.coef_) 1061 ] 1062 self.n_iter_ = self.n_iter_[0] 1063 else: 1064 for k in range(n_targets): 1065 this_Xy = None if Xy is None else Xy[:, k] 1066 alphas, _, self.coef_[k], n_iter_ = lars_path( 1067 X, 1068 y[:, k], 1069 Gram=Gram, 1070 Xy=this_Xy, 1071 copy_X=self.copy_X, 1072 copy_Gram=True, 1073 alpha_min=alpha, 1074 method=self.method, 1075 verbose=max(0, self.verbose - 1), 1076 max_iter=max_iter, 1077 eps=self.eps, 1078 return_path=False, 1079 return_n_iter=True, 1080 positive=self.positive, 1081 ) 1082 self.alphas_.append(alphas) 1083 self.n_iter_.append(n_iter_) 1084 if n_targets == 1: 1085 self.alphas_ = self.alphas_[0] 1086 self.n_iter_ = self.n_iter_[0] 1087 1088 self._set_intercept(X_offset, y_offset, X_scale) 1089 return self 1090 1091 def fit(self, X, y, Xy=None): 1092 """Fit the model using X, y as training data. 1093 1094 Parameters 1095 ---------- 1096 X : array-like of shape (n_samples, n_features) 1097 Training data. 1098 1099 y : array-like of shape (n_samples,) or (n_samples, n_targets) 1100 Target values. 1101 1102 Xy : array-like of shape (n_samples,) or (n_samples, n_targets), \ 1103 default=None 1104 Xy = np.dot(X.T, y) that can be precomputed. It is useful 1105 only when the Gram matrix is precomputed. 1106 1107 Returns 1108 ------- 1109 self : object 1110 Returns an instance of self. 1111 """ 1112 X, y = self._validate_data(X, y, y_numeric=True, multi_output=True) 1113 1114 _normalize = _deprecate_normalize( 1115 self.normalize, default=True, estimator_name=self.__class__.__name__ 1116 ) 1117 1118 alpha = getattr(self, "alpha", 0.0) 1119 if hasattr(self, "n_nonzero_coefs"): 1120 alpha = 0.0 # n_nonzero_coefs parametrization takes priority 1121 max_iter = self.n_nonzero_coefs 1122 else: 1123 max_iter = self.max_iter 1124 1125 if self.jitter is not None: 1126 rng = check_random_state(self.random_state) 1127 1128 noise = rng.uniform(high=self.jitter, size=len(y)) 1129 y = y + noise 1130 1131 self._fit( 1132 X, 1133 y, 1134 max_iter=max_iter, 1135 alpha=alpha, 1136 fit_path=self.fit_path, 1137 normalize=_normalize, 1138 Xy=Xy, 1139 ) 1140 1141 return self 1142 1143 1144class LassoLars(Lars): 1145 """Lasso model fit with Least Angle Regression a.k.a. Lars. 1146 1147 It is a Linear Model trained with an L1 prior as regularizer. 1148 1149 The optimization objective for Lasso is:: 1150 1151 (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1 1152 1153 Read more in the :ref:`User Guide <least_angle_regression>`. 1154 1155 Parameters 1156 ---------- 1157 alpha : float, default=1.0 1158 Constant that multiplies the penalty term. Defaults to 1.0. 1159 ``alpha = 0`` is equivalent to an ordinary least square, solved 1160 by :class:`LinearRegression`. For numerical reasons, using 1161 ``alpha = 0`` with the LassoLars object is not advised and you 1162 should prefer the LinearRegression object. 1163 1164 fit_intercept : bool, default=True 1165 Whether to calculate the intercept for this model. If set 1166 to false, no intercept will be used in calculations 1167 (i.e. data is expected to be centered). 1168 1169 verbose : bool or int, default=False 1170 Sets the verbosity amount. 1171 1172 normalize : bool, default=True 1173 This parameter is ignored when ``fit_intercept`` is set to False. 1174 If True, the regressors X will be normalized before regression by 1175 subtracting the mean and dividing by the l2-norm. 1176 If you wish to standardize, please use 1177 :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit`` 1178 on an estimator with ``normalize=False``. 1179 1180 .. deprecated:: 1.0 1181 ``normalize`` was deprecated in version 1.0. It will default 1182 to False in 1.2 and be removed in 1.4. 1183 1184 precompute : bool, 'auto' or array-like, default='auto' 1185 Whether to use a precomputed Gram matrix to speed up 1186 calculations. If set to ``'auto'`` let us decide. The Gram 1187 matrix can also be passed as argument. 1188 1189 max_iter : int, default=500 1190 Maximum number of iterations to perform. 1191 1192 eps : float, default=np.finfo(float).eps 1193 The machine-precision regularization in the computation of the 1194 Cholesky diagonal factors. Increase this for very ill-conditioned 1195 systems. Unlike the ``tol`` parameter in some iterative 1196 optimization-based algorithms, this parameter does not control 1197 the tolerance of the optimization. 1198 1199 copy_X : bool, default=True 1200 If True, X will be copied; else, it may be overwritten. 1201 1202 fit_path : bool, default=True 1203 If ``True`` the full path is stored in the ``coef_path_`` attribute. 1204 If you compute the solution for a large problem or many targets, 1205 setting ``fit_path`` to ``False`` will lead to a speedup, especially 1206 with a small alpha. 1207 1208 positive : bool, default=False 1209 Restrict coefficients to be >= 0. Be aware that you might want to 1210 remove fit_intercept which is set True by default. 1211 Under the positive restriction the model coefficients will not converge 1212 to the ordinary-least-squares solution for small values of alpha. 1213 Only coefficients up to the smallest alpha value (``alphas_[alphas_ > 1214 0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso 1215 algorithm are typically in congruence with the solution of the 1216 coordinate descent Lasso estimator. 1217 1218 jitter : float, default=None 1219 Upper bound on a uniform noise parameter to be added to the 1220 `y` values, to satisfy the model's assumption of 1221 one-at-a-time computations. Might help with stability. 1222 1223 .. versionadded:: 0.23 1224 1225 random_state : int, RandomState instance or None, default=None 1226 Determines random number generation for jittering. Pass an int 1227 for reproducible output across multiple function calls. 1228 See :term:`Glossary <random_state>`. Ignored if `jitter` is None. 1229 1230 .. versionadded:: 0.23 1231 1232 Attributes 1233 ---------- 1234 alphas_ : array-like of shape (n_alphas + 1,) or list of such arrays 1235 Maximum of covariances (in absolute value) at each iteration. 1236 ``n_alphas`` is either ``max_iter``, ``n_features`` or the 1237 number of nodes in the path with ``alpha >= alpha_min``, whichever 1238 is smaller. If this is a list of array-like, the length of the outer 1239 list is `n_targets`. 1240 1241 active_ : list of length n_alphas or list of such lists 1242 Indices of active variables at the end of the path. 1243 If this is a list of list, the length of the outer list is `n_targets`. 1244 1245 coef_path_ : array-like of shape (n_features, n_alphas + 1) or list \ 1246 of such arrays 1247 If a list is passed it's expected to be one of n_targets such arrays. 1248 The varying values of the coefficients along the path. It is not 1249 present if the ``fit_path`` parameter is ``False``. If this is a list 1250 of array-like, the length of the outer list is `n_targets`. 1251 1252 coef_ : array-like of shape (n_features,) or (n_targets, n_features) 1253 Parameter vector (w in the formulation formula). 1254 1255 intercept_ : float or array-like of shape (n_targets,) 1256 Independent term in decision function. 1257 1258 n_iter_ : array-like or int 1259 The number of iterations taken by lars_path to find the 1260 grid of alphas for each target. 1261 1262 n_features_in_ : int 1263 Number of features seen during :term:`fit`. 1264 1265 .. versionadded:: 0.24 1266 1267 feature_names_in_ : ndarray of shape (`n_features_in_`,) 1268 Names of features seen during :term:`fit`. Defined only when `X` 1269 has feature names that are all strings. 1270 1271 .. versionadded:: 1.0 1272 1273 See Also 1274 -------- 1275 lars_path : Compute Least Angle Regression or Lasso 1276 path using LARS algorithm. 1277 lasso_path : Compute Lasso path with coordinate descent. 1278 Lasso : Linear Model trained with L1 prior as 1279 regularizer (aka the Lasso). 1280 LassoCV : Lasso linear model with iterative fitting 1281 along a regularization path. 1282 LassoLarsCV: Cross-validated Lasso, using the LARS algorithm. 1283 LassoLarsIC : Lasso model fit with Lars using BIC 1284 or AIC for model selection. 1285 sklearn.decomposition.sparse_encode : Sparse coding. 1286 1287 Examples 1288 -------- 1289 >>> from sklearn import linear_model 1290 >>> reg = linear_model.LassoLars(alpha=0.01, normalize=False) 1291 >>> reg.fit([[-1, 1], [0, 0], [1, 1]], [-1, 0, -1]) 1292 LassoLars(alpha=0.01, normalize=False) 1293 >>> print(reg.coef_) 1294 [ 0. -0.955...] 1295 """ 1296 1297 method = "lasso" 1298 1299 def __init__( 1300 self, 1301 alpha=1.0, 1302 *, 1303 fit_intercept=True, 1304 verbose=False, 1305 normalize="deprecated", 1306 precompute="auto", 1307 max_iter=500, 1308 eps=np.finfo(float).eps, 1309 copy_X=True, 1310 fit_path=True, 1311 positive=False, 1312 jitter=None, 1313 random_state=None, 1314 ): 1315 self.alpha = alpha 1316 self.fit_intercept = fit_intercept 1317 self.max_iter = max_iter 1318 self.verbose = verbose 1319 self.normalize = normalize 1320 self.positive = positive 1321 self.precompute = precompute 1322 self.copy_X = copy_X 1323 self.eps = eps 1324 self.fit_path = fit_path 1325 self.jitter = jitter 1326 self.random_state = random_state 1327 1328 1329############################################################################### 1330# Cross-validated estimator classes 1331 1332 1333def _check_copy_and_writeable(array, copy=False): 1334 if copy or not array.flags.writeable: 1335 return array.copy() 1336 return array 1337 1338 1339def _lars_path_residues( 1340 X_train, 1341 y_train, 1342 X_test, 1343 y_test, 1344 Gram=None, 1345 copy=True, 1346 method="lars", 1347 verbose=False, 1348 fit_intercept=True, 1349 normalize=True, 1350 max_iter=500, 1351 eps=np.finfo(float).eps, 1352 positive=False, 1353): 1354 """Compute the residues on left-out data for a full LARS path 1355 1356 Parameters 1357 ----------- 1358 X_train : array-like of shape (n_samples, n_features) 1359 The data to fit the LARS on 1360 1361 y_train : array-like of shape (n_samples,) 1362 The target variable to fit LARS on 1363 1364 X_test : array-like of shape (n_samples, n_features) 1365 The data to compute the residues on 1366 1367 y_test : array-like of shape (n_samples,) 1368 The target variable to compute the residues on 1369 1370 Gram : None, 'auto' or array-like of shape (n_features, n_features), \ 1371 default=None 1372 Precomputed Gram matrix (X' * X), if ``'auto'``, the Gram 1373 matrix is precomputed from the given X, if there are more samples 1374 than features 1375 1376 copy : bool, default=True 1377 Whether X_train, X_test, y_train and y_test should be copied; 1378 if False, they may be overwritten. 1379 1380 method : {'lar' , 'lasso'}, default='lar' 1381 Specifies the returned model. Select ``'lar'`` for Least Angle 1382 Regression, ``'lasso'`` for the Lasso. 1383 1384 verbose : bool or int, default=False 1385 Sets the amount of verbosity 1386 1387 fit_intercept : bool, default=True 1388 whether to calculate the intercept for this model. If set 1389 to false, no intercept will be used in calculations 1390 (i.e. data is expected to be centered). 1391 1392 positive : bool, default=False 1393 Restrict coefficients to be >= 0. Be aware that you might want to 1394 remove fit_intercept which is set True by default. 1395 See reservations for using this option in combination with method 1396 'lasso' for expected small values of alpha in the doc of LassoLarsCV 1397 and LassoLarsIC. 1398 1399 normalize : bool, default=True 1400 This parameter is ignored when ``fit_intercept`` is set to False. 1401 If True, the regressors X will be normalized before regression by 1402 subtracting the mean and dividing by the l2-norm. 1403 If you wish to standardize, please use 1404 :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit`` 1405 on an estimator with ``normalize=False``. 1406 1407 .. deprecated:: 1.0 1408 ``normalize`` was deprecated in version 1.0. It will default 1409 to False in 1.2 and be removed in 1.4. 1410 1411 max_iter : int, default=500 1412 Maximum number of iterations to perform. 1413 1414 eps : float, default=np.finfo(float).eps 1415 The machine-precision regularization in the computation of the 1416 Cholesky diagonal factors. Increase this for very ill-conditioned 1417 systems. Unlike the ``tol`` parameter in some iterative 1418 optimization-based algorithms, this parameter does not control 1419 the tolerance of the optimization. 1420 1421 Returns 1422 -------- 1423 alphas : array-like of shape (n_alphas,) 1424 Maximum of covariances (in absolute value) at each iteration. 1425 ``n_alphas`` is either ``max_iter`` or ``n_features``, whichever 1426 is smaller. 1427 1428 active : list 1429 Indices of active variables at the end of the path. 1430 1431 coefs : array-like of shape (n_features, n_alphas) 1432 Coefficients along the path 1433 1434 residues : array-like of shape (n_alphas, n_samples) 1435 Residues of the prediction on the test data 1436 """ 1437 X_train = _check_copy_and_writeable(X_train, copy) 1438 y_train = _check_copy_and_writeable(y_train, copy) 1439 X_test = _check_copy_and_writeable(X_test, copy) 1440 y_test = _check_copy_and_writeable(y_test, copy) 1441 1442 if fit_intercept: 1443 X_mean = X_train.mean(axis=0) 1444 X_train -= X_mean 1445 X_test -= X_mean 1446 y_mean = y_train.mean(axis=0) 1447 y_train = as_float_array(y_train, copy=False) 1448 y_train -= y_mean 1449 y_test = as_float_array(y_test, copy=False) 1450 y_test -= y_mean 1451 1452 if normalize: 1453 norms = np.sqrt(np.sum(X_train ** 2, axis=0)) 1454 nonzeros = np.flatnonzero(norms) 1455 X_train[:, nonzeros] /= norms[nonzeros] 1456 1457 alphas, active, coefs = lars_path( 1458 X_train, 1459 y_train, 1460 Gram=Gram, 1461 copy_X=False, 1462 copy_Gram=False, 1463 method=method, 1464 verbose=max(0, verbose - 1), 1465 max_iter=max_iter, 1466 eps=eps, 1467 positive=positive, 1468 ) 1469 if normalize: 1470 coefs[nonzeros] /= norms[nonzeros][:, np.newaxis] 1471 residues = np.dot(X_test, coefs) - y_test[:, np.newaxis] 1472 return alphas, active, coefs, residues.T 1473 1474 1475class LarsCV(Lars): 1476 """Cross-validated Least Angle Regression model. 1477 1478 See glossary entry for :term:`cross-validation estimator`. 1479 1480 Read more in the :ref:`User Guide <least_angle_regression>`. 1481 1482 Parameters 1483 ---------- 1484 fit_intercept : bool, default=True 1485 Whether to calculate the intercept for this model. If set 1486 to false, no intercept will be used in calculations 1487 (i.e. data is expected to be centered). 1488 1489 verbose : bool or int, default=False 1490 Sets the verbosity amount. 1491 1492 max_iter : int, default=500 1493 Maximum number of iterations to perform. 1494 1495 normalize : bool, default=True 1496 This parameter is ignored when ``fit_intercept`` is set to False. 1497 If True, the regressors X will be normalized before regression by 1498 subtracting the mean and dividing by the l2-norm. 1499 If you wish to standardize, please use 1500 :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit`` 1501 on an estimator with ``normalize=False``. 1502 1503 .. deprecated:: 1.0 1504 ``normalize`` was deprecated in version 1.0. It will default 1505 to False in 1.2 and be removed in 1.4. 1506 1507 precompute : bool, 'auto' or array-like , default='auto' 1508 Whether to use a precomputed Gram matrix to speed up 1509 calculations. If set to ``'auto'`` let us decide. The Gram matrix 1510 cannot be passed as argument since we will use only subsets of X. 1511 1512 cv : int, cross-validation generator or an iterable, default=None 1513 Determines the cross-validation splitting strategy. 1514 Possible inputs for cv are: 1515 1516 - None, to use the default 5-fold cross-validation, 1517 - integer, to specify the number of folds. 1518 - :term:`CV splitter`, 1519 - An iterable yielding (train, test) splits as arrays of indices. 1520 1521 For integer/None inputs, :class:`KFold` is used. 1522 1523 Refer :ref:`User Guide <cross_validation>` for the various 1524 cross-validation strategies that can be used here. 1525 1526 .. versionchanged:: 0.22 1527 ``cv`` default value if None changed from 3-fold to 5-fold. 1528 1529 max_n_alphas : int, default=1000 1530 The maximum number of points on the path used to compute the 1531 residuals in the cross-validation. 1532 1533 n_jobs : int or None, default=None 1534 Number of CPUs to use during the cross validation. 1535 ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. 1536 ``-1`` means using all processors. See :term:`Glossary <n_jobs>` 1537 for more details. 1538 1539 eps : float, default=np.finfo(float).eps 1540 The machine-precision regularization in the computation of the 1541 Cholesky diagonal factors. Increase this for very ill-conditioned 1542 systems. Unlike the ``tol`` parameter in some iterative 1543 optimization-based algorithms, this parameter does not control 1544 the tolerance of the optimization. 1545 1546 copy_X : bool, default=True 1547 If ``True``, X will be copied; else, it may be overwritten. 1548 1549 Attributes 1550 ---------- 1551 active_ : list of length n_alphas or list of such lists 1552 Indices of active variables at the end of the path. 1553 If this is a list of lists, the outer list length is `n_targets`. 1554 1555 coef_ : array-like of shape (n_features,) 1556 parameter vector (w in the formulation formula) 1557 1558 intercept_ : float 1559 independent term in decision function 1560 1561 coef_path_ : array-like of shape (n_features, n_alphas) 1562 the varying values of the coefficients along the path 1563 1564 alpha_ : float 1565 the estimated regularization parameter alpha 1566 1567 alphas_ : array-like of shape (n_alphas,) 1568 the different values of alpha along the path 1569 1570 cv_alphas_ : array-like of shape (n_cv_alphas,) 1571 all the values of alpha along the path for the different folds 1572 1573 mse_path_ : array-like of shape (n_folds, n_cv_alphas) 1574 the mean square error on left-out for each fold along the path 1575 (alpha values given by ``cv_alphas``) 1576 1577 n_iter_ : array-like or int 1578 the number of iterations run by Lars with the optimal alpha. 1579 1580 n_features_in_ : int 1581 Number of features seen during :term:`fit`. 1582 1583 .. versionadded:: 0.24 1584 1585 feature_names_in_ : ndarray of shape (`n_features_in_`,) 1586 Names of features seen during :term:`fit`. Defined only when `X` 1587 has feature names that are all strings. 1588 1589 .. versionadded:: 1.0 1590 1591 See Also 1592 -------- 1593 lars_path : Compute Least Angle Regression or Lasso 1594 path using LARS algorithm. 1595 lasso_path : Compute Lasso path with coordinate descent. 1596 Lasso : Linear Model trained with L1 prior as 1597 regularizer (aka the Lasso). 1598 LassoCV : Lasso linear model with iterative fitting 1599 along a regularization path. 1600 LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars. 1601 LassoLarsIC : Lasso model fit with Lars using BIC 1602 or AIC for model selection. 1603 sklearn.decomposition.sparse_encode : Sparse coding. 1604 1605 Examples 1606 -------- 1607 >>> from sklearn.linear_model import LarsCV 1608 >>> from sklearn.datasets import make_regression 1609 >>> X, y = make_regression(n_samples=200, noise=4.0, random_state=0) 1610 >>> reg = LarsCV(cv=5, normalize=False).fit(X, y) 1611 >>> reg.score(X, y) 1612 0.9996... 1613 >>> reg.alpha_ 1614 0.2961... 1615 >>> reg.predict(X[:1,]) 1616 array([154.3996...]) 1617 """ 1618 1619 method = "lar" 1620 1621 def __init__( 1622 self, 1623 *, 1624 fit_intercept=True, 1625 verbose=False, 1626 max_iter=500, 1627 normalize="deprecated", 1628 precompute="auto", 1629 cv=None, 1630 max_n_alphas=1000, 1631 n_jobs=None, 1632 eps=np.finfo(float).eps, 1633 copy_X=True, 1634 ): 1635 self.max_iter = max_iter 1636 self.cv = cv 1637 self.max_n_alphas = max_n_alphas 1638 self.n_jobs = n_jobs 1639 super().__init__( 1640 fit_intercept=fit_intercept, 1641 verbose=verbose, 1642 normalize=normalize, 1643 precompute=precompute, 1644 n_nonzero_coefs=500, 1645 eps=eps, 1646 copy_X=copy_X, 1647 fit_path=True, 1648 ) 1649 1650 def _more_tags(self): 1651 return {"multioutput": False} 1652 1653 def fit(self, X, y): 1654 """Fit the model using X, y as training data. 1655 1656 Parameters 1657 ---------- 1658 X : array-like of shape (n_samples, n_features) 1659 Training data. 1660 1661 y : array-like of shape (n_samples,) 1662 Target values. 1663 1664 Returns 1665 ------- 1666 self : object 1667 Returns an instance of self. 1668 """ 1669 _normalize = _deprecate_normalize( 1670 self.normalize, default=True, estimator_name=self.__class__.__name__ 1671 ) 1672 1673 X, y = self._validate_data(X, y, y_numeric=True) 1674 X = as_float_array(X, copy=self.copy_X) 1675 y = as_float_array(y, copy=self.copy_X) 1676 1677 # init cross-validation generator 1678 cv = check_cv(self.cv, classifier=False) 1679 1680 # As we use cross-validation, the Gram matrix is not precomputed here 1681 Gram = self.precompute 1682 if hasattr(Gram, "__array__"): 1683 warnings.warn( 1684 'Parameter "precompute" cannot be an array in ' 1685 '%s. Automatically switch to "auto" instead.' 1686 % self.__class__.__name__ 1687 ) 1688 Gram = "auto" 1689 1690 cv_paths = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)( 1691 delayed(_lars_path_residues)( 1692 X[train], 1693 y[train], 1694 X[test], 1695 y[test], 1696 Gram=Gram, 1697 copy=False, 1698 method=self.method, 1699 verbose=max(0, self.verbose - 1), 1700 normalize=_normalize, 1701 fit_intercept=self.fit_intercept, 1702 max_iter=self.max_iter, 1703 eps=self.eps, 1704 positive=self.positive, 1705 ) 1706 for train, test in cv.split(X, y) 1707 ) 1708 all_alphas = np.concatenate(list(zip(*cv_paths))[0]) 1709 # Unique also sorts 1710 all_alphas = np.unique(all_alphas) 1711 # Take at most max_n_alphas values 1712 stride = int(max(1, int(len(all_alphas) / float(self.max_n_alphas)))) 1713 all_alphas = all_alphas[::stride] 1714 1715 mse_path = np.empty((len(all_alphas), len(cv_paths))) 1716 for index, (alphas, _, _, residues) in enumerate(cv_paths): 1717 alphas = alphas[::-1] 1718 residues = residues[::-1] 1719 if alphas[0] != 0: 1720 alphas = np.r_[0, alphas] 1721 residues = np.r_[residues[0, np.newaxis], residues] 1722 if alphas[-1] != all_alphas[-1]: 1723 alphas = np.r_[alphas, all_alphas[-1]] 1724 residues = np.r_[residues, residues[-1, np.newaxis]] 1725 this_residues = interpolate.interp1d(alphas, residues, axis=0)(all_alphas) 1726 this_residues **= 2 1727 mse_path[:, index] = np.mean(this_residues, axis=-1) 1728 1729 mask = np.all(np.isfinite(mse_path), axis=-1) 1730 all_alphas = all_alphas[mask] 1731 mse_path = mse_path[mask] 1732 # Select the alpha that minimizes left-out error 1733 i_best_alpha = np.argmin(mse_path.mean(axis=-1)) 1734 best_alpha = all_alphas[i_best_alpha] 1735 1736 # Store our parameters 1737 self.alpha_ = best_alpha 1738 self.cv_alphas_ = all_alphas 1739 self.mse_path_ = mse_path 1740 1741 # Now compute the full model 1742 # it will call a lasso internally when self if LassoLarsCV 1743 # as self.method == 'lasso' 1744 self._fit( 1745 X, 1746 y, 1747 max_iter=self.max_iter, 1748 alpha=best_alpha, 1749 Xy=None, 1750 fit_path=True, 1751 normalize=_normalize, 1752 ) 1753 return self 1754 1755 1756class LassoLarsCV(LarsCV): 1757 """Cross-validated Lasso, using the LARS algorithm. 1758 1759 See glossary entry for :term:`cross-validation estimator`. 1760 1761 The optimization objective for Lasso is:: 1762 1763 (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1 1764 1765 Read more in the :ref:`User Guide <least_angle_regression>`. 1766 1767 Parameters 1768 ---------- 1769 fit_intercept : bool, default=True 1770 Whether to calculate the intercept for this model. If set 1771 to false, no intercept will be used in calculations 1772 (i.e. data is expected to be centered). 1773 1774 verbose : bool or int, default=False 1775 Sets the verbosity amount. 1776 1777 max_iter : int, default=500 1778 Maximum number of iterations to perform. 1779 1780 normalize : bool, default=True 1781 This parameter is ignored when ``fit_intercept`` is set to False. 1782 If True, the regressors X will be normalized before regression by 1783 subtracting the mean and dividing by the l2-norm. 1784 If you wish to standardize, please use 1785 :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit`` 1786 on an estimator with ``normalize=False``. 1787 1788 .. deprecated:: 1.0 1789 ``normalize`` was deprecated in version 1.0. It will default 1790 to False in 1.2 and be removed in 1.4. 1791 1792 precompute : bool or 'auto' , default='auto' 1793 Whether to use a precomputed Gram matrix to speed up 1794 calculations. If set to ``'auto'`` let us decide. The Gram matrix 1795 cannot be passed as argument since we will use only subsets of X. 1796 1797 cv : int, cross-validation generator or an iterable, default=None 1798 Determines the cross-validation splitting strategy. 1799 Possible inputs for cv are: 1800 1801 - None, to use the default 5-fold cross-validation, 1802 - integer, to specify the number of folds. 1803 - :term:`CV splitter`, 1804 - An iterable yielding (train, test) splits as arrays of indices. 1805 1806 For integer/None inputs, :class:`KFold` is used. 1807 1808 Refer :ref:`User Guide <cross_validation>` for the various 1809 cross-validation strategies that can be used here. 1810 1811 .. versionchanged:: 0.22 1812 ``cv`` default value if None changed from 3-fold to 5-fold. 1813 1814 max_n_alphas : int, default=1000 1815 The maximum number of points on the path used to compute the 1816 residuals in the cross-validation. 1817 1818 n_jobs : int or None, default=None 1819 Number of CPUs to use during the cross validation. 1820 ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. 1821 ``-1`` means using all processors. See :term:`Glossary <n_jobs>` 1822 for more details. 1823 1824 eps : float, default=np.finfo(float).eps 1825 The machine-precision regularization in the computation of the 1826 Cholesky diagonal factors. Increase this for very ill-conditioned 1827 systems. Unlike the ``tol`` parameter in some iterative 1828 optimization-based algorithms, this parameter does not control 1829 the tolerance of the optimization. 1830 1831 copy_X : bool, default=True 1832 If True, X will be copied; else, it may be overwritten. 1833 1834 positive : bool, default=False 1835 Restrict coefficients to be >= 0. Be aware that you might want to 1836 remove fit_intercept which is set True by default. 1837 Under the positive restriction the model coefficients do not converge 1838 to the ordinary-least-squares solution for small values of alpha. 1839 Only coefficients up to the smallest alpha value (``alphas_[alphas_ > 1840 0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso 1841 algorithm are typically in congruence with the solution of the 1842 coordinate descent Lasso estimator. 1843 As a consequence using LassoLarsCV only makes sense for problems where 1844 a sparse solution is expected and/or reached. 1845 1846 Attributes 1847 ---------- 1848 coef_ : array-like of shape (n_features,) 1849 parameter vector (w in the formulation formula) 1850 1851 intercept_ : float 1852 independent term in decision function. 1853 1854 coef_path_ : array-like of shape (n_features, n_alphas) 1855 the varying values of the coefficients along the path 1856 1857 alpha_ : float 1858 the estimated regularization parameter alpha 1859 1860 alphas_ : array-like of shape (n_alphas,) 1861 the different values of alpha along the path 1862 1863 cv_alphas_ : array-like of shape (n_cv_alphas,) 1864 all the values of alpha along the path for the different folds 1865 1866 mse_path_ : array-like of shape (n_folds, n_cv_alphas) 1867 the mean square error on left-out for each fold along the path 1868 (alpha values given by ``cv_alphas``) 1869 1870 n_iter_ : array-like or int 1871 the number of iterations run by Lars with the optimal alpha. 1872 1873 active_ : list of int 1874 Indices of active variables at the end of the path. 1875 1876 n_features_in_ : int 1877 Number of features seen during :term:`fit`. 1878 1879 .. versionadded:: 0.24 1880 1881 feature_names_in_ : ndarray of shape (`n_features_in_`,) 1882 Names of features seen during :term:`fit`. Defined only when `X` 1883 has feature names that are all strings. 1884 1885 .. versionadded:: 1.0 1886 1887 See Also 1888 -------- 1889 lars_path : Compute Least Angle Regression or Lasso 1890 path using LARS algorithm. 1891 lasso_path : Compute Lasso path with coordinate descent. 1892 Lasso : Linear Model trained with L1 prior as 1893 regularizer (aka the Lasso). 1894 LassoCV : Lasso linear model with iterative fitting 1895 along a regularization path. 1896 LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars. 1897 LassoLarsIC : Lasso model fit with Lars using BIC 1898 or AIC for model selection. 1899 sklearn.decomposition.sparse_encode : Sparse coding. 1900 1901 Notes 1902 ----- 1903 The object solves the same problem as the LassoCV object. However, 1904 unlike the LassoCV, it find the relevant alphas values by itself. 1905 In general, because of this property, it will be more stable. 1906 However, it is more fragile to heavily multicollinear datasets. 1907 1908 It is more efficient than the LassoCV if only a small number of 1909 features are selected compared to the total number, for instance if 1910 there are very few samples compared to the number of features. 1911 1912 Examples 1913 -------- 1914 >>> from sklearn.linear_model import LassoLarsCV 1915 >>> from sklearn.datasets import make_regression 1916 >>> X, y = make_regression(noise=4.0, random_state=0) 1917 >>> reg = LassoLarsCV(cv=5, normalize=False).fit(X, y) 1918 >>> reg.score(X, y) 1919 0.9993... 1920 >>> reg.alpha_ 1921 0.3972... 1922 >>> reg.predict(X[:1,]) 1923 array([-78.4831...]) 1924 """ 1925 1926 method = "lasso" 1927 1928 def __init__( 1929 self, 1930 *, 1931 fit_intercept=True, 1932 verbose=False, 1933 max_iter=500, 1934 normalize="deprecated", 1935 precompute="auto", 1936 cv=None, 1937 max_n_alphas=1000, 1938 n_jobs=None, 1939 eps=np.finfo(float).eps, 1940 copy_X=True, 1941 positive=False, 1942 ): 1943 self.fit_intercept = fit_intercept 1944 self.verbose = verbose 1945 self.max_iter = max_iter 1946 self.normalize = normalize 1947 self.precompute = precompute 1948 self.cv = cv 1949 self.max_n_alphas = max_n_alphas 1950 self.n_jobs = n_jobs 1951 self.eps = eps 1952 self.copy_X = copy_X 1953 self.positive = positive 1954 # XXX : we don't use super().__init__ 1955 # to avoid setting n_nonzero_coefs 1956 1957 1958class LassoLarsIC(LassoLars): 1959 """Lasso model fit with Lars using BIC or AIC for model selection. 1960 1961 The optimization objective for Lasso is:: 1962 1963 (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1 1964 1965 AIC is the Akaike information criterion [2]_ and BIC is the Bayes 1966 Information criterion [3]_. Such criteria are useful to select the value 1967 of the regularization parameter by making a trade-off between the 1968 goodness of fit and the complexity of the model. A good model should 1969 explain well the data while being simple. 1970 1971 Read more in the :ref:`User Guide <lasso_lars_ic>`. 1972 1973 Parameters 1974 ---------- 1975 criterion : {'aic', 'bic'}, default='aic' 1976 The type of criterion to use. 1977 1978 fit_intercept : bool, default=True 1979 Whether to calculate the intercept for this model. If set 1980 to false, no intercept will be used in calculations 1981 (i.e. data is expected to be centered). 1982 1983 verbose : bool or int, default=False 1984 Sets the verbosity amount. 1985 1986 normalize : bool, default=True 1987 This parameter is ignored when ``fit_intercept`` is set to False. 1988 If True, the regressors X will be normalized before regression by 1989 subtracting the mean and dividing by the l2-norm. 1990 If you wish to standardize, please use 1991 :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit`` 1992 on an estimator with ``normalize=False``. 1993 1994 .. deprecated:: 1.0 1995 ``normalize`` was deprecated in version 1.0. It will default 1996 to False in 1.2 and be removed in 1.4. 1997 1998 precompute : bool, 'auto' or array-like, default='auto' 1999 Whether to use a precomputed Gram matrix to speed up 2000 calculations. If set to ``'auto'`` let us decide. The Gram 2001 matrix can also be passed as argument. 2002 2003 max_iter : int, default=500 2004 Maximum number of iterations to perform. Can be used for 2005 early stopping. 2006 2007 eps : float, default=np.finfo(float).eps 2008 The machine-precision regularization in the computation of the 2009 Cholesky diagonal factors. Increase this for very ill-conditioned 2010 systems. Unlike the ``tol`` parameter in some iterative 2011 optimization-based algorithms, this parameter does not control 2012 the tolerance of the optimization. 2013 2014 copy_X : bool, default=True 2015 If True, X will be copied; else, it may be overwritten. 2016 2017 positive : bool, default=False 2018 Restrict coefficients to be >= 0. Be aware that you might want to 2019 remove fit_intercept which is set True by default. 2020 Under the positive restriction the model coefficients do not converge 2021 to the ordinary-least-squares solution for small values of alpha. 2022 Only coefficients up to the smallest alpha value (``alphas_[alphas_ > 2023 0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso 2024 algorithm are typically in congruence with the solution of the 2025 coordinate descent Lasso estimator. 2026 As a consequence using LassoLarsIC only makes sense for problems where 2027 a sparse solution is expected and/or reached. 2028 2029 noise_variance : float, default=None 2030 The estimated noise variance of the data. If `None`, an unbiased 2031 estimate is computed by an OLS model. However, it is only possible 2032 in the case where `n_samples > n_features + fit_intercept`. 2033 2034 .. versionadded:: 1.1 2035 2036 Attributes 2037 ---------- 2038 coef_ : array-like of shape (n_features,) 2039 parameter vector (w in the formulation formula) 2040 2041 intercept_ : float 2042 independent term in decision function. 2043 2044 alpha_ : float 2045 the alpha parameter chosen by the information criterion 2046 2047 alphas_ : array-like of shape (n_alphas + 1,) or list of such arrays 2048 Maximum of covariances (in absolute value) at each iteration. 2049 ``n_alphas`` is either ``max_iter``, ``n_features`` or the 2050 number of nodes in the path with ``alpha >= alpha_min``, whichever 2051 is smaller. If a list, it will be of length `n_targets`. 2052 2053 n_iter_ : int 2054 number of iterations run by lars_path to find the grid of 2055 alphas. 2056 2057 criterion_ : array-like of shape (n_alphas,) 2058 The value of the information criteria ('aic', 'bic') across all 2059 alphas. The alpha which has the smallest information criterion is 2060 chosen, as specified in [1]_. 2061 2062 noise_variance_ : float 2063 The estimated noise variance from the data used to compute the 2064 criterion. 2065 2066 .. versionadded:: 1.1 2067 2068 n_features_in_ : int 2069 Number of features seen during :term:`fit`. 2070 2071 .. versionadded:: 0.24 2072 2073 feature_names_in_ : ndarray of shape (`n_features_in_`,) 2074 Names of features seen during :term:`fit`. Defined only when `X` 2075 has feature names that are all strings. 2076 2077 .. versionadded:: 1.0 2078 2079 See Also 2080 -------- 2081 lars_path : Compute Least Angle Regression or Lasso 2082 path using LARS algorithm. 2083 lasso_path : Compute Lasso path with coordinate descent. 2084 Lasso : Linear Model trained with L1 prior as 2085 regularizer (aka the Lasso). 2086 LassoCV : Lasso linear model with iterative fitting 2087 along a regularization path. 2088 LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars. 2089 LassoLarsCV: Cross-validated Lasso, using the LARS algorithm. 2090 sklearn.decomposition.sparse_encode : Sparse coding. 2091 2092 Notes 2093 ----- 2094 The number of degrees of freedom is computed as in [1]_. 2095 2096 To have more details regarding the mathematical formulation of the 2097 AIC and BIC criteria, please refer to :ref:`User Guide <lasso_lars_ic>`. 2098 2099 References 2100 ---------- 2101 .. [1] :arxiv:`Zou, Hui, Trevor Hastie, and Robert Tibshirani. 2102 "On the degrees of freedom of the lasso." 2103 The Annals of Statistics 35.5 (2007): 2173-2192. 2104 <0712.0881>` 2105 2106 .. [2] `Wikipedia entry on the Akaike information criterion 2107 <https://en.wikipedia.org/wiki/Akaike_information_criterion>`_ 2108 2109 .. [3] `Wikipedia entry on the Bayesian information criterion 2110 <https://en.wikipedia.org/wiki/Bayesian_information_criterion>`_ 2111 2112 Examples 2113 -------- 2114 >>> from sklearn import linear_model 2115 >>> reg = linear_model.LassoLarsIC(criterion='bic', normalize=False) 2116 >>> X = [[-2, 2], [-1, 1], [0, 0], [1, 1], [2, 2]] 2117 >>> y = [-2.2222, -1.1111, 0, -1.1111, -2.2222] 2118 >>> reg.fit(X, y) 2119 LassoLarsIC(criterion='bic', normalize=False) 2120 >>> print(reg.coef_) 2121 [ 0. -1.11...] 2122 """ 2123 2124 def __init__( 2125 self, 2126 criterion="aic", 2127 *, 2128 fit_intercept=True, 2129 verbose=False, 2130 normalize="deprecated", 2131 precompute="auto", 2132 max_iter=500, 2133 eps=np.finfo(float).eps, 2134 copy_X=True, 2135 positive=False, 2136 noise_variance=None, 2137 ): 2138 self.criterion = criterion 2139 self.fit_intercept = fit_intercept 2140 self.positive = positive 2141 self.max_iter = max_iter 2142 self.verbose = verbose 2143 self.normalize = normalize 2144 self.copy_X = copy_X 2145 self.precompute = precompute 2146 self.eps = eps 2147 self.fit_path = True 2148 self.noise_variance = noise_variance 2149 2150 def _more_tags(self): 2151 return {"multioutput": False} 2152 2153 def fit(self, X, y, copy_X=None): 2154 """Fit the model using X, y as training data. 2155 2156 Parameters 2157 ---------- 2158 X : array-like of shape (n_samples, n_features) 2159 Training data. 2160 2161 y : array-like of shape (n_samples,) 2162 Target values. Will be cast to X's dtype if necessary. 2163 2164 copy_X : bool, default=None 2165 If provided, this parameter will override the choice 2166 of copy_X made at instance creation. 2167 If ``True``, X will be copied; else, it may be overwritten. 2168 2169 Returns 2170 ------- 2171 self : object 2172 Returns an instance of self. 2173 """ 2174 _normalize = _deprecate_normalize( 2175 self.normalize, default=True, estimator_name=self.__class__.__name__ 2176 ) 2177 2178 if copy_X is None: 2179 copy_X = self.copy_X 2180 X, y = self._validate_data(X, y, y_numeric=True) 2181 2182 X, y, Xmean, ymean, Xstd = LinearModel._preprocess_data( 2183 X, y, self.fit_intercept, _normalize, copy_X 2184 ) 2185 2186 Gram = self.precompute 2187 2188 alphas_, _, coef_path_, self.n_iter_ = lars_path( 2189 X, 2190 y, 2191 Gram=Gram, 2192 copy_X=copy_X, 2193 copy_Gram=True, 2194 alpha_min=0.0, 2195 method="lasso", 2196 verbose=self.verbose, 2197 max_iter=self.max_iter, 2198 eps=self.eps, 2199 return_n_iter=True, 2200 positive=self.positive, 2201 ) 2202 2203 n_samples = X.shape[0] 2204 2205 if self.criterion == "aic": 2206 criterion_factor = 2 2207 elif self.criterion == "bic": 2208 criterion_factor = log(n_samples) 2209 else: 2210 raise ValueError( 2211 f"criterion should be either bic or aic, got {self.criterion!r}" 2212 ) 2213 2214 residuals = y[:, np.newaxis] - np.dot(X, coef_path_) 2215 residuals_sum_squares = np.sum(residuals ** 2, axis=0) 2216 degrees_of_freedom = np.zeros(coef_path_.shape[1], dtype=int) 2217 for k, coef in enumerate(coef_path_.T): 2218 mask = np.abs(coef) > np.finfo(coef.dtype).eps 2219 if not np.any(mask): 2220 continue 2221 # get the number of degrees of freedom equal to: 2222 # Xc = X[:, mask] 2223 # Trace(Xc * inv(Xc.T, Xc) * Xc.T) ie the number of non-zero coefs 2224 degrees_of_freedom[k] = np.sum(mask) 2225 2226 self.alphas_ = alphas_ 2227 2228 if self.noise_variance is None: 2229 self.noise_variance_ = self._estimate_noise_variance( 2230 X, y, positive=self.positive 2231 ) 2232 else: 2233 self.noise_variance_ = self.noise_variance 2234 2235 self.criterion_ = ( 2236 n_samples * np.log(2 * np.pi * self.noise_variance_) 2237 + residuals_sum_squares / self.noise_variance_ 2238 + criterion_factor * degrees_of_freedom 2239 ) 2240 n_best = np.argmin(self.criterion_) 2241 2242 self.alpha_ = alphas_[n_best] 2243 self.coef_ = coef_path_[:, n_best] 2244 self._set_intercept(Xmean, ymean, Xstd) 2245 return self 2246 2247 def _estimate_noise_variance(self, X, y, positive): 2248 """Compute an estimate of the variance with an OLS model. 2249 2250 Parameters 2251 ---------- 2252 X : ndarray of shape (n_samples, n_features) 2253 Data to be fitted by the OLS model. We expect the data to be 2254 centered. 2255 2256 y : ndarray of shape (n_samples,) 2257 Associated target. 2258 2259 positive : bool, default=False 2260 Restrict coefficients to be >= 0. This should be inline with 2261 the `positive` parameter from `LassoLarsIC`. 2262 2263 Returns 2264 ------- 2265 noise_variance : float 2266 An estimator of the noise variance of an OLS model. 2267 """ 2268 if X.shape[0] <= X.shape[1] + self.fit_intercept: 2269 raise ValueError( 2270 f"You are using {self.__class__.__name__} in the case where the number " 2271 "of samples is smaller than the number of features. In this setting, " 2272 "getting a good estimate for the variance of the noise is not " 2273 "possible. Provide an estimate of the noise variance in the " 2274 "constructor." 2275 ) 2276 # X and y are already centered and we don't need to fit with an intercept 2277 ols_model = LinearRegression(positive=positive, fit_intercept=False) 2278 y_pred = ols_model.fit(X, y).predict(X) 2279 return np.sum((y - y_pred) ** 2) / ( 2280 X.shape[0] - X.shape[1] - self.fit_intercept 2281 ) 2282