1"""
2Least Angle Regression algorithm. See the documentation on the
3Generalized Linear Model for a complete discussion.
4"""
5# Author: Fabian Pedregosa <fabian.pedregosa@inria.fr>
6#         Alexandre Gramfort <alexandre.gramfort@inria.fr>
7#         Gael Varoquaux
8#
9# License: BSD 3 clause
10
11from math import log
12import sys
13import warnings
14
15import numpy as np
16from scipy import linalg, interpolate
17from scipy.linalg.lapack import get_lapack_funcs
18from joblib import Parallel
19
20from ._base import LinearModel
21from ._base import _deprecate_normalize
22from ._base import LinearRegression
23from ..base import RegressorMixin, MultiOutputMixin
24
25# mypy error: Module 'sklearn.utils' has no attribute 'arrayfuncs'
26from ..utils import arrayfuncs, as_float_array  # type: ignore
27from ..utils import check_random_state
28from ..model_selection import check_cv
29from ..exceptions import ConvergenceWarning
30from ..utils.fixes import delayed
31
32SOLVE_TRIANGULAR_ARGS = {"check_finite": False}
33
34
35def lars_path(
36    X,
37    y,
38    Xy=None,
39    *,
40    Gram=None,
41    max_iter=500,
42    alpha_min=0,
43    method="lar",
44    copy_X=True,
45    eps=np.finfo(float).eps,
46    copy_Gram=True,
47    verbose=0,
48    return_path=True,
49    return_n_iter=False,
50    positive=False,
51):
52    """Compute Least Angle Regression or Lasso path using LARS algorithm [1]
53
54    The optimization objective for the case method='lasso' is::
55
56    (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1
57
58    in the case of method='lars', the objective function is only known in
59    the form of an implicit equation (see discussion in [1])
60
61    Read more in the :ref:`User Guide <least_angle_regression>`.
62
63    Parameters
64    ----------
65    X : None or array-like of shape (n_samples, n_features)
66        Input data. Note that if X is None then the Gram matrix must be
67        specified, i.e., cannot be None or False.
68
69    y : None or array-like of shape (n_samples,)
70        Input targets.
71
72    Xy : array-like of shape (n_samples,) or (n_samples, n_targets), \
73            default=None
74        Xy = np.dot(X.T, y) that can be precomputed. It is useful
75        only when the Gram matrix is precomputed.
76
77    Gram : None, 'auto', array-like of shape (n_features, n_features), \
78            default=None
79        Precomputed Gram matrix (X' * X), if ``'auto'``, the Gram
80        matrix is precomputed from the given X, if there are more samples
81        than features.
82
83    max_iter : int, default=500
84        Maximum number of iterations to perform, set to infinity for no limit.
85
86    alpha_min : float, default=0
87        Minimum correlation along the path. It corresponds to the
88        regularization parameter alpha parameter in the Lasso.
89
90    method : {'lar', 'lasso'}, default='lar'
91        Specifies the returned model. Select ``'lar'`` for Least Angle
92        Regression, ``'lasso'`` for the Lasso.
93
94    copy_X : bool, default=True
95        If ``False``, ``X`` is overwritten.
96
97    eps : float, default=np.finfo(float).eps
98        The machine-precision regularization in the computation of the
99        Cholesky diagonal factors. Increase this for very ill-conditioned
100        systems. Unlike the ``tol`` parameter in some iterative
101        optimization-based algorithms, this parameter does not control
102        the tolerance of the optimization.
103
104    copy_Gram : bool, default=True
105        If ``False``, ``Gram`` is overwritten.
106
107    verbose : int, default=0
108        Controls output verbosity.
109
110    return_path : bool, default=True
111        If ``return_path==True`` returns the entire path, else returns only the
112        last point of the path.
113
114    return_n_iter : bool, default=False
115        Whether to return the number of iterations.
116
117    positive : bool, default=False
118        Restrict coefficients to be >= 0.
119        This option is only allowed with method 'lasso'. Note that the model
120        coefficients will not converge to the ordinary-least-squares solution
121        for small values of alpha. Only coefficients up to the smallest alpha
122        value (``alphas_[alphas_ > 0.].min()`` when fit_path=True) reached by
123        the stepwise Lars-Lasso algorithm are typically in congruence with the
124        solution of the coordinate descent lasso_path function.
125
126    Returns
127    -------
128    alphas : array-like of shape (n_alphas + 1,)
129        Maximum of covariances (in absolute value) at each iteration.
130        ``n_alphas`` is either ``max_iter``, ``n_features`` or the
131        number of nodes in the path with ``alpha >= alpha_min``, whichever
132        is smaller.
133
134    active : array-like of shape (n_alphas,)
135        Indices of active variables at the end of the path.
136
137    coefs : array-like of shape (n_features, n_alphas + 1)
138        Coefficients along the path
139
140    n_iter : int
141        Number of iterations run. Returned only if return_n_iter is set
142        to True.
143
144    See Also
145    --------
146    lars_path_gram
147    lasso_path
148    lasso_path_gram
149    LassoLars
150    Lars
151    LassoLarsCV
152    LarsCV
153    sklearn.decomposition.sparse_encode
154
155    References
156    ----------
157    .. [1] "Least Angle Regression", Efron et al.
158           http://statweb.stanford.edu/~tibs/ftp/lars.pdf
159
160    .. [2] `Wikipedia entry on the Least-angle regression
161           <https://en.wikipedia.org/wiki/Least-angle_regression>`_
162
163    .. [3] `Wikipedia entry on the Lasso
164           <https://en.wikipedia.org/wiki/Lasso_(statistics)>`_
165
166    """
167    if X is None and Gram is not None:
168        raise ValueError(
169            "X cannot be None if Gram is not None"
170            "Use lars_path_gram to avoid passing X and y."
171        )
172    return _lars_path_solver(
173        X=X,
174        y=y,
175        Xy=Xy,
176        Gram=Gram,
177        n_samples=None,
178        max_iter=max_iter,
179        alpha_min=alpha_min,
180        method=method,
181        copy_X=copy_X,
182        eps=eps,
183        copy_Gram=copy_Gram,
184        verbose=verbose,
185        return_path=return_path,
186        return_n_iter=return_n_iter,
187        positive=positive,
188    )
189
190
191def lars_path_gram(
192    Xy,
193    Gram,
194    *,
195    n_samples,
196    max_iter=500,
197    alpha_min=0,
198    method="lar",
199    copy_X=True,
200    eps=np.finfo(float).eps,
201    copy_Gram=True,
202    verbose=0,
203    return_path=True,
204    return_n_iter=False,
205    positive=False,
206):
207    """lars_path in the sufficient stats mode [1]
208
209    The optimization objective for the case method='lasso' is::
210
211    (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1
212
213    in the case of method='lars', the objective function is only known in
214    the form of an implicit equation (see discussion in [1])
215
216    Read more in the :ref:`User Guide <least_angle_regression>`.
217
218    Parameters
219    ----------
220    Xy : array-like of shape (n_samples,) or (n_samples, n_targets)
221        Xy = np.dot(X.T, y).
222
223    Gram : array-like of shape (n_features, n_features)
224        Gram = np.dot(X.T * X).
225
226    n_samples : int or float
227        Equivalent size of sample.
228
229    max_iter : int, default=500
230        Maximum number of iterations to perform, set to infinity for no limit.
231
232    alpha_min : float, default=0
233        Minimum correlation along the path. It corresponds to the
234        regularization parameter alpha parameter in the Lasso.
235
236    method : {'lar', 'lasso'}, default='lar'
237        Specifies the returned model. Select ``'lar'`` for Least Angle
238        Regression, ``'lasso'`` for the Lasso.
239
240    copy_X : bool, default=True
241        If ``False``, ``X`` is overwritten.
242
243    eps : float, default=np.finfo(float).eps
244        The machine-precision regularization in the computation of the
245        Cholesky diagonal factors. Increase this for very ill-conditioned
246        systems. Unlike the ``tol`` parameter in some iterative
247        optimization-based algorithms, this parameter does not control
248        the tolerance of the optimization.
249
250    copy_Gram : bool, default=True
251        If ``False``, ``Gram`` is overwritten.
252
253    verbose : int, default=0
254        Controls output verbosity.
255
256    return_path : bool, default=True
257        If ``return_path==True`` returns the entire path, else returns only the
258        last point of the path.
259
260    return_n_iter : bool, default=False
261        Whether to return the number of iterations.
262
263    positive : bool, default=False
264        Restrict coefficients to be >= 0.
265        This option is only allowed with method 'lasso'. Note that the model
266        coefficients will not converge to the ordinary-least-squares solution
267        for small values of alpha. Only coefficients up to the smallest alpha
268        value (``alphas_[alphas_ > 0.].min()`` when fit_path=True) reached by
269        the stepwise Lars-Lasso algorithm are typically in congruence with the
270        solution of the coordinate descent lasso_path function.
271
272    Returns
273    -------
274    alphas : array-like of shape (n_alphas + 1,)
275        Maximum of covariances (in absolute value) at each iteration.
276        ``n_alphas`` is either ``max_iter``, ``n_features`` or the
277        number of nodes in the path with ``alpha >= alpha_min``, whichever
278        is smaller.
279
280    active : array-like of shape (n_alphas,)
281        Indices of active variables at the end of the path.
282
283    coefs : array-like of shape (n_features, n_alphas + 1)
284        Coefficients along the path
285
286    n_iter : int
287        Number of iterations run. Returned only if return_n_iter is set
288        to True.
289
290    See Also
291    --------
292    lars_path
293    lasso_path
294    lasso_path_gram
295    LassoLars
296    Lars
297    LassoLarsCV
298    LarsCV
299    sklearn.decomposition.sparse_encode
300
301    References
302    ----------
303    .. [1] "Least Angle Regression", Efron et al.
304           http://statweb.stanford.edu/~tibs/ftp/lars.pdf
305
306    .. [2] `Wikipedia entry on the Least-angle regression
307           <https://en.wikipedia.org/wiki/Least-angle_regression>`_
308
309    .. [3] `Wikipedia entry on the Lasso
310           <https://en.wikipedia.org/wiki/Lasso_(statistics)>`_
311
312    """
313    return _lars_path_solver(
314        X=None,
315        y=None,
316        Xy=Xy,
317        Gram=Gram,
318        n_samples=n_samples,
319        max_iter=max_iter,
320        alpha_min=alpha_min,
321        method=method,
322        copy_X=copy_X,
323        eps=eps,
324        copy_Gram=copy_Gram,
325        verbose=verbose,
326        return_path=return_path,
327        return_n_iter=return_n_iter,
328        positive=positive,
329    )
330
331
332def _lars_path_solver(
333    X,
334    y,
335    Xy=None,
336    Gram=None,
337    n_samples=None,
338    max_iter=500,
339    alpha_min=0,
340    method="lar",
341    copy_X=True,
342    eps=np.finfo(float).eps,
343    copy_Gram=True,
344    verbose=0,
345    return_path=True,
346    return_n_iter=False,
347    positive=False,
348):
349    """Compute Least Angle Regression or Lasso path using LARS algorithm [1]
350
351    The optimization objective for the case method='lasso' is::
352
353    (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1
354
355    in the case of method='lars', the objective function is only known in
356    the form of an implicit equation (see discussion in [1])
357
358    Read more in the :ref:`User Guide <least_angle_regression>`.
359
360    Parameters
361    ----------
362    X : None or ndarray of shape (n_samples, n_features)
363        Input data. Note that if X is None then Gram must be specified,
364        i.e., cannot be None or False.
365
366    y : None or ndarray of shape (n_samples,)
367        Input targets.
368
369    Xy : array-like of shape (n_samples,) or (n_samples, n_targets), \
370            default=None
371        `Xy = np.dot(X.T, y)` that can be precomputed. It is useful
372        only when the Gram matrix is precomputed.
373
374    Gram : None, 'auto' or array-like of shape (n_features, n_features), \
375            default=None
376        Precomputed Gram matrix `(X' * X)`, if ``'auto'``, the Gram
377        matrix is precomputed from the given X, if there are more samples
378        than features.
379
380    n_samples : int or float, default=None
381        Equivalent size of sample. If `None`, it will be `n_samples`.
382
383    max_iter : int, default=500
384        Maximum number of iterations to perform, set to infinity for no limit.
385
386    alpha_min : float, default=0
387        Minimum correlation along the path. It corresponds to the
388        regularization parameter alpha parameter in the Lasso.
389
390    method : {'lar', 'lasso'}, default='lar'
391        Specifies the returned model. Select ``'lar'`` for Least Angle
392        Regression, ``'lasso'`` for the Lasso.
393
394    copy_X : bool, default=True
395        If ``False``, ``X`` is overwritten.
396
397    eps : float, default=np.finfo(float).eps
398        The machine-precision regularization in the computation of the
399        Cholesky diagonal factors. Increase this for very ill-conditioned
400        systems. Unlike the ``tol`` parameter in some iterative
401        optimization-based algorithms, this parameter does not control
402        the tolerance of the optimization.
403
404    copy_Gram : bool, default=True
405        If ``False``, ``Gram`` is overwritten.
406
407    verbose : int, default=0
408        Controls output verbosity.
409
410    return_path : bool, default=True
411        If ``return_path==True`` returns the entire path, else returns only the
412        last point of the path.
413
414    return_n_iter : bool, default=False
415        Whether to return the number of iterations.
416
417    positive : bool, default=False
418        Restrict coefficients to be >= 0.
419        This option is only allowed with method 'lasso'. Note that the model
420        coefficients will not converge to the ordinary-least-squares solution
421        for small values of alpha. Only coefficients up to the smallest alpha
422        value (``alphas_[alphas_ > 0.].min()`` when fit_path=True) reached by
423        the stepwise Lars-Lasso algorithm are typically in congruence with the
424        solution of the coordinate descent lasso_path function.
425
426    Returns
427    -------
428    alphas : array-like of shape (n_alphas + 1,)
429        Maximum of covariances (in absolute value) at each iteration.
430        ``n_alphas`` is either ``max_iter``, ``n_features`` or the
431        number of nodes in the path with ``alpha >= alpha_min``, whichever
432        is smaller.
433
434    active : array-like of shape (n_alphas,)
435        Indices of active variables at the end of the path.
436
437    coefs : array-like of shape (n_features, n_alphas + 1)
438        Coefficients along the path
439
440    n_iter : int
441        Number of iterations run. Returned only if return_n_iter is set
442        to True.
443
444    See Also
445    --------
446    lasso_path
447    LassoLars
448    Lars
449    LassoLarsCV
450    LarsCV
451    sklearn.decomposition.sparse_encode
452
453    References
454    ----------
455    .. [1] "Least Angle Regression", Efron et al.
456           http://statweb.stanford.edu/~tibs/ftp/lars.pdf
457
458    .. [2] `Wikipedia entry on the Least-angle regression
459           <https://en.wikipedia.org/wiki/Least-angle_regression>`_
460
461    .. [3] `Wikipedia entry on the Lasso
462           <https://en.wikipedia.org/wiki/Lasso_(statistics)>`_
463
464    """
465    if method == "lar" and positive:
466        raise ValueError("Positive constraint not supported for 'lar' coding method.")
467
468    n_samples = n_samples if n_samples is not None else y.size
469
470    if Xy is None:
471        Cov = np.dot(X.T, y)
472    else:
473        Cov = Xy.copy()
474
475    if Gram is None or Gram is False:
476        Gram = None
477        if X is None:
478            raise ValueError("X and Gram cannot both be unspecified.")
479    elif isinstance(Gram, str) and Gram == "auto" or Gram is True:
480        if Gram is True or X.shape[0] > X.shape[1]:
481            Gram = np.dot(X.T, X)
482        else:
483            Gram = None
484    elif copy_Gram:
485        Gram = Gram.copy()
486
487    if Gram is None:
488        n_features = X.shape[1]
489    else:
490        n_features = Cov.shape[0]
491        if Gram.shape != (n_features, n_features):
492            raise ValueError("The shapes of the inputs Gram and Xy do not match.")
493
494    if copy_X and X is not None and Gram is None:
495        # force copy. setting the array to be fortran-ordered
496        # speeds up the calculation of the (partial) Gram matrix
497        # and allows to easily swap columns
498        X = X.copy("F")
499
500    max_features = min(max_iter, n_features)
501
502    dtypes = set(a.dtype for a in (X, y, Xy, Gram) if a is not None)
503    if len(dtypes) == 1:
504        # use the precision level of input data if it is consistent
505        return_dtype = next(iter(dtypes))
506    else:
507        # fallback to double precision otherwise
508        return_dtype = np.float64
509
510    if return_path:
511        coefs = np.zeros((max_features + 1, n_features), dtype=return_dtype)
512        alphas = np.zeros(max_features + 1, dtype=return_dtype)
513    else:
514        coef, prev_coef = (
515            np.zeros(n_features, dtype=return_dtype),
516            np.zeros(n_features, dtype=return_dtype),
517        )
518        alpha, prev_alpha = (
519            np.array([0.0], dtype=return_dtype),
520            np.array([0.0], dtype=return_dtype),
521        )
522        # above better ideas?
523
524    n_iter, n_active = 0, 0
525    active, indices = list(), np.arange(n_features)
526    # holds the sign of covariance
527    sign_active = np.empty(max_features, dtype=np.int8)
528    drop = False
529
530    # will hold the cholesky factorization. Only lower part is
531    # referenced.
532    if Gram is None:
533        L = np.empty((max_features, max_features), dtype=X.dtype)
534        swap, nrm2 = linalg.get_blas_funcs(("swap", "nrm2"), (X,))
535    else:
536        L = np.empty((max_features, max_features), dtype=Gram.dtype)
537        swap, nrm2 = linalg.get_blas_funcs(("swap", "nrm2"), (Cov,))
538    (solve_cholesky,) = get_lapack_funcs(("potrs",), (L,))
539
540    if verbose:
541        if verbose > 1:
542            print("Step\t\tAdded\t\tDropped\t\tActive set size\t\tC")
543        else:
544            sys.stdout.write(".")
545            sys.stdout.flush()
546
547    tiny32 = np.finfo(np.float32).tiny  # to avoid division by 0 warning
548    cov_precision = np.finfo(Cov.dtype).precision
549    equality_tolerance = np.finfo(np.float32).eps
550
551    if Gram is not None:
552        Gram_copy = Gram.copy()
553        Cov_copy = Cov.copy()
554
555    while True:
556        if Cov.size:
557            if positive:
558                C_idx = np.argmax(Cov)
559            else:
560                C_idx = np.argmax(np.abs(Cov))
561
562            C_ = Cov[C_idx]
563
564            if positive:
565                C = C_
566            else:
567                C = np.fabs(C_)
568        else:
569            C = 0.0
570
571        if return_path:
572            alpha = alphas[n_iter, np.newaxis]
573            coef = coefs[n_iter]
574            prev_alpha = alphas[n_iter - 1, np.newaxis]
575            prev_coef = coefs[n_iter - 1]
576
577        alpha[0] = C / n_samples
578        if alpha[0] <= alpha_min + equality_tolerance:  # early stopping
579            if abs(alpha[0] - alpha_min) > equality_tolerance:
580                # interpolation factor 0 <= ss < 1
581                if n_iter > 0:
582                    # In the first iteration, all alphas are zero, the formula
583                    # below would make ss a NaN
584                    ss = (prev_alpha[0] - alpha_min) / (prev_alpha[0] - alpha[0])
585                    coef[:] = prev_coef + ss * (coef - prev_coef)
586                alpha[0] = alpha_min
587            if return_path:
588                coefs[n_iter] = coef
589            break
590
591        if n_iter >= max_iter or n_active >= n_features:
592            break
593        if not drop:
594
595            ##########################################################
596            # Append x_j to the Cholesky factorization of (Xa * Xa') #
597            #                                                        #
598            #            ( L   0 )                                   #
599            #     L  ->  (       )  , where L * w = Xa' x_j          #
600            #            ( w   z )    and z = ||x_j||                #
601            #                                                        #
602            ##########################################################
603
604            if positive:
605                sign_active[n_active] = np.ones_like(C_)
606            else:
607                sign_active[n_active] = np.sign(C_)
608            m, n = n_active, C_idx + n_active
609
610            Cov[C_idx], Cov[0] = swap(Cov[C_idx], Cov[0])
611            indices[n], indices[m] = indices[m], indices[n]
612            Cov_not_shortened = Cov
613            Cov = Cov[1:]  # remove Cov[0]
614
615            if Gram is None:
616                X.T[n], X.T[m] = swap(X.T[n], X.T[m])
617                c = nrm2(X.T[n_active]) ** 2
618                L[n_active, :n_active] = np.dot(X.T[n_active], X.T[:n_active].T)
619            else:
620                # swap does only work inplace if matrix is fortran
621                # contiguous ...
622                Gram[m], Gram[n] = swap(Gram[m], Gram[n])
623                Gram[:, m], Gram[:, n] = swap(Gram[:, m], Gram[:, n])
624                c = Gram[n_active, n_active]
625                L[n_active, :n_active] = Gram[n_active, :n_active]
626
627            # Update the cholesky decomposition for the Gram matrix
628            if n_active:
629                linalg.solve_triangular(
630                    L[:n_active, :n_active],
631                    L[n_active, :n_active],
632                    trans=0,
633                    lower=1,
634                    overwrite_b=True,
635                    **SOLVE_TRIANGULAR_ARGS,
636                )
637
638            v = np.dot(L[n_active, :n_active], L[n_active, :n_active])
639            diag = max(np.sqrt(np.abs(c - v)), eps)
640            L[n_active, n_active] = diag
641
642            if diag < 1e-7:
643                # The system is becoming too ill-conditioned.
644                # We have degenerate vectors in our active set.
645                # We'll 'drop for good' the last regressor added.
646
647                # Note: this case is very rare. It is no longer triggered by
648                # the test suite. The `equality_tolerance` margin added in 0.16
649                # to get early stopping to work consistently on all versions of
650                # Python including 32 bit Python under Windows seems to make it
651                # very difficult to trigger the 'drop for good' strategy.
652                warnings.warn(
653                    "Regressors in active set degenerate. "
654                    "Dropping a regressor, after %i iterations, "
655                    "i.e. alpha=%.3e, "
656                    "with an active set of %i regressors, and "
657                    "the smallest cholesky pivot element being %.3e."
658                    " Reduce max_iter or increase eps parameters."
659                    % (n_iter, alpha, n_active, diag),
660                    ConvergenceWarning,
661                )
662
663                # XXX: need to figure a 'drop for good' way
664                Cov = Cov_not_shortened
665                Cov[0] = 0
666                Cov[C_idx], Cov[0] = swap(Cov[C_idx], Cov[0])
667                continue
668
669            active.append(indices[n_active])
670            n_active += 1
671
672            if verbose > 1:
673                print(
674                    "%s\t\t%s\t\t%s\t\t%s\t\t%s" % (n_iter, active[-1], "", n_active, C)
675                )
676
677        if method == "lasso" and n_iter > 0 and prev_alpha[0] < alpha[0]:
678            # alpha is increasing. This is because the updates of Cov are
679            # bringing in too much numerical error that is greater than
680            # than the remaining correlation with the
681            # regressors. Time to bail out
682            warnings.warn(
683                "Early stopping the lars path, as the residues "
684                "are small and the current value of alpha is no "
685                "longer well controlled. %i iterations, alpha=%.3e, "
686                "previous alpha=%.3e, with an active set of %i "
687                "regressors." % (n_iter, alpha, prev_alpha, n_active),
688                ConvergenceWarning,
689            )
690            break
691
692        # least squares solution
693        least_squares, _ = solve_cholesky(
694            L[:n_active, :n_active], sign_active[:n_active], lower=True
695        )
696
697        if least_squares.size == 1 and least_squares == 0:
698            # This happens because sign_active[:n_active] = 0
699            least_squares[...] = 1
700            AA = 1.0
701        else:
702            # is this really needed ?
703            AA = 1.0 / np.sqrt(np.sum(least_squares * sign_active[:n_active]))
704
705            if not np.isfinite(AA):
706                # L is too ill-conditioned
707                i = 0
708                L_ = L[:n_active, :n_active].copy()
709                while not np.isfinite(AA):
710                    L_.flat[:: n_active + 1] += (2 ** i) * eps
711                    least_squares, _ = solve_cholesky(
712                        L_, sign_active[:n_active], lower=True
713                    )
714                    tmp = max(np.sum(least_squares * sign_active[:n_active]), eps)
715                    AA = 1.0 / np.sqrt(tmp)
716                    i += 1
717            least_squares *= AA
718
719        if Gram is None:
720            # equiangular direction of variables in the active set
721            eq_dir = np.dot(X.T[:n_active].T, least_squares)
722            # correlation between each unactive variables and
723            # eqiangular vector
724            corr_eq_dir = np.dot(X.T[n_active:], eq_dir)
725        else:
726            # if huge number of features, this takes 50% of time, I
727            # think could be avoided if we just update it using an
728            # orthogonal (QR) decomposition of X
729            corr_eq_dir = np.dot(Gram[:n_active, n_active:].T, least_squares)
730
731        # Explicit rounding can be necessary to avoid `np.argmax(Cov)` yielding
732        # unstable results because of rounding errors.
733        np.around(corr_eq_dir, decimals=cov_precision, out=corr_eq_dir)
734
735        g1 = arrayfuncs.min_pos((C - Cov) / (AA - corr_eq_dir + tiny32))
736        if positive:
737            gamma_ = min(g1, C / AA)
738        else:
739            g2 = arrayfuncs.min_pos((C + Cov) / (AA + corr_eq_dir + tiny32))
740            gamma_ = min(g1, g2, C / AA)
741
742        # TODO: better names for these variables: z
743        drop = False
744        z = -coef[active] / (least_squares + tiny32)
745        z_pos = arrayfuncs.min_pos(z)
746        if z_pos < gamma_:
747            # some coefficients have changed sign
748            idx = np.where(z == z_pos)[0][::-1]
749
750            # update the sign, important for LAR
751            sign_active[idx] = -sign_active[idx]
752
753            if method == "lasso":
754                gamma_ = z_pos
755            drop = True
756
757        n_iter += 1
758
759        if return_path:
760            if n_iter >= coefs.shape[0]:
761                del coef, alpha, prev_alpha, prev_coef
762                # resize the coefs and alphas array
763                add_features = 2 * max(1, (max_features - n_active))
764                coefs = np.resize(coefs, (n_iter + add_features, n_features))
765                coefs[-add_features:] = 0
766                alphas = np.resize(alphas, n_iter + add_features)
767                alphas[-add_features:] = 0
768            coef = coefs[n_iter]
769            prev_coef = coefs[n_iter - 1]
770        else:
771            # mimic the effect of incrementing n_iter on the array references
772            prev_coef = coef
773            prev_alpha[0] = alpha[0]
774            coef = np.zeros_like(coef)
775
776        coef[active] = prev_coef[active] + gamma_ * least_squares
777
778        # update correlations
779        Cov -= gamma_ * corr_eq_dir
780
781        # See if any coefficient has changed sign
782        if drop and method == "lasso":
783
784            # handle the case when idx is not length of 1
785            for ii in idx:
786                arrayfuncs.cholesky_delete(L[:n_active, :n_active], ii)
787
788            n_active -= 1
789            # handle the case when idx is not length of 1
790            drop_idx = [active.pop(ii) for ii in idx]
791
792            if Gram is None:
793                # propagate dropped variable
794                for ii in idx:
795                    for i in range(ii, n_active):
796                        X.T[i], X.T[i + 1] = swap(X.T[i], X.T[i + 1])
797                        # yeah this is stupid
798                        indices[i], indices[i + 1] = indices[i + 1], indices[i]
799
800                # TODO: this could be updated
801                residual = y - np.dot(X[:, :n_active], coef[active])
802                temp = np.dot(X.T[n_active], residual)
803
804                Cov = np.r_[temp, Cov]
805            else:
806                for ii in idx:
807                    for i in range(ii, n_active):
808                        indices[i], indices[i + 1] = indices[i + 1], indices[i]
809                        Gram[i], Gram[i + 1] = swap(Gram[i], Gram[i + 1])
810                        Gram[:, i], Gram[:, i + 1] = swap(Gram[:, i], Gram[:, i + 1])
811
812                # Cov_n = Cov_j + x_j * X + increment(betas) TODO:
813                # will this still work with multiple drops ?
814
815                # recompute covariance. Probably could be done better
816                # wrong as Xy is not swapped with the rest of variables
817
818                # TODO: this could be updated
819                temp = Cov_copy[drop_idx] - np.dot(Gram_copy[drop_idx], coef)
820                Cov = np.r_[temp, Cov]
821
822            sign_active = np.delete(sign_active, idx)
823            sign_active = np.append(sign_active, 0.0)  # just to maintain size
824            if verbose > 1:
825                print(
826                    "%s\t\t%s\t\t%s\t\t%s\t\t%s"
827                    % (n_iter, "", drop_idx, n_active, abs(temp))
828                )
829
830    if return_path:
831        # resize coefs in case of early stop
832        alphas = alphas[: n_iter + 1]
833        coefs = coefs[: n_iter + 1]
834
835        if return_n_iter:
836            return alphas, active, coefs.T, n_iter
837        else:
838            return alphas, active, coefs.T
839    else:
840        if return_n_iter:
841            return alpha, active, coef, n_iter
842        else:
843            return alpha, active, coef
844
845
846###############################################################################
847# Estimator classes
848
849
850class Lars(MultiOutputMixin, RegressorMixin, LinearModel):
851    """Least Angle Regression model a.k.a. LAR.
852
853    Read more in the :ref:`User Guide <least_angle_regression>`.
854
855    Parameters
856    ----------
857    fit_intercept : bool, default=True
858        Whether to calculate the intercept for this model. If set
859        to false, no intercept will be used in calculations
860        (i.e. data is expected to be centered).
861
862    verbose : bool or int, default=False
863        Sets the verbosity amount.
864
865    normalize : bool, default=True
866        This parameter is ignored when ``fit_intercept`` is set to False.
867        If True, the regressors X will be normalized before regression by
868        subtracting the mean and dividing by the l2-norm.
869        If you wish to standardize, please use
870        :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``
871        on an estimator with ``normalize=False``.
872
873        .. deprecated:: 1.0
874            ``normalize`` was deprecated in version 1.0. It will default
875            to False in 1.2 and be removed in 1.4.
876
877    precompute : bool, 'auto' or array-like , default='auto'
878        Whether to use a precomputed Gram matrix to speed up
879        calculations. If set to ``'auto'`` let us decide. The Gram
880        matrix can also be passed as argument.
881
882    n_nonzero_coefs : int, default=500
883        Target number of non-zero coefficients. Use ``np.inf`` for no limit.
884
885    eps : float, default=np.finfo(float).eps
886        The machine-precision regularization in the computation of the
887        Cholesky diagonal factors. Increase this for very ill-conditioned
888        systems. Unlike the ``tol`` parameter in some iterative
889        optimization-based algorithms, this parameter does not control
890        the tolerance of the optimization.
891
892    copy_X : bool, default=True
893        If ``True``, X will be copied; else, it may be overwritten.
894
895    fit_path : bool, default=True
896        If True the full path is stored in the ``coef_path_`` attribute.
897        If you compute the solution for a large problem or many targets,
898        setting ``fit_path`` to ``False`` will lead to a speedup, especially
899        with a small alpha.
900
901    jitter : float, default=None
902        Upper bound on a uniform noise parameter to be added to the
903        `y` values, to satisfy the model's assumption of
904        one-at-a-time computations. Might help with stability.
905
906        .. versionadded:: 0.23
907
908    random_state : int, RandomState instance or None, default=None
909        Determines random number generation for jittering. Pass an int
910        for reproducible output across multiple function calls.
911        See :term:`Glossary <random_state>`. Ignored if `jitter` is None.
912
913        .. versionadded:: 0.23
914
915    Attributes
916    ----------
917    alphas_ : array-like of shape (n_alphas + 1,) or list of such arrays
918        Maximum of covariances (in absolute value) at each iteration.
919        ``n_alphas`` is either ``max_iter``, ``n_features`` or the
920        number of nodes in the path with ``alpha >= alpha_min``, whichever
921        is smaller. If this is a list of array-like, the length of the outer
922        list is `n_targets`.
923
924    active_ : list of shape (n_alphas,) or list of such lists
925        Indices of active variables at the end of the path.
926        If this is a list of list, the length of the outer list is `n_targets`.
927
928    coef_path_ : array-like of shape (n_features, n_alphas + 1) or list \
929            of such arrays
930        The varying values of the coefficients along the path. It is not
931        present if the ``fit_path`` parameter is ``False``. If this is a list
932        of array-like, the length of the outer list is `n_targets`.
933
934    coef_ : array-like of shape (n_features,) or (n_targets, n_features)
935        Parameter vector (w in the formulation formula).
936
937    intercept_ : float or array-like of shape (n_targets,)
938        Independent term in decision function.
939
940    n_iter_ : array-like or int
941        The number of iterations taken by lars_path to find the
942        grid of alphas for each target.
943
944    n_features_in_ : int
945        Number of features seen during :term:`fit`.
946
947        .. versionadded:: 0.24
948
949    feature_names_in_ : ndarray of shape (`n_features_in_`,)
950        Names of features seen during :term:`fit`. Defined only when `X`
951        has feature names that are all strings.
952
953        .. versionadded:: 1.0
954
955    See Also
956    --------
957    lars_path: Compute Least Angle Regression or Lasso
958        path using LARS algorithm.
959    LarsCV : Cross-validated Least Angle Regression model.
960    sklearn.decomposition.sparse_encode : Sparse coding.
961
962    Examples
963    --------
964    >>> from sklearn import linear_model
965    >>> reg = linear_model.Lars(n_nonzero_coefs=1, normalize=False)
966    >>> reg.fit([[-1, 1], [0, 0], [1, 1]], [-1.1111, 0, -1.1111])
967    Lars(n_nonzero_coefs=1, normalize=False)
968    >>> print(reg.coef_)
969    [ 0. -1.11...]
970    """
971
972    method = "lar"
973    positive = False
974
975    def __init__(
976        self,
977        *,
978        fit_intercept=True,
979        verbose=False,
980        normalize="deprecated",
981        precompute="auto",
982        n_nonzero_coefs=500,
983        eps=np.finfo(float).eps,
984        copy_X=True,
985        fit_path=True,
986        jitter=None,
987        random_state=None,
988    ):
989        self.fit_intercept = fit_intercept
990        self.verbose = verbose
991        self.normalize = normalize
992        self.precompute = precompute
993        self.n_nonzero_coefs = n_nonzero_coefs
994        self.eps = eps
995        self.copy_X = copy_X
996        self.fit_path = fit_path
997        self.jitter = jitter
998        self.random_state = random_state
999
1000    @staticmethod
1001    def _get_gram(precompute, X, y):
1002        if (not hasattr(precompute, "__array__")) and (
1003            (precompute is True)
1004            or (precompute == "auto" and X.shape[0] > X.shape[1])
1005            or (precompute == "auto" and y.shape[1] > 1)
1006        ):
1007            precompute = np.dot(X.T, X)
1008
1009        return precompute
1010
1011    def _fit(self, X, y, max_iter, alpha, fit_path, normalize, Xy=None):
1012        """Auxiliary method to fit the model using X, y as training data"""
1013        n_features = X.shape[1]
1014
1015        X, y, X_offset, y_offset, X_scale = self._preprocess_data(
1016            X, y, self.fit_intercept, normalize, self.copy_X
1017        )
1018
1019        if y.ndim == 1:
1020            y = y[:, np.newaxis]
1021
1022        n_targets = y.shape[1]
1023
1024        Gram = self._get_gram(self.precompute, X, y)
1025
1026        self.alphas_ = []
1027        self.n_iter_ = []
1028        self.coef_ = np.empty((n_targets, n_features), dtype=X.dtype)
1029
1030        if fit_path:
1031            self.active_ = []
1032            self.coef_path_ = []
1033            for k in range(n_targets):
1034                this_Xy = None if Xy is None else Xy[:, k]
1035                alphas, active, coef_path, n_iter_ = lars_path(
1036                    X,
1037                    y[:, k],
1038                    Gram=Gram,
1039                    Xy=this_Xy,
1040                    copy_X=self.copy_X,
1041                    copy_Gram=True,
1042                    alpha_min=alpha,
1043                    method=self.method,
1044                    verbose=max(0, self.verbose - 1),
1045                    max_iter=max_iter,
1046                    eps=self.eps,
1047                    return_path=True,
1048                    return_n_iter=True,
1049                    positive=self.positive,
1050                )
1051                self.alphas_.append(alphas)
1052                self.active_.append(active)
1053                self.n_iter_.append(n_iter_)
1054                self.coef_path_.append(coef_path)
1055                self.coef_[k] = coef_path[:, -1]
1056
1057            if n_targets == 1:
1058                self.alphas_, self.active_, self.coef_path_, self.coef_ = [
1059                    a[0]
1060                    for a in (self.alphas_, self.active_, self.coef_path_, self.coef_)
1061                ]
1062                self.n_iter_ = self.n_iter_[0]
1063        else:
1064            for k in range(n_targets):
1065                this_Xy = None if Xy is None else Xy[:, k]
1066                alphas, _, self.coef_[k], n_iter_ = lars_path(
1067                    X,
1068                    y[:, k],
1069                    Gram=Gram,
1070                    Xy=this_Xy,
1071                    copy_X=self.copy_X,
1072                    copy_Gram=True,
1073                    alpha_min=alpha,
1074                    method=self.method,
1075                    verbose=max(0, self.verbose - 1),
1076                    max_iter=max_iter,
1077                    eps=self.eps,
1078                    return_path=False,
1079                    return_n_iter=True,
1080                    positive=self.positive,
1081                )
1082                self.alphas_.append(alphas)
1083                self.n_iter_.append(n_iter_)
1084            if n_targets == 1:
1085                self.alphas_ = self.alphas_[0]
1086                self.n_iter_ = self.n_iter_[0]
1087
1088        self._set_intercept(X_offset, y_offset, X_scale)
1089        return self
1090
1091    def fit(self, X, y, Xy=None):
1092        """Fit the model using X, y as training data.
1093
1094        Parameters
1095        ----------
1096        X : array-like of shape (n_samples, n_features)
1097            Training data.
1098
1099        y : array-like of shape (n_samples,) or (n_samples, n_targets)
1100            Target values.
1101
1102        Xy : array-like of shape (n_samples,) or (n_samples, n_targets), \
1103                default=None
1104            Xy = np.dot(X.T, y) that can be precomputed. It is useful
1105            only when the Gram matrix is precomputed.
1106
1107        Returns
1108        -------
1109        self : object
1110            Returns an instance of self.
1111        """
1112        X, y = self._validate_data(X, y, y_numeric=True, multi_output=True)
1113
1114        _normalize = _deprecate_normalize(
1115            self.normalize, default=True, estimator_name=self.__class__.__name__
1116        )
1117
1118        alpha = getattr(self, "alpha", 0.0)
1119        if hasattr(self, "n_nonzero_coefs"):
1120            alpha = 0.0  # n_nonzero_coefs parametrization takes priority
1121            max_iter = self.n_nonzero_coefs
1122        else:
1123            max_iter = self.max_iter
1124
1125        if self.jitter is not None:
1126            rng = check_random_state(self.random_state)
1127
1128            noise = rng.uniform(high=self.jitter, size=len(y))
1129            y = y + noise
1130
1131        self._fit(
1132            X,
1133            y,
1134            max_iter=max_iter,
1135            alpha=alpha,
1136            fit_path=self.fit_path,
1137            normalize=_normalize,
1138            Xy=Xy,
1139        )
1140
1141        return self
1142
1143
1144class LassoLars(Lars):
1145    """Lasso model fit with Least Angle Regression a.k.a. Lars.
1146
1147    It is a Linear Model trained with an L1 prior as regularizer.
1148
1149    The optimization objective for Lasso is::
1150
1151    (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1
1152
1153    Read more in the :ref:`User Guide <least_angle_regression>`.
1154
1155    Parameters
1156    ----------
1157    alpha : float, default=1.0
1158        Constant that multiplies the penalty term. Defaults to 1.0.
1159        ``alpha = 0`` is equivalent to an ordinary least square, solved
1160        by :class:`LinearRegression`. For numerical reasons, using
1161        ``alpha = 0`` with the LassoLars object is not advised and you
1162        should prefer the LinearRegression object.
1163
1164    fit_intercept : bool, default=True
1165        Whether to calculate the intercept for this model. If set
1166        to false, no intercept will be used in calculations
1167        (i.e. data is expected to be centered).
1168
1169    verbose : bool or int, default=False
1170        Sets the verbosity amount.
1171
1172    normalize : bool, default=True
1173        This parameter is ignored when ``fit_intercept`` is set to False.
1174        If True, the regressors X will be normalized before regression by
1175        subtracting the mean and dividing by the l2-norm.
1176        If you wish to standardize, please use
1177        :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``
1178        on an estimator with ``normalize=False``.
1179
1180        .. deprecated:: 1.0
1181            ``normalize`` was deprecated in version 1.0. It will default
1182            to False in 1.2 and be removed in 1.4.
1183
1184    precompute : bool, 'auto' or array-like, default='auto'
1185        Whether to use a precomputed Gram matrix to speed up
1186        calculations. If set to ``'auto'`` let us decide. The Gram
1187        matrix can also be passed as argument.
1188
1189    max_iter : int, default=500
1190        Maximum number of iterations to perform.
1191
1192    eps : float, default=np.finfo(float).eps
1193        The machine-precision regularization in the computation of the
1194        Cholesky diagonal factors. Increase this for very ill-conditioned
1195        systems. Unlike the ``tol`` parameter in some iterative
1196        optimization-based algorithms, this parameter does not control
1197        the tolerance of the optimization.
1198
1199    copy_X : bool, default=True
1200        If True, X will be copied; else, it may be overwritten.
1201
1202    fit_path : bool, default=True
1203        If ``True`` the full path is stored in the ``coef_path_`` attribute.
1204        If you compute the solution for a large problem or many targets,
1205        setting ``fit_path`` to ``False`` will lead to a speedup, especially
1206        with a small alpha.
1207
1208    positive : bool, default=False
1209        Restrict coefficients to be >= 0. Be aware that you might want to
1210        remove fit_intercept which is set True by default.
1211        Under the positive restriction the model coefficients will not converge
1212        to the ordinary-least-squares solution for small values of alpha.
1213        Only coefficients up to the smallest alpha value (``alphas_[alphas_ >
1214        0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso
1215        algorithm are typically in congruence with the solution of the
1216        coordinate descent Lasso estimator.
1217
1218    jitter : float, default=None
1219        Upper bound on a uniform noise parameter to be added to the
1220        `y` values, to satisfy the model's assumption of
1221        one-at-a-time computations. Might help with stability.
1222
1223        .. versionadded:: 0.23
1224
1225    random_state : int, RandomState instance or None, default=None
1226        Determines random number generation for jittering. Pass an int
1227        for reproducible output across multiple function calls.
1228        See :term:`Glossary <random_state>`. Ignored if `jitter` is None.
1229
1230        .. versionadded:: 0.23
1231
1232    Attributes
1233    ----------
1234    alphas_ : array-like of shape (n_alphas + 1,) or list of such arrays
1235        Maximum of covariances (in absolute value) at each iteration.
1236        ``n_alphas`` is either ``max_iter``, ``n_features`` or the
1237        number of nodes in the path with ``alpha >= alpha_min``, whichever
1238        is smaller. If this is a list of array-like, the length of the outer
1239        list is `n_targets`.
1240
1241    active_ : list of length n_alphas or list of such lists
1242        Indices of active variables at the end of the path.
1243        If this is a list of list, the length of the outer list is `n_targets`.
1244
1245    coef_path_ : array-like of shape (n_features, n_alphas + 1) or list \
1246            of such arrays
1247        If a list is passed it's expected to be one of n_targets such arrays.
1248        The varying values of the coefficients along the path. It is not
1249        present if the ``fit_path`` parameter is ``False``. If this is a list
1250        of array-like, the length of the outer list is `n_targets`.
1251
1252    coef_ : array-like of shape (n_features,) or (n_targets, n_features)
1253        Parameter vector (w in the formulation formula).
1254
1255    intercept_ : float or array-like of shape (n_targets,)
1256        Independent term in decision function.
1257
1258    n_iter_ : array-like or int
1259        The number of iterations taken by lars_path to find the
1260        grid of alphas for each target.
1261
1262    n_features_in_ : int
1263        Number of features seen during :term:`fit`.
1264
1265        .. versionadded:: 0.24
1266
1267    feature_names_in_ : ndarray of shape (`n_features_in_`,)
1268        Names of features seen during :term:`fit`. Defined only when `X`
1269        has feature names that are all strings.
1270
1271        .. versionadded:: 1.0
1272
1273    See Also
1274    --------
1275    lars_path : Compute Least Angle Regression or Lasso
1276        path using LARS algorithm.
1277    lasso_path : Compute Lasso path with coordinate descent.
1278    Lasso : Linear Model trained with L1 prior as
1279        regularizer (aka the Lasso).
1280    LassoCV : Lasso linear model with iterative fitting
1281        along a regularization path.
1282    LassoLarsCV: Cross-validated Lasso, using the LARS algorithm.
1283    LassoLarsIC : Lasso model fit with Lars using BIC
1284        or AIC for model selection.
1285    sklearn.decomposition.sparse_encode : Sparse coding.
1286
1287    Examples
1288    --------
1289    >>> from sklearn import linear_model
1290    >>> reg = linear_model.LassoLars(alpha=0.01, normalize=False)
1291    >>> reg.fit([[-1, 1], [0, 0], [1, 1]], [-1, 0, -1])
1292    LassoLars(alpha=0.01, normalize=False)
1293    >>> print(reg.coef_)
1294    [ 0.         -0.955...]
1295    """
1296
1297    method = "lasso"
1298
1299    def __init__(
1300        self,
1301        alpha=1.0,
1302        *,
1303        fit_intercept=True,
1304        verbose=False,
1305        normalize="deprecated",
1306        precompute="auto",
1307        max_iter=500,
1308        eps=np.finfo(float).eps,
1309        copy_X=True,
1310        fit_path=True,
1311        positive=False,
1312        jitter=None,
1313        random_state=None,
1314    ):
1315        self.alpha = alpha
1316        self.fit_intercept = fit_intercept
1317        self.max_iter = max_iter
1318        self.verbose = verbose
1319        self.normalize = normalize
1320        self.positive = positive
1321        self.precompute = precompute
1322        self.copy_X = copy_X
1323        self.eps = eps
1324        self.fit_path = fit_path
1325        self.jitter = jitter
1326        self.random_state = random_state
1327
1328
1329###############################################################################
1330# Cross-validated estimator classes
1331
1332
1333def _check_copy_and_writeable(array, copy=False):
1334    if copy or not array.flags.writeable:
1335        return array.copy()
1336    return array
1337
1338
1339def _lars_path_residues(
1340    X_train,
1341    y_train,
1342    X_test,
1343    y_test,
1344    Gram=None,
1345    copy=True,
1346    method="lars",
1347    verbose=False,
1348    fit_intercept=True,
1349    normalize=True,
1350    max_iter=500,
1351    eps=np.finfo(float).eps,
1352    positive=False,
1353):
1354    """Compute the residues on left-out data for a full LARS path
1355
1356    Parameters
1357    -----------
1358    X_train : array-like of shape (n_samples, n_features)
1359        The data to fit the LARS on
1360
1361    y_train : array-like of shape (n_samples,)
1362        The target variable to fit LARS on
1363
1364    X_test : array-like of shape (n_samples, n_features)
1365        The data to compute the residues on
1366
1367    y_test : array-like of shape (n_samples,)
1368        The target variable to compute the residues on
1369
1370    Gram : None, 'auto' or array-like of shape (n_features, n_features), \
1371            default=None
1372        Precomputed Gram matrix (X' * X), if ``'auto'``, the Gram
1373        matrix is precomputed from the given X, if there are more samples
1374        than features
1375
1376    copy : bool, default=True
1377        Whether X_train, X_test, y_train and y_test should be copied;
1378        if False, they may be overwritten.
1379
1380    method : {'lar' , 'lasso'}, default='lar'
1381        Specifies the returned model. Select ``'lar'`` for Least Angle
1382        Regression, ``'lasso'`` for the Lasso.
1383
1384    verbose : bool or int, default=False
1385        Sets the amount of verbosity
1386
1387    fit_intercept : bool, default=True
1388        whether to calculate the intercept for this model. If set
1389        to false, no intercept will be used in calculations
1390        (i.e. data is expected to be centered).
1391
1392    positive : bool, default=False
1393        Restrict coefficients to be >= 0. Be aware that you might want to
1394        remove fit_intercept which is set True by default.
1395        See reservations for using this option in combination with method
1396        'lasso' for expected small values of alpha in the doc of LassoLarsCV
1397        and LassoLarsIC.
1398
1399    normalize : bool, default=True
1400        This parameter is ignored when ``fit_intercept`` is set to False.
1401        If True, the regressors X will be normalized before regression by
1402        subtracting the mean and dividing by the l2-norm.
1403        If you wish to standardize, please use
1404        :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``
1405        on an estimator with ``normalize=False``.
1406
1407        .. deprecated:: 1.0
1408            ``normalize`` was deprecated in version 1.0. It will default
1409            to False in 1.2 and be removed in 1.4.
1410
1411    max_iter : int, default=500
1412        Maximum number of iterations to perform.
1413
1414    eps : float, default=np.finfo(float).eps
1415        The machine-precision regularization in the computation of the
1416        Cholesky diagonal factors. Increase this for very ill-conditioned
1417        systems. Unlike the ``tol`` parameter in some iterative
1418        optimization-based algorithms, this parameter does not control
1419        the tolerance of the optimization.
1420
1421    Returns
1422    --------
1423    alphas : array-like of shape (n_alphas,)
1424        Maximum of covariances (in absolute value) at each iteration.
1425        ``n_alphas`` is either ``max_iter`` or ``n_features``, whichever
1426        is smaller.
1427
1428    active : list
1429        Indices of active variables at the end of the path.
1430
1431    coefs : array-like of shape (n_features, n_alphas)
1432        Coefficients along the path
1433
1434    residues : array-like of shape (n_alphas, n_samples)
1435        Residues of the prediction on the test data
1436    """
1437    X_train = _check_copy_and_writeable(X_train, copy)
1438    y_train = _check_copy_and_writeable(y_train, copy)
1439    X_test = _check_copy_and_writeable(X_test, copy)
1440    y_test = _check_copy_and_writeable(y_test, copy)
1441
1442    if fit_intercept:
1443        X_mean = X_train.mean(axis=0)
1444        X_train -= X_mean
1445        X_test -= X_mean
1446        y_mean = y_train.mean(axis=0)
1447        y_train = as_float_array(y_train, copy=False)
1448        y_train -= y_mean
1449        y_test = as_float_array(y_test, copy=False)
1450        y_test -= y_mean
1451
1452    if normalize:
1453        norms = np.sqrt(np.sum(X_train ** 2, axis=0))
1454        nonzeros = np.flatnonzero(norms)
1455        X_train[:, nonzeros] /= norms[nonzeros]
1456
1457    alphas, active, coefs = lars_path(
1458        X_train,
1459        y_train,
1460        Gram=Gram,
1461        copy_X=False,
1462        copy_Gram=False,
1463        method=method,
1464        verbose=max(0, verbose - 1),
1465        max_iter=max_iter,
1466        eps=eps,
1467        positive=positive,
1468    )
1469    if normalize:
1470        coefs[nonzeros] /= norms[nonzeros][:, np.newaxis]
1471    residues = np.dot(X_test, coefs) - y_test[:, np.newaxis]
1472    return alphas, active, coefs, residues.T
1473
1474
1475class LarsCV(Lars):
1476    """Cross-validated Least Angle Regression model.
1477
1478    See glossary entry for :term:`cross-validation estimator`.
1479
1480    Read more in the :ref:`User Guide <least_angle_regression>`.
1481
1482    Parameters
1483    ----------
1484    fit_intercept : bool, default=True
1485        Whether to calculate the intercept for this model. If set
1486        to false, no intercept will be used in calculations
1487        (i.e. data is expected to be centered).
1488
1489    verbose : bool or int, default=False
1490        Sets the verbosity amount.
1491
1492    max_iter : int, default=500
1493        Maximum number of iterations to perform.
1494
1495    normalize : bool, default=True
1496        This parameter is ignored when ``fit_intercept`` is set to False.
1497        If True, the regressors X will be normalized before regression by
1498        subtracting the mean and dividing by the l2-norm.
1499        If you wish to standardize, please use
1500        :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``
1501        on an estimator with ``normalize=False``.
1502
1503        .. deprecated:: 1.0
1504            ``normalize`` was deprecated in version 1.0. It will default
1505            to False in 1.2 and be removed in 1.4.
1506
1507    precompute : bool, 'auto' or array-like , default='auto'
1508        Whether to use a precomputed Gram matrix to speed up
1509        calculations. If set to ``'auto'`` let us decide. The Gram matrix
1510        cannot be passed as argument since we will use only subsets of X.
1511
1512    cv : int, cross-validation generator or an iterable, default=None
1513        Determines the cross-validation splitting strategy.
1514        Possible inputs for cv are:
1515
1516        - None, to use the default 5-fold cross-validation,
1517        - integer, to specify the number of folds.
1518        - :term:`CV splitter`,
1519        - An iterable yielding (train, test) splits as arrays of indices.
1520
1521        For integer/None inputs, :class:`KFold` is used.
1522
1523        Refer :ref:`User Guide <cross_validation>` for the various
1524        cross-validation strategies that can be used here.
1525
1526        .. versionchanged:: 0.22
1527            ``cv`` default value if None changed from 3-fold to 5-fold.
1528
1529    max_n_alphas : int, default=1000
1530        The maximum number of points on the path used to compute the
1531        residuals in the cross-validation.
1532
1533    n_jobs : int or None, default=None
1534        Number of CPUs to use during the cross validation.
1535        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
1536        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
1537        for more details.
1538
1539    eps : float, default=np.finfo(float).eps
1540        The machine-precision regularization in the computation of the
1541        Cholesky diagonal factors. Increase this for very ill-conditioned
1542        systems. Unlike the ``tol`` parameter in some iterative
1543        optimization-based algorithms, this parameter does not control
1544        the tolerance of the optimization.
1545
1546    copy_X : bool, default=True
1547        If ``True``, X will be copied; else, it may be overwritten.
1548
1549    Attributes
1550    ----------
1551    active_ : list of length n_alphas or list of such lists
1552        Indices of active variables at the end of the path.
1553        If this is a list of lists, the outer list length is `n_targets`.
1554
1555    coef_ : array-like of shape (n_features,)
1556        parameter vector (w in the formulation formula)
1557
1558    intercept_ : float
1559        independent term in decision function
1560
1561    coef_path_ : array-like of shape (n_features, n_alphas)
1562        the varying values of the coefficients along the path
1563
1564    alpha_ : float
1565        the estimated regularization parameter alpha
1566
1567    alphas_ : array-like of shape (n_alphas,)
1568        the different values of alpha along the path
1569
1570    cv_alphas_ : array-like of shape (n_cv_alphas,)
1571        all the values of alpha along the path for the different folds
1572
1573    mse_path_ : array-like of shape (n_folds, n_cv_alphas)
1574        the mean square error on left-out for each fold along the path
1575        (alpha values given by ``cv_alphas``)
1576
1577    n_iter_ : array-like or int
1578        the number of iterations run by Lars with the optimal alpha.
1579
1580    n_features_in_ : int
1581        Number of features seen during :term:`fit`.
1582
1583        .. versionadded:: 0.24
1584
1585    feature_names_in_ : ndarray of shape (`n_features_in_`,)
1586        Names of features seen during :term:`fit`. Defined only when `X`
1587        has feature names that are all strings.
1588
1589        .. versionadded:: 1.0
1590
1591    See Also
1592    --------
1593    lars_path : Compute Least Angle Regression or Lasso
1594        path using LARS algorithm.
1595    lasso_path : Compute Lasso path with coordinate descent.
1596    Lasso : Linear Model trained with L1 prior as
1597        regularizer (aka the Lasso).
1598    LassoCV : Lasso linear model with iterative fitting
1599        along a regularization path.
1600    LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars.
1601    LassoLarsIC : Lasso model fit with Lars using BIC
1602        or AIC for model selection.
1603    sklearn.decomposition.sparse_encode : Sparse coding.
1604
1605    Examples
1606    --------
1607    >>> from sklearn.linear_model import LarsCV
1608    >>> from sklearn.datasets import make_regression
1609    >>> X, y = make_regression(n_samples=200, noise=4.0, random_state=0)
1610    >>> reg = LarsCV(cv=5, normalize=False).fit(X, y)
1611    >>> reg.score(X, y)
1612    0.9996...
1613    >>> reg.alpha_
1614    0.2961...
1615    >>> reg.predict(X[:1,])
1616    array([154.3996...])
1617    """
1618
1619    method = "lar"
1620
1621    def __init__(
1622        self,
1623        *,
1624        fit_intercept=True,
1625        verbose=False,
1626        max_iter=500,
1627        normalize="deprecated",
1628        precompute="auto",
1629        cv=None,
1630        max_n_alphas=1000,
1631        n_jobs=None,
1632        eps=np.finfo(float).eps,
1633        copy_X=True,
1634    ):
1635        self.max_iter = max_iter
1636        self.cv = cv
1637        self.max_n_alphas = max_n_alphas
1638        self.n_jobs = n_jobs
1639        super().__init__(
1640            fit_intercept=fit_intercept,
1641            verbose=verbose,
1642            normalize=normalize,
1643            precompute=precompute,
1644            n_nonzero_coefs=500,
1645            eps=eps,
1646            copy_X=copy_X,
1647            fit_path=True,
1648        )
1649
1650    def _more_tags(self):
1651        return {"multioutput": False}
1652
1653    def fit(self, X, y):
1654        """Fit the model using X, y as training data.
1655
1656        Parameters
1657        ----------
1658        X : array-like of shape (n_samples, n_features)
1659            Training data.
1660
1661        y : array-like of shape (n_samples,)
1662            Target values.
1663
1664        Returns
1665        -------
1666        self : object
1667            Returns an instance of self.
1668        """
1669        _normalize = _deprecate_normalize(
1670            self.normalize, default=True, estimator_name=self.__class__.__name__
1671        )
1672
1673        X, y = self._validate_data(X, y, y_numeric=True)
1674        X = as_float_array(X, copy=self.copy_X)
1675        y = as_float_array(y, copy=self.copy_X)
1676
1677        # init cross-validation generator
1678        cv = check_cv(self.cv, classifier=False)
1679
1680        # As we use cross-validation, the Gram matrix is not precomputed here
1681        Gram = self.precompute
1682        if hasattr(Gram, "__array__"):
1683            warnings.warn(
1684                'Parameter "precompute" cannot be an array in '
1685                '%s. Automatically switch to "auto" instead.'
1686                % self.__class__.__name__
1687            )
1688            Gram = "auto"
1689
1690        cv_paths = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
1691            delayed(_lars_path_residues)(
1692                X[train],
1693                y[train],
1694                X[test],
1695                y[test],
1696                Gram=Gram,
1697                copy=False,
1698                method=self.method,
1699                verbose=max(0, self.verbose - 1),
1700                normalize=_normalize,
1701                fit_intercept=self.fit_intercept,
1702                max_iter=self.max_iter,
1703                eps=self.eps,
1704                positive=self.positive,
1705            )
1706            for train, test in cv.split(X, y)
1707        )
1708        all_alphas = np.concatenate(list(zip(*cv_paths))[0])
1709        # Unique also sorts
1710        all_alphas = np.unique(all_alphas)
1711        # Take at most max_n_alphas values
1712        stride = int(max(1, int(len(all_alphas) / float(self.max_n_alphas))))
1713        all_alphas = all_alphas[::stride]
1714
1715        mse_path = np.empty((len(all_alphas), len(cv_paths)))
1716        for index, (alphas, _, _, residues) in enumerate(cv_paths):
1717            alphas = alphas[::-1]
1718            residues = residues[::-1]
1719            if alphas[0] != 0:
1720                alphas = np.r_[0, alphas]
1721                residues = np.r_[residues[0, np.newaxis], residues]
1722            if alphas[-1] != all_alphas[-1]:
1723                alphas = np.r_[alphas, all_alphas[-1]]
1724                residues = np.r_[residues, residues[-1, np.newaxis]]
1725            this_residues = interpolate.interp1d(alphas, residues, axis=0)(all_alphas)
1726            this_residues **= 2
1727            mse_path[:, index] = np.mean(this_residues, axis=-1)
1728
1729        mask = np.all(np.isfinite(mse_path), axis=-1)
1730        all_alphas = all_alphas[mask]
1731        mse_path = mse_path[mask]
1732        # Select the alpha that minimizes left-out error
1733        i_best_alpha = np.argmin(mse_path.mean(axis=-1))
1734        best_alpha = all_alphas[i_best_alpha]
1735
1736        # Store our parameters
1737        self.alpha_ = best_alpha
1738        self.cv_alphas_ = all_alphas
1739        self.mse_path_ = mse_path
1740
1741        # Now compute the full model
1742        # it will call a lasso internally when self if LassoLarsCV
1743        # as self.method == 'lasso'
1744        self._fit(
1745            X,
1746            y,
1747            max_iter=self.max_iter,
1748            alpha=best_alpha,
1749            Xy=None,
1750            fit_path=True,
1751            normalize=_normalize,
1752        )
1753        return self
1754
1755
1756class LassoLarsCV(LarsCV):
1757    """Cross-validated Lasso, using the LARS algorithm.
1758
1759    See glossary entry for :term:`cross-validation estimator`.
1760
1761    The optimization objective for Lasso is::
1762
1763    (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1
1764
1765    Read more in the :ref:`User Guide <least_angle_regression>`.
1766
1767    Parameters
1768    ----------
1769    fit_intercept : bool, default=True
1770        Whether to calculate the intercept for this model. If set
1771        to false, no intercept will be used in calculations
1772        (i.e. data is expected to be centered).
1773
1774    verbose : bool or int, default=False
1775        Sets the verbosity amount.
1776
1777    max_iter : int, default=500
1778        Maximum number of iterations to perform.
1779
1780    normalize : bool, default=True
1781        This parameter is ignored when ``fit_intercept`` is set to False.
1782        If True, the regressors X will be normalized before regression by
1783        subtracting the mean and dividing by the l2-norm.
1784        If you wish to standardize, please use
1785        :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``
1786        on an estimator with ``normalize=False``.
1787
1788        .. deprecated:: 1.0
1789            ``normalize`` was deprecated in version 1.0. It will default
1790            to False in 1.2 and be removed in 1.4.
1791
1792    precompute : bool or 'auto' , default='auto'
1793        Whether to use a precomputed Gram matrix to speed up
1794        calculations. If set to ``'auto'`` let us decide. The Gram matrix
1795        cannot be passed as argument since we will use only subsets of X.
1796
1797    cv : int, cross-validation generator or an iterable, default=None
1798        Determines the cross-validation splitting strategy.
1799        Possible inputs for cv are:
1800
1801        - None, to use the default 5-fold cross-validation,
1802        - integer, to specify the number of folds.
1803        - :term:`CV splitter`,
1804        - An iterable yielding (train, test) splits as arrays of indices.
1805
1806        For integer/None inputs, :class:`KFold` is used.
1807
1808        Refer :ref:`User Guide <cross_validation>` for the various
1809        cross-validation strategies that can be used here.
1810
1811        .. versionchanged:: 0.22
1812            ``cv`` default value if None changed from 3-fold to 5-fold.
1813
1814    max_n_alphas : int, default=1000
1815        The maximum number of points on the path used to compute the
1816        residuals in the cross-validation.
1817
1818    n_jobs : int or None, default=None
1819        Number of CPUs to use during the cross validation.
1820        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
1821        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
1822        for more details.
1823
1824    eps : float, default=np.finfo(float).eps
1825        The machine-precision regularization in the computation of the
1826        Cholesky diagonal factors. Increase this for very ill-conditioned
1827        systems. Unlike the ``tol`` parameter in some iterative
1828        optimization-based algorithms, this parameter does not control
1829        the tolerance of the optimization.
1830
1831    copy_X : bool, default=True
1832        If True, X will be copied; else, it may be overwritten.
1833
1834    positive : bool, default=False
1835        Restrict coefficients to be >= 0. Be aware that you might want to
1836        remove fit_intercept which is set True by default.
1837        Under the positive restriction the model coefficients do not converge
1838        to the ordinary-least-squares solution for small values of alpha.
1839        Only coefficients up to the smallest alpha value (``alphas_[alphas_ >
1840        0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso
1841        algorithm are typically in congruence with the solution of the
1842        coordinate descent Lasso estimator.
1843        As a consequence using LassoLarsCV only makes sense for problems where
1844        a sparse solution is expected and/or reached.
1845
1846    Attributes
1847    ----------
1848    coef_ : array-like of shape (n_features,)
1849        parameter vector (w in the formulation formula)
1850
1851    intercept_ : float
1852        independent term in decision function.
1853
1854    coef_path_ : array-like of shape (n_features, n_alphas)
1855        the varying values of the coefficients along the path
1856
1857    alpha_ : float
1858        the estimated regularization parameter alpha
1859
1860    alphas_ : array-like of shape (n_alphas,)
1861        the different values of alpha along the path
1862
1863    cv_alphas_ : array-like of shape (n_cv_alphas,)
1864        all the values of alpha along the path for the different folds
1865
1866    mse_path_ : array-like of shape (n_folds, n_cv_alphas)
1867        the mean square error on left-out for each fold along the path
1868        (alpha values given by ``cv_alphas``)
1869
1870    n_iter_ : array-like or int
1871        the number of iterations run by Lars with the optimal alpha.
1872
1873    active_ : list of int
1874        Indices of active variables at the end of the path.
1875
1876    n_features_in_ : int
1877        Number of features seen during :term:`fit`.
1878
1879        .. versionadded:: 0.24
1880
1881    feature_names_in_ : ndarray of shape (`n_features_in_`,)
1882        Names of features seen during :term:`fit`. Defined only when `X`
1883        has feature names that are all strings.
1884
1885        .. versionadded:: 1.0
1886
1887    See Also
1888    --------
1889    lars_path : Compute Least Angle Regression or Lasso
1890        path using LARS algorithm.
1891    lasso_path : Compute Lasso path with coordinate descent.
1892    Lasso : Linear Model trained with L1 prior as
1893        regularizer (aka the Lasso).
1894    LassoCV : Lasso linear model with iterative fitting
1895        along a regularization path.
1896    LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars.
1897    LassoLarsIC : Lasso model fit with Lars using BIC
1898        or AIC for model selection.
1899    sklearn.decomposition.sparse_encode : Sparse coding.
1900
1901    Notes
1902    -----
1903    The object solves the same problem as the LassoCV object. However,
1904    unlike the LassoCV, it find the relevant alphas values by itself.
1905    In general, because of this property, it will be more stable.
1906    However, it is more fragile to heavily multicollinear datasets.
1907
1908    It is more efficient than the LassoCV if only a small number of
1909    features are selected compared to the total number, for instance if
1910    there are very few samples compared to the number of features.
1911
1912    Examples
1913    --------
1914    >>> from sklearn.linear_model import LassoLarsCV
1915    >>> from sklearn.datasets import make_regression
1916    >>> X, y = make_regression(noise=4.0, random_state=0)
1917    >>> reg = LassoLarsCV(cv=5, normalize=False).fit(X, y)
1918    >>> reg.score(X, y)
1919    0.9993...
1920    >>> reg.alpha_
1921    0.3972...
1922    >>> reg.predict(X[:1,])
1923    array([-78.4831...])
1924    """
1925
1926    method = "lasso"
1927
1928    def __init__(
1929        self,
1930        *,
1931        fit_intercept=True,
1932        verbose=False,
1933        max_iter=500,
1934        normalize="deprecated",
1935        precompute="auto",
1936        cv=None,
1937        max_n_alphas=1000,
1938        n_jobs=None,
1939        eps=np.finfo(float).eps,
1940        copy_X=True,
1941        positive=False,
1942    ):
1943        self.fit_intercept = fit_intercept
1944        self.verbose = verbose
1945        self.max_iter = max_iter
1946        self.normalize = normalize
1947        self.precompute = precompute
1948        self.cv = cv
1949        self.max_n_alphas = max_n_alphas
1950        self.n_jobs = n_jobs
1951        self.eps = eps
1952        self.copy_X = copy_X
1953        self.positive = positive
1954        # XXX : we don't use super().__init__
1955        # to avoid setting n_nonzero_coefs
1956
1957
1958class LassoLarsIC(LassoLars):
1959    """Lasso model fit with Lars using BIC or AIC for model selection.
1960
1961    The optimization objective for Lasso is::
1962
1963    (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1
1964
1965    AIC is the Akaike information criterion [2]_ and BIC is the Bayes
1966    Information criterion [3]_. Such criteria are useful to select the value
1967    of the regularization parameter by making a trade-off between the
1968    goodness of fit and the complexity of the model. A good model should
1969    explain well the data while being simple.
1970
1971    Read more in the :ref:`User Guide <lasso_lars_ic>`.
1972
1973    Parameters
1974    ----------
1975    criterion : {'aic', 'bic'}, default='aic'
1976        The type of criterion to use.
1977
1978    fit_intercept : bool, default=True
1979        Whether to calculate the intercept for this model. If set
1980        to false, no intercept will be used in calculations
1981        (i.e. data is expected to be centered).
1982
1983    verbose : bool or int, default=False
1984        Sets the verbosity amount.
1985
1986    normalize : bool, default=True
1987        This parameter is ignored when ``fit_intercept`` is set to False.
1988        If True, the regressors X will be normalized before regression by
1989        subtracting the mean and dividing by the l2-norm.
1990        If you wish to standardize, please use
1991        :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``
1992        on an estimator with ``normalize=False``.
1993
1994        .. deprecated:: 1.0
1995            ``normalize`` was deprecated in version 1.0. It will default
1996            to False in 1.2 and be removed in 1.4.
1997
1998    precompute : bool, 'auto' or array-like, default='auto'
1999        Whether to use a precomputed Gram matrix to speed up
2000        calculations. If set to ``'auto'`` let us decide. The Gram
2001        matrix can also be passed as argument.
2002
2003    max_iter : int, default=500
2004        Maximum number of iterations to perform. Can be used for
2005        early stopping.
2006
2007    eps : float, default=np.finfo(float).eps
2008        The machine-precision regularization in the computation of the
2009        Cholesky diagonal factors. Increase this for very ill-conditioned
2010        systems. Unlike the ``tol`` parameter in some iterative
2011        optimization-based algorithms, this parameter does not control
2012        the tolerance of the optimization.
2013
2014    copy_X : bool, default=True
2015        If True, X will be copied; else, it may be overwritten.
2016
2017    positive : bool, default=False
2018        Restrict coefficients to be >= 0. Be aware that you might want to
2019        remove fit_intercept which is set True by default.
2020        Under the positive restriction the model coefficients do not converge
2021        to the ordinary-least-squares solution for small values of alpha.
2022        Only coefficients up to the smallest alpha value (``alphas_[alphas_ >
2023        0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso
2024        algorithm are typically in congruence with the solution of the
2025        coordinate descent Lasso estimator.
2026        As a consequence using LassoLarsIC only makes sense for problems where
2027        a sparse solution is expected and/or reached.
2028
2029    noise_variance : float, default=None
2030        The estimated noise variance of the data. If `None`, an unbiased
2031        estimate is computed by an OLS model. However, it is only possible
2032        in the case where `n_samples > n_features + fit_intercept`.
2033
2034        .. versionadded:: 1.1
2035
2036    Attributes
2037    ----------
2038    coef_ : array-like of shape (n_features,)
2039        parameter vector (w in the formulation formula)
2040
2041    intercept_ : float
2042        independent term in decision function.
2043
2044    alpha_ : float
2045        the alpha parameter chosen by the information criterion
2046
2047    alphas_ : array-like of shape (n_alphas + 1,) or list of such arrays
2048        Maximum of covariances (in absolute value) at each iteration.
2049        ``n_alphas`` is either ``max_iter``, ``n_features`` or the
2050        number of nodes in the path with ``alpha >= alpha_min``, whichever
2051        is smaller. If a list, it will be of length `n_targets`.
2052
2053    n_iter_ : int
2054        number of iterations run by lars_path to find the grid of
2055        alphas.
2056
2057    criterion_ : array-like of shape (n_alphas,)
2058        The value of the information criteria ('aic', 'bic') across all
2059        alphas. The alpha which has the smallest information criterion is
2060        chosen, as specified in [1]_.
2061
2062    noise_variance_ : float
2063        The estimated noise variance from the data used to compute the
2064        criterion.
2065
2066        .. versionadded:: 1.1
2067
2068    n_features_in_ : int
2069        Number of features seen during :term:`fit`.
2070
2071        .. versionadded:: 0.24
2072
2073    feature_names_in_ : ndarray of shape (`n_features_in_`,)
2074        Names of features seen during :term:`fit`. Defined only when `X`
2075        has feature names that are all strings.
2076
2077        .. versionadded:: 1.0
2078
2079    See Also
2080    --------
2081    lars_path : Compute Least Angle Regression or Lasso
2082        path using LARS algorithm.
2083    lasso_path : Compute Lasso path with coordinate descent.
2084    Lasso : Linear Model trained with L1 prior as
2085        regularizer (aka the Lasso).
2086    LassoCV : Lasso linear model with iterative fitting
2087        along a regularization path.
2088    LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars.
2089    LassoLarsCV: Cross-validated Lasso, using the LARS algorithm.
2090    sklearn.decomposition.sparse_encode : Sparse coding.
2091
2092    Notes
2093    -----
2094    The number of degrees of freedom is computed as in [1]_.
2095
2096    To have more details regarding the mathematical formulation of the
2097    AIC and BIC criteria, please refer to :ref:`User Guide <lasso_lars_ic>`.
2098
2099    References
2100    ----------
2101    .. [1] :arxiv:`Zou, Hui, Trevor Hastie, and Robert Tibshirani.
2102            "On the degrees of freedom of the lasso."
2103            The Annals of Statistics 35.5 (2007): 2173-2192.
2104            <0712.0881>`
2105
2106    .. [2] `Wikipedia entry on the Akaike information criterion
2107            <https://en.wikipedia.org/wiki/Akaike_information_criterion>`_
2108
2109    .. [3] `Wikipedia entry on the Bayesian information criterion
2110            <https://en.wikipedia.org/wiki/Bayesian_information_criterion>`_
2111
2112    Examples
2113    --------
2114    >>> from sklearn import linear_model
2115    >>> reg = linear_model.LassoLarsIC(criterion='bic', normalize=False)
2116    >>> X = [[-2, 2], [-1, 1], [0, 0], [1, 1], [2, 2]]
2117    >>> y = [-2.2222, -1.1111, 0, -1.1111, -2.2222]
2118    >>> reg.fit(X, y)
2119    LassoLarsIC(criterion='bic', normalize=False)
2120    >>> print(reg.coef_)
2121    [ 0.  -1.11...]
2122    """
2123
2124    def __init__(
2125        self,
2126        criterion="aic",
2127        *,
2128        fit_intercept=True,
2129        verbose=False,
2130        normalize="deprecated",
2131        precompute="auto",
2132        max_iter=500,
2133        eps=np.finfo(float).eps,
2134        copy_X=True,
2135        positive=False,
2136        noise_variance=None,
2137    ):
2138        self.criterion = criterion
2139        self.fit_intercept = fit_intercept
2140        self.positive = positive
2141        self.max_iter = max_iter
2142        self.verbose = verbose
2143        self.normalize = normalize
2144        self.copy_X = copy_X
2145        self.precompute = precompute
2146        self.eps = eps
2147        self.fit_path = True
2148        self.noise_variance = noise_variance
2149
2150    def _more_tags(self):
2151        return {"multioutput": False}
2152
2153    def fit(self, X, y, copy_X=None):
2154        """Fit the model using X, y as training data.
2155
2156        Parameters
2157        ----------
2158        X : array-like of shape (n_samples, n_features)
2159            Training data.
2160
2161        y : array-like of shape (n_samples,)
2162            Target values. Will be cast to X's dtype if necessary.
2163
2164        copy_X : bool, default=None
2165            If provided, this parameter will override the choice
2166            of copy_X made at instance creation.
2167            If ``True``, X will be copied; else, it may be overwritten.
2168
2169        Returns
2170        -------
2171        self : object
2172            Returns an instance of self.
2173        """
2174        _normalize = _deprecate_normalize(
2175            self.normalize, default=True, estimator_name=self.__class__.__name__
2176        )
2177
2178        if copy_X is None:
2179            copy_X = self.copy_X
2180        X, y = self._validate_data(X, y, y_numeric=True)
2181
2182        X, y, Xmean, ymean, Xstd = LinearModel._preprocess_data(
2183            X, y, self.fit_intercept, _normalize, copy_X
2184        )
2185
2186        Gram = self.precompute
2187
2188        alphas_, _, coef_path_, self.n_iter_ = lars_path(
2189            X,
2190            y,
2191            Gram=Gram,
2192            copy_X=copy_X,
2193            copy_Gram=True,
2194            alpha_min=0.0,
2195            method="lasso",
2196            verbose=self.verbose,
2197            max_iter=self.max_iter,
2198            eps=self.eps,
2199            return_n_iter=True,
2200            positive=self.positive,
2201        )
2202
2203        n_samples = X.shape[0]
2204
2205        if self.criterion == "aic":
2206            criterion_factor = 2
2207        elif self.criterion == "bic":
2208            criterion_factor = log(n_samples)
2209        else:
2210            raise ValueError(
2211                f"criterion should be either bic or aic, got {self.criterion!r}"
2212            )
2213
2214        residuals = y[:, np.newaxis] - np.dot(X, coef_path_)
2215        residuals_sum_squares = np.sum(residuals ** 2, axis=0)
2216        degrees_of_freedom = np.zeros(coef_path_.shape[1], dtype=int)
2217        for k, coef in enumerate(coef_path_.T):
2218            mask = np.abs(coef) > np.finfo(coef.dtype).eps
2219            if not np.any(mask):
2220                continue
2221            # get the number of degrees of freedom equal to:
2222            # Xc = X[:, mask]
2223            # Trace(Xc * inv(Xc.T, Xc) * Xc.T) ie the number of non-zero coefs
2224            degrees_of_freedom[k] = np.sum(mask)
2225
2226        self.alphas_ = alphas_
2227
2228        if self.noise_variance is None:
2229            self.noise_variance_ = self._estimate_noise_variance(
2230                X, y, positive=self.positive
2231            )
2232        else:
2233            self.noise_variance_ = self.noise_variance
2234
2235        self.criterion_ = (
2236            n_samples * np.log(2 * np.pi * self.noise_variance_)
2237            + residuals_sum_squares / self.noise_variance_
2238            + criterion_factor * degrees_of_freedom
2239        )
2240        n_best = np.argmin(self.criterion_)
2241
2242        self.alpha_ = alphas_[n_best]
2243        self.coef_ = coef_path_[:, n_best]
2244        self._set_intercept(Xmean, ymean, Xstd)
2245        return self
2246
2247    def _estimate_noise_variance(self, X, y, positive):
2248        """Compute an estimate of the variance with an OLS model.
2249
2250        Parameters
2251        ----------
2252        X : ndarray of shape (n_samples, n_features)
2253            Data to be fitted by the OLS model. We expect the data to be
2254            centered.
2255
2256        y : ndarray of shape (n_samples,)
2257            Associated target.
2258
2259        positive : bool, default=False
2260            Restrict coefficients to be >= 0. This should be inline with
2261            the `positive` parameter from `LassoLarsIC`.
2262
2263        Returns
2264        -------
2265        noise_variance : float
2266            An estimator of the noise variance of an OLS model.
2267        """
2268        if X.shape[0] <= X.shape[1] + self.fit_intercept:
2269            raise ValueError(
2270                f"You are using {self.__class__.__name__} in the case where the number "
2271                "of samples is smaller than the number of features. In this setting, "
2272                "getting a good estimate for the variance of the noise is not "
2273                "possible. Provide an estimate of the noise variance in the "
2274                "constructor."
2275            )
2276        # X and y are already centered and we don't need to fit with an intercept
2277        ols_model = LinearRegression(positive=positive, fit_intercept=False)
2278        y_pred = ols_model.fit(X, y).predict(X)
2279        return np.sum((y - y_pred) ** 2) / (
2280            X.shape[0] - X.shape[1] - self.fit_intercept
2281        )
2282