1"""
2Generalized Linear Models.
3"""
4
5# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
6# Fabian Pedregosa <fabian.pedregosa@inria.fr>
7# Olivier Grisel <olivier.grisel@ensta.org>
8#         Vincent Michel <vincent.michel@inria.fr>
9#         Peter Prettenhofer <peter.prettenhofer@gmail.com>
10#         Mathieu Blondel <mathieu@mblondel.org>
11#         Lars Buitinck
12#         Maryan Morel <maryan.morel@polytechnique.edu>
13#         Giorgio Patrini <giorgio.patrini@anu.edu.au>
14#         Maria Telenczuk <https://github.com/maikia>
15# License: BSD 3 clause
16
17from abc import ABCMeta, abstractmethod
18import numbers
19import warnings
20
21import numpy as np
22import scipy.sparse as sp
23from scipy import linalg
24from scipy import optimize
25from scipy import sparse
26from scipy.special import expit
27from joblib import Parallel
28
29from ..base import BaseEstimator, ClassifierMixin, RegressorMixin, MultiOutputMixin
30from ..preprocessing._data import _is_constant_feature
31from ..utils import check_array
32from ..utils.validation import FLOAT_DTYPES
33from ..utils import check_random_state
34from ..utils.extmath import safe_sparse_dot
35from ..utils.extmath import _incremental_mean_and_var
36from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale
37from ..utils.fixes import sparse_lsqr
38from ..utils._seq_dataset import ArrayDataset32, CSRDataset32
39from ..utils._seq_dataset import ArrayDataset64, CSRDataset64
40from ..utils.validation import check_is_fitted, _check_sample_weight
41from ..utils.fixes import delayed
42
43# TODO: bayesian_ridge_regression and bayesian_regression_ard
44# should be squashed into its respective objects.
45
46SPARSE_INTERCEPT_DECAY = 0.01
47# For sparse data intercept updates are scaled by this decay factor to avoid
48# intercept oscillation.
49
50
51# FIXME in 1.2: parameter 'normalize' should be removed from linear models
52# in cases where now normalize=False. The default value of 'normalize' should
53# be changed to False in linear models where now normalize=True
54def _deprecate_normalize(normalize, default, estimator_name):
55    """Normalize is to be deprecated from linear models and a use of
56    a pipeline with a StandardScaler is to be recommended instead.
57    Here the appropriate message is selected to be displayed to the user
58    depending on the default normalize value (as it varies between the linear
59    models and normalize value selected by the user).
60
61    Parameters
62    ----------
63    normalize : bool,
64        normalize value passed by the user
65
66    default : bool,
67        default normalize value used by the estimator
68
69    estimator_name : str
70        name of the linear estimator which calls this function.
71        The name will be used for writing the deprecation warnings
72
73    Returns
74    -------
75    normalize : bool,
76        normalize value which should further be used by the estimator at this
77        stage of the depreciation process
78
79    Notes
80    -----
81    This function should be updated in 1.2 depending on the value of
82    `normalize`:
83    - True, warning: `normalize` was deprecated in 1.2 and will be removed in
84      1.4. Suggest to use pipeline instead.
85    - False, `normalize` was deprecated in 1.2 and it will be removed in 1.4.
86      Leave normalize to its default value.
87    - `deprecated` - this should only be possible with default == False as from
88      1.2 `normalize` in all the linear models should be either removed or the
89      default should be set to False.
90    This function should be completely removed in 1.4.
91    """
92
93    if normalize not in [True, False, "deprecated"]:
94        raise ValueError(
95            "Leave 'normalize' to its default value or set it to True or False"
96        )
97
98    if normalize == "deprecated":
99        _normalize = default
100    else:
101        _normalize = normalize
102
103    pipeline_msg = (
104        "If you wish to scale the data, use Pipeline with a StandardScaler "
105        "in a preprocessing stage. To reproduce the previous behavior:\n\n"
106        "from sklearn.pipeline import make_pipeline\n\n"
107        "model = make_pipeline(StandardScaler(with_mean=False), "
108        f"{estimator_name}())\n\n"
109        "If you wish to pass a sample_weight parameter, you need to pass it "
110        "as a fit parameter to each step of the pipeline as follows:\n\n"
111        "kwargs = {s[0] + '__sample_weight': sample_weight for s "
112        "in model.steps}\n"
113        "model.fit(X, y, **kwargs)\n\n"
114    )
115
116    if estimator_name == "Ridge" or estimator_name == "RidgeClassifier":
117        alpha_msg = "Set parameter alpha to: original_alpha * n_samples. "
118    elif "Lasso" in estimator_name:
119        alpha_msg = "Set parameter alpha to: original_alpha * np.sqrt(n_samples). "
120    elif "ElasticNet" in estimator_name:
121        alpha_msg = (
122            "Set parameter alpha to original_alpha * np.sqrt(n_samples) if "
123            "l1_ratio is 1, and to original_alpha * n_samples if l1_ratio is "
124            "0. For other values of l1_ratio, no analytic formula is "
125            "available."
126        )
127    elif estimator_name == "RidgeCV" or estimator_name == "RidgeClassifierCV":
128        alpha_msg = "Set parameter alphas to: original_alphas * n_samples. "
129    else:
130        alpha_msg = ""
131
132    if default and normalize == "deprecated":
133        warnings.warn(
134            "The default of 'normalize' will be set to False in version 1.2 "
135            "and deprecated in version 1.4.\n"
136            + pipeline_msg
137            + alpha_msg,
138            FutureWarning,
139        )
140    elif normalize != "deprecated" and normalize and not default:
141        warnings.warn(
142            "'normalize' was deprecated in version 1.0 and will be removed in 1.2.\n"
143            + pipeline_msg
144            + alpha_msg,
145            FutureWarning,
146        )
147    elif not normalize and not default:
148        warnings.warn(
149            "'normalize' was deprecated in version 1.0 and will be "
150            "removed in 1.2. "
151            "Please leave the normalize parameter to its default value to "
152            "silence this warning. The default behavior of this estimator "
153            "is to not do any normalization. If normalization is needed "
154            "please use sklearn.preprocessing.StandardScaler instead.",
155            FutureWarning,
156        )
157
158    return _normalize
159
160
161def make_dataset(X, y, sample_weight, random_state=None):
162    """Create ``Dataset`` abstraction for sparse and dense inputs.
163
164    This also returns the ``intercept_decay`` which is different
165    for sparse datasets.
166
167    Parameters
168    ----------
169    X : array-like, shape (n_samples, n_features)
170        Training data
171
172    y : array-like, shape (n_samples, )
173        Target values.
174
175    sample_weight : numpy array of shape (n_samples,)
176        The weight of each sample
177
178    random_state : int, RandomState instance or None (default)
179        Determines random number generation for dataset shuffling and noise.
180        Pass an int for reproducible output across multiple function calls.
181        See :term:`Glossary <random_state>`.
182
183    Returns
184    -------
185    dataset
186        The ``Dataset`` abstraction
187    intercept_decay
188        The intercept decay
189    """
190
191    rng = check_random_state(random_state)
192    # seed should never be 0 in SequentialDataset64
193    seed = rng.randint(1, np.iinfo(np.int32).max)
194
195    if X.dtype == np.float32:
196        CSRData = CSRDataset32
197        ArrayData = ArrayDataset32
198    else:
199        CSRData = CSRDataset64
200        ArrayData = ArrayDataset64
201
202    if sp.issparse(X):
203        dataset = CSRData(X.data, X.indptr, X.indices, y, sample_weight, seed=seed)
204        intercept_decay = SPARSE_INTERCEPT_DECAY
205    else:
206        X = np.ascontiguousarray(X)
207        dataset = ArrayData(X, y, sample_weight, seed=seed)
208        intercept_decay = 1.0
209
210    return dataset, intercept_decay
211
212
213def _preprocess_data(
214    X,
215    y,
216    fit_intercept,
217    normalize=False,
218    copy=True,
219    sample_weight=None,
220    return_mean=False,
221    check_input=True,
222):
223    """Center and scale data.
224
225    Centers data to have mean zero along axis 0. If fit_intercept=False or if
226    the X is a sparse matrix, no centering is done, but normalization can still
227    be applied. The function returns the statistics necessary to reconstruct
228    the input data, which are X_offset, y_offset, X_scale, such that the output
229
230        X = (X - X_offset) / X_scale
231
232    X_scale is the L2 norm of X - X_offset. If sample_weight is not None,
233    then the weighted mean of X and y is zero, and not the mean itself. If
234    return_mean=True, the mean, eventually weighted, is returned, independently
235    of whether X was centered (option used for optimization with sparse data in
236    coordinate_descend).
237
238    This is here because nearly all linear models will want their data to be
239    centered. This function also systematically makes y consistent with X.dtype
240    """
241    if isinstance(sample_weight, numbers.Number):
242        sample_weight = None
243    if sample_weight is not None:
244        sample_weight = np.asarray(sample_weight)
245
246    if check_input:
247        X = check_array(X, copy=copy, accept_sparse=["csr", "csc"], dtype=FLOAT_DTYPES)
248    elif copy:
249        if sp.issparse(X):
250            X = X.copy()
251        else:
252            X = X.copy(order="K")
253
254    y = np.asarray(y, dtype=X.dtype)
255
256    if fit_intercept:
257        if sp.issparse(X):
258            X_offset, X_var = mean_variance_axis(X, axis=0, weights=sample_weight)
259            if not return_mean:
260                X_offset[:] = X.dtype.type(0)
261        else:
262            if normalize:
263                X_offset, X_var, _ = _incremental_mean_and_var(
264                    X,
265                    last_mean=0.0,
266                    last_variance=0.0,
267                    last_sample_count=0.0,
268                    sample_weight=sample_weight,
269                )
270            else:
271                X_offset = np.average(X, axis=0, weights=sample_weight)
272
273            X_offset = X_offset.astype(X.dtype, copy=False)
274            X -= X_offset
275
276        if normalize:
277            X_var = X_var.astype(X.dtype, copy=False)
278            # Detect constant features on the computed variance, before taking
279            # the np.sqrt. Otherwise constant features cannot be detected with
280            # sample weights.
281            constant_mask = _is_constant_feature(X_var, X_offset, X.shape[0])
282            if sample_weight is None:
283                X_var *= X.shape[0]
284            else:
285                X_var *= sample_weight.sum()
286            X_scale = np.sqrt(X_var, out=X_var)
287            X_scale[constant_mask] = 1.0
288            if sp.issparse(X):
289                inplace_column_scale(X, 1.0 / X_scale)
290            else:
291                X /= X_scale
292        else:
293            X_scale = np.ones(X.shape[1], dtype=X.dtype)
294
295        y_offset = np.average(y, axis=0, weights=sample_weight)
296        y = y - y_offset
297    else:
298        X_offset = np.zeros(X.shape[1], dtype=X.dtype)
299        X_scale = np.ones(X.shape[1], dtype=X.dtype)
300        if y.ndim == 1:
301            y_offset = X.dtype.type(0)
302        else:
303            y_offset = np.zeros(y.shape[1], dtype=X.dtype)
304
305    return X, y, X_offset, y_offset, X_scale
306
307
308# TODO: _rescale_data should be factored into _preprocess_data.
309# Currently, the fact that sag implements its own way to deal with
310# sample_weight makes the refactoring tricky.
311
312
313def _rescale_data(X, y, sample_weight):
314    """Rescale data sample-wise by square root of sample_weight.
315
316    For many linear models, this enables easy support for sample_weight.
317
318    Returns
319    -------
320    X_rescaled : {array-like, sparse matrix}
321
322    y_rescaled : {array-like, sparse matrix}
323    """
324    n_samples = X.shape[0]
325    sample_weight = np.asarray(sample_weight)
326    if sample_weight.ndim == 0:
327        sample_weight = np.full(n_samples, sample_weight, dtype=sample_weight.dtype)
328    sample_weight = np.sqrt(sample_weight)
329    sw_matrix = sparse.dia_matrix((sample_weight, 0), shape=(n_samples, n_samples))
330    X = safe_sparse_dot(sw_matrix, X)
331    y = safe_sparse_dot(sw_matrix, y)
332    return X, y
333
334
335class LinearModel(BaseEstimator, metaclass=ABCMeta):
336    """Base class for Linear Models"""
337
338    @abstractmethod
339    def fit(self, X, y):
340        """Fit model."""
341
342    def _decision_function(self, X):
343        check_is_fitted(self)
344
345        X = self._validate_data(X, accept_sparse=["csr", "csc", "coo"], reset=False)
346        return safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_
347
348    def predict(self, X):
349        """
350        Predict using the linear model.
351
352        Parameters
353        ----------
354        X : array-like or sparse matrix, shape (n_samples, n_features)
355            Samples.
356
357        Returns
358        -------
359        C : array, shape (n_samples,)
360            Returns predicted values.
361        """
362        return self._decision_function(X)
363
364    _preprocess_data = staticmethod(_preprocess_data)
365
366    def _set_intercept(self, X_offset, y_offset, X_scale):
367        """Set the intercept_"""
368        if self.fit_intercept:
369            self.coef_ = self.coef_ / X_scale
370            self.intercept_ = y_offset - np.dot(X_offset, self.coef_.T)
371        else:
372            self.intercept_ = 0.0
373
374    def _more_tags(self):
375        return {"requires_y": True}
376
377
378# XXX Should this derive from LinearModel? It should be a mixin, not an ABC.
379# Maybe the n_features checking can be moved to LinearModel.
380class LinearClassifierMixin(ClassifierMixin):
381    """Mixin for linear classifiers.
382
383    Handles prediction for sparse and dense X.
384    """
385
386    def decision_function(self, X):
387        """
388        Predict confidence scores for samples.
389
390        The confidence score for a sample is proportional to the signed
391        distance of that sample to the hyperplane.
392
393        Parameters
394        ----------
395        X : {array-like, sparse matrix} of shape (n_samples, n_features)
396            The data matrix for which we want to get the confidence scores.
397
398        Returns
399        -------
400        scores : ndarray of shape (n_samples,) or (n_samples, n_classes)
401            Confidence scores per `(n_samples, n_classes)` combination. In the
402            binary case, confidence score for `self.classes_[1]` where >0 means
403            this class would be predicted.
404        """
405        check_is_fitted(self)
406
407        X = self._validate_data(X, accept_sparse="csr", reset=False)
408        scores = safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_
409        return scores.ravel() if scores.shape[1] == 1 else scores
410
411    def predict(self, X):
412        """
413        Predict class labels for samples in X.
414
415        Parameters
416        ----------
417        X : {array-like, sparse matrix} of shape (n_samples, n_features)
418            The data matrix for which we want to get the predictions.
419
420        Returns
421        -------
422        y_pred : ndarray of shape (n_samples,)
423            Vector containing the class labels for each sample.
424        """
425        scores = self.decision_function(X)
426        if len(scores.shape) == 1:
427            indices = (scores > 0).astype(int)
428        else:
429            indices = scores.argmax(axis=1)
430        return self.classes_[indices]
431
432    def _predict_proba_lr(self, X):
433        """Probability estimation for OvR logistic regression.
434
435        Positive class probabilities are computed as
436        1. / (1. + np.exp(-self.decision_function(X)));
437        multiclass is handled by normalizing that over all classes.
438        """
439        prob = self.decision_function(X)
440        expit(prob, out=prob)
441        if prob.ndim == 1:
442            return np.vstack([1 - prob, prob]).T
443        else:
444            # OvR normalization, like LibLinear's predict_probability
445            prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
446            return prob
447
448
449class SparseCoefMixin:
450    """Mixin for converting coef_ to and from CSR format.
451
452    L1-regularizing estimators should inherit this.
453    """
454
455    def densify(self):
456        """
457        Convert coefficient matrix to dense array format.
458
459        Converts the ``coef_`` member (back) to a numpy.ndarray. This is the
460        default format of ``coef_`` and is required for fitting, so calling
461        this method is only required on models that have previously been
462        sparsified; otherwise, it is a no-op.
463
464        Returns
465        -------
466        self
467            Fitted estimator.
468        """
469        msg = "Estimator, %(name)s, must be fitted before densifying."
470        check_is_fitted(self, msg=msg)
471        if sp.issparse(self.coef_):
472            self.coef_ = self.coef_.toarray()
473        return self
474
475    def sparsify(self):
476        """
477        Convert coefficient matrix to sparse format.
478
479        Converts the ``coef_`` member to a scipy.sparse matrix, which for
480        L1-regularized models can be much more memory- and storage-efficient
481        than the usual numpy.ndarray representation.
482
483        The ``intercept_`` member is not converted.
484
485        Returns
486        -------
487        self
488            Fitted estimator.
489
490        Notes
491        -----
492        For non-sparse models, i.e. when there are not many zeros in ``coef_``,
493        this may actually *increase* memory usage, so use this method with
494        care. A rule of thumb is that the number of zero elements, which can
495        be computed with ``(coef_ == 0).sum()``, must be more than 50% for this
496        to provide significant benefits.
497
498        After calling this method, further fitting with the partial_fit
499        method (if any) will not work until you call densify.
500        """
501        msg = "Estimator, %(name)s, must be fitted before sparsifying."
502        check_is_fitted(self, msg=msg)
503        self.coef_ = sp.csr_matrix(self.coef_)
504        return self
505
506
507class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel):
508    """
509    Ordinary least squares Linear Regression.
510
511    LinearRegression fits a linear model with coefficients w = (w1, ..., wp)
512    to minimize the residual sum of squares between the observed targets in
513    the dataset, and the targets predicted by the linear approximation.
514
515    Parameters
516    ----------
517    fit_intercept : bool, default=True
518        Whether to calculate the intercept for this model. If set
519        to False, no intercept will be used in calculations
520        (i.e. data is expected to be centered).
521
522    normalize : bool, default=False
523        This parameter is ignored when ``fit_intercept`` is set to False.
524        If True, the regressors X will be normalized before regression by
525        subtracting the mean and dividing by the l2-norm.
526        If you wish to standardize, please use
527        :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``
528        on an estimator with ``normalize=False``.
529
530        .. deprecated:: 1.0
531           `normalize` was deprecated in version 1.0 and will be
532           removed in 1.2.
533
534    copy_X : bool, default=True
535        If True, X will be copied; else, it may be overwritten.
536
537    n_jobs : int, default=None
538        The number of jobs to use for the computation. This will only provide
539        speedup in case of sufficiently large problems, that is if firstly
540        `n_targets > 1` and secondly `X` is sparse or if `positive` is set
541        to `True`. ``None`` means 1 unless in a
542        :obj:`joblib.parallel_backend` context. ``-1`` means using all
543        processors. See :term:`Glossary <n_jobs>` for more details.
544
545    positive : bool, default=False
546        When set to ``True``, forces the coefficients to be positive. This
547        option is only supported for dense arrays.
548
549        .. versionadded:: 0.24
550
551    Attributes
552    ----------
553    coef_ : array of shape (n_features, ) or (n_targets, n_features)
554        Estimated coefficients for the linear regression problem.
555        If multiple targets are passed during the fit (y 2D), this
556        is a 2D array of shape (n_targets, n_features), while if only
557        one target is passed, this is a 1D array of length n_features.
558
559    rank_ : int
560        Rank of matrix `X`. Only available when `X` is dense.
561
562    singular_ : array of shape (min(X, y),)
563        Singular values of `X`. Only available when `X` is dense.
564
565    intercept_ : float or array of shape (n_targets,)
566        Independent term in the linear model. Set to 0.0 if
567        `fit_intercept = False`.
568
569    n_features_in_ : int
570        Number of features seen during :term:`fit`.
571
572        .. versionadded:: 0.24
573
574    feature_names_in_ : ndarray of shape (`n_features_in_`,)
575        Names of features seen during :term:`fit`. Defined only when `X`
576        has feature names that are all strings.
577
578        .. versionadded:: 1.0
579
580    See Also
581    --------
582    Ridge : Ridge regression addresses some of the
583        problems of Ordinary Least Squares by imposing a penalty on the
584        size of the coefficients with l2 regularization.
585    Lasso : The Lasso is a linear model that estimates
586        sparse coefficients with l1 regularization.
587    ElasticNet : Elastic-Net is a linear regression
588        model trained with both l1 and l2 -norm regularization of the
589        coefficients.
590
591    Notes
592    -----
593    From the implementation point of view, this is just plain Ordinary
594    Least Squares (scipy.linalg.lstsq) or Non Negative Least Squares
595    (scipy.optimize.nnls) wrapped as a predictor object.
596
597    Examples
598    --------
599    >>> import numpy as np
600    >>> from sklearn.linear_model import LinearRegression
601    >>> X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
602    >>> # y = 1 * x_0 + 2 * x_1 + 3
603    >>> y = np.dot(X, np.array([1, 2])) + 3
604    >>> reg = LinearRegression().fit(X, y)
605    >>> reg.score(X, y)
606    1.0
607    >>> reg.coef_
608    array([1., 2.])
609    >>> reg.intercept_
610    3.0...
611    >>> reg.predict(np.array([[3, 5]]))
612    array([16.])
613    """
614
615    def __init__(
616        self,
617        *,
618        fit_intercept=True,
619        normalize="deprecated",
620        copy_X=True,
621        n_jobs=None,
622        positive=False,
623    ):
624        self.fit_intercept = fit_intercept
625        self.normalize = normalize
626        self.copy_X = copy_X
627        self.n_jobs = n_jobs
628        self.positive = positive
629
630    def fit(self, X, y, sample_weight=None):
631        """
632        Fit linear model.
633
634        Parameters
635        ----------
636        X : {array-like, sparse matrix} of shape (n_samples, n_features)
637            Training data.
638
639        y : array-like of shape (n_samples,) or (n_samples, n_targets)
640            Target values. Will be cast to X's dtype if necessary.
641
642        sample_weight : array-like of shape (n_samples,), default=None
643            Individual weights for each sample.
644
645            .. versionadded:: 0.17
646               parameter *sample_weight* support to LinearRegression.
647
648        Returns
649        -------
650        self : object
651            Fitted Estimator.
652        """
653
654        _normalize = _deprecate_normalize(
655            self.normalize, default=False, estimator_name=self.__class__.__name__
656        )
657
658        n_jobs_ = self.n_jobs
659
660        accept_sparse = False if self.positive else ["csr", "csc", "coo"]
661
662        X, y = self._validate_data(
663            X, y, accept_sparse=accept_sparse, y_numeric=True, multi_output=True
664        )
665
666        if sample_weight is not None:
667            sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
668
669        X, y, X_offset, y_offset, X_scale = self._preprocess_data(
670            X,
671            y,
672            fit_intercept=self.fit_intercept,
673            normalize=_normalize,
674            copy=self.copy_X,
675            sample_weight=sample_weight,
676            return_mean=True,
677        )
678
679        if sample_weight is not None:
680            # Sample weight can be implemented via a simple rescaling.
681            X, y = _rescale_data(X, y, sample_weight)
682
683        if self.positive:
684            if y.ndim < 2:
685                self.coef_, self._residues = optimize.nnls(X, y)
686            else:
687                # scipy.optimize.nnls cannot handle y with shape (M, K)
688                outs = Parallel(n_jobs=n_jobs_)(
689                    delayed(optimize.nnls)(X, y[:, j]) for j in range(y.shape[1])
690                )
691                self.coef_, self._residues = map(np.vstack, zip(*outs))
692        elif sp.issparse(X):
693            X_offset_scale = X_offset / X_scale
694
695            def matvec(b):
696                return X.dot(b) - b.dot(X_offset_scale)
697
698            def rmatvec(b):
699                return X.T.dot(b) - X_offset_scale * np.sum(b)
700
701            X_centered = sparse.linalg.LinearOperator(
702                shape=X.shape, matvec=matvec, rmatvec=rmatvec
703            )
704
705            if y.ndim < 2:
706                out = sparse_lsqr(X_centered, y)
707                self.coef_ = out[0]
708                self._residues = out[3]
709            else:
710                # sparse_lstsq cannot handle y with shape (M, K)
711                outs = Parallel(n_jobs=n_jobs_)(
712                    delayed(sparse_lsqr)(X_centered, y[:, j].ravel())
713                    for j in range(y.shape[1])
714                )
715                self.coef_ = np.vstack([out[0] for out in outs])
716                self._residues = np.vstack([out[3] for out in outs])
717        else:
718            self.coef_, self._residues, self.rank_, self.singular_ = linalg.lstsq(X, y)
719            self.coef_ = self.coef_.T
720
721        if y.ndim == 1:
722            self.coef_ = np.ravel(self.coef_)
723        self._set_intercept(X_offset, y_offset, X_scale)
724        return self
725
726
727def _check_precomputed_gram_matrix(
728    X, precompute, X_offset, X_scale, rtol=1e-7, atol=1e-5
729):
730    """Computes a single element of the gram matrix and compares it to
731    the corresponding element of the user supplied gram matrix.
732
733    If the values do not match a ValueError will be thrown.
734
735    Parameters
736    ----------
737    X : ndarray of shape (n_samples, n_features)
738        Data array.
739
740    precompute : array-like of shape (n_features, n_features)
741        User-supplied gram matrix.
742
743    X_offset : ndarray of shape (n_features,)
744        Array of feature means used to center design matrix.
745
746    X_scale : ndarray of shape (n_features,)
747        Array of feature scale factors used to normalize design matrix.
748
749    rtol : float, default=1e-7
750        Relative tolerance; see numpy.allclose.
751
752    atol : float, default=1e-5
753        absolute tolerance; see :func`numpy.allclose`. Note that the default
754        here is more tolerant than the default for
755        :func:`numpy.testing.assert_allclose`, where `atol=0`.
756
757    Raises
758    ------
759    ValueError
760        Raised when the provided Gram matrix is not consistent.
761    """
762
763    n_features = X.shape[1]
764    f1 = n_features // 2
765    f2 = min(f1 + 1, n_features - 1)
766
767    v1 = (X[:, f1] - X_offset[f1]) * X_scale[f1]
768    v2 = (X[:, f2] - X_offset[f2]) * X_scale[f2]
769
770    expected = np.dot(v1, v2)
771    actual = precompute[f1, f2]
772
773    if not np.isclose(expected, actual, rtol=rtol, atol=atol):
774        raise ValueError(
775            "Gram matrix passed in via 'precompute' parameter "
776            "did not pass validation when a single element was "
777            "checked - please check that it was computed "
778            f"properly. For element ({f1},{f2}) we computed "
779            f"{expected} but the user-supplied value was "
780            f"{actual}."
781        )
782
783
784def _pre_fit(
785    X,
786    y,
787    Xy,
788    precompute,
789    normalize,
790    fit_intercept,
791    copy,
792    check_input=True,
793    sample_weight=None,
794):
795    """Aux function used at beginning of fit in linear models
796
797    Parameters
798    ----------
799    order : 'F', 'C' or None, default=None
800        Whether X and y will be forced to be fortran or c-style. Only relevant
801        if sample_weight is not None.
802    """
803    n_samples, n_features = X.shape
804
805    if sparse.isspmatrix(X):
806        # copy is not needed here as X is not modified inplace when X is sparse
807        precompute = False
808        X, y, X_offset, y_offset, X_scale = _preprocess_data(
809            X,
810            y,
811            fit_intercept=fit_intercept,
812            normalize=normalize,
813            copy=False,
814            return_mean=True,
815            check_input=check_input,
816        )
817    else:
818        # copy was done in fit if necessary
819        X, y, X_offset, y_offset, X_scale = _preprocess_data(
820            X,
821            y,
822            fit_intercept=fit_intercept,
823            normalize=normalize,
824            copy=copy,
825            check_input=check_input,
826            sample_weight=sample_weight,
827        )
828    if sample_weight is not None:
829        X, y = _rescale_data(X, y, sample_weight=sample_weight)
830
831    # FIXME: 'normalize' to be removed in 1.2
832    if hasattr(precompute, "__array__"):
833        if (
834            fit_intercept
835            and not np.allclose(X_offset, np.zeros(n_features))
836            or normalize
837            and not np.allclose(X_scale, np.ones(n_features))
838        ):
839            warnings.warn(
840                "Gram matrix was provided but X was centered to fit "
841                "intercept, or X was normalized : recomputing Gram matrix.",
842                UserWarning,
843            )
844            # recompute Gram
845            precompute = "auto"
846            Xy = None
847        elif check_input:
848            # If we're going to use the user's precomputed gram matrix, we
849            # do a quick check to make sure its not totally bogus.
850            _check_precomputed_gram_matrix(X, precompute, X_offset, X_scale)
851
852    # precompute if n_samples > n_features
853    if isinstance(precompute, str) and precompute == "auto":
854        precompute = n_samples > n_features
855
856    if precompute is True:
857        # make sure that the 'precompute' array is contiguous.
858        precompute = np.empty(shape=(n_features, n_features), dtype=X.dtype, order="C")
859        np.dot(X.T, X, out=precompute)
860
861    if not hasattr(precompute, "__array__"):
862        Xy = None  # cannot use Xy if precompute is not Gram
863
864    if hasattr(precompute, "__array__") and Xy is None:
865        common_dtype = np.find_common_type([X.dtype, y.dtype], [])
866        if y.ndim == 1:
867            # Xy is 1d, make sure it is contiguous.
868            Xy = np.empty(shape=n_features, dtype=common_dtype, order="C")
869            np.dot(X.T, y, out=Xy)
870        else:
871            # Make sure that Xy is always F contiguous even if X or y are not
872            # contiguous: the goal is to make it fast to extract the data for a
873            # specific target.
874            n_targets = y.shape[1]
875            Xy = np.empty(shape=(n_features, n_targets), dtype=common_dtype, order="F")
876            np.dot(y.T, X, out=Xy.T)
877
878    return X, y, X_offset, y_offset, X_scale, precompute, Xy
879