1"""
2The :mod:`sklearn.pipeline` module implements utilities to build a composite
3estimator, as a chain of transforms and estimators.
4"""
5# Author: Edouard Duchesnay
6#         Gael Varoquaux
7#         Virgile Fritsch
8#         Alexandre Gramfort
9#         Lars Buitinck
10# License: BSD
11
12from collections import defaultdict
13from itertools import islice
14
15import numpy as np
16from scipy import sparse
17from joblib import Parallel
18
19from .base import clone, TransformerMixin
20from .utils._estimator_html_repr import _VisualBlock
21from .utils.metaestimators import available_if
22from .utils import (
23    Bunch,
24    _print_elapsed_time,
25)
26from .utils.deprecation import deprecated
27from .utils._tags import _safe_tags
28from .utils.validation import check_memory
29from .utils.validation import check_is_fitted
30from .utils.fixes import delayed
31from .exceptions import NotFittedError
32
33from .utils.metaestimators import _BaseComposition
34
35__all__ = ["Pipeline", "FeatureUnion", "make_pipeline", "make_union"]
36
37
38def _final_estimator_has(attr):
39    """Check that final_estimator has `attr`.
40
41    Used together with `avaliable_if` in `Pipeline`."""
42
43    def check(self):
44        # raise original `AttributeError` if `attr` does not exist
45        getattr(self._final_estimator, attr)
46        return True
47
48    return check
49
50
51class Pipeline(_BaseComposition):
52    """
53    Pipeline of transforms with a final estimator.
54
55    Sequentially apply a list of transforms and a final estimator.
56    Intermediate steps of the pipeline must be 'transforms', that is, they
57    must implement `fit` and `transform` methods.
58    The final estimator only needs to implement `fit`.
59    The transformers in the pipeline can be cached using ``memory`` argument.
60
61    The purpose of the pipeline is to assemble several steps that can be
62    cross-validated together while setting different parameters. For this, it
63    enables setting parameters of the various steps using their names and the
64    parameter name separated by a `'__'`, as in the example below. A step's
65    estimator may be replaced entirely by setting the parameter with its name
66    to another estimator, or a transformer removed by setting it to
67    `'passthrough'` or `None`.
68
69    Read more in the :ref:`User Guide <pipeline>`.
70
71    .. versionadded:: 0.5
72
73    Parameters
74    ----------
75    steps : list of tuple
76        List of (name, transform) tuples (implementing `fit`/`transform`) that
77        are chained, in the order in which they are chained, with the last
78        object an estimator.
79
80    memory : str or object with the joblib.Memory interface, default=None
81        Used to cache the fitted transformers of the pipeline. By default,
82        no caching is performed. If a string is given, it is the path to
83        the caching directory. Enabling caching triggers a clone of
84        the transformers before fitting. Therefore, the transformer
85        instance given to the pipeline cannot be inspected
86        directly. Use the attribute ``named_steps`` or ``steps`` to
87        inspect estimators within the pipeline. Caching the
88        transformers is advantageous when fitting is time consuming.
89
90    verbose : bool, default=False
91        If True, the time elapsed while fitting each step will be printed as it
92        is completed.
93
94    Attributes
95    ----------
96    named_steps : :class:`~sklearn.utils.Bunch`
97        Dictionary-like object, with the following attributes.
98        Read-only attribute to access any step parameter by user given name.
99        Keys are step names and values are steps parameters.
100
101    classes_ : ndarray of shape (n_classes,)
102        The classes labels. Only exist if the last step of the pipeline is a
103        classifier.
104
105    n_features_in_ : int
106        Number of features seen during :term:`fit`. Only defined if the
107        underlying first estimator in `steps` exposes such an attribute
108        when fit.
109
110        .. versionadded:: 0.24
111
112    feature_names_in_ : ndarray of shape (`n_features_in_`,)
113        Names of features seen during :term:`fit`. Only defined if the
114        underlying estimator exposes such an attribute when fit.
115
116        .. versionadded:: 1.0
117
118    See Also
119    --------
120    make_pipeline : Convenience function for simplified pipeline construction.
121
122    Examples
123    --------
124    >>> from sklearn.svm import SVC
125    >>> from sklearn.preprocessing import StandardScaler
126    >>> from sklearn.datasets import make_classification
127    >>> from sklearn.model_selection import train_test_split
128    >>> from sklearn.pipeline import Pipeline
129    >>> X, y = make_classification(random_state=0)
130    >>> X_train, X_test, y_train, y_test = train_test_split(X, y,
131    ...                                                     random_state=0)
132    >>> pipe = Pipeline([('scaler', StandardScaler()), ('svc', SVC())])
133    >>> # The pipeline can be used as any other estimator
134    >>> # and avoids leaking the test set into the train set
135    >>> pipe.fit(X_train, y_train)
136    Pipeline(steps=[('scaler', StandardScaler()), ('svc', SVC())])
137    >>> pipe.score(X_test, y_test)
138    0.88
139    """
140
141    # BaseEstimator interface
142    _required_parameters = ["steps"]
143
144    def __init__(self, steps, *, memory=None, verbose=False):
145        self.steps = steps
146        self.memory = memory
147        self.verbose = verbose
148        self._validate_steps()
149
150    def get_params(self, deep=True):
151        """Get parameters for this estimator.
152
153        Returns the parameters given in the constructor as well as the
154        estimators contained within the `steps` of the `Pipeline`.
155
156        Parameters
157        ----------
158        deep : bool, default=True
159            If True, will return the parameters for this estimator and
160            contained subobjects that are estimators.
161
162        Returns
163        -------
164        params : mapping of string to any
165            Parameter names mapped to their values.
166        """
167        return self._get_params("steps", deep=deep)
168
169    def set_params(self, **kwargs):
170        """Set the parameters of this estimator.
171
172        Valid parameter keys can be listed with ``get_params()``. Note that
173        you can directly set the parameters of the estimators contained in
174        `steps`.
175
176        Parameters
177        ----------
178        **kwargs : dict
179            Parameters of this estimator or parameters of estimators contained
180            in `steps`. Parameters of the steps may be set using its name and
181            the parameter name separated by a '__'.
182
183        Returns
184        -------
185        self : object
186            Pipeline class instance.
187        """
188        self._set_params("steps", **kwargs)
189        return self
190
191    def _validate_steps(self):
192        names, estimators = zip(*self.steps)
193
194        # validate names
195        self._validate_names(names)
196
197        # validate estimators
198        transformers = estimators[:-1]
199        estimator = estimators[-1]
200
201        for t in transformers:
202            if t is None or t == "passthrough":
203                continue
204            if not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not hasattr(
205                t, "transform"
206            ):
207                raise TypeError(
208                    "All intermediate steps should be "
209                    "transformers and implement fit and transform "
210                    "or be the string 'passthrough' "
211                    "'%s' (type %s) doesn't" % (t, type(t))
212                )
213
214        # We allow last estimator to be None as an identity transformation
215        if (
216            estimator is not None
217            and estimator != "passthrough"
218            and not hasattr(estimator, "fit")
219        ):
220            raise TypeError(
221                "Last step of Pipeline should implement fit "
222                "or be the string 'passthrough'. "
223                "'%s' (type %s) doesn't" % (estimator, type(estimator))
224            )
225
226    def _iter(self, with_final=True, filter_passthrough=True):
227        """
228        Generate (idx, (name, trans)) tuples from self.steps
229
230        When filter_passthrough is True, 'passthrough' and None transformers
231        are filtered out.
232        """
233        stop = len(self.steps)
234        if not with_final:
235            stop -= 1
236
237        for idx, (name, trans) in enumerate(islice(self.steps, 0, stop)):
238            if not filter_passthrough:
239                yield idx, name, trans
240            elif trans is not None and trans != "passthrough":
241                yield idx, name, trans
242
243    def __len__(self):
244        """
245        Returns the length of the Pipeline
246        """
247        return len(self.steps)
248
249    def __getitem__(self, ind):
250        """Returns a sub-pipeline or a single estimator in the pipeline
251
252        Indexing with an integer will return an estimator; using a slice
253        returns another Pipeline instance which copies a slice of this
254        Pipeline. This copy is shallow: modifying (or fitting) estimators in
255        the sub-pipeline will affect the larger pipeline and vice-versa.
256        However, replacing a value in `step` will not affect a copy.
257        """
258        if isinstance(ind, slice):
259            if ind.step not in (1, None):
260                raise ValueError("Pipeline slicing only supports a step of 1")
261            return self.__class__(
262                self.steps[ind], memory=self.memory, verbose=self.verbose
263            )
264        try:
265            name, est = self.steps[ind]
266        except TypeError:
267            # Not an int, try get step by name
268            return self.named_steps[ind]
269        return est
270
271    @property
272    def _estimator_type(self):
273        return self.steps[-1][1]._estimator_type
274
275    @property
276    def named_steps(self):
277        """Access the steps by name.
278
279        Read-only attribute to access any step by given name.
280        Keys are steps names and values are the steps objects."""
281        # Use Bunch object to improve autocomplete
282        return Bunch(**dict(self.steps))
283
284    @property
285    def _final_estimator(self):
286        estimator = self.steps[-1][1]
287        return "passthrough" if estimator is None else estimator
288
289    def _log_message(self, step_idx):
290        if not self.verbose:
291            return None
292        name, _ = self.steps[step_idx]
293
294        return "(step %d of %d) Processing %s" % (step_idx + 1, len(self.steps), name)
295
296    def _check_fit_params(self, **fit_params):
297        fit_params_steps = {name: {} for name, step in self.steps if step is not None}
298        for pname, pval in fit_params.items():
299            if "__" not in pname:
300                raise ValueError(
301                    "Pipeline.fit does not accept the {} parameter. "
302                    "You can pass parameters to specific steps of your "
303                    "pipeline using the stepname__parameter format, e.g. "
304                    "`Pipeline.fit(X, y, logisticregression__sample_weight"
305                    "=sample_weight)`.".format(pname)
306                )
307            step, param = pname.split("__", 1)
308            fit_params_steps[step][param] = pval
309        return fit_params_steps
310
311    # Estimator interface
312
313    def _fit(self, X, y=None, **fit_params_steps):
314        # shallow copy of steps - this should really be steps_
315        self.steps = list(self.steps)
316        self._validate_steps()
317        # Setup the memory
318        memory = check_memory(self.memory)
319
320        fit_transform_one_cached = memory.cache(_fit_transform_one)
321
322        for (step_idx, name, transformer) in self._iter(
323            with_final=False, filter_passthrough=False
324        ):
325            if transformer is None or transformer == "passthrough":
326                with _print_elapsed_time("Pipeline", self._log_message(step_idx)):
327                    continue
328
329            if hasattr(memory, "location"):
330                # joblib >= 0.12
331                if memory.location is None:
332                    # we do not clone when caching is disabled to
333                    # preserve backward compatibility
334                    cloned_transformer = transformer
335                else:
336                    cloned_transformer = clone(transformer)
337            elif hasattr(memory, "cachedir"):
338                # joblib < 0.11
339                if memory.cachedir is None:
340                    # we do not clone when caching is disabled to
341                    # preserve backward compatibility
342                    cloned_transformer = transformer
343                else:
344                    cloned_transformer = clone(transformer)
345            else:
346                cloned_transformer = clone(transformer)
347            # Fit or load from cache the current transformer
348            X, fitted_transformer = fit_transform_one_cached(
349                cloned_transformer,
350                X,
351                y,
352                None,
353                message_clsname="Pipeline",
354                message=self._log_message(step_idx),
355                **fit_params_steps[name],
356            )
357            # Replace the transformer of the step with the fitted
358            # transformer. This is necessary when loading the transformer
359            # from the cache.
360            self.steps[step_idx] = (name, fitted_transformer)
361        return X
362
363    def fit(self, X, y=None, **fit_params):
364        """Fit the model.
365
366        Fit all the transformers one after the other and transform the
367        data. Finally, fit the transformed data using the final estimator.
368
369        Parameters
370        ----------
371        X : iterable
372            Training data. Must fulfill input requirements of first step of the
373            pipeline.
374
375        y : iterable, default=None
376            Training targets. Must fulfill label requirements for all steps of
377            the pipeline.
378
379        **fit_params : dict of string -> object
380            Parameters passed to the ``fit`` method of each step, where
381            each parameter name is prefixed such that parameter ``p`` for step
382            ``s`` has key ``s__p``.
383
384        Returns
385        -------
386        self : object
387            Pipeline with fitted steps.
388        """
389        fit_params_steps = self._check_fit_params(**fit_params)
390        Xt = self._fit(X, y, **fit_params_steps)
391        with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
392            if self._final_estimator != "passthrough":
393                fit_params_last_step = fit_params_steps[self.steps[-1][0]]
394                self._final_estimator.fit(Xt, y, **fit_params_last_step)
395
396        return self
397
398    def fit_transform(self, X, y=None, **fit_params):
399        """Fit the model and transform with the final estimator.
400
401        Fits all the transformers one after the other and transform the
402        data. Then uses `fit_transform` on transformed data with the final
403        estimator.
404
405        Parameters
406        ----------
407        X : iterable
408            Training data. Must fulfill input requirements of first step of the
409            pipeline.
410
411        y : iterable, default=None
412            Training targets. Must fulfill label requirements for all steps of
413            the pipeline.
414
415        **fit_params : dict of string -> object
416            Parameters passed to the ``fit`` method of each step, where
417            each parameter name is prefixed such that parameter ``p`` for step
418            ``s`` has key ``s__p``.
419
420        Returns
421        -------
422        Xt : ndarray of shape (n_samples, n_transformed_features)
423            Transformed samples.
424        """
425        fit_params_steps = self._check_fit_params(**fit_params)
426        Xt = self._fit(X, y, **fit_params_steps)
427
428        last_step = self._final_estimator
429        with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
430            if last_step == "passthrough":
431                return Xt
432            fit_params_last_step = fit_params_steps[self.steps[-1][0]]
433            if hasattr(last_step, "fit_transform"):
434                return last_step.fit_transform(Xt, y, **fit_params_last_step)
435            else:
436                return last_step.fit(Xt, y, **fit_params_last_step).transform(Xt)
437
438    @available_if(_final_estimator_has("predict"))
439    def predict(self, X, **predict_params):
440        """Transform the data, and apply `predict` with the final estimator.
441
442        Call `transform` of each transformer in the pipeline. The transformed
443        data are finally passed to the final estimator that calls `predict`
444        method. Only valid if the final estimator implements `predict`.
445
446        Parameters
447        ----------
448        X : iterable
449            Data to predict on. Must fulfill input requirements of first step
450            of the pipeline.
451
452        **predict_params : dict of string -> object
453            Parameters to the ``predict`` called at the end of all
454            transformations in the pipeline. Note that while this may be
455            used to return uncertainties from some models with return_std
456            or return_cov, uncertainties that are generated by the
457            transformations in the pipeline are not propagated to the
458            final estimator.
459
460            .. versionadded:: 0.20
461
462        Returns
463        -------
464        y_pred : ndarray
465            Result of calling `predict` on the final estimator.
466        """
467        Xt = X
468        for _, name, transform in self._iter(with_final=False):
469            Xt = transform.transform(Xt)
470        return self.steps[-1][1].predict(Xt, **predict_params)
471
472    @available_if(_final_estimator_has("fit_predict"))
473    def fit_predict(self, X, y=None, **fit_params):
474        """Transform the data, and apply `fit_predict` with the final estimator.
475
476        Call `fit_transform` of each transformer in the pipeline. The
477        transformed data are finally passed to the final estimator that calls
478        `fit_predict` method. Only valid if the final estimator implements
479        `fit_predict`.
480
481        Parameters
482        ----------
483        X : iterable
484            Training data. Must fulfill input requirements of first step of
485            the pipeline.
486
487        y : iterable, default=None
488            Training targets. Must fulfill label requirements for all steps
489            of the pipeline.
490
491        **fit_params : dict of string -> object
492            Parameters passed to the ``fit`` method of each step, where
493            each parameter name is prefixed such that parameter ``p`` for step
494            ``s`` has key ``s__p``.
495
496        Returns
497        -------
498        y_pred : ndarray
499            Result of calling `fit_predict` on the final estimator.
500        """
501        fit_params_steps = self._check_fit_params(**fit_params)
502        Xt = self._fit(X, y, **fit_params_steps)
503
504        fit_params_last_step = fit_params_steps[self.steps[-1][0]]
505        with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
506            y_pred = self.steps[-1][1].fit_predict(Xt, y, **fit_params_last_step)
507        return y_pred
508
509    @available_if(_final_estimator_has("predict_proba"))
510    def predict_proba(self, X, **predict_proba_params):
511        """Transform the data, and apply `predict_proba` with the final estimator.
512
513        Call `transform` of each transformer in the pipeline. The transformed
514        data are finally passed to the final estimator that calls
515        `predict_proba` method. Only valid if the final estimator implements
516        `predict_proba`.
517
518        Parameters
519        ----------
520        X : iterable
521            Data to predict on. Must fulfill input requirements of first step
522            of the pipeline.
523
524        **predict_proba_params : dict of string -> object
525            Parameters to the `predict_proba` called at the end of all
526            transformations in the pipeline.
527
528        Returns
529        -------
530        y_proba : ndarray of shape (n_samples, n_classes)
531            Result of calling `predict_proba` on the final estimator.
532        """
533        Xt = X
534        for _, name, transform in self._iter(with_final=False):
535            Xt = transform.transform(Xt)
536        return self.steps[-1][1].predict_proba(Xt, **predict_proba_params)
537
538    @available_if(_final_estimator_has("decision_function"))
539    def decision_function(self, X):
540        """Transform the data, and apply `decision_function` with the final estimator.
541
542        Call `transform` of each transformer in the pipeline. The transformed
543        data are finally passed to the final estimator that calls
544        `decision_function` method. Only valid if the final estimator
545        implements `decision_function`.
546
547        Parameters
548        ----------
549        X : iterable
550            Data to predict on. Must fulfill input requirements of first step
551            of the pipeline.
552
553        Returns
554        -------
555        y_score : ndarray of shape (n_samples, n_classes)
556            Result of calling `decision_function` on the final estimator.
557        """
558        Xt = X
559        for _, name, transform in self._iter(with_final=False):
560            Xt = transform.transform(Xt)
561        return self.steps[-1][1].decision_function(Xt)
562
563    @available_if(_final_estimator_has("score_samples"))
564    def score_samples(self, X):
565        """Transform the data, and apply `score_samples` with the final estimator.
566
567        Call `transform` of each transformer in the pipeline. The transformed
568        data are finally passed to the final estimator that calls
569        `score_samples` method. Only valid if the final estimator implements
570        `score_samples`.
571
572        Parameters
573        ----------
574        X : iterable
575            Data to predict on. Must fulfill input requirements of first step
576            of the pipeline.
577
578        Returns
579        -------
580        y_score : ndarray of shape (n_samples,)
581            Result of calling `score_samples` on the final estimator.
582        """
583        Xt = X
584        for _, _, transformer in self._iter(with_final=False):
585            Xt = transformer.transform(Xt)
586        return self.steps[-1][1].score_samples(Xt)
587
588    @available_if(_final_estimator_has("predict_log_proba"))
589    def predict_log_proba(self, X, **predict_log_proba_params):
590        """Transform the data, and apply `predict_log_proba` with the final estimator.
591
592        Call `transform` of each transformer in the pipeline. The transformed
593        data are finally passed to the final estimator that calls
594        `predict_log_proba` method. Only valid if the final estimator
595        implements `predict_log_proba`.
596
597        Parameters
598        ----------
599        X : iterable
600            Data to predict on. Must fulfill input requirements of first step
601            of the pipeline.
602
603        **predict_log_proba_params : dict of string -> object
604            Parameters to the ``predict_log_proba`` called at the end of all
605            transformations in the pipeline.
606
607        Returns
608        -------
609        y_log_proba : ndarray of shape (n_samples, n_classes)
610            Result of calling `predict_log_proba` on the final estimator.
611        """
612        Xt = X
613        for _, name, transform in self._iter(with_final=False):
614            Xt = transform.transform(Xt)
615        return self.steps[-1][1].predict_log_proba(Xt, **predict_log_proba_params)
616
617    def _can_transform(self):
618        return self._final_estimator == "passthrough" or hasattr(
619            self._final_estimator, "transform"
620        )
621
622    @available_if(_can_transform)
623    def transform(self, X):
624        """Transform the data, and apply `transform` with the final estimator.
625
626        Call `transform` of each transformer in the pipeline. The transformed
627        data are finally passed to the final estimator that calls
628        `transform` method. Only valid if the final estimator
629        implements `transform`.
630
631        This also works where final estimator is `None` in which case all prior
632        transformations are applied.
633
634        Parameters
635        ----------
636        X : iterable
637            Data to transform. Must fulfill input requirements of first step
638            of the pipeline.
639
640        Returns
641        -------
642        Xt : ndarray of shape (n_samples, n_transformed_features)
643            Transformed data.
644        """
645        Xt = X
646        for _, _, transform in self._iter():
647            Xt = transform.transform(Xt)
648        return Xt
649
650    def _can_inverse_transform(self):
651        return all(hasattr(t, "inverse_transform") for _, _, t in self._iter())
652
653    @available_if(_can_inverse_transform)
654    def inverse_transform(self, Xt):
655        """Apply `inverse_transform` for each step in a reverse order.
656
657        All estimators in the pipeline must support `inverse_transform`.
658
659        Parameters
660        ----------
661        Xt : array-like of shape (n_samples, n_transformed_features)
662            Data samples, where ``n_samples`` is the number of samples and
663            ``n_features`` is the number of features. Must fulfill
664            input requirements of last step of pipeline's
665            ``inverse_transform`` method.
666
667        Returns
668        -------
669        Xt : ndarray of shape (n_samples, n_features)
670            Inverse transformed data, that is, data in the original feature
671            space.
672        """
673        reverse_iter = reversed(list(self._iter()))
674        for _, _, transform in reverse_iter:
675            Xt = transform.inverse_transform(Xt)
676        return Xt
677
678    @available_if(_final_estimator_has("score"))
679    def score(self, X, y=None, sample_weight=None):
680        """Transform the data, and apply `score` with the final estimator.
681
682        Call `transform` of each transformer in the pipeline. The transformed
683        data are finally passed to the final estimator that calls
684        `score` method. Only valid if the final estimator implements `score`.
685
686        Parameters
687        ----------
688        X : iterable
689            Data to predict on. Must fulfill input requirements of first step
690            of the pipeline.
691
692        y : iterable, default=None
693            Targets used for scoring. Must fulfill label requirements for all
694            steps of the pipeline.
695
696        sample_weight : array-like, default=None
697            If not None, this argument is passed as ``sample_weight`` keyword
698            argument to the ``score`` method of the final estimator.
699
700        Returns
701        -------
702        score : float
703            Result of calling `score` on the final estimator.
704        """
705        Xt = X
706        for _, name, transform in self._iter(with_final=False):
707            Xt = transform.transform(Xt)
708        score_params = {}
709        if sample_weight is not None:
710            score_params["sample_weight"] = sample_weight
711        return self.steps[-1][1].score(Xt, y, **score_params)
712
713    @property
714    def classes_(self):
715        """The classes labels. Only exist if the last step is a classifier."""
716        return self.steps[-1][1].classes_
717
718    def _more_tags(self):
719        # check if first estimator expects pairwise input
720        return {"pairwise": _safe_tags(self.steps[0][1], "pairwise")}
721
722    # TODO: Remove in 1.1
723    # mypy error: Decorated property not supported
724    @deprecated(  # type: ignore
725        "Attribute `_pairwise` was deprecated in "
726        "version 0.24 and will be removed in 1.1 (renaming of 0.26)."
727    )
728    @property
729    def _pairwise(self):
730        # check if first estimator expects pairwise input
731        return getattr(self.steps[0][1], "_pairwise", False)
732
733    def get_feature_names_out(self, input_features=None):
734        """Get output feature names for transformation.
735
736        Transform input features using the pipeline.
737
738        Parameters
739        ----------
740        input_features : array-like of str or None, default=None
741            Input features.
742
743        Returns
744        -------
745        feature_names_out : ndarray of str objects
746            Transformed feature names.
747        """
748        feature_names_out = input_features
749        for _, name, transform in self._iter():
750            if not hasattr(transform, "get_feature_names_out"):
751                raise AttributeError(
752                    "Estimator {} does not provide get_feature_names_out. "
753                    "Did you mean to call pipeline[:-1].get_feature_names_out"
754                    "()?".format(name)
755                )
756            feature_names_out = transform.get_feature_names_out(feature_names_out)
757        return feature_names_out
758
759    @property
760    def n_features_in_(self):
761        """Number of features seen during first step `fit` method."""
762        # delegate to first step (which will call _check_is_fitted)
763        return self.steps[0][1].n_features_in_
764
765    @property
766    def feature_names_in_(self):
767        """Names of features seen during first step `fit` method."""
768        # delegate to first step (which will call _check_is_fitted)
769        return self.steps[0][1].feature_names_in_
770
771    def __sklearn_is_fitted__(self):
772        """Indicate whether pipeline has been fit."""
773        try:
774            # check if the last step of the pipeline is fitted
775            # we only check the last step since if the last step is fit, it
776            # means the previous steps should also be fit. This is faster than
777            # checking if every step of the pipeline is fit.
778            check_is_fitted(self.steps[-1][1])
779            return True
780        except NotFittedError:
781            return False
782
783    def _sk_visual_block_(self):
784        _, estimators = zip(*self.steps)
785
786        def _get_name(name, est):
787            if est is None or est == "passthrough":
788                return f"{name}: passthrough"
789            # Is an estimator
790            return f"{name}: {est.__class__.__name__}"
791
792        names = [_get_name(name, est) for name, est in self.steps]
793        name_details = [str(est) for est in estimators]
794        return _VisualBlock(
795            "serial",
796            estimators,
797            names=names,
798            name_details=name_details,
799            dash_wrapped=False,
800        )
801
802
803def _name_estimators(estimators):
804    """Generate names for estimators."""
805
806    names = [
807        estimator if isinstance(estimator, str) else type(estimator).__name__.lower()
808        for estimator in estimators
809    ]
810    namecount = defaultdict(int)
811    for est, name in zip(estimators, names):
812        namecount[name] += 1
813
814    for k, v in list(namecount.items()):
815        if v == 1:
816            del namecount[k]
817
818    for i in reversed(range(len(estimators))):
819        name = names[i]
820        if name in namecount:
821            names[i] += "-%d" % namecount[name]
822            namecount[name] -= 1
823
824    return list(zip(names, estimators))
825
826
827def make_pipeline(*steps, memory=None, verbose=False):
828    """Construct a :class:`Pipeline` from the given estimators.
829
830    This is a shorthand for the :class:`Pipeline` constructor; it does not
831    require, and does not permit, naming the estimators. Instead, their names
832    will be set to the lowercase of their types automatically.
833
834    Parameters
835    ----------
836    *steps : list of Estimator objects
837        List of the scikit-learn estimators that are chained together.
838
839    memory : str or object with the joblib.Memory interface, default=None
840        Used to cache the fitted transformers of the pipeline. By default,
841        no caching is performed. If a string is given, it is the path to
842        the caching directory. Enabling caching triggers a clone of
843        the transformers before fitting. Therefore, the transformer
844        instance given to the pipeline cannot be inspected
845        directly. Use the attribute ``named_steps`` or ``steps`` to
846        inspect estimators within the pipeline. Caching the
847        transformers is advantageous when fitting is time consuming.
848
849    verbose : bool, default=False
850        If True, the time elapsed while fitting each step will be printed as it
851        is completed.
852
853    Returns
854    -------
855    p : Pipeline
856        Returns a scikit-learn :class:`Pipeline` object.
857
858    See Also
859    --------
860    Pipeline : Class for creating a pipeline of transforms with a final
861        estimator.
862
863    Examples
864    --------
865    >>> from sklearn.naive_bayes import GaussianNB
866    >>> from sklearn.preprocessing import StandardScaler
867    >>> from sklearn.pipeline import make_pipeline
868    >>> make_pipeline(StandardScaler(), GaussianNB(priors=None))
869    Pipeline(steps=[('standardscaler', StandardScaler()),
870                    ('gaussiannb', GaussianNB())])
871    """
872    return Pipeline(_name_estimators(steps), memory=memory, verbose=verbose)
873
874
875def _transform_one(transformer, X, y, weight, **fit_params):
876    res = transformer.transform(X)
877    # if we have a weight for this transformer, multiply output
878    if weight is None:
879        return res
880    return res * weight
881
882
883def _fit_transform_one(
884    transformer, X, y, weight, message_clsname="", message=None, **fit_params
885):
886    """
887    Fits ``transformer`` to ``X`` and ``y``. The transformed result is returned
888    with the fitted transformer. If ``weight`` is not ``None``, the result will
889    be multiplied by ``weight``.
890    """
891    with _print_elapsed_time(message_clsname, message):
892        if hasattr(transformer, "fit_transform"):
893            res = transformer.fit_transform(X, y, **fit_params)
894        else:
895            res = transformer.fit(X, y, **fit_params).transform(X)
896
897    if weight is None:
898        return res, transformer
899    return res * weight, transformer
900
901
902def _fit_one(transformer, X, y, weight, message_clsname="", message=None, **fit_params):
903    """
904    Fits ``transformer`` to ``X`` and ``y``.
905    """
906    with _print_elapsed_time(message_clsname, message):
907        return transformer.fit(X, y, **fit_params)
908
909
910class FeatureUnion(TransformerMixin, _BaseComposition):
911    """Concatenates results of multiple transformer objects.
912
913    This estimator applies a list of transformer objects in parallel to the
914    input data, then concatenates the results. This is useful to combine
915    several feature extraction mechanisms into a single transformer.
916
917    Parameters of the transformers may be set using its name and the parameter
918    name separated by a '__'. A transformer may be replaced entirely by
919    setting the parameter with its name to another transformer,
920    or removed by setting to 'drop'.
921
922    Read more in the :ref:`User Guide <feature_union>`.
923
924    .. versionadded:: 0.13
925
926    Parameters
927    ----------
928    transformer_list : list of tuple
929        List of tuple containing `(str, transformer)`. The first element
930        of the tuple is name affected to the transformer while the
931        second element is a scikit-learn transformer instance.
932        The transformer instance can also be `"drop"` for it to be
933        ignored.
934
935        .. versionchanged:: 0.22
936           Deprecated `None` as a transformer in favor of 'drop'.
937
938    n_jobs : int, default=None
939        Number of jobs to run in parallel.
940        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
941        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
942        for more details.
943
944        .. versionchanged:: v0.20
945           `n_jobs` default changed from 1 to None
946
947    transformer_weights : dict, default=None
948        Multiplicative weights for features per transformer.
949        Keys are transformer names, values the weights.
950        Raises ValueError if key not present in ``transformer_list``.
951
952    verbose : bool, default=False
953        If True, the time elapsed while fitting each transformer will be
954        printed as it is completed.
955
956    Attributes
957    ----------
958    n_features_in_ : int
959        Number of features seen during :term:`fit`. Only defined if the
960        underlying first transformer in `transformer_list` exposes such an
961        attribute when fit.
962
963        .. versionadded:: 0.24
964
965    See Also
966    --------
967    make_union : Convenience function for simplified feature union
968        construction.
969
970    Examples
971    --------
972    >>> from sklearn.pipeline import FeatureUnion
973    >>> from sklearn.decomposition import PCA, TruncatedSVD
974    >>> union = FeatureUnion([("pca", PCA(n_components=1)),
975    ...                       ("svd", TruncatedSVD(n_components=2))])
976    >>> X = [[0., 1., 3], [2., 2., 5]]
977    >>> union.fit_transform(X)
978    array([[ 1.5       ,  3.0...,  0.8...],
979           [-1.5       ,  5.7..., -0.4...]])
980    """
981
982    _required_parameters = ["transformer_list"]
983
984    def __init__(
985        self, transformer_list, *, n_jobs=None, transformer_weights=None, verbose=False
986    ):
987        self.transformer_list = transformer_list
988        self.n_jobs = n_jobs
989        self.transformer_weights = transformer_weights
990        self.verbose = verbose
991        self._validate_transformers()
992
993    def get_params(self, deep=True):
994        """Get parameters for this estimator.
995
996        Returns the parameters given in the constructor as well as the
997        estimators contained within the `transformer_list` of the
998        `FeatureUnion`.
999
1000        Parameters
1001        ----------
1002        deep : bool, default=True
1003            If True, will return the parameters for this estimator and
1004            contained subobjects that are estimators.
1005
1006        Returns
1007        -------
1008        params : mapping of string to any
1009            Parameter names mapped to their values.
1010        """
1011        return self._get_params("transformer_list", deep=deep)
1012
1013    def set_params(self, **kwargs):
1014        """Set the parameters of this estimator.
1015
1016        Valid parameter keys can be listed with ``get_params()``. Note that
1017        you can directly set the parameters of the estimators contained in
1018        `tranformer_list`.
1019
1020        Parameters
1021        ----------
1022        **kwargs : dict
1023            Parameters of this estimator or parameters of estimators contained
1024            in `transform_list`. Parameters of the transformers may be set
1025            using its name and the parameter name separated by a '__'.
1026
1027        Returns
1028        -------
1029        self : object
1030            FeatureUnion class instance.
1031        """
1032        self._set_params("transformer_list", **kwargs)
1033        return self
1034
1035    def _validate_transformers(self):
1036        names, transformers = zip(*self.transformer_list)
1037
1038        # validate names
1039        self._validate_names(names)
1040
1041        # validate estimators
1042        for t in transformers:
1043            if t == "drop":
1044                continue
1045            if not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not hasattr(
1046                t, "transform"
1047            ):
1048                raise TypeError(
1049                    "All estimators should implement fit and "
1050                    "transform. '%s' (type %s) doesn't" % (t, type(t))
1051                )
1052
1053    def _validate_transformer_weights(self):
1054        if not self.transformer_weights:
1055            return
1056
1057        transformer_names = set(name for name, _ in self.transformer_list)
1058        for name in self.transformer_weights:
1059            if name not in transformer_names:
1060                raise ValueError(
1061                    f'Attempting to weight transformer "{name}", '
1062                    "but it is not present in transformer_list."
1063                )
1064
1065    def _iter(self):
1066        """
1067        Generate (name, trans, weight) tuples excluding None and
1068        'drop' transformers.
1069        """
1070        get_weight = (self.transformer_weights or {}).get
1071        return (
1072            (name, trans, get_weight(name))
1073            for name, trans in self.transformer_list
1074            if trans != "drop"
1075        )
1076
1077    @deprecated(
1078        "get_feature_names is deprecated in 1.0 and will be removed "
1079        "in 1.2. Please use get_feature_names_out instead."
1080    )
1081    def get_feature_names(self):
1082        """Get feature names from all transformers.
1083
1084        Returns
1085        -------
1086        feature_names : list of strings
1087            Names of the features produced by transform.
1088        """
1089        feature_names = []
1090        for name, trans, weight in self._iter():
1091            if not hasattr(trans, "get_feature_names"):
1092                raise AttributeError(
1093                    "Transformer %s (type %s) does not provide get_feature_names."
1094                    % (str(name), type(trans).__name__)
1095                )
1096            feature_names.extend([name + "__" + f for f in trans.get_feature_names()])
1097        return feature_names
1098
1099    def get_feature_names_out(self, input_features=None):
1100        """Get output feature names for transformation.
1101
1102        Parameters
1103        ----------
1104        input_features : array-like of str or None, default=None
1105            Input features.
1106
1107        Returns
1108        -------
1109        feature_names_out : ndarray of str objects
1110            Transformed feature names.
1111        """
1112        feature_names = []
1113        for name, trans, _ in self._iter():
1114            if not hasattr(trans, "get_feature_names_out"):
1115                raise AttributeError(
1116                    "Transformer %s (type %s) does not provide get_feature_names_out."
1117                    % (str(name), type(trans).__name__)
1118                )
1119            feature_names.extend(
1120                [f"{name}__{f}" for f in trans.get_feature_names_out(input_features)]
1121            )
1122        return np.asarray(feature_names, dtype=object)
1123
1124    def fit(self, X, y=None, **fit_params):
1125        """Fit all transformers using X.
1126
1127        Parameters
1128        ----------
1129        X : iterable or array-like, depending on transformers
1130            Input data, used to fit transformers.
1131
1132        y : array-like of shape (n_samples, n_outputs), default=None
1133            Targets for supervised learning.
1134
1135        **fit_params : dict, default=None
1136            Parameters to pass to the fit method of the estimator.
1137
1138        Returns
1139        -------
1140        self : object
1141            FeatureUnion class instance.
1142        """
1143        transformers = self._parallel_func(X, y, fit_params, _fit_one)
1144        if not transformers:
1145            # All transformers are None
1146            return self
1147
1148        self._update_transformer_list(transformers)
1149        return self
1150
1151    def fit_transform(self, X, y=None, **fit_params):
1152        """Fit all transformers, transform the data and concatenate results.
1153
1154        Parameters
1155        ----------
1156        X : iterable or array-like, depending on transformers
1157            Input data to be transformed.
1158
1159        y : array-like of shape (n_samples, n_outputs), default=None
1160            Targets for supervised learning.
1161
1162        **fit_params : dict, default=None
1163            Parameters to pass to the fit method of the estimator.
1164
1165        Returns
1166        -------
1167        X_t : array-like or sparse matrix of \
1168                shape (n_samples, sum_n_components)
1169            The `hstack` of results of transformers. `sum_n_components` is the
1170            sum of `n_components` (output dimension) over transformers.
1171        """
1172        results = self._parallel_func(X, y, fit_params, _fit_transform_one)
1173        if not results:
1174            # All transformers are None
1175            return np.zeros((X.shape[0], 0))
1176
1177        Xs, transformers = zip(*results)
1178        self._update_transformer_list(transformers)
1179
1180        return self._hstack(Xs)
1181
1182    def _log_message(self, name, idx, total):
1183        if not self.verbose:
1184            return None
1185        return "(step %d of %d) Processing %s" % (idx, total, name)
1186
1187    def _parallel_func(self, X, y, fit_params, func):
1188        """Runs func in parallel on X and y"""
1189        self.transformer_list = list(self.transformer_list)
1190        self._validate_transformers()
1191        self._validate_transformer_weights()
1192        transformers = list(self._iter())
1193
1194        return Parallel(n_jobs=self.n_jobs)(
1195            delayed(func)(
1196                transformer,
1197                X,
1198                y,
1199                weight,
1200                message_clsname="FeatureUnion",
1201                message=self._log_message(name, idx, len(transformers)),
1202                **fit_params,
1203            )
1204            for idx, (name, transformer, weight) in enumerate(transformers, 1)
1205        )
1206
1207    def transform(self, X):
1208        """Transform X separately by each transformer, concatenate results.
1209
1210        Parameters
1211        ----------
1212        X : iterable or array-like, depending on transformers
1213            Input data to be transformed.
1214
1215        Returns
1216        -------
1217        X_t : array-like or sparse matrix of \
1218                shape (n_samples, sum_n_components)
1219            The `hstack` of results of transformers. `sum_n_components` is the
1220            sum of `n_components` (output dimension) over transformers.
1221        """
1222        Xs = Parallel(n_jobs=self.n_jobs)(
1223            delayed(_transform_one)(trans, X, None, weight)
1224            for name, trans, weight in self._iter()
1225        )
1226        if not Xs:
1227            # All transformers are None
1228            return np.zeros((X.shape[0], 0))
1229
1230        return self._hstack(Xs)
1231
1232    def _hstack(self, Xs):
1233        if any(sparse.issparse(f) for f in Xs):
1234            Xs = sparse.hstack(Xs).tocsr()
1235        else:
1236            Xs = np.hstack(Xs)
1237        return Xs
1238
1239    def _update_transformer_list(self, transformers):
1240        transformers = iter(transformers)
1241        self.transformer_list[:] = [
1242            (name, old if old == "drop" else next(transformers))
1243            for name, old in self.transformer_list
1244        ]
1245
1246    @property
1247    def n_features_in_(self):
1248        """Number of features seen during :term:`fit`."""
1249
1250        # X is passed to all transformers so we just delegate to the first one
1251        return self.transformer_list[0][1].n_features_in_
1252
1253    def _sk_visual_block_(self):
1254        names, transformers = zip(*self.transformer_list)
1255        return _VisualBlock("parallel", transformers, names=names)
1256
1257
1258def make_union(*transformers, n_jobs=None, verbose=False):
1259    """
1260    Construct a FeatureUnion from the given transformers.
1261
1262    This is a shorthand for the FeatureUnion constructor; it does not require,
1263    and does not permit, naming the transformers. Instead, they will be given
1264    names automatically based on their types. It also does not allow weighting.
1265
1266    Parameters
1267    ----------
1268    *transformers : list of estimators
1269
1270    n_jobs : int, default=None
1271        Number of jobs to run in parallel.
1272        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
1273        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
1274        for more details.
1275
1276        .. versionchanged:: v0.20
1277           `n_jobs` default changed from 1 to None
1278
1279    verbose : bool, default=False
1280        If True, the time elapsed while fitting each transformer will be
1281        printed as it is completed.
1282
1283    Returns
1284    -------
1285    f : FeatureUnion
1286
1287    See Also
1288    --------
1289    FeatureUnion : Class for concatenating the results of multiple transformer
1290        objects.
1291
1292    Examples
1293    --------
1294    >>> from sklearn.decomposition import PCA, TruncatedSVD
1295    >>> from sklearn.pipeline import make_union
1296    >>> make_union(PCA(), TruncatedSVD())
1297     FeatureUnion(transformer_list=[('pca', PCA()),
1298                                   ('truncatedsvd', TruncatedSVD())])
1299    """
1300    return FeatureUnion(_name_estimators(transformers), n_jobs=n_jobs, verbose=verbose)
1301