1""" 2The :mod:`sklearn.pipeline` module implements utilities to build a composite 3estimator, as a chain of transforms and estimators. 4""" 5# Author: Edouard Duchesnay 6# Gael Varoquaux 7# Virgile Fritsch 8# Alexandre Gramfort 9# Lars Buitinck 10# License: BSD 11 12from collections import defaultdict 13from itertools import islice 14 15import numpy as np 16from scipy import sparse 17from joblib import Parallel 18 19from .base import clone, TransformerMixin 20from .utils._estimator_html_repr import _VisualBlock 21from .utils.metaestimators import available_if 22from .utils import ( 23 Bunch, 24 _print_elapsed_time, 25) 26from .utils.deprecation import deprecated 27from .utils._tags import _safe_tags 28from .utils.validation import check_memory 29from .utils.validation import check_is_fitted 30from .utils.fixes import delayed 31from .exceptions import NotFittedError 32 33from .utils.metaestimators import _BaseComposition 34 35__all__ = ["Pipeline", "FeatureUnion", "make_pipeline", "make_union"] 36 37 38def _final_estimator_has(attr): 39 """Check that final_estimator has `attr`. 40 41 Used together with `avaliable_if` in `Pipeline`.""" 42 43 def check(self): 44 # raise original `AttributeError` if `attr` does not exist 45 getattr(self._final_estimator, attr) 46 return True 47 48 return check 49 50 51class Pipeline(_BaseComposition): 52 """ 53 Pipeline of transforms with a final estimator. 54 55 Sequentially apply a list of transforms and a final estimator. 56 Intermediate steps of the pipeline must be 'transforms', that is, they 57 must implement `fit` and `transform` methods. 58 The final estimator only needs to implement `fit`. 59 The transformers in the pipeline can be cached using ``memory`` argument. 60 61 The purpose of the pipeline is to assemble several steps that can be 62 cross-validated together while setting different parameters. For this, it 63 enables setting parameters of the various steps using their names and the 64 parameter name separated by a `'__'`, as in the example below. A step's 65 estimator may be replaced entirely by setting the parameter with its name 66 to another estimator, or a transformer removed by setting it to 67 `'passthrough'` or `None`. 68 69 Read more in the :ref:`User Guide <pipeline>`. 70 71 .. versionadded:: 0.5 72 73 Parameters 74 ---------- 75 steps : list of tuple 76 List of (name, transform) tuples (implementing `fit`/`transform`) that 77 are chained, in the order in which they are chained, with the last 78 object an estimator. 79 80 memory : str or object with the joblib.Memory interface, default=None 81 Used to cache the fitted transformers of the pipeline. By default, 82 no caching is performed. If a string is given, it is the path to 83 the caching directory. Enabling caching triggers a clone of 84 the transformers before fitting. Therefore, the transformer 85 instance given to the pipeline cannot be inspected 86 directly. Use the attribute ``named_steps`` or ``steps`` to 87 inspect estimators within the pipeline. Caching the 88 transformers is advantageous when fitting is time consuming. 89 90 verbose : bool, default=False 91 If True, the time elapsed while fitting each step will be printed as it 92 is completed. 93 94 Attributes 95 ---------- 96 named_steps : :class:`~sklearn.utils.Bunch` 97 Dictionary-like object, with the following attributes. 98 Read-only attribute to access any step parameter by user given name. 99 Keys are step names and values are steps parameters. 100 101 classes_ : ndarray of shape (n_classes,) 102 The classes labels. Only exist if the last step of the pipeline is a 103 classifier. 104 105 n_features_in_ : int 106 Number of features seen during :term:`fit`. Only defined if the 107 underlying first estimator in `steps` exposes such an attribute 108 when fit. 109 110 .. versionadded:: 0.24 111 112 feature_names_in_ : ndarray of shape (`n_features_in_`,) 113 Names of features seen during :term:`fit`. Only defined if the 114 underlying estimator exposes such an attribute when fit. 115 116 .. versionadded:: 1.0 117 118 See Also 119 -------- 120 make_pipeline : Convenience function for simplified pipeline construction. 121 122 Examples 123 -------- 124 >>> from sklearn.svm import SVC 125 >>> from sklearn.preprocessing import StandardScaler 126 >>> from sklearn.datasets import make_classification 127 >>> from sklearn.model_selection import train_test_split 128 >>> from sklearn.pipeline import Pipeline 129 >>> X, y = make_classification(random_state=0) 130 >>> X_train, X_test, y_train, y_test = train_test_split(X, y, 131 ... random_state=0) 132 >>> pipe = Pipeline([('scaler', StandardScaler()), ('svc', SVC())]) 133 >>> # The pipeline can be used as any other estimator 134 >>> # and avoids leaking the test set into the train set 135 >>> pipe.fit(X_train, y_train) 136 Pipeline(steps=[('scaler', StandardScaler()), ('svc', SVC())]) 137 >>> pipe.score(X_test, y_test) 138 0.88 139 """ 140 141 # BaseEstimator interface 142 _required_parameters = ["steps"] 143 144 def __init__(self, steps, *, memory=None, verbose=False): 145 self.steps = steps 146 self.memory = memory 147 self.verbose = verbose 148 self._validate_steps() 149 150 def get_params(self, deep=True): 151 """Get parameters for this estimator. 152 153 Returns the parameters given in the constructor as well as the 154 estimators contained within the `steps` of the `Pipeline`. 155 156 Parameters 157 ---------- 158 deep : bool, default=True 159 If True, will return the parameters for this estimator and 160 contained subobjects that are estimators. 161 162 Returns 163 ------- 164 params : mapping of string to any 165 Parameter names mapped to their values. 166 """ 167 return self._get_params("steps", deep=deep) 168 169 def set_params(self, **kwargs): 170 """Set the parameters of this estimator. 171 172 Valid parameter keys can be listed with ``get_params()``. Note that 173 you can directly set the parameters of the estimators contained in 174 `steps`. 175 176 Parameters 177 ---------- 178 **kwargs : dict 179 Parameters of this estimator or parameters of estimators contained 180 in `steps`. Parameters of the steps may be set using its name and 181 the parameter name separated by a '__'. 182 183 Returns 184 ------- 185 self : object 186 Pipeline class instance. 187 """ 188 self._set_params("steps", **kwargs) 189 return self 190 191 def _validate_steps(self): 192 names, estimators = zip(*self.steps) 193 194 # validate names 195 self._validate_names(names) 196 197 # validate estimators 198 transformers = estimators[:-1] 199 estimator = estimators[-1] 200 201 for t in transformers: 202 if t is None or t == "passthrough": 203 continue 204 if not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not hasattr( 205 t, "transform" 206 ): 207 raise TypeError( 208 "All intermediate steps should be " 209 "transformers and implement fit and transform " 210 "or be the string 'passthrough' " 211 "'%s' (type %s) doesn't" % (t, type(t)) 212 ) 213 214 # We allow last estimator to be None as an identity transformation 215 if ( 216 estimator is not None 217 and estimator != "passthrough" 218 and not hasattr(estimator, "fit") 219 ): 220 raise TypeError( 221 "Last step of Pipeline should implement fit " 222 "or be the string 'passthrough'. " 223 "'%s' (type %s) doesn't" % (estimator, type(estimator)) 224 ) 225 226 def _iter(self, with_final=True, filter_passthrough=True): 227 """ 228 Generate (idx, (name, trans)) tuples from self.steps 229 230 When filter_passthrough is True, 'passthrough' and None transformers 231 are filtered out. 232 """ 233 stop = len(self.steps) 234 if not with_final: 235 stop -= 1 236 237 for idx, (name, trans) in enumerate(islice(self.steps, 0, stop)): 238 if not filter_passthrough: 239 yield idx, name, trans 240 elif trans is not None and trans != "passthrough": 241 yield idx, name, trans 242 243 def __len__(self): 244 """ 245 Returns the length of the Pipeline 246 """ 247 return len(self.steps) 248 249 def __getitem__(self, ind): 250 """Returns a sub-pipeline or a single estimator in the pipeline 251 252 Indexing with an integer will return an estimator; using a slice 253 returns another Pipeline instance which copies a slice of this 254 Pipeline. This copy is shallow: modifying (or fitting) estimators in 255 the sub-pipeline will affect the larger pipeline and vice-versa. 256 However, replacing a value in `step` will not affect a copy. 257 """ 258 if isinstance(ind, slice): 259 if ind.step not in (1, None): 260 raise ValueError("Pipeline slicing only supports a step of 1") 261 return self.__class__( 262 self.steps[ind], memory=self.memory, verbose=self.verbose 263 ) 264 try: 265 name, est = self.steps[ind] 266 except TypeError: 267 # Not an int, try get step by name 268 return self.named_steps[ind] 269 return est 270 271 @property 272 def _estimator_type(self): 273 return self.steps[-1][1]._estimator_type 274 275 @property 276 def named_steps(self): 277 """Access the steps by name. 278 279 Read-only attribute to access any step by given name. 280 Keys are steps names and values are the steps objects.""" 281 # Use Bunch object to improve autocomplete 282 return Bunch(**dict(self.steps)) 283 284 @property 285 def _final_estimator(self): 286 estimator = self.steps[-1][1] 287 return "passthrough" if estimator is None else estimator 288 289 def _log_message(self, step_idx): 290 if not self.verbose: 291 return None 292 name, _ = self.steps[step_idx] 293 294 return "(step %d of %d) Processing %s" % (step_idx + 1, len(self.steps), name) 295 296 def _check_fit_params(self, **fit_params): 297 fit_params_steps = {name: {} for name, step in self.steps if step is not None} 298 for pname, pval in fit_params.items(): 299 if "__" not in pname: 300 raise ValueError( 301 "Pipeline.fit does not accept the {} parameter. " 302 "You can pass parameters to specific steps of your " 303 "pipeline using the stepname__parameter format, e.g. " 304 "`Pipeline.fit(X, y, logisticregression__sample_weight" 305 "=sample_weight)`.".format(pname) 306 ) 307 step, param = pname.split("__", 1) 308 fit_params_steps[step][param] = pval 309 return fit_params_steps 310 311 # Estimator interface 312 313 def _fit(self, X, y=None, **fit_params_steps): 314 # shallow copy of steps - this should really be steps_ 315 self.steps = list(self.steps) 316 self._validate_steps() 317 # Setup the memory 318 memory = check_memory(self.memory) 319 320 fit_transform_one_cached = memory.cache(_fit_transform_one) 321 322 for (step_idx, name, transformer) in self._iter( 323 with_final=False, filter_passthrough=False 324 ): 325 if transformer is None or transformer == "passthrough": 326 with _print_elapsed_time("Pipeline", self._log_message(step_idx)): 327 continue 328 329 if hasattr(memory, "location"): 330 # joblib >= 0.12 331 if memory.location is None: 332 # we do not clone when caching is disabled to 333 # preserve backward compatibility 334 cloned_transformer = transformer 335 else: 336 cloned_transformer = clone(transformer) 337 elif hasattr(memory, "cachedir"): 338 # joblib < 0.11 339 if memory.cachedir is None: 340 # we do not clone when caching is disabled to 341 # preserve backward compatibility 342 cloned_transformer = transformer 343 else: 344 cloned_transformer = clone(transformer) 345 else: 346 cloned_transformer = clone(transformer) 347 # Fit or load from cache the current transformer 348 X, fitted_transformer = fit_transform_one_cached( 349 cloned_transformer, 350 X, 351 y, 352 None, 353 message_clsname="Pipeline", 354 message=self._log_message(step_idx), 355 **fit_params_steps[name], 356 ) 357 # Replace the transformer of the step with the fitted 358 # transformer. This is necessary when loading the transformer 359 # from the cache. 360 self.steps[step_idx] = (name, fitted_transformer) 361 return X 362 363 def fit(self, X, y=None, **fit_params): 364 """Fit the model. 365 366 Fit all the transformers one after the other and transform the 367 data. Finally, fit the transformed data using the final estimator. 368 369 Parameters 370 ---------- 371 X : iterable 372 Training data. Must fulfill input requirements of first step of the 373 pipeline. 374 375 y : iterable, default=None 376 Training targets. Must fulfill label requirements for all steps of 377 the pipeline. 378 379 **fit_params : dict of string -> object 380 Parameters passed to the ``fit`` method of each step, where 381 each parameter name is prefixed such that parameter ``p`` for step 382 ``s`` has key ``s__p``. 383 384 Returns 385 ------- 386 self : object 387 Pipeline with fitted steps. 388 """ 389 fit_params_steps = self._check_fit_params(**fit_params) 390 Xt = self._fit(X, y, **fit_params_steps) 391 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)): 392 if self._final_estimator != "passthrough": 393 fit_params_last_step = fit_params_steps[self.steps[-1][0]] 394 self._final_estimator.fit(Xt, y, **fit_params_last_step) 395 396 return self 397 398 def fit_transform(self, X, y=None, **fit_params): 399 """Fit the model and transform with the final estimator. 400 401 Fits all the transformers one after the other and transform the 402 data. Then uses `fit_transform` on transformed data with the final 403 estimator. 404 405 Parameters 406 ---------- 407 X : iterable 408 Training data. Must fulfill input requirements of first step of the 409 pipeline. 410 411 y : iterable, default=None 412 Training targets. Must fulfill label requirements for all steps of 413 the pipeline. 414 415 **fit_params : dict of string -> object 416 Parameters passed to the ``fit`` method of each step, where 417 each parameter name is prefixed such that parameter ``p`` for step 418 ``s`` has key ``s__p``. 419 420 Returns 421 ------- 422 Xt : ndarray of shape (n_samples, n_transformed_features) 423 Transformed samples. 424 """ 425 fit_params_steps = self._check_fit_params(**fit_params) 426 Xt = self._fit(X, y, **fit_params_steps) 427 428 last_step = self._final_estimator 429 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)): 430 if last_step == "passthrough": 431 return Xt 432 fit_params_last_step = fit_params_steps[self.steps[-1][0]] 433 if hasattr(last_step, "fit_transform"): 434 return last_step.fit_transform(Xt, y, **fit_params_last_step) 435 else: 436 return last_step.fit(Xt, y, **fit_params_last_step).transform(Xt) 437 438 @available_if(_final_estimator_has("predict")) 439 def predict(self, X, **predict_params): 440 """Transform the data, and apply `predict` with the final estimator. 441 442 Call `transform` of each transformer in the pipeline. The transformed 443 data are finally passed to the final estimator that calls `predict` 444 method. Only valid if the final estimator implements `predict`. 445 446 Parameters 447 ---------- 448 X : iterable 449 Data to predict on. Must fulfill input requirements of first step 450 of the pipeline. 451 452 **predict_params : dict of string -> object 453 Parameters to the ``predict`` called at the end of all 454 transformations in the pipeline. Note that while this may be 455 used to return uncertainties from some models with return_std 456 or return_cov, uncertainties that are generated by the 457 transformations in the pipeline are not propagated to the 458 final estimator. 459 460 .. versionadded:: 0.20 461 462 Returns 463 ------- 464 y_pred : ndarray 465 Result of calling `predict` on the final estimator. 466 """ 467 Xt = X 468 for _, name, transform in self._iter(with_final=False): 469 Xt = transform.transform(Xt) 470 return self.steps[-1][1].predict(Xt, **predict_params) 471 472 @available_if(_final_estimator_has("fit_predict")) 473 def fit_predict(self, X, y=None, **fit_params): 474 """Transform the data, and apply `fit_predict` with the final estimator. 475 476 Call `fit_transform` of each transformer in the pipeline. The 477 transformed data are finally passed to the final estimator that calls 478 `fit_predict` method. Only valid if the final estimator implements 479 `fit_predict`. 480 481 Parameters 482 ---------- 483 X : iterable 484 Training data. Must fulfill input requirements of first step of 485 the pipeline. 486 487 y : iterable, default=None 488 Training targets. Must fulfill label requirements for all steps 489 of the pipeline. 490 491 **fit_params : dict of string -> object 492 Parameters passed to the ``fit`` method of each step, where 493 each parameter name is prefixed such that parameter ``p`` for step 494 ``s`` has key ``s__p``. 495 496 Returns 497 ------- 498 y_pred : ndarray 499 Result of calling `fit_predict` on the final estimator. 500 """ 501 fit_params_steps = self._check_fit_params(**fit_params) 502 Xt = self._fit(X, y, **fit_params_steps) 503 504 fit_params_last_step = fit_params_steps[self.steps[-1][0]] 505 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)): 506 y_pred = self.steps[-1][1].fit_predict(Xt, y, **fit_params_last_step) 507 return y_pred 508 509 @available_if(_final_estimator_has("predict_proba")) 510 def predict_proba(self, X, **predict_proba_params): 511 """Transform the data, and apply `predict_proba` with the final estimator. 512 513 Call `transform` of each transformer in the pipeline. The transformed 514 data are finally passed to the final estimator that calls 515 `predict_proba` method. Only valid if the final estimator implements 516 `predict_proba`. 517 518 Parameters 519 ---------- 520 X : iterable 521 Data to predict on. Must fulfill input requirements of first step 522 of the pipeline. 523 524 **predict_proba_params : dict of string -> object 525 Parameters to the `predict_proba` called at the end of all 526 transformations in the pipeline. 527 528 Returns 529 ------- 530 y_proba : ndarray of shape (n_samples, n_classes) 531 Result of calling `predict_proba` on the final estimator. 532 """ 533 Xt = X 534 for _, name, transform in self._iter(with_final=False): 535 Xt = transform.transform(Xt) 536 return self.steps[-1][1].predict_proba(Xt, **predict_proba_params) 537 538 @available_if(_final_estimator_has("decision_function")) 539 def decision_function(self, X): 540 """Transform the data, and apply `decision_function` with the final estimator. 541 542 Call `transform` of each transformer in the pipeline. The transformed 543 data are finally passed to the final estimator that calls 544 `decision_function` method. Only valid if the final estimator 545 implements `decision_function`. 546 547 Parameters 548 ---------- 549 X : iterable 550 Data to predict on. Must fulfill input requirements of first step 551 of the pipeline. 552 553 Returns 554 ------- 555 y_score : ndarray of shape (n_samples, n_classes) 556 Result of calling `decision_function` on the final estimator. 557 """ 558 Xt = X 559 for _, name, transform in self._iter(with_final=False): 560 Xt = transform.transform(Xt) 561 return self.steps[-1][1].decision_function(Xt) 562 563 @available_if(_final_estimator_has("score_samples")) 564 def score_samples(self, X): 565 """Transform the data, and apply `score_samples` with the final estimator. 566 567 Call `transform` of each transformer in the pipeline. The transformed 568 data are finally passed to the final estimator that calls 569 `score_samples` method. Only valid if the final estimator implements 570 `score_samples`. 571 572 Parameters 573 ---------- 574 X : iterable 575 Data to predict on. Must fulfill input requirements of first step 576 of the pipeline. 577 578 Returns 579 ------- 580 y_score : ndarray of shape (n_samples,) 581 Result of calling `score_samples` on the final estimator. 582 """ 583 Xt = X 584 for _, _, transformer in self._iter(with_final=False): 585 Xt = transformer.transform(Xt) 586 return self.steps[-1][1].score_samples(Xt) 587 588 @available_if(_final_estimator_has("predict_log_proba")) 589 def predict_log_proba(self, X, **predict_log_proba_params): 590 """Transform the data, and apply `predict_log_proba` with the final estimator. 591 592 Call `transform` of each transformer in the pipeline. The transformed 593 data are finally passed to the final estimator that calls 594 `predict_log_proba` method. Only valid if the final estimator 595 implements `predict_log_proba`. 596 597 Parameters 598 ---------- 599 X : iterable 600 Data to predict on. Must fulfill input requirements of first step 601 of the pipeline. 602 603 **predict_log_proba_params : dict of string -> object 604 Parameters to the ``predict_log_proba`` called at the end of all 605 transformations in the pipeline. 606 607 Returns 608 ------- 609 y_log_proba : ndarray of shape (n_samples, n_classes) 610 Result of calling `predict_log_proba` on the final estimator. 611 """ 612 Xt = X 613 for _, name, transform in self._iter(with_final=False): 614 Xt = transform.transform(Xt) 615 return self.steps[-1][1].predict_log_proba(Xt, **predict_log_proba_params) 616 617 def _can_transform(self): 618 return self._final_estimator == "passthrough" or hasattr( 619 self._final_estimator, "transform" 620 ) 621 622 @available_if(_can_transform) 623 def transform(self, X): 624 """Transform the data, and apply `transform` with the final estimator. 625 626 Call `transform` of each transformer in the pipeline. The transformed 627 data are finally passed to the final estimator that calls 628 `transform` method. Only valid if the final estimator 629 implements `transform`. 630 631 This also works where final estimator is `None` in which case all prior 632 transformations are applied. 633 634 Parameters 635 ---------- 636 X : iterable 637 Data to transform. Must fulfill input requirements of first step 638 of the pipeline. 639 640 Returns 641 ------- 642 Xt : ndarray of shape (n_samples, n_transformed_features) 643 Transformed data. 644 """ 645 Xt = X 646 for _, _, transform in self._iter(): 647 Xt = transform.transform(Xt) 648 return Xt 649 650 def _can_inverse_transform(self): 651 return all(hasattr(t, "inverse_transform") for _, _, t in self._iter()) 652 653 @available_if(_can_inverse_transform) 654 def inverse_transform(self, Xt): 655 """Apply `inverse_transform` for each step in a reverse order. 656 657 All estimators in the pipeline must support `inverse_transform`. 658 659 Parameters 660 ---------- 661 Xt : array-like of shape (n_samples, n_transformed_features) 662 Data samples, where ``n_samples`` is the number of samples and 663 ``n_features`` is the number of features. Must fulfill 664 input requirements of last step of pipeline's 665 ``inverse_transform`` method. 666 667 Returns 668 ------- 669 Xt : ndarray of shape (n_samples, n_features) 670 Inverse transformed data, that is, data in the original feature 671 space. 672 """ 673 reverse_iter = reversed(list(self._iter())) 674 for _, _, transform in reverse_iter: 675 Xt = transform.inverse_transform(Xt) 676 return Xt 677 678 @available_if(_final_estimator_has("score")) 679 def score(self, X, y=None, sample_weight=None): 680 """Transform the data, and apply `score` with the final estimator. 681 682 Call `transform` of each transformer in the pipeline. The transformed 683 data are finally passed to the final estimator that calls 684 `score` method. Only valid if the final estimator implements `score`. 685 686 Parameters 687 ---------- 688 X : iterable 689 Data to predict on. Must fulfill input requirements of first step 690 of the pipeline. 691 692 y : iterable, default=None 693 Targets used for scoring. Must fulfill label requirements for all 694 steps of the pipeline. 695 696 sample_weight : array-like, default=None 697 If not None, this argument is passed as ``sample_weight`` keyword 698 argument to the ``score`` method of the final estimator. 699 700 Returns 701 ------- 702 score : float 703 Result of calling `score` on the final estimator. 704 """ 705 Xt = X 706 for _, name, transform in self._iter(with_final=False): 707 Xt = transform.transform(Xt) 708 score_params = {} 709 if sample_weight is not None: 710 score_params["sample_weight"] = sample_weight 711 return self.steps[-1][1].score(Xt, y, **score_params) 712 713 @property 714 def classes_(self): 715 """The classes labels. Only exist if the last step is a classifier.""" 716 return self.steps[-1][1].classes_ 717 718 def _more_tags(self): 719 # check if first estimator expects pairwise input 720 return {"pairwise": _safe_tags(self.steps[0][1], "pairwise")} 721 722 # TODO: Remove in 1.1 723 # mypy error: Decorated property not supported 724 @deprecated( # type: ignore 725 "Attribute `_pairwise` was deprecated in " 726 "version 0.24 and will be removed in 1.1 (renaming of 0.26)." 727 ) 728 @property 729 def _pairwise(self): 730 # check if first estimator expects pairwise input 731 return getattr(self.steps[0][1], "_pairwise", False) 732 733 def get_feature_names_out(self, input_features=None): 734 """Get output feature names for transformation. 735 736 Transform input features using the pipeline. 737 738 Parameters 739 ---------- 740 input_features : array-like of str or None, default=None 741 Input features. 742 743 Returns 744 ------- 745 feature_names_out : ndarray of str objects 746 Transformed feature names. 747 """ 748 feature_names_out = input_features 749 for _, name, transform in self._iter(): 750 if not hasattr(transform, "get_feature_names_out"): 751 raise AttributeError( 752 "Estimator {} does not provide get_feature_names_out. " 753 "Did you mean to call pipeline[:-1].get_feature_names_out" 754 "()?".format(name) 755 ) 756 feature_names_out = transform.get_feature_names_out(feature_names_out) 757 return feature_names_out 758 759 @property 760 def n_features_in_(self): 761 """Number of features seen during first step `fit` method.""" 762 # delegate to first step (which will call _check_is_fitted) 763 return self.steps[0][1].n_features_in_ 764 765 @property 766 def feature_names_in_(self): 767 """Names of features seen during first step `fit` method.""" 768 # delegate to first step (which will call _check_is_fitted) 769 return self.steps[0][1].feature_names_in_ 770 771 def __sklearn_is_fitted__(self): 772 """Indicate whether pipeline has been fit.""" 773 try: 774 # check if the last step of the pipeline is fitted 775 # we only check the last step since if the last step is fit, it 776 # means the previous steps should also be fit. This is faster than 777 # checking if every step of the pipeline is fit. 778 check_is_fitted(self.steps[-1][1]) 779 return True 780 except NotFittedError: 781 return False 782 783 def _sk_visual_block_(self): 784 _, estimators = zip(*self.steps) 785 786 def _get_name(name, est): 787 if est is None or est == "passthrough": 788 return f"{name}: passthrough" 789 # Is an estimator 790 return f"{name}: {est.__class__.__name__}" 791 792 names = [_get_name(name, est) for name, est in self.steps] 793 name_details = [str(est) for est in estimators] 794 return _VisualBlock( 795 "serial", 796 estimators, 797 names=names, 798 name_details=name_details, 799 dash_wrapped=False, 800 ) 801 802 803def _name_estimators(estimators): 804 """Generate names for estimators.""" 805 806 names = [ 807 estimator if isinstance(estimator, str) else type(estimator).__name__.lower() 808 for estimator in estimators 809 ] 810 namecount = defaultdict(int) 811 for est, name in zip(estimators, names): 812 namecount[name] += 1 813 814 for k, v in list(namecount.items()): 815 if v == 1: 816 del namecount[k] 817 818 for i in reversed(range(len(estimators))): 819 name = names[i] 820 if name in namecount: 821 names[i] += "-%d" % namecount[name] 822 namecount[name] -= 1 823 824 return list(zip(names, estimators)) 825 826 827def make_pipeline(*steps, memory=None, verbose=False): 828 """Construct a :class:`Pipeline` from the given estimators. 829 830 This is a shorthand for the :class:`Pipeline` constructor; it does not 831 require, and does not permit, naming the estimators. Instead, their names 832 will be set to the lowercase of their types automatically. 833 834 Parameters 835 ---------- 836 *steps : list of Estimator objects 837 List of the scikit-learn estimators that are chained together. 838 839 memory : str or object with the joblib.Memory interface, default=None 840 Used to cache the fitted transformers of the pipeline. By default, 841 no caching is performed. If a string is given, it is the path to 842 the caching directory. Enabling caching triggers a clone of 843 the transformers before fitting. Therefore, the transformer 844 instance given to the pipeline cannot be inspected 845 directly. Use the attribute ``named_steps`` or ``steps`` to 846 inspect estimators within the pipeline. Caching the 847 transformers is advantageous when fitting is time consuming. 848 849 verbose : bool, default=False 850 If True, the time elapsed while fitting each step will be printed as it 851 is completed. 852 853 Returns 854 ------- 855 p : Pipeline 856 Returns a scikit-learn :class:`Pipeline` object. 857 858 See Also 859 -------- 860 Pipeline : Class for creating a pipeline of transforms with a final 861 estimator. 862 863 Examples 864 -------- 865 >>> from sklearn.naive_bayes import GaussianNB 866 >>> from sklearn.preprocessing import StandardScaler 867 >>> from sklearn.pipeline import make_pipeline 868 >>> make_pipeline(StandardScaler(), GaussianNB(priors=None)) 869 Pipeline(steps=[('standardscaler', StandardScaler()), 870 ('gaussiannb', GaussianNB())]) 871 """ 872 return Pipeline(_name_estimators(steps), memory=memory, verbose=verbose) 873 874 875def _transform_one(transformer, X, y, weight, **fit_params): 876 res = transformer.transform(X) 877 # if we have a weight for this transformer, multiply output 878 if weight is None: 879 return res 880 return res * weight 881 882 883def _fit_transform_one( 884 transformer, X, y, weight, message_clsname="", message=None, **fit_params 885): 886 """ 887 Fits ``transformer`` to ``X`` and ``y``. The transformed result is returned 888 with the fitted transformer. If ``weight`` is not ``None``, the result will 889 be multiplied by ``weight``. 890 """ 891 with _print_elapsed_time(message_clsname, message): 892 if hasattr(transformer, "fit_transform"): 893 res = transformer.fit_transform(X, y, **fit_params) 894 else: 895 res = transformer.fit(X, y, **fit_params).transform(X) 896 897 if weight is None: 898 return res, transformer 899 return res * weight, transformer 900 901 902def _fit_one(transformer, X, y, weight, message_clsname="", message=None, **fit_params): 903 """ 904 Fits ``transformer`` to ``X`` and ``y``. 905 """ 906 with _print_elapsed_time(message_clsname, message): 907 return transformer.fit(X, y, **fit_params) 908 909 910class FeatureUnion(TransformerMixin, _BaseComposition): 911 """Concatenates results of multiple transformer objects. 912 913 This estimator applies a list of transformer objects in parallel to the 914 input data, then concatenates the results. This is useful to combine 915 several feature extraction mechanisms into a single transformer. 916 917 Parameters of the transformers may be set using its name and the parameter 918 name separated by a '__'. A transformer may be replaced entirely by 919 setting the parameter with its name to another transformer, 920 or removed by setting to 'drop'. 921 922 Read more in the :ref:`User Guide <feature_union>`. 923 924 .. versionadded:: 0.13 925 926 Parameters 927 ---------- 928 transformer_list : list of tuple 929 List of tuple containing `(str, transformer)`. The first element 930 of the tuple is name affected to the transformer while the 931 second element is a scikit-learn transformer instance. 932 The transformer instance can also be `"drop"` for it to be 933 ignored. 934 935 .. versionchanged:: 0.22 936 Deprecated `None` as a transformer in favor of 'drop'. 937 938 n_jobs : int, default=None 939 Number of jobs to run in parallel. 940 ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. 941 ``-1`` means using all processors. See :term:`Glossary <n_jobs>` 942 for more details. 943 944 .. versionchanged:: v0.20 945 `n_jobs` default changed from 1 to None 946 947 transformer_weights : dict, default=None 948 Multiplicative weights for features per transformer. 949 Keys are transformer names, values the weights. 950 Raises ValueError if key not present in ``transformer_list``. 951 952 verbose : bool, default=False 953 If True, the time elapsed while fitting each transformer will be 954 printed as it is completed. 955 956 Attributes 957 ---------- 958 n_features_in_ : int 959 Number of features seen during :term:`fit`. Only defined if the 960 underlying first transformer in `transformer_list` exposes such an 961 attribute when fit. 962 963 .. versionadded:: 0.24 964 965 See Also 966 -------- 967 make_union : Convenience function for simplified feature union 968 construction. 969 970 Examples 971 -------- 972 >>> from sklearn.pipeline import FeatureUnion 973 >>> from sklearn.decomposition import PCA, TruncatedSVD 974 >>> union = FeatureUnion([("pca", PCA(n_components=1)), 975 ... ("svd", TruncatedSVD(n_components=2))]) 976 >>> X = [[0., 1., 3], [2., 2., 5]] 977 >>> union.fit_transform(X) 978 array([[ 1.5 , 3.0..., 0.8...], 979 [-1.5 , 5.7..., -0.4...]]) 980 """ 981 982 _required_parameters = ["transformer_list"] 983 984 def __init__( 985 self, transformer_list, *, n_jobs=None, transformer_weights=None, verbose=False 986 ): 987 self.transformer_list = transformer_list 988 self.n_jobs = n_jobs 989 self.transformer_weights = transformer_weights 990 self.verbose = verbose 991 self._validate_transformers() 992 993 def get_params(self, deep=True): 994 """Get parameters for this estimator. 995 996 Returns the parameters given in the constructor as well as the 997 estimators contained within the `transformer_list` of the 998 `FeatureUnion`. 999 1000 Parameters 1001 ---------- 1002 deep : bool, default=True 1003 If True, will return the parameters for this estimator and 1004 contained subobjects that are estimators. 1005 1006 Returns 1007 ------- 1008 params : mapping of string to any 1009 Parameter names mapped to their values. 1010 """ 1011 return self._get_params("transformer_list", deep=deep) 1012 1013 def set_params(self, **kwargs): 1014 """Set the parameters of this estimator. 1015 1016 Valid parameter keys can be listed with ``get_params()``. Note that 1017 you can directly set the parameters of the estimators contained in 1018 `tranformer_list`. 1019 1020 Parameters 1021 ---------- 1022 **kwargs : dict 1023 Parameters of this estimator or parameters of estimators contained 1024 in `transform_list`. Parameters of the transformers may be set 1025 using its name and the parameter name separated by a '__'. 1026 1027 Returns 1028 ------- 1029 self : object 1030 FeatureUnion class instance. 1031 """ 1032 self._set_params("transformer_list", **kwargs) 1033 return self 1034 1035 def _validate_transformers(self): 1036 names, transformers = zip(*self.transformer_list) 1037 1038 # validate names 1039 self._validate_names(names) 1040 1041 # validate estimators 1042 for t in transformers: 1043 if t == "drop": 1044 continue 1045 if not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not hasattr( 1046 t, "transform" 1047 ): 1048 raise TypeError( 1049 "All estimators should implement fit and " 1050 "transform. '%s' (type %s) doesn't" % (t, type(t)) 1051 ) 1052 1053 def _validate_transformer_weights(self): 1054 if not self.transformer_weights: 1055 return 1056 1057 transformer_names = set(name for name, _ in self.transformer_list) 1058 for name in self.transformer_weights: 1059 if name not in transformer_names: 1060 raise ValueError( 1061 f'Attempting to weight transformer "{name}", ' 1062 "but it is not present in transformer_list." 1063 ) 1064 1065 def _iter(self): 1066 """ 1067 Generate (name, trans, weight) tuples excluding None and 1068 'drop' transformers. 1069 """ 1070 get_weight = (self.transformer_weights or {}).get 1071 return ( 1072 (name, trans, get_weight(name)) 1073 for name, trans in self.transformer_list 1074 if trans != "drop" 1075 ) 1076 1077 @deprecated( 1078 "get_feature_names is deprecated in 1.0 and will be removed " 1079 "in 1.2. Please use get_feature_names_out instead." 1080 ) 1081 def get_feature_names(self): 1082 """Get feature names from all transformers. 1083 1084 Returns 1085 ------- 1086 feature_names : list of strings 1087 Names of the features produced by transform. 1088 """ 1089 feature_names = [] 1090 for name, trans, weight in self._iter(): 1091 if not hasattr(trans, "get_feature_names"): 1092 raise AttributeError( 1093 "Transformer %s (type %s) does not provide get_feature_names." 1094 % (str(name), type(trans).__name__) 1095 ) 1096 feature_names.extend([name + "__" + f for f in trans.get_feature_names()]) 1097 return feature_names 1098 1099 def get_feature_names_out(self, input_features=None): 1100 """Get output feature names for transformation. 1101 1102 Parameters 1103 ---------- 1104 input_features : array-like of str or None, default=None 1105 Input features. 1106 1107 Returns 1108 ------- 1109 feature_names_out : ndarray of str objects 1110 Transformed feature names. 1111 """ 1112 feature_names = [] 1113 for name, trans, _ in self._iter(): 1114 if not hasattr(trans, "get_feature_names_out"): 1115 raise AttributeError( 1116 "Transformer %s (type %s) does not provide get_feature_names_out." 1117 % (str(name), type(trans).__name__) 1118 ) 1119 feature_names.extend( 1120 [f"{name}__{f}" for f in trans.get_feature_names_out(input_features)] 1121 ) 1122 return np.asarray(feature_names, dtype=object) 1123 1124 def fit(self, X, y=None, **fit_params): 1125 """Fit all transformers using X. 1126 1127 Parameters 1128 ---------- 1129 X : iterable or array-like, depending on transformers 1130 Input data, used to fit transformers. 1131 1132 y : array-like of shape (n_samples, n_outputs), default=None 1133 Targets for supervised learning. 1134 1135 **fit_params : dict, default=None 1136 Parameters to pass to the fit method of the estimator. 1137 1138 Returns 1139 ------- 1140 self : object 1141 FeatureUnion class instance. 1142 """ 1143 transformers = self._parallel_func(X, y, fit_params, _fit_one) 1144 if not transformers: 1145 # All transformers are None 1146 return self 1147 1148 self._update_transformer_list(transformers) 1149 return self 1150 1151 def fit_transform(self, X, y=None, **fit_params): 1152 """Fit all transformers, transform the data and concatenate results. 1153 1154 Parameters 1155 ---------- 1156 X : iterable or array-like, depending on transformers 1157 Input data to be transformed. 1158 1159 y : array-like of shape (n_samples, n_outputs), default=None 1160 Targets for supervised learning. 1161 1162 **fit_params : dict, default=None 1163 Parameters to pass to the fit method of the estimator. 1164 1165 Returns 1166 ------- 1167 X_t : array-like or sparse matrix of \ 1168 shape (n_samples, sum_n_components) 1169 The `hstack` of results of transformers. `sum_n_components` is the 1170 sum of `n_components` (output dimension) over transformers. 1171 """ 1172 results = self._parallel_func(X, y, fit_params, _fit_transform_one) 1173 if not results: 1174 # All transformers are None 1175 return np.zeros((X.shape[0], 0)) 1176 1177 Xs, transformers = zip(*results) 1178 self._update_transformer_list(transformers) 1179 1180 return self._hstack(Xs) 1181 1182 def _log_message(self, name, idx, total): 1183 if not self.verbose: 1184 return None 1185 return "(step %d of %d) Processing %s" % (idx, total, name) 1186 1187 def _parallel_func(self, X, y, fit_params, func): 1188 """Runs func in parallel on X and y""" 1189 self.transformer_list = list(self.transformer_list) 1190 self._validate_transformers() 1191 self._validate_transformer_weights() 1192 transformers = list(self._iter()) 1193 1194 return Parallel(n_jobs=self.n_jobs)( 1195 delayed(func)( 1196 transformer, 1197 X, 1198 y, 1199 weight, 1200 message_clsname="FeatureUnion", 1201 message=self._log_message(name, idx, len(transformers)), 1202 **fit_params, 1203 ) 1204 for idx, (name, transformer, weight) in enumerate(transformers, 1) 1205 ) 1206 1207 def transform(self, X): 1208 """Transform X separately by each transformer, concatenate results. 1209 1210 Parameters 1211 ---------- 1212 X : iterable or array-like, depending on transformers 1213 Input data to be transformed. 1214 1215 Returns 1216 ------- 1217 X_t : array-like or sparse matrix of \ 1218 shape (n_samples, sum_n_components) 1219 The `hstack` of results of transformers. `sum_n_components` is the 1220 sum of `n_components` (output dimension) over transformers. 1221 """ 1222 Xs = Parallel(n_jobs=self.n_jobs)( 1223 delayed(_transform_one)(trans, X, None, weight) 1224 for name, trans, weight in self._iter() 1225 ) 1226 if not Xs: 1227 # All transformers are None 1228 return np.zeros((X.shape[0], 0)) 1229 1230 return self._hstack(Xs) 1231 1232 def _hstack(self, Xs): 1233 if any(sparse.issparse(f) for f in Xs): 1234 Xs = sparse.hstack(Xs).tocsr() 1235 else: 1236 Xs = np.hstack(Xs) 1237 return Xs 1238 1239 def _update_transformer_list(self, transformers): 1240 transformers = iter(transformers) 1241 self.transformer_list[:] = [ 1242 (name, old if old == "drop" else next(transformers)) 1243 for name, old in self.transformer_list 1244 ] 1245 1246 @property 1247 def n_features_in_(self): 1248 """Number of features seen during :term:`fit`.""" 1249 1250 # X is passed to all transformers so we just delegate to the first one 1251 return self.transformer_list[0][1].n_features_in_ 1252 1253 def _sk_visual_block_(self): 1254 names, transformers = zip(*self.transformer_list) 1255 return _VisualBlock("parallel", transformers, names=names) 1256 1257 1258def make_union(*transformers, n_jobs=None, verbose=False): 1259 """ 1260 Construct a FeatureUnion from the given transformers. 1261 1262 This is a shorthand for the FeatureUnion constructor; it does not require, 1263 and does not permit, naming the transformers. Instead, they will be given 1264 names automatically based on their types. It also does not allow weighting. 1265 1266 Parameters 1267 ---------- 1268 *transformers : list of estimators 1269 1270 n_jobs : int, default=None 1271 Number of jobs to run in parallel. 1272 ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. 1273 ``-1`` means using all processors. See :term:`Glossary <n_jobs>` 1274 for more details. 1275 1276 .. versionchanged:: v0.20 1277 `n_jobs` default changed from 1 to None 1278 1279 verbose : bool, default=False 1280 If True, the time elapsed while fitting each transformer will be 1281 printed as it is completed. 1282 1283 Returns 1284 ------- 1285 f : FeatureUnion 1286 1287 See Also 1288 -------- 1289 FeatureUnion : Class for concatenating the results of multiple transformer 1290 objects. 1291 1292 Examples 1293 -------- 1294 >>> from sklearn.decomposition import PCA, TruncatedSVD 1295 >>> from sklearn.pipeline import make_union 1296 >>> make_union(PCA(), TruncatedSVD()) 1297 FeatureUnion(transformer_list=[('pca', PCA()), 1298 ('truncatedsvd', TruncatedSVD())]) 1299 """ 1300 return FeatureUnion(_name_estimators(transformers), n_jobs=n_jobs, verbose=verbose) 1301