1# -*- coding: utf-8 -*- 2""" 3This module borrows and adapts `Pipeline` from `sklearn.pipeline` and 4`TransformerMixin` from `sklearn.base` in the scikit-learn framework 5(commit hash d205638475ca542dc46862652e3bb0be663a8eac) to be precise). 6Both are BSD licensed and allow for this sort of thing; attribution 7is given as a comment above each class. 8""" 9from collections import defaultdict 10from itertools import islice 11 12 13# Author: Gael Varoquaux <gael.varoquaux@normalesup.org> 14# License: BSD 3 clause 15class TransformerMixin(object): 16 """Mixin class for all transformers.""" 17 18 def fit_transform(self, X, y=None, **fit_params): 19 """ 20 Fit to data, then transform it. 21 Fits transformer to X and y with optional parameters fit_params 22 and returns a transformed version of X. 23 Parameters 24 ---------- 25 X : ndarray of shape (n_samples, n_features) 26 Training set. 27 y : ndarray of shape (n_samples,), default=None 28 Target values. 29 **fit_params : dict 30 Additional fit parameters. 31 Returns 32 ------- 33 X_new : ndarray array of shape (n_samples, n_features_new) 34 Transformed array. 35 """ 36 # non-optimized default implementation; override when a better 37 # method is possible for a given clustering algorithm 38 if y is None: 39 # fit method of arity 1 (unsupervised transformation) 40 return self.fit(X, **fit_params).transform(X) 41 else: 42 # fit method of arity 2 (supervised transformation) 43 return self.fit(X, y, **fit_params).transform(X) 44 45 46# Author: Edouard Duchesnay 47# Gael Varoquaux 48# Virgile Fritsch 49# Alexandre Gramfort 50# Lars Buitinck 51# License: BSD 52class Pipeline(object): 53 def __init__(self, steps, verbose=False): 54 self.steps = steps 55 self.verbose = verbose 56 self._validate_steps() 57 58 def _validate_steps(self): 59 names, estimators = zip(*self.steps) 60 61 # validate estimators 62 transformers = estimators[:-1] 63 estimator = estimators[-1] 64 65 for t in transformers: 66 if t is None or t == 'passthrough': 67 continue 68 if (not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not 69 hasattr(t, "transform")): 70 raise TypeError("All intermediate steps should be " 71 "transformers and implement fit and transform " 72 "or be the string 'passthrough' " 73 "'%s' (type %s) doesn't" % (t, type(t))) 74 75 # We allow last estimator to be None as an identity transformation 76 if (estimator is not None and estimator != 'passthrough' 77 and not hasattr(estimator, "fit")): 78 raise TypeError( 79 "Last step of Pipeline should implement fit " 80 "or be the string 'passthrough'. " 81 "'%s' (type %s) doesn't" % (estimator, type(estimator))) 82 83 def _iter(self, with_final=True, filter_passthrough=True): 84 """ 85 Generate (idx, (name, trans)) tuples from self.steps 86 87 When filter_passthrough is True, 'passthrough' and None transformers 88 are filtered out. 89 """ 90 stop = len(self.steps) 91 if not with_final: 92 stop -= 1 93 94 for idx, (name, trans) in enumerate(islice(self.steps, 0, stop)): 95 if not filter_passthrough: 96 yield idx, name, trans 97 elif trans is not None and trans != 'passthrough': 98 yield idx, name, trans 99 100 def __len__(self): 101 """ 102 Returns the length of the Pipeline 103 """ 104 return len(self.steps) 105 106 def __getitem__(self, ind): 107 """Returns a sub-pipeline or a single esimtator in the pipeline 108 109 Indexing with an integer will return an estimator; using a slice 110 returns another Pipeline instance which copies a slice of this 111 Pipeline. This copy is shallow: modifying (or fitting) estimators in 112 the sub-pipeline will affect the larger pipeline and vice-versa. 113 However, replacing a value in `step` will not affect a copy. 114 """ 115 if isinstance(ind, slice): 116 if ind.step not in (1, None): 117 raise ValueError('Pipeline slicing only supports a step of 1') 118 return self.__class__(self.steps[ind]) 119 try: 120 name, est = self.steps[ind] 121 except TypeError: 122 # Not an int, try get step by name 123 return self.named_steps[ind] 124 return est 125 126 @property 127 def _estimator_type(self): 128 return self.steps[-1][1]._estimator_type 129 130 @property 131 def named_steps(self): 132 return dict(self.steps) 133 134 @property 135 def _final_estimator(self): 136 estimator = self.steps[-1][1] 137 return 'passthrough' if estimator is None else estimator 138 139 def _log_message(self, step_idx): 140 if not self.verbose: 141 return None 142 name, step = self.steps[step_idx] 143 144 return '(step %d of %d) Processing %s' % (step_idx + 1, 145 len(self.steps), 146 name) 147 148 # Estimator interface 149 150 def _fit(self, X, y=None, **fit_params): 151 # shallow copy of steps - this should really be steps_ 152 self.steps = list(self.steps) 153 self._validate_steps() 154 155 fit_params_steps = {name: {} for name, step in self.steps 156 if step is not None} 157 for pname, pval in fit_params.items(): 158 if '__' not in pname: 159 raise ValueError( 160 "Pipeline.fit does not accept the {} parameter. " 161 "You can pass parameters to specific steps of your " 162 "pipeline using the stepname__parameter format, e.g. " 163 "`Pipeline.fit(X, y, logisticregression__sample_weight" 164 "=sample_weight)`.".format(pname)) 165 step, param = pname.split('__', 1) 166 fit_params_steps[step][param] = pval 167 for (step_idx, 168 name, 169 transformer) in self._iter(with_final=False, 170 filter_passthrough=False): 171 if transformer is None or transformer == 'passthrough': 172 continue 173 174 # Fit or load from cache the current transformer 175 X, fitted_transformer = _fit_transform_one( 176 transformer, X, y, None, 177 **fit_params_steps[name]) 178 # Replace the transformer of the step with the fitted 179 # transformer. This is necessary when loading the transformer 180 # from the cache. 181 self.steps[step_idx] = (name, fitted_transformer) 182 if self._final_estimator == 'passthrough': 183 return X, {} 184 return X, fit_params_steps[self.steps[-1][0]] 185 186 def fit(self, X, y=None, **fit_params): 187 """Fit the model 188 189 Fit all the transforms one after the other and transform the 190 data, then fit the transformed data using the final estimator. 191 192 Parameters 193 ---------- 194 X : iterable 195 Training data. Must fulfill input requirements of first step of the 196 pipeline. 197 198 y : iterable, default=None 199 Training targets. Must fulfill label requirements for all steps of 200 the pipeline. 201 202 **fit_params : dict of string -> object 203 Parameters passed to the ``fit`` method of each step, where 204 each parameter name is prefixed such that parameter ``p`` for step 205 ``s`` has key ``s__p``. 206 207 Returns 208 ------- 209 self : Pipeline 210 This estimator 211 """ 212 Xt, fit_params = self._fit(X, y, **fit_params) 213 if self._final_estimator != 'passthrough': 214 self._final_estimator.fit(Xt, y, **fit_params) 215 return self 216 217 def fit_transform(self, X, y=None, **fit_params): 218 """Fit the model and transform with the final estimator 219 220 Fits all the transforms one after the other and transforms the 221 data, then uses fit_transform on transformed data with the final 222 estimator. 223 224 Parameters 225 ---------- 226 X : iterable 227 Training data. Must fulfill input requirements of first step of the 228 pipeline. 229 230 y : iterable, default=None 231 Training targets. Must fulfill label requirements for all steps of 232 the pipeline. 233 234 **fit_params : dict of string -> object 235 Parameters passed to the ``fit`` method of each step, where 236 each parameter name is prefixed such that parameter ``p`` for step 237 ``s`` has key ``s__p``. 238 239 Returns 240 ------- 241 Xt : array-like of shape (n_samples, n_transformed_features) 242 Transformed samples 243 """ 244 last_step = self._final_estimator 245 Xt, fit_params = self._fit(X, y, **fit_params) 246 if last_step == 'passthrough': 247 return Xt 248 if hasattr(last_step, 'fit_transform'): 249 return last_step.fit_transform(Xt, y, **fit_params) 250 else: 251 return last_step.fit(Xt, y, **fit_params).transform(Xt) 252 253 @property 254 def transform(self): 255 """Apply transforms, and transform with the final estimator 256 257 This also works where final estimator is ``None``: all prior 258 transformations are applied. 259 260 Parameters 261 ---------- 262 X : iterable 263 Data to transform. Must fulfill input requirements of first step 264 of the pipeline. 265 266 Returns 267 ------- 268 Xt : array-like of shape (n_samples, n_transformed_features) 269 """ 270 # _final_estimator is None or has transform, otherwise attribute error 271 # XXX: Handling the None case means we can't use if_delegate_has_method 272 if self._final_estimator != 'passthrough': 273 self._final_estimator.transform 274 return self._transform 275 276 def _transform(self, X): 277 Xt = X 278 for _, _, transform in self._iter(): 279 Xt = transform.transform(Xt) 280 return Xt 281 282 283 @property 284 def classes_(self): 285 return self.steps[-1][-1].classes_ 286 287 @property 288 def _pairwise(self): 289 # check if first estimator expects pairwise input 290 return getattr(self.steps[0][1], '_pairwise', False) 291 292 @property 293 def n_features_in_(self): 294 # delegate to first step (which will call _check_is_fitted) 295 return self.steps[0][1].n_features_in_ 296 297 298def _name_estimators(estimators): 299 """Generate names for estimators.""" 300 301 names = [ 302 estimator 303 if isinstance(estimator, str) else type(estimator).__name__.lower() 304 for estimator in estimators 305 ] 306 namecount = defaultdict(int) 307 for est, name in zip(estimators, names): 308 namecount[name] += 1 309 310 for k, v in list(namecount.items()): 311 if v == 1: 312 del namecount[k] 313 314 for i in reversed(range(len(estimators))): 315 name = names[i] 316 if name in namecount: 317 names[i] += "-%d" % namecount[name] 318 namecount[name] -= 1 319 320 return list(zip(names, estimators)) 321 322 323def make_pipeline(*steps, **kwargs): 324 """Construct a Pipeline from the given estimators. 325 326 This is a shorthand for the Pipeline constructor; it does not require, and 327 does not permit, naming the estimators. Instead, their names will be set 328 to the lowercase of their types automatically. 329 330 Parameters 331 ---------- 332 *steps : list of estimators. 333 334 verbose : bool, default=False 335 If True, the time elapsed while fitting each step will be printed as it 336 is completed. 337 338 Returns 339 ------- 340 p : Pipeline 341 """ 342 verbose = kwargs.pop('verbose', False) 343 if kwargs: 344 raise TypeError('Unknown keyword arguments: "{}"' 345 .format(list(kwargs.keys())[0])) 346 return Pipeline(_name_estimators(steps), verbose=verbose) 347 348 349def _transform_one(transformer, X, y, weight, **fit_params): 350 res = transformer.transform(X) 351 # if we have a weight for this transformer, multiply output 352 if weight is None: 353 return res 354 return res * weight 355 356 357def _fit_transform_one(transformer, 358 X, 359 y, 360 weight, 361 **fit_params): 362 """ 363 Fits ``transformer`` to ``X`` and ``y``. The transformed result is returned 364 with the fitted transformer. If ``weight`` is not ``None``, the result will 365 be multiplied by ``weight``. 366 """ 367 if hasattr(transformer, 'fit_transform'): 368 res = transformer.fit_transform(X, y, **fit_params) 369 else: 370 res = transformer.fit(X, y, **fit_params).transform(X) 371 372 if weight is None: 373 return res, transformer 374 return res * weight, transformer 375