1# -*- coding: utf-8 -*-
2"""
3This module borrows and adapts `Pipeline` from `sklearn.pipeline` and
4`TransformerMixin` from `sklearn.base` in the scikit-learn framework
5(commit hash d205638475ca542dc46862652e3bb0be663a8eac) to be precise).
6Both are BSD licensed and allow for this sort of thing; attribution
7is given as a comment above each class.
8"""
9from collections import defaultdict
10from itertools import islice
11
12
13# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
14# License: BSD 3 clause
15class TransformerMixin(object):
16    """Mixin class for all transformers."""
17
18    def fit_transform(self, X, y=None, **fit_params):
19        """
20        Fit to data, then transform it.
21        Fits transformer to X and y with optional parameters fit_params
22        and returns a transformed version of X.
23        Parameters
24        ----------
25        X : ndarray of shape (n_samples, n_features)
26            Training set.
27        y : ndarray of shape (n_samples,), default=None
28            Target values.
29        **fit_params : dict
30            Additional fit parameters.
31        Returns
32        -------
33        X_new : ndarray array of shape (n_samples, n_features_new)
34            Transformed array.
35        """
36        # non-optimized default implementation; override when a better
37        # method is possible for a given clustering algorithm
38        if y is None:
39            # fit method of arity 1 (unsupervised transformation)
40            return self.fit(X, **fit_params).transform(X)
41        else:
42            # fit method of arity 2 (supervised transformation)
43            return self.fit(X, y, **fit_params).transform(X)
44
45
46# Author: Edouard Duchesnay
47#         Gael Varoquaux
48#         Virgile Fritsch
49#         Alexandre Gramfort
50#         Lars Buitinck
51# License: BSD
52class Pipeline(object):
53    def __init__(self, steps, verbose=False):
54        self.steps = steps
55        self.verbose = verbose
56        self._validate_steps()
57
58    def _validate_steps(self):
59        names, estimators = zip(*self.steps)
60
61        # validate estimators
62        transformers = estimators[:-1]
63        estimator = estimators[-1]
64
65        for t in transformers:
66            if t is None or t == 'passthrough':
67                continue
68            if (not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not
69            hasattr(t, "transform")):
70                raise TypeError("All intermediate steps should be "
71                                "transformers and implement fit and transform "
72                                "or be the string 'passthrough' "
73                                "'%s' (type %s) doesn't" % (t, type(t)))
74
75        # We allow last estimator to be None as an identity transformation
76        if (estimator is not None and estimator != 'passthrough'
77                and not hasattr(estimator, "fit")):
78            raise TypeError(
79                "Last step of Pipeline should implement fit "
80                "or be the string 'passthrough'. "
81                "'%s' (type %s) doesn't" % (estimator, type(estimator)))
82
83    def _iter(self, with_final=True, filter_passthrough=True):
84        """
85        Generate (idx, (name, trans)) tuples from self.steps
86
87        When filter_passthrough is True, 'passthrough' and None transformers
88        are filtered out.
89        """
90        stop = len(self.steps)
91        if not with_final:
92            stop -= 1
93
94        for idx, (name, trans) in enumerate(islice(self.steps, 0, stop)):
95            if not filter_passthrough:
96                yield idx, name, trans
97            elif trans is not None and trans != 'passthrough':
98                yield idx, name, trans
99
100    def __len__(self):
101        """
102        Returns the length of the Pipeline
103        """
104        return len(self.steps)
105
106    def __getitem__(self, ind):
107        """Returns a sub-pipeline or a single esimtator in the pipeline
108
109        Indexing with an integer will return an estimator; using a slice
110        returns another Pipeline instance which copies a slice of this
111        Pipeline. This copy is shallow: modifying (or fitting) estimators in
112        the sub-pipeline will affect the larger pipeline and vice-versa.
113        However, replacing a value in `step` will not affect a copy.
114        """
115        if isinstance(ind, slice):
116            if ind.step not in (1, None):
117                raise ValueError('Pipeline slicing only supports a step of 1')
118            return self.__class__(self.steps[ind])
119        try:
120            name, est = self.steps[ind]
121        except TypeError:
122            # Not an int, try get step by name
123            return self.named_steps[ind]
124        return est
125
126    @property
127    def _estimator_type(self):
128        return self.steps[-1][1]._estimator_type
129
130    @property
131    def named_steps(self):
132        return dict(self.steps)
133
134    @property
135    def _final_estimator(self):
136        estimator = self.steps[-1][1]
137        return 'passthrough' if estimator is None else estimator
138
139    def _log_message(self, step_idx):
140        if not self.verbose:
141            return None
142        name, step = self.steps[step_idx]
143
144        return '(step %d of %d) Processing %s' % (step_idx + 1,
145                                                  len(self.steps),
146                                                  name)
147
148    # Estimator interface
149
150    def _fit(self, X, y=None, **fit_params):
151        # shallow copy of steps - this should really be steps_
152        self.steps = list(self.steps)
153        self._validate_steps()
154
155        fit_params_steps = {name: {} for name, step in self.steps
156                            if step is not None}
157        for pname, pval in fit_params.items():
158            if '__' not in pname:
159                raise ValueError(
160                    "Pipeline.fit does not accept the {} parameter. "
161                    "You can pass parameters to specific steps of your "
162                    "pipeline using the stepname__parameter format, e.g. "
163                    "`Pipeline.fit(X, y, logisticregression__sample_weight"
164                    "=sample_weight)`.".format(pname))
165            step, param = pname.split('__', 1)
166            fit_params_steps[step][param] = pval
167        for (step_idx,
168             name,
169             transformer) in self._iter(with_final=False,
170                                        filter_passthrough=False):
171            if transformer is None or transformer == 'passthrough':
172                continue
173
174            # Fit or load from cache the current transformer
175            X, fitted_transformer = _fit_transform_one(
176                transformer, X, y, None,
177                **fit_params_steps[name])
178            # Replace the transformer of the step with the fitted
179            # transformer. This is necessary when loading the transformer
180            # from the cache.
181            self.steps[step_idx] = (name, fitted_transformer)
182        if self._final_estimator == 'passthrough':
183            return X, {}
184        return X, fit_params_steps[self.steps[-1][0]]
185
186    def fit(self, X, y=None, **fit_params):
187        """Fit the model
188
189        Fit all the transforms one after the other and transform the
190        data, then fit the transformed data using the final estimator.
191
192        Parameters
193        ----------
194        X : iterable
195            Training data. Must fulfill input requirements of first step of the
196            pipeline.
197
198        y : iterable, default=None
199            Training targets. Must fulfill label requirements for all steps of
200            the pipeline.
201
202        **fit_params : dict of string -> object
203            Parameters passed to the ``fit`` method of each step, where
204            each parameter name is prefixed such that parameter ``p`` for step
205            ``s`` has key ``s__p``.
206
207        Returns
208        -------
209        self : Pipeline
210            This estimator
211        """
212        Xt, fit_params = self._fit(X, y, **fit_params)
213        if self._final_estimator != 'passthrough':
214            self._final_estimator.fit(Xt, y, **fit_params)
215        return self
216
217    def fit_transform(self, X, y=None, **fit_params):
218        """Fit the model and transform with the final estimator
219
220        Fits all the transforms one after the other and transforms the
221        data, then uses fit_transform on transformed data with the final
222        estimator.
223
224        Parameters
225        ----------
226        X : iterable
227            Training data. Must fulfill input requirements of first step of the
228            pipeline.
229
230        y : iterable, default=None
231            Training targets. Must fulfill label requirements for all steps of
232            the pipeline.
233
234        **fit_params : dict of string -> object
235            Parameters passed to the ``fit`` method of each step, where
236            each parameter name is prefixed such that parameter ``p`` for step
237            ``s`` has key ``s__p``.
238
239        Returns
240        -------
241        Xt : array-like of shape  (n_samples, n_transformed_features)
242            Transformed samples
243        """
244        last_step = self._final_estimator
245        Xt, fit_params = self._fit(X, y, **fit_params)
246        if last_step == 'passthrough':
247            return Xt
248        if hasattr(last_step, 'fit_transform'):
249            return last_step.fit_transform(Xt, y, **fit_params)
250        else:
251            return last_step.fit(Xt, y, **fit_params).transform(Xt)
252
253    @property
254    def transform(self):
255        """Apply transforms, and transform with the final estimator
256
257        This also works where final estimator is ``None``: all prior
258        transformations are applied.
259
260        Parameters
261        ----------
262        X : iterable
263            Data to transform. Must fulfill input requirements of first step
264            of the pipeline.
265
266        Returns
267        -------
268        Xt : array-like of shape  (n_samples, n_transformed_features)
269        """
270        # _final_estimator is None or has transform, otherwise attribute error
271        # XXX: Handling the None case means we can't use if_delegate_has_method
272        if self._final_estimator != 'passthrough':
273            self._final_estimator.transform
274        return self._transform
275
276    def _transform(self, X):
277        Xt = X
278        for _, _, transform in self._iter():
279            Xt = transform.transform(Xt)
280        return Xt
281
282
283    @property
284    def classes_(self):
285        return self.steps[-1][-1].classes_
286
287    @property
288    def _pairwise(self):
289        # check if first estimator expects pairwise input
290        return getattr(self.steps[0][1], '_pairwise', False)
291
292    @property
293    def n_features_in_(self):
294        # delegate to first step (which will call _check_is_fitted)
295        return self.steps[0][1].n_features_in_
296
297
298def _name_estimators(estimators):
299    """Generate names for estimators."""
300
301    names = [
302        estimator
303        if isinstance(estimator, str) else type(estimator).__name__.lower()
304        for estimator in estimators
305    ]
306    namecount = defaultdict(int)
307    for est, name in zip(estimators, names):
308        namecount[name] += 1
309
310    for k, v in list(namecount.items()):
311        if v == 1:
312            del namecount[k]
313
314    for i in reversed(range(len(estimators))):
315        name = names[i]
316        if name in namecount:
317            names[i] += "-%d" % namecount[name]
318            namecount[name] -= 1
319
320    return list(zip(names, estimators))
321
322
323def make_pipeline(*steps, **kwargs):
324    """Construct a Pipeline from the given estimators.
325
326    This is a shorthand for the Pipeline constructor; it does not require, and
327    does not permit, naming the estimators. Instead, their names will be set
328    to the lowercase of their types automatically.
329
330    Parameters
331    ----------
332    *steps : list of estimators.
333
334    verbose : bool, default=False
335        If True, the time elapsed while fitting each step will be printed as it
336        is completed.
337
338    Returns
339    -------
340    p : Pipeline
341    """
342    verbose = kwargs.pop('verbose', False)
343    if kwargs:
344        raise TypeError('Unknown keyword arguments: "{}"'
345                        .format(list(kwargs.keys())[0]))
346    return Pipeline(_name_estimators(steps), verbose=verbose)
347
348
349def _transform_one(transformer, X, y, weight, **fit_params):
350    res = transformer.transform(X)
351    # if we have a weight for this transformer, multiply output
352    if weight is None:
353        return res
354    return res * weight
355
356
357def _fit_transform_one(transformer,
358                       X,
359                       y,
360                       weight,
361                       **fit_params):
362    """
363    Fits ``transformer`` to ``X`` and ``y``. The transformed result is returned
364    with the fitted transformer. If ``weight`` is not ``None``, the result will
365    be multiplied by ``weight``.
366    """
367    if hasattr(transformer, 'fit_transform'):
368        res = transformer.fit_transform(X, y, **fit_params)
369    else:
370        res = transformer.fit(X, y, **fit_params).transform(X)
371
372    if weight is None:
373        return res, transformer
374    return res * weight, transformer
375