1import numpy as np
2from sklearn.ensemble import RandomForestRegressor as _sk_RandomForestRegressor
3from sklearn.ensemble import ExtraTreesRegressor as _sk_ExtraTreesRegressor
4
5
6def _return_std(X, trees, predictions, min_variance):
7    """
8    Returns `std(Y | X)`.
9
10    Can be calculated by E[Var(Y | Tree)] + Var(E[Y | Tree]) where
11    P(Tree) is `1 / len(trees)`.
12
13    Parameters
14    ----------
15    X : array-like, shape=(n_samples, n_features)
16        Input data.
17
18    trees : list, shape=(n_estimators,)
19        List of fit sklearn trees as obtained from the ``estimators_``
20        attribute of a fit RandomForestRegressor or ExtraTreesRegressor.
21
22    predictions : array-like, shape=(n_samples,)
23        Prediction of each data point as returned by RandomForestRegressor
24        or ExtraTreesRegressor.
25
26    Returns
27    -------
28    std : array-like, shape=(n_samples,)
29        Standard deviation of `y` at `X`. If criterion
30        is set to "mse", then `std[i] ~= std(y | X[i])`.
31
32    """
33    # This derives std(y | x) as described in 4.3.2 of arXiv:1211.0906
34    std = np.zeros(len(X))
35
36    for tree in trees:
37        var_tree = tree.tree_.impurity[tree.apply(X)]
38
39        # This rounding off is done in accordance with the
40        # adjustment done in section 4.3.3
41        # of http://arxiv.org/pdf/1211.0906v2.pdf to account
42        # for cases such as leaves with 1 sample in which there
43        # is zero variance.
44        var_tree[var_tree < min_variance] = min_variance
45        mean_tree = tree.predict(X)
46        std += var_tree + mean_tree ** 2
47
48    std /= len(trees)
49    std -= predictions ** 2.0
50    std[std < 0.0] = 0.0
51    std = std ** 0.5
52    return std
53
54
55class RandomForestRegressor(_sk_RandomForestRegressor):
56    """
57    RandomForestRegressor that supports conditional std computation.
58
59    Parameters
60    ----------
61    n_estimators : integer, optional (default=10)
62        The number of trees in the forest.
63
64    criterion : string, optional (default="mse")
65        The function to measure the quality of a split. Supported criteria
66        are "mse" for the mean squared error, which is equal to variance
67        reduction as feature selection criterion, and "mae" for the mean
68        absolute error.
69
70    max_features : int, float, string or None, optional (default="auto")
71        The number of features to consider when looking for the best split:
72
73        - If int, then consider `max_features` features at each split.
74        - If float, then `max_features` is a percentage and
75          `int(max_features * n_features)` features are considered at each
76          split.
77        - If "auto", then `max_features=n_features`.
78        - If "sqrt", then `max_features=sqrt(n_features)`.
79        - If "log2", then `max_features=log2(n_features)`.
80        - If None, then `max_features=n_features`.
81
82        .. note::
83            The search for a split does not stop until at least one
84            valid partition of the node samples is found, even if it
85            requires to effectively inspect more than ``max_features``
86            features.
87
88    max_depth : integer or None, optional (default=None)
89        The maximum depth of the tree. If None, then nodes are expanded until
90        all leaves are pure or until all leaves contain less than
91        min_samples_split samples.
92
93    min_samples_split : int, float, optional (default=2)
94        The minimum number of samples required to split an internal node:
95
96        - If int, then consider `min_samples_split` as the minimum number.
97        - If float, then `min_samples_split` is a percentage and
98          `ceil(min_samples_split * n_samples)` are the minimum
99          number of samples for each split.
100
101    min_samples_leaf : int, float, optional (default=1)
102        The minimum number of samples required to be at a leaf node:
103
104        - If int, then consider `min_samples_leaf` as the minimum number.
105        - If float, then `min_samples_leaf` is a percentage and
106          `ceil(min_samples_leaf * n_samples)` are the minimum
107          number of samples for each node.
108
109    min_weight_fraction_leaf : float, optional (default=0.)
110        The minimum weighted fraction of the sum total of weights (of all
111        the input samples) required to be at a leaf node. Samples have
112        equal weight when sample_weight is not provided.
113
114    max_leaf_nodes : int or None, optional (default=None)
115        Grow trees with ``max_leaf_nodes`` in best-first fashion.
116        Best nodes are defined as relative reduction in impurity.
117        If None then unlimited number of leaf nodes.
118
119    min_impurity_decrease : float, optional (default=0.)
120        A node will be split if this split induces a decrease of the impurity
121        greater than or equal to this value.
122        The weighted impurity decrease equation is the following::
123
124            N_t / N * (impurity - N_t_R / N_t * right_impurity
125                                - N_t_L / N_t * left_impurity)
126
127        where ``N`` is the total number of samples, ``N_t`` is the number of
128        samples at the current node, ``N_t_L`` is the number of samples in the
129        left child, and ``N_t_R`` is the number of samples in the right child.
130        ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
131        if ``sample_weight`` is passed.
132
133    bootstrap : boolean, optional (default=True)
134        Whether bootstrap samples are used when building trees.
135
136    oob_score : bool, optional (default=False)
137        whether to use out-of-bag samples to estimate
138        the R^2 on unseen data.
139
140    n_jobs : integer, optional (default=1)
141        The number of jobs to run in parallel for both `fit` and `predict`.
142        If -1, then the number of jobs is set to the number of cores.
143
144    random_state : int, RandomState instance or None, optional (default=None)
145        If int, random_state is the seed used by the random number generator;
146        If RandomState instance, random_state is the random number generator;
147        If None, the random number generator is the RandomState instance used
148        by `np.random`.
149
150    verbose : int, optional (default=0)
151        Controls the verbosity of the tree building process.
152
153    warm_start : bool, optional (default=False)
154        When set to ``True``, reuse the solution of the previous call to fit
155        and add more estimators to the ensemble, otherwise, just fit a whole
156        new forest.
157
158    Attributes
159    ----------
160    estimators_ : list of DecisionTreeRegressor
161        The collection of fitted sub-estimators.
162
163    feature_importances_ : array of shape = [n_features]
164        The feature importances (the higher, the more important the feature).
165
166    n_features_ : int
167        The number of features when ``fit`` is performed.
168
169    n_outputs_ : int
170        The number of outputs when ``fit`` is performed.
171
172    oob_score_ : float
173        Score of the training dataset obtained using an out-of-bag estimate.
174
175    oob_prediction_ : array of shape = [n_samples]
176        Prediction computed with out-of-bag estimate on the training set.
177
178    Notes
179    -----
180    The default values for the parameters controlling the size of the trees
181    (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
182    unpruned trees which can potentially be very large on some data sets. To
183    reduce memory consumption, the complexity and size of the trees should be
184    controlled by setting those parameter values.
185    The features are always randomly permuted at each split. Therefore,
186    the best found split may vary, even with the same training data,
187    ``max_features=n_features`` and ``bootstrap=False``, if the improvement
188    of the criterion is identical for several splits enumerated during the
189    search of the best split. To obtain a deterministic behaviour during
190    fitting, ``random_state`` has to be fixed.
191
192    References
193    ----------
194    .. [1] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001.
195
196    """
197    def __init__(self, n_estimators=10, criterion='mse', max_depth=None,
198                 min_samples_split=2, min_samples_leaf=1,
199                 min_weight_fraction_leaf=0.0, max_features='auto',
200                 max_leaf_nodes=None, min_impurity_decrease=0.,
201                 bootstrap=True, oob_score=False,
202                 n_jobs=1, random_state=None, verbose=0, warm_start=False,
203                 min_variance=0.0):
204        self.min_variance = min_variance
205        super(RandomForestRegressor, self).__init__(
206            n_estimators=n_estimators, criterion=criterion,
207            max_depth=max_depth,
208            min_samples_split=min_samples_split,
209            min_samples_leaf=min_samples_leaf,
210            min_weight_fraction_leaf=min_weight_fraction_leaf,
211            max_features=max_features, max_leaf_nodes=max_leaf_nodes,
212            min_impurity_decrease=min_impurity_decrease,
213            bootstrap=bootstrap, oob_score=oob_score,
214            n_jobs=n_jobs, random_state=random_state,
215            verbose=verbose, warm_start=warm_start)
216
217    def predict(self, X, return_std=False):
218        """Predict continuous output for X.
219
220        Parameters
221        ----------
222        X : array of shape = (n_samples, n_features)
223            Input data.
224
225        return_std : boolean
226            Whether or not to return the standard deviation.
227
228        Returns
229        -------
230        predictions : array-like of shape = (n_samples,)
231            Predicted values for X. If criterion is set to "mse",
232            then `predictions[i] ~= mean(y | X[i])`.
233
234        std : array-like of shape=(n_samples,)
235            Standard deviation of `y` at `X`. If criterion
236            is set to "mse", then `std[i] ~= std(y | X[i])`.
237
238        """
239        mean = super(RandomForestRegressor, self).predict(X)
240
241        if return_std:
242            if self.criterion != "mse":
243                raise ValueError(
244                    "Expected impurity to be 'mse', got %s instead"
245                    % self.criterion)
246            std = _return_std(X, self.estimators_, mean, self.min_variance)
247            return mean, std
248        return mean
249
250
251class ExtraTreesRegressor(_sk_ExtraTreesRegressor):
252    """
253    ExtraTreesRegressor that supports conditional standard deviation.
254
255    Parameters
256    ----------
257    n_estimators : integer, optional (default=10)
258        The number of trees in the forest.
259
260    criterion : string, optional (default="mse")
261        The function to measure the quality of a split. Supported criteria
262        are "mse" for the mean squared error, which is equal to variance
263        reduction as feature selection criterion, and "mae" for the mean
264        absolute error.
265
266    max_features : int, float, string or None, optional (default="auto")
267        The number of features to consider when looking for the best split:
268
269        - If int, then consider `max_features` features at each split.
270        - If float, then `max_features` is a percentage and
271          `int(max_features * n_features)` features are considered at each
272          split.
273        - If "auto", then `max_features=n_features`.
274        - If "sqrt", then `max_features=sqrt(n_features)`.
275        - If "log2", then `max_features=log2(n_features)`.
276        - If None, then `max_features=n_features`.
277
278        .. note::
279            The search for a split does not stop until at least one
280            valid partition of the node samples is found, even if it
281            requires to effectively inspect more than ``max_features``
282            features.
283
284    max_depth : integer or None, optional (default=None)
285        The maximum depth of the tree. If None, then nodes are expanded until
286        all leaves are pure or until all leaves contain less than
287        min_samples_split samples.
288
289    min_samples_split : int, float, optional (default=2)
290        The minimum number of samples required to split an internal node:
291
292        - If int, then consider `min_samples_split` as the minimum number.
293        - If float, then `min_samples_split` is a percentage and
294          `ceil(min_samples_split * n_samples)` are the minimum
295          number of samples for each split.
296
297    min_samples_leaf : int, float, optional (default=1)
298        The minimum number of samples required to be at a leaf node:
299
300        - If int, then consider `min_samples_leaf` as the minimum number.
301        - If float, then `min_samples_leaf` is a percentage and
302          `ceil(min_samples_leaf * n_samples)` are the minimum
303          number of samples for each node.
304
305    min_weight_fraction_leaf : float, optional (default=0.)
306        The minimum weighted fraction of the sum total of weights (of all
307        the input samples) required to be at a leaf node. Samples have
308        equal weight when sample_weight is not provided.
309
310    max_leaf_nodes : int or None, optional (default=None)
311        Grow trees with ``max_leaf_nodes`` in best-first fashion.
312        Best nodes are defined as relative reduction in impurity.
313        If None then unlimited number of leaf nodes.
314
315    min_impurity_decrease : float, optional (default=0.)
316        A node will be split if this split induces a decrease of the impurity
317        greater than or equal to this value.
318        The weighted impurity decrease equation is the following::
319
320            N_t / N * (impurity - N_t_R / N_t * right_impurity
321                                - N_t_L / N_t * left_impurity)
322
323        where ``N`` is the total number of samples, ``N_t`` is the number of
324        samples at the current node, ``N_t_L`` is the number of samples in the
325        left child, and ``N_t_R`` is the number of samples in the right child.
326        ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
327        if ``sample_weight`` is passed.
328
329    bootstrap : boolean, optional (default=True)
330        Whether bootstrap samples are used when building trees.
331
332    oob_score : bool, optional (default=False)
333        whether to use out-of-bag samples to estimate
334        the R^2 on unseen data.
335
336    n_jobs : integer, optional (default=1)
337        The number of jobs to run in parallel for both `fit` and `predict`.
338        If -1, then the number of jobs is set to the number of cores.
339
340    random_state : int, RandomState instance or None, optional (default=None)
341        If int, random_state is the seed used by the random number generator;
342        If RandomState instance, random_state is the random number generator;
343        If None, the random number generator is the RandomState instance used
344        by `np.random`.
345
346    verbose : int, optional (default=0)
347        Controls the verbosity of the tree building process.
348
349    warm_start : bool, optional (default=False)
350        When set to ``True``, reuse the solution of the previous call to fit
351        and add more estimators to the ensemble, otherwise, just fit a whole
352        new forest.
353
354    Attributes
355    ----------
356    estimators_ : list of DecisionTreeRegressor
357        The collection of fitted sub-estimators.
358
359    feature_importances_ : array of shape = [n_features]
360        The feature importances (the higher, the more important the feature).
361
362    n_features_ : int
363        The number of features when ``fit`` is performed.
364
365    n_outputs_ : int
366        The number of outputs when ``fit`` is performed.
367
368    oob_score_ : float
369        Score of the training dataset obtained using an out-of-bag estimate.
370
371    oob_prediction_ : array of shape = [n_samples]
372        Prediction computed with out-of-bag estimate on the training set.
373
374    Notes
375    -----
376    The default values for the parameters controlling the size of the trees
377    (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
378    unpruned trees which can potentially be very large on some data sets. To
379    reduce memory consumption, the complexity and size of the trees should be
380    controlled by setting those parameter values.
381    The features are always randomly permuted at each split. Therefore,
382    the best found split may vary, even with the same training data,
383    ``max_features=n_features`` and ``bootstrap=False``, if the improvement
384    of the criterion is identical for several splits enumerated during the
385    search of the best split. To obtain a deterministic behaviour during
386    fitting, ``random_state`` has to be fixed.
387
388    References
389    ----------
390    .. [1] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001.
391
392    """
393    def __init__(self, n_estimators=10, criterion='mse', max_depth=None,
394                 min_samples_split=2, min_samples_leaf=1,
395                 min_weight_fraction_leaf=0.0, max_features='auto',
396                 max_leaf_nodes=None, min_impurity_decrease=0.,
397                 bootstrap=False, oob_score=False,
398                 n_jobs=1, random_state=None, verbose=0, warm_start=False,
399                 min_variance=0.0):
400        self.min_variance = min_variance
401        super(ExtraTreesRegressor, self).__init__(
402            n_estimators=n_estimators, criterion=criterion,
403            max_depth=max_depth,
404            min_samples_split=min_samples_split,
405            min_samples_leaf=min_samples_leaf,
406            min_weight_fraction_leaf=min_weight_fraction_leaf,
407            max_features=max_features, max_leaf_nodes=max_leaf_nodes,
408            min_impurity_decrease=min_impurity_decrease,
409            bootstrap=bootstrap, oob_score=oob_score,
410            n_jobs=n_jobs, random_state=random_state,
411            verbose=verbose, warm_start=warm_start)
412
413    def predict(self, X, return_std=False):
414        """
415        Predict continuous output for X.
416
417        Parameters
418        ----------
419        X : array-like of shape=(n_samples, n_features)
420            Input data.
421
422        return_std : boolean
423            Whether or not to return the standard deviation.
424
425        Returns
426        -------
427        predictions : array-like of shape=(n_samples,)
428            Predicted values for X. If criterion is set to "mse",
429            then `predictions[i] ~= mean(y | X[i])`.
430
431        std : array-like of shape=(n_samples,)
432            Standard deviation of `y` at `X`. If criterion
433            is set to "mse", then `std[i] ~= std(y | X[i])`.
434        """
435        mean = super(ExtraTreesRegressor, self).predict(X)
436
437        if return_std:
438            if self.criterion != "mse":
439                raise ValueError(
440                    "Expected impurity to be 'mse', got %s instead"
441                    % self.criterion)
442            std = _return_std(X, self.estimators_, mean, self.min_variance)
443            return mean, std
444
445        return mean
446