1import numpy as np 2from sklearn.ensemble import RandomForestRegressor as _sk_RandomForestRegressor 3from sklearn.ensemble import ExtraTreesRegressor as _sk_ExtraTreesRegressor 4 5 6def _return_std(X, trees, predictions, min_variance): 7 """ 8 Returns `std(Y | X)`. 9 10 Can be calculated by E[Var(Y | Tree)] + Var(E[Y | Tree]) where 11 P(Tree) is `1 / len(trees)`. 12 13 Parameters 14 ---------- 15 X : array-like, shape=(n_samples, n_features) 16 Input data. 17 18 trees : list, shape=(n_estimators,) 19 List of fit sklearn trees as obtained from the ``estimators_`` 20 attribute of a fit RandomForestRegressor or ExtraTreesRegressor. 21 22 predictions : array-like, shape=(n_samples,) 23 Prediction of each data point as returned by RandomForestRegressor 24 or ExtraTreesRegressor. 25 26 Returns 27 ------- 28 std : array-like, shape=(n_samples,) 29 Standard deviation of `y` at `X`. If criterion 30 is set to "mse", then `std[i] ~= std(y | X[i])`. 31 32 """ 33 # This derives std(y | x) as described in 4.3.2 of arXiv:1211.0906 34 std = np.zeros(len(X)) 35 36 for tree in trees: 37 var_tree = tree.tree_.impurity[tree.apply(X)] 38 39 # This rounding off is done in accordance with the 40 # adjustment done in section 4.3.3 41 # of http://arxiv.org/pdf/1211.0906v2.pdf to account 42 # for cases such as leaves with 1 sample in which there 43 # is zero variance. 44 var_tree[var_tree < min_variance] = min_variance 45 mean_tree = tree.predict(X) 46 std += var_tree + mean_tree ** 2 47 48 std /= len(trees) 49 std -= predictions ** 2.0 50 std[std < 0.0] = 0.0 51 std = std ** 0.5 52 return std 53 54 55class RandomForestRegressor(_sk_RandomForestRegressor): 56 """ 57 RandomForestRegressor that supports conditional std computation. 58 59 Parameters 60 ---------- 61 n_estimators : integer, optional (default=10) 62 The number of trees in the forest. 63 64 criterion : string, optional (default="mse") 65 The function to measure the quality of a split. Supported criteria 66 are "mse" for the mean squared error, which is equal to variance 67 reduction as feature selection criterion, and "mae" for the mean 68 absolute error. 69 70 max_features : int, float, string or None, optional (default="auto") 71 The number of features to consider when looking for the best split: 72 73 - If int, then consider `max_features` features at each split. 74 - If float, then `max_features` is a percentage and 75 `int(max_features * n_features)` features are considered at each 76 split. 77 - If "auto", then `max_features=n_features`. 78 - If "sqrt", then `max_features=sqrt(n_features)`. 79 - If "log2", then `max_features=log2(n_features)`. 80 - If None, then `max_features=n_features`. 81 82 .. note:: 83 The search for a split does not stop until at least one 84 valid partition of the node samples is found, even if it 85 requires to effectively inspect more than ``max_features`` 86 features. 87 88 max_depth : integer or None, optional (default=None) 89 The maximum depth of the tree. If None, then nodes are expanded until 90 all leaves are pure or until all leaves contain less than 91 min_samples_split samples. 92 93 min_samples_split : int, float, optional (default=2) 94 The minimum number of samples required to split an internal node: 95 96 - If int, then consider `min_samples_split` as the minimum number. 97 - If float, then `min_samples_split` is a percentage and 98 `ceil(min_samples_split * n_samples)` are the minimum 99 number of samples for each split. 100 101 min_samples_leaf : int, float, optional (default=1) 102 The minimum number of samples required to be at a leaf node: 103 104 - If int, then consider `min_samples_leaf` as the minimum number. 105 - If float, then `min_samples_leaf` is a percentage and 106 `ceil(min_samples_leaf * n_samples)` are the minimum 107 number of samples for each node. 108 109 min_weight_fraction_leaf : float, optional (default=0.) 110 The minimum weighted fraction of the sum total of weights (of all 111 the input samples) required to be at a leaf node. Samples have 112 equal weight when sample_weight is not provided. 113 114 max_leaf_nodes : int or None, optional (default=None) 115 Grow trees with ``max_leaf_nodes`` in best-first fashion. 116 Best nodes are defined as relative reduction in impurity. 117 If None then unlimited number of leaf nodes. 118 119 min_impurity_decrease : float, optional (default=0.) 120 A node will be split if this split induces a decrease of the impurity 121 greater than or equal to this value. 122 The weighted impurity decrease equation is the following:: 123 124 N_t / N * (impurity - N_t_R / N_t * right_impurity 125 - N_t_L / N_t * left_impurity) 126 127 where ``N`` is the total number of samples, ``N_t`` is the number of 128 samples at the current node, ``N_t_L`` is the number of samples in the 129 left child, and ``N_t_R`` is the number of samples in the right child. 130 ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, 131 if ``sample_weight`` is passed. 132 133 bootstrap : boolean, optional (default=True) 134 Whether bootstrap samples are used when building trees. 135 136 oob_score : bool, optional (default=False) 137 whether to use out-of-bag samples to estimate 138 the R^2 on unseen data. 139 140 n_jobs : integer, optional (default=1) 141 The number of jobs to run in parallel for both `fit` and `predict`. 142 If -1, then the number of jobs is set to the number of cores. 143 144 random_state : int, RandomState instance or None, optional (default=None) 145 If int, random_state is the seed used by the random number generator; 146 If RandomState instance, random_state is the random number generator; 147 If None, the random number generator is the RandomState instance used 148 by `np.random`. 149 150 verbose : int, optional (default=0) 151 Controls the verbosity of the tree building process. 152 153 warm_start : bool, optional (default=False) 154 When set to ``True``, reuse the solution of the previous call to fit 155 and add more estimators to the ensemble, otherwise, just fit a whole 156 new forest. 157 158 Attributes 159 ---------- 160 estimators_ : list of DecisionTreeRegressor 161 The collection of fitted sub-estimators. 162 163 feature_importances_ : array of shape = [n_features] 164 The feature importances (the higher, the more important the feature). 165 166 n_features_ : int 167 The number of features when ``fit`` is performed. 168 169 n_outputs_ : int 170 The number of outputs when ``fit`` is performed. 171 172 oob_score_ : float 173 Score of the training dataset obtained using an out-of-bag estimate. 174 175 oob_prediction_ : array of shape = [n_samples] 176 Prediction computed with out-of-bag estimate on the training set. 177 178 Notes 179 ----- 180 The default values for the parameters controlling the size of the trees 181 (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and 182 unpruned trees which can potentially be very large on some data sets. To 183 reduce memory consumption, the complexity and size of the trees should be 184 controlled by setting those parameter values. 185 The features are always randomly permuted at each split. Therefore, 186 the best found split may vary, even with the same training data, 187 ``max_features=n_features`` and ``bootstrap=False``, if the improvement 188 of the criterion is identical for several splits enumerated during the 189 search of the best split. To obtain a deterministic behaviour during 190 fitting, ``random_state`` has to be fixed. 191 192 References 193 ---------- 194 .. [1] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001. 195 196 """ 197 def __init__(self, n_estimators=10, criterion='mse', max_depth=None, 198 min_samples_split=2, min_samples_leaf=1, 199 min_weight_fraction_leaf=0.0, max_features='auto', 200 max_leaf_nodes=None, min_impurity_decrease=0., 201 bootstrap=True, oob_score=False, 202 n_jobs=1, random_state=None, verbose=0, warm_start=False, 203 min_variance=0.0): 204 self.min_variance = min_variance 205 super(RandomForestRegressor, self).__init__( 206 n_estimators=n_estimators, criterion=criterion, 207 max_depth=max_depth, 208 min_samples_split=min_samples_split, 209 min_samples_leaf=min_samples_leaf, 210 min_weight_fraction_leaf=min_weight_fraction_leaf, 211 max_features=max_features, max_leaf_nodes=max_leaf_nodes, 212 min_impurity_decrease=min_impurity_decrease, 213 bootstrap=bootstrap, oob_score=oob_score, 214 n_jobs=n_jobs, random_state=random_state, 215 verbose=verbose, warm_start=warm_start) 216 217 def predict(self, X, return_std=False): 218 """Predict continuous output for X. 219 220 Parameters 221 ---------- 222 X : array of shape = (n_samples, n_features) 223 Input data. 224 225 return_std : boolean 226 Whether or not to return the standard deviation. 227 228 Returns 229 ------- 230 predictions : array-like of shape = (n_samples,) 231 Predicted values for X. If criterion is set to "mse", 232 then `predictions[i] ~= mean(y | X[i])`. 233 234 std : array-like of shape=(n_samples,) 235 Standard deviation of `y` at `X`. If criterion 236 is set to "mse", then `std[i] ~= std(y | X[i])`. 237 238 """ 239 mean = super(RandomForestRegressor, self).predict(X) 240 241 if return_std: 242 if self.criterion != "mse": 243 raise ValueError( 244 "Expected impurity to be 'mse', got %s instead" 245 % self.criterion) 246 std = _return_std(X, self.estimators_, mean, self.min_variance) 247 return mean, std 248 return mean 249 250 251class ExtraTreesRegressor(_sk_ExtraTreesRegressor): 252 """ 253 ExtraTreesRegressor that supports conditional standard deviation. 254 255 Parameters 256 ---------- 257 n_estimators : integer, optional (default=10) 258 The number of trees in the forest. 259 260 criterion : string, optional (default="mse") 261 The function to measure the quality of a split. Supported criteria 262 are "mse" for the mean squared error, which is equal to variance 263 reduction as feature selection criterion, and "mae" for the mean 264 absolute error. 265 266 max_features : int, float, string or None, optional (default="auto") 267 The number of features to consider when looking for the best split: 268 269 - If int, then consider `max_features` features at each split. 270 - If float, then `max_features` is a percentage and 271 `int(max_features * n_features)` features are considered at each 272 split. 273 - If "auto", then `max_features=n_features`. 274 - If "sqrt", then `max_features=sqrt(n_features)`. 275 - If "log2", then `max_features=log2(n_features)`. 276 - If None, then `max_features=n_features`. 277 278 .. note:: 279 The search for a split does not stop until at least one 280 valid partition of the node samples is found, even if it 281 requires to effectively inspect more than ``max_features`` 282 features. 283 284 max_depth : integer or None, optional (default=None) 285 The maximum depth of the tree. If None, then nodes are expanded until 286 all leaves are pure or until all leaves contain less than 287 min_samples_split samples. 288 289 min_samples_split : int, float, optional (default=2) 290 The minimum number of samples required to split an internal node: 291 292 - If int, then consider `min_samples_split` as the minimum number. 293 - If float, then `min_samples_split` is a percentage and 294 `ceil(min_samples_split * n_samples)` are the minimum 295 number of samples for each split. 296 297 min_samples_leaf : int, float, optional (default=1) 298 The minimum number of samples required to be at a leaf node: 299 300 - If int, then consider `min_samples_leaf` as the minimum number. 301 - If float, then `min_samples_leaf` is a percentage and 302 `ceil(min_samples_leaf * n_samples)` are the minimum 303 number of samples for each node. 304 305 min_weight_fraction_leaf : float, optional (default=0.) 306 The minimum weighted fraction of the sum total of weights (of all 307 the input samples) required to be at a leaf node. Samples have 308 equal weight when sample_weight is not provided. 309 310 max_leaf_nodes : int or None, optional (default=None) 311 Grow trees with ``max_leaf_nodes`` in best-first fashion. 312 Best nodes are defined as relative reduction in impurity. 313 If None then unlimited number of leaf nodes. 314 315 min_impurity_decrease : float, optional (default=0.) 316 A node will be split if this split induces a decrease of the impurity 317 greater than or equal to this value. 318 The weighted impurity decrease equation is the following:: 319 320 N_t / N * (impurity - N_t_R / N_t * right_impurity 321 - N_t_L / N_t * left_impurity) 322 323 where ``N`` is the total number of samples, ``N_t`` is the number of 324 samples at the current node, ``N_t_L`` is the number of samples in the 325 left child, and ``N_t_R`` is the number of samples in the right child. 326 ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, 327 if ``sample_weight`` is passed. 328 329 bootstrap : boolean, optional (default=True) 330 Whether bootstrap samples are used when building trees. 331 332 oob_score : bool, optional (default=False) 333 whether to use out-of-bag samples to estimate 334 the R^2 on unseen data. 335 336 n_jobs : integer, optional (default=1) 337 The number of jobs to run in parallel for both `fit` and `predict`. 338 If -1, then the number of jobs is set to the number of cores. 339 340 random_state : int, RandomState instance or None, optional (default=None) 341 If int, random_state is the seed used by the random number generator; 342 If RandomState instance, random_state is the random number generator; 343 If None, the random number generator is the RandomState instance used 344 by `np.random`. 345 346 verbose : int, optional (default=0) 347 Controls the verbosity of the tree building process. 348 349 warm_start : bool, optional (default=False) 350 When set to ``True``, reuse the solution of the previous call to fit 351 and add more estimators to the ensemble, otherwise, just fit a whole 352 new forest. 353 354 Attributes 355 ---------- 356 estimators_ : list of DecisionTreeRegressor 357 The collection of fitted sub-estimators. 358 359 feature_importances_ : array of shape = [n_features] 360 The feature importances (the higher, the more important the feature). 361 362 n_features_ : int 363 The number of features when ``fit`` is performed. 364 365 n_outputs_ : int 366 The number of outputs when ``fit`` is performed. 367 368 oob_score_ : float 369 Score of the training dataset obtained using an out-of-bag estimate. 370 371 oob_prediction_ : array of shape = [n_samples] 372 Prediction computed with out-of-bag estimate on the training set. 373 374 Notes 375 ----- 376 The default values for the parameters controlling the size of the trees 377 (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and 378 unpruned trees which can potentially be very large on some data sets. To 379 reduce memory consumption, the complexity and size of the trees should be 380 controlled by setting those parameter values. 381 The features are always randomly permuted at each split. Therefore, 382 the best found split may vary, even with the same training data, 383 ``max_features=n_features`` and ``bootstrap=False``, if the improvement 384 of the criterion is identical for several splits enumerated during the 385 search of the best split. To obtain a deterministic behaviour during 386 fitting, ``random_state`` has to be fixed. 387 388 References 389 ---------- 390 .. [1] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001. 391 392 """ 393 def __init__(self, n_estimators=10, criterion='mse', max_depth=None, 394 min_samples_split=2, min_samples_leaf=1, 395 min_weight_fraction_leaf=0.0, max_features='auto', 396 max_leaf_nodes=None, min_impurity_decrease=0., 397 bootstrap=False, oob_score=False, 398 n_jobs=1, random_state=None, verbose=0, warm_start=False, 399 min_variance=0.0): 400 self.min_variance = min_variance 401 super(ExtraTreesRegressor, self).__init__( 402 n_estimators=n_estimators, criterion=criterion, 403 max_depth=max_depth, 404 min_samples_split=min_samples_split, 405 min_samples_leaf=min_samples_leaf, 406 min_weight_fraction_leaf=min_weight_fraction_leaf, 407 max_features=max_features, max_leaf_nodes=max_leaf_nodes, 408 min_impurity_decrease=min_impurity_decrease, 409 bootstrap=bootstrap, oob_score=oob_score, 410 n_jobs=n_jobs, random_state=random_state, 411 verbose=verbose, warm_start=warm_start) 412 413 def predict(self, X, return_std=False): 414 """ 415 Predict continuous output for X. 416 417 Parameters 418 ---------- 419 X : array-like of shape=(n_samples, n_features) 420 Input data. 421 422 return_std : boolean 423 Whether or not to return the standard deviation. 424 425 Returns 426 ------- 427 predictions : array-like of shape=(n_samples,) 428 Predicted values for X. If criterion is set to "mse", 429 then `predictions[i] ~= mean(y | X[i])`. 430 431 std : array-like of shape=(n_samples,) 432 Standard deviation of `y` at `X`. If criterion 433 is set to "mse", then `std[i] ~= std(y | X[i])`. 434 """ 435 mean = super(ExtraTreesRegressor, self).predict(X) 436 437 if return_std: 438 if self.criterion != "mse": 439 raise ValueError( 440 "Expected impurity to be 'mse', got %s instead" 441 % self.criterion) 442 std = _return_std(X, self.estimators_, mean, self.min_variance) 443 return mean, std 444 445 return mean 446