1"""An implementation of `An Efficient Approach for Assessing Hyperparameter Importance`. 2 3See http://proceedings.mlr.press/v32/hutter14.pdf and https://automl.github.io/fanova/cite.html 4for how to cite the original work. 5 6This implementation is inspired by the efficient algorithm in 7`fanova` (https://github.com/automl/fanova) and 8`pyrfr` (https://github.com/automl/random_forest_run) by the original authors. 9 10Differences include relying on scikit-learn to fit random forests 11(`sklearn.ensemble.RandomForestRegressor`) and that it is otherwise written entirely in Python. 12This stands in contrast to the original implementation which is partially written in C++. 13Since Python runtime overhead may become noticeable, included are instead several 14optimizations, e.g. vectorized NumPy functions to compute the marginals, instead of keeping all 15running statistics. Known cases include assessing higher order importances, e.g. pairwise 16importances, this is due to the fact that the number of partitions to visit grows exponentially, 17or when assessing categorical features with a larger number of choices since each choice is 18given a unique one-hot encoded raw feature. 19 20""" 21 22import itertools 23from typing import Dict 24from typing import List 25from typing import Optional 26from typing import Tuple 27from typing import Union 28 29import numpy 30 31from optuna._imports import try_import 32from optuna.importance._fanova._tree import _FanovaTree 33 34 35with try_import() as _imports: 36 from sklearn.ensemble import RandomForestRegressor 37 38 39class _Fanova(object): 40 def __init__( 41 self, 42 n_trees: int, 43 max_depth: int, 44 min_samples_split: Union[int, float], 45 min_samples_leaf: Union[int, float], 46 seed: Optional[int], 47 ) -> None: 48 _imports.check() 49 50 self._forest = RandomForestRegressor( 51 n_estimators=n_trees, 52 max_depth=max_depth, 53 min_samples_split=min_samples_split, 54 min_samples_leaf=min_samples_leaf, 55 random_state=seed, 56 ) 57 self._trees: Optional[List[_FanovaTree]] = None 58 self._variances: Optional[Dict[Tuple[int, ...], numpy.ndarray]] = None 59 self._column_to_encoded_columns: Optional[List[numpy.ndarray]] = None 60 61 def fit( 62 self, 63 X: numpy.ndarray, 64 y: numpy.ndarray, 65 search_spaces: numpy.ndarray, 66 column_to_encoded_columns: List[numpy.ndarray], 67 ) -> None: 68 assert X.shape[0] == y.shape[0] 69 assert X.shape[1] == search_spaces.shape[0] 70 assert search_spaces.shape[1] == 2 71 72 self._forest.fit(X, y) 73 74 self._trees = [_FanovaTree(e.tree_, search_spaces) for e in self._forest.estimators_] 75 self._column_to_encoded_columns = column_to_encoded_columns 76 self._variances = {} 77 78 if all(tree.variance == 0 for tree in self._trees): 79 # If all trees have 0 variance, we cannot assess any importances. 80 # This could occur if for instance `X.shape[0] == 1`. 81 raise RuntimeError("Encountered zero total variance in all trees.") 82 83 def get_importance(self, features: Tuple[int, ...]) -> Tuple[float, float]: 84 # Assert that `fit` has been called. 85 assert self._trees is not None 86 assert self._variances is not None 87 88 self._compute_variances(features) 89 90 fractions: Union[List[float], numpy.ndarray] = [] 91 92 for tree_index, tree in enumerate(self._trees): 93 tree_variance = tree.variance 94 if tree_variance > 0.0: 95 fraction = self._variances[features][tree_index] / tree_variance 96 fractions = numpy.append(fractions, fraction) 97 98 fractions = numpy.asarray(fractions) 99 100 return float(fractions.mean()), float(fractions.std()) 101 102 def _compute_variances(self, features: Tuple[int, ...]) -> None: 103 assert self._trees is not None 104 assert self._variances is not None 105 assert self._column_to_encoded_columns is not None 106 107 if features in self._variances: 108 return 109 110 for k in range(1, len(features)): 111 for sub_features in itertools.combinations(features, k): 112 if sub_features not in self._variances: 113 self._compute_variances(sub_features) 114 115 raw_features = numpy.concatenate([self._column_to_encoded_columns[f] for f in features]) 116 117 variances = numpy.empty(len(self._trees), dtype=numpy.float64) 118 119 for tree_index, tree in enumerate(self._trees): 120 marginal_variance = tree.get_marginal_variance(raw_features) 121 122 # See `fANOVA.__compute_marginals` in 123 # https://github.com/automl/fanova/blob/master/fanova/fanova.py. 124 for k in range(1, len(features)): 125 for sub_features in itertools.combinations(features, k): 126 marginal_variance -= self._variances[sub_features][tree_index] 127 128 variances[tree_index] = numpy.clip(marginal_variance, 0.0, numpy.inf) 129 130 self._variances[features] = variances 131