1"""An implementation of `An Efficient Approach for Assessing Hyperparameter Importance`.
2
3See http://proceedings.mlr.press/v32/hutter14.pdf and https://automl.github.io/fanova/cite.html
4for how to cite the original work.
5
6This implementation is inspired by the efficient algorithm in
7`fanova` (https://github.com/automl/fanova) and
8`pyrfr` (https://github.com/automl/random_forest_run) by the original authors.
9
10Differences include relying on scikit-learn to fit random forests
11(`sklearn.ensemble.RandomForestRegressor`) and that it is otherwise written entirely in Python.
12This stands in contrast to the original implementation which is partially written in C++.
13Since Python runtime overhead may become noticeable, included are instead several
14optimizations, e.g. vectorized NumPy functions to compute the marginals, instead of keeping all
15running statistics. Known cases include assessing higher order importances, e.g. pairwise
16importances, this is due to the fact that the number of partitions to visit grows exponentially,
17or when assessing categorical features with a larger number of choices since each choice is
18given a unique one-hot encoded raw feature.
19
20"""
21
22import itertools
23from typing import Dict
24from typing import List
25from typing import Optional
26from typing import Tuple
27from typing import Union
28
29import numpy
30
31from optuna._imports import try_import
32from optuna.importance._fanova._tree import _FanovaTree
33
34
35with try_import() as _imports:
36    from sklearn.ensemble import RandomForestRegressor
37
38
39class _Fanova(object):
40    def __init__(
41        self,
42        n_trees: int,
43        max_depth: int,
44        min_samples_split: Union[int, float],
45        min_samples_leaf: Union[int, float],
46        seed: Optional[int],
47    ) -> None:
48        _imports.check()
49
50        self._forest = RandomForestRegressor(
51            n_estimators=n_trees,
52            max_depth=max_depth,
53            min_samples_split=min_samples_split,
54            min_samples_leaf=min_samples_leaf,
55            random_state=seed,
56        )
57        self._trees: Optional[List[_FanovaTree]] = None
58        self._variances: Optional[Dict[Tuple[int, ...], numpy.ndarray]] = None
59        self._column_to_encoded_columns: Optional[List[numpy.ndarray]] = None
60
61    def fit(
62        self,
63        X: numpy.ndarray,
64        y: numpy.ndarray,
65        search_spaces: numpy.ndarray,
66        column_to_encoded_columns: List[numpy.ndarray],
67    ) -> None:
68        assert X.shape[0] == y.shape[0]
69        assert X.shape[1] == search_spaces.shape[0]
70        assert search_spaces.shape[1] == 2
71
72        self._forest.fit(X, y)
73
74        self._trees = [_FanovaTree(e.tree_, search_spaces) for e in self._forest.estimators_]
75        self._column_to_encoded_columns = column_to_encoded_columns
76        self._variances = {}
77
78        if all(tree.variance == 0 for tree in self._trees):
79            # If all trees have 0 variance, we cannot assess any importances.
80            # This could occur if for instance `X.shape[0] == 1`.
81            raise RuntimeError("Encountered zero total variance in all trees.")
82
83    def get_importance(self, features: Tuple[int, ...]) -> Tuple[float, float]:
84        # Assert that `fit` has been called.
85        assert self._trees is not None
86        assert self._variances is not None
87
88        self._compute_variances(features)
89
90        fractions: Union[List[float], numpy.ndarray] = []
91
92        for tree_index, tree in enumerate(self._trees):
93            tree_variance = tree.variance
94            if tree_variance > 0.0:
95                fraction = self._variances[features][tree_index] / tree_variance
96                fractions = numpy.append(fractions, fraction)
97
98        fractions = numpy.asarray(fractions)
99
100        return float(fractions.mean()), float(fractions.std())
101
102    def _compute_variances(self, features: Tuple[int, ...]) -> None:
103        assert self._trees is not None
104        assert self._variances is not None
105        assert self._column_to_encoded_columns is not None
106
107        if features in self._variances:
108            return
109
110        for k in range(1, len(features)):
111            for sub_features in itertools.combinations(features, k):
112                if sub_features not in self._variances:
113                    self._compute_variances(sub_features)
114
115        raw_features = numpy.concatenate([self._column_to_encoded_columns[f] for f in features])
116
117        variances = numpy.empty(len(self._trees), dtype=numpy.float64)
118
119        for tree_index, tree in enumerate(self._trees):
120            marginal_variance = tree.get_marginal_variance(raw_features)
121
122            # See `fANOVA.__compute_marginals` in
123            # https://github.com/automl/fanova/blob/master/fanova/fanova.py.
124            for k in range(1, len(features)):
125                for sub_features in itertools.combinations(features, k):
126                    marginal_variance -= self._variances[sub_features][tree_index]
127
128            variances[tree_index] = numpy.clip(marginal_variance, 0.0, numpy.inf)
129
130        self._variances[features] = variances
131