1from logging import DEBUG 2from logging import INFO 3from logging import WARNING 4from numbers import Integral 5from numbers import Number 6from time import time 7from typing import Any 8from typing import Callable 9from typing import Dict 10from typing import Iterable 11from typing import List 12from typing import Mapping 13from typing import Optional 14from typing import Union 15 16import numpy as np 17import scipy as sp 18from scipy.sparse import spmatrix 19 20from optuna import distributions 21from optuna import logging 22from optuna import samplers 23from optuna import study as study_module 24from optuna import TrialPruned 25from optuna._experimental import experimental 26from optuna._imports import try_import 27from optuna.study import StudyDirection 28from optuna.trial import FrozenTrial 29from optuna.trial import Trial 30 31 32with try_import() as _imports: 33 import pandas as pd 34 import sklearn 35 from sklearn.base import BaseEstimator 36 from sklearn.base import clone 37 from sklearn.base import is_classifier 38 from sklearn.metrics import check_scoring 39 from sklearn.model_selection import BaseCrossValidator 40 from sklearn.model_selection import check_cv 41 from sklearn.model_selection import cross_validate 42 from sklearn.utils import check_random_state 43 from sklearn.utils.metaestimators import _safe_split 44 45 if sklearn.__version__ >= "0.22": 46 from sklearn.utils import _safe_indexing as sklearn_safe_indexing 47 else: 48 from sklearn.utils import safe_indexing as sklearn_safe_indexing 49 from sklearn.utils.validation import check_is_fitted 50 51if not _imports.is_successful(): 52 BaseEstimator = object # NOQA 53 54ArrayLikeType = Union[List, np.ndarray, "pd.Series", spmatrix] 55OneDimArrayLikeType = Union[List[float], np.ndarray, "pd.Series"] 56TwoDimArrayLikeType = Union[List[List[float]], np.ndarray, "pd.DataFrame", spmatrix] 57IterableType = Union[List, "pd.DataFrame", np.ndarray, "pd.Series", spmatrix, None] 58IndexableType = Union[Iterable, None] 59 60_logger = logging.get_logger(__name__) 61 62 63def _check_fit_params( 64 X: TwoDimArrayLikeType, fit_params: Dict, indices: OneDimArrayLikeType 65) -> Dict: 66 67 fit_params_validated = {} 68 for key, value in fit_params.items(): 69 70 # NOTE Original implementation: 71 # https://github.com/scikit-learn/scikit-learn/blob/ \ 72 # 2467e1b84aeb493a22533fa15ff92e0d7c05ed1c/sklearn/utils/validation.py#L1324-L1328 73 # Scikit-learn does not accept non-iterable inputs. 74 # This line is for keeping backward compatibility. 75 # (See: https://github.com/scikit-learn/scikit-learn/issues/15805) 76 if not _is_arraylike(value) or _num_samples(value) != _num_samples(X): 77 fit_params_validated[key] = value 78 else: 79 fit_params_validated[key] = _make_indexable(value) 80 fit_params_validated[key] = _safe_indexing(fit_params_validated[key], indices) 81 return fit_params_validated 82 83 84# NOTE Original implementation: 85# https://github.com/scikit-learn/scikit-learn/blob/ \ 86# 8caa93889f85254fc3ca84caa0a24a1640eebdd1/sklearn/utils/validation.py#L131-L135 87def _is_arraylike(x: Any) -> bool: 88 89 return hasattr(x, "__len__") or hasattr(x, "shape") or hasattr(x, "__array__") 90 91 92# NOTE Original implementation: 93# https://github.com/scikit-learn/scikit-learn/blob/ \ 94# 8caa93889f85254fc3ca84caa0a24a1640eebdd1/sklearn/utils/validation.py#L217-L234 95def _make_indexable(iterable: IterableType) -> IndexableType: 96 97 tocsr_func = getattr(iterable, "tocsr", None) 98 if tocsr_func is not None and sp.sparse.issparse(iterable): 99 return tocsr_func(iterable) 100 elif hasattr(iterable, "__getitem__") or hasattr(iterable, "iloc"): 101 return iterable 102 elif iterable is None: 103 return iterable 104 return np.array(iterable) 105 106 107def _num_samples(x: ArrayLikeType) -> int: 108 109 # NOTE For dask dataframes 110 # https://github.com/scikit-learn/scikit-learn/blob/ \ 111 # 8caa93889f85254fc3ca84caa0a24a1640eebdd1/sklearn/utils/validation.py#L155-L158 112 x_shape = getattr(x, "shape", None) 113 if x_shape is not None: 114 if isinstance(x_shape[0], Integral): 115 return int(x_shape[0]) 116 117 try: 118 return len(x) 119 except TypeError: 120 raise TypeError("Expected sequence or array-like, got %s." % type(x)) from None 121 122 123def _safe_indexing( 124 X: Union[OneDimArrayLikeType, TwoDimArrayLikeType], indices: OneDimArrayLikeType 125) -> Union[OneDimArrayLikeType, TwoDimArrayLikeType]: 126 127 if X is None: 128 return X 129 130 return sklearn_safe_indexing(X, indices) 131 132 133class _Objective(object): 134 """Callable that implements objective function. 135 136 Args: 137 estimator: 138 Object to use to fit the data. This is assumed to implement the 139 scikit-learn estimator interface. Either this needs to provide 140 ``score``, or ``scoring`` must be passed. 141 142 param_distributions: 143 Dictionary where keys are parameters and values are distributions. 144 Distributions are assumed to implement the optuna distribution 145 interface. 146 147 X: 148 Training data. 149 150 y: 151 Target variable. 152 153 cv: 154 Cross-validation strategy. 155 156 enable_pruning: 157 If :obj:`True`, pruning is performed in the case where the 158 underlying estimator supports ``partial_fit``. 159 160 error_score: 161 Value to assign to the score if an error occurs in fitting. If 162 'raise', the error is raised. If numeric, 163 ``sklearn.exceptions.FitFailedWarning`` is raised. This does not 164 affect the refit step, which will always raise the error. 165 166 fit_params: 167 Parameters passed to ``fit`` one the estimator. 168 169 groups: 170 Group labels for the samples used while splitting the dataset into 171 train/validation set. 172 173 max_iter: 174 Maximum number of epochs. This is only used if the underlying 175 estimator supports ``partial_fit``. 176 177 return_train_score: 178 If :obj:`True`, training scores will be included. Computing 179 training scores is used to get insights on how different 180 hyperparameter settings impact the overfitting/underfitting 181 trade-off. However computing training scores can be 182 computationally expensive and is not strictly required to select 183 the hyperparameters that yield the best generalization 184 performance. 185 186 scoring: 187 Scorer function. 188 """ 189 190 def __init__( 191 self, 192 estimator: "BaseEstimator", 193 param_distributions: Mapping[str, distributions.BaseDistribution], 194 X: TwoDimArrayLikeType, 195 y: Optional[Union[OneDimArrayLikeType, TwoDimArrayLikeType]], 196 cv: "BaseCrossValidator", 197 enable_pruning: bool, 198 error_score: Union[Number, float, str], 199 fit_params: Dict[str, Any], 200 groups: Optional[OneDimArrayLikeType], 201 max_iter: int, 202 return_train_score: bool, 203 scoring: Callable[..., Number], 204 ) -> None: 205 206 self.cv = cv 207 self.enable_pruning = enable_pruning 208 self.error_score = error_score 209 self.estimator = estimator 210 self.fit_params = fit_params 211 self.groups = groups 212 self.max_iter = max_iter 213 self.param_distributions = param_distributions 214 self.return_train_score = return_train_score 215 self.scoring = scoring 216 self.X = X 217 self.y = y 218 219 def __call__(self, trial: Trial) -> float: 220 221 estimator = clone(self.estimator) 222 params = self._get_params(trial) 223 224 estimator.set_params(**params) 225 226 if self.enable_pruning: 227 scores = self._cross_validate_with_pruning(trial, estimator) 228 else: 229 scores = cross_validate( 230 estimator, 231 self.X, 232 self.y, 233 cv=self.cv, 234 error_score=self.error_score, 235 fit_params=self.fit_params, 236 groups=self.groups, 237 return_train_score=self.return_train_score, 238 scoring=self.scoring, 239 ) 240 241 self._store_scores(trial, scores) 242 243 return trial.user_attrs["mean_test_score"] 244 245 def _cross_validate_with_pruning( 246 self, trial: Trial, estimator: "BaseEstimator" 247 ) -> Mapping[str, OneDimArrayLikeType]: 248 249 if is_classifier(estimator): 250 partial_fit_params = self.fit_params.copy() 251 classes = np.unique(self.y) 252 253 partial_fit_params.setdefault("classes", classes) 254 255 else: 256 partial_fit_params = self.fit_params 257 258 n_splits = self.cv.get_n_splits(self.X, self.y, groups=self.groups) 259 estimators = [clone(estimator) for _ in range(n_splits)] 260 scores = { 261 "fit_time": np.zeros(n_splits), 262 "score_time": np.zeros(n_splits), 263 "test_score": np.empty(n_splits), 264 } 265 266 if self.return_train_score: 267 scores["train_score"] = np.empty(n_splits) 268 269 for step in range(self.max_iter): 270 for i, (train, test) in enumerate(self.cv.split(self.X, self.y, groups=self.groups)): 271 out = self._partial_fit_and_score(estimators[i], train, test, partial_fit_params) 272 273 if self.return_train_score: 274 scores["train_score"][i] = out.pop(0) 275 276 scores["test_score"][i] = out[0] 277 scores["fit_time"][i] += out[1] 278 scores["score_time"][i] += out[2] 279 280 intermediate_value = np.nanmean(scores["test_score"]) 281 282 trial.report(intermediate_value, step=step) 283 284 if trial.should_prune(): 285 self._store_scores(trial, scores) 286 287 raise TrialPruned("trial was pruned at iteration {}.".format(step)) 288 289 return scores 290 291 def _get_params(self, trial: Trial) -> Dict[str, Any]: 292 293 return { 294 name: trial._suggest(name, distribution) 295 for name, distribution in self.param_distributions.items() 296 } 297 298 def _partial_fit_and_score( 299 self, 300 estimator: "BaseEstimator", 301 train: List[int], 302 test: List[int], 303 partial_fit_params: Dict[str, Any], 304 ) -> List[Number]: 305 306 X_train, y_train = _safe_split(estimator, self.X, self.y, train) 307 X_test, y_test = _safe_split(estimator, self.X, self.y, test, train_indices=train) 308 309 start_time = time() 310 311 try: 312 estimator.partial_fit(X_train, y_train, **partial_fit_params) 313 314 except Exception as e: 315 if self.error_score == "raise": 316 raise e 317 318 elif isinstance(self.error_score, Number): 319 fit_time = time() - start_time 320 test_score = self.error_score 321 score_time = 0.0 322 323 if self.return_train_score: 324 train_score = self.error_score 325 326 else: 327 raise ValueError("error_score must be 'raise' or numeric.") from e 328 329 else: 330 fit_time = time() - start_time 331 test_score = self.scoring(estimator, X_test, y_test) 332 score_time = time() - fit_time - start_time 333 334 if self.return_train_score: 335 train_score = self.scoring(estimator, X_train, y_train) 336 337 # Required for type checking but is never expected to fail. 338 assert isinstance(fit_time, Number) 339 assert isinstance(score_time, Number) 340 341 ret = [test_score, fit_time, score_time] 342 343 if self.return_train_score: 344 ret.insert(0, train_score) 345 346 return ret 347 348 def _store_scores(self, trial: Trial, scores: Mapping[str, OneDimArrayLikeType]) -> None: 349 350 for name, array in scores.items(): 351 if name in ["test_score", "train_score"]: 352 for i, score in enumerate(array): 353 trial.set_user_attr("split{}_{}".format(i, name), score) 354 355 trial.set_user_attr("mean_{}".format(name), np.nanmean(array)) 356 trial.set_user_attr("std_{}".format(name), np.nanstd(array)) 357 358 359@experimental("0.17.0") 360class OptunaSearchCV(BaseEstimator): 361 """Hyperparameter search with cross-validation. 362 363 Args: 364 estimator: 365 Object to use to fit the data. This is assumed to implement the 366 scikit-learn estimator interface. Either this needs to provide 367 ``score``, or ``scoring`` must be passed. 368 369 param_distributions: 370 Dictionary where keys are parameters and values are distributions. 371 Distributions are assumed to implement the optuna distribution 372 interface. 373 374 cv: 375 Cross-validation strategy. Possible inputs for cv are: 376 377 - integer to specify the number of folds in a CV splitter, 378 - a CV splitter, 379 - an iterable yielding (train, validation) splits as arrays of indices. 380 381 For integer, if :obj:`estimator` is a classifier and :obj:`y` is 382 either binary or multiclass, 383 ``sklearn.model_selection.StratifiedKFold`` is used. otherwise, 384 ``sklearn.model_selection.KFold`` is used. 385 386 enable_pruning: 387 If :obj:`True`, pruning is performed in the case where the 388 underlying estimator supports ``partial_fit``. 389 390 error_score: 391 Value to assign to the score if an error occurs in fitting. If 392 'raise', the error is raised. If numeric, 393 ``sklearn.exceptions.FitFailedWarning`` is raised. This does not 394 affect the refit step, which will always raise the error. 395 396 max_iter: 397 Maximum number of epochs. This is only used if the underlying 398 estimator supports ``partial_fit``. 399 400 n_jobs: 401 Number of :obj:`threading` based parallel jobs. :obj:`-1` means 402 using the number is set to CPU count. 403 404 .. note:: 405 ``n_jobs`` allows parallelization using :obj:`threading` and may suffer from 406 `Python's GIL <https://wiki.python.org/moin/GlobalInterpreterLock>`_. 407 It is recommended to use :ref:`process-based parallelization<distributed>` 408 if ``func`` is CPU bound. 409 410 .. warning:: 411 Deprecated in v2.7.0. This feature will be removed in the future. 412 It is recommended to use :ref:`process-based parallelization<distributed>`. 413 The removal of this feature is currently scheduled for v4.0.0, but this 414 schedule is subject to change. 415 See https://github.com/optuna/optuna/releases/tag/v2.7.0. 416 417 n_trials: 418 Number of trials. If :obj:`None`, there is no limitation on the 419 number of trials. If :obj:`timeout` is also set to :obj:`None`, 420 the study continues to create trials until it receives a 421 termination signal such as Ctrl+C or SIGTERM. This trades off 422 runtime vs quality of the solution. 423 424 random_state: 425 Seed of the pseudo random number generator. If int, this is the 426 seed used by the random number generator. If 427 ``numpy.random.RandomState`` object, this is the random number 428 generator. If :obj:`None`, the global random state from 429 ``numpy.random`` is used. 430 431 refit: 432 If :obj:`True`, refit the estimator with the best found 433 hyperparameters. The refitted estimator is made available at the 434 ``best_estimator_`` attribute and permits using ``predict`` 435 directly. 436 437 return_train_score: 438 If :obj:`True`, training scores will be included. Computing 439 training scores is used to get insights on how different 440 hyperparameter settings impact the overfitting/underfitting 441 trade-off. However computing training scores can be 442 computationally expensive and is not strictly required to select 443 the hyperparameters that yield the best generalization 444 performance. 445 446 scoring: 447 String or callable to evaluate the predictions on the validation data. 448 If :obj:`None`, ``score`` on the estimator is used. 449 450 study: 451 Study corresponds to the optimization task. If :obj:`None`, a new 452 study is created. 453 454 subsample: 455 Proportion of samples that are used during hyperparameter search. 456 457 - If int, then draw ``subsample`` samples. 458 - If float, then draw ``subsample`` * ``X.shape[0]`` samples. 459 460 timeout: 461 Time limit in seconds for the search of appropriate models. If 462 :obj:`None`, the study is executed without time limitation. If 463 :obj:`n_trials` is also set to :obj:`None`, the study continues to 464 create trials until it receives a termination signal such as 465 Ctrl+C or SIGTERM. This trades off runtime vs quality of the 466 solution. 467 468 verbose: 469 Verbosity level. The higher, the more messages. 470 471 Attributes: 472 best_estimator_: 473 Estimator that was chosen by the search. This is present only if 474 ``refit`` is set to :obj:`True`. 475 476 n_splits_: 477 Number of cross-validation splits. 478 479 refit_time_: 480 Time for refitting the best estimator. This is present only if 481 ``refit`` is set to :obj:`True`. 482 483 sample_indices_: 484 Indices of samples that are used during hyperparameter search. 485 486 scorer_: 487 Scorer function. 488 489 study_: 490 Actual study. 491 492 Examples: 493 494 .. testcode:: 495 496 import optuna 497 from sklearn.datasets import load_iris 498 from sklearn.svm import SVC 499 500 clf = SVC(gamma="auto") 501 param_distributions = {"C": optuna.distributions.LogUniformDistribution(1e-10, 1e10)} 502 optuna_search = optuna.integration.OptunaSearchCV(clf, param_distributions) 503 X, y = load_iris(return_X_y=True) 504 optuna_search.fit(X, y) 505 y_pred = optuna_search.predict(X) 506 """ 507 508 _required_parameters = ["estimator", "param_distributions"] 509 510 @property 511 def _estimator_type(self) -> str: 512 513 return self.estimator._estimator_type 514 515 @property 516 def best_index_(self) -> int: 517 """Index which corresponds to the best candidate parameter setting.""" 518 519 df = self.trials_dataframe() 520 521 return df["value"].idxmin() 522 523 @property 524 def best_params_(self) -> Dict[str, Any]: 525 """Parameters of the best trial in the :class:`~optuna.study.Study`.""" 526 527 self._check_is_fitted() 528 529 return self.study_.best_params 530 531 @property 532 def best_score_(self) -> float: 533 """Mean cross-validated score of the best estimator.""" 534 535 self._check_is_fitted() 536 537 return self.study_.best_value 538 539 @property 540 def best_trial_(self) -> FrozenTrial: 541 """Best trial in the :class:`~optuna.study.Study`.""" 542 543 self._check_is_fitted() 544 545 return self.study_.best_trial 546 547 @property 548 def classes_(self) -> OneDimArrayLikeType: 549 """Class labels.""" 550 551 self._check_is_fitted() 552 553 return self.best_estimator_.classes_ 554 555 @property 556 def n_trials_(self) -> int: 557 """Actual number of trials.""" 558 559 return len(self.trials_) 560 561 @property 562 def trials_(self) -> List[FrozenTrial]: 563 """All trials in the :class:`~optuna.study.Study`.""" 564 565 self._check_is_fitted() 566 567 return self.study_.trials 568 569 @property 570 def user_attrs_(self) -> Dict[str, Any]: 571 """User attributes in the :class:`~optuna.study.Study`.""" 572 573 self._check_is_fitted() 574 575 return self.study_.user_attrs 576 577 @property 578 def decision_function(self) -> Callable[..., Union[OneDimArrayLikeType, TwoDimArrayLikeType]]: 579 """Call ``decision_function`` on the best estimator. 580 581 This is available only if the underlying estimator supports 582 ``decision_function`` and ``refit`` is set to :obj:`True`. 583 """ 584 585 self._check_is_fitted() 586 587 return self.best_estimator_.decision_function 588 589 @property 590 def inverse_transform(self) -> Callable[..., TwoDimArrayLikeType]: 591 """Call ``inverse_transform`` on the best estimator. 592 593 This is available only if the underlying estimator supports 594 ``inverse_transform`` and ``refit`` is set to :obj:`True`. 595 """ 596 597 self._check_is_fitted() 598 599 return self.best_estimator_.inverse_transform 600 601 @property 602 def predict(self) -> Callable[..., Union[OneDimArrayLikeType, TwoDimArrayLikeType]]: 603 """Call ``predict`` on the best estimator. 604 605 This is available only if the underlying estimator supports ``predict`` 606 and ``refit`` is set to :obj:`True`. 607 """ 608 609 self._check_is_fitted() 610 611 return self.best_estimator_.predict 612 613 @property 614 def predict_log_proba(self) -> Callable[..., TwoDimArrayLikeType]: 615 """Call ``predict_log_proba`` on the best estimator. 616 617 This is available only if the underlying estimator supports 618 ``predict_log_proba`` and ``refit`` is set to :obj:`True`. 619 """ 620 621 self._check_is_fitted() 622 623 return self.best_estimator_.predict_log_proba 624 625 @property 626 def predict_proba(self) -> Callable[..., TwoDimArrayLikeType]: 627 """Call ``predict_proba`` on the best estimator. 628 629 This is available only if the underlying estimator supports 630 ``predict_proba`` and ``refit`` is set to :obj:`True`. 631 """ 632 633 self._check_is_fitted() 634 635 return self.best_estimator_.predict_proba 636 637 @property 638 def score_samples(self) -> Callable[..., OneDimArrayLikeType]: 639 """Call ``score_samples`` on the best estimator. 640 641 This is available only if the underlying estimator supports 642 ``score_samples`` and ``refit`` is set to :obj:`True`. 643 """ 644 645 self._check_is_fitted() 646 647 return self.best_estimator_.score_samples 648 649 @property 650 def set_user_attr(self) -> Callable[..., None]: 651 """Call ``set_user_attr`` on the :class:`~optuna.study.Study`.""" 652 653 self._check_is_fitted() 654 655 return self.study_.set_user_attr 656 657 @property 658 def transform(self) -> Callable[..., TwoDimArrayLikeType]: 659 """Call ``transform`` on the best estimator. 660 661 This is available only if the underlying estimator supports 662 ``transform`` and ``refit`` is set to :obj:`True`. 663 """ 664 665 self._check_is_fitted() 666 667 return self.best_estimator_.transform 668 669 @property 670 def trials_dataframe(self) -> Callable[..., "pd.DataFrame"]: 671 """Call ``trials_dataframe`` on the :class:`~optuna.study.Study`.""" 672 673 self._check_is_fitted() 674 675 return self.study_.trials_dataframe 676 677 def __init__( 678 self, 679 estimator: "BaseEstimator", 680 param_distributions: Mapping[str, distributions.BaseDistribution], 681 cv: Optional[Union["BaseCrossValidator", int]] = 5, 682 enable_pruning: bool = False, 683 error_score: Union[Number, float, str] = np.nan, 684 max_iter: int = 1000, 685 n_jobs: int = 1, 686 n_trials: int = 10, 687 random_state: Optional[Union[int, np.random.RandomState]] = None, 688 refit: bool = True, 689 return_train_score: bool = False, 690 scoring: Optional[Union[Callable[..., float], str]] = None, 691 study: Optional[study_module.Study] = None, 692 subsample: Union[float, int] = 1.0, 693 timeout: Optional[float] = None, 694 verbose: int = 0, 695 ) -> None: 696 697 _imports.check() 698 699 self.cv = cv 700 self.enable_pruning = enable_pruning 701 self.error_score = error_score 702 self.estimator = estimator 703 self.max_iter = max_iter 704 self.n_trials = n_trials 705 self.n_jobs = n_jobs 706 self.param_distributions = param_distributions 707 self.random_state = random_state 708 self.refit = refit 709 self.return_train_score = return_train_score 710 self.scoring = scoring 711 self.study = study 712 self.subsample = subsample 713 self.timeout = timeout 714 self.verbose = verbose 715 716 def _check_is_fitted(self) -> None: 717 718 attributes = ["n_splits_", "sample_indices_", "scorer_", "study_"] 719 720 if self.refit: 721 attributes += ["best_estimator_", "refit_time_"] 722 723 check_is_fitted(self, attributes) 724 725 def _check_params(self) -> None: 726 727 if not hasattr(self.estimator, "fit"): 728 raise ValueError("estimator must be a scikit-learn estimator.") 729 730 if type(self.param_distributions) is not dict: 731 raise ValueError("param_distributions must be a dictionary.") 732 733 for name, distribution in self.param_distributions.items(): 734 if not isinstance(distribution, distributions.BaseDistribution): 735 raise ValueError("Value of {} must be a optuna distribution.".format(name)) 736 737 if self.enable_pruning and not hasattr(self.estimator, "partial_fit"): 738 raise ValueError("estimator must support partial_fit.") 739 740 if self.max_iter <= 0: 741 raise ValueError("max_iter must be > 0, got {}.".format(self.max_iter)) 742 743 if self.study is not None and self.study.direction != StudyDirection.MAXIMIZE: 744 raise ValueError("direction of study must be 'maximize'.") 745 746 def _more_tags(self) -> Dict[str, bool]: 747 748 return {"non_deterministic": True, "no_validation": True} 749 750 def _refit( 751 self, 752 X: TwoDimArrayLikeType, 753 y: Optional[Union[OneDimArrayLikeType, TwoDimArrayLikeType]] = None, 754 **fit_params: Any, 755 ) -> "OptunaSearchCV": 756 757 n_samples = _num_samples(X) 758 759 self.best_estimator_ = clone(self.estimator) 760 761 try: 762 self.best_estimator_.set_params(**self.study_.best_params) 763 except ValueError as e: 764 _logger.exception(e) 765 766 _logger.info("Refitting the estimator using {} samples...".format(n_samples)) 767 768 start_time = time() 769 770 self.best_estimator_.fit(X, y, **fit_params) 771 772 self.refit_time_ = time() - start_time 773 774 _logger.info("Finished refitting! (elapsed time: {:.3f} sec.)".format(self.refit_time_)) 775 776 return self 777 778 def fit( 779 self, 780 X: TwoDimArrayLikeType, 781 y: Optional[Union[OneDimArrayLikeType, TwoDimArrayLikeType]] = None, 782 groups: Optional[OneDimArrayLikeType] = None, 783 **fit_params: Any, 784 ) -> "OptunaSearchCV": 785 """Run fit with all sets of parameters. 786 787 Args: 788 X: 789 Training data. 790 791 y: 792 Target variable. 793 794 groups: 795 Group labels for the samples used while splitting the dataset 796 into train/validation set. 797 798 **fit_params: 799 Parameters passed to ``fit`` on the estimator. 800 801 Returns: 802 self: 803 Return self. 804 """ 805 806 self._check_params() 807 808 random_state = check_random_state(self.random_state) 809 max_samples = self.subsample 810 n_samples = _num_samples(X) 811 old_level = _logger.getEffectiveLevel() 812 813 if self.verbose > 1: 814 _logger.setLevel(DEBUG) 815 elif self.verbose > 0: 816 _logger.setLevel(INFO) 817 else: 818 _logger.setLevel(WARNING) 819 820 self.sample_indices_ = np.arange(n_samples) 821 822 if type(max_samples) is float: 823 max_samples = int(max_samples * n_samples) 824 825 if max_samples < n_samples: 826 self.sample_indices_ = random_state.choice( 827 self.sample_indices_, max_samples, replace=False 828 ) 829 830 self.sample_indices_.sort() 831 832 X_res = _safe_indexing(X, self.sample_indices_) 833 y_res = _safe_indexing(y, self.sample_indices_) 834 groups_res = _safe_indexing(groups, self.sample_indices_) 835 fit_params_res = fit_params 836 837 if fit_params_res is not None: 838 fit_params_res = _check_fit_params(X, fit_params, self.sample_indices_) 839 840 classifier = is_classifier(self.estimator) 841 cv = check_cv(self.cv, y_res, classifier=classifier) 842 843 self.n_splits_ = cv.get_n_splits(X_res, y_res, groups=groups_res) 844 self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) 845 846 if self.study is None: 847 seed = random_state.randint(0, np.iinfo("int32").max) 848 sampler = samplers.TPESampler(seed=seed) 849 850 self.study_ = study_module.create_study(direction="maximize", sampler=sampler) 851 852 else: 853 self.study_ = self.study 854 855 objective = _Objective( 856 self.estimator, 857 self.param_distributions, 858 X_res, 859 y_res, 860 cv, 861 self.enable_pruning, 862 self.error_score, 863 fit_params_res, 864 groups_res, 865 self.max_iter, 866 self.return_train_score, 867 self.scorer_, 868 ) 869 870 _logger.info( 871 "Searching the best hyperparameters using {} " 872 "samples...".format(_num_samples(self.sample_indices_)) 873 ) 874 875 self.study_.optimize( 876 objective, n_jobs=self.n_jobs, n_trials=self.n_trials, timeout=self.timeout 877 ) 878 879 _logger.info("Finished hyperparemeter search!") 880 881 if self.refit: 882 self._refit(X, y, **fit_params) 883 884 _logger.setLevel(old_level) 885 886 return self 887 888 def score( 889 self, 890 X: TwoDimArrayLikeType, 891 y: Optional[Union[OneDimArrayLikeType, TwoDimArrayLikeType]] = None, 892 ) -> float: 893 """Return the score on the given data. 894 895 Args: 896 X: 897 Data. 898 899 y: 900 Target variable. 901 902 Returns: 903 score: 904 Scaler score. 905 """ 906 907 return self.scorer_(self.best_estimator_, X, y) 908