1""" 2Testing for the partial dependence module. 3""" 4 5import numpy as np 6import pytest 7 8import sklearn 9from sklearn.inspection import partial_dependence 10from sklearn.inspection._partial_dependence import ( 11 _grid_from_X, 12 _partial_dependence_brute, 13 _partial_dependence_recursion, 14) 15from sklearn.ensemble import GradientBoostingClassifier 16from sklearn.ensemble import GradientBoostingRegressor 17from sklearn.ensemble import RandomForestRegressor 18from sklearn.ensemble import HistGradientBoostingClassifier 19from sklearn.ensemble import HistGradientBoostingRegressor 20from sklearn.linear_model import LinearRegression 21from sklearn.linear_model import LogisticRegression 22from sklearn.linear_model import MultiTaskLasso 23from sklearn.tree import DecisionTreeRegressor 24from sklearn.datasets import load_iris 25from sklearn.datasets import make_classification, make_regression 26from sklearn.cluster import KMeans 27from sklearn.compose import make_column_transformer 28from sklearn.metrics import r2_score 29from sklearn.preprocessing import PolynomialFeatures 30from sklearn.preprocessing import StandardScaler 31from sklearn.preprocessing import RobustScaler 32from sklearn.preprocessing import scale 33from sklearn.pipeline import make_pipeline 34from sklearn.dummy import DummyClassifier 35from sklearn.base import BaseEstimator, ClassifierMixin, clone 36from sklearn.exceptions import NotFittedError 37from sklearn.utils._testing import assert_allclose 38from sklearn.utils._testing import assert_array_equal 39from sklearn.utils import _IS_32BIT 40from sklearn.utils.validation import check_random_state 41from sklearn.tree.tests.test_tree import assert_is_subtree 42 43 44# toy sample 45X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] 46y = [-1, -1, -1, 1, 1, 1] 47 48 49# (X, y), n_targets <-- as expected in the output of partial_dep() 50binary_classification_data = (make_classification(n_samples=50, random_state=0), 1) 51multiclass_classification_data = ( 52 make_classification( 53 n_samples=50, n_classes=3, n_clusters_per_class=1, random_state=0 54 ), 55 3, 56) 57regression_data = (make_regression(n_samples=50, random_state=0), 1) 58multioutput_regression_data = ( 59 make_regression(n_samples=50, n_targets=2, random_state=0), 60 2, 61) 62 63# iris 64iris = load_iris() 65 66 67@pytest.mark.filterwarnings("ignore:A Bunch will be returned") 68@pytest.mark.parametrize( 69 "Estimator, method, data", 70 [ 71 (GradientBoostingClassifier, "auto", binary_classification_data), 72 (GradientBoostingClassifier, "auto", multiclass_classification_data), 73 (GradientBoostingClassifier, "brute", binary_classification_data), 74 (GradientBoostingClassifier, "brute", multiclass_classification_data), 75 (GradientBoostingRegressor, "auto", regression_data), 76 (GradientBoostingRegressor, "brute", regression_data), 77 (DecisionTreeRegressor, "brute", regression_data), 78 (LinearRegression, "brute", regression_data), 79 (LinearRegression, "brute", multioutput_regression_data), 80 (LogisticRegression, "brute", binary_classification_data), 81 (LogisticRegression, "brute", multiclass_classification_data), 82 (MultiTaskLasso, "brute", multioutput_regression_data), 83 ], 84) 85@pytest.mark.parametrize("grid_resolution", (5, 10)) 86@pytest.mark.parametrize("features", ([1], [1, 2])) 87@pytest.mark.parametrize("kind", ("legacy", "average", "individual", "both")) 88def test_output_shape(Estimator, method, data, grid_resolution, features, kind): 89 # Check that partial_dependence has consistent output shape for different 90 # kinds of estimators: 91 # - classifiers with binary and multiclass settings 92 # - regressors 93 # - multi-task regressors 94 95 est = Estimator() 96 97 # n_target corresponds to the number of classes (1 for binary classif) or 98 # the number of tasks / outputs in multi task settings. It's equal to 1 for 99 # classical regression_data. 100 (X, y), n_targets = data 101 n_instances = X.shape[0] 102 103 est.fit(X, y) 104 result = partial_dependence( 105 est, 106 X=X, 107 features=features, 108 method=method, 109 kind=kind, 110 grid_resolution=grid_resolution, 111 ) 112 # FIXME: Remove 'legacy' support in 1.1 113 pdp, axes = result if kind == "legacy" else (result, result["values"]) 114 115 expected_pdp_shape = (n_targets, *[grid_resolution for _ in range(len(features))]) 116 expected_ice_shape = ( 117 n_targets, 118 n_instances, 119 *[grid_resolution for _ in range(len(features))], 120 ) 121 if kind == "legacy": 122 assert pdp.shape == expected_pdp_shape 123 elif kind == "average": 124 assert pdp.average.shape == expected_pdp_shape 125 elif kind == "individual": 126 assert pdp.individual.shape == expected_ice_shape 127 else: # 'both' 128 assert pdp.average.shape == expected_pdp_shape 129 assert pdp.individual.shape == expected_ice_shape 130 131 expected_axes_shape = (len(features), grid_resolution) 132 assert axes is not None 133 assert np.asarray(axes).shape == expected_axes_shape 134 135 136def test_grid_from_X(): 137 # tests for _grid_from_X: sanity check for output, and for shapes. 138 139 # Make sure that the grid is a cartesian product of the input (it will use 140 # the unique values instead of the percentiles) 141 percentiles = (0.05, 0.95) 142 grid_resolution = 100 143 X = np.asarray([[1, 2], [3, 4]]) 144 grid, axes = _grid_from_X(X, percentiles, grid_resolution) 145 assert_array_equal(grid, [[1, 2], [1, 4], [3, 2], [3, 4]]) 146 assert_array_equal(axes, X.T) 147 148 # test shapes of returned objects depending on the number of unique values 149 # for a feature. 150 rng = np.random.RandomState(0) 151 grid_resolution = 15 152 153 # n_unique_values > grid_resolution 154 X = rng.normal(size=(20, 2)) 155 grid, axes = _grid_from_X(X, percentiles, grid_resolution=grid_resolution) 156 assert grid.shape == (grid_resolution * grid_resolution, X.shape[1]) 157 assert np.asarray(axes).shape == (2, grid_resolution) 158 159 # n_unique_values < grid_resolution, will use actual values 160 n_unique_values = 12 161 X[n_unique_values - 1 :, 0] = 12345 162 rng.shuffle(X) # just to make sure the order is irrelevant 163 grid, axes = _grid_from_X(X, percentiles, grid_resolution=grid_resolution) 164 assert grid.shape == (n_unique_values * grid_resolution, X.shape[1]) 165 # axes is a list of arrays of different shapes 166 assert axes[0].shape == (n_unique_values,) 167 assert axes[1].shape == (grid_resolution,) 168 169 170@pytest.mark.parametrize( 171 "grid_resolution, percentiles, err_msg", 172 [ 173 (2, (0, 0.0001), "percentiles are too close"), 174 (100, (1, 2, 3, 4), "'percentiles' must be a sequence of 2 elements"), 175 (100, 12345, "'percentiles' must be a sequence of 2 elements"), 176 (100, (-1, 0.95), r"'percentiles' values must be in \[0, 1\]"), 177 (100, (0.05, 2), r"'percentiles' values must be in \[0, 1\]"), 178 (100, (0.9, 0.1), r"percentiles\[0\] must be strictly less than"), 179 (1, (0.05, 0.95), "'grid_resolution' must be strictly greater than 1"), 180 ], 181) 182def test_grid_from_X_error(grid_resolution, percentiles, err_msg): 183 X = np.asarray([[1, 2], [3, 4]]) 184 with pytest.raises(ValueError, match=err_msg): 185 _grid_from_X(X, grid_resolution=grid_resolution, percentiles=percentiles) 186 187 188@pytest.mark.parametrize("target_feature", range(5)) 189@pytest.mark.parametrize( 190 "est, method", 191 [ 192 (LinearRegression(), "brute"), 193 (GradientBoostingRegressor(random_state=0), "brute"), 194 (GradientBoostingRegressor(random_state=0), "recursion"), 195 (HistGradientBoostingRegressor(random_state=0), "brute"), 196 (HistGradientBoostingRegressor(random_state=0), "recursion"), 197 ], 198) 199def test_partial_dependence_helpers(est, method, target_feature): 200 # Check that what is returned by _partial_dependence_brute or 201 # _partial_dependence_recursion is equivalent to manually setting a target 202 # feature to a given value, and computing the average prediction over all 203 # samples. 204 # This also checks that the brute and recursion methods give the same 205 # output. 206 # Note that even on the trainset, the brute and the recursion methods 207 # aren't always strictly equivalent, in particular when the slow method 208 # generates unrealistic samples that have low mass in the joint 209 # distribution of the input features, and when some of the features are 210 # dependent. Hence the high tolerance on the checks. 211 212 X, y = make_regression(random_state=0, n_features=5, n_informative=5) 213 # The 'init' estimator for GBDT (here the average prediction) isn't taken 214 # into account with the recursion method, for technical reasons. We set 215 # the mean to 0 to that this 'bug' doesn't have any effect. 216 y = y - y.mean() 217 est.fit(X, y) 218 219 # target feature will be set to .5 and then to 123 220 features = np.array([target_feature], dtype=np.int32) 221 grid = np.array([[0.5], [123]]) 222 223 if method == "brute": 224 pdp, predictions = _partial_dependence_brute( 225 est, grid, features, X, response_method="auto" 226 ) 227 else: 228 pdp = _partial_dependence_recursion(est, grid, features) 229 230 mean_predictions = [] 231 for val in (0.5, 123): 232 X_ = X.copy() 233 X_[:, target_feature] = val 234 mean_predictions.append(est.predict(X_).mean()) 235 236 pdp = pdp[0] # (shape is (1, 2) so make it (2,)) 237 238 # allow for greater margin for error with recursion method 239 rtol = 1e-1 if method == "recursion" else 1e-3 240 assert np.allclose(pdp, mean_predictions, rtol=rtol) 241 242 243@pytest.mark.parametrize("seed", range(1)) 244def test_recursion_decision_tree_vs_forest_and_gbdt(seed): 245 # Make sure that the recursion method gives the same results on a 246 # DecisionTreeRegressor and a GradientBoostingRegressor or a 247 # RandomForestRegressor with 1 tree and equivalent parameters. 248 249 rng = np.random.RandomState(seed) 250 251 # Purely random dataset to avoid correlated features 252 n_samples = 1000 253 n_features = 5 254 X = rng.randn(n_samples, n_features) 255 y = rng.randn(n_samples) * 10 256 257 # The 'init' estimator for GBDT (here the average prediction) isn't taken 258 # into account with the recursion method, for technical reasons. We set 259 # the mean to 0 to that this 'bug' doesn't have any effect. 260 y = y - y.mean() 261 262 # set max_depth not too high to avoid splits with same gain but different 263 # features 264 max_depth = 5 265 266 tree_seed = 0 267 forest = RandomForestRegressor( 268 n_estimators=1, 269 max_features=None, 270 bootstrap=False, 271 max_depth=max_depth, 272 random_state=tree_seed, 273 ) 274 # The forest will use ensemble.base._set_random_states to set the 275 # random_state of the tree sub-estimator. We simulate this here to have 276 # equivalent estimators. 277 equiv_random_state = check_random_state(tree_seed).randint(np.iinfo(np.int32).max) 278 gbdt = GradientBoostingRegressor( 279 n_estimators=1, 280 learning_rate=1, 281 criterion="squared_error", 282 max_depth=max_depth, 283 random_state=equiv_random_state, 284 ) 285 tree = DecisionTreeRegressor(max_depth=max_depth, random_state=equiv_random_state) 286 287 forest.fit(X, y) 288 gbdt.fit(X, y) 289 tree.fit(X, y) 290 291 # sanity check: if the trees aren't the same, the PD values won't be equal 292 try: 293 assert_is_subtree(tree.tree_, gbdt[0, 0].tree_) 294 assert_is_subtree(tree.tree_, forest[0].tree_) 295 except AssertionError: 296 # For some reason the trees aren't exactly equal on 32bits, so the PDs 297 # cannot be equal either. See 298 # https://github.com/scikit-learn/scikit-learn/issues/8853 299 assert _IS_32BIT, "this should only fail on 32 bit platforms" 300 return 301 302 grid = rng.randn(50).reshape(-1, 1) 303 for f in range(n_features): 304 features = np.array([f], dtype=np.int32) 305 306 pdp_forest = _partial_dependence_recursion(forest, grid, features) 307 pdp_gbdt = _partial_dependence_recursion(gbdt, grid, features) 308 pdp_tree = _partial_dependence_recursion(tree, grid, features) 309 310 np.testing.assert_allclose(pdp_gbdt, pdp_tree) 311 np.testing.assert_allclose(pdp_forest, pdp_tree) 312 313 314@pytest.mark.parametrize( 315 "est", 316 ( 317 GradientBoostingClassifier(random_state=0), 318 HistGradientBoostingClassifier(random_state=0), 319 ), 320) 321@pytest.mark.parametrize("target_feature", (0, 1, 2, 3, 4, 5)) 322def test_recursion_decision_function(est, target_feature): 323 # Make sure the recursion method (implicitly uses decision_function) has 324 # the same result as using brute method with 325 # response_method=decision_function 326 327 X, y = make_classification(n_classes=2, n_clusters_per_class=1, random_state=1) 328 assert np.mean(y) == 0.5 # make sure the init estimator predicts 0 anyway 329 330 est.fit(X, y) 331 332 preds_1 = partial_dependence( 333 est, 334 X, 335 [target_feature], 336 response_method="decision_function", 337 method="recursion", 338 kind="average", 339 ) 340 preds_2 = partial_dependence( 341 est, 342 X, 343 [target_feature], 344 response_method="decision_function", 345 method="brute", 346 kind="average", 347 ) 348 349 assert_allclose(preds_1["average"], preds_2["average"], atol=1e-7) 350 351 352@pytest.mark.parametrize( 353 "est", 354 ( 355 LinearRegression(), 356 GradientBoostingRegressor(random_state=0), 357 HistGradientBoostingRegressor( 358 random_state=0, min_samples_leaf=1, max_leaf_nodes=None, max_iter=1 359 ), 360 DecisionTreeRegressor(random_state=0), 361 ), 362) 363@pytest.mark.parametrize("power", (1, 2)) 364def test_partial_dependence_easy_target(est, power): 365 # If the target y only depends on one feature in an obvious way (linear or 366 # quadratic) then the partial dependence for that feature should reflect 367 # it. 368 # We here fit a linear regression_data model (with polynomial features if 369 # needed) and compute r_squared to check that the partial dependence 370 # correctly reflects the target. 371 372 rng = np.random.RandomState(0) 373 n_samples = 200 374 target_variable = 2 375 X = rng.normal(size=(n_samples, 5)) 376 y = X[:, target_variable] ** power 377 378 est.fit(X, y) 379 380 pdp = partial_dependence( 381 est, features=[target_variable], X=X, grid_resolution=1000, kind="average" 382 ) 383 384 new_X = pdp["values"][0].reshape(-1, 1) 385 new_y = pdp["average"][0] 386 # add polynomial features if needed 387 new_X = PolynomialFeatures(degree=power).fit_transform(new_X) 388 389 lr = LinearRegression().fit(new_X, new_y) 390 r2 = r2_score(new_y, lr.predict(new_X)) 391 392 assert r2 > 0.99 393 394 395@pytest.mark.parametrize( 396 "Estimator", 397 ( 398 sklearn.tree.DecisionTreeClassifier, 399 sklearn.tree.ExtraTreeClassifier, 400 sklearn.ensemble.ExtraTreesClassifier, 401 sklearn.neighbors.KNeighborsClassifier, 402 sklearn.neighbors.RadiusNeighborsClassifier, 403 sklearn.ensemble.RandomForestClassifier, 404 ), 405) 406def test_multiclass_multioutput(Estimator): 407 # Make sure error is raised for multiclass-multioutput classifiers 408 409 # make multiclass-multioutput dataset 410 X, y = make_classification(n_classes=3, n_clusters_per_class=1, random_state=0) 411 y = np.array([y, y]).T 412 413 est = Estimator() 414 est.fit(X, y) 415 416 with pytest.raises( 417 ValueError, match="Multiclass-multioutput estimators are not supported" 418 ): 419 partial_dependence(est, X, [0]) 420 421 422class NoPredictProbaNoDecisionFunction(ClassifierMixin, BaseEstimator): 423 def fit(self, X, y): 424 # simulate that we have some classes 425 self.classes_ = [0, 1] 426 return self 427 428 429@pytest.mark.filterwarnings("ignore:A Bunch will be returned") 430@pytest.mark.parametrize( 431 "estimator, params, err_msg", 432 [ 433 ( 434 KMeans(), 435 {"features": [0]}, 436 "'estimator' must be a fitted regressor or classifier", 437 ), 438 ( 439 LinearRegression(), 440 {"features": [0], "response_method": "predict_proba"}, 441 "The response_method parameter is ignored for regressors", 442 ), 443 ( 444 GradientBoostingClassifier(random_state=0), 445 { 446 "features": [0], 447 "response_method": "predict_proba", 448 "method": "recursion", 449 }, 450 "'recursion' method, the response_method must be 'decision_function'", 451 ), 452 ( 453 GradientBoostingClassifier(random_state=0), 454 {"features": [0], "response_method": "predict_proba", "method": "auto"}, 455 "'recursion' method, the response_method must be 'decision_function'", 456 ), 457 ( 458 GradientBoostingClassifier(random_state=0), 459 {"features": [0], "response_method": "blahblah"}, 460 "response_method blahblah is invalid. Accepted response_method", 461 ), 462 ( 463 NoPredictProbaNoDecisionFunction(), 464 {"features": [0], "response_method": "auto"}, 465 "The estimator has no predict_proba and no decision_function method", 466 ), 467 ( 468 NoPredictProbaNoDecisionFunction(), 469 {"features": [0], "response_method": "predict_proba"}, 470 "The estimator has no predict_proba method.", 471 ), 472 ( 473 NoPredictProbaNoDecisionFunction(), 474 {"features": [0], "response_method": "decision_function"}, 475 "The estimator has no decision_function method.", 476 ), 477 ( 478 LinearRegression(), 479 {"features": [0], "method": "blahblah"}, 480 "blahblah is invalid. Accepted method names are brute, recursion, auto", 481 ), 482 ( 483 LinearRegression(), 484 {"features": [0], "method": "recursion", "kind": "individual"}, 485 "The 'recursion' method only applies when 'kind' is set to 'average'", 486 ), 487 ( 488 LinearRegression(), 489 {"features": [0], "method": "recursion", "kind": "both"}, 490 "The 'recursion' method only applies when 'kind' is set to 'average'", 491 ), 492 ( 493 LinearRegression(), 494 {"features": [0], "method": "recursion"}, 495 "Only the following estimators support the 'recursion' method:", 496 ), 497 ], 498) 499def test_partial_dependence_error(estimator, params, err_msg): 500 X, y = make_classification(random_state=0) 501 estimator.fit(X, y) 502 503 with pytest.raises(ValueError, match=err_msg): 504 partial_dependence(estimator, X, **params) 505 506 507@pytest.mark.parametrize( 508 "with_dataframe, err_msg", 509 [ 510 (True, "Only array-like or scalar are supported"), 511 (False, "Only array-like or scalar are supported"), 512 ], 513) 514def test_partial_dependence_slice_error(with_dataframe, err_msg): 515 X, y = make_classification(random_state=0) 516 if with_dataframe: 517 pd = pytest.importorskip("pandas") 518 X = pd.DataFrame(X) 519 estimator = LogisticRegression().fit(X, y) 520 521 with pytest.raises(TypeError, match=err_msg): 522 partial_dependence(estimator, X, features=slice(0, 2, 1)) 523 524 525@pytest.mark.parametrize( 526 "estimator", [LinearRegression(), GradientBoostingClassifier(random_state=0)] 527) 528@pytest.mark.parametrize("features", [-1, 10000]) 529def test_partial_dependence_unknown_feature_indices(estimator, features): 530 X, y = make_classification(random_state=0) 531 estimator.fit(X, y) 532 533 err_msg = "all features must be in" 534 with pytest.raises(ValueError, match=err_msg): 535 partial_dependence(estimator, X, [features]) 536 537 538@pytest.mark.parametrize( 539 "estimator", [LinearRegression(), GradientBoostingClassifier(random_state=0)] 540) 541def test_partial_dependence_unknown_feature_string(estimator): 542 pd = pytest.importorskip("pandas") 543 X, y = make_classification(random_state=0) 544 df = pd.DataFrame(X) 545 estimator.fit(df, y) 546 547 features = ["random"] 548 err_msg = "A given column is not a column of the dataframe" 549 with pytest.raises(ValueError, match=err_msg): 550 partial_dependence(estimator, df, features) 551 552 553@pytest.mark.parametrize( 554 "estimator", [LinearRegression(), GradientBoostingClassifier(random_state=0)] 555) 556def test_partial_dependence_X_list(estimator): 557 # check that array-like objects are accepted 558 X, y = make_classification(random_state=0) 559 estimator.fit(X, y) 560 partial_dependence(estimator, list(X), [0], kind="average") 561 562 563def test_warning_recursion_non_constant_init(): 564 # make sure that passing a non-constant init parameter to a GBDT and using 565 # recursion method yields a warning. 566 567 gbc = GradientBoostingClassifier(init=DummyClassifier(), random_state=0) 568 gbc.fit(X, y) 569 570 with pytest.warns( 571 UserWarning, match="Using recursion method with a non-constant init predictor" 572 ): 573 partial_dependence(gbc, X, [0], method="recursion", kind="average") 574 575 with pytest.warns( 576 UserWarning, match="Using recursion method with a non-constant init predictor" 577 ): 578 partial_dependence(gbc, X, [0], method="recursion", kind="average") 579 580 581def test_partial_dependence_sample_weight(): 582 # Test near perfect correlation between partial dependence and diagonal 583 # when sample weights emphasize y = x predictions 584 # non-regression test for #13193 585 # TODO: extend to HistGradientBoosting once sample_weight is supported 586 N = 1000 587 rng = np.random.RandomState(123456) 588 mask = rng.randint(2, size=N, dtype=bool) 589 590 x = rng.rand(N) 591 # set y = x on mask and y = -x outside 592 y = x.copy() 593 y[~mask] = -y[~mask] 594 X = np.c_[mask, x] 595 # sample weights to emphasize data points where y = x 596 sample_weight = np.ones(N) 597 sample_weight[mask] = 1000.0 598 599 clf = GradientBoostingRegressor(n_estimators=10, random_state=1) 600 clf.fit(X, y, sample_weight=sample_weight) 601 602 pdp = partial_dependence(clf, X, features=[1], kind="average") 603 604 assert np.corrcoef(pdp["average"], pdp["values"])[0, 1] > 0.99 605 606 607def test_hist_gbdt_sw_not_supported(): 608 # TODO: remove/fix when PDP supports HGBT with sample weights 609 clf = HistGradientBoostingRegressor(random_state=1) 610 clf.fit(X, y, sample_weight=np.ones(len(X))) 611 612 with pytest.raises( 613 NotImplementedError, match="does not support partial dependence" 614 ): 615 partial_dependence(clf, X, features=[1]) 616 617 618def test_partial_dependence_pipeline(): 619 # check that the partial dependence support pipeline 620 iris = load_iris() 621 622 scaler = StandardScaler() 623 clf = DummyClassifier(random_state=42) 624 pipe = make_pipeline(scaler, clf) 625 626 clf.fit(scaler.fit_transform(iris.data), iris.target) 627 pipe.fit(iris.data, iris.target) 628 629 features = 0 630 pdp_pipe = partial_dependence( 631 pipe, iris.data, features=[features], grid_resolution=10, kind="average" 632 ) 633 pdp_clf = partial_dependence( 634 clf, 635 scaler.transform(iris.data), 636 features=[features], 637 grid_resolution=10, 638 kind="average", 639 ) 640 assert_allclose(pdp_pipe["average"], pdp_clf["average"]) 641 assert_allclose( 642 pdp_pipe["values"][0], 643 pdp_clf["values"][0] * scaler.scale_[features] + scaler.mean_[features], 644 ) 645 646 647@pytest.mark.parametrize( 648 "estimator", 649 [ 650 LogisticRegression(max_iter=1000, random_state=0), 651 GradientBoostingClassifier(random_state=0, n_estimators=5), 652 ], 653 ids=["estimator-brute", "estimator-recursion"], 654) 655@pytest.mark.parametrize( 656 "preprocessor", 657 [ 658 None, 659 make_column_transformer( 660 (StandardScaler(), [iris.feature_names[i] for i in (0, 2)]), 661 (RobustScaler(), [iris.feature_names[i] for i in (1, 3)]), 662 ), 663 make_column_transformer( 664 (StandardScaler(), [iris.feature_names[i] for i in (0, 2)]), 665 remainder="passthrough", 666 ), 667 ], 668 ids=["None", "column-transformer", "column-transformer-passthrough"], 669) 670@pytest.mark.parametrize( 671 "features", 672 [[0, 2], [iris.feature_names[i] for i in (0, 2)]], 673 ids=["features-integer", "features-string"], 674) 675def test_partial_dependence_dataframe(estimator, preprocessor, features): 676 # check that the partial dependence support dataframe and pipeline 677 # including a column transformer 678 pd = pytest.importorskip("pandas") 679 df = pd.DataFrame(scale(iris.data), columns=iris.feature_names) 680 681 pipe = make_pipeline(preprocessor, estimator) 682 pipe.fit(df, iris.target) 683 pdp_pipe = partial_dependence( 684 pipe, df, features=features, grid_resolution=10, kind="average" 685 ) 686 687 # the column transformer will reorder the column when transforming 688 # we mixed the index to be sure that we are computing the partial 689 # dependence of the right columns 690 if preprocessor is not None: 691 X_proc = clone(preprocessor).fit_transform(df) 692 features_clf = [0, 1] 693 else: 694 X_proc = df 695 features_clf = [0, 2] 696 697 clf = clone(estimator).fit(X_proc, iris.target) 698 pdp_clf = partial_dependence( 699 clf, 700 X_proc, 701 features=features_clf, 702 method="brute", 703 grid_resolution=10, 704 kind="average", 705 ) 706 707 assert_allclose(pdp_pipe["average"], pdp_clf["average"]) 708 if preprocessor is not None: 709 scaler = preprocessor.named_transformers_["standardscaler"] 710 assert_allclose( 711 pdp_pipe["values"][1], 712 pdp_clf["values"][1] * scaler.scale_[1] + scaler.mean_[1], 713 ) 714 else: 715 assert_allclose(pdp_pipe["values"][1], pdp_clf["values"][1]) 716 717 718@pytest.mark.parametrize( 719 "features, expected_pd_shape", 720 [ 721 (0, (3, 10)), 722 (iris.feature_names[0], (3, 10)), 723 ([0, 2], (3, 10, 10)), 724 ([iris.feature_names[i] for i in (0, 2)], (3, 10, 10)), 725 ([True, False, True, False], (3, 10, 10)), 726 ], 727 ids=["scalar-int", "scalar-str", "list-int", "list-str", "mask"], 728) 729def test_partial_dependence_feature_type(features, expected_pd_shape): 730 # check all possible features type supported in PDP 731 pd = pytest.importorskip("pandas") 732 df = pd.DataFrame(iris.data, columns=iris.feature_names) 733 734 preprocessor = make_column_transformer( 735 (StandardScaler(), [iris.feature_names[i] for i in (0, 2)]), 736 (RobustScaler(), [iris.feature_names[i] for i in (1, 3)]), 737 ) 738 pipe = make_pipeline( 739 preprocessor, LogisticRegression(max_iter=1000, random_state=0) 740 ) 741 pipe.fit(df, iris.target) 742 pdp_pipe = partial_dependence( 743 pipe, df, features=features, grid_resolution=10, kind="average" 744 ) 745 assert pdp_pipe["average"].shape == expected_pd_shape 746 assert len(pdp_pipe["values"]) == len(pdp_pipe["average"].shape) - 1 747 748 749@pytest.mark.parametrize( 750 "estimator", 751 [ 752 LinearRegression(), 753 LogisticRegression(), 754 GradientBoostingRegressor(), 755 GradientBoostingClassifier(), 756 ], 757) 758def test_partial_dependence_unfitted(estimator): 759 X = iris.data 760 preprocessor = make_column_transformer( 761 (StandardScaler(), [0, 2]), (RobustScaler(), [1, 3]) 762 ) 763 pipe = make_pipeline(preprocessor, estimator) 764 with pytest.raises(NotFittedError, match="is not fitted yet"): 765 partial_dependence(pipe, X, features=[0, 2], grid_resolution=10) 766 with pytest.raises(NotFittedError, match="is not fitted yet"): 767 partial_dependence(estimator, X, features=[0, 2], grid_resolution=10) 768 769 770@pytest.mark.parametrize( 771 "Estimator, data", 772 [ 773 (LinearRegression, multioutput_regression_data), 774 (LogisticRegression, binary_classification_data), 775 ], 776) 777def test_kind_average_and_average_of_individual(Estimator, data): 778 est = Estimator() 779 (X, y), n_targets = data 780 est.fit(X, y) 781 782 pdp_avg = partial_dependence(est, X=X, features=[1, 2], kind="average") 783 pdp_ind = partial_dependence(est, X=X, features=[1, 2], kind="individual") 784 avg_ind = np.mean(pdp_ind["individual"], axis=1) 785 assert_allclose(avg_ind, pdp_avg["average"]) 786 787 788def test_warning_for_kind_legacy(): 789 est = LogisticRegression() 790 (X, y), n_targets = binary_classification_data 791 est.fit(X, y) 792 793 err_msg = "A Bunch will be returned in place of 'predictions' from version 1.1" 794 with pytest.warns(FutureWarning, match=err_msg): 795 partial_dependence(est, X=X, features=[1, 2]) 796 797 with pytest.warns(FutureWarning, match=err_msg): 798 partial_dependence(est, X=X, features=[1, 2], kind="legacy") 799