1from time import time 2from collections import namedtuple 3import warnings 4 5from scipy import stats 6import numpy as np 7 8from ..base import clone 9from ..exceptions import ConvergenceWarning 10from ..preprocessing import normalize 11from ..utils import check_array, check_random_state, _safe_indexing, is_scalar_nan 12from ..utils.validation import FLOAT_DTYPES, check_is_fitted 13from ..utils._mask import _get_mask 14 15from ._base import _BaseImputer 16from ._base import SimpleImputer 17from ._base import _check_inputs_dtype 18 19 20_ImputerTriplet = namedtuple( 21 "_ImputerTriplet", ["feat_idx", "neighbor_feat_idx", "estimator"] 22) 23 24 25class IterativeImputer(_BaseImputer): 26 """Multivariate imputer that estimates each feature from all the others. 27 28 A strategy for imputing missing values by modeling each feature with 29 missing values as a function of other features in a round-robin fashion. 30 31 Read more in the :ref:`User Guide <iterative_imputer>`. 32 33 .. versionadded:: 0.21 34 35 .. note:: 36 37 This estimator is still **experimental** for now: the predictions 38 and the API might change without any deprecation cycle. To use it, 39 you need to explicitly import `enable_iterative_imputer`:: 40 41 >>> # explicitly require this experimental feature 42 >>> from sklearn.experimental import enable_iterative_imputer # noqa 43 >>> # now you can import normally from sklearn.impute 44 >>> from sklearn.impute import IterativeImputer 45 46 Parameters 47 ---------- 48 estimator : estimator object, default=BayesianRidge() 49 The estimator to use at each step of the round-robin imputation. 50 If `sample_posterior=True`, the estimator must support 51 `return_std` in its `predict` method. 52 53 missing_values : int or np.nan, default=np.nan 54 The placeholder for the missing values. All occurrences of 55 `missing_values` will be imputed. For pandas' dataframes with 56 nullable integer dtypes with missing values, `missing_values` 57 should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`. 58 59 sample_posterior : bool, default=False 60 Whether to sample from the (Gaussian) predictive posterior of the 61 fitted estimator for each imputation. Estimator must support 62 `return_std` in its `predict` method if set to `True`. Set to 63 `True` if using `IterativeImputer` for multiple imputations. 64 65 max_iter : int, default=10 66 Maximum number of imputation rounds to perform before returning the 67 imputations computed during the final round. A round is a single 68 imputation of each feature with missing values. The stopping criterion 69 is met once `max(abs(X_t - X_{t-1}))/max(abs(X[known_vals])) < tol`, 70 where `X_t` is `X` at iteration `t`. Note that early stopping is only 71 applied if `sample_posterior=False`. 72 73 tol : float, default=1e-3 74 Tolerance of the stopping condition. 75 76 n_nearest_features : int, default=None 77 Number of other features to use to estimate the missing values of 78 each feature column. Nearness between features is measured using 79 the absolute correlation coefficient between each feature pair (after 80 initial imputation). To ensure coverage of features throughout the 81 imputation process, the neighbor features are not necessarily nearest, 82 but are drawn with probability proportional to correlation for each 83 imputed target feature. Can provide significant speed-up when the 84 number of features is huge. If `None`, all features will be used. 85 86 initial_strategy : {'mean', 'median', 'most_frequent', 'constant'}, \ 87 default='mean' 88 Which strategy to use to initialize the missing values. Same as the 89 `strategy` parameter in :class:`~sklearn.impute.SimpleImputer`. 90 91 imputation_order : {'ascending', 'descending', 'roman', 'arabic', \ 92 'random'}, default='ascending' 93 The order in which the features will be imputed. Possible values: 94 95 - `'ascending'`: From features with fewest missing values to most. 96 - `'descending'`: From features with most missing values to fewest. 97 - `'roman'`: Left to right. 98 - `'arabic'`: Right to left. 99 - `'random'`: A random order for each round. 100 101 skip_complete : bool, default=False 102 If `True` then features with missing values during :meth:`transform` 103 which did not have any missing values during :meth:`fit` will be 104 imputed with the initial imputation method only. Set to `True` if you 105 have many features with no missing values at both :meth:`fit` and 106 :meth:`transform` time to save compute. 107 108 min_value : float or array-like of shape (n_features,), default=-np.inf 109 Minimum possible imputed value. Broadcast to shape `(n_features,)` if 110 scalar. If array-like, expects shape `(n_features,)`, one min value for 111 each feature. The default is `-np.inf`. 112 113 .. versionchanged:: 0.23 114 Added support for array-like. 115 116 max_value : float or array-like of shape (n_features,), default=np.inf 117 Maximum possible imputed value. Broadcast to shape `(n_features,)` if 118 scalar. If array-like, expects shape `(n_features,)`, one max value for 119 each feature. The default is `np.inf`. 120 121 .. versionchanged:: 0.23 122 Added support for array-like. 123 124 verbose : int, default=0 125 Verbosity flag, controls the debug messages that are issued 126 as functions are evaluated. The higher, the more verbose. Can be 0, 1, 127 or 2. 128 129 random_state : int, RandomState instance or None, default=None 130 The seed of the pseudo random number generator to use. Randomizes 131 selection of estimator features if `n_nearest_features` is not `None`, 132 the `imputation_order` if `random`, and the sampling from posterior if 133 `sample_posterior=True`. Use an integer for determinism. 134 See :term:`the Glossary <random_state>`. 135 136 add_indicator : bool, default=False 137 If `True`, a :class:`MissingIndicator` transform will stack onto output 138 of the imputer's transform. This allows a predictive estimator 139 to account for missingness despite imputation. If a feature has no 140 missing values at fit/train time, the feature won't appear on 141 the missing indicator even if there are missing values at 142 transform/test time. 143 144 Attributes 145 ---------- 146 initial_imputer_ : object of type :class:`~sklearn.impute.SimpleImputer` 147 Imputer used to initialize the missing values. 148 149 imputation_sequence_ : list of tuples 150 Each tuple has `(feat_idx, neighbor_feat_idx, estimator)`, where 151 `feat_idx` is the current feature to be imputed, 152 `neighbor_feat_idx` is the array of other features used to impute the 153 current feature, and `estimator` is the trained estimator used for 154 the imputation. Length is `self.n_features_with_missing_ * 155 self.n_iter_`. 156 157 n_iter_ : int 158 Number of iteration rounds that occurred. Will be less than 159 `self.max_iter` if early stopping criterion was reached. 160 161 n_features_in_ : int 162 Number of features seen during :term:`fit`. 163 164 .. versionadded:: 0.24 165 166 feature_names_in_ : ndarray of shape (`n_features_in_`,) 167 Names of features seen during :term:`fit`. Defined only when `X` 168 has feature names that are all strings. 169 170 .. versionadded:: 1.0 171 172 n_features_with_missing_ : int 173 Number of features with missing values. 174 175 indicator_ : :class:`~sklearn.impute.MissingIndicator` 176 Indicator used to add binary indicators for missing values. 177 `None` if `add_indicator=False`. 178 179 random_state_ : RandomState instance 180 RandomState instance that is generated either from a seed, the random 181 number generator or by `np.random`. 182 183 See Also 184 -------- 185 SimpleImputer : Univariate imputation of missing values. 186 187 Notes 188 ----- 189 To support imputation in inductive mode we store each feature's estimator 190 during the :meth:`fit` phase, and predict without refitting (in order) 191 during the :meth:`transform` phase. 192 193 Features which contain all missing values at :meth:`fit` are discarded upon 194 :meth:`transform`. 195 196 References 197 ---------- 198 .. [1] `Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice: 199 Multivariate Imputation by Chained Equations in R". Journal of 200 Statistical Software 45: 1-67. 201 <https://www.jstatsoft.org/article/view/v045i03>`_ 202 203 .. [2] `S. F. Buck, (1960). "A Method of Estimation of Missing Values in 204 Multivariate Data Suitable for use with an Electronic Computer". 205 Journal of the Royal Statistical Society 22(2): 302-306. 206 <https://www.jstor.org/stable/2984099>`_ 207 208 Examples 209 -------- 210 >>> import numpy as np 211 >>> from sklearn.experimental import enable_iterative_imputer 212 >>> from sklearn.impute import IterativeImputer 213 >>> imp_mean = IterativeImputer(random_state=0) 214 >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]]) 215 IterativeImputer(random_state=0) 216 >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]] 217 >>> imp_mean.transform(X) 218 array([[ 6.9584..., 2. , 3. ], 219 [ 4. , 2.6000..., 6. ], 220 [10. , 4.9999..., 9. ]]) 221 """ 222 223 def __init__( 224 self, 225 estimator=None, 226 *, 227 missing_values=np.nan, 228 sample_posterior=False, 229 max_iter=10, 230 tol=1e-3, 231 n_nearest_features=None, 232 initial_strategy="mean", 233 imputation_order="ascending", 234 skip_complete=False, 235 min_value=-np.inf, 236 max_value=np.inf, 237 verbose=0, 238 random_state=None, 239 add_indicator=False, 240 ): 241 super().__init__(missing_values=missing_values, add_indicator=add_indicator) 242 243 self.estimator = estimator 244 self.sample_posterior = sample_posterior 245 self.max_iter = max_iter 246 self.tol = tol 247 self.n_nearest_features = n_nearest_features 248 self.initial_strategy = initial_strategy 249 self.imputation_order = imputation_order 250 self.skip_complete = skip_complete 251 self.min_value = min_value 252 self.max_value = max_value 253 self.verbose = verbose 254 self.random_state = random_state 255 256 def _impute_one_feature( 257 self, 258 X_filled, 259 mask_missing_values, 260 feat_idx, 261 neighbor_feat_idx, 262 estimator=None, 263 fit_mode=True, 264 ): 265 """Impute a single feature from the others provided. 266 267 This function predicts the missing values of one of the features using 268 the current estimates of all the other features. The `estimator` must 269 support `return_std=True` in its `predict` method for this function 270 to work. 271 272 Parameters 273 ---------- 274 X_filled : ndarray 275 Input data with the most recent imputations. 276 277 mask_missing_values : ndarray 278 Input data's missing indicator matrix. 279 280 feat_idx : int 281 Index of the feature currently being imputed. 282 283 neighbor_feat_idx : ndarray 284 Indices of the features to be used in imputing `feat_idx`. 285 286 estimator : object 287 The estimator to use at this step of the round-robin imputation. 288 If `sample_posterior=True`, the estimator must support 289 `return_std` in its `predict` method. 290 If None, it will be cloned from self._estimator. 291 292 fit_mode : boolean, default=True 293 Whether to fit and predict with the estimator or just predict. 294 295 Returns 296 ------- 297 X_filled : ndarray 298 Input data with `X_filled[missing_row_mask, feat_idx]` updated. 299 300 estimator : estimator with sklearn API 301 The fitted estimator used to impute 302 `X_filled[missing_row_mask, feat_idx]`. 303 """ 304 if estimator is None and fit_mode is False: 305 raise ValueError( 306 "If fit_mode is False, then an already-fitted " 307 "estimator should be passed in." 308 ) 309 310 if estimator is None: 311 estimator = clone(self._estimator) 312 313 missing_row_mask = mask_missing_values[:, feat_idx] 314 if fit_mode: 315 X_train = _safe_indexing(X_filled[:, neighbor_feat_idx], ~missing_row_mask) 316 y_train = _safe_indexing(X_filled[:, feat_idx], ~missing_row_mask) 317 estimator.fit(X_train, y_train) 318 319 # if no missing values, don't predict 320 if np.sum(missing_row_mask) == 0: 321 return X_filled, estimator 322 323 # get posterior samples if there is at least one missing value 324 X_test = _safe_indexing(X_filled[:, neighbor_feat_idx], missing_row_mask) 325 if self.sample_posterior: 326 mus, sigmas = estimator.predict(X_test, return_std=True) 327 imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype) 328 # two types of problems: (1) non-positive sigmas 329 # (2) mus outside legal range of min_value and max_value 330 # (results in inf sample) 331 positive_sigmas = sigmas > 0 332 imputed_values[~positive_sigmas] = mus[~positive_sigmas] 333 mus_too_low = mus < self._min_value[feat_idx] 334 imputed_values[mus_too_low] = self._min_value[feat_idx] 335 mus_too_high = mus > self._max_value[feat_idx] 336 imputed_values[mus_too_high] = self._max_value[feat_idx] 337 # the rest can be sampled without statistical issues 338 inrange_mask = positive_sigmas & ~mus_too_low & ~mus_too_high 339 mus = mus[inrange_mask] 340 sigmas = sigmas[inrange_mask] 341 a = (self._min_value[feat_idx] - mus) / sigmas 342 b = (self._max_value[feat_idx] - mus) / sigmas 343 344 truncated_normal = stats.truncnorm(a=a, b=b, loc=mus, scale=sigmas) 345 imputed_values[inrange_mask] = truncated_normal.rvs( 346 random_state=self.random_state_ 347 ) 348 else: 349 imputed_values = estimator.predict(X_test) 350 imputed_values = np.clip( 351 imputed_values, self._min_value[feat_idx], self._max_value[feat_idx] 352 ) 353 354 # update the feature 355 X_filled[missing_row_mask, feat_idx] = imputed_values 356 return X_filled, estimator 357 358 def _get_neighbor_feat_idx(self, n_features, feat_idx, abs_corr_mat): 359 """Get a list of other features to predict `feat_idx`. 360 361 If `self.n_nearest_features` is less than or equal to the total 362 number of features, then use a probability proportional to the absolute 363 correlation between `feat_idx` and each other feature to randomly 364 choose a subsample of the other features (without replacement). 365 366 Parameters 367 ---------- 368 n_features : int 369 Number of features in `X`. 370 371 feat_idx : int 372 Index of the feature currently being imputed. 373 374 abs_corr_mat : ndarray, shape (n_features, n_features) 375 Absolute correlation matrix of `X`. The diagonal has been zeroed 376 out and each feature has been normalized to sum to 1. Can be None. 377 378 Returns 379 ------- 380 neighbor_feat_idx : array-like 381 The features to use to impute `feat_idx`. 382 """ 383 if self.n_nearest_features is not None and self.n_nearest_features < n_features: 384 p = abs_corr_mat[:, feat_idx] 385 neighbor_feat_idx = self.random_state_.choice( 386 np.arange(n_features), self.n_nearest_features, replace=False, p=p 387 ) 388 else: 389 inds_left = np.arange(feat_idx) 390 inds_right = np.arange(feat_idx + 1, n_features) 391 neighbor_feat_idx = np.concatenate((inds_left, inds_right)) 392 return neighbor_feat_idx 393 394 def _get_ordered_idx(self, mask_missing_values): 395 """Decide in what order we will update the features. 396 397 As a homage to the MICE R package, we will have 4 main options of 398 how to order the updates, and use a random order if anything else 399 is specified. 400 401 Also, this function skips features which have no missing values. 402 403 Parameters 404 ---------- 405 mask_missing_values : array-like, shape (n_samples, n_features) 406 Input data's missing indicator matrix, where `n_samples` is the 407 number of samples and `n_features` is the number of features. 408 409 Returns 410 ------- 411 ordered_idx : ndarray, shape (n_features,) 412 The order in which to impute the features. 413 """ 414 frac_of_missing_values = mask_missing_values.mean(axis=0) 415 if self.skip_complete: 416 missing_values_idx = np.flatnonzero(frac_of_missing_values) 417 else: 418 missing_values_idx = np.arange(np.shape(frac_of_missing_values)[0]) 419 if self.imputation_order == "roman": 420 ordered_idx = missing_values_idx 421 elif self.imputation_order == "arabic": 422 ordered_idx = missing_values_idx[::-1] 423 elif self.imputation_order == "ascending": 424 n = len(frac_of_missing_values) - len(missing_values_idx) 425 ordered_idx = np.argsort(frac_of_missing_values, kind="mergesort")[n:] 426 elif self.imputation_order == "descending": 427 n = len(frac_of_missing_values) - len(missing_values_idx) 428 ordered_idx = np.argsort(frac_of_missing_values, kind="mergesort")[n:][::-1] 429 elif self.imputation_order == "random": 430 ordered_idx = missing_values_idx 431 self.random_state_.shuffle(ordered_idx) 432 else: 433 raise ValueError( 434 "Got an invalid imputation order: '{0}'. It must " 435 "be one of the following: 'roman', 'arabic', " 436 "'ascending', 'descending', or " 437 "'random'.".format(self.imputation_order) 438 ) 439 return ordered_idx 440 441 def _get_abs_corr_mat(self, X_filled, tolerance=1e-6): 442 """Get absolute correlation matrix between features. 443 444 Parameters 445 ---------- 446 X_filled : ndarray, shape (n_samples, n_features) 447 Input data with the most recent imputations. 448 449 tolerance : float, default=1e-6 450 `abs_corr_mat` can have nans, which will be replaced 451 with `tolerance`. 452 453 Returns 454 ------- 455 abs_corr_mat : ndarray, shape (n_features, n_features) 456 Absolute correlation matrix of `X` at the beginning of the 457 current round. The diagonal has been zeroed out and each feature's 458 absolute correlations with all others have been normalized to sum 459 to 1. 460 """ 461 n_features = X_filled.shape[1] 462 if self.n_nearest_features is None or self.n_nearest_features >= n_features: 463 return None 464 with np.errstate(invalid="ignore"): 465 # if a feature in the neighborhood has only a single value 466 # (e.g., categorical feature), the std. dev. will be null and 467 # np.corrcoef will raise a warning due to a division by zero 468 abs_corr_mat = np.abs(np.corrcoef(X_filled.T)) 469 # np.corrcoef is not defined for features with zero std 470 abs_corr_mat[np.isnan(abs_corr_mat)] = tolerance 471 # ensures exploration, i.e. at least some probability of sampling 472 np.clip(abs_corr_mat, tolerance, None, out=abs_corr_mat) 473 # features are not their own neighbors 474 np.fill_diagonal(abs_corr_mat, 0) 475 # needs to sum to 1 for np.random.choice sampling 476 abs_corr_mat = normalize(abs_corr_mat, norm="l1", axis=0, copy=False) 477 return abs_corr_mat 478 479 def _initial_imputation(self, X, in_fit=False): 480 """Perform initial imputation for input `X`. 481 482 Parameters 483 ---------- 484 X : ndarray, shape (n_samples, n_features) 485 Input data, where `n_samples` is the number of samples and 486 `n_features` is the number of features. 487 488 in_fit : bool, default=False 489 Whether function is called in :meth:`fit`. 490 491 Returns 492 ------- 493 Xt : ndarray, shape (n_samples, n_features) 494 Input data, where `n_samples` is the number of samples and 495 `n_features` is the number of features. 496 497 X_filled : ndarray, shape (n_samples, n_features) 498 Input data with the most recent imputations. 499 500 mask_missing_values : ndarray, shape (n_samples, n_features) 501 Input data's missing indicator matrix, where `n_samples` is the 502 number of samples and `n_features` is the number of features. 503 504 X_missing_mask : ndarray, shape (n_samples, n_features) 505 Input data's mask matrix indicating missing datapoints, where 506 `n_samples` is the number of samples and `n_features` is the 507 number of features. 508 """ 509 if is_scalar_nan(self.missing_values): 510 force_all_finite = "allow-nan" 511 else: 512 force_all_finite = True 513 514 X = self._validate_data( 515 X, 516 dtype=FLOAT_DTYPES, 517 order="F", 518 reset=in_fit, 519 force_all_finite=force_all_finite, 520 ) 521 _check_inputs_dtype(X, self.missing_values) 522 523 X_missing_mask = _get_mask(X, self.missing_values) 524 mask_missing_values = X_missing_mask.copy() 525 if self.initial_imputer_ is None: 526 self.initial_imputer_ = SimpleImputer( 527 missing_values=self.missing_values, strategy=self.initial_strategy 528 ) 529 X_filled = self.initial_imputer_.fit_transform(X) 530 else: 531 X_filled = self.initial_imputer_.transform(X) 532 533 valid_mask = np.flatnonzero( 534 np.logical_not(np.isnan(self.initial_imputer_.statistics_)) 535 ) 536 Xt = X[:, valid_mask] 537 mask_missing_values = mask_missing_values[:, valid_mask] 538 539 return Xt, X_filled, mask_missing_values, X_missing_mask 540 541 @staticmethod 542 def _validate_limit(limit, limit_type, n_features): 543 """Validate the limits (min/max) of the feature values. 544 545 Converts scalar min/max limits to vectors of shape `(n_features,)`. 546 547 Parameters 548 ---------- 549 limit: scalar or array-like 550 The user-specified limit (i.e, min_value or max_value). 551 limit_type: {'max', 'min'} 552 Type of limit to validate. 553 n_features: int 554 Number of features in the dataset. 555 556 Returns 557 ------- 558 limit: ndarray, shape(n_features,) 559 Array of limits, one for each feature. 560 """ 561 limit_bound = np.inf if limit_type == "max" else -np.inf 562 limit = limit_bound if limit is None else limit 563 if np.isscalar(limit): 564 limit = np.full(n_features, limit) 565 limit = check_array(limit, force_all_finite=False, copy=False, ensure_2d=False) 566 if not limit.shape[0] == n_features: 567 raise ValueError( 568 f"'{limit_type}_value' should be of " 569 f"shape ({n_features},) when an array-like " 570 f"is provided. Got {limit.shape}, instead." 571 ) 572 return limit 573 574 def fit_transform(self, X, y=None): 575 """Fit the imputer on `X` and return the transformed `X`. 576 577 Parameters 578 ---------- 579 X : array-like, shape (n_samples, n_features) 580 Input data, where `n_samples` is the number of samples and 581 `n_features` is the number of features. 582 583 y : Ignored 584 Not used, present for API consistency by convention. 585 586 Returns 587 ------- 588 Xt : array-like, shape (n_samples, n_features) 589 The imputed input data. 590 """ 591 self.random_state_ = getattr( 592 self, "random_state_", check_random_state(self.random_state) 593 ) 594 595 if self.max_iter < 0: 596 raise ValueError( 597 "'max_iter' should be a positive integer. Got {} instead.".format( 598 self.max_iter 599 ) 600 ) 601 602 if self.tol < 0: 603 raise ValueError( 604 "'tol' should be a non-negative float. Got {} instead.".format(self.tol) 605 ) 606 607 if self.estimator is None: 608 from ..linear_model import BayesianRidge 609 610 self._estimator = BayesianRidge() 611 else: 612 self._estimator = clone(self.estimator) 613 614 self.imputation_sequence_ = [] 615 616 self.initial_imputer_ = None 617 618 X, Xt, mask_missing_values, complete_mask = self._initial_imputation( 619 X, in_fit=True 620 ) 621 622 super()._fit_indicator(complete_mask) 623 X_indicator = super()._transform_indicator(complete_mask) 624 625 if self.max_iter == 0 or np.all(mask_missing_values): 626 self.n_iter_ = 0 627 return super()._concatenate_indicator(Xt, X_indicator) 628 629 # Edge case: a single feature. We return the initial ... 630 if Xt.shape[1] == 1: 631 self.n_iter_ = 0 632 return super()._concatenate_indicator(Xt, X_indicator) 633 634 self._min_value = self._validate_limit(self.min_value, "min", X.shape[1]) 635 self._max_value = self._validate_limit(self.max_value, "max", X.shape[1]) 636 637 if not np.all(np.greater(self._max_value, self._min_value)): 638 raise ValueError("One (or more) features have min_value >= max_value.") 639 640 # order in which to impute 641 # note this is probably too slow for large feature data (d > 100000) 642 # and a better way would be good. 643 # see: https://goo.gl/KyCNwj and subsequent comments 644 ordered_idx = self._get_ordered_idx(mask_missing_values) 645 self.n_features_with_missing_ = len(ordered_idx) 646 647 abs_corr_mat = self._get_abs_corr_mat(Xt) 648 649 n_samples, n_features = Xt.shape 650 if self.verbose > 0: 651 print("[IterativeImputer] Completing matrix with shape %s" % (X.shape,)) 652 start_t = time() 653 if not self.sample_posterior: 654 Xt_previous = Xt.copy() 655 normalized_tol = self.tol * np.max(np.abs(X[~mask_missing_values])) 656 for self.n_iter_ in range(1, self.max_iter + 1): 657 if self.imputation_order == "random": 658 ordered_idx = self._get_ordered_idx(mask_missing_values) 659 660 for feat_idx in ordered_idx: 661 neighbor_feat_idx = self._get_neighbor_feat_idx( 662 n_features, feat_idx, abs_corr_mat 663 ) 664 Xt, estimator = self._impute_one_feature( 665 Xt, 666 mask_missing_values, 667 feat_idx, 668 neighbor_feat_idx, 669 estimator=None, 670 fit_mode=True, 671 ) 672 estimator_triplet = _ImputerTriplet( 673 feat_idx, neighbor_feat_idx, estimator 674 ) 675 self.imputation_sequence_.append(estimator_triplet) 676 677 if self.verbose > 1: 678 print( 679 "[IterativeImputer] Ending imputation round " 680 "%d/%d, elapsed time %0.2f" 681 % (self.n_iter_, self.max_iter, time() - start_t) 682 ) 683 684 if not self.sample_posterior: 685 inf_norm = np.linalg.norm(Xt - Xt_previous, ord=np.inf, axis=None) 686 if self.verbose > 0: 687 print( 688 "[IterativeImputer] Change: {}, scaled tolerance: {} ".format( 689 inf_norm, normalized_tol 690 ) 691 ) 692 if inf_norm < normalized_tol: 693 if self.verbose > 0: 694 print("[IterativeImputer] Early stopping criterion reached.") 695 break 696 Xt_previous = Xt.copy() 697 else: 698 if not self.sample_posterior: 699 warnings.warn( 700 "[IterativeImputer] Early stopping criterion not reached.", 701 ConvergenceWarning, 702 ) 703 Xt[~mask_missing_values] = X[~mask_missing_values] 704 return super()._concatenate_indicator(Xt, X_indicator) 705 706 def transform(self, X): 707 """Impute all missing values in `X`. 708 709 Note that this is stochastic, and that if `random_state` is not fixed, 710 repeated calls, or permuted input, results will differ. 711 712 Parameters 713 ---------- 714 X : array-like of shape (n_samples, n_features) 715 The input data to complete. 716 717 Returns 718 ------- 719 Xt : array-like, shape (n_samples, n_features) 720 The imputed input data. 721 """ 722 check_is_fitted(self) 723 724 X, Xt, mask_missing_values, complete_mask = self._initial_imputation(X) 725 726 X_indicator = super()._transform_indicator(complete_mask) 727 728 if self.n_iter_ == 0 or np.all(mask_missing_values): 729 return super()._concatenate_indicator(Xt, X_indicator) 730 731 imputations_per_round = len(self.imputation_sequence_) // self.n_iter_ 732 i_rnd = 0 733 if self.verbose > 0: 734 print("[IterativeImputer] Completing matrix with shape %s" % (X.shape,)) 735 start_t = time() 736 for it, estimator_triplet in enumerate(self.imputation_sequence_): 737 Xt, _ = self._impute_one_feature( 738 Xt, 739 mask_missing_values, 740 estimator_triplet.feat_idx, 741 estimator_triplet.neighbor_feat_idx, 742 estimator=estimator_triplet.estimator, 743 fit_mode=False, 744 ) 745 if not (it + 1) % imputations_per_round: 746 if self.verbose > 1: 747 print( 748 "[IterativeImputer] Ending imputation round " 749 "%d/%d, elapsed time %0.2f" 750 % (i_rnd + 1, self.n_iter_, time() - start_t) 751 ) 752 i_rnd += 1 753 754 Xt[~mask_missing_values] = X[~mask_missing_values] 755 756 return super()._concatenate_indicator(Xt, X_indicator) 757 758 def fit(self, X, y=None): 759 """Fit the imputer on `X` and return self. 760 761 Parameters 762 ---------- 763 X : array-like, shape (n_samples, n_features) 764 Input data, where `n_samples` is the number of samples and 765 `n_features` is the number of features. 766 767 y : Ignored 768 Not used, present for API consistency by convention. 769 770 Returns 771 ------- 772 self : object 773 Fitted estimator. 774 """ 775 self.fit_transform(X) 776 return self 777