1""" 2This file contains preprocessing tools based on polynomials. 3""" 4import collections 5import numbers 6from itertools import chain, combinations 7from itertools import combinations_with_replacement as combinations_w_r 8 9import numpy as np 10from scipy import sparse 11from scipy.interpolate import BSpline 12from scipy.special import comb 13 14from ..base import BaseEstimator, TransformerMixin 15from ..utils import check_array 16from ..utils.deprecation import deprecated 17from ..utils.fixes import linspace 18from ..utils.validation import check_is_fitted, FLOAT_DTYPES, _check_sample_weight 19from ..utils.validation import _check_feature_names_in 20from ..utils.stats import _weighted_percentile 21 22from ._csr_polynomial_expansion import _csr_polynomial_expansion 23 24 25__all__ = [ 26 "PolynomialFeatures", 27 "SplineTransformer", 28] 29 30 31class PolynomialFeatures(TransformerMixin, BaseEstimator): 32 """Generate polynomial and interaction features. 33 34 Generate a new feature matrix consisting of all polynomial combinations 35 of the features with degree less than or equal to the specified degree. 36 For example, if an input sample is two dimensional and of the form 37 [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2]. 38 39 Read more in the :ref:`User Guide <polynomial_features>`. 40 41 Parameters 42 ---------- 43 degree : int or tuple (min_degree, max_degree), default=2 44 If a single int is given, it specifies the maximal degree of the 45 polynomial features. If a tuple `(min_degree, max_degree)` is passed, 46 then `min_degree` is the minimum and `max_degree` is the maximum 47 polynomial degree of the generated features. Note that `min_degree=0` 48 and `min_degree=1` are equivalent as outputting the degree zero term is 49 determined by `include_bias`. 50 51 interaction_only : bool, default=False 52 If `True`, only interaction features are produced: features that are 53 products of at most `degree` *distinct* input features, i.e. terms with 54 power of 2 or higher of the same input feature are excluded: 55 56 - included: `x[0]`, `x[1]`, `x[0] * x[1]`, etc. 57 - excluded: `x[0] ** 2`, `x[0] ** 2 * x[1]`, etc. 58 59 include_bias : bool, default=True 60 If `True` (default), then include a bias column, the feature in which 61 all polynomial powers are zero (i.e. a column of ones - acts as an 62 intercept term in a linear model). 63 64 order : {'C', 'F'}, default='C' 65 Order of output array in the dense case. `'F'` order is faster to 66 compute, but may slow down subsequent estimators. 67 68 .. versionadded:: 0.21 69 70 Attributes 71 ---------- 72 powers_ : ndarray of shape (`n_output_features_`, `n_features_in_`) 73 `powers_[i, j]` is the exponent of the jth input in the ith output. 74 75 n_input_features_ : int 76 The total number of input features. 77 78 .. deprecated:: 1.0 79 This attribute is deprecated in 1.0 and will be removed in 1.2. 80 Refer to `n_features_in_` instead. 81 82 n_features_in_ : int 83 Number of features seen during :term:`fit`. 84 85 .. versionadded:: 0.24 86 87 feature_names_in_ : ndarray of shape (`n_features_in_`,) 88 Names of features seen during :term:`fit`. Defined only when `X` 89 has feature names that are all strings. 90 91 .. versionadded:: 1.0 92 93 n_output_features_ : int 94 The total number of polynomial output features. The number of output 95 features is computed by iterating over all suitably sized combinations 96 of input features. 97 98 See Also 99 -------- 100 SplineTransformer : Transformer that generates univariate B-spline bases 101 for features. 102 103 Notes 104 ----- 105 Be aware that the number of features in the output array scales 106 polynomially in the number of features of the input array, and 107 exponentially in the degree. High degrees can cause overfitting. 108 109 See :ref:`examples/linear_model/plot_polynomial_interpolation.py 110 <sphx_glr_auto_examples_linear_model_plot_polynomial_interpolation.py>` 111 112 Examples 113 -------- 114 >>> import numpy as np 115 >>> from sklearn.preprocessing import PolynomialFeatures 116 >>> X = np.arange(6).reshape(3, 2) 117 >>> X 118 array([[0, 1], 119 [2, 3], 120 [4, 5]]) 121 >>> poly = PolynomialFeatures(2) 122 >>> poly.fit_transform(X) 123 array([[ 1., 0., 1., 0., 0., 1.], 124 [ 1., 2., 3., 4., 6., 9.], 125 [ 1., 4., 5., 16., 20., 25.]]) 126 >>> poly = PolynomialFeatures(interaction_only=True) 127 >>> poly.fit_transform(X) 128 array([[ 1., 0., 1., 0.], 129 [ 1., 2., 3., 6.], 130 [ 1., 4., 5., 20.]]) 131 """ 132 133 def __init__( 134 self, degree=2, *, interaction_only=False, include_bias=True, order="C" 135 ): 136 self.degree = degree 137 self.interaction_only = interaction_only 138 self.include_bias = include_bias 139 self.order = order 140 141 @staticmethod 142 def _combinations( 143 n_features, min_degree, max_degree, interaction_only, include_bias 144 ): 145 comb = combinations if interaction_only else combinations_w_r 146 start = max(1, min_degree) 147 iter = chain.from_iterable( 148 comb(range(n_features), i) for i in range(start, max_degree + 1) 149 ) 150 if include_bias: 151 iter = chain(comb(range(n_features), 0), iter) 152 return iter 153 154 @staticmethod 155 def _num_combinations( 156 n_features, min_degree, max_degree, interaction_only, include_bias 157 ): 158 """Calculate number of terms in polynomial expansion 159 160 This should be equivalent to counting the number of terms returned by 161 _combinations(...) but much faster. 162 """ 163 164 if interaction_only: 165 combinations = sum( 166 [ 167 comb(n_features, i, exact=True) 168 for i in range(max(1, min_degree), min(max_degree, n_features) + 1) 169 ] 170 ) 171 else: 172 combinations = comb(n_features + max_degree, max_degree, exact=True) - 1 173 if min_degree > 0: 174 d = min_degree - 1 175 combinations -= comb(n_features + d, d, exact=True) - 1 176 177 if include_bias: 178 combinations += 1 179 180 return combinations 181 182 @property 183 def powers_(self): 184 """Exponent for each of the inputs in the output.""" 185 check_is_fitted(self) 186 187 combinations = self._combinations( 188 n_features=self.n_features_in_, 189 min_degree=self._min_degree, 190 max_degree=self._max_degree, 191 interaction_only=self.interaction_only, 192 include_bias=self.include_bias, 193 ) 194 return np.vstack( 195 [np.bincount(c, minlength=self.n_features_in_) for c in combinations] 196 ) 197 198 @deprecated( 199 "get_feature_names is deprecated in 1.0 and will be removed " 200 "in 1.2. Please use get_feature_names_out instead." 201 ) 202 def get_feature_names(self, input_features=None): 203 """Return feature names for output features. 204 205 Parameters 206 ---------- 207 input_features : list of str of shape (n_features,), default=None 208 String names for input features if available. By default, 209 "x0", "x1", ... "xn_features" is used. 210 211 Returns 212 ------- 213 output_feature_names : list of str of shape (n_output_features,) 214 Transformed feature names. 215 """ 216 powers = self.powers_ 217 if input_features is None: 218 input_features = ["x%d" % i for i in range(powers.shape[1])] 219 feature_names = [] 220 for row in powers: 221 inds = np.where(row)[0] 222 if len(inds): 223 name = " ".join( 224 "%s^%d" % (input_features[ind], exp) 225 if exp != 1 226 else input_features[ind] 227 for ind, exp in zip(inds, row[inds]) 228 ) 229 else: 230 name = "1" 231 feature_names.append(name) 232 return feature_names 233 234 def get_feature_names_out(self, input_features=None): 235 """Get output feature names for transformation. 236 237 Parameters 238 ---------- 239 input_features : array-like of str or None, default=None 240 Input features. 241 242 - If `input_features is None`, then `feature_names_in_` is 243 used as feature names in. If `feature_names_in_` is not defined, 244 then names are generated: `[x0, x1, ..., x(n_features_in_)]`. 245 - If `input_features` is an array-like, then `input_features` must 246 match `feature_names_in_` if `feature_names_in_` is defined. 247 248 Returns 249 ------- 250 feature_names_out : ndarray of str objects 251 Transformed feature names. 252 """ 253 powers = self.powers_ 254 input_features = _check_feature_names_in(self, input_features) 255 feature_names = [] 256 for row in powers: 257 inds = np.where(row)[0] 258 if len(inds): 259 name = " ".join( 260 "%s^%d" % (input_features[ind], exp) 261 if exp != 1 262 else input_features[ind] 263 for ind, exp in zip(inds, row[inds]) 264 ) 265 else: 266 name = "1" 267 feature_names.append(name) 268 return np.asarray(feature_names, dtype=object) 269 270 def fit(self, X, y=None): 271 """ 272 Compute number of output features. 273 274 Parameters 275 ---------- 276 X : {array-like, sparse matrix} of shape (n_samples, n_features) 277 The data. 278 279 y : Ignored 280 Not used, present here for API consistency by convention. 281 282 Returns 283 ------- 284 self : object 285 Fitted transformer. 286 """ 287 _, n_features = self._validate_data(X, accept_sparse=True).shape 288 289 if isinstance(self.degree, numbers.Integral): 290 if self.degree < 0: 291 raise ValueError( 292 f"degree must be a non-negative integer, got {self.degree}." 293 ) 294 self._min_degree = 0 295 self._max_degree = self.degree 296 elif ( 297 isinstance(self.degree, collections.abc.Iterable) and len(self.degree) == 2 298 ): 299 self._min_degree, self._max_degree = self.degree 300 if not ( 301 isinstance(self._min_degree, numbers.Integral) 302 and isinstance(self._max_degree, numbers.Integral) 303 and self._min_degree >= 0 304 and self._min_degree <= self._max_degree 305 ): 306 raise ValueError( 307 "degree=(min_degree, max_degree) must " 308 "be non-negative integers that fulfil " 309 "min_degree <= max_degree, got " 310 f"{self.degree}." 311 ) 312 else: 313 raise ValueError( 314 "degree must be a non-negative int or tuple " 315 "(min_degree, max_degree), got " 316 f"{self.degree}." 317 ) 318 319 self.n_output_features_ = self._num_combinations( 320 n_features=n_features, 321 min_degree=self._min_degree, 322 max_degree=self._max_degree, 323 interaction_only=self.interaction_only, 324 include_bias=self.include_bias, 325 ) 326 # We also record the number of output features for 327 # _max_degree = 0 328 self._n_out_full = self._num_combinations( 329 n_features=n_features, 330 min_degree=0, 331 max_degree=self._max_degree, 332 interaction_only=self.interaction_only, 333 include_bias=self.include_bias, 334 ) 335 336 return self 337 338 def transform(self, X): 339 """Transform data to polynomial features. 340 341 Parameters 342 ---------- 343 X : {array-like, sparse matrix} of shape (n_samples, n_features) 344 The data to transform, row by row. 345 346 Prefer CSR over CSC for sparse input (for speed), but CSC is 347 required if the degree is 4 or higher. If the degree is less than 348 4 and the input format is CSC, it will be converted to CSR, have 349 its polynomial features generated, then converted back to CSC. 350 351 If the degree is 2 or 3, the method described in "Leveraging 352 Sparsity to Speed Up Polynomial Feature Expansions of CSR Matrices 353 Using K-Simplex Numbers" by Andrew Nystrom and John Hughes is 354 used, which is much faster than the method used on CSC input. For 355 this reason, a CSC input will be converted to CSR, and the output 356 will be converted back to CSC prior to being returned, hence the 357 preference of CSR. 358 359 Returns 360 ------- 361 XP : {ndarray, sparse matrix} of shape (n_samples, NP) 362 The matrix of features, where `NP` is the number of polynomial 363 features generated from the combination of inputs. If a sparse 364 matrix is provided, it will be converted into a sparse 365 `csr_matrix`. 366 """ 367 check_is_fitted(self) 368 369 X = self._validate_data( 370 X, order="F", dtype=FLOAT_DTYPES, reset=False, accept_sparse=("csr", "csc") 371 ) 372 373 n_samples, n_features = X.shape 374 375 if sparse.isspmatrix_csr(X): 376 if self._max_degree > 3: 377 return self.transform(X.tocsc()).tocsr() 378 to_stack = [] 379 if self.include_bias: 380 to_stack.append( 381 sparse.csc_matrix(np.ones(shape=(n_samples, 1), dtype=X.dtype)) 382 ) 383 if self._min_degree <= 1: 384 to_stack.append(X) 385 for deg in range(max(2, self._min_degree), self._max_degree + 1): 386 Xp_next = _csr_polynomial_expansion( 387 X.data, X.indices, X.indptr, X.shape[1], self.interaction_only, deg 388 ) 389 if Xp_next is None: 390 break 391 to_stack.append(Xp_next) 392 if len(to_stack) == 0: 393 # edge case: deal with empty matrix 394 XP = sparse.csr_matrix((n_samples, 0), dtype=X.dtype) 395 else: 396 XP = sparse.hstack(to_stack, format="csr") 397 elif sparse.isspmatrix_csc(X) and self._max_degree < 4: 398 return self.transform(X.tocsr()).tocsc() 399 elif sparse.isspmatrix(X): 400 combinations = self._combinations( 401 n_features=n_features, 402 min_degree=self._min_degree, 403 max_degree=self._max_degree, 404 interaction_only=self.interaction_only, 405 include_bias=self.include_bias, 406 ) 407 columns = [] 408 for combi in combinations: 409 if combi: 410 out_col = 1 411 for col_idx in combi: 412 out_col = X[:, col_idx].multiply(out_col) 413 columns.append(out_col) 414 else: 415 bias = sparse.csc_matrix(np.ones((X.shape[0], 1))) 416 columns.append(bias) 417 XP = sparse.hstack(columns, dtype=X.dtype).tocsc() 418 else: 419 # Do as if _min_degree = 0 and cut down array after the 420 # computation, i.e. use _n_out_full instead of n_output_features_. 421 XP = np.empty( 422 shape=(n_samples, self._n_out_full), dtype=X.dtype, order=self.order 423 ) 424 425 # What follows is a faster implementation of: 426 # for i, comb in enumerate(combinations): 427 # XP[:, i] = X[:, comb].prod(1) 428 # This implementation uses two optimisations. 429 # First one is broadcasting, 430 # multiply ([X1, ..., Xn], X1) -> [X1 X1, ..., Xn X1] 431 # multiply ([X2, ..., Xn], X2) -> [X2 X2, ..., Xn X2] 432 # ... 433 # multiply ([X[:, start:end], X[:, start]) -> ... 434 # Second optimisation happens for degrees >= 3. 435 # Xi^3 is computed reusing previous computation: 436 # Xi^3 = Xi^2 * Xi. 437 438 # degree 0 term 439 if self.include_bias: 440 XP[:, 0] = 1 441 current_col = 1 442 else: 443 current_col = 0 444 445 # degree 1 term 446 XP[:, current_col : current_col + n_features] = X 447 index = list(range(current_col, current_col + n_features)) 448 current_col += n_features 449 index.append(current_col) 450 451 # loop over degree >= 2 terms 452 for _ in range(2, self._max_degree + 1): 453 new_index = [] 454 end = index[-1] 455 for feature_idx in range(n_features): 456 start = index[feature_idx] 457 new_index.append(current_col) 458 if self.interaction_only: 459 start += index[feature_idx + 1] - index[feature_idx] 460 next_col = current_col + end - start 461 if next_col <= current_col: 462 break 463 # XP[:, start:end] are terms of degree d - 1 464 # that exclude feature #feature_idx. 465 np.multiply( 466 XP[:, start:end], 467 X[:, feature_idx : feature_idx + 1], 468 out=XP[:, current_col:next_col], 469 casting="no", 470 ) 471 current_col = next_col 472 473 new_index.append(current_col) 474 index = new_index 475 476 if self._min_degree > 1: 477 n_XP, n_Xout = self._n_out_full, self.n_output_features_ 478 if self.include_bias: 479 Xout = np.empty( 480 shape=(n_samples, n_Xout), dtype=XP.dtype, order=self.order 481 ) 482 Xout[:, 0] = 1 483 Xout[:, 1:] = XP[:, n_XP - n_Xout + 1 :] 484 else: 485 Xout = XP[:, n_XP - n_Xout :].copy() 486 XP = Xout 487 return XP 488 489 # TODO: Remove in 1.2 490 # mypy error: Decorated property not supported 491 @deprecated( # type: ignore 492 "The attribute `n_input_features_` was " 493 "deprecated in version 1.0 and will be removed in 1.2." 494 ) 495 @property 496 def n_input_features_(self): 497 return self.n_features_in_ 498 499 500# TODO: 501# - sparse support (either scipy or own cython solution)? 502class SplineTransformer(TransformerMixin, BaseEstimator): 503 """Generate univariate B-spline bases for features. 504 505 Generate a new feature matrix consisting of 506 `n_splines=n_knots + degree - 1` (`n_knots - 1` for 507 `extrapolation="periodic"`) spline basis functions 508 (B-splines) of polynomial order=`degree` for each feature. 509 510 Read more in the :ref:`User Guide <spline_transformer>`. 511 512 .. versionadded:: 1.0 513 514 Parameters 515 ---------- 516 n_knots : int, default=5 517 Number of knots of the splines if `knots` equals one of 518 {'uniform', 'quantile'}. Must be larger or equal 2. Ignored if `knots` 519 is array-like. 520 521 degree : int, default=3 522 The polynomial degree of the spline basis. Must be a non-negative 523 integer. 524 525 knots : {'uniform', 'quantile'} or array-like of shape \ 526 (n_knots, n_features), default='uniform' 527 Set knot positions such that first knot <= features <= last knot. 528 529 - If 'uniform', `n_knots` number of knots are distributed uniformly 530 from min to max values of the features. 531 - If 'quantile', they are distributed uniformly along the quantiles of 532 the features. 533 - If an array-like is given, it directly specifies the sorted knot 534 positions including the boundary knots. Note that, internally, 535 `degree` number of knots are added before the first knot, the same 536 after the last knot. 537 538 extrapolation : {'error', 'constant', 'linear', 'continue', 'periodic'}, \ 539 default='constant' 540 If 'error', values outside the min and max values of the training 541 features raises a `ValueError`. If 'constant', the value of the 542 splines at minimum and maximum value of the features is used as 543 constant extrapolation. If 'linear', a linear extrapolation is used. 544 If 'continue', the splines are extrapolated as is, i.e. option 545 `extrapolate=True` in :class:`scipy.interpolate.BSpline`. If 546 'periodic', periodic splines with a periodicity equal to the distance 547 between the first and last knot are used. Periodic splines enforce 548 equal function values and derivatives at the first and last knot. 549 For example, this makes it possible to avoid introducing an arbitrary 550 jump between Dec 31st and Jan 1st in spline features derived from a 551 naturally periodic "day-of-year" input feature. In this case it is 552 recommended to manually set the knot values to control the period. 553 554 include_bias : bool, default=True 555 If True (default), then the last spline element inside the data range 556 of a feature is dropped. As B-splines sum to one over the spline basis 557 functions for each data point, they implicitly include a bias term, 558 i.e. a column of ones. It acts as an intercept term in a linear models. 559 560 order : {'C', 'F'}, default='C' 561 Order of output array. 'F' order is faster to compute, but may slow 562 down subsequent estimators. 563 564 Attributes 565 ---------- 566 bsplines_ : list of shape (n_features,) 567 List of BSplines objects, one for each feature. 568 569 n_features_in_ : int 570 The total number of input features. 571 572 feature_names_in_ : ndarray of shape (`n_features_in_`,) 573 Names of features seen during :term:`fit`. Defined only when `X` 574 has feature names that are all strings. 575 576 .. versionadded:: 1.0 577 578 n_features_out_ : int 579 The total number of output features, which is computed as 580 `n_features * n_splines`, where `n_splines` is 581 the number of bases elements of the B-splines, 582 `n_knots + degree - 1` for non-periodic splines and 583 `n_knots - 1` for periodic ones. 584 If `include_bias=False`, then it is only 585 `n_features * (n_splines - 1)`. 586 587 See Also 588 -------- 589 KBinsDiscretizer : Transformer that bins continuous data into intervals. 590 591 PolynomialFeatures : Transformer that generates polynomial and interaction 592 features. 593 594 Notes 595 ----- 596 High degrees and a high number of knots can cause overfitting. 597 598 See :ref:`examples/linear_model/plot_polynomial_interpolation.py 599 <sphx_glr_auto_examples_linear_model_plot_polynomial_interpolation.py>`. 600 601 Examples 602 -------- 603 >>> import numpy as np 604 >>> from sklearn.preprocessing import SplineTransformer 605 >>> X = np.arange(6).reshape(6, 1) 606 >>> spline = SplineTransformer(degree=2, n_knots=3) 607 >>> spline.fit_transform(X) 608 array([[0.5 , 0.5 , 0. , 0. ], 609 [0.18, 0.74, 0.08, 0. ], 610 [0.02, 0.66, 0.32, 0. ], 611 [0. , 0.32, 0.66, 0.02], 612 [0. , 0.08, 0.74, 0.18], 613 [0. , 0. , 0.5 , 0.5 ]]) 614 """ 615 616 def __init__( 617 self, 618 n_knots=5, 619 degree=3, 620 *, 621 knots="uniform", 622 extrapolation="constant", 623 include_bias=True, 624 order="C", 625 ): 626 self.n_knots = n_knots 627 self.degree = degree 628 self.knots = knots 629 self.extrapolation = extrapolation 630 self.include_bias = include_bias 631 self.order = order 632 633 @staticmethod 634 def _get_base_knot_positions(X, n_knots=10, knots="uniform", sample_weight=None): 635 """Calculate base knot positions. 636 637 Base knots such that first knot <= feature <= last knot. For the 638 B-spline construction with scipy.interpolate.BSpline, 2*degree knots 639 beyond the base interval are added. 640 641 Returns 642 ------- 643 knots : ndarray of shape (n_knots, n_features), dtype=np.float64 644 Knot positions (points) of base interval. 645 """ 646 if knots == "quantile": 647 percentiles = 100 * np.linspace( 648 start=0, stop=1, num=n_knots, dtype=np.float64 649 ) 650 651 if sample_weight is None: 652 knots = np.percentile(X, percentiles, axis=0) 653 else: 654 knots = np.array( 655 [ 656 _weighted_percentile(X, sample_weight, percentile) 657 for percentile in percentiles 658 ] 659 ) 660 661 else: 662 # knots == 'uniform': 663 # Note that the variable `knots` has already been validated and 664 # `else` is therefore safe. 665 # Disregard observations with zero weight. 666 mask = slice(None, None, 1) if sample_weight is None else sample_weight > 0 667 x_min = np.amin(X[mask], axis=0) 668 x_max = np.amax(X[mask], axis=0) 669 670 knots = linspace( 671 start=x_min, 672 stop=x_max, 673 num=n_knots, 674 endpoint=True, 675 dtype=np.float64, 676 ) 677 678 return knots 679 680 @deprecated( 681 "get_feature_names is deprecated in 1.0 and will be removed " 682 "in 1.2. Please use get_feature_names_out instead." 683 ) 684 def get_feature_names(self, input_features=None): 685 """Return feature names for output features. 686 687 Parameters 688 ---------- 689 input_features : list of str of shape (n_features,), default=None 690 String names for input features if available. By default, 691 "x0", "x1", ... "xn_features" is used. 692 693 Returns 694 ------- 695 output_feature_names : list of str of shape (n_output_features,) 696 Transformed feature names. 697 """ 698 n_splines = self.bsplines_[0].c.shape[0] 699 if input_features is None: 700 input_features = ["x%d" % i for i in range(self.n_features_in_)] 701 feature_names = [] 702 for i in range(self.n_features_in_): 703 for j in range(n_splines - 1 + self.include_bias): 704 feature_names.append(f"{input_features[i]}_sp_{j}") 705 return feature_names 706 707 def get_feature_names_out(self, input_features=None): 708 """Get output feature names for transformation. 709 710 Parameters 711 ---------- 712 input_features : array-like of str or None, default=None 713 Input features. 714 715 - If `input_features` is `None`, then `feature_names_in_` is 716 used as feature names in. If `feature_names_in_` is not defined, 717 then names are generated: `[x0, x1, ..., x(n_features_in_)]`. 718 - If `input_features` is an array-like, then `input_features` must 719 match `feature_names_in_` if `feature_names_in_` is defined. 720 721 Returns 722 ------- 723 feature_names_out : ndarray of str objects 724 Transformed feature names. 725 """ 726 n_splines = self.bsplines_[0].c.shape[0] 727 input_features = _check_feature_names_in(self, input_features) 728 feature_names = [] 729 for i in range(self.n_features_in_): 730 for j in range(n_splines - 1 + self.include_bias): 731 feature_names.append(f"{input_features[i]}_sp_{j}") 732 return np.asarray(feature_names, dtype=object) 733 734 def fit(self, X, y=None, sample_weight=None): 735 """Compute knot positions of splines. 736 737 Parameters 738 ---------- 739 X : array-like of shape (n_samples, n_features) 740 The data. 741 742 y : None 743 Ignored. 744 745 sample_weight : array-like of shape (n_samples,), default = None 746 Individual weights for each sample. Used to calculate quantiles if 747 `knots="quantile"`. For `knots="uniform"`, zero weighted 748 observations are ignored for finding the min and max of `X`. 749 750 Returns 751 ------- 752 self : object 753 Fitted transformer. 754 """ 755 X = self._validate_data( 756 X, 757 reset=True, 758 accept_sparse=False, 759 ensure_min_samples=2, 760 ensure_2d=True, 761 ) 762 if sample_weight is not None: 763 sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) 764 765 _, n_features = X.shape 766 767 if not (isinstance(self.degree, numbers.Integral) and self.degree >= 0): 768 raise ValueError( 769 f"degree must be a non-negative integer, got {self.degree}." 770 ) 771 772 if isinstance(self.knots, str) and self.knots in [ 773 "uniform", 774 "quantile", 775 ]: 776 if not (isinstance(self.n_knots, numbers.Integral) and self.n_knots >= 2): 777 raise ValueError( 778 f"n_knots must be a positive integer >= 2, got: {self.n_knots}" 779 ) 780 781 base_knots = self._get_base_knot_positions( 782 X, n_knots=self.n_knots, knots=self.knots, sample_weight=sample_weight 783 ) 784 else: 785 base_knots = check_array(self.knots, dtype=np.float64) 786 if base_knots.shape[0] < 2: 787 raise ValueError("Number of knots, knots.shape[0], must be >= 2.") 788 elif base_knots.shape[1] != n_features: 789 raise ValueError("knots.shape[1] == n_features is violated.") 790 elif not np.all(np.diff(base_knots, axis=0) > 0): 791 raise ValueError("knots must be sorted without duplicates.") 792 793 if self.extrapolation not in ( 794 "error", 795 "constant", 796 "linear", 797 "continue", 798 "periodic", 799 ): 800 raise ValueError( 801 "extrapolation must be one of 'error', " 802 "'constant', 'linear', 'continue' or 'periodic'." 803 ) 804 805 if not isinstance(self.include_bias, (bool, np.bool_)): 806 raise ValueError("include_bias must be bool.") 807 808 # number of knots for base interval 809 n_knots = base_knots.shape[0] 810 811 if self.extrapolation == "periodic" and n_knots <= self.degree: 812 raise ValueError( 813 "Periodic splines require degree < n_knots. Got n_knots=" 814 f"{n_knots} and degree={self.degree}." 815 ) 816 817 # number of splines basis functions 818 if self.extrapolation != "periodic": 819 n_splines = n_knots + self.degree - 1 820 else: 821 # periodic splines have self.degree less degrees of freedom 822 n_splines = n_knots - 1 823 824 degree = self.degree 825 n_out = n_features * n_splines 826 # We have to add degree number of knots below, and degree number knots 827 # above the base knots in order to make the spline basis complete. 828 if self.extrapolation == "periodic": 829 # For periodic splines the spacing of the first / last degree knots 830 # needs to be a continuation of the spacing of the last / first 831 # base knots. 832 period = base_knots[-1] - base_knots[0] 833 knots = np.r_[ 834 base_knots[-(degree + 1) : -1] - period, 835 base_knots, 836 base_knots[1 : (degree + 1)] + period, 837 ] 838 839 else: 840 # Eilers & Marx in "Flexible smoothing with B-splines and 841 # penalties" https://doi.org/10.1214/ss/1038425655 advice 842 # against repeating first and last knot several times, which 843 # would have inferior behaviour at boundaries if combined with 844 # a penalty (hence P-Spline). We follow this advice even if our 845 # splines are unpenalized. Meaning we do not: 846 # knots = np.r_[ 847 # np.tile(base_knots.min(axis=0), reps=[degree, 1]), 848 # base_knots, 849 # np.tile(base_knots.max(axis=0), reps=[degree, 1]) 850 # ] 851 # Instead, we reuse the distance of the 2 fist/last knots. 852 dist_min = base_knots[1] - base_knots[0] 853 dist_max = base_knots[-1] - base_knots[-2] 854 855 knots = np.r_[ 856 linspace( 857 base_knots[0] - degree * dist_min, 858 base_knots[0] - dist_min, 859 num=degree, 860 ), 861 base_knots, 862 linspace( 863 base_knots[-1] + dist_max, 864 base_knots[-1] + degree * dist_max, 865 num=degree, 866 ), 867 ] 868 869 # With a diagonal coefficient matrix, we get back the spline basis 870 # elements, i.e. the design matrix of the spline. 871 # Note, BSpline appreciates C-contiguous float64 arrays as c=coef. 872 coef = np.eye(n_splines, dtype=np.float64) 873 if self.extrapolation == "periodic": 874 coef = np.concatenate((coef, coef[:degree, :])) 875 876 extrapolate = self.extrapolation in ["periodic", "continue"] 877 878 bsplines = [ 879 BSpline.construct_fast( 880 knots[:, i], coef, self.degree, extrapolate=extrapolate 881 ) 882 for i in range(n_features) 883 ] 884 self.bsplines_ = bsplines 885 886 self.n_features_out_ = n_out - n_features * (1 - self.include_bias) 887 return self 888 889 def transform(self, X): 890 """Transform each feature data to B-splines. 891 892 Parameters 893 ---------- 894 X : array-like of shape (n_samples, n_features) 895 The data to transform. 896 897 Returns 898 ------- 899 XBS : ndarray of shape (n_samples, n_features * n_splines) 900 The matrix of features, where n_splines is the number of bases 901 elements of the B-splines, n_knots + degree - 1. 902 """ 903 check_is_fitted(self) 904 905 X = self._validate_data(X, reset=False, accept_sparse=False, ensure_2d=True) 906 907 n_samples, n_features = X.shape 908 n_splines = self.bsplines_[0].c.shape[1] 909 degree = self.degree 910 911 # Note that scipy BSpline returns float64 arrays and converts input 912 # x=X[:, i] to c-contiguous float64. 913 n_out = self.n_features_out_ + n_features * (1 - self.include_bias) 914 if X.dtype in FLOAT_DTYPES: 915 dtype = X.dtype 916 else: 917 dtype = np.float64 918 XBS = np.zeros((n_samples, n_out), dtype=dtype, order=self.order) 919 920 for i in range(n_features): 921 spl = self.bsplines_[i] 922 923 if self.extrapolation in ("continue", "error", "periodic"): 924 925 if self.extrapolation == "periodic": 926 # With periodic extrapolation we map x to the segment 927 # [spl.t[k], spl.t[n]]. 928 # This is equivalent to BSpline(.., extrapolate="periodic") 929 # for scipy>=1.0.0. 930 n = spl.t.size - spl.k - 1 931 # Assign to new array to avoid inplace operation 932 x = spl.t[spl.k] + (X[:, i] - spl.t[spl.k]) % ( 933 spl.t[n] - spl.t[spl.k] 934 ) 935 else: 936 x = X[:, i] 937 938 XBS[:, (i * n_splines) : ((i + 1) * n_splines)] = spl(x) 939 940 else: 941 xmin = spl.t[degree] 942 xmax = spl.t[-degree - 1] 943 mask = (xmin <= X[:, i]) & (X[:, i] <= xmax) 944 XBS[mask, (i * n_splines) : ((i + 1) * n_splines)] = spl(X[mask, i]) 945 946 # Note for extrapolation: 947 # 'continue' is already returned as is by scipy BSplines 948 if self.extrapolation == "error": 949 # BSpline with extrapolate=False does not raise an error, but 950 # output np.nan. 951 if np.any(np.isnan(XBS[:, (i * n_splines) : ((i + 1) * n_splines)])): 952 raise ValueError( 953 "X contains values beyond the limits of the knots." 954 ) 955 elif self.extrapolation == "constant": 956 # Set all values beyond xmin and xmax to the value of the 957 # spline basis functions at those two positions. 958 # Only the first degree and last degree number of splines 959 # have non-zero values at the boundaries. 960 961 # spline values at boundaries 962 f_min = spl(xmin) 963 f_max = spl(xmax) 964 mask = X[:, i] < xmin 965 if np.any(mask): 966 XBS[mask, (i * n_splines) : (i * n_splines + degree)] = f_min[ 967 :degree 968 ] 969 970 mask = X[:, i] > xmax 971 if np.any(mask): 972 XBS[ 973 mask, 974 ((i + 1) * n_splines - degree) : ((i + 1) * n_splines), 975 ] = f_max[-degree:] 976 977 elif self.extrapolation == "linear": 978 # Continue the degree first and degree last spline bases 979 # linearly beyond the boundaries, with slope = derivative at 980 # the boundary. 981 # Note that all others have derivative = value = 0 at the 982 # boundaries. 983 984 # spline values at boundaries 985 f_min, f_max = spl(xmin), spl(xmax) 986 # spline derivatives = slopes at boundaries 987 fp_min, fp_max = spl(xmin, nu=1), spl(xmax, nu=1) 988 # Compute the linear continuation. 989 if degree <= 1: 990 # For degree=1, the derivative of 2nd spline is not zero at 991 # boundary. For degree=0 it is the same as 'constant'. 992 degree += 1 993 for j in range(degree): 994 mask = X[:, i] < xmin 995 if np.any(mask): 996 XBS[mask, i * n_splines + j] = ( 997 f_min[j] + (X[mask, i] - xmin) * fp_min[j] 998 ) 999 1000 mask = X[:, i] > xmax 1001 if np.any(mask): 1002 k = n_splines - 1 - j 1003 XBS[mask, i * n_splines + k] = ( 1004 f_max[k] + (X[mask, i] - xmax) * fp_max[k] 1005 ) 1006 1007 if self.include_bias: 1008 return XBS 1009 else: 1010 # We throw away one spline basis per feature. 1011 # We chose the last one. 1012 indices = [j for j in range(XBS.shape[1]) if (j + 1) % n_splines != 0] 1013 return XBS[:, indices] 1014