1# -*- coding: utf8 2"""Random Projection transformers. 3 4Random Projections are a simple and computationally efficient way to 5reduce the dimensionality of the data by trading a controlled amount 6of accuracy (as additional variance) for faster processing times and 7smaller model sizes. 8 9The dimensions and distribution of Random Projections matrices are 10controlled so as to preserve the pairwise distances between any two 11samples of the dataset. 12 13The main theoretical result behind the efficiency of random projection is the 14`Johnson-Lindenstrauss lemma (quoting Wikipedia) 15<https://en.wikipedia.org/wiki/Johnson%E2%80%93Lindenstrauss_lemma>`_: 16 17 In mathematics, the Johnson-Lindenstrauss lemma is a result 18 concerning low-distortion embeddings of points from high-dimensional 19 into low-dimensional Euclidean space. The lemma states that a small set 20 of points in a high-dimensional space can be embedded into a space of 21 much lower dimension in such a way that distances between the points are 22 nearly preserved. The map used for the embedding is at least Lipschitz, 23 and can even be taken to be an orthogonal projection. 24 25""" 26# Authors: Olivier Grisel <olivier.grisel@ensta.org>, 27# Arnaud Joly <a.joly@ulg.ac.be> 28# License: BSD 3 clause 29 30import warnings 31from abc import ABCMeta, abstractmethod 32 33import numpy as np 34import scipy.sparse as sp 35 36from .base import BaseEstimator, TransformerMixin 37 38from .utils import check_random_state 39from .utils.extmath import safe_sparse_dot 40from .utils.random import sample_without_replacement 41from .utils.validation import check_is_fitted 42from .exceptions import DataDimensionalityWarning 43 44 45__all__ = [ 46 "SparseRandomProjection", 47 "GaussianRandomProjection", 48 "johnson_lindenstrauss_min_dim", 49] 50 51 52def johnson_lindenstrauss_min_dim(n_samples, *, eps=0.1): 53 """Find a 'safe' number of components to randomly project to. 54 55 The distortion introduced by a random projection `p` only changes the 56 distance between two points by a factor (1 +- eps) in an euclidean space 57 with good probability. The projection `p` is an eps-embedding as defined 58 by: 59 60 (1 - eps) ||u - v||^2 < ||p(u) - p(v)||^2 < (1 + eps) ||u - v||^2 61 62 Where u and v are any rows taken from a dataset of shape (n_samples, 63 n_features), eps is in ]0, 1[ and p is a projection by a random Gaussian 64 N(0, 1) matrix of shape (n_components, n_features) (or a sparse 65 Achlioptas matrix). 66 67 The minimum number of components to guarantee the eps-embedding is 68 given by: 69 70 n_components >= 4 log(n_samples) / (eps^2 / 2 - eps^3 / 3) 71 72 Note that the number of dimensions is independent of the original 73 number of features but instead depends on the size of the dataset: 74 the larger the dataset, the higher is the minimal dimensionality of 75 an eps-embedding. 76 77 Read more in the :ref:`User Guide <johnson_lindenstrauss>`. 78 79 Parameters 80 ---------- 81 n_samples : int or array-like of int 82 Number of samples that should be a integer greater than 0. If an array 83 is given, it will compute a safe number of components array-wise. 84 85 eps : float or ndarray of shape (n_components,), dtype=float, \ 86 default=0.1 87 Maximum distortion rate in the range (0,1 ) as defined by the 88 Johnson-Lindenstrauss lemma. If an array is given, it will compute a 89 safe number of components array-wise. 90 91 Returns 92 ------- 93 n_components : int or ndarray of int 94 The minimal number of components to guarantee with good probability 95 an eps-embedding with n_samples. 96 97 Examples 98 -------- 99 >>> from sklearn.random_projection import johnson_lindenstrauss_min_dim 100 >>> johnson_lindenstrauss_min_dim(1e6, eps=0.5) 101 663 102 103 >>> johnson_lindenstrauss_min_dim(1e6, eps=[0.5, 0.1, 0.01]) 104 array([ 663, 11841, 1112658]) 105 106 >>> johnson_lindenstrauss_min_dim([1e4, 1e5, 1e6], eps=0.1) 107 array([ 7894, 9868, 11841]) 108 109 References 110 ---------- 111 112 .. [1] https://en.wikipedia.org/wiki/Johnson%E2%80%93Lindenstrauss_lemma 113 114 .. [2] Sanjoy Dasgupta and Anupam Gupta, 1999, 115 "An elementary proof of the Johnson-Lindenstrauss Lemma." 116 http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.45.3654 117 118 """ 119 eps = np.asarray(eps) 120 n_samples = np.asarray(n_samples) 121 122 if np.any(eps <= 0.0) or np.any(eps >= 1): 123 raise ValueError("The JL bound is defined for eps in ]0, 1[, got %r" % eps) 124 125 if np.any(n_samples) <= 0: 126 raise ValueError( 127 "The JL bound is defined for n_samples greater than zero, got %r" 128 % n_samples 129 ) 130 131 denominator = (eps ** 2 / 2) - (eps ** 3 / 3) 132 return (4 * np.log(n_samples) / denominator).astype(np.int64) 133 134 135def _check_density(density, n_features): 136 """Factorize density check according to Li et al.""" 137 if density == "auto": 138 density = 1 / np.sqrt(n_features) 139 140 elif density <= 0 or density > 1: 141 raise ValueError("Expected density in range ]0, 1], got: %r" % density) 142 return density 143 144 145def _check_input_size(n_components, n_features): 146 """Factorize argument checking for random matrix generation.""" 147 if n_components <= 0: 148 raise ValueError( 149 "n_components must be strictly positive, got %d" % n_components 150 ) 151 if n_features <= 0: 152 raise ValueError("n_features must be strictly positive, got %d" % n_features) 153 154 155def _gaussian_random_matrix(n_components, n_features, random_state=None): 156 """Generate a dense Gaussian random matrix. 157 158 The components of the random matrix are drawn from 159 160 N(0, 1.0 / n_components). 161 162 Read more in the :ref:`User Guide <gaussian_random_matrix>`. 163 164 Parameters 165 ---------- 166 n_components : int, 167 Dimensionality of the target projection space. 168 169 n_features : int, 170 Dimensionality of the original source space. 171 172 random_state : int, RandomState instance or None, default=None 173 Controls the pseudo random number generator used to generate the matrix 174 at fit time. 175 Pass an int for reproducible output across multiple function calls. 176 See :term:`Glossary <random_state>`. 177 178 Returns 179 ------- 180 components : ndarray of shape (n_components, n_features) 181 The generated Gaussian random matrix. 182 183 See Also 184 -------- 185 GaussianRandomProjection 186 """ 187 _check_input_size(n_components, n_features) 188 rng = check_random_state(random_state) 189 components = rng.normal( 190 loc=0.0, scale=1.0 / np.sqrt(n_components), size=(n_components, n_features) 191 ) 192 return components 193 194 195def _sparse_random_matrix(n_components, n_features, density="auto", random_state=None): 196 """Generalized Achlioptas random sparse matrix for random projection. 197 198 Setting density to 1 / 3 will yield the original matrix by Dimitris 199 Achlioptas while setting a lower value will yield the generalization 200 by Ping Li et al. 201 202 If we note :math:`s = 1 / density`, the components of the random matrix are 203 drawn from: 204 205 - -sqrt(s) / sqrt(n_components) with probability 1 / 2s 206 - 0 with probability 1 - 1 / s 207 - +sqrt(s) / sqrt(n_components) with probability 1 / 2s 208 209 Read more in the :ref:`User Guide <sparse_random_matrix>`. 210 211 Parameters 212 ---------- 213 n_components : int, 214 Dimensionality of the target projection space. 215 216 n_features : int, 217 Dimensionality of the original source space. 218 219 density : float or 'auto', default='auto' 220 Ratio of non-zero component in the random projection matrix in the 221 range `(0, 1]` 222 223 If density = 'auto', the value is set to the minimum density 224 as recommended by Ping Li et al.: 1 / sqrt(n_features). 225 226 Use density = 1 / 3.0 if you want to reproduce the results from 227 Achlioptas, 2001. 228 229 random_state : int, RandomState instance or None, default=None 230 Controls the pseudo random number generator used to generate the matrix 231 at fit time. 232 Pass an int for reproducible output across multiple function calls. 233 See :term:`Glossary <random_state>`. 234 235 Returns 236 ------- 237 components : {ndarray, sparse matrix} of shape (n_components, n_features) 238 The generated Gaussian random matrix. Sparse matrix will be of CSR 239 format. 240 241 See Also 242 -------- 243 SparseRandomProjection 244 245 References 246 ---------- 247 248 .. [1] Ping Li, T. Hastie and K. W. Church, 2006, 249 "Very Sparse Random Projections". 250 https://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf 251 252 .. [2] D. Achlioptas, 2001, "Database-friendly random projections", 253 http://www.cs.ucsc.edu/~optas/papers/jl.pdf 254 255 """ 256 _check_input_size(n_components, n_features) 257 density = _check_density(density, n_features) 258 rng = check_random_state(random_state) 259 260 if density == 1: 261 # skip index generation if totally dense 262 components = rng.binomial(1, 0.5, (n_components, n_features)) * 2 - 1 263 return 1 / np.sqrt(n_components) * components 264 265 else: 266 # Generate location of non zero elements 267 indices = [] 268 offset = 0 269 indptr = [offset] 270 for _ in range(n_components): 271 # find the indices of the non-zero components for row i 272 n_nonzero_i = rng.binomial(n_features, density) 273 indices_i = sample_without_replacement( 274 n_features, n_nonzero_i, random_state=rng 275 ) 276 indices.append(indices_i) 277 offset += n_nonzero_i 278 indptr.append(offset) 279 280 indices = np.concatenate(indices) 281 282 # Among non zero components the probability of the sign is 50%/50% 283 data = rng.binomial(1, 0.5, size=np.size(indices)) * 2 - 1 284 285 # build the CSR structure by concatenating the rows 286 components = sp.csr_matrix( 287 (data, indices, indptr), shape=(n_components, n_features) 288 ) 289 290 return np.sqrt(1 / density) / np.sqrt(n_components) * components 291 292 293class BaseRandomProjection(TransformerMixin, BaseEstimator, metaclass=ABCMeta): 294 """Base class for random projections. 295 296 Warning: This class should not be used directly. 297 Use derived classes instead. 298 """ 299 300 @abstractmethod 301 def __init__( 302 self, n_components="auto", *, eps=0.1, dense_output=False, random_state=None 303 ): 304 self.n_components = n_components 305 self.eps = eps 306 self.dense_output = dense_output 307 self.random_state = random_state 308 309 @abstractmethod 310 def _make_random_matrix(self, n_components, n_features): 311 """Generate the random projection matrix. 312 313 Parameters 314 ---------- 315 n_components : int, 316 Dimensionality of the target projection space. 317 318 n_features : int, 319 Dimensionality of the original source space. 320 321 Returns 322 ------- 323 components : {ndarray, sparse matrix} of shape \ 324 (n_components, n_features) 325 The generated random matrix. Sparse matrix will be of CSR format. 326 327 """ 328 329 def fit(self, X, y=None): 330 """Generate a sparse random projection matrix. 331 332 Parameters 333 ---------- 334 X : {ndarray, sparse matrix} of shape (n_samples, n_features) 335 Training set: only the shape is used to find optimal random 336 matrix dimensions based on the theory referenced in the 337 afore mentioned papers. 338 339 y : Ignored 340 Not used, present here for API consistency by convention. 341 342 Returns 343 ------- 344 self : object 345 BaseRandomProjection class instance. 346 """ 347 X = self._validate_data(X, accept_sparse=["csr", "csc"]) 348 349 n_samples, n_features = X.shape 350 351 if self.n_components == "auto": 352 self.n_components_ = johnson_lindenstrauss_min_dim( 353 n_samples=n_samples, eps=self.eps 354 ) 355 356 if self.n_components_ <= 0: 357 raise ValueError( 358 "eps=%f and n_samples=%d lead to a target dimension of " 359 "%d which is invalid" % (self.eps, n_samples, self.n_components_) 360 ) 361 362 elif self.n_components_ > n_features: 363 raise ValueError( 364 "eps=%f and n_samples=%d lead to a target dimension of " 365 "%d which is larger than the original space with " 366 "n_features=%d" 367 % (self.eps, n_samples, self.n_components_, n_features) 368 ) 369 else: 370 if self.n_components <= 0: 371 raise ValueError( 372 "n_components must be greater than 0, got %s" % self.n_components 373 ) 374 375 elif self.n_components > n_features: 376 warnings.warn( 377 "The number of components is higher than the number of" 378 " features: n_features < n_components (%s < %s)." 379 "The dimensionality of the problem will not be reduced." 380 % (n_features, self.n_components), 381 DataDimensionalityWarning, 382 ) 383 384 self.n_components_ = self.n_components 385 386 # Generate a projection matrix of size [n_components, n_features] 387 self.components_ = self._make_random_matrix(self.n_components_, n_features) 388 389 # Check contract 390 assert self.components_.shape == (self.n_components_, n_features), ( 391 "An error has occurred the self.components_ matrix has " 392 " not the proper shape." 393 ) 394 395 return self 396 397 def transform(self, X): 398 """Project the data by using matrix product with the random matrix. 399 400 Parameters 401 ---------- 402 X : {ndarray, sparse matrix} of shape (n_samples, n_features) 403 The input data to project into a smaller dimensional space. 404 405 Returns 406 ------- 407 X_new : {ndarray, sparse matrix} of shape (n_samples, n_components) 408 Projected array. 409 """ 410 check_is_fitted(self) 411 X = self._validate_data(X, accept_sparse=["csr", "csc"], reset=False) 412 413 if X.shape[1] != self.components_.shape[1]: 414 raise ValueError( 415 "Impossible to perform projection:" 416 "X at fit stage had a different number of features. " 417 "(%s != %s)" % (X.shape[1], self.components_.shape[1]) 418 ) 419 420 X_new = safe_sparse_dot(X, self.components_.T, dense_output=self.dense_output) 421 return X_new 422 423 424class GaussianRandomProjection(BaseRandomProjection): 425 """Reduce dimensionality through Gaussian random projection. 426 427 The components of the random matrix are drawn from N(0, 1 / n_components). 428 429 Read more in the :ref:`User Guide <gaussian_random_matrix>`. 430 431 .. versionadded:: 0.13 432 433 Parameters 434 ---------- 435 n_components : int or 'auto', default='auto' 436 Dimensionality of the target projection space. 437 438 n_components can be automatically adjusted according to the 439 number of samples in the dataset and the bound given by the 440 Johnson-Lindenstrauss lemma. In that case the quality of the 441 embedding is controlled by the ``eps`` parameter. 442 443 It should be noted that Johnson-Lindenstrauss lemma can yield 444 very conservative estimated of the required number of components 445 as it makes no assumption on the structure of the dataset. 446 447 eps : float, default=0.1 448 Parameter to control the quality of the embedding according to 449 the Johnson-Lindenstrauss lemma when `n_components` is set to 450 'auto'. The value should be strictly positive. 451 452 Smaller values lead to better embedding and higher number of 453 dimensions (n_components) in the target projection space. 454 455 random_state : int, RandomState instance or None, default=None 456 Controls the pseudo random number generator used to generate the 457 projection matrix at fit time. 458 Pass an int for reproducible output across multiple function calls. 459 See :term:`Glossary <random_state>`. 460 461 Attributes 462 ---------- 463 n_components_ : int 464 Concrete number of components computed when n_components="auto". 465 466 components_ : ndarray of shape (n_components, n_features) 467 Random matrix used for the projection. 468 469 n_features_in_ : int 470 Number of features seen during :term:`fit`. 471 472 .. versionadded:: 0.24 473 474 feature_names_in_ : ndarray of shape (`n_features_in_`,) 475 Names of features seen during :term:`fit`. Defined only when `X` 476 has feature names that are all strings. 477 478 .. versionadded:: 1.0 479 480 See Also 481 -------- 482 SparseRandomProjection : Reduce dimensionality through sparse 483 random projection. 484 485 Examples 486 -------- 487 >>> import numpy as np 488 >>> from sklearn.random_projection import GaussianRandomProjection 489 >>> rng = np.random.RandomState(42) 490 >>> X = rng.rand(25, 3000) 491 >>> transformer = GaussianRandomProjection(random_state=rng) 492 >>> X_new = transformer.fit_transform(X) 493 >>> X_new.shape 494 (25, 2759) 495 """ 496 497 def __init__(self, n_components="auto", *, eps=0.1, random_state=None): 498 super().__init__( 499 n_components=n_components, 500 eps=eps, 501 dense_output=True, 502 random_state=random_state, 503 ) 504 505 def _make_random_matrix(self, n_components, n_features): 506 """ Generate the random projection matrix. 507 508 Parameters 509 ---------- 510 n_components : int, 511 Dimensionality of the target projection space. 512 513 n_features : int, 514 Dimensionality of the original source space. 515 516 Returns 517 ------- 518 components : {ndarray, sparse matrix} of shape \ 519 (n_components, n_features) 520 The generated random matrix. Sparse matrix will be of CSR format. 521 522 """ 523 random_state = check_random_state(self.random_state) 524 return _gaussian_random_matrix( 525 n_components, n_features, random_state=random_state 526 ) 527 528 529class SparseRandomProjection(BaseRandomProjection): 530 """Reduce dimensionality through sparse random projection. 531 532 Sparse random matrix is an alternative to dense random 533 projection matrix that guarantees similar embedding quality while being 534 much more memory efficient and allowing faster computation of the 535 projected data. 536 537 If we note `s = 1 / density` the components of the random matrix are 538 drawn from: 539 540 - -sqrt(s) / sqrt(n_components) with probability 1 / 2s 541 - 0 with probability 1 - 1 / s 542 - +sqrt(s) / sqrt(n_components) with probability 1 / 2s 543 544 Read more in the :ref:`User Guide <sparse_random_matrix>`. 545 546 .. versionadded:: 0.13 547 548 Parameters 549 ---------- 550 n_components : int or 'auto', default='auto' 551 Dimensionality of the target projection space. 552 553 n_components can be automatically adjusted according to the 554 number of samples in the dataset and the bound given by the 555 Johnson-Lindenstrauss lemma. In that case the quality of the 556 embedding is controlled by the ``eps`` parameter. 557 558 It should be noted that Johnson-Lindenstrauss lemma can yield 559 very conservative estimated of the required number of components 560 as it makes no assumption on the structure of the dataset. 561 562 density : float or 'auto', default='auto' 563 Ratio in the range (0, 1] of non-zero component in the random 564 projection matrix. 565 566 If density = 'auto', the value is set to the minimum density 567 as recommended by Ping Li et al.: 1 / sqrt(n_features). 568 569 Use density = 1 / 3.0 if you want to reproduce the results from 570 Achlioptas, 2001. 571 572 eps : float, default=0.1 573 Parameter to control the quality of the embedding according to 574 the Johnson-Lindenstrauss lemma when n_components is set to 575 'auto'. This value should be strictly positive. 576 577 Smaller values lead to better embedding and higher number of 578 dimensions (n_components) in the target projection space. 579 580 dense_output : bool, default=False 581 If True, ensure that the output of the random projection is a 582 dense numpy array even if the input and random projection matrix 583 are both sparse. In practice, if the number of components is 584 small the number of zero components in the projected data will 585 be very small and it will be more CPU and memory efficient to 586 use a dense representation. 587 588 If False, the projected data uses a sparse representation if 589 the input is sparse. 590 591 random_state : int, RandomState instance or None, default=None 592 Controls the pseudo random number generator used to generate the 593 projection matrix at fit time. 594 Pass an int for reproducible output across multiple function calls. 595 See :term:`Glossary <random_state>`. 596 597 Attributes 598 ---------- 599 n_components_ : int 600 Concrete number of components computed when n_components="auto". 601 602 components_ : sparse matrix of shape (n_components, n_features) 603 Random matrix used for the projection. Sparse matrix will be of CSR 604 format. 605 606 density_ : float in range 0.0 - 1.0 607 Concrete density computed from when density = "auto". 608 609 n_features_in_ : int 610 Number of features seen during :term:`fit`. 611 612 .. versionadded:: 0.24 613 614 feature_names_in_ : ndarray of shape (`n_features_in_`,) 615 Names of features seen during :term:`fit`. Defined only when `X` 616 has feature names that are all strings. 617 618 .. versionadded:: 1.0 619 620 See Also 621 -------- 622 GaussianRandomProjection : Reduce dimensionality through Gaussian 623 random projection. 624 625 References 626 ---------- 627 628 .. [1] Ping Li, T. Hastie and K. W. Church, 2006, 629 "Very Sparse Random Projections". 630 https://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf 631 632 .. [2] D. Achlioptas, 2001, "Database-friendly random projections", 633 https://users.soe.ucsc.edu/~optas/papers/jl.pdf 634 635 Examples 636 -------- 637 >>> import numpy as np 638 >>> from sklearn.random_projection import SparseRandomProjection 639 >>> rng = np.random.RandomState(42) 640 >>> X = rng.rand(25, 3000) 641 >>> transformer = SparseRandomProjection(random_state=rng) 642 >>> X_new = transformer.fit_transform(X) 643 >>> X_new.shape 644 (25, 2759) 645 >>> # very few components are non-zero 646 >>> np.mean(transformer.components_ != 0) 647 0.0182... 648 """ 649 650 def __init__( 651 self, 652 n_components="auto", 653 *, 654 density="auto", 655 eps=0.1, 656 dense_output=False, 657 random_state=None, 658 ): 659 super().__init__( 660 n_components=n_components, 661 eps=eps, 662 dense_output=dense_output, 663 random_state=random_state, 664 ) 665 666 self.density = density 667 668 def _make_random_matrix(self, n_components, n_features): 669 """ Generate the random projection matrix 670 671 Parameters 672 ---------- 673 n_components : int 674 Dimensionality of the target projection space. 675 676 n_features : int 677 Dimensionality of the original source space. 678 679 Returns 680 ------- 681 components : {ndarray, sparse matrix} of shape \ 682 (n_components, n_features) 683 The generated random matrix. Sparse matrix will be of CSR format. 684 685 """ 686 random_state = check_random_state(self.random_state) 687 self.density_ = _check_density(self.density, n_features) 688 return _sparse_random_matrix( 689 n_components, n_features, density=self.density_, random_state=random_state 690 ) 691