1# -*- coding: utf8
2"""Random Projection transformers.
3
4Random Projections are a simple and computationally efficient way to
5reduce the dimensionality of the data by trading a controlled amount
6of accuracy (as additional variance) for faster processing times and
7smaller model sizes.
8
9The dimensions and distribution of Random Projections matrices are
10controlled so as to preserve the pairwise distances between any two
11samples of the dataset.
12
13The main theoretical result behind the efficiency of random projection is the
14`Johnson-Lindenstrauss lemma (quoting Wikipedia)
15<https://en.wikipedia.org/wiki/Johnson%E2%80%93Lindenstrauss_lemma>`_:
16
17  In mathematics, the Johnson-Lindenstrauss lemma is a result
18  concerning low-distortion embeddings of points from high-dimensional
19  into low-dimensional Euclidean space. The lemma states that a small set
20  of points in a high-dimensional space can be embedded into a space of
21  much lower dimension in such a way that distances between the points are
22  nearly preserved. The map used for the embedding is at least Lipschitz,
23  and can even be taken to be an orthogonal projection.
24
25"""
26# Authors: Olivier Grisel <olivier.grisel@ensta.org>,
27#          Arnaud Joly <a.joly@ulg.ac.be>
28# License: BSD 3 clause
29
30import warnings
31from abc import ABCMeta, abstractmethod
32
33import numpy as np
34import scipy.sparse as sp
35
36from .base import BaseEstimator, TransformerMixin
37
38from .utils import check_random_state
39from .utils.extmath import safe_sparse_dot
40from .utils.random import sample_without_replacement
41from .utils.validation import check_is_fitted
42from .exceptions import DataDimensionalityWarning
43
44
45__all__ = [
46    "SparseRandomProjection",
47    "GaussianRandomProjection",
48    "johnson_lindenstrauss_min_dim",
49]
50
51
52def johnson_lindenstrauss_min_dim(n_samples, *, eps=0.1):
53    """Find a 'safe' number of components to randomly project to.
54
55    The distortion introduced by a random projection `p` only changes the
56    distance between two points by a factor (1 +- eps) in an euclidean space
57    with good probability. The projection `p` is an eps-embedding as defined
58    by:
59
60      (1 - eps) ||u - v||^2 < ||p(u) - p(v)||^2 < (1 + eps) ||u - v||^2
61
62    Where u and v are any rows taken from a dataset of shape (n_samples,
63    n_features), eps is in ]0, 1[ and p is a projection by a random Gaussian
64    N(0, 1) matrix of shape (n_components, n_features) (or a sparse
65    Achlioptas matrix).
66
67    The minimum number of components to guarantee the eps-embedding is
68    given by:
69
70      n_components >= 4 log(n_samples) / (eps^2 / 2 - eps^3 / 3)
71
72    Note that the number of dimensions is independent of the original
73    number of features but instead depends on the size of the dataset:
74    the larger the dataset, the higher is the minimal dimensionality of
75    an eps-embedding.
76
77    Read more in the :ref:`User Guide <johnson_lindenstrauss>`.
78
79    Parameters
80    ----------
81    n_samples : int or array-like of int
82        Number of samples that should be a integer greater than 0. If an array
83        is given, it will compute a safe number of components array-wise.
84
85    eps : float or ndarray of shape (n_components,), dtype=float, \
86            default=0.1
87        Maximum distortion rate in the range (0,1 ) as defined by the
88        Johnson-Lindenstrauss lemma. If an array is given, it will compute a
89        safe number of components array-wise.
90
91    Returns
92    -------
93    n_components : int or ndarray of int
94        The minimal number of components to guarantee with good probability
95        an eps-embedding with n_samples.
96
97    Examples
98    --------
99    >>> from sklearn.random_projection import johnson_lindenstrauss_min_dim
100    >>> johnson_lindenstrauss_min_dim(1e6, eps=0.5)
101    663
102
103    >>> johnson_lindenstrauss_min_dim(1e6, eps=[0.5, 0.1, 0.01])
104    array([    663,   11841, 1112658])
105
106    >>> johnson_lindenstrauss_min_dim([1e4, 1e5, 1e6], eps=0.1)
107    array([ 7894,  9868, 11841])
108
109    References
110    ----------
111
112    .. [1] https://en.wikipedia.org/wiki/Johnson%E2%80%93Lindenstrauss_lemma
113
114    .. [2] Sanjoy Dasgupta and Anupam Gupta, 1999,
115           "An elementary proof of the Johnson-Lindenstrauss Lemma."
116           http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.45.3654
117
118    """
119    eps = np.asarray(eps)
120    n_samples = np.asarray(n_samples)
121
122    if np.any(eps <= 0.0) or np.any(eps >= 1):
123        raise ValueError("The JL bound is defined for eps in ]0, 1[, got %r" % eps)
124
125    if np.any(n_samples) <= 0:
126        raise ValueError(
127            "The JL bound is defined for n_samples greater than zero, got %r"
128            % n_samples
129        )
130
131    denominator = (eps ** 2 / 2) - (eps ** 3 / 3)
132    return (4 * np.log(n_samples) / denominator).astype(np.int64)
133
134
135def _check_density(density, n_features):
136    """Factorize density check according to Li et al."""
137    if density == "auto":
138        density = 1 / np.sqrt(n_features)
139
140    elif density <= 0 or density > 1:
141        raise ValueError("Expected density in range ]0, 1], got: %r" % density)
142    return density
143
144
145def _check_input_size(n_components, n_features):
146    """Factorize argument checking for random matrix generation."""
147    if n_components <= 0:
148        raise ValueError(
149            "n_components must be strictly positive, got %d" % n_components
150        )
151    if n_features <= 0:
152        raise ValueError("n_features must be strictly positive, got %d" % n_features)
153
154
155def _gaussian_random_matrix(n_components, n_features, random_state=None):
156    """Generate a dense Gaussian random matrix.
157
158    The components of the random matrix are drawn from
159
160        N(0, 1.0 / n_components).
161
162    Read more in the :ref:`User Guide <gaussian_random_matrix>`.
163
164    Parameters
165    ----------
166    n_components : int,
167        Dimensionality of the target projection space.
168
169    n_features : int,
170        Dimensionality of the original source space.
171
172    random_state : int, RandomState instance or None, default=None
173        Controls the pseudo random number generator used to generate the matrix
174        at fit time.
175        Pass an int for reproducible output across multiple function calls.
176        See :term:`Glossary <random_state>`.
177
178    Returns
179    -------
180    components : ndarray of shape (n_components, n_features)
181        The generated Gaussian random matrix.
182
183    See Also
184    --------
185    GaussianRandomProjection
186    """
187    _check_input_size(n_components, n_features)
188    rng = check_random_state(random_state)
189    components = rng.normal(
190        loc=0.0, scale=1.0 / np.sqrt(n_components), size=(n_components, n_features)
191    )
192    return components
193
194
195def _sparse_random_matrix(n_components, n_features, density="auto", random_state=None):
196    """Generalized Achlioptas random sparse matrix for random projection.
197
198    Setting density to 1 / 3 will yield the original matrix by Dimitris
199    Achlioptas while setting a lower value will yield the generalization
200    by Ping Li et al.
201
202    If we note :math:`s = 1 / density`, the components of the random matrix are
203    drawn from:
204
205      - -sqrt(s) / sqrt(n_components)   with probability 1 / 2s
206      -  0                              with probability 1 - 1 / s
207      - +sqrt(s) / sqrt(n_components)   with probability 1 / 2s
208
209    Read more in the :ref:`User Guide <sparse_random_matrix>`.
210
211    Parameters
212    ----------
213    n_components : int,
214        Dimensionality of the target projection space.
215
216    n_features : int,
217        Dimensionality of the original source space.
218
219    density : float or 'auto', default='auto'
220        Ratio of non-zero component in the random projection matrix in the
221        range `(0, 1]`
222
223        If density = 'auto', the value is set to the minimum density
224        as recommended by Ping Li et al.: 1 / sqrt(n_features).
225
226        Use density = 1 / 3.0 if you want to reproduce the results from
227        Achlioptas, 2001.
228
229    random_state : int, RandomState instance or None, default=None
230        Controls the pseudo random number generator used to generate the matrix
231        at fit time.
232        Pass an int for reproducible output across multiple function calls.
233        See :term:`Glossary <random_state>`.
234
235    Returns
236    -------
237    components : {ndarray, sparse matrix} of shape (n_components, n_features)
238        The generated Gaussian random matrix. Sparse matrix will be of CSR
239        format.
240
241    See Also
242    --------
243    SparseRandomProjection
244
245    References
246    ----------
247
248    .. [1] Ping Li, T. Hastie and K. W. Church, 2006,
249           "Very Sparse Random Projections".
250           https://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf
251
252    .. [2] D. Achlioptas, 2001, "Database-friendly random projections",
253           http://www.cs.ucsc.edu/~optas/papers/jl.pdf
254
255    """
256    _check_input_size(n_components, n_features)
257    density = _check_density(density, n_features)
258    rng = check_random_state(random_state)
259
260    if density == 1:
261        # skip index generation if totally dense
262        components = rng.binomial(1, 0.5, (n_components, n_features)) * 2 - 1
263        return 1 / np.sqrt(n_components) * components
264
265    else:
266        # Generate location of non zero elements
267        indices = []
268        offset = 0
269        indptr = [offset]
270        for _ in range(n_components):
271            # find the indices of the non-zero components for row i
272            n_nonzero_i = rng.binomial(n_features, density)
273            indices_i = sample_without_replacement(
274                n_features, n_nonzero_i, random_state=rng
275            )
276            indices.append(indices_i)
277            offset += n_nonzero_i
278            indptr.append(offset)
279
280        indices = np.concatenate(indices)
281
282        # Among non zero components the probability of the sign is 50%/50%
283        data = rng.binomial(1, 0.5, size=np.size(indices)) * 2 - 1
284
285        # build the CSR structure by concatenating the rows
286        components = sp.csr_matrix(
287            (data, indices, indptr), shape=(n_components, n_features)
288        )
289
290        return np.sqrt(1 / density) / np.sqrt(n_components) * components
291
292
293class BaseRandomProjection(TransformerMixin, BaseEstimator, metaclass=ABCMeta):
294    """Base class for random projections.
295
296    Warning: This class should not be used directly.
297    Use derived classes instead.
298    """
299
300    @abstractmethod
301    def __init__(
302        self, n_components="auto", *, eps=0.1, dense_output=False, random_state=None
303    ):
304        self.n_components = n_components
305        self.eps = eps
306        self.dense_output = dense_output
307        self.random_state = random_state
308
309    @abstractmethod
310    def _make_random_matrix(self, n_components, n_features):
311        """Generate the random projection matrix.
312
313        Parameters
314        ----------
315        n_components : int,
316            Dimensionality of the target projection space.
317
318        n_features : int,
319            Dimensionality of the original source space.
320
321        Returns
322        -------
323        components : {ndarray, sparse matrix} of shape \
324                (n_components, n_features)
325            The generated random matrix. Sparse matrix will be of CSR format.
326
327        """
328
329    def fit(self, X, y=None):
330        """Generate a sparse random projection matrix.
331
332        Parameters
333        ----------
334        X : {ndarray, sparse matrix} of shape (n_samples, n_features)
335            Training set: only the shape is used to find optimal random
336            matrix dimensions based on the theory referenced in the
337            afore mentioned papers.
338
339        y : Ignored
340            Not used, present here for API consistency by convention.
341
342        Returns
343        -------
344        self : object
345            BaseRandomProjection class instance.
346        """
347        X = self._validate_data(X, accept_sparse=["csr", "csc"])
348
349        n_samples, n_features = X.shape
350
351        if self.n_components == "auto":
352            self.n_components_ = johnson_lindenstrauss_min_dim(
353                n_samples=n_samples, eps=self.eps
354            )
355
356            if self.n_components_ <= 0:
357                raise ValueError(
358                    "eps=%f and n_samples=%d lead to a target dimension of "
359                    "%d which is invalid" % (self.eps, n_samples, self.n_components_)
360                )
361
362            elif self.n_components_ > n_features:
363                raise ValueError(
364                    "eps=%f and n_samples=%d lead to a target dimension of "
365                    "%d which is larger than the original space with "
366                    "n_features=%d"
367                    % (self.eps, n_samples, self.n_components_, n_features)
368                )
369        else:
370            if self.n_components <= 0:
371                raise ValueError(
372                    "n_components must be greater than 0, got %s" % self.n_components
373                )
374
375            elif self.n_components > n_features:
376                warnings.warn(
377                    "The number of components is higher than the number of"
378                    " features: n_features < n_components (%s < %s)."
379                    "The dimensionality of the problem will not be reduced."
380                    % (n_features, self.n_components),
381                    DataDimensionalityWarning,
382                )
383
384            self.n_components_ = self.n_components
385
386        # Generate a projection matrix of size [n_components, n_features]
387        self.components_ = self._make_random_matrix(self.n_components_, n_features)
388
389        # Check contract
390        assert self.components_.shape == (self.n_components_, n_features), (
391            "An error has occurred the self.components_ matrix has "
392            " not the proper shape."
393        )
394
395        return self
396
397    def transform(self, X):
398        """Project the data by using matrix product with the random matrix.
399
400        Parameters
401        ----------
402        X : {ndarray, sparse matrix} of shape (n_samples, n_features)
403            The input data to project into a smaller dimensional space.
404
405        Returns
406        -------
407        X_new : {ndarray, sparse matrix} of shape (n_samples, n_components)
408            Projected array.
409        """
410        check_is_fitted(self)
411        X = self._validate_data(X, accept_sparse=["csr", "csc"], reset=False)
412
413        if X.shape[1] != self.components_.shape[1]:
414            raise ValueError(
415                "Impossible to perform projection:"
416                "X at fit stage had a different number of features. "
417                "(%s != %s)" % (X.shape[1], self.components_.shape[1])
418            )
419
420        X_new = safe_sparse_dot(X, self.components_.T, dense_output=self.dense_output)
421        return X_new
422
423
424class GaussianRandomProjection(BaseRandomProjection):
425    """Reduce dimensionality through Gaussian random projection.
426
427    The components of the random matrix are drawn from N(0, 1 / n_components).
428
429    Read more in the :ref:`User Guide <gaussian_random_matrix>`.
430
431    .. versionadded:: 0.13
432
433    Parameters
434    ----------
435    n_components : int or 'auto', default='auto'
436        Dimensionality of the target projection space.
437
438        n_components can be automatically adjusted according to the
439        number of samples in the dataset and the bound given by the
440        Johnson-Lindenstrauss lemma. In that case the quality of the
441        embedding is controlled by the ``eps`` parameter.
442
443        It should be noted that Johnson-Lindenstrauss lemma can yield
444        very conservative estimated of the required number of components
445        as it makes no assumption on the structure of the dataset.
446
447    eps : float, default=0.1
448        Parameter to control the quality of the embedding according to
449        the Johnson-Lindenstrauss lemma when `n_components` is set to
450        'auto'. The value should be strictly positive.
451
452        Smaller values lead to better embedding and higher number of
453        dimensions (n_components) in the target projection space.
454
455    random_state : int, RandomState instance or None, default=None
456        Controls the pseudo random number generator used to generate the
457        projection matrix at fit time.
458        Pass an int for reproducible output across multiple function calls.
459        See :term:`Glossary <random_state>`.
460
461    Attributes
462    ----------
463    n_components_ : int
464        Concrete number of components computed when n_components="auto".
465
466    components_ : ndarray of shape (n_components, n_features)
467        Random matrix used for the projection.
468
469    n_features_in_ : int
470        Number of features seen during :term:`fit`.
471
472        .. versionadded:: 0.24
473
474    feature_names_in_ : ndarray of shape (`n_features_in_`,)
475        Names of features seen during :term:`fit`. Defined only when `X`
476        has feature names that are all strings.
477
478        .. versionadded:: 1.0
479
480    See Also
481    --------
482    SparseRandomProjection : Reduce dimensionality through sparse
483        random projection.
484
485    Examples
486    --------
487    >>> import numpy as np
488    >>> from sklearn.random_projection import GaussianRandomProjection
489    >>> rng = np.random.RandomState(42)
490    >>> X = rng.rand(25, 3000)
491    >>> transformer = GaussianRandomProjection(random_state=rng)
492    >>> X_new = transformer.fit_transform(X)
493    >>> X_new.shape
494    (25, 2759)
495    """
496
497    def __init__(self, n_components="auto", *, eps=0.1, random_state=None):
498        super().__init__(
499            n_components=n_components,
500            eps=eps,
501            dense_output=True,
502            random_state=random_state,
503        )
504
505    def _make_random_matrix(self, n_components, n_features):
506        """ Generate the random projection matrix.
507
508        Parameters
509        ----------
510        n_components : int,
511            Dimensionality of the target projection space.
512
513        n_features : int,
514            Dimensionality of the original source space.
515
516        Returns
517        -------
518        components : {ndarray, sparse matrix} of shape \
519                (n_components, n_features)
520            The generated random matrix. Sparse matrix will be of CSR format.
521
522        """
523        random_state = check_random_state(self.random_state)
524        return _gaussian_random_matrix(
525            n_components, n_features, random_state=random_state
526        )
527
528
529class SparseRandomProjection(BaseRandomProjection):
530    """Reduce dimensionality through sparse random projection.
531
532    Sparse random matrix is an alternative to dense random
533    projection matrix that guarantees similar embedding quality while being
534    much more memory efficient and allowing faster computation of the
535    projected data.
536
537    If we note `s = 1 / density` the components of the random matrix are
538    drawn from:
539
540      - -sqrt(s) / sqrt(n_components)   with probability 1 / 2s
541      -  0                              with probability 1 - 1 / s
542      - +sqrt(s) / sqrt(n_components)   with probability 1 / 2s
543
544    Read more in the :ref:`User Guide <sparse_random_matrix>`.
545
546    .. versionadded:: 0.13
547
548    Parameters
549    ----------
550    n_components : int or 'auto', default='auto'
551        Dimensionality of the target projection space.
552
553        n_components can be automatically adjusted according to the
554        number of samples in the dataset and the bound given by the
555        Johnson-Lindenstrauss lemma. In that case the quality of the
556        embedding is controlled by the ``eps`` parameter.
557
558        It should be noted that Johnson-Lindenstrauss lemma can yield
559        very conservative estimated of the required number of components
560        as it makes no assumption on the structure of the dataset.
561
562    density : float or 'auto', default='auto'
563        Ratio in the range (0, 1] of non-zero component in the random
564        projection matrix.
565
566        If density = 'auto', the value is set to the minimum density
567        as recommended by Ping Li et al.: 1 / sqrt(n_features).
568
569        Use density = 1 / 3.0 if you want to reproduce the results from
570        Achlioptas, 2001.
571
572    eps : float, default=0.1
573        Parameter to control the quality of the embedding according to
574        the Johnson-Lindenstrauss lemma when n_components is set to
575        'auto'. This value should be strictly positive.
576
577        Smaller values lead to better embedding and higher number of
578        dimensions (n_components) in the target projection space.
579
580    dense_output : bool, default=False
581        If True, ensure that the output of the random projection is a
582        dense numpy array even if the input and random projection matrix
583        are both sparse. In practice, if the number of components is
584        small the number of zero components in the projected data will
585        be very small and it will be more CPU and memory efficient to
586        use a dense representation.
587
588        If False, the projected data uses a sparse representation if
589        the input is sparse.
590
591    random_state : int, RandomState instance or None, default=None
592        Controls the pseudo random number generator used to generate the
593        projection matrix at fit time.
594        Pass an int for reproducible output across multiple function calls.
595        See :term:`Glossary <random_state>`.
596
597    Attributes
598    ----------
599    n_components_ : int
600        Concrete number of components computed when n_components="auto".
601
602    components_ : sparse matrix of shape (n_components, n_features)
603        Random matrix used for the projection. Sparse matrix will be of CSR
604        format.
605
606    density_ : float in range 0.0 - 1.0
607        Concrete density computed from when density = "auto".
608
609    n_features_in_ : int
610        Number of features seen during :term:`fit`.
611
612        .. versionadded:: 0.24
613
614    feature_names_in_ : ndarray of shape (`n_features_in_`,)
615        Names of features seen during :term:`fit`. Defined only when `X`
616        has feature names that are all strings.
617
618        .. versionadded:: 1.0
619
620    See Also
621    --------
622    GaussianRandomProjection : Reduce dimensionality through Gaussian
623        random projection.
624
625    References
626    ----------
627
628    .. [1] Ping Li, T. Hastie and K. W. Church, 2006,
629           "Very Sparse Random Projections".
630           https://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf
631
632    .. [2] D. Achlioptas, 2001, "Database-friendly random projections",
633           https://users.soe.ucsc.edu/~optas/papers/jl.pdf
634
635    Examples
636    --------
637    >>> import numpy as np
638    >>> from sklearn.random_projection import SparseRandomProjection
639    >>> rng = np.random.RandomState(42)
640    >>> X = rng.rand(25, 3000)
641    >>> transformer = SparseRandomProjection(random_state=rng)
642    >>> X_new = transformer.fit_transform(X)
643    >>> X_new.shape
644    (25, 2759)
645    >>> # very few components are non-zero
646    >>> np.mean(transformer.components_ != 0)
647    0.0182...
648    """
649
650    def __init__(
651        self,
652        n_components="auto",
653        *,
654        density="auto",
655        eps=0.1,
656        dense_output=False,
657        random_state=None,
658    ):
659        super().__init__(
660            n_components=n_components,
661            eps=eps,
662            dense_output=dense_output,
663            random_state=random_state,
664        )
665
666        self.density = density
667
668    def _make_random_matrix(self, n_components, n_features):
669        """ Generate the random projection matrix
670
671        Parameters
672        ----------
673        n_components : int
674            Dimensionality of the target projection space.
675
676        n_features : int
677            Dimensionality of the original source space.
678
679        Returns
680        -------
681        components : {ndarray, sparse matrix} of shape \
682                (n_components, n_features)
683            The generated random matrix. Sparse matrix will be of CSR format.
684
685        """
686        random_state = check_random_state(self.random_state)
687        self.density_ = _check_density(self.density, n_features)
688        return _sparse_random_matrix(
689            n_components, n_features, density=self.density_, random_state=random_state
690        )
691