1import functools
2from typing import List, Any
3
4import numpy as np
5import scipy.sparse as sp
6import pytest
7
8from sklearn.metrics import euclidean_distances
9
10from sklearn.random_projection import johnson_lindenstrauss_min_dim
11from sklearn.random_projection import _gaussian_random_matrix
12from sklearn.random_projection import _sparse_random_matrix
13from sklearn.random_projection import SparseRandomProjection
14from sklearn.random_projection import GaussianRandomProjection
15
16from sklearn.utils._testing import assert_array_equal
17from sklearn.utils._testing import assert_almost_equal
18from sklearn.utils._testing import assert_array_almost_equal
19from sklearn.exceptions import DataDimensionalityWarning
20
21all_sparse_random_matrix: List[Any] = [_sparse_random_matrix]
22all_dense_random_matrix: List[Any] = [_gaussian_random_matrix]
23all_random_matrix = all_sparse_random_matrix + all_dense_random_matrix
24
25all_SparseRandomProjection: List[Any] = [SparseRandomProjection]
26all_DenseRandomProjection: List[Any] = [GaussianRandomProjection]
27all_RandomProjection = set(all_SparseRandomProjection + all_DenseRandomProjection)
28
29
30# Make some random data with uniformly located non zero entries with
31# Gaussian distributed values
32def make_sparse_random_data(n_samples, n_features, n_nonzeros):
33    rng = np.random.RandomState(0)
34    data_coo = sp.coo_matrix(
35        (
36            rng.randn(n_nonzeros),
37            (
38                rng.randint(n_samples, size=n_nonzeros),
39                rng.randint(n_features, size=n_nonzeros),
40            ),
41        ),
42        shape=(n_samples, n_features),
43    )
44    return data_coo.toarray(), data_coo.tocsr()
45
46
47def densify(matrix):
48    if not sp.issparse(matrix):
49        return matrix
50    else:
51        return matrix.toarray()
52
53
54n_samples, n_features = (10, 1000)
55n_nonzeros = int(n_samples * n_features / 100.0)
56data, data_csr = make_sparse_random_data(n_samples, n_features, n_nonzeros)
57
58
59###############################################################################
60# test on JL lemma
61###############################################################################
62
63
64@pytest.mark.parametrize(
65    "n_samples, eps", [(100, 1.1), (100, 0.0), (100, -0.1), (0, 0.5)]
66)
67def test_invalid_jl_domain(n_samples, eps):
68    with pytest.raises(ValueError):
69        johnson_lindenstrauss_min_dim(n_samples, eps=eps)
70
71
72def test_input_size_jl_min_dim():
73    with pytest.raises(ValueError):
74        johnson_lindenstrauss_min_dim(3 * [100], eps=2 * [0.9])
75
76    johnson_lindenstrauss_min_dim(
77        np.random.randint(1, 10, size=(10, 10)), eps=np.full((10, 10), 0.5)
78    )
79
80
81###############################################################################
82# tests random matrix generation
83###############################################################################
84def check_input_size_random_matrix(random_matrix):
85    inputs = [(0, 0), (-1, 1), (1, -1), (1, 0), (-1, 0)]
86    for n_components, n_features in inputs:
87        with pytest.raises(ValueError):
88            random_matrix(n_components, n_features)
89
90
91def check_size_generated(random_matrix):
92    inputs = [(1, 5), (5, 1), (5, 5), (1, 1)]
93    for n_components, n_features in inputs:
94        assert random_matrix(n_components, n_features).shape == (
95            n_components,
96            n_features,
97        )
98
99
100def check_zero_mean_and_unit_norm(random_matrix):
101    # All random matrix should produce a transformation matrix
102    # with zero mean and unit norm for each columns
103
104    A = densify(random_matrix(10000, 1, random_state=0))
105
106    assert_array_almost_equal(0, np.mean(A), 3)
107    assert_array_almost_equal(1.0, np.linalg.norm(A), 1)
108
109
110def check_input_with_sparse_random_matrix(random_matrix):
111    n_components, n_features = 5, 10
112
113    for density in [-1.0, 0.0, 1.1]:
114        with pytest.raises(ValueError):
115            random_matrix(n_components, n_features, density=density)
116
117
118@pytest.mark.parametrize("random_matrix", all_random_matrix)
119def test_basic_property_of_random_matrix(random_matrix):
120    # Check basic properties of random matrix generation
121    check_input_size_random_matrix(random_matrix)
122    check_size_generated(random_matrix)
123    check_zero_mean_and_unit_norm(random_matrix)
124
125
126@pytest.mark.parametrize("random_matrix", all_sparse_random_matrix)
127def test_basic_property_of_sparse_random_matrix(random_matrix):
128    check_input_with_sparse_random_matrix(random_matrix)
129
130    random_matrix_dense = functools.partial(random_matrix, density=1.0)
131
132    check_zero_mean_and_unit_norm(random_matrix_dense)
133
134
135def test_gaussian_random_matrix():
136    # Check some statical properties of Gaussian random matrix
137    # Check that the random matrix follow the proper distribution.
138    # Let's say that each element of a_{ij} of A is taken from
139    #   a_ij ~ N(0.0, 1 / n_components).
140    #
141    n_components = 100
142    n_features = 1000
143    A = _gaussian_random_matrix(n_components, n_features, random_state=0)
144
145    assert_array_almost_equal(0.0, np.mean(A), 2)
146    assert_array_almost_equal(np.var(A, ddof=1), 1 / n_components, 1)
147
148
149def test_sparse_random_matrix():
150    # Check some statical properties of sparse random matrix
151    n_components = 100
152    n_features = 500
153
154    for density in [0.3, 1.0]:
155        s = 1 / density
156
157        A = _sparse_random_matrix(
158            n_components, n_features, density=density, random_state=0
159        )
160        A = densify(A)
161
162        # Check possible values
163        values = np.unique(A)
164        assert np.sqrt(s) / np.sqrt(n_components) in values
165        assert -np.sqrt(s) / np.sqrt(n_components) in values
166
167        if density == 1.0:
168            assert np.size(values) == 2
169        else:
170            assert 0.0 in values
171            assert np.size(values) == 3
172
173        # Check that the random matrix follow the proper distribution.
174        # Let's say that each element of a_{ij} of A is taken from
175        #
176        # - -sqrt(s) / sqrt(n_components)   with probability 1 / 2s
177        # -  0                              with probability 1 - 1 / s
178        # - +sqrt(s) / sqrt(n_components)   with probability 1 / 2s
179        #
180        assert_almost_equal(np.mean(A == 0.0), 1 - 1 / s, decimal=2)
181        assert_almost_equal(
182            np.mean(A == np.sqrt(s) / np.sqrt(n_components)), 1 / (2 * s), decimal=2
183        )
184        assert_almost_equal(
185            np.mean(A == -np.sqrt(s) / np.sqrt(n_components)), 1 / (2 * s), decimal=2
186        )
187
188        assert_almost_equal(np.var(A == 0.0, ddof=1), (1 - 1 / s) * 1 / s, decimal=2)
189        assert_almost_equal(
190            np.var(A == np.sqrt(s) / np.sqrt(n_components), ddof=1),
191            (1 - 1 / (2 * s)) * 1 / (2 * s),
192            decimal=2,
193        )
194        assert_almost_equal(
195            np.var(A == -np.sqrt(s) / np.sqrt(n_components), ddof=1),
196            (1 - 1 / (2 * s)) * 1 / (2 * s),
197            decimal=2,
198        )
199
200
201###############################################################################
202# tests on random projection transformer
203###############################################################################
204
205
206@pytest.mark.parametrize("density", [1.1, 0, -0.1])
207def test_sparse_random_projection_transformer_invalid_density(density):
208    for RandomProjection in all_SparseRandomProjection:
209        with pytest.raises(ValueError):
210            RandomProjection(density=density).fit(data)
211
212
213@pytest.mark.parametrize("n_components, fit_data", [("auto", [[0, 1, 2]]), (-10, data)])
214def test_random_projection_transformer_invalid_input(n_components, fit_data):
215    for RandomProjection in all_RandomProjection:
216        with pytest.raises(ValueError):
217            RandomProjection(n_components=n_components).fit(fit_data)
218
219
220def test_try_to_transform_before_fit():
221    for RandomProjection in all_RandomProjection:
222        with pytest.raises(ValueError):
223            RandomProjection(n_components="auto").transform(data)
224
225
226def test_too_many_samples_to_find_a_safe_embedding():
227    data, _ = make_sparse_random_data(1000, 100, 1000)
228
229    for RandomProjection in all_RandomProjection:
230        rp = RandomProjection(n_components="auto", eps=0.1)
231        expected_msg = (
232            "eps=0.100000 and n_samples=1000 lead to a target dimension"
233            " of 5920 which is larger than the original space with"
234            " n_features=100"
235        )
236        with pytest.raises(ValueError, match=expected_msg):
237            rp.fit(data)
238
239
240def test_random_projection_embedding_quality():
241    data, _ = make_sparse_random_data(8, 5000, 15000)
242    eps = 0.2
243
244    original_distances = euclidean_distances(data, squared=True)
245    original_distances = original_distances.ravel()
246    non_identical = original_distances != 0.0
247
248    # remove 0 distances to avoid division by 0
249    original_distances = original_distances[non_identical]
250
251    for RandomProjection in all_RandomProjection:
252        rp = RandomProjection(n_components="auto", eps=eps, random_state=0)
253        projected = rp.fit_transform(data)
254
255        projected_distances = euclidean_distances(projected, squared=True)
256        projected_distances = projected_distances.ravel()
257
258        # remove 0 distances to avoid division by 0
259        projected_distances = projected_distances[non_identical]
260
261        distances_ratio = projected_distances / original_distances
262
263        # check that the automatically tuned values for the density respect the
264        # contract for eps: pairwise distances are preserved according to the
265        # Johnson-Lindenstrauss lemma
266        assert distances_ratio.max() < 1 + eps
267        assert 1 - eps < distances_ratio.min()
268
269
270def test_SparseRandomProj_output_representation():
271    for SparseRandomProj in all_SparseRandomProjection:
272        # when using sparse input, the projected data can be forced to be a
273        # dense numpy array
274        rp = SparseRandomProj(n_components=10, dense_output=True, random_state=0)
275        rp.fit(data)
276        assert isinstance(rp.transform(data), np.ndarray)
277
278        sparse_data = sp.csr_matrix(data)
279        assert isinstance(rp.transform(sparse_data), np.ndarray)
280
281        # the output can be left to a sparse matrix instead
282        rp = SparseRandomProj(n_components=10, dense_output=False, random_state=0)
283        rp = rp.fit(data)
284        # output for dense input will stay dense:
285        assert isinstance(rp.transform(data), np.ndarray)
286
287        # output for sparse output will be sparse:
288        assert sp.issparse(rp.transform(sparse_data))
289
290
291def test_correct_RandomProjection_dimensions_embedding():
292    for RandomProjection in all_RandomProjection:
293        rp = RandomProjection(n_components="auto", random_state=0, eps=0.5).fit(data)
294
295        # the number of components is adjusted from the shape of the training
296        # set
297        assert rp.n_components == "auto"
298        assert rp.n_components_ == 110
299
300        if RandomProjection in all_SparseRandomProjection:
301            assert rp.density == "auto"
302            assert_almost_equal(rp.density_, 0.03, 2)
303
304        assert rp.components_.shape == (110, n_features)
305
306        projected_1 = rp.transform(data)
307        assert projected_1.shape == (n_samples, 110)
308
309        # once the RP is 'fitted' the projection is always the same
310        projected_2 = rp.transform(data)
311        assert_array_equal(projected_1, projected_2)
312
313        # fit transform with same random seed will lead to the same results
314        rp2 = RandomProjection(random_state=0, eps=0.5)
315        projected_3 = rp2.fit_transform(data)
316        assert_array_equal(projected_1, projected_3)
317
318        # Try to transform with an input X of size different from fitted.
319        with pytest.raises(ValueError):
320            rp.transform(data[:, 1:5])
321
322        # it is also possible to fix the number of components and the density
323        # level
324        if RandomProjection in all_SparseRandomProjection:
325            rp = RandomProjection(n_components=100, density=0.001, random_state=0)
326            projected = rp.fit_transform(data)
327            assert projected.shape == (n_samples, 100)
328            assert rp.components_.shape == (100, n_features)
329            assert rp.components_.nnz < 115  # close to 1% density
330            assert 85 < rp.components_.nnz  # close to 1% density
331
332
333def test_warning_n_components_greater_than_n_features():
334    n_features = 20
335    data, _ = make_sparse_random_data(5, n_features, int(n_features / 4))
336
337    for RandomProjection in all_RandomProjection:
338        with pytest.warns(DataDimensionalityWarning):
339            RandomProjection(n_components=n_features + 1).fit(data)
340
341
342def test_works_with_sparse_data():
343    n_features = 20
344    data, _ = make_sparse_random_data(5, n_features, int(n_features / 4))
345
346    for RandomProjection in all_RandomProjection:
347        rp_dense = RandomProjection(n_components=3, random_state=1).fit(data)
348        rp_sparse = RandomProjection(n_components=3, random_state=1).fit(
349            sp.csr_matrix(data)
350        )
351        assert_array_almost_equal(
352            densify(rp_dense.components_), densify(rp_sparse.components_)
353        )
354
355
356def test_johnson_lindenstrauss_min_dim():
357    """Test Johnson-Lindenstrauss for small eps.
358
359    Regression test for #17111: before #19374, 32-bit systems would fail.
360    """
361    assert johnson_lindenstrauss_min_dim(100, eps=1e-5) == 368416070986
362