1import functools 2from typing import List, Any 3 4import numpy as np 5import scipy.sparse as sp 6import pytest 7 8from sklearn.metrics import euclidean_distances 9 10from sklearn.random_projection import johnson_lindenstrauss_min_dim 11from sklearn.random_projection import _gaussian_random_matrix 12from sklearn.random_projection import _sparse_random_matrix 13from sklearn.random_projection import SparseRandomProjection 14from sklearn.random_projection import GaussianRandomProjection 15 16from sklearn.utils._testing import assert_array_equal 17from sklearn.utils._testing import assert_almost_equal 18from sklearn.utils._testing import assert_array_almost_equal 19from sklearn.exceptions import DataDimensionalityWarning 20 21all_sparse_random_matrix: List[Any] = [_sparse_random_matrix] 22all_dense_random_matrix: List[Any] = [_gaussian_random_matrix] 23all_random_matrix = all_sparse_random_matrix + all_dense_random_matrix 24 25all_SparseRandomProjection: List[Any] = [SparseRandomProjection] 26all_DenseRandomProjection: List[Any] = [GaussianRandomProjection] 27all_RandomProjection = set(all_SparseRandomProjection + all_DenseRandomProjection) 28 29 30# Make some random data with uniformly located non zero entries with 31# Gaussian distributed values 32def make_sparse_random_data(n_samples, n_features, n_nonzeros): 33 rng = np.random.RandomState(0) 34 data_coo = sp.coo_matrix( 35 ( 36 rng.randn(n_nonzeros), 37 ( 38 rng.randint(n_samples, size=n_nonzeros), 39 rng.randint(n_features, size=n_nonzeros), 40 ), 41 ), 42 shape=(n_samples, n_features), 43 ) 44 return data_coo.toarray(), data_coo.tocsr() 45 46 47def densify(matrix): 48 if not sp.issparse(matrix): 49 return matrix 50 else: 51 return matrix.toarray() 52 53 54n_samples, n_features = (10, 1000) 55n_nonzeros = int(n_samples * n_features / 100.0) 56data, data_csr = make_sparse_random_data(n_samples, n_features, n_nonzeros) 57 58 59############################################################################### 60# test on JL lemma 61############################################################################### 62 63 64@pytest.mark.parametrize( 65 "n_samples, eps", [(100, 1.1), (100, 0.0), (100, -0.1), (0, 0.5)] 66) 67def test_invalid_jl_domain(n_samples, eps): 68 with pytest.raises(ValueError): 69 johnson_lindenstrauss_min_dim(n_samples, eps=eps) 70 71 72def test_input_size_jl_min_dim(): 73 with pytest.raises(ValueError): 74 johnson_lindenstrauss_min_dim(3 * [100], eps=2 * [0.9]) 75 76 johnson_lindenstrauss_min_dim( 77 np.random.randint(1, 10, size=(10, 10)), eps=np.full((10, 10), 0.5) 78 ) 79 80 81############################################################################### 82# tests random matrix generation 83############################################################################### 84def check_input_size_random_matrix(random_matrix): 85 inputs = [(0, 0), (-1, 1), (1, -1), (1, 0), (-1, 0)] 86 for n_components, n_features in inputs: 87 with pytest.raises(ValueError): 88 random_matrix(n_components, n_features) 89 90 91def check_size_generated(random_matrix): 92 inputs = [(1, 5), (5, 1), (5, 5), (1, 1)] 93 for n_components, n_features in inputs: 94 assert random_matrix(n_components, n_features).shape == ( 95 n_components, 96 n_features, 97 ) 98 99 100def check_zero_mean_and_unit_norm(random_matrix): 101 # All random matrix should produce a transformation matrix 102 # with zero mean and unit norm for each columns 103 104 A = densify(random_matrix(10000, 1, random_state=0)) 105 106 assert_array_almost_equal(0, np.mean(A), 3) 107 assert_array_almost_equal(1.0, np.linalg.norm(A), 1) 108 109 110def check_input_with_sparse_random_matrix(random_matrix): 111 n_components, n_features = 5, 10 112 113 for density in [-1.0, 0.0, 1.1]: 114 with pytest.raises(ValueError): 115 random_matrix(n_components, n_features, density=density) 116 117 118@pytest.mark.parametrize("random_matrix", all_random_matrix) 119def test_basic_property_of_random_matrix(random_matrix): 120 # Check basic properties of random matrix generation 121 check_input_size_random_matrix(random_matrix) 122 check_size_generated(random_matrix) 123 check_zero_mean_and_unit_norm(random_matrix) 124 125 126@pytest.mark.parametrize("random_matrix", all_sparse_random_matrix) 127def test_basic_property_of_sparse_random_matrix(random_matrix): 128 check_input_with_sparse_random_matrix(random_matrix) 129 130 random_matrix_dense = functools.partial(random_matrix, density=1.0) 131 132 check_zero_mean_and_unit_norm(random_matrix_dense) 133 134 135def test_gaussian_random_matrix(): 136 # Check some statical properties of Gaussian random matrix 137 # Check that the random matrix follow the proper distribution. 138 # Let's say that each element of a_{ij} of A is taken from 139 # a_ij ~ N(0.0, 1 / n_components). 140 # 141 n_components = 100 142 n_features = 1000 143 A = _gaussian_random_matrix(n_components, n_features, random_state=0) 144 145 assert_array_almost_equal(0.0, np.mean(A), 2) 146 assert_array_almost_equal(np.var(A, ddof=1), 1 / n_components, 1) 147 148 149def test_sparse_random_matrix(): 150 # Check some statical properties of sparse random matrix 151 n_components = 100 152 n_features = 500 153 154 for density in [0.3, 1.0]: 155 s = 1 / density 156 157 A = _sparse_random_matrix( 158 n_components, n_features, density=density, random_state=0 159 ) 160 A = densify(A) 161 162 # Check possible values 163 values = np.unique(A) 164 assert np.sqrt(s) / np.sqrt(n_components) in values 165 assert -np.sqrt(s) / np.sqrt(n_components) in values 166 167 if density == 1.0: 168 assert np.size(values) == 2 169 else: 170 assert 0.0 in values 171 assert np.size(values) == 3 172 173 # Check that the random matrix follow the proper distribution. 174 # Let's say that each element of a_{ij} of A is taken from 175 # 176 # - -sqrt(s) / sqrt(n_components) with probability 1 / 2s 177 # - 0 with probability 1 - 1 / s 178 # - +sqrt(s) / sqrt(n_components) with probability 1 / 2s 179 # 180 assert_almost_equal(np.mean(A == 0.0), 1 - 1 / s, decimal=2) 181 assert_almost_equal( 182 np.mean(A == np.sqrt(s) / np.sqrt(n_components)), 1 / (2 * s), decimal=2 183 ) 184 assert_almost_equal( 185 np.mean(A == -np.sqrt(s) / np.sqrt(n_components)), 1 / (2 * s), decimal=2 186 ) 187 188 assert_almost_equal(np.var(A == 0.0, ddof=1), (1 - 1 / s) * 1 / s, decimal=2) 189 assert_almost_equal( 190 np.var(A == np.sqrt(s) / np.sqrt(n_components), ddof=1), 191 (1 - 1 / (2 * s)) * 1 / (2 * s), 192 decimal=2, 193 ) 194 assert_almost_equal( 195 np.var(A == -np.sqrt(s) / np.sqrt(n_components), ddof=1), 196 (1 - 1 / (2 * s)) * 1 / (2 * s), 197 decimal=2, 198 ) 199 200 201############################################################################### 202# tests on random projection transformer 203############################################################################### 204 205 206@pytest.mark.parametrize("density", [1.1, 0, -0.1]) 207def test_sparse_random_projection_transformer_invalid_density(density): 208 for RandomProjection in all_SparseRandomProjection: 209 with pytest.raises(ValueError): 210 RandomProjection(density=density).fit(data) 211 212 213@pytest.mark.parametrize("n_components, fit_data", [("auto", [[0, 1, 2]]), (-10, data)]) 214def test_random_projection_transformer_invalid_input(n_components, fit_data): 215 for RandomProjection in all_RandomProjection: 216 with pytest.raises(ValueError): 217 RandomProjection(n_components=n_components).fit(fit_data) 218 219 220def test_try_to_transform_before_fit(): 221 for RandomProjection in all_RandomProjection: 222 with pytest.raises(ValueError): 223 RandomProjection(n_components="auto").transform(data) 224 225 226def test_too_many_samples_to_find_a_safe_embedding(): 227 data, _ = make_sparse_random_data(1000, 100, 1000) 228 229 for RandomProjection in all_RandomProjection: 230 rp = RandomProjection(n_components="auto", eps=0.1) 231 expected_msg = ( 232 "eps=0.100000 and n_samples=1000 lead to a target dimension" 233 " of 5920 which is larger than the original space with" 234 " n_features=100" 235 ) 236 with pytest.raises(ValueError, match=expected_msg): 237 rp.fit(data) 238 239 240def test_random_projection_embedding_quality(): 241 data, _ = make_sparse_random_data(8, 5000, 15000) 242 eps = 0.2 243 244 original_distances = euclidean_distances(data, squared=True) 245 original_distances = original_distances.ravel() 246 non_identical = original_distances != 0.0 247 248 # remove 0 distances to avoid division by 0 249 original_distances = original_distances[non_identical] 250 251 for RandomProjection in all_RandomProjection: 252 rp = RandomProjection(n_components="auto", eps=eps, random_state=0) 253 projected = rp.fit_transform(data) 254 255 projected_distances = euclidean_distances(projected, squared=True) 256 projected_distances = projected_distances.ravel() 257 258 # remove 0 distances to avoid division by 0 259 projected_distances = projected_distances[non_identical] 260 261 distances_ratio = projected_distances / original_distances 262 263 # check that the automatically tuned values for the density respect the 264 # contract for eps: pairwise distances are preserved according to the 265 # Johnson-Lindenstrauss lemma 266 assert distances_ratio.max() < 1 + eps 267 assert 1 - eps < distances_ratio.min() 268 269 270def test_SparseRandomProj_output_representation(): 271 for SparseRandomProj in all_SparseRandomProjection: 272 # when using sparse input, the projected data can be forced to be a 273 # dense numpy array 274 rp = SparseRandomProj(n_components=10, dense_output=True, random_state=0) 275 rp.fit(data) 276 assert isinstance(rp.transform(data), np.ndarray) 277 278 sparse_data = sp.csr_matrix(data) 279 assert isinstance(rp.transform(sparse_data), np.ndarray) 280 281 # the output can be left to a sparse matrix instead 282 rp = SparseRandomProj(n_components=10, dense_output=False, random_state=0) 283 rp = rp.fit(data) 284 # output for dense input will stay dense: 285 assert isinstance(rp.transform(data), np.ndarray) 286 287 # output for sparse output will be sparse: 288 assert sp.issparse(rp.transform(sparse_data)) 289 290 291def test_correct_RandomProjection_dimensions_embedding(): 292 for RandomProjection in all_RandomProjection: 293 rp = RandomProjection(n_components="auto", random_state=0, eps=0.5).fit(data) 294 295 # the number of components is adjusted from the shape of the training 296 # set 297 assert rp.n_components == "auto" 298 assert rp.n_components_ == 110 299 300 if RandomProjection in all_SparseRandomProjection: 301 assert rp.density == "auto" 302 assert_almost_equal(rp.density_, 0.03, 2) 303 304 assert rp.components_.shape == (110, n_features) 305 306 projected_1 = rp.transform(data) 307 assert projected_1.shape == (n_samples, 110) 308 309 # once the RP is 'fitted' the projection is always the same 310 projected_2 = rp.transform(data) 311 assert_array_equal(projected_1, projected_2) 312 313 # fit transform with same random seed will lead to the same results 314 rp2 = RandomProjection(random_state=0, eps=0.5) 315 projected_3 = rp2.fit_transform(data) 316 assert_array_equal(projected_1, projected_3) 317 318 # Try to transform with an input X of size different from fitted. 319 with pytest.raises(ValueError): 320 rp.transform(data[:, 1:5]) 321 322 # it is also possible to fix the number of components and the density 323 # level 324 if RandomProjection in all_SparseRandomProjection: 325 rp = RandomProjection(n_components=100, density=0.001, random_state=0) 326 projected = rp.fit_transform(data) 327 assert projected.shape == (n_samples, 100) 328 assert rp.components_.shape == (100, n_features) 329 assert rp.components_.nnz < 115 # close to 1% density 330 assert 85 < rp.components_.nnz # close to 1% density 331 332 333def test_warning_n_components_greater_than_n_features(): 334 n_features = 20 335 data, _ = make_sparse_random_data(5, n_features, int(n_features / 4)) 336 337 for RandomProjection in all_RandomProjection: 338 with pytest.warns(DataDimensionalityWarning): 339 RandomProjection(n_components=n_features + 1).fit(data) 340 341 342def test_works_with_sparse_data(): 343 n_features = 20 344 data, _ = make_sparse_random_data(5, n_features, int(n_features / 4)) 345 346 for RandomProjection in all_RandomProjection: 347 rp_dense = RandomProjection(n_components=3, random_state=1).fit(data) 348 rp_sparse = RandomProjection(n_components=3, random_state=1).fit( 349 sp.csr_matrix(data) 350 ) 351 assert_array_almost_equal( 352 densify(rp_dense.components_), densify(rp_sparse.components_) 353 ) 354 355 356def test_johnson_lindenstrauss_min_dim(): 357 """Test Johnson-Lindenstrauss for small eps. 358 359 Regression test for #17111: before #19374, 32-bit systems would fail. 360 """ 361 assert johnson_lindenstrauss_min_dim(100, eps=1e-5) == 368416070986 362