1 // Copyright (C) 2010 Davis E. King (davis@dlib.net) 2 // License: Boost Software License See LICENSE.txt for the full license. 3 4 #include "tester.h" 5 #include <dlib/manifold_regularization.h> 6 #include <dlib/svm.h> 7 #include <dlib/rand.h> 8 #include <dlib/string.h> 9 #include <dlib/graph_utils_threaded.h> 10 #include <vector> 11 #include <sstream> 12 #include <ctime> 13 14 namespace 15 { 16 using namespace test; 17 using namespace dlib; 18 using namespace std; 19 dlib::logger dlog("test.linear_manifold_regularizer"); 20 21 template <typename hash_type, typename samples_type> test_find_k_nearest_neighbors_lsh(const samples_type & samples)22 void test_find_k_nearest_neighbors_lsh( 23 const samples_type& samples 24 ) 25 { 26 std::vector<sample_pair> edges1, edges2; 27 28 find_k_nearest_neighbors(samples, cosine_distance(), 2, edges1); 29 find_k_nearest_neighbors_lsh(samples, cosine_distance(), hash_type(), 2, 6, edges2, 2); 30 31 std::sort(edges1.begin(), edges1.end(), order_by_index<sample_pair>); 32 std::sort(edges2.begin(), edges2.end(), order_by_index<sample_pair>); 33 34 DLIB_TEST_MSG(edges1.size() == edges2.size(), edges1.size() << " " << edges2.size()); 35 for (unsigned long i = 0; i < edges1.size(); ++i) 36 { 37 DLIB_TEST(edges1[i] == edges2[i]); 38 DLIB_TEST_MSG(std::abs(edges1[i].distance() - edges2[i].distance()) < 1e-7, 39 edges1[i].distance() - edges2[i].distance()); 40 } 41 } 42 43 template <typename scalar_type> test_knn_lsh_sparse()44 void test_knn_lsh_sparse() 45 { 46 dlib::rand rnd; 47 std::vector<std::map<unsigned long,scalar_type> > samples; 48 samples.resize(20); 49 for (unsigned int i = 0; i < samples.size(); ++i) 50 { 51 samples[i][0] = rnd.get_random_gaussian(); 52 samples[i][2] = rnd.get_random_gaussian(); 53 } 54 55 test_find_k_nearest_neighbors_lsh<hash_similar_angles_64>(samples); 56 test_find_k_nearest_neighbors_lsh<hash_similar_angles_128>(samples); 57 test_find_k_nearest_neighbors_lsh<hash_similar_angles_256>(samples); 58 test_find_k_nearest_neighbors_lsh<hash_similar_angles_512>(samples); 59 } 60 61 template <typename scalar_type> test_knn_lsh_dense()62 void test_knn_lsh_dense() 63 { 64 dlib::rand rnd; 65 std::vector<matrix<scalar_type,0,1> > samples; 66 samples.resize(20); 67 for (unsigned int i = 0; i < samples.size(); ++i) 68 { 69 samples[i].set_size(2); 70 samples[i](0) = rnd.get_random_gaussian(); 71 samples[i](1) = rnd.get_random_gaussian(); 72 } 73 74 test_find_k_nearest_neighbors_lsh<hash_similar_angles_64>(samples); 75 test_find_k_nearest_neighbors_lsh<hash_similar_angles_128>(samples); 76 test_find_k_nearest_neighbors_lsh<hash_similar_angles_256>(samples); 77 test_find_k_nearest_neighbors_lsh<hash_similar_angles_512>(samples); 78 } 79 80 81 82 class linear_manifold_regularizer_tester : public tester 83 { 84 /*! 85 WHAT THIS OBJECT REPRESENTS 86 This object represents a unit test. When it is constructed 87 it adds itself into the testing framework. 88 !*/ 89 public: linear_manifold_regularizer_tester()90 linear_manifold_regularizer_tester ( 91 ) : 92 tester ( 93 "test_linear_manifold_regularizer", // the command line argument name for this test 94 "Run tests on the linear_manifold_regularizer object.", // the command line argument description 95 0 // the number of command line arguments for this test 96 ) 97 { 98 seed = 1; 99 } 100 101 dlib::rand rnd; 102 103 unsigned long seed; 104 105 typedef matrix<double, 0, 1> sample_type; 106 typedef radial_basis_kernel<sample_type> kernel_type; 107 do_the_test()108 void do_the_test() 109 { 110 print_spinner(); 111 std::vector<sample_type> samples; 112 113 // Declare an instance of the kernel we will be using. 114 const kernel_type kern(0.1); 115 116 const unsigned long num_points = 200; 117 118 // create a large dataset with two concentric circles. 119 generate_circle(samples, 1, num_points); // circle of radius 1 120 generate_circle(samples, 5, num_points); // circle of radius 5 121 122 std::vector<sample_pair> edges; 123 find_percent_shortest_edges_randomly(samples, squared_euclidean_distance(0.1, 4), 1, 10000, "random seed", edges); 124 125 dlog << LTRACE << "number of edges generated: " << edges.size(); 126 127 empirical_kernel_map<kernel_type> ekm; 128 129 ekm.load(kern, randomly_subsample(samples, 100)); 130 131 // Project all the samples into the span of our 50 basis samples 132 for (unsigned long i = 0; i < samples.size(); ++i) 133 samples[i] = ekm.project(samples[i]); 134 135 136 // Now create the manifold regularizer. The result is a transformation matrix that 137 // embodies the manifold assumption discussed above. 138 linear_manifold_regularizer<sample_type> lmr; 139 lmr.build(samples, edges, use_gaussian_weights(0.1)); 140 matrix<double> T = lmr.get_transformation_matrix(10000); 141 142 print_spinner(); 143 144 // generate the T matrix manually and make sure it matches. The point of this test 145 // is to make sure that the more complex version of this that happens inside the linear_manifold_regularizer 146 // is correct. It uses a tedious block of loops to do it in a way that is a lot faster for sparse 147 // W matrices but isn't super straight forward. 148 matrix<double> X(samples[0].size(), samples.size()); 149 for (unsigned long i = 0; i < samples.size(); ++i) 150 set_colm(X,i) = samples[i]; 151 152 matrix<double> W(samples.size(), samples.size()); 153 W = 0; 154 for (unsigned long i = 0; i < edges.size(); ++i) 155 { 156 W(edges[i].index1(), edges[i].index2()) = use_gaussian_weights(0.1)(edges[i]); 157 W(edges[i].index2(), edges[i].index1()) = use_gaussian_weights(0.1)(edges[i]); 158 } 159 matrix<double> L = diagm(sum_rows(W)) - W; 160 matrix<double> trueT = inv_lower_triangular(chol(identity_matrix<double>(X.nr()) + (10000.0/sum(lowerm(W)))*X*L*trans(X))); 161 162 dlog << LTRACE << "T error: "<< max(abs(T - trueT)); 163 DLIB_TEST(max(abs(T - trueT)) < 1e-7); 164 165 166 print_spinner(); 167 // Apply the transformation generated by the linear_manifold_regularizer to 168 // all our samples. 169 for (unsigned long i = 0; i < samples.size(); ++i) 170 samples[i] = T*samples[i]; 171 172 173 // For convenience, generate a projection_function and merge the transformation 174 // matrix T into it. 175 projection_function<kernel_type> proj = ekm.get_projection_function(); 176 proj.weights = T*proj.weights; 177 178 179 // Pick 2 different labeled points. One on the inner circle and another on the outer. 180 // For each of these test points we will see if using the single plane that separates 181 // them is a good way to separate the concentric circles. Also do this a bunch 182 // of times with different randomly chosen points so we can see how robust the result is. 183 for (int itr = 0; itr < 10; ++itr) 184 { 185 print_spinner(); 186 std::vector<sample_type> test_points; 187 // generate a random point from the radius 1 circle 188 generate_circle(test_points, 1, 1); 189 // generate a random point from the radius 5 circle 190 generate_circle(test_points, 5, 1); 191 192 // project the two test points into kernel space. Recall that this projection_function 193 // has the manifold regularizer incorporated into it. 194 const sample_type class1_point = proj(test_points[0]); 195 const sample_type class2_point = proj(test_points[1]); 196 197 double num_wrong = 0; 198 199 // Now attempt to classify all the data samples according to which point 200 // they are closest to. The output of this program shows that without manifold 201 // regularization this test will fail but with it it will perfectly classify 202 // all the points. 203 for (unsigned long i = 0; i < samples.size(); ++i) 204 { 205 double distance_to_class1 = length(samples[i] - class1_point); 206 double distance_to_class2 = length(samples[i] - class2_point); 207 208 bool predicted_as_class_1 = (distance_to_class1 < distance_to_class2); 209 210 bool really_is_class_1 = (i < num_points); 211 212 // now count how many times we make a mistake 213 if (predicted_as_class_1 != really_is_class_1) 214 ++num_wrong; 215 } 216 217 DLIB_TEST_MSG(num_wrong == 0, num_wrong); 218 } 219 220 } 221 generate_circle(std::vector<sample_type> & samples,double radius,const long num)222 void generate_circle ( 223 std::vector<sample_type>& samples, 224 double radius, 225 const long num 226 ) 227 { 228 sample_type m(2,1); 229 230 for (long i = 0; i < num; ++i) 231 { 232 double sign = 1; 233 if (rnd.get_random_double() < 0.5) 234 sign = -1; 235 m(0) = 2*radius*rnd.get_random_double()-radius; 236 m(1) = sign*sqrt(radius*radius - m(0)*m(0)); 237 238 samples.push_back(m); 239 } 240 } 241 242 test_knn1()243 void test_knn1() 244 { 245 std::vector<matrix<double,2,1> > samples; 246 247 matrix<double,2,1> test; 248 249 test = 0,0; samples.push_back(test); 250 test = 1,1; samples.push_back(test); 251 test = 1,-1; samples.push_back(test); 252 test = -1,1; samples.push_back(test); 253 test = -1,-1; samples.push_back(test); 254 255 std::vector<sample_pair> edges; 256 find_k_nearest_neighbors(samples, squared_euclidean_distance(), 1, edges); 257 DLIB_TEST(edges.size() == 4); 258 259 std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>); 260 261 DLIB_TEST(edges[0] == sample_pair(0,1,0)); 262 DLIB_TEST(edges[1] == sample_pair(0,2,0)); 263 DLIB_TEST(edges[2] == sample_pair(0,3,0)); 264 DLIB_TEST(edges[3] == sample_pair(0,4,0)); 265 266 find_k_nearest_neighbors(samples, squared_euclidean_distance(), 3, edges); 267 DLIB_TEST(edges.size() == 8); 268 269 find_k_nearest_neighbors(samples, squared_euclidean_distance(3.9, 4.1), 3, edges); 270 DLIB_TEST(edges.size() == 4); 271 272 std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>); 273 274 DLIB_TEST(edges[0] == sample_pair(1,2,0)); 275 DLIB_TEST(edges[1] == sample_pair(1,3,0)); 276 DLIB_TEST(edges[2] == sample_pair(2,4,0)); 277 DLIB_TEST(edges[3] == sample_pair(3,4,0)); 278 279 find_k_nearest_neighbors(samples, squared_euclidean_distance(30000, 4.1), 3, edges); 280 DLIB_TEST(edges.size() == 0); 281 } 282 test_knn1_approx()283 void test_knn1_approx() 284 { 285 std::vector<matrix<double,2,1> > samples; 286 287 matrix<double,2,1> test; 288 289 test = 0,0; samples.push_back(test); 290 test = 1,1; samples.push_back(test); 291 test = 1,-1; samples.push_back(test); 292 test = -1,1; samples.push_back(test); 293 test = -1,-1; samples.push_back(test); 294 295 std::vector<sample_pair> edges; 296 find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(), 1, 10000, seed, edges); 297 DLIB_TEST(edges.size() == 4); 298 299 std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>); 300 301 DLIB_TEST(edges[0] == sample_pair(0,1,0)); 302 DLIB_TEST(edges[1] == sample_pair(0,2,0)); 303 DLIB_TEST(edges[2] == sample_pair(0,3,0)); 304 DLIB_TEST(edges[3] == sample_pair(0,4,0)); 305 306 find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(), 3, 10000, seed, edges); 307 DLIB_TEST(edges.size() == 8); 308 309 find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(3.9, 4.1), 3, 10000, seed, edges); 310 DLIB_TEST(edges.size() == 4); 311 312 std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>); 313 314 DLIB_TEST(edges[0] == sample_pair(1,2,0)); 315 DLIB_TEST(edges[1] == sample_pair(1,3,0)); 316 DLIB_TEST(edges[2] == sample_pair(2,4,0)); 317 DLIB_TEST(edges[3] == sample_pair(3,4,0)); 318 319 find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(30000, 4.1), 3, 10000, seed, edges); 320 DLIB_TEST(edges.size() == 0); 321 } 322 test_knn2()323 void test_knn2() 324 { 325 std::vector<matrix<double,2,1> > samples; 326 327 matrix<double,2,1> test; 328 329 test = 1,1; samples.push_back(test); 330 test = 1,-1; samples.push_back(test); 331 test = -1,1; samples.push_back(test); 332 test = -1,-1; samples.push_back(test); 333 334 std::vector<sample_pair> edges; 335 find_k_nearest_neighbors(samples, squared_euclidean_distance(), 2, edges); 336 DLIB_TEST(edges.size() == 4); 337 338 std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>); 339 340 DLIB_TEST(edges[0] == sample_pair(0,1,0)); 341 DLIB_TEST(edges[1] == sample_pair(0,2,0)); 342 DLIB_TEST(edges[2] == sample_pair(1,3,0)); 343 DLIB_TEST(edges[3] == sample_pair(2,3,0)); 344 345 find_k_nearest_neighbors(samples, squared_euclidean_distance(), 200, edges); 346 DLIB_TEST(edges.size() == 4*3/2); 347 } 348 test_knn2_approx()349 void test_knn2_approx() 350 { 351 std::vector<matrix<double,2,1> > samples; 352 353 matrix<double,2,1> test; 354 355 test = 1,1; samples.push_back(test); 356 test = 1,-1; samples.push_back(test); 357 test = -1,1; samples.push_back(test); 358 test = -1,-1; samples.push_back(test); 359 360 std::vector<sample_pair> edges; 361 // For this simple graph and high number of samples we will do we should obtain the exact 362 // knn solution. 363 find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(), 2, 10000, seed, edges); 364 DLIB_TEST(edges.size() == 4); 365 366 std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>); 367 368 DLIB_TEST(edges[0] == sample_pair(0,1,0)); 369 DLIB_TEST(edges[1] == sample_pair(0,2,0)); 370 DLIB_TEST(edges[2] == sample_pair(1,3,0)); 371 DLIB_TEST(edges[3] == sample_pair(2,3,0)); 372 373 374 find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(), 200, 10000, seed, edges); 375 DLIB_TEST(edges.size() == 4*3/2); 376 } 377 perform_test()378 void perform_test ( 379 ) 380 { 381 for (int i = 0; i < 5; ++i) 382 { 383 do_the_test(); 384 385 ++seed; 386 test_knn1_approx(); 387 test_knn2_approx(); 388 } 389 test_knn1(); 390 test_knn2(); 391 test_knn_lsh_sparse<double>(); 392 test_knn_lsh_sparse<float>(); 393 test_knn_lsh_dense<double>(); 394 test_knn_lsh_dense<float>(); 395 396 } 397 }; 398 399 // Create an instance of this object. Doing this causes this test 400 // to be automatically inserted into the testing framework whenever this cpp file 401 // is linked into the project. Note that since we are inside an unnamed-namespace 402 // we won't get any linker errors about the symbol a being defined multiple times. 403 linear_manifold_regularizer_tester a; 404 405 } 406 407 408 409