1 // Copyright (C) 2010  Davis E. King (davis@dlib.net)
2 // License: Boost Software License   See LICENSE.txt for the full license.
3 
4 #include "tester.h"
5 #include <dlib/manifold_regularization.h>
6 #include <dlib/svm.h>
7 #include <dlib/rand.h>
8 #include <dlib/string.h>
9 #include <dlib/graph_utils_threaded.h>
10 #include <vector>
11 #include <sstream>
12 #include <ctime>
13 
14 namespace
15 {
16     using namespace test;
17     using namespace dlib;
18     using namespace std;
19     dlib::logger dlog("test.linear_manifold_regularizer");
20 
21     template <typename hash_type, typename samples_type>
test_find_k_nearest_neighbors_lsh(const samples_type & samples)22     void test_find_k_nearest_neighbors_lsh(
23         const samples_type& samples
24     )
25     {
26         std::vector<sample_pair> edges1, edges2;
27 
28         find_k_nearest_neighbors(samples, cosine_distance(), 2, edges1);
29         find_k_nearest_neighbors_lsh(samples, cosine_distance(), hash_type(), 2, 6, edges2, 2);
30 
31         std::sort(edges1.begin(), edges1.end(), order_by_index<sample_pair>);
32         std::sort(edges2.begin(), edges2.end(), order_by_index<sample_pair>);
33 
34         DLIB_TEST_MSG(edges1.size() == edges2.size(), edges1.size() << "    " << edges2.size());
35         for (unsigned long i = 0; i < edges1.size(); ++i)
36         {
37             DLIB_TEST(edges1[i] == edges2[i]);
38             DLIB_TEST_MSG(std::abs(edges1[i].distance() - edges2[i].distance()) < 1e-7,
39                 edges1[i].distance() - edges2[i].distance());
40         }
41     }
42 
43     template <typename scalar_type>
test_knn_lsh_sparse()44     void test_knn_lsh_sparse()
45     {
46         dlib::rand rnd;
47         std::vector<std::map<unsigned long,scalar_type> > samples;
48         samples.resize(20);
49         for (unsigned int i = 0; i < samples.size(); ++i)
50         {
51             samples[i][0] = rnd.get_random_gaussian();
52             samples[i][2] = rnd.get_random_gaussian();
53         }
54 
55         test_find_k_nearest_neighbors_lsh<hash_similar_angles_64>(samples);
56         test_find_k_nearest_neighbors_lsh<hash_similar_angles_128>(samples);
57         test_find_k_nearest_neighbors_lsh<hash_similar_angles_256>(samples);
58         test_find_k_nearest_neighbors_lsh<hash_similar_angles_512>(samples);
59     }
60 
61     template <typename scalar_type>
test_knn_lsh_dense()62     void test_knn_lsh_dense()
63     {
64         dlib::rand rnd;
65         std::vector<matrix<scalar_type,0,1> > samples;
66         samples.resize(20);
67         for (unsigned int i = 0; i < samples.size(); ++i)
68         {
69             samples[i].set_size(2);
70             samples[i](0) = rnd.get_random_gaussian();
71             samples[i](1) = rnd.get_random_gaussian();
72         }
73 
74         test_find_k_nearest_neighbors_lsh<hash_similar_angles_64>(samples);
75         test_find_k_nearest_neighbors_lsh<hash_similar_angles_128>(samples);
76         test_find_k_nearest_neighbors_lsh<hash_similar_angles_256>(samples);
77         test_find_k_nearest_neighbors_lsh<hash_similar_angles_512>(samples);
78     }
79 
80 
81 
82     class linear_manifold_regularizer_tester : public tester
83     {
84         /*!
85             WHAT THIS OBJECT REPRESENTS
86                 This object represents a unit test.  When it is constructed
87                 it adds itself into the testing framework.
88         !*/
89     public:
linear_manifold_regularizer_tester()90         linear_manifold_regularizer_tester (
91         ) :
92             tester (
93                 "test_linear_manifold_regularizer",       // the command line argument name for this test
94                 "Run tests on the linear_manifold_regularizer object.", // the command line argument description
95                 0                     // the number of command line arguments for this test
96             )
97         {
98             seed = 1;
99         }
100 
101         dlib::rand rnd;
102 
103         unsigned long seed;
104 
105         typedef matrix<double, 0, 1> sample_type;
106         typedef radial_basis_kernel<sample_type> kernel_type;
107 
do_the_test()108         void do_the_test()
109         {
110             print_spinner();
111             std::vector<sample_type> samples;
112 
113             // Declare an instance of the kernel we will be using.
114             const kernel_type kern(0.1);
115 
116             const unsigned long num_points = 200;
117 
118             // create a large dataset with two concentric circles.
119             generate_circle(samples, 1, num_points);  // circle of radius 1
120             generate_circle(samples, 5, num_points);  // circle of radius 5
121 
122             std::vector<sample_pair> edges;
123             find_percent_shortest_edges_randomly(samples, squared_euclidean_distance(0.1, 4), 1, 10000, "random seed", edges);
124 
125             dlog << LTRACE << "number of edges generated: " << edges.size();
126 
127             empirical_kernel_map<kernel_type> ekm;
128 
129             ekm.load(kern, randomly_subsample(samples, 100));
130 
131             // Project all the samples into the span of our 50 basis samples
132             for (unsigned long i = 0; i < samples.size(); ++i)
133                 samples[i] = ekm.project(samples[i]);
134 
135 
136             // Now create the manifold regularizer.   The result is a transformation matrix that
137             // embodies the manifold assumption discussed above.
138             linear_manifold_regularizer<sample_type> lmr;
139             lmr.build(samples, edges, use_gaussian_weights(0.1));
140             matrix<double> T = lmr.get_transformation_matrix(10000);
141 
142             print_spinner();
143 
144             // generate the T matrix manually and make sure it matches.  The point of this test
145             // is to make sure that the more complex version of this that happens inside the linear_manifold_regularizer
146             // is correct.  It uses a tedious block of loops to do it in a way that is a lot faster for sparse
147             // W matrices but isn't super straight forward.
148             matrix<double> X(samples[0].size(), samples.size());
149             for (unsigned long i = 0; i < samples.size(); ++i)
150                 set_colm(X,i) = samples[i];
151 
152             matrix<double> W(samples.size(), samples.size());
153             W = 0;
154             for (unsigned long i = 0; i < edges.size(); ++i)
155             {
156                 W(edges[i].index1(), edges[i].index2()) = use_gaussian_weights(0.1)(edges[i]);
157                 W(edges[i].index2(), edges[i].index1()) = use_gaussian_weights(0.1)(edges[i]);
158             }
159             matrix<double> L = diagm(sum_rows(W)) - W;
160             matrix<double> trueT = inv_lower_triangular(chol(identity_matrix<double>(X.nr()) + (10000.0/sum(lowerm(W)))*X*L*trans(X)));
161 
162             dlog << LTRACE << "T error: "<< max(abs(T - trueT));
163             DLIB_TEST(max(abs(T - trueT)) < 1e-7);
164 
165 
166             print_spinner();
167             // Apply the transformation generated by the linear_manifold_regularizer to
168             // all our samples.
169             for (unsigned long i = 0; i < samples.size(); ++i)
170                 samples[i] = T*samples[i];
171 
172 
173             // For convenience, generate a projection_function and merge the transformation
174             // matrix T into it.
175             projection_function<kernel_type> proj = ekm.get_projection_function();
176             proj.weights = T*proj.weights;
177 
178 
179             // Pick 2 different labeled points.  One on the inner circle and another on the outer.
180             // For each of these test points we will see if using the single plane that separates
181             // them is a good way to separate the concentric circles.  Also do this a bunch
182             // of times with different randomly chosen points so we can see how robust the result is.
183             for (int itr = 0; itr < 10; ++itr)
184             {
185                 print_spinner();
186                 std::vector<sample_type> test_points;
187                 // generate a random point from the radius 1 circle
188                 generate_circle(test_points, 1, 1);
189                 // generate a random point from the radius 5 circle
190                 generate_circle(test_points, 5, 1);
191 
192                 // project the two test points into kernel space.  Recall that this projection_function
193                 // has the manifold regularizer incorporated into it.
194                 const sample_type class1_point = proj(test_points[0]);
195                 const sample_type class2_point = proj(test_points[1]);
196 
197                 double num_wrong = 0;
198 
199                 // Now attempt to classify all the data samples according to which point
200                 // they are closest to.  The output of this program shows that without manifold
201                 // regularization this test will fail but with it it will perfectly classify
202                 // all the points.
203                 for (unsigned long i = 0; i < samples.size(); ++i)
204                 {
205                     double distance_to_class1 = length(samples[i] - class1_point);
206                     double distance_to_class2 = length(samples[i] - class2_point);
207 
208                     bool predicted_as_class_1 = (distance_to_class1 < distance_to_class2);
209 
210                     bool really_is_class_1 = (i < num_points);
211 
212                     // now count how many times we make a mistake
213                     if (predicted_as_class_1 != really_is_class_1)
214                         ++num_wrong;
215                 }
216 
217                 DLIB_TEST_MSG(num_wrong == 0, num_wrong);
218             }
219 
220         }
221 
generate_circle(std::vector<sample_type> & samples,double radius,const long num)222         void generate_circle (
223             std::vector<sample_type>& samples,
224             double radius,
225             const long num
226         )
227         {
228             sample_type m(2,1);
229 
230             for (long i = 0; i < num; ++i)
231             {
232                 double sign = 1;
233                 if (rnd.get_random_double() < 0.5)
234                     sign = -1;
235                 m(0) = 2*radius*rnd.get_random_double()-radius;
236                 m(1) = sign*sqrt(radius*radius - m(0)*m(0));
237 
238                 samples.push_back(m);
239             }
240         }
241 
242 
test_knn1()243         void test_knn1()
244         {
245             std::vector<matrix<double,2,1> > samples;
246 
247             matrix<double,2,1> test;
248 
249             test = 0,0;  samples.push_back(test);
250             test = 1,1;  samples.push_back(test);
251             test = 1,-1;  samples.push_back(test);
252             test = -1,1;  samples.push_back(test);
253             test = -1,-1;  samples.push_back(test);
254 
255             std::vector<sample_pair> edges;
256             find_k_nearest_neighbors(samples, squared_euclidean_distance(), 1, edges);
257             DLIB_TEST(edges.size() == 4);
258 
259             std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>);
260 
261             DLIB_TEST(edges[0] == sample_pair(0,1,0));
262             DLIB_TEST(edges[1] == sample_pair(0,2,0));
263             DLIB_TEST(edges[2] == sample_pair(0,3,0));
264             DLIB_TEST(edges[3] == sample_pair(0,4,0));
265 
266             find_k_nearest_neighbors(samples, squared_euclidean_distance(), 3, edges);
267             DLIB_TEST(edges.size() == 8);
268 
269             find_k_nearest_neighbors(samples, squared_euclidean_distance(3.9, 4.1), 3, edges);
270             DLIB_TEST(edges.size() == 4);
271 
272             std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>);
273 
274             DLIB_TEST(edges[0] == sample_pair(1,2,0));
275             DLIB_TEST(edges[1] == sample_pair(1,3,0));
276             DLIB_TEST(edges[2] == sample_pair(2,4,0));
277             DLIB_TEST(edges[3] == sample_pair(3,4,0));
278 
279             find_k_nearest_neighbors(samples, squared_euclidean_distance(30000, 4.1), 3, edges);
280             DLIB_TEST(edges.size() == 0);
281         }
282 
test_knn1_approx()283         void test_knn1_approx()
284         {
285             std::vector<matrix<double,2,1> > samples;
286 
287             matrix<double,2,1> test;
288 
289             test = 0,0;  samples.push_back(test);
290             test = 1,1;  samples.push_back(test);
291             test = 1,-1;  samples.push_back(test);
292             test = -1,1;  samples.push_back(test);
293             test = -1,-1;  samples.push_back(test);
294 
295             std::vector<sample_pair> edges;
296             find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(), 1, 10000, seed, edges);
297             DLIB_TEST(edges.size() == 4);
298 
299             std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>);
300 
301             DLIB_TEST(edges[0] == sample_pair(0,1,0));
302             DLIB_TEST(edges[1] == sample_pair(0,2,0));
303             DLIB_TEST(edges[2] == sample_pair(0,3,0));
304             DLIB_TEST(edges[3] == sample_pair(0,4,0));
305 
306             find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(), 3, 10000, seed, edges);
307             DLIB_TEST(edges.size() == 8);
308 
309             find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(3.9, 4.1), 3, 10000, seed, edges);
310             DLIB_TEST(edges.size() == 4);
311 
312             std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>);
313 
314             DLIB_TEST(edges[0] == sample_pair(1,2,0));
315             DLIB_TEST(edges[1] == sample_pair(1,3,0));
316             DLIB_TEST(edges[2] == sample_pair(2,4,0));
317             DLIB_TEST(edges[3] == sample_pair(3,4,0));
318 
319             find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(30000, 4.1), 3, 10000, seed, edges);
320             DLIB_TEST(edges.size() == 0);
321         }
322 
test_knn2()323         void test_knn2()
324         {
325             std::vector<matrix<double,2,1> > samples;
326 
327             matrix<double,2,1> test;
328 
329             test = 1,1;  samples.push_back(test);
330             test = 1,-1;  samples.push_back(test);
331             test = -1,1;  samples.push_back(test);
332             test = -1,-1;  samples.push_back(test);
333 
334             std::vector<sample_pair> edges;
335             find_k_nearest_neighbors(samples, squared_euclidean_distance(), 2, edges);
336             DLIB_TEST(edges.size() == 4);
337 
338             std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>);
339 
340             DLIB_TEST(edges[0] == sample_pair(0,1,0));
341             DLIB_TEST(edges[1] == sample_pair(0,2,0));
342             DLIB_TEST(edges[2] == sample_pair(1,3,0));
343             DLIB_TEST(edges[3] == sample_pair(2,3,0));
344 
345             find_k_nearest_neighbors(samples, squared_euclidean_distance(), 200, edges);
346             DLIB_TEST(edges.size() == 4*3/2);
347         }
348 
test_knn2_approx()349         void test_knn2_approx()
350         {
351             std::vector<matrix<double,2,1> > samples;
352 
353             matrix<double,2,1> test;
354 
355             test = 1,1;  samples.push_back(test);
356             test = 1,-1;  samples.push_back(test);
357             test = -1,1;  samples.push_back(test);
358             test = -1,-1;  samples.push_back(test);
359 
360             std::vector<sample_pair> edges;
361             // For this simple graph and high number of samples we will do we should obtain the exact
362             // knn solution.
363             find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(), 2, 10000, seed,  edges);
364             DLIB_TEST(edges.size() == 4);
365 
366             std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>);
367 
368             DLIB_TEST(edges[0] == sample_pair(0,1,0));
369             DLIB_TEST(edges[1] == sample_pair(0,2,0));
370             DLIB_TEST(edges[2] == sample_pair(1,3,0));
371             DLIB_TEST(edges[3] == sample_pair(2,3,0));
372 
373 
374             find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(), 200, 10000, seed,  edges);
375             DLIB_TEST(edges.size() == 4*3/2);
376         }
377 
perform_test()378         void perform_test (
379         )
380         {
381             for (int i = 0; i < 5; ++i)
382             {
383                 do_the_test();
384 
385                 ++seed;
386                 test_knn1_approx();
387                 test_knn2_approx();
388             }
389             test_knn1();
390             test_knn2();
391             test_knn_lsh_sparse<double>();
392             test_knn_lsh_sparse<float>();
393             test_knn_lsh_dense<double>();
394             test_knn_lsh_dense<float>();
395 
396         }
397     };
398 
399     // Create an instance of this object.  Doing this causes this test
400     // to be automatically inserted into the testing framework whenever this cpp file
401     // is linked into the project.  Note that since we are inside an unnamed-namespace
402     // we won't get any linker errors about the symbol a being defined multiple times.
403     linear_manifold_regularizer_tester a;
404 
405 }
406 
407 
408 
409