dlib/test/linear_manifold_regularizer.cpp

// Copyright (C) 2010  Davis E. King (davis@dlib.net)
// License: Boost Software License   See LICENSE.txt for the full license.

#include "tester.h"
#include <dlib/manifold_regularization.h>
#include <dlib/svm.h>
#include <dlib/rand.h>
#include <dlib/string.h>
#include <dlib/graph_utils_threaded.h>
#include <vector>
#include <sstream>
#include <ctime>

namespace
{
    using namespace test;
    using namespace dlib;
    using namespace std;
    dlib::logger dlog("test.linear_manifold_regularizer");

    template <typename hash_type, typename samples_type>
    void test_find_k_nearest_neighbors_lsh(
        const samples_type& samples
    )
    {
        std::vector<sample_pair> edges1, edges2;

        find_k_nearest_neighbors(samples, cosine_distance(), 2, edges1);
        find_k_nearest_neighbors_lsh(samples, cosine_distance(), hash_type(), 2, 6, edges2, 2);

        std::sort(edges1.begin(), edges1.end(), order_by_index<sample_pair>);
        std::sort(edges2.begin(), edges2.end(), order_by_index<sample_pair>);

        DLIB_TEST_MSG(edges1.size() == edges2.size(), edges1.size() << "    " << edges2.size());
        for (unsigned long i = 0; i < edges1.size(); ++i)
        {
            DLIB_TEST(edges1[i] == edges2[i]);
            DLIB_TEST_MSG(std::abs(edges1[i].distance() - edges2[i].distance()) < 1e-7,
                edges1[i].distance() - edges2[i].distance());
        }
    }

    template <typename scalar_type>
    void test_knn_lsh_sparse()
    {
        dlib::rand rnd;
        std::vector<std::map<unsigned long,scalar_type> > samples;
        samples.resize(20);
        for (unsigned int i = 0; i < samples.size(); ++i)
        {
            samples[i][0] = rnd.get_random_gaussian();
            samples[i][2] = rnd.get_random_gaussian();
        }

        test_find_k_nearest_neighbors_lsh<hash_similar_angles_64>(samples);
        test_find_k_nearest_neighbors_lsh<hash_similar_angles_128>(samples);
        test_find_k_nearest_neighbors_lsh<hash_similar_angles_256>(samples);
        test_find_k_nearest_neighbors_lsh<hash_similar_angles_512>(samples);
    }

    template <typename scalar_type>
    void test_knn_lsh_dense()
    {
        dlib::rand rnd;
        std::vector<matrix<scalar_type,0,1> > samples;
        samples.resize(20);
        for (unsigned int i = 0; i < samples.size(); ++i)
        {
            samples[i].set_size(2);
            samples[i](0) = rnd.get_random_gaussian();
            samples[i](1) = rnd.get_random_gaussian();
        }

        test_find_k_nearest_neighbors_lsh<hash_similar_angles_64>(samples);
        test_find_k_nearest_neighbors_lsh<hash_similar_angles_128>(samples);
        test_find_k_nearest_neighbors_lsh<hash_similar_angles_256>(samples);
        test_find_k_nearest_neighbors_lsh<hash_similar_angles_512>(samples);
    }


    class linear_manifold_regularizer_tester : public tester
    {
        /*!
            WHAT THIS OBJECT REPRESENTS
                This object represents a unit test.  When it is constructed
                it adds itself into the testing framework.
        !*/
    public:
        linear_manifold_regularizer_tester (
        ) :
            tester (
                "test_linear_manifold_regularizer",       // the command line argument name for this test
                "Run tests on the linear_manifold_regularizer object.", // the command line argument description
                0                     // the number of command line arguments for this test
            )
        {
            seed = 1;
        }

        dlib::rand rnd;

        unsigned long seed;

        typedef matrix<double, 0, 1> sample_type;
        typedef radial_basis_kernel<sample_type> kernel_type;

        void do_the_test()
        {
            print_spinner();
            std::vector<sample_type> samples;

            // Declare an instance of the kernel we will be using.
            const kernel_type kern(0.1);

            const unsigned long num_points = 200;

            // create a large dataset with two concentric circles.
            generate_circle(samples, 1, num_points);  // circle of radius 1
            generate_circle(samples, 5, num_points);  // circle of radius 5

            std::vector<sample_pair> edges;
            find_percent_shortest_edges_randomly(samples, squared_euclidean_distance(0.1, 4), 1, 10000, "random seed", edges);

            dlog << LTRACE << "number of edges generated: " << edges.size();

            empirical_kernel_map<kernel_type> ekm;

            ekm.load(kern, randomly_subsample(samples, 100));

            // Project all the samples into the span of our 50 basis samples
            for (unsigned long i = 0; i < samples.size(); ++i)
                samples[i] = ekm.project(samples[i]);


            // Now create the manifold regularizer.   The result is a transformation matrix that
            // embodies the manifold assumption discussed above.
            linear_manifold_regularizer<sample_type> lmr;
            lmr.build(samples, edges, use_gaussian_weights(0.1));
            matrix<double> T = lmr.get_transformation_matrix(10000);

            print_spinner();

            // generate the T matrix manually and make sure it matches.  The point of this test
            // is to make sure that the more complex version of this that happens inside the linear_manifold_regularizer
            // is correct.  It uses a tedious block of loops to do it in a way that is a lot faster for sparse
            // W matrices but isn't super straight forward.
            matrix<double> X(samples[0].size(), samples.size());
            for (unsigned long i = 0; i < samples.size(); ++i)
                set_colm(X,i) = samples[i];

            matrix<double> W(samples.size(), samples.size());
            W = 0;
            for (unsigned long i = 0; i < edges.size(); ++i)
            {
                W(edges[i].index1(), edges[i].index2()) = use_gaussian_weights(0.1)(edges[i]);
                W(edges[i].index2(), edges[i].index1()) = use_gaussian_weights(0.1)(edges[i]);
            }
            matrix<double> L = diagm(sum_rows(W)) - W;
            matrix<double> trueT = inv_lower_triangular(chol(identity_matrix<double>(X.nr()) + (10000.0/sum(lowerm(W)))*X*L*trans(X)));

            dlog << LTRACE << "T error: "<< max(abs(T - trueT));
            DLIB_TEST(max(abs(T - trueT)) < 1e-7);


            print_spinner();
            // Apply the transformation generated by the linear_manifold_regularizer to
            // all our samples.
            for (unsigned long i = 0; i < samples.size(); ++i)
                samples[i] = T*samples[i];


            // For convenience, generate a projection_function and merge the transformation
            // matrix T into it.
            projection_function<kernel_type> proj = ekm.get_projection_function();
            proj.weights = T*proj.weights;


            // Pick 2 different labeled points.  One on the inner circle and another on the outer.
            // For each of these test points we will see if using the single plane that separates
            // them is a good way to separate the concentric circles.  Also do this a bunch
            // of times with different randomly chosen points so we can see how robust the result is.
            for (int itr = 0; itr < 10; ++itr)
            {
                print_spinner();
                std::vector<sample_type> test_points;
                // generate a random point from the radius 1 circle
                generate_circle(test_points, 1, 1);
                // generate a random point from the radius 5 circle
                generate_circle(test_points, 5, 1);

                // project the two test points into kernel space.  Recall that this projection_function
                // has the manifold regularizer incorporated into it.
                const sample_type class1_point = proj(test_points[0]);
                const sample_type class2_point = proj(test_points[1]);

                double num_wrong = 0;

                // Now attempt to classify all the data samples according to which point
                // they are closest to.  The output of this program shows that without manifold
                // regularization this test will fail but with it it will perfectly classify
                // all the points.
                for (unsigned long i = 0; i < samples.size(); ++i)
                {
                    double distance_to_class1 = length(samples[i] - class1_point);
                    double distance_to_class2 = length(samples[i] - class2_point);

                    bool predicted_as_class_1 = (distance_to_class1 < distance_to_class2);

                    bool really_is_class_1 = (i < num_points);

                    // now count how many times we make a mistake
                    if (predicted_as_class_1 != really_is_class_1)
                        ++num_wrong;
                }

                DLIB_TEST_MSG(num_wrong == 0, num_wrong);
            }

        }

        void generate_circle (
            std::vector<sample_type>& samples,
            double radius,
            const long num
        )
        {
            sample_type m(2,1);

            for (long i = 0; i < num; ++i)
            {
                double sign = 1;
                if (rnd.get_random_double() < 0.5)
                    sign = -1;
                m(0) = 2*radius*rnd.get_random_double()-radius;
                m(1) = sign*sqrt(radius*radius - m(0)*m(0));

                samples.push_back(m);
            }
        }


        void test_knn1()
        {
            std::vector<matrix<double,2,1> > samples;

            matrix<double,2,1> test;

            test = 0,0;  samples.push_back(test);
            test = 1,1;  samples.push_back(test);
            test = 1,-1;  samples.push_back(test);
            test = -1,1;  samples.push_back(test);
            test = -1,-1;  samples.push_back(test);

            std::vector<sample_pair> edges;
            find_k_nearest_neighbors(samples, squared_euclidean_distance(), 1, edges);
            DLIB_TEST(edges.size() == 4);

            std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>);

            DLIB_TEST(edges[0] == sample_pair(0,1,0));
            DLIB_TEST(edges[1] == sample_pair(0,2,0));
            DLIB_TEST(edges[2] == sample_pair(0,3,0));
            DLIB_TEST(edges[3] == sample_pair(0,4,0));

            find_k_nearest_neighbors(samples, squared_euclidean_distance(), 3, edges);
            DLIB_TEST(edges.size() == 8);

            find_k_nearest_neighbors(samples, squared_euclidean_distance(3.9, 4.1), 3, edges);
            DLIB_TEST(edges.size() == 4);

            std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>);

            DLIB_TEST(edges[0] == sample_pair(1,2,0));
            DLIB_TEST(edges[1] == sample_pair(1,3,0));
            DLIB_TEST(edges[2] == sample_pair(2,4,0));
            DLIB_TEST(edges[3] == sample_pair(3,4,0));

            find_k_nearest_neighbors(samples, squared_euclidean_distance(30000, 4.1), 3, edges);
            DLIB_TEST(edges.size() == 0);
        }

        void test_knn1_approx()
        {
            std::vector<matrix<double,2,1> > samples;

            matrix<double,2,1> test;

            test = 0,0;  samples.push_back(test);
            test = 1,1;  samples.push_back(test);
            test = 1,-1;  samples.push_back(test);
            test = -1,1;  samples.push_back(test);
            test = -1,-1;  samples.push_back(test);

            std::vector<sample_pair> edges;
            find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(), 1, 10000, seed, edges);
            DLIB_TEST(edges.size() == 4);

            std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>);

            DLIB_TEST(edges[0] == sample_pair(0,1,0));
            DLIB_TEST(edges[1] == sample_pair(0,2,0));
            DLIB_TEST(edges[2] == sample_pair(0,3,0));
            DLIB_TEST(edges[3] == sample_pair(0,4,0));

            find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(), 3, 10000, seed, edges);
            DLIB_TEST(edges.size() == 8);

            find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(3.9, 4.1), 3, 10000, seed, edges);
            DLIB_TEST(edges.size() == 4);

            std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>);

            DLIB_TEST(edges[0] == sample_pair(1,2,0));
            DLIB_TEST(edges[1] == sample_pair(1,3,0));
            DLIB_TEST(edges[2] == sample_pair(2,4,0));
            DLIB_TEST(edges[3] == sample_pair(3,4,0));

            find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(30000, 4.1), 3, 10000, seed, edges);
            DLIB_TEST(edges.size() == 0);
        }

        void test_knn2()
        {
            std::vector<matrix<double,2,1> > samples;

            matrix<double,2,1> test;

            test = 1,1;  samples.push_back(test);
            test = 1,-1;  samples.push_back(test);
            test = -1,1;  samples.push_back(test);
            test = -1,-1;  samples.push_back(test);

            std::vector<sample_pair> edges;
            find_k_nearest_neighbors(samples, squared_euclidean_distance(), 2, edges);
            DLIB_TEST(edges.size() == 4);

            std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>);

            DLIB_TEST(edges[0] == sample_pair(0,1,0));
            DLIB_TEST(edges[1] == sample_pair(0,2,0));
            DLIB_TEST(edges[2] == sample_pair(1,3,0));
            DLIB_TEST(edges[3] == sample_pair(2,3,0));

            find_k_nearest_neighbors(samples, squared_euclidean_distance(), 200, edges);
            DLIB_TEST(edges.size() == 4*3/2);
        }

        void test_knn2_approx()
        {
            std::vector<matrix<double,2,1> > samples;

            matrix<double,2,1> test;

            test = 1,1;  samples.push_back(test);
            test = 1,-1;  samples.push_back(test);
            test = -1,1;  samples.push_back(test);
            test = -1,-1;  samples.push_back(test);

            std::vector<sample_pair> edges;
            // For this simple graph and high number of samples we will do we should obtain the exact
            // knn solution.
            find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(), 2, 10000, seed,  edges);
            DLIB_TEST(edges.size() == 4);

            std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>);

            DLIB_TEST(edges[0] == sample_pair(0,1,0));
            DLIB_TEST(edges[1] == sample_pair(0,2,0));
            DLIB_TEST(edges[2] == sample_pair(1,3,0));
            DLIB_TEST(edges[3] == sample_pair(2,3,0));


            find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(), 200, 10000, seed,  edges);
            DLIB_TEST(edges.size() == 4*3/2);
        }

        void perform_test (
        )
        {
            for (int i = 0; i < 5; ++i)
            {
                do_the_test();

                ++seed;
                test_knn1_approx();
                test_knn2_approx();
            }
            test_knn1();
            test_knn2();
            test_knn_lsh_sparse<double>();
            test_knn_lsh_sparse<float>();
            test_knn_lsh_dense<double>();
            test_knn_lsh_dense<float>();

        }
    };

    // Create an instance of this object.  Doing this causes this test
    // to be automatically inserted into the testing framework whenever this cpp file
    // is linked into the project.  Note that since we are inside an unnamed-namespace
    // we won't get any linker errors about the symbol a being defined multiple times.
    linear_manifold_regularizer_tester a;

}