1 // Copyright (C) 2008  Davis E. King (davis@dlib.net)
2 // License: Boost Software License   See LICENSE.txt for the full license.
3 #ifndef DLIB_REDUCEd_TRAINERS_
4 #define DLIB_REDUCEd_TRAINERS_
5 
6 #include "reduced_abstract.h"
7 #include "../matrix.h"
8 #include "../algs.h"
9 #include "function.h"
10 #include "kernel.h"
11 #include "kcentroid.h"
12 #include "linearly_independent_subset_finder.h"
13 #include "../optimization.h"
14 
15 namespace dlib
16 {
17 
18 // ----------------------------------------------------------------------------------------
19 // ----------------------------------------------------------------------------------------
20 // ----------------------------------------------------------------------------------------
21 
22     template <
23         typename trainer_type
24         >
25     class reduced_decision_function_trainer
26     {
27     public:
28         typedef typename trainer_type::kernel_type kernel_type;
29         typedef typename trainer_type::scalar_type scalar_type;
30         typedef typename trainer_type::sample_type sample_type;
31         typedef typename trainer_type::mem_manager_type mem_manager_type;
32         typedef typename trainer_type::trained_function_type trained_function_type;
33 
reduced_decision_function_trainer()34         reduced_decision_function_trainer (
35         ) :num_bv(0) {}
36 
reduced_decision_function_trainer(const trainer_type & trainer_,const unsigned long num_sb_)37         reduced_decision_function_trainer (
38             const trainer_type& trainer_,
39             const unsigned long num_sb_
40         ) :
41             trainer(trainer_),
42             num_bv(num_sb_)
43         {
44             // make sure requires clause is not broken
45             DLIB_ASSERT(num_bv > 0,
46                         "\t reduced_decision_function_trainer()"
47                         << "\n\t you have given invalid arguments to this function"
48                         << "\n\t num_bv: " << num_bv
49             );
50         }
51 
52         template <
53             typename in_sample_vector_type,
54             typename in_scalar_vector_type
55             >
train(const in_sample_vector_type & x,const in_scalar_vector_type & y)56         const decision_function<kernel_type> train (
57             const in_sample_vector_type& x,
58             const in_scalar_vector_type& y
59         ) const
60         {
61             // make sure requires clause is not broken
62             DLIB_ASSERT(num_bv > 0,
63                         "\t reduced_decision_function_trainer::train(x,y)"
64                         << "\n\t You have tried to use an uninitialized version of this object"
65                         << "\n\t num_bv: " << num_bv );
66             return do_train(mat(x), mat(y));
67         }
68 
69     private:
70 
71     // ------------------------------------------------------------------------------------
72 
73         template <
74             typename in_sample_vector_type,
75             typename in_scalar_vector_type
76             >
do_train(const in_sample_vector_type & x,const in_scalar_vector_type & y)77         const decision_function<kernel_type> do_train (
78             const in_sample_vector_type& x,
79             const in_scalar_vector_type& y
80         ) const
81         {
82             // get the decision function object we are going to try and approximate
83             const decision_function<kernel_type>& dec_funct = trainer.train(x,y);
84 
85             // now find a linearly independent subset of the training points of num_bv points.
86             linearly_independent_subset_finder<kernel_type> lisf(dec_funct.kernel_function, num_bv);
87             fill_lisf(lisf, x);
88 
89             // The next few statements just find the best weights with which to approximate
90             // the dec_funct object with the smaller set of vectors in the lisf dictionary.  This
91             // is really just a simple application of some linear algebra.  For the details
92             // see page 554 of Learning with kernels by Scholkopf and Smola where they talk
93             // about "Optimal Expansion Coefficients."
94 
95             const kernel_type kern(dec_funct.kernel_function);
96 
97             matrix<scalar_type,0,1,mem_manager_type> alpha;
98 
99             alpha = lisf.get_inv_kernel_marix()*(kernel_matrix(kern,lisf,dec_funct.basis_vectors)*dec_funct.alpha);
100 
101             decision_function<kernel_type> new_df(alpha,
102                                                   0,
103                                                   kern,
104                                                   lisf.get_dictionary());
105 
106             // now we have to figure out what the new bias should be.  It might be a little
107             // different since we just messed with all the weights and vectors.
108             double bias = 0;
109             for (long i = 0; i < x.nr(); ++i)
110             {
111                 bias += new_df(x(i)) - dec_funct(x(i));
112             }
113 
114             new_df.b = bias/x.nr();
115 
116             return new_df;
117         }
118 
119     // ------------------------------------------------------------------------------------
120 
121         trainer_type trainer;
122         unsigned long num_bv;
123 
124 
125     }; // end of class reduced_decision_function_trainer
126 
127     template <typename trainer_type>
reduced(const trainer_type & trainer,const unsigned long num_bv)128     const reduced_decision_function_trainer<trainer_type> reduced (
129         const trainer_type& trainer,
130         const unsigned long num_bv
131     )
132     {
133         // make sure requires clause is not broken
134         DLIB_ASSERT(num_bv > 0,
135                     "\tconst reduced_decision_function_trainer reduced()"
136                     << "\n\t you have given invalid arguments to this function"
137                     << "\n\t num_bv: " << num_bv
138         );
139 
140         return reduced_decision_function_trainer<trainer_type>(trainer, num_bv);
141     }
142 
143 // ----------------------------------------------------------------------------------------
144 // ----------------------------------------------------------------------------------------
145 // ----------------------------------------------------------------------------------------
146 
147     namespace red_impl
148     {
149 
150     // ------------------------------------------------------------------------------------
151 
152         template <typename kernel_type>
153         class objective
154         {
155             /*
156                 This object represents the objective function we will try to
157                 minimize in approximate_distance_function().
158 
159                 The objective is the distance, in kernel induced feature space, between
160                 the original distance function and the approximated version.
161 
162             */
163             typedef typename kernel_type::scalar_type scalar_type;
164             typedef typename kernel_type::sample_type sample_type;
165             typedef typename kernel_type::mem_manager_type mem_manager_type;
166         public:
objective(const distance_function<kernel_type> & dist_funct_,matrix<scalar_type,0,1,mem_manager_type> & b_,matrix<sample_type,0,1,mem_manager_type> & out_vectors_)167             objective(
168                 const distance_function<kernel_type>& dist_funct_,
169                 matrix<scalar_type,0,1,mem_manager_type>& b_,
170                 matrix<sample_type,0,1,mem_manager_type>& out_vectors_
171             ) :
172                 dist_funct(dist_funct_),
173                 b(b_),
174                 out_vectors(out_vectors_)
175             {
176             }
177 
state_to_vector()178             const matrix<scalar_type, 0, 1, mem_manager_type> state_to_vector (
179             ) const
180             /*!
181                 ensures
182                     - returns a vector that contains all the information necessary to
183                       reproduce the current state of the approximated distance function
184             !*/
185             {
186                 matrix<scalar_type, 0, 1, mem_manager_type> z(b.nr() + out_vectors.size()*out_vectors(0).nr());
187                 long i = 0;
188                 for (long j = 0; j < b.nr(); ++j)
189                 {
190                     z(i) = b(j);
191                     ++i;
192                 }
193 
194                 for (long j = 0; j < out_vectors.size(); ++j)
195                 {
196                     for (long k = 0; k < out_vectors(j).size(); ++k)
197                     {
198                         z(i) = out_vectors(j)(k);
199                         ++i;
200                     }
201                 }
202                 return z;
203             }
204 
205 
vector_to_state(const matrix<scalar_type,0,1,mem_manager_type> & z)206             void vector_to_state (
207                 const matrix<scalar_type, 0, 1, mem_manager_type>& z
208             ) const
209             /*!
210                 requires
211                     - z came from the state_to_vector() function or has a compatible format
212                 ensures
213                     - loads the vector z into the state variables of the approximate
214                       distance function (i.e. b and out_vectors)
215             !*/
216             {
217                 long i = 0;
218                 for (long j = 0; j < b.nr(); ++j)
219                 {
220                     b(j) = z(i);
221                     ++i;
222                 }
223 
224                 for (long j = 0; j < out_vectors.size(); ++j)
225                 {
226                     for (long k = 0; k < out_vectors(j).size(); ++k)
227                     {
228                         out_vectors(j)(k) = z(i);
229                         ++i;
230                     }
231                 }
232             }
233 
operator()234             double operator() (
235                 const matrix<scalar_type, 0, 1, mem_manager_type>& z
236             ) const
237             /*!
238                 ensures
239                     - loads the current approximate distance function with z
240                     - returns the distance between the original distance function
241                       and the approximate one.
242             !*/
243             {
244                 vector_to_state(z);
245                 const kernel_type k(dist_funct.get_kernel());
246 
247                 double temp = 0;
248                 for (long i = 0; i < out_vectors.size(); ++i)
249                 {
250                     for (long j = 0; j < dist_funct.get_basis_vectors().nr(); ++j)
251                     {
252                         temp -= b(i)*dist_funct.get_alpha()(j)*k(out_vectors(i), dist_funct.get_basis_vectors()(j));
253                     }
254                 }
255 
256                 temp *= 2;
257 
258                 for (long i = 0; i < out_vectors.size(); ++i)
259                 {
260                     for (long j = 0; j < out_vectors.size(); ++j)
261                     {
262                         temp += b(i)*b(j)*k(out_vectors(i), out_vectors(j));
263                     }
264                 }
265 
266                 return temp + dist_funct.get_squared_norm();
267             }
268 
269         private:
270 
271             const distance_function<kernel_type>& dist_funct;
272             matrix<scalar_type,0,1,mem_manager_type>& b;
273             matrix<sample_type,0,1,mem_manager_type>& out_vectors;
274 
275         };
276 
277     // ------------------------------------------------------------------------------------
278 
279         template <typename kernel_type>
280         class objective_derivative
281         {
282             /*!
283                 This object represents the derivative of the objective object
284             !*/
285             typedef typename kernel_type::scalar_type scalar_type;
286             typedef typename kernel_type::sample_type sample_type;
287             typedef typename kernel_type::mem_manager_type mem_manager_type;
288         public:
289 
290 
objective_derivative(const distance_function<kernel_type> & dist_funct_,matrix<scalar_type,0,1,mem_manager_type> & b_,matrix<sample_type,0,1,mem_manager_type> & out_vectors_)291             objective_derivative(
292                 const distance_function<kernel_type>& dist_funct_,
293                 matrix<scalar_type,0,1,mem_manager_type>& b_,
294                 matrix<sample_type,0,1,mem_manager_type>& out_vectors_
295             ) :
296                 dist_funct(dist_funct_),
297                 b(b_),
298                 out_vectors(out_vectors_)
299             {
300             }
301 
vector_to_state(const matrix<scalar_type,0,1,mem_manager_type> & z)302             void vector_to_state (
303                 const matrix<scalar_type, 0, 1, mem_manager_type>& z
304             ) const
305             /*!
306                 requires
307                     - z came from the state_to_vector() function or has a compatible format
308                 ensures
309                     - loads the vector z into the state variables of the approximate
310                       distance function (i.e. b and out_vectors)
311             !*/
312             {
313                 long i = 0;
314                 for (long j = 0; j < b.nr(); ++j)
315                 {
316                     b(j) = z(i);
317                     ++i;
318                 }
319 
320                 for (long j = 0; j < out_vectors.size(); ++j)
321                 {
322                     for (long k = 0; k < out_vectors(j).size(); ++k)
323                     {
324                         out_vectors(j)(k) = z(i);
325                         ++i;
326                     }
327                 }
328             }
329 
operator()330             const matrix<scalar_type,0,1,mem_manager_type>& operator() (
331                 const matrix<scalar_type, 0, 1, mem_manager_type>& z
332             ) const
333             /*!
334                 ensures
335                     - loads the current approximate distance function with z
336                     - returns the derivative of the distance between the original
337                       distance function and the approximate one.
338             !*/
339             {
340                 vector_to_state(z);
341                 res.set_size(z.nr());
342                 set_all_elements(res,0);
343                 const kernel_type k(dist_funct.get_kernel());
344                 const kernel_derivative<kernel_type> K_der(k);
345 
346                 // first compute the gradient for the beta weights
347                 for (long i = 0; i < out_vectors.size(); ++i)
348                 {
349                     for (long j = 0; j < out_vectors.size(); ++j)
350                     {
351                         res(i) += b(j)*k(out_vectors(i), out_vectors(j));
352                     }
353                 }
354                 for (long i = 0; i < out_vectors.size(); ++i)
355                 {
356                     for (long j = 0; j < dist_funct.get_basis_vectors().size(); ++j)
357                     {
358                         res(i) -= dist_funct.get_alpha()(j)*k(out_vectors(i), dist_funct.get_basis_vectors()(j));
359                     }
360                 }
361 
362 
363                 // now compute the gradient of the actual vectors that go into the kernel functions
364                 long pos = out_vectors.size();
365                 const long num = out_vectors(0).nr();
366                 temp.set_size(num,1);
367                 for (long i = 0; i < out_vectors.size(); ++i)
368                 {
369                     set_all_elements(temp,0);
370                     for (long j = 0; j < out_vectors.size(); ++j)
371                     {
372                         temp += b(j)*K_der(out_vectors(j), out_vectors(i));
373                     }
374                     for (long j = 0; j < dist_funct.get_basis_vectors().nr(); ++j)
375                     {
376                         temp -= dist_funct.get_alpha()(j)*K_der(dist_funct.get_basis_vectors()(j), out_vectors(i) );
377                     }
378 
379                     // store the gradient for out_vectors(i) into result in the proper spot
380                     set_subm(res,pos,0,num,1) = b(i)*temp;
381                     pos += num;
382                 }
383 
384 
385                 res *= 2;
386                 return res;
387             }
388 
389         private:
390 
391             mutable matrix<scalar_type, 0, 1, mem_manager_type> res;
392             mutable sample_type temp;
393 
394             const distance_function<kernel_type>& dist_funct;
395             matrix<scalar_type,0,1,mem_manager_type>& b;
396             matrix<sample_type,0,1,mem_manager_type>& out_vectors;
397 
398         };
399 
400     // ------------------------------------------------------------------------------------
401 
402     }
403 
404     template <
405         typename K,
406         typename stop_strategy_type,
407         typename T
408         >
approximate_distance_function(stop_strategy_type stop_strategy,const distance_function<K> & target,const T & starting_basis)409     distance_function<K> approximate_distance_function (
410         stop_strategy_type stop_strategy,
411         const distance_function<K>& target,
412         const T& starting_basis
413     )
414     {
415         // make sure requires clause is not broken
416         DLIB_ASSERT(target.get_basis_vectors().size() > 0 &&
417                     starting_basis.size() > 0,
418                     "\t  distance_function approximate_distance_function()"
419                     << "\n\t Invalid inputs were given to this function."
420                     << "\n\t target.get_basis_vectors().size(): " << target.get_basis_vectors().size()
421                     << "\n\t starting_basis.size():             " << starting_basis.size()
422         );
423 
424         using namespace red_impl;
425         // The next few statements just find the best weights with which to approximate
426         // the target object with the set of basis vectors in starting_basis.  This
427         // is really just a simple application of some linear algebra.  For the details
428         // see page 554 of Learning with kernels by Scholkopf and Smola where they talk
429         // about "Optimal Expansion Coefficients."
430 
431         const K kern(target.get_kernel());
432         typedef typename K::scalar_type scalar_type;
433         typedef typename K::sample_type sample_type;
434         typedef typename K::mem_manager_type mem_manager_type;
435 
436         matrix<scalar_type,0,1,mem_manager_type> beta;
437 
438         // Now we compute the fist approximate distance function.
439         beta = pinv(kernel_matrix(kern,starting_basis)) *
440             (kernel_matrix(kern,starting_basis,target.get_basis_vectors())*target.get_alpha());
441         matrix<sample_type,0,1,mem_manager_type> out_vectors(mat(starting_basis));
442 
443 
444         // Now setup to do a global optimization of all the parameters in the approximate
445         // distance function.
446         const objective<K> obj(target, beta, out_vectors);
447         const objective_derivative<K> obj_der(target, beta, out_vectors);
448         matrix<scalar_type,0,1,mem_manager_type> opt_starting_point(obj.state_to_vector());
449 
450 
451         // perform a full optimization of all the parameters (i.e. both beta and the basis vectors together)
452         find_min(lbfgs_search_strategy(20),
453                  stop_strategy,
454                  obj, obj_der, opt_starting_point, 0);
455 
456         // now make sure that the final optimized value is loaded into the beta and
457         // out_vectors matrices
458         obj.vector_to_state(opt_starting_point);
459 
460         // Do a final reoptimization of beta just to make sure it is optimal given the new
461         // set of basis vectors.
462         beta = pinv(kernel_matrix(kern,out_vectors))*(kernel_matrix(kern,out_vectors,target.get_basis_vectors())*target.get_alpha());
463 
464         // It is possible that some of the beta weights will be very close to zero.  Lets remove
465         // the basis vectors with these essentially zero weights.
466         const scalar_type eps = max(abs(beta))*std::numeric_limits<scalar_type>::epsilon();
467         for (long i = 0; i < beta.size(); ++i)
468         {
469             // if beta(i) is zero (but leave at least one beta no matter what)
470             if (std::abs(beta(i)) < eps && beta.size() > 1)
471             {
472                 beta = remove_row(beta, i);
473                 out_vectors = remove_row(out_vectors, i);
474                 --i;
475             }
476         }
477 
478         return distance_function<K>(beta, kern, out_vectors);
479     }
480 
481 // ----------------------------------------------------------------------------------------
482 // ----------------------------------------------------------------------------------------
483 // ----------------------------------------------------------------------------------------
484 
485     template <
486         typename trainer_type
487         >
488     class reduced_decision_function_trainer2
489     {
490     public:
491         typedef typename trainer_type::kernel_type kernel_type;
492         typedef typename trainer_type::scalar_type scalar_type;
493         typedef typename trainer_type::sample_type sample_type;
494         typedef typename trainer_type::mem_manager_type mem_manager_type;
495         typedef typename trainer_type::trained_function_type trained_function_type;
496 
reduced_decision_function_trainer2()497         reduced_decision_function_trainer2 () : num_bv(0) {}
498         reduced_decision_function_trainer2 (
499             const trainer_type& trainer_,
500             const long num_sb_,
501             const double eps_ = 1e-3
502         ) :
trainer(trainer_)503             trainer(trainer_),
504             num_bv(num_sb_),
505             eps(eps_)
506         {
507             COMPILE_TIME_ASSERT(is_matrix<sample_type>::value);
508 
509             // make sure requires clause is not broken
510             DLIB_ASSERT(num_bv > 0 && eps > 0,
511                         "\t reduced_decision_function_trainer2()"
512                         << "\n\t you have given invalid arguments to this function"
513                         << "\n\t num_bv: " << num_bv
514                         << "\n\t eps:    " << eps
515             );
516         }
517 
518         template <
519             typename in_sample_vector_type,
520             typename in_scalar_vector_type
521             >
train(const in_sample_vector_type & x,const in_scalar_vector_type & y)522         const decision_function<kernel_type> train (
523             const in_sample_vector_type& x,
524             const in_scalar_vector_type& y
525         ) const
526         {
527             // make sure requires clause is not broken
528             DLIB_ASSERT(num_bv > 0,
529                         "\t reduced_decision_function_trainer2::train(x,y)"
530                         << "\n\t You have tried to use an uninitialized version of this object"
531                         << "\n\t num_bv: " << num_bv );
532             return do_train(mat(x), mat(y));
533         }
534 
535     private:
536 
537         template <
538             typename in_sample_vector_type,
539             typename in_scalar_vector_type
540             >
do_train(const in_sample_vector_type & x,const in_scalar_vector_type & y)541         const decision_function<kernel_type> do_train (
542             const in_sample_vector_type& x,
543             const in_scalar_vector_type& y
544         ) const
545         {
546             // get the decision function object we are going to try and approximate
547             const decision_function<kernel_type>& dec_funct = trainer.train(x,y);
548             const kernel_type kern(dec_funct.kernel_function);
549 
550             // now find a linearly independent subset of the training points of num_bv points.
551             linearly_independent_subset_finder<kernel_type> lisf(kern, num_bv);
552             fill_lisf(lisf,x);
553 
554             distance_function<kernel_type> approx, target;
555             target = dec_funct;
556             approx = approximate_distance_function(objective_delta_stop_strategy(eps), target, lisf);
557 
558             decision_function<kernel_type> new_df(approx.get_alpha(),
559                                                   0,
560                                                   kern,
561                                                   approx.get_basis_vectors());
562 
563             // now we have to figure out what the new bias should be.  It might be a little
564             // different since we just messed with all the weights and vectors.
565             double bias = 0;
566             for (long i = 0; i < x.nr(); ++i)
567             {
568                 bias += new_df(x(i)) - dec_funct(x(i));
569             }
570 
571             new_df.b = bias/x.nr();
572 
573             return new_df;
574 
575         }
576 
577     // ------------------------------------------------------------------------------------
578 
579         trainer_type trainer;
580         long num_bv;
581         double eps;
582 
583 
584     }; // end of class reduced_decision_function_trainer2
585 
586     template <typename trainer_type>
587     const reduced_decision_function_trainer2<trainer_type> reduced2 (
588         const trainer_type& trainer,
589         const long num_bv,
590         double eps = 1e-3
591     )
592     {
593         COMPILE_TIME_ASSERT(is_matrix<typename trainer_type::sample_type>::value);
594 
595         // make sure requires clause is not broken
596         DLIB_ASSERT(num_bv > 0 && eps > 0,
597                     "\tconst reduced_decision_function_trainer2 reduced2()"
598                     << "\n\t you have given invalid arguments to this function"
599                     << "\n\t num_bv: " << num_bv
600                     << "\n\t eps:    " << eps
601         );
602 
603         return reduced_decision_function_trainer2<trainer_type>(trainer, num_bv, eps);
604     }
605 
606 // ----------------------------------------------------------------------------------------
607 // ----------------------------------------------------------------------------------------
608 // ----------------------------------------------------------------------------------------
609 
610 }
611 
612 #endif // DLIB_REDUCEd_TRAINERS_
613 
614