1 // Copyright (C) 2008 Davis E. King (davis@dlib.net) 2 // License: Boost Software License See LICENSE.txt for the full license. 3 #ifndef DLIB_REDUCEd_TRAINERS_ 4 #define DLIB_REDUCEd_TRAINERS_ 5 6 #include "reduced_abstract.h" 7 #include "../matrix.h" 8 #include "../algs.h" 9 #include "function.h" 10 #include "kernel.h" 11 #include "kcentroid.h" 12 #include "linearly_independent_subset_finder.h" 13 #include "../optimization.h" 14 15 namespace dlib 16 { 17 18 // ---------------------------------------------------------------------------------------- 19 // ---------------------------------------------------------------------------------------- 20 // ---------------------------------------------------------------------------------------- 21 22 template < 23 typename trainer_type 24 > 25 class reduced_decision_function_trainer 26 { 27 public: 28 typedef typename trainer_type::kernel_type kernel_type; 29 typedef typename trainer_type::scalar_type scalar_type; 30 typedef typename trainer_type::sample_type sample_type; 31 typedef typename trainer_type::mem_manager_type mem_manager_type; 32 typedef typename trainer_type::trained_function_type trained_function_type; 33 reduced_decision_function_trainer()34 reduced_decision_function_trainer ( 35 ) :num_bv(0) {} 36 reduced_decision_function_trainer(const trainer_type & trainer_,const unsigned long num_sb_)37 reduced_decision_function_trainer ( 38 const trainer_type& trainer_, 39 const unsigned long num_sb_ 40 ) : 41 trainer(trainer_), 42 num_bv(num_sb_) 43 { 44 // make sure requires clause is not broken 45 DLIB_ASSERT(num_bv > 0, 46 "\t reduced_decision_function_trainer()" 47 << "\n\t you have given invalid arguments to this function" 48 << "\n\t num_bv: " << num_bv 49 ); 50 } 51 52 template < 53 typename in_sample_vector_type, 54 typename in_scalar_vector_type 55 > train(const in_sample_vector_type & x,const in_scalar_vector_type & y)56 const decision_function<kernel_type> train ( 57 const in_sample_vector_type& x, 58 const in_scalar_vector_type& y 59 ) const 60 { 61 // make sure requires clause is not broken 62 DLIB_ASSERT(num_bv > 0, 63 "\t reduced_decision_function_trainer::train(x,y)" 64 << "\n\t You have tried to use an uninitialized version of this object" 65 << "\n\t num_bv: " << num_bv ); 66 return do_train(mat(x), mat(y)); 67 } 68 69 private: 70 71 // ------------------------------------------------------------------------------------ 72 73 template < 74 typename in_sample_vector_type, 75 typename in_scalar_vector_type 76 > do_train(const in_sample_vector_type & x,const in_scalar_vector_type & y)77 const decision_function<kernel_type> do_train ( 78 const in_sample_vector_type& x, 79 const in_scalar_vector_type& y 80 ) const 81 { 82 // get the decision function object we are going to try and approximate 83 const decision_function<kernel_type>& dec_funct = trainer.train(x,y); 84 85 // now find a linearly independent subset of the training points of num_bv points. 86 linearly_independent_subset_finder<kernel_type> lisf(dec_funct.kernel_function, num_bv); 87 fill_lisf(lisf, x); 88 89 // The next few statements just find the best weights with which to approximate 90 // the dec_funct object with the smaller set of vectors in the lisf dictionary. This 91 // is really just a simple application of some linear algebra. For the details 92 // see page 554 of Learning with kernels by Scholkopf and Smola where they talk 93 // about "Optimal Expansion Coefficients." 94 95 const kernel_type kern(dec_funct.kernel_function); 96 97 matrix<scalar_type,0,1,mem_manager_type> alpha; 98 99 alpha = lisf.get_inv_kernel_marix()*(kernel_matrix(kern,lisf,dec_funct.basis_vectors)*dec_funct.alpha); 100 101 decision_function<kernel_type> new_df(alpha, 102 0, 103 kern, 104 lisf.get_dictionary()); 105 106 // now we have to figure out what the new bias should be. It might be a little 107 // different since we just messed with all the weights and vectors. 108 double bias = 0; 109 for (long i = 0; i < x.nr(); ++i) 110 { 111 bias += new_df(x(i)) - dec_funct(x(i)); 112 } 113 114 new_df.b = bias/x.nr(); 115 116 return new_df; 117 } 118 119 // ------------------------------------------------------------------------------------ 120 121 trainer_type trainer; 122 unsigned long num_bv; 123 124 125 }; // end of class reduced_decision_function_trainer 126 127 template <typename trainer_type> reduced(const trainer_type & trainer,const unsigned long num_bv)128 const reduced_decision_function_trainer<trainer_type> reduced ( 129 const trainer_type& trainer, 130 const unsigned long num_bv 131 ) 132 { 133 // make sure requires clause is not broken 134 DLIB_ASSERT(num_bv > 0, 135 "\tconst reduced_decision_function_trainer reduced()" 136 << "\n\t you have given invalid arguments to this function" 137 << "\n\t num_bv: " << num_bv 138 ); 139 140 return reduced_decision_function_trainer<trainer_type>(trainer, num_bv); 141 } 142 143 // ---------------------------------------------------------------------------------------- 144 // ---------------------------------------------------------------------------------------- 145 // ---------------------------------------------------------------------------------------- 146 147 namespace red_impl 148 { 149 150 // ------------------------------------------------------------------------------------ 151 152 template <typename kernel_type> 153 class objective 154 { 155 /* 156 This object represents the objective function we will try to 157 minimize in approximate_distance_function(). 158 159 The objective is the distance, in kernel induced feature space, between 160 the original distance function and the approximated version. 161 162 */ 163 typedef typename kernel_type::scalar_type scalar_type; 164 typedef typename kernel_type::sample_type sample_type; 165 typedef typename kernel_type::mem_manager_type mem_manager_type; 166 public: objective(const distance_function<kernel_type> & dist_funct_,matrix<scalar_type,0,1,mem_manager_type> & b_,matrix<sample_type,0,1,mem_manager_type> & out_vectors_)167 objective( 168 const distance_function<kernel_type>& dist_funct_, 169 matrix<scalar_type,0,1,mem_manager_type>& b_, 170 matrix<sample_type,0,1,mem_manager_type>& out_vectors_ 171 ) : 172 dist_funct(dist_funct_), 173 b(b_), 174 out_vectors(out_vectors_) 175 { 176 } 177 state_to_vector()178 const matrix<scalar_type, 0, 1, mem_manager_type> state_to_vector ( 179 ) const 180 /*! 181 ensures 182 - returns a vector that contains all the information necessary to 183 reproduce the current state of the approximated distance function 184 !*/ 185 { 186 matrix<scalar_type, 0, 1, mem_manager_type> z(b.nr() + out_vectors.size()*out_vectors(0).nr()); 187 long i = 0; 188 for (long j = 0; j < b.nr(); ++j) 189 { 190 z(i) = b(j); 191 ++i; 192 } 193 194 for (long j = 0; j < out_vectors.size(); ++j) 195 { 196 for (long k = 0; k < out_vectors(j).size(); ++k) 197 { 198 z(i) = out_vectors(j)(k); 199 ++i; 200 } 201 } 202 return z; 203 } 204 205 vector_to_state(const matrix<scalar_type,0,1,mem_manager_type> & z)206 void vector_to_state ( 207 const matrix<scalar_type, 0, 1, mem_manager_type>& z 208 ) const 209 /*! 210 requires 211 - z came from the state_to_vector() function or has a compatible format 212 ensures 213 - loads the vector z into the state variables of the approximate 214 distance function (i.e. b and out_vectors) 215 !*/ 216 { 217 long i = 0; 218 for (long j = 0; j < b.nr(); ++j) 219 { 220 b(j) = z(i); 221 ++i; 222 } 223 224 for (long j = 0; j < out_vectors.size(); ++j) 225 { 226 for (long k = 0; k < out_vectors(j).size(); ++k) 227 { 228 out_vectors(j)(k) = z(i); 229 ++i; 230 } 231 } 232 } 233 operator()234 double operator() ( 235 const matrix<scalar_type, 0, 1, mem_manager_type>& z 236 ) const 237 /*! 238 ensures 239 - loads the current approximate distance function with z 240 - returns the distance between the original distance function 241 and the approximate one. 242 !*/ 243 { 244 vector_to_state(z); 245 const kernel_type k(dist_funct.get_kernel()); 246 247 double temp = 0; 248 for (long i = 0; i < out_vectors.size(); ++i) 249 { 250 for (long j = 0; j < dist_funct.get_basis_vectors().nr(); ++j) 251 { 252 temp -= b(i)*dist_funct.get_alpha()(j)*k(out_vectors(i), dist_funct.get_basis_vectors()(j)); 253 } 254 } 255 256 temp *= 2; 257 258 for (long i = 0; i < out_vectors.size(); ++i) 259 { 260 for (long j = 0; j < out_vectors.size(); ++j) 261 { 262 temp += b(i)*b(j)*k(out_vectors(i), out_vectors(j)); 263 } 264 } 265 266 return temp + dist_funct.get_squared_norm(); 267 } 268 269 private: 270 271 const distance_function<kernel_type>& dist_funct; 272 matrix<scalar_type,0,1,mem_manager_type>& b; 273 matrix<sample_type,0,1,mem_manager_type>& out_vectors; 274 275 }; 276 277 // ------------------------------------------------------------------------------------ 278 279 template <typename kernel_type> 280 class objective_derivative 281 { 282 /*! 283 This object represents the derivative of the objective object 284 !*/ 285 typedef typename kernel_type::scalar_type scalar_type; 286 typedef typename kernel_type::sample_type sample_type; 287 typedef typename kernel_type::mem_manager_type mem_manager_type; 288 public: 289 290 objective_derivative(const distance_function<kernel_type> & dist_funct_,matrix<scalar_type,0,1,mem_manager_type> & b_,matrix<sample_type,0,1,mem_manager_type> & out_vectors_)291 objective_derivative( 292 const distance_function<kernel_type>& dist_funct_, 293 matrix<scalar_type,0,1,mem_manager_type>& b_, 294 matrix<sample_type,0,1,mem_manager_type>& out_vectors_ 295 ) : 296 dist_funct(dist_funct_), 297 b(b_), 298 out_vectors(out_vectors_) 299 { 300 } 301 vector_to_state(const matrix<scalar_type,0,1,mem_manager_type> & z)302 void vector_to_state ( 303 const matrix<scalar_type, 0, 1, mem_manager_type>& z 304 ) const 305 /*! 306 requires 307 - z came from the state_to_vector() function or has a compatible format 308 ensures 309 - loads the vector z into the state variables of the approximate 310 distance function (i.e. b and out_vectors) 311 !*/ 312 { 313 long i = 0; 314 for (long j = 0; j < b.nr(); ++j) 315 { 316 b(j) = z(i); 317 ++i; 318 } 319 320 for (long j = 0; j < out_vectors.size(); ++j) 321 { 322 for (long k = 0; k < out_vectors(j).size(); ++k) 323 { 324 out_vectors(j)(k) = z(i); 325 ++i; 326 } 327 } 328 } 329 operator()330 const matrix<scalar_type,0,1,mem_manager_type>& operator() ( 331 const matrix<scalar_type, 0, 1, mem_manager_type>& z 332 ) const 333 /*! 334 ensures 335 - loads the current approximate distance function with z 336 - returns the derivative of the distance between the original 337 distance function and the approximate one. 338 !*/ 339 { 340 vector_to_state(z); 341 res.set_size(z.nr()); 342 set_all_elements(res,0); 343 const kernel_type k(dist_funct.get_kernel()); 344 const kernel_derivative<kernel_type> K_der(k); 345 346 // first compute the gradient for the beta weights 347 for (long i = 0; i < out_vectors.size(); ++i) 348 { 349 for (long j = 0; j < out_vectors.size(); ++j) 350 { 351 res(i) += b(j)*k(out_vectors(i), out_vectors(j)); 352 } 353 } 354 for (long i = 0; i < out_vectors.size(); ++i) 355 { 356 for (long j = 0; j < dist_funct.get_basis_vectors().size(); ++j) 357 { 358 res(i) -= dist_funct.get_alpha()(j)*k(out_vectors(i), dist_funct.get_basis_vectors()(j)); 359 } 360 } 361 362 363 // now compute the gradient of the actual vectors that go into the kernel functions 364 long pos = out_vectors.size(); 365 const long num = out_vectors(0).nr(); 366 temp.set_size(num,1); 367 for (long i = 0; i < out_vectors.size(); ++i) 368 { 369 set_all_elements(temp,0); 370 for (long j = 0; j < out_vectors.size(); ++j) 371 { 372 temp += b(j)*K_der(out_vectors(j), out_vectors(i)); 373 } 374 for (long j = 0; j < dist_funct.get_basis_vectors().nr(); ++j) 375 { 376 temp -= dist_funct.get_alpha()(j)*K_der(dist_funct.get_basis_vectors()(j), out_vectors(i) ); 377 } 378 379 // store the gradient for out_vectors(i) into result in the proper spot 380 set_subm(res,pos,0,num,1) = b(i)*temp; 381 pos += num; 382 } 383 384 385 res *= 2; 386 return res; 387 } 388 389 private: 390 391 mutable matrix<scalar_type, 0, 1, mem_manager_type> res; 392 mutable sample_type temp; 393 394 const distance_function<kernel_type>& dist_funct; 395 matrix<scalar_type,0,1,mem_manager_type>& b; 396 matrix<sample_type,0,1,mem_manager_type>& out_vectors; 397 398 }; 399 400 // ------------------------------------------------------------------------------------ 401 402 } 403 404 template < 405 typename K, 406 typename stop_strategy_type, 407 typename T 408 > approximate_distance_function(stop_strategy_type stop_strategy,const distance_function<K> & target,const T & starting_basis)409 distance_function<K> approximate_distance_function ( 410 stop_strategy_type stop_strategy, 411 const distance_function<K>& target, 412 const T& starting_basis 413 ) 414 { 415 // make sure requires clause is not broken 416 DLIB_ASSERT(target.get_basis_vectors().size() > 0 && 417 starting_basis.size() > 0, 418 "\t distance_function approximate_distance_function()" 419 << "\n\t Invalid inputs were given to this function." 420 << "\n\t target.get_basis_vectors().size(): " << target.get_basis_vectors().size() 421 << "\n\t starting_basis.size(): " << starting_basis.size() 422 ); 423 424 using namespace red_impl; 425 // The next few statements just find the best weights with which to approximate 426 // the target object with the set of basis vectors in starting_basis. This 427 // is really just a simple application of some linear algebra. For the details 428 // see page 554 of Learning with kernels by Scholkopf and Smola where they talk 429 // about "Optimal Expansion Coefficients." 430 431 const K kern(target.get_kernel()); 432 typedef typename K::scalar_type scalar_type; 433 typedef typename K::sample_type sample_type; 434 typedef typename K::mem_manager_type mem_manager_type; 435 436 matrix<scalar_type,0,1,mem_manager_type> beta; 437 438 // Now we compute the fist approximate distance function. 439 beta = pinv(kernel_matrix(kern,starting_basis)) * 440 (kernel_matrix(kern,starting_basis,target.get_basis_vectors())*target.get_alpha()); 441 matrix<sample_type,0,1,mem_manager_type> out_vectors(mat(starting_basis)); 442 443 444 // Now setup to do a global optimization of all the parameters in the approximate 445 // distance function. 446 const objective<K> obj(target, beta, out_vectors); 447 const objective_derivative<K> obj_der(target, beta, out_vectors); 448 matrix<scalar_type,0,1,mem_manager_type> opt_starting_point(obj.state_to_vector()); 449 450 451 // perform a full optimization of all the parameters (i.e. both beta and the basis vectors together) 452 find_min(lbfgs_search_strategy(20), 453 stop_strategy, 454 obj, obj_der, opt_starting_point, 0); 455 456 // now make sure that the final optimized value is loaded into the beta and 457 // out_vectors matrices 458 obj.vector_to_state(opt_starting_point); 459 460 // Do a final reoptimization of beta just to make sure it is optimal given the new 461 // set of basis vectors. 462 beta = pinv(kernel_matrix(kern,out_vectors))*(kernel_matrix(kern,out_vectors,target.get_basis_vectors())*target.get_alpha()); 463 464 // It is possible that some of the beta weights will be very close to zero. Lets remove 465 // the basis vectors with these essentially zero weights. 466 const scalar_type eps = max(abs(beta))*std::numeric_limits<scalar_type>::epsilon(); 467 for (long i = 0; i < beta.size(); ++i) 468 { 469 // if beta(i) is zero (but leave at least one beta no matter what) 470 if (std::abs(beta(i)) < eps && beta.size() > 1) 471 { 472 beta = remove_row(beta, i); 473 out_vectors = remove_row(out_vectors, i); 474 --i; 475 } 476 } 477 478 return distance_function<K>(beta, kern, out_vectors); 479 } 480 481 // ---------------------------------------------------------------------------------------- 482 // ---------------------------------------------------------------------------------------- 483 // ---------------------------------------------------------------------------------------- 484 485 template < 486 typename trainer_type 487 > 488 class reduced_decision_function_trainer2 489 { 490 public: 491 typedef typename trainer_type::kernel_type kernel_type; 492 typedef typename trainer_type::scalar_type scalar_type; 493 typedef typename trainer_type::sample_type sample_type; 494 typedef typename trainer_type::mem_manager_type mem_manager_type; 495 typedef typename trainer_type::trained_function_type trained_function_type; 496 reduced_decision_function_trainer2()497 reduced_decision_function_trainer2 () : num_bv(0) {} 498 reduced_decision_function_trainer2 ( 499 const trainer_type& trainer_, 500 const long num_sb_, 501 const double eps_ = 1e-3 502 ) : trainer(trainer_)503 trainer(trainer_), 504 num_bv(num_sb_), 505 eps(eps_) 506 { 507 COMPILE_TIME_ASSERT(is_matrix<sample_type>::value); 508 509 // make sure requires clause is not broken 510 DLIB_ASSERT(num_bv > 0 && eps > 0, 511 "\t reduced_decision_function_trainer2()" 512 << "\n\t you have given invalid arguments to this function" 513 << "\n\t num_bv: " << num_bv 514 << "\n\t eps: " << eps 515 ); 516 } 517 518 template < 519 typename in_sample_vector_type, 520 typename in_scalar_vector_type 521 > train(const in_sample_vector_type & x,const in_scalar_vector_type & y)522 const decision_function<kernel_type> train ( 523 const in_sample_vector_type& x, 524 const in_scalar_vector_type& y 525 ) const 526 { 527 // make sure requires clause is not broken 528 DLIB_ASSERT(num_bv > 0, 529 "\t reduced_decision_function_trainer2::train(x,y)" 530 << "\n\t You have tried to use an uninitialized version of this object" 531 << "\n\t num_bv: " << num_bv ); 532 return do_train(mat(x), mat(y)); 533 } 534 535 private: 536 537 template < 538 typename in_sample_vector_type, 539 typename in_scalar_vector_type 540 > do_train(const in_sample_vector_type & x,const in_scalar_vector_type & y)541 const decision_function<kernel_type> do_train ( 542 const in_sample_vector_type& x, 543 const in_scalar_vector_type& y 544 ) const 545 { 546 // get the decision function object we are going to try and approximate 547 const decision_function<kernel_type>& dec_funct = trainer.train(x,y); 548 const kernel_type kern(dec_funct.kernel_function); 549 550 // now find a linearly independent subset of the training points of num_bv points. 551 linearly_independent_subset_finder<kernel_type> lisf(kern, num_bv); 552 fill_lisf(lisf,x); 553 554 distance_function<kernel_type> approx, target; 555 target = dec_funct; 556 approx = approximate_distance_function(objective_delta_stop_strategy(eps), target, lisf); 557 558 decision_function<kernel_type> new_df(approx.get_alpha(), 559 0, 560 kern, 561 approx.get_basis_vectors()); 562 563 // now we have to figure out what the new bias should be. It might be a little 564 // different since we just messed with all the weights and vectors. 565 double bias = 0; 566 for (long i = 0; i < x.nr(); ++i) 567 { 568 bias += new_df(x(i)) - dec_funct(x(i)); 569 } 570 571 new_df.b = bias/x.nr(); 572 573 return new_df; 574 575 } 576 577 // ------------------------------------------------------------------------------------ 578 579 trainer_type trainer; 580 long num_bv; 581 double eps; 582 583 584 }; // end of class reduced_decision_function_trainer2 585 586 template <typename trainer_type> 587 const reduced_decision_function_trainer2<trainer_type> reduced2 ( 588 const trainer_type& trainer, 589 const long num_bv, 590 double eps = 1e-3 591 ) 592 { 593 COMPILE_TIME_ASSERT(is_matrix<typename trainer_type::sample_type>::value); 594 595 // make sure requires clause is not broken 596 DLIB_ASSERT(num_bv > 0 && eps > 0, 597 "\tconst reduced_decision_function_trainer2 reduced2()" 598 << "\n\t you have given invalid arguments to this function" 599 << "\n\t num_bv: " << num_bv 600 << "\n\t eps: " << eps 601 ); 602 603 return reduced_decision_function_trainer2<trainer_type>(trainer, num_bv, eps); 604 } 605 606 // ---------------------------------------------------------------------------------------- 607 // ---------------------------------------------------------------------------------------- 608 // ---------------------------------------------------------------------------------------- 609 610 } 611 612 #endif // DLIB_REDUCEd_TRAINERS_ 613 614