1 // This is mul/clsfy/clsfy_direct_boost_builder.cxx
2 //:
3 // \file
4 // \brief Functions to train classifiers using AdaBoost algorithm
5 // \author dac
6 // \date   Fri Mar  1 23:49:39 2002
7 //
8 //  Functions to train classifiers using a direct boosting algorithm
9 //  DirectBoost combines a set of (usually simple, weak) classifiers into
10 //  a more powerful single classifier.  Essentially it selects the
11 //  classifiers one at a time, choosing the best at each step.
12 //  The classifiers are trained to distinguish the examples mis-classified
13 //  by the currently selected classifiers.
14 
15 #include <iostream>
16 #include <cstdlib>
17 #include <algorithm>
18 #include "clsfy_direct_boost_builder.h"
19 #include "clsfy_direct_boost.h"
20 #include "clsfy_builder_1d.h"
21 
22 #ifdef _MSC_VER
23 #  include "vcl_msvc_warnings.h"
24 #endif
25 #include <cassert>
26 #include <mbl/mbl_file_data_collector.h>
27 #include <mbl/mbl_data_collector_list.h>
28 #include <mbl/mbl_index_sort.h>
29 
30 //=======================================================================
31 
32 clsfy_direct_boost_builder::clsfy_direct_boost_builder()
33 
34     = default;
35 
36 //=======================================================================
37 
38 clsfy_direct_boost_builder::~clsfy_direct_boost_builder() = default;
39 
40 
41 //=======================================================================
42 
is_class(std::string const & s) const43 bool clsfy_direct_boost_builder::is_class(std::string const& s) const
44 {
45   return s == clsfy_direct_boost_builder::is_a() || clsfy_builder_base::is_class(s);
46 }
47 
48 //=======================================================================
49 
is_a() const50 std::string clsfy_direct_boost_builder::is_a() const
51 {
52   return std::string("clsfy_direct_boost_builder");
53 }
54 
55 
56 //: Calc similarity between two 1d input vectors
calc_prop_same(const std::vector<bool> & vec1,const std::vector<bool> & vec2) const57 double clsfy_direct_boost_builder::calc_prop_same(
58                    const std::vector<bool>& vec1,
59                    const std::vector<bool>& vec2) const
60 {
61   unsigned n = vec1.size();
62   assert( n==vec2.size() );
63   int sum= 0;
64   for (unsigned i=0;i<n;++i)
65     if (vec1[i]==vec2[i])
66       ++sum;
67 
68   return sum*1.0/n;
69 }
70 
71 
72 //: Calc threshold for current version of strong classifier
calc_threshold(clsfy_direct_boost & strong_classifier,mbl_data_wrapper<vnl_vector<double>> & inputs,const std::vector<unsigned> & outputs) const73 double clsfy_direct_boost_builder::calc_threshold(
74                    clsfy_direct_boost& strong_classifier,
75                    mbl_data_wrapper<vnl_vector<double> >& inputs,
76                    const std::vector<unsigned>& outputs) const
77 {
78   // calc classification score for each example
79   unsigned long n = inputs.size();
80   std::vector<double> scores(n);
81   inputs.reset();
82   for (unsigned long i=0;i<n;++i)
83   {
84     scores[i]= strong_classifier.log_l( inputs.current() );
85     inputs.next();
86   }
87 
88   // calc number of negative examples
89   unsigned int tot_pos=0;
90   for (unsigned long i=0;i<n;++i)
91     if ( outputs[i] == 1 ) ++tot_pos;
92 
93   // then find threshold that gives min_error over training set
94   std::vector<int> index;
95   mbl_index_sort(scores, index);
96 
97   unsigned int n_pos=0;
98   unsigned int n_neg=0;
99   unsigned long min_error= n+1;
100   double min_thresh= -1;
101   for (unsigned long i=0;i<n;++i)
102   {
103 #ifdef DEBUG
104     std::cout<<" scores[ index["<<i<<"] ] = "<< scores[ index[i] ]<<" ; "
105             <<"outputs[ index["<<i<<"] ] = "<<outputs[ index[i] ]<<'\n';
106 #endif
107     if ( outputs[ index[i] ] == 0 ) ++n_neg;
108     else if ( outputs[ index[i] ] == 1 ) ++n_pos;
109     else
110     {
111       std::cout<<"ERROR: clsfy_direct_boost_basic_builder::calc_threshold()\n"
112               <<"Unrecognised output value\n"
113               <<"outputs[ index["<<i<<"] ] = outputs["<<index[i]<<"] = "
114               <<outputs[index[i]]<<'\n';
115       std::abort();
116     }
117 
118 #ifdef DEBUG
119     std::cout<<"n = "<<n<<", n_pos= "<<n_pos<<", n_neg= "<<n_neg<<'\n';
120 #endif
121     unsigned int error= n_neg+(tot_pos-n_pos);
122 
123     if ( error<= min_error )
124     {
125       min_error= error;
126       min_thresh = scores[ index[i] ] + 0.001 ;
127 #ifdef DEBUG
128       std::cout<<"error= "<<error<<", min_thresh= "<<min_thresh<<'\n';
129 #endif
130     }
131   }
132 
133   assert( n_pos + n_neg == n );
134 #ifdef DEBUG
135   std::cout<<"min_error= "<<min_error<<", min_thresh= "<<min_thresh<<'\n';
136 #endif
137 
138   return min_thresh;
139 }
140 
141 
142 //: Build classifier composed of 1d classifiers working on individual vector elements
143 // Builds an n-component classifier, each component of which is a 1D classifier
144 // working on a single element of the input vector.
build(clsfy_classifier_base & model,mbl_data_wrapper<vnl_vector<double>> & inputs,unsigned,const std::vector<unsigned> & outputs) const145 double clsfy_direct_boost_builder::build(clsfy_classifier_base& model,
146                                          mbl_data_wrapper<vnl_vector<double> >& inputs,
147                                          unsigned /* nClasses */,
148                                          const std::vector<unsigned> &outputs) const
149 {
150   // nb  ignore nClasses=1, ie always binary classifier
151 
152   assert( model.is_class("clsfy_direct_boost") );
153   auto &strong_classifier = (clsfy_direct_boost&) model;
154 
155 
156   // check parameters are OK
157   if ( max_n_clfrs_ < 0 )
158   {
159     std::cout<<"Error: clsfy_direct_boost_builder::build\n"
160             <<"max_n_clfrs_ = "<<max_n_clfrs_<<" ie < 0\n"
161             <<"set using set_max_n_clfrs()\n";
162     std::abort();
163   }
164   else
165   {
166     std::cout<<"Maximum number of classifiers to be found by Adaboost ="
167             <<max_n_clfrs_<<'\n';
168   }
169 
170   if ( weak_builder_ == nullptr )
171   {
172     std::cout<<"Error: clsfy_direct_boost_builder::build\n"
173             <<"weak_builder_ pointer has not been set\n"
174             <<"need to provide a builder to build each weak classifier\n"
175             <<"set using set_weak_builder()\n";
176     std::abort();
177   }
178   else
179   {
180     std::cout<<"Weak learner used by AdaBoost ="
181             <<weak_builder_->is_a()<<'\n';
182   }
183 
184   if ( bs_ < 0 )
185   {
186     std::cout<<"Error: clsfy_direct_boost_builder::build\n"
187             <<"bs_ = "<<bs_<<" ie < 0\n"
188             <<"set using set_batch_size()\n";
189     std::abort();
190   }
191   else
192   {
193     std::cout<<"Batch size when sorting data =" <<bs_<<'\n';
194   }
195 
196 
197   assert(bs_>0);
198   assert(bs_!=1);
199   assert (max_n_clfrs_ >= 0);
200 
201   // first arrange the data in the form
202   // std::vector< < std::vector< vbl_triple<double,int,int> > > > data
203   // + vnl_vector wts
204   // then sort all data once, then build the classifier
205 
206   // number of examples
207   unsigned long n = inputs.size();
208   //std::cout<<"n = "<<n<<std::endl;
209 
210   // Dimensionality of data
211   inputs.reset();
212   unsigned int d = inputs.current().size();
213 
214   //need file data wrapper instead of old vector
215   //data stored on disk NOT ram
216   //std::vector< std::vector<vbl_triple<double,int,int> > > data(d);
217 
218   std::string temp_path= "temp.dat";
219   mbl_file_data_collector< vnl_vector<double> >
220               file_collector( temp_path );
221 
222   mbl_data_collector_list< vnl_vector< double > >
223               ram_collector;
224 
225   mbl_data_collector<vnl_vector< double> >*  collector;
226 
227   if (save_data_to_disk_)
228   {
229     std::cout<<"saving data to disk!\n";
230     collector= &file_collector;
231   }
232   else
233   {
234     //bs_ = n ;
235     std::cout<<"saving data to ram!\n";
236     collector= &ram_collector;
237   }
238 
239 
240   // say load in and sort 100 at once?????
241   // ie 100 features at once!
242 
243   //int bs= 100; //batch size
244   std::vector< vnl_vector< double > >vec(bs_);
245 
246   std::cout<<"d= "<<d<<std::endl;
247   unsigned int b=0;
248   while ( b+1<d )
249   {
250     int r= std::min ( bs_, int(d-b) );
251     assert(r>0);
252 
253     std::cout<<"arranging weak classifier data = "<<b<<" to "
254             <<(b+r)-1<<" of "<<d<<std::endl;
255 
256     // have to resize all vectors
257     for (int i=0; i< bs_; ++i)
258       vec[i].set_size(n);
259 
260     // add data for both classes
261     inputs.reset();
262     for (unsigned long j=0;j<n;++j)
263     {
264       for (int i=0; i< r; ++i)
265         vec[i](j)=( inputs.current()[b+i] );
266       inputs.next();
267     }
268 
269 
270     for (int i=0; i< r; ++i)
271     {
272       // sort training data for each individual weak classifier
273       assert (vec[i].size() == n);
274       assert (n != 0);
275 
276       // store sorted vector of responses for each individual weak classifier
277       collector->record(vec[i]);
278     }
279 
280     b+=bs_;
281   }
282 
283 
284   mbl_data_wrapper< vnl_vector<double> >&
285               wrapper=collector->data_wrapper();
286 
287 
288   // now actually apply direct boost algorithm
289   wrapper.reset();
290   assert ( wrapper.current().size() == n );
291   assert ( d == wrapper.size() );
292 
293 
294   // nb have to set builder as a member variable elsewhere
295   clsfy_classifier_1d* c1d = weak_builder_->new_classifier();
296 
297   // wts not really used!!
298   vnl_vector<double> wts(n,1.0/n);
299 
300   // need to train each weak classifier on the data
301   // and record the error and output responses of the weak classifiers
302   std::vector< double > errors(0);
303   std::vector< std::vector<bool> > responses(0);
304   std::vector< clsfy_classifier_1d* > classifiers(0);
305 
306   wrapper.reset();
307   for (unsigned int i=0; i<d; ++i )
308   {
309     const vnl_vector<double>& vec= wrapper.current();
310     double error= weak_builder_->build(*c1d, vec, wts, outputs);
311 
312     std::vector<bool> resp_vec(n);
313     // now get responses
314     for (unsigned long k=0; k<n;++k)
315     {
316       unsigned int r= c1d->classify( vec(k) );
317       if (r==0)
318         resp_vec[k]=false;
319       else
320         resp_vec[k]=true;
321     }
322 
323     responses.push_back( resp_vec );
324     errors.push_back( error );
325     classifiers.push_back( c1d->clone() );
326 
327     wrapper.next();
328   }
329 
330   delete c1d;
331 
332 
333   // now use the outputs and errors to define a strong classifier
334 
335   // create a sorted index of the errors
336   std::vector<int> index;
337   mbl_index_sort(errors, index);
338 
339 
340   // need to pick best classifier and store index + mean & variance
341   // whilst list of indices is NOT empty
342   strong_classifier.clear();
343   strong_classifier.set_n_dims(d);
344 
345 
346   for (int k=0; k<max_n_clfrs_; ++k)
347   {
348     if (index.empty())
349       break;
350 
351     // store best classifier that is left in list
352     int ind= index[0];
353     std::cout<<"ind= "<<ind<<", errors["<<ind<<"]= "<<errors[ind]<<'\n';
354     if (errors[ind]> 0.5 ) break;
355 
356     if (errors[ind]==0)
357       strong_classifier.add_one_classifier( classifiers[ind], 1.0, ind);
358     else
359       strong_classifier.add_one_classifier( classifiers[ind], 1.0/errors[ind], ind);
360 
361     if (calc_all_thresholds_)
362     {
363       // calculating response from classifier so far
364       // and using this to calc min_error threshold
365       double t=calc_threshold( strong_classifier, inputs, outputs );
366       strong_classifier.add_one_threshold(t);
367     }
368     else
369     {
370       // add dummy threshold, ie only calc properly at end
371       strong_classifier.add_one_threshold(0.0);
372     }
373 
374     if (errors[ind]==0) break;
375 
376     // find all classifiers that are similar to the selected
377     // classifier i
378     std::vector<int> new_index(0);
379     std::vector<bool>& i_vec=responses[ind];
380     unsigned int m=index.size();
381     unsigned int n_rejects=0;
382     for (unsigned int j=0; j<m; ++j)
383     {
384       std::vector<bool>& j_vec=responses[ index[j] ];
385       double prop_same= calc_prop_same(i_vec,j_vec);
386       //std::cout<<"prop_same= "<<prop_same<<", prop_= "<<prop_<<'\n';
387       if ( prop_same < prop_ )
388         new_index.push_back( index[j] );
389       else
390         ++n_rejects;
391     }
392 
393     std::cout<<"number of rejects due to similarity= "<<n_rejects<<std::endl;
394 
395     //for (int p=0; p<new_index.size(); ++p)
396     //  std::cout<<"new_index["<<p<<"]= "<<new_index[p]<<std::endl;
397 
398     index= new_index;
399 
400     //for (int p=0; p<index.size(); ++p)
401     //  std::cout<<"index["<<p<<"]= "<<index[p]<<std::endl;
402   }
403   for (auto & classifier : classifiers)
404     delete classifier;
405 
406   // calculating response from classifier so far
407   // and using this to calc min_error threshold
408   double t=calc_threshold( strong_classifier, inputs, outputs );
409   strong_classifier.add_final_threshold(t);
410 
411   // does clsfy_test_error balk if have too much data?
412   // should be OK because just passes mbl_data_wrapper and evaluates
413   // one at a time, so if using mbl_file_data_wrapper should be OK!
414   std::cout<<"calculating training error\n";
415   return clsfy_test_error(strong_classifier, inputs, outputs);
416 }
417 
418 
419 //: Create empty classifier
420 // Caller is responsible for deletion
new_classifier() const421 clsfy_classifier_base* clsfy_direct_boost_builder::new_classifier() const
422 {
423   return new clsfy_direct_boost();
424 }
425 
426 
427 //=======================================================================
428 
429 #if 0
430 
431 // required if data stored on the heap is present in this derived class
432 clsfy_direct_boost_builder::clsfy_direct_boost_builder(const clsfy_direct_boost_builder& new_b):
433   data_ptr_(0)
434 {
435   *this = new_b;
436 }
437 
438 //=======================================================================
439 
440 // required if data stored on the heap is present in this derived class
441 clsfy_direct_boost_builder& clsfy_direct_boost_builder::operator=(const clsfy_direct_boost_builder& new_b)
442 {
443   if (&new_b==this) return *this;
444 
445   // Copy heap member variables.
446   delete data_ptr_; data_ptr_=0;
447 
448   if (new_b.data_ptr_)
449     data_ptr_ = new_b.data_ptr_->clone();
450 
451   // Copy normal member variables
452   data_ = new_b.data_;
453 
454   return *this;
455 }
456 
457 #endif // 0
458 
459 
460 //=======================================================================
461 
clone() const462 clsfy_builder_base* clsfy_direct_boost_builder::clone() const
463 {
464   return new clsfy_direct_boost_builder(*this);
465 }
466 
467 //=======================================================================
468 
469 // required if data is present in this base class
print_summary(std::ostream &) const470 void clsfy_direct_boost_builder::print_summary(std::ostream& /*os*/) const
471 {
472 #if 0
473   clsfy_builder_base::print_summary(os); // Uncomment this line if it has one.
474   vsl_print_summary(os, data_); // Example of data output
475 #endif
476 
477   std::cerr << "clsfy_direct_boost_builder::print_summary() NYI\n";
478 }
479 
480 //=======================================================================
481 
482 // required if data is present in this base class
b_write(vsl_b_ostream &) const483 void clsfy_direct_boost_builder::b_write(vsl_b_ostream& /*bfs*/) const
484 {
485 #if 0
486   vsl_b_write(bfs, version_no());
487   clsfy_builder_base::b_write(bfs);  // Needed if base has any data
488   vsl_b_write(bfs, data_);
489 #endif
490   std::cerr << "clsfy_direct_boost_builder::b_write() NYI\n";
491 }
492 
493 //=======================================================================
494 
495 // required if data is present in this base class
b_read(vsl_b_istream &)496 void clsfy_direct_boost_builder::b_read(vsl_b_istream& /*bfs*/)
497 {
498   std::cerr << "clsfy_direct_boost_builder::b_read() NYI\n";
499 #if 0
500   if (!bfs) return;
501 
502   short version;
503   vsl_b_read(bfs,version);
504   switch (version)
505   {
506    case 1:
507     //clsfy_builder_base::b_read(bfs);  // Needed if base has any data
508     vsl_b_read(bfs,data_);
509     break;
510    default:
511     std::cerr << "I/O ERROR: vsl_b_read(vsl_b_istream&, clsfy_direct_boost_builder&)\n"
512              << "           Unknown version number "<< version << '\n';
513     bfs.is().clear(std::ios::badbit); // Set an unrecoverable IO error on stream
514     return;
515   }
516 #endif // 0
517 }
518