1 // This is mul/clsfy/clsfy_adaboost_sorted_builder.cxx
2 //:
3 // \file
4 // \brief Functions to train classifiers using AdaBoost algorithm
5 // \author dac
6 // \date   Fri Mar  1 23:49:39 2002
7 //
8 //  Functions to train classifiers using AdaBoost algorithm
9 //  AdaBoost combines a set of (usually simple, weak) classifiers into
10 //  a more powerful single classifier.  Essentially it selects the
11 //  classifiers one at a time, choosing the best at each step.
12 //  The classifiers are trained to distinguish the examples mis-classified
13 //  by the currently selected classifiers.
14 
15 #include <iostream>
16 #include <cstdlib>
17 #include <cmath>
18 #include <ctime>
19 #include <algorithm>
20 #include "clsfy_adaboost_sorted_builder.h"
21 #include "clsfy_simple_adaboost.h"
22 #include "clsfy_builder_1d.h"
23 
24 #ifdef _MSC_VER
25 #  include "vcl_msvc_warnings.h"
26 #endif
27 #include <cassert>
28 #include "vbl/vbl_triple.h"
29 #include <mbl/mbl_file_data_collector.h>
30 #include <mbl/mbl_data_collector_list.h>
31 
32 //=======================================================================
33 
34 clsfy_adaboost_sorted_builder::clsfy_adaboost_sorted_builder()
35 
36     = default;
37 
38 //=======================================================================
39 
40 clsfy_adaboost_sorted_builder::~clsfy_adaboost_sorted_builder() = default;
41 
42 
43 //=======================================================================
44 
is_class(std::string const & s) const45 bool clsfy_adaboost_sorted_builder::is_class(std::string const& s) const
46 {
47   return s == clsfy_adaboost_sorted_builder::is_a() || clsfy_builder_base::is_class(s);
48 }
49 
50 //=======================================================================
51 
is_a() const52 std::string clsfy_adaboost_sorted_builder::is_a() const
53 {
54   return std::string("clsfy_adaboost_sorted_builder");
55 }
56 
57 
58 //: Build classifier composed of 1d classifiers working on individual vector elements
59 // Builds an n-component classifier, each component of which is a 1D classifier
60 // working on a single element of the input vector.
build(clsfy_classifier_base & model,mbl_data_wrapper<vnl_vector<double>> & inputs,unsigned,const std::vector<unsigned> & outputs) const61 double clsfy_adaboost_sorted_builder::build(clsfy_classifier_base& model,
62                                             mbl_data_wrapper<vnl_vector<double> >& inputs,
63                                             unsigned /* nClasses */,
64                                             const std::vector<unsigned> &outputs) const
65 {
66   // N.B. ignore nClasses=1, i.e. always binary classifier
67 
68   assert( model.is_class("clsfy_simple_adaboost") );
69   auto &strong_classifier = (clsfy_simple_adaboost&) model;
70 
71 
72   // check parameters are OK
73   if ( max_n_clfrs_ < 0 )
74   {
75     std::cout<<"Error: clsfy_adaboost_sorted_builder::build\n"
76             <<"max_n_clfrs_ = "<<max_n_clfrs_<<" ie < 0\n"
77             <<"set using set_max_n_clfrs()\n";
78     std::abort();
79   }
80   else
81   {
82     std::cout<<"Maximum number of classifiers to be found by Adaboost = "
83             <<max_n_clfrs_<<'\n';
84   }
85 
86   if ( weak_builder_ == nullptr )
87   {
88     std::cout<<"Error: clsfy_adaboost_sorted_builder::build\n"
89             <<"weak_builder_ pointer has not been set\n"
90             <<"need to provide a builder to build each weak classifier\n"
91             <<"set using set_weak_builder()\n";
92     std::abort();
93   }
94   else
95   {
96     std::cout<<"Weak learner used by AdaBoost = "
97             <<weak_builder_->is_a()<<'\n';
98   }
99 
100   if ( bs_ < 0 )
101   {
102     std::cout<<"Error: clsfy_adaboost_sorted_builder::build\n"
103             <<"bs_ = "<<bs_<<" ie < 0\n"
104             <<"set using set_batch_size()\n";
105     std::abort();
106   }
107   else
108   {
109     std::cout<<"Batch size when sorting data =" <<bs_<<'\n';
110   }
111 
112 
113   assert(bs_>0);
114   assert(bs_!=1);
115   assert (max_n_clfrs_ >= 0);
116 
117   // first arrange the data in the form
118   // std::vector< < std::vector< vtl_triple<double,int,int> > > > data
119   // + vnl_vector wts
120   // then sort all data once, then build the classifier
121 
122   // number of examples
123   unsigned n= inputs.size();
124   //std::cout<<"n= "<<n<<'\n';
125 
126   // Dimensionality of data
127   inputs.reset();
128   int d = inputs.current().size();
129 
130   //need file data wrapper instead of old vector
131   //data stored on disk NOT ram
132   //std::vector< std::vector<vbl_triple<double,int,int> > > data(d);
133 
134   std::string temp_path= "temp.dat";
135   mbl_file_data_collector<
136       std::vector< vbl_triple<double,int,int> >
137                          >
138               file_collector( temp_path );
139 
140   mbl_data_collector_list<
141       std::vector< vbl_triple<double,int,int> >
142                          >
143               ram_collector;
144 
145   mbl_data_collector<std::vector< vbl_triple<double,int,int> >
146                          >*  collector;
147 
148   if (save_data_to_disk_)
149   {
150     std::cout<<"saving data to disk!\n";
151     collector= &file_collector;
152   }
153   else
154   {
155     //bs_ = n ;
156     std::cout<<"saving data to ram!\n";
157     collector= &ram_collector;
158   }
159 
160   // perhaps change this so load and sort several vectors at once!!
161   // far too slow at present
162   // say load in and sort 100 at once?????
163   // i.e. 100 features at once!
164 
165   //int bs= 100; //batch size
166   std::vector< std::vector< vbl_triple<double,int,int> > >vec(bs_);
167   vbl_triple<double,int,int> t;
168 
169   std::cout<<"d= "<<d<<'\n';
170   int b=0;
171   while ( b+1<d )
172   {
173     int r= std::min ( bs_, (d-b) );
174     assert(r>0);
175 
176     std::cout<<"sorting weak classifiers = "<<b<<" to "
177             <<(b+r)-1<<" of "<<d<<'\n';
178 
179 
180     // have to resize all vectors
181     for (int i=0; i< bs_; ++i)
182       vec[i].resize(0);
183 
184     // add data for both classes
185     inputs.reset();
186     for (unsigned int j=0;j<n;++j)
187     {
188       for (int i=0; i< r; ++i)
189       {
190         t.first=inputs.current()[b+i];
191         t.second=outputs[j];
192         t.third = j;
193         vec[i].push_back(t);
194       }
195       inputs.next();
196     }
197 
198 
199     for (int i=0; i< r; ++i)
200     {
201       // sort training data for each individual weak classifier
202       assert (vec[i].size() == n);
203       assert (n != 0);
204 
205       std::sort(vec[i].begin(), vec[i].end() );
206 
207       // store sorted vector of responses for each individual weak classifier
208       collector->record(vec[i]);
209     }
210 
211     b+=bs_;
212   }
213 
214 
215   mbl_data_wrapper< std::vector< vbl_triple<double,int,int> > >&
216               wrapper=collector->data_wrapper();
217 
218 
219   // now actually apply AdaBoost algorithm
220   wrapper.reset();
221   assert ( wrapper.current().size() == n );
222   assert ( d == (int)wrapper.size() );
223 
224 
225   // initialize weights
226   unsigned n0=0;
227   unsigned n1=0;
228   for (unsigned int i=0;i<n;++i)
229   {
230     if ( outputs[i] == 0 )
231       n0++;
232     else if ( outputs[i] == 1 )
233       n1++;
234     else
235     {
236       std::cout<<"Error : clsfy_adaboost_sorted_builder\n"
237               <<"unrecognised output value : outputs["<<i<<"]= "
238               <<outputs[i]<<'\n';
239       std::abort();
240     }
241   }
242 
243   assert (n0+n1==n );
244 
245   vnl_vector<double> wts(n);
246   for (unsigned int i=0; i<n; ++i)
247   {
248     if ( outputs[i] == 0 )
249       wts(i)=0.5/n0;
250     else if ( outputs[i] == 1 )
251       wts(i)=0.5/n1;
252     else
253     {
254       std::cout<<"Error : clsfy_adaboost_sorted_builder\n"
255               <<"unrecognised output value : outputs["<<i<<"]= "
256               <<outputs[i]<<'\n';
257       std::abort();
258     }
259   }
260 
261 
262   // clear classifier
263   // N.B. maybe shouldn't do this if going to build incrementally
264   // i.e. by rebuilding the training set from false positives of the
265   // current classifier
266   strong_classifier.clear();
267   strong_classifier.set_n_dims(d);
268   // N.B. have to set builder as a member variable elsewhere
269   clsfy_classifier_1d* c1d = weak_builder_->new_classifier();
270   clsfy_classifier_1d* best_c1d= weak_builder_->new_classifier();
271 
272   long old_time = std::clock();
273   double tot_time=0;
274 
275   for (unsigned int r=0;r<(unsigned)max_n_clfrs_;++r)
276   {
277     std::cout<<"adaboost training round = "<<r<<'\n';
278 
279     //std::cout<<"wts0= "<<wts0<<"\nwts1= "<<wts1<<'\n';
280 
281     int best_i=-1;
282     double min_error= 100000;
283     wrapper.reset();  // make sure pointing to first data vector
284     for (int i=0;i<d;++i)
285     {
286       const std::vector< vbl_triple<double,int,int> >& vec = wrapper.current();
287 
288       double error = weak_builder_->build_from_sorted_data(*c1d,&vec[0],wts);
289       if (i==0 || error<min_error)
290       {
291         min_error = error;
292         delete best_c1d;
293         best_c1d= c1d->clone();
294         best_i = i;
295       }
296 
297       wrapper.next();   // move to next data vector
298     }
299 
300     assert(best_i != -1);
301 
302     std::cout<<"best_i= "<<best_i<<'\n'
303             <<"min_error= "<<min_error<<'\n';
304 
305     if (min_error<1e-10)  // Hooray!
306     {
307       std::cout<<"min_error<1e-10 !!!\n";
308       double alpha  = std::log(2.0*n);   //is this appropriate???
309       strong_classifier.add_classifier( best_c1d, alpha, best_i);
310 
311       // delete classifiers on heap, because clones taken by strong_classifier
312       delete c1d;
313       delete best_c1d;
314       return clsfy_test_error(strong_classifier, inputs, outputs);
315     }
316 
317 
318     else if (0.5-min_error<1e-10) // Oh dear, no further improvement possible
319     {
320       std::cout<<"min_error => 0.5 !!!\n";
321 
322       // delete classifiers on heap, because clones taken by strong_classifier
323       delete c1d;
324       delete best_c1d;
325       return clsfy_test_error(strong_classifier, inputs, outputs);
326     }
327 
328     else { // update the classifier
329       double beta = min_error/(1.0-min_error);
330       double alpha  = -1.0*std::log(beta);
331       strong_classifier.add_classifier( best_c1d, alpha, best_i);
332 
333       // extract the best weak classifier results
334       wrapper.set_index(best_i);
335       const std::vector< vbl_triple<double,int,int> >& vec = wrapper.current();
336 
337       // update the wts using the best weak classifier
338       for (unsigned int j=0; j<n; ++j)
339         if ( best_c1d-> classify( vec[j].first ) == (unsigned) vec[j].second)
340           wts[vec[j].third]*=beta;
341       // no return yet!
342     }
343 
344     double w_sum= wts.mean()*n;
345     wts/=w_sum;
346 
347     long new_time = std::clock();
348 
349     double dt = (1.0*(new_time-old_time))/CLOCKS_PER_SEC;
350     std::cout<<"Time for AdaBoost round:      "<<dt<<" secs\n";
351     tot_time+=dt;
352     std::cout<<"Total time for rounds so far: "<<tot_time<<" secs\n";
353 
354     old_time = new_time;
355   }
356 
357   delete c1d;
358   delete best_c1d;
359 
360   // does clsfy_test_error balk if have too much data?
361   // should be OK because just passes mbl_data_wrapper and evaluates
362   // one at a time, so if using mbl_file_data_wrapper should be OK!
363   std::cout<<"calculating training error\n";
364   return clsfy_test_error(strong_classifier, inputs, outputs);
365 }
366 
367 
368 //: Create empty classifier
369 // Caller is responsible for deletion
new_classifier() const370 clsfy_classifier_base* clsfy_adaboost_sorted_builder::new_classifier() const
371 {
372   return new clsfy_simple_adaboost();
373 }
374 
375 
376 //=======================================================================
377 
378 #if 0
379     // required if data stored on the heap is present in this derived class
380 clsfy_adaboost_sorted_builder::clsfy_adaboost_sorted_builder(const clsfy_adaboost_sorted_builder& new_b):
381   data_ptr_(0)
382 {
383   *this = new_b;
384 }
385 
386 
387 //=======================================================================
388 
389     // required if data stored on the heap is present in this derived class
390 clsfy_adaboost_sorted_builder& clsfy_adaboost_sorted_builder::operator=(const clsfy_adaboost_sorted_builder& new_b)
391 {
392   if (&new_b==this) return *this;
393 
394   // Copy heap member variables.
395   delete data_ptr_; data_ptr_=0;
396 
397   if (new_b.data_ptr_)
398     data_ptr_ = new_b.data_ptr_->clone();
399 
400   // Copy normal member variables
401   data_ = new_b.data_;
402 
403   return *this;
404 }
405 #endif // 0
406 
407 //=======================================================================
408 
clone() const409 clsfy_builder_base* clsfy_adaboost_sorted_builder::clone() const
410 {
411   return new clsfy_adaboost_sorted_builder(*this);
412 }
413 
414 //=======================================================================
415 
416     // required if data is present in this base class
print_summary(std::ostream &) const417 void clsfy_adaboost_sorted_builder::print_summary(std::ostream& /*os*/) const
418 {
419 #if 0
420   clsfy_builder_base::print_summary(os); // Uncomment this line if it has one.
421   vsl_print_summary(os, data_); // Example of data output
422 #endif
423 
424   std::cerr << "clsfy_adaboost_sorted_builder::print_summary() NYI\n";
425 }
426 
427 //=======================================================================
428 
429   // required if data is present in this base class
b_write(vsl_b_ostream &) const430 void clsfy_adaboost_sorted_builder::b_write(vsl_b_ostream& /*bfs*/) const
431 {
432 #if 0
433   vsl_b_write(bfs, version_no());
434   clsfy_builder_base::b_write(bfs);  // Needed if base has any data
435   vsl_b_write(bfs, data_);
436 #endif
437   std::cerr << "clsfy_adaboost_sorted_builder::b_write() NYI\n";
438 }
439 
440 //=======================================================================
441 
442   // required if data is present in this base class
b_read(vsl_b_istream &)443 void clsfy_adaboost_sorted_builder::b_read(vsl_b_istream& /*bfs*/)
444 {
445   std::cerr << "clsfy_adaboost_sorted_builder::b_read() NYI\n";
446 #if 0
447   if (!bfs) return;
448 
449   short version;
450   vsl_b_read(bfs,version);
451   switch (version)
452   {
453    case 1:
454     //clsfy_builder_base::b_read(bfs);  // Needed if base has any data
455     vsl_b_read(bfs,data_);
456     break;
457    default:
458     std::cerr << "I/O ERROR: vsl_b_read(vsl_b_istream&, clsfy_adaboost_sorted_builder&)\n"
459              << "           Unknown version number "<< version << '\n';
460     bfs.is().clear(std::ios::badbit); // Set an unrecoverable IO error on stream
461     return;
462   }
463 #endif // 0
464 }
465