1 // This is mul/clsfy/clsfy_adaboost_sorted_builder.cxx
2 //:
3 // \file
4 // \brief Functions to train classifiers using AdaBoost algorithm
5 // \author dac
6 // \date Fri Mar 1 23:49:39 2002
7 //
8 // Functions to train classifiers using AdaBoost algorithm
9 // AdaBoost combines a set of (usually simple, weak) classifiers into
10 // a more powerful single classifier. Essentially it selects the
11 // classifiers one at a time, choosing the best at each step.
12 // The classifiers are trained to distinguish the examples mis-classified
13 // by the currently selected classifiers.
14
15 #include <iostream>
16 #include <cstdlib>
17 #include <cmath>
18 #include <ctime>
19 #include <algorithm>
20 #include "clsfy_adaboost_sorted_builder.h"
21 #include "clsfy_simple_adaboost.h"
22 #include "clsfy_builder_1d.h"
23
24 #ifdef _MSC_VER
25 # include "vcl_msvc_warnings.h"
26 #endif
27 #include <cassert>
28 #include "vbl/vbl_triple.h"
29 #include <mbl/mbl_file_data_collector.h>
30 #include <mbl/mbl_data_collector_list.h>
31
32 //=======================================================================
33
34 clsfy_adaboost_sorted_builder::clsfy_adaboost_sorted_builder()
35
36 = default;
37
38 //=======================================================================
39
40 clsfy_adaboost_sorted_builder::~clsfy_adaboost_sorted_builder() = default;
41
42
43 //=======================================================================
44
is_class(std::string const & s) const45 bool clsfy_adaboost_sorted_builder::is_class(std::string const& s) const
46 {
47 return s == clsfy_adaboost_sorted_builder::is_a() || clsfy_builder_base::is_class(s);
48 }
49
50 //=======================================================================
51
is_a() const52 std::string clsfy_adaboost_sorted_builder::is_a() const
53 {
54 return std::string("clsfy_adaboost_sorted_builder");
55 }
56
57
58 //: Build classifier composed of 1d classifiers working on individual vector elements
59 // Builds an n-component classifier, each component of which is a 1D classifier
60 // working on a single element of the input vector.
build(clsfy_classifier_base & model,mbl_data_wrapper<vnl_vector<double>> & inputs,unsigned,const std::vector<unsigned> & outputs) const61 double clsfy_adaboost_sorted_builder::build(clsfy_classifier_base& model,
62 mbl_data_wrapper<vnl_vector<double> >& inputs,
63 unsigned /* nClasses */,
64 const std::vector<unsigned> &outputs) const
65 {
66 // N.B. ignore nClasses=1, i.e. always binary classifier
67
68 assert( model.is_class("clsfy_simple_adaboost") );
69 auto &strong_classifier = (clsfy_simple_adaboost&) model;
70
71
72 // check parameters are OK
73 if ( max_n_clfrs_ < 0 )
74 {
75 std::cout<<"Error: clsfy_adaboost_sorted_builder::build\n"
76 <<"max_n_clfrs_ = "<<max_n_clfrs_<<" ie < 0\n"
77 <<"set using set_max_n_clfrs()\n";
78 std::abort();
79 }
80 else
81 {
82 std::cout<<"Maximum number of classifiers to be found by Adaboost = "
83 <<max_n_clfrs_<<'\n';
84 }
85
86 if ( weak_builder_ == nullptr )
87 {
88 std::cout<<"Error: clsfy_adaboost_sorted_builder::build\n"
89 <<"weak_builder_ pointer has not been set\n"
90 <<"need to provide a builder to build each weak classifier\n"
91 <<"set using set_weak_builder()\n";
92 std::abort();
93 }
94 else
95 {
96 std::cout<<"Weak learner used by AdaBoost = "
97 <<weak_builder_->is_a()<<'\n';
98 }
99
100 if ( bs_ < 0 )
101 {
102 std::cout<<"Error: clsfy_adaboost_sorted_builder::build\n"
103 <<"bs_ = "<<bs_<<" ie < 0\n"
104 <<"set using set_batch_size()\n";
105 std::abort();
106 }
107 else
108 {
109 std::cout<<"Batch size when sorting data =" <<bs_<<'\n';
110 }
111
112
113 assert(bs_>0);
114 assert(bs_!=1);
115 assert (max_n_clfrs_ >= 0);
116
117 // first arrange the data in the form
118 // std::vector< < std::vector< vtl_triple<double,int,int> > > > data
119 // + vnl_vector wts
120 // then sort all data once, then build the classifier
121
122 // number of examples
123 unsigned n= inputs.size();
124 //std::cout<<"n= "<<n<<'\n';
125
126 // Dimensionality of data
127 inputs.reset();
128 int d = inputs.current().size();
129
130 //need file data wrapper instead of old vector
131 //data stored on disk NOT ram
132 //std::vector< std::vector<vbl_triple<double,int,int> > > data(d);
133
134 std::string temp_path= "temp.dat";
135 mbl_file_data_collector<
136 std::vector< vbl_triple<double,int,int> >
137 >
138 file_collector( temp_path );
139
140 mbl_data_collector_list<
141 std::vector< vbl_triple<double,int,int> >
142 >
143 ram_collector;
144
145 mbl_data_collector<std::vector< vbl_triple<double,int,int> >
146 >* collector;
147
148 if (save_data_to_disk_)
149 {
150 std::cout<<"saving data to disk!\n";
151 collector= &file_collector;
152 }
153 else
154 {
155 //bs_ = n ;
156 std::cout<<"saving data to ram!\n";
157 collector= &ram_collector;
158 }
159
160 // perhaps change this so load and sort several vectors at once!!
161 // far too slow at present
162 // say load in and sort 100 at once?????
163 // i.e. 100 features at once!
164
165 //int bs= 100; //batch size
166 std::vector< std::vector< vbl_triple<double,int,int> > >vec(bs_);
167 vbl_triple<double,int,int> t;
168
169 std::cout<<"d= "<<d<<'\n';
170 int b=0;
171 while ( b+1<d )
172 {
173 int r= std::min ( bs_, (d-b) );
174 assert(r>0);
175
176 std::cout<<"sorting weak classifiers = "<<b<<" to "
177 <<(b+r)-1<<" of "<<d<<'\n';
178
179
180 // have to resize all vectors
181 for (int i=0; i< bs_; ++i)
182 vec[i].resize(0);
183
184 // add data for both classes
185 inputs.reset();
186 for (unsigned int j=0;j<n;++j)
187 {
188 for (int i=0; i< r; ++i)
189 {
190 t.first=inputs.current()[b+i];
191 t.second=outputs[j];
192 t.third = j;
193 vec[i].push_back(t);
194 }
195 inputs.next();
196 }
197
198
199 for (int i=0; i< r; ++i)
200 {
201 // sort training data for each individual weak classifier
202 assert (vec[i].size() == n);
203 assert (n != 0);
204
205 std::sort(vec[i].begin(), vec[i].end() );
206
207 // store sorted vector of responses for each individual weak classifier
208 collector->record(vec[i]);
209 }
210
211 b+=bs_;
212 }
213
214
215 mbl_data_wrapper< std::vector< vbl_triple<double,int,int> > >&
216 wrapper=collector->data_wrapper();
217
218
219 // now actually apply AdaBoost algorithm
220 wrapper.reset();
221 assert ( wrapper.current().size() == n );
222 assert ( d == (int)wrapper.size() );
223
224
225 // initialize weights
226 unsigned n0=0;
227 unsigned n1=0;
228 for (unsigned int i=0;i<n;++i)
229 {
230 if ( outputs[i] == 0 )
231 n0++;
232 else if ( outputs[i] == 1 )
233 n1++;
234 else
235 {
236 std::cout<<"Error : clsfy_adaboost_sorted_builder\n"
237 <<"unrecognised output value : outputs["<<i<<"]= "
238 <<outputs[i]<<'\n';
239 std::abort();
240 }
241 }
242
243 assert (n0+n1==n );
244
245 vnl_vector<double> wts(n);
246 for (unsigned int i=0; i<n; ++i)
247 {
248 if ( outputs[i] == 0 )
249 wts(i)=0.5/n0;
250 else if ( outputs[i] == 1 )
251 wts(i)=0.5/n1;
252 else
253 {
254 std::cout<<"Error : clsfy_adaboost_sorted_builder\n"
255 <<"unrecognised output value : outputs["<<i<<"]= "
256 <<outputs[i]<<'\n';
257 std::abort();
258 }
259 }
260
261
262 // clear classifier
263 // N.B. maybe shouldn't do this if going to build incrementally
264 // i.e. by rebuilding the training set from false positives of the
265 // current classifier
266 strong_classifier.clear();
267 strong_classifier.set_n_dims(d);
268 // N.B. have to set builder as a member variable elsewhere
269 clsfy_classifier_1d* c1d = weak_builder_->new_classifier();
270 clsfy_classifier_1d* best_c1d= weak_builder_->new_classifier();
271
272 long old_time = std::clock();
273 double tot_time=0;
274
275 for (unsigned int r=0;r<(unsigned)max_n_clfrs_;++r)
276 {
277 std::cout<<"adaboost training round = "<<r<<'\n';
278
279 //std::cout<<"wts0= "<<wts0<<"\nwts1= "<<wts1<<'\n';
280
281 int best_i=-1;
282 double min_error= 100000;
283 wrapper.reset(); // make sure pointing to first data vector
284 for (int i=0;i<d;++i)
285 {
286 const std::vector< vbl_triple<double,int,int> >& vec = wrapper.current();
287
288 double error = weak_builder_->build_from_sorted_data(*c1d,&vec[0],wts);
289 if (i==0 || error<min_error)
290 {
291 min_error = error;
292 delete best_c1d;
293 best_c1d= c1d->clone();
294 best_i = i;
295 }
296
297 wrapper.next(); // move to next data vector
298 }
299
300 assert(best_i != -1);
301
302 std::cout<<"best_i= "<<best_i<<'\n'
303 <<"min_error= "<<min_error<<'\n';
304
305 if (min_error<1e-10) // Hooray!
306 {
307 std::cout<<"min_error<1e-10 !!!\n";
308 double alpha = std::log(2.0*n); //is this appropriate???
309 strong_classifier.add_classifier( best_c1d, alpha, best_i);
310
311 // delete classifiers on heap, because clones taken by strong_classifier
312 delete c1d;
313 delete best_c1d;
314 return clsfy_test_error(strong_classifier, inputs, outputs);
315 }
316
317
318 else if (0.5-min_error<1e-10) // Oh dear, no further improvement possible
319 {
320 std::cout<<"min_error => 0.5 !!!\n";
321
322 // delete classifiers on heap, because clones taken by strong_classifier
323 delete c1d;
324 delete best_c1d;
325 return clsfy_test_error(strong_classifier, inputs, outputs);
326 }
327
328 else { // update the classifier
329 double beta = min_error/(1.0-min_error);
330 double alpha = -1.0*std::log(beta);
331 strong_classifier.add_classifier( best_c1d, alpha, best_i);
332
333 // extract the best weak classifier results
334 wrapper.set_index(best_i);
335 const std::vector< vbl_triple<double,int,int> >& vec = wrapper.current();
336
337 // update the wts using the best weak classifier
338 for (unsigned int j=0; j<n; ++j)
339 if ( best_c1d-> classify( vec[j].first ) == (unsigned) vec[j].second)
340 wts[vec[j].third]*=beta;
341 // no return yet!
342 }
343
344 double w_sum= wts.mean()*n;
345 wts/=w_sum;
346
347 long new_time = std::clock();
348
349 double dt = (1.0*(new_time-old_time))/CLOCKS_PER_SEC;
350 std::cout<<"Time for AdaBoost round: "<<dt<<" secs\n";
351 tot_time+=dt;
352 std::cout<<"Total time for rounds so far: "<<tot_time<<" secs\n";
353
354 old_time = new_time;
355 }
356
357 delete c1d;
358 delete best_c1d;
359
360 // does clsfy_test_error balk if have too much data?
361 // should be OK because just passes mbl_data_wrapper and evaluates
362 // one at a time, so if using mbl_file_data_wrapper should be OK!
363 std::cout<<"calculating training error\n";
364 return clsfy_test_error(strong_classifier, inputs, outputs);
365 }
366
367
368 //: Create empty classifier
369 // Caller is responsible for deletion
new_classifier() const370 clsfy_classifier_base* clsfy_adaboost_sorted_builder::new_classifier() const
371 {
372 return new clsfy_simple_adaboost();
373 }
374
375
376 //=======================================================================
377
378 #if 0
379 // required if data stored on the heap is present in this derived class
380 clsfy_adaboost_sorted_builder::clsfy_adaboost_sorted_builder(const clsfy_adaboost_sorted_builder& new_b):
381 data_ptr_(0)
382 {
383 *this = new_b;
384 }
385
386
387 //=======================================================================
388
389 // required if data stored on the heap is present in this derived class
390 clsfy_adaboost_sorted_builder& clsfy_adaboost_sorted_builder::operator=(const clsfy_adaboost_sorted_builder& new_b)
391 {
392 if (&new_b==this) return *this;
393
394 // Copy heap member variables.
395 delete data_ptr_; data_ptr_=0;
396
397 if (new_b.data_ptr_)
398 data_ptr_ = new_b.data_ptr_->clone();
399
400 // Copy normal member variables
401 data_ = new_b.data_;
402
403 return *this;
404 }
405 #endif // 0
406
407 //=======================================================================
408
clone() const409 clsfy_builder_base* clsfy_adaboost_sorted_builder::clone() const
410 {
411 return new clsfy_adaboost_sorted_builder(*this);
412 }
413
414 //=======================================================================
415
416 // required if data is present in this base class
print_summary(std::ostream &) const417 void clsfy_adaboost_sorted_builder::print_summary(std::ostream& /*os*/) const
418 {
419 #if 0
420 clsfy_builder_base::print_summary(os); // Uncomment this line if it has one.
421 vsl_print_summary(os, data_); // Example of data output
422 #endif
423
424 std::cerr << "clsfy_adaboost_sorted_builder::print_summary() NYI\n";
425 }
426
427 //=======================================================================
428
429 // required if data is present in this base class
b_write(vsl_b_ostream &) const430 void clsfy_adaboost_sorted_builder::b_write(vsl_b_ostream& /*bfs*/) const
431 {
432 #if 0
433 vsl_b_write(bfs, version_no());
434 clsfy_builder_base::b_write(bfs); // Needed if base has any data
435 vsl_b_write(bfs, data_);
436 #endif
437 std::cerr << "clsfy_adaboost_sorted_builder::b_write() NYI\n";
438 }
439
440 //=======================================================================
441
442 // required if data is present in this base class
b_read(vsl_b_istream &)443 void clsfy_adaboost_sorted_builder::b_read(vsl_b_istream& /*bfs*/)
444 {
445 std::cerr << "clsfy_adaboost_sorted_builder::b_read() NYI\n";
446 #if 0
447 if (!bfs) return;
448
449 short version;
450 vsl_b_read(bfs,version);
451 switch (version)
452 {
453 case 1:
454 //clsfy_builder_base::b_read(bfs); // Needed if base has any data
455 vsl_b_read(bfs,data_);
456 break;
457 default:
458 std::cerr << "I/O ERROR: vsl_b_read(vsl_b_istream&, clsfy_adaboost_sorted_builder&)\n"
459 << " Unknown version number "<< version << '\n';
460 bfs.is().clear(std::ios::badbit); // Set an unrecoverable IO error on stream
461 return;
462 }
463 #endif // 0
464 }
465