1 // This is mul/clsfy/clsfy_direct_boost_builder.cxx
2 //:
3 // \file
4 // \brief Functions to train classifiers using AdaBoost algorithm
5 // \author dac
6 // \date Fri Mar 1 23:49:39 2002
7 //
8 // Functions to train classifiers using a direct boosting algorithm
9 // DirectBoost combines a set of (usually simple, weak) classifiers into
10 // a more powerful single classifier. Essentially it selects the
11 // classifiers one at a time, choosing the best at each step.
12 // The classifiers are trained to distinguish the examples mis-classified
13 // by the currently selected classifiers.
14
15 #include <iostream>
16 #include <cstdlib>
17 #include <algorithm>
18 #include "clsfy_direct_boost_builder.h"
19 #include "clsfy_direct_boost.h"
20 #include "clsfy_builder_1d.h"
21
22 #ifdef _MSC_VER
23 # include "vcl_msvc_warnings.h"
24 #endif
25 #include <cassert>
26 #include <mbl/mbl_file_data_collector.h>
27 #include <mbl/mbl_data_collector_list.h>
28 #include <mbl/mbl_index_sort.h>
29
30 //=======================================================================
31
32 clsfy_direct_boost_builder::clsfy_direct_boost_builder()
33
34 = default;
35
36 //=======================================================================
37
38 clsfy_direct_boost_builder::~clsfy_direct_boost_builder() = default;
39
40
41 //=======================================================================
42
is_class(std::string const & s) const43 bool clsfy_direct_boost_builder::is_class(std::string const& s) const
44 {
45 return s == clsfy_direct_boost_builder::is_a() || clsfy_builder_base::is_class(s);
46 }
47
48 //=======================================================================
49
is_a() const50 std::string clsfy_direct_boost_builder::is_a() const
51 {
52 return std::string("clsfy_direct_boost_builder");
53 }
54
55
56 //: Calc similarity between two 1d input vectors
calc_prop_same(const std::vector<bool> & vec1,const std::vector<bool> & vec2) const57 double clsfy_direct_boost_builder::calc_prop_same(
58 const std::vector<bool>& vec1,
59 const std::vector<bool>& vec2) const
60 {
61 unsigned n = vec1.size();
62 assert( n==vec2.size() );
63 int sum= 0;
64 for (unsigned i=0;i<n;++i)
65 if (vec1[i]==vec2[i])
66 ++sum;
67
68 return sum*1.0/n;
69 }
70
71
72 //: Calc threshold for current version of strong classifier
calc_threshold(clsfy_direct_boost & strong_classifier,mbl_data_wrapper<vnl_vector<double>> & inputs,const std::vector<unsigned> & outputs) const73 double clsfy_direct_boost_builder::calc_threshold(
74 clsfy_direct_boost& strong_classifier,
75 mbl_data_wrapper<vnl_vector<double> >& inputs,
76 const std::vector<unsigned>& outputs) const
77 {
78 // calc classification score for each example
79 unsigned long n = inputs.size();
80 std::vector<double> scores(n);
81 inputs.reset();
82 for (unsigned long i=0;i<n;++i)
83 {
84 scores[i]= strong_classifier.log_l( inputs.current() );
85 inputs.next();
86 }
87
88 // calc number of negative examples
89 unsigned int tot_pos=0;
90 for (unsigned long i=0;i<n;++i)
91 if ( outputs[i] == 1 ) ++tot_pos;
92
93 // then find threshold that gives min_error over training set
94 std::vector<int> index;
95 mbl_index_sort(scores, index);
96
97 unsigned int n_pos=0;
98 unsigned int n_neg=0;
99 unsigned long min_error= n+1;
100 double min_thresh= -1;
101 for (unsigned long i=0;i<n;++i)
102 {
103 #ifdef DEBUG
104 std::cout<<" scores[ index["<<i<<"] ] = "<< scores[ index[i] ]<<" ; "
105 <<"outputs[ index["<<i<<"] ] = "<<outputs[ index[i] ]<<'\n';
106 #endif
107 if ( outputs[ index[i] ] == 0 ) ++n_neg;
108 else if ( outputs[ index[i] ] == 1 ) ++n_pos;
109 else
110 {
111 std::cout<<"ERROR: clsfy_direct_boost_basic_builder::calc_threshold()\n"
112 <<"Unrecognised output value\n"
113 <<"outputs[ index["<<i<<"] ] = outputs["<<index[i]<<"] = "
114 <<outputs[index[i]]<<'\n';
115 std::abort();
116 }
117
118 #ifdef DEBUG
119 std::cout<<"n = "<<n<<", n_pos= "<<n_pos<<", n_neg= "<<n_neg<<'\n';
120 #endif
121 unsigned int error= n_neg+(tot_pos-n_pos);
122
123 if ( error<= min_error )
124 {
125 min_error= error;
126 min_thresh = scores[ index[i] ] + 0.001 ;
127 #ifdef DEBUG
128 std::cout<<"error= "<<error<<", min_thresh= "<<min_thresh<<'\n';
129 #endif
130 }
131 }
132
133 assert( n_pos + n_neg == n );
134 #ifdef DEBUG
135 std::cout<<"min_error= "<<min_error<<", min_thresh= "<<min_thresh<<'\n';
136 #endif
137
138 return min_thresh;
139 }
140
141
142 //: Build classifier composed of 1d classifiers working on individual vector elements
143 // Builds an n-component classifier, each component of which is a 1D classifier
144 // working on a single element of the input vector.
build(clsfy_classifier_base & model,mbl_data_wrapper<vnl_vector<double>> & inputs,unsigned,const std::vector<unsigned> & outputs) const145 double clsfy_direct_boost_builder::build(clsfy_classifier_base& model,
146 mbl_data_wrapper<vnl_vector<double> >& inputs,
147 unsigned /* nClasses */,
148 const std::vector<unsigned> &outputs) const
149 {
150 // nb ignore nClasses=1, ie always binary classifier
151
152 assert( model.is_class("clsfy_direct_boost") );
153 auto &strong_classifier = (clsfy_direct_boost&) model;
154
155
156 // check parameters are OK
157 if ( max_n_clfrs_ < 0 )
158 {
159 std::cout<<"Error: clsfy_direct_boost_builder::build\n"
160 <<"max_n_clfrs_ = "<<max_n_clfrs_<<" ie < 0\n"
161 <<"set using set_max_n_clfrs()\n";
162 std::abort();
163 }
164 else
165 {
166 std::cout<<"Maximum number of classifiers to be found by Adaboost ="
167 <<max_n_clfrs_<<'\n';
168 }
169
170 if ( weak_builder_ == nullptr )
171 {
172 std::cout<<"Error: clsfy_direct_boost_builder::build\n"
173 <<"weak_builder_ pointer has not been set\n"
174 <<"need to provide a builder to build each weak classifier\n"
175 <<"set using set_weak_builder()\n";
176 std::abort();
177 }
178 else
179 {
180 std::cout<<"Weak learner used by AdaBoost ="
181 <<weak_builder_->is_a()<<'\n';
182 }
183
184 if ( bs_ < 0 )
185 {
186 std::cout<<"Error: clsfy_direct_boost_builder::build\n"
187 <<"bs_ = "<<bs_<<" ie < 0\n"
188 <<"set using set_batch_size()\n";
189 std::abort();
190 }
191 else
192 {
193 std::cout<<"Batch size when sorting data =" <<bs_<<'\n';
194 }
195
196
197 assert(bs_>0);
198 assert(bs_!=1);
199 assert (max_n_clfrs_ >= 0);
200
201 // first arrange the data in the form
202 // std::vector< < std::vector< vbl_triple<double,int,int> > > > data
203 // + vnl_vector wts
204 // then sort all data once, then build the classifier
205
206 // number of examples
207 unsigned long n = inputs.size();
208 //std::cout<<"n = "<<n<<std::endl;
209
210 // Dimensionality of data
211 inputs.reset();
212 unsigned int d = inputs.current().size();
213
214 //need file data wrapper instead of old vector
215 //data stored on disk NOT ram
216 //std::vector< std::vector<vbl_triple<double,int,int> > > data(d);
217
218 std::string temp_path= "temp.dat";
219 mbl_file_data_collector< vnl_vector<double> >
220 file_collector( temp_path );
221
222 mbl_data_collector_list< vnl_vector< double > >
223 ram_collector;
224
225 mbl_data_collector<vnl_vector< double> >* collector;
226
227 if (save_data_to_disk_)
228 {
229 std::cout<<"saving data to disk!\n";
230 collector= &file_collector;
231 }
232 else
233 {
234 //bs_ = n ;
235 std::cout<<"saving data to ram!\n";
236 collector= &ram_collector;
237 }
238
239
240 // say load in and sort 100 at once?????
241 // ie 100 features at once!
242
243 //int bs= 100; //batch size
244 std::vector< vnl_vector< double > >vec(bs_);
245
246 std::cout<<"d= "<<d<<std::endl;
247 unsigned int b=0;
248 while ( b+1<d )
249 {
250 int r= std::min ( bs_, int(d-b) );
251 assert(r>0);
252
253 std::cout<<"arranging weak classifier data = "<<b<<" to "
254 <<(b+r)-1<<" of "<<d<<std::endl;
255
256 // have to resize all vectors
257 for (int i=0; i< bs_; ++i)
258 vec[i].set_size(n);
259
260 // add data for both classes
261 inputs.reset();
262 for (unsigned long j=0;j<n;++j)
263 {
264 for (int i=0; i< r; ++i)
265 vec[i](j)=( inputs.current()[b+i] );
266 inputs.next();
267 }
268
269
270 for (int i=0; i< r; ++i)
271 {
272 // sort training data for each individual weak classifier
273 assert (vec[i].size() == n);
274 assert (n != 0);
275
276 // store sorted vector of responses for each individual weak classifier
277 collector->record(vec[i]);
278 }
279
280 b+=bs_;
281 }
282
283
284 mbl_data_wrapper< vnl_vector<double> >&
285 wrapper=collector->data_wrapper();
286
287
288 // now actually apply direct boost algorithm
289 wrapper.reset();
290 assert ( wrapper.current().size() == n );
291 assert ( d == wrapper.size() );
292
293
294 // nb have to set builder as a member variable elsewhere
295 clsfy_classifier_1d* c1d = weak_builder_->new_classifier();
296
297 // wts not really used!!
298 vnl_vector<double> wts(n,1.0/n);
299
300 // need to train each weak classifier on the data
301 // and record the error and output responses of the weak classifiers
302 std::vector< double > errors(0);
303 std::vector< std::vector<bool> > responses(0);
304 std::vector< clsfy_classifier_1d* > classifiers(0);
305
306 wrapper.reset();
307 for (unsigned int i=0; i<d; ++i )
308 {
309 const vnl_vector<double>& vec= wrapper.current();
310 double error= weak_builder_->build(*c1d, vec, wts, outputs);
311
312 std::vector<bool> resp_vec(n);
313 // now get responses
314 for (unsigned long k=0; k<n;++k)
315 {
316 unsigned int r= c1d->classify( vec(k) );
317 if (r==0)
318 resp_vec[k]=false;
319 else
320 resp_vec[k]=true;
321 }
322
323 responses.push_back( resp_vec );
324 errors.push_back( error );
325 classifiers.push_back( c1d->clone() );
326
327 wrapper.next();
328 }
329
330 delete c1d;
331
332
333 // now use the outputs and errors to define a strong classifier
334
335 // create a sorted index of the errors
336 std::vector<int> index;
337 mbl_index_sort(errors, index);
338
339
340 // need to pick best classifier and store index + mean & variance
341 // whilst list of indices is NOT empty
342 strong_classifier.clear();
343 strong_classifier.set_n_dims(d);
344
345
346 for (int k=0; k<max_n_clfrs_; ++k)
347 {
348 if (index.empty())
349 break;
350
351 // store best classifier that is left in list
352 int ind= index[0];
353 std::cout<<"ind= "<<ind<<", errors["<<ind<<"]= "<<errors[ind]<<'\n';
354 if (errors[ind]> 0.5 ) break;
355
356 if (errors[ind]==0)
357 strong_classifier.add_one_classifier( classifiers[ind], 1.0, ind);
358 else
359 strong_classifier.add_one_classifier( classifiers[ind], 1.0/errors[ind], ind);
360
361 if (calc_all_thresholds_)
362 {
363 // calculating response from classifier so far
364 // and using this to calc min_error threshold
365 double t=calc_threshold( strong_classifier, inputs, outputs );
366 strong_classifier.add_one_threshold(t);
367 }
368 else
369 {
370 // add dummy threshold, ie only calc properly at end
371 strong_classifier.add_one_threshold(0.0);
372 }
373
374 if (errors[ind]==0) break;
375
376 // find all classifiers that are similar to the selected
377 // classifier i
378 std::vector<int> new_index(0);
379 std::vector<bool>& i_vec=responses[ind];
380 unsigned int m=index.size();
381 unsigned int n_rejects=0;
382 for (unsigned int j=0; j<m; ++j)
383 {
384 std::vector<bool>& j_vec=responses[ index[j] ];
385 double prop_same= calc_prop_same(i_vec,j_vec);
386 //std::cout<<"prop_same= "<<prop_same<<", prop_= "<<prop_<<'\n';
387 if ( prop_same < prop_ )
388 new_index.push_back( index[j] );
389 else
390 ++n_rejects;
391 }
392
393 std::cout<<"number of rejects due to similarity= "<<n_rejects<<std::endl;
394
395 //for (int p=0; p<new_index.size(); ++p)
396 // std::cout<<"new_index["<<p<<"]= "<<new_index[p]<<std::endl;
397
398 index= new_index;
399
400 //for (int p=0; p<index.size(); ++p)
401 // std::cout<<"index["<<p<<"]= "<<index[p]<<std::endl;
402 }
403 for (auto & classifier : classifiers)
404 delete classifier;
405
406 // calculating response from classifier so far
407 // and using this to calc min_error threshold
408 double t=calc_threshold( strong_classifier, inputs, outputs );
409 strong_classifier.add_final_threshold(t);
410
411 // does clsfy_test_error balk if have too much data?
412 // should be OK because just passes mbl_data_wrapper and evaluates
413 // one at a time, so if using mbl_file_data_wrapper should be OK!
414 std::cout<<"calculating training error\n";
415 return clsfy_test_error(strong_classifier, inputs, outputs);
416 }
417
418
419 //: Create empty classifier
420 // Caller is responsible for deletion
new_classifier() const421 clsfy_classifier_base* clsfy_direct_boost_builder::new_classifier() const
422 {
423 return new clsfy_direct_boost();
424 }
425
426
427 //=======================================================================
428
429 #if 0
430
431 // required if data stored on the heap is present in this derived class
432 clsfy_direct_boost_builder::clsfy_direct_boost_builder(const clsfy_direct_boost_builder& new_b):
433 data_ptr_(0)
434 {
435 *this = new_b;
436 }
437
438 //=======================================================================
439
440 // required if data stored on the heap is present in this derived class
441 clsfy_direct_boost_builder& clsfy_direct_boost_builder::operator=(const clsfy_direct_boost_builder& new_b)
442 {
443 if (&new_b==this) return *this;
444
445 // Copy heap member variables.
446 delete data_ptr_; data_ptr_=0;
447
448 if (new_b.data_ptr_)
449 data_ptr_ = new_b.data_ptr_->clone();
450
451 // Copy normal member variables
452 data_ = new_b.data_;
453
454 return *this;
455 }
456
457 #endif // 0
458
459
460 //=======================================================================
461
clone() const462 clsfy_builder_base* clsfy_direct_boost_builder::clone() const
463 {
464 return new clsfy_direct_boost_builder(*this);
465 }
466
467 //=======================================================================
468
469 // required if data is present in this base class
print_summary(std::ostream &) const470 void clsfy_direct_boost_builder::print_summary(std::ostream& /*os*/) const
471 {
472 #if 0
473 clsfy_builder_base::print_summary(os); // Uncomment this line if it has one.
474 vsl_print_summary(os, data_); // Example of data output
475 #endif
476
477 std::cerr << "clsfy_direct_boost_builder::print_summary() NYI\n";
478 }
479
480 //=======================================================================
481
482 // required if data is present in this base class
b_write(vsl_b_ostream &) const483 void clsfy_direct_boost_builder::b_write(vsl_b_ostream& /*bfs*/) const
484 {
485 #if 0
486 vsl_b_write(bfs, version_no());
487 clsfy_builder_base::b_write(bfs); // Needed if base has any data
488 vsl_b_write(bfs, data_);
489 #endif
490 std::cerr << "clsfy_direct_boost_builder::b_write() NYI\n";
491 }
492
493 //=======================================================================
494
495 // required if data is present in this base class
b_read(vsl_b_istream &)496 void clsfy_direct_boost_builder::b_read(vsl_b_istream& /*bfs*/)
497 {
498 std::cerr << "clsfy_direct_boost_builder::b_read() NYI\n";
499 #if 0
500 if (!bfs) return;
501
502 short version;
503 vsl_b_read(bfs,version);
504 switch (version)
505 {
506 case 1:
507 //clsfy_builder_base::b_read(bfs); // Needed if base has any data
508 vsl_b_read(bfs,data_);
509 break;
510 default:
511 std::cerr << "I/O ERROR: vsl_b_read(vsl_b_istream&, clsfy_direct_boost_builder&)\n"
512 << " Unknown version number "<< version << '\n';
513 bfs.is().clear(std::ios::badbit); // Set an unrecoverable IO error on stream
514 return;
515 }
516 #endif // 0
517 }
518