1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // Intel License Agreement
11 //
12 // Copyright (C) 2000, Intel Corporation, all rights reserved.
13 // Third party copyrights are property of their respective owners.
14 //
15 // Redistribution and use in source and binary forms, with or without modification,
16 // are permitted provided that the following conditions are met:
17 //
18 // * Redistribution's of source code must retain the above copyright notice,
19 // this list of conditions and the following disclaimer.
20 //
21 // * Redistribution's in binary form must reproduce the above copyright notice,
22 // this list of conditions and the following disclaimer in the documentation
23 // and/or other materials provided with the distribution.
24 //
25 // * The name of Intel Corporation may not be used to endorse or promote products
26 // derived from this software without specific prior written permission.
27 //
28 // This software is provided by the copyright holders and contributors "as is" and
29 // any express or implied warranties, including, but not limited to, the implied
30 // warranties of merchantability and fitness for a particular purpose are disclaimed.
31 // In no event shall the Intel Corporation or contributors be liable for any direct,
32 // indirect, incidental, special, exemplary, or consequential damages
33 // (including, but not limited to, procurement of substitute goods or services;
34 // loss of use, data, or profits; or business interruption) however caused
35 // and on any theory of liability, whether in contract, strict liability,
36 // or tort (including negligence or otherwise) arising in any way out of
37 // the use of this software, even if advised of the possibility of such damage.
38 //
39 //M*/
40
41 #include "old_ml_precomp.hpp"
42
43 static inline double
log_ratio(double val)44 log_ratio( double val )
45 {
46 const double eps = 1e-5;
47
48 val = MAX( val, eps );
49 val = MIN( val, 1. - eps );
50 return log( val/(1. - val) );
51 }
52
53
CvBoostParams()54 CvBoostParams::CvBoostParams()
55 {
56 boost_type = CvBoost::REAL;
57 weak_count = 100;
58 weight_trim_rate = 0.95;
59 cv_folds = 0;
60 max_depth = 1;
61 }
62
63
CvBoostParams(int _boost_type,int _weak_count,double _weight_trim_rate,int _max_depth,bool _use_surrogates,const float * _priors)64 CvBoostParams::CvBoostParams( int _boost_type, int _weak_count,
65 double _weight_trim_rate, int _max_depth,
66 bool _use_surrogates, const float* _priors )
67 {
68 boost_type = _boost_type;
69 weak_count = _weak_count;
70 weight_trim_rate = _weight_trim_rate;
71 split_criteria = CvBoost::DEFAULT;
72 cv_folds = 0;
73 max_depth = _max_depth;
74 use_surrogates = _use_surrogates;
75 priors = _priors;
76 }
77
78
79
80 ///////////////////////////////// CvBoostTree ///////////////////////////////////
81
CvBoostTree()82 CvBoostTree::CvBoostTree()
83 {
84 ensemble = 0;
85 }
86
87
~CvBoostTree()88 CvBoostTree::~CvBoostTree()
89 {
90 clear();
91 }
92
93
94 void
clear()95 CvBoostTree::clear()
96 {
97 CvDTree::clear();
98 ensemble = 0;
99 }
100
101
102 bool
train(CvDTreeTrainData * _train_data,const CvMat * _subsample_idx,CvBoost * _ensemble)103 CvBoostTree::train( CvDTreeTrainData* _train_data,
104 const CvMat* _subsample_idx, CvBoost* _ensemble )
105 {
106 clear();
107 ensemble = _ensemble;
108 data = _train_data;
109 data->shared = true;
110 return do_train( _subsample_idx );
111 }
112
113
114 bool
train(const CvMat *,int,const CvMat *,const CvMat *,const CvMat *,const CvMat *,const CvMat *,CvDTreeParams)115 CvBoostTree::train( const CvMat*, int, const CvMat*, const CvMat*,
116 const CvMat*, const CvMat*, const CvMat*, CvDTreeParams )
117 {
118 assert(0);
119 return false;
120 }
121
122
123 bool
train(CvDTreeTrainData *,const CvMat *)124 CvBoostTree::train( CvDTreeTrainData*, const CvMat* )
125 {
126 assert(0);
127 return false;
128 }
129
130
131 void
scale(double _scale)132 CvBoostTree::scale( double _scale )
133 {
134 CvDTreeNode* node = root;
135
136 // traverse the tree and scale all the node values
137 for(;;)
138 {
139 CvDTreeNode* parent;
140 for(;;)
141 {
142 node->value *= _scale;
143 if( !node->left )
144 break;
145 node = node->left;
146 }
147
148 for( parent = node->parent; parent && parent->right == node;
149 node = parent, parent = parent->parent )
150 ;
151
152 if( !parent )
153 break;
154
155 node = parent->right;
156 }
157 }
158
159
160 void
try_split_node(CvDTreeNode * node)161 CvBoostTree::try_split_node( CvDTreeNode* node )
162 {
163 CvDTree::try_split_node( node );
164
165 if( !node->left )
166 {
167 // if the node has not been split,
168 // store the responses for the corresponding training samples
169 double* weak_eval = ensemble->get_weak_response()->data.db;
170 cv::AutoBuffer<int> inn_buf(node->sample_count);
171 const int* labels = data->get_cv_labels(node, inn_buf.data());
172 int i, count = node->sample_count;
173 double value = node->value;
174
175 for( i = 0; i < count; i++ )
176 weak_eval[labels[i]] = value;
177 }
178 }
179
180
181 double
calc_node_dir(CvDTreeNode * node)182 CvBoostTree::calc_node_dir( CvDTreeNode* node )
183 {
184 char* dir = (char*)data->direction->data.ptr;
185 const double* weights = ensemble->get_subtree_weights()->data.db;
186 int i, n = node->sample_count, vi = node->split->var_idx;
187 double L, R;
188
189 assert( !node->split->inversed );
190
191 if( data->get_var_type(vi) >= 0 ) // split on categorical var
192 {
193 cv::AutoBuffer<int> inn_buf(n);
194 const int* cat_labels = data->get_cat_var_data(node, vi, inn_buf.data());
195 const int* subset = node->split->subset;
196 double sum = 0, sum_abs = 0;
197
198 for( i = 0; i < n; i++ )
199 {
200 int idx = ((cat_labels[i] == 65535) && data->is_buf_16u) ? -1 : cat_labels[i];
201 double w = weights[i];
202 int d = idx >= 0 ? CV_DTREE_CAT_DIR(idx,subset) : 0;
203 sum += d*w; sum_abs += (d & 1)*w;
204 dir[i] = (char)d;
205 }
206
207 R = (sum_abs + sum) * 0.5;
208 L = (sum_abs - sum) * 0.5;
209 }
210 else // split on ordered var
211 {
212 cv::AutoBuffer<uchar> inn_buf(2*n*sizeof(int)+n*sizeof(float));
213 float* values_buf = (float*)inn_buf.data();
214 int* sorted_indices_buf = (int*)(values_buf + n);
215 int* sample_indices_buf = sorted_indices_buf + n;
216 const float* values = 0;
217 const int* sorted_indices = 0;
218 data->get_ord_var_data( node, vi, values_buf, sorted_indices_buf, &values, &sorted_indices, sample_indices_buf );
219 int split_point = node->split->ord.split_point;
220 int n1 = node->get_num_valid(vi);
221
222 assert( 0 <= split_point && split_point < n1-1 );
223 L = R = 0;
224
225 for( i = 0; i <= split_point; i++ )
226 {
227 int idx = sorted_indices[i];
228 double w = weights[idx];
229 dir[idx] = (char)-1;
230 L += w;
231 }
232
233 for( ; i < n1; i++ )
234 {
235 int idx = sorted_indices[i];
236 double w = weights[idx];
237 dir[idx] = (char)1;
238 R += w;
239 }
240
241 for( ; i < n; i++ )
242 dir[sorted_indices[i]] = (char)0;
243 }
244
245 node->maxlr = MAX( L, R );
246 return node->split->quality/(L + R);
247 }
248
249
250 CvDTreeSplit*
find_split_ord_class(CvDTreeNode * node,int vi,float init_quality,CvDTreeSplit * _split,uchar * _ext_buf)251 CvBoostTree::find_split_ord_class( CvDTreeNode* node, int vi, float init_quality,
252 CvDTreeSplit* _split, uchar* _ext_buf )
253 {
254 const float epsilon = FLT_EPSILON*2;
255
256 const double* weights = ensemble->get_subtree_weights()->data.db;
257 int n = node->sample_count;
258 int n1 = node->get_num_valid(vi);
259
260 cv::AutoBuffer<uchar> inn_buf;
261 if( !_ext_buf )
262 inn_buf.allocate(n*(3*sizeof(int)+sizeof(float)));
263 uchar* ext_buf = _ext_buf ? _ext_buf : inn_buf.data();
264 float* values_buf = (float*)ext_buf;
265 int* sorted_indices_buf = (int*)(values_buf + n);
266 int* sample_indices_buf = sorted_indices_buf + n;
267 const float* values = 0;
268 const int* sorted_indices = 0;
269 data->get_ord_var_data( node, vi, values_buf, sorted_indices_buf, &values, &sorted_indices, sample_indices_buf );
270 int* responses_buf = sorted_indices_buf + n;
271 const int* responses = data->get_class_labels( node, responses_buf );
272 const double* rcw0 = weights + n;
273 double lcw[2] = {0,0}, rcw[2];
274 int i, best_i = -1;
275 double best_val = init_quality;
276 int boost_type = ensemble->get_params().boost_type;
277 int split_criteria = ensemble->get_params().split_criteria;
278
279 rcw[0] = rcw0[0]; rcw[1] = rcw0[1];
280 for( i = n1; i < n; i++ )
281 {
282 int idx = sorted_indices[i];
283 double w = weights[idx];
284 rcw[responses[idx]] -= w;
285 }
286
287 if( split_criteria != CvBoost::GINI && split_criteria != CvBoost::MISCLASS )
288 split_criteria = boost_type == CvBoost::DISCRETE ? CvBoost::MISCLASS : CvBoost::GINI;
289
290 if( split_criteria == CvBoost::GINI )
291 {
292 double L = 0, R = rcw[0] + rcw[1];
293 double lsum2 = 0, rsum2 = rcw[0]*rcw[0] + rcw[1]*rcw[1];
294
295 for( i = 0; i < n1 - 1; i++ )
296 {
297 int idx = sorted_indices[i];
298 double w = weights[idx], w2 = w*w;
299 double lv, rv;
300 idx = responses[idx];
301 L += w; R -= w;
302 lv = lcw[idx]; rv = rcw[idx];
303 lsum2 += 2*lv*w + w2;
304 rsum2 -= 2*rv*w - w2;
305 lcw[idx] = lv + w; rcw[idx] = rv - w;
306
307 if( values[i] + epsilon < values[i+1] )
308 {
309 double val = (lsum2*R + rsum2*L)/(L*R);
310 if( best_val < val )
311 {
312 best_val = val;
313 best_i = i;
314 }
315 }
316 }
317 }
318 else
319 {
320 for( i = 0; i < n1 - 1; i++ )
321 {
322 int idx = sorted_indices[i];
323 double w = weights[idx];
324 idx = responses[idx];
325 lcw[idx] += w;
326 rcw[idx] -= w;
327
328 if( values[i] + epsilon < values[i+1] )
329 {
330 double val = lcw[0] + rcw[1], val2 = lcw[1] + rcw[0];
331 val = MAX(val, val2);
332 if( best_val < val )
333 {
334 best_val = val;
335 best_i = i;
336 }
337 }
338 }
339 }
340
341 CvDTreeSplit* split = 0;
342 if( best_i >= 0 )
343 {
344 split = _split ? _split : data->new_split_ord( 0, 0.0f, 0, 0, 0.0f );
345 split->var_idx = vi;
346 split->ord.c = (values[best_i] + values[best_i+1])*0.5f;
347 split->ord.split_point = best_i;
348 split->inversed = 0;
349 split->quality = (float)best_val;
350 }
351 return split;
352 }
353
354 template<typename T>
355 class LessThanPtr
356 {
357 public:
operator ()(T * a,T * b) const358 bool operator()(T* a, T* b) const { return *a < *b; }
359 };
360
361 CvDTreeSplit*
find_split_cat_class(CvDTreeNode * node,int vi,float init_quality,CvDTreeSplit * _split,uchar * _ext_buf)362 CvBoostTree::find_split_cat_class( CvDTreeNode* node, int vi, float init_quality, CvDTreeSplit* _split, uchar* _ext_buf )
363 {
364 int ci = data->get_var_type(vi);
365 int n = node->sample_count;
366 int mi = data->cat_count->data.i[ci];
367
368 int base_size = (2*mi+3)*sizeof(double) + mi*sizeof(double*);
369 cv::AutoBuffer<uchar> inn_buf((2*mi+3)*sizeof(double) + mi*sizeof(double*));
370 if( !_ext_buf)
371 inn_buf.allocate( base_size + 2*n*sizeof(int) );
372 uchar* base_buf = inn_buf.data();
373 uchar* ext_buf = _ext_buf ? _ext_buf : base_buf + base_size;
374
375 int* cat_labels_buf = (int*)ext_buf;
376 const int* cat_labels = data->get_cat_var_data(node, vi, cat_labels_buf);
377 int* responses_buf = cat_labels_buf + n;
378 const int* responses = data->get_class_labels(node, responses_buf);
379 double lcw[2]={0,0}, rcw[2]={0,0};
380
381 double* cjk = (double*)cv::alignPtr(base_buf,sizeof(double))+2;
382 const double* weights = ensemble->get_subtree_weights()->data.db;
383 double** dbl_ptr = (double**)(cjk + 2*mi);
384 int i, j, k, idx;
385 double L = 0, R;
386 double best_val = init_quality;
387 int best_subset = -1, subset_i;
388 int boost_type = ensemble->get_params().boost_type;
389 int split_criteria = ensemble->get_params().split_criteria;
390
391 // init array of counters:
392 // c_{jk} - number of samples that have vi-th input variable = j and response = k.
393 for( j = -1; j < mi; j++ )
394 cjk[j*2] = cjk[j*2+1] = 0;
395
396 for( i = 0; i < n; i++ )
397 {
398 double w = weights[i];
399 j = ((cat_labels[i] == 65535) && data->is_buf_16u) ? -1 : cat_labels[i];
400 k = responses[i];
401 cjk[j*2 + k] += w;
402 }
403
404 for( j = 0; j < mi; j++ )
405 {
406 rcw[0] += cjk[j*2];
407 rcw[1] += cjk[j*2+1];
408 dbl_ptr[j] = cjk + j*2 + 1;
409 }
410
411 R = rcw[0] + rcw[1];
412
413 if( split_criteria != CvBoost::GINI && split_criteria != CvBoost::MISCLASS )
414 split_criteria = boost_type == CvBoost::DISCRETE ? CvBoost::MISCLASS : CvBoost::GINI;
415
416 // sort rows of c_jk by increasing c_j,1
417 // (i.e. by the weight of samples in j-th category that belong to class 1)
418 std::sort(dbl_ptr, dbl_ptr + mi, LessThanPtr<double>());
419
420 for( subset_i = 0; subset_i < mi-1; subset_i++ )
421 {
422 idx = (int)(dbl_ptr[subset_i] - cjk)/2;
423 const double* crow = cjk + idx*2;
424 double w0 = crow[0], w1 = crow[1];
425 double weight = w0 + w1;
426
427 if( weight < FLT_EPSILON )
428 continue;
429
430 lcw[0] += w0; rcw[0] -= w0;
431 lcw[1] += w1; rcw[1] -= w1;
432
433 if( split_criteria == CvBoost::GINI )
434 {
435 double lsum2 = lcw[0]*lcw[0] + lcw[1]*lcw[1];
436 double rsum2 = rcw[0]*rcw[0] + rcw[1]*rcw[1];
437
438 L += weight;
439 R -= weight;
440
441 if( L > FLT_EPSILON && R > FLT_EPSILON )
442 {
443 double val = (lsum2*R + rsum2*L)/(L*R);
444 if( best_val < val )
445 {
446 best_val = val;
447 best_subset = subset_i;
448 }
449 }
450 }
451 else
452 {
453 double val = lcw[0] + rcw[1];
454 double val2 = lcw[1] + rcw[0];
455
456 val = MAX(val, val2);
457 if( best_val < val )
458 {
459 best_val = val;
460 best_subset = subset_i;
461 }
462 }
463 }
464
465 CvDTreeSplit* split = 0;
466 if( best_subset >= 0 )
467 {
468 split = _split ? _split : data->new_split_cat( 0, -1.0f);
469 split->var_idx = vi;
470 split->quality = (float)best_val;
471 memset( split->subset, 0, (data->max_c_count + 31)/32 * sizeof(int));
472 for( i = 0; i <= best_subset; i++ )
473 {
474 idx = (int)(dbl_ptr[i] - cjk) >> 1;
475 split->subset[idx >> 5] |= 1 << (idx & 31);
476 }
477 }
478 return split;
479 }
480
481
482 CvDTreeSplit*
find_split_ord_reg(CvDTreeNode * node,int vi,float init_quality,CvDTreeSplit * _split,uchar * _ext_buf)483 CvBoostTree::find_split_ord_reg( CvDTreeNode* node, int vi, float init_quality, CvDTreeSplit* _split, uchar* _ext_buf )
484 {
485 const float epsilon = FLT_EPSILON*2;
486 const double* weights = ensemble->get_subtree_weights()->data.db;
487 int n = node->sample_count;
488 int n1 = node->get_num_valid(vi);
489
490 cv::AutoBuffer<uchar> inn_buf;
491 if( !_ext_buf )
492 inn_buf.allocate(2*n*(sizeof(int)+sizeof(float)));
493 uchar* ext_buf = _ext_buf ? _ext_buf : inn_buf.data();
494
495 float* values_buf = (float*)ext_buf;
496 int* indices_buf = (int*)(values_buf + n);
497 int* sample_indices_buf = indices_buf + n;
498 const float* values = 0;
499 const int* indices = 0;
500 data->get_ord_var_data( node, vi, values_buf, indices_buf, &values, &indices, sample_indices_buf );
501 float* responses_buf = (float*)(indices_buf + n);
502 const float* responses = data->get_ord_responses( node, responses_buf, sample_indices_buf );
503
504 int i, best_i = -1;
505 double L = 0, R = weights[n];
506 double best_val = init_quality, lsum = 0, rsum = node->value*R;
507
508 // compensate for missing values
509 for( i = n1; i < n; i++ )
510 {
511 int idx = indices[i];
512 double w = weights[idx];
513 rsum -= responses[idx]*w;
514 R -= w;
515 }
516
517 // find the optimal split
518 for( i = 0; i < n1 - 1; i++ )
519 {
520 int idx = indices[i];
521 double w = weights[idx];
522 double t = responses[idx]*w;
523 L += w; R -= w;
524 lsum += t; rsum -= t;
525
526 if( values[i] + epsilon < values[i+1] )
527 {
528 double val = (lsum*lsum*R + rsum*rsum*L)/(L*R);
529 if( best_val < val )
530 {
531 best_val = val;
532 best_i = i;
533 }
534 }
535 }
536
537 CvDTreeSplit* split = 0;
538 if( best_i >= 0 )
539 {
540 split = _split ? _split : data->new_split_ord( 0, 0.0f, 0, 0, 0.0f );
541 split->var_idx = vi;
542 split->ord.c = (values[best_i] + values[best_i+1])*0.5f;
543 split->ord.split_point = best_i;
544 split->inversed = 0;
545 split->quality = (float)best_val;
546 }
547 return split;
548 }
549
550
551 CvDTreeSplit*
find_split_cat_reg(CvDTreeNode * node,int vi,float init_quality,CvDTreeSplit * _split,uchar * _ext_buf)552 CvBoostTree::find_split_cat_reg( CvDTreeNode* node, int vi, float init_quality, CvDTreeSplit* _split, uchar* _ext_buf )
553 {
554 const double* weights = ensemble->get_subtree_weights()->data.db;
555 int ci = data->get_var_type(vi);
556 int n = node->sample_count;
557 int mi = data->cat_count->data.i[ci];
558 int base_size = (2*mi+3)*sizeof(double) + mi*sizeof(double*);
559 cv::AutoBuffer<uchar> inn_buf(base_size);
560 if( !_ext_buf )
561 inn_buf.allocate(base_size + n*(2*sizeof(int) + sizeof(float)));
562 uchar* base_buf = inn_buf.data();
563 uchar* ext_buf = _ext_buf ? _ext_buf : base_buf + base_size;
564
565 int* cat_labels_buf = (int*)ext_buf;
566 const int* cat_labels = data->get_cat_var_data(node, vi, cat_labels_buf);
567 float* responses_buf = (float*)(cat_labels_buf + n);
568 int* sample_indices_buf = (int*)(responses_buf + n);
569 const float* responses = data->get_ord_responses(node, responses_buf, sample_indices_buf);
570
571 double* sum = (double*)cv::alignPtr(base_buf,sizeof(double)) + 1;
572 double* counts = sum + mi + 1;
573 double** sum_ptr = (double**)(counts + mi);
574 double L = 0, R = 0, best_val = init_quality, lsum = 0, rsum = 0;
575 int i, best_subset = -1, subset_i;
576
577 for( i = -1; i < mi; i++ )
578 sum[i] = counts[i] = 0;
579
580 // calculate sum response and weight of each category of the input var
581 for( i = 0; i < n; i++ )
582 {
583 int idx = ((cat_labels[i] == 65535) && data->is_buf_16u) ? -1 : cat_labels[i];
584 double w = weights[i];
585 double s = sum[idx] + responses[i]*w;
586 double nc = counts[idx] + w;
587 sum[idx] = s;
588 counts[idx] = nc;
589 }
590
591 // calculate average response in each category
592 for( i = 0; i < mi; i++ )
593 {
594 R += counts[i];
595 rsum += sum[i];
596 sum[i] = fabs(counts[i]) > DBL_EPSILON ? sum[i]/counts[i] : 0;
597 sum_ptr[i] = sum + i;
598 }
599
600 std::sort(sum_ptr, sum_ptr + mi, LessThanPtr<double>());
601
602 // revert back to unnormalized sums
603 // (there should be a very little loss in accuracy)
604 for( i = 0; i < mi; i++ )
605 sum[i] *= counts[i];
606
607 for( subset_i = 0; subset_i < mi-1; subset_i++ )
608 {
609 int idx = (int)(sum_ptr[subset_i] - sum);
610 double ni = counts[idx];
611
612 if( ni > FLT_EPSILON )
613 {
614 double s = sum[idx];
615 lsum += s; L += ni;
616 rsum -= s; R -= ni;
617
618 if( L > FLT_EPSILON && R > FLT_EPSILON )
619 {
620 double val = (lsum*lsum*R + rsum*rsum*L)/(L*R);
621 if( best_val < val )
622 {
623 best_val = val;
624 best_subset = subset_i;
625 }
626 }
627 }
628 }
629
630 CvDTreeSplit* split = 0;
631 if( best_subset >= 0 )
632 {
633 split = _split ? _split : data->new_split_cat( 0, -1.0f);
634 split->var_idx = vi;
635 split->quality = (float)best_val;
636 memset( split->subset, 0, (data->max_c_count + 31)/32 * sizeof(int));
637 for( i = 0; i <= best_subset; i++ )
638 {
639 int idx = (int)(sum_ptr[i] - sum);
640 split->subset[idx >> 5] |= 1 << (idx & 31);
641 }
642 }
643 return split;
644 }
645
646
647 CvDTreeSplit*
find_surrogate_split_ord(CvDTreeNode * node,int vi,uchar * _ext_buf)648 CvBoostTree::find_surrogate_split_ord( CvDTreeNode* node, int vi, uchar* _ext_buf )
649 {
650 const float epsilon = FLT_EPSILON*2;
651 int n = node->sample_count;
652 cv::AutoBuffer<uchar> inn_buf;
653 if( !_ext_buf )
654 inn_buf.allocate(n*(2*sizeof(int)+sizeof(float)));
655 uchar* ext_buf = _ext_buf ? _ext_buf : inn_buf.data();
656 float* values_buf = (float*)ext_buf;
657 int* indices_buf = (int*)(values_buf + n);
658 int* sample_indices_buf = indices_buf + n;
659 const float* values = 0;
660 const int* indices = 0;
661 data->get_ord_var_data( node, vi, values_buf, indices_buf, &values, &indices, sample_indices_buf );
662
663 const double* weights = ensemble->get_subtree_weights()->data.db;
664 const char* dir = (char*)data->direction->data.ptr;
665 int n1 = node->get_num_valid(vi);
666 // LL - number of samples that both the primary and the surrogate splits send to the left
667 // LR - ... primary split sends to the left and the surrogate split sends to the right
668 // RL - ... primary split sends to the right and the surrogate split sends to the left
669 // RR - ... both send to the right
670 int i, best_i = -1, best_inversed = 0;
671 double best_val;
672 double LL = 0, RL = 0, LR, RR;
673 double worst_val = node->maxlr;
674 double sum = 0, sum_abs = 0;
675 best_val = worst_val;
676
677 for( i = 0; i < n1; i++ )
678 {
679 int idx = indices[i];
680 double w = weights[idx];
681 int d = dir[idx];
682 sum += d*w; sum_abs += (d & 1)*w;
683 }
684
685 // sum_abs = R + L; sum = R - L
686 RR = (sum_abs + sum)*0.5;
687 LR = (sum_abs - sum)*0.5;
688
689 // initially all the samples are sent to the right by the surrogate split,
690 // LR of them are sent to the left by primary split, and RR - to the right.
691 // now iteratively compute LL, LR, RL and RR for every possible surrogate split value.
692 for( i = 0; i < n1 - 1; i++ )
693 {
694 int idx = indices[i];
695 double w = weights[idx];
696 int d = dir[idx];
697
698 if( d < 0 )
699 {
700 LL += w; LR -= w;
701 if( LL + RR > best_val && values[i] + epsilon < values[i+1] )
702 {
703 best_val = LL + RR;
704 best_i = i; best_inversed = 0;
705 }
706 }
707 else if( d > 0 )
708 {
709 RL += w; RR -= w;
710 if( RL + LR > best_val && values[i] + epsilon < values[i+1] )
711 {
712 best_val = RL + LR;
713 best_i = i; best_inversed = 1;
714 }
715 }
716 }
717
718 return best_i >= 0 && best_val > node->maxlr ? data->new_split_ord( vi,
719 (values[best_i] + values[best_i+1])*0.5f, best_i,
720 best_inversed, (float)best_val ) : 0;
721 }
722
723
724 CvDTreeSplit*
find_surrogate_split_cat(CvDTreeNode * node,int vi,uchar * _ext_buf)725 CvBoostTree::find_surrogate_split_cat( CvDTreeNode* node, int vi, uchar* _ext_buf )
726 {
727 const char* dir = (char*)data->direction->data.ptr;
728 const double* weights = ensemble->get_subtree_weights()->data.db;
729 int n = node->sample_count;
730 int i, mi = data->cat_count->data.i[data->get_var_type(vi)];
731
732 int base_size = (2*mi+3)*sizeof(double);
733 cv::AutoBuffer<uchar> inn_buf(base_size);
734 if( !_ext_buf )
735 inn_buf.allocate(base_size + n*sizeof(int));
736 uchar* ext_buf = _ext_buf ? _ext_buf : inn_buf.data();
737 int* cat_labels_buf = (int*)ext_buf;
738 const int* cat_labels = data->get_cat_var_data(node, vi, cat_labels_buf);
739
740 // LL - number of samples that both the primary and the surrogate splits send to the left
741 // LR - ... primary split sends to the left and the surrogate split sends to the right
742 // RL - ... primary split sends to the right and the surrogate split sends to the left
743 // RR - ... both send to the right
744 CvDTreeSplit* split = data->new_split_cat( vi, 0 );
745 double best_val = 0;
746 double* lc = (double*)cv::alignPtr(cat_labels_buf + n, sizeof(double)) + 1;
747 double* rc = lc + mi + 1;
748
749 for( i = -1; i < mi; i++ )
750 lc[i] = rc[i] = 0;
751
752 // 1. for each category calculate the weight of samples
753 // sent to the left (lc) and to the right (rc) by the primary split
754 for( i = 0; i < n; i++ )
755 {
756 int idx = ((cat_labels[i] == 65535) && data->is_buf_16u) ? -1 : cat_labels[i];
757 double w = weights[i];
758 int d = dir[i];
759 double sum = lc[idx] + d*w;
760 double sum_abs = rc[idx] + (d & 1)*w;
761 lc[idx] = sum; rc[idx] = sum_abs;
762 }
763
764 for( i = 0; i < mi; i++ )
765 {
766 double sum = lc[i];
767 double sum_abs = rc[i];
768 lc[i] = (sum_abs - sum) * 0.5;
769 rc[i] = (sum_abs + sum) * 0.5;
770 }
771
772 // 2. now form the split.
773 // in each category send all the samples to the same direction as majority
774 for( i = 0; i < mi; i++ )
775 {
776 double lval = lc[i], rval = rc[i];
777 if( lval > rval )
778 {
779 split->subset[i >> 5] |= 1 << (i & 31);
780 best_val += lval;
781 }
782 else
783 best_val += rval;
784 }
785
786 split->quality = (float)best_val;
787 if( split->quality <= node->maxlr )
788 cvSetRemoveByPtr( data->split_heap, split ), split = 0;
789
790 return split;
791 }
792
793
794 void
calc_node_value(CvDTreeNode * node)795 CvBoostTree::calc_node_value( CvDTreeNode* node )
796 {
797 int i, n = node->sample_count;
798 const double* weights = ensemble->get_weights()->data.db;
799 cv::AutoBuffer<uchar> inn_buf(n*(sizeof(int) + ( data->is_classifier ? sizeof(int) : sizeof(int) + sizeof(float))));
800 int* labels_buf = (int*)inn_buf.data();
801 const int* labels = data->get_cv_labels(node, labels_buf);
802 double* subtree_weights = ensemble->get_subtree_weights()->data.db;
803 double rcw[2] = {0,0};
804 int boost_type = ensemble->get_params().boost_type;
805
806 if( data->is_classifier )
807 {
808 int* _responses_buf = labels_buf + n;
809 const int* _responses = data->get_class_labels(node, _responses_buf);
810 int m = data->get_num_classes();
811 int* cls_count = data->counts->data.i;
812 for( int k = 0; k < m; k++ )
813 cls_count[k] = 0;
814
815 for( i = 0; i < n; i++ )
816 {
817 int idx = labels[i];
818 double w = weights[idx];
819 int r = _responses[i];
820 rcw[r] += w;
821 cls_count[r]++;
822 subtree_weights[i] = w;
823 }
824
825 node->class_idx = rcw[1] > rcw[0];
826
827 if( boost_type == CvBoost::DISCRETE )
828 {
829 // ignore cat_map for responses, and use {-1,1},
830 // as the whole ensemble response is computes as sign(sum_i(weak_response_i)
831 node->value = node->class_idx*2 - 1;
832 }
833 else
834 {
835 double p = rcw[1]/(rcw[0] + rcw[1]);
836 assert( boost_type == CvBoost::REAL );
837
838 // store log-ratio of the probability
839 node->value = 0.5*log_ratio(p);
840 }
841 }
842 else
843 {
844 // in case of regression tree:
845 // * node value is 1/n*sum_i(Y_i), where Y_i is i-th response,
846 // n is the number of samples in the node.
847 // * node risk is the sum of squared errors: sum_i((Y_i - <node_value>)^2)
848 double sum = 0, sum2 = 0, iw;
849 float* values_buf = (float*)(labels_buf + n);
850 int* sample_indices_buf = (int*)(values_buf + n);
851 const float* values = data->get_ord_responses(node, values_buf, sample_indices_buf);
852
853 for( i = 0; i < n; i++ )
854 {
855 int idx = labels[i];
856 double w = weights[idx]/*priors[values[i] > 0]*/;
857 double t = values[i];
858 rcw[0] += w;
859 subtree_weights[i] = w;
860 sum += t*w;
861 sum2 += t*t*w;
862 }
863
864 iw = 1./rcw[0];
865 node->value = sum*iw;
866 node->node_risk = sum2 - (sum*iw)*sum;
867
868 // renormalize the risk, as in try_split_node the unweighted formula
869 // sqrt(risk)/n is used, rather than sqrt(risk)/sum(weights_i)
870 node->node_risk *= n*iw*n*iw;
871 }
872
873 // store summary weights
874 subtree_weights[n] = rcw[0];
875 subtree_weights[n+1] = rcw[1];
876 }
877
878
read(CvFileStorage * fs,CvFileNode * fnode,CvBoost * _ensemble,CvDTreeTrainData * _data)879 void CvBoostTree::read( CvFileStorage* fs, CvFileNode* fnode, CvBoost* _ensemble, CvDTreeTrainData* _data )
880 {
881 CvDTree::read( fs, fnode, _data );
882 ensemble = _ensemble;
883 }
884
read(CvFileStorage *,CvFileNode *)885 void CvBoostTree::read( CvFileStorage*, CvFileNode* )
886 {
887 assert(0);
888 }
889
read(CvFileStorage * _fs,CvFileNode * _node,CvDTreeTrainData * _data)890 void CvBoostTree::read( CvFileStorage* _fs, CvFileNode* _node,
891 CvDTreeTrainData* _data )
892 {
893 CvDTree::read( _fs, _node, _data );
894 }
895
896
897 /////////////////////////////////// CvBoost /////////////////////////////////////
898
CvBoost()899 CvBoost::CvBoost()
900 {
901 data = 0;
902 weak = 0;
903 default_model_name = "my_boost_tree";
904
905 active_vars = active_vars_abs = orig_response = sum_response = weak_eval =
906 subsample_mask = weights = subtree_weights = 0;
907 have_active_cat_vars = have_subsample = false;
908
909 clear();
910 }
911
912
prune(CvSlice slice)913 void CvBoost::prune( CvSlice slice )
914 {
915 if( weak && weak->total > 0 )
916 {
917 CvSeqReader reader;
918 int i, count = cvSliceLength( slice, weak );
919
920 cvStartReadSeq( weak, &reader );
921 cvSetSeqReaderPos( &reader, slice.start_index );
922
923 for( i = 0; i < count; i++ )
924 {
925 CvBoostTree* w;
926 CV_READ_SEQ_ELEM( w, reader );
927 delete w;
928 }
929
930 cvSeqRemoveSlice( weak, slice );
931 }
932 }
933
934
clear()935 void CvBoost::clear()
936 {
937 if( weak )
938 {
939 prune( CV_WHOLE_SEQ );
940 cvReleaseMemStorage( &weak->storage );
941 }
942 if( data )
943 delete data;
944 weak = 0;
945 data = 0;
946 cvReleaseMat( &active_vars );
947 cvReleaseMat( &active_vars_abs );
948 cvReleaseMat( &orig_response );
949 cvReleaseMat( &sum_response );
950 cvReleaseMat( &weak_eval );
951 cvReleaseMat( &subsample_mask );
952 cvReleaseMat( &weights );
953 cvReleaseMat( &subtree_weights );
954
955 have_subsample = false;
956 }
957
958
~CvBoost()959 CvBoost::~CvBoost()
960 {
961 clear();
962 }
963
964
CvBoost(const CvMat * _train_data,int _tflag,const CvMat * _responses,const CvMat * _var_idx,const CvMat * _sample_idx,const CvMat * _var_type,const CvMat * _missing_mask,CvBoostParams _params)965 CvBoost::CvBoost( const CvMat* _train_data, int _tflag,
966 const CvMat* _responses, const CvMat* _var_idx,
967 const CvMat* _sample_idx, const CvMat* _var_type,
968 const CvMat* _missing_mask, CvBoostParams _params )
969 {
970 weak = 0;
971 data = 0;
972 default_model_name = "my_boost_tree";
973
974 active_vars = active_vars_abs = orig_response = sum_response = weak_eval =
975 subsample_mask = weights = subtree_weights = 0;
976
977 train( _train_data, _tflag, _responses, _var_idx, _sample_idx,
978 _var_type, _missing_mask, _params );
979 }
980
981
982 bool
set_params(const CvBoostParams & _params)983 CvBoost::set_params( const CvBoostParams& _params )
984 {
985 bool ok = false;
986
987 CV_FUNCNAME( "CvBoost::set_params" );
988
989 __BEGIN__;
990
991 params = _params;
992 if( params.boost_type != DISCRETE && params.boost_type != REAL &&
993 params.boost_type != LOGIT && params.boost_type != GENTLE )
994 CV_ERROR( CV_StsBadArg, "Unknown/unsupported boosting type" );
995
996 params.weak_count = MAX( params.weak_count, 1 );
997 params.weight_trim_rate = MAX( params.weight_trim_rate, 0. );
998 params.weight_trim_rate = MIN( params.weight_trim_rate, 1. );
999 if( params.weight_trim_rate < FLT_EPSILON )
1000 params.weight_trim_rate = 1.f;
1001
1002 if( params.boost_type == DISCRETE &&
1003 params.split_criteria != GINI && params.split_criteria != MISCLASS )
1004 params.split_criteria = MISCLASS;
1005 if( params.boost_type == REAL &&
1006 params.split_criteria != GINI && params.split_criteria != MISCLASS )
1007 params.split_criteria = GINI;
1008 if( (params.boost_type == LOGIT || params.boost_type == GENTLE) &&
1009 params.split_criteria != SQERR )
1010 params.split_criteria = SQERR;
1011
1012 ok = true;
1013
1014 __END__;
1015
1016 return ok;
1017 }
1018
1019
1020 bool
train(const CvMat * _train_data,int _tflag,const CvMat * _responses,const CvMat * _var_idx,const CvMat * _sample_idx,const CvMat * _var_type,const CvMat * _missing_mask,CvBoostParams _params,bool _update)1021 CvBoost::train( const CvMat* _train_data, int _tflag,
1022 const CvMat* _responses, const CvMat* _var_idx,
1023 const CvMat* _sample_idx, const CvMat* _var_type,
1024 const CvMat* _missing_mask,
1025 CvBoostParams _params, bool _update )
1026 {
1027 bool ok = false;
1028 CvMemStorage* storage = 0;
1029
1030 CV_FUNCNAME( "CvBoost::train" );
1031
1032 __BEGIN__;
1033
1034 int i;
1035
1036 set_params( _params );
1037
1038 cvReleaseMat( &active_vars );
1039 cvReleaseMat( &active_vars_abs );
1040
1041 if( !_update || !data )
1042 {
1043 clear();
1044 data = new CvDTreeTrainData( _train_data, _tflag, _responses, _var_idx,
1045 _sample_idx, _var_type, _missing_mask, _params, true, true );
1046
1047 if( data->get_num_classes() != 2 )
1048 CV_ERROR( CV_StsNotImplemented,
1049 "Boosted trees can only be used for 2-class classification." );
1050 CV_CALL( storage = cvCreateMemStorage() );
1051 weak = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvBoostTree*), storage );
1052 storage = 0;
1053 }
1054 else
1055 {
1056 data->set_data( _train_data, _tflag, _responses, _var_idx,
1057 _sample_idx, _var_type, _missing_mask, _params, true, true, true );
1058 }
1059
1060 if ( (_params.boost_type == LOGIT) || (_params.boost_type == GENTLE) )
1061 data->do_responses_copy();
1062
1063 update_weights( 0 );
1064
1065 for( i = 0; i < params.weak_count; i++ )
1066 {
1067 CvBoostTree* tree = new CvBoostTree;
1068 if( !tree->train( data, subsample_mask, this ) )
1069 {
1070 delete tree;
1071 break;
1072 }
1073 //cvCheckArr( get_weak_response());
1074 cvSeqPush( weak, &tree );
1075 update_weights( tree );
1076 trim_weights();
1077 if( cvCountNonZero(subsample_mask) == 0 )
1078 break;
1079 }
1080
1081 if(weak->total > 0)
1082 {
1083 get_active_vars(); // recompute active_vars* maps and condensed_idx's in the splits.
1084 data->is_classifier = true;
1085 data->free_train_data();
1086 ok = true;
1087 }
1088 else
1089 clear();
1090
1091 __END__;
1092
1093 return ok;
1094 }
1095
train(CvMLData * _data,CvBoostParams _params,bool update)1096 bool CvBoost::train( CvMLData* _data,
1097 CvBoostParams _params,
1098 bool update )
1099 {
1100 bool result = false;
1101
1102 CV_FUNCNAME( "CvBoost::train" );
1103
1104 __BEGIN__;
1105
1106 const CvMat* values = _data->get_values();
1107 const CvMat* response = _data->get_responses();
1108 const CvMat* missing = _data->get_missing();
1109 const CvMat* var_types = _data->get_var_types();
1110 const CvMat* train_sidx = _data->get_train_sample_idx();
1111 const CvMat* var_idx = _data->get_var_idx();
1112
1113 CV_CALL( result = train( values, CV_ROW_SAMPLE, response, var_idx,
1114 train_sidx, var_types, missing, _params, update ) );
1115
1116 __END__;
1117
1118 return result;
1119 }
1120
initialize_weights(double (& p)[2])1121 void CvBoost::initialize_weights(double (&p)[2])
1122 {
1123 p[0] = 1.;
1124 p[1] = 1.;
1125 }
1126
1127 void
update_weights(CvBoostTree * tree)1128 CvBoost::update_weights( CvBoostTree* tree )
1129 {
1130 CV_FUNCNAME( "CvBoost::update_weights" );
1131
1132 __BEGIN__;
1133
1134 int i, n = data->sample_count;
1135 double sumw = 0.;
1136 int step = 0;
1137 float* fdata = 0;
1138 int *sample_idx_buf;
1139 const int* sample_idx = 0;
1140 cv::AutoBuffer<uchar> inn_buf;
1141 size_t _buf_size = (params.boost_type == LOGIT) || (params.boost_type == GENTLE) ? (size_t)(data->sample_count)*sizeof(int) : 0;
1142 if( !tree )
1143 _buf_size += n*sizeof(int);
1144 else
1145 {
1146 if( have_subsample )
1147 _buf_size += data->get_length_subbuf()*(sizeof(float)+sizeof(uchar));
1148 }
1149 inn_buf.allocate(_buf_size);
1150 uchar* cur_buf_pos = inn_buf.data();
1151
1152 if ( (params.boost_type == LOGIT) || (params.boost_type == GENTLE) )
1153 {
1154 step = CV_IS_MAT_CONT(data->responses_copy->type) ?
1155 1 : data->responses_copy->step / CV_ELEM_SIZE(data->responses_copy->type);
1156 fdata = data->responses_copy->data.fl;
1157 sample_idx_buf = (int*)cur_buf_pos;
1158 cur_buf_pos = (uchar*)(sample_idx_buf + data->sample_count);
1159 sample_idx = data->get_sample_indices( data->data_root, sample_idx_buf );
1160 }
1161 CvMat* dtree_data_buf = data->buf;
1162 size_t length_buf_row = data->get_length_subbuf();
1163 if( !tree ) // before training the first tree, initialize weights and other parameters
1164 {
1165 int* class_labels_buf = (int*)cur_buf_pos;
1166 cur_buf_pos = (uchar*)(class_labels_buf + n);
1167 const int* class_labels = data->get_class_labels(data->data_root, class_labels_buf);
1168 // in case of logitboost and gentle adaboost each weak tree is a regression tree,
1169 // so we need to convert class labels to floating-point values
1170
1171 double w0 = 1./ n;
1172 double p[2] = { 1., 1. };
1173 initialize_weights(p);
1174
1175 cvReleaseMat( &orig_response );
1176 cvReleaseMat( &sum_response );
1177 cvReleaseMat( &weak_eval );
1178 cvReleaseMat( &subsample_mask );
1179 cvReleaseMat( &weights );
1180 cvReleaseMat( &subtree_weights );
1181
1182 CV_CALL( orig_response = cvCreateMat( 1, n, CV_32S ));
1183 CV_CALL( weak_eval = cvCreateMat( 1, n, CV_64F ));
1184 CV_CALL( subsample_mask = cvCreateMat( 1, n, CV_8U ));
1185 CV_CALL( weights = cvCreateMat( 1, n, CV_64F ));
1186 CV_CALL( subtree_weights = cvCreateMat( 1, n + 2, CV_64F ));
1187
1188 if( data->have_priors )
1189 {
1190 // compute weight scale for each class from their prior probabilities
1191 int c1 = 0;
1192 for( i = 0; i < n; i++ )
1193 c1 += class_labels[i];
1194 p[0] = data->priors->data.db[0]*(c1 < n ? 1./(n - c1) : 0.);
1195 p[1] = data->priors->data.db[1]*(c1 > 0 ? 1./c1 : 0.);
1196 p[0] /= p[0] + p[1];
1197 p[1] = 1. - p[0];
1198 }
1199
1200 if (data->is_buf_16u)
1201 {
1202 unsigned short* labels = (unsigned short*)(dtree_data_buf->data.s + data->data_root->buf_idx*length_buf_row +
1203 data->data_root->offset + (size_t)(data->work_var_count-1)*data->sample_count);
1204 for( i = 0; i < n; i++ )
1205 {
1206 // save original categorical responses {0,1}, convert them to {-1,1}
1207 orig_response->data.i[i] = class_labels[i]*2 - 1;
1208 // make all the samples active at start.
1209 // later, in trim_weights() deactivate/reactive again some, if need
1210 subsample_mask->data.ptr[i] = (uchar)1;
1211 // make all the initial weights the same.
1212 weights->data.db[i] = w0*p[class_labels[i]];
1213 // set the labels to find (from within weak tree learning proc)
1214 // the particular sample weight, and where to store the response.
1215 labels[i] = (unsigned short)i;
1216 }
1217 }
1218 else
1219 {
1220 int* labels = dtree_data_buf->data.i + data->data_root->buf_idx*length_buf_row +
1221 data->data_root->offset + (size_t)(data->work_var_count-1)*data->sample_count;
1222
1223 for( i = 0; i < n; i++ )
1224 {
1225 // save original categorical responses {0,1}, convert them to {-1,1}
1226 orig_response->data.i[i] = class_labels[i]*2 - 1;
1227 // make all the samples active at start.
1228 // later, in trim_weights() deactivate/reactive again some, if need
1229 subsample_mask->data.ptr[i] = (uchar)1;
1230 // make all the initial weights the same.
1231 weights->data.db[i] = w0*p[class_labels[i]];
1232 // set the labels to find (from within weak tree learning proc)
1233 // the particular sample weight, and where to store the response.
1234 labels[i] = i;
1235 }
1236 }
1237
1238 if( params.boost_type == LOGIT )
1239 {
1240 CV_CALL( sum_response = cvCreateMat( 1, n, CV_64F ));
1241
1242 for( i = 0; i < n; i++ )
1243 {
1244 sum_response->data.db[i] = 0;
1245 fdata[sample_idx[i]*step] = orig_response->data.i[i] > 0 ? 2.f : -2.f;
1246 }
1247
1248 // in case of logitboost each weak tree is a regression tree.
1249 // the target function values are recalculated for each of the trees
1250 data->is_classifier = false;
1251 }
1252 else if( params.boost_type == GENTLE )
1253 {
1254 for( i = 0; i < n; i++ )
1255 fdata[sample_idx[i]*step] = (float)orig_response->data.i[i];
1256
1257 data->is_classifier = false;
1258 }
1259 }
1260 else
1261 {
1262 // at this moment, for all the samples that participated in the training of the most
1263 // recent weak classifier we know the responses. For other samples we need to compute them
1264 if( have_subsample )
1265 {
1266 float* values = (float*)cur_buf_pos;
1267 cur_buf_pos = (uchar*)(values + data->get_length_subbuf());
1268 uchar* missing = cur_buf_pos;
1269 cur_buf_pos = missing + data->get_length_subbuf() * (size_t)CV_ELEM_SIZE(data->buf->type);
1270
1271 CvMat _sample, _mask;
1272
1273 // invert the subsample mask
1274 cvXorS( subsample_mask, cvScalar(1.), subsample_mask );
1275 data->get_vectors( subsample_mask, values, missing, 0 );
1276
1277 _sample = cvMat( 1, data->var_count, CV_32F );
1278 _mask = cvMat( 1, data->var_count, CV_8U );
1279
1280 // run tree through all the non-processed samples
1281 for( i = 0; i < n; i++ )
1282 if( subsample_mask->data.ptr[i] )
1283 {
1284 _sample.data.fl = values;
1285 _mask.data.ptr = missing;
1286 values += _sample.cols;
1287 missing += _mask.cols;
1288 weak_eval->data.db[i] = tree->predict( &_sample, &_mask, true )->value;
1289 }
1290 }
1291
1292 // now update weights and other parameters for each type of boosting
1293 if( params.boost_type == DISCRETE )
1294 {
1295 // Discrete AdaBoost:
1296 // weak_eval[i] (=f(x_i)) is in {-1,1}
1297 // err = sum(w_i*(f(x_i) != y_i))/sum(w_i)
1298 // C = log((1-err)/err)
1299 // w_i *= exp(C*(f(x_i) != y_i))
1300
1301 double C, err = 0.;
1302 double scale[] = { 1., 0. };
1303
1304 for( i = 0; i < n; i++ )
1305 {
1306 double w = weights->data.db[i];
1307 sumw += w;
1308 err += w*(weak_eval->data.db[i] != orig_response->data.i[i]);
1309 }
1310
1311 if( sumw != 0 )
1312 err /= sumw;
1313 C = err = -log_ratio( err );
1314 scale[1] = exp(err);
1315
1316 sumw = 0;
1317 for( i = 0; i < n; i++ )
1318 {
1319 double w = weights->data.db[i]*
1320 scale[weak_eval->data.db[i] != orig_response->data.i[i]];
1321 sumw += w;
1322 weights->data.db[i] = w;
1323 }
1324
1325 tree->scale( C );
1326 }
1327 else if( params.boost_type == REAL )
1328 {
1329 // Real AdaBoost:
1330 // weak_eval[i] = f(x_i) = 0.5*log(p(x_i)/(1-p(x_i))), p(x_i)=P(y=1|x_i)
1331 // w_i *= exp(-y_i*f(x_i))
1332
1333 for( i = 0; i < n; i++ )
1334 weak_eval->data.db[i] *= -orig_response->data.i[i];
1335
1336 cvExp( weak_eval, weak_eval );
1337
1338 for( i = 0; i < n; i++ )
1339 {
1340 double w = weights->data.db[i]*weak_eval->data.db[i];
1341 sumw += w;
1342 weights->data.db[i] = w;
1343 }
1344 }
1345 else if( params.boost_type == LOGIT )
1346 {
1347 // LogitBoost:
1348 // weak_eval[i] = f(x_i) in [-z_max,z_max]
1349 // sum_response = F(x_i).
1350 // F(x_i) += 0.5*f(x_i)
1351 // p(x_i) = exp(F(x_i))/(exp(F(x_i)) + exp(-F(x_i))=1/(1+exp(-2*F(x_i)))
1352 // reuse weak_eval: weak_eval[i] <- p(x_i)
1353 // w_i = p(x_i)*1(1 - p(x_i))
1354 // z_i = ((y_i+1)/2 - p(x_i))/(p(x_i)*(1 - p(x_i)))
1355 // store z_i to the data->data_root as the new target responses
1356
1357 const double lb_weight_thresh = FLT_EPSILON;
1358 const double lb_z_max = 10.;
1359 /*float* responses_buf = data->get_resp_float_buf();
1360 const float* responses = 0;
1361 data->get_ord_responses(data->data_root, responses_buf, &responses);*/
1362
1363 /*if( weak->total == 7 )
1364 putchar('*');*/
1365
1366 for( i = 0; i < n; i++ )
1367 {
1368 double s = sum_response->data.db[i] + 0.5*weak_eval->data.db[i];
1369 sum_response->data.db[i] = s;
1370 weak_eval->data.db[i] = -2*s;
1371 }
1372
1373 cvExp( weak_eval, weak_eval );
1374
1375 for( i = 0; i < n; i++ )
1376 {
1377 double p = 1./(1. + weak_eval->data.db[i]);
1378 double w = p*(1 - p), z;
1379 w = MAX( w, lb_weight_thresh );
1380 weights->data.db[i] = w;
1381 sumw += w;
1382 if( orig_response->data.i[i] > 0 )
1383 {
1384 z = 1./p;
1385 fdata[sample_idx[i]*step] = (float)MIN(z, lb_z_max);
1386 }
1387 else
1388 {
1389 z = 1./(1-p);
1390 fdata[sample_idx[i]*step] = (float)-MIN(z, lb_z_max);
1391 }
1392 }
1393 }
1394 else
1395 {
1396 // Gentle AdaBoost:
1397 // weak_eval[i] = f(x_i) in [-1,1]
1398 // w_i *= exp(-y_i*f(x_i))
1399 assert( params.boost_type == GENTLE );
1400
1401 for( i = 0; i < n; i++ )
1402 weak_eval->data.db[i] *= -orig_response->data.i[i];
1403
1404 cvExp( weak_eval, weak_eval );
1405
1406 for( i = 0; i < n; i++ )
1407 {
1408 double w = weights->data.db[i] * weak_eval->data.db[i];
1409 weights->data.db[i] = w;
1410 sumw += w;
1411 }
1412 }
1413 }
1414
1415 // renormalize weights
1416 if( sumw > FLT_EPSILON )
1417 {
1418 sumw = 1./sumw;
1419 for( i = 0; i < n; ++i )
1420 weights->data.db[i] *= sumw;
1421 }
1422
1423 __END__;
1424 }
1425
1426
1427 void
trim_weights()1428 CvBoost::trim_weights()
1429 {
1430 //CV_FUNCNAME( "CvBoost::trim_weights" );
1431
1432 __BEGIN__;
1433
1434 int i, count = data->sample_count, nz_count = 0;
1435 double sum, threshold;
1436
1437 if( params.weight_trim_rate <= 0. || params.weight_trim_rate >= 1. )
1438 EXIT;
1439
1440 // use weak_eval as temporary buffer for sorted weights
1441 cvCopy( weights, weak_eval );
1442
1443 std::sort(weak_eval->data.db, weak_eval->data.db + count);
1444
1445 // as weight trimming occurs immediately after updating the weights,
1446 // where they are renormalized, we assume that the weight sum = 1.
1447 sum = 1. - params.weight_trim_rate;
1448
1449 for( i = 0; i < count; i++ )
1450 {
1451 double w = weak_eval->data.db[i];
1452 if( sum <= 0 )
1453 break;
1454 sum -= w;
1455 }
1456
1457 threshold = i < count ? weak_eval->data.db[i] : DBL_MAX;
1458
1459 for( i = 0; i < count; i++ )
1460 {
1461 double w = weights->data.db[i];
1462 int f = w >= threshold;
1463 subsample_mask->data.ptr[i] = (uchar)f;
1464 nz_count += f;
1465 }
1466
1467 have_subsample = nz_count < count;
1468
1469 __END__;
1470 }
1471
1472
1473 const CvMat*
get_active_vars(bool absolute_idx)1474 CvBoost::get_active_vars( bool absolute_idx )
1475 {
1476 CvMat* mask = 0;
1477 CvMat* inv_map = 0;
1478 CvMat* result = 0;
1479
1480 CV_FUNCNAME( "CvBoost::get_active_vars" );
1481
1482 __BEGIN__;
1483
1484 if( !weak )
1485 CV_ERROR( CV_StsError, "The boosted tree ensemble has not been trained yet" );
1486
1487 if( !active_vars || !active_vars_abs )
1488 {
1489 CvSeqReader reader;
1490 int i, j, nactive_vars;
1491 CvBoostTree* wtree;
1492 const CvDTreeNode* node;
1493
1494 assert(!active_vars && !active_vars_abs);
1495 mask = cvCreateMat( 1, data->var_count, CV_8U );
1496 inv_map = cvCreateMat( 1, data->var_count, CV_32S );
1497 cvZero( mask );
1498 cvSet( inv_map, cvScalar(-1) );
1499
1500 // first pass: compute the mask of used variables
1501 cvStartReadSeq( weak, &reader );
1502 for( i = 0; i < weak->total; i++ )
1503 {
1504 CV_READ_SEQ_ELEM(wtree, reader);
1505
1506 node = wtree->get_root();
1507 assert( node != 0 );
1508 for(;;)
1509 {
1510 const CvDTreeNode* parent;
1511 for(;;)
1512 {
1513 CvDTreeSplit* split = node->split;
1514 for( ; split != 0; split = split->next )
1515 mask->data.ptr[split->var_idx] = 1;
1516 if( !node->left )
1517 break;
1518 node = node->left;
1519 }
1520
1521 for( parent = node->parent; parent && parent->right == node;
1522 node = parent, parent = parent->parent )
1523 ;
1524
1525 if( !parent )
1526 break;
1527
1528 node = parent->right;
1529 }
1530 }
1531
1532 nactive_vars = cvCountNonZero(mask);
1533
1534 //if ( nactive_vars > 0 )
1535 {
1536 active_vars = cvCreateMat( 1, nactive_vars, CV_32S );
1537 active_vars_abs = cvCreateMat( 1, nactive_vars, CV_32S );
1538
1539 have_active_cat_vars = false;
1540
1541 for( i = j = 0; i < data->var_count; i++ )
1542 {
1543 if( mask->data.ptr[i] )
1544 {
1545 active_vars->data.i[j] = i;
1546 active_vars_abs->data.i[j] = data->var_idx ? data->var_idx->data.i[i] : i;
1547 inv_map->data.i[i] = j;
1548 if( data->var_type->data.i[i] >= 0 )
1549 have_active_cat_vars = true;
1550 j++;
1551 }
1552 }
1553
1554
1555 // second pass: now compute the condensed indices
1556 cvStartReadSeq( weak, &reader );
1557 for( i = 0; i < weak->total; i++ )
1558 {
1559 CV_READ_SEQ_ELEM(wtree, reader);
1560 node = wtree->get_root();
1561 for(;;)
1562 {
1563 const CvDTreeNode* parent;
1564 for(;;)
1565 {
1566 CvDTreeSplit* split = node->split;
1567 for( ; split != 0; split = split->next )
1568 {
1569 split->condensed_idx = inv_map->data.i[split->var_idx];
1570 assert( split->condensed_idx >= 0 );
1571 }
1572
1573 if( !node->left )
1574 break;
1575 node = node->left;
1576 }
1577
1578 for( parent = node->parent; parent && parent->right == node;
1579 node = parent, parent = parent->parent )
1580 ;
1581
1582 if( !parent )
1583 break;
1584
1585 node = parent->right;
1586 }
1587 }
1588 }
1589 }
1590
1591 result = absolute_idx ? active_vars_abs : active_vars;
1592
1593 __END__;
1594
1595 cvReleaseMat( &mask );
1596 cvReleaseMat( &inv_map );
1597
1598 return result;
1599 }
1600
1601
1602 float
predict(const CvMat * _sample,const CvMat * _missing,CvMat * weak_responses,CvSlice slice,bool raw_mode,bool return_sum) const1603 CvBoost::predict( const CvMat* _sample, const CvMat* _missing,
1604 CvMat* weak_responses, CvSlice slice,
1605 bool raw_mode, bool return_sum ) const
1606 {
1607 float value = -FLT_MAX;
1608
1609 CvSeqReader reader;
1610 double sum = 0;
1611 int wstep = 0;
1612 const float* sample_data;
1613
1614 if( !weak )
1615 CV_Error( CV_StsError, "The boosted tree ensemble has not been trained yet" );
1616
1617 if( !CV_IS_MAT(_sample) || CV_MAT_TYPE(_sample->type) != CV_32FC1 ||
1618 (_sample->cols != 1 && _sample->rows != 1) ||
1619 (_sample->cols + _sample->rows - 1 != data->var_all && !raw_mode) ||
1620 (active_vars && _sample->cols + _sample->rows - 1 != active_vars->cols && raw_mode) )
1621 CV_Error( CV_StsBadArg,
1622 "the input sample must be 1d floating-point vector with the same "
1623 "number of elements as the total number of variables or "
1624 "as the number of variables used for training" );
1625
1626 if( _missing )
1627 {
1628 if( !CV_IS_MAT(_missing) || !CV_IS_MASK_ARR(_missing) ||
1629 !CV_ARE_SIZES_EQ(_missing, _sample) )
1630 CV_Error( CV_StsBadArg,
1631 "the missing data mask must be 8-bit vector of the same size as input sample" );
1632 }
1633
1634 int i, weak_count = cvSliceLength( slice, weak );
1635 if( weak_count >= weak->total )
1636 {
1637 weak_count = weak->total;
1638 slice.start_index = 0;
1639 }
1640
1641 if( weak_responses )
1642 {
1643 if( !CV_IS_MAT(weak_responses) ||
1644 CV_MAT_TYPE(weak_responses->type) != CV_32FC1 ||
1645 (weak_responses->cols != 1 && weak_responses->rows != 1) ||
1646 weak_responses->cols + weak_responses->rows - 1 != weak_count )
1647 CV_Error( CV_StsBadArg,
1648 "The output matrix of weak classifier responses must be valid "
1649 "floating-point vector of the same number of components as the length of input slice" );
1650 wstep = CV_IS_MAT_CONT(weak_responses->type) ? 1 : weak_responses->step/sizeof(float);
1651 }
1652
1653 int var_count = active_vars->cols;
1654 const int* vtype = data->var_type->data.i;
1655 const int* cmap = data->cat_map->data.i;
1656 const int* cofs = data->cat_ofs->data.i;
1657
1658 cv::Mat sample = cv::cvarrToMat(_sample);
1659 cv::Mat missing;
1660 if(!_missing)
1661 missing = cv::cvarrToMat(_missing);
1662
1663 // if need, preprocess the input vector
1664 if( !raw_mode )
1665 {
1666 int sstep, mstep = 0;
1667 const float* src_sample;
1668 const uchar* src_mask = 0;
1669 float* dst_sample;
1670 uchar* dst_mask;
1671 const int* vidx = active_vars->data.i;
1672 const int* vidx_abs = active_vars_abs->data.i;
1673 bool have_mask = _missing != 0;
1674
1675 sample = cv::Mat(1, var_count, CV_32FC1);
1676 missing = cv::Mat(1, var_count, CV_8UC1);
1677
1678 dst_sample = sample.ptr<float>();
1679 dst_mask = missing.ptr<uchar>();
1680
1681 src_sample = _sample->data.fl;
1682 sstep = CV_IS_MAT_CONT(_sample->type) ? 1 : _sample->step/sizeof(src_sample[0]);
1683
1684 if( _missing )
1685 {
1686 src_mask = _missing->data.ptr;
1687 mstep = CV_IS_MAT_CONT(_missing->type) ? 1 : _missing->step;
1688 }
1689
1690 for( i = 0; i < var_count; i++ )
1691 {
1692 int idx = vidx[i], idx_abs = vidx_abs[i];
1693 float val = src_sample[idx_abs*sstep];
1694 int ci = vtype[idx];
1695 uchar m = src_mask ? src_mask[idx_abs*mstep] : (uchar)0;
1696
1697 if( ci >= 0 )
1698 {
1699 int a = cofs[ci], b = (ci+1 >= data->cat_ofs->cols) ? data->cat_map->cols : cofs[ci+1],
1700 c = a;
1701 int ival = cvRound(val);
1702 if ( (ival != val) && (!m) )
1703 CV_Error( CV_StsBadArg,
1704 "one of input categorical variable is not an integer" );
1705
1706 while( a < b )
1707 {
1708 c = (a + b) >> 1;
1709 if( ival < cmap[c] )
1710 b = c;
1711 else if( ival > cmap[c] )
1712 a = c+1;
1713 else
1714 break;
1715 }
1716
1717 if( c < 0 || ival != cmap[c] )
1718 {
1719 m = 1;
1720 have_mask = true;
1721 }
1722 else
1723 {
1724 val = (float)(c - cofs[ci]);
1725 }
1726 }
1727
1728 dst_sample[i] = val;
1729 dst_mask[i] = m;
1730 }
1731
1732 if( !have_mask )
1733 missing.release();
1734 }
1735 else
1736 {
1737 if( !CV_IS_MAT_CONT(_sample->type & (_missing ? _missing->type : -1)) )
1738 CV_Error( CV_StsBadArg, "In raw mode the input vectors must be continuous" );
1739 }
1740
1741 cvStartReadSeq( weak, &reader );
1742 cvSetSeqReaderPos( &reader, slice.start_index );
1743
1744 sample_data = sample.ptr<float>();
1745
1746 if( !have_active_cat_vars && missing.empty() && !weak_responses )
1747 {
1748 for( i = 0; i < weak_count; i++ )
1749 {
1750 CvBoostTree* wtree;
1751 const CvDTreeNode* node;
1752 CV_READ_SEQ_ELEM( wtree, reader );
1753
1754 node = wtree->get_root();
1755 while( node->left )
1756 {
1757 CvDTreeSplit* split = node->split;
1758 int vi = split->condensed_idx;
1759 float val = sample_data[vi];
1760 int dir = val <= split->ord.c ? -1 : 1;
1761 if( split->inversed )
1762 dir = -dir;
1763 node = dir < 0 ? node->left : node->right;
1764 }
1765 sum += node->value;
1766 }
1767 }
1768 else
1769 {
1770 const int* avars = active_vars->data.i;
1771 const uchar* m = !missing.empty() ? missing.ptr<uchar>() : 0;
1772
1773 // full-featured version
1774 for( i = 0; i < weak_count; i++ )
1775 {
1776 CvBoostTree* wtree;
1777 const CvDTreeNode* node;
1778 CV_READ_SEQ_ELEM( wtree, reader );
1779
1780 node = wtree->get_root();
1781 while( node->left )
1782 {
1783 const CvDTreeSplit* split = node->split;
1784 int dir = 0;
1785 for( ; !dir && split != 0; split = split->next )
1786 {
1787 int vi = split->condensed_idx;
1788 int ci = vtype[avars[vi]];
1789 float val = sample_data[vi];
1790 if( m && m[vi] )
1791 continue;
1792 if( ci < 0 ) // ordered
1793 dir = val <= split->ord.c ? -1 : 1;
1794 else // categorical
1795 {
1796 int c = cvRound(val);
1797 dir = CV_DTREE_CAT_DIR(c, split->subset);
1798 }
1799 if( split->inversed )
1800 dir = -dir;
1801 }
1802
1803 if( !dir )
1804 {
1805 int diff = node->right->sample_count - node->left->sample_count;
1806 dir = diff < 0 ? -1 : 1;
1807 }
1808 node = dir < 0 ? node->left : node->right;
1809 }
1810 if( weak_responses )
1811 weak_responses->data.fl[i*wstep] = (float)node->value;
1812 sum += node->value;
1813 }
1814 }
1815
1816 if( return_sum )
1817 value = (float)sum;
1818 else
1819 {
1820 int cls_idx = sum >= 0;
1821 if( raw_mode )
1822 value = (float)cls_idx;
1823 else
1824 value = (float)cmap[cofs[vtype[data->var_count]] + cls_idx];
1825 }
1826
1827 return value;
1828 }
1829
calc_error(CvMLData * _data,int type,std::vector<float> * resp)1830 float CvBoost::calc_error( CvMLData* _data, int type, std::vector<float> *resp )
1831 {
1832 float err = 0;
1833 const CvMat* values = _data->get_values();
1834 const CvMat* response = _data->get_responses();
1835 const CvMat* missing = _data->get_missing();
1836 const CvMat* sample_idx = (type == CV_TEST_ERROR) ? _data->get_test_sample_idx() : _data->get_train_sample_idx();
1837 const CvMat* var_types = _data->get_var_types();
1838 int* sidx = sample_idx ? sample_idx->data.i : 0;
1839 int r_step = CV_IS_MAT_CONT(response->type) ?
1840 1 : response->step / CV_ELEM_SIZE(response->type);
1841 bool is_classifier = var_types->data.ptr[var_types->cols-1] == CV_VAR_CATEGORICAL;
1842 int sample_count = sample_idx ? sample_idx->cols : 0;
1843 sample_count = (type == CV_TRAIN_ERROR && sample_count == 0) ? values->rows : sample_count;
1844 float* pred_resp = 0;
1845 if( resp && (sample_count > 0) )
1846 {
1847 resp->resize( sample_count );
1848 pred_resp = &((*resp)[0]);
1849 }
1850 if ( is_classifier )
1851 {
1852 for( int i = 0; i < sample_count; i++ )
1853 {
1854 CvMat sample, miss;
1855 int si = sidx ? sidx[i] : i;
1856 cvGetRow( values, &sample, si );
1857 if( missing )
1858 cvGetRow( missing, &miss, si );
1859 float r = (float)predict( &sample, missing ? &miss : 0 );
1860 if( pred_resp )
1861 pred_resp[i] = r;
1862 int d = fabs((double)r - response->data.fl[si*r_step]) <= FLT_EPSILON ? 0 : 1;
1863 err += d;
1864 }
1865 err = sample_count ? err / (float)sample_count * 100 : -FLT_MAX;
1866 }
1867 else
1868 {
1869 for( int i = 0; i < sample_count; i++ )
1870 {
1871 CvMat sample, miss;
1872 int si = sidx ? sidx[i] : i;
1873 cvGetRow( values, &sample, si );
1874 if( missing )
1875 cvGetRow( missing, &miss, si );
1876 float r = (float)predict( &sample, missing ? &miss : 0 );
1877 if( pred_resp )
1878 pred_resp[i] = r;
1879 float d = r - response->data.fl[si*r_step];
1880 err += d*d;
1881 }
1882 err = sample_count ? err / (float)sample_count : -FLT_MAX;
1883 }
1884 return err;
1885 }
1886
write_params(CvFileStorage * fs) const1887 void CvBoost::write_params( CvFileStorage* fs ) const
1888 {
1889 const char* boost_type_str =
1890 params.boost_type == DISCRETE ? "DiscreteAdaboost" :
1891 params.boost_type == REAL ? "RealAdaboost" :
1892 params.boost_type == LOGIT ? "LogitBoost" :
1893 params.boost_type == GENTLE ? "GentleAdaboost" : 0;
1894
1895 const char* split_crit_str =
1896 params.split_criteria == DEFAULT ? "Default" :
1897 params.split_criteria == GINI ? "Gini" :
1898 params.boost_type == MISCLASS ? "Misclassification" :
1899 params.boost_type == SQERR ? "SquaredErr" : 0;
1900
1901 if( boost_type_str )
1902 cvWriteString( fs, "boosting_type", boost_type_str );
1903 else
1904 cvWriteInt( fs, "boosting_type", params.boost_type );
1905
1906 if( split_crit_str )
1907 cvWriteString( fs, "splitting_criteria", split_crit_str );
1908 else
1909 cvWriteInt( fs, "splitting_criteria", params.split_criteria );
1910
1911 cvWriteInt( fs, "ntrees", weak->total );
1912 cvWriteReal( fs, "weight_trimming_rate", params.weight_trim_rate );
1913
1914 data->write_params( fs );
1915 }
1916
1917
read_params(CvFileStorage * fs,CvFileNode * fnode)1918 void CvBoost::read_params( CvFileStorage* fs, CvFileNode* fnode )
1919 {
1920 CV_FUNCNAME( "CvBoost::read_params" );
1921
1922 __BEGIN__;
1923
1924 CvFileNode* temp;
1925
1926 if( !fnode || !CV_NODE_IS_MAP(fnode->tag) )
1927 return;
1928
1929 data = new CvDTreeTrainData();
1930 CV_CALL( data->read_params(fs, fnode));
1931 data->shared = true;
1932
1933 params.max_depth = data->params.max_depth;
1934 params.min_sample_count = data->params.min_sample_count;
1935 params.max_categories = data->params.max_categories;
1936 params.priors = data->params.priors;
1937 params.regression_accuracy = data->params.regression_accuracy;
1938 params.use_surrogates = data->params.use_surrogates;
1939
1940 temp = cvGetFileNodeByName( fs, fnode, "boosting_type" );
1941 if( !temp )
1942 return;
1943
1944 if( temp && CV_NODE_IS_STRING(temp->tag) )
1945 {
1946 const char* boost_type_str = cvReadString( temp, "" );
1947 params.boost_type = strcmp( boost_type_str, "DiscreteAdaboost" ) == 0 ? DISCRETE :
1948 strcmp( boost_type_str, "RealAdaboost" ) == 0 ? REAL :
1949 strcmp( boost_type_str, "LogitBoost" ) == 0 ? LOGIT :
1950 strcmp( boost_type_str, "GentleAdaboost" ) == 0 ? GENTLE : -1;
1951 }
1952 else
1953 params.boost_type = cvReadInt( temp, -1 );
1954
1955 if( params.boost_type < DISCRETE || params.boost_type > GENTLE )
1956 CV_ERROR( CV_StsBadArg, "Unknown boosting type" );
1957
1958 temp = cvGetFileNodeByName( fs, fnode, "splitting_criteria" );
1959 if( temp && CV_NODE_IS_STRING(temp->tag) )
1960 {
1961 const char* split_crit_str = cvReadString( temp, "" );
1962 params.split_criteria = strcmp( split_crit_str, "Default" ) == 0 ? DEFAULT :
1963 strcmp( split_crit_str, "Gini" ) == 0 ? GINI :
1964 strcmp( split_crit_str, "Misclassification" ) == 0 ? MISCLASS :
1965 strcmp( split_crit_str, "SquaredErr" ) == 0 ? SQERR : -1;
1966 }
1967 else
1968 params.split_criteria = cvReadInt( temp, -1 );
1969
1970 if( params.split_criteria < DEFAULT || params.boost_type > SQERR )
1971 CV_ERROR( CV_StsBadArg, "Unknown boosting type" );
1972
1973 params.weak_count = cvReadIntByName( fs, fnode, "ntrees" );
1974 params.weight_trim_rate = cvReadRealByName( fs, fnode, "weight_trimming_rate", 0. );
1975
1976 __END__;
1977 }
1978
1979
1980
1981 void
read(CvFileStorage * fs,CvFileNode * node)1982 CvBoost::read( CvFileStorage* fs, CvFileNode* node )
1983 {
1984 CV_FUNCNAME( "CvBoost::read" );
1985
1986 __BEGIN__;
1987
1988 CvSeqReader reader;
1989 CvFileNode* trees_fnode;
1990 CvMemStorage* storage;
1991 int i, ntrees;
1992
1993 clear();
1994 read_params( fs, node );
1995
1996 if( !data )
1997 EXIT;
1998
1999 trees_fnode = cvGetFileNodeByName( fs, node, "trees" );
2000 if( !trees_fnode || !CV_NODE_IS_SEQ(trees_fnode->tag) )
2001 CV_ERROR( CV_StsParseError, "<trees> tag is missing" );
2002
2003 cvStartReadSeq( trees_fnode->data.seq, &reader );
2004 ntrees = trees_fnode->data.seq->total;
2005
2006 if( ntrees != params.weak_count )
2007 CV_ERROR( CV_StsUnmatchedSizes,
2008 "The number of trees stored does not match <ntrees> tag value" );
2009
2010 CV_CALL( storage = cvCreateMemStorage() );
2011 weak = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvBoostTree*), storage );
2012
2013 for( i = 0; i < ntrees; i++ )
2014 {
2015 CvBoostTree* tree = new CvBoostTree();
2016 CV_CALL(tree->read( fs, (CvFileNode*)reader.ptr, this, data ));
2017 CV_NEXT_SEQ_ELEM( reader.seq->elem_size, reader );
2018 cvSeqPush( weak, &tree );
2019 }
2020 get_active_vars();
2021
2022 __END__;
2023 }
2024
2025
2026 void
write(CvFileStorage * fs,const char * name) const2027 CvBoost::write( CvFileStorage* fs, const char* name ) const
2028 {
2029 CV_FUNCNAME( "CvBoost::write" );
2030
2031 __BEGIN__;
2032
2033 CvSeqReader reader;
2034 int i;
2035
2036 cvStartWriteStruct( fs, name, CV_NODE_MAP, CV_TYPE_NAME_ML_BOOSTING );
2037
2038 if( !weak )
2039 CV_ERROR( CV_StsBadArg, "The classifier has not been trained yet" );
2040
2041 write_params( fs );
2042 cvStartWriteStruct( fs, "trees", CV_NODE_SEQ );
2043
2044 cvStartReadSeq( weak, &reader );
2045
2046 for( i = 0; i < weak->total; i++ )
2047 {
2048 CvBoostTree* tree;
2049 CV_READ_SEQ_ELEM( tree, reader );
2050 cvStartWriteStruct( fs, 0, CV_NODE_MAP );
2051 tree->write( fs );
2052 cvEndWriteStruct( fs );
2053 }
2054
2055 cvEndWriteStruct( fs );
2056 cvEndWriteStruct( fs );
2057
2058 __END__;
2059 }
2060
2061
2062 CvMat*
get_weights()2063 CvBoost::get_weights()
2064 {
2065 return weights;
2066 }
2067
2068
2069 CvMat*
get_subtree_weights()2070 CvBoost::get_subtree_weights()
2071 {
2072 return subtree_weights;
2073 }
2074
2075
2076 CvMat*
get_weak_response()2077 CvBoost::get_weak_response()
2078 {
2079 return weak_eval;
2080 }
2081
2082
2083 const CvBoostParams&
get_params() const2084 CvBoost::get_params() const
2085 {
2086 return params;
2087 }
2088
get_weak_predictors()2089 CvSeq* CvBoost::get_weak_predictors()
2090 {
2091 return weak;
2092 }
2093
get_data() const2094 const CvDTreeTrainData* CvBoost::get_data() const
2095 {
2096 return data;
2097 }
2098
2099 using namespace cv;
2100
CvBoost(const Mat & _train_data,int _tflag,const Mat & _responses,const Mat & _var_idx,const Mat & _sample_idx,const Mat & _var_type,const Mat & _missing_mask,CvBoostParams _params)2101 CvBoost::CvBoost( const Mat& _train_data, int _tflag,
2102 const Mat& _responses, const Mat& _var_idx,
2103 const Mat& _sample_idx, const Mat& _var_type,
2104 const Mat& _missing_mask,
2105 CvBoostParams _params )
2106 {
2107 weak = 0;
2108 data = 0;
2109 default_model_name = "my_boost_tree";
2110 active_vars = active_vars_abs = orig_response = sum_response = weak_eval =
2111 subsample_mask = weights = subtree_weights = 0;
2112
2113 train( _train_data, _tflag, _responses, _var_idx, _sample_idx,
2114 _var_type, _missing_mask, _params );
2115 }
2116
2117
2118 bool
train(const Mat & _train_data,int _tflag,const Mat & _responses,const Mat & _var_idx,const Mat & _sample_idx,const Mat & _var_type,const Mat & _missing_mask,CvBoostParams _params,bool _update)2119 CvBoost::train( const Mat& _train_data, int _tflag,
2120 const Mat& _responses, const Mat& _var_idx,
2121 const Mat& _sample_idx, const Mat& _var_type,
2122 const Mat& _missing_mask,
2123 CvBoostParams _params, bool _update )
2124 {
2125 train_data_hdr = cvMat(_train_data);
2126 train_data_mat = _train_data;
2127 responses_hdr = cvMat(_responses);
2128 responses_mat = _responses;
2129
2130 CvMat vidx = cvMat(_var_idx), sidx = cvMat(_sample_idx), vtype = cvMat(_var_type), mmask = cvMat(_missing_mask);
2131
2132 return train(&train_data_hdr, _tflag, &responses_hdr, vidx.data.ptr ? &vidx : 0,
2133 sidx.data.ptr ? &sidx : 0, vtype.data.ptr ? &vtype : 0,
2134 mmask.data.ptr ? &mmask : 0, _params, _update);
2135 }
2136
2137 float
predict(const Mat & _sample,const Mat & _missing,const Range & slice,bool raw_mode,bool return_sum) const2138 CvBoost::predict( const Mat& _sample, const Mat& _missing,
2139 const Range& slice, bool raw_mode, bool return_sum ) const
2140 {
2141 CvMat sample = cvMat(_sample), mmask = cvMat(_missing);
2142 /*if( weak_responses )
2143 {
2144 int weak_count = cvSliceLength( slice, weak );
2145 if( weak_count >= weak->total )
2146 {
2147 weak_count = weak->total;
2148 slice.start_index = 0;
2149 }
2150
2151 if( !(weak_responses->data && weak_responses->type() == CV_32FC1 &&
2152 (weak_responses->cols == 1 || weak_responses->rows == 1) &&
2153 weak_responses->cols + weak_responses->rows - 1 == weak_count) )
2154 weak_responses->create(weak_count, 1, CV_32FC1);
2155 pwr = &(wr = *weak_responses);
2156 }*/
2157 return predict(&sample, _missing.empty() ? 0 : &mmask, 0,
2158 slice == Range::all() ? CV_WHOLE_SEQ : cvSlice(slice.start, slice.end),
2159 raw_mode, return_sum);
2160 }
2161
2162 /* End of file. */
2163