1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                        Intel License Agreement
11 //
12 // Copyright (C) 2000, Intel Corporation, all rights reserved.
13 // Third party copyrights are property of their respective owners.
14 //
15 // Redistribution and use in source and binary forms, with or without modification,
16 // are permitted provided that the following conditions are met:
17 //
18 //   * Redistribution's of source code must retain the above copyright notice,
19 //     this list of conditions and the following disclaimer.
20 //
21 //   * Redistribution's in binary form must reproduce the above copyright notice,
22 //     this list of conditions and the following disclaimer in the documentation
23 //     and/or other materials provided with the distribution.
24 //
25 //   * The name of Intel Corporation may not be used to endorse or promote products
26 //     derived from this software without specific prior written permission.
27 //
28 // This software is provided by the copyright holders and contributors "as is" and
29 // any express or implied warranties, including, but not limited to, the implied
30 // warranties of merchantability and fitness for a particular purpose are disclaimed.
31 // In no event shall the Intel Corporation or contributors be liable for any direct,
32 // indirect, incidental, special, exemplary, or consequential damages
33 // (including, but not limited to, procurement of substitute goods or services;
34 // loss of use, data, or profits; or business interruption) however caused
35 // and on any theory of liability, whether in contract, strict liability,
36 // or tort (including negligence or otherwise) arising in any way out of
37 // the use of this software, even if advised of the possibility of such damage.
38 //
39 //M*/
40 
41 #include "old_ml_precomp.hpp"
42 
43 static inline double
log_ratio(double val)44 log_ratio( double val )
45 {
46     const double eps = 1e-5;
47 
48     val = MAX( val, eps );
49     val = MIN( val, 1. - eps );
50     return log( val/(1. - val) );
51 }
52 
53 
CvBoostParams()54 CvBoostParams::CvBoostParams()
55 {
56     boost_type = CvBoost::REAL;
57     weak_count = 100;
58     weight_trim_rate = 0.95;
59     cv_folds = 0;
60     max_depth = 1;
61 }
62 
63 
CvBoostParams(int _boost_type,int _weak_count,double _weight_trim_rate,int _max_depth,bool _use_surrogates,const float * _priors)64 CvBoostParams::CvBoostParams( int _boost_type, int _weak_count,
65                                         double _weight_trim_rate, int _max_depth,
66                                         bool _use_surrogates, const float* _priors )
67 {
68     boost_type = _boost_type;
69     weak_count = _weak_count;
70     weight_trim_rate = _weight_trim_rate;
71     split_criteria = CvBoost::DEFAULT;
72     cv_folds = 0;
73     max_depth = _max_depth;
74     use_surrogates = _use_surrogates;
75     priors = _priors;
76 }
77 
78 
79 
80 ///////////////////////////////// CvBoostTree ///////////////////////////////////
81 
CvBoostTree()82 CvBoostTree::CvBoostTree()
83 {
84     ensemble = 0;
85 }
86 
87 
~CvBoostTree()88 CvBoostTree::~CvBoostTree()
89 {
90     clear();
91 }
92 
93 
94 void
clear()95 CvBoostTree::clear()
96 {
97     CvDTree::clear();
98     ensemble = 0;
99 }
100 
101 
102 bool
train(CvDTreeTrainData * _train_data,const CvMat * _subsample_idx,CvBoost * _ensemble)103 CvBoostTree::train( CvDTreeTrainData* _train_data,
104                     const CvMat* _subsample_idx, CvBoost* _ensemble )
105 {
106     clear();
107     ensemble = _ensemble;
108     data = _train_data;
109     data->shared = true;
110     return do_train( _subsample_idx );
111 }
112 
113 
114 bool
train(const CvMat *,int,const CvMat *,const CvMat *,const CvMat *,const CvMat *,const CvMat *,CvDTreeParams)115 CvBoostTree::train( const CvMat*, int, const CvMat*, const CvMat*,
116                     const CvMat*, const CvMat*, const CvMat*, CvDTreeParams )
117 {
118     assert(0);
119     return false;
120 }
121 
122 
123 bool
train(CvDTreeTrainData *,const CvMat *)124 CvBoostTree::train( CvDTreeTrainData*, const CvMat* )
125 {
126     assert(0);
127     return false;
128 }
129 
130 
131 void
scale(double _scale)132 CvBoostTree::scale( double _scale )
133 {
134     CvDTreeNode* node = root;
135 
136     // traverse the tree and scale all the node values
137     for(;;)
138     {
139         CvDTreeNode* parent;
140         for(;;)
141         {
142             node->value *= _scale;
143             if( !node->left )
144                 break;
145             node = node->left;
146         }
147 
148         for( parent = node->parent; parent && parent->right == node;
149             node = parent, parent = parent->parent )
150             ;
151 
152         if( !parent )
153             break;
154 
155         node = parent->right;
156     }
157 }
158 
159 
160 void
try_split_node(CvDTreeNode * node)161 CvBoostTree::try_split_node( CvDTreeNode* node )
162 {
163     CvDTree::try_split_node( node );
164 
165     if( !node->left )
166     {
167         // if the node has not been split,
168         // store the responses for the corresponding training samples
169         double* weak_eval = ensemble->get_weak_response()->data.db;
170         cv::AutoBuffer<int> inn_buf(node->sample_count);
171         const int* labels = data->get_cv_labels(node, inn_buf.data());
172         int i, count = node->sample_count;
173         double value = node->value;
174 
175         for( i = 0; i < count; i++ )
176             weak_eval[labels[i]] = value;
177     }
178 }
179 
180 
181 double
calc_node_dir(CvDTreeNode * node)182 CvBoostTree::calc_node_dir( CvDTreeNode* node )
183 {
184     char* dir = (char*)data->direction->data.ptr;
185     const double* weights = ensemble->get_subtree_weights()->data.db;
186     int i, n = node->sample_count, vi = node->split->var_idx;
187     double L, R;
188 
189     assert( !node->split->inversed );
190 
191     if( data->get_var_type(vi) >= 0 ) // split on categorical var
192     {
193         cv::AutoBuffer<int> inn_buf(n);
194         const int* cat_labels = data->get_cat_var_data(node, vi, inn_buf.data());
195         const int* subset = node->split->subset;
196         double sum = 0, sum_abs = 0;
197 
198         for( i = 0; i < n; i++ )
199         {
200             int idx = ((cat_labels[i] == 65535) && data->is_buf_16u) ? -1 : cat_labels[i];
201             double w = weights[i];
202             int d = idx >= 0 ? CV_DTREE_CAT_DIR(idx,subset) : 0;
203             sum += d*w; sum_abs += (d & 1)*w;
204             dir[i] = (char)d;
205         }
206 
207         R = (sum_abs + sum) * 0.5;
208         L = (sum_abs - sum) * 0.5;
209     }
210     else // split on ordered var
211     {
212         cv::AutoBuffer<uchar> inn_buf(2*n*sizeof(int)+n*sizeof(float));
213         float* values_buf = (float*)inn_buf.data();
214         int* sorted_indices_buf = (int*)(values_buf + n);
215         int* sample_indices_buf = sorted_indices_buf + n;
216         const float* values = 0;
217         const int* sorted_indices = 0;
218         data->get_ord_var_data( node, vi, values_buf, sorted_indices_buf, &values, &sorted_indices, sample_indices_buf );
219         int split_point = node->split->ord.split_point;
220         int n1 = node->get_num_valid(vi);
221 
222         assert( 0 <= split_point && split_point < n1-1 );
223         L = R = 0;
224 
225         for( i = 0; i <= split_point; i++ )
226         {
227             int idx = sorted_indices[i];
228             double w = weights[idx];
229             dir[idx] = (char)-1;
230             L += w;
231         }
232 
233         for( ; i < n1; i++ )
234         {
235             int idx = sorted_indices[i];
236             double w = weights[idx];
237             dir[idx] = (char)1;
238             R += w;
239         }
240 
241         for( ; i < n; i++ )
242             dir[sorted_indices[i]] = (char)0;
243     }
244 
245     node->maxlr = MAX( L, R );
246     return node->split->quality/(L + R);
247 }
248 
249 
250 CvDTreeSplit*
find_split_ord_class(CvDTreeNode * node,int vi,float init_quality,CvDTreeSplit * _split,uchar * _ext_buf)251 CvBoostTree::find_split_ord_class( CvDTreeNode* node, int vi, float init_quality,
252                                     CvDTreeSplit* _split, uchar* _ext_buf )
253 {
254     const float epsilon = FLT_EPSILON*2;
255 
256     const double* weights = ensemble->get_subtree_weights()->data.db;
257     int n = node->sample_count;
258     int n1 = node->get_num_valid(vi);
259 
260     cv::AutoBuffer<uchar> inn_buf;
261     if( !_ext_buf )
262         inn_buf.allocate(n*(3*sizeof(int)+sizeof(float)));
263     uchar* ext_buf = _ext_buf ? _ext_buf : inn_buf.data();
264     float* values_buf = (float*)ext_buf;
265     int* sorted_indices_buf = (int*)(values_buf + n);
266     int* sample_indices_buf = sorted_indices_buf + n;
267     const float* values = 0;
268     const int* sorted_indices = 0;
269     data->get_ord_var_data( node, vi, values_buf, sorted_indices_buf, &values, &sorted_indices, sample_indices_buf );
270     int* responses_buf = sorted_indices_buf + n;
271     const int* responses = data->get_class_labels( node, responses_buf );
272     const double* rcw0 = weights + n;
273     double lcw[2] = {0,0}, rcw[2];
274     int i, best_i = -1;
275     double best_val = init_quality;
276     int boost_type = ensemble->get_params().boost_type;
277     int split_criteria = ensemble->get_params().split_criteria;
278 
279     rcw[0] = rcw0[0]; rcw[1] = rcw0[1];
280     for( i = n1; i < n; i++ )
281     {
282         int idx = sorted_indices[i];
283         double w = weights[idx];
284         rcw[responses[idx]] -= w;
285     }
286 
287     if( split_criteria != CvBoost::GINI && split_criteria != CvBoost::MISCLASS )
288         split_criteria = boost_type == CvBoost::DISCRETE ? CvBoost::MISCLASS : CvBoost::GINI;
289 
290     if( split_criteria == CvBoost::GINI )
291     {
292         double L = 0, R = rcw[0] + rcw[1];
293         double lsum2 = 0, rsum2 = rcw[0]*rcw[0] + rcw[1]*rcw[1];
294 
295         for( i = 0; i < n1 - 1; i++ )
296         {
297             int idx = sorted_indices[i];
298             double w = weights[idx], w2 = w*w;
299             double lv, rv;
300             idx = responses[idx];
301             L += w; R -= w;
302             lv = lcw[idx]; rv = rcw[idx];
303             lsum2 += 2*lv*w + w2;
304             rsum2 -= 2*rv*w - w2;
305             lcw[idx] = lv + w; rcw[idx] = rv - w;
306 
307             if( values[i] + epsilon < values[i+1] )
308             {
309                 double val = (lsum2*R + rsum2*L)/(L*R);
310                 if( best_val < val )
311                 {
312                     best_val = val;
313                     best_i = i;
314                 }
315             }
316         }
317     }
318     else
319     {
320         for( i = 0; i < n1 - 1; i++ )
321         {
322             int idx = sorted_indices[i];
323             double w = weights[idx];
324             idx = responses[idx];
325             lcw[idx] += w;
326             rcw[idx] -= w;
327 
328             if( values[i] + epsilon < values[i+1] )
329             {
330                 double val = lcw[0] + rcw[1], val2 = lcw[1] + rcw[0];
331                 val = MAX(val, val2);
332                 if( best_val < val )
333                 {
334                     best_val = val;
335                     best_i = i;
336                 }
337             }
338         }
339     }
340 
341     CvDTreeSplit* split = 0;
342     if( best_i >= 0 )
343     {
344         split = _split ? _split : data->new_split_ord( 0, 0.0f, 0, 0, 0.0f );
345         split->var_idx = vi;
346         split->ord.c = (values[best_i] + values[best_i+1])*0.5f;
347         split->ord.split_point = best_i;
348         split->inversed = 0;
349         split->quality = (float)best_val;
350     }
351     return split;
352 }
353 
354 template<typename T>
355 class LessThanPtr
356 {
357 public:
operator ()(T * a,T * b) const358     bool operator()(T* a, T* b) const { return *a < *b; }
359 };
360 
361 CvDTreeSplit*
find_split_cat_class(CvDTreeNode * node,int vi,float init_quality,CvDTreeSplit * _split,uchar * _ext_buf)362 CvBoostTree::find_split_cat_class( CvDTreeNode* node, int vi, float init_quality, CvDTreeSplit* _split, uchar* _ext_buf )
363 {
364     int ci = data->get_var_type(vi);
365     int n = node->sample_count;
366     int mi = data->cat_count->data.i[ci];
367 
368     int base_size = (2*mi+3)*sizeof(double) + mi*sizeof(double*);
369     cv::AutoBuffer<uchar> inn_buf((2*mi+3)*sizeof(double) + mi*sizeof(double*));
370     if( !_ext_buf)
371         inn_buf.allocate( base_size + 2*n*sizeof(int) );
372     uchar* base_buf = inn_buf.data();
373     uchar* ext_buf = _ext_buf ? _ext_buf : base_buf + base_size;
374 
375     int* cat_labels_buf = (int*)ext_buf;
376     const int* cat_labels = data->get_cat_var_data(node, vi, cat_labels_buf);
377     int* responses_buf = cat_labels_buf + n;
378     const int* responses = data->get_class_labels(node, responses_buf);
379     double lcw[2]={0,0}, rcw[2]={0,0};
380 
381     double* cjk = (double*)cv::alignPtr(base_buf,sizeof(double))+2;
382     const double* weights = ensemble->get_subtree_weights()->data.db;
383     double** dbl_ptr = (double**)(cjk + 2*mi);
384     int i, j, k, idx;
385     double L = 0, R;
386     double best_val = init_quality;
387     int best_subset = -1, subset_i;
388     int boost_type = ensemble->get_params().boost_type;
389     int split_criteria = ensemble->get_params().split_criteria;
390 
391     // init array of counters:
392     // c_{jk} - number of samples that have vi-th input variable = j and response = k.
393     for( j = -1; j < mi; j++ )
394         cjk[j*2] = cjk[j*2+1] = 0;
395 
396     for( i = 0; i < n; i++ )
397     {
398         double w = weights[i];
399         j = ((cat_labels[i] == 65535) && data->is_buf_16u) ? -1 : cat_labels[i];
400         k = responses[i];
401         cjk[j*2 + k] += w;
402     }
403 
404     for( j = 0; j < mi; j++ )
405     {
406         rcw[0] += cjk[j*2];
407         rcw[1] += cjk[j*2+1];
408         dbl_ptr[j] = cjk + j*2 + 1;
409     }
410 
411     R = rcw[0] + rcw[1];
412 
413     if( split_criteria != CvBoost::GINI && split_criteria != CvBoost::MISCLASS )
414         split_criteria = boost_type == CvBoost::DISCRETE ? CvBoost::MISCLASS : CvBoost::GINI;
415 
416     // sort rows of c_jk by increasing c_j,1
417     // (i.e. by the weight of samples in j-th category that belong to class 1)
418     std::sort(dbl_ptr, dbl_ptr + mi, LessThanPtr<double>());
419 
420     for( subset_i = 0; subset_i < mi-1; subset_i++ )
421     {
422         idx = (int)(dbl_ptr[subset_i] - cjk)/2;
423         const double* crow = cjk + idx*2;
424         double w0 = crow[0], w1 = crow[1];
425         double weight = w0 + w1;
426 
427         if( weight < FLT_EPSILON )
428             continue;
429 
430         lcw[0] += w0; rcw[0] -= w0;
431         lcw[1] += w1; rcw[1] -= w1;
432 
433         if( split_criteria == CvBoost::GINI )
434         {
435             double lsum2 = lcw[0]*lcw[0] + lcw[1]*lcw[1];
436             double rsum2 = rcw[0]*rcw[0] + rcw[1]*rcw[1];
437 
438             L += weight;
439             R -= weight;
440 
441             if( L > FLT_EPSILON && R > FLT_EPSILON )
442             {
443                 double val = (lsum2*R + rsum2*L)/(L*R);
444                 if( best_val < val )
445                 {
446                     best_val = val;
447                     best_subset = subset_i;
448                 }
449             }
450         }
451         else
452         {
453             double val = lcw[0] + rcw[1];
454             double val2 = lcw[1] + rcw[0];
455 
456             val = MAX(val, val2);
457             if( best_val < val )
458             {
459                 best_val = val;
460                 best_subset = subset_i;
461             }
462         }
463     }
464 
465     CvDTreeSplit* split = 0;
466     if( best_subset >= 0 )
467     {
468         split = _split ? _split : data->new_split_cat( 0, -1.0f);
469         split->var_idx = vi;
470         split->quality = (float)best_val;
471         memset( split->subset, 0, (data->max_c_count + 31)/32 * sizeof(int));
472         for( i = 0; i <= best_subset; i++ )
473         {
474             idx = (int)(dbl_ptr[i] - cjk) >> 1;
475             split->subset[idx >> 5] |= 1 << (idx & 31);
476         }
477     }
478     return split;
479 }
480 
481 
482 CvDTreeSplit*
find_split_ord_reg(CvDTreeNode * node,int vi,float init_quality,CvDTreeSplit * _split,uchar * _ext_buf)483 CvBoostTree::find_split_ord_reg( CvDTreeNode* node, int vi, float init_quality, CvDTreeSplit* _split, uchar* _ext_buf )
484 {
485     const float epsilon = FLT_EPSILON*2;
486     const double* weights = ensemble->get_subtree_weights()->data.db;
487     int n = node->sample_count;
488     int n1 = node->get_num_valid(vi);
489 
490     cv::AutoBuffer<uchar> inn_buf;
491     if( !_ext_buf )
492         inn_buf.allocate(2*n*(sizeof(int)+sizeof(float)));
493     uchar* ext_buf = _ext_buf ? _ext_buf : inn_buf.data();
494 
495     float* values_buf = (float*)ext_buf;
496     int* indices_buf = (int*)(values_buf + n);
497     int* sample_indices_buf = indices_buf + n;
498     const float* values = 0;
499     const int* indices = 0;
500     data->get_ord_var_data( node, vi, values_buf, indices_buf, &values, &indices, sample_indices_buf );
501     float* responses_buf = (float*)(indices_buf + n);
502     const float* responses = data->get_ord_responses( node, responses_buf, sample_indices_buf );
503 
504     int i, best_i = -1;
505     double L = 0, R = weights[n];
506     double best_val = init_quality, lsum = 0, rsum = node->value*R;
507 
508     // compensate for missing values
509     for( i = n1; i < n; i++ )
510     {
511         int idx = indices[i];
512         double w = weights[idx];
513         rsum -= responses[idx]*w;
514         R -= w;
515     }
516 
517     // find the optimal split
518     for( i = 0; i < n1 - 1; i++ )
519     {
520         int idx = indices[i];
521         double w = weights[idx];
522         double t = responses[idx]*w;
523         L += w; R -= w;
524         lsum += t; rsum -= t;
525 
526         if( values[i] + epsilon < values[i+1] )
527         {
528             double val = (lsum*lsum*R + rsum*rsum*L)/(L*R);
529             if( best_val < val )
530             {
531                 best_val = val;
532                 best_i = i;
533             }
534         }
535     }
536 
537     CvDTreeSplit* split = 0;
538     if( best_i >= 0 )
539     {
540         split = _split ? _split : data->new_split_ord( 0, 0.0f, 0, 0, 0.0f );
541         split->var_idx = vi;
542         split->ord.c = (values[best_i] + values[best_i+1])*0.5f;
543         split->ord.split_point = best_i;
544         split->inversed = 0;
545         split->quality = (float)best_val;
546     }
547     return split;
548 }
549 
550 
551 CvDTreeSplit*
find_split_cat_reg(CvDTreeNode * node,int vi,float init_quality,CvDTreeSplit * _split,uchar * _ext_buf)552 CvBoostTree::find_split_cat_reg( CvDTreeNode* node, int vi, float init_quality, CvDTreeSplit* _split, uchar* _ext_buf )
553 {
554     const double* weights = ensemble->get_subtree_weights()->data.db;
555     int ci = data->get_var_type(vi);
556     int n = node->sample_count;
557     int mi = data->cat_count->data.i[ci];
558     int base_size = (2*mi+3)*sizeof(double) + mi*sizeof(double*);
559     cv::AutoBuffer<uchar> inn_buf(base_size);
560     if( !_ext_buf )
561         inn_buf.allocate(base_size + n*(2*sizeof(int) + sizeof(float)));
562     uchar* base_buf = inn_buf.data();
563     uchar* ext_buf = _ext_buf ? _ext_buf : base_buf + base_size;
564 
565     int* cat_labels_buf = (int*)ext_buf;
566     const int* cat_labels = data->get_cat_var_data(node, vi, cat_labels_buf);
567     float* responses_buf = (float*)(cat_labels_buf + n);
568     int* sample_indices_buf = (int*)(responses_buf + n);
569     const float* responses = data->get_ord_responses(node, responses_buf, sample_indices_buf);
570 
571     double* sum = (double*)cv::alignPtr(base_buf,sizeof(double)) + 1;
572     double* counts = sum + mi + 1;
573     double** sum_ptr = (double**)(counts + mi);
574     double L = 0, R = 0, best_val = init_quality, lsum = 0, rsum = 0;
575     int i, best_subset = -1, subset_i;
576 
577     for( i = -1; i < mi; i++ )
578         sum[i] = counts[i] = 0;
579 
580     // calculate sum response and weight of each category of the input var
581     for( i = 0; i < n; i++ )
582     {
583         int idx = ((cat_labels[i] == 65535) && data->is_buf_16u) ? -1 : cat_labels[i];
584         double w = weights[i];
585         double s = sum[idx] + responses[i]*w;
586         double nc = counts[idx] + w;
587         sum[idx] = s;
588         counts[idx] = nc;
589     }
590 
591     // calculate average response in each category
592     for( i = 0; i < mi; i++ )
593     {
594         R += counts[i];
595         rsum += sum[i];
596         sum[i] = fabs(counts[i]) > DBL_EPSILON ? sum[i]/counts[i] : 0;
597         sum_ptr[i] = sum + i;
598     }
599 
600     std::sort(sum_ptr, sum_ptr + mi, LessThanPtr<double>());
601 
602     // revert back to unnormalized sums
603     // (there should be a very little loss in accuracy)
604     for( i = 0; i < mi; i++ )
605         sum[i] *= counts[i];
606 
607     for( subset_i = 0; subset_i < mi-1; subset_i++ )
608     {
609         int idx = (int)(sum_ptr[subset_i] - sum);
610         double ni = counts[idx];
611 
612         if( ni > FLT_EPSILON )
613         {
614             double s = sum[idx];
615             lsum += s; L += ni;
616             rsum -= s; R -= ni;
617 
618             if( L > FLT_EPSILON && R > FLT_EPSILON )
619             {
620                 double val = (lsum*lsum*R + rsum*rsum*L)/(L*R);
621                 if( best_val < val )
622                 {
623                     best_val = val;
624                     best_subset = subset_i;
625                 }
626             }
627         }
628     }
629 
630     CvDTreeSplit* split = 0;
631     if( best_subset >= 0 )
632     {
633         split = _split ? _split : data->new_split_cat( 0, -1.0f);
634         split->var_idx = vi;
635         split->quality = (float)best_val;
636         memset( split->subset, 0, (data->max_c_count + 31)/32 * sizeof(int));
637         for( i = 0; i <= best_subset; i++ )
638         {
639             int idx = (int)(sum_ptr[i] - sum);
640             split->subset[idx >> 5] |= 1 << (idx & 31);
641         }
642     }
643     return split;
644 }
645 
646 
647 CvDTreeSplit*
find_surrogate_split_ord(CvDTreeNode * node,int vi,uchar * _ext_buf)648 CvBoostTree::find_surrogate_split_ord( CvDTreeNode* node, int vi, uchar* _ext_buf )
649 {
650     const float epsilon = FLT_EPSILON*2;
651     int n = node->sample_count;
652     cv::AutoBuffer<uchar> inn_buf;
653     if( !_ext_buf )
654         inn_buf.allocate(n*(2*sizeof(int)+sizeof(float)));
655     uchar* ext_buf = _ext_buf ? _ext_buf : inn_buf.data();
656     float* values_buf = (float*)ext_buf;
657     int* indices_buf = (int*)(values_buf + n);
658     int* sample_indices_buf = indices_buf + n;
659     const float* values = 0;
660     const int* indices = 0;
661     data->get_ord_var_data( node, vi, values_buf, indices_buf, &values, &indices, sample_indices_buf );
662 
663     const double* weights = ensemble->get_subtree_weights()->data.db;
664     const char* dir = (char*)data->direction->data.ptr;
665     int n1 = node->get_num_valid(vi);
666     // LL - number of samples that both the primary and the surrogate splits send to the left
667     // LR - ... primary split sends to the left and the surrogate split sends to the right
668     // RL - ... primary split sends to the right and the surrogate split sends to the left
669     // RR - ... both send to the right
670     int i, best_i = -1, best_inversed = 0;
671     double best_val;
672     double LL = 0, RL = 0, LR, RR;
673     double worst_val = node->maxlr;
674     double sum = 0, sum_abs = 0;
675     best_val = worst_val;
676 
677     for( i = 0; i < n1; i++ )
678     {
679         int idx = indices[i];
680         double w = weights[idx];
681         int d = dir[idx];
682         sum += d*w; sum_abs += (d & 1)*w;
683     }
684 
685     // sum_abs = R + L; sum = R - L
686     RR = (sum_abs + sum)*0.5;
687     LR = (sum_abs - sum)*0.5;
688 
689     // initially all the samples are sent to the right by the surrogate split,
690     // LR of them are sent to the left by primary split, and RR - to the right.
691     // now iteratively compute LL, LR, RL and RR for every possible surrogate split value.
692     for( i = 0; i < n1 - 1; i++ )
693     {
694         int idx = indices[i];
695         double w = weights[idx];
696         int d = dir[idx];
697 
698         if( d < 0 )
699         {
700             LL += w; LR -= w;
701             if( LL + RR > best_val && values[i] + epsilon < values[i+1] )
702             {
703                 best_val = LL + RR;
704                 best_i = i; best_inversed = 0;
705             }
706         }
707         else if( d > 0 )
708         {
709             RL += w; RR -= w;
710             if( RL + LR > best_val && values[i] + epsilon < values[i+1] )
711             {
712                 best_val = RL + LR;
713                 best_i = i; best_inversed = 1;
714             }
715         }
716     }
717 
718     return best_i >= 0 && best_val > node->maxlr ? data->new_split_ord( vi,
719         (values[best_i] + values[best_i+1])*0.5f, best_i,
720         best_inversed, (float)best_val ) : 0;
721 }
722 
723 
724 CvDTreeSplit*
find_surrogate_split_cat(CvDTreeNode * node,int vi,uchar * _ext_buf)725 CvBoostTree::find_surrogate_split_cat( CvDTreeNode* node, int vi, uchar* _ext_buf )
726 {
727     const char* dir = (char*)data->direction->data.ptr;
728     const double* weights = ensemble->get_subtree_weights()->data.db;
729     int n = node->sample_count;
730     int i, mi = data->cat_count->data.i[data->get_var_type(vi)];
731 
732     int base_size = (2*mi+3)*sizeof(double);
733     cv::AutoBuffer<uchar> inn_buf(base_size);
734     if( !_ext_buf )
735         inn_buf.allocate(base_size + n*sizeof(int));
736     uchar* ext_buf = _ext_buf ? _ext_buf : inn_buf.data();
737     int* cat_labels_buf = (int*)ext_buf;
738     const int* cat_labels = data->get_cat_var_data(node, vi, cat_labels_buf);
739 
740     // LL - number of samples that both the primary and the surrogate splits send to the left
741     // LR - ... primary split sends to the left and the surrogate split sends to the right
742     // RL - ... primary split sends to the right and the surrogate split sends to the left
743     // RR - ... both send to the right
744     CvDTreeSplit* split = data->new_split_cat( vi, 0 );
745     double best_val = 0;
746     double* lc = (double*)cv::alignPtr(cat_labels_buf + n, sizeof(double)) + 1;
747     double* rc = lc + mi + 1;
748 
749     for( i = -1; i < mi; i++ )
750         lc[i] = rc[i] = 0;
751 
752     // 1. for each category calculate the weight of samples
753     // sent to the left (lc) and to the right (rc) by the primary split
754     for( i = 0; i < n; i++ )
755     {
756         int idx = ((cat_labels[i] == 65535) && data->is_buf_16u) ? -1 : cat_labels[i];
757         double w = weights[i];
758         int d = dir[i];
759         double sum = lc[idx] + d*w;
760         double sum_abs = rc[idx] + (d & 1)*w;
761         lc[idx] = sum; rc[idx] = sum_abs;
762     }
763 
764     for( i = 0; i < mi; i++ )
765     {
766         double sum = lc[i];
767         double sum_abs = rc[i];
768         lc[i] = (sum_abs - sum) * 0.5;
769         rc[i] = (sum_abs + sum) * 0.5;
770     }
771 
772     // 2. now form the split.
773     // in each category send all the samples to the same direction as majority
774     for( i = 0; i < mi; i++ )
775     {
776         double lval = lc[i], rval = rc[i];
777         if( lval > rval )
778         {
779             split->subset[i >> 5] |= 1 << (i & 31);
780             best_val += lval;
781         }
782         else
783             best_val += rval;
784     }
785 
786     split->quality = (float)best_val;
787     if( split->quality <= node->maxlr )
788         cvSetRemoveByPtr( data->split_heap, split ), split = 0;
789 
790     return split;
791 }
792 
793 
794 void
calc_node_value(CvDTreeNode * node)795 CvBoostTree::calc_node_value( CvDTreeNode* node )
796 {
797     int i, n = node->sample_count;
798     const double* weights = ensemble->get_weights()->data.db;
799     cv::AutoBuffer<uchar> inn_buf(n*(sizeof(int) + ( data->is_classifier ? sizeof(int) : sizeof(int) + sizeof(float))));
800     int* labels_buf = (int*)inn_buf.data();
801     const int* labels = data->get_cv_labels(node, labels_buf);
802     double* subtree_weights = ensemble->get_subtree_weights()->data.db;
803     double rcw[2] = {0,0};
804     int boost_type = ensemble->get_params().boost_type;
805 
806     if( data->is_classifier )
807     {
808         int* _responses_buf = labels_buf + n;
809         const int* _responses = data->get_class_labels(node, _responses_buf);
810         int m = data->get_num_classes();
811         int* cls_count = data->counts->data.i;
812         for( int k = 0; k < m; k++ )
813             cls_count[k] = 0;
814 
815         for( i = 0; i < n; i++ )
816         {
817             int idx = labels[i];
818             double w = weights[idx];
819             int r = _responses[i];
820             rcw[r] += w;
821             cls_count[r]++;
822             subtree_weights[i] = w;
823         }
824 
825         node->class_idx = rcw[1] > rcw[0];
826 
827         if( boost_type == CvBoost::DISCRETE )
828         {
829             // ignore cat_map for responses, and use {-1,1},
830             // as the whole ensemble response is computes as sign(sum_i(weak_response_i)
831             node->value = node->class_idx*2 - 1;
832         }
833         else
834         {
835             double p = rcw[1]/(rcw[0] + rcw[1]);
836             assert( boost_type == CvBoost::REAL );
837 
838             // store log-ratio of the probability
839             node->value = 0.5*log_ratio(p);
840         }
841     }
842     else
843     {
844         // in case of regression tree:
845         //  * node value is 1/n*sum_i(Y_i), where Y_i is i-th response,
846         //    n is the number of samples in the node.
847         //  * node risk is the sum of squared errors: sum_i((Y_i - <node_value>)^2)
848         double sum = 0, sum2 = 0, iw;
849         float* values_buf = (float*)(labels_buf + n);
850         int* sample_indices_buf = (int*)(values_buf + n);
851         const float* values = data->get_ord_responses(node, values_buf, sample_indices_buf);
852 
853         for( i = 0; i < n; i++ )
854         {
855             int idx = labels[i];
856             double w = weights[idx]/*priors[values[i] > 0]*/;
857             double t = values[i];
858             rcw[0] += w;
859             subtree_weights[i] = w;
860             sum += t*w;
861             sum2 += t*t*w;
862         }
863 
864         iw = 1./rcw[0];
865         node->value = sum*iw;
866         node->node_risk = sum2 - (sum*iw)*sum;
867 
868         // renormalize the risk, as in try_split_node the unweighted formula
869         // sqrt(risk)/n is used, rather than sqrt(risk)/sum(weights_i)
870         node->node_risk *= n*iw*n*iw;
871     }
872 
873     // store summary weights
874     subtree_weights[n] = rcw[0];
875     subtree_weights[n+1] = rcw[1];
876 }
877 
878 
read(CvFileStorage * fs,CvFileNode * fnode,CvBoost * _ensemble,CvDTreeTrainData * _data)879 void CvBoostTree::read( CvFileStorage* fs, CvFileNode* fnode, CvBoost* _ensemble, CvDTreeTrainData* _data )
880 {
881     CvDTree::read( fs, fnode, _data );
882     ensemble = _ensemble;
883 }
884 
read(CvFileStorage *,CvFileNode *)885 void CvBoostTree::read( CvFileStorage*, CvFileNode* )
886 {
887     assert(0);
888 }
889 
read(CvFileStorage * _fs,CvFileNode * _node,CvDTreeTrainData * _data)890 void CvBoostTree::read( CvFileStorage* _fs, CvFileNode* _node,
891                         CvDTreeTrainData* _data )
892 {
893     CvDTree::read( _fs, _node, _data );
894 }
895 
896 
897 /////////////////////////////////// CvBoost /////////////////////////////////////
898 
CvBoost()899 CvBoost::CvBoost()
900 {
901     data = 0;
902     weak = 0;
903     default_model_name = "my_boost_tree";
904 
905     active_vars = active_vars_abs = orig_response = sum_response = weak_eval =
906         subsample_mask = weights = subtree_weights = 0;
907     have_active_cat_vars = have_subsample = false;
908 
909     clear();
910 }
911 
912 
prune(CvSlice slice)913 void CvBoost::prune( CvSlice slice )
914 {
915     if( weak && weak->total > 0 )
916     {
917         CvSeqReader reader;
918         int i, count = cvSliceLength( slice, weak );
919 
920         cvStartReadSeq( weak, &reader );
921         cvSetSeqReaderPos( &reader, slice.start_index );
922 
923         for( i = 0; i < count; i++ )
924         {
925             CvBoostTree* w;
926             CV_READ_SEQ_ELEM( w, reader );
927             delete w;
928         }
929 
930         cvSeqRemoveSlice( weak, slice );
931     }
932 }
933 
934 
clear()935 void CvBoost::clear()
936 {
937     if( weak )
938     {
939         prune( CV_WHOLE_SEQ );
940         cvReleaseMemStorage( &weak->storage );
941     }
942     if( data )
943         delete data;
944     weak = 0;
945     data = 0;
946     cvReleaseMat( &active_vars );
947     cvReleaseMat( &active_vars_abs );
948     cvReleaseMat( &orig_response );
949     cvReleaseMat( &sum_response );
950     cvReleaseMat( &weak_eval );
951     cvReleaseMat( &subsample_mask );
952     cvReleaseMat( &weights );
953     cvReleaseMat( &subtree_weights );
954 
955     have_subsample = false;
956 }
957 
958 
~CvBoost()959 CvBoost::~CvBoost()
960 {
961     clear();
962 }
963 
964 
CvBoost(const CvMat * _train_data,int _tflag,const CvMat * _responses,const CvMat * _var_idx,const CvMat * _sample_idx,const CvMat * _var_type,const CvMat * _missing_mask,CvBoostParams _params)965 CvBoost::CvBoost( const CvMat* _train_data, int _tflag,
966                   const CvMat* _responses, const CvMat* _var_idx,
967                   const CvMat* _sample_idx, const CvMat* _var_type,
968                   const CvMat* _missing_mask, CvBoostParams _params )
969 {
970     weak = 0;
971     data = 0;
972     default_model_name = "my_boost_tree";
973 
974     active_vars = active_vars_abs = orig_response = sum_response = weak_eval =
975         subsample_mask = weights = subtree_weights = 0;
976 
977     train( _train_data, _tflag, _responses, _var_idx, _sample_idx,
978            _var_type, _missing_mask, _params );
979 }
980 
981 
982 bool
set_params(const CvBoostParams & _params)983 CvBoost::set_params( const CvBoostParams& _params )
984 {
985     bool ok = false;
986 
987     CV_FUNCNAME( "CvBoost::set_params" );
988 
989     __BEGIN__;
990 
991     params = _params;
992     if( params.boost_type != DISCRETE && params.boost_type != REAL &&
993         params.boost_type != LOGIT && params.boost_type != GENTLE )
994         CV_ERROR( CV_StsBadArg, "Unknown/unsupported boosting type" );
995 
996     params.weak_count = MAX( params.weak_count, 1 );
997     params.weight_trim_rate = MAX( params.weight_trim_rate, 0. );
998     params.weight_trim_rate = MIN( params.weight_trim_rate, 1. );
999     if( params.weight_trim_rate < FLT_EPSILON )
1000         params.weight_trim_rate = 1.f;
1001 
1002     if( params.boost_type == DISCRETE &&
1003         params.split_criteria != GINI && params.split_criteria != MISCLASS )
1004         params.split_criteria = MISCLASS;
1005     if( params.boost_type == REAL &&
1006         params.split_criteria != GINI && params.split_criteria != MISCLASS )
1007         params.split_criteria = GINI;
1008     if( (params.boost_type == LOGIT || params.boost_type == GENTLE) &&
1009         params.split_criteria != SQERR )
1010         params.split_criteria = SQERR;
1011 
1012     ok = true;
1013 
1014     __END__;
1015 
1016     return ok;
1017 }
1018 
1019 
1020 bool
train(const CvMat * _train_data,int _tflag,const CvMat * _responses,const CvMat * _var_idx,const CvMat * _sample_idx,const CvMat * _var_type,const CvMat * _missing_mask,CvBoostParams _params,bool _update)1021 CvBoost::train( const CvMat* _train_data, int _tflag,
1022               const CvMat* _responses, const CvMat* _var_idx,
1023               const CvMat* _sample_idx, const CvMat* _var_type,
1024               const CvMat* _missing_mask,
1025               CvBoostParams _params, bool _update )
1026 {
1027     bool ok = false;
1028     CvMemStorage* storage = 0;
1029 
1030     CV_FUNCNAME( "CvBoost::train" );
1031 
1032     __BEGIN__;
1033 
1034     int i;
1035 
1036     set_params( _params );
1037 
1038     cvReleaseMat( &active_vars );
1039     cvReleaseMat( &active_vars_abs );
1040 
1041     if( !_update || !data )
1042     {
1043         clear();
1044         data = new CvDTreeTrainData( _train_data, _tflag, _responses, _var_idx,
1045             _sample_idx, _var_type, _missing_mask, _params, true, true );
1046 
1047         if( data->get_num_classes() != 2 )
1048             CV_ERROR( CV_StsNotImplemented,
1049             "Boosted trees can only be used for 2-class classification." );
1050         CV_CALL( storage = cvCreateMemStorage() );
1051         weak = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvBoostTree*), storage );
1052         storage = 0;
1053     }
1054     else
1055     {
1056         data->set_data( _train_data, _tflag, _responses, _var_idx,
1057             _sample_idx, _var_type, _missing_mask, _params, true, true, true );
1058     }
1059 
1060     if ( (_params.boost_type == LOGIT) || (_params.boost_type == GENTLE) )
1061         data->do_responses_copy();
1062 
1063     update_weights( 0 );
1064 
1065     for( i = 0; i < params.weak_count; i++ )
1066     {
1067         CvBoostTree* tree = new CvBoostTree;
1068         if( !tree->train( data, subsample_mask, this ) )
1069         {
1070             delete tree;
1071             break;
1072         }
1073         //cvCheckArr( get_weak_response());
1074         cvSeqPush( weak, &tree );
1075         update_weights( tree );
1076         trim_weights();
1077         if( cvCountNonZero(subsample_mask) == 0 )
1078             break;
1079     }
1080 
1081     if(weak->total > 0)
1082     {
1083         get_active_vars(); // recompute active_vars* maps and condensed_idx's in the splits.
1084         data->is_classifier = true;
1085         data->free_train_data();
1086         ok = true;
1087     }
1088     else
1089         clear();
1090 
1091     __END__;
1092 
1093     return ok;
1094 }
1095 
train(CvMLData * _data,CvBoostParams _params,bool update)1096 bool CvBoost::train( CvMLData* _data,
1097              CvBoostParams _params,
1098              bool update )
1099 {
1100     bool result = false;
1101 
1102     CV_FUNCNAME( "CvBoost::train" );
1103 
1104     __BEGIN__;
1105 
1106     const CvMat* values = _data->get_values();
1107     const CvMat* response = _data->get_responses();
1108     const CvMat* missing = _data->get_missing();
1109     const CvMat* var_types = _data->get_var_types();
1110     const CvMat* train_sidx = _data->get_train_sample_idx();
1111     const CvMat* var_idx = _data->get_var_idx();
1112 
1113     CV_CALL( result = train( values, CV_ROW_SAMPLE, response, var_idx,
1114         train_sidx, var_types, missing, _params, update ) );
1115 
1116     __END__;
1117 
1118     return result;
1119 }
1120 
initialize_weights(double (& p)[2])1121 void CvBoost::initialize_weights(double (&p)[2])
1122 {
1123     p[0] = 1.;
1124     p[1] = 1.;
1125 }
1126 
1127 void
update_weights(CvBoostTree * tree)1128 CvBoost::update_weights( CvBoostTree* tree )
1129 {
1130     CV_FUNCNAME( "CvBoost::update_weights" );
1131 
1132     __BEGIN__;
1133 
1134     int i, n = data->sample_count;
1135     double sumw = 0.;
1136     int step = 0;
1137     float* fdata = 0;
1138     int *sample_idx_buf;
1139     const int* sample_idx = 0;
1140     cv::AutoBuffer<uchar> inn_buf;
1141     size_t _buf_size = (params.boost_type == LOGIT) || (params.boost_type == GENTLE) ? (size_t)(data->sample_count)*sizeof(int) : 0;
1142     if( !tree )
1143         _buf_size += n*sizeof(int);
1144     else
1145     {
1146         if( have_subsample )
1147             _buf_size += data->get_length_subbuf()*(sizeof(float)+sizeof(uchar));
1148     }
1149     inn_buf.allocate(_buf_size);
1150     uchar* cur_buf_pos = inn_buf.data();
1151 
1152     if ( (params.boost_type == LOGIT) || (params.boost_type == GENTLE) )
1153     {
1154         step = CV_IS_MAT_CONT(data->responses_copy->type) ?
1155             1 : data->responses_copy->step / CV_ELEM_SIZE(data->responses_copy->type);
1156         fdata = data->responses_copy->data.fl;
1157         sample_idx_buf = (int*)cur_buf_pos;
1158         cur_buf_pos = (uchar*)(sample_idx_buf + data->sample_count);
1159         sample_idx = data->get_sample_indices( data->data_root, sample_idx_buf );
1160     }
1161     CvMat* dtree_data_buf = data->buf;
1162     size_t length_buf_row = data->get_length_subbuf();
1163     if( !tree ) // before training the first tree, initialize weights and other parameters
1164     {
1165         int* class_labels_buf = (int*)cur_buf_pos;
1166         cur_buf_pos = (uchar*)(class_labels_buf + n);
1167         const int* class_labels = data->get_class_labels(data->data_root, class_labels_buf);
1168         // in case of logitboost and gentle adaboost each weak tree is a regression tree,
1169         // so we need to convert class labels to floating-point values
1170 
1171         double w0 = 1./ n;
1172         double p[2] = { 1., 1. };
1173         initialize_weights(p);
1174 
1175         cvReleaseMat( &orig_response );
1176         cvReleaseMat( &sum_response );
1177         cvReleaseMat( &weak_eval );
1178         cvReleaseMat( &subsample_mask );
1179         cvReleaseMat( &weights );
1180         cvReleaseMat( &subtree_weights );
1181 
1182         CV_CALL( orig_response = cvCreateMat( 1, n, CV_32S ));
1183         CV_CALL( weak_eval = cvCreateMat( 1, n, CV_64F ));
1184         CV_CALL( subsample_mask = cvCreateMat( 1, n, CV_8U ));
1185         CV_CALL( weights = cvCreateMat( 1, n, CV_64F ));
1186         CV_CALL( subtree_weights = cvCreateMat( 1, n + 2, CV_64F ));
1187 
1188         if( data->have_priors )
1189         {
1190             // compute weight scale for each class from their prior probabilities
1191             int c1 = 0;
1192             for( i = 0; i < n; i++ )
1193                 c1 += class_labels[i];
1194             p[0] = data->priors->data.db[0]*(c1 < n ? 1./(n - c1) : 0.);
1195             p[1] = data->priors->data.db[1]*(c1 > 0 ? 1./c1 : 0.);
1196             p[0] /= p[0] + p[1];
1197             p[1] = 1. - p[0];
1198         }
1199 
1200         if (data->is_buf_16u)
1201         {
1202             unsigned short* labels = (unsigned short*)(dtree_data_buf->data.s + data->data_root->buf_idx*length_buf_row +
1203                 data->data_root->offset + (size_t)(data->work_var_count-1)*data->sample_count);
1204             for( i = 0; i < n; i++ )
1205             {
1206                 // save original categorical responses {0,1}, convert them to {-1,1}
1207                 orig_response->data.i[i] = class_labels[i]*2 - 1;
1208                 // make all the samples active at start.
1209                 // later, in trim_weights() deactivate/reactive again some, if need
1210                 subsample_mask->data.ptr[i] = (uchar)1;
1211                 // make all the initial weights the same.
1212                 weights->data.db[i] = w0*p[class_labels[i]];
1213                 // set the labels to find (from within weak tree learning proc)
1214                 // the particular sample weight, and where to store the response.
1215                 labels[i] = (unsigned short)i;
1216             }
1217         }
1218         else
1219         {
1220             int* labels = dtree_data_buf->data.i + data->data_root->buf_idx*length_buf_row +
1221                 data->data_root->offset + (size_t)(data->work_var_count-1)*data->sample_count;
1222 
1223             for( i = 0; i < n; i++ )
1224             {
1225                 // save original categorical responses {0,1}, convert them to {-1,1}
1226                 orig_response->data.i[i] = class_labels[i]*2 - 1;
1227                 // make all the samples active at start.
1228                 // later, in trim_weights() deactivate/reactive again some, if need
1229                 subsample_mask->data.ptr[i] = (uchar)1;
1230                 // make all the initial weights the same.
1231                 weights->data.db[i] = w0*p[class_labels[i]];
1232                 // set the labels to find (from within weak tree learning proc)
1233                 // the particular sample weight, and where to store the response.
1234                 labels[i] = i;
1235             }
1236         }
1237 
1238         if( params.boost_type == LOGIT )
1239         {
1240             CV_CALL( sum_response = cvCreateMat( 1, n, CV_64F ));
1241 
1242             for( i = 0; i < n; i++ )
1243             {
1244                 sum_response->data.db[i] = 0;
1245                 fdata[sample_idx[i]*step] = orig_response->data.i[i] > 0 ? 2.f : -2.f;
1246             }
1247 
1248             // in case of logitboost each weak tree is a regression tree.
1249             // the target function values are recalculated for each of the trees
1250             data->is_classifier = false;
1251         }
1252         else if( params.boost_type == GENTLE )
1253         {
1254             for( i = 0; i < n; i++ )
1255                 fdata[sample_idx[i]*step] = (float)orig_response->data.i[i];
1256 
1257             data->is_classifier = false;
1258         }
1259     }
1260     else
1261     {
1262         // at this moment, for all the samples that participated in the training of the most
1263         // recent weak classifier we know the responses. For other samples we need to compute them
1264         if( have_subsample )
1265         {
1266             float* values = (float*)cur_buf_pos;
1267             cur_buf_pos = (uchar*)(values + data->get_length_subbuf());
1268             uchar* missing = cur_buf_pos;
1269             cur_buf_pos = missing + data->get_length_subbuf() * (size_t)CV_ELEM_SIZE(data->buf->type);
1270 
1271             CvMat _sample, _mask;
1272 
1273             // invert the subsample mask
1274             cvXorS( subsample_mask, cvScalar(1.), subsample_mask );
1275             data->get_vectors( subsample_mask, values, missing, 0 );
1276 
1277             _sample = cvMat( 1, data->var_count, CV_32F );
1278             _mask = cvMat( 1, data->var_count, CV_8U );
1279 
1280             // run tree through all the non-processed samples
1281             for( i = 0; i < n; i++ )
1282                 if( subsample_mask->data.ptr[i] )
1283                 {
1284                     _sample.data.fl = values;
1285                     _mask.data.ptr = missing;
1286                     values += _sample.cols;
1287                     missing += _mask.cols;
1288                     weak_eval->data.db[i] = tree->predict( &_sample, &_mask, true )->value;
1289                 }
1290         }
1291 
1292         // now update weights and other parameters for each type of boosting
1293         if( params.boost_type == DISCRETE )
1294         {
1295             // Discrete AdaBoost:
1296             //   weak_eval[i] (=f(x_i)) is in {-1,1}
1297             //   err = sum(w_i*(f(x_i) != y_i))/sum(w_i)
1298             //   C = log((1-err)/err)
1299             //   w_i *= exp(C*(f(x_i) != y_i))
1300 
1301             double C, err = 0.;
1302             double scale[] = { 1., 0. };
1303 
1304             for( i = 0; i < n; i++ )
1305             {
1306                 double w = weights->data.db[i];
1307                 sumw += w;
1308                 err += w*(weak_eval->data.db[i] != orig_response->data.i[i]);
1309             }
1310 
1311             if( sumw != 0 )
1312                 err /= sumw;
1313             C = err = -log_ratio( err );
1314             scale[1] = exp(err);
1315 
1316             sumw = 0;
1317             for( i = 0; i < n; i++ )
1318             {
1319                 double w = weights->data.db[i]*
1320                     scale[weak_eval->data.db[i] != orig_response->data.i[i]];
1321                 sumw += w;
1322                 weights->data.db[i] = w;
1323             }
1324 
1325             tree->scale( C );
1326         }
1327         else if( params.boost_type == REAL )
1328         {
1329             // Real AdaBoost:
1330             //   weak_eval[i] = f(x_i) = 0.5*log(p(x_i)/(1-p(x_i))), p(x_i)=P(y=1|x_i)
1331             //   w_i *= exp(-y_i*f(x_i))
1332 
1333             for( i = 0; i < n; i++ )
1334                 weak_eval->data.db[i] *= -orig_response->data.i[i];
1335 
1336             cvExp( weak_eval, weak_eval );
1337 
1338             for( i = 0; i < n; i++ )
1339             {
1340                 double w = weights->data.db[i]*weak_eval->data.db[i];
1341                 sumw += w;
1342                 weights->data.db[i] = w;
1343             }
1344         }
1345         else if( params.boost_type == LOGIT )
1346         {
1347             // LogitBoost:
1348             //   weak_eval[i] = f(x_i) in [-z_max,z_max]
1349             //   sum_response = F(x_i).
1350             //   F(x_i) += 0.5*f(x_i)
1351             //   p(x_i) = exp(F(x_i))/(exp(F(x_i)) + exp(-F(x_i))=1/(1+exp(-2*F(x_i)))
1352             //   reuse weak_eval: weak_eval[i] <- p(x_i)
1353             //   w_i = p(x_i)*1(1 - p(x_i))
1354             //   z_i = ((y_i+1)/2 - p(x_i))/(p(x_i)*(1 - p(x_i)))
1355             //   store z_i to the data->data_root as the new target responses
1356 
1357             const double lb_weight_thresh = FLT_EPSILON;
1358             const double lb_z_max = 10.;
1359             /*float* responses_buf = data->get_resp_float_buf();
1360             const float* responses = 0;
1361             data->get_ord_responses(data->data_root, responses_buf, &responses);*/
1362 
1363             /*if( weak->total == 7 )
1364                 putchar('*');*/
1365 
1366             for( i = 0; i < n; i++ )
1367             {
1368                 double s = sum_response->data.db[i] + 0.5*weak_eval->data.db[i];
1369                 sum_response->data.db[i] = s;
1370                 weak_eval->data.db[i] = -2*s;
1371             }
1372 
1373             cvExp( weak_eval, weak_eval );
1374 
1375             for( i = 0; i < n; i++ )
1376             {
1377                 double p = 1./(1. + weak_eval->data.db[i]);
1378                 double w = p*(1 - p), z;
1379                 w = MAX( w, lb_weight_thresh );
1380                 weights->data.db[i] = w;
1381                 sumw += w;
1382                 if( orig_response->data.i[i] > 0 )
1383                 {
1384                     z = 1./p;
1385                     fdata[sample_idx[i]*step] = (float)MIN(z, lb_z_max);
1386                 }
1387                 else
1388                 {
1389                     z = 1./(1-p);
1390                     fdata[sample_idx[i]*step] = (float)-MIN(z, lb_z_max);
1391                 }
1392             }
1393         }
1394         else
1395         {
1396             // Gentle AdaBoost:
1397             //   weak_eval[i] = f(x_i) in [-1,1]
1398             //   w_i *= exp(-y_i*f(x_i))
1399             assert( params.boost_type == GENTLE );
1400 
1401             for( i = 0; i < n; i++ )
1402                 weak_eval->data.db[i] *= -orig_response->data.i[i];
1403 
1404             cvExp( weak_eval, weak_eval );
1405 
1406             for( i = 0; i < n; i++ )
1407             {
1408                 double w = weights->data.db[i] * weak_eval->data.db[i];
1409                 weights->data.db[i] = w;
1410                 sumw += w;
1411             }
1412         }
1413     }
1414 
1415     // renormalize weights
1416     if( sumw > FLT_EPSILON )
1417     {
1418         sumw = 1./sumw;
1419         for( i = 0; i < n; ++i )
1420             weights->data.db[i] *= sumw;
1421     }
1422 
1423     __END__;
1424 }
1425 
1426 
1427 void
trim_weights()1428 CvBoost::trim_weights()
1429 {
1430     //CV_FUNCNAME( "CvBoost::trim_weights" );
1431 
1432     __BEGIN__;
1433 
1434     int i, count = data->sample_count, nz_count = 0;
1435     double sum, threshold;
1436 
1437     if( params.weight_trim_rate <= 0. || params.weight_trim_rate >= 1. )
1438         EXIT;
1439 
1440     // use weak_eval as temporary buffer for sorted weights
1441     cvCopy( weights, weak_eval );
1442 
1443     std::sort(weak_eval->data.db, weak_eval->data.db + count);
1444 
1445     // as weight trimming occurs immediately after updating the weights,
1446     // where they are renormalized, we assume that the weight sum = 1.
1447     sum = 1. - params.weight_trim_rate;
1448 
1449     for( i = 0; i < count; i++ )
1450     {
1451         double w = weak_eval->data.db[i];
1452         if( sum <= 0 )
1453             break;
1454         sum -= w;
1455     }
1456 
1457     threshold = i < count ? weak_eval->data.db[i] : DBL_MAX;
1458 
1459     for( i = 0; i < count; i++ )
1460     {
1461         double w = weights->data.db[i];
1462         int f = w >= threshold;
1463         subsample_mask->data.ptr[i] = (uchar)f;
1464         nz_count += f;
1465     }
1466 
1467     have_subsample = nz_count < count;
1468 
1469     __END__;
1470 }
1471 
1472 
1473 const CvMat*
get_active_vars(bool absolute_idx)1474 CvBoost::get_active_vars( bool absolute_idx )
1475 {
1476     CvMat* mask = 0;
1477     CvMat* inv_map = 0;
1478     CvMat* result = 0;
1479 
1480     CV_FUNCNAME( "CvBoost::get_active_vars" );
1481 
1482     __BEGIN__;
1483 
1484     if( !weak )
1485         CV_ERROR( CV_StsError, "The boosted tree ensemble has not been trained yet" );
1486 
1487     if( !active_vars || !active_vars_abs )
1488     {
1489         CvSeqReader reader;
1490         int i, j, nactive_vars;
1491         CvBoostTree* wtree;
1492         const CvDTreeNode* node;
1493 
1494         assert(!active_vars && !active_vars_abs);
1495         mask = cvCreateMat( 1, data->var_count, CV_8U );
1496         inv_map = cvCreateMat( 1, data->var_count, CV_32S );
1497         cvZero( mask );
1498         cvSet( inv_map, cvScalar(-1) );
1499 
1500         // first pass: compute the mask of used variables
1501         cvStartReadSeq( weak, &reader );
1502         for( i = 0; i < weak->total; i++ )
1503         {
1504             CV_READ_SEQ_ELEM(wtree, reader);
1505 
1506             node = wtree->get_root();
1507             assert( node != 0 );
1508             for(;;)
1509             {
1510                 const CvDTreeNode* parent;
1511                 for(;;)
1512                 {
1513                     CvDTreeSplit* split = node->split;
1514                     for( ; split != 0; split = split->next )
1515                         mask->data.ptr[split->var_idx] = 1;
1516                     if( !node->left )
1517                         break;
1518                     node = node->left;
1519                 }
1520 
1521                 for( parent = node->parent; parent && parent->right == node;
1522                     node = parent, parent = parent->parent )
1523                     ;
1524 
1525                 if( !parent )
1526                     break;
1527 
1528                 node = parent->right;
1529             }
1530         }
1531 
1532         nactive_vars = cvCountNonZero(mask);
1533 
1534         //if ( nactive_vars > 0 )
1535         {
1536             active_vars = cvCreateMat( 1, nactive_vars, CV_32S );
1537             active_vars_abs = cvCreateMat( 1, nactive_vars, CV_32S );
1538 
1539             have_active_cat_vars = false;
1540 
1541             for( i = j = 0; i < data->var_count; i++ )
1542             {
1543                 if( mask->data.ptr[i] )
1544                 {
1545                     active_vars->data.i[j] = i;
1546                     active_vars_abs->data.i[j] = data->var_idx ? data->var_idx->data.i[i] : i;
1547                     inv_map->data.i[i] = j;
1548                     if( data->var_type->data.i[i] >= 0 )
1549                         have_active_cat_vars = true;
1550                     j++;
1551                 }
1552             }
1553 
1554 
1555             // second pass: now compute the condensed indices
1556             cvStartReadSeq( weak, &reader );
1557             for( i = 0; i < weak->total; i++ )
1558             {
1559                 CV_READ_SEQ_ELEM(wtree, reader);
1560                 node = wtree->get_root();
1561                 for(;;)
1562                 {
1563                     const CvDTreeNode* parent;
1564                     for(;;)
1565                     {
1566                         CvDTreeSplit* split = node->split;
1567                         for( ; split != 0; split = split->next )
1568                         {
1569                             split->condensed_idx = inv_map->data.i[split->var_idx];
1570                             assert( split->condensed_idx >= 0 );
1571                         }
1572 
1573                         if( !node->left )
1574                             break;
1575                         node = node->left;
1576                     }
1577 
1578                     for( parent = node->parent; parent && parent->right == node;
1579                         node = parent, parent = parent->parent )
1580                         ;
1581 
1582                     if( !parent )
1583                         break;
1584 
1585                     node = parent->right;
1586                 }
1587             }
1588         }
1589     }
1590 
1591     result = absolute_idx ? active_vars_abs : active_vars;
1592 
1593     __END__;
1594 
1595     cvReleaseMat( &mask );
1596     cvReleaseMat( &inv_map );
1597 
1598     return result;
1599 }
1600 
1601 
1602 float
predict(const CvMat * _sample,const CvMat * _missing,CvMat * weak_responses,CvSlice slice,bool raw_mode,bool return_sum) const1603 CvBoost::predict( const CvMat* _sample, const CvMat* _missing,
1604                   CvMat* weak_responses, CvSlice slice,
1605                   bool raw_mode, bool return_sum ) const
1606 {
1607     float value = -FLT_MAX;
1608 
1609     CvSeqReader reader;
1610     double sum = 0;
1611     int wstep = 0;
1612     const float* sample_data;
1613 
1614     if( !weak )
1615         CV_Error( CV_StsError, "The boosted tree ensemble has not been trained yet" );
1616 
1617     if( !CV_IS_MAT(_sample) || CV_MAT_TYPE(_sample->type) != CV_32FC1 ||
1618         (_sample->cols != 1 && _sample->rows != 1) ||
1619         (_sample->cols + _sample->rows - 1 != data->var_all && !raw_mode) ||
1620         (active_vars && _sample->cols + _sample->rows - 1 != active_vars->cols && raw_mode) )
1621             CV_Error( CV_StsBadArg,
1622         "the input sample must be 1d floating-point vector with the same "
1623         "number of elements as the total number of variables or "
1624         "as the number of variables used for training" );
1625 
1626     if( _missing )
1627     {
1628         if( !CV_IS_MAT(_missing) || !CV_IS_MASK_ARR(_missing) ||
1629             !CV_ARE_SIZES_EQ(_missing, _sample) )
1630             CV_Error( CV_StsBadArg,
1631             "the missing data mask must be 8-bit vector of the same size as input sample" );
1632     }
1633 
1634     int i, weak_count = cvSliceLength( slice, weak );
1635     if( weak_count >= weak->total )
1636     {
1637         weak_count = weak->total;
1638         slice.start_index = 0;
1639     }
1640 
1641     if( weak_responses )
1642     {
1643         if( !CV_IS_MAT(weak_responses) ||
1644             CV_MAT_TYPE(weak_responses->type) != CV_32FC1 ||
1645             (weak_responses->cols != 1 && weak_responses->rows != 1) ||
1646             weak_responses->cols + weak_responses->rows - 1 != weak_count )
1647             CV_Error( CV_StsBadArg,
1648             "The output matrix of weak classifier responses must be valid "
1649             "floating-point vector of the same number of components as the length of input slice" );
1650         wstep = CV_IS_MAT_CONT(weak_responses->type) ? 1 : weak_responses->step/sizeof(float);
1651     }
1652 
1653     int var_count = active_vars->cols;
1654     const int* vtype = data->var_type->data.i;
1655     const int* cmap = data->cat_map->data.i;
1656     const int* cofs = data->cat_ofs->data.i;
1657 
1658     cv::Mat sample = cv::cvarrToMat(_sample);
1659     cv::Mat missing;
1660     if(!_missing)
1661         missing = cv::cvarrToMat(_missing);
1662 
1663     // if need, preprocess the input vector
1664     if( !raw_mode )
1665     {
1666         int sstep, mstep = 0;
1667         const float* src_sample;
1668         const uchar* src_mask = 0;
1669         float* dst_sample;
1670         uchar* dst_mask;
1671         const int* vidx = active_vars->data.i;
1672         const int* vidx_abs = active_vars_abs->data.i;
1673         bool have_mask = _missing != 0;
1674 
1675         sample = cv::Mat(1, var_count, CV_32FC1);
1676         missing = cv::Mat(1, var_count, CV_8UC1);
1677 
1678         dst_sample = sample.ptr<float>();
1679         dst_mask = missing.ptr<uchar>();
1680 
1681         src_sample = _sample->data.fl;
1682         sstep = CV_IS_MAT_CONT(_sample->type) ? 1 : _sample->step/sizeof(src_sample[0]);
1683 
1684         if( _missing )
1685         {
1686             src_mask = _missing->data.ptr;
1687             mstep = CV_IS_MAT_CONT(_missing->type) ? 1 : _missing->step;
1688         }
1689 
1690         for( i = 0; i < var_count; i++ )
1691         {
1692             int idx = vidx[i], idx_abs = vidx_abs[i];
1693             float val = src_sample[idx_abs*sstep];
1694             int ci = vtype[idx];
1695             uchar m = src_mask ? src_mask[idx_abs*mstep] : (uchar)0;
1696 
1697             if( ci >= 0 )
1698             {
1699                 int a = cofs[ci], b = (ci+1 >= data->cat_ofs->cols) ? data->cat_map->cols : cofs[ci+1],
1700                     c = a;
1701                 int ival = cvRound(val);
1702                 if ( (ival != val) && (!m) )
1703                     CV_Error( CV_StsBadArg,
1704                         "one of input categorical variable is not an integer" );
1705 
1706                 while( a < b )
1707                 {
1708                     c = (a + b) >> 1;
1709                     if( ival < cmap[c] )
1710                         b = c;
1711                     else if( ival > cmap[c] )
1712                         a = c+1;
1713                     else
1714                         break;
1715                 }
1716 
1717                 if( c < 0 || ival != cmap[c] )
1718                 {
1719                     m = 1;
1720                     have_mask = true;
1721                 }
1722                 else
1723                 {
1724                     val = (float)(c - cofs[ci]);
1725                 }
1726             }
1727 
1728             dst_sample[i] = val;
1729             dst_mask[i] = m;
1730         }
1731 
1732         if( !have_mask )
1733             missing.release();
1734     }
1735     else
1736     {
1737         if( !CV_IS_MAT_CONT(_sample->type & (_missing ? _missing->type : -1)) )
1738             CV_Error( CV_StsBadArg, "In raw mode the input vectors must be continuous" );
1739     }
1740 
1741     cvStartReadSeq( weak, &reader );
1742     cvSetSeqReaderPos( &reader, slice.start_index );
1743 
1744     sample_data = sample.ptr<float>();
1745 
1746     if( !have_active_cat_vars && missing.empty() && !weak_responses )
1747     {
1748         for( i = 0; i < weak_count; i++ )
1749         {
1750             CvBoostTree* wtree;
1751             const CvDTreeNode* node;
1752             CV_READ_SEQ_ELEM( wtree, reader );
1753 
1754             node = wtree->get_root();
1755             while( node->left )
1756             {
1757                 CvDTreeSplit* split = node->split;
1758                 int vi = split->condensed_idx;
1759                 float val = sample_data[vi];
1760                 int dir = val <= split->ord.c ? -1 : 1;
1761                 if( split->inversed )
1762                     dir = -dir;
1763                 node = dir < 0 ? node->left : node->right;
1764             }
1765             sum += node->value;
1766         }
1767     }
1768     else
1769     {
1770         const int* avars = active_vars->data.i;
1771         const uchar* m = !missing.empty() ? missing.ptr<uchar>() : 0;
1772 
1773         // full-featured version
1774         for( i = 0; i < weak_count; i++ )
1775         {
1776             CvBoostTree* wtree;
1777             const CvDTreeNode* node;
1778             CV_READ_SEQ_ELEM( wtree, reader );
1779 
1780             node = wtree->get_root();
1781             while( node->left )
1782             {
1783                 const CvDTreeSplit* split = node->split;
1784                 int dir = 0;
1785                 for( ; !dir && split != 0; split = split->next )
1786                 {
1787                     int vi = split->condensed_idx;
1788                     int ci = vtype[avars[vi]];
1789                     float val = sample_data[vi];
1790                     if( m && m[vi] )
1791                         continue;
1792                     if( ci < 0 ) // ordered
1793                         dir = val <= split->ord.c ? -1 : 1;
1794                     else // categorical
1795                     {
1796                         int c = cvRound(val);
1797                         dir = CV_DTREE_CAT_DIR(c, split->subset);
1798                     }
1799                     if( split->inversed )
1800                         dir = -dir;
1801                 }
1802 
1803                 if( !dir )
1804                 {
1805                     int diff = node->right->sample_count - node->left->sample_count;
1806                     dir = diff < 0 ? -1 : 1;
1807                 }
1808                 node = dir < 0 ? node->left : node->right;
1809             }
1810             if( weak_responses )
1811                 weak_responses->data.fl[i*wstep] = (float)node->value;
1812             sum += node->value;
1813         }
1814     }
1815 
1816     if( return_sum )
1817         value = (float)sum;
1818     else
1819     {
1820         int cls_idx = sum >= 0;
1821         if( raw_mode )
1822             value = (float)cls_idx;
1823         else
1824             value = (float)cmap[cofs[vtype[data->var_count]] + cls_idx];
1825     }
1826 
1827     return value;
1828 }
1829 
calc_error(CvMLData * _data,int type,std::vector<float> * resp)1830 float CvBoost::calc_error( CvMLData* _data, int type, std::vector<float> *resp )
1831 {
1832     float err = 0;
1833     const CvMat* values = _data->get_values();
1834     const CvMat* response = _data->get_responses();
1835     const CvMat* missing = _data->get_missing();
1836     const CvMat* sample_idx = (type == CV_TEST_ERROR) ? _data->get_test_sample_idx() : _data->get_train_sample_idx();
1837     const CvMat* var_types = _data->get_var_types();
1838     int* sidx = sample_idx ? sample_idx->data.i : 0;
1839     int r_step = CV_IS_MAT_CONT(response->type) ?
1840                 1 : response->step / CV_ELEM_SIZE(response->type);
1841     bool is_classifier = var_types->data.ptr[var_types->cols-1] == CV_VAR_CATEGORICAL;
1842     int sample_count = sample_idx ? sample_idx->cols : 0;
1843     sample_count = (type == CV_TRAIN_ERROR && sample_count == 0) ? values->rows : sample_count;
1844     float* pred_resp = 0;
1845     if( resp && (sample_count > 0) )
1846     {
1847         resp->resize( sample_count );
1848         pred_resp = &((*resp)[0]);
1849     }
1850     if ( is_classifier )
1851     {
1852         for( int i = 0; i < sample_count; i++ )
1853         {
1854             CvMat sample, miss;
1855             int si = sidx ? sidx[i] : i;
1856             cvGetRow( values, &sample, si );
1857             if( missing )
1858                 cvGetRow( missing, &miss, si );
1859             float r = (float)predict( &sample, missing ? &miss : 0 );
1860             if( pred_resp )
1861                 pred_resp[i] = r;
1862             int d = fabs((double)r - response->data.fl[si*r_step]) <= FLT_EPSILON ? 0 : 1;
1863             err += d;
1864         }
1865         err = sample_count ? err / (float)sample_count * 100 : -FLT_MAX;
1866     }
1867     else
1868     {
1869         for( int i = 0; i < sample_count; i++ )
1870         {
1871             CvMat sample, miss;
1872             int si = sidx ? sidx[i] : i;
1873             cvGetRow( values, &sample, si );
1874             if( missing )
1875                 cvGetRow( missing, &miss, si );
1876             float r = (float)predict( &sample, missing ? &miss : 0 );
1877             if( pred_resp )
1878                 pred_resp[i] = r;
1879             float d = r - response->data.fl[si*r_step];
1880             err += d*d;
1881         }
1882         err = sample_count ? err / (float)sample_count : -FLT_MAX;
1883     }
1884     return err;
1885 }
1886 
write_params(CvFileStorage * fs) const1887 void CvBoost::write_params( CvFileStorage* fs ) const
1888 {
1889     const char* boost_type_str =
1890         params.boost_type == DISCRETE ? "DiscreteAdaboost" :
1891         params.boost_type == REAL ? "RealAdaboost" :
1892         params.boost_type == LOGIT ? "LogitBoost" :
1893         params.boost_type == GENTLE ? "GentleAdaboost" : 0;
1894 
1895     const char* split_crit_str =
1896         params.split_criteria == DEFAULT ? "Default" :
1897         params.split_criteria == GINI ? "Gini" :
1898         params.boost_type == MISCLASS ? "Misclassification" :
1899         params.boost_type == SQERR ? "SquaredErr" : 0;
1900 
1901     if( boost_type_str )
1902         cvWriteString( fs, "boosting_type", boost_type_str );
1903     else
1904         cvWriteInt( fs, "boosting_type", params.boost_type );
1905 
1906     if( split_crit_str )
1907         cvWriteString( fs, "splitting_criteria", split_crit_str );
1908     else
1909         cvWriteInt( fs, "splitting_criteria", params.split_criteria );
1910 
1911     cvWriteInt( fs, "ntrees", weak->total );
1912     cvWriteReal( fs, "weight_trimming_rate", params.weight_trim_rate );
1913 
1914     data->write_params( fs );
1915 }
1916 
1917 
read_params(CvFileStorage * fs,CvFileNode * fnode)1918 void CvBoost::read_params( CvFileStorage* fs, CvFileNode* fnode )
1919 {
1920     CV_FUNCNAME( "CvBoost::read_params" );
1921 
1922     __BEGIN__;
1923 
1924     CvFileNode* temp;
1925 
1926     if( !fnode || !CV_NODE_IS_MAP(fnode->tag) )
1927         return;
1928 
1929     data = new CvDTreeTrainData();
1930     CV_CALL( data->read_params(fs, fnode));
1931     data->shared = true;
1932 
1933     params.max_depth = data->params.max_depth;
1934     params.min_sample_count = data->params.min_sample_count;
1935     params.max_categories = data->params.max_categories;
1936     params.priors = data->params.priors;
1937     params.regression_accuracy = data->params.regression_accuracy;
1938     params.use_surrogates = data->params.use_surrogates;
1939 
1940     temp = cvGetFileNodeByName( fs, fnode, "boosting_type" );
1941     if( !temp )
1942         return;
1943 
1944     if( temp && CV_NODE_IS_STRING(temp->tag) )
1945     {
1946         const char* boost_type_str = cvReadString( temp, "" );
1947         params.boost_type = strcmp( boost_type_str, "DiscreteAdaboost" ) == 0 ? DISCRETE :
1948                             strcmp( boost_type_str, "RealAdaboost" ) == 0 ? REAL :
1949                             strcmp( boost_type_str, "LogitBoost" ) == 0 ? LOGIT :
1950                             strcmp( boost_type_str, "GentleAdaboost" ) == 0 ? GENTLE : -1;
1951     }
1952     else
1953         params.boost_type = cvReadInt( temp, -1 );
1954 
1955     if( params.boost_type < DISCRETE || params.boost_type > GENTLE )
1956         CV_ERROR( CV_StsBadArg, "Unknown boosting type" );
1957 
1958     temp = cvGetFileNodeByName( fs, fnode, "splitting_criteria" );
1959     if( temp && CV_NODE_IS_STRING(temp->tag) )
1960     {
1961         const char* split_crit_str = cvReadString( temp, "" );
1962         params.split_criteria = strcmp( split_crit_str, "Default" ) == 0 ? DEFAULT :
1963                                 strcmp( split_crit_str, "Gini" ) == 0 ? GINI :
1964                                 strcmp( split_crit_str, "Misclassification" ) == 0 ? MISCLASS :
1965                                 strcmp( split_crit_str, "SquaredErr" ) == 0 ? SQERR : -1;
1966     }
1967     else
1968         params.split_criteria = cvReadInt( temp, -1 );
1969 
1970     if( params.split_criteria < DEFAULT || params.boost_type > SQERR )
1971         CV_ERROR( CV_StsBadArg, "Unknown boosting type" );
1972 
1973     params.weak_count = cvReadIntByName( fs, fnode, "ntrees" );
1974     params.weight_trim_rate = cvReadRealByName( fs, fnode, "weight_trimming_rate", 0. );
1975 
1976     __END__;
1977 }
1978 
1979 
1980 
1981 void
read(CvFileStorage * fs,CvFileNode * node)1982 CvBoost::read( CvFileStorage* fs, CvFileNode* node )
1983 {
1984     CV_FUNCNAME( "CvBoost::read" );
1985 
1986     __BEGIN__;
1987 
1988     CvSeqReader reader;
1989     CvFileNode* trees_fnode;
1990     CvMemStorage* storage;
1991     int i, ntrees;
1992 
1993     clear();
1994     read_params( fs, node );
1995 
1996     if( !data )
1997         EXIT;
1998 
1999     trees_fnode = cvGetFileNodeByName( fs, node, "trees" );
2000     if( !trees_fnode || !CV_NODE_IS_SEQ(trees_fnode->tag) )
2001         CV_ERROR( CV_StsParseError, "<trees> tag is missing" );
2002 
2003     cvStartReadSeq( trees_fnode->data.seq, &reader );
2004     ntrees = trees_fnode->data.seq->total;
2005 
2006     if( ntrees != params.weak_count )
2007         CV_ERROR( CV_StsUnmatchedSizes,
2008         "The number of trees stored does not match <ntrees> tag value" );
2009 
2010     CV_CALL( storage = cvCreateMemStorage() );
2011     weak = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvBoostTree*), storage );
2012 
2013     for( i = 0; i < ntrees; i++ )
2014     {
2015         CvBoostTree* tree = new CvBoostTree();
2016         CV_CALL(tree->read( fs, (CvFileNode*)reader.ptr, this, data ));
2017         CV_NEXT_SEQ_ELEM( reader.seq->elem_size, reader );
2018         cvSeqPush( weak, &tree );
2019     }
2020     get_active_vars();
2021 
2022     __END__;
2023 }
2024 
2025 
2026 void
write(CvFileStorage * fs,const char * name) const2027 CvBoost::write( CvFileStorage* fs, const char* name ) const
2028 {
2029     CV_FUNCNAME( "CvBoost::write" );
2030 
2031     __BEGIN__;
2032 
2033     CvSeqReader reader;
2034     int i;
2035 
2036     cvStartWriteStruct( fs, name, CV_NODE_MAP, CV_TYPE_NAME_ML_BOOSTING );
2037 
2038     if( !weak )
2039         CV_ERROR( CV_StsBadArg, "The classifier has not been trained yet" );
2040 
2041     write_params( fs );
2042     cvStartWriteStruct( fs, "trees", CV_NODE_SEQ );
2043 
2044     cvStartReadSeq( weak, &reader );
2045 
2046     for( i = 0; i < weak->total; i++ )
2047     {
2048         CvBoostTree* tree;
2049         CV_READ_SEQ_ELEM( tree, reader );
2050         cvStartWriteStruct( fs, 0, CV_NODE_MAP );
2051         tree->write( fs );
2052         cvEndWriteStruct( fs );
2053     }
2054 
2055     cvEndWriteStruct( fs );
2056     cvEndWriteStruct( fs );
2057 
2058     __END__;
2059 }
2060 
2061 
2062 CvMat*
get_weights()2063 CvBoost::get_weights()
2064 {
2065     return weights;
2066 }
2067 
2068 
2069 CvMat*
get_subtree_weights()2070 CvBoost::get_subtree_weights()
2071 {
2072     return subtree_weights;
2073 }
2074 
2075 
2076 CvMat*
get_weak_response()2077 CvBoost::get_weak_response()
2078 {
2079     return weak_eval;
2080 }
2081 
2082 
2083 const CvBoostParams&
get_params() const2084 CvBoost::get_params() const
2085 {
2086     return params;
2087 }
2088 
get_weak_predictors()2089 CvSeq* CvBoost::get_weak_predictors()
2090 {
2091     return weak;
2092 }
2093 
get_data() const2094 const CvDTreeTrainData* CvBoost::get_data() const
2095 {
2096     return data;
2097 }
2098 
2099 using namespace cv;
2100 
CvBoost(const Mat & _train_data,int _tflag,const Mat & _responses,const Mat & _var_idx,const Mat & _sample_idx,const Mat & _var_type,const Mat & _missing_mask,CvBoostParams _params)2101 CvBoost::CvBoost( const Mat& _train_data, int _tflag,
2102                const Mat& _responses, const Mat& _var_idx,
2103                const Mat& _sample_idx, const Mat& _var_type,
2104                const Mat& _missing_mask,
2105                CvBoostParams _params )
2106 {
2107     weak = 0;
2108     data = 0;
2109     default_model_name = "my_boost_tree";
2110     active_vars = active_vars_abs = orig_response = sum_response = weak_eval =
2111         subsample_mask = weights = subtree_weights = 0;
2112 
2113     train( _train_data, _tflag, _responses, _var_idx, _sample_idx,
2114           _var_type, _missing_mask, _params );
2115 }
2116 
2117 
2118 bool
train(const Mat & _train_data,int _tflag,const Mat & _responses,const Mat & _var_idx,const Mat & _sample_idx,const Mat & _var_type,const Mat & _missing_mask,CvBoostParams _params,bool _update)2119 CvBoost::train( const Mat& _train_data, int _tflag,
2120                const Mat& _responses, const Mat& _var_idx,
2121                const Mat& _sample_idx, const Mat& _var_type,
2122                const Mat& _missing_mask,
2123                CvBoostParams _params, bool _update )
2124 {
2125     train_data_hdr = cvMat(_train_data);
2126     train_data_mat = _train_data;
2127     responses_hdr = cvMat(_responses);
2128     responses_mat = _responses;
2129 
2130     CvMat vidx = cvMat(_var_idx), sidx = cvMat(_sample_idx), vtype = cvMat(_var_type), mmask = cvMat(_missing_mask);
2131 
2132     return train(&train_data_hdr, _tflag, &responses_hdr, vidx.data.ptr ? &vidx : 0,
2133           sidx.data.ptr ? &sidx : 0, vtype.data.ptr ? &vtype : 0,
2134           mmask.data.ptr ? &mmask : 0, _params, _update);
2135 }
2136 
2137 float
predict(const Mat & _sample,const Mat & _missing,const Range & slice,bool raw_mode,bool return_sum) const2138 CvBoost::predict( const Mat& _sample, const Mat& _missing,
2139                   const Range& slice, bool raw_mode, bool return_sum ) const
2140 {
2141     CvMat sample = cvMat(_sample), mmask = cvMat(_missing);
2142     /*if( weak_responses )
2143     {
2144         int weak_count = cvSliceLength( slice, weak );
2145         if( weak_count >= weak->total )
2146         {
2147             weak_count = weak->total;
2148             slice.start_index = 0;
2149         }
2150 
2151         if( !(weak_responses->data && weak_responses->type() == CV_32FC1 &&
2152               (weak_responses->cols == 1 || weak_responses->rows == 1) &&
2153               weak_responses->cols + weak_responses->rows - 1 == weak_count) )
2154             weak_responses->create(weak_count, 1, CV_32FC1);
2155         pwr = &(wr = *weak_responses);
2156     }*/
2157     return predict(&sample, _missing.empty() ? 0 : &mmask, 0,
2158                    slice == Range::all() ? CV_WHOLE_SEQ : cvSlice(slice.start, slice.end),
2159                    raw_mode, return_sum);
2160 }
2161 
2162 /* End of file. */
2163