1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996,1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : May 1996 */
35 /*-----------------------------------------------------------------------*/
36 /* A Classification and Regression Tree (CART) Program */
37 /* A basic implementation of many of the techniques in */
38 /* Briemen et al. 1984 */
39 /* */
40 /* Added decision list support, Feb 1997 */
41 /* Added stepwise use of features, Oct 1997 */
42 /* */
43 /*=======================================================================*/
44
45 #include <cstdlib>
46 #include <iostream>
47 #include <fstream>
48 #include <cstring>
49 #include "EST_Token.h"
50 #include "EST_FMatrix.h"
51 #include "EST_multistats.h"
52 #include "EST_Wagon.h"
53 #include "EST_math.h"
54
55 Discretes wgn_discretes;
56
57 WDataSet wgn_dataset;
58 WDataSet wgn_test_dataset;
59 EST_FMatrix wgn_DistMatrix;
60 EST_Track wgn_VertexTrack;
61 EST_Track wgn_VertexFeats;
62 EST_Track wgn_UnitTrack;
63
64 int wgn_min_cluster_size = 50;
65 int wgn_held_out = 0;
66 int wgn_prune = TRUE;
67 int wgn_quiet = FALSE;
68 int wgn_verbose = FALSE;
69 int wgn_count_field = -1;
70 EST_String wgn_count_field_name = "";
71 int wgn_predictee = 0;
72 EST_String wgn_predictee_name = "";
73 float wgn_float_range_split = 10;
74 float wgn_balance = 0;
75 EST_String wgn_opt_param = "";
76 EST_String wgn_vertex_output = "mean";
77 EST_String wgn_vertex_otype = "mean";
78
79 static float do_summary(WNode &tree,WDataSet &ds,ostream *output);
80 static float test_tree_float(WNode &tree,WDataSet &ds,ostream *output);
81 static float test_tree_class(WNode &tree,WDataSet &ds,ostream *output);
82 static float test_tree_cluster(WNode &tree,WDataSet &dataset, ostream *output);
83 static float test_tree_vector(WNode &tree,WDataSet &dataset,ostream *output);
84 static float test_tree_trajectory(WNode &tree,WDataSet &dataset,ostream *output);
85 static float test_tree_ols(WNode &tree,WDataSet &dataset,ostream *output);
86 static int wagon_split(int margin,WNode &node);
87 static WQuestion find_best_question(WVectorVector &dset);
88 static void construct_binary_ques(int feat,WQuestion &test_ques);
89 static float construct_float_ques(int feat,WQuestion &ques,WVectorVector &ds);
90 static float construct_class_ques(int feat,WQuestion &ques,WVectorVector &ds);
91 static void wgn_set_up_data(WVectorVector &data,const WVectorList &ds,int held_out,int in);
92 static WNode *wagon_stepwise_find_next_best(float &bscore,int &best_feat);
93
Declare_TList_T(WVector *,WVectorP)94 Declare_TList_T(WVector *, WVectorP)
95
96 Declare_TVector_Base_T(WVector *,NULL,NULL,WVectorP)
97
98 #if defined(INSTANTIATE_TEMPLATES)
99 // Instantiate class
100 #include "../base_class/EST_TList.cc"
101 #include "../base_class/EST_TVector.cc"
102
103 Instantiate_TList_T(WVector *, WVectorP)
104
105 Instantiate_TVector(WVector *)
106
107 #endif
108
109 void wgn_load_datadescription(EST_String fname,LISP ignores)
110 {
111 // Load field description for a file
112 wgn_dataset.load_description(fname,ignores);
113 wgn_test_dataset.load_description(fname,ignores);
114 }
115
wgn_load_dataset(WDataSet & dataset,EST_String fname)116 void wgn_load_dataset(WDataSet &dataset,EST_String fname)
117 {
118 // Read the data set from a filename. One vector per line
119 // Assume all numbers are numbers and non-nums are categorical
120 EST_TokenStream ts;
121 WVector *v;
122 int nvec=0,i;
123
124 if (ts.open(fname) == -1)
125 wagon_error(EST_String("unable to open data file \"")+
126 fname+"\"");
127 ts.set_PunctuationSymbols("");
128 ts.set_PrePunctuationSymbols("");
129 ts.set_SingleCharSymbols("");
130
131 for ( ;!ts.eof(); )
132 {
133 v = new WVector(dataset.width());
134 i = 0;
135 do
136 {
137 int type = dataset.ftype(i);
138 if ((type == wndt_float) ||
139 (type == wndt_ols) ||
140 (wgn_count_field == i))
141 {
142 // need to ensure this is not NaN or Infinity
143 float f = atof(ts.get().string());
144 if (isfinite(f))
145 v->set_flt_val(i,f);
146 else
147 {
148 cout << fname << ": bad float " << f
149 << " in field " <<
150 dataset.feat_name(i) << " vector " <<
151 dataset.samples() << endl;
152 v->set_flt_val(i,0.0);
153 }
154 }
155 else if (type == wndt_binary)
156 v->set_int_val(i,atoi(ts.get().string()));
157 else if (type == wndt_cluster) /* index into distmatrix */
158 v->set_int_val(i,atoi(ts.get().string()));
159 else if (type == wndt_vector) /* index into VertexTrack */
160 v->set_int_val(i,atoi(ts.get().string()));
161 else if (type == wndt_trajectory) /* index to index and length */
162 { /* a number pointing to a vector in UnitTrack that */
163 /* has an idex into VertexTrack and a number of Vertices */
164 /* Thus if its 15, UnitTrack.a(15,0) is the start frame in */
165 /* VertexTrack and UnitTrack.a(15,1) is the number of */
166 /* frames in the unit */
167 v->set_int_val(i,atoi(ts.get().string()));
168 }
169 else if (type == wndt_ignore)
170 {
171 ts.get(); // skip it
172 v->set_int_val(i,0);
173 }
174 else // should check the different classes
175 {
176 EST_String s = ts.get().string();
177 int n = wgn_discretes.discrete(type).name(s);
178 if (n == -1)
179 {
180 cout << fname << ": bad value " << s << " in field " <<
181 dataset.feat_name(i) << " vector " <<
182 dataset.samples() << endl;
183 n = 0;
184 }
185 v->set_int_val(i,n);
186 }
187 i++;
188 }
189 while (!ts.eoln() && i<dataset.width());
190 nvec ++;
191 if (i != dataset.width())
192 {
193 wagon_error(fname+": data vector "+itoString(nvec)+" contains "
194 +itoString(i)+" parameters instead of "+
195 itoString(dataset.width()));
196 }
197 if (!ts.eoln())
198 {
199 cerr << fname << ": data vector " << nvec <<
200 " contains too many parameters instead of "
201 << dataset.width() << endl;
202 wagon_error(EST_String("extra parameter(s) from ")+
203 ts.peek().string());
204 }
205 dataset.append(v);
206 }
207
208 cout << "Dataset of " << dataset.samples() << " vectors of " <<
209 dataset.width() << " parameters from: " << fname << endl;
210 ts.close();
211 }
212
summary_results(WNode & tree,ostream * output)213 float summary_results(WNode &tree,ostream *output)
214 {
215 if (wgn_test_dataset.samples() != 0)
216 return do_summary(tree,wgn_test_dataset,output);
217 else
218 return do_summary(tree,wgn_dataset,output);
219 }
220
do_summary(WNode & tree,WDataSet & ds,ostream * output)221 static float do_summary(WNode &tree,WDataSet &ds,ostream *output)
222 {
223 if (wgn_dataset.ftype(wgn_predictee) == wndt_cluster)
224 return test_tree_cluster(tree,ds,output);
225 else if (wgn_dataset.ftype(wgn_predictee) == wndt_vector)
226 return test_tree_vector(tree,ds,output);
227 else if (wgn_dataset.ftype(wgn_predictee) == wndt_trajectory)
228 return test_tree_trajectory(tree,ds,output);
229 else if (wgn_dataset.ftype(wgn_predictee) == wndt_ols)
230 return test_tree_ols(tree,ds,output);
231 else if (wgn_dataset.ftype(wgn_predictee) >= wndt_class)
232 return test_tree_class(tree,ds,output);
233 else
234 return test_tree_float(tree,ds,output);
235 }
236
wgn_build_tree(float & score)237 WNode *wgn_build_tree(float &score)
238 {
239 // Build init node and split it while reducing the impurity
240 WNode *top = new WNode();
241 int margin = 0;
242
243 wgn_set_up_data(top->get_data(),wgn_dataset,wgn_held_out,TRUE);
244
245 margin = 0;
246 wagon_split(margin,*top); // recursively split data;
247
248 if (wgn_held_out > 0)
249 {
250 wgn_set_up_data(top->get_data(),wgn_dataset,wgn_held_out,FALSE);
251 top->held_out_prune();
252 }
253
254 if (wgn_prune)
255 top->prune();
256
257 score = summary_results(*top,0);
258
259 return top;
260 }
261
wgn_set_up_data(WVectorVector & data,const WVectorList & ds,int held_out,int in)262 static void wgn_set_up_data(WVectorVector &data,const WVectorList &ds,int held_out,int in)
263 {
264 // Set data ommitting held_out percent if in is true
265 // or only including 100-held_out percent if in is false
266 int i,j;
267 EST_Litem *d;
268
269 // Make it definitely big enough
270 data.resize(ds.length());
271
272 for (j=i=0,d=ds.head(); d != 0; d=d->next(),j++)
273 {
274 if ((in) && ((j%100) >= held_out))
275 data[i++] = ds(d);
276 // else if ((!in) && ((j%100 < held_out)))
277 // data[i++] = ds(d);
278 else if (!in)
279 data[i++] = ds(d);
280 // if ((in) && (j < held_out))
281 // data[i++] = ds(d);
282 // else if ((!in) && (j >=held_out))
283 // data[i++] = ds(d);
284 }
285 // make it the actual size, but don't reset values
286 data.resize(i,1);
287 }
288
test_tree_class(WNode & tree,WDataSet & dataset,ostream * output)289 static float test_tree_class(WNode &tree,WDataSet &dataset,ostream *output)
290 {
291 // Test tree against data to get summary of results
292 EST_StrStr_KVL pairs;
293 EST_StrList lex;
294 EST_Litem *p;
295 EST_String predict,real;
296 WNode *pnode;
297 double H=0,prob;
298 int i,type;
299 float correct=0,total=0, count=0;
300
301 float bcorrect=0, bpredicted=0, bactual=0;
302 float precision=0, recall=0;
303
304 for (p=dataset.head(); p != 0; p=p->next())
305 {
306 pnode = tree.predict_node((*dataset(p)));
307 predict = (EST_String)pnode->get_impurity().value();
308 if (wgn_count_field == -1)
309 count = 1.0;
310 else
311 count = dataset(p)->get_flt_val(wgn_count_field);
312 prob = pnode->get_impurity().pd().probability(predict);
313 H += (log(prob))*count;
314 type = dataset.ftype(wgn_predictee);
315 real = wgn_discretes[type].name(dataset(p)->get_int_val(wgn_predictee));
316
317 if (wgn_opt_param == "B_NB_F1")
318 {
319 //cout << real << " " << predict << endl;
320 if (real == "B")
321 bactual +=count;
322 if (predict == "B")
323 {
324 bpredicted += count;
325 if (real == predict)
326 bcorrect += count;
327 }
328 // cout <<bactual << " " << bpredicted << " " << bcorrect << endl;
329 }
330 if (real == predict)
331 correct += count;
332 total += count;
333 pairs.add_item(real,predict,1);
334 }
335 for (i=0; i<wgn_discretes[dataset.ftype(wgn_predictee)].length(); i++)
336 lex.append(wgn_discretes[dataset.ftype(wgn_predictee)].name(i));
337
338 const EST_FMatrix &m = confusion(pairs,lex);
339
340 if (output != NULL)
341 {
342 print_confusion(m,pairs,lex); // should be to output not stdout
343 *output << ";; entropy " << (-1*(H/total)) << " perplexity " <<
344 pow(2.0,(-1*(H/total))) << endl;
345 }
346
347
348 // Minus it so bigger is better
349 if (wgn_opt_param == "entropy")
350 return -pow(2.0,(-1*(H/total)));
351 else if(wgn_opt_param == "B_NB_F1")
352 {
353 if(bpredicted == 0)
354 precision = 1;
355 else
356 precision = bcorrect/bpredicted;
357 if(bactual == 0)
358 recall = 1;
359 else
360 recall = bcorrect/bactual;
361 float fmeasure = 0;
362 if((precision+recall) !=0)
363 fmeasure = 2* (precision*recall)/(precision+recall);
364 cout<< "F1 :" << fmeasure << " Prec:" << precision << " Rec:" << recall << " B-Pred:" << bpredicted << " B-Actual:" << bactual << " B-Correct:" << bcorrect << endl;
365 return fmeasure;
366 }
367 else
368 return (float)correct/(float)total;
369 }
370
test_tree_vector(WNode & tree,WDataSet & dataset,ostream * output)371 static float test_tree_vector(WNode &tree,WDataSet &dataset,ostream *output)
372 {
373 // Test tree against data to get summary of results VECTOR
374 // distance is calculated in zscores (as the values in vector may
375 // have quite different ranges
376 WNode *leaf;
377 EST_Litem *p;
378 float predict, actual;
379 EST_SuffStats x,y,xx,yy,xy,se,e;
380 EST_SuffStats b;
381 int i,j,pos;
382 double cor,error;
383 double count;
384 EST_Litem *pp;
385
386 for (p=dataset.head(); p != 0; p=p->next())
387 {
388 leaf = tree.predict_node((*dataset(p)));
389 pos = dataset(p)->get_int_val(wgn_predictee);
390 for (j=0; j<wgn_VertexFeats.num_channels(); j++)
391 if (wgn_VertexFeats.a(0,j) > 0.0)
392 {
393 b.reset();
394 for (pp=leaf->get_impurity().members.head(); pp != 0; pp=pp->next())
395 {
396 i = leaf->get_impurity().members.item(pp);
397 b += wgn_VertexTrack.a(i,j);
398 }
399 predict = b.mean();
400 actual = wgn_VertexTrack.a(pos,j);
401 if (wgn_count_field == -1)
402 count = 1.0;
403 else
404 count = dataset(p)->get_flt_val(wgn_count_field);
405 x.cumulate(predict,count);
406 y.cumulate(actual,count);
407 /* Normalized the error by the standard deviation */
408 if (b.stddev() == 0)
409 error = predict-actual;
410 else
411 error = (predict-actual)/b.stddev();
412 error = predict-actual; /* awb_debug */
413 se.cumulate((error*error),count);
414 e.cumulate(fabs(error),count);
415 xx.cumulate(predict*predict,count);
416 yy.cumulate(actual*actual,count);
417 xy.cumulate(predict*actual,count);
418 }
419 }
420
421 // Pearson's product moment correlation coefficient
422 // cor = (xy.mean() - (x.mean()*y.mean()))/
423 // (sqrt(xx.mean()-(x.mean()*x.mean())) *
424 // sqrt(yy.mean()-(y.mean()*y.mean())));
425 // Because when the variation is X is very small we can
426 // go negative, thus cause the sqrt's to give FPE
427 double v1 = xx.mean()-(x.mean()*x.mean());
428 double v2 = yy.mean()-(y.mean()*y.mean());
429
430 double v3 = v1*v2;
431
432 if (v3 <= 0)
433 // happens when there's very little variation in x
434 cor = 0;
435 else
436 cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
437
438 if (output != NULL)
439 {
440 if (output != &cout) // save in output file
441 *output
442 << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
443 << " Correlation is " << ftoString(cor,4,1)
444 << " Mean (abs) Error " << ftoString(e.mean(),4,1)
445 << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
446
447 cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
448 << " Correlation is " << ftoString(cor,4,1)
449 << " Mean (abs) Error " << ftoString(e.mean(),4,1)
450 << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
451 }
452
453 if (wgn_opt_param == "rmse")
454 return -sqrt(se.mean()); // * -1 so bigger is better
455 else
456 return cor; // should really be % variance, I think
457 }
458
test_tree_trajectory(WNode & tree,WDataSet & dataset,ostream * output)459 static float test_tree_trajectory(WNode &tree,WDataSet &dataset,ostream *output)
460 {
461 // Test tree against data to get summary of results TRAJECTORY
462 // distance is calculated in zscores (as the values in vector may
463 // have quite different ranges)
464 // NOT WRITTEN YET
465 WNode *leaf;
466 EST_Litem *p;
467 float predict, actual;
468 EST_SuffStats x,y,xx,yy,xy,se,e;
469 EST_SuffStats b;
470 int i,j,pos;
471 double cor,error;
472 double count;
473 EST_Litem *pp;
474
475 for (p=dataset.head(); p != 0; p=p->next())
476 {
477 leaf = tree.predict_node((*dataset(p)));
478 pos = dataset(p)->get_int_val(wgn_predictee);
479 for (j=0; j<wgn_VertexFeats.num_channels(); j++)
480 if (wgn_VertexFeats.a(0,j) > 0.0)
481 {
482 b.reset();
483 for (pp=leaf->get_impurity().members.head(); pp != 0; pp=pp->next())
484 {
485 i = leaf->get_impurity().members.item(pp);
486 b += wgn_VertexTrack.a(i,j);
487 }
488 predict = b.mean();
489 actual = wgn_VertexTrack.a(pos,j);
490 if (wgn_count_field == -1)
491 count = 1.0;
492 else
493 count = dataset(p)->get_flt_val(wgn_count_field);
494 x.cumulate(predict,count);
495 y.cumulate(actual,count);
496 /* Normalized the error by the standard deviation */
497 if (b.stddev() == 0)
498 error = predict-actual;
499 else
500 error = (predict-actual)/b.stddev();
501 error = predict-actual; /* awb_debug */
502 se.cumulate((error*error),count);
503 e.cumulate(fabs(error),count);
504 xx.cumulate(predict*predict,count);
505 yy.cumulate(actual*actual,count);
506 xy.cumulate(predict*actual,count);
507 }
508 }
509
510 // Pearson's product moment correlation coefficient
511 // cor = (xy.mean() - (x.mean()*y.mean()))/
512 // (sqrt(xx.mean()-(x.mean()*x.mean())) *
513 // sqrt(yy.mean()-(y.mean()*y.mean())));
514 // Because when the variation is X is very small we can
515 // go negative, thus cause the sqrt's to give FPE
516 double v1 = xx.mean()-(x.mean()*x.mean());
517 double v2 = yy.mean()-(y.mean()*y.mean());
518
519 double v3 = v1*v2;
520
521 if (v3 <= 0)
522 // happens when there's very little variation in x
523 cor = 0;
524 else
525 cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
526
527 if (output != NULL)
528 {
529 if (output != &cout) // save in output file
530 *output
531 << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
532 << " Correlation is " << ftoString(cor,4,1)
533 << " Mean (abs) Error " << ftoString(e.mean(),4,1)
534 << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
535
536 cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
537 << " Correlation is " << ftoString(cor,4,1)
538 << " Mean (abs) Error " << ftoString(e.mean(),4,1)
539 << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
540 }
541
542 if (wgn_opt_param == "rmse")
543 return -sqrt(se.mean()); // * -1 so bigger is better
544 else
545 return cor; // should really be % variance, I think
546 }
547
test_tree_cluster(WNode & tree,WDataSet & dataset,ostream * output)548 static float test_tree_cluster(WNode &tree,WDataSet &dataset,ostream *output)
549 {
550 // Test tree against data to get summary of results for cluster trees
551 WNode *leaf;
552 int real;
553 int right_cluster=0;
554 EST_SuffStats ranking, meandist;
555 EST_Litem *p;
556
557 for (p=dataset.head(); p != 0; p=p->next())
558 {
559 leaf = tree.predict_node((*dataset(p)));
560 real = dataset(p)->get_int_val(wgn_predictee);
561 meandist += leaf->get_impurity().cluster_distance(real);
562 right_cluster += leaf->get_impurity().in_cluster(real);
563 ranking += leaf->get_impurity().cluster_ranking(real);
564 }
565
566 if (output != NULL)
567 {
568 // Want number in right class, mean distance in sds, mean ranking
569 if (output != &cout) // save in output file
570 *output << ";; Right cluster " << right_cluster << " (" <<
571 (int)(100.0*(float)right_cluster/(float)dataset.length()) <<
572 "%) mean ranking " << ranking.mean() << " mean distance "
573 << meandist.mean() << endl;
574 cout << "Right cluster " << right_cluster << " (" <<
575 (int)(100.0*(float)right_cluster/(float)dataset.length()) <<
576 "%) mean ranking " << ranking.mean() << " mean distance "
577 << meandist.mean() << endl;
578 }
579
580 return 10000-meandist.mean(); // this doesn't work but I tested it
581 }
582
test_tree_float(WNode & tree,WDataSet & dataset,ostream * output)583 static float test_tree_float(WNode &tree,WDataSet &dataset,ostream *output)
584 {
585 // Test tree against data to get summary of results FLOAT
586 EST_Litem *p;
587 float predict,real;
588 EST_SuffStats x,y,xx,yy,xy,se,e;
589 double cor,error;
590 double count;
591
592 for (p=dataset.head(); p != 0; p=p->next())
593 {
594 predict = tree.predict((*dataset(p)));
595 real = dataset(p)->get_flt_val(wgn_predictee);
596 if (wgn_count_field == -1)
597 count = 1.0;
598 else
599 count = dataset(p)->get_flt_val(wgn_count_field);
600 x.cumulate(predict,count);
601 y.cumulate(real,count);
602 error = predict-real;
603 se.cumulate((error*error),count);
604 e.cumulate(fabs(error),count);
605 xx.cumulate(predict*predict,count);
606 yy.cumulate(real*real,count);
607 xy.cumulate(predict*real,count);
608 }
609
610 // Pearson's product moment correlation coefficient
611 // cor = (xy.mean() - (x.mean()*y.mean()))/
612 // (sqrt(xx.mean()-(x.mean()*x.mean())) *
613 // sqrt(yy.mean()-(y.mean()*y.mean())));
614 // Because when the variation is X is very small we can
615 // go negative, thus cause the sqrt's to give FPE
616 double v1 = xx.mean()-(x.mean()*x.mean());
617 double v2 = yy.mean()-(y.mean()*y.mean());
618
619 double v3 = v1*v2;
620
621 if (v3 <= 0)
622 // happens when there's very little variation in x
623 cor = 0;
624 else
625 cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
626
627 if (output != NULL)
628 {
629 if (output != &cout) // save in output file
630 *output
631 << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
632 << " Correlation is " << ftoString(cor,4,1)
633 << " Mean (abs) Error " << ftoString(e.mean(),4,1)
634 << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
635
636 cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
637 << " Correlation is " << ftoString(cor,4,1)
638 << " Mean (abs) Error " << ftoString(e.mean(),4,1)
639 << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
640 }
641
642 if (wgn_opt_param == "rmse")
643 return -sqrt(se.mean()); // * -1 so bigger is better
644 else
645 return cor; // should really be % variance, I think
646 }
647
test_tree_ols(WNode & tree,WDataSet & dataset,ostream * output)648 static float test_tree_ols(WNode &tree,WDataSet &dataset,ostream *output)
649 {
650 // Test tree against data to get summary of results OLS
651 EST_Litem *p;
652 WNode *leaf;
653 float predict,real;
654 EST_SuffStats x,y,xx,yy,xy,se,e;
655 double cor,error;
656 double count;
657
658 for (p=dataset.head(); p != 0; p=p->next())
659 {
660 leaf = tree.predict_node((*dataset(p)));
661 // do ols to get predict;
662 predict = 0.0;
663 real = dataset(p)->get_flt_val(wgn_predictee);
664 if (wgn_count_field == -1)
665 count = 1.0;
666 else
667 count = dataset(p)->get_flt_val(wgn_count_field);
668 x.cumulate(predict,count);
669 y.cumulate(real,count);
670 error = predict-real;
671 se.cumulate((error*error),count);
672 e.cumulate(fabs(error),count);
673 xx.cumulate(predict*predict,count);
674 yy.cumulate(real*real,count);
675 xy.cumulate(predict*real,count);
676 }
677
678 // Pearson's product moment correlation coefficient
679 // cor = (xy.mean() - (x.mean()*y.mean()))/
680 // (sqrt(xx.mean()-(x.mean()*x.mean())) *
681 // sqrt(yy.mean()-(y.mean()*y.mean())));
682 // Because when the variation is X is very small we can
683 // go negative, thus cause the sqrt's to give FPE
684 double v1 = xx.mean()-(x.mean()*x.mean());
685 double v2 = yy.mean()-(y.mean()*y.mean());
686
687 double v3 = v1*v2;
688
689 if (v3 <= 0)
690 // happens when there's very little variation in x
691 cor = 0;
692 else
693 cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
694
695 if (output != NULL)
696 {
697 if (output != &cout) // save in output file
698 *output
699 << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
700 << " Correlation is " << ftoString(cor,4,1)
701 << " Mean (abs) Error " << ftoString(e.mean(),4,1)
702 << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
703
704 cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
705 << " Correlation is " << ftoString(cor,4,1)
706 << " Mean (abs) Error " << ftoString(e.mean(),4,1)
707 << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
708 }
709
710 if (wgn_opt_param == "rmse")
711 return -sqrt(se.mean()); // * -1 so bigger is better
712 else
713 return cor; // should really be % variance, I think
714 }
715
wagon_split(int margin,WNode & node)716 static int wagon_split(int margin, WNode &node)
717 {
718 // Split given node (if possible)
719 WQuestion q;
720 WNode *l,*r;
721
722 node.set_impurity(WImpurity(node.get_data()));
723 q = find_best_question(node.get_data());
724
725 /* printf("q.score() %f impurity %f\n",
726 q.get_score(),
727 node.get_impurity().measure()); */
728
729 double impurity_measure = node.get_impurity().measure();
730 double question_score = q.get_score();
731
732 if ((question_score < WGN_HUGE_VAL) &&
733 (question_score < impurity_measure))
734
735 {
736 // Ok its worth a split
737 l = new WNode();
738 r = new WNode();
739 wgn_find_split(q,node.get_data(),l->get_data(),r->get_data());
740 node.set_subnodes(l,r);
741 node.set_question(q);
742 if (wgn_verbose)
743 {
744 int i;
745 for (i=0; i < margin; i++)
746 cout << " ";
747 cout << q << endl;
748 }
749 margin++;
750 wagon_split(margin,*l);
751 margin++;
752 wagon_split(margin,*r);
753 margin--;
754 return TRUE;
755 }
756 else
757 {
758 if (wgn_verbose)
759 {
760 int i;
761 for (i=0; i < margin; i++)
762 cout << " ";
763 cout << "stopped samples: " << node.samples() << " impurity: "
764 << node.get_impurity() << endl;
765 }
766 margin--;
767 return FALSE;
768 }
769 }
770
wgn_find_split(WQuestion & q,WVectorVector & ds,WVectorVector & y,WVectorVector & n)771 void wgn_find_split(WQuestion &q,WVectorVector &ds,
772 WVectorVector &y,WVectorVector &n)
773 {
774 int i, iy, in;
775
776 y.resize(q.get_yes());
777 n.resize(q.get_no());
778
779 for (iy=in=i=0; i < ds.n(); i++)
780 if (q.ask(*ds(i)) == TRUE)
781 y[iy++] = ds(i);
782 else
783 n[in++] = ds(i);
784
785 }
786
find_best_question(WVectorVector & dset)787 static WQuestion find_best_question(WVectorVector &dset)
788 {
789 // Ask all possible questions and find the best one
790 int i;
791 float bscore,tscore;
792 WQuestion test_ques, best_ques;
793
794 bscore = tscore = WGN_HUGE_VAL;
795 best_ques.set_score(bscore);
796 // test each feature with each possible question
797 for (i=0;i < wgn_dataset.width(); i++)
798 {
799 if ((wgn_dataset.ignore(i) == TRUE) ||
800 (i == wgn_predictee))
801 tscore = WGN_HUGE_VAL; // ignore this feature this time
802 else if (wgn_dataset.ftype(i) == wndt_binary)
803 {
804 construct_binary_ques(i,test_ques);
805 tscore = wgn_score_question(test_ques,dset);
806 }
807 else if (wgn_dataset.ftype(i) == wndt_float)
808 {
809 tscore = construct_float_ques(i,test_ques,dset);
810 }
811 else if (wgn_dataset.ftype(i) == wndt_ignore)
812 tscore = WGN_HUGE_VAL; // always ignore this feature
813 #if 0
814 // This doesn't work reasonably
815 else if (wgn_csubset && (wgn_dataset.ftype(i) >= wndt_class))
816 {
817 wagon_error("subset selection temporarily deleted");
818 tscore = construct_class_ques_subset(i,test_ques,dset);
819 }
820 #endif
821 else if (wgn_dataset.ftype(i) >= wndt_class)
822 tscore = construct_class_ques(i,test_ques,dset);
823 if (tscore < bscore)
824 {
825 best_ques = test_ques;
826 best_ques.set_score(tscore);
827 bscore = tscore;
828 }
829 }
830
831 return best_ques;
832 }
833
construct_class_ques(int feat,WQuestion & ques,WVectorVector & ds)834 static float construct_class_ques(int feat,WQuestion &ques,WVectorVector &ds)
835 {
836 // Find out which member of a class gives the best split
837 float tscore,bscore = WGN_HUGE_VAL;
838 int cl;
839 WQuestion test_q;
840
841 test_q.set_fp(feat);
842 test_q.set_oper(wnop_is);
843 ques = test_q;
844
845 for (cl=0; cl < wgn_discretes[wgn_dataset.ftype(feat)].length(); cl++)
846 {
847 test_q.set_operand1(EST_Val(cl));
848 tscore = wgn_score_question(test_q,ds);
849 if (tscore < bscore)
850 {
851 ques = test_q;
852 bscore = tscore;
853 }
854 }
855
856 return bscore;
857 }
858
859 #if 0
860 static float construct_class_ques_subset(int feat,WQuestion &ques,
861 WVectorVector &ds)
862 {
863 // Find out which subset of a class gives the best split.
864 // We first measure the subset of the data for each member of
865 // of the class. Then order those splits. Then go through finding
866 // where the best split of that ordered list is. This is described
867 // on page 247 of Breiman et al.
868 float tscore,bscore = WGN_HUGE_VAL;
869 LISP l;
870 int cl;
871
872 ques.set_fp(feat);
873 ques.set_oper(wnop_is);
874 float *scores = new float[wgn_discretes[wgn_dataset.ftype(feat)].length()];
875
876 // Only do it for exists values
877 for (cl=0; cl < wgn_discretes[wgn_dataset.ftype(feat)].length(); cl++)
878 {
879 ques.set_operand(flocons(cl));
880 scores[cl] = wgn_score_question(ques,ds);
881 }
882
883 LISP order = sort_class_scores(feat,scores);
884 if (order == NIL)
885 return WGN_HUGE_VAL;
886 if (siod_llength(order) == 1)
887 { // Only one so we know the best "split"
888 ques.set_oper(wnop_is);
889 ques.set_operand(car(order));
890 return scores[get_c_int(car(order))];
891 }
892
893 ques.set_oper(wnop_in);
894 LISP best_l = NIL;
895 for (l=cdr(order); CDR(l) != NIL; l = cdr(l))
896 {
897 ques.set_operand(l);
898 tscore = wgn_score_question(ques,ds);
899 if (tscore < bscore)
900 {
901 best_l = l;
902 bscore = tscore;
903 }
904
905 }
906
907 if (best_l != NIL)
908 {
909 if (siod_llength(best_l) == 1)
910 {
911 ques.set_oper(wnop_is);
912 ques.set_operand(car(best_l));
913 }
914 else if (equal(cdr(order),best_l) != NIL)
915 {
916 ques.set_oper(wnop_is);
917 ques.set_operand(car(order));
918 }
919 else
920 {
921 cout << "Found a good subset" << endl;
922 ques.set_operand(best_l);
923 }
924 }
925 return bscore;
926 }
927
928 static LISP sort_class_scores(int feat,float *scores)
929 {
930 // returns sorted list of (non WGN_HUGE_VAL) items
931 int i;
932 LISP items = NIL;
933 LISP l;
934
935 for (i=0; i < wgn_discretes[wgn_dataset.ftype(feat)].length(); i++)
936 {
937 if (scores[i] != WGN_HUGE_VAL)
938 {
939 if (items == NIL)
940 items = cons(flocons(i),NIL);
941 else
942 {
943 for (l=items; l != NIL; l=cdr(l))
944 {
945 if (scores[i] < scores[get_c_int(car(l))])
946 {
947 CDR(l) = cons(car(l),cdr(l));
948 CAR(l) = flocons(i);
949 break;
950 }
951 }
952 if (l == NIL)
953 items = l_append(items,cons(flocons(i),NIL));
954 }
955 }
956 }
957 return items;
958 }
959 #endif
960
construct_float_ques(int feat,WQuestion & ques,WVectorVector & ds)961 static float construct_float_ques(int feat,WQuestion &ques,WVectorVector &ds)
962 {
963 // Find out a split of the range that gives the best score
964 // Naively does this by partitioning the range into float_range_split slots
965 float tscore,bscore = WGN_HUGE_VAL;
966 int d, i;
967 float p;
968 WQuestion test_q;
969 float max,min,val,incr;
970
971 test_q.set_fp(feat);
972 test_q.set_oper(wnop_lessthan);
973 ques = test_q;
974
975 min = max = ds(0)->get_flt_val(feat); /* set up some value */
976 for (d=0; d < ds.n(); d++)
977 {
978 val = ds(d)->get_flt_val(feat);
979 if (val < min)
980 min = val;
981 else if (val > max)
982 max = val;
983 }
984 if (max == min) // we're pure
985 return WGN_HUGE_VAL;
986 incr = (max-min)/wgn_float_range_split;
987 // so do float_range-1 splits
988 /* We calculate this based on the number splits, not the increments, */
989 /* becuase incr can be so small it doesn't increment p */
990 for (i=0,p=min+incr; i < wgn_float_range_split; i++,p += incr )
991 {
992 test_q.set_operand1(EST_Val(p));
993 tscore = wgn_score_question(test_q,ds);
994 if (tscore < bscore)
995 {
996 ques = test_q;
997 bscore = tscore;
998 }
999 }
1000
1001 return bscore;
1002 }
1003
construct_binary_ques(int feat,WQuestion & test_ques)1004 static void construct_binary_ques(int feat,WQuestion &test_ques)
1005 {
1006 // construct a question. Not sure about this in general
1007 // of course continuous/categorical features will require different
1008 // rule and non-binary ones will require some test point
1009
1010 test_ques.set_fp(feat);
1011 test_ques.set_oper(wnop_binary);
1012 test_ques.set_operand1(EST_Val(""));
1013 }
1014
score_question_set(WQuestion & q,WVectorVector & ds,int ignorenth)1015 static float score_question_set(WQuestion &q, WVectorVector &ds, int ignorenth)
1016 {
1017 // score this question as a possible split by finding
1018 // the sum of the impurities when ds is split with this question
1019 WImpurity y,n;
1020 int d, num_yes, num_no;
1021 float count;
1022 WVector *wv;
1023
1024 num_yes = num_no = 0;
1025 y.data = &ds;
1026 n.data = &ds;
1027 for (d=0; d < ds.n(); d++)
1028 {
1029 if ((ignorenth < 2) ||
1030 (d%ignorenth != ignorenth-1))
1031 {
1032 wv = ds(d);
1033 if (wgn_count_field == -1)
1034 count = 1.0;
1035 else
1036 count = (*wv)[wgn_count_field];
1037
1038 if (q.ask(*wv) == TRUE)
1039 {
1040 num_yes++;
1041 if (wgn_dataset.ftype(wgn_predictee) == wndt_ols)
1042 y.cumulate(d,count); // note the sample number not value
1043 else
1044 y.cumulate((*wv)[wgn_predictee],count);
1045 }
1046 else
1047 {
1048 num_no++;
1049 if (wgn_dataset.ftype(wgn_predictee) == wndt_ols)
1050 n.cumulate(d,count); // note the sample number not value
1051 else
1052 n.cumulate((*wv)[wgn_predictee],count);
1053 }
1054 }
1055 }
1056
1057 q.set_yes(num_yes);
1058 q.set_no(num_no);
1059
1060 int min_cluster;
1061
1062 if ((wgn_balance == 0.0) ||
1063 (ds.n()/wgn_balance < wgn_min_cluster_size))
1064 min_cluster = wgn_min_cluster_size;
1065 else
1066 min_cluster = (int)(ds.n()/wgn_balance);
1067
1068 if ((y.samples() < min_cluster) ||
1069 (n.samples() < min_cluster))
1070 return WGN_HUGE_VAL;
1071
1072 float ym,nm,bm;
1073 // printf("awb_debug score_question_set X %f Y %f\n",
1074 // y.samples(), n.samples());
1075 ym = y.measure();
1076 nm = n.measure();
1077 bm = ym + nm;
1078
1079 /* cout << q << endl;
1080 printf("test question y %f n %f b %f\n",
1081 ym, nm, bm); */
1082
1083 return bm/2.0;
1084 }
1085
wgn_score_question(WQuestion & q,WVectorVector & ds)1086 float wgn_score_question(WQuestion &q, WVectorVector &ds)
1087 {
1088 // This level of indirection was introduced for later expansion
1089
1090 return score_question_set(q,ds,1);
1091 }
1092
wagon_stepwise(float limit)1093 WNode *wagon_stepwise(float limit)
1094 {
1095 // Find the best single features and incrementally add features
1096 // that best improve result until it doesn't improve.
1097 // This is basically to automate what Kurt was doing in building
1098 // trees, he then automated it in PERL and as it seemed to work
1099 // I put it into wagon itself.
1100 // This can be pretty computationally intensive.
1101 WNode *best = 0,*new_best = 0;
1102 float bscore,best_score = -WGN_HUGE_VAL;
1103 int best_feat,i;
1104 int nf = 1;
1105
1106 // Set all features to ignore
1107 for (i=0; i < wgn_dataset.width(); i++)
1108 wgn_dataset.set_ignore(i,TRUE);
1109
1110 for (i=0; i < wgn_dataset.width(); i++)
1111 {
1112 if ((wgn_dataset.ftype(i) == wndt_ignore) || (i == wgn_predictee))
1113 {
1114 // This skips the round not because this has anything to
1115 // do with this feature being (user specified) ignored
1116 // but because it indicates there is one less cycle that is
1117 // necessary
1118 continue;
1119 }
1120 new_best = wagon_stepwise_find_next_best(bscore,best_feat);
1121
1122 if ((bscore - fabs(bscore * (limit/100))) <= best_score)
1123 {
1124 // gone as far as we can
1125 delete new_best;
1126 break;
1127 }
1128 else
1129 {
1130 best_score = bscore;
1131 delete best;
1132 best = new_best;
1133 wgn_dataset.set_ignore(best_feat,FALSE);
1134 if (!wgn_quiet)
1135 {
1136 fprintf(stdout,"FEATURE %d %s: %2.4f\n",
1137 nf,
1138 (const char *)wgn_dataset.feat_name(best_feat),
1139 best_score);
1140 fflush(stdout);
1141 nf++;
1142 }
1143 }
1144 }
1145
1146 return best;
1147 }
1148
wagon_stepwise_find_next_best(float & bscore,int & best_feat)1149 static WNode *wagon_stepwise_find_next_best(float &bscore,int &best_feat)
1150 {
1151 // Find which of the currently ignored features will best improve
1152 // the result
1153 WNode *best = 0;
1154 float best_score = -WGN_HUGE_VAL;
1155 int best_new_feat = -1;
1156 int i;
1157
1158 for (i=0; i < wgn_dataset.width(); i++)
1159 {
1160 if (wgn_dataset.ftype(i) == wndt_ignore)
1161 continue; // user wants me to ignore this completely
1162 else if (i == wgn_predictee) // can't use the answer
1163 continue;
1164 else if (wgn_dataset.ignore(i) == TRUE)
1165 {
1166 WNode *current;
1167 float score;
1168
1169 // Allow this feature to participate
1170 wgn_dataset.set_ignore(i,FALSE);
1171
1172 current = wgn_build_tree(score);
1173
1174 if (score > best_score)
1175 {
1176 best_score = score;
1177 delete best;
1178 best = current;
1179 best_new_feat = i;
1180 // fprintf(stdout,"BETTER FEATURE %d %s: %2.4f\n",
1181 // i,
1182 // (const char *)wgn_dataset.feat_name(i),
1183 // best_score);
1184 // fflush(stdout);
1185 }
1186 else
1187 delete current;
1188
1189 // switch it off again
1190 wgn_dataset.set_ignore(i,TRUE);
1191 }
1192 }
1193
1194 bscore = best_score;
1195 best_feat = best_new_feat;
1196 return best;
1197 }
1198