1 /*************************************************************************/
2 /*                                                                       */
3 /*                Centre for Speech Technology Research                  */
4 /*                     University of Edinburgh, UK                       */
5 /*                      Copyright (c) 1996,1997                          */
6 /*                        All Rights Reserved.                           */
7 /*                                                                       */
8 /*  Permission is hereby granted, free of charge, to use and distribute  */
9 /*  this software and its documentation without restriction, including   */
10 /*  without limitation the rights to use, copy, modify, merge, publish,  */
11 /*  distribute, sublicense, and/or sell copies of this work, and to      */
12 /*  permit persons to whom this work is furnished to do so, subject to   */
13 /*  the following conditions:                                            */
14 /*   1. The code must retain the above copyright notice, this list of    */
15 /*      conditions and the following disclaimer.                         */
16 /*   2. Any modifications must be clearly marked as such.                */
17 /*   3. Original authors' names are not deleted.                         */
18 /*   4. The authors' names are not used to endorse or promote products   */
19 /*      derived from this software without specific prior written        */
20 /*      permission.                                                      */
21 /*                                                                       */
22 /*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
23 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
24 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
25 /*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
26 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
27 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
28 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
29 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
30 /*  THIS SOFTWARE.                                                       */
31 /*                                                                       */
32 /*************************************************************************/
33 /*                     Author :  Alan W Black                            */
34 /*                     Date   :  May 1996                                */
35 /*-----------------------------------------------------------------------*/
36 /*  A Classification and Regression Tree (CART) Program                  */
37 /*  A basic implementation of many of the techniques in                  */
38 /*  Briemen et al. 1984                                                  */
39 /*                                                                       */
40 /*  Added decision list support, Feb 1997                                */
41 /*  Added stepwise use of features, Oct 1997                             */
42 /*                                                                       */
43 /*=======================================================================*/
44 
45 #include <cstdlib>
46 #include <iostream>
47 #include <fstream>
48 #include <cstring>
49 #include "EST_Token.h"
50 #include "EST_FMatrix.h"
51 #include "EST_multistats.h"
52 #include "EST_Wagon.h"
53 #include "EST_math.h"
54 
55 Discretes wgn_discretes;
56 
57 WDataSet wgn_dataset;
58 WDataSet wgn_test_dataset;
59 EST_FMatrix wgn_DistMatrix;
60 EST_Track wgn_VertexTrack;
61 EST_Track wgn_VertexFeats;
62 EST_Track wgn_UnitTrack;
63 
64 int wgn_min_cluster_size = 50;
65 int wgn_held_out = 0;
66 int wgn_prune = TRUE;
67 int wgn_quiet = FALSE;
68 int wgn_verbose = FALSE;
69 int wgn_count_field = -1;
70 EST_String wgn_count_field_name = "";
71 int wgn_predictee = 0;
72 EST_String wgn_predictee_name = "";
73 float wgn_float_range_split = 10;
74 float wgn_balance = 0;
75 EST_String wgn_opt_param = "";
76 EST_String wgn_vertex_output = "mean";
77 EST_String wgn_vertex_otype = "mean";
78 
79 static float do_summary(WNode &tree,WDataSet &ds,ostream *output);
80 static float test_tree_float(WNode &tree,WDataSet &ds,ostream *output);
81 static float test_tree_class(WNode &tree,WDataSet &ds,ostream *output);
82 static float test_tree_cluster(WNode &tree,WDataSet &dataset, ostream *output);
83 static float test_tree_vector(WNode &tree,WDataSet &dataset,ostream *output);
84 static float test_tree_trajectory(WNode &tree,WDataSet &dataset,ostream *output);
85 static float test_tree_ols(WNode &tree,WDataSet &dataset,ostream *output);
86 static int wagon_split(int margin,WNode &node);
87 static WQuestion find_best_question(WVectorVector &dset);
88 static void construct_binary_ques(int feat,WQuestion &test_ques);
89 static float construct_float_ques(int feat,WQuestion &ques,WVectorVector &ds);
90 static float construct_class_ques(int feat,WQuestion &ques,WVectorVector &ds);
91 static void wgn_set_up_data(WVectorVector &data,const WVectorList &ds,int held_out,int in);
92 static WNode *wagon_stepwise_find_next_best(float &bscore,int &best_feat);
93 
Declare_TList_T(WVector *,WVectorP)94 Declare_TList_T(WVector *, WVectorP)
95 
96 Declare_TVector_Base_T(WVector *,NULL,NULL,WVectorP)
97 
98 #if defined(INSTANTIATE_TEMPLATES)
99 // Instantiate class
100 #include "../base_class/EST_TList.cc"
101 #include "../base_class/EST_TVector.cc"
102 
103 Instantiate_TList_T(WVector *, WVectorP)
104 
105 Instantiate_TVector(WVector *)
106 
107 #endif
108 
109 void wgn_load_datadescription(EST_String fname,LISP ignores)
110 {
111     // Load field description for a file
112     wgn_dataset.load_description(fname,ignores);
113     wgn_test_dataset.load_description(fname,ignores);
114 }
115 
wgn_load_dataset(WDataSet & dataset,EST_String fname)116 void wgn_load_dataset(WDataSet &dataset,EST_String fname)
117 {
118     // Read the data set from a filename.  One vector per line
119     // Assume all numbers are numbers and non-nums are categorical
120     EST_TokenStream ts;
121     WVector *v;
122     int nvec=0,i;
123 
124     if (ts.open(fname) == -1)
125 	wagon_error(EST_String("unable to open data file \"")+
126 		    fname+"\"");
127     ts.set_PunctuationSymbols("");
128     ts.set_PrePunctuationSymbols("");
129     ts.set_SingleCharSymbols("");
130 
131     for ( ;!ts.eof(); )
132     {
133 	v = new WVector(dataset.width());
134 	i = 0;
135 	do
136 	{
137 	    int type = dataset.ftype(i);
138 	    if ((type == wndt_float) ||
139                 (type == wndt_ols) ||
140                 (wgn_count_field == i))
141 	    {
142 		// need to ensure this is not NaN or Infinity
143 		float f = atof(ts.get().string());
144 		if (isfinite(f))
145 		    v->set_flt_val(i,f);
146 		else
147 		{
148 		    cout << fname << ": bad float " << f
149 			<< " in field " <<
150 			dataset.feat_name(i) << " vector " <<
151 			    dataset.samples() << endl;
152 		    v->set_flt_val(i,0.0);
153 		}
154 	    }
155 	    else if (type == wndt_binary)
156 		v->set_int_val(i,atoi(ts.get().string()));
157 	    else if (type == wndt_cluster)  /* index into distmatrix */
158 		v->set_int_val(i,atoi(ts.get().string()));
159 	    else if (type == wndt_vector)   /* index into VertexTrack */
160 		v->set_int_val(i,atoi(ts.get().string()));
161 	    else if (type == wndt_trajectory) /* index to index and length */
162             {   /* a number pointing to a vector in UnitTrack that */
163                 /* has an idex into VertexTrack and a number of Vertices */
164                 /* Thus if its 15, UnitTrack.a(15,0) is the start frame in */
165                 /* VertexTrack and UnitTrack.a(15,1) is the number of */
166                 /* frames in the unit                                 */
167 		v->set_int_val(i,atoi(ts.get().string()));
168             }
169 	    else if (type == wndt_ignore)
170 	    {
171 		ts.get();  // skip it
172 		v->set_int_val(i,0);
173 	    }
174 	    else // should check the different classes
175 	    {
176 		EST_String s = ts.get().string();
177 		int n = wgn_discretes.discrete(type).name(s);
178 		if (n == -1)
179 		{
180 		    cout << fname << ": bad value " << s << " in field " <<
181 			dataset.feat_name(i) << " vector " <<
182 			    dataset.samples() << endl;
183 		    n = 0;
184 		}
185 		v->set_int_val(i,n);
186 	    }
187 	    i++;
188 	}
189 	while (!ts.eoln() && i<dataset.width());
190 	nvec ++;
191 	if (i != dataset.width())
192 	{
193 	    wagon_error(fname+": data vector "+itoString(nvec)+" contains "
194 			+itoString(i)+" parameters instead of "+
195 			itoString(dataset.width()));
196 	}
197 	if (!ts.eoln())
198 	{
199 	    cerr << fname << ": data vector " << nvec <<
200 		" contains too many parameters instead of "
201 		<< dataset.width() << endl;
202 	    wagon_error(EST_String("extra parameter(s) from ")+
203 			ts.peek().string());
204 	}
205 	dataset.append(v);
206     }
207 
208     cout << "Dataset of " << dataset.samples() << " vectors of " <<
209 	dataset.width() << " parameters from: " << fname << endl;
210     ts.close();
211 }
212 
summary_results(WNode & tree,ostream * output)213 float summary_results(WNode &tree,ostream *output)
214 {
215     if (wgn_test_dataset.samples() != 0)
216 	return do_summary(tree,wgn_test_dataset,output);
217     else
218 	return do_summary(tree,wgn_dataset,output);
219 }
220 
do_summary(WNode & tree,WDataSet & ds,ostream * output)221 static float do_summary(WNode &tree,WDataSet &ds,ostream *output)
222 {
223     if (wgn_dataset.ftype(wgn_predictee) == wndt_cluster)
224 	return test_tree_cluster(tree,ds,output);
225     else if (wgn_dataset.ftype(wgn_predictee) == wndt_vector)
226 	return test_tree_vector(tree,ds,output);
227     else if (wgn_dataset.ftype(wgn_predictee) == wndt_trajectory)
228 	return test_tree_trajectory(tree,ds,output);
229     else if (wgn_dataset.ftype(wgn_predictee) == wndt_ols)
230 	return test_tree_ols(tree,ds,output);
231     else if (wgn_dataset.ftype(wgn_predictee) >= wndt_class)
232 	return test_tree_class(tree,ds,output);
233     else
234 	return test_tree_float(tree,ds,output);
235 }
236 
wgn_build_tree(float & score)237 WNode *wgn_build_tree(float &score)
238 {
239     // Build init node and split it while reducing the impurity
240     WNode *top = new WNode();
241     int margin = 0;
242 
243     wgn_set_up_data(top->get_data(),wgn_dataset,wgn_held_out,TRUE);
244 
245     margin = 0;
246     wagon_split(margin,*top);  // recursively split data;
247 
248     if (wgn_held_out > 0)
249     {
250 	wgn_set_up_data(top->get_data(),wgn_dataset,wgn_held_out,FALSE);
251 	top->held_out_prune();
252     }
253 
254     if (wgn_prune)
255 	top->prune();
256 
257     score = summary_results(*top,0);
258 
259     return top;
260 }
261 
wgn_set_up_data(WVectorVector & data,const WVectorList & ds,int held_out,int in)262 static void wgn_set_up_data(WVectorVector &data,const WVectorList &ds,int held_out,int in)
263 {
264     // Set data ommitting held_out percent if in is true
265     // or only including 100-held_out percent if in is false
266     int i,j;
267     EST_Litem *d;
268 
269     // Make it definitely big enough
270     data.resize(ds.length());
271 
272     for (j=i=0,d=ds.head(); d != 0; d=d->next(),j++)
273     {
274 	if ((in) && ((j%100) >= held_out))
275 	    data[i++] = ds(d);
276 //	else if ((!in) && ((j%100 < held_out)))
277 //	    data[i++] = ds(d);
278 	else if (!in)
279 	    data[i++] = ds(d);
280 //	if ((in) && (j < held_out))
281 //	    data[i++] = ds(d);
282 //	else if ((!in) && (j >=held_out))
283 //	    data[i++] = ds(d);
284     }
285     // make it the actual size, but don't reset values
286     data.resize(i,1);
287 }
288 
test_tree_class(WNode & tree,WDataSet & dataset,ostream * output)289 static float test_tree_class(WNode &tree,WDataSet &dataset,ostream *output)
290 {
291     // Test tree against data to get summary of results
292     EST_StrStr_KVL pairs;
293     EST_StrList lex;
294     EST_Litem *p;
295     EST_String predict,real;
296     WNode *pnode;
297     double H=0,prob;
298     int i,type;
299     float correct=0,total=0, count=0;
300 
301     float bcorrect=0, bpredicted=0, bactual=0;
302     float precision=0, recall=0;
303 
304     for (p=dataset.head(); p != 0; p=p->next())
305     {
306 	pnode = tree.predict_node((*dataset(p)));
307 	predict = (EST_String)pnode->get_impurity().value();
308 	if (wgn_count_field == -1)
309 	    count = 1.0;
310 	else
311 	    count = dataset(p)->get_flt_val(wgn_count_field);
312 	prob = pnode->get_impurity().pd().probability(predict);
313 	H += (log(prob))*count;
314 	type = dataset.ftype(wgn_predictee);
315 	real = wgn_discretes[type].name(dataset(p)->get_int_val(wgn_predictee));
316 
317 	if (wgn_opt_param == "B_NB_F1")
318 	  {
319 	    //cout << real << " " << predict << endl;
320 	    if (real == "B")
321 	      bactual +=count;
322 	    if (predict == "B")
323 	      {
324 		bpredicted += count;
325 		if (real == predict)
326 		  bcorrect += count;
327 	      }
328 	    //	    cout <<bactual << " " << bpredicted << " " << bcorrect << endl;
329 	  }
330 	if (real == predict)
331 	    correct += count;
332 	total += count;
333 	pairs.add_item(real,predict,1);
334     }
335     for (i=0; i<wgn_discretes[dataset.ftype(wgn_predictee)].length(); i++)
336 	lex.append(wgn_discretes[dataset.ftype(wgn_predictee)].name(i));
337 
338     const EST_FMatrix &m = confusion(pairs,lex);
339 
340     if (output != NULL)
341     {
342 	print_confusion(m,pairs,lex);  // should be to output not stdout
343 	*output << ";; entropy " << (-1*(H/total)) << " perplexity " <<
344 	    pow(2.0,(-1*(H/total))) << endl;
345     }
346 
347 
348     // Minus it so bigger is better
349     if (wgn_opt_param == "entropy")
350 	return -pow(2.0,(-1*(H/total)));
351     else if(wgn_opt_param == "B_NB_F1")
352       {
353 	if(bpredicted == 0)
354 	  precision = 1;
355 	else
356 	  precision = bcorrect/bpredicted;
357 	if(bactual == 0)
358 	  recall = 1;
359 	else
360 	  recall = bcorrect/bactual;
361 	float fmeasure = 0;
362 	if((precision+recall) !=0)
363 	  fmeasure = 2* (precision*recall)/(precision+recall);
364 	cout<< "F1 :" << fmeasure << " Prec:" << precision << " Rec:" << recall << " B-Pred:" << bpredicted << " B-Actual:" << bactual << " B-Correct:" << bcorrect << endl;
365 	return fmeasure;
366       }
367     else
368 	return (float)correct/(float)total;
369 }
370 
test_tree_vector(WNode & tree,WDataSet & dataset,ostream * output)371 static float test_tree_vector(WNode &tree,WDataSet &dataset,ostream *output)
372 {
373     // Test tree against data to get summary of results VECTOR
374     // distance is calculated in zscores (as the values in vector may
375     // have quite different ranges
376     WNode *leaf;
377     EST_Litem *p;
378     float predict, actual;
379     EST_SuffStats x,y,xx,yy,xy,se,e;
380     EST_SuffStats b;
381     int i,j,pos;
382     double cor,error;
383     double count;
384     EST_Litem *pp;
385 
386     for (p=dataset.head(); p != 0; p=p->next())
387     {
388 	leaf = tree.predict_node((*dataset(p)));
389 	pos = dataset(p)->get_int_val(wgn_predictee);
390         for (j=0; j<wgn_VertexFeats.num_channels(); j++)
391             if (wgn_VertexFeats.a(0,j) > 0.0)
392             {
393                 b.reset();
394                 for (pp=leaf->get_impurity().members.head(); pp != 0; pp=pp->next())
395                 {
396                     i = leaf->get_impurity().members.item(pp);
397                     b += wgn_VertexTrack.a(i,j);
398                 }
399                 predict = b.mean();
400                 actual = wgn_VertexTrack.a(pos,j);
401                 if (wgn_count_field == -1)
402                     count = 1.0;
403                 else
404                     count = dataset(p)->get_flt_val(wgn_count_field);
405                 x.cumulate(predict,count);
406                 y.cumulate(actual,count);
407                 /* Normalized the error by the standard deviation */
408                 if (b.stddev() == 0)
409                     error = predict-actual;
410                 else
411                     error = (predict-actual)/b.stddev();
412                 error = predict-actual; /* awb_debug */
413                 se.cumulate((error*error),count);
414                 e.cumulate(fabs(error),count);
415                 xx.cumulate(predict*predict,count);
416                 yy.cumulate(actual*actual,count);
417                 xy.cumulate(predict*actual,count);
418             }
419     }
420 
421     // Pearson's product moment correlation coefficient
422 //    cor = (xy.mean() - (x.mean()*y.mean()))/
423 //	(sqrt(xx.mean()-(x.mean()*x.mean())) *
424 //	 sqrt(yy.mean()-(y.mean()*y.mean())));
425     // Because when the variation is X is very small we can
426     // go negative, thus cause the sqrt's to give FPE
427     double v1 = xx.mean()-(x.mean()*x.mean());
428     double v2 = yy.mean()-(y.mean()*y.mean());
429 
430     double v3 = v1*v2;
431 
432     if (v3 <= 0)
433 	// happens when there's very little variation in x
434 	cor = 0;
435     else
436 	cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
437 
438     if (output != NULL)
439     {
440 	if (output != &cout)   // save in output file
441 	    *output
442 		<< ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
443 		<< " Correlation is " << ftoString(cor,4,1)
444 		<< " Mean (abs) Error " << ftoString(e.mean(),4,1)
445 		<< " (" << ftoString(e.stddev(),4,1) << ")" << endl;
446 
447 	cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
448 	    << " Correlation is " << ftoString(cor,4,1)
449 	    << " Mean (abs) Error " << ftoString(e.mean(),4,1)
450 	    << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
451     }
452 
453     if (wgn_opt_param == "rmse")
454 	return -sqrt(se.mean());  // * -1 so bigger is better
455     else
456 	return cor;  // should really be % variance, I think
457 }
458 
test_tree_trajectory(WNode & tree,WDataSet & dataset,ostream * output)459 static float test_tree_trajectory(WNode &tree,WDataSet &dataset,ostream *output)
460 {
461     // Test tree against data to get summary of results TRAJECTORY
462     // distance is calculated in zscores (as the values in vector may
463     // have quite different ranges)
464     // NOT WRITTEN YET
465     WNode *leaf;
466     EST_Litem *p;
467     float predict, actual;
468     EST_SuffStats x,y,xx,yy,xy,se,e;
469     EST_SuffStats b;
470     int i,j,pos;
471     double cor,error;
472     double count;
473     EST_Litem *pp;
474 
475     for (p=dataset.head(); p != 0; p=p->next())
476     {
477 	leaf = tree.predict_node((*dataset(p)));
478 	pos = dataset(p)->get_int_val(wgn_predictee);
479         for (j=0; j<wgn_VertexFeats.num_channels(); j++)
480             if (wgn_VertexFeats.a(0,j) > 0.0)
481             {
482                 b.reset();
483                 for (pp=leaf->get_impurity().members.head(); pp != 0; pp=pp->next())
484                 {
485                     i = leaf->get_impurity().members.item(pp);
486                     b += wgn_VertexTrack.a(i,j);
487                 }
488                 predict = b.mean();
489                 actual = wgn_VertexTrack.a(pos,j);
490                 if (wgn_count_field == -1)
491                     count = 1.0;
492                 else
493                     count = dataset(p)->get_flt_val(wgn_count_field);
494                 x.cumulate(predict,count);
495                 y.cumulate(actual,count);
496                 /* Normalized the error by the standard deviation */
497                 if (b.stddev() == 0)
498                     error = predict-actual;
499                 else
500                     error = (predict-actual)/b.stddev();
501                 error = predict-actual; /* awb_debug */
502                 se.cumulate((error*error),count);
503                 e.cumulate(fabs(error),count);
504                 xx.cumulate(predict*predict,count);
505                 yy.cumulate(actual*actual,count);
506                 xy.cumulate(predict*actual,count);
507             }
508     }
509 
510     // Pearson's product moment correlation coefficient
511 //    cor = (xy.mean() - (x.mean()*y.mean()))/
512 //	(sqrt(xx.mean()-(x.mean()*x.mean())) *
513 //	 sqrt(yy.mean()-(y.mean()*y.mean())));
514     // Because when the variation is X is very small we can
515     // go negative, thus cause the sqrt's to give FPE
516     double v1 = xx.mean()-(x.mean()*x.mean());
517     double v2 = yy.mean()-(y.mean()*y.mean());
518 
519     double v3 = v1*v2;
520 
521     if (v3 <= 0)
522 	// happens when there's very little variation in x
523 	cor = 0;
524     else
525 	cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
526 
527     if (output != NULL)
528     {
529 	if (output != &cout)   // save in output file
530 	    *output
531 		<< ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
532 		<< " Correlation is " << ftoString(cor,4,1)
533 		<< " Mean (abs) Error " << ftoString(e.mean(),4,1)
534 		<< " (" << ftoString(e.stddev(),4,1) << ")" << endl;
535 
536 	cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
537 	    << " Correlation is " << ftoString(cor,4,1)
538 	    << " Mean (abs) Error " << ftoString(e.mean(),4,1)
539 	    << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
540     }
541 
542     if (wgn_opt_param == "rmse")
543 	return -sqrt(se.mean());  // * -1 so bigger is better
544     else
545 	return cor;  // should really be % variance, I think
546 }
547 
test_tree_cluster(WNode & tree,WDataSet & dataset,ostream * output)548 static float test_tree_cluster(WNode &tree,WDataSet &dataset,ostream *output)
549 {
550     // Test tree against data to get summary of results for cluster trees
551     WNode *leaf;
552     int real;
553     int right_cluster=0;
554     EST_SuffStats ranking, meandist;
555     EST_Litem *p;
556 
557     for (p=dataset.head(); p != 0; p=p->next())
558     {
559 	leaf = tree.predict_node((*dataset(p)));
560 	real = dataset(p)->get_int_val(wgn_predictee);
561 	meandist += leaf->get_impurity().cluster_distance(real);
562 	right_cluster += leaf->get_impurity().in_cluster(real);
563 	ranking += leaf->get_impurity().cluster_ranking(real);
564     }
565 
566     if (output != NULL)
567     {
568 	// Want number in right class, mean distance in sds, mean ranking
569 	if (output != &cout)   // save in output file
570 	    *output << ";; Right cluster " << right_cluster << " (" <<
571 		(int)(100.0*(float)right_cluster/(float)dataset.length()) <<
572 		    "%) mean ranking " << ranking.mean() << " mean distance "
573 			<< meandist.mean() << endl;
574 	cout << "Right cluster " << right_cluster << " (" <<
575 	    (int)(100.0*(float)right_cluster/(float)dataset.length()) <<
576 		"%) mean ranking " << ranking.mean() << " mean distance "
577 		    << meandist.mean() << endl;
578     }
579 
580     return 10000-meandist.mean();  // this doesn't work but I tested it
581 }
582 
test_tree_float(WNode & tree,WDataSet & dataset,ostream * output)583 static float test_tree_float(WNode &tree,WDataSet &dataset,ostream *output)
584 {
585     // Test tree against data to get summary of results FLOAT
586     EST_Litem *p;
587     float predict,real;
588     EST_SuffStats x,y,xx,yy,xy,se,e;
589     double cor,error;
590     double count;
591 
592     for (p=dataset.head(); p != 0; p=p->next())
593     {
594 	predict = tree.predict((*dataset(p)));
595 	real = dataset(p)->get_flt_val(wgn_predictee);
596 	if (wgn_count_field == -1)
597 	    count = 1.0;
598 	else
599 	    count = dataset(p)->get_flt_val(wgn_count_field);
600 	x.cumulate(predict,count);
601 	y.cumulate(real,count);
602 	error = predict-real;
603 	se.cumulate((error*error),count);
604 	e.cumulate(fabs(error),count);
605 	xx.cumulate(predict*predict,count);
606 	yy.cumulate(real*real,count);
607 	xy.cumulate(predict*real,count);
608     }
609 
610     // Pearson's product moment correlation coefficient
611 //    cor = (xy.mean() - (x.mean()*y.mean()))/
612 //	(sqrt(xx.mean()-(x.mean()*x.mean())) *
613 //	 sqrt(yy.mean()-(y.mean()*y.mean())));
614     // Because when the variation is X is very small we can
615     // go negative, thus cause the sqrt's to give FPE
616     double v1 = xx.mean()-(x.mean()*x.mean());
617     double v2 = yy.mean()-(y.mean()*y.mean());
618 
619     double v3 = v1*v2;
620 
621     if (v3 <= 0)
622 	// happens when there's very little variation in x
623 	cor = 0;
624     else
625 	cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
626 
627     if (output != NULL)
628     {
629 	if (output != &cout)   // save in output file
630 	    *output
631 		<< ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
632 		<< " Correlation is " << ftoString(cor,4,1)
633 		<< " Mean (abs) Error " << ftoString(e.mean(),4,1)
634 		<< " (" << ftoString(e.stddev(),4,1) << ")" << endl;
635 
636 	cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
637 	    << " Correlation is " << ftoString(cor,4,1)
638 	    << " Mean (abs) Error " << ftoString(e.mean(),4,1)
639 	    << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
640     }
641 
642     if (wgn_opt_param == "rmse")
643 	return -sqrt(se.mean());  // * -1 so bigger is better
644     else
645 	return cor;  // should really be % variance, I think
646 }
647 
test_tree_ols(WNode & tree,WDataSet & dataset,ostream * output)648 static float test_tree_ols(WNode &tree,WDataSet &dataset,ostream *output)
649 {
650     // Test tree against data to get summary of results OLS
651     EST_Litem *p;
652     WNode *leaf;
653     float predict,real;
654     EST_SuffStats x,y,xx,yy,xy,se,e;
655     double cor,error;
656     double count;
657 
658     for (p=dataset.head(); p != 0; p=p->next())
659     {
660 	leaf = tree.predict_node((*dataset(p)));
661         // do ols to get predict;
662         predict = 0.0;
663 	real = dataset(p)->get_flt_val(wgn_predictee);
664 	if (wgn_count_field == -1)
665 	    count = 1.0;
666 	else
667 	    count = dataset(p)->get_flt_val(wgn_count_field);
668 	x.cumulate(predict,count);
669 	y.cumulate(real,count);
670 	error = predict-real;
671 	se.cumulate((error*error),count);
672 	e.cumulate(fabs(error),count);
673 	xx.cumulate(predict*predict,count);
674 	yy.cumulate(real*real,count);
675 	xy.cumulate(predict*real,count);
676     }
677 
678     // Pearson's product moment correlation coefficient
679 //    cor = (xy.mean() - (x.mean()*y.mean()))/
680 //	(sqrt(xx.mean()-(x.mean()*x.mean())) *
681 //	 sqrt(yy.mean()-(y.mean()*y.mean())));
682     // Because when the variation is X is very small we can
683     // go negative, thus cause the sqrt's to give FPE
684     double v1 = xx.mean()-(x.mean()*x.mean());
685     double v2 = yy.mean()-(y.mean()*y.mean());
686 
687     double v3 = v1*v2;
688 
689     if (v3 <= 0)
690 	// happens when there's very little variation in x
691 	cor = 0;
692     else
693 	cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
694 
695     if (output != NULL)
696     {
697 	if (output != &cout)   // save in output file
698 	    *output
699 		<< ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
700 		<< " Correlation is " << ftoString(cor,4,1)
701 		<< " Mean (abs) Error " << ftoString(e.mean(),4,1)
702 		<< " (" << ftoString(e.stddev(),4,1) << ")" << endl;
703 
704 	cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
705 	    << " Correlation is " << ftoString(cor,4,1)
706 	    << " Mean (abs) Error " << ftoString(e.mean(),4,1)
707 	    << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
708     }
709 
710     if (wgn_opt_param == "rmse")
711 	return -sqrt(se.mean());  // * -1 so bigger is better
712     else
713 	return cor;  // should really be % variance, I think
714 }
715 
wagon_split(int margin,WNode & node)716 static int wagon_split(int margin, WNode &node)
717 {
718     // Split given node (if possible)
719     WQuestion q;
720     WNode *l,*r;
721 
722     node.set_impurity(WImpurity(node.get_data()));
723     q = find_best_question(node.get_data());
724 
725 /*    printf("q.score() %f impurity %f\n",
726 	   q.get_score(),
727 	   node.get_impurity().measure()); */
728 
729     double impurity_measure = node.get_impurity().measure();
730     double question_score = q.get_score();
731 
732     if ((question_score < WGN_HUGE_VAL) &&
733         (question_score < impurity_measure))
734 
735     {
736 	// Ok its worth a split
737 	l = new WNode();
738 	r = new WNode();
739 	wgn_find_split(q,node.get_data(),l->get_data(),r->get_data());
740 	node.set_subnodes(l,r);
741 	node.set_question(q);
742 	if (wgn_verbose)
743 	{
744 	    int i;
745 	    for (i=0; i < margin; i++)
746 		cout << " ";
747 	    cout << q << endl;
748 	}
749 	margin++;
750 	wagon_split(margin,*l);
751 	margin++;
752 	wagon_split(margin,*r);
753 	margin--;
754 	return TRUE;
755     }
756     else
757     {
758 	if (wgn_verbose)
759 	{
760 	    int i;
761 	    for (i=0; i < margin; i++)
762 		cout << " ";
763 	    cout << "stopped samples: " << node.samples() << " impurity: "
764 		<< node.get_impurity() << endl;
765 	}
766 	margin--;
767 	return FALSE;
768     }
769 }
770 
wgn_find_split(WQuestion & q,WVectorVector & ds,WVectorVector & y,WVectorVector & n)771 void wgn_find_split(WQuestion &q,WVectorVector &ds,
772 		    WVectorVector &y,WVectorVector &n)
773 {
774     int i, iy, in;
775 
776     y.resize(q.get_yes());
777     n.resize(q.get_no());
778 
779     for (iy=in=i=0; i < ds.n(); i++)
780 	if (q.ask(*ds(i)) == TRUE)
781 	    y[iy++] = ds(i);
782 	else
783 	    n[in++] = ds(i);
784 
785 }
786 
find_best_question(WVectorVector & dset)787 static WQuestion find_best_question(WVectorVector &dset)
788 {
789     //  Ask all possible questions and find the best one
790     int i;
791     float bscore,tscore;
792     WQuestion test_ques, best_ques;
793 
794     bscore = tscore = WGN_HUGE_VAL;
795     best_ques.set_score(bscore);
796     // test each feature with each possible question
797     for (i=0;i < wgn_dataset.width(); i++)
798     {
799 	if ((wgn_dataset.ignore(i) == TRUE) ||
800 	    (i == wgn_predictee))
801 	    tscore = WGN_HUGE_VAL;     // ignore this feature this time
802 	else if (wgn_dataset.ftype(i) == wndt_binary)
803 	{
804 	    construct_binary_ques(i,test_ques);
805 	    tscore = wgn_score_question(test_ques,dset);
806 	}
807 	else if (wgn_dataset.ftype(i) == wndt_float)
808 	{
809 	    tscore = construct_float_ques(i,test_ques,dset);
810 	}
811 	else if (wgn_dataset.ftype(i) == wndt_ignore)
812 	    tscore = WGN_HUGE_VAL;    // always ignore this feature
813 #if 0
814 	// This doesn't work reasonably
815 	else if (wgn_csubset && (wgn_dataset.ftype(i) >= wndt_class))
816 	{
817 	    wagon_error("subset selection temporarily deleted");
818 	    tscore = construct_class_ques_subset(i,test_ques,dset);
819 	}
820 #endif
821 	else if (wgn_dataset.ftype(i) >= wndt_class)
822 	    tscore = construct_class_ques(i,test_ques,dset);
823 	if (tscore < bscore)
824 	{
825 	    best_ques = test_ques;
826 	    best_ques.set_score(tscore);
827 	    bscore = tscore;
828 	}
829     }
830 
831     return best_ques;
832 }
833 
construct_class_ques(int feat,WQuestion & ques,WVectorVector & ds)834 static float construct_class_ques(int feat,WQuestion &ques,WVectorVector &ds)
835 {
836     // Find out which member of a class gives the best split
837     float tscore,bscore = WGN_HUGE_VAL;
838     int cl;
839     WQuestion test_q;
840 
841     test_q.set_fp(feat);
842     test_q.set_oper(wnop_is);
843     ques = test_q;
844 
845     for (cl=0; cl < wgn_discretes[wgn_dataset.ftype(feat)].length(); cl++)
846     {
847 	test_q.set_operand1(EST_Val(cl));
848 	tscore = wgn_score_question(test_q,ds);
849 	if (tscore < bscore)
850 	{
851 	    ques = test_q;
852 	    bscore = tscore;
853 	}
854     }
855 
856     return bscore;
857 }
858 
859 #if 0
860 static float construct_class_ques_subset(int feat,WQuestion &ques,
861 					 WVectorVector &ds)
862 {
863     // Find out which subset of a class gives the best split.
864     // We first measure the subset of the data for each member of
865     // of the class.  Then order those splits.  Then go through finding
866     // where the best split of that ordered list is.  This is described
867     // on page 247 of Breiman et al.
868     float tscore,bscore = WGN_HUGE_VAL;
869     LISP l;
870     int cl;
871 
872     ques.set_fp(feat);
873     ques.set_oper(wnop_is);
874     float *scores = new float[wgn_discretes[wgn_dataset.ftype(feat)].length()];
875 
876     // Only do it for exists values
877     for (cl=0; cl < wgn_discretes[wgn_dataset.ftype(feat)].length(); cl++)
878     {
879 	ques.set_operand(flocons(cl));
880 	scores[cl] = wgn_score_question(ques,ds);
881     }
882 
883     LISP order = sort_class_scores(feat,scores);
884     if (order == NIL)
885 	return WGN_HUGE_VAL;
886     if (siod_llength(order) == 1)
887     {   // Only one so we know the best "split"
888 	ques.set_oper(wnop_is);
889 	ques.set_operand(car(order));
890 	return scores[get_c_int(car(order))];
891     }
892 
893     ques.set_oper(wnop_in);
894     LISP best_l = NIL;
895     for (l=cdr(order); CDR(l) != NIL; l = cdr(l))
896     {
897 	ques.set_operand(l);
898 	tscore = wgn_score_question(ques,ds);
899 	if (tscore < bscore)
900 	{
901 	    best_l = l;
902 	    bscore = tscore;
903 	}
904 
905     }
906 
907     if (best_l != NIL)
908     {
909 	if (siod_llength(best_l) == 1)
910 	{
911 	    ques.set_oper(wnop_is);
912 	    ques.set_operand(car(best_l));
913 	}
914 	else if (equal(cdr(order),best_l) != NIL)
915 	{
916 	    ques.set_oper(wnop_is);
917 	    ques.set_operand(car(order));
918 	}
919 	else
920 	{
921 	    cout << "Found a good subset" << endl;
922 	    ques.set_operand(best_l);
923 	}
924     }
925     return bscore;
926 }
927 
928 static LISP sort_class_scores(int feat,float *scores)
929 {
930     // returns sorted list of (non WGN_HUGE_VAL) items
931     int i;
932     LISP items = NIL;
933     LISP l;
934 
935     for (i=0; i < wgn_discretes[wgn_dataset.ftype(feat)].length(); i++)
936     {
937 	if (scores[i] != WGN_HUGE_VAL)
938 	{
939 	    if (items == NIL)
940 		items = cons(flocons(i),NIL);
941 	    else
942 	    {
943 		for (l=items; l != NIL; l=cdr(l))
944 		{
945 		    if (scores[i] < scores[get_c_int(car(l))])
946 		    {
947 			CDR(l) = cons(car(l),cdr(l));
948 			CAR(l) = flocons(i);
949 			break;
950 		    }
951 		}
952 		if (l == NIL)
953 		    items = l_append(items,cons(flocons(i),NIL));
954 	    }
955 	}
956     }
957     return items;
958 }
959 #endif
960 
construct_float_ques(int feat,WQuestion & ques,WVectorVector & ds)961 static float construct_float_ques(int feat,WQuestion &ques,WVectorVector &ds)
962 {
963     // Find out a split of the range that gives the best score
964     // Naively does this by partitioning the range into float_range_split slots
965     float tscore,bscore = WGN_HUGE_VAL;
966     int d, i;
967     float p;
968     WQuestion test_q;
969     float max,min,val,incr;
970 
971     test_q.set_fp(feat);
972     test_q.set_oper(wnop_lessthan);
973     ques = test_q;
974 
975     min = max = ds(0)->get_flt_val(feat);  /* set up some value */
976     for (d=0; d < ds.n(); d++)
977     {
978 	val = ds(d)->get_flt_val(feat);
979 	if (val < min)
980 	    min = val;
981 	else if (val > max)
982 	    max = val;
983     }
984     if (max == min)  // we're pure
985 	return WGN_HUGE_VAL;
986     incr = (max-min)/wgn_float_range_split;
987     // so do float_range-1 splits
988     /* We calculate this based on the number splits, not the increments, */
989     /* becuase incr can be so small it doesn't increment p */
990     for (i=0,p=min+incr; i < wgn_float_range_split; i++,p += incr )
991     {
992 	test_q.set_operand1(EST_Val(p));
993 	tscore = wgn_score_question(test_q,ds);
994 	if (tscore < bscore)
995 	{
996 	    ques = test_q;
997 	    bscore = tscore;
998 	}
999     }
1000 
1001     return bscore;
1002 }
1003 
construct_binary_ques(int feat,WQuestion & test_ques)1004 static void construct_binary_ques(int feat,WQuestion &test_ques)
1005 {
1006     // construct a question.  Not sure about this in general
1007     // of course continuous/categorical features will require different
1008     // rule and non-binary ones will require some test point
1009 
1010     test_ques.set_fp(feat);
1011     test_ques.set_oper(wnop_binary);
1012     test_ques.set_operand1(EST_Val(""));
1013 }
1014 
score_question_set(WQuestion & q,WVectorVector & ds,int ignorenth)1015 static float score_question_set(WQuestion &q, WVectorVector &ds, int ignorenth)
1016 {
1017     // score this question as a possible split by finding
1018     // the sum of the impurities when ds is split with this question
1019     WImpurity y,n;
1020     int d, num_yes, num_no;
1021     float count;
1022     WVector *wv;
1023 
1024     num_yes = num_no = 0;
1025     y.data = &ds;
1026     n.data = &ds;
1027     for (d=0; d < ds.n(); d++)
1028     {
1029 	if ((ignorenth < 2) ||
1030 	    (d%ignorenth != ignorenth-1))
1031 	{
1032 	    wv = ds(d);
1033 	    if (wgn_count_field == -1)
1034 		count = 1.0;
1035 	    else
1036 		count = (*wv)[wgn_count_field];
1037 
1038 	    if (q.ask(*wv) == TRUE)
1039 	    {
1040 		num_yes++;
1041                 if (wgn_dataset.ftype(wgn_predictee) == wndt_ols)
1042                     y.cumulate(d,count);  // note the sample number not value
1043                 else
1044                     y.cumulate((*wv)[wgn_predictee],count);
1045 	    }
1046 	    else
1047 	    {
1048 		num_no++;
1049                 if (wgn_dataset.ftype(wgn_predictee) == wndt_ols)
1050                     n.cumulate(d,count);  // note the sample number not value
1051                 else
1052                     n.cumulate((*wv)[wgn_predictee],count);
1053 	    }
1054 	}
1055     }
1056 
1057     q.set_yes(num_yes);
1058     q.set_no(num_no);
1059 
1060     int min_cluster;
1061 
1062     if ((wgn_balance == 0.0) ||
1063 	(ds.n()/wgn_balance < wgn_min_cluster_size))
1064 	min_cluster = wgn_min_cluster_size;
1065     else
1066 	min_cluster = (int)(ds.n()/wgn_balance);
1067 
1068     if ((y.samples() < min_cluster) ||
1069 	(n.samples() < min_cluster))
1070 	return WGN_HUGE_VAL;
1071 
1072     float ym,nm,bm;
1073     //    printf("awb_debug score_question_set X %f Y %f\n",
1074     //    y.samples(), n.samples());
1075     ym = y.measure();
1076     nm = n.measure();
1077     bm = ym + nm;
1078 
1079     /*    cout << q << endl;
1080     printf("test question y %f n %f b %f\n",
1081     ym, nm, bm); */
1082 
1083     return bm/2.0;
1084 }
1085 
wgn_score_question(WQuestion & q,WVectorVector & ds)1086 float wgn_score_question(WQuestion &q, WVectorVector &ds)
1087 {
1088     // This level of indirection was introduced for later expansion
1089 
1090     return score_question_set(q,ds,1);
1091 }
1092 
wagon_stepwise(float limit)1093 WNode *wagon_stepwise(float limit)
1094 {
1095     // Find the best single features and incrementally add features
1096     // that best improve result until it doesn't improve.
1097     // This is basically to automate what Kurt was doing in building
1098     // trees, he then automated it in PERL and as it seemed to work
1099     // I put it into wagon itself.
1100     // This can be pretty computationally intensive.
1101     WNode *best = 0,*new_best = 0;
1102     float bscore,best_score = -WGN_HUGE_VAL;
1103     int best_feat,i;
1104     int nf = 1;
1105 
1106     // Set all features to ignore
1107     for (i=0; i < wgn_dataset.width(); i++)
1108 	wgn_dataset.set_ignore(i,TRUE);
1109 
1110     for (i=0; i < wgn_dataset.width(); i++)
1111     {
1112 	if ((wgn_dataset.ftype(i) == wndt_ignore) || (i == wgn_predictee))
1113 	{
1114 	    // This skips the round not because this has anything to
1115 	    // do with this feature being (user specified) ignored
1116 	    // but because it indicates there is one less cycle that is
1117 	    // necessary
1118 	    continue;
1119 	}
1120 	new_best = wagon_stepwise_find_next_best(bscore,best_feat);
1121 
1122 	if ((bscore - fabs(bscore * (limit/100))) <= best_score)
1123 	{
1124 	    // gone as far as we can
1125 	    delete new_best;
1126 	    break;
1127 	}
1128 	else
1129 	{
1130 	    best_score = bscore;
1131 	    delete best;
1132 	    best = new_best;
1133 	    wgn_dataset.set_ignore(best_feat,FALSE);
1134 	    if (!wgn_quiet)
1135 	    {
1136 		fprintf(stdout,"FEATURE    %d %s: %2.4f\n",
1137 			nf,
1138 			(const char *)wgn_dataset.feat_name(best_feat),
1139 			best_score);
1140 		fflush(stdout);
1141 		nf++;
1142 	    }
1143 	}
1144     }
1145 
1146     return best;
1147 }
1148 
wagon_stepwise_find_next_best(float & bscore,int & best_feat)1149 static WNode *wagon_stepwise_find_next_best(float &bscore,int &best_feat)
1150 {
1151     // Find which of the currently ignored features will best improve
1152     // the result
1153     WNode *best = 0;
1154     float best_score = -WGN_HUGE_VAL;
1155     int best_new_feat = -1;
1156     int i;
1157 
1158     for (i=0; i < wgn_dataset.width(); i++)
1159     {
1160 	if (wgn_dataset.ftype(i) == wndt_ignore)
1161 	    continue; // user wants me to ignore this completely
1162 	else if (i == wgn_predictee) // can't use the answer
1163 	    continue;
1164 	else if (wgn_dataset.ignore(i) == TRUE)
1165 	{
1166 	    WNode *current;
1167 	    float score;
1168 
1169 	    // Allow this feature to participate
1170 	    wgn_dataset.set_ignore(i,FALSE);
1171 
1172 	    current = wgn_build_tree(score);
1173 
1174 	    if (score > best_score)
1175 	    {
1176 		best_score = score;
1177 		delete best;
1178 		best = current;
1179 		best_new_feat = i;
1180 //		fprintf(stdout,"BETTER FEATURE    %d %s: %2.4f\n",
1181 //			i,
1182 //			(const char *)wgn_dataset.feat_name(i),
1183 //			best_score);
1184 //		fflush(stdout);
1185 	    }
1186 	    else
1187 		delete current;
1188 
1189 	    // switch it off again
1190 	    wgn_dataset.set_ignore(i,TRUE);
1191 	}
1192     }
1193 
1194     bscore = best_score;
1195     best_feat = best_new_feat;
1196     return best;
1197 }
1198