1 /*************************************************************************/
2 /*                                                                       */
3 /*                Centre for Speech Technology Research                  */
4 /*                     University of Edinburgh, UK                       */
5 /*                      Copyright (c) 1996,1997                          */
6 /*                        All Rights Reserved.                           */
7 /*                                                                       */
8 /*  Permission is hereby granted, free of charge, to use and distribute  */
9 /*  this software and its documentation without restriction, including   */
10 /*  without limitation the rights to use, copy, modify, merge, publish,  */
11 /*  distribute, sublicense, and/or sell copies of this work, and to      */
12 /*  permit persons to whom this work is furnished to do so, subject to   */
13 /*  the following conditions:                                            */
14 /*   1. The code must retain the above copyright notice, this list of    */
15 /*      conditions and the following disclaimer.                         */
16 /*   2. Any modifications must be clearly marked as such.                */
17 /*   3. Original authors' names are not deleted.                         */
18 /*   4. The authors' names are not used to endorse or promote products   */
19 /*      derived from this software without specific prior written        */
20 /*      permission.                                                      */
21 /*                                                                       */
22 /*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
23 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
24 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
25 /*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
26 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
27 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
28 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
29 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
30 /*  THIS SOFTWARE.                                                       */
31 /*                                                                       */
32 /*************************************************************************/
33 /*                     Author :  Alan W Black                            */
34 /*                     Date   :  October 1997                            */
35 /*-----------------------------------------------------------------------*/
36 /*  A program for testing a CART tree against data, also may be used to  */
37 /*  predict values using a tree and data                                 */
38 /*                                                                       */
39 /*=======================================================================*/
40 #include <cstdlib>
41 #include <iostream>
42 #include <fstream>
43 #include <cstring>
44 #include "EST_Wagon.h"
45 #include "EST_cutils.h"
46 #include "EST_multistats.h"
47 #include "EST_Token.h"
48 #include "EST_cmd_line.h"
49 
50 static int wagon_test_main(int argc, char **argv);
51 static LISP find_feature_value(const char *feature,
52 			       LISP vector, LISP description);
53 static LISP wagon_vector_predict(LISP tree, LISP vector, LISP description);
54 static LISP get_data_vector(EST_TokenStream &data, LISP description);
55 static void simple_predict(EST_TokenStream &data, FILE *output,
56 			   LISP tree, LISP description, int all_info);
57 static void test_tree_class(EST_TokenStream &data, FILE *output,
58 			    LISP tree, LISP description);
59 static void test_tree_float(EST_TokenStream &data, FILE *output,
60 			    LISP tree, LISP description);
61 
62 /** @name <command>wagon_test</command> <emphasis>Test CART models</emphasis>
63     @id wagon-test-manual
64   * @toc
65  */
66 
67 //@{
68 
69 
70 /**@name Synopsis
71   */
72 //@{
73 
74 //@synopsis
75 
76 /**
77 Wagon_test is used to test CART models on feature data.
78 
79 A detailed description of the CART model can be found in the
80 <link linkend="cart-overview">CART model overview</link> section.
81 */
82 
main(int argc,char ** argv)83 int main(int argc, char **argv)
84 {
85 
86     wagon_test_main(argc,argv);
87 
88     exit(0);
89     return 0;
90 }
91 
wagon_test_main(int argc,char ** argv)92 static int wagon_test_main(int argc, char **argv)
93 {
94     // Top level function sets up data and creates a tree
95     EST_Option al;
96     EST_StrList files;
97     LISP description,tree=NIL;;
98     EST_TokenStream data;
99     FILE *wgn_output;
100 
101     parse_command_line
102 	(argc, argv,
103 	 EST_String("<options>\n")+
104 	 "Summary: program to test CART models on data\n"+
105 	 "-desc <ifile>     Field description file\n"+
106 	 "-data <ifile>     Datafile, one vector per line\n"+
107 	 "-tree <ifile>     File containing CART tree\n"+
108 	 "-track <ifile>\n"+
109          "                  track for vertex indices\n"+
110 	 "-predict          Predict for each vector returning full vector\n"+
111 	 "-predict_val      Predict for each vector returning just value\n"+
112 	 "-predictee <string>\n"+
113 	 "                  name of field to predict (default is first field)\n"+
114 	 "-heap <int> {210000}\n"+
115 	 "              Set size of Lisp heap, should not normally need\n"+
116 	 "              to be changed from its default\n"+
117 	 "-o <ofile>        File to save output in\n",
118 	 files, al);
119 
120     siod_init(al.ival("-heap"));
121 
122     if (al.present("-desc"))
123     {
124 	gc_protect(&description);
125 	description = car(vload(al.val("-desc"),1));
126     }
127     else
128     {
129 	cerr << argv[0] << ": no description file specified" << endl;
130 	exit(-1);
131     }
132 
133     if (al.present("-tree"))
134     {
135 	gc_protect(&tree);
136 	tree = car(vload(al.val("-tree"),1));
137 	if (tree == NIL)
138 	{
139 	    cerr << argv[0] << ": no tree found in \"" << al.val("-tree")
140 		<< "\"" << endl;
141 	    exit(-1);
142 	}
143     }
144     else
145     {
146 	cerr << argv[0] << ": no tree file specified" << endl;
147 	exit(-1);
148     }
149 
150     if (al.present("-data"))
151     {
152 	if (data.open(al.val("-data")) != 0)
153 	{
154 	    cerr << argv[0] << ": can't open data file \"" <<
155 		al.val("-data") << "\" for input." << endl;
156 	    exit(-1);
157 	}
158     }
159     else
160     {
161 	cerr << argv[0] << ": no data file specified" << endl;
162 	exit(-1);
163     }
164 
165     if (al.present("-track"))
166     {
167         wgn_VertexTrack.load(al.val("-track"));
168     }
169 
170     if (al.present("-o"))
171     {
172 	if ((wgn_output = fopen(al.val("-o"),"w")) == NULL)
173 	{
174 	    cerr << argv[0] << ": can't open output file \"" <<
175 		al.val("-o") << "\"" << endl;
176 	}
177     }
178     else
179 	wgn_output = stdout;
180 
181     if (al.present("-predictee"))
182     {
183 	LISP l;
184 	int i;
185 	wgn_predictee_name = al.val("-predictee");
186 	for (l=description,i=0; l != NIL; l=cdr(l),i++)
187 	    if (streq(wgn_predictee_name,get_c_string(car(car(l)))))
188 	    {
189 		wgn_predictee = i;
190 		break;
191 	    }
192 	if (l==NIL)
193 	{
194 	    cerr << argv[0] << ": predictee \"" << wgn_predictee <<
195 		"\" not in description\n";
196 	}
197     }
198     const char *predict_type =
199 	get_c_string(car(cdr(siod_nth(wgn_predictee,description))));
200 
201     if (al.present("-predict"))
202 	simple_predict(data,wgn_output,tree,description,FALSE);
203     else if (al.present("-predict_val"))
204 	simple_predict(data,wgn_output,tree,description,TRUE);
205     else if (streq(predict_type,"float") ||
206 	     streq(predict_type,"int"))
207 	test_tree_float(data,wgn_output,tree,description);
208 #if 0
209     else if (streq(predict_type,"vector"))
210 	test_tree_vector(data,wgn_output,tree,description);
211 #endif
212     else
213 	test_tree_class(data,wgn_output,tree,description);
214 
215     if (wgn_output != stdout)
216 	fclose(wgn_output);
217     data.close();
218     return 0;
219 }
220 
get_data_vector(EST_TokenStream & data,LISP description)221 static LISP get_data_vector(EST_TokenStream &data, LISP description)
222 {
223     // read in one vector.  Should be terminated with an newline
224     LISP v=NIL,d;
225 
226     if (data.eof())
227 	return NIL;
228 
229     for (d=description; d != NIL; d=cdr(d))
230     {
231 	EST_Token t = data.get();
232 
233 	if ((d != description) && (t.whitespace().contains("\n")))
234 	{
235 	    cerr << "wagon_test: unexpected newline within vector " <<
236 		t.row() << " wrong number of features" << endl;
237 	    siod_error();
238 	}
239 	if (streq(get_c_string(car(cdr(car(d)))),"float") ||
240 	    streq(get_c_string(car(cdr(car(d)))),"int"))
241 	    v = cons(flocons(atof(t.string())),v);
242 	else if ((streq(get_c_string(car(cdr(car(d)))),"_other_")) &&
243 		 (siod_member_str(t.string(),cdr(car(d))) == NIL))
244 	    v = cons(strintern("_other_"),v);
245 	else
246 	    v = cons(strintern(t.string()),v);
247     }
248 
249     return reverse(v);
250 }
251 
simple_predict(EST_TokenStream & data,FILE * output,LISP tree,LISP description,int all_info)252 static void simple_predict(EST_TokenStream &data, FILE *output,
253 			   LISP tree, LISP description, int all_info)
254 {
255     LISP vector,predict;
256     EST_String val;
257 
258     for (vector=get_data_vector(data,description);
259 	 vector != NIL; vector=get_data_vector(data,description))
260     {
261 	predict = wagon_vector_predict(tree,vector,description);
262 	if (all_info)
263 	    val = siod_sprint(car(reverse(predict)));
264 	else
265 	    val = siod_sprint(predict);
266 	fprintf(output,"%s\n",(const char *)val);
267     }
268 }
269 
test_tree_float(EST_TokenStream & data,FILE * output,LISP tree,LISP description)270 static void test_tree_float(EST_TokenStream &data, FILE *output,
271 			    LISP tree, LISP description)
272 {
273     // Test tree against data to get summary of results FLOAT
274     float predict_val,real_val;
275     EST_SuffStats x,y,xx,yy,xy,se,e;
276     double cor,error;
277     LISP vector,predict;
278 
279     for (vector=get_data_vector(data,description);
280 	 vector != NIL; vector=get_data_vector(data,description))
281     {
282 	predict = wagon_vector_predict(tree,vector,description);
283 	predict_val = get_c_float(car(reverse(predict)));
284 	real_val = get_c_float(siod_nth(wgn_predictee,vector));
285 	x += predict_val;
286 	y += real_val;
287 	error = predict_val-real_val;
288 	se += error*error;
289 	e += fabs(error);
290 	xx += predict_val*predict_val;
291 	yy += real_val*real_val;
292 	xy += predict_val*real_val;
293     }
294 
295     cor = (xy.mean() - (x.mean()*y.mean()))/
296 	(sqrt(xx.mean()-(x.mean()*x.mean())) *
297 	 sqrt(yy.mean()-(y.mean()*y.mean())));
298 
299     fprintf(output,";; RMSE %1.4f Correlation is %1.4f Mean (abs) Error %1.4f (%1.4f)\n",
300 	    sqrt(se.mean()),
301 	    cor,
302 	    e.mean(),
303 	    e.stddev());
304 }
305 
test_tree_class(EST_TokenStream & data,FILE * output,LISP tree,LISP description)306 static void test_tree_class(EST_TokenStream &data, FILE *output,
307 			    LISP tree, LISP description)
308 {
309     // Test tree against class data to get summary of results
310     EST_StrStr_KVL pairs;
311     EST_StrList lex;
312     EST_String predict_class,real_class;
313     LISP vector,w,predict;
314     double H=0,Q=0,prob;
315     (void)output;
316 
317     for (vector=get_data_vector(data,description);
318 	 vector != NIL; vector=get_data_vector(data,description))
319     {
320 	predict = wagon_vector_predict(tree,vector,description);
321 	predict_class = get_c_string(car(reverse(predict)));
322 	real_class = get_c_string(siod_nth(wgn_predictee,vector));
323 	prob = get_c_float(car(cdr(siod_assoc_str(real_class,
324 						  predict))));
325 	if (prob == 0)
326 	    H += log(0.000001);
327 	else
328 	    H += log(prob);
329 	Q ++;
330 	pairs.add_item(real_class,predict_class,1);
331     }
332     for (w=cdr(siod_nth(wgn_predictee,description)); w != NIL; w = cdr(w))
333 	lex.append(get_c_string(car(w)));
334 
335     const EST_FMatrix &m = confusion(pairs,lex);
336     print_confusion(m,pairs,lex);
337     fprintf(stdout,";; entropy %g perplexity %g\n",
338 	    (-1*(H/Q)),pow(2.0,(-1*(H/Q))));
339 }
340 
341 #if 0
342 static void test_tree_vector(EST_TokenStream &data, FILE *output,
343                              LISP tree, LISP description)
344 {
345     // Test tree against class data to get summary of results
346     // Note we are talking about predicting vectors (a *bunch* of
347     // numbers, not just a single class here)
348     EST_StrStr_KVL pairs;
349     EST_StrList lex;
350     EST_String predict_class,real_class;
351     LISP vector,w,predict;
352     double H=0,Q=0,prob;
353     (void)output;
354 
355     for (vector=get_data_vector(data,description);
356 	 vector != NIL; vector=get_data_vector(data,description))
357     {
358 	predict = wagon_vector_predict(tree,vector,description);
359 	predict_class = get_c_string(car(reverse(predict)));
360 	real_class = get_c_string(siod_nth(wgn_predictee,vector));
361 	prob = get_c_float(car(cdr(siod_assoc_str(real_class,
362 						  predict))));
363 	if (prob == 0)
364 	    H += log(0.000001);
365 	else
366 	    H += log(prob);
367 	Q ++;
368 	pairs.add_item(real_class,predict_class,1);
369     }
370     for (w=cdr(siod_nth(wgn_predictee,description)); w != NIL; w = cdr(w))
371 	lex.append(get_c_string(car(w)));
372 
373     const EST_FMatrix &m = confusion(pairs,lex);
374     print_confusion(m,pairs,lex);
375     fprintf(stdout,";; entropy %g perplexity %g\n",
376 	    (-1*(H/Q)),pow(2.0,(-1*(H/Q))));
377 }
378 #endif
379 
380 
wagon_vector_predict(LISP tree,LISP vector,LISP description)381 static LISP wagon_vector_predict(LISP tree, LISP vector, LISP description)
382 {
383     // Using the LISP tree, vector and description, do standard prediction
384 
385     if (cdr(tree) == NIL)
386 	return car(tree);
387 
388     LISP value = find_feature_value(wgn_ques_feature(car(tree)),
389 				    vector, description);
390 
391     if (wagon_ask_question(car(tree),value))
392 	// Yes answer
393 	return wagon_vector_predict(car(cdr(tree)),vector,description);
394     else
395 	// No answer
396 	return wagon_vector_predict(car(cdr(cdr(tree))),vector,description);
397 }
398 
find_feature_value(const char * feature,LISP vector,LISP description)399 static LISP find_feature_value(const char *feature,
400 			       LISP vector, LISP description)
401 {
402     LISP v,d;
403 
404     for (v=vector,d=description; v != NIL; v=cdr(v),d=cdr(d))
405 	if (streq(feature,get_c_string(car(car(d)))))
406 	    return car(v);
407 
408     cerr << "wagon_test: can't find feature \"" << feature <<
409 	"\" in description" << endl;
410     siod_error();
411     return NIL;
412 
413 }
414 
415 /** @name Testing trees
416 <para>
417 Decision trees generated by wagon (or otherwise) can be applied
418 to and tested against data sets using this program.  This program
419 requires a data set which is in the same format as wagon (and
420 other programs) requires.  It also needs a dataset description
421 file naming the fields and given their type (see
422 <link linkend="wagon-manual">the wagon manual</link> for a description
423 for the actual format.
424 <screen>
425 wagon_test -data feats.data -desc feats.desc -tree feats.tree
426 </screen>
427 This will simply uses the tree against each sample in the data
428 file and compare the predicted value with the actual value and
429 produce a summary of the result.  For categorial predictees a
430 percentage correct and confusion matrix is generated.  For continuous
431 values the root mean squared error (RMSE) and correlation between the
432 predicted values and the actual values is given.
433 </para><para>
434 By default the predictee is the first field but may also be specified
435 on the command line.  The dataset may contain features which are not
436 used by the tree.
437 </para><para>
438 This program can also be used to generate output values for sampled
439 data.  In this case the sample data must still contain a "value" for
440 the predictee even if it is dummy.  The option
441 <command>-predict</command> will output the new sample vector with
442 the predicted value in place, and the option
443 <command>-predict_val</command> option will just output the value.
444 </para><para>
445 This program is specifically designed for testing purposes although it
446 can also just be used for prediction. It is probably more efficient
447 to use the Lisp function <command>wagon</command> or underlying
448 C++ function <command>wagon_predict()</command>.
449 
450 */
451 
452 //@}
453