1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996,1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : October 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* A program for testing a CART tree against data, also may be used to */
37 /* predict values using a tree and data */
38 /* */
39 /*=======================================================================*/
40 #include <cstdlib>
41 #include <iostream>
42 #include <fstream>
43 #include <cstring>
44 #include "EST_Wagon.h"
45 #include "EST_cutils.h"
46 #include "EST_multistats.h"
47 #include "EST_Token.h"
48 #include "EST_cmd_line.h"
49
50 static int wagon_test_main(int argc, char **argv);
51 static LISP find_feature_value(const char *feature,
52 LISP vector, LISP description);
53 static LISP wagon_vector_predict(LISP tree, LISP vector, LISP description);
54 static LISP get_data_vector(EST_TokenStream &data, LISP description);
55 static void simple_predict(EST_TokenStream &data, FILE *output,
56 LISP tree, LISP description, int all_info);
57 static void test_tree_class(EST_TokenStream &data, FILE *output,
58 LISP tree, LISP description);
59 static void test_tree_float(EST_TokenStream &data, FILE *output,
60 LISP tree, LISP description);
61
62 /** @name <command>wagon_test</command> <emphasis>Test CART models</emphasis>
63 @id wagon-test-manual
64 * @toc
65 */
66
67 //@{
68
69
70 /**@name Synopsis
71 */
72 //@{
73
74 //@synopsis
75
76 /**
77 Wagon_test is used to test CART models on feature data.
78
79 A detailed description of the CART model can be found in the
80 <link linkend="cart-overview">CART model overview</link> section.
81 */
82
main(int argc,char ** argv)83 int main(int argc, char **argv)
84 {
85
86 wagon_test_main(argc,argv);
87
88 exit(0);
89 return 0;
90 }
91
wagon_test_main(int argc,char ** argv)92 static int wagon_test_main(int argc, char **argv)
93 {
94 // Top level function sets up data and creates a tree
95 EST_Option al;
96 EST_StrList files;
97 LISP description,tree=NIL;;
98 EST_TokenStream data;
99 FILE *wgn_output;
100
101 parse_command_line
102 (argc, argv,
103 EST_String("<options>\n")+
104 "Summary: program to test CART models on data\n"+
105 "-desc <ifile> Field description file\n"+
106 "-data <ifile> Datafile, one vector per line\n"+
107 "-tree <ifile> File containing CART tree\n"+
108 "-track <ifile>\n"+
109 " track for vertex indices\n"+
110 "-predict Predict for each vector returning full vector\n"+
111 "-predict_val Predict for each vector returning just value\n"+
112 "-predictee <string>\n"+
113 " name of field to predict (default is first field)\n"+
114 "-heap <int> {210000}\n"+
115 " Set size of Lisp heap, should not normally need\n"+
116 " to be changed from its default\n"+
117 "-o <ofile> File to save output in\n",
118 files, al);
119
120 siod_init(al.ival("-heap"));
121
122 if (al.present("-desc"))
123 {
124 gc_protect(&description);
125 description = car(vload(al.val("-desc"),1));
126 }
127 else
128 {
129 cerr << argv[0] << ": no description file specified" << endl;
130 exit(-1);
131 }
132
133 if (al.present("-tree"))
134 {
135 gc_protect(&tree);
136 tree = car(vload(al.val("-tree"),1));
137 if (tree == NIL)
138 {
139 cerr << argv[0] << ": no tree found in \"" << al.val("-tree")
140 << "\"" << endl;
141 exit(-1);
142 }
143 }
144 else
145 {
146 cerr << argv[0] << ": no tree file specified" << endl;
147 exit(-1);
148 }
149
150 if (al.present("-data"))
151 {
152 if (data.open(al.val("-data")) != 0)
153 {
154 cerr << argv[0] << ": can't open data file \"" <<
155 al.val("-data") << "\" for input." << endl;
156 exit(-1);
157 }
158 }
159 else
160 {
161 cerr << argv[0] << ": no data file specified" << endl;
162 exit(-1);
163 }
164
165 if (al.present("-track"))
166 {
167 wgn_VertexTrack.load(al.val("-track"));
168 }
169
170 if (al.present("-o"))
171 {
172 if ((wgn_output = fopen(al.val("-o"),"w")) == NULL)
173 {
174 cerr << argv[0] << ": can't open output file \"" <<
175 al.val("-o") << "\"" << endl;
176 }
177 }
178 else
179 wgn_output = stdout;
180
181 if (al.present("-predictee"))
182 {
183 LISP l;
184 int i;
185 wgn_predictee_name = al.val("-predictee");
186 for (l=description,i=0; l != NIL; l=cdr(l),i++)
187 if (streq(wgn_predictee_name,get_c_string(car(car(l)))))
188 {
189 wgn_predictee = i;
190 break;
191 }
192 if (l==NIL)
193 {
194 cerr << argv[0] << ": predictee \"" << wgn_predictee <<
195 "\" not in description\n";
196 }
197 }
198 const char *predict_type =
199 get_c_string(car(cdr(siod_nth(wgn_predictee,description))));
200
201 if (al.present("-predict"))
202 simple_predict(data,wgn_output,tree,description,FALSE);
203 else if (al.present("-predict_val"))
204 simple_predict(data,wgn_output,tree,description,TRUE);
205 else if (streq(predict_type,"float") ||
206 streq(predict_type,"int"))
207 test_tree_float(data,wgn_output,tree,description);
208 #if 0
209 else if (streq(predict_type,"vector"))
210 test_tree_vector(data,wgn_output,tree,description);
211 #endif
212 else
213 test_tree_class(data,wgn_output,tree,description);
214
215 if (wgn_output != stdout)
216 fclose(wgn_output);
217 data.close();
218 return 0;
219 }
220
get_data_vector(EST_TokenStream & data,LISP description)221 static LISP get_data_vector(EST_TokenStream &data, LISP description)
222 {
223 // read in one vector. Should be terminated with an newline
224 LISP v=NIL,d;
225
226 if (data.eof())
227 return NIL;
228
229 for (d=description; d != NIL; d=cdr(d))
230 {
231 EST_Token t = data.get();
232
233 if ((d != description) && (t.whitespace().contains("\n")))
234 {
235 cerr << "wagon_test: unexpected newline within vector " <<
236 t.row() << " wrong number of features" << endl;
237 siod_error();
238 }
239 if (streq(get_c_string(car(cdr(car(d)))),"float") ||
240 streq(get_c_string(car(cdr(car(d)))),"int"))
241 v = cons(flocons(atof(t.string())),v);
242 else if ((streq(get_c_string(car(cdr(car(d)))),"_other_")) &&
243 (siod_member_str(t.string(),cdr(car(d))) == NIL))
244 v = cons(strintern("_other_"),v);
245 else
246 v = cons(strintern(t.string()),v);
247 }
248
249 return reverse(v);
250 }
251
simple_predict(EST_TokenStream & data,FILE * output,LISP tree,LISP description,int all_info)252 static void simple_predict(EST_TokenStream &data, FILE *output,
253 LISP tree, LISP description, int all_info)
254 {
255 LISP vector,predict;
256 EST_String val;
257
258 for (vector=get_data_vector(data,description);
259 vector != NIL; vector=get_data_vector(data,description))
260 {
261 predict = wagon_vector_predict(tree,vector,description);
262 if (all_info)
263 val = siod_sprint(car(reverse(predict)));
264 else
265 val = siod_sprint(predict);
266 fprintf(output,"%s\n",(const char *)val);
267 }
268 }
269
test_tree_float(EST_TokenStream & data,FILE * output,LISP tree,LISP description)270 static void test_tree_float(EST_TokenStream &data, FILE *output,
271 LISP tree, LISP description)
272 {
273 // Test tree against data to get summary of results FLOAT
274 float predict_val,real_val;
275 EST_SuffStats x,y,xx,yy,xy,se,e;
276 double cor,error;
277 LISP vector,predict;
278
279 for (vector=get_data_vector(data,description);
280 vector != NIL; vector=get_data_vector(data,description))
281 {
282 predict = wagon_vector_predict(tree,vector,description);
283 predict_val = get_c_float(car(reverse(predict)));
284 real_val = get_c_float(siod_nth(wgn_predictee,vector));
285 x += predict_val;
286 y += real_val;
287 error = predict_val-real_val;
288 se += error*error;
289 e += fabs(error);
290 xx += predict_val*predict_val;
291 yy += real_val*real_val;
292 xy += predict_val*real_val;
293 }
294
295 cor = (xy.mean() - (x.mean()*y.mean()))/
296 (sqrt(xx.mean()-(x.mean()*x.mean())) *
297 sqrt(yy.mean()-(y.mean()*y.mean())));
298
299 fprintf(output,";; RMSE %1.4f Correlation is %1.4f Mean (abs) Error %1.4f (%1.4f)\n",
300 sqrt(se.mean()),
301 cor,
302 e.mean(),
303 e.stddev());
304 }
305
test_tree_class(EST_TokenStream & data,FILE * output,LISP tree,LISP description)306 static void test_tree_class(EST_TokenStream &data, FILE *output,
307 LISP tree, LISP description)
308 {
309 // Test tree against class data to get summary of results
310 EST_StrStr_KVL pairs;
311 EST_StrList lex;
312 EST_String predict_class,real_class;
313 LISP vector,w,predict;
314 double H=0,Q=0,prob;
315 (void)output;
316
317 for (vector=get_data_vector(data,description);
318 vector != NIL; vector=get_data_vector(data,description))
319 {
320 predict = wagon_vector_predict(tree,vector,description);
321 predict_class = get_c_string(car(reverse(predict)));
322 real_class = get_c_string(siod_nth(wgn_predictee,vector));
323 prob = get_c_float(car(cdr(siod_assoc_str(real_class,
324 predict))));
325 if (prob == 0)
326 H += log(0.000001);
327 else
328 H += log(prob);
329 Q ++;
330 pairs.add_item(real_class,predict_class,1);
331 }
332 for (w=cdr(siod_nth(wgn_predictee,description)); w != NIL; w = cdr(w))
333 lex.append(get_c_string(car(w)));
334
335 const EST_FMatrix &m = confusion(pairs,lex);
336 print_confusion(m,pairs,lex);
337 fprintf(stdout,";; entropy %g perplexity %g\n",
338 (-1*(H/Q)),pow(2.0,(-1*(H/Q))));
339 }
340
341 #if 0
342 static void test_tree_vector(EST_TokenStream &data, FILE *output,
343 LISP tree, LISP description)
344 {
345 // Test tree against class data to get summary of results
346 // Note we are talking about predicting vectors (a *bunch* of
347 // numbers, not just a single class here)
348 EST_StrStr_KVL pairs;
349 EST_StrList lex;
350 EST_String predict_class,real_class;
351 LISP vector,w,predict;
352 double H=0,Q=0,prob;
353 (void)output;
354
355 for (vector=get_data_vector(data,description);
356 vector != NIL; vector=get_data_vector(data,description))
357 {
358 predict = wagon_vector_predict(tree,vector,description);
359 predict_class = get_c_string(car(reverse(predict)));
360 real_class = get_c_string(siod_nth(wgn_predictee,vector));
361 prob = get_c_float(car(cdr(siod_assoc_str(real_class,
362 predict))));
363 if (prob == 0)
364 H += log(0.000001);
365 else
366 H += log(prob);
367 Q ++;
368 pairs.add_item(real_class,predict_class,1);
369 }
370 for (w=cdr(siod_nth(wgn_predictee,description)); w != NIL; w = cdr(w))
371 lex.append(get_c_string(car(w)));
372
373 const EST_FMatrix &m = confusion(pairs,lex);
374 print_confusion(m,pairs,lex);
375 fprintf(stdout,";; entropy %g perplexity %g\n",
376 (-1*(H/Q)),pow(2.0,(-1*(H/Q))));
377 }
378 #endif
379
380
wagon_vector_predict(LISP tree,LISP vector,LISP description)381 static LISP wagon_vector_predict(LISP tree, LISP vector, LISP description)
382 {
383 // Using the LISP tree, vector and description, do standard prediction
384
385 if (cdr(tree) == NIL)
386 return car(tree);
387
388 LISP value = find_feature_value(wgn_ques_feature(car(tree)),
389 vector, description);
390
391 if (wagon_ask_question(car(tree),value))
392 // Yes answer
393 return wagon_vector_predict(car(cdr(tree)),vector,description);
394 else
395 // No answer
396 return wagon_vector_predict(car(cdr(cdr(tree))),vector,description);
397 }
398
find_feature_value(const char * feature,LISP vector,LISP description)399 static LISP find_feature_value(const char *feature,
400 LISP vector, LISP description)
401 {
402 LISP v,d;
403
404 for (v=vector,d=description; v != NIL; v=cdr(v),d=cdr(d))
405 if (streq(feature,get_c_string(car(car(d)))))
406 return car(v);
407
408 cerr << "wagon_test: can't find feature \"" << feature <<
409 "\" in description" << endl;
410 siod_error();
411 return NIL;
412
413 }
414
415 /** @name Testing trees
416 <para>
417 Decision trees generated by wagon (or otherwise) can be applied
418 to and tested against data sets using this program. This program
419 requires a data set which is in the same format as wagon (and
420 other programs) requires. It also needs a dataset description
421 file naming the fields and given their type (see
422 <link linkend="wagon-manual">the wagon manual</link> for a description
423 for the actual format.
424 <screen>
425 wagon_test -data feats.data -desc feats.desc -tree feats.tree
426 </screen>
427 This will simply uses the tree against each sample in the data
428 file and compare the predicted value with the actual value and
429 produce a summary of the result. For categorial predictees a
430 percentage correct and confusion matrix is generated. For continuous
431 values the root mean squared error (RMSE) and correlation between the
432 predicted values and the actual values is given.
433 </para><para>
434 By default the predictee is the first field but may also be specified
435 on the command line. The dataset may contain features which are not
436 used by the tree.
437 </para><para>
438 This program can also be used to generate output values for sampled
439 data. In this case the sample data must still contain a "value" for
440 the predictee even if it is dummy. The option
441 <command>-predict</command> will output the new sample vector with
442 the predicted value in place, and the option
443 <command>-predict_val</command> option will just output the value.
444 </para><para>
445 This program is specifically designed for testing purposes although it
446 can also just be used for prediction. It is probably more efficient
447 to use the Lisp function <command>wagon</command> or underlying
448 C++ function <command>wagon_predict()</command>.
449
450 */
451
452 //@}
453