1#!/usr/bin/env python2.7
2# ----------------------------------
3"""
4Usage: dectreelearn.py [options] <trainfile> [testfile]
5
6Generate a decision tree for classifying the examples in example
7file and do stuff with it.
8
9Options:
10
11-x<n>
12 Perform n-fold stratified cross-validition. If n is ommited, performs
13 10-fold cross-validation. Folds are stratified for class as far as
14 possible, but proportions of very rare classes may still be way off
15 (if there are only 5 examples with class X, but you request 10-fold
16 cross-validation, 5 folds will have _no_ example for that class).
17
18-X<n>
19 Same, but performs non-stratified cross-validiation. THIS IS A
20 PARTICULARLY BAD IDEA IF THE INPUT SET IS SORTED BY CLASS! I know of
21 no reason why you should ever prefer -X to -x, but was reminded about
22 the value of stratification the hard way.
23
24-S<n>
25 Provide a seed for the pseudo-random number generator used in
26 splitting the set for cross-validation. If none is given, current
27 system time is used. Use this option to eliminate one random factor
28 when comparing different parameters.
29
30-g<limit>
31 Sets the relative information gain limit for refining nodes. Smaller
32 values make for bigger trees. Default is 0.5, but needs more
33 experiments and mybe that a knot in my thinking unknots.
34
35-a
36 Use absolute information gain instead of relative information gain to
37 select features. If you use -g0.0 -a -s2, this should behave (more or
38 less) like a standard ID3 algorithm.
39
40-s<n>
41 Set the maximal number of subclasses for each numerical feature to
42 n. This only applies at a given node in the tree, the same feature
43 can later be picked again and refined.
44
45-v
46 Verbose output. The programm will print a lot of stuff while doing
47 its thing...
48
49-c
50 Classify data from a second input file. If classes are given, will
51 also print performance, otherwise will just give predicted class for
52 each instance.
53
54-e
55 Evaluate the generate tree on the training data and print a report.
56
57-n
58 Do NOT print the generated tree.
59
60-h
61 Print this information and exit.
62
63
64Copyright 2003 Stephan Schulz, schulz@informatik.tu-muenchen.de
65
66his code is part of the support structure for the equational
67heorem prover E. Visit
68
69 http://www4.informatik.tu-muenchen.de/~schulz/WORK/eprover.html
70
71for more information.
72
73This program is free software; you can redistribute it and/or modify
74it under the terms of the GNU General Public License as published by
75the Free Software Foundation; either version 2 of the License, or
76(at your option) any later version.
77
78This program is distributed in the hope that it will be useful,
79but WITHOUT ANY WARRANTY; without even the implied warranty of
80MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
81GNU General Public License for more details.
82
83You should have received a copy of the GNU General Public License
84along with this program ; if not, write to the Free Software
85Foundation, Inc., 59 Temple Place, Suite 330, Boston,
86MA  02111-1307 USA
87
88The original copyright holder can be contacted as
89
90Stephan Schulz (I4)
91Technische Universitaet Muenchen
92Institut fuer Informatik
93Boltzmannstrasse 3
94Garching bei Muenchen
95Germany
96
97or via email (address above).
98"""
99
100
101import sys
102import string
103import random
104
105import pylib_io
106import pylib_basics
107import pylib_ml_examples
108import pylib_probabilities
109import pylib_dectrees
110
111relgain_limit = 0.5
112entropy_compare_fun = cmp
113max_split  = 10
114crossval   = 0
115eval_tree  = False
116classify   = False
117printtree  = True
118seed       = None
119stratified = True
120feature_selection = "R"
121dectree_constructor = pylib_dectrees.global_decision_tree
122
123options = pylib_io.get_options()
124args    = pylib_io.get_args()
125
126for o in options:
127    if o[0:2] == "-g":
128        relgain_limit = float(o[2:])
129    if o == "-a":
130        entropy_compare_fun = pylib_basics.rl_lex_compare
131        feature_selection = "A"
132    if o[0:2] == "-s":
133        max_split = int(float(o[2:]))
134    if o[0:2] == "-x":
135        try:
136            crossval = int(float(o[2:]))
137        except ValueError:
138            crossval = 10
139    if o[0:2] == "-X":
140        try:
141            crossval = int(float(o[2:]))
142        except ValueError:
143            crossval = 10
144        stratified = False
145    if o=="-v":
146        pylib_basics.globals.verbose = True
147    if o=="-c":
148        classify = True
149        pylib_io.check_argc(2,args)
150    if o=="-e":
151        eval_tree = True
152    if o=="-n":
153        printtree = False
154    if o=="-h":
155        print __doc__
156        sys.exit()
157    if o[0:2] == "-S":
158        seed = long(o[2:])
159
160pylib_io.check_argc(1,None,args)
161
162set = pylib_ml_examples.ml_exampleset()
163set.parse(args[0])
164
165
166if crossval:
167    random.seed(seed)
168    jobs = set.crossval_sets(crossval, stratified)
169    tr_results = []
170    te_results = []
171    fold = 0
172    for i in jobs:
173        fold += 1
174        tree = dectree_constructor(i[0],
175                                   entropy_compare_fun,
176                                   relgain_limit,
177                                   max_split)
178        (tsize, tdepth, tleaves)                     = tree.characteristics()
179        (apriori, remainder, absinfgain, relinfgain) = tree.entropies()
180        (succ, count) = tree.classify_set(i[0],pylib_basics.verbose())
181        succ_percent = float(succ)/count*100
182        print "Fold %-2d RIG: %5.3f (%2d,%4d,%4d) Train: %4d out of %4d, %7.3f%% " %\
183              (fold, relinfgain,tdepth, tsize, tleaves, succ, count, succ_percent),
184        tr_results.append((succ,count,succ_percent,relinfgain, tdepth,
185                           tsize, tleaves))
186
187        (succ, count) = tree.classify_set(i[1],pylib_basics.verbose())
188        succ_percent = float(succ)/count*100
189        print "Test: %4d out of %4d, %7.3f%%" % (succ, count, succ_percent)
190        te_results.append((succ, count,succ_percent))
191
192    tr_percent = (map(lambda x:x[2],tr_results))
193    te_percent = (map(lambda x:x[2],te_results))
194    rig        = (map(lambda x:x[3],tr_results))
195    depths     = (map(lambda x:x[4],tr_results))
196    size       = (map(lambda x:x[5],tr_results))
197    leaves     = (map(lambda x:x[6],tr_results))
198    print "%s Splits %2d RIGL %5.3f RIG %5.3f+/-%5.3f (%5.2f+/-%4.2f, %7.2f+/-%4.2f, %7.2f+/-%4.2f) Train: %7.3f+/-%7.3f%%  Test:  %7.3f+/-%7.3f%%" %\
199          (feature_selection,
200           max_split,
201           relgain_limit,
202           pylib_basics.mean(rig),
203           pylib_basics.standard_deviation(rig),
204           pylib_basics.mean(depths),
205           pylib_basics.standard_deviation(depths),
206           pylib_basics.mean(size),
207           pylib_basics.standard_deviation(size),
208           pylib_basics.mean(leaves),
209           pylib_basics.standard_deviation(leaves),
210           pylib_basics.mean(tr_percent),
211           pylib_basics.standard_deviation(tr_percent),
212           pylib_basics.mean(te_percent),
213           pylib_basics.standard_deviation(te_percent))
214
215else:
216    tree = dectree_constructor(set,
217                               entropy_compare_fun,
218                               relgain_limit,
219                               max_split)
220    if printtree:
221        tree.printout()
222    if eval_tree:
223        (succ, count) = tree.classify_set(set, pylib_basics.verbose())
224        print "Successes on training set: %d out of %d, %f%%"% (succ, count, float(succ)/count*100)
225    if classify:
226        testset = pylib_ml_examples.ml_exampleset()
227        testset.parse(args[1])
228        (succ, count) = tree.classify_set(testset, True)
229        if count!=0:
230            print "Successes on test set    : %d out of %d, %f%%"% (succ, count, float(succ)/count*100)
231
232