1#!/usr/bin/env python2.7 2# ---------------------------------- 3""" 4Usage: dectreelearn.py [options] <trainfile> [testfile] 5 6Generate a decision tree for classifying the examples in example 7file and do stuff with it. 8 9Options: 10 11-x<n> 12 Perform n-fold stratified cross-validition. If n is ommited, performs 13 10-fold cross-validation. Folds are stratified for class as far as 14 possible, but proportions of very rare classes may still be way off 15 (if there are only 5 examples with class X, but you request 10-fold 16 cross-validation, 5 folds will have _no_ example for that class). 17 18-X<n> 19 Same, but performs non-stratified cross-validiation. THIS IS A 20 PARTICULARLY BAD IDEA IF THE INPUT SET IS SORTED BY CLASS! I know of 21 no reason why you should ever prefer -X to -x, but was reminded about 22 the value of stratification the hard way. 23 24-S<n> 25 Provide a seed for the pseudo-random number generator used in 26 splitting the set for cross-validation. If none is given, current 27 system time is used. Use this option to eliminate one random factor 28 when comparing different parameters. 29 30-g<limit> 31 Sets the relative information gain limit for refining nodes. Smaller 32 values make for bigger trees. Default is 0.5, but needs more 33 experiments and mybe that a knot in my thinking unknots. 34 35-a 36 Use absolute information gain instead of relative information gain to 37 select features. If you use -g0.0 -a -s2, this should behave (more or 38 less) like a standard ID3 algorithm. 39 40-s<n> 41 Set the maximal number of subclasses for each numerical feature to 42 n. This only applies at a given node in the tree, the same feature 43 can later be picked again and refined. 44 45-v 46 Verbose output. The programm will print a lot of stuff while doing 47 its thing... 48 49-c 50 Classify data from a second input file. If classes are given, will 51 also print performance, otherwise will just give predicted class for 52 each instance. 53 54-e 55 Evaluate the generate tree on the training data and print a report. 56 57-n 58 Do NOT print the generated tree. 59 60-h 61 Print this information and exit. 62 63 64Copyright 2003 Stephan Schulz, schulz@informatik.tu-muenchen.de 65 66his code is part of the support structure for the equational 67heorem prover E. Visit 68 69 http://www4.informatik.tu-muenchen.de/~schulz/WORK/eprover.html 70 71for more information. 72 73This program is free software; you can redistribute it and/or modify 74it under the terms of the GNU General Public License as published by 75the Free Software Foundation; either version 2 of the License, or 76(at your option) any later version. 77 78This program is distributed in the hope that it will be useful, 79but WITHOUT ANY WARRANTY; without even the implied warranty of 80MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 81GNU General Public License for more details. 82 83You should have received a copy of the GNU General Public License 84along with this program ; if not, write to the Free Software 85Foundation, Inc., 59 Temple Place, Suite 330, Boston, 86MA 02111-1307 USA 87 88The original copyright holder can be contacted as 89 90Stephan Schulz (I4) 91Technische Universitaet Muenchen 92Institut fuer Informatik 93Boltzmannstrasse 3 94Garching bei Muenchen 95Germany 96 97or via email (address above). 98""" 99 100 101import sys 102import string 103import random 104 105import pylib_io 106import pylib_basics 107import pylib_ml_examples 108import pylib_probabilities 109import pylib_dectrees 110 111relgain_limit = 0.5 112entropy_compare_fun = cmp 113max_split = 10 114crossval = 0 115eval_tree = False 116classify = False 117printtree = True 118seed = None 119stratified = True 120feature_selection = "R" 121dectree_constructor = pylib_dectrees.global_decision_tree 122 123options = pylib_io.get_options() 124args = pylib_io.get_args() 125 126for o in options: 127 if o[0:2] == "-g": 128 relgain_limit = float(o[2:]) 129 if o == "-a": 130 entropy_compare_fun = pylib_basics.rl_lex_compare 131 feature_selection = "A" 132 if o[0:2] == "-s": 133 max_split = int(float(o[2:])) 134 if o[0:2] == "-x": 135 try: 136 crossval = int(float(o[2:])) 137 except ValueError: 138 crossval = 10 139 if o[0:2] == "-X": 140 try: 141 crossval = int(float(o[2:])) 142 except ValueError: 143 crossval = 10 144 stratified = False 145 if o=="-v": 146 pylib_basics.globals.verbose = True 147 if o=="-c": 148 classify = True 149 pylib_io.check_argc(2,args) 150 if o=="-e": 151 eval_tree = True 152 if o=="-n": 153 printtree = False 154 if o=="-h": 155 print __doc__ 156 sys.exit() 157 if o[0:2] == "-S": 158 seed = long(o[2:]) 159 160pylib_io.check_argc(1,None,args) 161 162set = pylib_ml_examples.ml_exampleset() 163set.parse(args[0]) 164 165 166if crossval: 167 random.seed(seed) 168 jobs = set.crossval_sets(crossval, stratified) 169 tr_results = [] 170 te_results = [] 171 fold = 0 172 for i in jobs: 173 fold += 1 174 tree = dectree_constructor(i[0], 175 entropy_compare_fun, 176 relgain_limit, 177 max_split) 178 (tsize, tdepth, tleaves) = tree.characteristics() 179 (apriori, remainder, absinfgain, relinfgain) = tree.entropies() 180 (succ, count) = tree.classify_set(i[0],pylib_basics.verbose()) 181 succ_percent = float(succ)/count*100 182 print "Fold %-2d RIG: %5.3f (%2d,%4d,%4d) Train: %4d out of %4d, %7.3f%% " %\ 183 (fold, relinfgain,tdepth, tsize, tleaves, succ, count, succ_percent), 184 tr_results.append((succ,count,succ_percent,relinfgain, tdepth, 185 tsize, tleaves)) 186 187 (succ, count) = tree.classify_set(i[1],pylib_basics.verbose()) 188 succ_percent = float(succ)/count*100 189 print "Test: %4d out of %4d, %7.3f%%" % (succ, count, succ_percent) 190 te_results.append((succ, count,succ_percent)) 191 192 tr_percent = (map(lambda x:x[2],tr_results)) 193 te_percent = (map(lambda x:x[2],te_results)) 194 rig = (map(lambda x:x[3],tr_results)) 195 depths = (map(lambda x:x[4],tr_results)) 196 size = (map(lambda x:x[5],tr_results)) 197 leaves = (map(lambda x:x[6],tr_results)) 198 print "%s Splits %2d RIGL %5.3f RIG %5.3f+/-%5.3f (%5.2f+/-%4.2f, %7.2f+/-%4.2f, %7.2f+/-%4.2f) Train: %7.3f+/-%7.3f%% Test: %7.3f+/-%7.3f%%" %\ 199 (feature_selection, 200 max_split, 201 relgain_limit, 202 pylib_basics.mean(rig), 203 pylib_basics.standard_deviation(rig), 204 pylib_basics.mean(depths), 205 pylib_basics.standard_deviation(depths), 206 pylib_basics.mean(size), 207 pylib_basics.standard_deviation(size), 208 pylib_basics.mean(leaves), 209 pylib_basics.standard_deviation(leaves), 210 pylib_basics.mean(tr_percent), 211 pylib_basics.standard_deviation(tr_percent), 212 pylib_basics.mean(te_percent), 213 pylib_basics.standard_deviation(te_percent)) 214 215else: 216 tree = dectree_constructor(set, 217 entropy_compare_fun, 218 relgain_limit, 219 max_split) 220 if printtree: 221 tree.printout() 222 if eval_tree: 223 (succ, count) = tree.classify_set(set, pylib_basics.verbose()) 224 print "Successes on training set: %d out of %d, %f%%"% (succ, count, float(succ)/count*100) 225 if classify: 226 testset = pylib_ml_examples.ml_exampleset() 227 testset.parse(args[1]) 228 (succ, count) = tree.classify_set(testset, True) 229 if count!=0: 230 print "Successes on test set : %d out of %d, %f%%"% (succ, count, float(succ)/count*100) 231 232