1#!/usr/bin/env python2.7 2# ---------------------------------- 3# 4# Module pylib_ml_examples.py 5# 6# Functions and classes for handling sets of examples form machine 7# learning, where each example contains an identifier, a set of 8# features of different types, and a desired class. 9# 10# Copyright 2003 Stephan Schulz, schulz@informatik.tu-muenchen.de 11# 12# This code is part of the support structure for the equational 13# theorem prover E. Visit 14# 15# http://www4.informatik.tu-muenchen.de/~schulz/WORK/eprover.html 16# 17# for more information. 18# 19# This program is free software; you can redistribute it and/or modify 20# it under the terms of the GNU General Public License as published by 21# the Free Software Foundation; either version 2 of the License, or 22# (at your option) any later version. 23# 24# This program is distributed in the hope that it will be useful, 25# but WITHOUT ANY WARRANTY; without even the implied warranty of 26# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 27# GNU General Public License for more details. 28# 29# You should have received a copy of the GNU General Public License 30# along with this program ; if not, write to the Free Software 31# Foundation, Inc., 59 Temple Place, Suite 330, Boston, 32# MA 02111-1307 USA 33# 34# The original copyright holder can be contacted as 35# 36# Stephan Schulz (I4) 37# Technische Universitaet Muenchen 38# Institut fuer Informatik 39# Boltzmannstrasse 3 40# Garching bei Muenchen 41# Germany 42# 43# or via email (address above). 44 45from __future__ import generators 46 47from types import * 48import string 49import random 50from UserList import UserList 51 52import pylib_basics 53import pylib_io 54import pylib_probabilities 55 56 57def atofeatureval(str): 58 """Try to convert a string to an integer or a float, 59 and return the converted value""" 60 try: 61 res=string.atoi(str) 62 return res 63 except ValueError: 64 pass 65 try: 66 res=string.atof(str) 67 return res 68 except ValueError: 69 pass 70 return str 71 72 73def typemax(t1,t2): 74 """Given two types (string, int, float), return 75 the more general one""" 76 if t1 == StringType: 77 return t1 78 if t2 == StringType: 79 return t2 80 if t1 == FloatType: 81 return t1 82 if t2 == FloatType: 83 return t2 84 assert(t1==IntType and t2==IntType) 85 return t1 86 87 88def typemax_l(l): 89 """ 90 Reduce a list of types to the most specific one compatible with 91 all elements. 92 """ 93 return reduce(typemax, l, int) 94 95 96class ml_example: 97 """ 98 Representing an example for machine learning, with am identifier 99 (id), a list of features values (features), and (optionally) a 100 class (tclass). 101 """ 102 def __init__(self, 103 rep, 104 classadmin = pylib_basics.name_number_hash()): 105 self.classadmin = classadmin 106 if type(rep) == StringType: 107 tmp = rep.split(":"); 108 tmp = map(string.strip, tmp) 109 tmp[1] = tmp[1].split(",") 110 tmp[1] = map(string.strip, tmp[1]) 111 else: 112 assert type(rep) == TupleType 113 assert len(rep) == 3 or len(rep)==4 114 tmp = rep 115 116 assert type(tmp[1]) == ListType 117 self.id = tmp[0] 118 self.features = map(atofeatureval,tmp[1]) 119 if len(tmp)==3: 120 self.tclass = self.classadmin.insert(tmp[2]) 121 else: 122 self.tclass = None 123 124 def feature_no(self): 125 """ 126 Return number of different features of example. 127 """ 128 return len(self.features) 129 130 def __repr__(self): 131 features = string.join(map(str, self.features),",") 132 res = self.id + " : " +features; 133 if self.tclass!=None: 134 res+= " : "+self.classadmin.get_name(self.tclass) 135 return res 136 137 def feature_val(self, f_no): 138 """ 139 Return value of a given feature. 140 """ 141 return self.features[f_no] 142 143 def tclass_val(self): 144 """ 145 Return target class of feature. 146 """ 147 return self.tclass 148 149 150class ml_exampleset(UserList): 151 """ 152 Representing a set of examples for machine learning. 153 """ 154 def __init__(self, data=[]): 155 UserList.__init__(self) 156 self.feature_no = None 157 self.name = None 158 self.class_omega = None 159 self.init_precomputed() 160 self.classadmin = pylib_basics.name_number_hash() 161 for i in data: 162 self.append(i) 163 164 def set_name(self, name): 165 self.name = name 166 167 def append(self, element): 168 if self.feature_no: 169 assert self.feature_no == element.feature_no() 170 else: 171 self.feature_no = element.feature_no() 172 UserList.append(self, element) 173 self.init_precomputed() 174 175 def extend(self, l): 176 for i in l: 177 self.append(i) 178 179 def init_precomputed(self): 180 """ 181 Initialize/reset precomputed values that potentially change 182 with each new inertion. 183 """ 184 self.feature_types = None 185 self.feature_values = None 186 self.feature_dvalues = None 187 self.feature_range = None 188 self.class_values = None 189 self.class_dvalues = None 190 191 def get_feature_type(self, feature): 192 """ 193 Give the type of the requested feature. 194 """ 195 return self.get_feature_types()[feature] 196 197 198 def get_feature_types(self): 199 """ 200 Return a list of types of features used in examples, 201 recomputing it if necessary. 202 """ 203 if not self.feature_no: 204 return None 205 if self.feature_types: 206 return self.feature_types 207 self.feature_types = [] 208 for i in range(0, self.feature_no): 209 self.feature_types.append(typemax_l(map(lambda element, f=i: 210 type(element.feature_val(f)), 211 self.data))) 212 return self.feature_types 213 214 def get_feature_values(self, feature): 215 """ 216 Return (sorted) list of all values for a given feature. 217 """ 218 if not self.feature_no: 219 return None 220 if not self.feature_values: 221 self.feature_values = [None] * self.feature_no 222 if not self.feature_values[feature]: 223 self.feature_values[feature] =\ 224 map(lambda x, f=feature:x.feature_val(f), self) 225 self.feature_values[feature].sort() 226 return self.feature_values[feature] 227 228 229 def get_feature_range(self, feature): 230 """ 231 Return tuple with largest and smallest feature value (using 232 natural order of the type). 233 """ 234 if not self.feature_no: 235 return None 236 if not self.feature_range: 237 self.feature_range = [None] * self.feature_no 238 if not self.feature_range[feature]: 239 tmp = self.get_feature_values(feature) 240 self.feature_range[feature] = (tmp[0], tmp[-1]) 241 return self.feature_range[feature] 242 243 def get_distinct_feature_values(self, feature): 244 """ 245 Return (sorted) list of all distinct values for a given 246 feature. 247 """ 248 if not self.feature_no: 249 return None 250 if not self.feature_dvalues: 251 self.feature_dvalues = [None] * self.feature_no 252 if not self.feature_dvalues[feature]: 253 self.feature_dvalues[feature] =\ 254 pylib_basics.uniq(self.get_feature_values(feature)) 255 return self.feature_dvalues[feature] 256 257 def get_class_values(self): 258 """ 259 Return (sorted) list of all classes of examples. 260 """ 261 if self.class_values == None: 262 self.class_values = map(lambda x:x.tclass_val(), self) 263 self.class_values.sort() 264 return self.class_values 265 266 def get_distinct_class_values(self): 267 """ 268 Return (sorted) list of all occuring classes. 269 """ 270 if self.class_dvalues == None: 271 self.class_dvalues = pylib_basics.uniq(self.get_class_values()) 272 return self.class_dvalues 273 274 def get_class_number(self): 275 """ 276 Return the number of distinct classes. 277 """ 278 return self.classadmin.get_entry_no() 279 280 def parse(self, file): 281 f = pylib_io.flexopen(file,'r') 282 l = f.readlines() 283 pylib_io.flexclose(f) 284 for line in l: 285 if line[0] == "#": 286 continue 287 ex = ml_example(line, self.classadmin) 288 self.append(ex) 289 290 def __repr__(self): 291 if self.name: 292 res = "# "+self.name +"\n" 293 else: 294 res = "" 295 for i in self: 296 res+=repr(i)+"\n" 297 return res 298 299 def abs_freq_vector(self, abstracter): 300 tmppart = partition() 301 tmppart.abstracter = abstracter 302 tmppart.insert_set(self) 303 return tmppart.abs_freq_vector() 304 305 def most_frequent_class(self): 306 """ 307 Find the most frequent class of examples in the set and return 308 it (and its relative frequency). 309 """ 310 classes = self.get_distinct_class_values() 311 class_occ = self.get_class_values() 312 class_freq = [(class_occ.count(cl),cl) for cl in classes] 313 class_freq.sort() 314 abs_freq, mf_class = class_freq[-1] 315 rel_freq = float(abs_freq)/len(class_occ) 316 return (mf_class, rel_freq) 317 318 def class_freq_vector(self): 319 """ 320 Return an in-order vector of absolute class frequencies. 321 """ 322 res = [0] * self.get_class_number() 323 class_occ = self.get_class_values() 324 for i in class_occ: 325 res[i]+=1 326 # print res, len(res) 327 return res 328 329 def get_class_entropy(self): 330 return pylib_probabilities.compute_entropy_absdistrib(self.class_freq_vector()) 331 def plain_rel_inf_gain(self): 332 apriori_entropy = pylib_probabilities.compute_entropy_absdistrib([1] 333 * self.get_class_number()) 334 real_entropy = self.get_class_entropy() 335 return pylib_probabilities.rel_info_gain(apriori_entropy, 336 real_entropy, 337 real_entropy) 338 339 def random_split(self,n): 340 """ 341 Randomly split the example set into n neary equal-sized subsets. 342 """ 343 tmp = list(self) 344 random.shuffle(tmp) 345 res = [] 346 for i in range(n): 347 res.append([]) 348 count = 0 349 for i in tmp: 350 res[count].append(i) 351 count+=1 352 count = count % n 353 return res 354 355 def stratified_split(self,n): 356 """ 357 Randomly split the example set into n neary equal-sized 358 subsets stratified for class. 359 """ 360 def weird_cmp(ex1, ex2): 361 tmp = cmp(ex1.tclass_val(), ex2.tclass_val) 362 if tmp: 363 return tmp 364 return cmp(ex1.tmp, ex2.tmp) 365 366 tmp = list(self) 367 random.shuffle(tmp) 368 count = 0 369 for i in tmp: 370 i.tmp = count 371 count += 1 372 373 tmp.sort(weird_cmp) 374 res = [] 375 for i in range(n): 376 res.append([]) 377 count = 0 378 for i in tmp: 379 res[count].append(i) 380 count+=1 381 count = count % n 382 return res 383 384 385 def crossval_sets(self,n, stratified=True): 386 """ 387 Return a list of n tuples (training_set, test_set), so that 388 the union of both is the full set, the 10 test sets are 389 disjoint, and the union of the 10 test sets is the full set. 390 """ 391 res = [] 392 if stratified: 393 tmp = self.stratified_split(n) 394 else: 395 tmp = self.random_split(n) 396 for i in range(len(tmp)): 397 train = ml_exampleset() 398 train.classadmin = self.classadmin 399 for j in range(len(tmp)): 400 if j!=i: 401 train.extend(tmp[j]) 402 test = ml_exampleset() 403 test.classadmin = self.classadmin 404 test.extend(tmp[i]) 405 res.append((train, test)) 406 return res 407 408def class_partitioner(example): 409 return example.tclass_val() 410 411class partition: 412 """ 413 Represent a partition of examples with named parts. Partition 414 type is defined by the abstracter() function that should return a 415 partition label for each subset. 416 """ 417 def __init__(self, examples=[]): 418 self.parts = {} 419 self.insert_set(examples) 420 self.entropy = None 421 self.class_entropy = None 422 self.remainder_entropy = None 423 424 def empty(self): 425 return len(self.parts)==0 426 427 def trivial(self): 428 return len(self.parts)<=1 429 430 def insert(self, example, class_admin=None): 431 """ 432 Insert a single example. 433 """ 434 self.entropy = None 435 self.class_entropy = None 436 self.remainder_entropy = None 437 part = self.abstracter(example) 438 try: 439 self.parts[part].append(example) 440 except KeyError: 441 tmp = ml_exampleset() 442 tmp.classadmin = class_admin 443 tmp.set_name(part) 444 tmp.append(example) 445 self.parts[part] = tmp 446 447 def insert_set(self, examples): 448 """ 449 Insert a whole set of examples. 450 """ 451 if isinstance(examples, ml_exampleset): 452 class_admin = examples.classadmin 453 for i in examples: 454 self.insert(i, class_admin) 455 else: 456 for i in examples: 457 self.insert(i) 458 459 460 def get_size(self): 461 return len(self.parts) 462 463 def abstracter(self, example): 464 """ 465 Return a label. This is only defined for concrete instances. 466 """ 467 assert false, "Virtual function only!" 468 469 def __repr__(self): 470 res = "# Partition:\n" 471 tmp = self.parts.keys() 472 tmp.sort() 473 for i in tmp: 474 res = res+repr(i)+"\n"+repr(self.parts[i]) 475 return res 476 477 def get_class_distributions(self): 478 res = [i.class_freq_vector() for i in self.parts.values()] 479 return res 480 481 def compute_entropies(self): 482 if self.entropy != None: 483 return 484 if len(self.parts)==1: # Avoid rounding errors 485 tmp = self.parts.values()[0].get_class_entropy() 486 (self.entropy, self.class_entropy, self.remainder_entropy) =\ 487 (0,tmp,tmp) 488 else: 489 tmp = self.get_class_distributions() 490 (self.entropy, self.class_entropy, self.remainder_entropy) =\ 491 pylib_probabilities.compute_entropies(tmp) 492 493 def get_class_entropy(self): 494 """ 495 Return the a-priory entropy of the class distribution. 496 """ 497 self.compute_entropies() 498 return self.class_entropy 499 500 501 def get_entropy(self): 502 """ 503 Return the entropy of the partion. 504 """ 505 self.compute_entropies() 506 return self.entropy 507 508 def get_remainder_entropy(self): 509 """ 510 Return the remainder entropy of the class test after 511 performing the paritioning. 512 """ 513 self.compute_entropies() 514 return self.remainder_entropy 515 516 517class scalar_feature_test: 518 """ 519 Callable object representing a single interval constraint on a 520 scalar variable. It returns True for values with lb <= val < ub. 521 """ 522 def __init__(self, feature, lower_bound, upper_bound): 523 assert not lower_bound or not upper_bound or lower_bound <= upper_bound 524 self.lower_bound = lower_bound 525 self.upper_bound = upper_bound 526 self.feature = feature 527 self.name = "feature["+repr(feature)+"]" 528 529 def __call__(self, example): 530 value = example.feature_val(self.feature) 531 if self.lower_bound==None: 532 lb = True 533 else: 534 lb = (value >= self.lower_bound) 535 if self.upper_bound==None: 536 ub = True 537 else: 538 ub = (value < self.upper_bound) 539 return lb and ub 540 541 def __repr__(self): 542 if self.lower_bound==None and self.upper_bound==None: 543 return "always_true" 544 elif self.lower_bound==None: 545 return self.name+"<"+repr(self.upper_bound) 546 elif self.upper_bound==None: 547 return self.name+">="+repr(self.lower_bound) 548 return self.name+">="+repr(self.lower_bound)+\ 549 " and "+self.name+"<"+repr(self.upper_bound) 550 551 552class scalar_feature_partitioner(partition): 553 def __init__(self, feature, limits=[]): 554 assert(pylib_basics.is_sorted(limits)) 555 self.feature_no = feature 556 tmpname = "feature["+repr(self.feature_no)+"]" 557 if len(limits)==0: 558 tmp = scalar_feature_test(None, None) 559 self.features = [tmp] 560 else: 561 lb = None 562 self.features = [] 563 for i in limits: 564 tmp = scalar_feature_test(feature,lb,i) 565 self.features.append(tmp) 566 lb = i 567 tmp = scalar_feature_test(feature,lb, None) 568 self.features.append(tmp) 569 570 def __call__(self, example): 571 for i in self.features: 572 if i(example): 573 return i 574 assert False, "Not a partition!" 575 576 def __repr__(self): 577 res = "[" 578 sep = "" 579 for i in self.features: 580 res = res+sep+repr(i) 581 sep = " ; " 582 res = res + "]" 583 return res 584 585 586class class_partition(partition): 587 """ 588 Generate a partion of a set of examples based on the target class 589 if the examples. 590 """ 591 def __init__(self, examples): 592 self.abstracter = class_partitioner 593 partition.__init__(self, examples) 594 595 596class scalar_feature_partition(partition): 597 """ 598 Generate a partion of a set of examples based on a scalar 599 feature test. 600 """ 601 def __init__(self, examples, feature, limits): 602 self.abstracter = scalar_feature_partitioner(feature, limits) 603 partition.__init__(self, examples) 604 605 606 607def weird_filter(values, boundaries): 608 """ 609 Reduce a boundaries= [n0....nn] as follows: 610 For all i from 0 to n-1:If there is no element e in values with 611 ni<e<=ni+1, drop ni. 612 """ 613 tmpvals = list(values) 614 res = [] 615 for i in boundaries: 616 found = False 617 while tmpvals!=[] and tmpvals[0]<i: 618 found = True 619 tmpvals.pop(0) 620 if found: 621 res.append(i) 622 return res 623 624def equisplit_feature_space(feature_values, n): 625 """ 626 Return a set of boundaries that split the space of (distinct) 627 feature values into n approximately equal-sized parts. Works for 628 both distinct values and real values, as it will discard 629 boundaries occuring more than once. 630 """ 631 if n<=1: 632 return [] 633 tmp = len(feature_values) 634 if n>tmp: 635 n = tmp 636 step = tmp/float(n) 637 res = [feature_values[int(i*step)] for i in range(1, n)] 638 res = weird_filter(feature_values,res) 639 return res 640 641 642def equisplit_feature_range(dfeature_values, n): 643 """ 644 Return a set of boundaries that split the range of 645 feature values into n equal-sized parts. 646 """ 647 if n<=1: 648 return [] 649 tmp = dfeature_values[-1]-dfeature_values[0] 650 step = tmp/float(n) 651 res = [(i*step) for i in range(1, n)] 652 return weird_filter(dfeature_values,res) 653 654 655def prop_n_nary_split(feature_values, proportions): 656 """ 657 Return a set of boundaries located at the proporional values 658 given. 659 """ 660 tmp = len(feature_values) 661 res = [feature_values[int(p*tmp)] for p in proportions] 662 return weird_filter(feature_values,res) 663 664def prop_n_nary_rangesplit(feature_values, proportions): 665 """ 666 Return a set of boundaries located at the proporional values 667 within the feature value range. 668 """ 669 tmp = feature_values[-1]-feature_values[0] 670 res = [((p*tmp)+feature_values[0]) for p in proportions] 671 return weird_filter(feature_values,res) 672 673def first_n_and_rest_split(dfeature_values, n): 674 """ 675 Return a set of boundaries so that the first n values have their 676 own class, and all the other ones are lumped into one. 677 """ 678 return dfeature_values[1:(n+1)] 679 680 681 682class discrete_feature_test: 683 def __init__(self, feature, set=[]): 684 self.set = set 685 self.feature = feature 686 self.name = "feature["+repr(feature)+"]" 687 self.positive = True 688 689 def __call__(self, example): 690 value = example.feature_val(self.feature) 691 return value in self.set 692 693 def __repr__(self): 694 res = self.name+" in "+repr(self.set) 695 return res 696 697class discrete_feature_else_test(discrete_feature_test): 698 def __init__(self, feature, set=[]): 699 discrete_feature_test.__init__(self,feature,set) 700 self.positive = False 701 702 def __call__(self, example): 703 value = example.feature_val(self.feature) 704 return not(value in self.set) 705 706 def __repr__(self): 707 res = self.name+" notin "+repr(self.set) 708 return res 709 710 711class discrete_feature_partitioner: 712 """ 713 Generate a functional object that will sort examples into feature 714 classes based on the possible values of a discrete feature. 715 """ 716 def __init__(self, feature, values, value_distrib): 717 assert len(values)!=0 718 self.feature_no = feature 719 most_freq = None 720 most_freq_count = 0 721 for i in values: 722 tmp_count = value_distrib.count(i) 723 if tmp_count>most_freq_count: 724 most_freq_count = tmp_count 725 most_freq = i 726 assert(most_freq) 727 else_set = list(values) 728 else_set.remove(most_freq) 729 self.ctests = [] 730 for i in values: 731 if i==most_freq: 732 tmp = discrete_feature_else_test(feature,else_set) 733 self.ctests.append(tmp) 734 else: 735 tmp = discrete_feature_test(feature,[i]) 736 self.ctests.append(tmp) 737 738 def __call__(self, example): 739 for i in self.ctests: 740 if i(example): 741 return i 742 assert False, "Not a partition!" 743 744 def __repr__(self): 745 res = "[" 746 sep = "" 747 for i in self.ctests: 748 res = res+sep+repr(i) 749 sep = " ; " 750 res = res + "]" 751 return res 752 753 754class discrete_feature_partition(partition): 755 """ 756 Generate a partion of a set of examples based on a discrete 757 feature test. 758 """ 759 def __init__(self, examples, feature, subsetlimit=0): 760 self.abstracter = \ 761 discrete_feature_partitioner(feature,\ 762 examples.get_distinct_feature_values(feature),\ 763 examples.get_feature_values(feature)) 764 partition.__init__(self, examples) 765 766 767class one_and_rest_partitioner(discrete_feature_partitioner): 768 """ 769 Generate a functional object that will sort examples into two 770 classes based on wether a certain feature has a given value or 771 not. 772 """ 773 def __init__(self, feature, value): 774 self.feature_no = feature 775 self.ctests = [] 776 tmp = discrete_feature_else_test(feature, [value]) 777 self.ctests.append(tmp) 778 tmp = discrete_feature_test(feature, [value]) 779 self.ctests.append(tmp) 780 781class one_and_rest_partition(partition): 782 """ 783 Generate a partion of a set of examples based on a binary outcome 784 feature test (feature = value or feature != value) 785 """ 786 def __init__(self, examples, feature, value): 787 self.abstracter = one_and_rest_partitioner(feature, value) 788 partition.__init__(self, examples) 789 790 791 792def partition_generator_feature(examples, feature, max_splits): 793 """ 794 Generate sequence of partitions for the given feature. Tries to 795 intelligently guess what to do and where to stop. 796 """ 797 type = examples.get_feature_type(feature) 798 if type == StringType: 799 tmp = discrete_feature_partition(examples, feature) 800 if not tmp.trivial(): 801 yield tmp 802 dvalues = examples.get_distinct_feature_values(feature) 803 if len(dvalues) > max_splits: 804 return 805 for i in dvalues: 806 yield one_and_rest_partition(examples, feature, i) 807 808 return 809 810 values = examples.get_feature_values(feature) 811 dvalues = examples.get_distinct_feature_values(feature) 812 max_split = min(max_splits, len(dvalues)) 813 814 #print values 815 #print dvalues 816 # Generate splits by evenly splitting sequence of all values 817 for i in range(2,max_split): 818 boundaries = equisplit_feature_space(values,i) 819 if len(boundaries)>0: 820 part = scalar_feature_partition(examples, feature, boundaries) 821 yield part 822 823 # Generate splits by evenly splitting sequence of all distinct values 824 for i in range(2,max_split): 825 boundaries = equisplit_feature_space(dvalues,i) 826 if len(boundaries)>0: 827 part = scalar_feature_partition(examples, feature, boundaries) 828 yield part 829 830 # Generate splits by evenly splitting the range of values 831 for i in range(2,max_split): 832 boundaries = equisplit_feature_range(dvalues,i) 833 if len(boundaries)>0: 834 part = scalar_feature_partition(examples, feature, boundaries) 835 yield part 836 837 # Generate some uneven binary splits 838 for i in range(1,9): 839 boundaries = prop_n_nary_split(dvalues, [0.1*i]) 840 if len(boundaries)>0: 841 part = scalar_feature_partition(examples, feature, boundaries) 842 yield part 843 boundaries = prop_n_nary_rangesplit(dvalues, [0.1*i]) 844 if len(boundaries)>0: 845 part = scalar_feature_partition(examples, feature, boundaries) 846 yield part 847 848 849 # Generate some weird splits: 850 ws = [[0.1,0.9], [0.2,0.8], [0.1,0.5,0.1]] 851 if max_splits >= 3: 852 for i in ws: 853 boundaries = prop_n_nary_split(dvalues, i) 854 if len(boundaries)>0: 855 part = scalar_feature_partition(examples, feature, boundaries) 856 yield part 857 boundaries = prop_n_nary_split(values, i) 858 if len(boundaries)>0: 859 part = scalar_feature_partition(examples, feature, boundaries) 860 yield part 861 boundaries = prop_n_nary_rangesplit(dvalues, i) 862 if len(boundaries)>0: 863 part = scalar_feature_partition(examples, feature, boundaries) 864 yield part 865 866 # Generate some even weirder splits ;-) 867 limit = min(6,max_splits) 868 for i in range(1,limit): 869 boundaries = first_n_and_rest_split(dvalues,1) 870 if len(boundaries)>0: 871 part = scalar_feature_partition(examples, feature, boundaries) 872 yield part 873 874 return 875 876def partition_generator(examples, max_split): 877 """ 878 Enumerate all possible feature partions for examples. 879 """ 880 for i in range(0,examples.feature_no): 881 pg = partition_generator_feature(examples, i, max_split) 882 while 1: 883 try: 884 part = pg.next() 885 yield part 886 except StopIteration: 887 break 888 return 889 890def find_best_feature_partition(examples, 891 compare_fun, 892 feature, 893 max_splits): 894 assert isinstance(examples, ml_exampleset) 895 assert type(feature) == IntType 896 assert type(max_splits) == IntType 897 898 best_relinfgain = -1 899 best_absinfgain = -1 900 best_part = None 901 part_gen = partition_generator_feature(examples, feature, max_splits) 902 while 1: 903 try: 904 part = part_gen.next() 905 apriori = part.get_class_entropy() 906 cost = part.get_entropy() 907 remainder = part.get_remainder_entropy() 908 # print "# A-priori, cost, remainder = (%2.6f, %2.6f, %2.6f)" %\ 909 # (apriori,cost,remainder) 910 absinfgain = apriori-remainder 911 relinfgain = pylib_probabilities.rel_info_gain(apriori, 912 remainder, 913 cost) 914 915 #print relinfgain, absinfgain 916 if compare_fun((relinfgain, absinfgain), 917 (best_relinfgain,best_absinfgain)) > 0: 918 best_relinfgain = relinfgain 919 best_absinfgain = absinfgain 920 best_part = part 921 except StopIteration: 922 break 923 return (best_relinfgain, best_absinfgain, best_part) 924 925 926 927 928def find_best_partition(examples, 929 compare_fun, 930 max_splits): 931 assert type(max_splits == IntType) 932 best_relinfgain = -1 933 best_absinfgain = -1 934 best_part = None 935 936 if len(examples) > 1: 937 for i in range(0,examples.feature_no): 938 (relinfgain, absinfgain, part) = \ 939 find_best_feature_partition(examples, 940 compare_fun, 941 i, 942 max_splits) 943 if pylib_basics.verbose(): 944 print "# Evaluating feature %d: %1.6f, %1.6f "\ 945 %(i,relinfgain, absinfgain), 946 if part: 947 if pylib_basics.verbose(): 948 print part.abstracter 949 else: 950 if pylib_basics.verbose(): 951 print "# No split possible, feature is homogenous:", 952 print examples.get_distinct_feature_values(i) 953 if compare_fun((relinfgain, absinfgain), 954 (best_relinfgain,best_absinfgain)) > 0: 955 best_relinfgain = relinfgain 956 best_absinfgain = absinfgain 957 best_part = part 958 959 return (best_relinfgain, best_absinfgain, best_part) 960 961