1#!/usr/bin/env python2.7
2# ----------------------------------
3#
4# Module pylib_ml_examples.py
5#
6# Functions and classes for handling sets of examples form machine
7# learning, where each example contains an identifier, a set of
8# features of different types, and a desired class.
9#
10# Copyright 2003 Stephan Schulz, schulz@informatik.tu-muenchen.de
11#
12# This code is part of the support structure for the equational
13# theorem prover E. Visit
14#
15#  http://www4.informatik.tu-muenchen.de/~schulz/WORK/eprover.html
16#
17# for more information.
18#
19# This program is free software; you can redistribute it and/or modify
20# it under the terms of the GNU General Public License as published by
21# the Free Software Foundation; either version 2 of the License, or
22# (at your option) any later version.
23#
24# This program is distributed in the hope that it will be useful,
25# but WITHOUT ANY WARRANTY; without even the implied warranty of
26# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27# GNU General Public License for more details.
28#
29# You should have received a copy of the GNU General Public License
30# along with this program ; if not, write to the Free Software
31# Foundation, Inc., 59 Temple Place, Suite 330, Boston,
32# MA  02111-1307 USA
33#
34# The original copyright holder can be contacted as
35#
36# Stephan Schulz (I4)
37# Technische Universitaet Muenchen
38# Institut fuer Informatik
39# Boltzmannstrasse 3
40# Garching bei Muenchen
41# Germany
42#
43# or via email (address above).
44
45from __future__ import generators
46
47from types import *
48import string
49import random
50from UserList import UserList
51
52import pylib_basics
53import pylib_io
54import pylib_probabilities
55
56
57def atofeatureval(str):
58    """Try to convert a string to an integer or a float,
59       and return the converted value"""
60    try:
61        res=string.atoi(str)
62        return res
63    except ValueError:
64        pass
65    try:
66        res=string.atof(str)
67        return res
68    except ValueError:
69        pass
70    return str
71
72
73def typemax(t1,t2):
74    """Given two types (string, int, float), return
75       the more general one"""
76    if t1 == StringType:
77        return t1
78    if t2 == StringType:
79        return t2
80    if t1 == FloatType:
81        return t1
82    if t2 == FloatType:
83        return t2
84    assert(t1==IntType and t2==IntType)
85    return t1
86
87
88def typemax_l(l):
89    """
90    Reduce a list of types to the most specific one compatible with
91    all elements.
92    """
93    return reduce(typemax, l, int)
94
95
96class ml_example:
97    """
98    Representing an example for machine learning, with am identifier
99    (id), a list of features values (features), and (optionally) a
100    class (tclass).
101    """
102    def __init__(self,
103                 rep,
104                 classadmin = pylib_basics.name_number_hash()):
105        self.classadmin = classadmin
106        if type(rep) == StringType:
107            tmp = rep.split(":");
108            tmp = map(string.strip, tmp)
109            tmp[1] = tmp[1].split(",")
110            tmp[1] = map(string.strip, tmp[1])
111        else:
112            assert type(rep) == TupleType
113            assert len(rep) == 3 or len(rep)==4
114            tmp = rep
115
116        assert type(tmp[1]) == ListType
117        self.id       = tmp[0]
118        self.features = map(atofeatureval,tmp[1])
119        if len(tmp)==3:
120            self.tclass = self.classadmin.insert(tmp[2])
121        else:
122            self.tclass = None
123
124    def feature_no(self):
125        """
126        Return number of different features of example.
127        """
128        return len(self.features)
129
130    def __repr__(self):
131        features = string.join(map(str, self.features),",")
132        res = self.id + " : " +features;
133        if self.tclass!=None:
134            res+= " : "+self.classadmin.get_name(self.tclass)
135        return res
136
137    def feature_val(self, f_no):
138        """
139        Return value of a given feature.
140        """
141        return self.features[f_no]
142
143    def tclass_val(self):
144        """
145        Return target class of feature.
146        """
147        return self.tclass
148
149
150class ml_exampleset(UserList):
151    """
152    Representing a set of examples for machine learning.
153    """
154    def __init__(self, data=[]):
155        UserList.__init__(self)
156        self.feature_no  = None
157        self.name        = None
158        self.class_omega = None
159        self.init_precomputed()
160        self.classadmin = pylib_basics.name_number_hash()
161        for i in data:
162            self.append(i)
163
164    def set_name(self, name):
165        self.name = name
166
167    def append(self, element):
168        if self.feature_no:
169            assert self.feature_no == element.feature_no()
170        else:
171            self.feature_no = element.feature_no()
172        UserList.append(self, element)
173        self.init_precomputed()
174
175    def extend(self, l):
176        for i in l:
177            self.append(i)
178
179    def init_precomputed(self):
180        """
181        Initialize/reset precomputed values that potentially change
182        with each new inertion.
183        """
184        self.feature_types   = None
185        self.feature_values  = None
186        self.feature_dvalues = None
187        self.feature_range   = None
188        self.class_values    = None
189        self.class_dvalues   = None
190
191    def get_feature_type(self, feature):
192        """
193        Give the type of the requested feature.
194        """
195        return self.get_feature_types()[feature]
196
197
198    def get_feature_types(self):
199        """
200        Return a list of types of features used in examples,
201        recomputing it if necessary.
202        """
203        if not self.feature_no:
204            return None
205        if self.feature_types:
206            return self.feature_types
207        self.feature_types = []
208        for i in range(0, self.feature_no):
209            self.feature_types.append(typemax_l(map(lambda element, f=i:
210                                                    type(element.feature_val(f)),
211                                                    self.data)))
212        return self.feature_types
213
214    def get_feature_values(self, feature):
215        """
216        Return (sorted) list of all values for a given feature.
217        """
218        if not self.feature_no:
219            return None
220        if not self.feature_values:
221            self.feature_values = [None] * self.feature_no
222        if not self.feature_values[feature]:
223            self.feature_values[feature] =\
224                map(lambda x, f=feature:x.feature_val(f), self)
225            self.feature_values[feature].sort()
226        return self.feature_values[feature]
227
228
229    def get_feature_range(self, feature):
230        """
231        Return tuple with largest and smallest feature value (using
232        natural order of the type).
233        """
234        if not self.feature_no:
235            return None
236        if not self.feature_range:
237            self.feature_range = [None] * self.feature_no
238        if not self.feature_range[feature]:
239            tmp = self.get_feature_values(feature)
240            self.feature_range[feature] = (tmp[0], tmp[-1])
241        return self.feature_range[feature]
242
243    def get_distinct_feature_values(self, feature):
244        """
245        Return (sorted) list of all distinct values for a given
246        feature.
247        """
248        if not self.feature_no:
249            return None
250        if not self.feature_dvalues:
251            self.feature_dvalues = [None] * self.feature_no
252        if not self.feature_dvalues[feature]:
253            self.feature_dvalues[feature] =\
254                pylib_basics.uniq(self.get_feature_values(feature))
255        return self.feature_dvalues[feature]
256
257    def get_class_values(self):
258        """
259        Return (sorted) list of all classes of examples.
260        """
261        if self.class_values == None:
262            self.class_values = map(lambda x:x.tclass_val(), self)
263            self.class_values.sort()
264        return self.class_values
265
266    def get_distinct_class_values(self):
267        """
268        Return (sorted) list of all occuring classes.
269        """
270        if self.class_dvalues == None:
271            self.class_dvalues = pylib_basics.uniq(self.get_class_values())
272        return self.class_dvalues
273
274    def get_class_number(self):
275        """
276        Return the number of distinct classes.
277        """
278        return self.classadmin.get_entry_no()
279
280    def parse(self, file):
281        f = pylib_io.flexopen(file,'r')
282        l = f.readlines()
283        pylib_io.flexclose(f)
284        for line in l:
285            if line[0] == "#":
286                continue
287            ex = ml_example(line, self.classadmin)
288            self.append(ex)
289
290    def __repr__(self):
291        if self.name:
292            res = "# "+self.name +"\n"
293        else:
294            res = ""
295        for i in self:
296            res+=repr(i)+"\n"
297        return res
298
299    def abs_freq_vector(self, abstracter):
300        tmppart = partition()
301        tmppart.abstracter = abstracter
302        tmppart.insert_set(self)
303        return tmppart.abs_freq_vector()
304
305    def most_frequent_class(self):
306        """
307        Find the most frequent class of examples in the set and return
308        it (and its relative frequency).
309        """
310        classes = self.get_distinct_class_values()
311        class_occ =  self.get_class_values()
312        class_freq = [(class_occ.count(cl),cl) for cl in classes]
313        class_freq.sort()
314        abs_freq, mf_class = class_freq[-1]
315        rel_freq = float(abs_freq)/len(class_occ)
316        return (mf_class, rel_freq)
317
318    def class_freq_vector(self):
319        """
320        Return an in-order vector of absolute class frequencies.
321        """
322        res = [0] * self.get_class_number()
323        class_occ = self.get_class_values()
324        for i in class_occ:
325            res[i]+=1
326        # print res, len(res)
327        return res
328
329    def get_class_entropy(self):
330        return pylib_probabilities.compute_entropy_absdistrib(self.class_freq_vector())
331    def plain_rel_inf_gain(self):
332        apriori_entropy = pylib_probabilities.compute_entropy_absdistrib([1]
333                                               * self.get_class_number())
334        real_entropy = self.get_class_entropy()
335        return pylib_probabilities.rel_info_gain(apriori_entropy,
336                                                 real_entropy,
337                                                 real_entropy)
338
339    def random_split(self,n):
340        """
341        Randomly split the example set into n neary equal-sized subsets.
342        """
343        tmp = list(self)
344        random.shuffle(tmp)
345        res = []
346        for i in range(n):
347            res.append([])
348        count = 0
349        for i in tmp:
350            res[count].append(i)
351            count+=1
352            count = count % n
353        return res
354
355    def stratified_split(self,n):
356        """
357        Randomly split the example set into n neary equal-sized
358        subsets stratified for class.
359        """
360        def weird_cmp(ex1, ex2):
361            tmp = cmp(ex1.tclass_val(), ex2.tclass_val)
362            if tmp:
363                return tmp
364            return cmp(ex1.tmp, ex2.tmp)
365
366        tmp = list(self)
367        random.shuffle(tmp)
368        count = 0
369        for i in tmp:
370            i.tmp = count
371            count += 1
372
373        tmp.sort(weird_cmp)
374        res = []
375        for i in range(n):
376            res.append([])
377        count = 0
378        for i in tmp:
379            res[count].append(i)
380            count+=1
381            count = count % n
382        return res
383
384
385    def crossval_sets(self,n, stratified=True):
386        """
387        Return a list of n tuples (training_set, test_set), so that
388        the union of both is the full set, the 10 test sets are
389        disjoint, and the union of the 10 test sets is the full set.
390        """
391        res = []
392        if stratified:
393            tmp = self.stratified_split(n)
394        else:
395            tmp = self.random_split(n)
396        for i in range(len(tmp)):
397            train = ml_exampleset()
398            train.classadmin = self.classadmin
399            for j in range(len(tmp)):
400                if j!=i:
401                    train.extend(tmp[j])
402            test =  ml_exampleset()
403            test.classadmin = self.classadmin
404            test.extend(tmp[i])
405            res.append((train, test))
406        return res
407
408def class_partitioner(example):
409    return example.tclass_val()
410
411class partition:
412    """
413    Represent a partition of examples with named parts. Partition
414    type is defined by the abstracter() function that should return a
415    partition label for each subset.
416    """
417    def __init__(self, examples=[]):
418        self.parts = {}
419        self.insert_set(examples)
420        self.entropy           = None
421        self.class_entropy     = None
422        self.remainder_entropy = None
423
424    def empty(self):
425        return len(self.parts)==0
426
427    def trivial(self):
428        return len(self.parts)<=1
429
430    def insert(self, example, class_admin=None):
431        """
432        Insert a single example.
433        """
434        self.entropy           = None
435        self.class_entropy     = None
436        self.remainder_entropy = None
437        part = self.abstracter(example)
438        try:
439            self.parts[part].append(example)
440        except KeyError:
441            tmp = ml_exampleset()
442            tmp.classadmin = class_admin
443            tmp.set_name(part)
444            tmp.append(example)
445            self.parts[part] = tmp
446
447    def insert_set(self, examples):
448        """
449        Insert a whole set of examples.
450        """
451        if isinstance(examples, ml_exampleset):
452            class_admin = examples.classadmin
453            for i in examples:
454                self.insert(i, class_admin)
455        else:
456            for i in examples:
457                self.insert(i)
458
459
460    def get_size(self):
461        return len(self.parts)
462
463    def abstracter(self, example):
464        """
465        Return a label. This is only defined for concrete instances.
466        """
467        assert false, "Virtual function only!"
468
469    def __repr__(self):
470        res = "# Partition:\n"
471        tmp = self.parts.keys()
472        tmp.sort()
473        for i in tmp:
474            res = res+repr(i)+"\n"+repr(self.parts[i])
475        return res
476
477    def get_class_distributions(self):
478        res = [i.class_freq_vector() for i in self.parts.values()]
479        return res
480
481    def compute_entropies(self):
482        if self.entropy != None:
483            return
484        if len(self.parts)==1: # Avoid rounding errors
485            tmp = self.parts.values()[0].get_class_entropy()
486            (self.entropy, self.class_entropy, self.remainder_entropy) =\
487                           (0,tmp,tmp)
488        else:
489            tmp = self.get_class_distributions()
490            (self.entropy, self.class_entropy, self.remainder_entropy) =\
491                           pylib_probabilities.compute_entropies(tmp)
492
493    def get_class_entropy(self):
494        """
495        Return the a-priory entropy of the class distribution.
496        """
497        self.compute_entropies()
498        return self.class_entropy
499
500
501    def get_entropy(self):
502        """
503        Return the entropy of the partion.
504        """
505        self.compute_entropies()
506        return self.entropy
507
508    def get_remainder_entropy(self):
509        """
510        Return the remainder entropy of the class test after
511        performing the paritioning.
512        """
513        self.compute_entropies()
514        return self.remainder_entropy
515
516
517class scalar_feature_test:
518    """
519    Callable object representing a single interval constraint on a
520    scalar variable. It returns True for values with lb <= val < ub.
521    """
522    def __init__(self, feature, lower_bound, upper_bound):
523        assert not lower_bound or not upper_bound or lower_bound <= upper_bound
524        self.lower_bound = lower_bound
525        self.upper_bound = upper_bound
526        self.feature = feature
527        self.name = "feature["+repr(feature)+"]"
528
529    def __call__(self, example):
530        value = example.feature_val(self.feature)
531        if self.lower_bound==None:
532            lb = True
533        else:
534            lb = (value >= self.lower_bound)
535        if self.upper_bound==None:
536            ub = True
537        else:
538            ub = (value < self.upper_bound)
539        return lb and ub
540
541    def __repr__(self):
542        if self.lower_bound==None and self.upper_bound==None:
543            return "always_true"
544        elif self.lower_bound==None:
545            return self.name+"<"+repr(self.upper_bound)
546        elif self.upper_bound==None:
547            return self.name+">="+repr(self.lower_bound)
548        return self.name+">="+repr(self.lower_bound)+\
549               " and "+self.name+"<"+repr(self.upper_bound)
550
551
552class scalar_feature_partitioner(partition):
553    def __init__(self, feature, limits=[]):
554        assert(pylib_basics.is_sorted(limits))
555        self.feature_no = feature
556        tmpname = "feature["+repr(self.feature_no)+"]"
557        if len(limits)==0:
558            tmp = scalar_feature_test(None, None)
559            self.features = [tmp]
560        else:
561            lb = None
562            self.features = []
563            for i in limits:
564                tmp = scalar_feature_test(feature,lb,i)
565                self.features.append(tmp)
566                lb = i
567            tmp = scalar_feature_test(feature,lb, None)
568            self.features.append(tmp)
569
570    def __call__(self, example):
571        for i in self.features:
572            if i(example):
573                return i
574        assert False, "Not a partition!"
575
576    def __repr__(self):
577        res = "["
578        sep = ""
579        for i in self.features:
580            res = res+sep+repr(i)
581            sep = " ; "
582        res = res + "]"
583        return res
584
585
586class class_partition(partition):
587    """
588    Generate a partion of a set of examples based on the target class
589    if the examples.
590    """
591    def __init__(self, examples):
592        self.abstracter = class_partitioner
593        partition.__init__(self, examples)
594
595
596class scalar_feature_partition(partition):
597    """
598    Generate a partion of a set of examples based on a scalar
599    feature test.
600    """
601    def __init__(self, examples, feature, limits):
602        self.abstracter = scalar_feature_partitioner(feature, limits)
603        partition.__init__(self, examples)
604
605
606
607def weird_filter(values, boundaries):
608    """
609    Reduce a boundaries= [n0....nn] as follows:
610    For all i from 0 to n-1:If there is no element e in values with
611    ni<e<=ni+1, drop ni.
612    """
613    tmpvals = list(values)
614    res = []
615    for i in boundaries:
616        found = False
617        while tmpvals!=[] and tmpvals[0]<i:
618            found = True
619            tmpvals.pop(0)
620        if found:
621            res.append(i)
622    return res
623
624def equisplit_feature_space(feature_values, n):
625    """
626    Return a set of boundaries that split the space of (distinct)
627    feature values into n approximately equal-sized parts. Works for
628    both distinct values and real values, as it will discard
629    boundaries occuring more than once.
630    """
631    if n<=1:
632        return []
633    tmp = len(feature_values)
634    if n>tmp:
635        n = tmp
636    step = tmp/float(n)
637    res = [feature_values[int(i*step)] for i in range(1, n)]
638    res = weird_filter(feature_values,res)
639    return res
640
641
642def equisplit_feature_range(dfeature_values, n):
643    """
644    Return a set of boundaries that split the range of
645    feature values into n equal-sized parts.
646    """
647    if n<=1:
648        return []
649    tmp = dfeature_values[-1]-dfeature_values[0]
650    step = tmp/float(n)
651    res = [(i*step) for i in range(1, n)]
652    return weird_filter(dfeature_values,res)
653
654
655def prop_n_nary_split(feature_values, proportions):
656    """
657    Return a set of boundaries located at the proporional values
658    given.
659    """
660    tmp = len(feature_values)
661    res = [feature_values[int(p*tmp)] for p in proportions]
662    return weird_filter(feature_values,res)
663
664def prop_n_nary_rangesplit(feature_values, proportions):
665    """
666    Return a set of boundaries located at the proporional values
667    within the feature value range.
668    """
669    tmp = feature_values[-1]-feature_values[0]
670    res = [((p*tmp)+feature_values[0]) for p in proportions]
671    return weird_filter(feature_values,res)
672
673def first_n_and_rest_split(dfeature_values, n):
674    """
675    Return a set of boundaries so that the first n values have their
676    own class, and all the other ones are lumped into one.
677    """
678    return dfeature_values[1:(n+1)]
679
680
681
682class discrete_feature_test:
683    def __init__(self, feature, set=[]):
684        self.set = set
685        self.feature = feature
686        self.name = "feature["+repr(feature)+"]"
687        self.positive = True
688
689    def __call__(self, example):
690        value = example.feature_val(self.feature)
691        return value in self.set
692
693    def __repr__(self):
694        res = self.name+" in "+repr(self.set)
695        return res
696
697class discrete_feature_else_test(discrete_feature_test):
698    def __init__(self, feature, set=[]):
699        discrete_feature_test.__init__(self,feature,set)
700        self.positive = False
701
702    def __call__(self, example):
703        value = example.feature_val(self.feature)
704        return not(value in self.set)
705
706    def __repr__(self):
707        res = self.name+" notin "+repr(self.set)
708        return res
709
710
711class discrete_feature_partitioner:
712    """
713    Generate a functional object that will sort examples into feature
714    classes based on the possible values of a discrete feature.
715    """
716    def __init__(self, feature, values, value_distrib):
717        assert len(values)!=0
718        self.feature_no = feature
719        most_freq = None
720        most_freq_count = 0
721        for i in values:
722            tmp_count = value_distrib.count(i)
723            if tmp_count>most_freq_count:
724                most_freq_count = tmp_count
725                most_freq = i
726        assert(most_freq)
727        else_set = list(values)
728        else_set.remove(most_freq)
729        self.ctests = []
730        for i in values:
731            if i==most_freq:
732                tmp = discrete_feature_else_test(feature,else_set)
733                self.ctests.append(tmp)
734            else:
735                tmp = discrete_feature_test(feature,[i])
736                self.ctests.append(tmp)
737
738    def __call__(self, example):
739        for i in self.ctests:
740            if i(example):
741                return i
742        assert False, "Not a partition!"
743
744    def __repr__(self):
745        res = "["
746        sep = ""
747        for i in self.ctests:
748            res = res+sep+repr(i)
749            sep = " ; "
750        res = res + "]"
751        return res
752
753
754class discrete_feature_partition(partition):
755    """
756    Generate a partion of a set of examples based on a discrete
757    feature test.
758    """
759    def __init__(self, examples, feature, subsetlimit=0):
760        self.abstracter = \
761                 discrete_feature_partitioner(feature,\
762                                examples.get_distinct_feature_values(feature),\
763                                examples.get_feature_values(feature))
764        partition.__init__(self, examples)
765
766
767class one_and_rest_partitioner(discrete_feature_partitioner):
768    """
769    Generate a functional object that will sort examples into two
770    classes based on wether a certain feature has a given value or
771    not.
772    """
773    def __init__(self, feature, value):
774        self.feature_no = feature
775        self.ctests = []
776        tmp = discrete_feature_else_test(feature, [value])
777        self.ctests.append(tmp)
778        tmp = discrete_feature_test(feature, [value])
779        self.ctests.append(tmp)
780
781class one_and_rest_partition(partition):
782    """
783    Generate a partion of a set of examples based on a binary outcome
784    feature test (feature = value or feature != value)
785    """
786    def __init__(self, examples, feature, value):
787        self.abstracter = one_and_rest_partitioner(feature, value)
788        partition.__init__(self, examples)
789
790
791
792def partition_generator_feature(examples, feature, max_splits):
793    """
794    Generate sequence of partitions for the given feature. Tries to
795    intelligently guess what to do and where to stop.
796    """
797    type = examples.get_feature_type(feature)
798    if type == StringType:
799        tmp = discrete_feature_partition(examples, feature)
800        if not tmp.trivial():
801            yield tmp
802        dvalues = examples.get_distinct_feature_values(feature)
803        if len(dvalues) > max_splits:
804            return
805        for i in dvalues:
806            yield one_and_rest_partition(examples, feature, i)
807
808        return
809
810    values  = examples.get_feature_values(feature)
811    dvalues = examples.get_distinct_feature_values(feature)
812    max_split = min(max_splits, len(dvalues))
813
814    #print  values
815    #print dvalues
816    # Generate splits by evenly splitting sequence of all values
817    for i in range(2,max_split):
818        boundaries = equisplit_feature_space(values,i)
819        if len(boundaries)>0:
820            part = scalar_feature_partition(examples, feature, boundaries)
821            yield part
822
823    # Generate splits by evenly splitting sequence of all distinct values
824    for i in range(2,max_split):
825        boundaries = equisplit_feature_space(dvalues,i)
826        if len(boundaries)>0:
827            part = scalar_feature_partition(examples, feature, boundaries)
828            yield part
829
830    # Generate splits by evenly splitting the range of values
831    for i in range(2,max_split):
832        boundaries = equisplit_feature_range(dvalues,i)
833        if len(boundaries)>0:
834            part = scalar_feature_partition(examples, feature, boundaries)
835            yield part
836
837    # Generate some uneven binary splits
838    for i in range(1,9):
839        boundaries = prop_n_nary_split(dvalues, [0.1*i])
840        if len(boundaries)>0:
841            part = scalar_feature_partition(examples, feature, boundaries)
842            yield part
843        boundaries = prop_n_nary_rangesplit(dvalues, [0.1*i])
844        if len(boundaries)>0:
845            part = scalar_feature_partition(examples, feature, boundaries)
846            yield part
847
848
849    # Generate some weird splits:
850    ws = [[0.1,0.9], [0.2,0.8], [0.1,0.5,0.1]]
851    if max_splits >= 3:
852        for i in ws:
853            boundaries = prop_n_nary_split(dvalues, i)
854            if len(boundaries)>0:
855                part = scalar_feature_partition(examples, feature, boundaries)
856                yield part
857            boundaries = prop_n_nary_split(values, i)
858            if len(boundaries)>0:
859                part = scalar_feature_partition(examples, feature, boundaries)
860                yield part
861            boundaries = prop_n_nary_rangesplit(dvalues, i)
862            if len(boundaries)>0:
863                part = scalar_feature_partition(examples, feature, boundaries)
864                yield part
865
866    # Generate some even weirder splits ;-)
867    limit = min(6,max_splits)
868    for i in range(1,limit):
869        boundaries = first_n_and_rest_split(dvalues,1)
870        if len(boundaries)>0:
871            part = scalar_feature_partition(examples, feature, boundaries)
872            yield part
873
874    return
875
876def partition_generator(examples, max_split):
877    """
878    Enumerate all possible feature partions for examples.
879    """
880    for i in range(0,examples.feature_no):
881        pg = partition_generator_feature(examples, i, max_split)
882        while 1:
883            try:
884                part = pg.next()
885                yield part
886            except StopIteration:
887                break
888    return
889
890def find_best_feature_partition(examples,
891                                compare_fun,
892                                feature,
893                                max_splits):
894    assert isinstance(examples, ml_exampleset)
895    assert type(feature) == IntType
896    assert type(max_splits) == IntType
897
898    best_relinfgain = -1
899    best_absinfgain = -1
900    best_part       = None
901    part_gen        = partition_generator_feature(examples, feature, max_splits)
902    while 1:
903        try:
904            part       = part_gen.next()
905            apriori    = part.get_class_entropy()
906            cost       = part.get_entropy()
907            remainder  = part.get_remainder_entropy()
908            # print "# A-priori, cost, remainder = (%2.6f, %2.6f, %2.6f)" %\
909            # (apriori,cost,remainder)
910            absinfgain = apriori-remainder
911            relinfgain = pylib_probabilities.rel_info_gain(apriori,
912                                                           remainder,
913                                                           cost)
914
915            #print relinfgain, absinfgain
916            if compare_fun((relinfgain, absinfgain),
917                           (best_relinfgain,best_absinfgain)) > 0:
918                best_relinfgain = relinfgain
919                best_absinfgain = absinfgain
920                best_part       = part
921        except StopIteration:
922            break
923    return (best_relinfgain, best_absinfgain, best_part)
924
925
926
927
928def find_best_partition(examples,
929                        compare_fun,
930                        max_splits):
931    assert type(max_splits == IntType)
932    best_relinfgain = -1
933    best_absinfgain = -1
934    best_part       = None
935
936    if len(examples) > 1:
937        for i in range(0,examples.feature_no):
938            (relinfgain, absinfgain, part) = \
939                         find_best_feature_partition(examples,
940                                                     compare_fun,
941                                                     i,
942                                                     max_splits)
943            if pylib_basics.verbose():
944                print "# Evaluating feature %d: %1.6f, %1.6f "\
945                      %(i,relinfgain, absinfgain),
946            if part:
947                if pylib_basics.verbose():
948                    print part.abstracter
949            else:
950                if pylib_basics.verbose():
951                    print "# No split possible, feature is homogenous:",
952                    print examples.get_distinct_feature_values(i)
953            if compare_fun((relinfgain, absinfgain),
954                           (best_relinfgain,best_absinfgain)) > 0:
955                best_relinfgain = relinfgain
956                best_absinfgain = absinfgain
957                best_part = part
958
959    return  (best_relinfgain, best_absinfgain, best_part)
960
961