1"""This module contains an object that implements the Paice-Husk stemming
2algorithm.
3
4If you just want to use the standard Paice-Husk stemming rules, use the
5module's ``stem()`` function::
6
7    stemmed_word = stem(word)
8
9If you want to use a custom rule set, read the rules into a string where the
10rules are separated by newlines, and instantiate the object with the string,
11then use the object's stem method to stem words::
12
13    stemmer = PaiceHuskStemmer(my_rules_string)
14    stemmed_word = stemmer.stem(word)
15"""
16
17import re
18from collections import defaultdict
19
20
21class PaiceHuskStemmer(object):
22    """Implements the Paice-Husk stemming algorithm.
23    """
24
25    rule_expr = re.compile(r"""
26    ^(?P<ending>\w+)
27    (?P<intact>[*]?)
28    (?P<num>\d+)
29    (?P<append>\w*)
30    (?P<cont>[.>])
31    """, re.UNICODE | re.VERBOSE)
32
33    stem_expr = re.compile("^\w+", re.UNICODE)
34
35    def __init__(self, ruletable):
36        """
37        :param ruletable: a string containing the rule data, separated
38            by newlines.
39        """
40        self.rules = defaultdict(list)
41        self.read_rules(ruletable)
42
43    def read_rules(self, ruletable):
44        rule_expr = self.rule_expr
45        rules = self.rules
46
47        for line in ruletable.split("\n"):
48            line = line.strip()
49            if not line:
50                continue
51
52            match = rule_expr.match(line)
53            if match:
54                ending = match.group("ending")[::-1]
55                lastchar = ending[-1]
56                intact = match.group("intact") == "*"
57                num = int(match.group("num"))
58                append = match.group("append")
59                cont = match.group("cont") == ">"
60
61                rules[lastchar].append((ending, intact, num, append, cont))
62            else:
63                raise Exception("Bad rule: %r" % line)
64
65    def first_vowel(self, word):
66        vp = min([p for p in [word.find(v) for v in "aeiou"]
67                  if p > -1])
68        yp = word.find("y")
69        if yp > 0 and yp < vp:
70            return yp
71        return vp
72
73    def strip_prefix(self, word):
74        for prefix in ("kilo", "micro", "milli", "intra", "ultra", "mega",
75                       "nano", "pico", "pseudo"):
76            if word.startswith(prefix):
77                return word[len(prefix):]
78        return word
79
80    def stem(self, word):
81        """Returns a stemmed version of the argument string.
82        """
83
84        rules = self.rules
85        match = self.stem_expr.match(word)
86        if not match: return word
87        stem = self.strip_prefix(match.group(0))
88
89        is_intact = True
90        continuing = True
91        while continuing:
92            pfv = self.first_vowel(stem)
93            rulelist = rules.get(stem[-1])
94            if not rulelist: break
95
96            continuing = False
97            for ending, intact, num, append, cont in rulelist:
98                if stem.endswith(ending):
99                    if intact and not is_intact: continue
100                    newlen = len(stem) - num + len(append)
101
102                    if ((pfv == 0 and newlen < 2)
103                        or (pfv > 0 and newlen < 3)):
104                        # If word starts with vowel, minimum stem length is 2.
105                        # If word starts with consonant, minimum stem length is
106                        # 3.
107                            continue
108
109                    is_intact = False
110                    stem = stem[:0-num] + append
111
112                    continuing = cont
113                    break
114
115        return stem
116
117# The default rules for the Paice-Husk stemming algorithm
118
119defaultrules = """
120ai*2.     { -ia > -   if intact }
121a*1.      { -a > -    if intact }
122bb1.      { -bb > -b   }
123city3s.   { -ytic > -ys }
124ci2>      { -ic > -    }
125cn1t>     { -nc > -nt  }
126dd1.      { -dd > -d   }
127dei3y>    { -ied > -y  }
128deec2ss.  { -ceed > -cess }
129dee1.     { -eed > -ee }
130de2>      { -ed > -    }
131dooh4>    { -hood > -  }
132e1>       { -e > -     }
133feil1v.   { -lief > -liev }
134fi2>      { -if > -    }
135gni3>     { -ing > -   }
136gai3y.    { -iag > -y  }
137ga2>      { -ag > -    }
138gg1.      { -gg > -g   }
139ht*2.     { -th > -   if intact }
140hsiug5ct. { -guish > -ct }
141hsi3>     { -ish > -   }
142i*1.      { -i > -    if intact }
143i1y>      { -i > -y    }
144ji1d.     { -ij > -id   --  see nois4j> & vis3j> }
145juf1s.    { -fuj > -fus }
146ju1d.     { -uj > -ud  }
147jo1d.     { -oj > -od  }
148jeh1r.    { -hej > -her }
149jrev1t.   { -verj > -vert }
150jsim2t.   { -misj > -mit }
151jn1d.     { -nj > -nd  }
152j1s.      { -j > -s    }
153lbaifi6.  { -ifiabl > - }
154lbai4y.   { -iabl > -y }
155lba3>     { -abl > -   }
156lbi3.     { -ibl > -   }
157lib2l>    { -bil > -bl }
158lc1.      { -cl > c    }
159lufi4y.   { -iful > -y }
160luf3>     { -ful > -   }
161lu2.      { -ul > -    }
162lai3>     { -ial > -   }
163lau3>     { -ual > -   }
164la2>      { -al > -    }
165ll1.      { -ll > -l   }
166mui3.     { -ium > -   }
167mu*2.     { -um > -   if intact }
168msi3>     { -ism > -   }
169mm1.      { -mm > -m   }
170nois4j>   { -sion > -j }
171noix4ct.  { -xion > -ct }
172noi3>     { -ion > -   }
173nai3>     { -ian > -   }
174na2>      { -an > -    }
175nee0.     { protect  -een }
176ne2>      { -en > -    }
177nn1.      { -nn > -n   }
178pihs4>    { -ship > -  }
179pp1.      { -pp > -p   }
180re2>      { -er > -    }
181rae0.     { protect  -ear }
182ra2.      { -ar > -    }
183ro2>      { -or > -    }
184ru2>      { -ur > -    }
185rr1.      { -rr > -r   }
186rt1>      { -tr > -t   }
187rei3y>    { -ier > -y  }
188sei3y>    { -ies > -y  }
189sis2.     { -sis > -s  }
190si2>      { -is > -    }
191ssen4>    { -ness > -  }
192ss0.      { protect  -ss }
193suo3>     { -ous > -   }
194su*2.     { -us > -   if intact }
195s*1>      { -s > -    if intact }
196s0.       { -s > -s    }
197tacilp4y. { -plicat > -ply }
198ta2>      { -at > -    }
199tnem4>    { -ment > -  }
200tne3>     { -ent > -   }
201tna3>     { -ant > -   }
202tpir2b.   { -ript > -rib }
203tpro2b.   { -orpt > -orb }
204tcud1.    { -duct > -duc }
205tpmus2.   { -sumpt > -sum }
206tpec2iv.  { -cept > -ceiv }
207tulo2v.   { -olut > -olv }
208tsis0.    { protect  -sist }
209tsi3>     { -ist > -   }
210tt1.      { -tt > -t   }
211uqi3.     { -iqu > -   }
212ugo1.     { -ogu > -og }
213vis3j>    { -siv > -j  }
214vie0.     { protect  -eiv }
215vi2>      { -iv > -    }
216ylb1>     { -bly > -bl }
217yli3y>    { -ily > -y  }
218ylp0.     { protect  -ply }
219yl2>      { -ly > -    }
220ygo1.     { -ogy > -og }
221yhp1.     { -phy > -ph }
222ymo1.     { -omy > -om }
223ypo1.     { -opy > -op }
224yti3>     { -ity > -   }
225yte3>     { -ety > -   }
226ytl2.     { -lty > -l  }
227yrtsi5.   { -istry > - }
228yra3>     { -ary > -   }
229yro3>     { -ory > -   }
230yfi3.     { -ify > -   }
231ycn2t>    { -ncy > -nt }
232yca3>     { -acy > -   }
233zi2>      { -iz > -    }
234zy1s.     { -yz > -ys  }
235"""
236
237# Make the standard rules available as a module-level function
238
239stem = PaiceHuskStemmer(defaultrules).stem
240
241
242
243
244
245
246
247