1"""This module contains an object that implements the Paice-Husk stemming 2algorithm. 3 4If you just want to use the standard Paice-Husk stemming rules, use the 5module's ``stem()`` function:: 6 7 stemmed_word = stem(word) 8 9If you want to use a custom rule set, read the rules into a string where the 10rules are separated by newlines, and instantiate the object with the string, 11then use the object's stem method to stem words:: 12 13 stemmer = PaiceHuskStemmer(my_rules_string) 14 stemmed_word = stemmer.stem(word) 15""" 16 17import re 18from collections import defaultdict 19 20 21class PaiceHuskStemmer(object): 22 """Implements the Paice-Husk stemming algorithm. 23 """ 24 25 rule_expr = re.compile(r""" 26 ^(?P<ending>\w+) 27 (?P<intact>[*]?) 28 (?P<num>\d+) 29 (?P<append>\w*) 30 (?P<cont>[.>]) 31 """, re.UNICODE | re.VERBOSE) 32 33 stem_expr = re.compile("^\w+", re.UNICODE) 34 35 def __init__(self, ruletable): 36 """ 37 :param ruletable: a string containing the rule data, separated 38 by newlines. 39 """ 40 self.rules = defaultdict(list) 41 self.read_rules(ruletable) 42 43 def read_rules(self, ruletable): 44 rule_expr = self.rule_expr 45 rules = self.rules 46 47 for line in ruletable.split("\n"): 48 line = line.strip() 49 if not line: 50 continue 51 52 match = rule_expr.match(line) 53 if match: 54 ending = match.group("ending")[::-1] 55 lastchar = ending[-1] 56 intact = match.group("intact") == "*" 57 num = int(match.group("num")) 58 append = match.group("append") 59 cont = match.group("cont") == ">" 60 61 rules[lastchar].append((ending, intact, num, append, cont)) 62 else: 63 raise Exception("Bad rule: %r" % line) 64 65 def first_vowel(self, word): 66 vp = min([p for p in [word.find(v) for v in "aeiou"] 67 if p > -1]) 68 yp = word.find("y") 69 if yp > 0 and yp < vp: 70 return yp 71 return vp 72 73 def strip_prefix(self, word): 74 for prefix in ("kilo", "micro", "milli", "intra", "ultra", "mega", 75 "nano", "pico", "pseudo"): 76 if word.startswith(prefix): 77 return word[len(prefix):] 78 return word 79 80 def stem(self, word): 81 """Returns a stemmed version of the argument string. 82 """ 83 84 rules = self.rules 85 match = self.stem_expr.match(word) 86 if not match: return word 87 stem = self.strip_prefix(match.group(0)) 88 89 is_intact = True 90 continuing = True 91 while continuing: 92 pfv = self.first_vowel(stem) 93 rulelist = rules.get(stem[-1]) 94 if not rulelist: break 95 96 continuing = False 97 for ending, intact, num, append, cont in rulelist: 98 if stem.endswith(ending): 99 if intact and not is_intact: continue 100 newlen = len(stem) - num + len(append) 101 102 if ((pfv == 0 and newlen < 2) 103 or (pfv > 0 and newlen < 3)): 104 # If word starts with vowel, minimum stem length is 2. 105 # If word starts with consonant, minimum stem length is 106 # 3. 107 continue 108 109 is_intact = False 110 stem = stem[:0-num] + append 111 112 continuing = cont 113 break 114 115 return stem 116 117# The default rules for the Paice-Husk stemming algorithm 118 119defaultrules = """ 120ai*2. { -ia > - if intact } 121a*1. { -a > - if intact } 122bb1. { -bb > -b } 123city3s. { -ytic > -ys } 124ci2> { -ic > - } 125cn1t> { -nc > -nt } 126dd1. { -dd > -d } 127dei3y> { -ied > -y } 128deec2ss. { -ceed > -cess } 129dee1. { -eed > -ee } 130de2> { -ed > - } 131dooh4> { -hood > - } 132e1> { -e > - } 133feil1v. { -lief > -liev } 134fi2> { -if > - } 135gni3> { -ing > - } 136gai3y. { -iag > -y } 137ga2> { -ag > - } 138gg1. { -gg > -g } 139ht*2. { -th > - if intact } 140hsiug5ct. { -guish > -ct } 141hsi3> { -ish > - } 142i*1. { -i > - if intact } 143i1y> { -i > -y } 144ji1d. { -ij > -id -- see nois4j> & vis3j> } 145juf1s. { -fuj > -fus } 146ju1d. { -uj > -ud } 147jo1d. { -oj > -od } 148jeh1r. { -hej > -her } 149jrev1t. { -verj > -vert } 150jsim2t. { -misj > -mit } 151jn1d. { -nj > -nd } 152j1s. { -j > -s } 153lbaifi6. { -ifiabl > - } 154lbai4y. { -iabl > -y } 155lba3> { -abl > - } 156lbi3. { -ibl > - } 157lib2l> { -bil > -bl } 158lc1. { -cl > c } 159lufi4y. { -iful > -y } 160luf3> { -ful > - } 161lu2. { -ul > - } 162lai3> { -ial > - } 163lau3> { -ual > - } 164la2> { -al > - } 165ll1. { -ll > -l } 166mui3. { -ium > - } 167mu*2. { -um > - if intact } 168msi3> { -ism > - } 169mm1. { -mm > -m } 170nois4j> { -sion > -j } 171noix4ct. { -xion > -ct } 172noi3> { -ion > - } 173nai3> { -ian > - } 174na2> { -an > - } 175nee0. { protect -een } 176ne2> { -en > - } 177nn1. { -nn > -n } 178pihs4> { -ship > - } 179pp1. { -pp > -p } 180re2> { -er > - } 181rae0. { protect -ear } 182ra2. { -ar > - } 183ro2> { -or > - } 184ru2> { -ur > - } 185rr1. { -rr > -r } 186rt1> { -tr > -t } 187rei3y> { -ier > -y } 188sei3y> { -ies > -y } 189sis2. { -sis > -s } 190si2> { -is > - } 191ssen4> { -ness > - } 192ss0. { protect -ss } 193suo3> { -ous > - } 194su*2. { -us > - if intact } 195s*1> { -s > - if intact } 196s0. { -s > -s } 197tacilp4y. { -plicat > -ply } 198ta2> { -at > - } 199tnem4> { -ment > - } 200tne3> { -ent > - } 201tna3> { -ant > - } 202tpir2b. { -ript > -rib } 203tpro2b. { -orpt > -orb } 204tcud1. { -duct > -duc } 205tpmus2. { -sumpt > -sum } 206tpec2iv. { -cept > -ceiv } 207tulo2v. { -olut > -olv } 208tsis0. { protect -sist } 209tsi3> { -ist > - } 210tt1. { -tt > -t } 211uqi3. { -iqu > - } 212ugo1. { -ogu > -og } 213vis3j> { -siv > -j } 214vie0. { protect -eiv } 215vi2> { -iv > - } 216ylb1> { -bly > -bl } 217yli3y> { -ily > -y } 218ylp0. { protect -ply } 219yl2> { -ly > - } 220ygo1. { -ogy > -og } 221yhp1. { -phy > -ph } 222ymo1. { -omy > -om } 223ypo1. { -opy > -op } 224yti3> { -ity > - } 225yte3> { -ety > - } 226ytl2. { -lty > -l } 227yrtsi5. { -istry > - } 228yra3> { -ary > - } 229yro3> { -ory > - } 230yfi3. { -ify > - } 231ycn2t> { -ncy > -nt } 232yca3> { -acy > - } 233zi2> { -iz > - } 234zy1s. { -yz > -ys } 235""" 236 237# Make the standard rules available as a module-level function 238 239stem = PaiceHuskStemmer(defaultrules).stem 240 241 242 243 244 245 246 247