1"""An implementation of the Porter2 stemming algorithm. 2See http://snowball.tartarus.org/algorithms/english/stemmer.html 3 4Adapted from pyporter2 by Michael Dirolf. 5 6This algorithm is more correct but (at least in this implementation) 7several times slower than the original porter algorithm as implemented 8in stemming.porter. 9""" 10 11import re 12 13r_exp = re.compile(r"[^aeiouy]*[aeiouy]+[^aeiouy](\w*)") 14ewss_exp1 = re.compile(r"^[aeiouy][^aeiouy]$") 15ewss_exp2 = re.compile(r".*[^aeiouy][aeiouy][^aeiouywxY]$") 16ccy_exp = re.compile(r"([aeiouy])y") 17s1a_exp = re.compile(r"[aeiouy].") 18s1b_exp = re.compile(r"[aeiouy]") 19 20 21def get_r1(word): 22 # exceptional forms 23 if word.startswith('gener') or word.startswith('arsen'): 24 return 5 25 if word.startswith('commun'): 26 return 6 27 28 # normal form 29 match = r_exp.match(word) 30 if match: 31 return match.start(1) 32 return len(word) 33 34 35def get_r2(word): 36 match = r_exp.match(word, get_r1(word)) 37 if match: 38 return match.start(1) 39 return len(word) 40 41 42def ends_with_short_syllable(word): 43 if len(word) == 2: 44 if ewss_exp1.match(word): 45 return True 46 if ewss_exp2.match(word): 47 return True 48 return False 49 50 51def is_short_word(word): 52 if ends_with_short_syllable(word): 53 if get_r1(word) == len(word): 54 return True 55 return False 56 57 58def remove_initial_apostrophe(word): 59 if word.startswith("'"): 60 return word[1:] 61 return word 62 63 64def capitalize_consonant_ys(word): 65 if word.startswith('y'): 66 word = 'Y' + word[1:] 67 return ccy_exp.sub('\g<1>Y', word) 68 69 70def step_0(word): 71 if word.endswith("'s'"): 72 return word[:-3] 73 if word.endswith("'s"): 74 return word[:-2] 75 if word.endswith("'"): 76 return word[:-1] 77 return word 78 79 80def step_1a(word): 81 if word.endswith('sses'): 82 return word[:-4] + 'ss' 83 if word.endswith('ied') or word.endswith('ies'): 84 if len(word) > 4: 85 return word[:-3] + 'i' 86 else: 87 return word[:-3] + 'ie' 88 if word.endswith('us') or word.endswith('ss'): 89 return word 90 if word.endswith('s'): 91 preceding = word[:-1] 92 if s1a_exp.search(preceding): 93 return preceding 94 return word 95 return word 96 97 98doubles = ('bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt') 99 100 101def ends_with_double(word): 102 for double in doubles: 103 if word.endswith(double): 104 return True 105 return False 106 107 108def step_1b_helper(word): 109 if word.endswith('at') or word.endswith('bl') or word.endswith('iz'): 110 return word + 'e' 111 if ends_with_double(word): 112 return word[:-1] 113 if is_short_word(word): 114 return word + 'e' 115 return word 116 117 118s1b_suffixes = ('ed', 'edly', 'ing', 'ingly') 119 120 121def step_1b(word, r1): 122 if word.endswith('eedly'): 123 if len(word) - 5 >= r1: 124 return word[:-3] 125 return word 126 if word.endswith('eed'): 127 if len(word) - 3 >= r1: 128 return word[:-1] 129 return word 130 131 for suffix in s1b_suffixes: 132 if word.endswith(suffix): 133 preceding = word[:-len(suffix)] 134 if s1b_exp.search(preceding): 135 return step_1b_helper(preceding) 136 return word 137 138 return word 139 140 141def step_1c(word): 142 if word.endswith('y') or word.endswith('Y') and len(word) > 1: 143 if word[-2] not in 'aeiouy': 144 if len(word) > 2: 145 return word[:-1] + 'i' 146 return word 147 148 149def step_2_helper(word, r1, end, repl, prev): 150 if word.endswith(end): 151 if len(word) - len(end) >= r1: 152 if prev == []: 153 return word[:-len(end)] + repl 154 for p in prev: 155 if word[:-len(end)].endswith(p): 156 return word[:-len(end)] + repl 157 return word 158 return None 159 160 161s2_triples = (('ization', 'ize', []), 162 ('ational', 'ate', []), 163 ('fulness', 'ful', []), 164 ('ousness', 'ous', []), 165 ('iveness', 'ive', []), 166 ('tional', 'tion', []), 167 ('biliti', 'ble', []), 168 ('lessli', 'less', []), 169 ('entli', 'ent', []), 170 ('ation', 'ate', []), 171 ('alism', 'al', []), 172 ('aliti', 'al', []), 173 ('ousli', 'ous', []), 174 ('iviti', 'ive', []), 175 ('fulli', 'ful', []), 176 ('enci', 'ence', []), 177 ('anci', 'ance', []), 178 ('abli', 'able', []), 179 ('izer', 'ize', []), 180 ('ator', 'ate', []), 181 ('alli', 'al', []), 182 ('bli', 'ble', []), 183 ('ogi', 'og', ['l']), 184 ('li', '', ['c', 'd', 'e', 'g', 'h', 'k', 'm', 'n', 'r', 't'])) 185 186 187def step_2(word, r1): 188 for trip in s2_triples: 189 attempt = step_2_helper(word, r1, trip[0], trip[1], trip[2]) 190 if attempt: 191 return attempt 192 return word 193 194 195def step_3_helper(word, r1, r2, end, repl, r2_necessary): 196 if word.endswith(end): 197 if len(word) - len(end) >= r1: 198 if not r2_necessary: 199 return word[:-len(end)] + repl 200 else: 201 if len(word) - len(end) >= r2: 202 return word[:-len(end)] + repl 203 return word 204 return None 205 206 207s3_triples = (('ational', 'ate', False), 208 ('tional', 'tion', False), 209 ('alize', 'al', False), 210 ('icate', 'ic', False), 211 ('iciti', 'ic', False), 212 ('ative', '', True), 213 ('ical', 'ic', False), 214 ('ness', '', False), 215 ('ful', '', False)) 216 217 218def step_3(word, r1, r2): 219 for trip in s3_triples: 220 attempt = step_3_helper(word, r1, r2, trip[0], trip[1], trip[2]) 221 if attempt: 222 return attempt 223 return word 224 225 226s4_delete_list = ('al', 'ance', 'ence', 'er', 'ic', 'able', 'ible', 'ant', 'ement', 227 'ment', 'ent', 'ism', 'ate', 'iti', 'ous', 'ive', 'ize') 228 229 230def step_4(word, r2): 231 for end in s4_delete_list: 232 if word.endswith(end): 233 if len(word) - len(end) >= r2: 234 return word[:-len(end)] 235 return word 236 237 if word.endswith('sion') or word.endswith('tion'): 238 if len(word) - 3 >= r2: 239 return word[:-3] 240 241 return word 242 243 244def step_5(word, r1, r2): 245 if word.endswith('l'): 246 if len(word) - 1 >= r2 and word[-2] == 'l': 247 return word[:-1] 248 return word 249 250 if word.endswith('e'): 251 if len(word) - 1 >= r2: 252 return word[:-1] 253 if len(word) - 1 >= r1 and not ends_with_short_syllable(word[:-1]): 254 return word[:-1] 255 256 return word 257 258 259def normalize_ys(word): 260 return word.replace('Y', 'y') 261 262 263exceptional_forms = {'skis': 'ski', 264 'skies': 'sky', 265 'dying': 'die', 266 'lying': 'lie', 267 'tying': 'tie', 268 'idly': 'idl', 269 'gently': 'gentl', 270 'ugly': 'ugli', 271 'early': 'earli', 272 'only': 'onli', 273 'singly': 'singl', 274 'sky': 'sky', 275 'news': 'news', 276 'howe': 'howe', 277 'atlas': 'atlas', 278 'cosmos': 'cosmos', 279 'bias': 'bias', 280 'andes': 'andes'} 281 282exceptional_early_exit_post_1a = frozenset(['inning', 'outing', 'canning', 'herring', 283 'earring', 'proceed', 'exceed', 'succeed']) 284 285 286def stem(word): 287 if len(word) <= 2: 288 return word 289 word = remove_initial_apostrophe(word) 290 291 # handle some exceptional forms 292 if word in exceptional_forms: 293 return exceptional_forms[word] 294 295 word = capitalize_consonant_ys(word) 296 r1 = get_r1(word) 297 r2 = get_r2(word) 298 word = step_0(word) 299 word = step_1a(word) 300 301 # handle some more exceptional forms 302 if word in exceptional_early_exit_post_1a: 303 return word 304 305 word = step_1b(word, r1) 306 word = step_1c(word) 307 word = step_2(word, r1) 308 word = step_3(word, r1, r2) 309 word = step_4(word, r2) 310 word = step_5(word, r1, r2) 311 word = normalize_ys(word) 312 313 return word 314