1"""An implementation of the Porter2 stemming algorithm.
2See http://snowball.tartarus.org/algorithms/english/stemmer.html
3
4Adapted from pyporter2 by Michael Dirolf.
5
6This algorithm is more correct but (at least in this implementation)
7several times slower than the original porter algorithm as implemented
8in stemming.porter.
9"""
10
11import re
12
13r_exp = re.compile(r"[^aeiouy]*[aeiouy]+[^aeiouy](\w*)")
14ewss_exp1 = re.compile(r"^[aeiouy][^aeiouy]$")
15ewss_exp2 = re.compile(r".*[^aeiouy][aeiouy][^aeiouywxY]$")
16ccy_exp = re.compile(r"([aeiouy])y")
17s1a_exp = re.compile(r"[aeiouy].")
18s1b_exp = re.compile(r"[aeiouy]")
19
20
21def get_r1(word):
22    # exceptional forms
23    if word.startswith('gener') or word.startswith('arsen'):
24        return 5
25    if word.startswith('commun'):
26        return 6
27
28    # normal form
29    match = r_exp.match(word)
30    if match:
31        return match.start(1)
32    return len(word)
33
34
35def get_r2(word):
36    match = r_exp.match(word, get_r1(word))
37    if match:
38        return match.start(1)
39    return len(word)
40
41
42def ends_with_short_syllable(word):
43    if len(word) == 2:
44        if ewss_exp1.match(word):
45            return True
46    if ewss_exp2.match(word):
47        return True
48    return False
49
50
51def is_short_word(word):
52    if ends_with_short_syllable(word):
53        if get_r1(word) == len(word):
54            return True
55    return False
56
57
58def remove_initial_apostrophe(word):
59    if word.startswith("'"):
60        return word[1:]
61    return word
62
63
64def capitalize_consonant_ys(word):
65    if word.startswith('y'):
66        word = 'Y' + word[1:]
67    return ccy_exp.sub('\g<1>Y', word)
68
69
70def step_0(word):
71    if word.endswith("'s'"):
72        return word[:-3]
73    if word.endswith("'s"):
74        return word[:-2]
75    if word.endswith("'"):
76        return word[:-1]
77    return word
78
79
80def step_1a(word):
81    if word.endswith('sses'):
82        return word[:-4] + 'ss'
83    if word.endswith('ied') or word.endswith('ies'):
84        if len(word) > 4:
85            return word[:-3] + 'i'
86        else:
87            return word[:-3] + 'ie'
88    if word.endswith('us') or word.endswith('ss'):
89        return word
90    if word.endswith('s'):
91        preceding = word[:-1]
92        if s1a_exp.search(preceding):
93            return preceding
94        return word
95    return word
96
97
98doubles = ('bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt')
99
100
101def ends_with_double(word):
102    for double in doubles:
103        if word.endswith(double):
104            return True
105    return False
106
107
108def step_1b_helper(word):
109    if word.endswith('at') or word.endswith('bl') or word.endswith('iz'):
110        return word + 'e'
111    if ends_with_double(word):
112        return word[:-1]
113    if is_short_word(word):
114        return word + 'e'
115    return word
116
117
118s1b_suffixes = ('ed', 'edly', 'ing', 'ingly')
119
120
121def step_1b(word, r1):
122    if word.endswith('eedly'):
123        if len(word) - 5 >= r1:
124            return word[:-3]
125        return word
126    if word.endswith('eed'):
127        if len(word) - 3 >= r1:
128            return word[:-1]
129        return word
130
131    for suffix in s1b_suffixes:
132        if word.endswith(suffix):
133            preceding = word[:-len(suffix)]
134            if s1b_exp.search(preceding):
135                return step_1b_helper(preceding)
136            return word
137
138    return word
139
140
141def step_1c(word):
142    if word.endswith('y') or word.endswith('Y') and len(word) > 1:
143        if word[-2] not in 'aeiouy':
144            if len(word) > 2:
145                return word[:-1] + 'i'
146    return word
147
148
149def step_2_helper(word, r1, end, repl, prev):
150        if word.endswith(end):
151            if len(word) - len(end) >= r1:
152                if prev == []:
153                    return word[:-len(end)] + repl
154                for p in prev:
155                    if word[:-len(end)].endswith(p):
156                        return word[:-len(end)] + repl
157            return word
158        return None
159
160
161s2_triples = (('ization', 'ize', []),
162               ('ational', 'ate', []),
163               ('fulness', 'ful', []),
164               ('ousness', 'ous', []),
165               ('iveness', 'ive', []),
166               ('tional', 'tion', []),
167               ('biliti', 'ble', []),
168               ('lessli', 'less', []),
169               ('entli', 'ent', []),
170               ('ation', 'ate', []),
171               ('alism', 'al', []),
172               ('aliti', 'al', []),
173               ('ousli', 'ous', []),
174               ('iviti', 'ive', []),
175               ('fulli', 'ful', []),
176               ('enci', 'ence', []),
177               ('anci', 'ance', []),
178               ('abli', 'able', []),
179               ('izer', 'ize', []),
180               ('ator', 'ate', []),
181               ('alli', 'al', []),
182               ('bli', 'ble', []),
183               ('ogi', 'og', ['l']),
184               ('li', '', ['c', 'd', 'e', 'g', 'h', 'k', 'm', 'n', 'r', 't']))
185
186
187def step_2(word, r1):
188    for trip in s2_triples:
189        attempt = step_2_helper(word, r1, trip[0], trip[1], trip[2])
190        if attempt:
191            return attempt
192    return word
193
194
195def step_3_helper(word, r1, r2, end, repl, r2_necessary):
196    if word.endswith(end):
197        if len(word) - len(end) >= r1:
198            if not r2_necessary:
199                return word[:-len(end)] + repl
200            else:
201                if len(word) - len(end) >= r2:
202                    return word[:-len(end)] + repl
203        return word
204    return None
205
206
207s3_triples = (('ational', 'ate', False),
208               ('tional', 'tion', False),
209               ('alize', 'al', False),
210               ('icate', 'ic', False),
211               ('iciti', 'ic', False),
212               ('ative', '', True),
213               ('ical', 'ic', False),
214               ('ness', '', False),
215               ('ful', '', False))
216
217
218def step_3(word, r1, r2):
219    for trip in s3_triples:
220        attempt = step_3_helper(word, r1, r2, trip[0], trip[1], trip[2])
221        if attempt:
222            return attempt
223    return word
224
225
226s4_delete_list = ('al', 'ance', 'ence', 'er', 'ic', 'able', 'ible', 'ant', 'ement',
227                  'ment', 'ent', 'ism', 'ate', 'iti', 'ous', 'ive', 'ize')
228
229
230def step_4(word, r2):
231    for end in s4_delete_list:
232        if word.endswith(end):
233            if len(word) - len(end) >= r2:
234                return word[:-len(end)]
235            return word
236
237    if word.endswith('sion') or word.endswith('tion'):
238        if len(word) - 3 >= r2:
239            return word[:-3]
240
241    return word
242
243
244def step_5(word, r1, r2):
245    if word.endswith('l'):
246        if len(word) - 1 >= r2 and word[-2] == 'l':
247            return word[:-1]
248        return word
249
250    if word.endswith('e'):
251        if len(word) - 1 >= r2:
252            return word[:-1]
253        if len(word) - 1 >= r1 and not ends_with_short_syllable(word[:-1]):
254            return word[:-1]
255
256    return word
257
258
259def normalize_ys(word):
260    return word.replace('Y', 'y')
261
262
263exceptional_forms = {'skis': 'ski',
264                    'skies': 'sky',
265                    'dying': 'die',
266                    'lying': 'lie',
267                    'tying': 'tie',
268                    'idly': 'idl',
269                    'gently': 'gentl',
270                    'ugly': 'ugli',
271                    'early': 'earli',
272                    'only': 'onli',
273                    'singly': 'singl',
274                    'sky': 'sky',
275                    'news': 'news',
276                    'howe': 'howe',
277                    'atlas': 'atlas',
278                    'cosmos': 'cosmos',
279                    'bias': 'bias',
280                    'andes': 'andes'}
281
282exceptional_early_exit_post_1a = frozenset(['inning', 'outing', 'canning', 'herring',
283                                            'earring', 'proceed', 'exceed', 'succeed'])
284
285
286def stem(word):
287    if len(word) <= 2:
288        return word
289    word = remove_initial_apostrophe(word)
290
291    # handle some exceptional forms
292    if word in exceptional_forms:
293        return exceptional_forms[word]
294
295    word = capitalize_consonant_ys(word)
296    r1 = get_r1(word)
297    r2 = get_r2(word)
298    word = step_0(word)
299    word = step_1a(word)
300
301    # handle some more exceptional forms
302    if word in exceptional_early_exit_post_1a:
303        return word
304
305    word = step_1b(word, r1)
306    word = step_1c(word)
307    word = step_2(word, r1)
308    word = step_3(word, r1, r2)
309    word = step_4(word, r2)
310    word = step_5(word, r1, r2)
311    word = normalize_ys(word)
312
313    return word
314