whoosh/lang/porter2.py

"""An implementation of the Porter2 stemming algorithm.
See http://snowball.tartarus.org/algorithms/english/stemmer.html

Adapted from pyporter2 by Michael Dirolf.

This algorithm is more correct but (at least in this implementation)
several times slower than the original porter algorithm as implemented
in stemming.porter.
"""

import re

r_exp = re.compile(r"[^aeiouy]*[aeiouy]+[^aeiouy](\w*)")
ewss_exp1 = re.compile(r"^[aeiouy][^aeiouy]$")
ewss_exp2 = re.compile(r".*[^aeiouy][aeiouy][^aeiouywxY]$")
ccy_exp = re.compile(r"([aeiouy])y")
s1a_exp = re.compile(r"[aeiouy].")
s1b_exp = re.compile(r"[aeiouy]")


def get_r1(word):
    # exceptional forms
    if word.startswith('gener') or word.startswith('arsen'):
        return 5
    if word.startswith('commun'):
        return 6

    # normal form
    match = r_exp.match(word)
    if match:
        return match.start(1)
    return len(word)


def get_r2(word):
    match = r_exp.match(word, get_r1(word))
    if match:
        return match.start(1)
    return len(word)


def ends_with_short_syllable(word):
    if len(word) == 2:
        if ewss_exp1.match(word):
            return True
    if ewss_exp2.match(word):
        return True
    return False


def is_short_word(word):
    if ends_with_short_syllable(word):
        if get_r1(word) == len(word):
            return True
    return False


def remove_initial_apostrophe(word):
    if word.startswith("'"):
        return word[1:]
    return word


def capitalize_consonant_ys(word):
    if word.startswith('y'):
        word = 'Y' + word[1:]
    return ccy_exp.sub('\g<1>Y', word)


def step_0(word):
    if word.endswith("'s'"):
        return word[:-3]
    if word.endswith("'s"):
        return word[:-2]
    if word.endswith("'"):
        return word[:-1]
    return word


def step_1a(word):
    if word.endswith('sses'):
        return word[:-4] + 'ss'
    if word.endswith('ied') or word.endswith('ies'):
        if len(word) > 4:
            return word[:-3] + 'i'
        else:
            return word[:-3] + 'ie'
    if word.endswith('us') or word.endswith('ss'):
        return word
    if word.endswith('s'):
        preceding = word[:-1]
        if s1a_exp.search(preceding):
            return preceding
        return word
    return word


doubles = ('bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt')


def ends_with_double(word):
    for double in doubles:
        if word.endswith(double):
            return True
    return False


def step_1b_helper(word):
    if word.endswith('at') or word.endswith('bl') or word.endswith('iz'):
        return word + 'e'
    if ends_with_double(word):
        return word[:-1]
    if is_short_word(word):
        return word + 'e'
    return word


s1b_suffixes = ('ed', 'edly', 'ing', 'ingly')


def step_1b(word, r1):
    if word.endswith('eedly'):
        if len(word) - 5 >= r1:
            return word[:-3]
        return word
    if word.endswith('eed'):
        if len(word) - 3 >= r1:
            return word[:-1]
        return word

    for suffix in s1b_suffixes:
        if word.endswith(suffix):
            preceding = word[:-len(suffix)]
            if s1b_exp.search(preceding):
                return step_1b_helper(preceding)
            return word

    return word


def step_1c(word):
    if word.endswith('y') or word.endswith('Y') and len(word) > 1:
        if word[-2] not in 'aeiouy':
            if len(word) > 2:
                return word[:-1] + 'i'
    return word


def step_2_helper(word, r1, end, repl, prev):
        if word.endswith(end):
            if len(word) - len(end) >= r1:
                if prev == []:
                    return word[:-len(end)] + repl
                for p in prev:
                    if word[:-len(end)].endswith(p):
                        return word[:-len(end)] + repl
            return word
        return None


s2_triples = (('ization', 'ize', []),
               ('ational', 'ate', []),
               ('fulness', 'ful', []),
               ('ousness', 'ous', []),
               ('iveness', 'ive', []),
               ('tional', 'tion', []),
               ('biliti', 'ble', []),
               ('lessli', 'less', []),
               ('entli', 'ent', []),
               ('ation', 'ate', []),
               ('alism', 'al', []),
               ('aliti', 'al', []),
               ('ousli', 'ous', []),
               ('iviti', 'ive', []),
               ('fulli', 'ful', []),
               ('enci', 'ence', []),
               ('anci', 'ance', []),
               ('abli', 'able', []),
               ('izer', 'ize', []),
               ('ator', 'ate', []),
               ('alli', 'al', []),
               ('bli', 'ble', []),
               ('ogi', 'og', ['l']),
               ('li', '', ['c', 'd', 'e', 'g', 'h', 'k', 'm', 'n', 'r', 't']))


def step_2(word, r1):
    for trip in s2_triples:
        attempt = step_2_helper(word, r1, trip[0], trip[1], trip[2])
        if attempt:
            return attempt
    return word


def step_3_helper(word, r1, r2, end, repl, r2_necessary):
    if word.endswith(end):
        if len(word) - len(end) >= r1:
            if not r2_necessary:
                return word[:-len(end)] + repl
            else:
                if len(word) - len(end) >= r2:
                    return word[:-len(end)] + repl
        return word
    return None


s3_triples = (('ational', 'ate', False),
               ('tional', 'tion', False),
               ('alize', 'al', False),
               ('icate', 'ic', False),
               ('iciti', 'ic', False),
               ('ative', '', True),
               ('ical', 'ic', False),
               ('ness', '', False),
               ('ful', '', False))


def step_3(word, r1, r2):
    for trip in s3_triples:
        attempt = step_3_helper(word, r1, r2, trip[0], trip[1], trip[2])
        if attempt:
            return attempt
    return word


s4_delete_list = ('al', 'ance', 'ence', 'er', 'ic', 'able', 'ible', 'ant', 'ement',
                  'ment', 'ent', 'ism', 'ate', 'iti', 'ous', 'ive', 'ize')


def step_4(word, r2):
    for end in s4_delete_list:
        if word.endswith(end):
            if len(word) - len(end) >= r2:
                return word[:-len(end)]
            return word

    if word.endswith('sion') or word.endswith('tion'):
        if len(word) - 3 >= r2:
            return word[:-3]

    return word


def step_5(word, r1, r2):
    if word.endswith('l'):
        if len(word) - 1 >= r2 and word[-2] == 'l':
            return word[:-1]
        return word

    if word.endswith('e'):
        if len(word) - 1 >= r2:
            return word[:-1]
        if len(word) - 1 >= r1 and not ends_with_short_syllable(word[:-1]):
            return word[:-1]

    return word


def normalize_ys(word):
    return word.replace('Y', 'y')


exceptional_forms = {'skis': 'ski',
                    'skies': 'sky',
                    'dying': 'die',
                    'lying': 'lie',
                    'tying': 'tie',
                    'idly': 'idl',
                    'gently': 'gentl',
                    'ugly': 'ugli',
                    'early': 'earli',
                    'only': 'onli',
                    'singly': 'singl',
                    'sky': 'sky',
                    'news': 'news',
                    'howe': 'howe',
                    'atlas': 'atlas',
                    'cosmos': 'cosmos',
                    'bias': 'bias',
                    'andes': 'andes'}

exceptional_early_exit_post_1a = frozenset(['inning', 'outing', 'canning', 'herring',
                                            'earring', 'proceed', 'exceed', 'succeed'])


def stem(word):
    if len(word) <= 2:
        return word
    word = remove_initial_apostrophe(word)

    # handle some exceptional forms
    if word in exceptional_forms:
        return exceptional_forms[word]

    word = capitalize_consonant_ys(word)
    r1 = get_r1(word)
    r2 = get_r2(word)
    word = step_0(word)
    word = step_1a(word)

    # handle some more exceptional forms
    if word in exceptional_early_exit_post_1a:
        return word

    word = step_1b(word, r1)
    word = step_1c(word)
    word = step_2(word, r1)
    word = step_3(word, r1, r2)
    word = step_4(word, r2)
    word = step_5(word, r1, r2)
    word = normalize_ys(word)

    return word