contrib/regular_languages/regex_parser.py

"""
Parser for parsing a regular expression.
Take a string representing a regular expression and return the root node of its
parse tree.

usage::

    root_node = parse_regex('(hello|world)')

Remarks:
- The regex parser processes multiline, it ignores all whitespace and supports
  multiple named groups with the same name and #-style comments.

Limitations:
- Lookahead is not supported.
"""
from __future__ import unicode_literals
import re

__all__ = (
    'Repeat',
    'Variable',
    'Regex',
    'Lookahead',

    'tokenize_regex',
    'parse_regex',
)


class Node(object):
    """
    Base class for all the grammar nodes.
    (You don't initialize this one.)
    """
    def __add__(self, other_node):
        return Sequence([self, other_node])

    def __or__(self, other_node):
        return Any([self, other_node])


class Any(Node):
    """
    Union operation (OR operation) between several grammars. You don't
    initialize this yourself, but it's a result of a "Grammar1 | Grammar2"
    operation.
    """
    def __init__(self, children):
        self.children = children

    def __or__(self, other_node):
        return Any(self.children + [other_node])

    def __repr__(self):
        return '%s(%r)' % (self.__class__.__name__, self.children)


class Sequence(Node):
    """
    Concatenation operation of several grammars. You don't initialize this
    yourself, but it's a result of a "Grammar1 + Grammar2" operation.
    """
    def __init__(self, children):
        self.children = children

    def __add__(self, other_node):
        return Sequence(self.children + [other_node])

    def __repr__(self):
        return '%s(%r)' % (self.__class__.__name__, self.children)


class Regex(Node):
    """
    Regular expression.
    """
    def __init__(self, regex):
        re.compile(regex)  # Validate

        self.regex = regex

    def __repr__(self):
        return '%s(/%s/)' % (self.__class__.__name__, self.regex)


class Lookahead(Node):
    """
    Lookahead expression.
    """
    def __init__(self, childnode, negative=False):
        self.childnode = childnode
        self.negative = negative

    def __repr__(self):
        return '%s(%r)' % (self.__class__.__name__, self.childnode)


class Variable(Node):
    """
    Mark a variable in the regular grammar. This will be translated into a
    named group. Each variable can have his own completer, validator, etc..

    :param childnode: The grammar which is wrapped inside this variable.
    :param varname: String.
    """
    def __init__(self, childnode, varname=None):
        self.childnode = childnode
        self.varname = varname

    def __repr__(self):
        return '%s(childnode=%r, varname=%r)' % (
            self.__class__.__name__, self.childnode, self.varname)


class Repeat(Node):
    def __init__(self, childnode, min_repeat=0, max_repeat=None, greedy=True):
        self.childnode = childnode
        self.min_repeat = min_repeat
        self.max_repeat = max_repeat
        self.greedy = greedy

    def __repr__(self):
        return '%s(childnode=%r)' % (self.__class__.__name__, self.childnode)


def tokenize_regex(input):
    """
    Takes a string, representing a regular expression as input, and tokenizes
    it.

    :param input: string, representing a regular expression.
    :returns: List of tokens.
    """
    # Regular expression for tokenizing other regular expressions.
    p = re.compile(r'''^(
        \(\?P\<[a-zA-Z0-9_-]+\>  | # Start of named group.
        \(\?#[^)]*\)             | # Comment
        \(\?=                    | # Start of lookahead assertion
        \(\?!                    | # Start of negative lookahead assertion
        \(\?<=                   | # If preceded by.
        \(\?<                    | # If not preceded by.
        \(?:                     | # Start of group. (non capturing.)
        \(                       | # Start of group.
        \(?[iLmsux]              | # Flags.
        \(?P=[a-zA-Z]+\)         | # Back reference to named group
        \)                       | # End of group.
        \{[^{}]*\}               | # Repetition
        \*\? | \+\? | \?\?\      | # Non greedy repetition.
        \* | \+ | \?             | # Repetition
        \#.*\n                   | # Comment
        \\. |

        # Character group.
        \[
            ( [^\]\\]  |  \\.)*
        \]                  |

        [^(){}]             |
        .
    )''', re.VERBOSE)

    tokens = []

    while input:
        m = p.match(input)
        if m:
            token, input = input[:m.end()], input[m.end():]
            if not token.isspace():
                tokens.append(token)
        else:
            raise Exception('Could not tokenize input regex.')

    return tokens


def parse_regex(regex_tokens):
    """
    Takes a list of tokens from the tokenizer, and returns a parse tree.
    """
    # We add a closing brace because that represents the final pop of the stack.
    tokens = [')'] + regex_tokens[::-1]

    def wrap(lst):
        """ Turn list into sequence when it contains several items. """
        if len(lst) == 1:
            return lst[0]
        else:
            return Sequence(lst)

    def _parse():
        or_list = []
        result = []

        def wrapped_result():
            if or_list == []:
                return wrap(result)
            else:
                or_list.append(result)
                return Any([wrap(i) for i in or_list])

        while tokens:
            t = tokens.pop()

            if t.startswith('(?P<'):
                variable = Variable(_parse(), varname=t[4:-1])
                result.append(variable)

            elif t in ('*', '*?'):
                greedy = (t == '*')
                result[-1] = Repeat(result[-1], greedy=greedy)

            elif t in ('+', '+?'):
                greedy = (t == '+')
                result[-1] = Repeat(result[-1], min_repeat=1, greedy=greedy)

            elif t in ('?', '??'):
                if result == []:
                    raise Exception('Nothing to repeat.' + repr(tokens))
                else:
                    greedy = (t == '?')
                    result[-1] = Repeat(result[-1], min_repeat=0, max_repeat=1, greedy=greedy)

            elif t == '|':
                or_list.append(result)
                result = []

            elif t in ('(', '(?:'):
                result.append(_parse())

            elif t == '(?!':
                result.append(Lookahead(_parse(), negative=True))

            elif t == '(?=':
                result.append(Lookahead(_parse(), negative=False))

            elif t == ')':
                return wrapped_result()

            elif t.startswith('#'):
                pass

            elif t.startswith('{'):
                # TODO: implement!
                raise Exception('{}-style repitition not yet supported' % t)

            elif t.startswith('(?'):
                raise Exception('%r not supported' % t)

            elif t.isspace():
                pass
            else:
                result.append(Regex(t))

        raise Exception("Expecting ')' token")

    result = _parse()

    if len(tokens) != 0:
        raise Exception("Unmatched parantheses.")
    else:
        return result