1"""
2Parser for parsing a regular expression.
3Take a string representing a regular expression and return the root node of its
4parse tree.
5
6usage::
7
8    root_node = parse_regex('(hello|world)')
9
10Remarks:
11- The regex parser processes multiline, it ignores all whitespace and supports
12  multiple named groups with the same name and #-style comments.
13
14Limitations:
15- Lookahead is not supported.
16"""
17from __future__ import unicode_literals
18import re
19
20__all__ = (
21    'Repeat',
22    'Variable',
23    'Regex',
24    'Lookahead',
25
26    'tokenize_regex',
27    'parse_regex',
28)
29
30
31class Node(object):
32    """
33    Base class for all the grammar nodes.
34    (You don't initialize this one.)
35    """
36    def __add__(self, other_node):
37        return Sequence([self, other_node])
38
39    def __or__(self, other_node):
40        return Any([self, other_node])
41
42
43class Any(Node):
44    """
45    Union operation (OR operation) between several grammars. You don't
46    initialize this yourself, but it's a result of a "Grammar1 | Grammar2"
47    operation.
48    """
49    def __init__(self, children):
50        self.children = children
51
52    def __or__(self, other_node):
53        return Any(self.children + [other_node])
54
55    def __repr__(self):
56        return '%s(%r)' % (self.__class__.__name__, self.children)
57
58
59class Sequence(Node):
60    """
61    Concatenation operation of several grammars. You don't initialize this
62    yourself, but it's a result of a "Grammar1 + Grammar2" operation.
63    """
64    def __init__(self, children):
65        self.children = children
66
67    def __add__(self, other_node):
68        return Sequence(self.children + [other_node])
69
70    def __repr__(self):
71        return '%s(%r)' % (self.__class__.__name__, self.children)
72
73
74class Regex(Node):
75    """
76    Regular expression.
77    """
78    def __init__(self, regex):
79        re.compile(regex)  # Validate
80
81        self.regex = regex
82
83    def __repr__(self):
84        return '%s(/%s/)' % (self.__class__.__name__, self.regex)
85
86
87class Lookahead(Node):
88    """
89    Lookahead expression.
90    """
91    def __init__(self, childnode, negative=False):
92        self.childnode = childnode
93        self.negative = negative
94
95    def __repr__(self):
96        return '%s(%r)' % (self.__class__.__name__, self.childnode)
97
98
99class Variable(Node):
100    """
101    Mark a variable in the regular grammar. This will be translated into a
102    named group. Each variable can have his own completer, validator, etc..
103
104    :param childnode: The grammar which is wrapped inside this variable.
105    :param varname: String.
106    """
107    def __init__(self, childnode, varname=None):
108        self.childnode = childnode
109        self.varname = varname
110
111    def __repr__(self):
112        return '%s(childnode=%r, varname=%r)' % (
113            self.__class__.__name__, self.childnode, self.varname)
114
115
116class Repeat(Node):
117    def __init__(self, childnode, min_repeat=0, max_repeat=None, greedy=True):
118        self.childnode = childnode
119        self.min_repeat = min_repeat
120        self.max_repeat = max_repeat
121        self.greedy = greedy
122
123    def __repr__(self):
124        return '%s(childnode=%r)' % (self.__class__.__name__, self.childnode)
125
126
127def tokenize_regex(input):
128    """
129    Takes a string, representing a regular expression as input, and tokenizes
130    it.
131
132    :param input: string, representing a regular expression.
133    :returns: List of tokens.
134    """
135    # Regular expression for tokenizing other regular expressions.
136    p = re.compile(r'''^(
137        \(\?P\<[a-zA-Z0-9_-]+\>  | # Start of named group.
138        \(\?#[^)]*\)             | # Comment
139        \(\?=                    | # Start of lookahead assertion
140        \(\?!                    | # Start of negative lookahead assertion
141        \(\?<=                   | # If preceded by.
142        \(\?<                    | # If not preceded by.
143        \(?:                     | # Start of group. (non capturing.)
144        \(                       | # Start of group.
145        \(?[iLmsux]              | # Flags.
146        \(?P=[a-zA-Z]+\)         | # Back reference to named group
147        \)                       | # End of group.
148        \{[^{}]*\}               | # Repetition
149        \*\? | \+\? | \?\?\      | # Non greedy repetition.
150        \* | \+ | \?             | # Repetition
151        \#.*\n                   | # Comment
152        \\. |
153
154        # Character group.
155        \[
156            ( [^\]\\]  |  \\.)*
157        \]                  |
158
159        [^(){}]             |
160        .
161    )''', re.VERBOSE)
162
163    tokens = []
164
165    while input:
166        m = p.match(input)
167        if m:
168            token, input = input[:m.end()], input[m.end():]
169            if not token.isspace():
170                tokens.append(token)
171        else:
172            raise Exception('Could not tokenize input regex.')
173
174    return tokens
175
176
177def parse_regex(regex_tokens):
178    """
179    Takes a list of tokens from the tokenizer, and returns a parse tree.
180    """
181    # We add a closing brace because that represents the final pop of the stack.
182    tokens = [')'] + regex_tokens[::-1]
183
184    def wrap(lst):
185        """ Turn list into sequence when it contains several items. """
186        if len(lst) == 1:
187            return lst[0]
188        else:
189            return Sequence(lst)
190
191    def _parse():
192        or_list = []
193        result = []
194
195        def wrapped_result():
196            if or_list == []:
197                return wrap(result)
198            else:
199                or_list.append(result)
200                return Any([wrap(i) for i in or_list])
201
202        while tokens:
203            t = tokens.pop()
204
205            if t.startswith('(?P<'):
206                variable = Variable(_parse(), varname=t[4:-1])
207                result.append(variable)
208
209            elif t in ('*', '*?'):
210                greedy = (t == '*')
211                result[-1] = Repeat(result[-1], greedy=greedy)
212
213            elif t in ('+', '+?'):
214                greedy = (t == '+')
215                result[-1] = Repeat(result[-1], min_repeat=1, greedy=greedy)
216
217            elif t in ('?', '??'):
218                if result == []:
219                    raise Exception('Nothing to repeat.' + repr(tokens))
220                else:
221                    greedy = (t == '?')
222                    result[-1] = Repeat(result[-1], min_repeat=0, max_repeat=1, greedy=greedy)
223
224            elif t == '|':
225                or_list.append(result)
226                result = []
227
228            elif t in ('(', '(?:'):
229                result.append(_parse())
230
231            elif t == '(?!':
232                result.append(Lookahead(_parse(), negative=True))
233
234            elif t == '(?=':
235                result.append(Lookahead(_parse(), negative=False))
236
237            elif t == ')':
238                return wrapped_result()
239
240            elif t.startswith('#'):
241                pass
242
243            elif t.startswith('{'):
244                # TODO: implement!
245                raise Exception('{}-style repitition not yet supported' % t)
246
247            elif t.startswith('(?'):
248                raise Exception('%r not supported' % t)
249
250            elif t.isspace():
251                pass
252            else:
253                result.append(Regex(t))
254
255        raise Exception("Expecting ')' token")
256
257    result = _parse()
258
259    if len(tokens) != 0:
260        raise Exception("Unmatched parantheses.")
261    else:
262        return result
263