1""" 2Parser for parsing a regular expression. 3Take a string representing a regular expression and return the root node of its 4parse tree. 5 6usage:: 7 8 root_node = parse_regex('(hello|world)') 9 10Remarks: 11- The regex parser processes multiline, it ignores all whitespace and supports 12 multiple named groups with the same name and #-style comments. 13 14Limitations: 15- Lookahead is not supported. 16""" 17from __future__ import unicode_literals 18import re 19 20__all__ = ( 21 'Repeat', 22 'Variable', 23 'Regex', 24 'Lookahead', 25 26 'tokenize_regex', 27 'parse_regex', 28) 29 30 31class Node(object): 32 """ 33 Base class for all the grammar nodes. 34 (You don't initialize this one.) 35 """ 36 def __add__(self, other_node): 37 return Sequence([self, other_node]) 38 39 def __or__(self, other_node): 40 return Any([self, other_node]) 41 42 43class Any(Node): 44 """ 45 Union operation (OR operation) between several grammars. You don't 46 initialize this yourself, but it's a result of a "Grammar1 | Grammar2" 47 operation. 48 """ 49 def __init__(self, children): 50 self.children = children 51 52 def __or__(self, other_node): 53 return Any(self.children + [other_node]) 54 55 def __repr__(self): 56 return '%s(%r)' % (self.__class__.__name__, self.children) 57 58 59class Sequence(Node): 60 """ 61 Concatenation operation of several grammars. You don't initialize this 62 yourself, but it's a result of a "Grammar1 + Grammar2" operation. 63 """ 64 def __init__(self, children): 65 self.children = children 66 67 def __add__(self, other_node): 68 return Sequence(self.children + [other_node]) 69 70 def __repr__(self): 71 return '%s(%r)' % (self.__class__.__name__, self.children) 72 73 74class Regex(Node): 75 """ 76 Regular expression. 77 """ 78 def __init__(self, regex): 79 re.compile(regex) # Validate 80 81 self.regex = regex 82 83 def __repr__(self): 84 return '%s(/%s/)' % (self.__class__.__name__, self.regex) 85 86 87class Lookahead(Node): 88 """ 89 Lookahead expression. 90 """ 91 def __init__(self, childnode, negative=False): 92 self.childnode = childnode 93 self.negative = negative 94 95 def __repr__(self): 96 return '%s(%r)' % (self.__class__.__name__, self.childnode) 97 98 99class Variable(Node): 100 """ 101 Mark a variable in the regular grammar. This will be translated into a 102 named group. Each variable can have his own completer, validator, etc.. 103 104 :param childnode: The grammar which is wrapped inside this variable. 105 :param varname: String. 106 """ 107 def __init__(self, childnode, varname=None): 108 self.childnode = childnode 109 self.varname = varname 110 111 def __repr__(self): 112 return '%s(childnode=%r, varname=%r)' % ( 113 self.__class__.__name__, self.childnode, self.varname) 114 115 116class Repeat(Node): 117 def __init__(self, childnode, min_repeat=0, max_repeat=None, greedy=True): 118 self.childnode = childnode 119 self.min_repeat = min_repeat 120 self.max_repeat = max_repeat 121 self.greedy = greedy 122 123 def __repr__(self): 124 return '%s(childnode=%r)' % (self.__class__.__name__, self.childnode) 125 126 127def tokenize_regex(input): 128 """ 129 Takes a string, representing a regular expression as input, and tokenizes 130 it. 131 132 :param input: string, representing a regular expression. 133 :returns: List of tokens. 134 """ 135 # Regular expression for tokenizing other regular expressions. 136 p = re.compile(r'''^( 137 \(\?P\<[a-zA-Z0-9_-]+\> | # Start of named group. 138 \(\?#[^)]*\) | # Comment 139 \(\?= | # Start of lookahead assertion 140 \(\?! | # Start of negative lookahead assertion 141 \(\?<= | # If preceded by. 142 \(\?< | # If not preceded by. 143 \(?: | # Start of group. (non capturing.) 144 \( | # Start of group. 145 \(?[iLmsux] | # Flags. 146 \(?P=[a-zA-Z]+\) | # Back reference to named group 147 \) | # End of group. 148 \{[^{}]*\} | # Repetition 149 \*\? | \+\? | \?\?\ | # Non greedy repetition. 150 \* | \+ | \? | # Repetition 151 \#.*\n | # Comment 152 \\. | 153 154 # Character group. 155 \[ 156 ( [^\]\\] | \\.)* 157 \] | 158 159 [^(){}] | 160 . 161 )''', re.VERBOSE) 162 163 tokens = [] 164 165 while input: 166 m = p.match(input) 167 if m: 168 token, input = input[:m.end()], input[m.end():] 169 if not token.isspace(): 170 tokens.append(token) 171 else: 172 raise Exception('Could not tokenize input regex.') 173 174 return tokens 175 176 177def parse_regex(regex_tokens): 178 """ 179 Takes a list of tokens from the tokenizer, and returns a parse tree. 180 """ 181 # We add a closing brace because that represents the final pop of the stack. 182 tokens = [')'] + regex_tokens[::-1] 183 184 def wrap(lst): 185 """ Turn list into sequence when it contains several items. """ 186 if len(lst) == 1: 187 return lst[0] 188 else: 189 return Sequence(lst) 190 191 def _parse(): 192 or_list = [] 193 result = [] 194 195 def wrapped_result(): 196 if or_list == []: 197 return wrap(result) 198 else: 199 or_list.append(result) 200 return Any([wrap(i) for i in or_list]) 201 202 while tokens: 203 t = tokens.pop() 204 205 if t.startswith('(?P<'): 206 variable = Variable(_parse(), varname=t[4:-1]) 207 result.append(variable) 208 209 elif t in ('*', '*?'): 210 greedy = (t == '*') 211 result[-1] = Repeat(result[-1], greedy=greedy) 212 213 elif t in ('+', '+?'): 214 greedy = (t == '+') 215 result[-1] = Repeat(result[-1], min_repeat=1, greedy=greedy) 216 217 elif t in ('?', '??'): 218 if result == []: 219 raise Exception('Nothing to repeat.' + repr(tokens)) 220 else: 221 greedy = (t == '?') 222 result[-1] = Repeat(result[-1], min_repeat=0, max_repeat=1, greedy=greedy) 223 224 elif t == '|': 225 or_list.append(result) 226 result = [] 227 228 elif t in ('(', '(?:'): 229 result.append(_parse()) 230 231 elif t == '(?!': 232 result.append(Lookahead(_parse(), negative=True)) 233 234 elif t == '(?=': 235 result.append(Lookahead(_parse(), negative=False)) 236 237 elif t == ')': 238 return wrapped_result() 239 240 elif t.startswith('#'): 241 pass 242 243 elif t.startswith('{'): 244 # TODO: implement! 245 raise Exception('{}-style repitition not yet supported' % t) 246 247 elif t.startswith('(?'): 248 raise Exception('%r not supported' % t) 249 250 elif t.isspace(): 251 pass 252 else: 253 result.append(Regex(t)) 254 255 raise Exception("Expecting ')' token") 256 257 result = _parse() 258 259 if len(tokens) != 0: 260 raise Exception("Unmatched parantheses.") 261 else: 262 return result 263