1# Copyright (c) Django Software Foundation and individual contributors. 2# All rights reserved. 3# 4# Redistribution and use in source and binary forms, with or without modification, 5# are permitted provided that the following conditions are met: 6# 7# 1. Redistributions of source code must retain the above copyright notice, 8# this list of conditions and the following disclaimer. 9# 10# 2. Redistributions in binary form must reproduce the above copyright 11# notice, this list of conditions and the following disclaimer in the 12# documentation and/or other materials provided with the distribution. 13# 14# 3. Neither the name of Django nor the names of its contributors may be used 15# to endorse or promote products derived from this software without 16# specific prior written permission. 17# 18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 22# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 29""" 30Functions for reversing a regular expression (used in reverse URL resolving). 31Used internally by Django and not intended for external use. 32 33This is not, and is not intended to be, a complete reg-exp decompiler. It 34should be good enough for a large class of URLS, however. 35""" 36 37from weboob.tools.compat import basestring 38 39# Mapping of an escape character to a representative of that class. So, e.g., 40# "\w" is replaced by "x" in a reverse URL. A value of None means to ignore 41# this sequence. Any missing key is mapped to itself. 42ESCAPE_MAPPINGS = { 43 "A": None, 44 "b": None, 45 "B": None, 46 "d": u"0", 47 "D": u"x", 48 "s": u" ", 49 "S": u"x", 50 "w": u"x", 51 "W": u"!", 52 "Z": None, 53} 54 55 56class Choice(list): 57 """ 58 Used to represent multiple possibilities at this point in a pattern string. 59 We use a distinguished type, rather than a list, so that the usage in the 60 code is clear. 61 """ 62 63 64class Group(list): 65 """ 66 Used to represent a capturing group in the pattern string. 67 """ 68 69 70class NonCapture(list): 71 """ 72 Used to represent a non-capturing group in the pattern string. 73 """ 74 75 76def normalize(pattern): 77 """ 78 Given a reg-exp pattern, normalizes it to a list of forms that suffice for 79 reverse matching. This does the following: 80 81 (1) For any repeating sections, keeps the minimum number of occurrences 82 permitted (this means zero for optional groups). 83 (2) If an optional group includes parameters, include one occurrence of 84 that group (along with the zero occurrence case from step (1)). 85 (3) Select the first (essentially an arbitrary) element from any character 86 class. Select an arbitrary character for any unordered class (e.g. '.' 87 or '\w') in the pattern. 88 (4) Ignore comments and any of the reg-exp flags that won't change 89 what we construct ("iLmsu"). "(?x)" is an error, however. 90 (5) Raise an error on all other non-capturing (?...) forms (e.g. 91 look-ahead and look-behind matches) and any disjunctive ('|') 92 constructs. 93 94 Django's URLs for forward resolving are either all positional arguments or 95 all keyword arguments. That is assumed here, as well. Although reverse 96 resolving can be done using positional args when keyword args are 97 specified, the two cannot be mixed in the same reverse() call. 98 """ 99 # Do a linear scan to work out the special features of this pattern. The 100 # idea is that we scan once here and collect all the information we need to 101 # make future decisions. 102 result = [] 103 non_capturing_groups = [] 104 consume_next = True 105 pattern_iter = next_char(iter(pattern)) 106 num_args = 0 107 108 # A "while" loop is used here because later on we need to be able to peek 109 # at the next character and possibly go around without consuming another 110 # one at the top of the loop. 111 try: 112 ch, escaped = next(pattern_iter) 113 except StopIteration: 114 return list(zip([u''], [[]])) 115 116 try: 117 while True: 118 if escaped: 119 result.append(ch) 120 elif ch == '.': 121 # Replace "any character" with an arbitrary representative. 122 result.append(u".") 123 elif ch == '|': 124 # FIXME: One day we'll should do this, but not in 1.0. 125 raise NotImplementedError() 126 elif ch == "^": 127 pass 128 elif ch == '$': 129 break 130 elif ch == ')': 131 # This can only be the end of a non-capturing group, since all 132 # other unescaped parentheses are handled by the grouping 133 # section later (and the full group is handled there). 134 # 135 # We regroup everything inside the capturing group so that it 136 # can be quantified, if necessary. 137 start = non_capturing_groups.pop() 138 inner = NonCapture(result[start:]) 139 result = result[:start] + [inner] 140 elif ch == '[': 141 # Replace ranges with the first character in the range. 142 ch, escaped = next(pattern_iter) 143 result.append(ch) 144 ch, escaped = next(pattern_iter) 145 while escaped or ch != ']': 146 ch, escaped = next(pattern_iter) 147 elif ch == '(': 148 # Some kind of group. 149 ch, escaped = next(pattern_iter) 150 if ch != '?' or escaped: 151 # A positional group 152 name = "_%d" % num_args 153 num_args += 1 154 result.append(Group(((u"%%(%s)s" % name), name))) 155 walk_to_end(ch, pattern_iter) 156 else: 157 ch, escaped = next(pattern_iter) 158 if ch in "iLmsu#": 159 # All of these are ignorable. Walk to the end of the 160 # group. 161 walk_to_end(ch, pattern_iter) 162 elif ch == ':': 163 # Non-capturing group 164 non_capturing_groups.append(len(result)) 165 elif ch != 'P': 166 # Anything else, other than a named group, is something 167 # we cannot reverse. 168 raise ValueError("Non-reversible reg-exp portion: '(?%s'" % ch) 169 else: 170 ch, escaped = next(pattern_iter) 171 if ch not in ('<', '='): 172 raise ValueError("Non-reversible reg-exp portion: '(?P%s'" % ch) 173 # We are in a named capturing group. Extra the name and 174 # then skip to the end. 175 if ch == '<': 176 terminal_char = '>' 177 # We are in a named backreference. 178 else: 179 terminal_char = ')' 180 name = [] 181 ch, escaped = next(pattern_iter) 182 while ch != terminal_char: 183 name.append(ch) 184 ch, escaped = next(pattern_iter) 185 param = ''.join(name) 186 # Named backreferences have already consumed the 187 # parenthesis. 188 if terminal_char != ')': 189 result.append(Group(((u"%%(%s)s" % param), param))) 190 walk_to_end(ch, pattern_iter) 191 else: 192 result.append(Group(((u"%%(%s)s" % param), None))) 193 elif ch in "*?+{": 194 # Quanitifers affect the previous item in the result list. 195 count, ch = get_quantifier(ch, pattern_iter) 196 if ch: 197 # We had to look ahead, but it wasn't need to compute the 198 # quanitifer, so use this character next time around the 199 # main loop. 200 consume_next = False 201 202 if count == 0: 203 if contains(result[-1], Group): 204 # If we are quantifying a capturing group (or 205 # something containing such a group) and the minimum is 206 # zero, we must also handle the case of one occurrence 207 # being present. All the quantifiers (except {0,0}, 208 # which we conveniently ignore) that have a 0 minimum 209 # also allow a single occurrence. 210 result[-1] = Choice([None, result[-1]]) 211 else: 212 result.pop() 213 elif count > 1: 214 result.extend([result[-1]] * (count - 1)) 215 else: 216 # Anything else is a literal. 217 result.append(ch) 218 219 if consume_next: 220 ch, escaped = next(pattern_iter) 221 else: 222 consume_next = True 223 except StopIteration: 224 pass 225 except NotImplementedError: 226 # A case of using the disjunctive form. No results for you! 227 return list(zip([u''], [[]])) 228 229 return list(zip(*flatten_result(result))) 230 231 232def next_char(input_iter): 233 """ 234 An iterator that yields the next character from "pattern_iter", respecting 235 escape sequences. An escaped character is replaced by a representative of 236 its class (e.g. \w -> "x"). If the escaped character is one that is 237 skipped, it is not returned (the next character is returned instead). 238 239 Yields the next character, along with a boolean indicating whether it is a 240 raw (unescaped) character or not. 241 """ 242 for ch in input_iter: 243 if ch != '\\': 244 yield ch, False 245 continue 246 ch = next(input_iter) 247 representative = ESCAPE_MAPPINGS.get(ch, ch) 248 if representative is None: 249 continue 250 yield representative, True 251 252 253def walk_to_end(ch, input_iter): 254 """ 255 The iterator is currently inside a capturing group. We want to walk to the 256 close of this group, skipping over any nested groups and handling escaped 257 parentheses correctly. 258 """ 259 if ch == '(': 260 nesting = 1 261 else: 262 nesting = 0 263 for ch, escaped in input_iter: 264 if escaped: 265 continue 266 elif ch == '(': 267 nesting += 1 268 elif ch == ')': 269 if not nesting: 270 return 271 nesting -= 1 272 273 274def get_quantifier(ch, input_iter): 275 """ 276 Parse a quantifier from the input, where "ch" is the first character in the 277 quantifier. 278 279 Returns the minimum number of occurences permitted by the quantifier and 280 either None or the next character from the input_iter if the next character 281 is not part of the quantifier. 282 """ 283 if ch in '*?+': 284 try: 285 ch2, escaped = next(input_iter) 286 except StopIteration: 287 ch2 = None 288 if ch2 == '?': 289 ch2 = None 290 if ch == '+': 291 return 1, ch2 292 return 0, ch2 293 294 quant = [] 295 while ch != '}': 296 ch, escaped = next(input_iter) 297 quant.append(ch) 298 quant = quant[:-1] 299 values = ''.join(quant).split(',') 300 301 # Consume the trailing '?', if necessary. 302 try: 303 ch, escaped = next(input_iter) 304 except StopIteration: 305 ch = None 306 if ch == '?': 307 ch = None 308 return int(values[0]), ch 309 310 311def contains(source, inst): 312 """ 313 Returns True if the "source" contains an instance of "inst". False, 314 otherwise. 315 """ 316 if isinstance(source, inst): 317 return True 318 if isinstance(source, NonCapture): 319 for elt in source: 320 if contains(elt, inst): 321 return True 322 return False 323 324 325def flatten_result(source): 326 """ 327 Turns the given source sequence into a list of reg-exp possibilities and 328 their arguments. Returns a list of strings and a list of argument lists. 329 Each of the two lists will be of the same length. 330 """ 331 if source is None: 332 return [u''], [[]] 333 if isinstance(source, Group): 334 if source[1] is None: 335 params = [] 336 else: 337 params = [source[1]] 338 return [source[0]], [params] 339 result = [u''] 340 result_args = [[]] 341 pos = last = 0 342 for pos, elt in enumerate(source): 343 if isinstance(elt, basestring): 344 continue 345 piece = u''.join(source[last:pos]) 346 if isinstance(elt, Group): 347 piece += elt[0] 348 param = elt[1] 349 else: 350 param = None 351 last = pos + 1 352 for i in range(len(result)): 353 result[i] += piece 354 if param: 355 result_args[i].append(param) 356 if isinstance(elt, (Choice, NonCapture)): 357 if isinstance(elt, NonCapture): 358 elt = [elt] 359 inner_result, inner_args = [], [] 360 for item in elt: 361 res, args = flatten_result(item) 362 inner_result.extend(res) 363 inner_args.extend(args) 364 new_result = [] 365 new_args = [] 366 for item, args in zip(result, result_args): 367 for i_item, i_args in zip(inner_result, inner_args): 368 new_result.append(item + i_item) 369 new_args.append(args[:] + i_args) 370 result = new_result 371 result_args = new_args 372 if pos >= last: 373 piece = u''.join(source[last:]) 374 for i in range(len(result)): 375 result[i] += piece 376 return result, result_args 377