1# Copyright (c) Django Software Foundation and individual contributors.
2# All rights reserved.
3#
4# Redistribution and use in source and binary forms, with or without modification,
5# are permitted provided that the following conditions are met:
6#
7#     1. Redistributions of source code must retain the above copyright notice,
8#        this list of conditions and the following disclaimer.
9#
10#     2. Redistributions in binary form must reproduce the above copyright
11#        notice, this list of conditions and the following disclaimer in the
12#        documentation and/or other materials provided with the distribution.
13#
14#     3. Neither the name of Django nor the names of its contributors may be used
15#        to endorse or promote products derived from this software without
16#        specific prior written permission.
17#
18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
22# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
29"""
30Functions for reversing a regular expression (used in reverse URL resolving).
31Used internally by Django and not intended for external use.
32
33This is not, and is not intended to be, a complete reg-exp decompiler. It
34should be good enough for a large class of URLS, however.
35"""
36
37from weboob.tools.compat import basestring
38
39# Mapping of an escape character to a representative of that class. So, e.g.,
40# "\w" is replaced by "x" in a reverse URL. A value of None means to ignore
41# this sequence. Any missing key is mapped to itself.
42ESCAPE_MAPPINGS = {
43    "A": None,
44    "b": None,
45    "B": None,
46    "d": u"0",
47    "D": u"x",
48    "s": u" ",
49    "S": u"x",
50    "w": u"x",
51    "W": u"!",
52    "Z": None,
53}
54
55
56class Choice(list):
57    """
58    Used to represent multiple possibilities at this point in a pattern string.
59    We use a distinguished type, rather than a list, so that the usage in the
60    code is clear.
61    """
62
63
64class Group(list):
65    """
66    Used to represent a capturing group in the pattern string.
67    """
68
69
70class NonCapture(list):
71    """
72    Used to represent a non-capturing group in the pattern string.
73    """
74
75
76def normalize(pattern):
77    """
78    Given a reg-exp pattern, normalizes it to a list of forms that suffice for
79    reverse matching. This does the following:
80
81    (1) For any repeating sections, keeps the minimum number of occurrences
82        permitted (this means zero for optional groups).
83    (2) If an optional group includes parameters, include one occurrence of
84        that group (along with the zero occurrence case from step (1)).
85    (3) Select the first (essentially an arbitrary) element from any character
86        class. Select an arbitrary character for any unordered class (e.g. '.'
87        or '\w') in the pattern.
88    (4) Ignore comments and any of the reg-exp flags that won't change
89        what we construct ("iLmsu"). "(?x)" is an error, however.
90    (5) Raise an error on all other non-capturing (?...) forms (e.g.
91        look-ahead and look-behind matches) and any disjunctive ('|')
92        constructs.
93
94    Django's URLs for forward resolving are either all positional arguments or
95    all keyword arguments. That is assumed here, as well. Although reverse
96    resolving can be done using positional args when keyword args are
97    specified, the two cannot be mixed in the same reverse() call.
98    """
99    # Do a linear scan to work out the special features of this pattern. The
100    # idea is that we scan once here and collect all the information we need to
101    # make future decisions.
102    result = []
103    non_capturing_groups = []
104    consume_next = True
105    pattern_iter = next_char(iter(pattern))
106    num_args = 0
107
108    # A "while" loop is used here because later on we need to be able to peek
109    # at the next character and possibly go around without consuming another
110    # one at the top of the loop.
111    try:
112        ch, escaped = next(pattern_iter)
113    except StopIteration:
114        return list(zip([u''],  [[]]))
115
116    try:
117        while True:
118            if escaped:
119                result.append(ch)
120            elif ch == '.':
121                # Replace "any character" with an arbitrary representative.
122                result.append(u".")
123            elif ch == '|':
124                # FIXME: One day we'll should do this, but not in 1.0.
125                raise NotImplementedError()
126            elif ch == "^":
127                pass
128            elif ch == '$':
129                break
130            elif ch == ')':
131                # This can only be the end of a non-capturing group, since all
132                # other unescaped parentheses are handled by the grouping
133                # section later (and the full group is handled there).
134                #
135                # We regroup everything inside the capturing group so that it
136                # can be quantified, if necessary.
137                start = non_capturing_groups.pop()
138                inner = NonCapture(result[start:])
139                result = result[:start] + [inner]
140            elif ch == '[':
141                # Replace ranges with the first character in the range.
142                ch, escaped = next(pattern_iter)
143                result.append(ch)
144                ch, escaped = next(pattern_iter)
145                while escaped or ch != ']':
146                    ch, escaped = next(pattern_iter)
147            elif ch == '(':
148                # Some kind of group.
149                ch, escaped = next(pattern_iter)
150                if ch != '?' or escaped:
151                    # A positional group
152                    name = "_%d" % num_args
153                    num_args += 1
154                    result.append(Group(((u"%%(%s)s" % name), name)))
155                    walk_to_end(ch, pattern_iter)
156                else:
157                    ch, escaped = next(pattern_iter)
158                    if ch in "iLmsu#":
159                        # All of these are ignorable. Walk to the end of the
160                        # group.
161                        walk_to_end(ch, pattern_iter)
162                    elif ch == ':':
163                        # Non-capturing group
164                        non_capturing_groups.append(len(result))
165                    elif ch != 'P':
166                        # Anything else, other than a named group, is something
167                        # we cannot reverse.
168                        raise ValueError("Non-reversible reg-exp portion: '(?%s'" % ch)
169                    else:
170                        ch, escaped = next(pattern_iter)
171                        if ch not in ('<', '='):
172                            raise ValueError("Non-reversible reg-exp portion: '(?P%s'" % ch)
173                        # We are in a named capturing group. Extra the name and
174                        # then skip to the end.
175                        if ch == '<':
176                            terminal_char = '>'
177                        # We are in a named backreference.
178                        else:
179                            terminal_char = ')'
180                        name = []
181                        ch, escaped = next(pattern_iter)
182                        while ch != terminal_char:
183                            name.append(ch)
184                            ch, escaped = next(pattern_iter)
185                        param = ''.join(name)
186                        # Named backreferences have already consumed the
187                        # parenthesis.
188                        if terminal_char != ')':
189                            result.append(Group(((u"%%(%s)s" % param), param)))
190                            walk_to_end(ch, pattern_iter)
191                        else:
192                            result.append(Group(((u"%%(%s)s" % param), None)))
193            elif ch in "*?+{":
194                # Quanitifers affect the previous item in the result list.
195                count, ch = get_quantifier(ch, pattern_iter)
196                if ch:
197                    # We had to look ahead, but it wasn't need to compute the
198                    # quanitifer, so use this character next time around the
199                    # main loop.
200                    consume_next = False
201
202                if count == 0:
203                    if contains(result[-1], Group):
204                        # If we are quantifying a capturing group (or
205                        # something containing such a group) and the minimum is
206                        # zero, we must also handle the case of one occurrence
207                        # being present. All the quantifiers (except {0,0},
208                        # which we conveniently ignore) that have a 0 minimum
209                        # also allow a single occurrence.
210                        result[-1] = Choice([None, result[-1]])
211                    else:
212                        result.pop()
213                elif count > 1:
214                    result.extend([result[-1]] * (count - 1))
215            else:
216                # Anything else is a literal.
217                result.append(ch)
218
219            if consume_next:
220                ch, escaped = next(pattern_iter)
221            else:
222                consume_next = True
223    except StopIteration:
224        pass
225    except NotImplementedError:
226        # A case of using the disjunctive form. No results for you!
227        return list(zip([u''],  [[]]))
228
229    return list(zip(*flatten_result(result)))
230
231
232def next_char(input_iter):
233    """
234    An iterator that yields the next character from "pattern_iter", respecting
235    escape sequences. An escaped character is replaced by a representative of
236    its class (e.g. \w -> "x"). If the escaped character is one that is
237    skipped, it is not returned (the next character is returned instead).
238
239    Yields the next character, along with a boolean indicating whether it is a
240    raw (unescaped) character or not.
241    """
242    for ch in input_iter:
243        if ch != '\\':
244            yield ch, False
245            continue
246        ch = next(input_iter)
247        representative = ESCAPE_MAPPINGS.get(ch, ch)
248        if representative is None:
249            continue
250        yield representative, True
251
252
253def walk_to_end(ch, input_iter):
254    """
255    The iterator is currently inside a capturing group. We want to walk to the
256    close of this group, skipping over any nested groups and handling escaped
257    parentheses correctly.
258    """
259    if ch == '(':
260        nesting = 1
261    else:
262        nesting = 0
263    for ch, escaped in input_iter:
264        if escaped:
265            continue
266        elif ch == '(':
267            nesting += 1
268        elif ch == ')':
269            if not nesting:
270                return
271            nesting -= 1
272
273
274def get_quantifier(ch, input_iter):
275    """
276    Parse a quantifier from the input, where "ch" is the first character in the
277    quantifier.
278
279    Returns the minimum number of occurences permitted by the quantifier and
280    either None or the next character from the input_iter if the next character
281    is not part of the quantifier.
282    """
283    if ch in '*?+':
284        try:
285            ch2, escaped = next(input_iter)
286        except StopIteration:
287            ch2 = None
288        if ch2 == '?':
289            ch2 = None
290        if ch == '+':
291            return 1, ch2
292        return 0, ch2
293
294    quant = []
295    while ch != '}':
296        ch, escaped = next(input_iter)
297        quant.append(ch)
298    quant = quant[:-1]
299    values = ''.join(quant).split(',')
300
301    # Consume the trailing '?', if necessary.
302    try:
303        ch, escaped = next(input_iter)
304    except StopIteration:
305        ch = None
306    if ch == '?':
307        ch = None
308    return int(values[0]), ch
309
310
311def contains(source, inst):
312    """
313    Returns True if the "source" contains an instance of "inst". False,
314    otherwise.
315    """
316    if isinstance(source, inst):
317        return True
318    if isinstance(source, NonCapture):
319        for elt in source:
320            if contains(elt, inst):
321                return True
322    return False
323
324
325def flatten_result(source):
326    """
327    Turns the given source sequence into a list of reg-exp possibilities and
328    their arguments. Returns a list of strings and a list of argument lists.
329    Each of the two lists will be of the same length.
330    """
331    if source is None:
332        return [u''], [[]]
333    if isinstance(source, Group):
334        if source[1] is None:
335            params = []
336        else:
337            params = [source[1]]
338        return [source[0]], [params]
339    result = [u'']
340    result_args = [[]]
341    pos = last = 0
342    for pos, elt in enumerate(source):
343        if isinstance(elt, basestring):
344            continue
345        piece = u''.join(source[last:pos])
346        if isinstance(elt, Group):
347            piece += elt[0]
348            param = elt[1]
349        else:
350            param = None
351        last = pos + 1
352        for i in range(len(result)):
353            result[i] += piece
354            if param:
355                result_args[i].append(param)
356        if isinstance(elt, (Choice, NonCapture)):
357            if isinstance(elt, NonCapture):
358                elt = [elt]
359            inner_result, inner_args = [], []
360            for item in elt:
361                res, args = flatten_result(item)
362                inner_result.extend(res)
363                inner_args.extend(args)
364            new_result = []
365            new_args = []
366            for item, args in zip(result, result_args):
367                for i_item, i_args in zip(inner_result, inner_args):
368                    new_result.append(item + i_item)
369                    new_args.append(args[:] + i_args)
370            result = new_result
371            result_args = new_args
372    if pos >= last:
373        piece = u''.join(source[last:])
374        for i in range(len(result)):
375            result[i] += piece
376    return result, result_args
377