1# -*- coding: utf-8 -*-
2# Copyright 2009-2013, Peter A. Bigot
3# Copyright 2012, Jon Foster
4#
5# Licensed under the Apache License, Version 2.0 (the "License"); you may
6# not use this file except in compliance with the License. You may obtain a
7# copy of the License at:
8#
9#            http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14# License for the specific language governing permissions and limitations
15# under the License.
16
17"""Support for regular expressions conformant to the XML Schema specification.
18
19For the most part, XML regular expressions are similar to the POSIX
20ones, and can be handled by the Python C{re} module.  The exceptions
21are for multi-character (C{\w}) and category escapes (e.g., C{\p{N}} or
22C{\p{IPAExtensions}}) and the character set subtraction capability.
23This module supports those by scanning the regular expression,
24replacing the category escapes with equivalent charset expressions.
25It further detects the subtraction syntax and modifies the charset
26expression to remove the unwanted code points.
27
28The basic technique is to step through the characters of the regular
29expression, entering a recursive-descent parser when one of the
30translated constructs is encountered.
31
32There is a nice set of XML regular expressions at
33U{http://www.xmlschemareference.com/examples/Ch14/regexpDemo.xsd},
34with a sample document at U{
35http://www.xmlschemareference.com/examples/Ch14/regexpDemo.xml}"""
36
37import re
38import logging
39import pyxb.utils.unicode
40from pyxb.utils import six
41
42_log = logging.getLogger(__name__)
43
44# AllEsc maps all the possible escape codes and wildcards in an XML schema
45# regular expression into the corresponding CodePointSet.
46_AllEsc = { }
47
48def _InitializeAllEsc ():
49    """Set the values in _AllEsc without introducing C{k} and C{v} into
50    the module."""
51
52    _AllEsc.update({ six.u('.'): pyxb.utils.unicode.WildcardEsc })
53    bs = six.unichr(0x5c)
54    for k, v in six.iteritems(pyxb.utils.unicode.SingleCharEsc):
55        _AllEsc[bs + six.text_type(k)] = v
56    for k, v in six.iteritems(pyxb.utils.unicode.MultiCharEsc):
57        _AllEsc[bs + six.text_type(k)] = v
58    for k, v in six.iteritems(pyxb.utils.unicode.catEsc):
59        _AllEsc[bs + six.text_type(k)] = v
60    for k, v in six.iteritems(pyxb.utils.unicode.complEsc):
61        _AllEsc[bs + six.text_type(k)] = v
62    for k, v in six.iteritems(pyxb.utils.unicode.IsBlockEsc):
63        _AllEsc[bs + six.text_type(k)] = v
64_InitializeAllEsc()
65
66class RegularExpressionError (ValueError):
67    """Raised when a regular expression cannot be processed.."""
68    def __init__ (self, position, description):
69        self.position = position
70        ValueError.__init__(self, 'At %d: %s' % (position, description))
71
72_CharClassEsc_re = re.compile(r'\\(?:(?P<cgProp>[pP]{(?P<charProp>[-A-Za-z0-9]+)})|(?P<cgClass>[^pP]))')
73def _MatchCharClassEsc(text, position):
74    """Parse a U{charClassEsc<http://www.w3.org/TR/xmlschema-2/#nt-charClassEsc>} term.
75
76    This is one of:
77
78      - U{SingleCharEsc<http://www.w3.org/TR/xmlschema-2/#nt-SingleCharEsc>},
79      an escaped single character such as C{E{\}n}
80
81      - U{MultiCharEsc<http://www.w3.org/TR/xmlschema-2/#nt-MultiCharEsc>},
82      an escape code that can match a range of characters,
83      e.g. C{E{\}s} to match certain whitespace characters
84
85      - U{catEsc<http://www.w3.org/TR/xmlschema-2/#nt-catEsc>}, the
86      C{E{\}pE{lb}...E{rb}} Unicode property escapes including
87      categories and blocks
88
89      - U{complEsc<http://www.w3.org/TR/xmlschema-2/#nt-complEsc>},
90      the C{E{\}PE{lb}...E{rb}} inverted Unicode property escapes
91
92    If the parsing fails, throws a RegularExpressionError.
93
94    @return: A pair C{(cps, p)} where C{cps} is a
95    L{pyxb.utils.unicode.CodePointSet} containing the code points
96    associated with the character class, and C{p} is the text offset
97    immediately following the escape sequence.
98
99    @raise RegularExpressionError: if the expression is syntactically
100    invalid.
101    """
102
103    mo = _CharClassEsc_re.match(text, position)
104    if mo:
105        escape_code = mo.group(0)
106        cps = _AllEsc.get(escape_code)
107        if cps is not None:
108            return (cps, mo.end())
109        char_prop = mo.group('charProp')
110        if char_prop is not None:
111            if char_prop.startswith('Is'):
112                raise RegularExpressionError(position, 'Unrecognized Unicode block %s in %s' % (char_prop[2:], escape_code))
113            raise RegularExpressionError(position, 'Unrecognized character property %s' % (escape_code,))
114        raise RegularExpressionError(position, 'Unrecognized character class %s' % (escape_code,))
115    raise RegularExpressionError(position, "Unrecognized escape identifier at %s" % (text[position:],))
116
117def _MatchPosCharGroup(text, position):
118    '''Parse a U{posCharGroup<http://www.w3.org/TR/xmlschema-2/#nt-posCharGroup>} term.
119
120    @return: A tuple C{(cps, fs, p)} where:
121      - C{cps} is a L{pyxb.utils.unicode.CodePointSet} containing the code points associated with the group;
122      - C{fs} is a C{bool} that is C{True} if the next character is the C{-} in a U{charClassSub<http://www.w3.org/TR/xmlschema-2/#nt-charClassSub>} and C{False} if the group is not part of a charClassSub;
123      - C{p} is the text offset immediately following the closing brace.
124
125    @raise RegularExpressionError: if the expression is syntactically
126    invalid.
127    '''
128
129    start_position = position
130
131    # DASH is just some unique object, used as a marker.
132    # It can't be unicode or a CodePointSet.
133    class DashClass:
134        pass
135    DASH = DashClass()
136
137    # We tokenize first, then go back and stick the ranges together.
138    tokens = []
139    has_following_subtraction = False
140    while True:
141        if position >= len(text):
142            raise RegularExpressionError(position, "Incomplete character class expression, missing closing ']'")
143        ch = text[position]
144        if ch == six.u('['):
145            # Only allowed if this is a subtraction
146            if not tokens or tokens[-1] is not DASH:
147                raise RegularExpressionError(position, "'[' character not allowed in character class")
148            has_following_subtraction = True
149            # For a character class subtraction, the "-[" are not part of the
150            # posCharGroup, so undo reading the dash
151            tokens.pop()
152            position = position - 1
153            break
154        elif ch == six.u(']'):
155            # End
156            break
157        elif ch == six.unichr(0x5c): # backslash
158            cps, position = _MatchCharClassEsc(text, position)
159            single_char = cps.asSingleCharacter()
160            if single_char is not None:
161                tokens.append(single_char)
162            else:
163                tokens.append(cps)
164        elif ch == six.u('-'):
165            # We need to distinguish between "-" and "\-".  So we use
166            # DASH for a plain "-", and u"-" for a "\-".
167            tokens.append(DASH)
168            position = position + 1
169        else:
170            tokens.append(ch)
171            position = position + 1
172
173    if not tokens:
174        raise RegularExpressionError(position, "Empty character class not allowed")
175
176    # At the start or end of the character group, a dash has to be a literal
177    if tokens[0] is DASH:
178        tokens[0] = six.u('-')
179    if tokens[-1] is DASH:
180        tokens[-1] = six.u('-')
181    result_cps = pyxb.utils.unicode.CodePointSet()
182    cur_token = 0
183    while cur_token < len(tokens):
184        start = tokens[cur_token]
185        if cur_token + 2 < len(tokens) and tokens[cur_token + 1] is DASH:
186            end = tokens[cur_token + 2]
187            if not isinstance(start, six.text_type) or not isinstance(end, six.text_type):
188                if start is DASH or end is DASH:
189                    raise RegularExpressionError(start_position, 'Two dashes in a row is not allowed in the middle of a character class.')
190                raise RegularExpressionError(start_position, 'Dashes must be surrounded by characters, not character class escapes. %r %r' %(start, end))
191            if start > end:
192                raise RegularExpressionError(start_position, 'Character ranges must have the lowest character first')
193            result_cps.add((ord(start), ord(end)))
194            cur_token = cur_token + 3
195        else:
196            if start is DASH:
197                raise RegularExpressionError(start_position, 'Dash without an initial character')
198            elif isinstance(start, six.text_type):
199                result_cps.add(ord(start))
200            else:
201                result_cps.extend(start)
202            cur_token = cur_token + 1
203
204    return result_cps, has_following_subtraction, position
205
206def _MatchCharClassExpr(text, position):
207    '''Parse a U{charClassExpr<http://www.w3.org/TR/xmlschema-2/#nt-charClassExpr>}.
208
209    These are XML regular expression classes such as C{[abc]}, C{[a-c]}, C{[^abc]}, or C{[a-z-[q]]}.
210
211    @param text: The complete text of the regular expression being
212    translated.  The first character must be the C{[} starting a
213    character class.
214
215    @param position: The offset of the start of the character group.
216
217    @return: A pair C{(cps, p)} where C{cps} is a
218    L{pyxb.utils.unicode.CodePointSet} containing the code points
219    associated with the property, and C{p} is the text offset
220    immediately following the closing brace.
221
222    @raise RegularExpressionError: if the expression is syntactically
223    invalid.
224    '''
225    if position >= len(text):
226        raise RegularExpressionError(position, 'Missing character class expression')
227    if six.u('[') != text[position]:
228        raise RegularExpressionError(position, "Expected start of character class expression, got '%s'" % (text[position],))
229    position = position + 1
230    if position >= len(text):
231        raise RegularExpressionError(position, 'Missing character class expression')
232    negated = (text[position] == '^')
233    if negated:
234        position = position + 1
235
236    result_cps, has_following_subtraction, position = _MatchPosCharGroup(text, position)
237
238    if negated:
239        result_cps = result_cps.negate()
240
241    if has_following_subtraction:
242        assert text[position] == six.u('-')
243        assert text[position + 1] == six.u('[')
244        position = position + 1
245        sub_cps, position = _MatchCharClassExpr(text, position)
246        result_cps.subtract(sub_cps)
247
248    if position >= len(text) or text[position] != six.u(']'):
249        raise RegularExpressionError(position, "Expected ']' to end character class")
250    return result_cps, position + 1
251
252def MaybeMatchCharacterClass (text, position):
253    """Attempt to match a U{character class expression
254    <http://www.w3.org/TR/xmlschema-2/#nt-charClassExpr>}.
255
256    @param text: The complete text of the regular expression being
257    translated
258
259    @param position: The offset of the start of the potential
260    expression.
261
262    @return: C{None} if C{position} does not begin a character class
263    expression; otherwise a pair C{(cps, p)} where C{cps} is a
264    L{pyxb.utils.unicode.CodePointSet} containing the code points associated with
265    the property, and C{p} is the text offset immediately following
266    the closing brace."""
267    if position >= len(text):
268        return None
269    c = text[position]
270    np = position + 1
271    if '.' == c:
272        return (pyxb.utils.unicode.WildcardEsc, np)
273    if '[' == c:
274        return _MatchCharClassExpr(text, position)
275    if '\\' == c:
276        return _MatchCharClassEsc(text, position)
277    return None
278
279def XMLToPython (pattern):
280    """Convert the given pattern to the format required for Python
281    regular expressions.
282
283    @param pattern: A Unicode string defining a pattern consistent
284    with U{XML regular
285    expressions<http://www.w3.org/TR/xmlschema-2/index.html#regexs>}.
286
287    @return: A Unicode string specifying a Python regular expression
288    that matches the same language as C{pattern}."""
289    assert isinstance(pattern, six.text_type)
290    new_pattern_elts = []
291    new_pattern_elts.append('^(')
292    position = 0
293    while position < len(pattern):
294        cg = MaybeMatchCharacterClass(pattern, position)
295        if cg is None:
296            ch = pattern[position]
297            if ch == six.u('^') or ch == six.u('$'):
298                # These characters have no special meaning in XSD.  But they
299                # match start and end of string in Python, so they have to
300                # be escaped.
301                new_pattern_elts.append(six.unichr(0x5c) + ch)
302            else:
303                new_pattern_elts.append(ch)
304            position += 1
305        else:
306            (cps, position) = cg
307            new_pattern_elts.append(cps.asPattern())
308    new_pattern_elts.append(')$')
309    return ''.join(new_pattern_elts)
310