1# -*- coding: utf-8 -*- 2# Copyright 2009-2013, Peter A. Bigot 3# Copyright 2012, Jon Foster 4# 5# Licensed under the Apache License, Version 2.0 (the "License"); you may 6# not use this file except in compliance with the License. You may obtain a 7# copy of the License at: 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, software 12# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 14# License for the specific language governing permissions and limitations 15# under the License. 16 17"""Support for regular expressions conformant to the XML Schema specification. 18 19For the most part, XML regular expressions are similar to the POSIX 20ones, and can be handled by the Python C{re} module. The exceptions 21are for multi-character (C{\w}) and category escapes (e.g., C{\p{N}} or 22C{\p{IPAExtensions}}) and the character set subtraction capability. 23This module supports those by scanning the regular expression, 24replacing the category escapes with equivalent charset expressions. 25It further detects the subtraction syntax and modifies the charset 26expression to remove the unwanted code points. 27 28The basic technique is to step through the characters of the regular 29expression, entering a recursive-descent parser when one of the 30translated constructs is encountered. 31 32There is a nice set of XML regular expressions at 33U{http://www.xmlschemareference.com/examples/Ch14/regexpDemo.xsd}, 34with a sample document at U{ 35http://www.xmlschemareference.com/examples/Ch14/regexpDemo.xml}""" 36 37import re 38import logging 39import pyxb.utils.unicode 40from pyxb.utils import six 41 42_log = logging.getLogger(__name__) 43 44# AllEsc maps all the possible escape codes and wildcards in an XML schema 45# regular expression into the corresponding CodePointSet. 46_AllEsc = { } 47 48def _InitializeAllEsc (): 49 """Set the values in _AllEsc without introducing C{k} and C{v} into 50 the module.""" 51 52 _AllEsc.update({ six.u('.'): pyxb.utils.unicode.WildcardEsc }) 53 bs = six.unichr(0x5c) 54 for k, v in six.iteritems(pyxb.utils.unicode.SingleCharEsc): 55 _AllEsc[bs + six.text_type(k)] = v 56 for k, v in six.iteritems(pyxb.utils.unicode.MultiCharEsc): 57 _AllEsc[bs + six.text_type(k)] = v 58 for k, v in six.iteritems(pyxb.utils.unicode.catEsc): 59 _AllEsc[bs + six.text_type(k)] = v 60 for k, v in six.iteritems(pyxb.utils.unicode.complEsc): 61 _AllEsc[bs + six.text_type(k)] = v 62 for k, v in six.iteritems(pyxb.utils.unicode.IsBlockEsc): 63 _AllEsc[bs + six.text_type(k)] = v 64_InitializeAllEsc() 65 66class RegularExpressionError (ValueError): 67 """Raised when a regular expression cannot be processed..""" 68 def __init__ (self, position, description): 69 self.position = position 70 ValueError.__init__(self, 'At %d: %s' % (position, description)) 71 72_CharClassEsc_re = re.compile(r'\\(?:(?P<cgProp>[pP]{(?P<charProp>[-A-Za-z0-9]+)})|(?P<cgClass>[^pP]))') 73def _MatchCharClassEsc(text, position): 74 """Parse a U{charClassEsc<http://www.w3.org/TR/xmlschema-2/#nt-charClassEsc>} term. 75 76 This is one of: 77 78 - U{SingleCharEsc<http://www.w3.org/TR/xmlschema-2/#nt-SingleCharEsc>}, 79 an escaped single character such as C{E{\}n} 80 81 - U{MultiCharEsc<http://www.w3.org/TR/xmlschema-2/#nt-MultiCharEsc>}, 82 an escape code that can match a range of characters, 83 e.g. C{E{\}s} to match certain whitespace characters 84 85 - U{catEsc<http://www.w3.org/TR/xmlschema-2/#nt-catEsc>}, the 86 C{E{\}pE{lb}...E{rb}} Unicode property escapes including 87 categories and blocks 88 89 - U{complEsc<http://www.w3.org/TR/xmlschema-2/#nt-complEsc>}, 90 the C{E{\}PE{lb}...E{rb}} inverted Unicode property escapes 91 92 If the parsing fails, throws a RegularExpressionError. 93 94 @return: A pair C{(cps, p)} where C{cps} is a 95 L{pyxb.utils.unicode.CodePointSet} containing the code points 96 associated with the character class, and C{p} is the text offset 97 immediately following the escape sequence. 98 99 @raise RegularExpressionError: if the expression is syntactically 100 invalid. 101 """ 102 103 mo = _CharClassEsc_re.match(text, position) 104 if mo: 105 escape_code = mo.group(0) 106 cps = _AllEsc.get(escape_code) 107 if cps is not None: 108 return (cps, mo.end()) 109 char_prop = mo.group('charProp') 110 if char_prop is not None: 111 if char_prop.startswith('Is'): 112 raise RegularExpressionError(position, 'Unrecognized Unicode block %s in %s' % (char_prop[2:], escape_code)) 113 raise RegularExpressionError(position, 'Unrecognized character property %s' % (escape_code,)) 114 raise RegularExpressionError(position, 'Unrecognized character class %s' % (escape_code,)) 115 raise RegularExpressionError(position, "Unrecognized escape identifier at %s" % (text[position:],)) 116 117def _MatchPosCharGroup(text, position): 118 '''Parse a U{posCharGroup<http://www.w3.org/TR/xmlschema-2/#nt-posCharGroup>} term. 119 120 @return: A tuple C{(cps, fs, p)} where: 121 - C{cps} is a L{pyxb.utils.unicode.CodePointSet} containing the code points associated with the group; 122 - C{fs} is a C{bool} that is C{True} if the next character is the C{-} in a U{charClassSub<http://www.w3.org/TR/xmlschema-2/#nt-charClassSub>} and C{False} if the group is not part of a charClassSub; 123 - C{p} is the text offset immediately following the closing brace. 124 125 @raise RegularExpressionError: if the expression is syntactically 126 invalid. 127 ''' 128 129 start_position = position 130 131 # DASH is just some unique object, used as a marker. 132 # It can't be unicode or a CodePointSet. 133 class DashClass: 134 pass 135 DASH = DashClass() 136 137 # We tokenize first, then go back and stick the ranges together. 138 tokens = [] 139 has_following_subtraction = False 140 while True: 141 if position >= len(text): 142 raise RegularExpressionError(position, "Incomplete character class expression, missing closing ']'") 143 ch = text[position] 144 if ch == six.u('['): 145 # Only allowed if this is a subtraction 146 if not tokens or tokens[-1] is not DASH: 147 raise RegularExpressionError(position, "'[' character not allowed in character class") 148 has_following_subtraction = True 149 # For a character class subtraction, the "-[" are not part of the 150 # posCharGroup, so undo reading the dash 151 tokens.pop() 152 position = position - 1 153 break 154 elif ch == six.u(']'): 155 # End 156 break 157 elif ch == six.unichr(0x5c): # backslash 158 cps, position = _MatchCharClassEsc(text, position) 159 single_char = cps.asSingleCharacter() 160 if single_char is not None: 161 tokens.append(single_char) 162 else: 163 tokens.append(cps) 164 elif ch == six.u('-'): 165 # We need to distinguish between "-" and "\-". So we use 166 # DASH for a plain "-", and u"-" for a "\-". 167 tokens.append(DASH) 168 position = position + 1 169 else: 170 tokens.append(ch) 171 position = position + 1 172 173 if not tokens: 174 raise RegularExpressionError(position, "Empty character class not allowed") 175 176 # At the start or end of the character group, a dash has to be a literal 177 if tokens[0] is DASH: 178 tokens[0] = six.u('-') 179 if tokens[-1] is DASH: 180 tokens[-1] = six.u('-') 181 result_cps = pyxb.utils.unicode.CodePointSet() 182 cur_token = 0 183 while cur_token < len(tokens): 184 start = tokens[cur_token] 185 if cur_token + 2 < len(tokens) and tokens[cur_token + 1] is DASH: 186 end = tokens[cur_token + 2] 187 if not isinstance(start, six.text_type) or not isinstance(end, six.text_type): 188 if start is DASH or end is DASH: 189 raise RegularExpressionError(start_position, 'Two dashes in a row is not allowed in the middle of a character class.') 190 raise RegularExpressionError(start_position, 'Dashes must be surrounded by characters, not character class escapes. %r %r' %(start, end)) 191 if start > end: 192 raise RegularExpressionError(start_position, 'Character ranges must have the lowest character first') 193 result_cps.add((ord(start), ord(end))) 194 cur_token = cur_token + 3 195 else: 196 if start is DASH: 197 raise RegularExpressionError(start_position, 'Dash without an initial character') 198 elif isinstance(start, six.text_type): 199 result_cps.add(ord(start)) 200 else: 201 result_cps.extend(start) 202 cur_token = cur_token + 1 203 204 return result_cps, has_following_subtraction, position 205 206def _MatchCharClassExpr(text, position): 207 '''Parse a U{charClassExpr<http://www.w3.org/TR/xmlschema-2/#nt-charClassExpr>}. 208 209 These are XML regular expression classes such as C{[abc]}, C{[a-c]}, C{[^abc]}, or C{[a-z-[q]]}. 210 211 @param text: The complete text of the regular expression being 212 translated. The first character must be the C{[} starting a 213 character class. 214 215 @param position: The offset of the start of the character group. 216 217 @return: A pair C{(cps, p)} where C{cps} is a 218 L{pyxb.utils.unicode.CodePointSet} containing the code points 219 associated with the property, and C{p} is the text offset 220 immediately following the closing brace. 221 222 @raise RegularExpressionError: if the expression is syntactically 223 invalid. 224 ''' 225 if position >= len(text): 226 raise RegularExpressionError(position, 'Missing character class expression') 227 if six.u('[') != text[position]: 228 raise RegularExpressionError(position, "Expected start of character class expression, got '%s'" % (text[position],)) 229 position = position + 1 230 if position >= len(text): 231 raise RegularExpressionError(position, 'Missing character class expression') 232 negated = (text[position] == '^') 233 if negated: 234 position = position + 1 235 236 result_cps, has_following_subtraction, position = _MatchPosCharGroup(text, position) 237 238 if negated: 239 result_cps = result_cps.negate() 240 241 if has_following_subtraction: 242 assert text[position] == six.u('-') 243 assert text[position + 1] == six.u('[') 244 position = position + 1 245 sub_cps, position = _MatchCharClassExpr(text, position) 246 result_cps.subtract(sub_cps) 247 248 if position >= len(text) or text[position] != six.u(']'): 249 raise RegularExpressionError(position, "Expected ']' to end character class") 250 return result_cps, position + 1 251 252def MaybeMatchCharacterClass (text, position): 253 """Attempt to match a U{character class expression 254 <http://www.w3.org/TR/xmlschema-2/#nt-charClassExpr>}. 255 256 @param text: The complete text of the regular expression being 257 translated 258 259 @param position: The offset of the start of the potential 260 expression. 261 262 @return: C{None} if C{position} does not begin a character class 263 expression; otherwise a pair C{(cps, p)} where C{cps} is a 264 L{pyxb.utils.unicode.CodePointSet} containing the code points associated with 265 the property, and C{p} is the text offset immediately following 266 the closing brace.""" 267 if position >= len(text): 268 return None 269 c = text[position] 270 np = position + 1 271 if '.' == c: 272 return (pyxb.utils.unicode.WildcardEsc, np) 273 if '[' == c: 274 return _MatchCharClassExpr(text, position) 275 if '\\' == c: 276 return _MatchCharClassEsc(text, position) 277 return None 278 279def XMLToPython (pattern): 280 """Convert the given pattern to the format required for Python 281 regular expressions. 282 283 @param pattern: A Unicode string defining a pattern consistent 284 with U{XML regular 285 expressions<http://www.w3.org/TR/xmlschema-2/index.html#regexs>}. 286 287 @return: A Unicode string specifying a Python regular expression 288 that matches the same language as C{pattern}.""" 289 assert isinstance(pattern, six.text_type) 290 new_pattern_elts = [] 291 new_pattern_elts.append('^(') 292 position = 0 293 while position < len(pattern): 294 cg = MaybeMatchCharacterClass(pattern, position) 295 if cg is None: 296 ch = pattern[position] 297 if ch == six.u('^') or ch == six.u('$'): 298 # These characters have no special meaning in XSD. But they 299 # match start and end of string in Python, so they have to 300 # be escaped. 301 new_pattern_elts.append(six.unichr(0x5c) + ch) 302 else: 303 new_pattern_elts.append(ch) 304 position += 1 305 else: 306 (cps, position) = cg 307 new_pattern_elts.append(cps.asPattern()) 308 new_pattern_elts.append(')$') 309 return ''.join(new_pattern_elts) 310