1# 2# Secret Labs' Regular Expression Engine 3# 4# convert re-style regular expression to sre pattern 5# 6# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. 7# 8# See the sre.py file for information on usage and redistribution. 9# 10 11"""Internal support module for sre""" 12 13# XXX: show string offset and offending character for all errors 14 15from sre_constants import * 16 17SPECIAL_CHARS = ".\\[{()*+?^$|" 18REPEAT_CHARS = "*+?{" 19 20DIGITS = frozenset("0123456789") 21 22OCTDIGITS = frozenset("01234567") 23HEXDIGITS = frozenset("0123456789abcdefABCDEF") 24ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") 25 26WHITESPACE = frozenset(" \t\n\r\v\f") 27 28_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT}) 29_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY}) 30 31ESCAPES = { 32 r"\a": (LITERAL, ord("\a")), 33 r"\b": (LITERAL, ord("\b")), 34 r"\f": (LITERAL, ord("\f")), 35 r"\n": (LITERAL, ord("\n")), 36 r"\r": (LITERAL, ord("\r")), 37 r"\t": (LITERAL, ord("\t")), 38 r"\v": (LITERAL, ord("\v")), 39 r"\\": (LITERAL, ord("\\")) 40} 41 42CATEGORIES = { 43 r"\A": (AT, AT_BEGINNING_STRING), # start of string 44 r"\b": (AT, AT_BOUNDARY), 45 r"\B": (AT, AT_NON_BOUNDARY), 46 r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), 47 r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), 48 r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), 49 r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), 50 r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), 51 r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), 52 r"\Z": (AT, AT_END_STRING), # end of string 53} 54 55FLAGS = { 56 # standard flags 57 "i": SRE_FLAG_IGNORECASE, 58 "L": SRE_FLAG_LOCALE, 59 "m": SRE_FLAG_MULTILINE, 60 "s": SRE_FLAG_DOTALL, 61 "x": SRE_FLAG_VERBOSE, 62 # extensions 63 "a": SRE_FLAG_ASCII, 64 "t": SRE_FLAG_TEMPLATE, 65 "u": SRE_FLAG_UNICODE, 66} 67 68TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE 69GLOBAL_FLAGS = SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE 70 71class Verbose(Exception): 72 pass 73 74class Pattern: 75 # master pattern object. keeps track of global attributes 76 def __init__(self): 77 self.flags = 0 78 self.groupdict = {} 79 self.groupwidths = [None] # group 0 80 self.lookbehindgroups = None 81 @property 82 def groups(self): 83 return len(self.groupwidths) 84 def opengroup(self, name=None): 85 gid = self.groups 86 self.groupwidths.append(None) 87 if self.groups > MAXGROUPS: 88 raise error("too many groups") 89 if name is not None: 90 ogid = self.groupdict.get(name, None) 91 if ogid is not None: 92 raise error("redefinition of group name %r as group %d; " 93 "was group %d" % (name, gid, ogid)) 94 self.groupdict[name] = gid 95 return gid 96 def closegroup(self, gid, p): 97 self.groupwidths[gid] = p.getwidth() 98 def checkgroup(self, gid): 99 return gid < self.groups and self.groupwidths[gid] is not None 100 101 def checklookbehindgroup(self, gid, source): 102 if self.lookbehindgroups is not None: 103 if not self.checkgroup(gid): 104 raise source.error('cannot refer to an open group') 105 if gid >= self.lookbehindgroups: 106 raise source.error('cannot refer to group defined in the same ' 107 'lookbehind subpattern') 108 109class SubPattern: 110 # a subpattern, in intermediate form 111 def __init__(self, pattern, data=None): 112 self.pattern = pattern 113 if data is None: 114 data = [] 115 self.data = data 116 self.width = None 117 118 def dump(self, level=0): 119 nl = True 120 seqtypes = (tuple, list) 121 for op, av in self.data: 122 print(level*" " + str(op), end='') 123 if op is IN: 124 # member sublanguage 125 print() 126 for op, a in av: 127 print((level+1)*" " + str(op), a) 128 elif op is BRANCH: 129 print() 130 for i, a in enumerate(av[1]): 131 if i: 132 print(level*" " + "OR") 133 a.dump(level+1) 134 elif op is GROUPREF_EXISTS: 135 condgroup, item_yes, item_no = av 136 print('', condgroup) 137 item_yes.dump(level+1) 138 if item_no: 139 print(level*" " + "ELSE") 140 item_no.dump(level+1) 141 elif isinstance(av, seqtypes): 142 nl = False 143 for a in av: 144 if isinstance(a, SubPattern): 145 if not nl: 146 print() 147 a.dump(level+1) 148 nl = True 149 else: 150 if not nl: 151 print(' ', end='') 152 print(a, end='') 153 nl = False 154 if not nl: 155 print() 156 else: 157 print('', av) 158 def __repr__(self): 159 return repr(self.data) 160 def __len__(self): 161 return len(self.data) 162 def __delitem__(self, index): 163 del self.data[index] 164 def __getitem__(self, index): 165 if isinstance(index, slice): 166 return SubPattern(self.pattern, self.data[index]) 167 return self.data[index] 168 def __setitem__(self, index, code): 169 self.data[index] = code 170 def insert(self, index, code): 171 self.data.insert(index, code) 172 def append(self, code): 173 self.data.append(code) 174 def getwidth(self): 175 # determine the width (min, max) for this subpattern 176 if self.width is not None: 177 return self.width 178 lo = hi = 0 179 for op, av in self.data: 180 if op is BRANCH: 181 i = MAXREPEAT - 1 182 j = 0 183 for av in av[1]: 184 l, h = av.getwidth() 185 i = min(i, l) 186 j = max(j, h) 187 lo = lo + i 188 hi = hi + j 189 elif op is CALL: 190 i, j = av.getwidth() 191 lo = lo + i 192 hi = hi + j 193 elif op is SUBPATTERN: 194 i, j = av[-1].getwidth() 195 lo = lo + i 196 hi = hi + j 197 elif op in _REPEATCODES: 198 i, j = av[2].getwidth() 199 lo = lo + i * av[0] 200 hi = hi + j * av[1] 201 elif op in _UNITCODES: 202 lo = lo + 1 203 hi = hi + 1 204 elif op is GROUPREF: 205 i, j = self.pattern.groupwidths[av] 206 lo = lo + i 207 hi = hi + j 208 elif op is GROUPREF_EXISTS: 209 i, j = av[1].getwidth() 210 if av[2] is not None: 211 l, h = av[2].getwidth() 212 i = min(i, l) 213 j = max(j, h) 214 else: 215 i = 0 216 lo = lo + i 217 hi = hi + j 218 elif op is SUCCESS: 219 break 220 self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT) 221 return self.width 222 223class Tokenizer: 224 def __init__(self, string): 225 self.istext = isinstance(string, str) 226 self.string = string 227 if not self.istext: 228 string = str(string, 'latin1') 229 self.decoded_string = string 230 self.index = 0 231 self.next = None 232 self.__next() 233 def __next(self): 234 index = self.index 235 try: 236 char = self.decoded_string[index] 237 except IndexError: 238 self.next = None 239 return 240 if char == "\\": 241 index += 1 242 try: 243 char += self.decoded_string[index] 244 except IndexError: 245 raise error("bad escape (end of pattern)", 246 self.string, len(self.string) - 1) from None 247 self.index = index + 1 248 self.next = char 249 def match(self, char): 250 if char == self.next: 251 self.__next() 252 return True 253 return False 254 def get(self): 255 this = self.next 256 self.__next() 257 return this 258 def getwhile(self, n, charset): 259 result = '' 260 for _ in range(n): 261 c = self.next 262 if c not in charset: 263 break 264 result += c 265 self.__next() 266 return result 267 def getuntil(self, terminator): 268 result = '' 269 while True: 270 c = self.next 271 self.__next() 272 if c is None: 273 if not result: 274 raise self.error("missing group name") 275 raise self.error("missing %s, unterminated name" % terminator, 276 len(result)) 277 if c == terminator: 278 if not result: 279 raise self.error("missing group name", 1) 280 break 281 result += c 282 return result 283 @property 284 def pos(self): 285 return self.index - len(self.next or '') 286 def tell(self): 287 return self.index - len(self.next or '') 288 def seek(self, index): 289 self.index = index 290 self.__next() 291 292 def error(self, msg, offset=0): 293 return error(msg, self.string, self.tell() - offset) 294 295def _class_escape(source, escape): 296 # handle escape code inside character class 297 code = ESCAPES.get(escape) 298 if code: 299 return code 300 code = CATEGORIES.get(escape) 301 if code and code[0] is IN: 302 return code 303 try: 304 c = escape[1:2] 305 if c == "x": 306 # hexadecimal escape (exactly two digits) 307 escape += source.getwhile(2, HEXDIGITS) 308 if len(escape) != 4: 309 raise source.error("incomplete escape %s" % escape, len(escape)) 310 return LITERAL, int(escape[2:], 16) 311 elif c == "u" and source.istext: 312 # unicode escape (exactly four digits) 313 escape += source.getwhile(4, HEXDIGITS) 314 if len(escape) != 6: 315 raise source.error("incomplete escape %s" % escape, len(escape)) 316 return LITERAL, int(escape[2:], 16) 317 elif c == "U" and source.istext: 318 # unicode escape (exactly eight digits) 319 escape += source.getwhile(8, HEXDIGITS) 320 if len(escape) != 10: 321 raise source.error("incomplete escape %s" % escape, len(escape)) 322 c = int(escape[2:], 16) 323 chr(c) # raise ValueError for invalid code 324 return LITERAL, c 325 elif c in OCTDIGITS: 326 # octal escape (up to three digits) 327 escape += source.getwhile(2, OCTDIGITS) 328 c = int(escape[1:], 8) 329 if c > 0o377: 330 raise source.error('octal escape value %s outside of ' 331 'range 0-0o377' % escape, len(escape)) 332 return LITERAL, c 333 elif c in DIGITS: 334 raise ValueError 335 if len(escape) == 2: 336 if c in ASCIILETTERS: 337 raise source.error('bad escape %s' % escape, len(escape)) 338 return LITERAL, ord(escape[1]) 339 except ValueError: 340 pass 341 raise source.error("bad escape %s" % escape, len(escape)) 342 343def _escape(source, escape, state): 344 # handle escape code in expression 345 code = CATEGORIES.get(escape) 346 if code: 347 return code 348 code = ESCAPES.get(escape) 349 if code: 350 return code 351 try: 352 c = escape[1:2] 353 if c == "x": 354 # hexadecimal escape 355 escape += source.getwhile(2, HEXDIGITS) 356 if len(escape) != 4: 357 raise source.error("incomplete escape %s" % escape, len(escape)) 358 return LITERAL, int(escape[2:], 16) 359 elif c == "u" and source.istext: 360 # unicode escape (exactly four digits) 361 escape += source.getwhile(4, HEXDIGITS) 362 if len(escape) != 6: 363 raise source.error("incomplete escape %s" % escape, len(escape)) 364 return LITERAL, int(escape[2:], 16) 365 elif c == "U" and source.istext: 366 # unicode escape (exactly eight digits) 367 escape += source.getwhile(8, HEXDIGITS) 368 if len(escape) != 10: 369 raise source.error("incomplete escape %s" % escape, len(escape)) 370 c = int(escape[2:], 16) 371 chr(c) # raise ValueError for invalid code 372 return LITERAL, c 373 elif c == "0": 374 # octal escape 375 escape += source.getwhile(2, OCTDIGITS) 376 return LITERAL, int(escape[1:], 8) 377 elif c in DIGITS: 378 # octal escape *or* decimal group reference (sigh) 379 if source.next in DIGITS: 380 escape += source.get() 381 if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and 382 source.next in OCTDIGITS): 383 # got three octal digits; this is an octal escape 384 escape += source.get() 385 c = int(escape[1:], 8) 386 if c > 0o377: 387 raise source.error('octal escape value %s outside of ' 388 'range 0-0o377' % escape, 389 len(escape)) 390 return LITERAL, c 391 # not an octal escape, so this is a group reference 392 group = int(escape[1:]) 393 if group < state.groups: 394 if not state.checkgroup(group): 395 raise source.error("cannot refer to an open group", 396 len(escape)) 397 state.checklookbehindgroup(group, source) 398 return GROUPREF, group 399 raise source.error("invalid group reference %d" % group, len(escape) - 1) 400 if len(escape) == 2: 401 if c in ASCIILETTERS: 402 raise source.error("bad escape %s" % escape, len(escape)) 403 return LITERAL, ord(escape[1]) 404 except ValueError: 405 pass 406 raise source.error("bad escape %s" % escape, len(escape)) 407 408def _uniq(items): 409 return list(dict.fromkeys(items)) 410 411def _parse_sub(source, state, verbose, nested): 412 # parse an alternation: a|b|c 413 414 items = [] 415 itemsappend = items.append 416 sourcematch = source.match 417 start = source.tell() 418 while True: 419 itemsappend(_parse(source, state, verbose, nested + 1, 420 not nested and not items)) 421 if not sourcematch("|"): 422 break 423 424 if len(items) == 1: 425 return items[0] 426 427 subpattern = SubPattern(state) 428 429 # check if all items share a common prefix 430 while True: 431 prefix = None 432 for item in items: 433 if not item: 434 break 435 if prefix is None: 436 prefix = item[0] 437 elif item[0] != prefix: 438 break 439 else: 440 # all subitems start with a common "prefix". 441 # move it out of the branch 442 for item in items: 443 del item[0] 444 subpattern.append(prefix) 445 continue # check next one 446 break 447 448 # check if the branch can be replaced by a character set 449 set = [] 450 for item in items: 451 if len(item) != 1: 452 break 453 op, av = item[0] 454 if op is LITERAL: 455 set.append((op, av)) 456 elif op is IN and av[0][0] is not NEGATE: 457 set.extend(av) 458 else: 459 break 460 else: 461 # we can store this as a character set instead of a 462 # branch (the compiler may optimize this even more) 463 subpattern.append((IN, _uniq(set))) 464 return subpattern 465 466 subpattern.append((BRANCH, (None, items))) 467 return subpattern 468 469def _parse(source, state, verbose, nested, first=False): 470 # parse a simple pattern 471 subpattern = SubPattern(state) 472 473 # precompute constants into local variables 474 subpatternappend = subpattern.append 475 sourceget = source.get 476 sourcematch = source.match 477 _len = len 478 _ord = ord 479 480 while True: 481 482 this = source.next 483 if this is None: 484 break # end of pattern 485 if this in "|)": 486 break # end of subpattern 487 sourceget() 488 489 if verbose: 490 # skip whitespace and comments 491 if this in WHITESPACE: 492 continue 493 if this == "#": 494 while True: 495 this = sourceget() 496 if this is None or this == "\n": 497 break 498 continue 499 500 if this[0] == "\\": 501 code = _escape(source, this, state) 502 subpatternappend(code) 503 504 elif this not in SPECIAL_CHARS: 505 subpatternappend((LITERAL, _ord(this))) 506 507 elif this == "[": 508 here = source.tell() - 1 509 # character set 510 set = [] 511 setappend = set.append 512## if sourcematch(":"): 513## pass # handle character classes 514 if source.next == '[': 515 import warnings 516 warnings.warn( 517 'Possible nested set at position %d' % source.tell(), 518 FutureWarning, stacklevel=nested + 6 519 ) 520 negate = sourcematch("^") 521 # check remaining characters 522 while True: 523 this = sourceget() 524 if this is None: 525 raise source.error("unterminated character set", 526 source.tell() - here) 527 if this == "]" and set: 528 break 529 elif this[0] == "\\": 530 code1 = _class_escape(source, this) 531 else: 532 if set and this in '-&~|' and source.next == this: 533 import warnings 534 warnings.warn( 535 'Possible set %s at position %d' % ( 536 'difference' if this == '-' else 537 'intersection' if this == '&' else 538 'symmetric difference' if this == '~' else 539 'union', 540 source.tell() - 1), 541 FutureWarning, stacklevel=nested + 6 542 ) 543 code1 = LITERAL, _ord(this) 544 if sourcematch("-"): 545 # potential range 546 that = sourceget() 547 if that is None: 548 raise source.error("unterminated character set", 549 source.tell() - here) 550 if that == "]": 551 if code1[0] is IN: 552 code1 = code1[1][0] 553 setappend(code1) 554 setappend((LITERAL, _ord("-"))) 555 break 556 if that[0] == "\\": 557 code2 = _class_escape(source, that) 558 else: 559 if that == '-': 560 import warnings 561 warnings.warn( 562 'Possible set difference at position %d' % ( 563 source.tell() - 2), 564 FutureWarning, stacklevel=nested + 6 565 ) 566 code2 = LITERAL, _ord(that) 567 if code1[0] != LITERAL or code2[0] != LITERAL: 568 msg = "bad character range %s-%s" % (this, that) 569 raise source.error(msg, len(this) + 1 + len(that)) 570 lo = code1[1] 571 hi = code2[1] 572 if hi < lo: 573 msg = "bad character range %s-%s" % (this, that) 574 raise source.error(msg, len(this) + 1 + len(that)) 575 setappend((RANGE, (lo, hi))) 576 else: 577 if code1[0] is IN: 578 code1 = code1[1][0] 579 setappend(code1) 580 581 set = _uniq(set) 582 # XXX: <fl> should move set optimization to compiler! 583 if _len(set) == 1 and set[0][0] is LITERAL: 584 # optimization 585 if negate: 586 subpatternappend((NOT_LITERAL, set[0][1])) 587 else: 588 subpatternappend(set[0]) 589 else: 590 if negate: 591 set.insert(0, (NEGATE, None)) 592 # charmap optimization can't be added here because 593 # global flags still are not known 594 subpatternappend((IN, set)) 595 596 elif this in REPEAT_CHARS: 597 # repeat previous item 598 here = source.tell() 599 if this == "?": 600 min, max = 0, 1 601 elif this == "*": 602 min, max = 0, MAXREPEAT 603 604 elif this == "+": 605 min, max = 1, MAXREPEAT 606 elif this == "{": 607 if source.next == "}": 608 subpatternappend((LITERAL, _ord(this))) 609 continue 610 611 min, max = 0, MAXREPEAT 612 lo = hi = "" 613 while source.next in DIGITS: 614 lo += sourceget() 615 if sourcematch(","): 616 while source.next in DIGITS: 617 hi += sourceget() 618 else: 619 hi = lo 620 if not sourcematch("}"): 621 subpatternappend((LITERAL, _ord(this))) 622 source.seek(here) 623 continue 624 625 if lo: 626 min = int(lo) 627 if min >= MAXREPEAT: 628 raise OverflowError("the repetition number is too large") 629 if hi: 630 max = int(hi) 631 if max >= MAXREPEAT: 632 raise OverflowError("the repetition number is too large") 633 if max < min: 634 raise source.error("min repeat greater than max repeat", 635 source.tell() - here) 636 else: 637 raise AssertionError("unsupported quantifier %r" % (char,)) 638 # figure out which item to repeat 639 if subpattern: 640 item = subpattern[-1:] 641 else: 642 item = None 643 if not item or item[0][0] is AT: 644 raise source.error("nothing to repeat", 645 source.tell() - here + len(this)) 646 if item[0][0] in _REPEATCODES: 647 raise source.error("multiple repeat", 648 source.tell() - here + len(this)) 649 if item[0][0] is SUBPATTERN: 650 group, add_flags, del_flags, p = item[0][1] 651 if group is None and not add_flags and not del_flags: 652 item = p 653 if sourcematch("?"): 654 subpattern[-1] = (MIN_REPEAT, (min, max, item)) 655 else: 656 subpattern[-1] = (MAX_REPEAT, (min, max, item)) 657 658 elif this == ".": 659 subpatternappend((ANY, None)) 660 661 elif this == "(": 662 start = source.tell() - 1 663 group = True 664 name = None 665 add_flags = 0 666 del_flags = 0 667 if sourcematch("?"): 668 # options 669 char = sourceget() 670 if char is None: 671 raise source.error("unexpected end of pattern") 672 if char == "P": 673 # python extensions 674 if sourcematch("<"): 675 # named group: skip forward to end of name 676 name = source.getuntil(">") 677 if not name.isidentifier(): 678 msg = "bad character in group name %r" % name 679 raise source.error(msg, len(name) + 1) 680 elif sourcematch("="): 681 # named backreference 682 name = source.getuntil(")") 683 if not name.isidentifier(): 684 msg = "bad character in group name %r" % name 685 raise source.error(msg, len(name) + 1) 686 gid = state.groupdict.get(name) 687 if gid is None: 688 msg = "unknown group name %r" % name 689 raise source.error(msg, len(name) + 1) 690 if not state.checkgroup(gid): 691 raise source.error("cannot refer to an open group", 692 len(name) + 1) 693 state.checklookbehindgroup(gid, source) 694 subpatternappend((GROUPREF, gid)) 695 continue 696 697 else: 698 char = sourceget() 699 if char is None: 700 raise source.error("unexpected end of pattern") 701 raise source.error("unknown extension ?P" + char, 702 len(char) + 2) 703 elif char == ":": 704 # non-capturing group 705 group = None 706 elif char == "#": 707 # comment 708 while True: 709 if source.next is None: 710 raise source.error("missing ), unterminated comment", 711 source.tell() - start) 712 if sourceget() == ")": 713 break 714 continue 715 716 elif char in "=!<": 717 # lookahead assertions 718 dir = 1 719 if char == "<": 720 char = sourceget() 721 if char is None: 722 raise source.error("unexpected end of pattern") 723 if char not in "=!": 724 raise source.error("unknown extension ?<" + char, 725 len(char) + 2) 726 dir = -1 # lookbehind 727 lookbehindgroups = state.lookbehindgroups 728 if lookbehindgroups is None: 729 state.lookbehindgroups = state.groups 730 p = _parse_sub(source, state, verbose, nested + 1) 731 if dir < 0: 732 if lookbehindgroups is None: 733 state.lookbehindgroups = None 734 if not sourcematch(")"): 735 raise source.error("missing ), unterminated subpattern", 736 source.tell() - start) 737 if char == "=": 738 subpatternappend((ASSERT, (dir, p))) 739 else: 740 subpatternappend((ASSERT_NOT, (dir, p))) 741 continue 742 743 elif char == "(": 744 # conditional backreference group 745 condname = source.getuntil(")") 746 if condname.isidentifier(): 747 condgroup = state.groupdict.get(condname) 748 if condgroup is None: 749 msg = "unknown group name %r" % condname 750 raise source.error(msg, len(condname) + 1) 751 else: 752 try: 753 condgroup = int(condname) 754 if condgroup < 0: 755 raise ValueError 756 except ValueError: 757 msg = "bad character in group name %r" % condname 758 raise source.error(msg, len(condname) + 1) from None 759 if not condgroup: 760 raise source.error("bad group number", 761 len(condname) + 1) 762 if condgroup >= MAXGROUPS: 763 msg = "invalid group reference %d" % condgroup 764 raise source.error(msg, len(condname) + 1) 765 state.checklookbehindgroup(condgroup, source) 766 item_yes = _parse(source, state, verbose, nested + 1) 767 if source.match("|"): 768 item_no = _parse(source, state, verbose, nested + 1) 769 if source.next == "|": 770 raise source.error("conditional backref with more than two branches") 771 else: 772 item_no = None 773 if not source.match(")"): 774 raise source.error("missing ), unterminated subpattern", 775 source.tell() - start) 776 subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) 777 continue 778 779 elif char in FLAGS or char == "-": 780 # flags 781 flags = _parse_flags(source, state, char) 782 if flags is None: # global flags 783 if not first or subpattern: 784 import warnings 785 warnings.warn( 786 'Flags not at the start of the expression %r%s' % ( 787 source.string[:20], # truncate long regexes 788 ' (truncated)' if len(source.string) > 20 else '', 789 ), 790 DeprecationWarning, stacklevel=nested + 6 791 ) 792 if (state.flags & SRE_FLAG_VERBOSE) and not verbose: 793 raise Verbose 794 continue 795 796 add_flags, del_flags = flags 797 group = None 798 else: 799 raise source.error("unknown extension ?" + char, 800 len(char) + 1) 801 802 # parse group contents 803 if group is not None: 804 try: 805 group = state.opengroup(name) 806 except error as err: 807 raise source.error(err.msg, len(name) + 1) from None 808 sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and 809 not (del_flags & SRE_FLAG_VERBOSE)) 810 p = _parse_sub(source, state, sub_verbose, nested + 1) 811 if not source.match(")"): 812 raise source.error("missing ), unterminated subpattern", 813 source.tell() - start) 814 if group is not None: 815 state.closegroup(group, p) 816 subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p))) 817 818 elif this == "^": 819 subpatternappend((AT, AT_BEGINNING)) 820 821 elif this == "$": 822 subpatternappend((AT, AT_END)) 823 824 else: 825 raise AssertionError("unsupported special character %r" % (char,)) 826 827 # unpack non-capturing groups 828 for i in range(len(subpattern))[::-1]: 829 op, av = subpattern[i] 830 if op is SUBPATTERN: 831 group, add_flags, del_flags, p = av 832 if group is None and not add_flags and not del_flags: 833 subpattern[i: i+1] = p 834 835 return subpattern 836 837def _parse_flags(source, state, char): 838 sourceget = source.get 839 add_flags = 0 840 del_flags = 0 841 if char != "-": 842 while True: 843 flag = FLAGS[char] 844 if source.istext: 845 if char == 'L': 846 msg = "bad inline flags: cannot use 'L' flag with a str pattern" 847 raise source.error(msg) 848 else: 849 if char == 'u': 850 msg = "bad inline flags: cannot use 'u' flag with a bytes pattern" 851 raise source.error(msg) 852 add_flags |= flag 853 if (flag & TYPE_FLAGS) and (add_flags & TYPE_FLAGS) != flag: 854 msg = "bad inline flags: flags 'a', 'u' and 'L' are incompatible" 855 raise source.error(msg) 856 char = sourceget() 857 if char is None: 858 raise source.error("missing -, : or )") 859 if char in ")-:": 860 break 861 if char not in FLAGS: 862 msg = "unknown flag" if char.isalpha() else "missing -, : or )" 863 raise source.error(msg, len(char)) 864 if char == ")": 865 state.flags |= add_flags 866 return None 867 if add_flags & GLOBAL_FLAGS: 868 raise source.error("bad inline flags: cannot turn on global flag", 1) 869 if char == "-": 870 char = sourceget() 871 if char is None: 872 raise source.error("missing flag") 873 if char not in FLAGS: 874 msg = "unknown flag" if char.isalpha() else "missing flag" 875 raise source.error(msg, len(char)) 876 while True: 877 flag = FLAGS[char] 878 if flag & TYPE_FLAGS: 879 msg = "bad inline flags: cannot turn off flags 'a', 'u' and 'L'" 880 raise source.error(msg) 881 del_flags |= flag 882 char = sourceget() 883 if char is None: 884 raise source.error("missing :") 885 if char == ":": 886 break 887 if char not in FLAGS: 888 msg = "unknown flag" if char.isalpha() else "missing :" 889 raise source.error(msg, len(char)) 890 assert char == ":" 891 if del_flags & GLOBAL_FLAGS: 892 raise source.error("bad inline flags: cannot turn off global flag", 1) 893 if add_flags & del_flags: 894 raise source.error("bad inline flags: flag turned on and off", 1) 895 return add_flags, del_flags 896 897def fix_flags(src, flags): 898 # Check and fix flags according to the type of pattern (str or bytes) 899 if isinstance(src, str): 900 if flags & SRE_FLAG_LOCALE: 901 raise ValueError("cannot use LOCALE flag with a str pattern") 902 if not flags & SRE_FLAG_ASCII: 903 flags |= SRE_FLAG_UNICODE 904 elif flags & SRE_FLAG_UNICODE: 905 raise ValueError("ASCII and UNICODE flags are incompatible") 906 else: 907 if flags & SRE_FLAG_UNICODE: 908 raise ValueError("cannot use UNICODE flag with a bytes pattern") 909 if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII: 910 raise ValueError("ASCII and LOCALE flags are incompatible") 911 return flags 912 913def parse(str, flags=0, pattern=None): 914 # parse 're' pattern into list of (opcode, argument) tuples 915 916 source = Tokenizer(str) 917 918 if pattern is None: 919 pattern = Pattern() 920 pattern.flags = flags 921 pattern.str = str 922 923 try: 924 p = _parse_sub(source, pattern, flags & SRE_FLAG_VERBOSE, 0) 925 except Verbose: 926 # the VERBOSE flag was switched on inside the pattern. to be 927 # on the safe side, we'll parse the whole thing again... 928 pattern = Pattern() 929 pattern.flags = flags | SRE_FLAG_VERBOSE 930 pattern.str = str 931 source.seek(0) 932 p = _parse_sub(source, pattern, True, 0) 933 934 p.pattern.flags = fix_flags(str, p.pattern.flags) 935 936 if source.next is not None: 937 assert source.next == ")" 938 raise source.error("unbalanced parenthesis") 939 940 if flags & SRE_FLAG_DEBUG: 941 p.dump() 942 943 return p 944 945def parse_template(source, pattern): 946 # parse 're' replacement string into list of literals and 947 # group references 948 s = Tokenizer(source) 949 sget = s.get 950 groups = [] 951 literals = [] 952 literal = [] 953 lappend = literal.append 954 def addgroup(index, pos): 955 if index > pattern.groups: 956 raise s.error("invalid group reference %d" % index, pos) 957 if literal: 958 literals.append(''.join(literal)) 959 del literal[:] 960 groups.append((len(literals), index)) 961 literals.append(None) 962 groupindex = pattern.groupindex 963 while True: 964 this = sget() 965 if this is None: 966 break # end of replacement string 967 if this[0] == "\\": 968 # group 969 c = this[1] 970 if c == "g": 971 name = "" 972 if not s.match("<"): 973 raise s.error("missing <") 974 name = s.getuntil(">") 975 if name.isidentifier(): 976 try: 977 index = groupindex[name] 978 except KeyError: 979 raise IndexError("unknown group name %r" % name) 980 else: 981 try: 982 index = int(name) 983 if index < 0: 984 raise ValueError 985 except ValueError: 986 raise s.error("bad character in group name %r" % name, 987 len(name) + 1) from None 988 if index >= MAXGROUPS: 989 raise s.error("invalid group reference %d" % index, 990 len(name) + 1) 991 addgroup(index, len(name) + 1) 992 elif c == "0": 993 if s.next in OCTDIGITS: 994 this += sget() 995 if s.next in OCTDIGITS: 996 this += sget() 997 lappend(chr(int(this[1:], 8) & 0xff)) 998 elif c in DIGITS: 999 isoctal = False 1000 if s.next in DIGITS: 1001 this += sget() 1002 if (c in OCTDIGITS and this[2] in OCTDIGITS and 1003 s.next in OCTDIGITS): 1004 this += sget() 1005 isoctal = True 1006 c = int(this[1:], 8) 1007 if c > 0o377: 1008 raise s.error('octal escape value %s outside of ' 1009 'range 0-0o377' % this, len(this)) 1010 lappend(chr(c)) 1011 if not isoctal: 1012 addgroup(int(this[1:]), len(this) - 1) 1013 else: 1014 try: 1015 this = chr(ESCAPES[this][1]) 1016 except KeyError: 1017 if c in ASCIILETTERS: 1018 raise s.error('bad escape %s' % this, len(this)) 1019 lappend(this) 1020 else: 1021 lappend(this) 1022 if literal: 1023 literals.append(''.join(literal)) 1024 if not isinstance(source, str): 1025 # The tokenizer implicitly decodes bytes objects as latin-1, we must 1026 # therefore re-encode the final representation. 1027 literals = [None if s is None else s.encode('latin-1') for s in literals] 1028 return groups, literals 1029 1030def expand_template(template, match): 1031 g = match.group 1032 empty = match.string[:0] 1033 groups, literals = template 1034 literals = literals[:] 1035 try: 1036 for index, group in groups: 1037 literals[index] = g(group) or empty 1038 except IndexError: 1039 raise error("invalid group reference %d" % index) 1040 return empty.join(literals) 1041