1# 2# Secret Labs' Regular Expression Engine core module 3# 4# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. 5# 6# This version of the SRE library can be redistributed under CNRI's 7# Python 1.6 license. For any other use, please contact Secret Labs 8# AB (info@pythonware.com). 9# 10# Portions of this engine have been developed in cooperation with 11# CNRI. Hewlett-Packard provided funding for 1.6 integration and 12# other compatibility work. 13# 14# 2010-01-16 mrab Python front-end re-written and extended 15 16import string 17import sys 18import unicodedata 19from collections import defaultdict 20 21import _regex 22 23__all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH", 24 "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P", 25 "POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE", 26 "V0", "VERSION0", "V1", "VERSION1", "W", "WORD", "X", "VERBOSE", "error", 27 "Scanner"] 28 29# The regex exception. 30class error(Exception): 31 """Exception raised for invalid regular expressions. 32 33 Attributes: 34 35 msg: The unformatted error message 36 pattern: The regular expression pattern 37 pos: The position in the pattern where compilation failed, or None 38 lineno: The line number where compilation failed, unless pos is None 39 colno: The column number where compilation failed, unless pos is None 40 """ 41 42 def __init__(self, message, pattern=None, pos=None): 43 newline = u'\n' if isinstance(pattern, unicode) else '\n' 44 self.msg = message 45 self.pattern = pattern 46 self.pos = pos 47 if pattern is not None and pos is not None: 48 self.lineno = pattern.count(newline, 0, pos) + 1 49 self.colno = pos - pattern.rfind(newline, 0, pos) 50 51 message = "%s at position %d" % (message, pos) 52 53 if newline in pattern: 54 message += " (line %d, column %d)" % (self.lineno, self.colno) 55 56 Exception.__init__(self, message) 57 58# The exception for when a positional flag has been turned on in the old 59# behaviour. 60class _UnscopedFlagSet(Exception): 61 pass 62 63# The exception for when parsing fails and we want to try something else. 64class ParseError(Exception): 65 pass 66 67# The exception for when there isn't a valid first set. 68class _FirstSetError(Exception): 69 pass 70 71# Flags. 72A = ASCII = 0x80 # Assume ASCII locale. 73B = BESTMATCH = 0x1000 # Best fuzzy match. 74D = DEBUG = 0x200 # Print parsed pattern. 75E = ENHANCEMATCH = 0x8000 # Attempt to improve the fit after finding the first 76 # fuzzy match. 77F = FULLCASE = 0x4000 # Unicode full case-folding. 78I = IGNORECASE = 0x2 # Ignore case. 79L = LOCALE = 0x4 # Assume current 8-bit locale. 80M = MULTILINE = 0x8 # Make anchors look for newline. 81P = POSIX = 0x10000 # POSIX-style matching (leftmost longest). 82R = REVERSE = 0x400 # Search backwards. 83S = DOTALL = 0x10 # Make dot match newline. 84U = UNICODE = 0x20 # Assume Unicode locale. 85V0 = VERSION0 = 0x2000 # Old legacy behaviour. 86V1 = VERSION1 = 0x100 # New enhanced behaviour. 87W = WORD = 0x800 # Default Unicode word breaks. 88X = VERBOSE = 0x40 # Ignore whitespace and comments. 89T = TEMPLATE = 0x1 # Template (present because re module has it). 90 91DEFAULT_VERSION = VERSION1 92 93_ALL_VERSIONS = VERSION0 | VERSION1 94_ALL_ENCODINGS = ASCII | LOCALE | UNICODE 95 96# The default flags for the various versions. 97DEFAULT_FLAGS = {VERSION0: 0, VERSION1: FULLCASE} 98 99# The mask for the flags. 100GLOBAL_FLAGS = (_ALL_ENCODINGS | _ALL_VERSIONS | BESTMATCH | DEBUG | 101 ENHANCEMATCH | POSIX | REVERSE) 102SCOPED_FLAGS = FULLCASE | IGNORECASE | MULTILINE | DOTALL | WORD | VERBOSE 103 104ALPHA = frozenset(string.ascii_letters) 105DIGITS = frozenset(string.digits) 106ALNUM = ALPHA | DIGITS 107OCT_DIGITS = frozenset(string.octdigits) 108HEX_DIGITS = frozenset(string.hexdigits) 109SPECIAL_CHARS = frozenset("()|?*+{^$.[\\#") | frozenset([""]) 110NAMED_CHAR_PART = ALNUM | frozenset(" -") 111PROPERTY_NAME_PART = ALNUM | frozenset(" &_-.") 112SET_OPS = ("||", "~~", "&&", "--") 113 114# The width of the code words inside the regex engine. 115BYTES_PER_CODE = _regex.get_code_size() 116BITS_PER_CODE = BYTES_PER_CODE * 8 117 118# The repeat count which represents infinity. 119UNLIMITED = (1 << BITS_PER_CODE) - 1 120 121# The regular expression flags. 122REGEX_FLAGS = {"a": ASCII, "b": BESTMATCH, "e": ENHANCEMATCH, "f": FULLCASE, 123 "i": IGNORECASE, "L": LOCALE, "m": MULTILINE, "p": POSIX, "r": REVERSE, 124 "s": DOTALL, "u": UNICODE, "V0": VERSION0, "V1": VERSION1, "w": WORD, "x": 125 VERBOSE} 126 127# The case flags. 128CASE_FLAGS = FULLCASE | IGNORECASE 129NOCASE = 0 130FULLIGNORECASE = FULLCASE | IGNORECASE 131 132FULL_CASE_FOLDING = UNICODE | FULLIGNORECASE 133 134CASE_FLAGS_COMBINATIONS = {0: 0, FULLCASE: 0, IGNORECASE: IGNORECASE, 135 FULLIGNORECASE: FULLIGNORECASE} 136 137# The number of digits in hexadecimal escapes. 138HEX_ESCAPES = {"x": 2, "u": 4, "U": 8} 139 140# The names of the opcodes. 141OPCODES = """ 142FAILURE 143SUCCESS 144ANY 145ANY_ALL 146ANY_ALL_REV 147ANY_REV 148ANY_U 149ANY_U_REV 150ATOMIC 151BOUNDARY 152BRANCH 153CALL_REF 154CHARACTER 155CHARACTER_IGN 156CHARACTER_IGN_REV 157CHARACTER_REV 158CONDITIONAL 159DEFAULT_BOUNDARY 160DEFAULT_END_OF_WORD 161DEFAULT_START_OF_WORD 162END 163END_OF_LINE 164END_OF_LINE_U 165END_OF_STRING 166END_OF_STRING_LINE 167END_OF_STRING_LINE_U 168END_OF_WORD 169FUZZY 170GRAPHEME_BOUNDARY 171GREEDY_REPEAT 172GROUP 173GROUP_CALL 174GROUP_EXISTS 175KEEP 176LAZY_REPEAT 177LOOKAROUND 178NEXT 179PROPERTY 180PROPERTY_IGN 181PROPERTY_IGN_REV 182PROPERTY_REV 183PRUNE 184RANGE 185RANGE_IGN 186RANGE_IGN_REV 187RANGE_REV 188REF_GROUP 189REF_GROUP_FLD 190REF_GROUP_FLD_REV 191REF_GROUP_IGN 192REF_GROUP_IGN_REV 193REF_GROUP_REV 194SEARCH_ANCHOR 195SET_DIFF 196SET_DIFF_IGN 197SET_DIFF_IGN_REV 198SET_DIFF_REV 199SET_INTER 200SET_INTER_IGN 201SET_INTER_IGN_REV 202SET_INTER_REV 203SET_SYM_DIFF 204SET_SYM_DIFF_IGN 205SET_SYM_DIFF_IGN_REV 206SET_SYM_DIFF_REV 207SET_UNION 208SET_UNION_IGN 209SET_UNION_IGN_REV 210SET_UNION_REV 211SKIP 212START_OF_LINE 213START_OF_LINE_U 214START_OF_STRING 215START_OF_WORD 216STRING 217STRING_FLD 218STRING_FLD_REV 219STRING_IGN 220STRING_IGN_REV 221STRING_REV 222FUZZY_EXT 223""" 224 225# Define the opcodes in a namespace. 226class Namespace(object): 227 pass 228 229OP = Namespace() 230for i, op in enumerate(OPCODES.split()): 231 setattr(OP, op, i) 232 233def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5): 234 """Make room in the given cache. 235 236 Args: 237 cache_dict: The cache dictionary to modify. 238 args_dict: The dictionary of named list args used by patterns. 239 max_length: Maximum # of entries in cache_dict before it is shrunk. 240 divisor: Cache will shrink to max_length - 1/divisor*max_length items. 241 """ 242 # Toss out a fraction of the entries at random to make room for new ones. 243 # A random algorithm was chosen as opposed to simply cache_dict.popitem() 244 # as popitem could penalize the same regular expression repeatedly based 245 # on its internal hash value. Being random should spread the cache miss 246 # love around. 247 cache_keys = tuple(cache_dict.keys()) 248 overage = len(cache_keys) - max_length 249 if overage < 0: 250 # Cache is already within limits. Normally this should not happen 251 # but it could due to multithreading. 252 return 253 254 number_to_toss = max_length // divisor + overage 255 256 # The import is done here to avoid a circular dependency. 257 import random 258 if not hasattr(random, 'sample'): 259 # Do nothing while resolving the circular dependency: 260 # re->random->warnings->tokenize->string->re 261 return 262 263 for doomed_key in random.sample(cache_keys, number_to_toss): 264 try: 265 del cache_dict[doomed_key] 266 except KeyError: 267 # Ignore problems if the cache changed from another thread. 268 pass 269 270 # Rebuild the arguments and locale-sensitivity dictionaries. 271 args_dict.clear() 272 sensitivity_dict = {} 273 for pattern, pattern_type, flags, args, default_version, locale in tuple(cache_dict): 274 args_dict[pattern, pattern_type, flags, default_version, locale] = args 275 try: 276 sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern] 277 except KeyError: 278 pass 279 280 locale_sensitive.clear() 281 locale_sensitive.update(sensitivity_dict) 282 283def _fold_case(info, string): 284 "Folds the case of a string." 285 flags = info.flags 286 if (flags & _ALL_ENCODINGS) == 0: 287 flags |= info.guess_encoding 288 289 return _regex.fold_case(flags, string) 290 291def is_cased_i(info, char): 292 "Checks whether a character is cased." 293 return len(_regex.get_all_cases(info.flags, char)) > 1 294 295def is_cased_f(flags, char): 296 "Checks whether a character is cased." 297 return len(_regex.get_all_cases(flags, char)) > 1 298 299def _compile_firstset(info, fs): 300 "Compiles the firstset for the pattern." 301 reverse = bool(info.flags & REVERSE) 302 fs = _check_firstset(info, reverse, fs) 303 if not fs: 304 return [] 305 306 # Compile the firstset. 307 return fs.compile(reverse) 308 309def _check_firstset(info, reverse, fs): 310 "Checks the firstset for the pattern." 311 if not fs or None in fs: 312 return None 313 314 # If we ignore the case, for simplicity we won't build a firstset. 315 members = set() 316 case_flags = NOCASE 317 for i in fs: 318 if isinstance(i, Character) and not i.positive: 319 return None 320 321# if i.case_flags: 322# if isinstance(i, Character): 323# if is_cased_i(info, i.value): 324# return [] 325# elif isinstance(i, SetBase): 326# return [] 327 case_flags |= i.case_flags 328 members.add(i.with_flags(case_flags=NOCASE)) 329 330 if case_flags == (FULLCASE | IGNORECASE): 331 return None 332 333 # Build the firstset. 334 fs = SetUnion(info, list(members), case_flags=case_flags & ~FULLCASE, 335 zerowidth=True) 336 fs = fs.optimise(info, reverse, in_set=True) 337 338 return fs 339 340def _flatten_code(code): 341 "Flattens the code from a list of tuples." 342 flat_code = [] 343 for c in code: 344 flat_code.extend(c) 345 346 return flat_code 347 348def make_case_flags(info): 349 "Makes the case flags." 350 flags = info.flags & CASE_FLAGS 351 352 # Turn off FULLCASE if ASCII is turned on. 353 if info.flags & ASCII: 354 flags &= ~FULLCASE 355 356 return flags 357 358def make_character(info, value, in_set=False): 359 "Makes a character literal." 360 if in_set: 361 # A character set is built case-sensitively. 362 return Character(value) 363 364 return Character(value, case_flags=make_case_flags(info)) 365 366def make_ref_group(info, name, position): 367 "Makes a group reference." 368 return RefGroup(info, name, position, case_flags=make_case_flags(info)) 369 370def make_string_set(info, name): 371 "Makes a string set." 372 return StringSet(info, name, case_flags=make_case_flags(info)) 373 374def make_property(info, prop, in_set): 375 "Makes a property." 376 if in_set: 377 return prop 378 379 return prop.with_flags(case_flags=make_case_flags(info)) 380 381def _parse_pattern(source, info): 382 "Parses a pattern, eg. 'a|b|c'." 383 branches = [parse_sequence(source, info)] 384 while source.match("|"): 385 branches.append(parse_sequence(source, info)) 386 387 if len(branches) == 1: 388 return branches[0] 389 return Branch(branches) 390 391def parse_sequence(source, info): 392 "Parses a sequence, eg. 'abc'." 393 sequence = [None] 394 case_flags = make_case_flags(info) 395 while True: 396 saved_pos = source.pos 397 ch = source.get() 398 if ch in SPECIAL_CHARS: 399 if ch in ")|": 400 # The end of a sequence. At the end of the pattern ch is "". 401 source.pos = saved_pos 402 break 403 elif ch == "\\": 404 # An escape sequence outside a set. 405 sequence.append(parse_escape(source, info, False)) 406 elif ch == "(": 407 # A parenthesised subpattern or a flag. 408 element = parse_paren(source, info) 409 if element is None: 410 case_flags = make_case_flags(info) 411 else: 412 sequence.append(element) 413 elif ch == ".": 414 # Any character. 415 if info.flags & DOTALL: 416 sequence.append(AnyAll()) 417 elif info.flags & WORD: 418 sequence.append(AnyU()) 419 else: 420 sequence.append(Any()) 421 elif ch == "[": 422 # A character set. 423 sequence.append(parse_set(source, info)) 424 elif ch == "^": 425 # The start of a line or the string. 426 if info.flags & MULTILINE: 427 if info.flags & WORD: 428 sequence.append(StartOfLineU()) 429 else: 430 sequence.append(StartOfLine()) 431 else: 432 sequence.append(StartOfString()) 433 elif ch == "$": 434 # The end of a line or the string. 435 if info.flags & MULTILINE: 436 if info.flags & WORD: 437 sequence.append(EndOfLineU()) 438 else: 439 sequence.append(EndOfLine()) 440 else: 441 if info.flags & WORD: 442 sequence.append(EndOfStringLineU()) 443 else: 444 sequence.append(EndOfStringLine()) 445 elif ch in "?*+{": 446 # Looks like a quantifier. 447 counts = parse_quantifier(source, info, ch) 448 if counts: 449 # It _is_ a quantifier. 450 apply_quantifier(source, info, counts, case_flags, ch, 451 saved_pos, sequence) 452 sequence.append(None) 453 else: 454 # It's not a quantifier. Maybe it's a fuzzy constraint. 455 constraints = parse_fuzzy(source, info, ch) 456 if constraints: 457 # It _is_ a fuzzy constraint. 458 apply_constraint(source, info, constraints, case_flags, 459 saved_pos, sequence) 460 sequence.append(None) 461 else: 462 # The element was just a literal. 463 sequence.append(Character(ord(ch), 464 case_flags=case_flags)) 465 else: 466 # A literal. 467 sequence.append(Character(ord(ch), case_flags=case_flags)) 468 else: 469 # A literal. 470 sequence.append(Character(ord(ch), case_flags=case_flags)) 471 472 sequence = [item for item in sequence if item is not None] 473 return Sequence(sequence) 474 475def apply_quantifier(source, info, counts, case_flags, ch, saved_pos, 476 sequence): 477 element = sequence.pop() 478 if element is None: 479 if sequence: 480 raise error("multiple repeat", source.string, saved_pos) 481 raise error("nothing to repeat", source.string, saved_pos) 482 483 if isinstance(element, (GreedyRepeat, LazyRepeat, PossessiveRepeat)): 484 raise error("multiple repeat", source.string, saved_pos) 485 486 min_count, max_count = counts 487 saved_pos = source.pos 488 ch = source.get() 489 if ch == "?": 490 # The "?" suffix that means it's a lazy repeat. 491 repeated = LazyRepeat 492 elif ch == "+": 493 # The "+" suffix that means it's a possessive repeat. 494 repeated = PossessiveRepeat 495 else: 496 # No suffix means that it's a greedy repeat. 497 source.pos = saved_pos 498 repeated = GreedyRepeat 499 500 # Ignore the quantifier if it applies to a zero-width item or the number of 501 # repeats is fixed at 1. 502 if not element.is_empty() and (min_count != 1 or max_count != 1): 503 element = repeated(element, min_count, max_count) 504 505 sequence.append(element) 506 507def apply_constraint(source, info, constraints, case_flags, saved_pos, 508 sequence): 509 element = sequence.pop() 510 if element is None: 511 raise error("nothing for fuzzy constraint", source.string, saved_pos) 512 513 # If a group is marked as fuzzy then put all of the fuzzy part in the 514 # group. 515 if isinstance(element, Group): 516 element.subpattern = Fuzzy(element.subpattern, constraints) 517 sequence.append(element) 518 else: 519 sequence.append(Fuzzy(element, constraints)) 520 521_QUANTIFIERS = {"?": (0, 1), "*": (0, None), "+": (1, None)} 522 523def parse_quantifier(source, info, ch): 524 "Parses a quantifier." 525 q = _QUANTIFIERS.get(ch) 526 if q: 527 # It's a quantifier. 528 return q 529 530 if ch == "{": 531 # Looks like a limited repeated element, eg. 'a{2,3}'. 532 counts = parse_limited_quantifier(source) 533 if counts: 534 return counts 535 536 return None 537 538def is_above_limit(count): 539 "Checks whether a count is above the maximum." 540 return count is not None and count >= UNLIMITED 541 542def parse_limited_quantifier(source): 543 "Parses a limited quantifier." 544 saved_pos = source.pos 545 min_count = parse_count(source) 546 if source.match(","): 547 max_count = parse_count(source) 548 549 # No minimum means 0 and no maximum means unlimited. 550 min_count = int(min_count or 0) 551 max_count = int(max_count) if max_count else None 552 else: 553 if not min_count: 554 source.pos = saved_pos 555 return None 556 557 min_count = max_count = int(min_count) 558 559 if not source.match ("}"): 560 source.pos = saved_pos 561 return None 562 563 if is_above_limit(min_count) or is_above_limit(max_count): 564 raise error("repeat count too big", source.string, saved_pos) 565 566 if max_count is not None and min_count > max_count: 567 raise error("min repeat greater than max repeat", source.string, 568 saved_pos) 569 570 return min_count, max_count 571 572def parse_fuzzy(source, info, ch): 573 "Parses a fuzzy setting, if present." 574 saved_pos = source.pos 575 576 if ch != "{": 577 return None 578 579 constraints = {} 580 try: 581 parse_fuzzy_item(source, constraints) 582 while source.match(","): 583 parse_fuzzy_item(source, constraints) 584 except ParseError: 585 source.pos = saved_pos 586 return None 587 588 if source.match(":"): 589 constraints["test"] = parse_fuzzy_test(source, info) 590 591 if not source.match("}"): 592 raise error("expected }", source.string, source.pos) 593 594 return constraints 595 596def parse_fuzzy_item(source, constraints): 597 "Parses a fuzzy setting item." 598 saved_pos = source.pos 599 try: 600 parse_cost_constraint(source, constraints) 601 except ParseError: 602 source.pos = saved_pos 603 604 parse_cost_equation(source, constraints) 605 606def parse_cost_constraint(source, constraints): 607 "Parses a cost constraint." 608 saved_pos = source.pos 609 ch = source.get() 610 if ch in ALPHA: 611 # Syntax: constraint [("<=" | "<") cost] 612 constraint = parse_constraint(source, constraints, ch) 613 614 max_inc = parse_fuzzy_compare(source) 615 616 if max_inc is None: 617 # No maximum cost. 618 constraints[constraint] = 0, None 619 else: 620 # There's a maximum cost. 621 cost_pos = source.pos 622 max_cost = parse_cost_limit(source) 623 624 # Inclusive or exclusive limit? 625 if not max_inc: 626 max_cost -= 1 627 628 if max_cost < 0: 629 raise error("bad fuzzy cost limit", source.string, cost_pos) 630 631 constraints[constraint] = 0, max_cost 632 elif ch in DIGITS: 633 # Syntax: cost ("<=" | "<") constraint ("<=" | "<") cost 634 source.pos = saved_pos 635 636 # Minimum cost. 637 cost_pos = source.pos 638 min_cost = parse_cost_limit(source) 639 640 min_inc = parse_fuzzy_compare(source) 641 if min_inc is None: 642 raise ParseError() 643 644 constraint = parse_constraint(source, constraints, source.get()) 645 646 max_inc = parse_fuzzy_compare(source) 647 if max_inc is None: 648 raise ParseError() 649 650 # Maximum cost. 651 cost_pos = source.pos 652 max_cost = parse_cost_limit(source) 653 654 # Inclusive or exclusive limits? 655 if not min_inc: 656 min_cost += 1 657 if not max_inc: 658 max_cost -= 1 659 660 if not 0 <= min_cost <= max_cost: 661 raise error("bad fuzzy cost limit", source.string, cost_pos) 662 663 constraints[constraint] = min_cost, max_cost 664 else: 665 raise ParseError() 666 667def parse_cost_limit(source): 668 "Parses a cost limit." 669 cost_pos = source.pos 670 digits = parse_count(source) 671 672 try: 673 return int(digits) 674 except ValueError: 675 pass 676 677 raise error("bad fuzzy cost limit", source.string, cost_pos) 678 679def parse_constraint(source, constraints, ch): 680 "Parses a constraint." 681 if ch not in "deis": 682 raise ParseError() 683 684 if ch in constraints: 685 raise ParseError() 686 687 return ch 688 689def parse_fuzzy_compare(source): 690 "Parses a cost comparator." 691 if source.match("<="): 692 return True 693 elif source.match("<"): 694 return False 695 else: 696 return None 697 698def parse_cost_equation(source, constraints): 699 "Parses a cost equation." 700 if "cost" in constraints: 701 raise error("more than one cost equation", source.string, source.pos) 702 703 cost = {} 704 705 parse_cost_term(source, cost) 706 while source.match("+"): 707 parse_cost_term(source, cost) 708 709 max_inc = parse_fuzzy_compare(source) 710 if max_inc is None: 711 raise ParseError() 712 713 max_cost = int(parse_count(source)) 714 715 if not max_inc: 716 max_cost -= 1 717 718 if max_cost < 0: 719 raise error("bad fuzzy cost limit", source.string, source.pos) 720 721 cost["max"] = max_cost 722 723 constraints["cost"] = cost 724 725def parse_cost_term(source, cost): 726 "Parses a cost equation term." 727 coeff = parse_count(source) 728 ch = source.get() 729 if ch not in "dis": 730 raise ParseError() 731 732 if ch in cost: 733 raise error("repeated fuzzy cost", source.string, source.pos) 734 735 cost[ch] = int(coeff or 1) 736 737def parse_fuzzy_test(source, info): 738 saved_pos = source.pos 739 ch = source.get() 740 if ch in SPECIAL_CHARS: 741 if ch == "\\": 742 # An escape sequence outside a set. 743 return parse_escape(source, info, False) 744 elif ch == ".": 745 # Any character. 746 if info.flags & DOTALL: 747 return AnyAll() 748 elif info.flags & WORD: 749 return AnyU() 750 else: 751 return Any() 752 elif ch == "[": 753 # A character set. 754 return parse_set(source, info) 755 else: 756 raise error("expected character set", source.string, saved_pos) 757 elif ch: 758 # A literal. 759 return Character(ord(ch), case_flags=case_flags) 760 else: 761 raise error("expected character set", source.string, saved_pos) 762 763def parse_count(source): 764 "Parses a quantifier's count, which can be empty." 765 return source.get_while(DIGITS) 766 767def parse_paren(source, info): 768 """Parses a parenthesised subpattern or a flag. Returns FLAGS if it's an 769 inline flag. 770 """ 771 saved_pos = source.pos 772 ch = source.get() 773 if ch == "?": 774 # (?... 775 saved_pos_2 = source.pos 776 ch = source.get() 777 if ch == "<": 778 # (?<... 779 saved_pos_3 = source.pos 780 ch = source.get() 781 if ch in ("=", "!"): 782 # (?<=... or (?<!...: lookbehind. 783 return parse_lookaround(source, info, True, ch == "=") 784 785 # (?<...: a named capture group. 786 source.pos = saved_pos_3 787 name = parse_name(source) 788 group = info.open_group(name) 789 source.expect(">") 790 saved_flags = info.flags 791 try: 792 subpattern = _parse_pattern(source, info) 793 source.expect(")") 794 finally: 795 info.flags = saved_flags 796 source.ignore_space = bool(info.flags & VERBOSE) 797 798 info.close_group() 799 return Group(info, group, subpattern) 800 if ch in ("=", "!"): 801 # (?=... or (?!...: lookahead. 802 return parse_lookaround(source, info, False, ch == "=") 803 if ch == "P": 804 # (?P...: a Python extension. 805 return parse_extension(source, info) 806 if ch == "#": 807 # (?#...: a comment. 808 return parse_comment(source) 809 if ch == "(": 810 # (?(...: a conditional subpattern. 811 return parse_conditional(source, info) 812 if ch == ">": 813 # (?>...: an atomic subpattern. 814 return parse_atomic(source, info) 815 if ch == "|": 816 # (?|...: a common/reset groups branch. 817 return parse_common(source, info) 818 if ch == "R" or "0" <= ch <= "9": 819 # (?R...: probably a call to a group. 820 return parse_call_group(source, info, ch, saved_pos_2) 821 if ch == "&": 822 # (?&...: a call to a named group. 823 return parse_call_named_group(source, info, saved_pos_2) 824 825 # (?...: probably a flags subpattern. 826 source.pos = saved_pos_2 827 return parse_flags_subpattern(source, info) 828 829 if ch == "*": 830 # (*... 831 saved_pos_2 = source.pos 832 word = source.get_while(set(")>"), include=False) 833 if word[ : 1].isalpha(): 834 verb = VERBS.get(word) 835 if not verb: 836 raise error("unknown verb", source.string, saved_pos_2) 837 838 source.expect(")") 839 840 return verb 841 842 # (...: an unnamed capture group. 843 source.pos = saved_pos 844 group = info.open_group() 845 saved_flags = info.flags 846 try: 847 subpattern = _parse_pattern(source, info) 848 source.expect(")") 849 finally: 850 info.flags = saved_flags 851 source.ignore_space = bool(info.flags & VERBOSE) 852 853 info.close_group() 854 855 return Group(info, group, subpattern) 856 857def parse_extension(source, info): 858 "Parses a Python extension." 859 saved_pos = source.pos 860 ch = source.get() 861 if ch == "<": 862 # (?P<...: a named capture group. 863 name = parse_name(source) 864 group = info.open_group(name) 865 source.expect(">") 866 saved_flags = info.flags 867 try: 868 subpattern = _parse_pattern(source, info) 869 source.expect(")") 870 finally: 871 info.flags = saved_flags 872 source.ignore_space = bool(info.flags & VERBOSE) 873 874 info.close_group() 875 876 return Group(info, group, subpattern) 877 if ch == "=": 878 # (?P=...: a named group reference. 879 name = parse_name(source, allow_numeric=True) 880 source.expect(")") 881 if info.is_open_group(name): 882 raise error("cannot refer to an open group", source.string, 883 saved_pos) 884 885 return make_ref_group(info, name, saved_pos) 886 if ch == ">" or ch == "&": 887 # (?P>...: a call to a group. 888 return parse_call_named_group(source, info, saved_pos) 889 890 source.pos = saved_pos 891 raise error("unknown extension", source.string, saved_pos) 892 893def parse_comment(source): 894 "Parses a comment." 895 while True: 896 saved_pos = source.pos 897 c = source.get() 898 899 if not c or c == ")": 900 break 901 902 if c == "\\": 903 c = source.get() 904 905 source.pos = saved_pos 906 source.expect(")") 907 908 return None 909 910def parse_lookaround(source, info, behind, positive): 911 "Parses a lookaround." 912 saved_flags = info.flags 913 try: 914 subpattern = _parse_pattern(source, info) 915 source.expect(")") 916 finally: 917 info.flags = saved_flags 918 source.ignore_space = bool(info.flags & VERBOSE) 919 920 return LookAround(behind, positive, subpattern) 921 922def parse_conditional(source, info): 923 "Parses a conditional subpattern." 924 saved_flags = info.flags 925 saved_pos = source.pos 926 ch = source.get() 927 if ch == "?": 928 # (?(?... 929 ch = source.get() 930 if ch in ("=", "!"): 931 # (?(?=... or (?(?!...: lookahead conditional. 932 return parse_lookaround_conditional(source, info, False, ch == "=") 933 if ch == "<": 934 # (?(?<... 935 ch = source.get() 936 if ch in ("=", "!"): 937 # (?(?<=... or (?(?<!...: lookbehind conditional. 938 return parse_lookaround_conditional(source, info, True, ch == 939 "=") 940 941 source.pos = saved_pos 942 raise error("expected lookaround conditional", source.string, 943 source.pos) 944 945 source.pos = saved_pos 946 try: 947 group = parse_name(source, True) 948 source.expect(")") 949 yes_branch = parse_sequence(source, info) 950 if source.match("|"): 951 no_branch = parse_sequence(source, info) 952 else: 953 no_branch = Sequence() 954 955 source.expect(")") 956 finally: 957 info.flags = saved_flags 958 source.ignore_space = bool(info.flags & VERBOSE) 959 960 if yes_branch.is_empty() and no_branch.is_empty(): 961 return Sequence() 962 963 return Conditional(info, group, yes_branch, no_branch, saved_pos) 964 965def parse_lookaround_conditional(source, info, behind, positive): 966 saved_flags = info.flags 967 try: 968 subpattern = _parse_pattern(source, info) 969 source.expect(")") 970 finally: 971 info.flags = saved_flags 972 source.ignore_space = bool(info.flags & VERBOSE) 973 974 yes_branch = parse_sequence(source, info) 975 if source.match("|"): 976 no_branch = parse_sequence(source, info) 977 else: 978 no_branch = Sequence() 979 980 source.expect(")") 981 982 return LookAroundConditional(behind, positive, subpattern, yes_branch, 983 no_branch) 984 985def parse_atomic(source, info): 986 "Parses an atomic subpattern." 987 saved_flags = info.flags 988 try: 989 subpattern = _parse_pattern(source, info) 990 source.expect(")") 991 finally: 992 info.flags = saved_flags 993 source.ignore_space = bool(info.flags & VERBOSE) 994 995 return Atomic(subpattern) 996 997def parse_common(source, info): 998 "Parses a common groups branch." 999 # Capture group numbers in different branches can reuse the group numbers. 1000 initial_group_count = info.group_count 1001 branches = [parse_sequence(source, info)] 1002 final_group_count = info.group_count 1003 while source.match("|"): 1004 info.group_count = initial_group_count 1005 branches.append(parse_sequence(source, info)) 1006 final_group_count = max(final_group_count, info.group_count) 1007 1008 info.group_count = final_group_count 1009 source.expect(")") 1010 1011 if len(branches) == 1: 1012 return branches[0] 1013 return Branch(branches) 1014 1015def parse_call_group(source, info, ch, pos): 1016 "Parses a call to a group." 1017 if ch == "R": 1018 group = "0" 1019 else: 1020 group = ch + source.get_while(DIGITS) 1021 1022 source.expect(")") 1023 1024 return CallGroup(info, group, pos) 1025 1026def parse_call_named_group(source, info, pos): 1027 "Parses a call to a named group." 1028 group = parse_name(source) 1029 source.expect(")") 1030 1031 return CallGroup(info, group, pos) 1032 1033def parse_flag_set(source): 1034 "Parses a set of inline flags." 1035 flags = 0 1036 1037 try: 1038 while True: 1039 saved_pos = source.pos 1040 ch = source.get() 1041 if ch == "V": 1042 ch += source.get() 1043 flags |= REGEX_FLAGS[ch] 1044 except KeyError: 1045 source.pos = saved_pos 1046 1047 return flags 1048 1049def parse_flags(source, info): 1050 "Parses flags being turned on/off." 1051 flags_on = parse_flag_set(source) 1052 if source.match("-"): 1053 flags_off = parse_flag_set(source) 1054 if not flags_off: 1055 raise error("bad inline flags: no flags after '-'", source.string, 1056 source.pos) 1057 else: 1058 flags_off = 0 1059 1060 if flags_on & LOCALE: 1061 # Remember that this pattern as an inline locale flag. 1062 info.inline_locale = True 1063 1064 return flags_on, flags_off 1065 1066def parse_subpattern(source, info, flags_on, flags_off): 1067 "Parses a subpattern with scoped flags." 1068 saved_flags = info.flags 1069 info.flags = (info.flags | flags_on) & ~flags_off 1070 source.ignore_space = bool(info.flags & VERBOSE) 1071 try: 1072 subpattern = _parse_pattern(source, info) 1073 source.expect(")") 1074 finally: 1075 info.flags = saved_flags 1076 source.ignore_space = bool(info.flags & VERBOSE) 1077 1078 return subpattern 1079 1080def parse_flags_subpattern(source, info): 1081 """Parses a flags subpattern. It could be inline flags or a subpattern 1082 possibly with local flags. If it's a subpattern, then that's returned; 1083 if it's a inline flags, then None is returned. 1084 """ 1085 flags_on, flags_off = parse_flags(source, info) 1086 1087 if flags_off & GLOBAL_FLAGS: 1088 raise error("bad inline flags: cannot turn off global flag", 1089 source.string, source.pos) 1090 1091 if flags_on & flags_off: 1092 raise error("bad inline flags: flag turned on and off", source.string, 1093 source.pos) 1094 1095 # Handle flags which are global in all regex behaviours. 1096 new_global_flags = (flags_on & ~info.global_flags) & GLOBAL_FLAGS 1097 if new_global_flags: 1098 info.global_flags |= new_global_flags 1099 1100 # A global has been turned on, so reparse the pattern. 1101 raise _UnscopedFlagSet(info.global_flags) 1102 1103 # Ensure that from now on we have only scoped flags. 1104 flags_on &= ~GLOBAL_FLAGS 1105 1106 if source.match(":"): 1107 return parse_subpattern(source, info, flags_on, flags_off) 1108 1109 if source.match(")"): 1110 parse_positional_flags(source, info, flags_on, flags_off) 1111 return None 1112 1113 raise error("unknown extension", source.string, source.pos) 1114 1115def parse_positional_flags(source, info, flags_on, flags_off): 1116 "Parses positional flags." 1117 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION 1118 if version == VERSION0: 1119 # Positional flags are global and can only be turned on. 1120 if flags_off: 1121 raise error("bad inline flags: cannot turn flags off", 1122 source.string, source.pos) 1123 1124 new_global_flags = flags_on & ~info.global_flags 1125 if new_global_flags: 1126 info.global_flags |= new_global_flags 1127 1128 # A global has been turned on, so reparse the pattern. 1129 raise _UnscopedFlagSet(info.global_flags) 1130 else: 1131 info.flags = (info.flags | flags_on) & ~flags_off 1132 1133 source.ignore_space = bool(info.flags & VERBOSE) 1134 1135def parse_name(source, allow_numeric=False, allow_group_0=False): 1136 "Parses a name." 1137 name = source.get_while(set(")>"), include=False) 1138 1139 if not name: 1140 raise error("missing group name", source.string, source.pos) 1141 1142 if name.isdigit(): 1143 min_group = 0 if allow_group_0 else 1 1144 if not allow_numeric or int(name) < min_group: 1145 raise error("bad character in group name", source.string, 1146 source.pos) 1147 else: 1148 if not is_identifier(name): 1149 raise error("bad character in group name", source.string, 1150 source.pos) 1151 1152 return name 1153 1154def is_identifier(name): 1155 if not name: 1156 return False 1157 1158 if name[0] not in ALPHA and name[0] != "_": 1159 return False 1160 1161 name = name.replace("_", "") 1162 1163 return not name or all(c in ALNUM for c in name) 1164 1165def is_octal(string): 1166 "Checks whether a string is octal." 1167 return all(ch in OCT_DIGITS for ch in string) 1168 1169def is_decimal(string): 1170 "Checks whether a string is decimal." 1171 return all(ch in DIGITS for ch in string) 1172 1173def is_hexadecimal(string): 1174 "Checks whether a string is hexadecimal." 1175 return all(ch in HEX_DIGITS for ch in string) 1176 1177def parse_escape(source, info, in_set): 1178 "Parses an escape sequence." 1179 saved_ignore = source.ignore_space 1180 source.ignore_space = False 1181 ch = source.get() 1182 source.ignore_space = saved_ignore 1183 if not ch: 1184 # A backslash at the end of the pattern. 1185 raise error("bad escape (end of pattern)", source.string, source.pos) 1186 if ch in HEX_ESCAPES: 1187 # A hexadecimal escape sequence. 1188 return parse_hex_escape(source, info, ch, HEX_ESCAPES[ch], in_set, ch) 1189 elif ch == "g" and not in_set: 1190 # A group reference. 1191 saved_pos = source.pos 1192 try: 1193 return parse_group_ref(source, info) 1194 except error: 1195 # Invalid as a group reference, so assume it's a literal. 1196 source.pos = saved_pos 1197 1198 return make_character(info, ord(ch), in_set) 1199 elif ch == "G" and not in_set: 1200 # A search anchor. 1201 return SearchAnchor() 1202 elif ch == "L" and not in_set: 1203 # A string set. 1204 return parse_string_set(source, info) 1205 elif ch == "N": 1206 # A named codepoint. 1207 return parse_named_char(source, info, in_set) 1208 elif ch in "pP": 1209 # A Unicode property, positive or negative. 1210 return parse_property(source, info, ch == "p", in_set) 1211 elif ch == "X" and not in_set: 1212 # A grapheme cluster. 1213 return Grapheme() 1214 elif ch in ALPHA: 1215 # An alphabetic escape sequence. 1216 # Positional escapes aren't allowed inside a character set. 1217 if not in_set: 1218 if info.flags & WORD: 1219 value = WORD_POSITION_ESCAPES.get(ch) 1220 else: 1221 value = POSITION_ESCAPES.get(ch) 1222 1223 if value: 1224 return value 1225 1226 value = CHARSET_ESCAPES.get(ch) 1227 if value: 1228 return value 1229 1230 value = CHARACTER_ESCAPES.get(ch) 1231 if value: 1232 return Character(ord(value)) 1233 1234 return make_character(info, ord(ch), in_set) 1235 elif ch in DIGITS: 1236 # A numeric escape sequence. 1237 return parse_numeric_escape(source, info, ch, in_set) 1238 else: 1239 # A literal. 1240 return make_character(info, ord(ch), in_set) 1241 1242def parse_numeric_escape(source, info, ch, in_set): 1243 "Parses a numeric escape sequence." 1244 if in_set or ch == "0": 1245 # Octal escape sequence, max 3 digits. 1246 return parse_octal_escape(source, info, [ch], in_set) 1247 1248 # At least 1 digit, so either octal escape or group. 1249 digits = ch 1250 saved_pos = source.pos 1251 ch = source.get() 1252 if ch in DIGITS: 1253 # At least 2 digits, so either octal escape or group. 1254 digits += ch 1255 saved_pos = source.pos 1256 ch = source.get() 1257 if is_octal(digits) and ch in OCT_DIGITS: 1258 # 3 octal digits, so octal escape sequence. 1259 encoding = info.flags & _ALL_ENCODINGS 1260 if encoding == ASCII or encoding == LOCALE: 1261 octal_mask = 0xFF 1262 else: 1263 octal_mask = 0x1FF 1264 1265 value = int(digits + ch, 8) & octal_mask 1266 return make_character(info, value) 1267 1268 # Group reference. 1269 source.pos = saved_pos 1270 if info.is_open_group(digits): 1271 raise error("cannot refer to an open group", source.string, source.pos) 1272 1273 return make_ref_group(info, digits, source.pos) 1274 1275def parse_octal_escape(source, info, digits, in_set): 1276 "Parses an octal escape sequence." 1277 saved_pos = source.pos 1278 ch = source.get() 1279 while len(digits) < 3 and ch in OCT_DIGITS: 1280 digits.append(ch) 1281 saved_pos = source.pos 1282 ch = source.get() 1283 1284 source.pos = saved_pos 1285 try: 1286 value = int("".join(digits), 8) 1287 return make_character(info, value, in_set) 1288 except ValueError: 1289 if digits[0] in OCT_DIGITS: 1290 raise error("incomplete escape \\%s" % ''.join(digits), 1291 source.string, source.pos) 1292 else: 1293 raise error("bad escape \\%s" % digits[0], source.string, 1294 source.pos) 1295 1296def parse_hex_escape(source, info, esc, expected_len, in_set, type): 1297 "Parses a hex escape sequence." 1298 saved_pos = source.pos 1299 digits = [] 1300 for i in range(expected_len): 1301 ch = source.get() 1302 if ch not in HEX_DIGITS: 1303 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)), 1304 source.string, saved_pos) 1305 digits.append(ch) 1306 1307 try: 1308 value = int("".join(digits), 16) 1309 except ValueError: 1310 pass 1311 else: 1312 if value < 0x110000: 1313 return make_character(info, value, in_set) 1314 1315 # Bad hex escape. 1316 raise error("bad hex escape \\%s%s" % (esc, ''.join(digits)), 1317 source.string, saved_pos) 1318 1319def parse_group_ref(source, info): 1320 "Parses a group reference." 1321 source.expect("<") 1322 saved_pos = source.pos 1323 name = parse_name(source, True) 1324 source.expect(">") 1325 if info.is_open_group(name): 1326 raise error("cannot refer to an open group", source.string, source.pos) 1327 1328 return make_ref_group(info, name, saved_pos) 1329 1330def parse_string_set(source, info): 1331 "Parses a string set reference." 1332 source.expect("<") 1333 name = parse_name(source, True) 1334 source.expect(">") 1335 if name is None or name not in info.kwargs: 1336 raise error("undefined named list", source.string, source.pos) 1337 1338 return make_string_set(info, name) 1339 1340def parse_named_char(source, info, in_set): 1341 "Parses a named character." 1342 saved_pos = source.pos 1343 if source.match("{"): 1344 name = source.get_while(NAMED_CHAR_PART) 1345 if source.match("}"): 1346 try: 1347 value = unicodedata.lookup(name) 1348 return make_character(info, ord(value), in_set) 1349 except KeyError: 1350 raise error("undefined character name", source.string, 1351 source.pos) 1352 1353 source.pos = saved_pos 1354 return make_character(info, ord("N"), in_set) 1355 1356def parse_property(source, info, positive, in_set): 1357 "Parses a Unicode property." 1358 saved_pos = source.pos 1359 ch = source.get() 1360 if ch == "{": 1361 negate = source.match("^") 1362 prop_name, name = parse_property_name(source) 1363 if source.match("}"): 1364 # It's correctly delimited. 1365 prop = lookup_property(prop_name, name, positive != negate, source) 1366 return make_property(info, prop, in_set) 1367 elif ch and ch in "CLMNPSZ": 1368 # An abbreviated property, eg \pL. 1369 prop = lookup_property(None, ch, positive, source) 1370 return make_property(info, prop, in_set) 1371 1372 # Not a property, so treat as a literal "p" or "P". 1373 source.pos = saved_pos 1374 ch = "p" if positive else "P" 1375 return make_character(info, ord(ch), in_set) 1376 1377def parse_property_name(source): 1378 "Parses a property name, which may be qualified." 1379 name = source.get_while(PROPERTY_NAME_PART) 1380 saved_pos = source.pos 1381 1382 ch = source.get() 1383 if ch and ch in ":=": 1384 prop_name = name 1385 name = source.get_while(ALNUM | set(" &_-./")).strip() 1386 1387 if name: 1388 # Name after the ":" or "=", so it's a qualified name. 1389 saved_pos = source.pos 1390 else: 1391 # No name after the ":" or "=", so assume it's an unqualified name. 1392 prop_name, name = None, prop_name 1393 else: 1394 prop_name = None 1395 1396 source.pos = saved_pos 1397 return prop_name, name 1398 1399def parse_set(source, info): 1400 "Parses a character set." 1401 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION 1402 1403 saved_ignore = source.ignore_space 1404 source.ignore_space = False 1405 # Negative set? 1406 negate = source.match("^") 1407 try: 1408 if version == VERSION0: 1409 item = parse_set_imp_union(source, info) 1410 else: 1411 item = parse_set_union(source, info) 1412 1413 if not source.match("]"): 1414 raise error("missing ]", source.string, source.pos) 1415 finally: 1416 source.ignore_space = saved_ignore 1417 1418 if negate: 1419 item = item.with_flags(positive=not item.positive) 1420 1421 item = item.with_flags(case_flags=make_case_flags(info)) 1422 1423 return item 1424 1425def parse_set_union(source, info): 1426 "Parses a set union ([x||y])." 1427 items = [parse_set_symm_diff(source, info)] 1428 while source.match("||"): 1429 items.append(parse_set_symm_diff(source, info)) 1430 1431 if len(items) == 1: 1432 return items[0] 1433 return SetUnion(info, items) 1434 1435def parse_set_symm_diff(source, info): 1436 "Parses a set symmetric difference ([x~~y])." 1437 items = [parse_set_inter(source, info)] 1438 while source.match("~~"): 1439 items.append(parse_set_inter(source, info)) 1440 1441 if len(items) == 1: 1442 return items[0] 1443 return SetSymDiff(info, items) 1444 1445def parse_set_inter(source, info): 1446 "Parses a set intersection ([x&&y])." 1447 items = [parse_set_diff(source, info)] 1448 while source.match("&&"): 1449 items.append(parse_set_diff(source, info)) 1450 1451 if len(items) == 1: 1452 return items[0] 1453 return SetInter(info, items) 1454 1455def parse_set_diff(source, info): 1456 "Parses a set difference ([x--y])." 1457 items = [parse_set_imp_union(source, info)] 1458 while source.match("--"): 1459 items.append(parse_set_imp_union(source, info)) 1460 1461 if len(items) == 1: 1462 return items[0] 1463 return SetDiff(info, items) 1464 1465def parse_set_imp_union(source, info): 1466 "Parses a set implicit union ([xy])." 1467 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION 1468 1469 items = [parse_set_member(source, info)] 1470 while True: 1471 saved_pos = source.pos 1472 if source.match("]"): 1473 # End of the set. 1474 source.pos = saved_pos 1475 break 1476 1477 if version == VERSION1 and any(source.match(op) for op in SET_OPS): 1478 # The new behaviour has set operators. 1479 source.pos = saved_pos 1480 break 1481 1482 items.append(parse_set_member(source, info)) 1483 1484 if len(items) == 1: 1485 return items[0] 1486 return SetUnion(info, items) 1487 1488def parse_set_member(source, info): 1489 "Parses a member in a character set." 1490 # Parse a set item. 1491 start = parse_set_item(source, info) 1492 saved_pos1 = source.pos 1493 if (not isinstance(start, Character) or not start.positive or not 1494 source.match("-")): 1495 # It's not the start of a range. 1496 return start 1497 1498 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION 1499 1500 # It looks like the start of a range of characters. 1501 saved_pos2 = source.pos 1502 if version == VERSION1 and source.match("-"): 1503 # It's actually the set difference operator '--', so return the 1504 # character. 1505 source.pos = saved_pos1 1506 return start 1507 1508 if source.match("]"): 1509 # We've reached the end of the set, so return both the character and 1510 # hyphen. 1511 source.pos = saved_pos2 1512 return SetUnion(info, [start, Character(ord("-"))]) 1513 1514 # Parse a set item. 1515 end = parse_set_item(source, info) 1516 if not isinstance(end, Character) or not end.positive: 1517 # It's not a range, so return the character, hyphen and property. 1518 return SetUnion(info, [start, Character(ord("-")), end]) 1519 1520 # It _is_ a range. 1521 if start.value > end.value: 1522 raise error("bad character range", source.string, source.pos) 1523 1524 if start.value == end.value: 1525 return start 1526 1527 return Range(start.value, end.value) 1528 1529def parse_set_item(source, info): 1530 "Parses an item in a character set." 1531 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION 1532 1533 if source.match("\\"): 1534 # An escape sequence in a set. 1535 return parse_escape(source, info, True) 1536 1537 saved_pos = source.pos 1538 if source.match("[:"): 1539 # Looks like a POSIX character class. 1540 try: 1541 return parse_posix_class(source, info) 1542 except ParseError: 1543 # Not a POSIX character class. 1544 source.pos = saved_pos 1545 1546 if version == VERSION1 and source.match("["): 1547 # It's the start of a nested set. 1548 1549 # Negative set? 1550 negate = source.match("^") 1551 item = parse_set_union(source, info) 1552 1553 if not source.match("]"): 1554 raise error("missing ]", source.string, source.pos) 1555 1556 if negate: 1557 item = item.with_flags(positive=not item.positive) 1558 1559 return item 1560 1561 ch = source.get() 1562 if not ch: 1563 raise error("unterminated character set", source.string, source.pos) 1564 1565 return Character(ord(ch)) 1566 1567def parse_posix_class(source, info): 1568 "Parses a POSIX character class." 1569 negate = source.match("^") 1570 prop_name, name = parse_property_name(source) 1571 if not source.match(":]"): 1572 raise ParseError() 1573 1574 return lookup_property(prop_name, name, not negate, source, posix=True) 1575 1576def float_to_rational(flt): 1577 "Converts a float to a rational pair." 1578 int_part = int(flt) 1579 error = flt - int_part 1580 if abs(error) < 0.0001: 1581 return int_part, 1 1582 1583 den, num = float_to_rational(1.0 / error) 1584 1585 return int_part * den + num, den 1586 1587def numeric_to_rational(numeric): 1588 "Converts a numeric string to a rational string, if possible." 1589 if numeric[ : 1] == "-": 1590 sign, numeric = numeric[0], numeric[1 : ] 1591 else: 1592 sign = "" 1593 1594 parts = numeric.split("/") 1595 if len(parts) == 2: 1596 num, den = float_to_rational(float(parts[0]) / float(parts[1])) 1597 elif len(parts) == 1: 1598 num, den = float_to_rational(float(parts[0])) 1599 else: 1600 raise ValueError() 1601 1602 result = "%s%s/%s" % (sign, num, den) 1603 if result.endswith("/1"): 1604 return result[ : -2] 1605 1606 return result 1607 1608upper_trans = string.maketrans(string.ascii_lowercase, string.ascii_uppercase) 1609def ascii_upper(s): 1610 "Uppercases a bytestring in a locale-insensitive way within the ASCII range." 1611 if isinstance(s, str): 1612 return s.translate(upper_trans) 1613 1614 return s.upper() 1615 1616def standardise_name(name): 1617 "Standardises a property or value name." 1618 try: 1619 return numeric_to_rational("".join(name)) 1620 except (ValueError, ZeroDivisionError): 1621 return ascii_upper("".join(ch for ch in name if ch not in "_- ")) 1622 1623_POSIX_CLASSES = set('ALNUM DIGIT PUNCT XDIGIT'.split()) 1624 1625_BINARY_VALUES = set('YES Y NO N TRUE T FALSE F'.split()) 1626 1627def lookup_property(property, value, positive, source=None, posix=False): 1628 "Looks up a property." 1629 # Normalise the names (which may still be lists). 1630 property = standardise_name(property) if property else None 1631 value = standardise_name(value) 1632 1633 if (property, value) == ("GENERALCATEGORY", "ASSIGNED"): 1634 property, value, positive = "GENERALCATEGORY", "UNASSIGNED", not positive 1635 1636 if posix and not property and ascii_upper(value) in _POSIX_CLASSES: 1637 value = 'POSIX' + value 1638 1639 if property: 1640 # Both the property and the value are provided. 1641 prop = PROPERTIES.get(property) 1642 if not prop: 1643 if not source: 1644 raise error("unknown property") 1645 1646 raise error("unknown property", source.string, source.pos) 1647 1648 prop_id, value_dict = prop 1649 val_id = value_dict.get(value) 1650 if val_id is None: 1651 if not source: 1652 raise error("unknown property value") 1653 1654 raise error("unknown property value", source.string, source.pos) 1655 1656 return Property((prop_id << 16) | val_id, positive) 1657 1658 # Only the value is provided. 1659 # It might be the name of a GC, script or block value. 1660 for property in ("GC", "SCRIPT", "BLOCK"): 1661 prop_id, value_dict = PROPERTIES.get(property) 1662 val_id = value_dict.get(value) 1663 if val_id is not None: 1664 return Property((prop_id << 16) | val_id, positive) 1665 1666 # It might be the name of a binary property. 1667 prop = PROPERTIES.get(value) 1668 if prop: 1669 prop_id, value_dict = prop 1670 if set(value_dict) == _BINARY_VALUES: 1671 return Property((prop_id << 16) | 1, positive) 1672 1673 return Property(prop_id << 16, not positive) 1674 1675 # It might be the name of a binary property starting with a prefix. 1676 if value.startswith("IS"): 1677 prop = PROPERTIES.get(value[2 : ]) 1678 if prop: 1679 prop_id, value_dict = prop 1680 if "YES" in value_dict: 1681 return Property((prop_id << 16) | 1, positive) 1682 1683 # It might be the name of a script or block starting with a prefix. 1684 for prefix, property in (("IS", "SCRIPT"), ("IN", "BLOCK")): 1685 if value.startswith(prefix): 1686 prop_id, value_dict = PROPERTIES.get(property) 1687 val_id = value_dict.get(value[2 : ]) 1688 if val_id is not None: 1689 return Property((prop_id << 16) | val_id, positive) 1690 1691 # Unknown property. 1692 if not source: 1693 raise error("unknown property") 1694 1695 raise error("unknown property", source.string, source.pos) 1696 1697def _compile_replacement(source, pattern, is_unicode): 1698 "Compiles a replacement template escape sequence." 1699 ch = source.get() 1700 if ch in ALPHA: 1701 # An alphabetic escape sequence. 1702 value = CHARACTER_ESCAPES.get(ch) 1703 if value: 1704 return False, [ord(value)] 1705 1706 if ch in HEX_ESCAPES and (ch == "x" or is_unicode): 1707 # A hexadecimal escape sequence. 1708 return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch], ch)] 1709 1710 if ch == "g": 1711 # A group preference. 1712 return True, [compile_repl_group(source, pattern)] 1713 1714 if ch == "N" and is_unicode: 1715 # A named character. 1716 value = parse_repl_named_char(source) 1717 if value is not None: 1718 return False, [value] 1719 1720 return False, [ord("\\"), ord(ch)] 1721 1722 if isinstance(source.sep, str): 1723 octal_mask = 0xFF 1724 else: 1725 octal_mask = 0x1FF 1726 1727 if ch == "0": 1728 # An octal escape sequence. 1729 digits = ch 1730 while len(digits) < 3: 1731 saved_pos = source.pos 1732 ch = source.get() 1733 if ch not in OCT_DIGITS: 1734 source.pos = saved_pos 1735 break 1736 digits += ch 1737 1738 return False, [int(digits, 8) & octal_mask] 1739 1740 if ch in DIGITS: 1741 # Either an octal escape sequence (3 digits) or a group reference (max 1742 # 2 digits). 1743 digits = ch 1744 saved_pos = source.pos 1745 ch = source.get() 1746 if ch in DIGITS: 1747 digits += ch 1748 saved_pos = source.pos 1749 ch = source.get() 1750 if ch and is_octal(digits + ch): 1751 # An octal escape sequence. 1752 return False, [int(digits + ch, 8) & octal_mask] 1753 1754 # A group reference. 1755 source.pos = saved_pos 1756 return True, [int(digits)] 1757 1758 if ch == "\\": 1759 # An escaped backslash is a backslash. 1760 return False, [ord("\\")] 1761 1762 if not ch: 1763 # A trailing backslash. 1764 raise error("bad escape (end of pattern)", source.string, source.pos) 1765 1766 # An escaped non-backslash is a backslash followed by the literal. 1767 return False, [ord("\\"), ord(ch)] 1768 1769def parse_repl_hex_escape(source, expected_len, type): 1770 "Parses a hex escape sequence in a replacement string." 1771 digits = [] 1772 for i in range(expected_len): 1773 ch = source.get() 1774 if ch not in HEX_DIGITS: 1775 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)), 1776 source.string, source.pos) 1777 digits.append(ch) 1778 1779 return int("".join(digits), 16) 1780 1781def parse_repl_named_char(source): 1782 "Parses a named character in a replacement string." 1783 saved_pos = source.pos 1784 if source.match("{"): 1785 name = source.get_while(ALPHA | set(" ")) 1786 1787 if source.match("}"): 1788 try: 1789 value = unicodedata.lookup(name) 1790 return ord(value) 1791 except KeyError: 1792 raise error("undefined character name", source.string, 1793 source.pos) 1794 1795 source.pos = saved_pos 1796 return None 1797 1798def compile_repl_group(source, pattern): 1799 "Compiles a replacement template group reference." 1800 source.expect("<") 1801 name = parse_name(source, True, True) 1802 1803 source.expect(">") 1804 if name.isdigit(): 1805 index = int(name) 1806 if not 0 <= index <= pattern.groups: 1807 raise error("invalid group reference", source.string, source.pos) 1808 1809 return index 1810 1811 try: 1812 return pattern.groupindex[name] 1813 except KeyError: 1814 raise IndexError("unknown group") 1815 1816# The regular expression is parsed into a syntax tree. The different types of 1817# node are defined below. 1818 1819INDENT = " " 1820POSITIVE_OP = 0x1 1821ZEROWIDTH_OP = 0x2 1822FUZZY_OP = 0x4 1823REVERSE_OP = 0x8 1824REQUIRED_OP = 0x10 1825 1826POS_TEXT = {False: "NON-MATCH", True: "MATCH"} 1827CASE_TEXT = {NOCASE: "", IGNORECASE: " SIMPLE_IGNORE_CASE", FULLCASE: "", 1828 FULLIGNORECASE: " FULL_IGNORE_CASE"} 1829 1830def make_sequence(items): 1831 if len(items) == 1: 1832 return items[0] 1833 return Sequence(items) 1834 1835# Common base class for all nodes. 1836class RegexBase(object): 1837 def __init__(self): 1838 self._key = self.__class__ 1839 1840 def with_flags(self, positive=None, case_flags=None, zerowidth=None): 1841 if positive is None: 1842 positive = self.positive 1843 else: 1844 positive = bool(positive) 1845 if case_flags is None: 1846 case_flags = self.case_flags 1847 else: 1848 case_flags = CASE_FLAGS_COMBINATIONS[case_flags & CASE_FLAGS] 1849 if zerowidth is None: 1850 zerowidth = self.zerowidth 1851 else: 1852 zerowidth = bool(zerowidth) 1853 1854 if (positive == self.positive and case_flags == self.case_flags and 1855 zerowidth == self.zerowidth): 1856 return self 1857 1858 return self.rebuild(positive, case_flags, zerowidth) 1859 1860 def fix_groups(self, pattern, reverse, fuzzy): 1861 pass 1862 1863 def optimise(self, info, reverse): 1864 return self 1865 1866 def pack_characters(self, info): 1867 return self 1868 1869 def remove_captures(self): 1870 return self 1871 1872 def is_atomic(self): 1873 return True 1874 1875 def can_be_affix(self): 1876 return True 1877 1878 def contains_group(self): 1879 return False 1880 1881 def get_firstset(self, reverse): 1882 raise _FirstSetError() 1883 1884 def has_simple_start(self): 1885 return False 1886 1887 def compile(self, reverse=False, fuzzy=False): 1888 return self._compile(reverse, fuzzy) 1889 1890 def is_empty(self): 1891 return False 1892 1893 def __hash__(self): 1894 return hash(self._key) 1895 1896 def __eq__(self, other): 1897 return type(self) is type(other) and self._key == other._key 1898 1899 def __ne__(self, other): 1900 return not self.__eq__(other) 1901 1902 def get_required_string(self, reverse): 1903 return self.max_width(), None 1904 1905# Base class for zero-width nodes. 1906class ZeroWidthBase(RegexBase): 1907 def __init__(self, positive=True): 1908 RegexBase.__init__(self) 1909 self.positive = bool(positive) 1910 1911 self._key = self.__class__, self.positive 1912 1913 def get_firstset(self, reverse): 1914 return set([None]) 1915 1916 def _compile(self, reverse, fuzzy): 1917 flags = 0 1918 if self.positive: 1919 flags |= POSITIVE_OP 1920 if fuzzy: 1921 flags |= FUZZY_OP 1922 if reverse: 1923 flags |= REVERSE_OP 1924 return [(self._opcode, flags)] 1925 1926 def dump(self, indent, reverse): 1927 print "%s%s %s" % (INDENT * indent, self._op_name, 1928 POS_TEXT[self.positive]) 1929 1930 def max_width(self): 1931 return 0 1932 1933class Any(RegexBase): 1934 _opcode = {False: OP.ANY, True: OP.ANY_REV} 1935 _op_name = "ANY" 1936 1937 def has_simple_start(self): 1938 return True 1939 1940 def _compile(self, reverse, fuzzy): 1941 flags = 0 1942 if fuzzy: 1943 flags |= FUZZY_OP 1944 return [(self._opcode[reverse], flags)] 1945 1946 def dump(self, indent, reverse): 1947 print "%s%s" % (INDENT * indent, self._op_name) 1948 1949 def max_width(self): 1950 return 1 1951 1952class AnyAll(Any): 1953 _opcode = {False: OP.ANY_ALL, True: OP.ANY_ALL_REV} 1954 _op_name = "ANY_ALL" 1955 1956class AnyU(Any): 1957 _opcode = {False: OP.ANY_U, True: OP.ANY_U_REV} 1958 _op_name = "ANY_U" 1959 1960class Atomic(RegexBase): 1961 def __init__(self, subpattern): 1962 RegexBase.__init__(self) 1963 self.subpattern = subpattern 1964 1965 def fix_groups(self, pattern, reverse, fuzzy): 1966 self.subpattern.fix_groups(pattern, reverse, fuzzy) 1967 1968 def optimise(self, info, reverse): 1969 self.subpattern = self.subpattern.optimise(info, reverse) 1970 1971 if self.subpattern.is_empty(): 1972 return self.subpattern 1973 return self 1974 1975 def pack_characters(self, info): 1976 self.subpattern = self.subpattern.pack_characters(info) 1977 return self 1978 1979 def remove_captures(self): 1980 self.subpattern = self.subpattern.remove_captures() 1981 return self 1982 1983 def can_be_affix(self): 1984 return self.subpattern.can_be_affix() 1985 1986 def contains_group(self): 1987 return self.subpattern.contains_group() 1988 1989 def get_firstset(self, reverse): 1990 return self.subpattern.get_firstset(reverse) 1991 1992 def has_simple_start(self): 1993 return self.subpattern.has_simple_start() 1994 1995 def _compile(self, reverse, fuzzy): 1996 return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) + 1997 [(OP.END, )]) 1998 1999 def dump(self, indent, reverse): 2000 print "%sATOMIC" % (INDENT * indent) 2001 self.subpattern.dump(indent + 1, reverse) 2002 2003 def is_empty(self): 2004 return self.subpattern.is_empty() 2005 2006 def __eq__(self, other): 2007 return (type(self) is type(other) and self.subpattern == 2008 other.subpattern) 2009 2010 def max_width(self): 2011 return self.subpattern.max_width() 2012 2013 def get_required_string(self, reverse): 2014 return self.subpattern.get_required_string(reverse) 2015 2016class Boundary(ZeroWidthBase): 2017 _opcode = OP.BOUNDARY 2018 _op_name = "BOUNDARY" 2019 2020class Branch(RegexBase): 2021 def __init__(self, branches): 2022 RegexBase.__init__(self) 2023 self.branches = branches 2024 2025 def fix_groups(self, pattern, reverse, fuzzy): 2026 for b in self.branches: 2027 b.fix_groups(pattern, reverse, fuzzy) 2028 2029 def optimise(self, info, reverse): 2030 if not self.branches: 2031 return Sequence([]) 2032 2033 # Flatten branches within branches. 2034 branches = Branch._flatten_branches(info, reverse, self.branches) 2035 2036 # Move any common prefix or suffix out of the branches. 2037 if reverse: 2038 suffix, branches = Branch._split_common_suffix(info, branches) 2039 prefix = [] 2040 else: 2041 prefix, branches = Branch._split_common_prefix(info, branches) 2042 suffix = [] 2043 2044 # Try to reduce adjacent single-character branches to sets. 2045 branches = Branch._reduce_to_set(info, reverse, branches) 2046 2047 if len(branches) > 1: 2048 sequence = [Branch(branches)] 2049 2050 if not prefix or not suffix: 2051 # We might be able to add a quick precheck before the branches. 2052 firstset = self._add_precheck(info, reverse, branches) 2053 2054 if firstset: 2055 if reverse: 2056 sequence.append(firstset) 2057 else: 2058 sequence.insert(0, firstset) 2059 else: 2060 sequence = branches 2061 2062 return make_sequence(prefix + sequence + suffix) 2063 2064 def _add_precheck(self, info, reverse, branches): 2065 charset = set() 2066 pos = -1 if reverse else 0 2067 2068 for branch in branches: 2069 if type(branch) is Literal and branch.case_flags == NOCASE: 2070 charset.add(branch.characters[pos]) 2071 else: 2072 return 2073 2074 if not charset: 2075 return None 2076 2077 return _check_firstset(info, reverse, [Character(c) for c in charset]) 2078 2079 def pack_characters(self, info): 2080 self.branches = [b.pack_characters(info) for b in self.branches] 2081 return self 2082 2083 def remove_captures(self): 2084 self.branches = [b.remove_captures() for b in self.branches] 2085 return self 2086 2087 def is_atomic(self): 2088 return all(b.is_atomic() for b in self.branches) 2089 2090 def can_be_affix(self): 2091 return all(b.can_be_affix() for b in self.branches) 2092 2093 def contains_group(self): 2094 return any(b.contains_group() for b in self.branches) 2095 2096 def get_firstset(self, reverse): 2097 fs = set() 2098 for b in self.branches: 2099 fs |= b.get_firstset(reverse) 2100 2101 return fs or set([None]) 2102 2103 def _compile(self, reverse, fuzzy): 2104 code = [(OP.BRANCH, )] 2105 for b in self.branches: 2106 code.extend(b.compile(reverse, fuzzy)) 2107 code.append((OP.NEXT, )) 2108 2109 code[-1] = (OP.END, ) 2110 2111 return code 2112 2113 def dump(self, indent, reverse): 2114 print "%sBRANCH" % (INDENT * indent) 2115 self.branches[0].dump(indent + 1, reverse) 2116 for b in self.branches[1 : ]: 2117 print "%sOR" % (INDENT * indent) 2118 b.dump(indent + 1, reverse) 2119 2120 @staticmethod 2121 def _flatten_branches(info, reverse, branches): 2122 # Flatten the branches so that there aren't branches of branches. 2123 new_branches = [] 2124 for b in branches: 2125 b = b.optimise(info, reverse) 2126 if isinstance(b, Branch): 2127 new_branches.extend(b.branches) 2128 else: 2129 new_branches.append(b) 2130 2131 return new_branches 2132 2133 @staticmethod 2134 def _split_common_prefix(info, branches): 2135 # Common leading items can be moved out of the branches. 2136 # Get the items in the branches. 2137 alternatives = [] 2138 for b in branches: 2139 if isinstance(b, Sequence): 2140 alternatives.append(b.items) 2141 else: 2142 alternatives.append([b]) 2143 2144 # What is the maximum possible length of the prefix? 2145 max_count = min(len(a) for a in alternatives) 2146 2147 # What is the longest common prefix? 2148 prefix = alternatives[0] 2149 pos = 0 2150 end_pos = max_count 2151 while pos < end_pos and prefix[pos].can_be_affix() and all(a[pos] == 2152 prefix[pos] for a in alternatives): 2153 pos += 1 2154 count = pos 2155 2156 if info.flags & UNICODE: 2157 # We need to check that we're not splitting a sequence of 2158 # characters which could form part of full case-folding. 2159 count = pos 2160 while count > 0 and not all(Branch._can_split(a, count) for a in 2161 alternatives): 2162 count -= 1 2163 2164 # No common prefix is possible. 2165 if count == 0: 2166 return [], branches 2167 2168 # Rebuild the branches. 2169 new_branches = [] 2170 for a in alternatives: 2171 new_branches.append(make_sequence(a[count : ])) 2172 2173 return prefix[ : count], new_branches 2174 2175 @staticmethod 2176 def _split_common_suffix(info, branches): 2177 # Common trailing items can be moved out of the branches. 2178 # Get the items in the branches. 2179 alternatives = [] 2180 for b in branches: 2181 if isinstance(b, Sequence): 2182 alternatives.append(b.items) 2183 else: 2184 alternatives.append([b]) 2185 2186 # What is the maximum possible length of the suffix? 2187 max_count = min(len(a) for a in alternatives) 2188 2189 # What is the longest common suffix? 2190 suffix = alternatives[0] 2191 pos = -1 2192 end_pos = -1 - max_count 2193 while pos > end_pos and suffix[pos].can_be_affix() and all(a[pos] == 2194 suffix[pos] for a in alternatives): 2195 pos -= 1 2196 count = -1 - pos 2197 2198 if info.flags & UNICODE: 2199 # We need to check that we're not splitting a sequence of 2200 # characters which could form part of full case-folding. 2201 while count > 0 and not all(Branch._can_split_rev(a, count) for a 2202 in alternatives): 2203 count -= 1 2204 2205 # No common suffix is possible. 2206 if count == 0: 2207 return [], branches 2208 2209 # Rebuild the branches. 2210 new_branches = [] 2211 for a in alternatives: 2212 new_branches.append(make_sequence(a[ : -count])) 2213 2214 return suffix[-count : ], new_branches 2215 2216 @staticmethod 2217 def _can_split(items, count): 2218 # Check the characters either side of the proposed split. 2219 if not Branch._is_full_case(items, count - 1): 2220 return True 2221 2222 if not Branch._is_full_case(items, count): 2223 return True 2224 2225 # Check whether a 1-1 split would be OK. 2226 if Branch._is_folded(items[count - 1 : count + 1]): 2227 return False 2228 2229 # Check whether a 1-2 split would be OK. 2230 if (Branch._is_full_case(items, count + 2) and 2231 Branch._is_folded(items[count - 1 : count + 2])): 2232 return False 2233 2234 # Check whether a 2-1 split would be OK. 2235 if (Branch._is_full_case(items, count - 2) and 2236 Branch._is_folded(items[count - 2 : count + 1])): 2237 return False 2238 2239 return True 2240 2241 @staticmethod 2242 def _can_split_rev(items, count): 2243 end = len(items) 2244 2245 # Check the characters either side of the proposed split. 2246 if not Branch._is_full_case(items, end - count): 2247 return True 2248 2249 if not Branch._is_full_case(items, end - count - 1): 2250 return True 2251 2252 # Check whether a 1-1 split would be OK. 2253 if Branch._is_folded(items[end - count - 1 : end - count + 1]): 2254 return False 2255 2256 # Check whether a 1-2 split would be OK. 2257 if (Branch._is_full_case(items, end - count + 2) and 2258 Branch._is_folded(items[end - count - 1 : end - count + 2])): 2259 return False 2260 2261 # Check whether a 2-1 split would be OK. 2262 if (Branch._is_full_case(items, end - count - 2) and 2263 Branch._is_folded(items[end - count - 2 : end - count + 1])): 2264 return False 2265 2266 return True 2267 2268 @staticmethod 2269 def _merge_common_prefixes(info, reverse, branches): 2270 # Branches with the same case-sensitive character prefix can be grouped 2271 # together if they are separated only by other branches with a 2272 # character prefix. 2273 prefixed = defaultdict(list) 2274 order = {} 2275 new_branches = [] 2276 for b in branches: 2277 if Branch._is_simple_character(b): 2278 # Branch starts with a simple character. 2279 prefixed[b.value].append([b]) 2280 order.setdefault(b.value, len(order)) 2281 elif (isinstance(b, Sequence) and b.items and 2282 Branch._is_simple_character(b.items[0])): 2283 # Branch starts with a simple character. 2284 prefixed[b.items[0].value].append(b.items) 2285 order.setdefault(b.items[0].value, len(order)) 2286 else: 2287 Branch._flush_char_prefix(info, reverse, prefixed, order, 2288 new_branches) 2289 2290 new_branches.append(b) 2291 2292 Branch._flush_char_prefix(info, prefixed, order, new_branches) 2293 2294 return new_branches 2295 2296 @staticmethod 2297 def _is_simple_character(c): 2298 return isinstance(c, Character) and c.positive and not c.case_flags 2299 2300 @staticmethod 2301 def _reduce_to_set(info, reverse, branches): 2302 # Can the branches be reduced to a set? 2303 new_branches = [] 2304 items = set() 2305 case_flags = NOCASE 2306 for b in branches: 2307 if isinstance(b, (Character, Property, SetBase)): 2308 # Branch starts with a single character. 2309 if b.case_flags != case_flags: 2310 # Different case sensitivity, so flush. 2311 Branch._flush_set_members(info, reverse, items, case_flags, 2312 new_branches) 2313 2314 case_flags = b.case_flags 2315 2316 items.add(b.with_flags(case_flags=NOCASE)) 2317 else: 2318 Branch._flush_set_members(info, reverse, items, case_flags, 2319 new_branches) 2320 2321 new_branches.append(b) 2322 2323 Branch._flush_set_members(info, reverse, items, case_flags, 2324 new_branches) 2325 2326 return new_branches 2327 2328 @staticmethod 2329 def _flush_char_prefix(info, reverse, prefixed, order, new_branches): 2330 # Flush the prefixed branches. 2331 if not prefixed: 2332 return 2333 2334 for value, branches in sorted(prefixed.items(), key=lambda pair: 2335 order[pair[0]]): 2336 if len(branches) == 1: 2337 new_branches.append(make_sequence(branches[0])) 2338 else: 2339 subbranches = [] 2340 optional = False 2341 for b in branches: 2342 if len(b) > 1: 2343 subbranches.append(make_sequence(b[1 : ])) 2344 elif not optional: 2345 subbranches.append(Sequence()) 2346 optional = True 2347 2348 sequence = Sequence([Character(value), Branch(subbranches)]) 2349 new_branches.append(sequence.optimise(info, reverse)) 2350 2351 prefixed.clear() 2352 order.clear() 2353 2354 @staticmethod 2355 def _flush_set_members(info, reverse, items, case_flags, new_branches): 2356 # Flush the set members. 2357 if not items: 2358 return 2359 2360 if len(items) == 1: 2361 item = list(items)[0] 2362 else: 2363 item = SetUnion(info, list(items)).optimise(info, reverse) 2364 2365 new_branches.append(item.with_flags(case_flags=case_flags)) 2366 2367 items.clear() 2368 2369 @staticmethod 2370 def _is_full_case(items, i): 2371 if not 0 <= i < len(items): 2372 return False 2373 2374 item = items[i] 2375 return (isinstance(item, Character) and item.positive and 2376 (item.case_flags & FULLIGNORECASE) == FULLIGNORECASE) 2377 2378 @staticmethod 2379 def _is_folded(items): 2380 if len(items) < 2: 2381 return False 2382 2383 for i in items: 2384 if (not isinstance(i, Character) or not i.positive or not 2385 i.case_flags): 2386 return False 2387 2388 folded = u"".join(unichr(i.value) for i in items) 2389 folded = _regex.fold_case(FULL_CASE_FOLDING, folded) 2390 2391 # Get the characters which expand to multiple codepoints on folding. 2392 expanding_chars = _regex.get_expand_on_folding() 2393 2394 for c in expanding_chars: 2395 if folded == _regex.fold_case(FULL_CASE_FOLDING, c): 2396 return True 2397 2398 return False 2399 2400 def is_empty(self): 2401 return all(b.is_empty() for b in self.branches) 2402 2403 def __eq__(self, other): 2404 return type(self) is type(other) and self.branches == other.branches 2405 2406 def max_width(self): 2407 return max(b.max_width() for b in self.branches) 2408 2409class CallGroup(RegexBase): 2410 def __init__(self, info, group, position): 2411 RegexBase.__init__(self) 2412 self.info = info 2413 self.group = group 2414 self.position = position 2415 2416 self._key = self.__class__, self.group 2417 2418 def fix_groups(self, pattern, reverse, fuzzy): 2419 try: 2420 self.group = int(self.group) 2421 except ValueError: 2422 try: 2423 self.group = self.info.group_index[self.group] 2424 except KeyError: 2425 raise error("invalid group reference", pattern, self.position) 2426 2427 if not 0 <= self.group <= self.info.group_count: 2428 raise error("unknown group", pattern, self.position) 2429 2430 if self.group > 0 and self.info.open_group_count[self.group] > 1: 2431 raise error("ambiguous group reference", pattern, self.position) 2432 2433 self.info.group_calls.append((self, reverse, fuzzy)) 2434 2435 self._key = self.__class__, self.group 2436 2437 def remove_captures(self): 2438 raise error("group reference not allowed", pattern, self.position) 2439 2440 def _compile(self, reverse, fuzzy): 2441 return [(OP.GROUP_CALL, self.call_ref)] 2442 2443 def dump(self, indent, reverse): 2444 print "%sGROUP_CALL %s" % (INDENT * indent, self.group) 2445 2446 def __eq__(self, other): 2447 return type(self) is type(other) and self.group == other.group 2448 2449 def max_width(self): 2450 return UNLIMITED 2451 2452 def __del__(self): 2453 self.info = None 2454 2455class CallRef(RegexBase): 2456 def __init__(self, ref, parsed): 2457 self.ref = ref 2458 self.parsed = parsed 2459 2460 def _compile(self, reverse, fuzzy): 2461 return ([(OP.CALL_REF, self.ref)] + self.parsed._compile(reverse, 2462 fuzzy) + [(OP.END, )]) 2463 2464class Character(RegexBase): 2465 _opcode = {(NOCASE, False): OP.CHARACTER, (IGNORECASE, False): 2466 OP.CHARACTER_IGN, (FULLCASE, False): OP.CHARACTER, (FULLIGNORECASE, 2467 False): OP.CHARACTER_IGN, (NOCASE, True): OP.CHARACTER_REV, (IGNORECASE, 2468 True): OP.CHARACTER_IGN_REV, (FULLCASE, True): OP.CHARACTER_REV, 2469 (FULLIGNORECASE, True): OP.CHARACTER_IGN_REV} 2470 2471 def __init__(self, value, positive=True, case_flags=NOCASE, 2472 zerowidth=False): 2473 RegexBase.__init__(self) 2474 self.value = value 2475 self.positive = bool(positive) 2476 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] 2477 self.zerowidth = bool(zerowidth) 2478 2479 if (self.positive and (self.case_flags & FULLIGNORECASE) == 2480 FULLIGNORECASE): 2481 self.folded = _regex.fold_case(FULL_CASE_FOLDING, unichr(self.value)) 2482 else: 2483 self.folded = unichr(self.value) 2484 2485 self._key = (self.__class__, self.value, self.positive, 2486 self.case_flags, self.zerowidth) 2487 2488 def rebuild(self, positive, case_flags, zerowidth): 2489 return Character(self.value, positive, case_flags, zerowidth) 2490 2491 def optimise(self, info, reverse, in_set=False): 2492 return self 2493 2494 def get_firstset(self, reverse): 2495 return set([self]) 2496 2497 def has_simple_start(self): 2498 return True 2499 2500 def _compile(self, reverse, fuzzy): 2501 flags = 0 2502 if self.positive: 2503 flags |= POSITIVE_OP 2504 if self.zerowidth: 2505 flags |= ZEROWIDTH_OP 2506 if fuzzy: 2507 flags |= FUZZY_OP 2508 2509 code = PrecompiledCode([self._opcode[self.case_flags, reverse], flags, 2510 self.value]) 2511 2512 if len(self.folded) > 1: 2513 # The character expands on full case-folding. 2514 code = Branch([code, String([ord(c) for c in self.folded], 2515 case_flags=self.case_flags)]) 2516 2517 return code.compile(reverse, fuzzy) 2518 2519 def dump(self, indent, reverse): 2520 display = repr(unichr(self.value)).lstrip("bu") 2521 print "%sCHARACTER %s %s%s" % (INDENT * indent, 2522 POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags]) 2523 2524 def matches(self, ch): 2525 return (ch == self.value) == self.positive 2526 2527 def max_width(self): 2528 return len(self.folded) 2529 2530 def get_required_string(self, reverse): 2531 if not self.positive: 2532 return 1, None 2533 2534 self.folded_characters = tuple(ord(c) for c in self.folded) 2535 2536 return 0, self 2537 2538class Conditional(RegexBase): 2539 def __init__(self, info, group, yes_item, no_item, position): 2540 RegexBase.__init__(self) 2541 self.info = info 2542 self.group = group 2543 self.yes_item = yes_item 2544 self.no_item = no_item 2545 self.position = position 2546 2547 def fix_groups(self, pattern, reverse, fuzzy): 2548 try: 2549 self.group = int(self.group) 2550 except ValueError: 2551 try: 2552 self.group = self.info.group_index[self.group] 2553 except KeyError: 2554 if self.group == 'DEFINE': 2555 # 'DEFINE' is a special name unless there's a group with 2556 # that name. 2557 self.group = 0 2558 else: 2559 raise error("unknown group", pattern, self.position) 2560 2561 if not 0 <= self.group <= self.info.group_count: 2562 raise error("invalid group reference", pattern, self.position) 2563 2564 self.yes_item.fix_groups(pattern, reverse, fuzzy) 2565 self.no_item.fix_groups(pattern, reverse, fuzzy) 2566 2567 def optimise(self, info, reverse): 2568 yes_item = self.yes_item.optimise(info, reverse) 2569 no_item = self.no_item.optimise(info, reverse) 2570 2571 return Conditional(info, self.group, yes_item, no_item, self.position) 2572 2573 def pack_characters(self, info): 2574 self.yes_item = self.yes_item.pack_characters(info) 2575 self.no_item = self.no_item.pack_characters(info) 2576 return self 2577 2578 def remove_captures(self): 2579 self.yes_item = self.yes_item.remove_captures() 2580 self.no_item = self.no_item.remove_captures() 2581 2582 def is_atomic(self): 2583 return self.yes_item.is_atomic() and self.no_item.is_atomic() 2584 2585 def can_be_affix(self): 2586 return self.yes_item.can_be_affix() and self.no_item.can_be_affix() 2587 2588 def contains_group(self): 2589 return self.yes_item.contains_group() or self.no_item.contains_group() 2590 2591 def get_firstset(self, reverse): 2592 return (self.yes_item.get_firstset(reverse) | 2593 self.no_item.get_firstset(reverse)) 2594 2595 def _compile(self, reverse, fuzzy): 2596 code = [(OP.GROUP_EXISTS, self.group)] 2597 code.extend(self.yes_item.compile(reverse, fuzzy)) 2598 add_code = self.no_item.compile(reverse, fuzzy) 2599 if add_code: 2600 code.append((OP.NEXT, )) 2601 code.extend(add_code) 2602 2603 code.append((OP.END, )) 2604 2605 return code 2606 2607 def dump(self, indent, reverse): 2608 print "%sGROUP_EXISTS %s" % (INDENT * indent, self.group) 2609 self.yes_item.dump(indent + 1, reverse) 2610 if not self.no_item.is_empty(): 2611 print "%sOR" % (INDENT * indent) 2612 self.no_item.dump(indent + 1, reverse) 2613 2614 def is_empty(self): 2615 return self.yes_item.is_empty() and self.no_item.is_empty() 2616 2617 def __eq__(self, other): 2618 return type(self) is type(other) and (self.group, self.yes_item, 2619 self.no_item) == (other.group, other.yes_item, other.no_item) 2620 2621 def max_width(self): 2622 return max(self.yes_item.max_width(), self.no_item.max_width()) 2623 2624 def __del__(self): 2625 self.info = None 2626 2627class DefaultBoundary(ZeroWidthBase): 2628 _opcode = OP.DEFAULT_BOUNDARY 2629 _op_name = "DEFAULT_BOUNDARY" 2630 2631class DefaultEndOfWord(ZeroWidthBase): 2632 _opcode = OP.DEFAULT_END_OF_WORD 2633 _op_name = "DEFAULT_END_OF_WORD" 2634 2635class DefaultStartOfWord(ZeroWidthBase): 2636 _opcode = OP.DEFAULT_START_OF_WORD 2637 _op_name = "DEFAULT_START_OF_WORD" 2638 2639class EndOfLine(ZeroWidthBase): 2640 _opcode = OP.END_OF_LINE 2641 _op_name = "END_OF_LINE" 2642 2643class EndOfLineU(EndOfLine): 2644 _opcode = OP.END_OF_LINE_U 2645 _op_name = "END_OF_LINE_U" 2646 2647class EndOfString(ZeroWidthBase): 2648 _opcode = OP.END_OF_STRING 2649 _op_name = "END_OF_STRING" 2650 2651class EndOfStringLine(ZeroWidthBase): 2652 _opcode = OP.END_OF_STRING_LINE 2653 _op_name = "END_OF_STRING_LINE" 2654 2655class EndOfStringLineU(EndOfStringLine): 2656 _opcode = OP.END_OF_STRING_LINE_U 2657 _op_name = "END_OF_STRING_LINE_U" 2658 2659class EndOfWord(ZeroWidthBase): 2660 _opcode = OP.END_OF_WORD 2661 _op_name = "END_OF_WORD" 2662 2663class Failure(ZeroWidthBase): 2664 _op_name = "FAILURE" 2665 2666 def _compile(self, reverse, fuzzy): 2667 return [(OP.FAILURE, )] 2668 2669class Fuzzy(RegexBase): 2670 def __init__(self, subpattern, constraints=None): 2671 RegexBase.__init__(self) 2672 if constraints is None: 2673 constraints = {} 2674 self.subpattern = subpattern 2675 self.constraints = constraints 2676 2677 # If an error type is mentioned in the cost equation, then its maximum 2678 # defaults to unlimited. 2679 if "cost" in constraints: 2680 for e in "dis": 2681 if e in constraints["cost"]: 2682 constraints.setdefault(e, (0, None)) 2683 2684 # If any error type is mentioned, then all the error maxima default to 2685 # 0, otherwise they default to unlimited. 2686 if set(constraints) & set("dis"): 2687 for e in "dis": 2688 constraints.setdefault(e, (0, 0)) 2689 else: 2690 for e in "dis": 2691 constraints.setdefault(e, (0, None)) 2692 2693 # The maximum of the generic error type defaults to unlimited. 2694 constraints.setdefault("e", (0, None)) 2695 2696 # The cost equation defaults to equal costs. Also, the cost of any 2697 # error type not mentioned in the cost equation defaults to 0. 2698 if "cost" in constraints: 2699 for e in "dis": 2700 constraints["cost"].setdefault(e, 0) 2701 else: 2702 constraints["cost"] = {"d": 1, "i": 1, "s": 1, "max": 2703 constraints["e"][1]} 2704 2705 def fix_groups(self, pattern, reverse, fuzzy): 2706 self.subpattern.fix_groups(pattern, reverse, True) 2707 2708 def pack_characters(self, info): 2709 self.subpattern = self.subpattern.pack_characters(info) 2710 return self 2711 2712 def remove_captures(self): 2713 self.subpattern = self.subpattern.remove_captures() 2714 return self 2715 2716 def is_atomic(self): 2717 return self.subpattern.is_atomic() 2718 2719 def contains_group(self): 2720 return self.subpattern.contains_group() 2721 2722 def _compile(self, reverse, fuzzy): 2723 # The individual limits. 2724 arguments = [] 2725 for e in "dise": 2726 v = self.constraints[e] 2727 arguments.append(v[0]) 2728 arguments.append(UNLIMITED if v[1] is None else v[1]) 2729 2730 # The coeffs of the cost equation. 2731 for e in "dis": 2732 arguments.append(self.constraints["cost"][e]) 2733 2734 # The maximum of the cost equation. 2735 v = self.constraints["cost"]["max"] 2736 arguments.append(UNLIMITED if v is None else v) 2737 2738 flags = 0 2739 if reverse: 2740 flags |= REVERSE_OP 2741 2742 test = self.constraints.get("test") 2743 2744 if test: 2745 return ([(OP.FUZZY_EXT, flags) + tuple(arguments)] + 2746 test.compile(reverse, True) + [(OP.NEXT,)] + 2747 self.subpattern.compile(reverse, True) + [(OP.END,)]) 2748 2749 return ([(OP.FUZZY, flags) + tuple(arguments)] + 2750 self.subpattern.compile(reverse, True) + [(OP.END,)]) 2751 2752 def dump(self, indent, reverse): 2753 constraints = self._constraints_to_string() 2754 if constraints: 2755 constraints = " " + constraints 2756 print "%sFUZZY%s" % (INDENT * indent, constraints) 2757 self.subpattern.dump(indent + 1, reverse) 2758 2759 def is_empty(self): 2760 return self.subpattern.is_empty() 2761 2762 def __eq__(self, other): 2763 return (type(self) is type(other) and self.subpattern == 2764 other.subpattern and self.constraints == other.constraints) 2765 2766 def max_width(self): 2767 return UNLIMITED 2768 2769 def _constraints_to_string(self): 2770 constraints = [] 2771 2772 for name in "ids": 2773 min, max = self.constraints[name] 2774 if max == 0: 2775 continue 2776 2777 con = "" 2778 2779 if min > 0: 2780 con = "%s<=" % min 2781 2782 con += name 2783 2784 if max is not None: 2785 con += "<=%s" % max 2786 2787 constraints.append(con) 2788 2789 cost = [] 2790 for name in "ids": 2791 coeff = self.constraints["cost"][name] 2792 if coeff > 0: 2793 cost.append("%s%s" % (coeff, name)) 2794 2795 limit = self.constraints["cost"]["max"] 2796 if limit is not None and limit > 0: 2797 cost = "%s<=%s" % ("+".join(cost), limit) 2798 constraints.append(cost) 2799 2800 return ",".join(constraints) 2801 2802class Grapheme(RegexBase): 2803 def _compile(self, reverse, fuzzy): 2804 # Match at least 1 character until a grapheme boundary is reached. Note 2805 # that this is the same whether matching forwards or backwards. 2806 grapheme_matcher = Atomic(Sequence([LazyRepeat(AnyAll(), 1, None), 2807 GraphemeBoundary()])) 2808 2809 return grapheme_matcher.compile(reverse, fuzzy) 2810 2811 def dump(self, indent, reverse): 2812 print "%sGRAPHEME" % (INDENT * indent) 2813 2814 def max_width(self): 2815 return UNLIMITED 2816 2817class GraphemeBoundary: 2818 def compile(self, reverse, fuzzy): 2819 return [(OP.GRAPHEME_BOUNDARY, 1)] 2820 2821class GreedyRepeat(RegexBase): 2822 _opcode = OP.GREEDY_REPEAT 2823 _op_name = "GREEDY_REPEAT" 2824 2825 def __init__(self, subpattern, min_count, max_count): 2826 RegexBase.__init__(self) 2827 self.subpattern = subpattern 2828 self.min_count = min_count 2829 self.max_count = max_count 2830 2831 def fix_groups(self, pattern, reverse, fuzzy): 2832 self.subpattern.fix_groups(pattern, reverse, fuzzy) 2833 2834 def optimise(self, info, reverse): 2835 subpattern = self.subpattern.optimise(info, reverse) 2836 2837 return type(self)(subpattern, self.min_count, self.max_count) 2838 2839 def pack_characters(self, info): 2840 self.subpattern = self.subpattern.pack_characters(info) 2841 return self 2842 2843 def remove_captures(self): 2844 self.subpattern = self.subpattern.remove_captures() 2845 return self 2846 2847 def is_atomic(self): 2848 return self.min_count == self.max_count and self.subpattern.is_atomic() 2849 2850 def can_be_affix(self): 2851 return False 2852 2853 def contains_group(self): 2854 return self.subpattern.contains_group() 2855 2856 def get_firstset(self, reverse): 2857 fs = self.subpattern.get_firstset(reverse) 2858 if self.min_count == 0: 2859 fs.add(None) 2860 2861 return fs 2862 2863 def _compile(self, reverse, fuzzy): 2864 repeat = [self._opcode, self.min_count] 2865 if self.max_count is None: 2866 repeat.append(UNLIMITED) 2867 else: 2868 repeat.append(self.max_count) 2869 2870 subpattern = self.subpattern.compile(reverse, fuzzy) 2871 if not subpattern: 2872 return [] 2873 2874 return ([tuple(repeat)] + subpattern + [(OP.END, )]) 2875 2876 def dump(self, indent, reverse): 2877 if self.max_count is None: 2878 limit = "INF" 2879 else: 2880 limit = self.max_count 2881 print "%s%s %s %s" % (INDENT * indent, self._op_name, self.min_count, 2882 limit) 2883 2884 self.subpattern.dump(indent + 1, reverse) 2885 2886 def is_empty(self): 2887 return self.subpattern.is_empty() 2888 2889 def __eq__(self, other): 2890 return type(self) is type(other) and (self.subpattern, self.min_count, 2891 self.max_count) == (other.subpattern, other.min_count, 2892 other.max_count) 2893 2894 def max_width(self): 2895 if self.max_count is None: 2896 return UNLIMITED 2897 2898 return self.subpattern.max_width() * self.max_count 2899 2900 def get_required_string(self, reverse): 2901 max_count = UNLIMITED if self.max_count is None else self.max_count 2902 if self.min_count == 0: 2903 w = self.subpattern.max_width() * max_count 2904 return min(w, UNLIMITED), None 2905 2906 ofs, req = self.subpattern.get_required_string(reverse) 2907 if req: 2908 return ofs, req 2909 2910 w = self.subpattern.max_width() * max_count 2911 return min(w, UNLIMITED), None 2912 2913class PossessiveRepeat(GreedyRepeat): 2914 def is_atomic(self): 2915 return True 2916 2917 def _compile(self, reverse, fuzzy): 2918 subpattern = self.subpattern.compile(reverse, fuzzy) 2919 if not subpattern: 2920 return [] 2921 2922 repeat = [self._opcode, self.min_count] 2923 if self.max_count is None: 2924 repeat.append(UNLIMITED) 2925 else: 2926 repeat.append(self.max_count) 2927 2928 return ([(OP.ATOMIC, ), tuple(repeat)] + subpattern + [(OP.END, ), 2929 (OP.END, )]) 2930 2931 def dump(self, indent, reverse): 2932 print("%sATOMIC" % (INDENT * indent)) 2933 2934 if self.max_count is None: 2935 limit = "INF" 2936 else: 2937 limit = self.max_count 2938 print("%s%s %s %s" % (INDENT * (indent + 1), self._op_name, 2939 self.min_count, limit)) 2940 2941 self.subpattern.dump(indent + 2, reverse) 2942 2943class Group(RegexBase): 2944 def __init__(self, info, group, subpattern): 2945 RegexBase.__init__(self) 2946 self.info = info 2947 self.group = group 2948 self.subpattern = subpattern 2949 2950 self.call_ref = None 2951 2952 def fix_groups(self, pattern, reverse, fuzzy): 2953 self.info.defined_groups[self.group] = (self, reverse, fuzzy) 2954 self.subpattern.fix_groups(pattern, reverse, fuzzy) 2955 2956 def optimise(self, info, reverse): 2957 subpattern = self.subpattern.optimise(info, reverse) 2958 2959 return Group(self.info, self.group, subpattern) 2960 2961 def pack_characters(self, info): 2962 self.subpattern = self.subpattern.pack_characters(info) 2963 return self 2964 2965 def remove_captures(self): 2966 return self.subpattern.remove_captures() 2967 2968 def is_atomic(self): 2969 return self.subpattern.is_atomic() 2970 2971 def can_be_affix(self): 2972 return False 2973 2974 def contains_group(self): 2975 return True 2976 2977 def get_firstset(self, reverse): 2978 return self.subpattern.get_firstset(reverse) 2979 2980 def has_simple_start(self): 2981 return self.subpattern.has_simple_start() 2982 2983 def _compile(self, reverse, fuzzy): 2984 code = [] 2985 2986 key = self.group, reverse, fuzzy 2987 ref = self.info.call_refs.get(key) 2988 if ref is not None: 2989 code += [(OP.CALL_REF, ref)] 2990 2991 public_group = private_group = self.group 2992 if private_group < 0: 2993 public_group = self.info.private_groups[private_group] 2994 private_group = self.info.group_count - private_group 2995 2996 code += ([(OP.GROUP, int(not reverse), private_group, public_group)] + 2997 self.subpattern.compile(reverse, fuzzy) + [(OP.END, )]) 2998 2999 if ref is not None: 3000 code += [(OP.END, )] 3001 3002 return code 3003 3004 def dump(self, indent, reverse): 3005 group = self.group 3006 if group < 0: 3007 group = private_groups[group] 3008 print "%sGROUP %s" % (INDENT * indent, group) 3009 self.subpattern.dump(indent + 1, reverse) 3010 3011 def __eq__(self, other): 3012 return (type(self) is type(other) and (self.group, self.subpattern) == 3013 (other.group, other.subpattern)) 3014 3015 def max_width(self): 3016 return self.subpattern.max_width() 3017 3018 def get_required_string(self, reverse): 3019 return self.subpattern.get_required_string(reverse) 3020 3021 def __del__(self): 3022 self.info = None 3023 3024class Keep(ZeroWidthBase): 3025 _opcode = OP.KEEP 3026 _op_name = "KEEP" 3027 3028class LazyRepeat(GreedyRepeat): 3029 _opcode = OP.LAZY_REPEAT 3030 _op_name = "LAZY_REPEAT" 3031 3032class LookAround(RegexBase): 3033 _dir_text = {False: "AHEAD", True: "BEHIND"} 3034 3035 def __init__(self, behind, positive, subpattern): 3036 RegexBase.__init__(self) 3037 self.behind = bool(behind) 3038 self.positive = bool(positive) 3039 self.subpattern = subpattern 3040 3041 def fix_groups(self, pattern, reverse, fuzzy): 3042 self.subpattern.fix_groups(pattern, self.behind, fuzzy) 3043 3044 def optimise(self, info, reverse): 3045 subpattern = self.subpattern.optimise(info, self.behind) 3046 if self.positive and subpattern.is_empty(): 3047 return subpattern 3048 3049 return LookAround(self.behind, self.positive, subpattern) 3050 3051 def pack_characters(self, info): 3052 self.subpattern = self.subpattern.pack_characters(info) 3053 return self 3054 3055 def remove_captures(self): 3056 return self.subpattern.remove_captures() 3057 3058 def is_atomic(self): 3059 return self.subpattern.is_atomic() 3060 3061 def can_be_affix(self): 3062 return self.subpattern.can_be_affix() 3063 3064 def contains_group(self): 3065 return self.subpattern.contains_group() 3066 3067 def get_firstset(self, reverse): 3068 if self.positive and self.behind == reverse: 3069 return self.subpattern.get_firstset(reverse) 3070 3071 return set([None]) 3072 3073 def _compile(self, reverse, fuzzy): 3074 flags = 0 3075 if self.positive: 3076 flags |= POSITIVE_OP 3077 if fuzzy: 3078 flags |= FUZZY_OP 3079 if reverse: 3080 flags |= REVERSE_OP 3081 3082 return ([(OP.LOOKAROUND, flags, int(not self.behind))] + 3083 self.subpattern.compile(self.behind) + [(OP.END, )]) 3084 3085 def dump(self, indent, reverse): 3086 print "%sLOOK%s %s" % (INDENT * indent, self._dir_text[self.behind], 3087 POS_TEXT[self.positive]) 3088 self.subpattern.dump(indent + 1, self.behind) 3089 3090 def is_empty(self): 3091 return self.positive and self.subpattern.is_empty() 3092 3093 def __eq__(self, other): 3094 return type(self) is type(other) and (self.behind, self.positive, 3095 self.subpattern) == (other.behind, other.positive, other.subpattern) 3096 3097 def max_width(self): 3098 return 0 3099 3100class LookAroundConditional(RegexBase): 3101 _dir_text = {False: "AHEAD", True: "BEHIND"} 3102 3103 def __init__(self, behind, positive, subpattern, yes_item, no_item): 3104 RegexBase.__init__(self) 3105 self.behind = bool(behind) 3106 self.positive = bool(positive) 3107 self.subpattern = subpattern 3108 self.yes_item = yes_item 3109 self.no_item = no_item 3110 3111 def fix_groups(self, pattern, reverse, fuzzy): 3112 self.subpattern.fix_groups(pattern, reverse, fuzzy) 3113 self.yes_item.fix_groups(pattern, reverse, fuzzy) 3114 self.no_item.fix_groups(pattern, reverse, fuzzy) 3115 3116 def optimise(self, info, reverse): 3117 subpattern = self.subpattern.optimise(info, self.behind) 3118 yes_item = self.yes_item.optimise(info, self.behind) 3119 no_item = self.no_item.optimise(info, self.behind) 3120 3121 return LookAroundConditional(self.behind, self.positive, subpattern, 3122 yes_item, no_item) 3123 3124 def pack_characters(self, info): 3125 self.subpattern = self.subpattern.pack_characters(info) 3126 self.yes_item = self.yes_item.pack_characters(info) 3127 self.no_item = self.no_item.pack_characters(info) 3128 return self 3129 3130 def remove_captures(self): 3131 self.subpattern = self.subpattern.remove_captures() 3132 self.yes_item = self.yes_item.remove_captures() 3133 self.no_item = self.no_item.remove_captures() 3134 3135 def is_atomic(self): 3136 return (self.subpattern.is_atomic() and self.yes_item.is_atomic() and 3137 self.no_item.is_atomic()) 3138 3139 def can_be_affix(self): 3140 return (self.subpattern.can_be_affix() and self.yes_item.can_be_affix() 3141 and self.no_item.can_be_affix()) 3142 3143 def contains_group(self): 3144 return (self.subpattern.contains_group() or 3145 self.yes_item.contains_group() or self.no_item.contains_group()) 3146 3147 def _compile(self, reverse, fuzzy): 3148 code = [(OP.CONDITIONAL, int(self.positive), int(not self.behind))] 3149 code.extend(self.subpattern.compile(self.behind, fuzzy)) 3150 code.append((OP.NEXT, )) 3151 code.extend(self.yes_item.compile(reverse, fuzzy)) 3152 add_code = self.no_item.compile(reverse, fuzzy) 3153 if add_code: 3154 code.append((OP.NEXT, )) 3155 code.extend(add_code) 3156 3157 code.append((OP.END, )) 3158 3159 return code 3160 3161 def dump(self, indent, reverse): 3162 print("%sCONDITIONAL %s %s" % (INDENT * indent, 3163 self._dir_text[self.behind], POS_TEXT[self.positive])) 3164 self.subpattern.dump(indent + 1, self.behind) 3165 print("%sEITHER" % (INDENT * indent)) 3166 self.yes_item.dump(indent + 1, reverse) 3167 if not self.no_item.is_empty(): 3168 print("%sOR" % (INDENT * indent)) 3169 self.no_item.dump(indent + 1, reverse) 3170 3171 def is_empty(self): 3172 return (self.subpattern.is_empty() and self.yes_item.is_empty() or 3173 self.no_item.is_empty()) 3174 3175 def __eq__(self, other): 3176 return type(self) is type(other) and (self.subpattern, self.yes_item, 3177 self.no_item) == (other.subpattern, other.yes_item, other.no_item) 3178 3179 def max_width(self): 3180 return max(self.yes_item.max_width(), self.no_item.max_width()) 3181 3182 def get_required_string(self, reverse): 3183 return self.max_width(), None 3184 3185class PrecompiledCode(RegexBase): 3186 def __init__(self, code): 3187 self.code = code 3188 3189 def _compile(self, reverse, fuzzy): 3190 return [tuple(self.code)] 3191 3192class Property(RegexBase): 3193 _opcode = {(NOCASE, False): OP.PROPERTY, (IGNORECASE, False): 3194 OP.PROPERTY_IGN, (FULLCASE, False): OP.PROPERTY, (FULLIGNORECASE, False): 3195 OP.PROPERTY_IGN, (NOCASE, True): OP.PROPERTY_REV, (IGNORECASE, True): 3196 OP.PROPERTY_IGN_REV, (FULLCASE, True): OP.PROPERTY_REV, (FULLIGNORECASE, 3197 True): OP.PROPERTY_IGN_REV} 3198 3199 def __init__(self, value, positive=True, case_flags=NOCASE, 3200 zerowidth=False): 3201 RegexBase.__init__(self) 3202 self.value = value 3203 self.positive = bool(positive) 3204 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] 3205 self.zerowidth = bool(zerowidth) 3206 3207 self._key = (self.__class__, self.value, self.positive, 3208 self.case_flags, self.zerowidth) 3209 3210 def rebuild(self, positive, case_flags, zerowidth): 3211 return Property(self.value, positive, case_flags, zerowidth) 3212 3213 def optimise(self, info, reverse, in_set=False): 3214 return self 3215 3216 def get_firstset(self, reverse): 3217 return set([self]) 3218 3219 def has_simple_start(self): 3220 return True 3221 3222 def _compile(self, reverse, fuzzy): 3223 flags = 0 3224 if self.positive: 3225 flags |= POSITIVE_OP 3226 if self.zerowidth: 3227 flags |= ZEROWIDTH_OP 3228 if fuzzy: 3229 flags |= FUZZY_OP 3230 return [(self._opcode[self.case_flags, reverse], flags, self.value)] 3231 3232 def dump(self, indent, reverse): 3233 prop = PROPERTY_NAMES[self.value >> 16] 3234 name, value = prop[0], prop[1][self.value & 0xFFFF] 3235 print "%sPROPERTY %s %s:%s%s" % (INDENT * indent, 3236 POS_TEXT[self.positive], name, value, CASE_TEXT[self.case_flags]) 3237 3238 def matches(self, ch): 3239 return _regex.has_property_value(self.value, ch) == self.positive 3240 3241 def max_width(self): 3242 return 1 3243 3244class Prune(ZeroWidthBase): 3245 _op_name = "PRUNE" 3246 3247 def _compile(self, reverse, fuzzy): 3248 return [(OP.PRUNE, )] 3249 3250class Range(RegexBase): 3251 _opcode = {(NOCASE, False): OP.RANGE, (IGNORECASE, False): OP.RANGE_IGN, 3252 (FULLCASE, False): OP.RANGE, (FULLIGNORECASE, False): OP.RANGE_IGN, 3253 (NOCASE, True): OP.RANGE_REV, (IGNORECASE, True): OP.RANGE_IGN_REV, 3254 (FULLCASE, True): OP.RANGE_REV, (FULLIGNORECASE, True): OP.RANGE_IGN_REV} 3255 _op_name = "RANGE" 3256 3257 def __init__(self, lower, upper, positive=True, case_flags=NOCASE, 3258 zerowidth=False): 3259 RegexBase.__init__(self) 3260 self.lower = lower 3261 self.upper = upper 3262 self.positive = bool(positive) 3263 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] 3264 self.zerowidth = bool(zerowidth) 3265 3266 self._key = (self.__class__, self.lower, self.upper, self.positive, 3267 self.case_flags, self.zerowidth) 3268 3269 def rebuild(self, positive, case_flags, zerowidth): 3270 return Range(self.lower, self.upper, positive, case_flags, zerowidth) 3271 3272 def optimise(self, info, reverse, in_set=False): 3273 # Is the range case-sensitive? 3274 if not self.positive or not (self.case_flags & IGNORECASE) or in_set: 3275 return self 3276 3277 # Is full case-folding possible? 3278 if (not (info.flags & UNICODE) or (self.case_flags & FULLIGNORECASE) != 3279 FULLIGNORECASE): 3280 return self 3281 3282 # Get the characters which expand to multiple codepoints on folding. 3283 expanding_chars = _regex.get_expand_on_folding() 3284 3285 # Get the folded characters in the range. 3286 items = [] 3287 for ch in expanding_chars: 3288 if self.lower <= ord(ch) <= self.upper: 3289 folded = _regex.fold_case(FULL_CASE_FOLDING, ch) 3290 items.append(String([ord(c) for c in folded], 3291 case_flags=self.case_flags)) 3292 3293 if not items: 3294 # We can fall back to simple case-folding. 3295 return self 3296 3297 if len(items) < self.upper - self.lower + 1: 3298 # Not all the characters are covered by the full case-folding. 3299 items.insert(0, self) 3300 3301 return Branch(items) 3302 3303 def _compile(self, reverse, fuzzy): 3304 flags = 0 3305 if self.positive: 3306 flags |= POSITIVE_OP 3307 if self.zerowidth: 3308 flags |= ZEROWIDTH_OP 3309 if fuzzy: 3310 flags |= FUZZY_OP 3311 return [(self._opcode[self.case_flags, reverse], flags, self.lower, 3312 self.upper)] 3313 3314 def dump(self, indent, reverse): 3315 display_lower = repr(unichr(self.lower)).lstrip("bu") 3316 display_upper = repr(unichr(self.upper)).lstrip("bu") 3317 print "%sRANGE %s %s %s%s" % (INDENT * indent, POS_TEXT[self.positive], 3318 display_lower, display_upper, CASE_TEXT[self.case_flags]) 3319 3320 def matches(self, ch): 3321 return (self.lower <= ch <= self.upper) == self.positive 3322 3323 def max_width(self): 3324 return 1 3325 3326class RefGroup(RegexBase): 3327 _opcode = {(NOCASE, False): OP.REF_GROUP, (IGNORECASE, False): 3328 OP.REF_GROUP_IGN, (FULLCASE, False): OP.REF_GROUP, (FULLIGNORECASE, 3329 False): OP.REF_GROUP_FLD, (NOCASE, True): OP.REF_GROUP_REV, (IGNORECASE, 3330 True): OP.REF_GROUP_IGN_REV, (FULLCASE, True): OP.REF_GROUP_REV, 3331 (FULLIGNORECASE, True): OP.REF_GROUP_FLD_REV} 3332 3333 def __init__(self, info, group, position, case_flags=NOCASE): 3334 RegexBase.__init__(self) 3335 self.info = info 3336 self.group = group 3337 self.position = position 3338 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] 3339 3340 self._key = self.__class__, self.group, self.case_flags 3341 3342 def fix_groups(self, pattern, reverse, fuzzy): 3343 try: 3344 self.group = int(self.group) 3345 except ValueError: 3346 try: 3347 self.group = self.info.group_index[self.group] 3348 except KeyError: 3349 raise error("unknown group", pattern, self.position) 3350 3351 if not 1 <= self.group <= self.info.group_count: 3352 raise error("invalid group reference", pattern, self.position) 3353 3354 self._key = self.__class__, self.group, self.case_flags 3355 3356 def remove_captures(self): 3357 raise error("group reference not allowed", pattern, self.position) 3358 3359 def _compile(self, reverse, fuzzy): 3360 flags = 0 3361 if fuzzy: 3362 flags |= FUZZY_OP 3363 return [(self._opcode[self.case_flags, reverse], flags, self.group)] 3364 3365 def dump(self, indent, reverse): 3366 print "%sREF_GROUP %s%s" % (INDENT * indent, self.group, 3367 CASE_TEXT[self.case_flags]) 3368 3369 def max_width(self): 3370 return UNLIMITED 3371 3372 def __del__(self): 3373 self.info = None 3374 3375class SearchAnchor(ZeroWidthBase): 3376 _opcode = OP.SEARCH_ANCHOR 3377 _op_name = "SEARCH_ANCHOR" 3378 3379class Sequence(RegexBase): 3380 def __init__(self, items=None): 3381 RegexBase.__init__(self) 3382 if items is None: 3383 items = [] 3384 3385 self.items = items 3386 3387 def fix_groups(self, pattern, reverse, fuzzy): 3388 for s in self.items: 3389 s.fix_groups(pattern, reverse, fuzzy) 3390 3391 def optimise(self, info, reverse): 3392 # Flatten the sequences. 3393 items = [] 3394 for s in self.items: 3395 s = s.optimise(info, reverse) 3396 if isinstance(s, Sequence): 3397 items.extend(s.items) 3398 else: 3399 items.append(s) 3400 3401 return make_sequence(items) 3402 3403 def pack_characters(self, info): 3404 "Packs sequences of characters into strings." 3405 items = [] 3406 characters = [] 3407 case_flags = NOCASE 3408 for s in self.items: 3409 if type(s) is Character and s.positive and not s.zerowidth: 3410 if s.case_flags != case_flags: 3411 # Different case sensitivity, so flush, unless neither the 3412 # previous nor the new character are cased. 3413 if s.case_flags or is_cased_i(info, s.value): 3414 Sequence._flush_characters(info, characters, 3415 case_flags, items) 3416 3417 case_flags = s.case_flags 3418 3419 characters.append(s.value) 3420 elif type(s) is String or type(s) is Literal: 3421 if s.case_flags != case_flags: 3422 # Different case sensitivity, so flush, unless the neither 3423 # the previous nor the new string are cased. 3424 if s.case_flags or any(is_cased_i(info, c) for c in 3425 characters): 3426 Sequence._flush_characters(info, characters, 3427 case_flags, items) 3428 3429 case_flags = s.case_flags 3430 3431 characters.extend(s.characters) 3432 else: 3433 Sequence._flush_characters(info, characters, case_flags, items) 3434 3435 items.append(s.pack_characters(info)) 3436 3437 Sequence._flush_characters(info, characters, case_flags, items) 3438 3439 return make_sequence(items) 3440 3441 def remove_captures(self): 3442 self.items = [s.remove_captures() for s in self.items] 3443 return self 3444 3445 def is_atomic(self): 3446 return all(s.is_atomic() for s in self.items) 3447 3448 def can_be_affix(self): 3449 return False 3450 3451 def contains_group(self): 3452 return any(s.contains_group() for s in self.items) 3453 3454 def get_firstset(self, reverse): 3455 fs = set() 3456 items = self.items 3457 if reverse: 3458 items.reverse() 3459 for s in items: 3460 fs |= s.get_firstset(reverse) 3461 if None not in fs: 3462 return fs 3463 fs.discard(None) 3464 3465 return fs | set([None]) 3466 3467 def has_simple_start(self): 3468 return bool(self.items) and self.items[0].has_simple_start() 3469 3470 def _compile(self, reverse, fuzzy): 3471 seq = self.items 3472 if reverse: 3473 seq = seq[::-1] 3474 3475 code = [] 3476 for s in seq: 3477 code.extend(s.compile(reverse, fuzzy)) 3478 3479 return code 3480 3481 def dump(self, indent, reverse): 3482 for s in self.items: 3483 s.dump(indent, reverse) 3484 3485 @staticmethod 3486 def _flush_characters(info, characters, case_flags, items): 3487 if not characters: 3488 return 3489 3490 # Disregard case_flags if all of the characters are case-less. 3491 if case_flags & IGNORECASE: 3492 if not any(is_cased_i(info, c) for c in characters): 3493 case_flags = NOCASE 3494 3495 if (case_flags & FULLIGNORECASE) == FULLIGNORECASE: 3496 literals = Sequence._fix_full_casefold(characters) 3497 3498 for item in literals: 3499 chars = item.characters 3500 3501 if len(chars) == 1: 3502 items.append(Character(chars[0], case_flags=item.case_flags)) 3503 else: 3504 items.append(String(chars, case_flags=item.case_flags)) 3505 else: 3506 if len(characters) == 1: 3507 items.append(Character(characters[0], case_flags=case_flags)) 3508 else: 3509 items.append(String(characters, case_flags=case_flags)) 3510 3511 characters[:] = [] 3512 3513 @staticmethod 3514 def _fix_full_casefold(characters): 3515 # Split a literal needing full case-folding into chunks that need it 3516 # and chunks that can use simple case-folding, which is faster. 3517 expanded = [_regex.fold_case(FULL_CASE_FOLDING, c) for c in 3518 _regex.get_expand_on_folding()] 3519 string = _regex.fold_case(FULL_CASE_FOLDING, u''.join(unichr(c) 3520 for c in characters)).lower() 3521 chunks = [] 3522 3523 for e in expanded: 3524 found = string.find(e) 3525 3526 while found >= 0: 3527 chunks.append((found, found + len(e))) 3528 found = string.find(e, found + 1) 3529 3530 pos = 0 3531 literals = [] 3532 3533 for start, end in Sequence._merge_chunks(chunks): 3534 if pos < start: 3535 literals.append(Literal(characters[pos : start], 3536 case_flags=IGNORECASE)) 3537 3538 literals.append(Literal(characters[start : end], 3539 case_flags=FULLIGNORECASE)) 3540 pos = end 3541 3542 if pos < len(characters): 3543 literals.append(Literal(characters[pos : ], case_flags=IGNORECASE)) 3544 3545 return literals 3546 3547 @staticmethod 3548 def _merge_chunks(chunks): 3549 if len(chunks) < 2: 3550 return chunks 3551 3552 chunks.sort() 3553 3554 start, end = chunks[0] 3555 new_chunks = [] 3556 3557 for s, e in chunks[1 : ]: 3558 if s <= end: 3559 end = max(end, e) 3560 else: 3561 new_chunks.append((start, end)) 3562 start, end = s, e 3563 3564 new_chunks.append((start, end)) 3565 3566 return new_chunks 3567 3568 def is_empty(self): 3569 return all(i.is_empty() for i in self.items) 3570 3571 def __eq__(self, other): 3572 return type(self) is type(other) and self.items == other.items 3573 3574 def max_width(self): 3575 return sum(s.max_width() for s in self.items) 3576 3577 def get_required_string(self, reverse): 3578 seq = self.items 3579 if reverse: 3580 seq = seq[::-1] 3581 3582 offset = 0 3583 3584 for s in seq: 3585 ofs, req = s.get_required_string(reverse) 3586 offset += ofs 3587 if req: 3588 return offset, req 3589 3590 return offset, None 3591 3592class SetBase(RegexBase): 3593 def __init__(self, info, items, positive=True, case_flags=NOCASE, 3594 zerowidth=False): 3595 RegexBase.__init__(self) 3596 self.info = info 3597 self.items = tuple(items) 3598 self.positive = bool(positive) 3599 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] 3600 self.zerowidth = bool(zerowidth) 3601 3602 self.char_width = 1 3603 3604 self._key = (self.__class__, self.items, self.positive, 3605 self.case_flags, self.zerowidth) 3606 3607 def rebuild(self, positive, case_flags, zerowidth): 3608 return type(self)(self.info, self.items, positive, case_flags, 3609 zerowidth).optimise(self.info, False) 3610 3611 def get_firstset(self, reverse): 3612 return set([self]) 3613 3614 def has_simple_start(self): 3615 return True 3616 3617 def _compile(self, reverse, fuzzy): 3618 flags = 0 3619 if self.positive: 3620 flags |= POSITIVE_OP 3621 if self.zerowidth: 3622 flags |= ZEROWIDTH_OP 3623 if fuzzy: 3624 flags |= FUZZY_OP 3625 code = [(self._opcode[self.case_flags, reverse], flags)] 3626 for m in self.items: 3627 code.extend(m.compile()) 3628 3629 code.append((OP.END, )) 3630 3631 return code 3632 3633 def dump(self, indent, reverse): 3634 print "%s%s %s%s" % (INDENT * indent, self._op_name, 3635 POS_TEXT[self.positive], CASE_TEXT[self.case_flags]) 3636 for i in self.items: 3637 i.dump(indent + 1, reverse) 3638 3639 def _handle_case_folding(self, info, in_set): 3640 # Is the set case-sensitive? 3641 if not self.positive or not (self.case_flags & IGNORECASE) or in_set: 3642 return self 3643 3644 # Is full case-folding possible? 3645 if (not (self.info.flags & UNICODE) or (self.case_flags & 3646 FULLIGNORECASE) != FULLIGNORECASE): 3647 return self 3648 3649 # Get the characters which expand to multiple codepoints on folding. 3650 expanding_chars = _regex.get_expand_on_folding() 3651 3652 # Get the folded characters in the set. 3653 items = [] 3654 seen = set() 3655 for ch in expanding_chars: 3656 if self.matches(ord(ch)): 3657 folded = _regex.fold_case(FULL_CASE_FOLDING, ch) 3658 if folded not in seen: 3659 items.append(String([ord(c) for c in folded], 3660 case_flags=self.case_flags)) 3661 seen.add(folded) 3662 3663 if not items: 3664 # We can fall back to simple case-folding. 3665 return self 3666 3667 return Branch([self] + items) 3668 3669 def max_width(self): 3670 # Is the set case-sensitive? 3671 if not self.positive or not (self.case_flags & IGNORECASE): 3672 return 1 3673 3674 # Is full case-folding possible? 3675 if (not (self.info.flags & UNICODE) or (self.case_flags & 3676 FULLIGNORECASE) != FULLIGNORECASE): 3677 return 1 3678 3679 # Get the characters which expand to multiple codepoints on folding. 3680 expanding_chars = _regex.get_expand_on_folding() 3681 3682 # Get the folded characters in the set. 3683 seen = set() 3684 for ch in expanding_chars: 3685 if self.matches(ord(ch)): 3686 folded = _regex.fold_case(FULL_CASE_FOLDING, ch) 3687 seen.add(folded) 3688 3689 if not seen: 3690 return 1 3691 3692 return max(len(folded) for folded in seen) 3693 3694 def __del__(self): 3695 self.info = None 3696 3697class SetDiff(SetBase): 3698 _opcode = {(NOCASE, False): OP.SET_DIFF, (IGNORECASE, False): 3699 OP.SET_DIFF_IGN, (FULLCASE, False): OP.SET_DIFF, (FULLIGNORECASE, False): 3700 OP.SET_DIFF_IGN, (NOCASE, True): OP.SET_DIFF_REV, (IGNORECASE, True): 3701 OP.SET_DIFF_IGN_REV, (FULLCASE, True): OP.SET_DIFF_REV, (FULLIGNORECASE, 3702 True): OP.SET_DIFF_IGN_REV} 3703 _op_name = "SET_DIFF" 3704 3705 def optimise(self, info, reverse, in_set=False): 3706 items = self.items 3707 if len(items) > 2: 3708 items = [items[0], SetUnion(info, items[1 : ])] 3709 3710 if len(items) == 1: 3711 return items[0].with_flags(case_flags=self.case_flags, 3712 zerowidth=self.zerowidth).optimise(info, reverse, in_set) 3713 3714 self.items = tuple(m.optimise(info, reverse, in_set=True) for m in 3715 items) 3716 3717 return self._handle_case_folding(info, in_set) 3718 3719 def matches(self, ch): 3720 m = self.items[0].matches(ch) and not self.items[1].matches(ch) 3721 return m == self.positive 3722 3723class SetInter(SetBase): 3724 _opcode = {(NOCASE, False): OP.SET_INTER, (IGNORECASE, False): 3725 OP.SET_INTER_IGN, (FULLCASE, False): OP.SET_INTER, (FULLIGNORECASE, 3726 False): OP.SET_INTER_IGN, (NOCASE, True): OP.SET_INTER_REV, (IGNORECASE, 3727 True): OP.SET_INTER_IGN_REV, (FULLCASE, True): OP.SET_INTER_REV, 3728 (FULLIGNORECASE, True): OP.SET_INTER_IGN_REV} 3729 _op_name = "SET_INTER" 3730 3731 def optimise(self, info, reverse, in_set=False): 3732 items = [] 3733 for m in self.items: 3734 m = m.optimise(info, reverse, in_set=True) 3735 if isinstance(m, SetInter) and m.positive: 3736 # Intersection in intersection. 3737 items.extend(m.items) 3738 else: 3739 items.append(m) 3740 3741 if len(items) == 1: 3742 return items[0].with_flags(case_flags=self.case_flags, 3743 zerowidth=self.zerowidth).optimise(info, reverse, in_set) 3744 3745 self.items = tuple(items) 3746 3747 return self._handle_case_folding(info, in_set) 3748 3749 def matches(self, ch): 3750 m = all(i.matches(ch) for i in self.items) 3751 return m == self.positive 3752 3753class SetSymDiff(SetBase): 3754 _opcode = {(NOCASE, False): OP.SET_SYM_DIFF, (IGNORECASE, False): 3755 OP.SET_SYM_DIFF_IGN, (FULLCASE, False): OP.SET_SYM_DIFF, (FULLIGNORECASE, 3756 False): OP.SET_SYM_DIFF_IGN, (NOCASE, True): OP.SET_SYM_DIFF_REV, 3757 (IGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV, (FULLCASE, True): 3758 OP.SET_SYM_DIFF_REV, (FULLIGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV} 3759 _op_name = "SET_SYM_DIFF" 3760 3761 def optimise(self, info, reverse, in_set=False): 3762 items = [] 3763 for m in self.items: 3764 m = m.optimise(info, reverse, in_set=True) 3765 if isinstance(m, SetSymDiff) and m.positive: 3766 # Symmetric difference in symmetric difference. 3767 items.extend(m.items) 3768 else: 3769 items.append(m) 3770 3771 if len(items) == 1: 3772 return items[0].with_flags(case_flags=self.case_flags, 3773 zerowidth=self.zerowidth).optimise(info, reverse, in_set) 3774 3775 self.items = tuple(items) 3776 3777 return self._handle_case_folding(info, in_set) 3778 3779 def matches(self, ch): 3780 m = False 3781 for i in self.items: 3782 m = m != i.matches(ch) 3783 3784 return m == self.positive 3785 3786class SetUnion(SetBase): 3787 _opcode = {(NOCASE, False): OP.SET_UNION, (IGNORECASE, False): 3788 OP.SET_UNION_IGN, (FULLCASE, False): OP.SET_UNION, (FULLIGNORECASE, 3789 False): OP.SET_UNION_IGN, (NOCASE, True): OP.SET_UNION_REV, (IGNORECASE, 3790 True): OP.SET_UNION_IGN_REV, (FULLCASE, True): OP.SET_UNION_REV, 3791 (FULLIGNORECASE, True): OP.SET_UNION_IGN_REV} 3792 _op_name = "SET_UNION" 3793 3794 def optimise(self, info, reverse, in_set=False): 3795 items = [] 3796 for m in self.items: 3797 m = m.optimise(info, reverse, in_set=True) 3798 if isinstance(m, SetUnion) and m.positive: 3799 # Union in union. 3800 items.extend(m.items) 3801 else: 3802 items.append(m) 3803 3804 if len(items) == 1: 3805 i = items[0] 3806 return i.with_flags(positive=i.positive == self.positive, 3807 case_flags=self.case_flags, 3808 zerowidth=self.zerowidth).optimise(info, reverse, in_set) 3809 3810 self.items = tuple(items) 3811 3812 return self._handle_case_folding(info, in_set) 3813 3814 def _compile(self, reverse, fuzzy): 3815 flags = 0 3816 if self.positive: 3817 flags |= POSITIVE_OP 3818 if self.zerowidth: 3819 flags |= ZEROWIDTH_OP 3820 if fuzzy: 3821 flags |= FUZZY_OP 3822 3823 characters, others = defaultdict(list), [] 3824 for m in self.items: 3825 if isinstance(m, Character): 3826 characters[m.positive].append(m.value) 3827 else: 3828 others.append(m) 3829 3830 code = [(self._opcode[self.case_flags, reverse], flags)] 3831 3832 for positive, values in characters.items(): 3833 flags = 0 3834 if positive: 3835 flags |= POSITIVE_OP 3836 if len(values) == 1: 3837 code.append((OP.CHARACTER, flags, values[0])) 3838 else: 3839 code.append((OP.STRING, flags, len(values)) + tuple(values)) 3840 3841 for m in others: 3842 code.extend(m.compile()) 3843 3844 code.append((OP.END, )) 3845 3846 return code 3847 3848 def matches(self, ch): 3849 m = any(i.matches(ch) for i in self.items) 3850 return m == self.positive 3851 3852class Skip(ZeroWidthBase): 3853 _op_name = "SKIP" 3854 _opcode = OP.SKIP 3855 3856class StartOfLine(ZeroWidthBase): 3857 _opcode = OP.START_OF_LINE 3858 _op_name = "START_OF_LINE" 3859 3860class StartOfLineU(StartOfLine): 3861 _opcode = OP.START_OF_LINE_U 3862 _op_name = "START_OF_LINE_U" 3863 3864class StartOfString(ZeroWidthBase): 3865 _opcode = OP.START_OF_STRING 3866 _op_name = "START_OF_STRING" 3867 3868class StartOfWord(ZeroWidthBase): 3869 _opcode = OP.START_OF_WORD 3870 _op_name = "START_OF_WORD" 3871 3872class String(RegexBase): 3873 _opcode = {(NOCASE, False): OP.STRING, (IGNORECASE, False): OP.STRING_IGN, 3874 (FULLCASE, False): OP.STRING, (FULLIGNORECASE, False): OP.STRING_FLD, 3875 (NOCASE, True): OP.STRING_REV, (IGNORECASE, True): OP.STRING_IGN_REV, 3876 (FULLCASE, True): OP.STRING_REV, (FULLIGNORECASE, True): 3877 OP.STRING_FLD_REV} 3878 3879 def __init__(self, characters, case_flags=NOCASE): 3880 self.characters = tuple(characters) 3881 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] 3882 3883 if (self.case_flags & FULLIGNORECASE) == FULLIGNORECASE: 3884 folded_characters = [] 3885 for char in self.characters: 3886 folded = _regex.fold_case(FULL_CASE_FOLDING, unichr(char)) 3887 folded_characters.extend(ord(c) for c in folded) 3888 else: 3889 folded_characters = self.characters 3890 3891 self.folded_characters = tuple(folded_characters) 3892 self.required = False 3893 3894 self._key = self.__class__, self.characters, self.case_flags 3895 3896 def get_firstset(self, reverse): 3897 if reverse: 3898 pos = -1 3899 else: 3900 pos = 0 3901 return set([Character(self.characters[pos], 3902 case_flags=self.case_flags)]) 3903 3904 def has_simple_start(self): 3905 return True 3906 3907 def _compile(self, reverse, fuzzy): 3908 flags = 0 3909 if fuzzy: 3910 flags |= FUZZY_OP 3911 if self.required: 3912 flags |= REQUIRED_OP 3913 return [(self._opcode[self.case_flags, reverse], flags, 3914 len(self.folded_characters)) + self.folded_characters] 3915 3916 def dump(self, indent, reverse): 3917 display = repr("".join(unichr(c) for c in self.characters)).lstrip("bu") 3918 print "%sSTRING %s%s" % (INDENT * indent, display, 3919 CASE_TEXT[self.case_flags]) 3920 3921 def max_width(self): 3922 return len(self.folded_characters) 3923 3924 def get_required_string(self, reverse): 3925 return 0, self 3926 3927class Literal(String): 3928 def dump(self, indent, reverse): 3929 literal = ''.join(unichr(c) for c in self.characters) 3930 display = repr(literal).lstrip("bu") 3931 print "%sLITERAL MATCH %s%s" % (INDENT * indent, display, 3932 CASE_TEXT[self.case_flags]) 3933 3934class StringSet(Branch): 3935 def __init__(self, info, name, case_flags=NOCASE): 3936 self.info = info 3937 self.name = name 3938 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] 3939 3940 self._key = self.__class__, self.name, self.case_flags 3941 3942 self.set_key = (name, self.case_flags) 3943 if self.set_key not in info.named_lists_used: 3944 info.named_lists_used[self.set_key] = len(info.named_lists_used) 3945 3946 index = self.info.named_lists_used[self.set_key] 3947 items = self.info.kwargs[self.name] 3948 3949 case_flags = self.case_flags 3950 3951 encoding = self.info.flags & _ALL_ENCODINGS 3952 fold_flags = encoding | case_flags 3953 3954 choices = [] 3955 for string in items: 3956 choices.append([Character(ord(c), case_flags=case_flags) for c in 3957 string]) 3958 3959 # Sort from longest to shortest. 3960 choices.sort(key=len, reverse=True) 3961 3962 self.branches = [Sequence(choice) for choice in choices] 3963 3964 def dump(self, indent, reverse): 3965 print "%sSTRING_SET %s%s" % (INDENT * indent, self.name, 3966 CASE_TEXT[self.case_flags]) 3967 3968 def __del__(self): 3969 self.info = None 3970 3971class Source(object): 3972 "Scanner for the regular expression source string." 3973 def __init__(self, string): 3974 if isinstance(string, unicode): 3975 self.string = string 3976 self.char_type = unichr 3977 else: 3978 self.string = string 3979 self.char_type = chr 3980 3981 self.pos = 0 3982 self.ignore_space = False 3983 self.sep = string[ : 0] 3984 3985 def get(self): 3986 string = self.string 3987 pos = self.pos 3988 3989 try: 3990 if self.ignore_space: 3991 while True: 3992 if string[pos].isspace(): 3993 # Skip over the whitespace. 3994 pos += 1 3995 elif string[pos] == "#": 3996 # Skip over the comment to the end of the line. 3997 pos = string.index("\n", pos) 3998 else: 3999 break 4000 4001 ch = string[pos] 4002 self.pos = pos + 1 4003 return ch 4004 except IndexError: 4005 # We've reached the end of the string. 4006 self.pos = pos 4007 return string[ : 0] 4008 except ValueError: 4009 # The comment extended to the end of the string. 4010 self.pos = len(string) 4011 return string[ : 0] 4012 4013 def get_many(self, count=1): 4014 string = self.string 4015 pos = self.pos 4016 4017 try: 4018 if self.ignore_space: 4019 substring = [] 4020 4021 while len(substring) < count: 4022 while True: 4023 if string[pos].isspace(): 4024 # Skip over the whitespace. 4025 pos += 1 4026 elif string[pos] == "#": 4027 # Skip over the comment to the end of the line. 4028 pos = string.index("\n", pos) 4029 else: 4030 break 4031 4032 substring.append(string[pos]) 4033 pos += 1 4034 4035 substring = "".join(substring) 4036 else: 4037 substring = string[pos : pos + count] 4038 pos += len(substring) 4039 4040 self.pos = pos 4041 return substring 4042 except IndexError: 4043 # We've reached the end of the string. 4044 self.pos = len(string) 4045 return "".join(substring) 4046 except ValueError: 4047 # The comment extended to the end of the string. 4048 self.pos = len(string) 4049 return "".join(substring) 4050 4051 def get_while(self, test_set, include=True): 4052 string = self.string 4053 pos = self.pos 4054 4055 if self.ignore_space: 4056 try: 4057 substring = [] 4058 4059 while True: 4060 if string[pos].isspace(): 4061 # Skip over the whitespace. 4062 pos += 1 4063 elif string[pos] == "#": 4064 # Skip over the comment to the end of the line. 4065 pos = string.index("\n", pos) 4066 elif (string[pos] in test_set) == include: 4067 substring.append(string[pos]) 4068 pos += 1 4069 else: 4070 break 4071 4072 self.pos = pos 4073 except IndexError: 4074 # We've reached the end of the string. 4075 self.pos = len(string) 4076 except ValueError: 4077 # The comment extended to the end of the string. 4078 self.pos = len(string) 4079 4080 return "".join(substring) 4081 else: 4082 try: 4083 while (string[pos] in test_set) == include: 4084 pos += 1 4085 4086 substring = string[self.pos : pos] 4087 4088 self.pos = pos 4089 4090 return substring 4091 except IndexError: 4092 # We've reached the end of the string. 4093 substring = string[self.pos : pos] 4094 4095 self.pos = pos 4096 4097 return substring 4098 4099 def skip_while(self, test_set, include=True): 4100 string = self.string 4101 pos = self.pos 4102 4103 try: 4104 if self.ignore_space: 4105 while True: 4106 if string[pos].isspace(): 4107 # Skip over the whitespace. 4108 pos += 1 4109 elif string[pos] == "#": 4110 # Skip over the comment to the end of the line. 4111 pos = string.index("\n", pos) 4112 elif (string[pos] in test_set) == include: 4113 pos += 1 4114 else: 4115 break 4116 else: 4117 while (string[pos] in test_set) == include: 4118 pos += 1 4119 4120 self.pos = pos 4121 except IndexError: 4122 # We've reached the end of the string. 4123 self.pos = len(string) 4124 except ValueError: 4125 # The comment extended to the end of the string. 4126 self.pos = len(string) 4127 4128 def match(self, substring): 4129 string = self.string 4130 pos = self.pos 4131 4132 if self.ignore_space: 4133 try: 4134 for c in substring: 4135 while True: 4136 if string[pos].isspace(): 4137 # Skip over the whitespace. 4138 pos += 1 4139 elif string[pos] == "#": 4140 # Skip over the comment to the end of the line. 4141 pos = string.index("\n", pos) 4142 else: 4143 break 4144 4145 if string[pos] != c: 4146 return False 4147 4148 pos += 1 4149 4150 self.pos = pos 4151 4152 return True 4153 except IndexError: 4154 # We've reached the end of the string. 4155 return False 4156 except ValueError: 4157 # The comment extended to the end of the string. 4158 return False 4159 else: 4160 if not string.startswith(substring, pos): 4161 return False 4162 4163 self.pos = pos + len(substring) 4164 4165 return True 4166 4167 def expect(self, substring): 4168 if not self.match(substring): 4169 raise error("missing %s" % substring, self.string, self.pos) 4170 4171 def at_end(self): 4172 string = self.string 4173 pos = self.pos 4174 4175 try: 4176 if self.ignore_space: 4177 while True: 4178 if string[pos].isspace(): 4179 pos += 1 4180 elif string[pos] == "#": 4181 pos = string.index("\n", pos) 4182 else: 4183 break 4184 4185 return pos >= len(string) 4186 except IndexError: 4187 # We've reached the end of the string. 4188 return True 4189 except ValueError: 4190 # The comment extended to the end of the string. 4191 return True 4192 4193class Info(object): 4194 "Info about the regular expression." 4195 4196 def __init__(self, flags=0, char_type=None, kwargs={}): 4197 flags |= DEFAULT_FLAGS[(flags & _ALL_VERSIONS) or DEFAULT_VERSION] 4198 self.flags = flags 4199 self.global_flags = flags 4200 self.inline_locale = False 4201 4202 self.kwargs = kwargs 4203 4204 self.group_count = 0 4205 self.group_index = {} 4206 self.group_name = {} 4207 self.char_type = char_type 4208 self.named_lists_used = {} 4209 self.open_groups = [] 4210 self.open_group_count = {} 4211 self.defined_groups = {} 4212 self.group_calls = [] 4213 self.private_groups = {} 4214 4215 def open_group(self, name=None): 4216 group = self.group_index.get(name) 4217 if group is None: 4218 while True: 4219 self.group_count += 1 4220 if name is None or self.group_count not in self.group_name: 4221 break 4222 4223 group = self.group_count 4224 if name: 4225 self.group_index[name] = group 4226 self.group_name[group] = name 4227 4228 if group in self.open_groups: 4229 # We have a nested named group. We'll assign it a private group 4230 # number, initially negative until we can assign a proper 4231 # (positive) number. 4232 group_alias = -(len(self.private_groups) + 1) 4233 self.private_groups[group_alias] = group 4234 group = group_alias 4235 4236 self.open_groups.append(group) 4237 self.open_group_count[group] = self.open_group_count.get(group, 0) + 1 4238 4239 return group 4240 4241 def close_group(self): 4242 self.open_groups.pop() 4243 4244 def is_open_group(self, name): 4245 # In version 1, a group reference can refer to an open group. We'll 4246 # just pretend the group isn't open. 4247 version = (self.flags & _ALL_VERSIONS) or DEFAULT_VERSION 4248 if version == VERSION1: 4249 return False 4250 4251 if name.isdigit(): 4252 group = int(name) 4253 else: 4254 group = self.group_index.get(name) 4255 4256 return group in self.open_groups 4257 4258def _check_group_features(info, parsed): 4259 """Checks whether the reverse and fuzzy features of the group calls match 4260 the groups which they call. 4261 """ 4262 call_refs = {} 4263 additional_groups = [] 4264 for call, reverse, fuzzy in info.group_calls: 4265 # Look up the reference of this group call. 4266 key = (call.group, reverse, fuzzy) 4267 ref = call_refs.get(key) 4268 if ref is None: 4269 # This group doesn't have a reference yet, so look up its features. 4270 if call.group == 0: 4271 # Calling the pattern as a whole. 4272 rev = bool(info.flags & REVERSE) 4273 fuz = isinstance(parsed, Fuzzy) 4274 if (rev, fuz) != (reverse, fuzzy): 4275 # The pattern as a whole doesn't have the features we want, 4276 # so we'll need to make a copy of it with the desired 4277 # features. 4278 additional_groups.append((CallRef(len(call_refs), parsed), 4279 reverse, fuzzy)) 4280 else: 4281 # Calling a capture group. 4282 def_info = info.defined_groups[call.group] 4283 group = def_info[0] 4284 if def_info[1 : ] != (reverse, fuzzy): 4285 # The group doesn't have the features we want, so we'll 4286 # need to make a copy of it with the desired features. 4287 additional_groups.append((group, reverse, fuzzy)) 4288 4289 ref = len(call_refs) 4290 call_refs[key] = ref 4291 4292 call.call_ref = ref 4293 4294 info.call_refs = call_refs 4295 info.additional_groups = additional_groups 4296 4297def _get_required_string(parsed, flags): 4298 "Gets the required string and related info of a parsed pattern." 4299 4300 req_offset, required = parsed.get_required_string(bool(flags & REVERSE)) 4301 if required: 4302 required.required = True 4303 if req_offset >= UNLIMITED: 4304 req_offset = -1 4305 4306 req_flags = required.case_flags 4307 if not (flags & UNICODE): 4308 req_flags &= ~UNICODE 4309 4310 req_chars = required.folded_characters 4311 else: 4312 req_offset = 0 4313 req_chars = () 4314 req_flags = 0 4315 4316 return req_offset, req_chars, req_flags 4317 4318class Scanner: 4319 def __init__(self, lexicon, flags=0): 4320 self.lexicon = lexicon 4321 4322 # Combine phrases into a compound pattern. 4323 patterns = [] 4324 for phrase, action in lexicon: 4325 # Parse the regular expression. 4326 source = Source(phrase) 4327 info = Info(flags, source.char_type) 4328 source.ignore_space = bool(info.flags & VERBOSE) 4329 parsed = _parse_pattern(source, info) 4330 if not source.at_end(): 4331 raise error("unbalanced parenthesis", source.string, 4332 source.pos) 4333 4334 # We want to forbid capture groups within each phrase. 4335 patterns.append(parsed.remove_captures()) 4336 4337 # Combine all the subpatterns into one pattern. 4338 info = Info(flags) 4339 patterns = [Group(info, g + 1, p) for g, p in enumerate(patterns)] 4340 parsed = Branch(patterns) 4341 4342 # Optimise the compound pattern. 4343 reverse = bool(info.flags & REVERSE) 4344 parsed = parsed.optimise(info, reverse) 4345 parsed = parsed.pack_characters(info) 4346 4347 # Get the required string. 4348 req_offset, req_chars, req_flags = _get_required_string(parsed, 4349 info.flags) 4350 4351 # Check the features of the groups. 4352 _check_group_features(info, parsed) 4353 4354 # Complain if there are any group calls. They are not supported by the 4355 # Scanner class. 4356 if info.call_refs: 4357 raise error("recursive regex not supported by Scanner", 4358 source.string, source.pos) 4359 4360 reverse = bool(info.flags & REVERSE) 4361 4362 # Compile the compound pattern. The result is a list of tuples. 4363 code = parsed.compile(reverse) + [(OP.SUCCESS, )] 4364 4365 # Flatten the code into a list of ints. 4366 code = _flatten_code(code) 4367 4368 if not parsed.has_simple_start(): 4369 # Get the first set, if possible. 4370 try: 4371 fs_code = _compile_firstset(info, parsed.get_firstset(reverse)) 4372 fs_code = _flatten_code(fs_code) 4373 code = fs_code + code 4374 except _FirstSetError: 4375 pass 4376 4377 # Check the global flags for conflicts. 4378 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION 4379 if version not in (0, VERSION0, VERSION1): 4380 raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible") 4381 4382 # Create the PatternObject. 4383 # 4384 # Local flags like IGNORECASE affect the code generation, but aren't 4385 # needed by the PatternObject itself. Conversely, global flags like 4386 # LOCALE _don't_ affect the code generation but _are_ needed by the 4387 # PatternObject. 4388 self.scanner = _regex.compile(None, (flags & GLOBAL_FLAGS) | version, 4389 code, {}, {}, {}, [], req_offset, req_chars, req_flags, 4390 len(patterns)) 4391 4392 def scan(self, string): 4393 result = [] 4394 append = result.append 4395 match = self.scanner.scanner(string).match 4396 i = 0 4397 while True: 4398 m = match() 4399 if not m: 4400 break 4401 j = m.end() 4402 if i == j: 4403 break 4404 action = self.lexicon[m.lastindex - 1][1] 4405 if hasattr(action, '__call__'): 4406 self.match = m 4407 action = action(self, m.group()) 4408 if action is not None: 4409 append(action) 4410 i = j 4411 4412 return result, string[i : ] 4413 4414# Get the known properties dict. 4415PROPERTIES = _regex.get_properties() 4416 4417# Build the inverse of the properties dict. 4418PROPERTY_NAMES = {} 4419for prop_name, (prop_id, values) in PROPERTIES.items(): 4420 name, prop_values = PROPERTY_NAMES.get(prop_id, ("", {})) 4421 name = max(name, prop_name, key=len) 4422 PROPERTY_NAMES[prop_id] = name, prop_values 4423 4424 for val_name, val_id in values.items(): 4425 prop_values[val_id] = max(prop_values.get(val_id, ""), val_name, 4426 key=len) 4427 4428# Character escape sequences. 4429CHARACTER_ESCAPES = { 4430 "a": "\a", 4431 "b": "\b", 4432 "f": "\f", 4433 "n": "\n", 4434 "r": "\r", 4435 "t": "\t", 4436 "v": "\v", 4437} 4438 4439# Predefined character set escape sequences. 4440CHARSET_ESCAPES = { 4441 "d": lookup_property(None, "Digit", True), 4442 "D": lookup_property(None, "Digit", False), 4443 "h": lookup_property(None, "Blank", True), 4444 "s": lookup_property(None, "Space", True), 4445 "S": lookup_property(None, "Space", False), 4446 "w": lookup_property(None, "Word", True), 4447 "W": lookup_property(None, "Word", False), 4448} 4449 4450# Positional escape sequences. 4451POSITION_ESCAPES = { 4452 "A": StartOfString(), 4453 "b": Boundary(), 4454 "B": Boundary(False), 4455 "K": Keep(), 4456 "m": StartOfWord(), 4457 "M": EndOfWord(), 4458 "Z": EndOfString(), 4459} 4460 4461# Positional escape sequences when WORD flag set. 4462WORD_POSITION_ESCAPES = dict(POSITION_ESCAPES) 4463WORD_POSITION_ESCAPES.update({ 4464 "b": DefaultBoundary(), 4465 "B": DefaultBoundary(False), 4466 "m": DefaultStartOfWord(), 4467 "M": DefaultEndOfWord(), 4468}) 4469 4470# Regex control verbs. 4471VERBS = { 4472 "FAIL": Failure(), 4473 "F": Failure(), 4474 "PRUNE": Prune(), 4475 "SKIP": Skip(), 4476} 4477