1#
2# Secret Labs' Regular Expression Engine core module
3#
4# Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved.
5#
6# This version of the SRE library can be redistributed under CNRI's
7# Python 1.6 license.  For any other use, please contact Secret Labs
8# AB (info@pythonware.com).
9#
10# Portions of this engine have been developed in cooperation with
11# CNRI.  Hewlett-Packard provided funding for 1.6 integration and
12# other compatibility work.
13#
14# 2010-01-16 mrab Python front-end re-written and extended
15
16import string
17import sys
18import unicodedata
19from collections import defaultdict
20
21import _regex
22
23__all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH",
24  "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P",
25  "POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE",
26  "V0", "VERSION0", "V1", "VERSION1", "W", "WORD", "X", "VERBOSE", "error",
27  "Scanner"]
28
29# The regex exception.
30class error(Exception):
31    """Exception raised for invalid regular expressions.
32
33    Attributes:
34
35        msg: The unformatted error message
36        pattern: The regular expression pattern
37        pos: The position in the pattern where compilation failed, or None
38        lineno: The line number where compilation failed, unless pos is None
39        colno: The column number where compilation failed, unless pos is None
40    """
41
42    def __init__(self, message, pattern=None, pos=None):
43        newline = u'\n' if isinstance(pattern, unicode) else '\n'
44        self.msg = message
45        self.pattern = pattern
46        self.pos = pos
47        if pattern is not None and pos is not None:
48            self.lineno = pattern.count(newline, 0, pos) + 1
49            self.colno = pos - pattern.rfind(newline, 0, pos)
50
51            message = "%s at position %d" % (message, pos)
52
53            if newline in pattern:
54                message += " (line %d, column %d)" % (self.lineno, self.colno)
55
56        Exception.__init__(self, message)
57
58# The exception for when a positional flag has been turned on in the old
59# behaviour.
60class _UnscopedFlagSet(Exception):
61    pass
62
63# The exception for when parsing fails and we want to try something else.
64class ParseError(Exception):
65    pass
66
67# The exception for when there isn't a valid first set.
68class _FirstSetError(Exception):
69    pass
70
71# Flags.
72A = ASCII = 0x80          # Assume ASCII locale.
73B = BESTMATCH = 0x1000    # Best fuzzy match.
74D = DEBUG = 0x200         # Print parsed pattern.
75E = ENHANCEMATCH = 0x8000 # Attempt to improve the fit after finding the first
76                          # fuzzy match.
77F = FULLCASE = 0x4000     # Unicode full case-folding.
78I = IGNORECASE = 0x2      # Ignore case.
79L = LOCALE = 0x4          # Assume current 8-bit locale.
80M = MULTILINE = 0x8       # Make anchors look for newline.
81P = POSIX = 0x10000       # POSIX-style matching (leftmost longest).
82R = REVERSE = 0x400       # Search backwards.
83S = DOTALL = 0x10         # Make dot match newline.
84U = UNICODE = 0x20        # Assume Unicode locale.
85V0 = VERSION0 = 0x2000    # Old legacy behaviour.
86V1 = VERSION1 = 0x100     # New enhanced behaviour.
87W = WORD = 0x800          # Default Unicode word breaks.
88X = VERBOSE = 0x40        # Ignore whitespace and comments.
89T = TEMPLATE = 0x1        # Template (present because re module has it).
90
91DEFAULT_VERSION = VERSION1
92
93_ALL_VERSIONS = VERSION0 | VERSION1
94_ALL_ENCODINGS = ASCII | LOCALE | UNICODE
95
96# The default flags for the various versions.
97DEFAULT_FLAGS = {VERSION0: 0, VERSION1: FULLCASE}
98
99# The mask for the flags.
100GLOBAL_FLAGS = (_ALL_ENCODINGS | _ALL_VERSIONS | BESTMATCH | DEBUG |
101  ENHANCEMATCH | POSIX | REVERSE)
102SCOPED_FLAGS = FULLCASE | IGNORECASE | MULTILINE | DOTALL | WORD | VERBOSE
103
104ALPHA = frozenset(string.ascii_letters)
105DIGITS = frozenset(string.digits)
106ALNUM = ALPHA | DIGITS
107OCT_DIGITS = frozenset(string.octdigits)
108HEX_DIGITS = frozenset(string.hexdigits)
109SPECIAL_CHARS = frozenset("()|?*+{^$.[\\#") | frozenset([""])
110NAMED_CHAR_PART = ALNUM | frozenset(" -")
111PROPERTY_NAME_PART = ALNUM | frozenset(" &_-.")
112SET_OPS = ("||", "~~", "&&", "--")
113
114# The width of the code words inside the regex engine.
115BYTES_PER_CODE = _regex.get_code_size()
116BITS_PER_CODE = BYTES_PER_CODE * 8
117
118# The repeat count which represents infinity.
119UNLIMITED = (1 << BITS_PER_CODE) - 1
120
121# The regular expression flags.
122REGEX_FLAGS = {"a": ASCII, "b": BESTMATCH, "e": ENHANCEMATCH, "f": FULLCASE,
123  "i": IGNORECASE, "L": LOCALE, "m": MULTILINE, "p": POSIX, "r": REVERSE,
124  "s": DOTALL, "u": UNICODE, "V0": VERSION0, "V1": VERSION1, "w": WORD, "x":
125  VERBOSE}
126
127# The case flags.
128CASE_FLAGS = FULLCASE | IGNORECASE
129NOCASE = 0
130FULLIGNORECASE = FULLCASE | IGNORECASE
131
132FULL_CASE_FOLDING = UNICODE | FULLIGNORECASE
133
134CASE_FLAGS_COMBINATIONS = {0: 0, FULLCASE: 0, IGNORECASE: IGNORECASE,
135  FULLIGNORECASE: FULLIGNORECASE}
136
137# The number of digits in hexadecimal escapes.
138HEX_ESCAPES = {"x": 2, "u": 4, "U": 8}
139
140# The names of the opcodes.
141OPCODES = """
142FAILURE
143SUCCESS
144ANY
145ANY_ALL
146ANY_ALL_REV
147ANY_REV
148ANY_U
149ANY_U_REV
150ATOMIC
151BOUNDARY
152BRANCH
153CALL_REF
154CHARACTER
155CHARACTER_IGN
156CHARACTER_IGN_REV
157CHARACTER_REV
158CONDITIONAL
159DEFAULT_BOUNDARY
160DEFAULT_END_OF_WORD
161DEFAULT_START_OF_WORD
162END
163END_OF_LINE
164END_OF_LINE_U
165END_OF_STRING
166END_OF_STRING_LINE
167END_OF_STRING_LINE_U
168END_OF_WORD
169FUZZY
170GRAPHEME_BOUNDARY
171GREEDY_REPEAT
172GROUP
173GROUP_CALL
174GROUP_EXISTS
175KEEP
176LAZY_REPEAT
177LOOKAROUND
178NEXT
179PROPERTY
180PROPERTY_IGN
181PROPERTY_IGN_REV
182PROPERTY_REV
183PRUNE
184RANGE
185RANGE_IGN
186RANGE_IGN_REV
187RANGE_REV
188REF_GROUP
189REF_GROUP_FLD
190REF_GROUP_FLD_REV
191REF_GROUP_IGN
192REF_GROUP_IGN_REV
193REF_GROUP_REV
194SEARCH_ANCHOR
195SET_DIFF
196SET_DIFF_IGN
197SET_DIFF_IGN_REV
198SET_DIFF_REV
199SET_INTER
200SET_INTER_IGN
201SET_INTER_IGN_REV
202SET_INTER_REV
203SET_SYM_DIFF
204SET_SYM_DIFF_IGN
205SET_SYM_DIFF_IGN_REV
206SET_SYM_DIFF_REV
207SET_UNION
208SET_UNION_IGN
209SET_UNION_IGN_REV
210SET_UNION_REV
211SKIP
212START_OF_LINE
213START_OF_LINE_U
214START_OF_STRING
215START_OF_WORD
216STRING
217STRING_FLD
218STRING_FLD_REV
219STRING_IGN
220STRING_IGN_REV
221STRING_REV
222FUZZY_EXT
223"""
224
225# Define the opcodes in a namespace.
226class Namespace(object):
227    pass
228
229OP = Namespace()
230for i, op in enumerate(OPCODES.split()):
231    setattr(OP, op, i)
232
233def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5):
234    """Make room in the given cache.
235
236    Args:
237        cache_dict: The cache dictionary to modify.
238        args_dict: The dictionary of named list args used by patterns.
239        max_length: Maximum # of entries in cache_dict before it is shrunk.
240        divisor: Cache will shrink to max_length - 1/divisor*max_length items.
241    """
242    # Toss out a fraction of the entries at random to make room for new ones.
243    # A random algorithm was chosen as opposed to simply cache_dict.popitem()
244    # as popitem could penalize the same regular expression repeatedly based
245    # on its internal hash value.  Being random should spread the cache miss
246    # love around.
247    cache_keys = tuple(cache_dict.keys())
248    overage = len(cache_keys) - max_length
249    if overage < 0:
250        # Cache is already within limits.  Normally this should not happen
251        # but it could due to multithreading.
252        return
253
254    number_to_toss = max_length // divisor + overage
255
256    # The import is done here to avoid a circular dependency.
257    import random
258    if not hasattr(random, 'sample'):
259        # Do nothing while resolving the circular dependency:
260        #  re->random->warnings->tokenize->string->re
261        return
262
263    for doomed_key in random.sample(cache_keys, number_to_toss):
264        try:
265            del cache_dict[doomed_key]
266        except KeyError:
267            # Ignore problems if the cache changed from another thread.
268            pass
269
270    # Rebuild the arguments and locale-sensitivity dictionaries.
271    args_dict.clear()
272    sensitivity_dict = {}
273    for pattern, pattern_type, flags, args, default_version, locale in tuple(cache_dict):
274        args_dict[pattern, pattern_type, flags, default_version, locale] = args
275        try:
276            sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern]
277        except KeyError:
278            pass
279
280    locale_sensitive.clear()
281    locale_sensitive.update(sensitivity_dict)
282
283def _fold_case(info, string):
284    "Folds the case of a string."
285    flags = info.flags
286    if (flags & _ALL_ENCODINGS) == 0:
287        flags |= info.guess_encoding
288
289    return _regex.fold_case(flags, string)
290
291def is_cased_i(info, char):
292    "Checks whether a character is cased."
293    return len(_regex.get_all_cases(info.flags, char)) > 1
294
295def is_cased_f(flags, char):
296    "Checks whether a character is cased."
297    return len(_regex.get_all_cases(flags, char)) > 1
298
299def _compile_firstset(info, fs):
300    "Compiles the firstset for the pattern."
301    reverse = bool(info.flags & REVERSE)
302    fs = _check_firstset(info, reverse, fs)
303    if not fs:
304        return []
305
306    # Compile the firstset.
307    return fs.compile(reverse)
308
309def _check_firstset(info, reverse, fs):
310    "Checks the firstset for the pattern."
311    if not fs or None in fs:
312        return None
313
314    # If we ignore the case, for simplicity we won't build a firstset.
315    members = set()
316    case_flags = NOCASE
317    for i in fs:
318        if isinstance(i, Character) and not i.positive:
319            return None
320
321#        if i.case_flags:
322#            if isinstance(i, Character):
323#                if is_cased_i(info, i.value):
324#                    return []
325#            elif isinstance(i, SetBase):
326#                return []
327        case_flags |= i.case_flags
328        members.add(i.with_flags(case_flags=NOCASE))
329
330    if case_flags == (FULLCASE | IGNORECASE):
331        return None
332
333    # Build the firstset.
334    fs = SetUnion(info, list(members), case_flags=case_flags & ~FULLCASE,
335      zerowidth=True)
336    fs = fs.optimise(info, reverse, in_set=True)
337
338    return fs
339
340def _flatten_code(code):
341    "Flattens the code from a list of tuples."
342    flat_code = []
343    for c in code:
344        flat_code.extend(c)
345
346    return flat_code
347
348def make_case_flags(info):
349    "Makes the case flags."
350    flags = info.flags & CASE_FLAGS
351
352    # Turn off FULLCASE if ASCII is turned on.
353    if info.flags & ASCII:
354        flags &= ~FULLCASE
355
356    return flags
357
358def make_character(info, value, in_set=False):
359    "Makes a character literal."
360    if in_set:
361        # A character set is built case-sensitively.
362        return Character(value)
363
364    return Character(value, case_flags=make_case_flags(info))
365
366def make_ref_group(info, name, position):
367    "Makes a group reference."
368    return RefGroup(info, name, position, case_flags=make_case_flags(info))
369
370def make_string_set(info, name):
371    "Makes a string set."
372    return StringSet(info, name, case_flags=make_case_flags(info))
373
374def make_property(info, prop, in_set):
375    "Makes a property."
376    if in_set:
377        return prop
378
379    return prop.with_flags(case_flags=make_case_flags(info))
380
381def _parse_pattern(source, info):
382    "Parses a pattern, eg. 'a|b|c'."
383    branches = [parse_sequence(source, info)]
384    while source.match("|"):
385        branches.append(parse_sequence(source, info))
386
387    if len(branches) == 1:
388        return branches[0]
389    return Branch(branches)
390
391def parse_sequence(source, info):
392    "Parses a sequence, eg. 'abc'."
393    sequence = [None]
394    case_flags = make_case_flags(info)
395    while True:
396        saved_pos = source.pos
397        ch = source.get()
398        if ch in SPECIAL_CHARS:
399            if ch in ")|":
400                # The end of a sequence. At the end of the pattern ch is "".
401                source.pos = saved_pos
402                break
403            elif ch == "\\":
404                # An escape sequence outside a set.
405                sequence.append(parse_escape(source, info, False))
406            elif ch == "(":
407                # A parenthesised subpattern or a flag.
408                element = parse_paren(source, info)
409                if element is None:
410                    case_flags = make_case_flags(info)
411                else:
412                    sequence.append(element)
413            elif ch == ".":
414                # Any character.
415                if info.flags & DOTALL:
416                    sequence.append(AnyAll())
417                elif info.flags & WORD:
418                    sequence.append(AnyU())
419                else:
420                    sequence.append(Any())
421            elif ch == "[":
422                # A character set.
423                sequence.append(parse_set(source, info))
424            elif ch == "^":
425                # The start of a line or the string.
426                if info.flags & MULTILINE:
427                    if info.flags & WORD:
428                        sequence.append(StartOfLineU())
429                    else:
430                        sequence.append(StartOfLine())
431                else:
432                    sequence.append(StartOfString())
433            elif ch == "$":
434                # The end of a line or the string.
435                if info.flags & MULTILINE:
436                    if info.flags & WORD:
437                        sequence.append(EndOfLineU())
438                    else:
439                        sequence.append(EndOfLine())
440                else:
441                    if info.flags & WORD:
442                        sequence.append(EndOfStringLineU())
443                    else:
444                        sequence.append(EndOfStringLine())
445            elif ch in "?*+{":
446                # Looks like a quantifier.
447                counts = parse_quantifier(source, info, ch)
448                if counts:
449                    # It _is_ a quantifier.
450                    apply_quantifier(source, info, counts, case_flags, ch,
451                      saved_pos, sequence)
452                    sequence.append(None)
453                else:
454                    # It's not a quantifier. Maybe it's a fuzzy constraint.
455                    constraints = parse_fuzzy(source, info, ch)
456                    if constraints:
457                        # It _is_ a fuzzy constraint.
458                        apply_constraint(source, info, constraints, case_flags,
459                          saved_pos, sequence)
460                        sequence.append(None)
461                    else:
462                        # The element was just a literal.
463                        sequence.append(Character(ord(ch),
464                          case_flags=case_flags))
465            else:
466                # A literal.
467                sequence.append(Character(ord(ch), case_flags=case_flags))
468        else:
469            # A literal.
470            sequence.append(Character(ord(ch), case_flags=case_flags))
471
472    sequence = [item for item in sequence if item is not None]
473    return Sequence(sequence)
474
475def apply_quantifier(source, info, counts, case_flags, ch, saved_pos,
476  sequence):
477    element = sequence.pop()
478    if element is None:
479        if sequence:
480            raise error("multiple repeat", source.string, saved_pos)
481        raise error("nothing to repeat", source.string, saved_pos)
482
483    if isinstance(element, (GreedyRepeat, LazyRepeat, PossessiveRepeat)):
484        raise error("multiple repeat", source.string, saved_pos)
485
486    min_count, max_count = counts
487    saved_pos = source.pos
488    ch = source.get()
489    if ch == "?":
490        # The "?" suffix that means it's a lazy repeat.
491        repeated = LazyRepeat
492    elif ch == "+":
493        # The "+" suffix that means it's a possessive repeat.
494        repeated = PossessiveRepeat
495    else:
496        # No suffix means that it's a greedy repeat.
497        source.pos = saved_pos
498        repeated = GreedyRepeat
499
500    # Ignore the quantifier if it applies to a zero-width item or the number of
501    # repeats is fixed at 1.
502    if not element.is_empty() and (min_count != 1 or max_count != 1):
503        element = repeated(element, min_count, max_count)
504
505    sequence.append(element)
506
507def apply_constraint(source, info, constraints, case_flags, saved_pos,
508  sequence):
509    element = sequence.pop()
510    if element is None:
511        raise error("nothing for fuzzy constraint", source.string, saved_pos)
512
513    # If a group is marked as fuzzy then put all of the fuzzy part in the
514    # group.
515    if isinstance(element, Group):
516        element.subpattern = Fuzzy(element.subpattern, constraints)
517        sequence.append(element)
518    else:
519        sequence.append(Fuzzy(element, constraints))
520
521_QUANTIFIERS = {"?": (0, 1), "*": (0, None), "+": (1, None)}
522
523def parse_quantifier(source, info, ch):
524    "Parses a quantifier."
525    q = _QUANTIFIERS.get(ch)
526    if q:
527        # It's a quantifier.
528        return q
529
530    if ch == "{":
531        # Looks like a limited repeated element, eg. 'a{2,3}'.
532        counts = parse_limited_quantifier(source)
533        if counts:
534            return counts
535
536    return None
537
538def is_above_limit(count):
539    "Checks whether a count is above the maximum."
540    return count is not None and count >= UNLIMITED
541
542def parse_limited_quantifier(source):
543    "Parses a limited quantifier."
544    saved_pos = source.pos
545    min_count = parse_count(source)
546    if source.match(","):
547        max_count = parse_count(source)
548
549        # No minimum means 0 and no maximum means unlimited.
550        min_count = int(min_count or 0)
551        max_count = int(max_count) if max_count else None
552    else:
553        if not min_count:
554            source.pos = saved_pos
555            return None
556
557        min_count = max_count = int(min_count)
558
559    if not source.match ("}"):
560        source.pos = saved_pos
561        return None
562
563    if is_above_limit(min_count) or is_above_limit(max_count):
564        raise error("repeat count too big", source.string, saved_pos)
565
566    if max_count is not None and min_count > max_count:
567        raise error("min repeat greater than max repeat", source.string,
568          saved_pos)
569
570    return min_count, max_count
571
572def parse_fuzzy(source, info, ch):
573    "Parses a fuzzy setting, if present."
574    saved_pos = source.pos
575
576    if ch != "{":
577        return None
578
579    constraints = {}
580    try:
581        parse_fuzzy_item(source, constraints)
582        while source.match(","):
583            parse_fuzzy_item(source, constraints)
584    except ParseError:
585        source.pos = saved_pos
586        return None
587
588    if source.match(":"):
589        constraints["test"] = parse_fuzzy_test(source, info)
590
591    if not source.match("}"):
592        raise error("expected }", source.string, source.pos)
593
594    return constraints
595
596def parse_fuzzy_item(source, constraints):
597    "Parses a fuzzy setting item."
598    saved_pos = source.pos
599    try:
600        parse_cost_constraint(source, constraints)
601    except ParseError:
602        source.pos = saved_pos
603
604        parse_cost_equation(source, constraints)
605
606def parse_cost_constraint(source, constraints):
607    "Parses a cost constraint."
608    saved_pos = source.pos
609    ch = source.get()
610    if ch in ALPHA:
611        # Syntax: constraint [("<=" | "<") cost]
612        constraint = parse_constraint(source, constraints, ch)
613
614        max_inc = parse_fuzzy_compare(source)
615
616        if max_inc is None:
617            # No maximum cost.
618            constraints[constraint] = 0, None
619        else:
620            # There's a maximum cost.
621            cost_pos = source.pos
622            max_cost = parse_cost_limit(source)
623
624            # Inclusive or exclusive limit?
625            if not max_inc:
626                max_cost -= 1
627
628            if max_cost < 0:
629                raise error("bad fuzzy cost limit", source.string, cost_pos)
630
631            constraints[constraint] = 0, max_cost
632    elif ch in DIGITS:
633        # Syntax: cost ("<=" | "<") constraint ("<=" | "<") cost
634        source.pos = saved_pos
635
636        # Minimum cost.
637        cost_pos = source.pos
638        min_cost = parse_cost_limit(source)
639
640        min_inc = parse_fuzzy_compare(source)
641        if min_inc is None:
642            raise ParseError()
643
644        constraint = parse_constraint(source, constraints, source.get())
645
646        max_inc = parse_fuzzy_compare(source)
647        if max_inc is None:
648            raise ParseError()
649
650        # Maximum cost.
651        cost_pos = source.pos
652        max_cost = parse_cost_limit(source)
653
654        # Inclusive or exclusive limits?
655        if not min_inc:
656            min_cost += 1
657        if not max_inc:
658            max_cost -= 1
659
660        if not 0 <= min_cost <= max_cost:
661            raise error("bad fuzzy cost limit", source.string, cost_pos)
662
663        constraints[constraint] = min_cost, max_cost
664    else:
665        raise ParseError()
666
667def parse_cost_limit(source):
668    "Parses a cost limit."
669    cost_pos = source.pos
670    digits = parse_count(source)
671
672    try:
673        return int(digits)
674    except ValueError:
675        pass
676
677    raise error("bad fuzzy cost limit", source.string, cost_pos)
678
679def parse_constraint(source, constraints, ch):
680    "Parses a constraint."
681    if ch not in "deis":
682        raise ParseError()
683
684    if ch in constraints:
685        raise ParseError()
686
687    return ch
688
689def parse_fuzzy_compare(source):
690    "Parses a cost comparator."
691    if source.match("<="):
692        return True
693    elif source.match("<"):
694        return False
695    else:
696        return None
697
698def parse_cost_equation(source, constraints):
699    "Parses a cost equation."
700    if "cost" in constraints:
701        raise error("more than one cost equation", source.string, source.pos)
702
703    cost = {}
704
705    parse_cost_term(source, cost)
706    while source.match("+"):
707        parse_cost_term(source, cost)
708
709    max_inc = parse_fuzzy_compare(source)
710    if max_inc is None:
711        raise ParseError()
712
713    max_cost = int(parse_count(source))
714
715    if not max_inc:
716        max_cost -= 1
717
718    if max_cost < 0:
719        raise error("bad fuzzy cost limit", source.string, source.pos)
720
721    cost["max"] = max_cost
722
723    constraints["cost"] = cost
724
725def parse_cost_term(source, cost):
726    "Parses a cost equation term."
727    coeff = parse_count(source)
728    ch = source.get()
729    if ch not in "dis":
730        raise ParseError()
731
732    if ch in cost:
733        raise error("repeated fuzzy cost", source.string, source.pos)
734
735    cost[ch] = int(coeff or 1)
736
737def parse_fuzzy_test(source, info):
738    saved_pos = source.pos
739    ch = source.get()
740    if ch in SPECIAL_CHARS:
741        if ch == "\\":
742            # An escape sequence outside a set.
743            return parse_escape(source, info, False)
744        elif ch == ".":
745            # Any character.
746            if info.flags & DOTALL:
747                return AnyAll()
748            elif info.flags & WORD:
749                return AnyU()
750            else:
751                return Any()
752        elif ch == "[":
753            # A character set.
754            return parse_set(source, info)
755        else:
756            raise error("expected character set", source.string, saved_pos)
757    elif ch:
758        # A literal.
759        return Character(ord(ch), case_flags=case_flags)
760    else:
761        raise error("expected character set", source.string, saved_pos)
762
763def parse_count(source):
764    "Parses a quantifier's count, which can be empty."
765    return source.get_while(DIGITS)
766
767def parse_paren(source, info):
768    """Parses a parenthesised subpattern or a flag. Returns FLAGS if it's an
769    inline flag.
770    """
771    saved_pos = source.pos
772    ch = source.get()
773    if ch == "?":
774        # (?...
775        saved_pos_2 = source.pos
776        ch = source.get()
777        if ch == "<":
778            # (?<...
779            saved_pos_3 = source.pos
780            ch = source.get()
781            if ch in ("=", "!"):
782                # (?<=... or (?<!...: lookbehind.
783                return parse_lookaround(source, info, True, ch == "=")
784
785            # (?<...: a named capture group.
786            source.pos = saved_pos_3
787            name = parse_name(source)
788            group = info.open_group(name)
789            source.expect(">")
790            saved_flags = info.flags
791            try:
792                subpattern = _parse_pattern(source, info)
793                source.expect(")")
794            finally:
795                info.flags = saved_flags
796                source.ignore_space = bool(info.flags & VERBOSE)
797
798            info.close_group()
799            return Group(info, group, subpattern)
800        if ch in ("=", "!"):
801            # (?=... or (?!...: lookahead.
802            return parse_lookaround(source, info, False, ch == "=")
803        if ch == "P":
804            # (?P...: a Python extension.
805            return parse_extension(source, info)
806        if ch == "#":
807            # (?#...: a comment.
808            return parse_comment(source)
809        if ch == "(":
810            # (?(...: a conditional subpattern.
811            return parse_conditional(source, info)
812        if ch == ">":
813            # (?>...: an atomic subpattern.
814            return parse_atomic(source, info)
815        if ch == "|":
816            # (?|...: a common/reset groups branch.
817            return parse_common(source, info)
818        if ch == "R" or "0" <= ch <= "9":
819            # (?R...: probably a call to a group.
820            return parse_call_group(source, info, ch, saved_pos_2)
821        if ch == "&":
822            # (?&...: a call to a named group.
823            return parse_call_named_group(source, info, saved_pos_2)
824
825        # (?...: probably a flags subpattern.
826        source.pos = saved_pos_2
827        return parse_flags_subpattern(source, info)
828
829    if ch == "*":
830        # (*...
831        saved_pos_2 = source.pos
832        word = source.get_while(set(")>"), include=False)
833        if word[ : 1].isalpha():
834            verb = VERBS.get(word)
835            if not verb:
836                raise error("unknown verb", source.string, saved_pos_2)
837
838            source.expect(")")
839
840            return verb
841
842    # (...: an unnamed capture group.
843    source.pos = saved_pos
844    group = info.open_group()
845    saved_flags = info.flags
846    try:
847        subpattern = _parse_pattern(source, info)
848        source.expect(")")
849    finally:
850        info.flags = saved_flags
851        source.ignore_space = bool(info.flags & VERBOSE)
852
853    info.close_group()
854
855    return Group(info, group, subpattern)
856
857def parse_extension(source, info):
858    "Parses a Python extension."
859    saved_pos = source.pos
860    ch = source.get()
861    if ch == "<":
862        # (?P<...: a named capture group.
863        name = parse_name(source)
864        group = info.open_group(name)
865        source.expect(">")
866        saved_flags = info.flags
867        try:
868            subpattern = _parse_pattern(source, info)
869            source.expect(")")
870        finally:
871            info.flags = saved_flags
872            source.ignore_space = bool(info.flags & VERBOSE)
873
874        info.close_group()
875
876        return Group(info, group, subpattern)
877    if ch == "=":
878        # (?P=...: a named group reference.
879        name = parse_name(source, allow_numeric=True)
880        source.expect(")")
881        if info.is_open_group(name):
882            raise error("cannot refer to an open group", source.string,
883              saved_pos)
884
885        return make_ref_group(info, name, saved_pos)
886    if ch == ">" or ch == "&":
887        # (?P>...: a call to a group.
888        return parse_call_named_group(source, info, saved_pos)
889
890    source.pos = saved_pos
891    raise error("unknown extension", source.string, saved_pos)
892
893def parse_comment(source):
894    "Parses a comment."
895    while True:
896        saved_pos = source.pos
897        c = source.get()
898
899        if not c or c == ")":
900            break
901
902        if c == "\\":
903            c = source.get()
904
905    source.pos = saved_pos
906    source.expect(")")
907
908    return None
909
910def parse_lookaround(source, info, behind, positive):
911    "Parses a lookaround."
912    saved_flags = info.flags
913    try:
914        subpattern = _parse_pattern(source, info)
915        source.expect(")")
916    finally:
917        info.flags = saved_flags
918        source.ignore_space = bool(info.flags & VERBOSE)
919
920    return LookAround(behind, positive, subpattern)
921
922def parse_conditional(source, info):
923    "Parses a conditional subpattern."
924    saved_flags = info.flags
925    saved_pos = source.pos
926    ch = source.get()
927    if ch == "?":
928        # (?(?...
929        ch = source.get()
930        if ch in ("=", "!"):
931            # (?(?=... or (?(?!...: lookahead conditional.
932            return parse_lookaround_conditional(source, info, False, ch == "=")
933        if ch == "<":
934            # (?(?<...
935            ch = source.get()
936            if ch in ("=", "!"):
937                # (?(?<=... or (?(?<!...: lookbehind conditional.
938                return parse_lookaround_conditional(source, info, True, ch ==
939                  "=")
940
941        source.pos = saved_pos
942        raise error("expected lookaround conditional", source.string,
943          source.pos)
944
945    source.pos = saved_pos
946    try:
947        group = parse_name(source, True)
948        source.expect(")")
949        yes_branch = parse_sequence(source, info)
950        if source.match("|"):
951            no_branch = parse_sequence(source, info)
952        else:
953            no_branch = Sequence()
954
955        source.expect(")")
956    finally:
957        info.flags = saved_flags
958        source.ignore_space = bool(info.flags & VERBOSE)
959
960    if yes_branch.is_empty() and no_branch.is_empty():
961        return Sequence()
962
963    return Conditional(info, group, yes_branch, no_branch, saved_pos)
964
965def parse_lookaround_conditional(source, info, behind, positive):
966    saved_flags = info.flags
967    try:
968        subpattern = _parse_pattern(source, info)
969        source.expect(")")
970    finally:
971        info.flags = saved_flags
972        source.ignore_space = bool(info.flags & VERBOSE)
973
974    yes_branch = parse_sequence(source, info)
975    if source.match("|"):
976        no_branch = parse_sequence(source, info)
977    else:
978        no_branch = Sequence()
979
980    source.expect(")")
981
982    return LookAroundConditional(behind, positive, subpattern, yes_branch,
983      no_branch)
984
985def parse_atomic(source, info):
986    "Parses an atomic subpattern."
987    saved_flags = info.flags
988    try:
989        subpattern = _parse_pattern(source, info)
990        source.expect(")")
991    finally:
992        info.flags = saved_flags
993        source.ignore_space = bool(info.flags & VERBOSE)
994
995    return Atomic(subpattern)
996
997def parse_common(source, info):
998    "Parses a common groups branch."
999    # Capture group numbers in different branches can reuse the group numbers.
1000    initial_group_count = info.group_count
1001    branches = [parse_sequence(source, info)]
1002    final_group_count = info.group_count
1003    while source.match("|"):
1004        info.group_count = initial_group_count
1005        branches.append(parse_sequence(source, info))
1006        final_group_count = max(final_group_count, info.group_count)
1007
1008    info.group_count = final_group_count
1009    source.expect(")")
1010
1011    if len(branches) == 1:
1012        return branches[0]
1013    return Branch(branches)
1014
1015def parse_call_group(source, info, ch, pos):
1016    "Parses a call to a group."
1017    if ch == "R":
1018        group = "0"
1019    else:
1020        group = ch + source.get_while(DIGITS)
1021
1022    source.expect(")")
1023
1024    return CallGroup(info, group, pos)
1025
1026def parse_call_named_group(source, info, pos):
1027    "Parses a call to a named group."
1028    group = parse_name(source)
1029    source.expect(")")
1030
1031    return CallGroup(info, group, pos)
1032
1033def parse_flag_set(source):
1034    "Parses a set of inline flags."
1035    flags = 0
1036
1037    try:
1038        while True:
1039            saved_pos = source.pos
1040            ch = source.get()
1041            if ch == "V":
1042                ch += source.get()
1043            flags |= REGEX_FLAGS[ch]
1044    except KeyError:
1045        source.pos = saved_pos
1046
1047    return flags
1048
1049def parse_flags(source, info):
1050    "Parses flags being turned on/off."
1051    flags_on = parse_flag_set(source)
1052    if source.match("-"):
1053        flags_off = parse_flag_set(source)
1054        if not flags_off:
1055            raise error("bad inline flags: no flags after '-'", source.string,
1056              source.pos)
1057    else:
1058        flags_off = 0
1059
1060    if flags_on & LOCALE:
1061        # Remember that this pattern as an inline locale flag.
1062        info.inline_locale = True
1063
1064    return flags_on, flags_off
1065
1066def parse_subpattern(source, info, flags_on, flags_off):
1067    "Parses a subpattern with scoped flags."
1068    saved_flags = info.flags
1069    info.flags = (info.flags | flags_on) & ~flags_off
1070    source.ignore_space = bool(info.flags & VERBOSE)
1071    try:
1072        subpattern = _parse_pattern(source, info)
1073        source.expect(")")
1074    finally:
1075        info.flags = saved_flags
1076        source.ignore_space = bool(info.flags & VERBOSE)
1077
1078    return subpattern
1079
1080def parse_flags_subpattern(source, info):
1081    """Parses a flags subpattern. It could be inline flags or a subpattern
1082    possibly with local flags. If it's a subpattern, then that's returned;
1083    if it's a inline flags, then None is returned.
1084    """
1085    flags_on, flags_off = parse_flags(source, info)
1086
1087    if flags_off & GLOBAL_FLAGS:
1088        raise error("bad inline flags: cannot turn off global flag",
1089          source.string, source.pos)
1090
1091    if flags_on & flags_off:
1092        raise error("bad inline flags: flag turned on and off", source.string,
1093          source.pos)
1094
1095    # Handle flags which are global in all regex behaviours.
1096    new_global_flags = (flags_on & ~info.global_flags) & GLOBAL_FLAGS
1097    if new_global_flags:
1098        info.global_flags |= new_global_flags
1099
1100        # A global has been turned on, so reparse the pattern.
1101        raise _UnscopedFlagSet(info.global_flags)
1102
1103    # Ensure that from now on we have only scoped flags.
1104    flags_on &= ~GLOBAL_FLAGS
1105
1106    if source.match(":"):
1107        return parse_subpattern(source, info, flags_on, flags_off)
1108
1109    if source.match(")"):
1110        parse_positional_flags(source, info, flags_on, flags_off)
1111        return None
1112
1113    raise error("unknown extension", source.string, source.pos)
1114
1115def parse_positional_flags(source, info, flags_on, flags_off):
1116    "Parses positional flags."
1117    version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1118    if version == VERSION0:
1119        # Positional flags are global and can only be turned on.
1120        if flags_off:
1121            raise error("bad inline flags: cannot turn flags off",
1122              source.string, source.pos)
1123
1124        new_global_flags = flags_on & ~info.global_flags
1125        if new_global_flags:
1126            info.global_flags |= new_global_flags
1127
1128            # A global has been turned on, so reparse the pattern.
1129            raise _UnscopedFlagSet(info.global_flags)
1130    else:
1131        info.flags = (info.flags | flags_on) & ~flags_off
1132
1133    source.ignore_space = bool(info.flags & VERBOSE)
1134
1135def parse_name(source, allow_numeric=False, allow_group_0=False):
1136    "Parses a name."
1137    name = source.get_while(set(")>"), include=False)
1138
1139    if not name:
1140        raise error("missing group name", source.string, source.pos)
1141
1142    if name.isdigit():
1143        min_group = 0 if allow_group_0 else 1
1144        if not allow_numeric or int(name) < min_group:
1145            raise error("bad character in group name", source.string,
1146              source.pos)
1147    else:
1148        if not is_identifier(name):
1149            raise error("bad character in group name", source.string,
1150              source.pos)
1151
1152    return name
1153
1154def is_identifier(name):
1155    if not name:
1156        return False
1157
1158    if name[0] not in ALPHA and name[0] != "_":
1159        return False
1160
1161    name = name.replace("_", "")
1162
1163    return not name or all(c in ALNUM for c in name)
1164
1165def is_octal(string):
1166    "Checks whether a string is octal."
1167    return all(ch in OCT_DIGITS for ch in string)
1168
1169def is_decimal(string):
1170    "Checks whether a string is decimal."
1171    return all(ch in DIGITS for ch in string)
1172
1173def is_hexadecimal(string):
1174    "Checks whether a string is hexadecimal."
1175    return all(ch in HEX_DIGITS for ch in string)
1176
1177def parse_escape(source, info, in_set):
1178    "Parses an escape sequence."
1179    saved_ignore = source.ignore_space
1180    source.ignore_space = False
1181    ch = source.get()
1182    source.ignore_space = saved_ignore
1183    if not ch:
1184        # A backslash at the end of the pattern.
1185        raise error("bad escape (end of pattern)", source.string, source.pos)
1186    if ch in HEX_ESCAPES:
1187        # A hexadecimal escape sequence.
1188        return parse_hex_escape(source, info, ch, HEX_ESCAPES[ch], in_set, ch)
1189    elif ch == "g" and not in_set:
1190        # A group reference.
1191        saved_pos = source.pos
1192        try:
1193            return parse_group_ref(source, info)
1194        except error:
1195            # Invalid as a group reference, so assume it's a literal.
1196            source.pos = saved_pos
1197
1198        return make_character(info, ord(ch), in_set)
1199    elif ch == "G" and not in_set:
1200        # A search anchor.
1201        return SearchAnchor()
1202    elif ch == "L" and not in_set:
1203        # A string set.
1204        return parse_string_set(source, info)
1205    elif ch == "N":
1206        # A named codepoint.
1207        return parse_named_char(source, info, in_set)
1208    elif ch in "pP":
1209        # A Unicode property, positive or negative.
1210        return parse_property(source, info, ch == "p", in_set)
1211    elif ch == "X" and not in_set:
1212        # A grapheme cluster.
1213        return Grapheme()
1214    elif ch in ALPHA:
1215        # An alphabetic escape sequence.
1216        # Positional escapes aren't allowed inside a character set.
1217        if not in_set:
1218            if info.flags & WORD:
1219                value = WORD_POSITION_ESCAPES.get(ch)
1220            else:
1221                value = POSITION_ESCAPES.get(ch)
1222
1223            if value:
1224                return value
1225
1226        value = CHARSET_ESCAPES.get(ch)
1227        if value:
1228            return value
1229
1230        value = CHARACTER_ESCAPES.get(ch)
1231        if value:
1232            return Character(ord(value))
1233
1234        return make_character(info, ord(ch), in_set)
1235    elif ch in DIGITS:
1236        # A numeric escape sequence.
1237        return parse_numeric_escape(source, info, ch, in_set)
1238    else:
1239        # A literal.
1240        return make_character(info, ord(ch), in_set)
1241
1242def parse_numeric_escape(source, info, ch, in_set):
1243    "Parses a numeric escape sequence."
1244    if in_set or ch == "0":
1245        # Octal escape sequence, max 3 digits.
1246        return parse_octal_escape(source, info, [ch], in_set)
1247
1248    # At least 1 digit, so either octal escape or group.
1249    digits = ch
1250    saved_pos = source.pos
1251    ch = source.get()
1252    if ch in DIGITS:
1253        # At least 2 digits, so either octal escape or group.
1254        digits += ch
1255        saved_pos = source.pos
1256        ch = source.get()
1257        if is_octal(digits) and ch in OCT_DIGITS:
1258            # 3 octal digits, so octal escape sequence.
1259            encoding = info.flags & _ALL_ENCODINGS
1260            if encoding == ASCII or encoding == LOCALE:
1261                octal_mask = 0xFF
1262            else:
1263                octal_mask = 0x1FF
1264
1265            value = int(digits + ch, 8) & octal_mask
1266            return make_character(info, value)
1267
1268    # Group reference.
1269    source.pos = saved_pos
1270    if info.is_open_group(digits):
1271        raise error("cannot refer to an open group", source.string, source.pos)
1272
1273    return make_ref_group(info, digits, source.pos)
1274
1275def parse_octal_escape(source, info, digits, in_set):
1276    "Parses an octal escape sequence."
1277    saved_pos = source.pos
1278    ch = source.get()
1279    while len(digits) < 3 and ch in OCT_DIGITS:
1280        digits.append(ch)
1281        saved_pos = source.pos
1282        ch = source.get()
1283
1284    source.pos = saved_pos
1285    try:
1286        value = int("".join(digits), 8)
1287        return make_character(info, value, in_set)
1288    except ValueError:
1289        if digits[0] in OCT_DIGITS:
1290            raise error("incomplete escape \\%s" % ''.join(digits),
1291              source.string, source.pos)
1292        else:
1293            raise error("bad escape \\%s" % digits[0], source.string,
1294              source.pos)
1295
1296def parse_hex_escape(source, info, esc, expected_len, in_set, type):
1297    "Parses a hex escape sequence."
1298    saved_pos = source.pos
1299    digits = []
1300    for i in range(expected_len):
1301        ch = source.get()
1302        if ch not in HEX_DIGITS:
1303            raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),
1304              source.string, saved_pos)
1305        digits.append(ch)
1306
1307    try:
1308        value = int("".join(digits), 16)
1309    except ValueError:
1310        pass
1311    else:
1312        if value < 0x110000:
1313            return make_character(info, value, in_set)
1314
1315    # Bad hex escape.
1316    raise error("bad hex escape \\%s%s" % (esc, ''.join(digits)),
1317      source.string, saved_pos)
1318
1319def parse_group_ref(source, info):
1320    "Parses a group reference."
1321    source.expect("<")
1322    saved_pos = source.pos
1323    name = parse_name(source, True)
1324    source.expect(">")
1325    if info.is_open_group(name):
1326        raise error("cannot refer to an open group", source.string, source.pos)
1327
1328    return make_ref_group(info, name, saved_pos)
1329
1330def parse_string_set(source, info):
1331    "Parses a string set reference."
1332    source.expect("<")
1333    name = parse_name(source, True)
1334    source.expect(">")
1335    if name is None or name not in info.kwargs:
1336        raise error("undefined named list", source.string, source.pos)
1337
1338    return make_string_set(info, name)
1339
1340def parse_named_char(source, info, in_set):
1341    "Parses a named character."
1342    saved_pos = source.pos
1343    if source.match("{"):
1344        name = source.get_while(NAMED_CHAR_PART)
1345        if source.match("}"):
1346            try:
1347                value = unicodedata.lookup(name)
1348                return make_character(info, ord(value), in_set)
1349            except KeyError:
1350                raise error("undefined character name", source.string,
1351                  source.pos)
1352
1353    source.pos = saved_pos
1354    return make_character(info, ord("N"), in_set)
1355
1356def parse_property(source, info, positive, in_set):
1357    "Parses a Unicode property."
1358    saved_pos = source.pos
1359    ch = source.get()
1360    if ch == "{":
1361        negate = source.match("^")
1362        prop_name, name = parse_property_name(source)
1363        if source.match("}"):
1364            # It's correctly delimited.
1365            prop = lookup_property(prop_name, name, positive != negate, source)
1366            return make_property(info, prop, in_set)
1367    elif ch and ch in "CLMNPSZ":
1368        # An abbreviated property, eg \pL.
1369        prop = lookup_property(None, ch, positive, source)
1370        return make_property(info, prop, in_set)
1371
1372    # Not a property, so treat as a literal "p" or "P".
1373    source.pos = saved_pos
1374    ch = "p" if positive else "P"
1375    return make_character(info, ord(ch), in_set)
1376
1377def parse_property_name(source):
1378    "Parses a property name, which may be qualified."
1379    name = source.get_while(PROPERTY_NAME_PART)
1380    saved_pos = source.pos
1381
1382    ch = source.get()
1383    if ch and ch in ":=":
1384        prop_name = name
1385        name = source.get_while(ALNUM | set(" &_-./")).strip()
1386
1387        if name:
1388            # Name after the ":" or "=", so it's a qualified name.
1389            saved_pos = source.pos
1390        else:
1391            # No name after the ":" or "=", so assume it's an unqualified name.
1392            prop_name, name = None, prop_name
1393    else:
1394        prop_name = None
1395
1396    source.pos = saved_pos
1397    return prop_name, name
1398
1399def parse_set(source, info):
1400    "Parses a character set."
1401    version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1402
1403    saved_ignore = source.ignore_space
1404    source.ignore_space = False
1405    # Negative set?
1406    negate = source.match("^")
1407    try:
1408        if version == VERSION0:
1409            item = parse_set_imp_union(source, info)
1410        else:
1411            item = parse_set_union(source, info)
1412
1413        if not source.match("]"):
1414            raise error("missing ]", source.string, source.pos)
1415    finally:
1416        source.ignore_space = saved_ignore
1417
1418    if negate:
1419        item = item.with_flags(positive=not item.positive)
1420
1421    item = item.with_flags(case_flags=make_case_flags(info))
1422
1423    return item
1424
1425def parse_set_union(source, info):
1426    "Parses a set union ([x||y])."
1427    items = [parse_set_symm_diff(source, info)]
1428    while source.match("||"):
1429        items.append(parse_set_symm_diff(source, info))
1430
1431    if len(items) == 1:
1432        return items[0]
1433    return SetUnion(info, items)
1434
1435def parse_set_symm_diff(source, info):
1436    "Parses a set symmetric difference ([x~~y])."
1437    items = [parse_set_inter(source, info)]
1438    while source.match("~~"):
1439        items.append(parse_set_inter(source, info))
1440
1441    if len(items) == 1:
1442        return items[0]
1443    return SetSymDiff(info, items)
1444
1445def parse_set_inter(source, info):
1446    "Parses a set intersection ([x&&y])."
1447    items = [parse_set_diff(source, info)]
1448    while source.match("&&"):
1449        items.append(parse_set_diff(source, info))
1450
1451    if len(items) == 1:
1452        return items[0]
1453    return SetInter(info, items)
1454
1455def parse_set_diff(source, info):
1456    "Parses a set difference ([x--y])."
1457    items = [parse_set_imp_union(source, info)]
1458    while source.match("--"):
1459        items.append(parse_set_imp_union(source, info))
1460
1461    if len(items) == 1:
1462        return items[0]
1463    return SetDiff(info, items)
1464
1465def parse_set_imp_union(source, info):
1466    "Parses a set implicit union ([xy])."
1467    version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1468
1469    items = [parse_set_member(source, info)]
1470    while True:
1471        saved_pos = source.pos
1472        if source.match("]"):
1473            # End of the set.
1474            source.pos = saved_pos
1475            break
1476
1477        if version == VERSION1 and any(source.match(op) for op in SET_OPS):
1478            # The new behaviour has set operators.
1479            source.pos = saved_pos
1480            break
1481
1482        items.append(parse_set_member(source, info))
1483
1484    if len(items) == 1:
1485        return items[0]
1486    return SetUnion(info, items)
1487
1488def parse_set_member(source, info):
1489    "Parses a member in a character set."
1490    # Parse a set item.
1491    start = parse_set_item(source, info)
1492    saved_pos1 = source.pos
1493    if (not isinstance(start, Character) or not start.positive or not
1494      source.match("-")):
1495        # It's not the start of a range.
1496        return start
1497
1498    version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1499
1500    # It looks like the start of a range of characters.
1501    saved_pos2 = source.pos
1502    if version == VERSION1 and source.match("-"):
1503        # It's actually the set difference operator '--', so return the
1504        # character.
1505        source.pos = saved_pos1
1506        return start
1507
1508    if source.match("]"):
1509        # We've reached the end of the set, so return both the character and
1510        # hyphen.
1511        source.pos = saved_pos2
1512        return SetUnion(info, [start, Character(ord("-"))])
1513
1514    # Parse a set item.
1515    end = parse_set_item(source, info)
1516    if not isinstance(end, Character) or not end.positive:
1517        # It's not a range, so return the character, hyphen and property.
1518        return SetUnion(info, [start, Character(ord("-")), end])
1519
1520    # It _is_ a range.
1521    if start.value > end.value:
1522        raise error("bad character range", source.string, source.pos)
1523
1524    if start.value == end.value:
1525        return start
1526
1527    return Range(start.value, end.value)
1528
1529def parse_set_item(source, info):
1530    "Parses an item in a character set."
1531    version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1532
1533    if source.match("\\"):
1534        # An escape sequence in a set.
1535        return parse_escape(source, info, True)
1536
1537    saved_pos = source.pos
1538    if source.match("[:"):
1539        # Looks like a POSIX character class.
1540        try:
1541            return parse_posix_class(source, info)
1542        except ParseError:
1543            # Not a POSIX character class.
1544            source.pos = saved_pos
1545
1546    if version == VERSION1 and source.match("["):
1547        # It's the start of a nested set.
1548
1549        # Negative set?
1550        negate = source.match("^")
1551        item = parse_set_union(source, info)
1552
1553        if not source.match("]"):
1554            raise error("missing ]", source.string, source.pos)
1555
1556        if negate:
1557            item = item.with_flags(positive=not item.positive)
1558
1559        return item
1560
1561    ch = source.get()
1562    if not ch:
1563        raise error("unterminated character set", source.string, source.pos)
1564
1565    return Character(ord(ch))
1566
1567def parse_posix_class(source, info):
1568    "Parses a POSIX character class."
1569    negate = source.match("^")
1570    prop_name, name = parse_property_name(source)
1571    if not source.match(":]"):
1572        raise ParseError()
1573
1574    return lookup_property(prop_name, name, not negate, source, posix=True)
1575
1576def float_to_rational(flt):
1577    "Converts a float to a rational pair."
1578    int_part = int(flt)
1579    error = flt - int_part
1580    if abs(error) < 0.0001:
1581        return int_part, 1
1582
1583    den, num = float_to_rational(1.0 / error)
1584
1585    return int_part * den + num, den
1586
1587def numeric_to_rational(numeric):
1588    "Converts a numeric string to a rational string, if possible."
1589    if numeric[ : 1] == "-":
1590        sign, numeric = numeric[0], numeric[1 : ]
1591    else:
1592        sign = ""
1593
1594    parts = numeric.split("/")
1595    if len(parts) == 2:
1596        num, den = float_to_rational(float(parts[0]) / float(parts[1]))
1597    elif len(parts) == 1:
1598        num, den = float_to_rational(float(parts[0]))
1599    else:
1600        raise ValueError()
1601
1602    result = "%s%s/%s" % (sign, num, den)
1603    if result.endswith("/1"):
1604        return result[ : -2]
1605
1606    return result
1607
1608upper_trans = string.maketrans(string.ascii_lowercase, string.ascii_uppercase)
1609def ascii_upper(s):
1610    "Uppercases a bytestring in a locale-insensitive way within the ASCII range."
1611    if isinstance(s, str):
1612        return s.translate(upper_trans)
1613
1614    return s.upper()
1615
1616def standardise_name(name):
1617    "Standardises a property or value name."
1618    try:
1619        return numeric_to_rational("".join(name))
1620    except (ValueError, ZeroDivisionError):
1621        return ascii_upper("".join(ch for ch in name if ch not in "_- "))
1622
1623_POSIX_CLASSES = set('ALNUM DIGIT PUNCT XDIGIT'.split())
1624
1625_BINARY_VALUES = set('YES Y NO N TRUE T FALSE F'.split())
1626
1627def lookup_property(property, value, positive, source=None, posix=False):
1628    "Looks up a property."
1629    # Normalise the names (which may still be lists).
1630    property = standardise_name(property) if property else None
1631    value = standardise_name(value)
1632
1633    if (property, value) == ("GENERALCATEGORY", "ASSIGNED"):
1634        property, value, positive = "GENERALCATEGORY", "UNASSIGNED", not positive
1635
1636    if posix and not property and ascii_upper(value) in _POSIX_CLASSES:
1637        value = 'POSIX' + value
1638
1639    if property:
1640        # Both the property and the value are provided.
1641        prop = PROPERTIES.get(property)
1642        if not prop:
1643            if not source:
1644                raise error("unknown property")
1645
1646            raise error("unknown property", source.string, source.pos)
1647
1648        prop_id, value_dict = prop
1649        val_id = value_dict.get(value)
1650        if val_id is None:
1651            if not source:
1652                raise error("unknown property value")
1653
1654            raise error("unknown property value", source.string, source.pos)
1655
1656        return Property((prop_id << 16) | val_id, positive)
1657
1658    # Only the value is provided.
1659    # It might be the name of a GC, script or block value.
1660    for property in ("GC", "SCRIPT", "BLOCK"):
1661        prop_id, value_dict = PROPERTIES.get(property)
1662        val_id = value_dict.get(value)
1663        if val_id is not None:
1664            return Property((prop_id << 16) | val_id, positive)
1665
1666    # It might be the name of a binary property.
1667    prop = PROPERTIES.get(value)
1668    if prop:
1669        prop_id, value_dict = prop
1670        if set(value_dict) == _BINARY_VALUES:
1671            return Property((prop_id << 16) | 1, positive)
1672
1673        return Property(prop_id << 16, not positive)
1674
1675    # It might be the name of a binary property starting with a prefix.
1676    if value.startswith("IS"):
1677        prop = PROPERTIES.get(value[2 : ])
1678        if prop:
1679            prop_id, value_dict = prop
1680            if "YES" in value_dict:
1681                return Property((prop_id << 16) | 1, positive)
1682
1683    # It might be the name of a script or block starting with a prefix.
1684    for prefix, property in (("IS", "SCRIPT"), ("IN", "BLOCK")):
1685        if value.startswith(prefix):
1686            prop_id, value_dict = PROPERTIES.get(property)
1687            val_id = value_dict.get(value[2 : ])
1688            if val_id is not None:
1689                return Property((prop_id << 16) | val_id, positive)
1690
1691    # Unknown property.
1692    if not source:
1693        raise error("unknown property")
1694
1695    raise error("unknown property", source.string, source.pos)
1696
1697def _compile_replacement(source, pattern, is_unicode):
1698    "Compiles a replacement template escape sequence."
1699    ch = source.get()
1700    if ch in ALPHA:
1701        # An alphabetic escape sequence.
1702        value = CHARACTER_ESCAPES.get(ch)
1703        if value:
1704            return False, [ord(value)]
1705
1706        if ch in HEX_ESCAPES and (ch == "x" or is_unicode):
1707            # A hexadecimal escape sequence.
1708            return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch], ch)]
1709
1710        if ch == "g":
1711            # A group preference.
1712            return True, [compile_repl_group(source, pattern)]
1713
1714        if ch == "N" and is_unicode:
1715            # A named character.
1716            value = parse_repl_named_char(source)
1717            if value is not None:
1718                return False, [value]
1719
1720        return False, [ord("\\"), ord(ch)]
1721
1722    if isinstance(source.sep, str):
1723        octal_mask = 0xFF
1724    else:
1725        octal_mask = 0x1FF
1726
1727    if ch == "0":
1728        # An octal escape sequence.
1729        digits = ch
1730        while len(digits) < 3:
1731            saved_pos = source.pos
1732            ch = source.get()
1733            if ch not in OCT_DIGITS:
1734                source.pos = saved_pos
1735                break
1736            digits += ch
1737
1738        return False, [int(digits, 8) & octal_mask]
1739
1740    if ch in DIGITS:
1741        # Either an octal escape sequence (3 digits) or a group reference (max
1742        # 2 digits).
1743        digits = ch
1744        saved_pos = source.pos
1745        ch = source.get()
1746        if ch in DIGITS:
1747            digits += ch
1748            saved_pos = source.pos
1749            ch = source.get()
1750            if ch and is_octal(digits + ch):
1751                # An octal escape sequence.
1752                return False, [int(digits + ch, 8) & octal_mask]
1753
1754        # A group reference.
1755        source.pos = saved_pos
1756        return True, [int(digits)]
1757
1758    if ch == "\\":
1759        # An escaped backslash is a backslash.
1760        return False, [ord("\\")]
1761
1762    if not ch:
1763        # A trailing backslash.
1764        raise error("bad escape (end of pattern)", source.string, source.pos)
1765
1766    # An escaped non-backslash is a backslash followed by the literal.
1767    return False, [ord("\\"), ord(ch)]
1768
1769def parse_repl_hex_escape(source, expected_len, type):
1770    "Parses a hex escape sequence in a replacement string."
1771    digits = []
1772    for i in range(expected_len):
1773        ch = source.get()
1774        if ch not in HEX_DIGITS:
1775            raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),
1776              source.string, source.pos)
1777        digits.append(ch)
1778
1779    return int("".join(digits), 16)
1780
1781def parse_repl_named_char(source):
1782    "Parses a named character in a replacement string."
1783    saved_pos = source.pos
1784    if source.match("{"):
1785        name = source.get_while(ALPHA | set(" "))
1786
1787        if source.match("}"):
1788            try:
1789                value = unicodedata.lookup(name)
1790                return ord(value)
1791            except KeyError:
1792                raise error("undefined character name", source.string,
1793                  source.pos)
1794
1795    source.pos = saved_pos
1796    return None
1797
1798def compile_repl_group(source, pattern):
1799    "Compiles a replacement template group reference."
1800    source.expect("<")
1801    name = parse_name(source, True, True)
1802
1803    source.expect(">")
1804    if name.isdigit():
1805        index = int(name)
1806        if not 0 <= index <= pattern.groups:
1807            raise error("invalid group reference", source.string, source.pos)
1808
1809        return index
1810
1811    try:
1812        return pattern.groupindex[name]
1813    except KeyError:
1814        raise IndexError("unknown group")
1815
1816# The regular expression is parsed into a syntax tree. The different types of
1817# node are defined below.
1818
1819INDENT = "  "
1820POSITIVE_OP = 0x1
1821ZEROWIDTH_OP = 0x2
1822FUZZY_OP = 0x4
1823REVERSE_OP = 0x8
1824REQUIRED_OP = 0x10
1825
1826POS_TEXT = {False: "NON-MATCH", True: "MATCH"}
1827CASE_TEXT = {NOCASE: "", IGNORECASE: " SIMPLE_IGNORE_CASE", FULLCASE: "",
1828  FULLIGNORECASE: " FULL_IGNORE_CASE"}
1829
1830def make_sequence(items):
1831    if len(items) == 1:
1832        return items[0]
1833    return Sequence(items)
1834
1835# Common base class for all nodes.
1836class RegexBase(object):
1837    def __init__(self):
1838        self._key = self.__class__
1839
1840    def with_flags(self, positive=None, case_flags=None, zerowidth=None):
1841        if positive is None:
1842            positive = self.positive
1843        else:
1844            positive = bool(positive)
1845        if case_flags is None:
1846            case_flags = self.case_flags
1847        else:
1848            case_flags = CASE_FLAGS_COMBINATIONS[case_flags & CASE_FLAGS]
1849        if zerowidth is None:
1850            zerowidth = self.zerowidth
1851        else:
1852            zerowidth = bool(zerowidth)
1853
1854        if (positive == self.positive and case_flags == self.case_flags and
1855          zerowidth == self.zerowidth):
1856            return self
1857
1858        return self.rebuild(positive, case_flags, zerowidth)
1859
1860    def fix_groups(self, pattern, reverse, fuzzy):
1861        pass
1862
1863    def optimise(self, info, reverse):
1864        return self
1865
1866    def pack_characters(self, info):
1867        return self
1868
1869    def remove_captures(self):
1870        return self
1871
1872    def is_atomic(self):
1873        return True
1874
1875    def can_be_affix(self):
1876        return True
1877
1878    def contains_group(self):
1879        return False
1880
1881    def get_firstset(self, reverse):
1882        raise _FirstSetError()
1883
1884    def has_simple_start(self):
1885        return False
1886
1887    def compile(self, reverse=False, fuzzy=False):
1888        return self._compile(reverse, fuzzy)
1889
1890    def is_empty(self):
1891        return False
1892
1893    def __hash__(self):
1894        return hash(self._key)
1895
1896    def __eq__(self, other):
1897        return type(self) is type(other) and self._key == other._key
1898
1899    def __ne__(self, other):
1900        return not self.__eq__(other)
1901
1902    def get_required_string(self, reverse):
1903        return self.max_width(), None
1904
1905# Base class for zero-width nodes.
1906class ZeroWidthBase(RegexBase):
1907    def __init__(self, positive=True):
1908        RegexBase.__init__(self)
1909        self.positive = bool(positive)
1910
1911        self._key = self.__class__, self.positive
1912
1913    def get_firstset(self, reverse):
1914        return set([None])
1915
1916    def _compile(self, reverse, fuzzy):
1917        flags = 0
1918        if self.positive:
1919            flags |= POSITIVE_OP
1920        if fuzzy:
1921            flags |= FUZZY_OP
1922        if reverse:
1923            flags |= REVERSE_OP
1924        return [(self._opcode, flags)]
1925
1926    def dump(self, indent, reverse):
1927        print "%s%s %s" % (INDENT * indent, self._op_name,
1928          POS_TEXT[self.positive])
1929
1930    def max_width(self):
1931        return 0
1932
1933class Any(RegexBase):
1934    _opcode = {False: OP.ANY, True: OP.ANY_REV}
1935    _op_name = "ANY"
1936
1937    def has_simple_start(self):
1938        return True
1939
1940    def _compile(self, reverse, fuzzy):
1941        flags = 0
1942        if fuzzy:
1943            flags |= FUZZY_OP
1944        return [(self._opcode[reverse], flags)]
1945
1946    def dump(self, indent, reverse):
1947        print "%s%s" % (INDENT * indent, self._op_name)
1948
1949    def max_width(self):
1950        return 1
1951
1952class AnyAll(Any):
1953    _opcode = {False: OP.ANY_ALL, True: OP.ANY_ALL_REV}
1954    _op_name = "ANY_ALL"
1955
1956class AnyU(Any):
1957    _opcode = {False: OP.ANY_U, True: OP.ANY_U_REV}
1958    _op_name = "ANY_U"
1959
1960class Atomic(RegexBase):
1961    def __init__(self, subpattern):
1962        RegexBase.__init__(self)
1963        self.subpattern = subpattern
1964
1965    def fix_groups(self, pattern, reverse, fuzzy):
1966        self.subpattern.fix_groups(pattern, reverse, fuzzy)
1967
1968    def optimise(self, info, reverse):
1969        self.subpattern = self.subpattern.optimise(info, reverse)
1970
1971        if self.subpattern.is_empty():
1972            return self.subpattern
1973        return self
1974
1975    def pack_characters(self, info):
1976        self.subpattern = self.subpattern.pack_characters(info)
1977        return self
1978
1979    def remove_captures(self):
1980        self.subpattern = self.subpattern.remove_captures()
1981        return self
1982
1983    def can_be_affix(self):
1984        return self.subpattern.can_be_affix()
1985
1986    def contains_group(self):
1987        return self.subpattern.contains_group()
1988
1989    def get_firstset(self, reverse):
1990        return self.subpattern.get_firstset(reverse)
1991
1992    def has_simple_start(self):
1993        return self.subpattern.has_simple_start()
1994
1995    def _compile(self, reverse, fuzzy):
1996        return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) +
1997          [(OP.END, )])
1998
1999    def dump(self, indent, reverse):
2000        print "%sATOMIC" % (INDENT * indent)
2001        self.subpattern.dump(indent + 1, reverse)
2002
2003    def is_empty(self):
2004        return self.subpattern.is_empty()
2005
2006    def __eq__(self, other):
2007        return (type(self) is type(other) and self.subpattern ==
2008          other.subpattern)
2009
2010    def max_width(self):
2011        return self.subpattern.max_width()
2012
2013    def get_required_string(self, reverse):
2014        return self.subpattern.get_required_string(reverse)
2015
2016class Boundary(ZeroWidthBase):
2017    _opcode = OP.BOUNDARY
2018    _op_name = "BOUNDARY"
2019
2020class Branch(RegexBase):
2021    def __init__(self, branches):
2022        RegexBase.__init__(self)
2023        self.branches = branches
2024
2025    def fix_groups(self, pattern, reverse, fuzzy):
2026        for b in self.branches:
2027            b.fix_groups(pattern, reverse, fuzzy)
2028
2029    def optimise(self, info, reverse):
2030        if not self.branches:
2031            return Sequence([])
2032
2033        # Flatten branches within branches.
2034        branches = Branch._flatten_branches(info, reverse, self.branches)
2035
2036        # Move any common prefix or suffix out of the branches.
2037        if reverse:
2038            suffix, branches = Branch._split_common_suffix(info, branches)
2039            prefix = []
2040        else:
2041            prefix, branches = Branch._split_common_prefix(info, branches)
2042            suffix = []
2043
2044        # Try to reduce adjacent single-character branches to sets.
2045        branches = Branch._reduce_to_set(info, reverse, branches)
2046
2047        if len(branches) > 1:
2048            sequence = [Branch(branches)]
2049
2050            if not prefix or not suffix:
2051                # We might be able to add a quick precheck before the branches.
2052                firstset = self._add_precheck(info, reverse, branches)
2053
2054                if firstset:
2055                    if reverse:
2056                        sequence.append(firstset)
2057                    else:
2058                        sequence.insert(0, firstset)
2059        else:
2060            sequence = branches
2061
2062        return make_sequence(prefix + sequence + suffix)
2063
2064    def _add_precheck(self, info, reverse, branches):
2065        charset = set()
2066        pos = -1 if reverse else 0
2067
2068        for branch in branches:
2069            if type(branch) is Literal and branch.case_flags == NOCASE:
2070                charset.add(branch.characters[pos])
2071            else:
2072                return
2073
2074        if not charset:
2075            return None
2076
2077        return _check_firstset(info, reverse, [Character(c) for c in charset])
2078
2079    def pack_characters(self, info):
2080        self.branches = [b.pack_characters(info) for b in self.branches]
2081        return self
2082
2083    def remove_captures(self):
2084        self.branches = [b.remove_captures() for b in self.branches]
2085        return self
2086
2087    def is_atomic(self):
2088        return all(b.is_atomic() for b in self.branches)
2089
2090    def can_be_affix(self):
2091        return all(b.can_be_affix() for b in self.branches)
2092
2093    def contains_group(self):
2094        return any(b.contains_group() for b in self.branches)
2095
2096    def get_firstset(self, reverse):
2097        fs = set()
2098        for b in self.branches:
2099            fs |= b.get_firstset(reverse)
2100
2101        return fs or set([None])
2102
2103    def _compile(self, reverse, fuzzy):
2104        code = [(OP.BRANCH, )]
2105        for b in self.branches:
2106            code.extend(b.compile(reverse, fuzzy))
2107            code.append((OP.NEXT, ))
2108
2109        code[-1] = (OP.END, )
2110
2111        return code
2112
2113    def dump(self, indent, reverse):
2114        print "%sBRANCH" % (INDENT * indent)
2115        self.branches[0].dump(indent + 1, reverse)
2116        for b in self.branches[1 : ]:
2117            print "%sOR" % (INDENT * indent)
2118            b.dump(indent + 1, reverse)
2119
2120    @staticmethod
2121    def _flatten_branches(info, reverse, branches):
2122        # Flatten the branches so that there aren't branches of branches.
2123        new_branches = []
2124        for b in branches:
2125            b = b.optimise(info, reverse)
2126            if isinstance(b, Branch):
2127                new_branches.extend(b.branches)
2128            else:
2129                new_branches.append(b)
2130
2131        return new_branches
2132
2133    @staticmethod
2134    def _split_common_prefix(info, branches):
2135        # Common leading items can be moved out of the branches.
2136        # Get the items in the branches.
2137        alternatives = []
2138        for b in branches:
2139            if isinstance(b, Sequence):
2140                alternatives.append(b.items)
2141            else:
2142                alternatives.append([b])
2143
2144        # What is the maximum possible length of the prefix?
2145        max_count = min(len(a) for a in alternatives)
2146
2147        # What is the longest common prefix?
2148        prefix = alternatives[0]
2149        pos = 0
2150        end_pos = max_count
2151        while pos < end_pos and prefix[pos].can_be_affix() and all(a[pos] ==
2152          prefix[pos] for a in alternatives):
2153            pos += 1
2154        count = pos
2155
2156        if info.flags & UNICODE:
2157            # We need to check that we're not splitting a sequence of
2158            # characters which could form part of full case-folding.
2159            count = pos
2160            while count > 0 and not all(Branch._can_split(a, count) for a in
2161              alternatives):
2162                count -= 1
2163
2164        # No common prefix is possible.
2165        if count == 0:
2166            return [], branches
2167
2168        # Rebuild the branches.
2169        new_branches = []
2170        for a in alternatives:
2171            new_branches.append(make_sequence(a[count : ]))
2172
2173        return prefix[ : count], new_branches
2174
2175    @staticmethod
2176    def _split_common_suffix(info, branches):
2177        # Common trailing items can be moved out of the branches.
2178        # Get the items in the branches.
2179        alternatives = []
2180        for b in branches:
2181            if isinstance(b, Sequence):
2182                alternatives.append(b.items)
2183            else:
2184                alternatives.append([b])
2185
2186        # What is the maximum possible length of the suffix?
2187        max_count = min(len(a) for a in alternatives)
2188
2189        # What is the longest common suffix?
2190        suffix = alternatives[0]
2191        pos = -1
2192        end_pos = -1 - max_count
2193        while pos > end_pos and suffix[pos].can_be_affix() and all(a[pos] ==
2194          suffix[pos] for a in alternatives):
2195            pos -= 1
2196        count = -1 - pos
2197
2198        if info.flags & UNICODE:
2199            # We need to check that we're not splitting a sequence of
2200            # characters which could form part of full case-folding.
2201            while count > 0 and not all(Branch._can_split_rev(a, count) for a
2202              in alternatives):
2203                count -= 1
2204
2205        # No common suffix is possible.
2206        if count == 0:
2207            return [], branches
2208
2209        # Rebuild the branches.
2210        new_branches = []
2211        for a in alternatives:
2212            new_branches.append(make_sequence(a[ : -count]))
2213
2214        return suffix[-count : ], new_branches
2215
2216    @staticmethod
2217    def _can_split(items, count):
2218        # Check the characters either side of the proposed split.
2219        if not Branch._is_full_case(items, count - 1):
2220            return True
2221
2222        if not Branch._is_full_case(items, count):
2223            return True
2224
2225        # Check whether a 1-1 split would be OK.
2226        if Branch._is_folded(items[count - 1 : count + 1]):
2227            return False
2228
2229        # Check whether a 1-2 split would be OK.
2230        if (Branch._is_full_case(items, count + 2) and
2231          Branch._is_folded(items[count - 1 : count + 2])):
2232            return False
2233
2234        # Check whether a 2-1 split would be OK.
2235        if (Branch._is_full_case(items, count - 2) and
2236          Branch._is_folded(items[count - 2 : count + 1])):
2237            return False
2238
2239        return True
2240
2241    @staticmethod
2242    def _can_split_rev(items, count):
2243        end = len(items)
2244
2245        # Check the characters either side of the proposed split.
2246        if not Branch._is_full_case(items, end - count):
2247            return True
2248
2249        if not Branch._is_full_case(items, end - count - 1):
2250            return True
2251
2252        # Check whether a 1-1 split would be OK.
2253        if Branch._is_folded(items[end - count - 1 : end - count + 1]):
2254            return False
2255
2256        # Check whether a 1-2 split would be OK.
2257        if (Branch._is_full_case(items, end - count + 2) and
2258          Branch._is_folded(items[end - count - 1 : end - count + 2])):
2259            return False
2260
2261        # Check whether a 2-1 split would be OK.
2262        if (Branch._is_full_case(items, end - count - 2) and
2263          Branch._is_folded(items[end - count - 2 : end - count + 1])):
2264            return False
2265
2266        return True
2267
2268    @staticmethod
2269    def _merge_common_prefixes(info, reverse, branches):
2270        # Branches with the same case-sensitive character prefix can be grouped
2271        # together if they are separated only by other branches with a
2272        # character prefix.
2273        prefixed = defaultdict(list)
2274        order = {}
2275        new_branches = []
2276        for b in branches:
2277            if Branch._is_simple_character(b):
2278                # Branch starts with a simple character.
2279                prefixed[b.value].append([b])
2280                order.setdefault(b.value, len(order))
2281            elif (isinstance(b, Sequence) and b.items and
2282              Branch._is_simple_character(b.items[0])):
2283                # Branch starts with a simple character.
2284                prefixed[b.items[0].value].append(b.items)
2285                order.setdefault(b.items[0].value, len(order))
2286            else:
2287                Branch._flush_char_prefix(info, reverse, prefixed, order,
2288                  new_branches)
2289
2290                new_branches.append(b)
2291
2292        Branch._flush_char_prefix(info, prefixed, order, new_branches)
2293
2294        return new_branches
2295
2296    @staticmethod
2297    def _is_simple_character(c):
2298        return isinstance(c, Character) and c.positive and not c.case_flags
2299
2300    @staticmethod
2301    def _reduce_to_set(info, reverse, branches):
2302        # Can the branches be reduced to a set?
2303        new_branches = []
2304        items = set()
2305        case_flags = NOCASE
2306        for b in branches:
2307            if isinstance(b, (Character, Property, SetBase)):
2308                # Branch starts with a single character.
2309                if b.case_flags != case_flags:
2310                    # Different case sensitivity, so flush.
2311                    Branch._flush_set_members(info, reverse, items, case_flags,
2312                      new_branches)
2313
2314                    case_flags = b.case_flags
2315
2316                items.add(b.with_flags(case_flags=NOCASE))
2317            else:
2318                Branch._flush_set_members(info, reverse, items, case_flags,
2319                  new_branches)
2320
2321                new_branches.append(b)
2322
2323        Branch._flush_set_members(info, reverse, items, case_flags,
2324          new_branches)
2325
2326        return new_branches
2327
2328    @staticmethod
2329    def _flush_char_prefix(info, reverse, prefixed, order, new_branches):
2330        # Flush the prefixed branches.
2331        if not prefixed:
2332            return
2333
2334        for value, branches in sorted(prefixed.items(), key=lambda pair:
2335          order[pair[0]]):
2336            if len(branches) == 1:
2337                new_branches.append(make_sequence(branches[0]))
2338            else:
2339                subbranches = []
2340                optional = False
2341                for b in branches:
2342                    if len(b) > 1:
2343                        subbranches.append(make_sequence(b[1 : ]))
2344                    elif not optional:
2345                        subbranches.append(Sequence())
2346                        optional = True
2347
2348                sequence = Sequence([Character(value), Branch(subbranches)])
2349                new_branches.append(sequence.optimise(info, reverse))
2350
2351        prefixed.clear()
2352        order.clear()
2353
2354    @staticmethod
2355    def _flush_set_members(info, reverse, items, case_flags, new_branches):
2356        # Flush the set members.
2357        if not items:
2358            return
2359
2360        if len(items) == 1:
2361            item = list(items)[0]
2362        else:
2363            item = SetUnion(info, list(items)).optimise(info, reverse)
2364
2365        new_branches.append(item.with_flags(case_flags=case_flags))
2366
2367        items.clear()
2368
2369    @staticmethod
2370    def _is_full_case(items, i):
2371        if not 0 <= i < len(items):
2372            return False
2373
2374        item = items[i]
2375        return (isinstance(item, Character) and item.positive and
2376          (item.case_flags & FULLIGNORECASE) == FULLIGNORECASE)
2377
2378    @staticmethod
2379    def _is_folded(items):
2380        if len(items) < 2:
2381            return False
2382
2383        for i in items:
2384            if (not isinstance(i, Character) or not i.positive or not
2385              i.case_flags):
2386                return False
2387
2388        folded = u"".join(unichr(i.value) for i in items)
2389        folded = _regex.fold_case(FULL_CASE_FOLDING, folded)
2390
2391        # Get the characters which expand to multiple codepoints on folding.
2392        expanding_chars = _regex.get_expand_on_folding()
2393
2394        for c in expanding_chars:
2395            if folded == _regex.fold_case(FULL_CASE_FOLDING, c):
2396                return True
2397
2398        return False
2399
2400    def is_empty(self):
2401        return all(b.is_empty() for b in self.branches)
2402
2403    def __eq__(self, other):
2404        return type(self) is type(other) and self.branches == other.branches
2405
2406    def max_width(self):
2407        return max(b.max_width() for b in self.branches)
2408
2409class CallGroup(RegexBase):
2410    def __init__(self, info, group, position):
2411        RegexBase.__init__(self)
2412        self.info = info
2413        self.group = group
2414        self.position = position
2415
2416        self._key = self.__class__, self.group
2417
2418    def fix_groups(self, pattern, reverse, fuzzy):
2419        try:
2420            self.group = int(self.group)
2421        except ValueError:
2422            try:
2423                self.group = self.info.group_index[self.group]
2424            except KeyError:
2425                raise error("invalid group reference", pattern, self.position)
2426
2427        if not 0 <= self.group <= self.info.group_count:
2428            raise error("unknown group", pattern, self.position)
2429
2430        if self.group > 0 and self.info.open_group_count[self.group] > 1:
2431            raise error("ambiguous group reference", pattern, self.position)
2432
2433        self.info.group_calls.append((self, reverse, fuzzy))
2434
2435        self._key = self.__class__, self.group
2436
2437    def remove_captures(self):
2438        raise error("group reference not allowed", pattern, self.position)
2439
2440    def _compile(self, reverse, fuzzy):
2441        return [(OP.GROUP_CALL, self.call_ref)]
2442
2443    def dump(self, indent, reverse):
2444        print "%sGROUP_CALL %s" % (INDENT * indent, self.group)
2445
2446    def __eq__(self, other):
2447        return type(self) is type(other) and self.group == other.group
2448
2449    def max_width(self):
2450        return UNLIMITED
2451
2452    def __del__(self):
2453        self.info = None
2454
2455class CallRef(RegexBase):
2456    def __init__(self, ref, parsed):
2457        self.ref = ref
2458        self.parsed = parsed
2459
2460    def _compile(self, reverse, fuzzy):
2461        return ([(OP.CALL_REF, self.ref)] + self.parsed._compile(reverse,
2462          fuzzy) + [(OP.END, )])
2463
2464class Character(RegexBase):
2465    _opcode = {(NOCASE, False): OP.CHARACTER, (IGNORECASE, False):
2466      OP.CHARACTER_IGN, (FULLCASE, False): OP.CHARACTER, (FULLIGNORECASE,
2467      False): OP.CHARACTER_IGN, (NOCASE, True): OP.CHARACTER_REV, (IGNORECASE,
2468      True): OP.CHARACTER_IGN_REV, (FULLCASE, True): OP.CHARACTER_REV,
2469      (FULLIGNORECASE, True): OP.CHARACTER_IGN_REV}
2470
2471    def __init__(self, value, positive=True, case_flags=NOCASE,
2472      zerowidth=False):
2473        RegexBase.__init__(self)
2474        self.value = value
2475        self.positive = bool(positive)
2476        self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
2477        self.zerowidth = bool(zerowidth)
2478
2479        if (self.positive and (self.case_flags & FULLIGNORECASE) ==
2480          FULLIGNORECASE):
2481            self.folded = _regex.fold_case(FULL_CASE_FOLDING, unichr(self.value))
2482        else:
2483            self.folded = unichr(self.value)
2484
2485        self._key = (self.__class__, self.value, self.positive,
2486          self.case_flags, self.zerowidth)
2487
2488    def rebuild(self, positive, case_flags, zerowidth):
2489        return Character(self.value, positive, case_flags, zerowidth)
2490
2491    def optimise(self, info, reverse, in_set=False):
2492        return self
2493
2494    def get_firstset(self, reverse):
2495        return set([self])
2496
2497    def has_simple_start(self):
2498        return True
2499
2500    def _compile(self, reverse, fuzzy):
2501        flags = 0
2502        if self.positive:
2503            flags |= POSITIVE_OP
2504        if self.zerowidth:
2505            flags |= ZEROWIDTH_OP
2506        if fuzzy:
2507            flags |= FUZZY_OP
2508
2509        code = PrecompiledCode([self._opcode[self.case_flags, reverse], flags,
2510          self.value])
2511
2512        if len(self.folded) > 1:
2513            # The character expands on full case-folding.
2514            code = Branch([code, String([ord(c) for c in self.folded],
2515              case_flags=self.case_flags)])
2516
2517        return code.compile(reverse, fuzzy)
2518
2519    def dump(self, indent, reverse):
2520        display = repr(unichr(self.value)).lstrip("bu")
2521        print "%sCHARACTER %s %s%s" % (INDENT * indent,
2522          POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags])
2523
2524    def matches(self, ch):
2525        return (ch == self.value) == self.positive
2526
2527    def max_width(self):
2528        return len(self.folded)
2529
2530    def get_required_string(self, reverse):
2531        if not self.positive:
2532            return 1, None
2533
2534        self.folded_characters = tuple(ord(c) for c in self.folded)
2535
2536        return 0, self
2537
2538class Conditional(RegexBase):
2539    def __init__(self, info, group, yes_item, no_item, position):
2540        RegexBase.__init__(self)
2541        self.info = info
2542        self.group = group
2543        self.yes_item = yes_item
2544        self.no_item = no_item
2545        self.position = position
2546
2547    def fix_groups(self, pattern, reverse, fuzzy):
2548        try:
2549            self.group = int(self.group)
2550        except ValueError:
2551            try:
2552                self.group = self.info.group_index[self.group]
2553            except KeyError:
2554                if self.group == 'DEFINE':
2555                    # 'DEFINE' is a special name unless there's a group with
2556                    # that name.
2557                    self.group = 0
2558                else:
2559                    raise error("unknown group", pattern, self.position)
2560
2561        if not 0 <= self.group <= self.info.group_count:
2562            raise error("invalid group reference", pattern, self.position)
2563
2564        self.yes_item.fix_groups(pattern, reverse, fuzzy)
2565        self.no_item.fix_groups(pattern, reverse, fuzzy)
2566
2567    def optimise(self, info, reverse):
2568        yes_item = self.yes_item.optimise(info, reverse)
2569        no_item = self.no_item.optimise(info, reverse)
2570
2571        return Conditional(info, self.group, yes_item, no_item, self.position)
2572
2573    def pack_characters(self, info):
2574        self.yes_item = self.yes_item.pack_characters(info)
2575        self.no_item = self.no_item.pack_characters(info)
2576        return self
2577
2578    def remove_captures(self):
2579        self.yes_item = self.yes_item.remove_captures()
2580        self.no_item = self.no_item.remove_captures()
2581
2582    def is_atomic(self):
2583        return self.yes_item.is_atomic() and self.no_item.is_atomic()
2584
2585    def can_be_affix(self):
2586        return self.yes_item.can_be_affix() and self.no_item.can_be_affix()
2587
2588    def contains_group(self):
2589        return self.yes_item.contains_group() or self.no_item.contains_group()
2590
2591    def get_firstset(self, reverse):
2592        return (self.yes_item.get_firstset(reverse) |
2593          self.no_item.get_firstset(reverse))
2594
2595    def _compile(self, reverse, fuzzy):
2596        code = [(OP.GROUP_EXISTS, self.group)]
2597        code.extend(self.yes_item.compile(reverse, fuzzy))
2598        add_code = self.no_item.compile(reverse, fuzzy)
2599        if add_code:
2600            code.append((OP.NEXT, ))
2601            code.extend(add_code)
2602
2603        code.append((OP.END, ))
2604
2605        return code
2606
2607    def dump(self, indent, reverse):
2608        print "%sGROUP_EXISTS %s" % (INDENT * indent, self.group)
2609        self.yes_item.dump(indent + 1, reverse)
2610        if not self.no_item.is_empty():
2611            print "%sOR" % (INDENT * indent)
2612            self.no_item.dump(indent + 1, reverse)
2613
2614    def is_empty(self):
2615        return self.yes_item.is_empty() and self.no_item.is_empty()
2616
2617    def __eq__(self, other):
2618        return type(self) is type(other) and (self.group, self.yes_item,
2619          self.no_item) == (other.group, other.yes_item, other.no_item)
2620
2621    def max_width(self):
2622        return max(self.yes_item.max_width(), self.no_item.max_width())
2623
2624    def __del__(self):
2625        self.info = None
2626
2627class DefaultBoundary(ZeroWidthBase):
2628    _opcode = OP.DEFAULT_BOUNDARY
2629    _op_name = "DEFAULT_BOUNDARY"
2630
2631class DefaultEndOfWord(ZeroWidthBase):
2632    _opcode = OP.DEFAULT_END_OF_WORD
2633    _op_name = "DEFAULT_END_OF_WORD"
2634
2635class DefaultStartOfWord(ZeroWidthBase):
2636    _opcode = OP.DEFAULT_START_OF_WORD
2637    _op_name = "DEFAULT_START_OF_WORD"
2638
2639class EndOfLine(ZeroWidthBase):
2640    _opcode = OP.END_OF_LINE
2641    _op_name = "END_OF_LINE"
2642
2643class EndOfLineU(EndOfLine):
2644    _opcode = OP.END_OF_LINE_U
2645    _op_name = "END_OF_LINE_U"
2646
2647class EndOfString(ZeroWidthBase):
2648    _opcode = OP.END_OF_STRING
2649    _op_name = "END_OF_STRING"
2650
2651class EndOfStringLine(ZeroWidthBase):
2652    _opcode = OP.END_OF_STRING_LINE
2653    _op_name = "END_OF_STRING_LINE"
2654
2655class EndOfStringLineU(EndOfStringLine):
2656    _opcode = OP.END_OF_STRING_LINE_U
2657    _op_name = "END_OF_STRING_LINE_U"
2658
2659class EndOfWord(ZeroWidthBase):
2660    _opcode = OP.END_OF_WORD
2661    _op_name = "END_OF_WORD"
2662
2663class Failure(ZeroWidthBase):
2664    _op_name = "FAILURE"
2665
2666    def _compile(self, reverse, fuzzy):
2667        return [(OP.FAILURE, )]
2668
2669class Fuzzy(RegexBase):
2670    def __init__(self, subpattern, constraints=None):
2671        RegexBase.__init__(self)
2672        if constraints is None:
2673            constraints = {}
2674        self.subpattern = subpattern
2675        self.constraints = constraints
2676
2677        # If an error type is mentioned in the cost equation, then its maximum
2678        # defaults to unlimited.
2679        if "cost" in constraints:
2680            for e in "dis":
2681                if e in constraints["cost"]:
2682                    constraints.setdefault(e, (0, None))
2683
2684        # If any error type is mentioned, then all the error maxima default to
2685        # 0, otherwise they default to unlimited.
2686        if set(constraints) & set("dis"):
2687            for e in "dis":
2688                constraints.setdefault(e, (0, 0))
2689        else:
2690            for e in "dis":
2691                constraints.setdefault(e, (0, None))
2692
2693        # The maximum of the generic error type defaults to unlimited.
2694        constraints.setdefault("e", (0, None))
2695
2696        # The cost equation defaults to equal costs. Also, the cost of any
2697        # error type not mentioned in the cost equation defaults to 0.
2698        if "cost" in constraints:
2699            for e in "dis":
2700                constraints["cost"].setdefault(e, 0)
2701        else:
2702            constraints["cost"] = {"d": 1, "i": 1, "s": 1, "max":
2703              constraints["e"][1]}
2704
2705    def fix_groups(self, pattern, reverse, fuzzy):
2706        self.subpattern.fix_groups(pattern, reverse, True)
2707
2708    def pack_characters(self, info):
2709        self.subpattern = self.subpattern.pack_characters(info)
2710        return self
2711
2712    def remove_captures(self):
2713        self.subpattern = self.subpattern.remove_captures()
2714        return self
2715
2716    def is_atomic(self):
2717        return self.subpattern.is_atomic()
2718
2719    def contains_group(self):
2720        return self.subpattern.contains_group()
2721
2722    def _compile(self, reverse, fuzzy):
2723        # The individual limits.
2724        arguments = []
2725        for e in "dise":
2726            v = self.constraints[e]
2727            arguments.append(v[0])
2728            arguments.append(UNLIMITED if v[1] is None else v[1])
2729
2730        # The coeffs of the cost equation.
2731        for e in "dis":
2732            arguments.append(self.constraints["cost"][e])
2733
2734        # The maximum of the cost equation.
2735        v = self.constraints["cost"]["max"]
2736        arguments.append(UNLIMITED if v is None else v)
2737
2738        flags = 0
2739        if reverse:
2740            flags |= REVERSE_OP
2741
2742        test = self.constraints.get("test")
2743
2744        if test:
2745            return ([(OP.FUZZY_EXT, flags) + tuple(arguments)] +
2746              test.compile(reverse, True) + [(OP.NEXT,)] +
2747              self.subpattern.compile(reverse, True) + [(OP.END,)])
2748
2749        return ([(OP.FUZZY, flags) + tuple(arguments)] +
2750          self.subpattern.compile(reverse, True) + [(OP.END,)])
2751
2752    def dump(self, indent, reverse):
2753        constraints = self._constraints_to_string()
2754        if constraints:
2755            constraints = " " + constraints
2756        print "%sFUZZY%s" % (INDENT * indent, constraints)
2757        self.subpattern.dump(indent + 1, reverse)
2758
2759    def is_empty(self):
2760        return self.subpattern.is_empty()
2761
2762    def __eq__(self, other):
2763        return (type(self) is type(other) and self.subpattern ==
2764          other.subpattern and self.constraints == other.constraints)
2765
2766    def max_width(self):
2767        return UNLIMITED
2768
2769    def _constraints_to_string(self):
2770        constraints = []
2771
2772        for name in "ids":
2773            min, max = self.constraints[name]
2774            if max == 0:
2775                continue
2776
2777            con = ""
2778
2779            if min > 0:
2780                con = "%s<=" % min
2781
2782            con += name
2783
2784            if max is not None:
2785                con += "<=%s" % max
2786
2787            constraints.append(con)
2788
2789        cost = []
2790        for name in "ids":
2791            coeff = self.constraints["cost"][name]
2792            if coeff > 0:
2793                cost.append("%s%s" % (coeff, name))
2794
2795        limit = self.constraints["cost"]["max"]
2796        if limit is not None and limit > 0:
2797            cost = "%s<=%s" % ("+".join(cost), limit)
2798            constraints.append(cost)
2799
2800        return ",".join(constraints)
2801
2802class Grapheme(RegexBase):
2803    def _compile(self, reverse, fuzzy):
2804        # Match at least 1 character until a grapheme boundary is reached. Note
2805        # that this is the same whether matching forwards or backwards.
2806        grapheme_matcher = Atomic(Sequence([LazyRepeat(AnyAll(), 1, None),
2807          GraphemeBoundary()]))
2808
2809        return grapheme_matcher.compile(reverse, fuzzy)
2810
2811    def dump(self, indent, reverse):
2812        print "%sGRAPHEME" % (INDENT * indent)
2813
2814    def max_width(self):
2815        return UNLIMITED
2816
2817class GraphemeBoundary:
2818    def compile(self, reverse, fuzzy):
2819        return [(OP.GRAPHEME_BOUNDARY, 1)]
2820
2821class GreedyRepeat(RegexBase):
2822    _opcode = OP.GREEDY_REPEAT
2823    _op_name = "GREEDY_REPEAT"
2824
2825    def __init__(self, subpattern, min_count, max_count):
2826        RegexBase.__init__(self)
2827        self.subpattern = subpattern
2828        self.min_count = min_count
2829        self.max_count = max_count
2830
2831    def fix_groups(self, pattern, reverse, fuzzy):
2832        self.subpattern.fix_groups(pattern, reverse, fuzzy)
2833
2834    def optimise(self, info, reverse):
2835        subpattern = self.subpattern.optimise(info, reverse)
2836
2837        return type(self)(subpattern, self.min_count, self.max_count)
2838
2839    def pack_characters(self, info):
2840        self.subpattern = self.subpattern.pack_characters(info)
2841        return self
2842
2843    def remove_captures(self):
2844        self.subpattern = self.subpattern.remove_captures()
2845        return self
2846
2847    def is_atomic(self):
2848        return self.min_count == self.max_count and self.subpattern.is_atomic()
2849
2850    def can_be_affix(self):
2851        return False
2852
2853    def contains_group(self):
2854        return self.subpattern.contains_group()
2855
2856    def get_firstset(self, reverse):
2857        fs = self.subpattern.get_firstset(reverse)
2858        if self.min_count == 0:
2859            fs.add(None)
2860
2861        return fs
2862
2863    def _compile(self, reverse, fuzzy):
2864        repeat = [self._opcode, self.min_count]
2865        if self.max_count is None:
2866            repeat.append(UNLIMITED)
2867        else:
2868            repeat.append(self.max_count)
2869
2870        subpattern = self.subpattern.compile(reverse, fuzzy)
2871        if not subpattern:
2872            return []
2873
2874        return ([tuple(repeat)] + subpattern + [(OP.END, )])
2875
2876    def dump(self, indent, reverse):
2877        if self.max_count is None:
2878            limit = "INF"
2879        else:
2880            limit = self.max_count
2881        print "%s%s %s %s" % (INDENT * indent, self._op_name, self.min_count,
2882          limit)
2883
2884        self.subpattern.dump(indent + 1, reverse)
2885
2886    def is_empty(self):
2887        return self.subpattern.is_empty()
2888
2889    def __eq__(self, other):
2890        return type(self) is type(other) and (self.subpattern, self.min_count,
2891          self.max_count) == (other.subpattern, other.min_count,
2892          other.max_count)
2893
2894    def max_width(self):
2895        if self.max_count is None:
2896            return UNLIMITED
2897
2898        return self.subpattern.max_width() * self.max_count
2899
2900    def get_required_string(self, reverse):
2901        max_count = UNLIMITED if self.max_count is None else self.max_count
2902        if self.min_count == 0:
2903            w = self.subpattern.max_width() * max_count
2904            return min(w, UNLIMITED), None
2905
2906        ofs, req = self.subpattern.get_required_string(reverse)
2907        if req:
2908            return ofs, req
2909
2910        w = self.subpattern.max_width() * max_count
2911        return min(w, UNLIMITED), None
2912
2913class PossessiveRepeat(GreedyRepeat):
2914    def is_atomic(self):
2915        return True
2916
2917    def _compile(self, reverse, fuzzy):
2918        subpattern = self.subpattern.compile(reverse, fuzzy)
2919        if not subpattern:
2920            return []
2921
2922        repeat = [self._opcode, self.min_count]
2923        if self.max_count is None:
2924            repeat.append(UNLIMITED)
2925        else:
2926            repeat.append(self.max_count)
2927
2928        return ([(OP.ATOMIC, ), tuple(repeat)] + subpattern + [(OP.END, ),
2929          (OP.END, )])
2930
2931    def dump(self, indent, reverse):
2932        print("%sATOMIC" % (INDENT * indent))
2933
2934        if self.max_count is None:
2935            limit = "INF"
2936        else:
2937            limit = self.max_count
2938        print("%s%s %s %s" % (INDENT * (indent + 1), self._op_name,
2939          self.min_count, limit))
2940
2941        self.subpattern.dump(indent + 2, reverse)
2942
2943class Group(RegexBase):
2944    def __init__(self, info, group, subpattern):
2945        RegexBase.__init__(self)
2946        self.info = info
2947        self.group = group
2948        self.subpattern = subpattern
2949
2950        self.call_ref = None
2951
2952    def fix_groups(self, pattern, reverse, fuzzy):
2953        self.info.defined_groups[self.group] = (self, reverse, fuzzy)
2954        self.subpattern.fix_groups(pattern, reverse, fuzzy)
2955
2956    def optimise(self, info, reverse):
2957        subpattern = self.subpattern.optimise(info, reverse)
2958
2959        return Group(self.info, self.group, subpattern)
2960
2961    def pack_characters(self, info):
2962        self.subpattern = self.subpattern.pack_characters(info)
2963        return self
2964
2965    def remove_captures(self):
2966        return self.subpattern.remove_captures()
2967
2968    def is_atomic(self):
2969        return self.subpattern.is_atomic()
2970
2971    def can_be_affix(self):
2972        return False
2973
2974    def contains_group(self):
2975        return True
2976
2977    def get_firstset(self, reverse):
2978        return self.subpattern.get_firstset(reverse)
2979
2980    def has_simple_start(self):
2981        return self.subpattern.has_simple_start()
2982
2983    def _compile(self, reverse, fuzzy):
2984        code = []
2985
2986        key = self.group, reverse, fuzzy
2987        ref = self.info.call_refs.get(key)
2988        if ref is not None:
2989            code += [(OP.CALL_REF, ref)]
2990
2991        public_group = private_group = self.group
2992        if private_group < 0:
2993            public_group = self.info.private_groups[private_group]
2994            private_group = self.info.group_count - private_group
2995
2996        code += ([(OP.GROUP, int(not reverse), private_group, public_group)] +
2997          self.subpattern.compile(reverse, fuzzy) + [(OP.END, )])
2998
2999        if ref is not None:
3000            code += [(OP.END, )]
3001
3002        return code
3003
3004    def dump(self, indent, reverse):
3005        group = self.group
3006        if group < 0:
3007            group = private_groups[group]
3008        print "%sGROUP %s" % (INDENT * indent, group)
3009        self.subpattern.dump(indent + 1, reverse)
3010
3011    def __eq__(self, other):
3012        return (type(self) is type(other) and (self.group, self.subpattern) ==
3013          (other.group, other.subpattern))
3014
3015    def max_width(self):
3016        return self.subpattern.max_width()
3017
3018    def get_required_string(self, reverse):
3019        return self.subpattern.get_required_string(reverse)
3020
3021    def __del__(self):
3022        self.info = None
3023
3024class Keep(ZeroWidthBase):
3025    _opcode = OP.KEEP
3026    _op_name = "KEEP"
3027
3028class LazyRepeat(GreedyRepeat):
3029    _opcode = OP.LAZY_REPEAT
3030    _op_name = "LAZY_REPEAT"
3031
3032class LookAround(RegexBase):
3033    _dir_text = {False: "AHEAD", True: "BEHIND"}
3034
3035    def __init__(self, behind, positive, subpattern):
3036        RegexBase.__init__(self)
3037        self.behind = bool(behind)
3038        self.positive = bool(positive)
3039        self.subpattern = subpattern
3040
3041    def fix_groups(self, pattern, reverse, fuzzy):
3042        self.subpattern.fix_groups(pattern, self.behind, fuzzy)
3043
3044    def optimise(self, info, reverse):
3045        subpattern = self.subpattern.optimise(info, self.behind)
3046        if self.positive and subpattern.is_empty():
3047            return subpattern
3048
3049        return LookAround(self.behind, self.positive, subpattern)
3050
3051    def pack_characters(self, info):
3052        self.subpattern = self.subpattern.pack_characters(info)
3053        return self
3054
3055    def remove_captures(self):
3056        return self.subpattern.remove_captures()
3057
3058    def is_atomic(self):
3059        return self.subpattern.is_atomic()
3060
3061    def can_be_affix(self):
3062        return self.subpattern.can_be_affix()
3063
3064    def contains_group(self):
3065        return self.subpattern.contains_group()
3066
3067    def get_firstset(self, reverse):
3068        if self.positive and self.behind == reverse:
3069            return self.subpattern.get_firstset(reverse)
3070
3071        return set([None])
3072
3073    def _compile(self, reverse, fuzzy):
3074        flags = 0
3075        if self.positive:
3076            flags |= POSITIVE_OP
3077        if fuzzy:
3078            flags |= FUZZY_OP
3079        if reverse:
3080            flags |= REVERSE_OP
3081
3082        return ([(OP.LOOKAROUND, flags, int(not self.behind))] +
3083          self.subpattern.compile(self.behind) + [(OP.END, )])
3084
3085    def dump(self, indent, reverse):
3086        print "%sLOOK%s %s" % (INDENT * indent, self._dir_text[self.behind],
3087          POS_TEXT[self.positive])
3088        self.subpattern.dump(indent + 1, self.behind)
3089
3090    def is_empty(self):
3091        return self.positive and self.subpattern.is_empty()
3092
3093    def __eq__(self, other):
3094        return type(self) is type(other) and (self.behind, self.positive,
3095          self.subpattern) == (other.behind, other.positive, other.subpattern)
3096
3097    def max_width(self):
3098        return 0
3099
3100class LookAroundConditional(RegexBase):
3101    _dir_text = {False: "AHEAD", True: "BEHIND"}
3102
3103    def __init__(self, behind, positive, subpattern, yes_item, no_item):
3104        RegexBase.__init__(self)
3105        self.behind = bool(behind)
3106        self.positive = bool(positive)
3107        self.subpattern = subpattern
3108        self.yes_item = yes_item
3109        self.no_item = no_item
3110
3111    def fix_groups(self, pattern, reverse, fuzzy):
3112        self.subpattern.fix_groups(pattern, reverse, fuzzy)
3113        self.yes_item.fix_groups(pattern, reverse, fuzzy)
3114        self.no_item.fix_groups(pattern, reverse, fuzzy)
3115
3116    def optimise(self, info, reverse):
3117        subpattern = self.subpattern.optimise(info, self.behind)
3118        yes_item = self.yes_item.optimise(info, self.behind)
3119        no_item = self.no_item.optimise(info, self.behind)
3120
3121        return LookAroundConditional(self.behind, self.positive, subpattern,
3122          yes_item, no_item)
3123
3124    def pack_characters(self, info):
3125        self.subpattern = self.subpattern.pack_characters(info)
3126        self.yes_item = self.yes_item.pack_characters(info)
3127        self.no_item = self.no_item.pack_characters(info)
3128        return self
3129
3130    def remove_captures(self):
3131        self.subpattern = self.subpattern.remove_captures()
3132        self.yes_item = self.yes_item.remove_captures()
3133        self.no_item = self.no_item.remove_captures()
3134
3135    def is_atomic(self):
3136        return (self.subpattern.is_atomic() and self.yes_item.is_atomic() and
3137          self.no_item.is_atomic())
3138
3139    def can_be_affix(self):
3140        return (self.subpattern.can_be_affix() and self.yes_item.can_be_affix()
3141          and self.no_item.can_be_affix())
3142
3143    def contains_group(self):
3144        return (self.subpattern.contains_group() or
3145          self.yes_item.contains_group() or self.no_item.contains_group())
3146
3147    def _compile(self, reverse, fuzzy):
3148        code = [(OP.CONDITIONAL, int(self.positive), int(not self.behind))]
3149        code.extend(self.subpattern.compile(self.behind, fuzzy))
3150        code.append((OP.NEXT, ))
3151        code.extend(self.yes_item.compile(reverse, fuzzy))
3152        add_code = self.no_item.compile(reverse, fuzzy)
3153        if add_code:
3154            code.append((OP.NEXT, ))
3155            code.extend(add_code)
3156
3157        code.append((OP.END, ))
3158
3159        return code
3160
3161    def dump(self, indent, reverse):
3162        print("%sCONDITIONAL %s %s" % (INDENT * indent,
3163          self._dir_text[self.behind], POS_TEXT[self.positive]))
3164        self.subpattern.dump(indent + 1, self.behind)
3165        print("%sEITHER" % (INDENT * indent))
3166        self.yes_item.dump(indent + 1, reverse)
3167        if not self.no_item.is_empty():
3168            print("%sOR" % (INDENT * indent))
3169            self.no_item.dump(indent + 1, reverse)
3170
3171    def is_empty(self):
3172        return (self.subpattern.is_empty() and self.yes_item.is_empty() or
3173          self.no_item.is_empty())
3174
3175    def __eq__(self, other):
3176        return type(self) is type(other) and (self.subpattern, self.yes_item,
3177          self.no_item) == (other.subpattern, other.yes_item, other.no_item)
3178
3179    def max_width(self):
3180        return max(self.yes_item.max_width(), self.no_item.max_width())
3181
3182    def get_required_string(self, reverse):
3183        return self.max_width(), None
3184
3185class PrecompiledCode(RegexBase):
3186    def __init__(self, code):
3187        self.code = code
3188
3189    def _compile(self, reverse, fuzzy):
3190        return [tuple(self.code)]
3191
3192class Property(RegexBase):
3193    _opcode = {(NOCASE, False): OP.PROPERTY, (IGNORECASE, False):
3194      OP.PROPERTY_IGN, (FULLCASE, False): OP.PROPERTY, (FULLIGNORECASE, False):
3195      OP.PROPERTY_IGN, (NOCASE, True): OP.PROPERTY_REV, (IGNORECASE, True):
3196      OP.PROPERTY_IGN_REV, (FULLCASE, True): OP.PROPERTY_REV, (FULLIGNORECASE,
3197      True): OP.PROPERTY_IGN_REV}
3198
3199    def __init__(self, value, positive=True, case_flags=NOCASE,
3200      zerowidth=False):
3201        RegexBase.__init__(self)
3202        self.value = value
3203        self.positive = bool(positive)
3204        self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3205        self.zerowidth = bool(zerowidth)
3206
3207        self._key = (self.__class__, self.value, self.positive,
3208          self.case_flags, self.zerowidth)
3209
3210    def rebuild(self, positive, case_flags, zerowidth):
3211        return Property(self.value, positive, case_flags, zerowidth)
3212
3213    def optimise(self, info, reverse, in_set=False):
3214        return self
3215
3216    def get_firstset(self, reverse):
3217        return set([self])
3218
3219    def has_simple_start(self):
3220        return True
3221
3222    def _compile(self, reverse, fuzzy):
3223        flags = 0
3224        if self.positive:
3225            flags |= POSITIVE_OP
3226        if self.zerowidth:
3227            flags |= ZEROWIDTH_OP
3228        if fuzzy:
3229            flags |= FUZZY_OP
3230        return [(self._opcode[self.case_flags, reverse], flags, self.value)]
3231
3232    def dump(self, indent, reverse):
3233        prop = PROPERTY_NAMES[self.value >> 16]
3234        name, value = prop[0], prop[1][self.value & 0xFFFF]
3235        print "%sPROPERTY %s %s:%s%s" % (INDENT * indent,
3236          POS_TEXT[self.positive], name, value, CASE_TEXT[self.case_flags])
3237
3238    def matches(self, ch):
3239        return _regex.has_property_value(self.value, ch) == self.positive
3240
3241    def max_width(self):
3242        return 1
3243
3244class Prune(ZeroWidthBase):
3245    _op_name = "PRUNE"
3246
3247    def _compile(self, reverse, fuzzy):
3248        return [(OP.PRUNE, )]
3249
3250class Range(RegexBase):
3251    _opcode = {(NOCASE, False): OP.RANGE, (IGNORECASE, False): OP.RANGE_IGN,
3252      (FULLCASE, False): OP.RANGE, (FULLIGNORECASE, False): OP.RANGE_IGN,
3253      (NOCASE, True): OP.RANGE_REV, (IGNORECASE, True): OP.RANGE_IGN_REV,
3254      (FULLCASE, True): OP.RANGE_REV, (FULLIGNORECASE, True): OP.RANGE_IGN_REV}
3255    _op_name = "RANGE"
3256
3257    def __init__(self, lower, upper, positive=True, case_flags=NOCASE,
3258      zerowidth=False):
3259        RegexBase.__init__(self)
3260        self.lower = lower
3261        self.upper = upper
3262        self.positive = bool(positive)
3263        self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3264        self.zerowidth = bool(zerowidth)
3265
3266        self._key = (self.__class__, self.lower, self.upper, self.positive,
3267          self.case_flags, self.zerowidth)
3268
3269    def rebuild(self, positive, case_flags, zerowidth):
3270        return Range(self.lower, self.upper, positive, case_flags, zerowidth)
3271
3272    def optimise(self, info, reverse, in_set=False):
3273        # Is the range case-sensitive?
3274        if not self.positive or not (self.case_flags & IGNORECASE) or in_set:
3275            return self
3276
3277        # Is full case-folding possible?
3278        if (not (info.flags & UNICODE) or (self.case_flags & FULLIGNORECASE) !=
3279          FULLIGNORECASE):
3280            return self
3281
3282        # Get the characters which expand to multiple codepoints on folding.
3283        expanding_chars = _regex.get_expand_on_folding()
3284
3285        # Get the folded characters in the range.
3286        items = []
3287        for ch in expanding_chars:
3288            if self.lower <= ord(ch) <= self.upper:
3289                folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3290                items.append(String([ord(c) for c in folded],
3291                  case_flags=self.case_flags))
3292
3293        if not items:
3294            # We can fall back to simple case-folding.
3295            return self
3296
3297        if len(items) < self.upper - self.lower + 1:
3298            # Not all the characters are covered by the full case-folding.
3299            items.insert(0, self)
3300
3301        return Branch(items)
3302
3303    def _compile(self, reverse, fuzzy):
3304        flags = 0
3305        if self.positive:
3306            flags |= POSITIVE_OP
3307        if self.zerowidth:
3308            flags |= ZEROWIDTH_OP
3309        if fuzzy:
3310            flags |= FUZZY_OP
3311        return [(self._opcode[self.case_flags, reverse], flags, self.lower,
3312          self.upper)]
3313
3314    def dump(self, indent, reverse):
3315        display_lower = repr(unichr(self.lower)).lstrip("bu")
3316        display_upper = repr(unichr(self.upper)).lstrip("bu")
3317        print "%sRANGE %s %s %s%s" % (INDENT * indent, POS_TEXT[self.positive],
3318          display_lower, display_upper, CASE_TEXT[self.case_flags])
3319
3320    def matches(self, ch):
3321        return (self.lower <= ch <= self.upper) == self.positive
3322
3323    def max_width(self):
3324        return 1
3325
3326class RefGroup(RegexBase):
3327    _opcode = {(NOCASE, False): OP.REF_GROUP, (IGNORECASE, False):
3328      OP.REF_GROUP_IGN, (FULLCASE, False): OP.REF_GROUP, (FULLIGNORECASE,
3329      False): OP.REF_GROUP_FLD, (NOCASE, True): OP.REF_GROUP_REV, (IGNORECASE,
3330      True): OP.REF_GROUP_IGN_REV, (FULLCASE, True): OP.REF_GROUP_REV,
3331      (FULLIGNORECASE, True): OP.REF_GROUP_FLD_REV}
3332
3333    def __init__(self, info, group, position, case_flags=NOCASE):
3334        RegexBase.__init__(self)
3335        self.info = info
3336        self.group = group
3337        self.position = position
3338        self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3339
3340        self._key = self.__class__, self.group, self.case_flags
3341
3342    def fix_groups(self, pattern, reverse, fuzzy):
3343        try:
3344            self.group = int(self.group)
3345        except ValueError:
3346            try:
3347                self.group = self.info.group_index[self.group]
3348            except KeyError:
3349                raise error("unknown group", pattern, self.position)
3350
3351        if not 1 <= self.group <= self.info.group_count:
3352            raise error("invalid group reference", pattern, self.position)
3353
3354        self._key = self.__class__, self.group, self.case_flags
3355
3356    def remove_captures(self):
3357        raise error("group reference not allowed", pattern, self.position)
3358
3359    def _compile(self, reverse, fuzzy):
3360        flags = 0
3361        if fuzzy:
3362            flags |= FUZZY_OP
3363        return [(self._opcode[self.case_flags, reverse], flags, self.group)]
3364
3365    def dump(self, indent, reverse):
3366        print "%sREF_GROUP %s%s" % (INDENT * indent, self.group,
3367          CASE_TEXT[self.case_flags])
3368
3369    def max_width(self):
3370        return UNLIMITED
3371
3372    def __del__(self):
3373        self.info = None
3374
3375class SearchAnchor(ZeroWidthBase):
3376    _opcode = OP.SEARCH_ANCHOR
3377    _op_name = "SEARCH_ANCHOR"
3378
3379class Sequence(RegexBase):
3380    def __init__(self, items=None):
3381        RegexBase.__init__(self)
3382        if items is None:
3383            items = []
3384
3385        self.items = items
3386
3387    def fix_groups(self, pattern, reverse, fuzzy):
3388        for s in self.items:
3389            s.fix_groups(pattern, reverse, fuzzy)
3390
3391    def optimise(self, info, reverse):
3392        # Flatten the sequences.
3393        items = []
3394        for s in self.items:
3395            s = s.optimise(info, reverse)
3396            if isinstance(s, Sequence):
3397                items.extend(s.items)
3398            else:
3399                items.append(s)
3400
3401        return make_sequence(items)
3402
3403    def pack_characters(self, info):
3404        "Packs sequences of characters into strings."
3405        items = []
3406        characters = []
3407        case_flags = NOCASE
3408        for s in self.items:
3409            if type(s) is Character and s.positive and not s.zerowidth:
3410                if s.case_flags != case_flags:
3411                    # Different case sensitivity, so flush, unless neither the
3412                    # previous nor the new character are cased.
3413                    if s.case_flags or is_cased_i(info, s.value):
3414                        Sequence._flush_characters(info, characters,
3415                          case_flags, items)
3416
3417                        case_flags = s.case_flags
3418
3419                characters.append(s.value)
3420            elif type(s) is String or type(s) is Literal:
3421                if s.case_flags != case_flags:
3422                    # Different case sensitivity, so flush, unless the neither
3423                    # the previous nor the new string are cased.
3424                    if s.case_flags or any(is_cased_i(info, c) for c in
3425                      characters):
3426                        Sequence._flush_characters(info, characters,
3427                          case_flags, items)
3428
3429                        case_flags = s.case_flags
3430
3431                characters.extend(s.characters)
3432            else:
3433                Sequence._flush_characters(info, characters, case_flags, items)
3434
3435                items.append(s.pack_characters(info))
3436
3437        Sequence._flush_characters(info, characters, case_flags, items)
3438
3439        return make_sequence(items)
3440
3441    def remove_captures(self):
3442        self.items = [s.remove_captures() for s in self.items]
3443        return self
3444
3445    def is_atomic(self):
3446        return all(s.is_atomic() for s in self.items)
3447
3448    def can_be_affix(self):
3449        return False
3450
3451    def contains_group(self):
3452        return any(s.contains_group() for s in self.items)
3453
3454    def get_firstset(self, reverse):
3455        fs = set()
3456        items = self.items
3457        if reverse:
3458            items.reverse()
3459        for s in items:
3460            fs |= s.get_firstset(reverse)
3461            if None not in fs:
3462                return fs
3463            fs.discard(None)
3464
3465        return fs | set([None])
3466
3467    def has_simple_start(self):
3468        return bool(self.items) and self.items[0].has_simple_start()
3469
3470    def _compile(self, reverse, fuzzy):
3471        seq = self.items
3472        if reverse:
3473            seq = seq[::-1]
3474
3475        code = []
3476        for s in seq:
3477            code.extend(s.compile(reverse, fuzzy))
3478
3479        return code
3480
3481    def dump(self, indent, reverse):
3482        for s in self.items:
3483            s.dump(indent, reverse)
3484
3485    @staticmethod
3486    def _flush_characters(info, characters, case_flags, items):
3487        if not characters:
3488            return
3489
3490        # Disregard case_flags if all of the characters are case-less.
3491        if case_flags & IGNORECASE:
3492            if not any(is_cased_i(info, c) for c in characters):
3493                case_flags = NOCASE
3494
3495        if (case_flags & FULLIGNORECASE) == FULLIGNORECASE:
3496            literals = Sequence._fix_full_casefold(characters)
3497
3498            for item in literals:
3499                chars = item.characters
3500
3501                if len(chars) == 1:
3502                    items.append(Character(chars[0], case_flags=item.case_flags))
3503                else:
3504                    items.append(String(chars, case_flags=item.case_flags))
3505        else:
3506            if len(characters) == 1:
3507                items.append(Character(characters[0], case_flags=case_flags))
3508            else:
3509                items.append(String(characters, case_flags=case_flags))
3510
3511        characters[:] = []
3512
3513    @staticmethod
3514    def _fix_full_casefold(characters):
3515        # Split a literal needing full case-folding into chunks that need it
3516        # and chunks that can use simple case-folding, which is faster.
3517        expanded = [_regex.fold_case(FULL_CASE_FOLDING, c) for c in
3518          _regex.get_expand_on_folding()]
3519        string = _regex.fold_case(FULL_CASE_FOLDING, u''.join(unichr(c)
3520          for c in characters)).lower()
3521        chunks = []
3522
3523        for e in expanded:
3524            found = string.find(e)
3525
3526            while found >= 0:
3527                chunks.append((found, found + len(e)))
3528                found = string.find(e, found + 1)
3529
3530        pos = 0
3531        literals = []
3532
3533        for start, end in Sequence._merge_chunks(chunks):
3534            if pos < start:
3535                literals.append(Literal(characters[pos : start],
3536                  case_flags=IGNORECASE))
3537
3538            literals.append(Literal(characters[start : end],
3539              case_flags=FULLIGNORECASE))
3540            pos = end
3541
3542        if pos < len(characters):
3543            literals.append(Literal(characters[pos : ], case_flags=IGNORECASE))
3544
3545        return literals
3546
3547    @staticmethod
3548    def _merge_chunks(chunks):
3549        if len(chunks) < 2:
3550            return chunks
3551
3552        chunks.sort()
3553
3554        start, end = chunks[0]
3555        new_chunks = []
3556
3557        for s, e in chunks[1 : ]:
3558            if s <= end:
3559                end = max(end, e)
3560            else:
3561                new_chunks.append((start, end))
3562                start, end = s, e
3563
3564        new_chunks.append((start, end))
3565
3566        return new_chunks
3567
3568    def is_empty(self):
3569        return all(i.is_empty() for i in self.items)
3570
3571    def __eq__(self, other):
3572        return type(self) is type(other) and self.items == other.items
3573
3574    def max_width(self):
3575        return sum(s.max_width() for s in self.items)
3576
3577    def get_required_string(self, reverse):
3578        seq = self.items
3579        if reverse:
3580            seq = seq[::-1]
3581
3582        offset = 0
3583
3584        for s in seq:
3585            ofs, req = s.get_required_string(reverse)
3586            offset += ofs
3587            if req:
3588                return offset, req
3589
3590        return offset, None
3591
3592class SetBase(RegexBase):
3593    def __init__(self, info, items, positive=True, case_flags=NOCASE,
3594      zerowidth=False):
3595        RegexBase.__init__(self)
3596        self.info = info
3597        self.items = tuple(items)
3598        self.positive = bool(positive)
3599        self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3600        self.zerowidth = bool(zerowidth)
3601
3602        self.char_width = 1
3603
3604        self._key = (self.__class__, self.items, self.positive,
3605          self.case_flags, self.zerowidth)
3606
3607    def rebuild(self, positive, case_flags, zerowidth):
3608        return type(self)(self.info, self.items, positive, case_flags,
3609          zerowidth).optimise(self.info, False)
3610
3611    def get_firstset(self, reverse):
3612        return set([self])
3613
3614    def has_simple_start(self):
3615        return True
3616
3617    def _compile(self, reverse, fuzzy):
3618        flags = 0
3619        if self.positive:
3620            flags |= POSITIVE_OP
3621        if self.zerowidth:
3622            flags |= ZEROWIDTH_OP
3623        if fuzzy:
3624            flags |= FUZZY_OP
3625        code = [(self._opcode[self.case_flags, reverse], flags)]
3626        for m in self.items:
3627            code.extend(m.compile())
3628
3629        code.append((OP.END, ))
3630
3631        return code
3632
3633    def dump(self, indent, reverse):
3634        print "%s%s %s%s" % (INDENT * indent, self._op_name,
3635          POS_TEXT[self.positive], CASE_TEXT[self.case_flags])
3636        for i in self.items:
3637            i.dump(indent + 1, reverse)
3638
3639    def _handle_case_folding(self, info, in_set):
3640        # Is the set case-sensitive?
3641        if not self.positive or not (self.case_flags & IGNORECASE) or in_set:
3642            return self
3643
3644        # Is full case-folding possible?
3645        if (not (self.info.flags & UNICODE) or (self.case_flags &
3646          FULLIGNORECASE) != FULLIGNORECASE):
3647            return self
3648
3649        # Get the characters which expand to multiple codepoints on folding.
3650        expanding_chars = _regex.get_expand_on_folding()
3651
3652        # Get the folded characters in the set.
3653        items = []
3654        seen = set()
3655        for ch in expanding_chars:
3656            if self.matches(ord(ch)):
3657                folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3658                if folded not in seen:
3659                    items.append(String([ord(c) for c in folded],
3660                      case_flags=self.case_flags))
3661                    seen.add(folded)
3662
3663        if not items:
3664            # We can fall back to simple case-folding.
3665            return self
3666
3667        return Branch([self] + items)
3668
3669    def max_width(self):
3670        # Is the set case-sensitive?
3671        if not self.positive or not (self.case_flags & IGNORECASE):
3672            return 1
3673
3674        # Is full case-folding possible?
3675        if (not (self.info.flags & UNICODE) or (self.case_flags &
3676          FULLIGNORECASE) != FULLIGNORECASE):
3677            return 1
3678
3679        # Get the characters which expand to multiple codepoints on folding.
3680        expanding_chars = _regex.get_expand_on_folding()
3681
3682        # Get the folded characters in the set.
3683        seen = set()
3684        for ch in expanding_chars:
3685            if self.matches(ord(ch)):
3686                folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3687                seen.add(folded)
3688
3689        if not seen:
3690            return 1
3691
3692        return max(len(folded) for folded in seen)
3693
3694    def __del__(self):
3695        self.info = None
3696
3697class SetDiff(SetBase):
3698    _opcode = {(NOCASE, False): OP.SET_DIFF, (IGNORECASE, False):
3699      OP.SET_DIFF_IGN, (FULLCASE, False): OP.SET_DIFF, (FULLIGNORECASE, False):
3700      OP.SET_DIFF_IGN, (NOCASE, True): OP.SET_DIFF_REV, (IGNORECASE, True):
3701      OP.SET_DIFF_IGN_REV, (FULLCASE, True): OP.SET_DIFF_REV, (FULLIGNORECASE,
3702      True): OP.SET_DIFF_IGN_REV}
3703    _op_name = "SET_DIFF"
3704
3705    def optimise(self, info, reverse, in_set=False):
3706        items = self.items
3707        if len(items) > 2:
3708            items = [items[0], SetUnion(info, items[1 : ])]
3709
3710        if len(items) == 1:
3711            return items[0].with_flags(case_flags=self.case_flags,
3712              zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3713
3714        self.items = tuple(m.optimise(info, reverse, in_set=True) for m in
3715          items)
3716
3717        return self._handle_case_folding(info, in_set)
3718
3719    def matches(self, ch):
3720        m = self.items[0].matches(ch) and not self.items[1].matches(ch)
3721        return m == self.positive
3722
3723class SetInter(SetBase):
3724    _opcode = {(NOCASE, False): OP.SET_INTER, (IGNORECASE, False):
3725      OP.SET_INTER_IGN, (FULLCASE, False): OP.SET_INTER, (FULLIGNORECASE,
3726      False): OP.SET_INTER_IGN, (NOCASE, True): OP.SET_INTER_REV, (IGNORECASE,
3727      True): OP.SET_INTER_IGN_REV, (FULLCASE, True): OP.SET_INTER_REV,
3728      (FULLIGNORECASE, True): OP.SET_INTER_IGN_REV}
3729    _op_name = "SET_INTER"
3730
3731    def optimise(self, info, reverse, in_set=False):
3732        items = []
3733        for m in self.items:
3734            m = m.optimise(info, reverse, in_set=True)
3735            if isinstance(m, SetInter) and m.positive:
3736                # Intersection in intersection.
3737                items.extend(m.items)
3738            else:
3739                items.append(m)
3740
3741        if len(items) == 1:
3742            return items[0].with_flags(case_flags=self.case_flags,
3743              zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3744
3745        self.items = tuple(items)
3746
3747        return self._handle_case_folding(info, in_set)
3748
3749    def matches(self, ch):
3750        m = all(i.matches(ch) for i in self.items)
3751        return m == self.positive
3752
3753class SetSymDiff(SetBase):
3754    _opcode = {(NOCASE, False): OP.SET_SYM_DIFF, (IGNORECASE, False):
3755      OP.SET_SYM_DIFF_IGN, (FULLCASE, False): OP.SET_SYM_DIFF, (FULLIGNORECASE,
3756      False): OP.SET_SYM_DIFF_IGN, (NOCASE, True): OP.SET_SYM_DIFF_REV,
3757      (IGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV, (FULLCASE, True):
3758      OP.SET_SYM_DIFF_REV, (FULLIGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV}
3759    _op_name = "SET_SYM_DIFF"
3760
3761    def optimise(self, info, reverse, in_set=False):
3762        items = []
3763        for m in self.items:
3764            m = m.optimise(info, reverse, in_set=True)
3765            if isinstance(m, SetSymDiff) and m.positive:
3766                # Symmetric difference in symmetric difference.
3767                items.extend(m.items)
3768            else:
3769                items.append(m)
3770
3771        if len(items) == 1:
3772            return items[0].with_flags(case_flags=self.case_flags,
3773              zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3774
3775        self.items = tuple(items)
3776
3777        return self._handle_case_folding(info, in_set)
3778
3779    def matches(self, ch):
3780        m = False
3781        for i in self.items:
3782            m = m != i.matches(ch)
3783
3784        return m == self.positive
3785
3786class SetUnion(SetBase):
3787    _opcode = {(NOCASE, False): OP.SET_UNION, (IGNORECASE, False):
3788      OP.SET_UNION_IGN, (FULLCASE, False): OP.SET_UNION, (FULLIGNORECASE,
3789      False): OP.SET_UNION_IGN, (NOCASE, True): OP.SET_UNION_REV, (IGNORECASE,
3790      True): OP.SET_UNION_IGN_REV, (FULLCASE, True): OP.SET_UNION_REV,
3791      (FULLIGNORECASE, True): OP.SET_UNION_IGN_REV}
3792    _op_name = "SET_UNION"
3793
3794    def optimise(self, info, reverse, in_set=False):
3795        items = []
3796        for m in self.items:
3797            m = m.optimise(info, reverse, in_set=True)
3798            if isinstance(m, SetUnion) and m.positive:
3799                # Union in union.
3800                items.extend(m.items)
3801            else:
3802                items.append(m)
3803
3804        if len(items) == 1:
3805            i = items[0]
3806            return i.with_flags(positive=i.positive == self.positive,
3807              case_flags=self.case_flags,
3808              zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3809
3810        self.items = tuple(items)
3811
3812        return self._handle_case_folding(info, in_set)
3813
3814    def _compile(self, reverse, fuzzy):
3815        flags = 0
3816        if self.positive:
3817            flags |= POSITIVE_OP
3818        if self.zerowidth:
3819            flags |= ZEROWIDTH_OP
3820        if fuzzy:
3821            flags |= FUZZY_OP
3822
3823        characters, others = defaultdict(list), []
3824        for m in self.items:
3825            if isinstance(m, Character):
3826                characters[m.positive].append(m.value)
3827            else:
3828                others.append(m)
3829
3830        code = [(self._opcode[self.case_flags, reverse], flags)]
3831
3832        for positive, values in characters.items():
3833            flags = 0
3834            if positive:
3835                flags |= POSITIVE_OP
3836            if len(values) == 1:
3837                code.append((OP.CHARACTER, flags, values[0]))
3838            else:
3839                code.append((OP.STRING, flags, len(values)) + tuple(values))
3840
3841        for m in others:
3842            code.extend(m.compile())
3843
3844        code.append((OP.END, ))
3845
3846        return code
3847
3848    def matches(self, ch):
3849        m = any(i.matches(ch) for i in self.items)
3850        return m == self.positive
3851
3852class Skip(ZeroWidthBase):
3853    _op_name = "SKIP"
3854    _opcode = OP.SKIP
3855
3856class StartOfLine(ZeroWidthBase):
3857    _opcode = OP.START_OF_LINE
3858    _op_name = "START_OF_LINE"
3859
3860class StartOfLineU(StartOfLine):
3861    _opcode = OP.START_OF_LINE_U
3862    _op_name = "START_OF_LINE_U"
3863
3864class StartOfString(ZeroWidthBase):
3865    _opcode = OP.START_OF_STRING
3866    _op_name = "START_OF_STRING"
3867
3868class StartOfWord(ZeroWidthBase):
3869    _opcode = OP.START_OF_WORD
3870    _op_name = "START_OF_WORD"
3871
3872class String(RegexBase):
3873    _opcode = {(NOCASE, False): OP.STRING, (IGNORECASE, False): OP.STRING_IGN,
3874      (FULLCASE, False): OP.STRING, (FULLIGNORECASE, False): OP.STRING_FLD,
3875      (NOCASE, True): OP.STRING_REV, (IGNORECASE, True): OP.STRING_IGN_REV,
3876      (FULLCASE, True): OP.STRING_REV, (FULLIGNORECASE, True):
3877      OP.STRING_FLD_REV}
3878
3879    def __init__(self, characters, case_flags=NOCASE):
3880        self.characters = tuple(characters)
3881        self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3882
3883        if (self.case_flags & FULLIGNORECASE) == FULLIGNORECASE:
3884            folded_characters = []
3885            for char in self.characters:
3886                folded = _regex.fold_case(FULL_CASE_FOLDING, unichr(char))
3887                folded_characters.extend(ord(c) for c in folded)
3888        else:
3889            folded_characters = self.characters
3890
3891        self.folded_characters = tuple(folded_characters)
3892        self.required = False
3893
3894        self._key = self.__class__, self.characters, self.case_flags
3895
3896    def get_firstset(self, reverse):
3897        if reverse:
3898            pos = -1
3899        else:
3900            pos = 0
3901        return set([Character(self.characters[pos],
3902          case_flags=self.case_flags)])
3903
3904    def has_simple_start(self):
3905        return True
3906
3907    def _compile(self, reverse, fuzzy):
3908        flags = 0
3909        if fuzzy:
3910            flags |= FUZZY_OP
3911        if self.required:
3912            flags |= REQUIRED_OP
3913        return [(self._opcode[self.case_flags, reverse], flags,
3914          len(self.folded_characters)) + self.folded_characters]
3915
3916    def dump(self, indent, reverse):
3917        display = repr("".join(unichr(c) for c in self.characters)).lstrip("bu")
3918        print "%sSTRING %s%s" % (INDENT * indent, display,
3919          CASE_TEXT[self.case_flags])
3920
3921    def max_width(self):
3922        return len(self.folded_characters)
3923
3924    def get_required_string(self, reverse):
3925        return 0, self
3926
3927class Literal(String):
3928    def dump(self, indent, reverse):
3929        literal = ''.join(unichr(c) for c in self.characters)
3930        display = repr(literal).lstrip("bu")
3931        print "%sLITERAL MATCH %s%s" % (INDENT * indent, display,
3932          CASE_TEXT[self.case_flags])
3933
3934class StringSet(Branch):
3935    def __init__(self, info, name, case_flags=NOCASE):
3936        self.info = info
3937        self.name = name
3938        self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3939
3940        self._key = self.__class__, self.name, self.case_flags
3941
3942        self.set_key = (name, self.case_flags)
3943        if self.set_key not in info.named_lists_used:
3944            info.named_lists_used[self.set_key] = len(info.named_lists_used)
3945
3946        index = self.info.named_lists_used[self.set_key]
3947        items = self.info.kwargs[self.name]
3948
3949        case_flags = self.case_flags
3950
3951        encoding = self.info.flags & _ALL_ENCODINGS
3952        fold_flags = encoding | case_flags
3953
3954        choices = []
3955        for string in items:
3956            choices.append([Character(ord(c), case_flags=case_flags) for c in
3957              string])
3958
3959        # Sort from longest to shortest.
3960        choices.sort(key=len, reverse=True)
3961
3962        self.branches = [Sequence(choice) for choice in choices]
3963
3964    def dump(self, indent, reverse):
3965        print "%sSTRING_SET %s%s" % (INDENT * indent, self.name,
3966          CASE_TEXT[self.case_flags])
3967
3968    def __del__(self):
3969        self.info = None
3970
3971class Source(object):
3972    "Scanner for the regular expression source string."
3973    def __init__(self, string):
3974        if isinstance(string, unicode):
3975            self.string = string
3976            self.char_type = unichr
3977        else:
3978            self.string = string
3979            self.char_type = chr
3980
3981        self.pos = 0
3982        self.ignore_space = False
3983        self.sep = string[ : 0]
3984
3985    def get(self):
3986        string = self.string
3987        pos = self.pos
3988
3989        try:
3990            if self.ignore_space:
3991                while True:
3992                    if string[pos].isspace():
3993                        # Skip over the whitespace.
3994                        pos += 1
3995                    elif string[pos] == "#":
3996                        # Skip over the comment to the end of the line.
3997                        pos = string.index("\n", pos)
3998                    else:
3999                        break
4000
4001            ch = string[pos]
4002            self.pos = pos + 1
4003            return ch
4004        except IndexError:
4005            # We've reached the end of the string.
4006            self.pos = pos
4007            return string[ : 0]
4008        except ValueError:
4009            # The comment extended to the end of the string.
4010            self.pos = len(string)
4011            return string[ : 0]
4012
4013    def get_many(self, count=1):
4014        string = self.string
4015        pos = self.pos
4016
4017        try:
4018            if self.ignore_space:
4019                substring = []
4020
4021                while len(substring) < count:
4022                    while True:
4023                        if string[pos].isspace():
4024                            # Skip over the whitespace.
4025                            pos += 1
4026                        elif string[pos] == "#":
4027                            # Skip over the comment to the end of the line.
4028                            pos = string.index("\n", pos)
4029                        else:
4030                            break
4031
4032                    substring.append(string[pos])
4033                    pos += 1
4034
4035                substring = "".join(substring)
4036            else:
4037                substring = string[pos : pos + count]
4038                pos += len(substring)
4039
4040            self.pos = pos
4041            return substring
4042        except IndexError:
4043            # We've reached the end of the string.
4044            self.pos = len(string)
4045            return "".join(substring)
4046        except ValueError:
4047            # The comment extended to the end of the string.
4048            self.pos = len(string)
4049            return "".join(substring)
4050
4051    def get_while(self, test_set, include=True):
4052        string = self.string
4053        pos = self.pos
4054
4055        if self.ignore_space:
4056            try:
4057                substring = []
4058
4059                while True:
4060                    if string[pos].isspace():
4061                        # Skip over the whitespace.
4062                        pos += 1
4063                    elif string[pos] == "#":
4064                        # Skip over the comment to the end of the line.
4065                        pos = string.index("\n", pos)
4066                    elif (string[pos] in test_set) == include:
4067                        substring.append(string[pos])
4068                        pos += 1
4069                    else:
4070                        break
4071
4072                self.pos = pos
4073            except IndexError:
4074                # We've reached the end of the string.
4075                self.pos = len(string)
4076            except ValueError:
4077                # The comment extended to the end of the string.
4078                self.pos = len(string)
4079
4080            return "".join(substring)
4081        else:
4082            try:
4083                while (string[pos] in test_set) == include:
4084                    pos += 1
4085
4086                substring = string[self.pos : pos]
4087
4088                self.pos = pos
4089
4090                return substring
4091            except IndexError:
4092                # We've reached the end of the string.
4093                substring = string[self.pos : pos]
4094
4095                self.pos = pos
4096
4097                return substring
4098
4099    def skip_while(self, test_set, include=True):
4100        string = self.string
4101        pos = self.pos
4102
4103        try:
4104            if self.ignore_space:
4105                while True:
4106                    if string[pos].isspace():
4107                        # Skip over the whitespace.
4108                        pos += 1
4109                    elif string[pos] == "#":
4110                        # Skip over the comment to the end of the line.
4111                        pos = string.index("\n", pos)
4112                    elif (string[pos] in test_set) == include:
4113                        pos += 1
4114                    else:
4115                        break
4116            else:
4117                while (string[pos] in test_set) == include:
4118                    pos += 1
4119
4120            self.pos = pos
4121        except IndexError:
4122            # We've reached the end of the string.
4123            self.pos = len(string)
4124        except ValueError:
4125            # The comment extended to the end of the string.
4126            self.pos = len(string)
4127
4128    def match(self, substring):
4129        string = self.string
4130        pos = self.pos
4131
4132        if self.ignore_space:
4133            try:
4134                for c in substring:
4135                    while True:
4136                        if string[pos].isspace():
4137                            # Skip over the whitespace.
4138                            pos += 1
4139                        elif string[pos] == "#":
4140                            # Skip over the comment to the end of the line.
4141                            pos = string.index("\n", pos)
4142                        else:
4143                            break
4144
4145                    if string[pos] != c:
4146                        return False
4147
4148                    pos += 1
4149
4150                self.pos = pos
4151
4152                return True
4153            except IndexError:
4154                # We've reached the end of the string.
4155                return False
4156            except ValueError:
4157                # The comment extended to the end of the string.
4158                return False
4159        else:
4160            if not string.startswith(substring, pos):
4161                return False
4162
4163            self.pos = pos + len(substring)
4164
4165            return True
4166
4167    def expect(self, substring):
4168        if not self.match(substring):
4169            raise error("missing %s" % substring, self.string, self.pos)
4170
4171    def at_end(self):
4172        string = self.string
4173        pos = self.pos
4174
4175        try:
4176            if self.ignore_space:
4177                while True:
4178                    if string[pos].isspace():
4179                        pos += 1
4180                    elif string[pos] == "#":
4181                        pos = string.index("\n", pos)
4182                    else:
4183                        break
4184
4185            return pos >= len(string)
4186        except IndexError:
4187            # We've reached the end of the string.
4188            return True
4189        except ValueError:
4190            # The comment extended to the end of the string.
4191            return True
4192
4193class Info(object):
4194    "Info about the regular expression."
4195
4196    def __init__(self, flags=0, char_type=None, kwargs={}):
4197        flags |= DEFAULT_FLAGS[(flags & _ALL_VERSIONS) or DEFAULT_VERSION]
4198        self.flags = flags
4199        self.global_flags = flags
4200        self.inline_locale = False
4201
4202        self.kwargs = kwargs
4203
4204        self.group_count = 0
4205        self.group_index = {}
4206        self.group_name = {}
4207        self.char_type = char_type
4208        self.named_lists_used = {}
4209        self.open_groups = []
4210        self.open_group_count = {}
4211        self.defined_groups = {}
4212        self.group_calls = []
4213        self.private_groups = {}
4214
4215    def open_group(self, name=None):
4216        group = self.group_index.get(name)
4217        if group is None:
4218            while True:
4219                self.group_count += 1
4220                if name is None or self.group_count not in self.group_name:
4221                    break
4222
4223            group = self.group_count
4224            if name:
4225                self.group_index[name] = group
4226                self.group_name[group] = name
4227
4228        if group in self.open_groups:
4229            # We have a nested named group. We'll assign it a private group
4230            # number, initially negative until we can assign a proper
4231            # (positive) number.
4232            group_alias = -(len(self.private_groups) + 1)
4233            self.private_groups[group_alias] = group
4234            group = group_alias
4235
4236        self.open_groups.append(group)
4237        self.open_group_count[group] = self.open_group_count.get(group, 0) + 1
4238
4239        return group
4240
4241    def close_group(self):
4242        self.open_groups.pop()
4243
4244    def is_open_group(self, name):
4245        # In version 1, a group reference can refer to an open group. We'll
4246        # just pretend the group isn't open.
4247        version = (self.flags & _ALL_VERSIONS) or DEFAULT_VERSION
4248        if version == VERSION1:
4249            return False
4250
4251        if name.isdigit():
4252            group = int(name)
4253        else:
4254            group = self.group_index.get(name)
4255
4256        return group in self.open_groups
4257
4258def _check_group_features(info, parsed):
4259    """Checks whether the reverse and fuzzy features of the group calls match
4260    the groups which they call.
4261    """
4262    call_refs = {}
4263    additional_groups = []
4264    for call, reverse, fuzzy in info.group_calls:
4265        # Look up the reference of this group call.
4266        key = (call.group, reverse, fuzzy)
4267        ref = call_refs.get(key)
4268        if ref is None:
4269            # This group doesn't have a reference yet, so look up its features.
4270            if call.group == 0:
4271                # Calling the pattern as a whole.
4272                rev = bool(info.flags & REVERSE)
4273                fuz = isinstance(parsed, Fuzzy)
4274                if (rev, fuz) != (reverse, fuzzy):
4275                    # The pattern as a whole doesn't have the features we want,
4276                    # so we'll need to make a copy of it with the desired
4277                    # features.
4278                    additional_groups.append((CallRef(len(call_refs), parsed),
4279                      reverse, fuzzy))
4280            else:
4281                # Calling a capture group.
4282                def_info = info.defined_groups[call.group]
4283                group = def_info[0]
4284                if def_info[1 : ] != (reverse, fuzzy):
4285                    # The group doesn't have the features we want, so we'll
4286                    # need to make a copy of it with the desired features.
4287                    additional_groups.append((group, reverse, fuzzy))
4288
4289            ref = len(call_refs)
4290            call_refs[key] = ref
4291
4292        call.call_ref = ref
4293
4294    info.call_refs = call_refs
4295    info.additional_groups = additional_groups
4296
4297def _get_required_string(parsed, flags):
4298    "Gets the required string and related info of a parsed pattern."
4299
4300    req_offset, required = parsed.get_required_string(bool(flags & REVERSE))
4301    if required:
4302        required.required = True
4303        if req_offset >= UNLIMITED:
4304            req_offset = -1
4305
4306        req_flags = required.case_flags
4307        if not (flags & UNICODE):
4308            req_flags &= ~UNICODE
4309
4310        req_chars = required.folded_characters
4311    else:
4312        req_offset = 0
4313        req_chars = ()
4314        req_flags = 0
4315
4316    return req_offset, req_chars, req_flags
4317
4318class Scanner:
4319    def __init__(self, lexicon, flags=0):
4320        self.lexicon = lexicon
4321
4322        # Combine phrases into a compound pattern.
4323        patterns = []
4324        for phrase, action in lexicon:
4325            # Parse the regular expression.
4326            source = Source(phrase)
4327            info = Info(flags, source.char_type)
4328            source.ignore_space = bool(info.flags & VERBOSE)
4329            parsed = _parse_pattern(source, info)
4330            if not source.at_end():
4331                raise error("unbalanced parenthesis", source.string,
4332                  source.pos)
4333
4334            # We want to forbid capture groups within each phrase.
4335            patterns.append(parsed.remove_captures())
4336
4337        # Combine all the subpatterns into one pattern.
4338        info = Info(flags)
4339        patterns = [Group(info, g + 1, p) for g, p in enumerate(patterns)]
4340        parsed = Branch(patterns)
4341
4342        # Optimise the compound pattern.
4343        reverse = bool(info.flags & REVERSE)
4344        parsed = parsed.optimise(info, reverse)
4345        parsed = parsed.pack_characters(info)
4346
4347        # Get the required string.
4348        req_offset, req_chars, req_flags = _get_required_string(parsed,
4349          info.flags)
4350
4351        # Check the features of the groups.
4352        _check_group_features(info, parsed)
4353
4354        # Complain if there are any group calls. They are not supported by the
4355        # Scanner class.
4356        if info.call_refs:
4357            raise error("recursive regex not supported by Scanner",
4358              source.string, source.pos)
4359
4360        reverse = bool(info.flags & REVERSE)
4361
4362        # Compile the compound pattern. The result is a list of tuples.
4363        code = parsed.compile(reverse) + [(OP.SUCCESS, )]
4364
4365        # Flatten the code into a list of ints.
4366        code = _flatten_code(code)
4367
4368        if not parsed.has_simple_start():
4369            # Get the first set, if possible.
4370            try:
4371                fs_code = _compile_firstset(info, parsed.get_firstset(reverse))
4372                fs_code = _flatten_code(fs_code)
4373                code = fs_code + code
4374            except _FirstSetError:
4375                pass
4376
4377        # Check the global flags for conflicts.
4378        version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
4379        if version not in (0, VERSION0, VERSION1):
4380            raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible")
4381
4382        # Create the PatternObject.
4383        #
4384        # Local flags like IGNORECASE affect the code generation, but aren't
4385        # needed by the PatternObject itself. Conversely, global flags like
4386        # LOCALE _don't_ affect the code generation but _are_ needed by the
4387        # PatternObject.
4388        self.scanner = _regex.compile(None, (flags & GLOBAL_FLAGS) | version,
4389          code, {}, {}, {}, [], req_offset, req_chars, req_flags,
4390          len(patterns))
4391
4392    def scan(self, string):
4393        result = []
4394        append = result.append
4395        match = self.scanner.scanner(string).match
4396        i = 0
4397        while True:
4398            m = match()
4399            if not m:
4400                break
4401            j = m.end()
4402            if i == j:
4403                break
4404            action = self.lexicon[m.lastindex - 1][1]
4405            if hasattr(action, '__call__'):
4406                self.match = m
4407                action = action(self, m.group())
4408            if action is not None:
4409                append(action)
4410            i = j
4411
4412        return result, string[i : ]
4413
4414# Get the known properties dict.
4415PROPERTIES = _regex.get_properties()
4416
4417# Build the inverse of the properties dict.
4418PROPERTY_NAMES = {}
4419for prop_name, (prop_id, values) in PROPERTIES.items():
4420    name, prop_values = PROPERTY_NAMES.get(prop_id, ("", {}))
4421    name = max(name, prop_name, key=len)
4422    PROPERTY_NAMES[prop_id] = name, prop_values
4423
4424    for val_name, val_id in values.items():
4425        prop_values[val_id] = max(prop_values.get(val_id, ""), val_name,
4426          key=len)
4427
4428# Character escape sequences.
4429CHARACTER_ESCAPES = {
4430    "a": "\a",
4431    "b": "\b",
4432    "f": "\f",
4433    "n": "\n",
4434    "r": "\r",
4435    "t": "\t",
4436    "v": "\v",
4437}
4438
4439# Predefined character set escape sequences.
4440CHARSET_ESCAPES = {
4441    "d": lookup_property(None, "Digit", True),
4442    "D": lookup_property(None, "Digit", False),
4443    "h": lookup_property(None, "Blank", True),
4444    "s": lookup_property(None, "Space", True),
4445    "S": lookup_property(None, "Space", False),
4446    "w": lookup_property(None, "Word", True),
4447    "W": lookup_property(None, "Word", False),
4448}
4449
4450# Positional escape sequences.
4451POSITION_ESCAPES = {
4452    "A": StartOfString(),
4453    "b": Boundary(),
4454    "B": Boundary(False),
4455    "K": Keep(),
4456    "m": StartOfWord(),
4457    "M": EndOfWord(),
4458    "Z": EndOfString(),
4459}
4460
4461# Positional escape sequences when WORD flag set.
4462WORD_POSITION_ESCAPES = dict(POSITION_ESCAPES)
4463WORD_POSITION_ESCAPES.update({
4464    "b": DefaultBoundary(),
4465    "B": DefaultBoundary(False),
4466    "m": DefaultStartOfWord(),
4467    "M": DefaultEndOfWord(),
4468})
4469
4470# Regex control verbs.
4471VERBS = {
4472    "FAIL": Failure(),
4473    "F": Failure(),
4474    "PRUNE": Prune(),
4475    "SKIP": Skip(),
4476}
4477