1#
2# Secret Labs' Regular Expression Engine
3#
4# convert re-style regular expression to sre pattern
5#
6# Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved.
7#
8# See the sre.py file for information on usage and redistribution.
9#
10
11"""Internal support module for sre"""
12
13# XXX: show string offset and offending character for all errors
14
15from sre_constants import *
16
17SPECIAL_CHARS = ".\\[{()*+?^$|"
18REPEAT_CHARS = "*+?{"
19
20DIGITS = frozenset("0123456789")
21
22OCTDIGITS = frozenset("01234567")
23HEXDIGITS = frozenset("0123456789abcdefABCDEF")
24ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
25
26WHITESPACE = frozenset(" \t\n\r\v\f")
27
28_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
29_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
30
31ESCAPES = {
32    r"\a": (LITERAL, ord("\a")),
33    r"\b": (LITERAL, ord("\b")),
34    r"\f": (LITERAL, ord("\f")),
35    r"\n": (LITERAL, ord("\n")),
36    r"\r": (LITERAL, ord("\r")),
37    r"\t": (LITERAL, ord("\t")),
38    r"\v": (LITERAL, ord("\v")),
39    r"\\": (LITERAL, ord("\\"))
40}
41
42CATEGORIES = {
43    r"\A": (AT, AT_BEGINNING_STRING), # start of string
44    r"\b": (AT, AT_BOUNDARY),
45    r"\B": (AT, AT_NON_BOUNDARY),
46    r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
47    r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
48    r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
49    r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
50    r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
51    r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
52    r"\Z": (AT, AT_END_STRING), # end of string
53}
54
55FLAGS = {
56    # standard flags
57    "i": SRE_FLAG_IGNORECASE,
58    "L": SRE_FLAG_LOCALE,
59    "m": SRE_FLAG_MULTILINE,
60    "s": SRE_FLAG_DOTALL,
61    "x": SRE_FLAG_VERBOSE,
62    # extensions
63    "a": SRE_FLAG_ASCII,
64    "t": SRE_FLAG_TEMPLATE,
65    "u": SRE_FLAG_UNICODE,
66}
67
68TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE
69GLOBAL_FLAGS = SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE
70
71class Verbose(Exception):
72    pass
73
74class Pattern:
75    # master pattern object.  keeps track of global attributes
76    def __init__(self):
77        self.flags = 0
78        self.groupdict = {}
79        self.groupwidths = [None]  # group 0
80        self.lookbehindgroups = None
81    @property
82    def groups(self):
83        return len(self.groupwidths)
84    def opengroup(self, name=None):
85        gid = self.groups
86        self.groupwidths.append(None)
87        if self.groups > MAXGROUPS:
88            raise error("too many groups")
89        if name is not None:
90            ogid = self.groupdict.get(name, None)
91            if ogid is not None:
92                raise error("redefinition of group name %r as group %d; "
93                            "was group %d" % (name, gid,  ogid))
94            self.groupdict[name] = gid
95        return gid
96    def closegroup(self, gid, p):
97        self.groupwidths[gid] = p.getwidth()
98    def checkgroup(self, gid):
99        return gid < self.groups and self.groupwidths[gid] is not None
100
101    def checklookbehindgroup(self, gid, source):
102        if self.lookbehindgroups is not None:
103            if not self.checkgroup(gid):
104                raise source.error('cannot refer to an open group')
105            if gid >= self.lookbehindgroups:
106                raise source.error('cannot refer to group defined in the same '
107                                   'lookbehind subpattern')
108
109class SubPattern:
110    # a subpattern, in intermediate form
111    def __init__(self, pattern, data=None):
112        self.pattern = pattern
113        if data is None:
114            data = []
115        self.data = data
116        self.width = None
117
118    def dump(self, level=0):
119        nl = True
120        seqtypes = (tuple, list)
121        for op, av in self.data:
122            print(level*"  " + str(op), end='')
123            if op is IN:
124                # member sublanguage
125                print()
126                for op, a in av:
127                    print((level+1)*"  " + str(op), a)
128            elif op is BRANCH:
129                print()
130                for i, a in enumerate(av[1]):
131                    if i:
132                        print(level*"  " + "OR")
133                    a.dump(level+1)
134            elif op is GROUPREF_EXISTS:
135                condgroup, item_yes, item_no = av
136                print('', condgroup)
137                item_yes.dump(level+1)
138                if item_no:
139                    print(level*"  " + "ELSE")
140                    item_no.dump(level+1)
141            elif isinstance(av, seqtypes):
142                nl = False
143                for a in av:
144                    if isinstance(a, SubPattern):
145                        if not nl:
146                            print()
147                        a.dump(level+1)
148                        nl = True
149                    else:
150                        if not nl:
151                            print(' ', end='')
152                        print(a, end='')
153                        nl = False
154                if not nl:
155                    print()
156            else:
157                print('', av)
158    def __repr__(self):
159        return repr(self.data)
160    def __len__(self):
161        return len(self.data)
162    def __delitem__(self, index):
163        del self.data[index]
164    def __getitem__(self, index):
165        if isinstance(index, slice):
166            return SubPattern(self.pattern, self.data[index])
167        return self.data[index]
168    def __setitem__(self, index, code):
169        self.data[index] = code
170    def insert(self, index, code):
171        self.data.insert(index, code)
172    def append(self, code):
173        self.data.append(code)
174    def getwidth(self):
175        # determine the width (min, max) for this subpattern
176        if self.width is not None:
177            return self.width
178        lo = hi = 0
179        for op, av in self.data:
180            if op is BRANCH:
181                i = MAXREPEAT - 1
182                j = 0
183                for av in av[1]:
184                    l, h = av.getwidth()
185                    i = min(i, l)
186                    j = max(j, h)
187                lo = lo + i
188                hi = hi + j
189            elif op is CALL:
190                i, j = av.getwidth()
191                lo = lo + i
192                hi = hi + j
193            elif op is SUBPATTERN:
194                i, j = av[-1].getwidth()
195                lo = lo + i
196                hi = hi + j
197            elif op in _REPEATCODES:
198                i, j = av[2].getwidth()
199                lo = lo + i * av[0]
200                hi = hi + j * av[1]
201            elif op in _UNITCODES:
202                lo = lo + 1
203                hi = hi + 1
204            elif op is GROUPREF:
205                i, j = self.pattern.groupwidths[av]
206                lo = lo + i
207                hi = hi + j
208            elif op is GROUPREF_EXISTS:
209                i, j = av[1].getwidth()
210                if av[2] is not None:
211                    l, h = av[2].getwidth()
212                    i = min(i, l)
213                    j = max(j, h)
214                else:
215                    i = 0
216                lo = lo + i
217                hi = hi + j
218            elif op is SUCCESS:
219                break
220        self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
221        return self.width
222
223class Tokenizer:
224    def __init__(self, string):
225        self.istext = isinstance(string, str)
226        self.string = string
227        if not self.istext:
228            string = str(string, 'latin1')
229        self.decoded_string = string
230        self.index = 0
231        self.next = None
232        self.__next()
233    def __next(self):
234        index = self.index
235        try:
236            char = self.decoded_string[index]
237        except IndexError:
238            self.next = None
239            return
240        if char == "\\":
241            index += 1
242            try:
243                char += self.decoded_string[index]
244            except IndexError:
245                raise error("bad escape (end of pattern)",
246                            self.string, len(self.string) - 1) from None
247        self.index = index + 1
248        self.next = char
249    def match(self, char):
250        if char == self.next:
251            self.__next()
252            return True
253        return False
254    def get(self):
255        this = self.next
256        self.__next()
257        return this
258    def getwhile(self, n, charset):
259        result = ''
260        for _ in range(n):
261            c = self.next
262            if c not in charset:
263                break
264            result += c
265            self.__next()
266        return result
267    def getuntil(self, terminator):
268        result = ''
269        while True:
270            c = self.next
271            self.__next()
272            if c is None:
273                if not result:
274                    raise self.error("missing group name")
275                raise self.error("missing %s, unterminated name" % terminator,
276                                 len(result))
277            if c == terminator:
278                if not result:
279                    raise self.error("missing group name", 1)
280                break
281            result += c
282        return result
283    @property
284    def pos(self):
285        return self.index - len(self.next or '')
286    def tell(self):
287        return self.index - len(self.next or '')
288    def seek(self, index):
289        self.index = index
290        self.__next()
291
292    def error(self, msg, offset=0):
293        return error(msg, self.string, self.tell() - offset)
294
295def _class_escape(source, escape):
296    # handle escape code inside character class
297    code = ESCAPES.get(escape)
298    if code:
299        return code
300    code = CATEGORIES.get(escape)
301    if code and code[0] is IN:
302        return code
303    try:
304        c = escape[1:2]
305        if c == "x":
306            # hexadecimal escape (exactly two digits)
307            escape += source.getwhile(2, HEXDIGITS)
308            if len(escape) != 4:
309                raise source.error("incomplete escape %s" % escape, len(escape))
310            return LITERAL, int(escape[2:], 16)
311        elif c == "u" and source.istext:
312            # unicode escape (exactly four digits)
313            escape += source.getwhile(4, HEXDIGITS)
314            if len(escape) != 6:
315                raise source.error("incomplete escape %s" % escape, len(escape))
316            return LITERAL, int(escape[2:], 16)
317        elif c == "U" and source.istext:
318            # unicode escape (exactly eight digits)
319            escape += source.getwhile(8, HEXDIGITS)
320            if len(escape) != 10:
321                raise source.error("incomplete escape %s" % escape, len(escape))
322            c = int(escape[2:], 16)
323            chr(c) # raise ValueError for invalid code
324            return LITERAL, c
325        elif c in OCTDIGITS:
326            # octal escape (up to three digits)
327            escape += source.getwhile(2, OCTDIGITS)
328            c = int(escape[1:], 8)
329            if c > 0o377:
330                raise source.error('octal escape value %s outside of '
331                                   'range 0-0o377' % escape, len(escape))
332            return LITERAL, c
333        elif c in DIGITS:
334            raise ValueError
335        if len(escape) == 2:
336            if c in ASCIILETTERS:
337                raise source.error('bad escape %s' % escape, len(escape))
338            return LITERAL, ord(escape[1])
339    except ValueError:
340        pass
341    raise source.error("bad escape %s" % escape, len(escape))
342
343def _escape(source, escape, state):
344    # handle escape code in expression
345    code = CATEGORIES.get(escape)
346    if code:
347        return code
348    code = ESCAPES.get(escape)
349    if code:
350        return code
351    try:
352        c = escape[1:2]
353        if c == "x":
354            # hexadecimal escape
355            escape += source.getwhile(2, HEXDIGITS)
356            if len(escape) != 4:
357                raise source.error("incomplete escape %s" % escape, len(escape))
358            return LITERAL, int(escape[2:], 16)
359        elif c == "u" and source.istext:
360            # unicode escape (exactly four digits)
361            escape += source.getwhile(4, HEXDIGITS)
362            if len(escape) != 6:
363                raise source.error("incomplete escape %s" % escape, len(escape))
364            return LITERAL, int(escape[2:], 16)
365        elif c == "U" and source.istext:
366            # unicode escape (exactly eight digits)
367            escape += source.getwhile(8, HEXDIGITS)
368            if len(escape) != 10:
369                raise source.error("incomplete escape %s" % escape, len(escape))
370            c = int(escape[2:], 16)
371            chr(c) # raise ValueError for invalid code
372            return LITERAL, c
373        elif c == "0":
374            # octal escape
375            escape += source.getwhile(2, OCTDIGITS)
376            return LITERAL, int(escape[1:], 8)
377        elif c in DIGITS:
378            # octal escape *or* decimal group reference (sigh)
379            if source.next in DIGITS:
380                escape += source.get()
381                if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
382                    source.next in OCTDIGITS):
383                    # got three octal digits; this is an octal escape
384                    escape += source.get()
385                    c = int(escape[1:], 8)
386                    if c > 0o377:
387                        raise source.error('octal escape value %s outside of '
388                                           'range 0-0o377' % escape,
389                                           len(escape))
390                    return LITERAL, c
391            # not an octal escape, so this is a group reference
392            group = int(escape[1:])
393            if group < state.groups:
394                if not state.checkgroup(group):
395                    raise source.error("cannot refer to an open group",
396                                       len(escape))
397                state.checklookbehindgroup(group, source)
398                return GROUPREF, group
399            raise source.error("invalid group reference %d" % group, len(escape) - 1)
400        if len(escape) == 2:
401            if c in ASCIILETTERS:
402                raise source.error("bad escape %s" % escape, len(escape))
403            return LITERAL, ord(escape[1])
404    except ValueError:
405        pass
406    raise source.error("bad escape %s" % escape, len(escape))
407
408def _uniq(items):
409    return list(dict.fromkeys(items))
410
411def _parse_sub(source, state, verbose, nested):
412    # parse an alternation: a|b|c
413
414    items = []
415    itemsappend = items.append
416    sourcematch = source.match
417    start = source.tell()
418    while True:
419        itemsappend(_parse(source, state, verbose, nested + 1,
420                           not nested and not items))
421        if not sourcematch("|"):
422            break
423
424    if len(items) == 1:
425        return items[0]
426
427    subpattern = SubPattern(state)
428
429    # check if all items share a common prefix
430    while True:
431        prefix = None
432        for item in items:
433            if not item:
434                break
435            if prefix is None:
436                prefix = item[0]
437            elif item[0] != prefix:
438                break
439        else:
440            # all subitems start with a common "prefix".
441            # move it out of the branch
442            for item in items:
443                del item[0]
444            subpattern.append(prefix)
445            continue # check next one
446        break
447
448    # check if the branch can be replaced by a character set
449    set = []
450    for item in items:
451        if len(item) != 1:
452            break
453        op, av = item[0]
454        if op is LITERAL:
455            set.append((op, av))
456        elif op is IN and av[0][0] is not NEGATE:
457            set.extend(av)
458        else:
459            break
460    else:
461        # we can store this as a character set instead of a
462        # branch (the compiler may optimize this even more)
463        subpattern.append((IN, _uniq(set)))
464        return subpattern
465
466    subpattern.append((BRANCH, (None, items)))
467    return subpattern
468
469def _parse(source, state, verbose, nested, first=False):
470    # parse a simple pattern
471    subpattern = SubPattern(state)
472
473    # precompute constants into local variables
474    subpatternappend = subpattern.append
475    sourceget = source.get
476    sourcematch = source.match
477    _len = len
478    _ord = ord
479
480    while True:
481
482        this = source.next
483        if this is None:
484            break # end of pattern
485        if this in "|)":
486            break # end of subpattern
487        sourceget()
488
489        if verbose:
490            # skip whitespace and comments
491            if this in WHITESPACE:
492                continue
493            if this == "#":
494                while True:
495                    this = sourceget()
496                    if this is None or this == "\n":
497                        break
498                continue
499
500        if this[0] == "\\":
501            code = _escape(source, this, state)
502            subpatternappend(code)
503
504        elif this not in SPECIAL_CHARS:
505            subpatternappend((LITERAL, _ord(this)))
506
507        elif this == "[":
508            here = source.tell() - 1
509            # character set
510            set = []
511            setappend = set.append
512##          if sourcematch(":"):
513##              pass # handle character classes
514            if source.next == '[':
515                import warnings
516                warnings.warn(
517                    'Possible nested set at position %d' % source.tell(),
518                    FutureWarning, stacklevel=nested + 6
519                )
520            negate = sourcematch("^")
521            # check remaining characters
522            while True:
523                this = sourceget()
524                if this is None:
525                    raise source.error("unterminated character set",
526                                       source.tell() - here)
527                if this == "]" and set:
528                    break
529                elif this[0] == "\\":
530                    code1 = _class_escape(source, this)
531                else:
532                    if set and this in '-&~|' and source.next == this:
533                        import warnings
534                        warnings.warn(
535                            'Possible set %s at position %d' % (
536                                'difference' if this == '-' else
537                                'intersection' if this == '&' else
538                                'symmetric difference' if this == '~' else
539                                'union',
540                                source.tell() - 1),
541                            FutureWarning, stacklevel=nested + 6
542                        )
543                    code1 = LITERAL, _ord(this)
544                if sourcematch("-"):
545                    # potential range
546                    that = sourceget()
547                    if that is None:
548                        raise source.error("unterminated character set",
549                                           source.tell() - here)
550                    if that == "]":
551                        if code1[0] is IN:
552                            code1 = code1[1][0]
553                        setappend(code1)
554                        setappend((LITERAL, _ord("-")))
555                        break
556                    if that[0] == "\\":
557                        code2 = _class_escape(source, that)
558                    else:
559                        if that == '-':
560                            import warnings
561                            warnings.warn(
562                                'Possible set difference at position %d' % (
563                                    source.tell() - 2),
564                                FutureWarning, stacklevel=nested + 6
565                            )
566                        code2 = LITERAL, _ord(that)
567                    if code1[0] != LITERAL or code2[0] != LITERAL:
568                        msg = "bad character range %s-%s" % (this, that)
569                        raise source.error(msg, len(this) + 1 + len(that))
570                    lo = code1[1]
571                    hi = code2[1]
572                    if hi < lo:
573                        msg = "bad character range %s-%s" % (this, that)
574                        raise source.error(msg, len(this) + 1 + len(that))
575                    setappend((RANGE, (lo, hi)))
576                else:
577                    if code1[0] is IN:
578                        code1 = code1[1][0]
579                    setappend(code1)
580
581            set = _uniq(set)
582            # XXX: <fl> should move set optimization to compiler!
583            if _len(set) == 1 and set[0][0] is LITERAL:
584                # optimization
585                if negate:
586                    subpatternappend((NOT_LITERAL, set[0][1]))
587                else:
588                    subpatternappend(set[0])
589            else:
590                if negate:
591                    set.insert(0, (NEGATE, None))
592                # charmap optimization can't be added here because
593                # global flags still are not known
594                subpatternappend((IN, set))
595
596        elif this in REPEAT_CHARS:
597            # repeat previous item
598            here = source.tell()
599            if this == "?":
600                min, max = 0, 1
601            elif this == "*":
602                min, max = 0, MAXREPEAT
603
604            elif this == "+":
605                min, max = 1, MAXREPEAT
606            elif this == "{":
607                if source.next == "}":
608                    subpatternappend((LITERAL, _ord(this)))
609                    continue
610
611                min, max = 0, MAXREPEAT
612                lo = hi = ""
613                while source.next in DIGITS:
614                    lo += sourceget()
615                if sourcematch(","):
616                    while source.next in DIGITS:
617                        hi += sourceget()
618                else:
619                    hi = lo
620                if not sourcematch("}"):
621                    subpatternappend((LITERAL, _ord(this)))
622                    source.seek(here)
623                    continue
624
625                if lo:
626                    min = int(lo)
627                    if min >= MAXREPEAT:
628                        raise OverflowError("the repetition number is too large")
629                if hi:
630                    max = int(hi)
631                    if max >= MAXREPEAT:
632                        raise OverflowError("the repetition number is too large")
633                    if max < min:
634                        raise source.error("min repeat greater than max repeat",
635                                           source.tell() - here)
636            else:
637                raise AssertionError("unsupported quantifier %r" % (char,))
638            # figure out which item to repeat
639            if subpattern:
640                item = subpattern[-1:]
641            else:
642                item = None
643            if not item or item[0][0] is AT:
644                raise source.error("nothing to repeat",
645                                   source.tell() - here + len(this))
646            if item[0][0] in _REPEATCODES:
647                raise source.error("multiple repeat",
648                                   source.tell() - here + len(this))
649            if item[0][0] is SUBPATTERN:
650                group, add_flags, del_flags, p = item[0][1]
651                if group is None and not add_flags and not del_flags:
652                    item = p
653            if sourcematch("?"):
654                subpattern[-1] = (MIN_REPEAT, (min, max, item))
655            else:
656                subpattern[-1] = (MAX_REPEAT, (min, max, item))
657
658        elif this == ".":
659            subpatternappend((ANY, None))
660
661        elif this == "(":
662            start = source.tell() - 1
663            group = True
664            name = None
665            add_flags = 0
666            del_flags = 0
667            if sourcematch("?"):
668                # options
669                char = sourceget()
670                if char is None:
671                    raise source.error("unexpected end of pattern")
672                if char == "P":
673                    # python extensions
674                    if sourcematch("<"):
675                        # named group: skip forward to end of name
676                        name = source.getuntil(">")
677                        if not name.isidentifier():
678                            msg = "bad character in group name %r" % name
679                            raise source.error(msg, len(name) + 1)
680                    elif sourcematch("="):
681                        # named backreference
682                        name = source.getuntil(")")
683                        if not name.isidentifier():
684                            msg = "bad character in group name %r" % name
685                            raise source.error(msg, len(name) + 1)
686                        gid = state.groupdict.get(name)
687                        if gid is None:
688                            msg = "unknown group name %r" % name
689                            raise source.error(msg, len(name) + 1)
690                        if not state.checkgroup(gid):
691                            raise source.error("cannot refer to an open group",
692                                               len(name) + 1)
693                        state.checklookbehindgroup(gid, source)
694                        subpatternappend((GROUPREF, gid))
695                        continue
696
697                    else:
698                        char = sourceget()
699                        if char is None:
700                            raise source.error("unexpected end of pattern")
701                        raise source.error("unknown extension ?P" + char,
702                                           len(char) + 2)
703                elif char == ":":
704                    # non-capturing group
705                    group = None
706                elif char == "#":
707                    # comment
708                    while True:
709                        if source.next is None:
710                            raise source.error("missing ), unterminated comment",
711                                               source.tell() - start)
712                        if sourceget() == ")":
713                            break
714                    continue
715
716                elif char in "=!<":
717                    # lookahead assertions
718                    dir = 1
719                    if char == "<":
720                        char = sourceget()
721                        if char is None:
722                            raise source.error("unexpected end of pattern")
723                        if char not in "=!":
724                            raise source.error("unknown extension ?<" + char,
725                                               len(char) + 2)
726                        dir = -1 # lookbehind
727                        lookbehindgroups = state.lookbehindgroups
728                        if lookbehindgroups is None:
729                            state.lookbehindgroups = state.groups
730                    p = _parse_sub(source, state, verbose, nested + 1)
731                    if dir < 0:
732                        if lookbehindgroups is None:
733                            state.lookbehindgroups = None
734                    if not sourcematch(")"):
735                        raise source.error("missing ), unterminated subpattern",
736                                           source.tell() - start)
737                    if char == "=":
738                        subpatternappend((ASSERT, (dir, p)))
739                    else:
740                        subpatternappend((ASSERT_NOT, (dir, p)))
741                    continue
742
743                elif char == "(":
744                    # conditional backreference group
745                    condname = source.getuntil(")")
746                    if condname.isidentifier():
747                        condgroup = state.groupdict.get(condname)
748                        if condgroup is None:
749                            msg = "unknown group name %r" % condname
750                            raise source.error(msg, len(condname) + 1)
751                    else:
752                        try:
753                            condgroup = int(condname)
754                            if condgroup < 0:
755                                raise ValueError
756                        except ValueError:
757                            msg = "bad character in group name %r" % condname
758                            raise source.error(msg, len(condname) + 1) from None
759                        if not condgroup:
760                            raise source.error("bad group number",
761                                               len(condname) + 1)
762                        if condgroup >= MAXGROUPS:
763                            msg = "invalid group reference %d" % condgroup
764                            raise source.error(msg, len(condname) + 1)
765                    state.checklookbehindgroup(condgroup, source)
766                    item_yes = _parse(source, state, verbose, nested + 1)
767                    if source.match("|"):
768                        item_no = _parse(source, state, verbose, nested + 1)
769                        if source.next == "|":
770                            raise source.error("conditional backref with more than two branches")
771                    else:
772                        item_no = None
773                    if not source.match(")"):
774                        raise source.error("missing ), unterminated subpattern",
775                                           source.tell() - start)
776                    subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
777                    continue
778
779                elif char in FLAGS or char == "-":
780                    # flags
781                    flags = _parse_flags(source, state, char)
782                    if flags is None:  # global flags
783                        if not first or subpattern:
784                            import warnings
785                            warnings.warn(
786                                'Flags not at the start of the expression %r%s' % (
787                                    source.string[:20],  # truncate long regexes
788                                    ' (truncated)' if len(source.string) > 20 else '',
789                                ),
790                                DeprecationWarning, stacklevel=nested + 6
791                            )
792                        if (state.flags & SRE_FLAG_VERBOSE) and not verbose:
793                            raise Verbose
794                        continue
795
796                    add_flags, del_flags = flags
797                    group = None
798                else:
799                    raise source.error("unknown extension ?" + char,
800                                       len(char) + 1)
801
802            # parse group contents
803            if group is not None:
804                try:
805                    group = state.opengroup(name)
806                except error as err:
807                    raise source.error(err.msg, len(name) + 1) from None
808            sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
809                           not (del_flags & SRE_FLAG_VERBOSE))
810            p = _parse_sub(source, state, sub_verbose, nested + 1)
811            if not source.match(")"):
812                raise source.error("missing ), unterminated subpattern",
813                                   source.tell() - start)
814            if group is not None:
815                state.closegroup(group, p)
816            subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
817
818        elif this == "^":
819            subpatternappend((AT, AT_BEGINNING))
820
821        elif this == "$":
822            subpatternappend((AT, AT_END))
823
824        else:
825            raise AssertionError("unsupported special character %r" % (char,))
826
827    # unpack non-capturing groups
828    for i in range(len(subpattern))[::-1]:
829        op, av = subpattern[i]
830        if op is SUBPATTERN:
831            group, add_flags, del_flags, p = av
832            if group is None and not add_flags and not del_flags:
833                subpattern[i: i+1] = p
834
835    return subpattern
836
837def _parse_flags(source, state, char):
838    sourceget = source.get
839    add_flags = 0
840    del_flags = 0
841    if char != "-":
842        while True:
843            flag = FLAGS[char]
844            if source.istext:
845                if char == 'L':
846                    msg = "bad inline flags: cannot use 'L' flag with a str pattern"
847                    raise source.error(msg)
848            else:
849                if char == 'u':
850                    msg = "bad inline flags: cannot use 'u' flag with a bytes pattern"
851                    raise source.error(msg)
852            add_flags |= flag
853            if (flag & TYPE_FLAGS) and (add_flags & TYPE_FLAGS) != flag:
854                msg = "bad inline flags: flags 'a', 'u' and 'L' are incompatible"
855                raise source.error(msg)
856            char = sourceget()
857            if char is None:
858                raise source.error("missing -, : or )")
859            if char in ")-:":
860                break
861            if char not in FLAGS:
862                msg = "unknown flag" if char.isalpha() else "missing -, : or )"
863                raise source.error(msg, len(char))
864    if char == ")":
865        state.flags |= add_flags
866        return None
867    if add_flags & GLOBAL_FLAGS:
868        raise source.error("bad inline flags: cannot turn on global flag", 1)
869    if char == "-":
870        char = sourceget()
871        if char is None:
872            raise source.error("missing flag")
873        if char not in FLAGS:
874            msg = "unknown flag" if char.isalpha() else "missing flag"
875            raise source.error(msg, len(char))
876        while True:
877            flag = FLAGS[char]
878            if flag & TYPE_FLAGS:
879                msg = "bad inline flags: cannot turn off flags 'a', 'u' and 'L'"
880                raise source.error(msg)
881            del_flags |= flag
882            char = sourceget()
883            if char is None:
884                raise source.error("missing :")
885            if char == ":":
886                break
887            if char not in FLAGS:
888                msg = "unknown flag" if char.isalpha() else "missing :"
889                raise source.error(msg, len(char))
890    assert char == ":"
891    if del_flags & GLOBAL_FLAGS:
892        raise source.error("bad inline flags: cannot turn off global flag", 1)
893    if add_flags & del_flags:
894        raise source.error("bad inline flags: flag turned on and off", 1)
895    return add_flags, del_flags
896
897def fix_flags(src, flags):
898    # Check and fix flags according to the type of pattern (str or bytes)
899    if isinstance(src, str):
900        if flags & SRE_FLAG_LOCALE:
901            raise ValueError("cannot use LOCALE flag with a str pattern")
902        if not flags & SRE_FLAG_ASCII:
903            flags |= SRE_FLAG_UNICODE
904        elif flags & SRE_FLAG_UNICODE:
905            raise ValueError("ASCII and UNICODE flags are incompatible")
906    else:
907        if flags & SRE_FLAG_UNICODE:
908            raise ValueError("cannot use UNICODE flag with a bytes pattern")
909        if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII:
910            raise ValueError("ASCII and LOCALE flags are incompatible")
911    return flags
912
913def parse(str, flags=0, pattern=None):
914    # parse 're' pattern into list of (opcode, argument) tuples
915
916    source = Tokenizer(str)
917
918    if pattern is None:
919        pattern = Pattern()
920    pattern.flags = flags
921    pattern.str = str
922
923    try:
924        p = _parse_sub(source, pattern, flags & SRE_FLAG_VERBOSE, 0)
925    except Verbose:
926        # the VERBOSE flag was switched on inside the pattern.  to be
927        # on the safe side, we'll parse the whole thing again...
928        pattern = Pattern()
929        pattern.flags = flags | SRE_FLAG_VERBOSE
930        pattern.str = str
931        source.seek(0)
932        p = _parse_sub(source, pattern, True, 0)
933
934    p.pattern.flags = fix_flags(str, p.pattern.flags)
935
936    if source.next is not None:
937        assert source.next == ")"
938        raise source.error("unbalanced parenthesis")
939
940    if flags & SRE_FLAG_DEBUG:
941        p.dump()
942
943    return p
944
945def parse_template(source, pattern):
946    # parse 're' replacement string into list of literals and
947    # group references
948    s = Tokenizer(source)
949    sget = s.get
950    groups = []
951    literals = []
952    literal = []
953    lappend = literal.append
954    def addgroup(index, pos):
955        if index > pattern.groups:
956            raise s.error("invalid group reference %d" % index, pos)
957        if literal:
958            literals.append(''.join(literal))
959            del literal[:]
960        groups.append((len(literals), index))
961        literals.append(None)
962    groupindex = pattern.groupindex
963    while True:
964        this = sget()
965        if this is None:
966            break # end of replacement string
967        if this[0] == "\\":
968            # group
969            c = this[1]
970            if c == "g":
971                name = ""
972                if not s.match("<"):
973                    raise s.error("missing <")
974                name = s.getuntil(">")
975                if name.isidentifier():
976                    try:
977                        index = groupindex[name]
978                    except KeyError:
979                        raise IndexError("unknown group name %r" % name)
980                else:
981                    try:
982                        index = int(name)
983                        if index < 0:
984                            raise ValueError
985                    except ValueError:
986                        raise s.error("bad character in group name %r" % name,
987                                      len(name) + 1) from None
988                    if index >= MAXGROUPS:
989                        raise s.error("invalid group reference %d" % index,
990                                      len(name) + 1)
991                addgroup(index, len(name) + 1)
992            elif c == "0":
993                if s.next in OCTDIGITS:
994                    this += sget()
995                    if s.next in OCTDIGITS:
996                        this += sget()
997                lappend(chr(int(this[1:], 8) & 0xff))
998            elif c in DIGITS:
999                isoctal = False
1000                if s.next in DIGITS:
1001                    this += sget()
1002                    if (c in OCTDIGITS and this[2] in OCTDIGITS and
1003                        s.next in OCTDIGITS):
1004                        this += sget()
1005                        isoctal = True
1006                        c = int(this[1:], 8)
1007                        if c > 0o377:
1008                            raise s.error('octal escape value %s outside of '
1009                                          'range 0-0o377' % this, len(this))
1010                        lappend(chr(c))
1011                if not isoctal:
1012                    addgroup(int(this[1:]), len(this) - 1)
1013            else:
1014                try:
1015                    this = chr(ESCAPES[this][1])
1016                except KeyError:
1017                    if c in ASCIILETTERS:
1018                        raise s.error('bad escape %s' % this, len(this))
1019                lappend(this)
1020        else:
1021            lappend(this)
1022    if literal:
1023        literals.append(''.join(literal))
1024    if not isinstance(source, str):
1025        # The tokenizer implicitly decodes bytes objects as latin-1, we must
1026        # therefore re-encode the final representation.
1027        literals = [None if s is None else s.encode('latin-1') for s in literals]
1028    return groups, literals
1029
1030def expand_template(template, match):
1031    g = match.group
1032    empty = match.string[:0]
1033    groups, literals = template
1034    literals = literals[:]
1035    try:
1036        for index, group in groups:
1037            literals[index] = g(group) or empty
1038    except IndexError:
1039        raise error("invalid group reference %d" % index)
1040    return empty.join(literals)
1041