1import re
2import sys
3
4# Reason last stmt is continued (or C_NONE if it's not).
5(C_NONE, C_BACKSLASH, C_STRING_FIRST_LINE,
6 C_STRING_NEXT_LINES, C_BRACKET) = range(5)
7
8if 0:   # for throwaway debugging output
9    def dump(*stuff):
10        sys.__stdout__.write(" ".join(map(str, stuff)) + "\n")
11
12# Find what looks like the start of a popular stmt.
13
14_synchre = re.compile(r"""
15    ^
16    [ \t]*
17    (?: while
18    |   else
19    |   def
20    |   return
21    |   assert
22    |   break
23    |   class
24    |   continue
25    |   elif
26    |   try
27    |   except
28    |   raise
29    |   import
30    |   yield
31    )
32    \b
33""", re.VERBOSE | re.MULTILINE).search
34
35# Match blank line or non-indenting comment line.
36
37_junkre = re.compile(r"""
38    [ \t]*
39    (?: \# \S .* )?
40    \n
41""", re.VERBOSE).match
42
43# Match any flavor of string; the terminating quote is optional
44# so that we're robust in the face of incomplete program text.
45
46_match_stringre = re.compile(r"""
47    \""" [^"\\]* (?:
48                     (?: \\. | "(?!"") )
49                     [^"\\]*
50                 )*
51    (?: \""" )?
52
53|   " [^"\\\n]* (?: \\. [^"\\\n]* )* "?
54
55|   ''' [^'\\]* (?:
56                   (?: \\. | '(?!'') )
57                   [^'\\]*
58                )*
59    (?: ''' )?
60
61|   ' [^'\\\n]* (?: \\. [^'\\\n]* )* '?
62""", re.VERBOSE | re.DOTALL).match
63
64# Match a line that starts with something interesting;
65# used to find the first item of a bracket structure.
66
67_itemre = re.compile(r"""
68    [ \t]*
69    [^\s#\\]    # if we match, m.end()-1 is the interesting char
70""", re.VERBOSE).match
71
72# Match start of stmts that should be followed by a dedent.
73
74_closere = re.compile(r"""
75    \s*
76    (?: return
77    |   break
78    |   continue
79    |   raise
80    |   pass
81    )
82    \b
83""", re.VERBOSE).match
84
85# Chew up non-special chars as quickly as possible.  If match is
86# successful, m.end() less 1 is the index of the last boring char
87# matched.  If match is unsuccessful, the string starts with an
88# interesting char.
89
90_chew_ordinaryre = re.compile(r"""
91    [^[\](){}#'"\\]+
92""", re.VERBOSE).match
93
94# Build translation table to map uninteresting chars to "x", open
95# brackets to "(", and close brackets to ")".
96
97_tran = ['x'] * 256
98for ch in "({[":
99    _tran[ord(ch)] = '('
100for ch in ")}]":
101    _tran[ord(ch)] = ')'
102for ch in "\"'\\\n#":
103    _tran[ord(ch)] = ch
104_tran = ''.join(_tran)
105del ch
106
107try:
108    UnicodeType = type(unicode(""))
109except NameError:
110    UnicodeType = None
111
112class Parser:
113
114    def __init__(self, indentwidth, tabwidth):
115        self.indentwidth = indentwidth
116        self.tabwidth = tabwidth
117
118    def set_str(self, str):
119        assert len(str) == 0 or str[-1] == '\n'
120        if type(str) is UnicodeType:
121            # The parse functions have no idea what to do with Unicode, so
122            # replace all Unicode characters with "x".  This is "safe"
123            # so long as the only characters germane to parsing the structure
124            # of Python are 7-bit ASCII.  It's *necessary* because Unicode
125            # strings don't have a .translate() method that supports
126            # deletechars.
127            uniphooey = str
128            str = []
129            push = str.append
130            for raw in map(ord, uniphooey):
131                push(raw < 127 and chr(raw) or "x")
132            str = "".join(str)
133        self.str = str
134        self.study_level = 0
135
136    # Return index of a good place to begin parsing, as close to the
137    # end of the string as possible.  This will be the start of some
138    # popular stmt like "if" or "def".  Return None if none found:
139    # the caller should pass more prior context then, if possible, or
140    # if not (the entire program text up until the point of interest
141    # has already been tried) pass 0 to set_lo.
142    #
143    # This will be reliable iff given a reliable is_char_in_string
144    # function, meaning that when it says "no", it's absolutely
145    # guaranteed that the char is not in a string.
146
147    def find_good_parse_start(self, is_char_in_string=None,
148                              _synchre=_synchre):
149        str, pos = self.str, None
150
151        if not is_char_in_string:
152            # no clue -- make the caller pass everything
153            return None
154
155        # Peek back from the end for a good place to start,
156        # but don't try too often; pos will be left None, or
157        # bumped to a legitimate synch point.
158        limit = len(str)
159        for tries in range(5):
160            i = str.rfind(":\n", 0, limit)
161            if i < 0:
162                break
163            i = str.rfind('\n', 0, i) + 1  # start of colon line
164            m = _synchre(str, i, limit)
165            if m and not is_char_in_string(m.start()):
166                pos = m.start()
167                break
168            limit = i
169        if pos is None:
170            # Nothing looks like a block-opener, or stuff does
171            # but is_char_in_string keeps returning true; most likely
172            # we're in or near a giant string, the colorizer hasn't
173            # caught up enough to be helpful, or there simply *aren't*
174            # any interesting stmts.  In any of these cases we're
175            # going to have to parse the whole thing to be sure, so
176            # give it one last try from the start, but stop wasting
177            # time here regardless of the outcome.
178            m = _synchre(str)
179            if m and not is_char_in_string(m.start()):
180                pos = m.start()
181            return pos
182
183        # Peeking back worked; look forward until _synchre no longer
184        # matches.
185        i = pos + 1
186        while 1:
187            m = _synchre(str, i)
188            if m:
189                s, i = m.span()
190                if not is_char_in_string(s):
191                    pos = s
192            else:
193                break
194        return pos
195
196    # Throw away the start of the string.  Intended to be called with
197    # find_good_parse_start's result.
198
199    def set_lo(self, lo):
200        assert lo == 0 or self.str[lo-1] == '\n'
201        if lo > 0:
202            self.str = self.str[lo:]
203
204    # As quickly as humanly possible <wink>, find the line numbers (0-
205    # based) of the non-continuation lines.
206    # Creates self.{goodlines, continuation}.
207
208    def _study1(self):
209        if self.study_level >= 1:
210            return
211        self.study_level = 1
212
213        # Map all uninteresting characters to "x", all open brackets
214        # to "(", all close brackets to ")", then collapse runs of
215        # uninteresting characters.  This can cut the number of chars
216        # by a factor of 10-40, and so greatly speed the following loop.
217        str = self.str
218        str = str.translate(_tran)
219        str = str.replace('xxxxxxxx', 'x')
220        str = str.replace('xxxx', 'x')
221        str = str.replace('xx', 'x')
222        str = str.replace('xx', 'x')
223        str = str.replace('\nx', '\n')
224        # note that replacing x\n with \n would be incorrect, because
225        # x may be preceded by a backslash
226
227        # March over the squashed version of the program, accumulating
228        # the line numbers of non-continued stmts, and determining
229        # whether & why the last stmt is a continuation.
230        continuation = C_NONE
231        level = lno = 0     # level is nesting level; lno is line number
232        self.goodlines = goodlines = [0]
233        push_good = goodlines.append
234        i, n = 0, len(str)
235        while i < n:
236            ch = str[i]
237            i = i+1
238
239            # cases are checked in decreasing order of frequency
240            if ch == 'x':
241                continue
242
243            if ch == '\n':
244                lno = lno + 1
245                if level == 0:
246                    push_good(lno)
247                    # else we're in an unclosed bracket structure
248                continue
249
250            if ch == '(':
251                level = level + 1
252                continue
253
254            if ch == ')':
255                if level:
256                    level = level - 1
257                    # else the program is invalid, but we can't complain
258                continue
259
260            if ch == '"' or ch == "'":
261                # consume the string
262                quote = ch
263                if str[i-1:i+2] == quote * 3:
264                    quote = quote * 3
265                firstlno = lno
266                w = len(quote) - 1
267                i = i+w
268                while i < n:
269                    ch = str[i]
270                    i = i+1
271
272                    if ch == 'x':
273                        continue
274
275                    if str[i-1:i+w] == quote:
276                        i = i+w
277                        break
278
279                    if ch == '\n':
280                        lno = lno + 1
281                        if w == 0:
282                            # unterminated single-quoted string
283                            if level == 0:
284                                push_good(lno)
285                            break
286                        continue
287
288                    if ch == '\\':
289                        assert i < n
290                        if str[i] == '\n':
291                            lno = lno + 1
292                        i = i+1
293                        continue
294
295                    # else comment char or paren inside string
296
297                else:
298                    # didn't break out of the loop, so we're still
299                    # inside a string
300                    if (lno - 1) == firstlno:
301                        # before the previous \n in str, we were in the first
302                        # line of the string
303                        continuation = C_STRING_FIRST_LINE
304                    else:
305                        continuation = C_STRING_NEXT_LINES
306                continue    # with outer loop
307
308            if ch == '#':
309                # consume the comment
310                i = str.find('\n', i)
311                assert i >= 0
312                continue
313
314            assert ch == '\\'
315            assert i < n
316            if str[i] == '\n':
317                lno = lno + 1
318                if i+1 == n:
319                    continuation = C_BACKSLASH
320            i = i+1
321
322        # The last stmt may be continued for all 3 reasons.
323        # String continuation takes precedence over bracket
324        # continuation, which beats backslash continuation.
325        if (continuation != C_STRING_FIRST_LINE
326            and continuation != C_STRING_NEXT_LINES and level > 0):
327            continuation = C_BRACKET
328        self.continuation = continuation
329
330        # Push the final line number as a sentinel value, regardless of
331        # whether it's continued.
332        assert (continuation == C_NONE) == (goodlines[-1] == lno)
333        if goodlines[-1] != lno:
334            push_good(lno)
335
336    def get_continuation_type(self):
337        self._study1()
338        return self.continuation
339
340    # study1 was sufficient to determine the continuation status,
341    # but doing more requires looking at every character.  study2
342    # does this for the last interesting statement in the block.
343    # Creates:
344    #     self.stmt_start, stmt_end
345    #         slice indices of last interesting stmt
346    #     self.stmt_bracketing
347    #         the bracketing structure of the last interesting stmt;
348    #         for example, for the statement "say(boo) or die", stmt_bracketing
349    #         will be [(0, 0), (3, 1), (8, 0)]. Strings and comments are
350    #         treated as brackets, for the matter.
351    #     self.lastch
352    #         last non-whitespace character before optional trailing
353    #         comment
354    #     self.lastopenbracketpos
355    #         if continuation is C_BRACKET, index of last open bracket
356
357    def _study2(self):
358        if self.study_level >= 2:
359            return
360        self._study1()
361        self.study_level = 2
362
363        # Set p and q to slice indices of last interesting stmt.
364        str, goodlines = self.str, self.goodlines
365        i = len(goodlines) - 1
366        p = len(str)    # index of newest line
367        while i:
368            assert p
369            # p is the index of the stmt at line number goodlines[i].
370            # Move p back to the stmt at line number goodlines[i-1].
371            q = p
372            for nothing in range(goodlines[i-1], goodlines[i]):
373                # tricky: sets p to 0 if no preceding newline
374                p = str.rfind('\n', 0, p-1) + 1
375            # The stmt str[p:q] isn't a continuation, but may be blank
376            # or a non-indenting comment line.
377            if  _junkre(str, p):
378                i = i-1
379            else:
380                break
381        if i == 0:
382            # nothing but junk!
383            assert p == 0
384            q = p
385        self.stmt_start, self.stmt_end = p, q
386
387        # Analyze this stmt, to find the last open bracket (if any)
388        # and last interesting character (if any).
389        lastch = ""
390        stack = []  # stack of open bracket indices
391        push_stack = stack.append
392        bracketing = [(p, 0)]
393        while p < q:
394            # suck up all except ()[]{}'"#\\
395            m = _chew_ordinaryre(str, p, q)
396            if m:
397                # we skipped at least one boring char
398                newp = m.end()
399                # back up over totally boring whitespace
400                i = newp - 1    # index of last boring char
401                while i >= p and str[i] in " \t\n":
402                    i = i-1
403                if i >= p:
404                    lastch = str[i]
405                p = newp
406                if p >= q:
407                    break
408
409            ch = str[p]
410
411            if ch in "([{":
412                push_stack(p)
413                bracketing.append((p, len(stack)))
414                lastch = ch
415                p = p+1
416                continue
417
418            if ch in ")]}":
419                if stack:
420                    del stack[-1]
421                lastch = ch
422                p = p+1
423                bracketing.append((p, len(stack)))
424                continue
425
426            if ch == '"' or ch == "'":
427                # consume string
428                # Note that study1 did this with a Python loop, but
429                # we use a regexp here; the reason is speed in both
430                # cases; the string may be huge, but study1 pre-squashed
431                # strings to a couple of characters per line.  study1
432                # also needed to keep track of newlines, and we don't
433                # have to.
434                bracketing.append((p, len(stack)+1))
435                lastch = ch
436                p = _match_stringre(str, p, q).end()
437                bracketing.append((p, len(stack)))
438                continue
439
440            if ch == '#':
441                # consume comment and trailing newline
442                bracketing.append((p, len(stack)+1))
443                p = str.find('\n', p, q) + 1
444                assert p > 0
445                bracketing.append((p, len(stack)))
446                continue
447
448            assert ch == '\\'
449            p = p+1     # beyond backslash
450            assert p < q
451            if str[p] != '\n':
452                # the program is invalid, but can't complain
453                lastch = ch + str[p]
454            p = p+1     # beyond escaped char
455
456        # end while p < q:
457
458        self.lastch = lastch
459        if stack:
460            self.lastopenbracketpos = stack[-1]
461        self.stmt_bracketing = tuple(bracketing)
462
463    # Assuming continuation is C_BRACKET, return the number
464    # of spaces the next line should be indented.
465
466    def compute_bracket_indent(self):
467        self._study2()
468        assert self.continuation == C_BRACKET
469        j = self.lastopenbracketpos
470        str = self.str
471        n = len(str)
472        origi = i = str.rfind('\n', 0, j) + 1
473        j = j+1     # one beyond open bracket
474        # find first list item; set i to start of its line
475        while j < n:
476            m = _itemre(str, j)
477            if m:
478                j = m.end() - 1     # index of first interesting char
479                extra = 0
480                break
481            else:
482                # this line is junk; advance to next line
483                i = j = str.find('\n', j) + 1
484        else:
485            # nothing interesting follows the bracket;
486            # reproduce the bracket line's indentation + a level
487            j = i = origi
488            while str[j] in " \t":
489                j = j+1
490            extra = self.indentwidth
491        return len(str[i:j].expandtabs(self.tabwidth)) + extra
492
493    # Return number of physical lines in last stmt (whether or not
494    # it's an interesting stmt!  this is intended to be called when
495    # continuation is C_BACKSLASH).
496
497    def get_num_lines_in_stmt(self):
498        self._study1()
499        goodlines = self.goodlines
500        return goodlines[-1] - goodlines[-2]
501
502    # Assuming continuation is C_BACKSLASH, return the number of spaces
503    # the next line should be indented.  Also assuming the new line is
504    # the first one following the initial line of the stmt.
505
506    def compute_backslash_indent(self):
507        self._study2()
508        assert self.continuation == C_BACKSLASH
509        str = self.str
510        i = self.stmt_start
511        while str[i] in " \t":
512            i = i+1
513        startpos = i
514
515        # See whether the initial line starts an assignment stmt; i.e.,
516        # look for an = operator
517        endpos = str.find('\n', startpos) + 1
518        found = level = 0
519        while i < endpos:
520            ch = str[i]
521            if ch in "([{":
522                level = level + 1
523                i = i+1
524            elif ch in ")]}":
525                if level:
526                    level = level - 1
527                i = i+1
528            elif ch == '"' or ch == "'":
529                i = _match_stringre(str, i, endpos).end()
530            elif ch == '#':
531                break
532            elif level == 0 and ch == '=' and \
533                   (i == 0 or str[i-1] not in "=<>!") and \
534                   str[i+1] != '=':
535                found = 1
536                break
537            else:
538                i = i+1
539
540        if found:
541            # found a legit =, but it may be the last interesting
542            # thing on the line
543            i = i+1     # move beyond the =
544            found = re.match(r"\s*\\", str[i:endpos]) is None
545
546        if not found:
547            # oh well ... settle for moving beyond the first chunk
548            # of non-whitespace chars
549            i = startpos
550            while str[i] not in " \t\n":
551                i = i+1
552
553        return len(str[self.stmt_start:i].expandtabs(\
554                                     self.tabwidth)) + 1
555
556    # Return the leading whitespace on the initial line of the last
557    # interesting stmt.
558
559    def get_base_indent_string(self):
560        self._study2()
561        i, n = self.stmt_start, self.stmt_end
562        j = i
563        str = self.str
564        while j < n and str[j] in " \t":
565            j = j + 1
566        return str[i:j]
567
568    # Did the last interesting stmt open a block?
569
570    def is_block_opener(self):
571        self._study2()
572        return self.lastch == ':'
573
574    # Did the last interesting stmt close a block?
575
576    def is_block_closer(self):
577        self._study2()
578        return _closere(self.str, self.stmt_start) is not None
579
580    # index of last open bracket ({[, or None if none
581    lastopenbracketpos = None
582
583    def get_last_open_bracket_pos(self):
584        self._study2()
585        return self.lastopenbracketpos
586
587    # the structure of the bracketing of the last interesting statement,
588    # in the format defined in _study2, or None if the text didn't contain
589    # anything
590    stmt_bracketing = None
591
592    def get_last_stmt_bracketing(self):
593        self._study2()
594        return self.stmt_bracketing
595