1"""Provide advanced parsing abilities for ParenMatch and other extensions.
2
3HyperParser uses PyParser.  PyParser mostly gives information on the
4proper indentation of code.  HyperParser gives additional information on
5the structure of code.
6"""
7from keyword import iskeyword
8import string
9
10from idlelib import pyparse
11
12# all ASCII chars that may be in an identifier
13_ASCII_ID_CHARS = frozenset(string.ascii_letters + string.digits + "_")
14# all ASCII chars that may be the first char of an identifier
15_ASCII_ID_FIRST_CHARS = frozenset(string.ascii_letters + "_")
16
17# lookup table for whether 7-bit ASCII chars are valid in a Python identifier
18_IS_ASCII_ID_CHAR = [(chr(x) in _ASCII_ID_CHARS) for x in range(128)]
19# lookup table for whether 7-bit ASCII chars are valid as the first
20# char in a Python identifier
21_IS_ASCII_ID_FIRST_CHAR = \
22    [(chr(x) in _ASCII_ID_FIRST_CHARS) for x in range(128)]
23
24
25class HyperParser:
26    def __init__(self, editwin, index):
27        "To initialize, analyze the surroundings of the given index."
28
29        self.editwin = editwin
30        self.text = text = editwin.text
31
32        parser = pyparse.Parser(editwin.indentwidth, editwin.tabwidth)
33
34        def index2line(index):
35            return int(float(index))
36        lno = index2line(text.index(index))
37
38        if not editwin.prompt_last_line:
39            for context in editwin.num_context_lines:
40                startat = max(lno - context, 1)
41                startatindex = repr(startat) + ".0"
42                stopatindex = "%d.end" % lno
43                # We add the newline because PyParse requires a newline
44                # at end. We add a space so that index won't be at end
45                # of line, so that its status will be the same as the
46                # char before it, if should.
47                parser.set_code(text.get(startatindex, stopatindex)+' \n')
48                bod = parser.find_good_parse_start(
49                          editwin._build_char_in_string_func(startatindex))
50                if bod is not None or startat == 1:
51                    break
52            parser.set_lo(bod or 0)
53        else:
54            r = text.tag_prevrange("console", index)
55            if r:
56                startatindex = r[1]
57            else:
58                startatindex = "1.0"
59            stopatindex = "%d.end" % lno
60            # We add the newline because PyParse requires it. We add a
61            # space so that index won't be at end of line, so that its
62            # status will be the same as the char before it, if should.
63            parser.set_code(text.get(startatindex, stopatindex)+' \n')
64            parser.set_lo(0)
65
66        # We want what the parser has, minus the last newline and space.
67        self.rawtext = parser.code[:-2]
68        # Parser.code apparently preserves the statement we are in, so
69        # that stopatindex can be used to synchronize the string with
70        # the text box indices.
71        self.stopatindex = stopatindex
72        self.bracketing = parser.get_last_stmt_bracketing()
73        # find which pairs of bracketing are openers. These always
74        # correspond to a character of rawtext.
75        self.isopener = [i>0 and self.bracketing[i][1] >
76                         self.bracketing[i-1][1]
77                         for i in range(len(self.bracketing))]
78
79        self.set_index(index)
80
81    def set_index(self, index):
82        """Set the index to which the functions relate.
83
84        The index must be in the same statement.
85        """
86        indexinrawtext = (len(self.rawtext) -
87                          len(self.text.get(index, self.stopatindex)))
88        if indexinrawtext < 0:
89            raise ValueError("Index %s precedes the analyzed statement"
90                             % index)
91        self.indexinrawtext = indexinrawtext
92        # find the rightmost bracket to which index belongs
93        self.indexbracket = 0
94        while (self.indexbracket < len(self.bracketing)-1 and
95               self.bracketing[self.indexbracket+1][0] < self.indexinrawtext):
96            self.indexbracket += 1
97        if (self.indexbracket < len(self.bracketing)-1 and
98            self.bracketing[self.indexbracket+1][0] == self.indexinrawtext and
99           not self.isopener[self.indexbracket+1]):
100            self.indexbracket += 1
101
102    def is_in_string(self):
103        """Is the index given to the HyperParser in a string?"""
104        # The bracket to which we belong should be an opener.
105        # If it's an opener, it has to have a character.
106        return (self.isopener[self.indexbracket] and
107                self.rawtext[self.bracketing[self.indexbracket][0]]
108                in ('"', "'"))
109
110    def is_in_code(self):
111        """Is the index given to the HyperParser in normal code?"""
112        return (not self.isopener[self.indexbracket] or
113                self.rawtext[self.bracketing[self.indexbracket][0]]
114                not in ('#', '"', "'"))
115
116    def get_surrounding_brackets(self, openers='([{', mustclose=False):
117        """Return bracket indexes or None.
118
119        If the index given to the HyperParser is surrounded by a
120        bracket defined in openers (or at least has one before it),
121        return the indices of the opening bracket and the closing
122        bracket (or the end of line, whichever comes first).
123
124        If it is not surrounded by brackets, or the end of line comes
125        before the closing bracket and mustclose is True, returns None.
126        """
127
128        bracketinglevel = self.bracketing[self.indexbracket][1]
129        before = self.indexbracket
130        while (not self.isopener[before] or
131              self.rawtext[self.bracketing[before][0]] not in openers or
132              self.bracketing[before][1] > bracketinglevel):
133            before -= 1
134            if before < 0:
135                return None
136            bracketinglevel = min(bracketinglevel, self.bracketing[before][1])
137        after = self.indexbracket + 1
138        while (after < len(self.bracketing) and
139              self.bracketing[after][1] >= bracketinglevel):
140            after += 1
141
142        beforeindex = self.text.index("%s-%dc" %
143            (self.stopatindex, len(self.rawtext)-self.bracketing[before][0]))
144        if (after >= len(self.bracketing) or
145           self.bracketing[after][0] > len(self.rawtext)):
146            if mustclose:
147                return None
148            afterindex = self.stopatindex
149        else:
150            # We are after a real char, so it is a ')' and we give the
151            # index before it.
152            afterindex = self.text.index(
153                "%s-%dc" % (self.stopatindex,
154                 len(self.rawtext)-(self.bracketing[after][0]-1)))
155
156        return beforeindex, afterindex
157
158    # the set of built-in identifiers which are also keywords,
159    # i.e. keyword.iskeyword() returns True for them
160    _ID_KEYWORDS = frozenset({"True", "False", "None"})
161
162    @classmethod
163    def _eat_identifier(cls, str, limit, pos):
164        """Given a string and pos, return the number of chars in the
165        identifier which ends at pos, or 0 if there is no such one.
166
167        This ignores non-identifier eywords are not identifiers.
168        """
169        is_ascii_id_char = _IS_ASCII_ID_CHAR
170
171        # Start at the end (pos) and work backwards.
172        i = pos
173
174        # Go backwards as long as the characters are valid ASCII
175        # identifier characters. This is an optimization, since it
176        # is faster in the common case where most of the characters
177        # are ASCII.
178        while i > limit and (
179                ord(str[i - 1]) < 128 and
180                is_ascii_id_char[ord(str[i - 1])]
181        ):
182            i -= 1
183
184        # If the above loop ended due to reaching a non-ASCII
185        # character, continue going backwards using the most generic
186        # test for whether a string contains only valid identifier
187        # characters.
188        if i > limit and ord(str[i - 1]) >= 128:
189            while i - 4 >= limit and ('a' + str[i - 4:pos]).isidentifier():
190                i -= 4
191            if i - 2 >= limit and ('a' + str[i - 2:pos]).isidentifier():
192                i -= 2
193            if i - 1 >= limit and ('a' + str[i - 1:pos]).isidentifier():
194                i -= 1
195
196            # The identifier candidate starts here. If it isn't a valid
197            # identifier, don't eat anything. At this point that is only
198            # possible if the first character isn't a valid first
199            # character for an identifier.
200            if not str[i:pos].isidentifier():
201                return 0
202        elif i < pos:
203            # All characters in str[i:pos] are valid ASCII identifier
204            # characters, so it is enough to check that the first is
205            # valid as the first character of an identifier.
206            if not _IS_ASCII_ID_FIRST_CHAR[ord(str[i])]:
207                return 0
208
209        # All keywords are valid identifiers, but should not be
210        # considered identifiers here, except for True, False and None.
211        if i < pos and (
212                iskeyword(str[i:pos]) and
213                str[i:pos] not in cls._ID_KEYWORDS
214        ):
215            return 0
216
217        return pos - i
218
219    # This string includes all chars that may be in a white space
220    _whitespace_chars = " \t\n\\"
221
222    def get_expression(self):
223        """Return a string with the Python expression which ends at the
224        given index, which is empty if there is no real one.
225        """
226        if not self.is_in_code():
227            raise ValueError("get_expression should only be called "
228                             "if index is inside a code.")
229
230        rawtext = self.rawtext
231        bracketing = self.bracketing
232
233        brck_index = self.indexbracket
234        brck_limit = bracketing[brck_index][0]
235        pos = self.indexinrawtext
236
237        last_identifier_pos = pos
238        postdot_phase = True
239
240        while 1:
241            # Eat whitespaces, comments, and if postdot_phase is False - a dot
242            while 1:
243                if pos>brck_limit and rawtext[pos-1] in self._whitespace_chars:
244                    # Eat a whitespace
245                    pos -= 1
246                elif (not postdot_phase and
247                      pos > brck_limit and rawtext[pos-1] == '.'):
248                    # Eat a dot
249                    pos -= 1
250                    postdot_phase = True
251                # The next line will fail if we are *inside* a comment,
252                # but we shouldn't be.
253                elif (pos == brck_limit and brck_index > 0 and
254                      rawtext[bracketing[brck_index-1][0]] == '#'):
255                    # Eat a comment
256                    brck_index -= 2
257                    brck_limit = bracketing[brck_index][0]
258                    pos = bracketing[brck_index+1][0]
259                else:
260                    # If we didn't eat anything, quit.
261                    break
262
263            if not postdot_phase:
264                # We didn't find a dot, so the expression end at the
265                # last identifier pos.
266                break
267
268            ret = self._eat_identifier(rawtext, brck_limit, pos)
269            if ret:
270                # There is an identifier to eat
271                pos = pos - ret
272                last_identifier_pos = pos
273                # Now, to continue the search, we must find a dot.
274                postdot_phase = False
275                # (the loop continues now)
276
277            elif pos == brck_limit:
278                # We are at a bracketing limit. If it is a closing
279                # bracket, eat the bracket, otherwise, stop the search.
280                level = bracketing[brck_index][1]
281                while brck_index > 0 and bracketing[brck_index-1][1] > level:
282                    brck_index -= 1
283                if bracketing[brck_index][0] == brck_limit:
284                    # We were not at the end of a closing bracket
285                    break
286                pos = bracketing[brck_index][0]
287                brck_index -= 1
288                brck_limit = bracketing[brck_index][0]
289                last_identifier_pos = pos
290                if rawtext[pos] in "([":
291                    # [] and () may be used after an identifier, so we
292                    # continue. postdot_phase is True, so we don't allow a dot.
293                    pass
294                else:
295                    # We can't continue after other types of brackets
296                    if rawtext[pos] in "'\"":
297                        # Scan a string prefix
298                        while pos > 0 and rawtext[pos - 1] in "rRbBuU":
299                            pos -= 1
300                        last_identifier_pos = pos
301                    break
302
303            else:
304                # We've found an operator or something.
305                break
306
307        return rawtext[last_identifier_pos:self.indexinrawtext]
308
309
310if __name__ == '__main__':
311    from unittest import main
312    main('idlelib.idle_test.test_hyperparser', verbosity=2)
313