1"""
2Lexer interface and implementation.
3Used for syntax highlighting.
4"""
5from __future__ import unicode_literals
6from abc import ABCMeta, abstractmethod
7from six import with_metaclass
8from six.moves import range
9
10from prompt_toolkit.token import Token
11from prompt_toolkit.filters import to_cli_filter
12from .utils import split_lines
13
14import re
15import six
16
17__all__ = (
18    'Lexer',
19    'SimpleLexer',
20    'PygmentsLexer',
21    'SyntaxSync',
22    'SyncFromStart',
23    'RegexSync',
24)
25
26
27class Lexer(with_metaclass(ABCMeta, object)):
28    """
29    Base class for all lexers.
30    """
31    @abstractmethod
32    def lex_document(self, cli, document):
33        """
34        Takes a :class:`~prompt_toolkit.document.Document` and returns a
35        callable that takes a line number and returns the tokens for that line.
36        """
37
38
39class SimpleLexer(Lexer):
40    """
41    Lexer that doesn't do any tokenizing and returns the whole input as one token.
42
43    :param token: The `Token` for this lexer.
44    """
45    # `default_token` parameter is deprecated!
46    def __init__(self, token=Token, default_token=None):
47        self.token = token
48
49        if default_token is not None:
50            self.token = default_token
51
52    def lex_document(self, cli, document):
53        lines = document.lines
54
55        def get_line(lineno):
56            " Return the tokens for the given line. "
57            try:
58                return [(self.token, lines[lineno])]
59            except IndexError:
60                return []
61        return get_line
62
63
64class SyntaxSync(with_metaclass(ABCMeta, object)):
65    """
66    Syntax synchroniser. This is a tool that finds a start position for the
67    lexer. This is especially important when editing big documents; we don't
68    want to start the highlighting by running the lexer from the beginning of
69    the file. That is very slow when editing.
70    """
71    @abstractmethod
72    def get_sync_start_position(self, document, lineno):
73        """
74        Return the position from where we can start lexing as a (row, column)
75        tuple.
76
77        :param document: `Document` instance that contains all the lines.
78        :param lineno: The line that we want to highlight. (We need to return
79            this line, or an earlier position.)
80        """
81
82class SyncFromStart(SyntaxSync):
83    """
84    Always start the syntax highlighting from the beginning.
85    """
86    def get_sync_start_position(self, document, lineno):
87        return 0, 0
88
89
90class RegexSync(SyntaxSync):
91    """
92    Synchronize by starting at a line that matches the given regex pattern.
93    """
94    # Never go more than this amount of lines backwards for synchronisation.
95    # That would be too CPU intensive.
96    MAX_BACKWARDS = 500
97
98    # Start lexing at the start, if we are in the first 'n' lines and no
99    # synchronisation position was found.
100    FROM_START_IF_NO_SYNC_POS_FOUND = 100
101
102    def __init__(self, pattern):
103        assert isinstance(pattern, six.text_type)
104        self._compiled_pattern = re.compile(pattern)
105
106    def get_sync_start_position(self, document, lineno):
107        " Scan backwards, and find a possible position to start. "
108        pattern = self._compiled_pattern
109        lines = document.lines
110
111        # Scan upwards, until we find a point where we can start the syntax
112        # synchronisation.
113        for i in range(lineno, max(-1, lineno - self.MAX_BACKWARDS), -1):
114            match = pattern.match(lines[i])
115            if match:
116                return i, match.start()
117
118        # No synchronisation point found. If we aren't that far from the
119        # beginning, start at the very beginning, otherwise, just try to start
120        # at the current line.
121        if lineno < self.FROM_START_IF_NO_SYNC_POS_FOUND:
122            return 0, 0
123        else:
124            return lineno, 0
125
126    @classmethod
127    def from_pygments_lexer_cls(cls, lexer_cls):
128        """
129        Create a :class:`.RegexSync` instance for this Pygments lexer class.
130        """
131        patterns = {
132            # For Python, start highlighting at any class/def block.
133            'Python':   r'^\s*(class|def)\s+',
134            'Python 3': r'^\s*(class|def)\s+',
135
136            # For HTML, start at any open/close tag definition.
137            'HTML': r'<[/a-zA-Z]',
138
139            # For javascript, start at a function.
140            'JavaScript': r'\bfunction\b'
141
142            # TODO: Add definitions for other languages.
143            #       By default, we start at every possible line.
144        }
145        p = patterns.get(lexer_cls.name, '^')
146        return cls(p)
147
148
149class PygmentsLexer(Lexer):
150    """
151    Lexer that calls a pygments lexer.
152
153    Example::
154
155        from pygments.lexers import HtmlLexer
156        lexer = PygmentsLexer(HtmlLexer)
157
158    Note: Don't forget to also load a Pygments compatible style. E.g.::
159
160        from prompt_toolkit.styles.from_pygments import style_from_pygments
161        from pygments.styles import get_style_by_name
162        style = style_from_pygments(get_style_by_name('monokai'))
163
164    :param pygments_lexer_cls: A `Lexer` from Pygments.
165    :param sync_from_start: Start lexing at the start of the document. This
166        will always give the best results, but it will be slow for bigger
167        documents. (When the last part of the document is display, then the
168        whole document will be lexed by Pygments on every key stroke.) It is
169        recommended to disable this for inputs that are expected to be more
170        than 1,000 lines.
171    :param syntax_sync: `SyntaxSync` object.
172    """
173    # Minimum amount of lines to go backwards when starting the parser.
174    # This is important when the lines are retrieved in reverse order, or when
175    # scrolling upwards. (Due to the complexity of calculating the vertical
176    # scroll offset in the `Window` class, lines are not always retrieved in
177    # order.)
178    MIN_LINES_BACKWARDS = 50
179
180    # When a parser was started this amount of lines back, read the parser
181    # until we get the current line. Otherwise, start a new parser.
182    # (This should probably be bigger than MIN_LINES_BACKWARDS.)
183    REUSE_GENERATOR_MAX_DISTANCE = 100
184
185    def __init__(self, pygments_lexer_cls, sync_from_start=True, syntax_sync=None):
186        assert syntax_sync is None or isinstance(syntax_sync, SyntaxSync)
187
188        self.pygments_lexer_cls = pygments_lexer_cls
189        self.sync_from_start = to_cli_filter(sync_from_start)
190
191        # Instantiate the Pygments lexer.
192        self.pygments_lexer = pygments_lexer_cls(
193            stripnl=False,
194            stripall=False,
195            ensurenl=False)
196
197        # Create syntax sync instance.
198        self.syntax_sync = syntax_sync or RegexSync.from_pygments_lexer_cls(pygments_lexer_cls)
199
200    @classmethod
201    def from_filename(cls, filename, sync_from_start=True):
202        """
203        Create a `Lexer` from a filename.
204        """
205        # Inline imports: the Pygments dependency is optional!
206        from pygments.util import ClassNotFound
207        from pygments.lexers import get_lexer_for_filename
208
209        try:
210            pygments_lexer = get_lexer_for_filename(filename)
211        except ClassNotFound:
212            return SimpleLexer()
213        else:
214            return cls(pygments_lexer.__class__, sync_from_start=sync_from_start)
215
216    def lex_document(self, cli, document):
217        """
218        Create a lexer function that takes a line number and returns the list
219        of (Token, text) tuples as the Pygments lexer returns for that line.
220        """
221        # Cache of already lexed lines.
222        cache = {}
223
224        # Pygments generators that are currently lexing.
225        line_generators = {}  # Map lexer generator to the line number.
226
227        def get_syntax_sync():
228            " The Syntax synchronisation objcet that we currently use. "
229            if self.sync_from_start(cli):
230                return SyncFromStart()
231            else:
232                return self.syntax_sync
233
234        def find_closest_generator(i):
235            " Return a generator close to line 'i', or None if none was fonud. "
236            for generator, lineno in line_generators.items():
237                if lineno < i and i - lineno < self.REUSE_GENERATOR_MAX_DISTANCE:
238                    return generator
239
240        def create_line_generator(start_lineno, column=0):
241            """
242            Create a generator that yields the lexed lines.
243            Each iteration it yields a (line_number, [(token, text), ...]) tuple.
244            """
245            def get_tokens():
246                text = '\n'.join(document.lines[start_lineno:])[column:]
247
248                # We call `get_tokens_unprocessed`, because `get_tokens` will
249                # still replace \r\n and \r by \n.  (We don't want that,
250                # Pygments should return exactly the same amount of text, as we
251                # have given as input.)
252                for _, t, v in self.pygments_lexer.get_tokens_unprocessed(text):
253                    yield t, v
254
255            return enumerate(split_lines(get_tokens()), start_lineno)
256
257        def get_generator(i):
258            """
259            Find an already started generator that is close, or create a new one.
260            """
261            # Find closest line generator.
262            generator = find_closest_generator(i)
263            if generator:
264                return generator
265
266            # No generator found. Determine starting point for the syntax
267            # synchronisation first.
268
269            # Go at least x lines back. (Make scrolling upwards more
270            # efficient.)
271            i = max(0, i - self.MIN_LINES_BACKWARDS)
272
273            if i == 0:
274                row = 0
275                column = 0
276            else:
277                row, column = get_syntax_sync().get_sync_start_position(document, i)
278
279            # Find generator close to this point, or otherwise create a new one.
280            generator = find_closest_generator(i)
281            if generator:
282                return generator
283            else:
284                generator = create_line_generator(row, column)
285
286            # If the column is not 0, ignore the first line. (Which is
287            # incomplete. This happens when the synchronisation algorithm tells
288            # us to start parsing in the middle of a line.)
289            if column:
290                next(generator)
291                row += 1
292
293            line_generators[generator] = row
294            return generator
295
296        def get_line(i):
297            " Return the tokens for a given line number. "
298            try:
299                return cache[i]
300            except KeyError:
301                generator = get_generator(i)
302
303                # Exhaust the generator, until we find the requested line.
304                for num, line in generator:
305                    cache[num] = line
306                    if num == i:
307                        line_generators[generator] = i
308
309                        # Remove the next item from the cache.
310                        # (It could happen that it's already there, because of
311                        # another generator that started filling these lines,
312                        # but we want to synchronise these lines with the
313                        # current lexer's state.)
314                        if num + 1 in cache:
315                            del cache[num + 1]
316
317                        return cache[num]
318            return []
319
320        return get_line
321