1""" 2Lexer interface and implementation. 3Used for syntax highlighting. 4""" 5from __future__ import unicode_literals 6from abc import ABCMeta, abstractmethod 7from six import with_metaclass 8from six.moves import range 9 10from prompt_toolkit.token import Token 11from prompt_toolkit.filters import to_cli_filter 12from .utils import split_lines 13 14import re 15import six 16 17__all__ = ( 18 'Lexer', 19 'SimpleLexer', 20 'PygmentsLexer', 21 'SyntaxSync', 22 'SyncFromStart', 23 'RegexSync', 24) 25 26 27class Lexer(with_metaclass(ABCMeta, object)): 28 """ 29 Base class for all lexers. 30 """ 31 @abstractmethod 32 def lex_document(self, cli, document): 33 """ 34 Takes a :class:`~prompt_toolkit.document.Document` and returns a 35 callable that takes a line number and returns the tokens for that line. 36 """ 37 38 39class SimpleLexer(Lexer): 40 """ 41 Lexer that doesn't do any tokenizing and returns the whole input as one token. 42 43 :param token: The `Token` for this lexer. 44 """ 45 # `default_token` parameter is deprecated! 46 def __init__(self, token=Token, default_token=None): 47 self.token = token 48 49 if default_token is not None: 50 self.token = default_token 51 52 def lex_document(self, cli, document): 53 lines = document.lines 54 55 def get_line(lineno): 56 " Return the tokens for the given line. " 57 try: 58 return [(self.token, lines[lineno])] 59 except IndexError: 60 return [] 61 return get_line 62 63 64class SyntaxSync(with_metaclass(ABCMeta, object)): 65 """ 66 Syntax synchroniser. This is a tool that finds a start position for the 67 lexer. This is especially important when editing big documents; we don't 68 want to start the highlighting by running the lexer from the beginning of 69 the file. That is very slow when editing. 70 """ 71 @abstractmethod 72 def get_sync_start_position(self, document, lineno): 73 """ 74 Return the position from where we can start lexing as a (row, column) 75 tuple. 76 77 :param document: `Document` instance that contains all the lines. 78 :param lineno: The line that we want to highlight. (We need to return 79 this line, or an earlier position.) 80 """ 81 82class SyncFromStart(SyntaxSync): 83 """ 84 Always start the syntax highlighting from the beginning. 85 """ 86 def get_sync_start_position(self, document, lineno): 87 return 0, 0 88 89 90class RegexSync(SyntaxSync): 91 """ 92 Synchronize by starting at a line that matches the given regex pattern. 93 """ 94 # Never go more than this amount of lines backwards for synchronisation. 95 # That would be too CPU intensive. 96 MAX_BACKWARDS = 500 97 98 # Start lexing at the start, if we are in the first 'n' lines and no 99 # synchronisation position was found. 100 FROM_START_IF_NO_SYNC_POS_FOUND = 100 101 102 def __init__(self, pattern): 103 assert isinstance(pattern, six.text_type) 104 self._compiled_pattern = re.compile(pattern) 105 106 def get_sync_start_position(self, document, lineno): 107 " Scan backwards, and find a possible position to start. " 108 pattern = self._compiled_pattern 109 lines = document.lines 110 111 # Scan upwards, until we find a point where we can start the syntax 112 # synchronisation. 113 for i in range(lineno, max(-1, lineno - self.MAX_BACKWARDS), -1): 114 match = pattern.match(lines[i]) 115 if match: 116 return i, match.start() 117 118 # No synchronisation point found. If we aren't that far from the 119 # beginning, start at the very beginning, otherwise, just try to start 120 # at the current line. 121 if lineno < self.FROM_START_IF_NO_SYNC_POS_FOUND: 122 return 0, 0 123 else: 124 return lineno, 0 125 126 @classmethod 127 def from_pygments_lexer_cls(cls, lexer_cls): 128 """ 129 Create a :class:`.RegexSync` instance for this Pygments lexer class. 130 """ 131 patterns = { 132 # For Python, start highlighting at any class/def block. 133 'Python': r'^\s*(class|def)\s+', 134 'Python 3': r'^\s*(class|def)\s+', 135 136 # For HTML, start at any open/close tag definition. 137 'HTML': r'<[/a-zA-Z]', 138 139 # For javascript, start at a function. 140 'JavaScript': r'\bfunction\b' 141 142 # TODO: Add definitions for other languages. 143 # By default, we start at every possible line. 144 } 145 p = patterns.get(lexer_cls.name, '^') 146 return cls(p) 147 148 149class PygmentsLexer(Lexer): 150 """ 151 Lexer that calls a pygments lexer. 152 153 Example:: 154 155 from pygments.lexers import HtmlLexer 156 lexer = PygmentsLexer(HtmlLexer) 157 158 Note: Don't forget to also load a Pygments compatible style. E.g.:: 159 160 from prompt_toolkit.styles.from_pygments import style_from_pygments 161 from pygments.styles import get_style_by_name 162 style = style_from_pygments(get_style_by_name('monokai')) 163 164 :param pygments_lexer_cls: A `Lexer` from Pygments. 165 :param sync_from_start: Start lexing at the start of the document. This 166 will always give the best results, but it will be slow for bigger 167 documents. (When the last part of the document is display, then the 168 whole document will be lexed by Pygments on every key stroke.) It is 169 recommended to disable this for inputs that are expected to be more 170 than 1,000 lines. 171 :param syntax_sync: `SyntaxSync` object. 172 """ 173 # Minimum amount of lines to go backwards when starting the parser. 174 # This is important when the lines are retrieved in reverse order, or when 175 # scrolling upwards. (Due to the complexity of calculating the vertical 176 # scroll offset in the `Window` class, lines are not always retrieved in 177 # order.) 178 MIN_LINES_BACKWARDS = 50 179 180 # When a parser was started this amount of lines back, read the parser 181 # until we get the current line. Otherwise, start a new parser. 182 # (This should probably be bigger than MIN_LINES_BACKWARDS.) 183 REUSE_GENERATOR_MAX_DISTANCE = 100 184 185 def __init__(self, pygments_lexer_cls, sync_from_start=True, syntax_sync=None): 186 assert syntax_sync is None or isinstance(syntax_sync, SyntaxSync) 187 188 self.pygments_lexer_cls = pygments_lexer_cls 189 self.sync_from_start = to_cli_filter(sync_from_start) 190 191 # Instantiate the Pygments lexer. 192 self.pygments_lexer = pygments_lexer_cls( 193 stripnl=False, 194 stripall=False, 195 ensurenl=False) 196 197 # Create syntax sync instance. 198 self.syntax_sync = syntax_sync or RegexSync.from_pygments_lexer_cls(pygments_lexer_cls) 199 200 @classmethod 201 def from_filename(cls, filename, sync_from_start=True): 202 """ 203 Create a `Lexer` from a filename. 204 """ 205 # Inline imports: the Pygments dependency is optional! 206 from pygments.util import ClassNotFound 207 from pygments.lexers import get_lexer_for_filename 208 209 try: 210 pygments_lexer = get_lexer_for_filename(filename) 211 except ClassNotFound: 212 return SimpleLexer() 213 else: 214 return cls(pygments_lexer.__class__, sync_from_start=sync_from_start) 215 216 def lex_document(self, cli, document): 217 """ 218 Create a lexer function that takes a line number and returns the list 219 of (Token, text) tuples as the Pygments lexer returns for that line. 220 """ 221 # Cache of already lexed lines. 222 cache = {} 223 224 # Pygments generators that are currently lexing. 225 line_generators = {} # Map lexer generator to the line number. 226 227 def get_syntax_sync(): 228 " The Syntax synchronisation objcet that we currently use. " 229 if self.sync_from_start(cli): 230 return SyncFromStart() 231 else: 232 return self.syntax_sync 233 234 def find_closest_generator(i): 235 " Return a generator close to line 'i', or None if none was fonud. " 236 for generator, lineno in line_generators.items(): 237 if lineno < i and i - lineno < self.REUSE_GENERATOR_MAX_DISTANCE: 238 return generator 239 240 def create_line_generator(start_lineno, column=0): 241 """ 242 Create a generator that yields the lexed lines. 243 Each iteration it yields a (line_number, [(token, text), ...]) tuple. 244 """ 245 def get_tokens(): 246 text = '\n'.join(document.lines[start_lineno:])[column:] 247 248 # We call `get_tokens_unprocessed`, because `get_tokens` will 249 # still replace \r\n and \r by \n. (We don't want that, 250 # Pygments should return exactly the same amount of text, as we 251 # have given as input.) 252 for _, t, v in self.pygments_lexer.get_tokens_unprocessed(text): 253 yield t, v 254 255 return enumerate(split_lines(get_tokens()), start_lineno) 256 257 def get_generator(i): 258 """ 259 Find an already started generator that is close, or create a new one. 260 """ 261 # Find closest line generator. 262 generator = find_closest_generator(i) 263 if generator: 264 return generator 265 266 # No generator found. Determine starting point for the syntax 267 # synchronisation first. 268 269 # Go at least x lines back. (Make scrolling upwards more 270 # efficient.) 271 i = max(0, i - self.MIN_LINES_BACKWARDS) 272 273 if i == 0: 274 row = 0 275 column = 0 276 else: 277 row, column = get_syntax_sync().get_sync_start_position(document, i) 278 279 # Find generator close to this point, or otherwise create a new one. 280 generator = find_closest_generator(i) 281 if generator: 282 return generator 283 else: 284 generator = create_line_generator(row, column) 285 286 # If the column is not 0, ignore the first line. (Which is 287 # incomplete. This happens when the synchronisation algorithm tells 288 # us to start parsing in the middle of a line.) 289 if column: 290 next(generator) 291 row += 1 292 293 line_generators[generator] = row 294 return generator 295 296 def get_line(i): 297 " Return the tokens for a given line number. " 298 try: 299 return cache[i] 300 except KeyError: 301 generator = get_generator(i) 302 303 # Exhaust the generator, until we find the requested line. 304 for num, line in generator: 305 cache[num] = line 306 if num == i: 307 line_generators[generator] = i 308 309 # Remove the next item from the cache. 310 # (It could happen that it's already there, because of 311 # another generator that started filling these lines, 312 # but we want to synchronise these lines with the 313 # current lexer's state.) 314 if num + 1 in cache: 315 del cache[num + 1] 316 317 return cache[num] 318 return [] 319 320 return get_line 321