1# -*- coding: utf-8 -*- 2""" 3 pygments.scanner 4 ~~~~~~~~~~~~~~~~ 5 6 This library implements a regex based scanner. Some languages 7 like Pascal are easy to parse but have some keywords that 8 depend on the context. Because of this it's impossible to lex 9 that just by using a regular expression lexer like the 10 `RegexLexer`. 11 12 Have a look at the `DelphiLexer` to get an idea of how to use 13 this scanner. 14 15 :copyright: Copyright 2006-2020 by the Pygments team, see AUTHORS. 16 :license: BSD, see LICENSE for details. 17""" 18import re 19 20 21class EndOfText(RuntimeError): 22 """ 23 Raise if end of text is reached and the user 24 tried to call a match function. 25 """ 26 27 28class Scanner: 29 """ 30 Simple scanner 31 32 All method patterns are regular expression strings (not 33 compiled expressions!) 34 """ 35 36 def __init__(self, text, flags=0): 37 """ 38 :param text: The text which should be scanned 39 :param flags: default regular expression flags 40 """ 41 self.data = text 42 self.data_length = len(text) 43 self.start_pos = 0 44 self.pos = 0 45 self.flags = flags 46 self.last = None 47 self.match = None 48 self._re_cache = {} 49 50 def eos(self): 51 """`True` if the scanner reached the end of text.""" 52 return self.pos >= self.data_length 53 eos = property(eos, eos.__doc__) 54 55 def check(self, pattern): 56 """ 57 Apply `pattern` on the current position and return 58 the match object. (Doesn't touch pos). Use this for 59 lookahead. 60 """ 61 if self.eos: 62 raise EndOfText() 63 if pattern not in self._re_cache: 64 self._re_cache[pattern] = re.compile(pattern, self.flags) 65 return self._re_cache[pattern].match(self.data, self.pos) 66 67 def test(self, pattern): 68 """Apply a pattern on the current position and check 69 if it patches. Doesn't touch pos. 70 """ 71 return self.check(pattern) is not None 72 73 def scan(self, pattern): 74 """ 75 Scan the text for the given pattern and update pos/match 76 and related fields. The return value is a boolen that 77 indicates if the pattern matched. The matched value is 78 stored on the instance as ``match``, the last value is 79 stored as ``last``. ``start_pos`` is the position of the 80 pointer before the pattern was matched, ``pos`` is the 81 end position. 82 """ 83 if self.eos: 84 raise EndOfText() 85 if pattern not in self._re_cache: 86 self._re_cache[pattern] = re.compile(pattern, self.flags) 87 self.last = self.match 88 m = self._re_cache[pattern].match(self.data, self.pos) 89 if m is None: 90 return False 91 self.start_pos = m.start() 92 self.pos = m.end() 93 self.match = m.group() 94 return True 95 96 def get_char(self): 97 """Scan exactly one char.""" 98 self.scan('.') 99 100 def __repr__(self): 101 return '<%s %d/%d>' % ( 102 self.__class__.__name__, 103 self.pos, 104 self.data_length 105 ) 106