1# -*- coding: utf-8 -*- 2# Copyright (C) 2017 by Juancarlo Añez 3# Copyright (C) 2012-2016 by Juancarlo Añez and Thomas Bragg 4""" 5The Buffer class provides the functionality required by a parser-driven lexer. 6 7Line analysis and caching are done so the parser can freely move with goto(p) 8to any position in the parsed text, and still recover accurate information 9about source lines and content. 10""" 11from __future__ import (absolute_import, division, print_function, 12 unicode_literals) 13 14import os 15from itertools import takewhile, repeat 16 17from grako.util import identity, imap, ustr, strtype 18from grako.util import extend_list, contains_sublist 19from grako.util import re as regexp 20from grako.util import WHITESPACE_RE, RE_FLAGS 21from grako.exceptions import ParseError 22from grako.infos import PosLine, LineIndexInfo, LineInfo, CommentInfo 23 24RETYPE = type(regexp.compile('.')) 25 26# for backwards compatibility with existing parsers 27LineIndexEntry = LineIndexInfo 28 29 30class Buffer(object): 31 def __init__(self, 32 text, 33 filename=None, 34 whitespace=None, 35 comments_re=None, 36 eol_comments_re=None, 37 ignorecase=False, 38 nameguard=None, 39 comment_recovery=False, 40 namechars='', 41 **kwargs): 42 text = ustr(text) 43 self.text = self.original_text = text 44 self.filename = filename or '' 45 46 self.whitespace = whitespace 47 48 self.comments_re = comments_re 49 self.eol_comments_re = eol_comments_re 50 self.ignorecase = ignorecase 51 self.nameguard = (nameguard 52 if nameguard is not None 53 else bool(self.whitespace_re)) 54 self.comment_recovery = comment_recovery 55 self.namechars = namechars 56 self._namechar_set = set(namechars) 57 if namechars: 58 self.nameguard = True 59 60 self._pos = 0 61 self._len = 0 62 self._linecount = 0 63 self._lines = [] 64 self._line_index = [] 65 self._line_cache = [] 66 self._comment_index = [] 67 self._re_cache = {} 68 69 self._preprocess() 70 self._postprocess() 71 72 @property 73 def whitespace(self): 74 return self._whitespace 75 76 @whitespace.setter 77 def whitespace(self, value): 78 self._whitespace = value 79 self.whitespace_re = self.build_whitespace_re(value) 80 81 @staticmethod 82 def build_whitespace_re(whitespace): 83 if whitespace is None: 84 return WHITESPACE_RE 85 elif isinstance(whitespace, RETYPE): 86 return whitespace 87 elif whitespace: 88 if not isinstance(whitespace, strtype): 89 # a list or a set? 90 whitespace = ''.join(c for c in whitespace) 91 return regexp.compile( 92 '[%s]+' % regexp.escape(whitespace), RE_FLAGS 93 ) 94 else: 95 return None 96 97 def _preprocess(self, *args, **kwargs): 98 lines, index = self._preprocess_block(self.filename, self.text) 99 self._lines = lines 100 self._line_index = index 101 self.text = self.join_block_lines(lines) 102 103 def _postprocess(self): 104 cache, count = PosLine.build_line_cache(self._lines) 105 self._line_cache = cache 106 self._linecount = count 107 self._len = len(self.text) 108 109 def _preprocess_block(self, name, block, **kwargs): 110 lines = self.split_block_lines(block) 111 index = LineIndexInfo.block_index(name, len(lines)) 112 return self.process_block(name, lines, index, **kwargs) 113 114 def split_block_lines(self, block): 115 return block.splitlines(True) 116 117 def join_block_lines(self, lines): 118 return ''.join(lines) 119 120 def process_block(self, name, lines, index, **kwargs): 121 return lines, index 122 123 def include(self, lines, index, i, j, name, block, **kwargs): 124 blines, bindex = self._preprocess_block(name, block, **kwargs) 125 assert len(blines) == len(bindex) 126 lines[i:j] = blines 127 index[i:j] = bindex 128 assert len(lines) == len(index) 129 return j + len(blines) - 1 130 131 def include_file(self, source, name, lines, index, i, j): 132 text, filename = self.get_include(source, name) 133 return self.include(lines, index, i, j, filename, text) 134 135 def get_include(self, source, filename): 136 source = os.path.abspath(source) 137 base = os.path.dirname(source) 138 include = os.path.join(base, filename) 139 try: 140 with open(include) as f: 141 return f.read(), include 142 except IOError: 143 raise ParseError('include not found: %s' % include) 144 145 def replace_lines(self, i, j, name, block): 146 lines = self.split_block_lines(self.text) 147 index = list(self._line_index) 148 149 endline = self.include(lines, index, i, j, name, block) 150 151 self.text = self.join_block_lines(lines) 152 self._line_index = index 153 self._postprocess() 154 155 newtext = self.join_block_lines(lines[j + 1:endline + 2]) 156 return endline, newtext 157 158 @property 159 def pos(self): 160 return self._pos 161 162 @pos.setter 163 def pos(self, p): 164 self.goto(p) 165 166 @property 167 def line(self): 168 return self.posline() 169 170 @property 171 def col(self): 172 return self.poscol() 173 174 def posline(self, pos=None): 175 if pos is None: 176 pos = self._pos 177 return self._line_cache[pos].line 178 179 def poscol(self, pos=None): 180 if pos is None: 181 pos = self._pos 182 start = self._line_cache[pos].start 183 return pos - start 184 185 def atend(self): 186 return self._pos >= self._len 187 188 def ateol(self): 189 return self.atend() or self.current() in '\r\n' 190 191 def current(self): 192 if self._pos >= self._len: 193 return None 194 return self.text[self._pos] 195 196 def at(self, p): 197 if p >= self._len: 198 return None 199 return self.text[p] 200 201 def peek(self, n=1): 202 return self.at(self._pos + n) 203 204 def next(self): 205 if self._pos >= self._len: 206 return None 207 c = self.text[self._pos] 208 self._pos += 1 209 return c 210 211 def goto(self, p): 212 self._pos = max(0, min(len(self.text), p)) 213 214 def move(self, n): 215 self.goto(self.pos + n) 216 217 def comments(self, p, clear=False): 218 if not self.comment_recovery or not self._comment_index: 219 return CommentInfo([], []) 220 221 n = self.posline(p) 222 if n >= len(self._comment_index): 223 return CommentInfo([], []) 224 225 eolcmm = [] 226 if n < len(self._comment_index): 227 eolcmm = self._comment_index[n].eol 228 if clear: 229 self._comment_index[n].eol = [] 230 231 cmm = [] 232 while n >= 0 and self._comment_index[n].inline: 233 cmm.insert(0, self._comment_index[n].inline) 234 if clear: 235 self._comment_index[n].inline = [] 236 n -= 1 237 238 return CommentInfo(cmm, eolcmm) 239 240 def _index_comments(self, comments, selector): 241 if comments and self.comment_recovery: 242 n = self.line 243 extend_list(self._comment_index, n, default=CommentInfo.new_comment) 244 previous = selector(self._comment_index[n]) 245 if not contains_sublist(previous, comments): # FIXME: will discard repeated comments 246 previous.extend(comments) 247 248 def _eat_regex(self, regex): 249 if regex is not None: 250 return list(takewhile(identity, imap(self.matchre, repeat(regex)))) 251 252 def eat_whitespace(self): 253 return self._eat_regex(self.whitespace_re) 254 255 def eat_comments(self): 256 comments = self._eat_regex(self.comments_re) 257 self._index_comments(comments, lambda x: x.inline) 258 259 def eat_eol_comments(self): 260 comments = self._eat_regex(self.eol_comments_re) 261 self._index_comments(comments, lambda x: x.eol) 262 263 def next_token(self): 264 p = None 265 while self._pos != p: 266 p = self._pos 267 self.eat_eol_comments() 268 self.eat_comments() 269 self.eat_whitespace() 270 271 def skip_to(self, c): 272 p = self._pos 273 le = self._len 274 while p < le and self.text[p] != c: 275 p += 1 276 self.goto(p) 277 return self.pos 278 279 def skip_past(self, c): 280 self.skip_to(c) 281 self.next() 282 return self.pos 283 284 def skip_to_eol(self): 285 return self.skip_to('\n') 286 287 def scan_space(self, offset=0): 288 return ( 289 self.whitespace_re and 290 self._scanre(self.whitespace_re, offset=offset) is not None 291 ) 292 293 def is_space(self): 294 return self.scan_space() 295 296 def is_name_char(self, c): 297 return c is not None and c.isalnum() or c in self._namechar_set 298 299 def match(self, token, ignorecase=None): 300 ignorecase = ignorecase if ignorecase is not None else self.ignorecase 301 302 if token is None: 303 return self.atend() 304 305 p = self.pos 306 if ignorecase: 307 is_match = self.text[p:p + len(token)].lower() == token.lower() 308 else: 309 is_match = self.text[p:p + len(token)] == token 310 311 if is_match: 312 self.move(len(token)) 313 if not self.nameguard: 314 return token 315 316 partial_match = ( 317 token.isalnum() and 318 token[0].isalpha() and 319 self.is_name_char(self.current()) 320 ) 321 if not partial_match: 322 return token 323 self.goto(p) 324 325 def matchre(self, pattern, ignorecase=None): 326 matched = self._scanre(pattern, ignorecase=ignorecase) 327 if matched: 328 token = matched.group() 329 self.move(len(token)) 330 return token 331 332 def _scanre(self, pattern, ignorecase=None, offset=0): 333 ignorecase = ignorecase if ignorecase is not None else self.ignorecase 334 335 if isinstance(pattern, RETYPE): 336 re = pattern 337 elif pattern in self._re_cache: 338 re = self._re_cache[pattern] 339 else: 340 flags = RE_FLAGS | (regexp.IGNORECASE if ignorecase else 0) 341 re = regexp.compile( 342 pattern, 343 flags 344 ) 345 self._re_cache[pattern] = re 346 return re.match(self.text, self.pos + offset) 347 348 @property 349 def linecount(self): 350 return self._linecount 351 352 def line_info(self, pos=None): 353 if pos is None: 354 pos = self._pos 355 356 if pos >= len(self._line_cache): 357 return LineInfo(self.filename, self.linecount, 0, self._len, self._len, '') 358 359 start, line, length = self._line_cache[pos] 360 end = start + length 361 col = pos - start 362 363 text = self.text[start:end] 364 n = min(len(self._line_index) - 1, line) 365 filename, line = self._line_index[n] 366 367 return LineInfo(filename, line, col, start, end, text) 368 369 def lookahead(self): 370 if self.atend(): 371 return '' 372 info = self.line_info() 373 text = info.text[info.col:info.col + 1 + 80] 374 text = self.split_block_lines(text)[0].rstrip() 375 return '<%d:%d>%s' % (info.line + 1, info.col + 1, text) 376 377 def get_line(self, n=None): 378 if n is None: 379 n = self.line 380 return self._lines[n] 381 382 def get_lines(self, start=None, end=None): 383 if start is None: 384 start = 0 385 if end is None: 386 end = len(self._lines) 387 return self._lines[start:end + 1] 388 389 def line_index(self, start=0, end=None): 390 if end is None: 391 end = len(self._line_index) 392 return self._line_index[start:1 + end] 393 394 def __repr__(self): 395 return '%s@%d' % (type(self).__name__, self.pos) 396 397 def __json__(self): 398 return None 399