1# -*- test-case-name: twisted.words.test.test_xpath -*- 2# Copyright (c) Twisted Matrix Laboratories. 3# See LICENSE for details. 4 5# pylint: disable=W9401,W9402 6 7# DO NOT EDIT xpathparser.py! 8# 9# It is generated from xpathparser.g using Yapps. Make needed changes there. 10# This also means that the generated Python may not conform to Twisted's coding 11# standards, so it is wrapped in exec to prevent automated checkers from 12# complaining. 13 14# HOWTO Generate me: 15# 16# 1.) Grab a copy of yapps2: 17# https://github.com/smurfix/yapps 18# 19# Note: Do NOT use the package in debian/ubuntu as it has incompatible 20# modifications. The original at http://theory.stanford.edu/~amitp/yapps/ 21# hasn't been touched since 2003 and has not been updated to work with 22# Python 3. 23# 24# 2.) Generate the grammar: 25# 26# yapps2 xpathparser.g xpathparser.py.proto 27# 28# 3.) Edit the output to depend on the embedded runtime, and remove extraneous 29# imports: 30# 31# sed -e '/^# Begin/,${/^[^ ].*mport/d}' -e 's/runtime\.//g' \ 32# -e "s/^\(from __future\)/exec(r'''\n\1/" -e"\$a''')" 33# xpathparser.py.proto > xpathparser.py 34 35""" 36XPath Parser. 37 38Besides the parser code produced by Yapps, this module also defines the 39parse-time exception classes, a scanner class, a base class for parsers 40produced by Yapps, and a context class that keeps track of the parse stack. 41These have been copied from the Yapps runtime module. 42""" 43 44from __future__ import print_function 45import sys, re 46 47MIN_WINDOW=4096 48# File lookup window 49 50class SyntaxError(Exception): 51 """When we run into an unexpected token, this is the exception to use""" 52 def __init__(self, pos=None, msg="Bad Token", context=None): 53 Exception.__init__(self) 54 self.pos = pos 55 self.msg = msg 56 self.context = context 57 58 def __str__(self): 59 if not self.pos: return 'SyntaxError' 60 else: return 'SyntaxError@%s(%s)' % (repr(self.pos), self.msg) 61 62class NoMoreTokens(Exception): 63 """Another exception object, for when we run out of tokens""" 64 pass 65 66class Token: 67 """Yapps token. 68 69 This is a container for a scanned token. 70 """ 71 72 def __init__(self, type,value, pos=None): 73 """Initialize a token.""" 74 self.type = type 75 self.value = value 76 self.pos = pos 77 78 def __repr__(self): 79 output = '<%s: %s' % (self.type, repr(self.value)) 80 if self.pos: 81 output += " @ " 82 if self.pos[0]: 83 output += "%s:" % self.pos[0] 84 if self.pos[1]: 85 output += "%d" % self.pos[1] 86 if self.pos[2] is not None: 87 output += ".%d" % self.pos[2] 88 output += ">" 89 return output 90 91in_name=0 92class Scanner: 93 """Yapps scanner. 94 95 The Yapps scanner can work in context sensitive or context 96 insensitive modes. The token(i) method is used to retrieve the 97 i-th token. It takes a restrict set that limits the set of tokens 98 it is allowed to return. In context sensitive mode, this restrict 99 set guides the scanner. In context insensitive mode, there is no 100 restriction (the set is always the full set of tokens). 101 102 """ 103 104 def __init__(self, patterns, ignore, input="", 105 file=None,filename=None,stacked=False): 106 """Initialize the scanner. 107 108 Parameters: 109 patterns : [(terminal, uncompiled regex), ...] or None 110 ignore : {terminal:None, ...} 111 input : string 112 113 If patterns is None, we assume that the subclass has 114 defined self.patterns : [(terminal, compiled regex), ...]. 115 Note that the patterns parameter expects uncompiled regexes, 116 whereas the self.patterns field expects compiled regexes. 117 118 The 'ignore' value is either None or a callable, which is called 119 with the scanner and the to-be-ignored match object; this can 120 be used for include file or comment handling. 121 """ 122 123 if not filename: 124 global in_name 125 filename="<f.%d>" % in_name 126 in_name += 1 127 128 self.input = input 129 self.ignore = ignore 130 self.file = file 131 self.filename = filename 132 self.pos = 0 133 self.del_pos = 0 # skipped 134 self.line = 1 135 self.del_line = 0 # skipped 136 self.col = 0 137 self.tokens = [] 138 self.stack = None 139 self.stacked = stacked 140 141 self.last_read_token = None 142 self.last_token = None 143 self.last_types = None 144 145 if patterns is not None: 146 # Compile the regex strings into regex objects 147 self.patterns = [] 148 for terminal, regex in patterns: 149 self.patterns.append( (terminal, re.compile(regex)) ) 150 151 def stack_input(self, input="", file=None, filename=None): 152 """Temporarily parse from a second file.""" 153 154 # Already reading from somewhere else: Go on top of that, please. 155 if self.stack: 156 # autogenerate a recursion-level-identifying filename 157 if not filename: 158 filename = 1 159 else: 160 try: 161 filename += 1 162 except TypeError: 163 pass 164 # now pass off to the include file 165 self.stack.stack_input(input,file,filename) 166 else: 167 168 try: 169 filename += 0 170 except TypeError: 171 pass 172 else: 173 filename = "<str_%d>" % filename 174 175# self.stack = object.__new__(self.__class__) 176# Scanner.__init__(self.stack,self.patterns,self.ignore,input,file,filename, stacked=True) 177 178 # Note that the pattern+ignore are added by the generated 179 # scanner code 180 self.stack = self.__class__(input,file,filename, stacked=True) 181 182 def get_pos(self): 183 """Return a file/line/char tuple.""" 184 if self.stack: return self.stack.get_pos() 185 186 return (self.filename, self.line+self.del_line, self.col) 187 188# def __repr__(self): 189# """Print the last few tokens that have been scanned in""" 190# output = '' 191# for t in self.tokens: 192# output += '%s\n' % (repr(t),) 193# return output 194 195 def print_line_with_pointer(self, pos, length=0, out=sys.stderr): 196 """Print the line of 'text' that includes position 'p', 197 along with a second line with a single caret (^) at position p""" 198 199 file,line,p = pos 200 if file != self.filename: 201 if self.stack: return self.stack.print_line_with_pointer(pos,length=length,out=out) 202 print >>out, "(%s: not in input buffer)" % file 203 return 204 205 text = self.input 206 p += length-1 # starts at pos 1 207 208 origline=line 209 line -= self.del_line 210 spos=0 211 if line > 0: 212 while 1: 213 line = line - 1 214 try: 215 cr = text.index("\n",spos) 216 except ValueError: 217 if line: 218 text = "" 219 break 220 if line == 0: 221 text = text[spos:cr] 222 break 223 spos = cr+1 224 else: 225 print >>out, "(%s:%d not in input buffer)" % (file,origline) 226 return 227 228 # Now try printing part of the line 229 text = text[max(p-80, 0):p+80] 230 p = p - max(p-80, 0) 231 232 # Strip to the left 233 i = text[:p].rfind('\n') 234 j = text[:p].rfind('\r') 235 if i < 0 or (0 <= j < i): i = j 236 if 0 <= i < p: 237 p = p - i - 1 238 text = text[i+1:] 239 240 # Strip to the right 241 i = text.find('\n', p) 242 j = text.find('\r', p) 243 if i < 0 or (0 <= j < i): i = j 244 if i >= 0: 245 text = text[:i] 246 247 # Now shorten the text 248 while len(text) > 70 and p > 60: 249 # Cut off 10 chars 250 text = "..." + text[10:] 251 p = p - 7 252 253 # Now print the string, along with an indicator 254 print >>out, '> ',text 255 print >>out, '> ',' '*p + '^' 256 257 def grab_input(self): 258 """Get more input if possible.""" 259 if not self.file: return 260 if len(self.input) - self.pos >= MIN_WINDOW: return 261 262 data = self.file.read(MIN_WINDOW) 263 if data is None or data == "": 264 self.file = None 265 266 # Drop bytes from the start, if necessary. 267 if self.pos > 2*MIN_WINDOW: 268 self.del_pos += MIN_WINDOW 269 self.del_line += self.input[:MIN_WINDOW].count("\n") 270 self.pos -= MIN_WINDOW 271 self.input = self.input[MIN_WINDOW:] + data 272 else: 273 self.input = self.input + data 274 275 def getchar(self): 276 """Return the next character.""" 277 self.grab_input() 278 279 c = self.input[self.pos] 280 self.pos += 1 281 return c 282 283 def token(self, restrict, context=None): 284 """Scan for another token.""" 285 286 while 1: 287 if self.stack: 288 try: 289 return self.stack.token(restrict, context) 290 except StopIteration: 291 self.stack = None 292 293 # Keep looking for a token, ignoring any in self.ignore 294 self.grab_input() 295 296 # special handling for end-of-file 297 if self.stacked and self.pos==len(self.input): 298 raise StopIteration 299 300 # Search the patterns for the longest match, with earlier 301 # tokens in the list having preference 302 best_match = -1 303 best_pat = '(error)' 304 best_m = None 305 for p, regexp in self.patterns: 306 # First check to see if we're ignoring this token 307 if restrict and p not in restrict and p not in self.ignore: 308 continue 309 m = regexp.match(self.input, self.pos) 310 if m and m.end()-m.start() > best_match: 311 # We got a match that's better than the previous one 312 best_pat = p 313 best_match = m.end()-m.start() 314 best_m = m 315 316 # If we didn't find anything, raise an error 317 if best_pat == '(error)' and best_match < 0: 318 msg = 'Bad Token' 319 if restrict: 320 msg = 'Trying to find one of '+', '.join(restrict) 321 raise SyntaxError(self.get_pos(), msg, context=context) 322 323 ignore = best_pat in self.ignore 324 value = self.input[self.pos:self.pos+best_match] 325 if not ignore: 326 tok=Token(type=best_pat, value=value, pos=self.get_pos()) 327 328 self.pos += best_match 329 330 npos = value.rfind("\n") 331 if npos > -1: 332 self.col = best_match-npos 333 self.line += value.count("\n") 334 else: 335 self.col += best_match 336 337 # If we found something that isn't to be ignored, return it 338 if not ignore: 339 if len(self.tokens) >= 10: 340 del self.tokens[0] 341 self.tokens.append(tok) 342 self.last_read_token = tok 343 # print repr(tok) 344 return tok 345 else: 346 ignore = self.ignore[best_pat] 347 if ignore: 348 ignore(self, best_m) 349 350 def peek(self, *types, **kw): 351 """Returns the token type for lookahead; if there are any args 352 then the list of args is the set of token types to allow""" 353 context = kw.get("context",None) 354 if self.last_token is None: 355 self.last_types = types 356 self.last_token = self.token(types,context) 357 elif self.last_types: 358 for t in types: 359 if t not in self.last_types: 360 raise NotImplementedError("Unimplemented: restriction set changed") 361 return self.last_token.type 362 363 def scan(self, type, **kw): 364 """Returns the matched text, and moves to the next token""" 365 context = kw.get("context",None) 366 367 if self.last_token is None: 368 tok = self.token([type],context) 369 else: 370 if self.last_types and type not in self.last_types: 371 raise NotImplementedError("Unimplemented: restriction set changed") 372 373 tok = self.last_token 374 self.last_token = None 375 if tok.type != type: 376 if not self.last_types: self.last_types=[] 377 raise SyntaxError(tok.pos, 'Trying to find '+type+': '+ ', '.join(self.last_types)+", got "+tok.type, context=context) 378 return tok.value 379 380class Parser: 381 """Base class for Yapps-generated parsers. 382 383 """ 384 385 def __init__(self, scanner): 386 self._scanner = scanner 387 388 def _stack(self, input="",file=None,filename=None): 389 """Temporarily read from someplace else""" 390 self._scanner.stack_input(input,file,filename) 391 self._tok = None 392 393 def _peek(self, *types, **kw): 394 """Returns the token type for lookahead; if there are any args 395 then the list of args is the set of token types to allow""" 396 return self._scanner.peek(*types, **kw) 397 398 def _scan(self, type, **kw): 399 """Returns the matched text, and moves to the next token""" 400 return self._scanner.scan(type, **kw) 401 402class Context: 403 """Class to represent the parser's call stack. 404 405 Every rule creates a Context that links to its parent rule. The 406 contexts can be used for debugging. 407 408 """ 409 410 def __init__(self, parent, scanner, rule, args=()): 411 """Create a new context. 412 413 Args: 414 parent: Context object or None 415 scanner: Scanner object 416 rule: string (name of the rule) 417 args: tuple listing parameters to the rule 418 419 """ 420 self.parent = parent 421 self.scanner = scanner 422 self.rule = rule 423 self.args = args 424 while scanner.stack: scanner = scanner.stack 425 self.token = scanner.last_read_token 426 427 def __str__(self): 428 output = '' 429 if self.parent: output = str(self.parent) + ' > ' 430 output += self.rule 431 return output 432 433def print_error(err, scanner, max_ctx=None): 434 """Print error messages, the parser stack, and the input text -- for human-readable error messages.""" 435 # NOTE: this function assumes 80 columns :-( 436 # Figure out the line number 437 pos = err.pos 438 if not pos: 439 pos = scanner.get_pos() 440 441 file_name, line_number, column_number = pos 442 print('%s:%d:%d: %s' % (file_name, line_number, column_number, err.msg), file=sys.stderr) 443 444 scanner.print_line_with_pointer(pos) 445 446 context = err.context 447 token = None 448 while context: 449 print('while parsing %s%s:' % (context.rule, tuple(context.args)), file=sys.stderr) 450 if context.token: 451 token = context.token 452 if token: 453 scanner.print_line_with_pointer(token.pos, length=len(token.value)) 454 context = context.parent 455 if max_ctx: 456 max_ctx = max_ctx-1 457 if not max_ctx: 458 break 459 460def wrap_error_reporter(parser, rule, *args,**kw): 461 try: 462 return getattr(parser, rule)(*args,**kw) 463 except SyntaxError as e: 464 print_error(e, parser._scanner) 465 except NoMoreTokens: 466 print('Could not complete parsing; stopped around here:', file=sys.stderr) 467 print(parser._scanner, file=sys.stderr) 468 469from twisted.words.xish.xpath import AttribValue, BooleanValue, CompareValue 470from twisted.words.xish.xpath import Function, IndexValue, LiteralValue 471from twisted.words.xish.xpath import _AnyLocation, _Location 472 473%% 474parser XPathParser: 475 ignore: "\\s+" 476 token INDEX: "[0-9]+" 477 token WILDCARD: "\*" 478 token IDENTIFIER: "[a-zA-Z][a-zA-Z0-9_\-]*" 479 token ATTRIBUTE: "\@[a-zA-Z][a-zA-Z0-9_\-]*" 480 token FUNCNAME: "[a-zA-Z][a-zA-Z0-9_]*" 481 token CMP_EQ: "\=" 482 token CMP_NE: "\!\=" 483 token STR_DQ: '"([^"]|(\\"))*?"' 484 token STR_SQ: "'([^']|(\\'))*?'" 485 token OP_AND: "and" 486 token OP_OR: "or" 487 token END: "$" 488 489 rule XPATH: PATH {{ result = PATH; current = result }} 490 ( PATH {{ current.childLocation = PATH; current = current.childLocation }} ) * END 491 {{ return result }} 492 493 rule PATH: ("/" {{ result = _Location() }} | "//" {{ result = _AnyLocation() }} ) 494 ( IDENTIFIER {{ result.elementName = IDENTIFIER }} | WILDCARD {{ result.elementName = None }} ) 495 ( "\[" PREDICATE {{ result.predicates.append(PREDICATE) }} "\]")* 496 {{ return result }} 497 498 rule PREDICATE: EXPR {{ return EXPR }} | 499 INDEX {{ return IndexValue(INDEX) }} 500 501 rule EXPR: FACTOR {{ e = FACTOR }} 502 ( BOOLOP FACTOR {{ e = BooleanValue(e, BOOLOP, FACTOR) }} )* 503 {{ return e }} 504 505 rule BOOLOP: ( OP_AND {{ return OP_AND }} | OP_OR {{ return OP_OR }} ) 506 507 rule FACTOR: TERM {{ return TERM }} 508 | "\(" EXPR "\)" {{ return EXPR }} 509 510 rule TERM: VALUE {{ t = VALUE }} 511 [ CMP VALUE {{ t = CompareValue(t, CMP, VALUE) }} ] 512 {{ return t }} 513 514 rule VALUE: "@" IDENTIFIER {{ return AttribValue(IDENTIFIER) }} | 515 FUNCNAME {{ f = Function(FUNCNAME); args = [] }} 516 "\(" [ VALUE {{ args.append(VALUE) }} 517 ( 518 "," VALUE {{ args.append(VALUE) }} 519 )* 520 ] "\)" {{ f.setParams(*args); return f }} | 521 STR {{ return LiteralValue(STR[1:len(STR)-1]) }} 522 523 rule CMP: (CMP_EQ {{ return CMP_EQ }} | CMP_NE {{ return CMP_NE }}) 524 rule STR: (STR_DQ {{ return STR_DQ }} | STR_SQ {{ return STR_SQ }}) 525