1# -*- test-case-name: twisted.words.test.test_xpath -*-
2# Copyright (c) Twisted Matrix Laboratories.
3# See LICENSE for details.
4
5# pylint: disable=W9401,W9402
6
7# DO NOT EDIT xpathparser.py!
8#
9# It is generated from xpathparser.g using Yapps. Make needed changes there.
10# This also means that the generated Python may not conform to Twisted's coding
11# standards, so it is wrapped in exec to prevent automated checkers from
12# complaining.
13
14# HOWTO Generate me:
15#
16# 1.) Grab a copy of yapps2:
17#         https://github.com/smurfix/yapps
18#
19#     Note: Do NOT use the package in debian/ubuntu as it has incompatible
20#     modifications. The original at http://theory.stanford.edu/~amitp/yapps/
21#     hasn't been touched since 2003 and has not been updated to work with
22#     Python 3.
23#
24# 2.) Generate the grammar:
25#
26#         yapps2 xpathparser.g xpathparser.py.proto
27#
28# 3.) Edit the output to depend on the embedded runtime, and remove extraneous
29#     imports:
30#
31#         sed -e '/^# Begin/,${/^[^ ].*mport/d}' -e 's/runtime\.//g' \
32#             -e "s/^\(from __future\)/exec(r'''\n\1/" -e"\$a''')"
33#             xpathparser.py.proto > xpathparser.py
34
35"""
36XPath Parser.
37
38Besides the parser code produced by Yapps, this module also defines the
39parse-time exception classes, a scanner class, a base class for parsers
40produced by Yapps, and a context class that keeps track of the parse stack.
41These have been copied from the Yapps runtime module.
42"""
43
44from __future__ import print_function
45import sys, re
46
47MIN_WINDOW=4096
48# File lookup window
49
50class SyntaxError(Exception):
51    """When we run into an unexpected token, this is the exception to use"""
52    def __init__(self, pos=None, msg="Bad Token", context=None):
53        Exception.__init__(self)
54        self.pos = pos
55        self.msg = msg
56        self.context = context
57
58    def __str__(self):
59        if not self.pos: return 'SyntaxError'
60        else: return 'SyntaxError@%s(%s)' % (repr(self.pos), self.msg)
61
62class NoMoreTokens(Exception):
63    """Another exception object, for when we run out of tokens"""
64    pass
65
66class Token:
67    """Yapps token.
68
69    This is a container for a scanned token.
70    """
71
72    def __init__(self, type,value, pos=None):
73        """Initialize a token."""
74        self.type = type
75        self.value = value
76        self.pos = pos
77
78    def __repr__(self):
79        output = '<%s: %s' % (self.type, repr(self.value))
80        if self.pos:
81            output += " @ "
82            if self.pos[0]:
83                output += "%s:" % self.pos[0]
84            if self.pos[1]:
85                output += "%d" % self.pos[1]
86            if self.pos[2] is not None:
87                output += ".%d" % self.pos[2]
88        output += ">"
89        return output
90
91in_name=0
92class Scanner:
93    """Yapps scanner.
94
95    The Yapps scanner can work in context sensitive or context
96    insensitive modes.  The token(i) method is used to retrieve the
97    i-th token.  It takes a restrict set that limits the set of tokens
98    it is allowed to return.  In context sensitive mode, this restrict
99    set guides the scanner.  In context insensitive mode, there is no
100    restriction (the set is always the full set of tokens).
101
102    """
103
104    def __init__(self, patterns, ignore, input="",
105            file=None,filename=None,stacked=False):
106        """Initialize the scanner.
107
108        Parameters:
109          patterns : [(terminal, uncompiled regex), ...] or None
110          ignore : {terminal:None, ...}
111          input : string
112
113        If patterns is None, we assume that the subclass has
114        defined self.patterns : [(terminal, compiled regex), ...].
115        Note that the patterns parameter expects uncompiled regexes,
116        whereas the self.patterns field expects compiled regexes.
117
118        The 'ignore' value is either None or a callable, which is called
119        with the scanner and the to-be-ignored match object; this can
120        be used for include file or comment handling.
121        """
122
123        if not filename:
124            global in_name
125            filename="<f.%d>" % in_name
126            in_name += 1
127
128        self.input = input
129        self.ignore = ignore
130        self.file = file
131        self.filename = filename
132        self.pos = 0
133        self.del_pos = 0 # skipped
134        self.line = 1
135        self.del_line = 0 # skipped
136        self.col = 0
137        self.tokens = []
138        self.stack = None
139        self.stacked = stacked
140
141        self.last_read_token = None
142        self.last_token = None
143        self.last_types = None
144
145        if patterns is not None:
146            # Compile the regex strings into regex objects
147            self.patterns = []
148            for terminal, regex in patterns:
149                self.patterns.append( (terminal, re.compile(regex)) )
150
151    def stack_input(self, input="", file=None, filename=None):
152        """Temporarily parse from a second file."""
153
154        # Already reading from somewhere else: Go on top of that, please.
155        if self.stack:
156            # autogenerate a recursion-level-identifying filename
157            if not filename:
158                filename = 1
159            else:
160                try:
161                    filename += 1
162                except TypeError:
163                    pass
164                # now pass off to the include file
165            self.stack.stack_input(input,file,filename)
166        else:
167
168            try:
169                filename += 0
170            except TypeError:
171                pass
172            else:
173                filename = "<str_%d>" % filename
174
175#			self.stack = object.__new__(self.__class__)
176#			Scanner.__init__(self.stack,self.patterns,self.ignore,input,file,filename, stacked=True)
177
178            # Note that the pattern+ignore are added by the generated
179            # scanner code
180            self.stack = self.__class__(input,file,filename, stacked=True)
181
182    def get_pos(self):
183        """Return a file/line/char tuple."""
184        if self.stack: return self.stack.get_pos()
185
186        return (self.filename, self.line+self.del_line, self.col)
187
188#	def __repr__(self):
189#		"""Print the last few tokens that have been scanned in"""
190#		output = ''
191#		for t in self.tokens:
192#			output += '%s\n' % (repr(t),)
193#		return output
194
195    def print_line_with_pointer(self, pos, length=0, out=sys.stderr):
196        """Print the line of 'text' that includes position 'p',
197        along with a second line with a single caret (^) at position p"""
198
199        file,line,p = pos
200        if file != self.filename:
201            if self.stack: return self.stack.print_line_with_pointer(pos,length=length,out=out)
202            print >>out, "(%s: not in input buffer)" % file
203            return
204
205        text = self.input
206        p += length-1 # starts at pos 1
207
208        origline=line
209        line -= self.del_line
210        spos=0
211        if line > 0:
212            while 1:
213                line = line - 1
214                try:
215                    cr = text.index("\n",spos)
216                except ValueError:
217                    if line:
218                        text = ""
219                    break
220                if line == 0:
221                    text = text[spos:cr]
222                    break
223                spos = cr+1
224        else:
225            print >>out, "(%s:%d not in input buffer)" % (file,origline)
226            return
227
228        # Now try printing part of the line
229        text = text[max(p-80, 0):p+80]
230        p = p - max(p-80, 0)
231
232        # Strip to the left
233        i = text[:p].rfind('\n')
234        j = text[:p].rfind('\r')
235        if i < 0 or (0 <= j < i): i = j
236        if 0 <= i < p:
237            p = p - i - 1
238            text = text[i+1:]
239
240        # Strip to the right
241        i = text.find('\n', p)
242        j = text.find('\r', p)
243        if i < 0 or (0 <= j < i): i = j
244        if i >= 0:
245            text = text[:i]
246
247        # Now shorten the text
248        while len(text) > 70 and p > 60:
249            # Cut off 10 chars
250            text = "..." + text[10:]
251            p = p - 7
252
253        # Now print the string, along with an indicator
254        print >>out, '> ',text
255        print >>out, '> ',' '*p + '^'
256
257    def grab_input(self):
258        """Get more input if possible."""
259        if not self.file: return
260        if len(self.input) - self.pos >= MIN_WINDOW: return
261
262        data = self.file.read(MIN_WINDOW)
263        if data is None or data == "":
264            self.file = None
265
266        # Drop bytes from the start, if necessary.
267        if self.pos > 2*MIN_WINDOW:
268            self.del_pos += MIN_WINDOW
269            self.del_line += self.input[:MIN_WINDOW].count("\n")
270            self.pos -= MIN_WINDOW
271            self.input = self.input[MIN_WINDOW:] + data
272        else:
273            self.input = self.input + data
274
275    def getchar(self):
276        """Return the next character."""
277        self.grab_input()
278
279        c = self.input[self.pos]
280        self.pos += 1
281        return c
282
283    def token(self, restrict, context=None):
284        """Scan for another token."""
285
286        while 1:
287            if self.stack:
288                try:
289                    return self.stack.token(restrict, context)
290                except StopIteration:
291                    self.stack = None
292
293        # Keep looking for a token, ignoring any in self.ignore
294            self.grab_input()
295
296            # special handling for end-of-file
297            if self.stacked and self.pos==len(self.input):
298                raise StopIteration
299
300            # Search the patterns for the longest match, with earlier
301            # tokens in the list having preference
302            best_match = -1
303            best_pat = '(error)'
304            best_m = None
305            for p, regexp in self.patterns:
306                # First check to see if we're ignoring this token
307                if restrict and p not in restrict and p not in self.ignore:
308                    continue
309                m = regexp.match(self.input, self.pos)
310                if m and m.end()-m.start() > best_match:
311                    # We got a match that's better than the previous one
312                    best_pat = p
313                    best_match = m.end()-m.start()
314                    best_m = m
315
316            # If we didn't find anything, raise an error
317            if best_pat == '(error)' and best_match < 0:
318                msg = 'Bad Token'
319                if restrict:
320                    msg = 'Trying to find one of '+', '.join(restrict)
321                raise SyntaxError(self.get_pos(), msg, context=context)
322
323            ignore = best_pat in self.ignore
324            value = self.input[self.pos:self.pos+best_match]
325            if not ignore:
326                tok=Token(type=best_pat, value=value, pos=self.get_pos())
327
328            self.pos += best_match
329
330            npos = value.rfind("\n")
331            if npos > -1:
332                self.col = best_match-npos
333                self.line += value.count("\n")
334            else:
335                self.col += best_match
336
337            # If we found something that isn't to be ignored, return it
338            if not ignore:
339                if len(self.tokens) >= 10:
340                    del self.tokens[0]
341                self.tokens.append(tok)
342                self.last_read_token = tok
343                # print repr(tok)
344                return tok
345            else:
346                ignore = self.ignore[best_pat]
347                if ignore:
348                    ignore(self, best_m)
349
350    def peek(self, *types, **kw):
351        """Returns the token type for lookahead; if there are any args
352        then the list of args is the set of token types to allow"""
353        context = kw.get("context",None)
354        if self.last_token is None:
355            self.last_types = types
356            self.last_token = self.token(types,context)
357        elif self.last_types:
358            for t in types:
359                if t not in self.last_types:
360                    raise NotImplementedError("Unimplemented: restriction set changed")
361        return self.last_token.type
362
363    def scan(self, type, **kw):
364        """Returns the matched text, and moves to the next token"""
365        context = kw.get("context",None)
366
367        if self.last_token is None:
368            tok = self.token([type],context)
369        else:
370            if self.last_types and type not in self.last_types:
371                raise NotImplementedError("Unimplemented: restriction set changed")
372
373            tok = self.last_token
374            self.last_token = None
375        if tok.type != type:
376            if not self.last_types: self.last_types=[]
377            raise SyntaxError(tok.pos, 'Trying to find '+type+': '+ ', '.join(self.last_types)+", got "+tok.type, context=context)
378        return tok.value
379
380class Parser:
381    """Base class for Yapps-generated parsers.
382
383    """
384
385    def __init__(self, scanner):
386        self._scanner = scanner
387
388    def _stack(self, input="",file=None,filename=None):
389        """Temporarily read from someplace else"""
390        self._scanner.stack_input(input,file,filename)
391        self._tok = None
392
393    def _peek(self, *types, **kw):
394        """Returns the token type for lookahead; if there are any args
395        then the list of args is the set of token types to allow"""
396        return self._scanner.peek(*types, **kw)
397
398    def _scan(self, type, **kw):
399        """Returns the matched text, and moves to the next token"""
400        return self._scanner.scan(type, **kw)
401
402class Context:
403    """Class to represent the parser's call stack.
404
405    Every rule creates a Context that links to its parent rule.  The
406    contexts can be used for debugging.
407
408    """
409
410    def __init__(self, parent, scanner, rule, args=()):
411        """Create a new context.
412
413        Args:
414        parent: Context object or None
415        scanner: Scanner object
416        rule: string (name of the rule)
417        args: tuple listing parameters to the rule
418
419        """
420        self.parent = parent
421        self.scanner = scanner
422        self.rule = rule
423        self.args = args
424        while scanner.stack: scanner = scanner.stack
425        self.token = scanner.last_read_token
426
427    def __str__(self):
428        output = ''
429        if self.parent: output = str(self.parent) + ' > '
430        output += self.rule
431        return output
432
433def print_error(err, scanner, max_ctx=None):
434    """Print error messages, the parser stack, and the input text -- for human-readable error messages."""
435    # NOTE: this function assumes 80 columns :-(
436    # Figure out the line number
437    pos = err.pos
438    if not pos:
439        pos = scanner.get_pos()
440
441    file_name, line_number, column_number = pos
442    print('%s:%d:%d: %s' % (file_name, line_number, column_number, err.msg), file=sys.stderr)
443
444    scanner.print_line_with_pointer(pos)
445
446    context = err.context
447    token = None
448    while context:
449        print('while parsing %s%s:' % (context.rule, tuple(context.args)), file=sys.stderr)
450        if context.token:
451            token = context.token
452        if token:
453            scanner.print_line_with_pointer(token.pos, length=len(token.value))
454        context = context.parent
455        if max_ctx:
456            max_ctx = max_ctx-1
457            if not max_ctx:
458                break
459
460def wrap_error_reporter(parser, rule, *args,**kw):
461    try:
462        return getattr(parser, rule)(*args,**kw)
463    except SyntaxError as e:
464        print_error(e, parser._scanner)
465    except NoMoreTokens:
466        print('Could not complete parsing; stopped around here:', file=sys.stderr)
467        print(parser._scanner, file=sys.stderr)
468
469from twisted.words.xish.xpath import AttribValue, BooleanValue, CompareValue
470from twisted.words.xish.xpath import Function, IndexValue, LiteralValue
471from twisted.words.xish.xpath import _AnyLocation, _Location
472
473%%
474parser XPathParser:
475        ignore:             "\\s+"
476        token INDEX:        "[0-9]+"
477        token WILDCARD:     "\*"
478        token IDENTIFIER:   "[a-zA-Z][a-zA-Z0-9_\-]*"
479        token ATTRIBUTE:    "\@[a-zA-Z][a-zA-Z0-9_\-]*"
480        token FUNCNAME:     "[a-zA-Z][a-zA-Z0-9_]*"
481        token CMP_EQ:       "\="
482        token CMP_NE:       "\!\="
483        token STR_DQ:       '"([^"]|(\\"))*?"'
484        token STR_SQ:       "'([^']|(\\'))*?'"
485        token OP_AND:       "and"
486        token OP_OR:        "or"
487        token END:          "$"
488
489        rule XPATH:      PATH {{ result = PATH; current = result }}
490                           ( PATH {{ current.childLocation = PATH; current = current.childLocation }} ) * END
491                           {{ return  result }}
492
493        rule PATH:       ("/" {{ result = _Location() }} | "//" {{ result = _AnyLocation() }} )
494                           ( IDENTIFIER {{ result.elementName = IDENTIFIER }} | WILDCARD {{ result.elementName = None }} )
495                           ( "\[" PREDICATE {{ result.predicates.append(PREDICATE) }} "\]")*
496                           {{ return result }}
497
498        rule PREDICATE:  EXPR  {{ return EXPR }} |
499                         INDEX {{ return IndexValue(INDEX) }}
500
501        rule EXPR:       FACTOR {{ e = FACTOR }}
502                           ( BOOLOP FACTOR {{ e = BooleanValue(e, BOOLOP, FACTOR) }} )*
503                           {{ return e }}
504
505        rule BOOLOP:     ( OP_AND {{ return OP_AND }} | OP_OR {{ return OP_OR }} )
506
507        rule FACTOR:     TERM {{ return TERM }}
508                           | "\(" EXPR "\)" {{ return EXPR }}
509
510        rule TERM:       VALUE            {{ t = VALUE }}
511                           [ CMP VALUE  {{ t = CompareValue(t, CMP, VALUE) }} ]
512                                          {{ return t }}
513
514        rule VALUE:      "@" IDENTIFIER   {{ return AttribValue(IDENTIFIER) }} |
515                         FUNCNAME         {{ f = Function(FUNCNAME); args = [] }}
516                           "\(" [ VALUE      {{ args.append(VALUE) }}
517                             (
518                               "," VALUE     {{ args.append(VALUE) }}
519                             )*
520                           ] "\)"           {{ f.setParams(*args); return f }} |
521                         STR              {{ return LiteralValue(STR[1:len(STR)-1]) }}
522
523        rule CMP: (CMP_EQ  {{ return CMP_EQ }} | CMP_NE {{ return CMP_NE }})
524        rule STR: (STR_DQ  {{ return STR_DQ }} | STR_SQ {{ return STR_SQ }})
525