1# -*- coding: utf-8 -*-
2"""
3    jinja2.lexer
4    ~~~~~~~~~~~~
5
6    This module implements a Jinja / Python combination lexer. The
7    `Lexer` class provided by this module is used to do some preprocessing
8    for Jinja.
9
10    On the one hand it filters out invalid operators like the bitshift
11    operators we don't allow in templates. On the other hand it separates
12    template code and python code in expressions.
13
14    :copyright: (c) 2010 by the Jinja Team.
15    :license: BSD, see LICENSE for more details.
16"""
17import re
18from operator import itemgetter
19from collections import deque
20from jinja2.exceptions import TemplateSyntaxError
21from jinja2.utils import LRUCache, next
22
23
24# cache for the lexers. Exists in order to be able to have multiple
25# environments with the same lexer
26_lexer_cache = LRUCache(50)
27
28# static regular expressions
29whitespace_re = re.compile(r'\s+', re.U)
30string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'"
31                       r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S)
32integer_re = re.compile(r'\d+')
33
34# we use the unicode identifier rule if this python version is able
35# to handle unicode identifiers, otherwise the standard ASCII one.
36try:
37    compile('föö', '<unknown>', 'eval')
38except SyntaxError:
39    name_re = re.compile(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b')
40else:
41    from jinja2 import _stringdefs
42    name_re = re.compile(r'[%s][%s]*' % (_stringdefs.xid_start,
43                                         _stringdefs.xid_continue))
44
45float_re = re.compile(r'(?<!\.)\d+\.\d+')
46newline_re = re.compile(r'(\r\n|\r|\n)')
47
48# internal the tokens and keep references to them
49TOKEN_ADD = intern('add')
50TOKEN_ASSIGN = intern('assign')
51TOKEN_COLON = intern('colon')
52TOKEN_COMMA = intern('comma')
53TOKEN_DIV = intern('div')
54TOKEN_DOT = intern('dot')
55TOKEN_EQ = intern('eq')
56TOKEN_FLOORDIV = intern('floordiv')
57TOKEN_GT = intern('gt')
58TOKEN_GTEQ = intern('gteq')
59TOKEN_LBRACE = intern('lbrace')
60TOKEN_LBRACKET = intern('lbracket')
61TOKEN_LPAREN = intern('lparen')
62TOKEN_LT = intern('lt')
63TOKEN_LTEQ = intern('lteq')
64TOKEN_MOD = intern('mod')
65TOKEN_MUL = intern('mul')
66TOKEN_NE = intern('ne')
67TOKEN_PIPE = intern('pipe')
68TOKEN_POW = intern('pow')
69TOKEN_RBRACE = intern('rbrace')
70TOKEN_RBRACKET = intern('rbracket')
71TOKEN_RPAREN = intern('rparen')
72TOKEN_SEMICOLON = intern('semicolon')
73TOKEN_SUB = intern('sub')
74TOKEN_TILDE = intern('tilde')
75TOKEN_WHITESPACE = intern('whitespace')
76TOKEN_FLOAT = intern('float')
77TOKEN_INTEGER = intern('integer')
78TOKEN_NAME = intern('name')
79TOKEN_STRING = intern('string')
80TOKEN_OPERATOR = intern('operator')
81TOKEN_BLOCK_BEGIN = intern('block_begin')
82TOKEN_BLOCK_END = intern('block_end')
83TOKEN_VARIABLE_BEGIN = intern('variable_begin')
84TOKEN_VARIABLE_END = intern('variable_end')
85TOKEN_RAW_BEGIN = intern('raw_begin')
86TOKEN_RAW_END = intern('raw_end')
87TOKEN_COMMENT_BEGIN = intern('comment_begin')
88TOKEN_COMMENT_END = intern('comment_end')
89TOKEN_COMMENT = intern('comment')
90TOKEN_LINESTATEMENT_BEGIN = intern('linestatement_begin')
91TOKEN_LINESTATEMENT_END = intern('linestatement_end')
92TOKEN_LINECOMMENT_BEGIN = intern('linecomment_begin')
93TOKEN_LINECOMMENT_END = intern('linecomment_end')
94TOKEN_LINECOMMENT = intern('linecomment')
95TOKEN_DATA = intern('data')
96TOKEN_INITIAL = intern('initial')
97TOKEN_EOF = intern('eof')
98
99# bind operators to token types
100operators = {
101    '+':            TOKEN_ADD,
102    '-':            TOKEN_SUB,
103    '/':            TOKEN_DIV,
104    '//':           TOKEN_FLOORDIV,
105    '*':            TOKEN_MUL,
106    '%':            TOKEN_MOD,
107    '**':           TOKEN_POW,
108    '~':            TOKEN_TILDE,
109    '[':            TOKEN_LBRACKET,
110    ']':            TOKEN_RBRACKET,
111    '(':            TOKEN_LPAREN,
112    ')':            TOKEN_RPAREN,
113    '{':            TOKEN_LBRACE,
114    '}':            TOKEN_RBRACE,
115    '==':           TOKEN_EQ,
116    '!=':           TOKEN_NE,
117    '>':            TOKEN_GT,
118    '>=':           TOKEN_GTEQ,
119    '<':            TOKEN_LT,
120    '<=':           TOKEN_LTEQ,
121    '=':            TOKEN_ASSIGN,
122    '.':            TOKEN_DOT,
123    ':':            TOKEN_COLON,
124    '|':            TOKEN_PIPE,
125    ',':            TOKEN_COMMA,
126    ';':            TOKEN_SEMICOLON
127}
128
129reverse_operators = dict([(v, k) for k, v in operators.iteritems()])
130assert len(operators) == len(reverse_operators), 'operators dropped'
131operator_re = re.compile('(%s)' % '|'.join(re.escape(x) for x in
132                         sorted(operators, key=lambda x: -len(x))))
133
134ignored_tokens = frozenset([TOKEN_COMMENT_BEGIN, TOKEN_COMMENT,
135                            TOKEN_COMMENT_END, TOKEN_WHITESPACE,
136                            TOKEN_WHITESPACE, TOKEN_LINECOMMENT_BEGIN,
137                            TOKEN_LINECOMMENT_END, TOKEN_LINECOMMENT])
138ignore_if_empty = frozenset([TOKEN_WHITESPACE, TOKEN_DATA,
139                             TOKEN_COMMENT, TOKEN_LINECOMMENT])
140
141
142def _describe_token_type(token_type):
143    if token_type in reverse_operators:
144        return reverse_operators[token_type]
145    return {
146        TOKEN_COMMENT_BEGIN:        'begin of comment',
147        TOKEN_COMMENT_END:          'end of comment',
148        TOKEN_COMMENT:              'comment',
149        TOKEN_LINECOMMENT:          'comment',
150        TOKEN_BLOCK_BEGIN:          'begin of statement block',
151        TOKEN_BLOCK_END:            'end of statement block',
152        TOKEN_VARIABLE_BEGIN:       'begin of print statement',
153        TOKEN_VARIABLE_END:         'end of print statement',
154        TOKEN_LINESTATEMENT_BEGIN:  'begin of line statement',
155        TOKEN_LINESTATEMENT_END:    'end of line statement',
156        TOKEN_DATA:                 'template data / text',
157        TOKEN_EOF:                  'end of template'
158    }.get(token_type, token_type)
159
160
161def describe_token(token):
162    """Returns a description of the token."""
163    if token.type == 'name':
164        return token.value
165    return _describe_token_type(token.type)
166
167
168def describe_token_expr(expr):
169    """Like `describe_token` but for token expressions."""
170    if ':' in expr:
171        type, value = expr.split(':', 1)
172        if type == 'name':
173            return value
174    else:
175        type = expr
176    return _describe_token_type(type)
177
178
179def count_newlines(value):
180    """Count the number of newline characters in the string.  This is
181    useful for extensions that filter a stream.
182    """
183    return len(newline_re.findall(value))
184
185
186def compile_rules(environment):
187    """Compiles all the rules from the environment into a list of rules."""
188    e = re.escape
189    rules = [
190        (len(environment.comment_start_string), 'comment',
191         e(environment.comment_start_string)),
192        (len(environment.block_start_string), 'block',
193         e(environment.block_start_string)),
194        (len(environment.variable_start_string), 'variable',
195         e(environment.variable_start_string))
196    ]
197
198    if environment.line_statement_prefix is not None:
199        rules.append((len(environment.line_statement_prefix), 'linestatement',
200                      r'^\s*' + e(environment.line_statement_prefix)))
201    if environment.line_comment_prefix is not None:
202        rules.append((len(environment.line_comment_prefix), 'linecomment',
203                      r'(?:^|(?<=\S))[^\S\r\n]*' +
204                      e(environment.line_comment_prefix)))
205
206    return [x[1:] for x in sorted(rules, reverse=True)]
207
208
209class Failure(object):
210    """Class that raises a `TemplateSyntaxError` if called.
211    Used by the `Lexer` to specify known errors.
212    """
213
214    def __init__(self, message, cls=TemplateSyntaxError):
215        self.message = message
216        self.error_class = cls
217
218    def __call__(self, lineno, filename):
219        raise self.error_class(self.message, lineno, filename)
220
221
222class Token(tuple):
223    """Token class."""
224    __slots__ = ()
225    lineno, type, value = (property(itemgetter(x)) for x in range(3))
226
227    def __new__(cls, lineno, type, value):
228        return tuple.__new__(cls, (lineno, intern(str(type)), value))
229
230    def __str__(self):
231        if self.type in reverse_operators:
232            return reverse_operators[self.type]
233        elif self.type == 'name':
234            return self.value
235        return self.type
236
237    def test(self, expr):
238        """Test a token against a token expression.  This can either be a
239        token type or ``'token_type:token_value'``.  This can only test
240        against string values and types.
241        """
242        # here we do a regular string equality check as test_any is usually
243        # passed an iterable of not interned strings.
244        if self.type == expr:
245            return True
246        elif ':' in expr:
247            return expr.split(':', 1) == [self.type, self.value]
248        return False
249
250    def test_any(self, *iterable):
251        """Test against multiple token expressions."""
252        for expr in iterable:
253            if self.test(expr):
254                return True
255        return False
256
257    def __repr__(self):
258        return 'Token(%r, %r, %r)' % (
259            self.lineno,
260            self.type,
261            self.value
262        )
263
264
265class TokenStreamIterator(object):
266    """The iterator for tokenstreams.  Iterate over the stream
267    until the eof token is reached.
268    """
269
270    def __init__(self, stream):
271        self.stream = stream
272
273    def __iter__(self):
274        return self
275
276    def next(self):
277        token = self.stream.current
278        if token.type is TOKEN_EOF:
279            self.stream.close()
280            raise StopIteration()
281        next(self.stream)
282        return token
283
284
285class TokenStream(object):
286    """A token stream is an iterable that yields :class:`Token`\s.  The
287    parser however does not iterate over it but calls :meth:`next` to go
288    one token ahead.  The current active token is stored as :attr:`current`.
289    """
290
291    def __init__(self, generator, name, filename):
292        self._next = iter(generator).next
293        self._pushed = deque()
294        self.name = name
295        self.filename = filename
296        self.closed = False
297        self.current = Token(1, TOKEN_INITIAL, '')
298        next(self)
299
300    def __iter__(self):
301        return TokenStreamIterator(self)
302
303    def __nonzero__(self):
304        return bool(self._pushed) or self.current.type is not TOKEN_EOF
305
306    eos = property(lambda x: not x, doc="Are we at the end of the stream?")
307
308    def push(self, token):
309        """Push a token back to the stream."""
310        self._pushed.append(token)
311
312    def look(self):
313        """Look at the next token."""
314        old_token = next(self)
315        result = self.current
316        self.push(result)
317        self.current = old_token
318        return result
319
320    def skip(self, n=1):
321        """Got n tokens ahead."""
322        for x in xrange(n):
323            next(self)
324
325    def next_if(self, expr):
326        """Perform the token test and return the token if it matched.
327        Otherwise the return value is `None`.
328        """
329        if self.current.test(expr):
330            return next(self)
331
332    def skip_if(self, expr):
333        """Like :meth:`next_if` but only returns `True` or `False`."""
334        return self.next_if(expr) is not None
335
336    def next(self):
337        """Go one token ahead and return the old one"""
338        rv = self.current
339        if self._pushed:
340            self.current = self._pushed.popleft()
341        elif self.current.type is not TOKEN_EOF:
342            try:
343                self.current = self._next()
344            except StopIteration:
345                self.close()
346        return rv
347
348    def close(self):
349        """Close the stream."""
350        self.current = Token(self.current.lineno, TOKEN_EOF, '')
351        self._next = None
352        self.closed = True
353
354    def expect(self, expr):
355        """Expect a given token type and return it.  This accepts the same
356        argument as :meth:`jinja2.lexer.Token.test`.
357        """
358        if not self.current.test(expr):
359            expr = describe_token_expr(expr)
360            if self.current.type is TOKEN_EOF:
361                raise TemplateSyntaxError('unexpected end of template, '
362                                          'expected %r.' % expr,
363                                          self.current.lineno,
364                                          self.name, self.filename)
365            raise TemplateSyntaxError("expected token %r, got %r" %
366                                      (expr, describe_token(self.current)),
367                                      self.current.lineno,
368                                      self.name, self.filename)
369        try:
370            return self.current
371        finally:
372            next(self)
373
374
375def get_lexer(environment):
376    """Return a lexer which is probably cached."""
377    key = (environment.block_start_string,
378           environment.block_end_string,
379           environment.variable_start_string,
380           environment.variable_end_string,
381           environment.comment_start_string,
382           environment.comment_end_string,
383           environment.line_statement_prefix,
384           environment.line_comment_prefix,
385           environment.trim_blocks,
386           environment.newline_sequence)
387    lexer = _lexer_cache.get(key)
388    if lexer is None:
389        lexer = Lexer(environment)
390        _lexer_cache[key] = lexer
391    return lexer
392
393
394class Lexer(object):
395    """Class that implements a lexer for a given environment. Automatically
396    created by the environment class, usually you don't have to do that.
397
398    Note that the lexer is not automatically bound to an environment.
399    Multiple environments can share the same lexer.
400    """
401
402    def __init__(self, environment):
403        # shortcuts
404        c = lambda x: re.compile(x, re.M | re.S)
405        e = re.escape
406
407        # lexing rules for tags
408        tag_rules = [
409            (whitespace_re, TOKEN_WHITESPACE, None),
410            (float_re, TOKEN_FLOAT, None),
411            (integer_re, TOKEN_INTEGER, None),
412            (name_re, TOKEN_NAME, None),
413            (string_re, TOKEN_STRING, None),
414            (operator_re, TOKEN_OPERATOR, None)
415        ]
416
417        # assamble the root lexing rule. because "|" is ungreedy
418        # we have to sort by length so that the lexer continues working
419        # as expected when we have parsing rules like <% for block and
420        # <%= for variables. (if someone wants asp like syntax)
421        # variables are just part of the rules if variable processing
422        # is required.
423        root_tag_rules = compile_rules(environment)
424
425        # block suffix if trimming is enabled
426        block_suffix_re = environment.trim_blocks and '\\n?' or ''
427
428        self.newline_sequence = environment.newline_sequence
429
430        # global lexing rules
431        self.rules = {
432            'root': [
433                # directives
434                (c('(.*?)(?:%s)' % '|'.join(
435                    [r'(?P<raw_begin>(?:\s*%s\-|%s)\s*raw\s*%s)' % (
436                        e(environment.block_start_string),
437                        e(environment.block_start_string),
438                        e(environment.block_end_string)
439                    )] + [
440                        r'(?P<%s_begin>\s*%s\-|%s)' % (n, r, r)
441                        for n, r in root_tag_rules
442                    ])), (TOKEN_DATA, '#bygroup'), '#bygroup'),
443                # data
444                (c('.+'), TOKEN_DATA, None)
445            ],
446            # comments
447            TOKEN_COMMENT_BEGIN: [
448                (c(r'(.*?)((?:\-%s\s*|%s)%s)' % (
449                    e(environment.comment_end_string),
450                    e(environment.comment_end_string),
451                    block_suffix_re
452                )), (TOKEN_COMMENT, TOKEN_COMMENT_END), '#pop'),
453                (c('(.)'), (Failure('Missing end of comment tag'),), None)
454            ],
455            # blocks
456            TOKEN_BLOCK_BEGIN: [
457                (c('(?:\-%s\s*|%s)%s' % (
458                    e(environment.block_end_string),
459                    e(environment.block_end_string),
460                    block_suffix_re
461                )), TOKEN_BLOCK_END, '#pop'),
462            ] + tag_rules,
463            # variables
464            TOKEN_VARIABLE_BEGIN: [
465                (c('\-%s\s*|%s' % (
466                    e(environment.variable_end_string),
467                    e(environment.variable_end_string)
468                )), TOKEN_VARIABLE_END, '#pop')
469            ] + tag_rules,
470            # raw block
471            TOKEN_RAW_BEGIN: [
472                (c('(.*?)((?:\s*%s\-|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % (
473                    e(environment.block_start_string),
474                    e(environment.block_start_string),
475                    e(environment.block_end_string),
476                    e(environment.block_end_string),
477                    block_suffix_re
478                )), (TOKEN_DATA, TOKEN_RAW_END), '#pop'),
479                (c('(.)'), (Failure('Missing end of raw directive'),), None)
480            ],
481            # line statements
482            TOKEN_LINESTATEMENT_BEGIN: [
483                (c(r'\s*(\n|$)'), TOKEN_LINESTATEMENT_END, '#pop')
484            ] + tag_rules,
485            # line comments
486            TOKEN_LINECOMMENT_BEGIN: [
487                (c(r'(.*?)()(?=\n|$)'), (TOKEN_LINECOMMENT,
488                 TOKEN_LINECOMMENT_END), '#pop')
489            ]
490        }
491
492    def _normalize_newlines(self, value):
493        """Called for strings and template data to normlize it to unicode."""
494        return newline_re.sub(self.newline_sequence, value)
495
496    def tokenize(self, source, name=None, filename=None, state=None):
497        """Calls tokeniter + tokenize and wraps it in a token stream.
498        """
499        stream = self.tokeniter(source, name, filename, state)
500        return TokenStream(self.wrap(stream, name, filename), name, filename)
501
502    def wrap(self, stream, name=None, filename=None):
503        """This is called with the stream as returned by `tokenize` and wraps
504        every token in a :class:`Token` and converts the value.
505        """
506        for lineno, token, value in stream:
507            if token in ignored_tokens:
508                continue
509            elif token == 'linestatement_begin':
510                token = 'block_begin'
511            elif token == 'linestatement_end':
512                token = 'block_end'
513            # we are not interested in those tokens in the parser
514            elif token in ('raw_begin', 'raw_end'):
515                continue
516            elif token == 'data':
517                value = self._normalize_newlines(value)
518            elif token == 'keyword':
519                token = value
520            elif token == 'name':
521                value = str(value)
522            elif token == 'string':
523                # try to unescape string
524                try:
525                    value = self._normalize_newlines(value[1:-1]) \
526                        .encode('ascii', 'backslashreplace') \
527                        .decode('unicode-escape')
528                except Exception, e:
529                    msg = str(e).split(':')[-1].strip()
530                    raise TemplateSyntaxError(msg, lineno, name, filename)
531                # if we can express it as bytestring (ascii only)
532                # we do that for support of semi broken APIs
533                # as datetime.datetime.strftime.  On python 3 this
534                # call becomes a noop thanks to 2to3
535                try:
536                    value = str(value)
537                except UnicodeError:
538                    pass
539            elif token == 'integer':
540                value = int(value)
541            elif token == 'float':
542                value = float(value)
543            elif token == 'operator':
544                token = operators[value]
545            yield Token(lineno, token, value)
546
547    def tokeniter(self, source, name, filename=None, state=None):
548        """This method tokenizes the text and returns the tokens in a
549        generator.  Use this method if you just want to tokenize a template.
550        """
551        source = '\n'.join(unicode(source).splitlines())
552        pos = 0
553        lineno = 1
554        stack = ['root']
555        if state is not None and state != 'root':
556            assert state in ('variable', 'block'), 'invalid state'
557            stack.append(state + '_begin')
558        else:
559            state = 'root'
560        statetokens = self.rules[stack[-1]]
561        source_length = len(source)
562
563        balancing_stack = []
564
565        while 1:
566            # tokenizer loop
567            for regex, tokens, new_state in statetokens:
568                m = regex.match(source, pos)
569                # if no match we try again with the next rule
570                if m is None:
571                    continue
572
573                # we only match blocks and variables if brances / parentheses
574                # are balanced. continue parsing with the lower rule which
575                # is the operator rule. do this only if the end tags look
576                # like operators
577                if balancing_stack and \
578                   tokens in ('variable_end', 'block_end',
579                              'linestatement_end'):
580                    continue
581
582                # tuples support more options
583                if isinstance(tokens, tuple):
584                    for idx, token in enumerate(tokens):
585                        # failure group
586                        if token.__class__ is Failure:
587                            raise token(lineno, filename)
588                        # bygroup is a bit more complex, in that case we
589                        # yield for the current token the first named
590                        # group that matched
591                        elif token == '#bygroup':
592                            for key, value in m.groupdict().iteritems():
593                                if value is not None:
594                                    yield lineno, key, value
595                                    lineno += value.count('\n')
596                                    break
597                            else:
598                                raise RuntimeError('%r wanted to resolve '
599                                                   'the token dynamically'
600                                                   ' but no group matched'
601                                                   % regex)
602                        # normal group
603                        else:
604                            data = m.group(idx + 1)
605                            if data or token not in ignore_if_empty:
606                                yield lineno, token, data
607                            lineno += data.count('\n')
608
609                # strings as token just are yielded as it.
610                else:
611                    data = m.group()
612                    # update brace/parentheses balance
613                    if tokens == 'operator':
614                        if data == '{':
615                            balancing_stack.append('}')
616                        elif data == '(':
617                            balancing_stack.append(')')
618                        elif data == '[':
619                            balancing_stack.append(']')
620                        elif data in ('}', ')', ']'):
621                            if not balancing_stack:
622                                raise TemplateSyntaxError('unexpected \'%s\'' %
623                                                          data, lineno, name,
624                                                          filename)
625                            expected_op = balancing_stack.pop()
626                            if expected_op != data:
627                                raise TemplateSyntaxError('unexpected \'%s\', '
628                                                          'expected \'%s\'' %
629                                                          (data, expected_op),
630                                                          lineno, name,
631                                                          filename)
632                    # yield items
633                    if data or tokens not in ignore_if_empty:
634                        yield lineno, tokens, data
635                    lineno += data.count('\n')
636
637                # fetch new position into new variable so that we can check
638                # if there is a internal parsing error which would result
639                # in an infinite loop
640                pos2 = m.end()
641
642                # handle state changes
643                if new_state is not None:
644                    # remove the uppermost state
645                    if new_state == '#pop':
646                        stack.pop()
647                    # resolve the new state by group checking
648                    elif new_state == '#bygroup':
649                        for key, value in m.groupdict().iteritems():
650                            if value is not None:
651                                stack.append(key)
652                                break
653                        else:
654                            raise RuntimeError('%r wanted to resolve the '
655                                               'new state dynamically but'
656                                               ' no group matched' %
657                                               regex)
658                    # direct state name given
659                    else:
660                        stack.append(new_state)
661                    statetokens = self.rules[stack[-1]]
662                # we are still at the same position and no stack change.
663                # this means a loop without break condition, avoid that and
664                # raise error
665                elif pos2 == pos:
666                    raise RuntimeError('%r yielded empty string without '
667                                       'stack change' % regex)
668                # publish new function and start again
669                pos = pos2
670                break
671            # if loop terminated without break we havn't found a single match
672            # either we are at the end of the file or we have a problem
673            else:
674                # end of text
675                if pos >= source_length:
676                    return
677                # something went wrong
678                raise TemplateSyntaxError('unexpected char %r at %d' %
679                                          (source[pos], pos), lineno,
680                                          name, filename)
681