1# -*- coding: utf-8 -*- 2""" 3 jinja2.lexer 4 ~~~~~~~~~~~~ 5 6 This module implements a Jinja / Python combination lexer. The 7 `Lexer` class provided by this module is used to do some preprocessing 8 for Jinja. 9 10 On the one hand it filters out invalid operators like the bitshift 11 operators we don't allow in templates. On the other hand it separates 12 template code and python code in expressions. 13 14 :copyright: (c) 2010 by the Jinja Team. 15 :license: BSD, see LICENSE for more details. 16""" 17import re 18from operator import itemgetter 19from collections import deque 20from jinja2.exceptions import TemplateSyntaxError 21from jinja2.utils import LRUCache, next 22 23 24# cache for the lexers. Exists in order to be able to have multiple 25# environments with the same lexer 26_lexer_cache = LRUCache(50) 27 28# static regular expressions 29whitespace_re = re.compile(r'\s+', re.U) 30string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'" 31 r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S) 32integer_re = re.compile(r'\d+') 33 34# we use the unicode identifier rule if this python version is able 35# to handle unicode identifiers, otherwise the standard ASCII one. 36try: 37 compile('föö', '<unknown>', 'eval') 38except SyntaxError: 39 name_re = re.compile(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b') 40else: 41 from jinja2 import _stringdefs 42 name_re = re.compile(r'[%s][%s]*' % (_stringdefs.xid_start, 43 _stringdefs.xid_continue)) 44 45float_re = re.compile(r'(?<!\.)\d+\.\d+') 46newline_re = re.compile(r'(\r\n|\r|\n)') 47 48# internal the tokens and keep references to them 49TOKEN_ADD = intern('add') 50TOKEN_ASSIGN = intern('assign') 51TOKEN_COLON = intern('colon') 52TOKEN_COMMA = intern('comma') 53TOKEN_DIV = intern('div') 54TOKEN_DOT = intern('dot') 55TOKEN_EQ = intern('eq') 56TOKEN_FLOORDIV = intern('floordiv') 57TOKEN_GT = intern('gt') 58TOKEN_GTEQ = intern('gteq') 59TOKEN_LBRACE = intern('lbrace') 60TOKEN_LBRACKET = intern('lbracket') 61TOKEN_LPAREN = intern('lparen') 62TOKEN_LT = intern('lt') 63TOKEN_LTEQ = intern('lteq') 64TOKEN_MOD = intern('mod') 65TOKEN_MUL = intern('mul') 66TOKEN_NE = intern('ne') 67TOKEN_PIPE = intern('pipe') 68TOKEN_POW = intern('pow') 69TOKEN_RBRACE = intern('rbrace') 70TOKEN_RBRACKET = intern('rbracket') 71TOKEN_RPAREN = intern('rparen') 72TOKEN_SEMICOLON = intern('semicolon') 73TOKEN_SUB = intern('sub') 74TOKEN_TILDE = intern('tilde') 75TOKEN_WHITESPACE = intern('whitespace') 76TOKEN_FLOAT = intern('float') 77TOKEN_INTEGER = intern('integer') 78TOKEN_NAME = intern('name') 79TOKEN_STRING = intern('string') 80TOKEN_OPERATOR = intern('operator') 81TOKEN_BLOCK_BEGIN = intern('block_begin') 82TOKEN_BLOCK_END = intern('block_end') 83TOKEN_VARIABLE_BEGIN = intern('variable_begin') 84TOKEN_VARIABLE_END = intern('variable_end') 85TOKEN_RAW_BEGIN = intern('raw_begin') 86TOKEN_RAW_END = intern('raw_end') 87TOKEN_COMMENT_BEGIN = intern('comment_begin') 88TOKEN_COMMENT_END = intern('comment_end') 89TOKEN_COMMENT = intern('comment') 90TOKEN_LINESTATEMENT_BEGIN = intern('linestatement_begin') 91TOKEN_LINESTATEMENT_END = intern('linestatement_end') 92TOKEN_LINECOMMENT_BEGIN = intern('linecomment_begin') 93TOKEN_LINECOMMENT_END = intern('linecomment_end') 94TOKEN_LINECOMMENT = intern('linecomment') 95TOKEN_DATA = intern('data') 96TOKEN_INITIAL = intern('initial') 97TOKEN_EOF = intern('eof') 98 99# bind operators to token types 100operators = { 101 '+': TOKEN_ADD, 102 '-': TOKEN_SUB, 103 '/': TOKEN_DIV, 104 '//': TOKEN_FLOORDIV, 105 '*': TOKEN_MUL, 106 '%': TOKEN_MOD, 107 '**': TOKEN_POW, 108 '~': TOKEN_TILDE, 109 '[': TOKEN_LBRACKET, 110 ']': TOKEN_RBRACKET, 111 '(': TOKEN_LPAREN, 112 ')': TOKEN_RPAREN, 113 '{': TOKEN_LBRACE, 114 '}': TOKEN_RBRACE, 115 '==': TOKEN_EQ, 116 '!=': TOKEN_NE, 117 '>': TOKEN_GT, 118 '>=': TOKEN_GTEQ, 119 '<': TOKEN_LT, 120 '<=': TOKEN_LTEQ, 121 '=': TOKEN_ASSIGN, 122 '.': TOKEN_DOT, 123 ':': TOKEN_COLON, 124 '|': TOKEN_PIPE, 125 ',': TOKEN_COMMA, 126 ';': TOKEN_SEMICOLON 127} 128 129reverse_operators = dict([(v, k) for k, v in operators.iteritems()]) 130assert len(operators) == len(reverse_operators), 'operators dropped' 131operator_re = re.compile('(%s)' % '|'.join(re.escape(x) for x in 132 sorted(operators, key=lambda x: -len(x)))) 133 134ignored_tokens = frozenset([TOKEN_COMMENT_BEGIN, TOKEN_COMMENT, 135 TOKEN_COMMENT_END, TOKEN_WHITESPACE, 136 TOKEN_WHITESPACE, TOKEN_LINECOMMENT_BEGIN, 137 TOKEN_LINECOMMENT_END, TOKEN_LINECOMMENT]) 138ignore_if_empty = frozenset([TOKEN_WHITESPACE, TOKEN_DATA, 139 TOKEN_COMMENT, TOKEN_LINECOMMENT]) 140 141 142def _describe_token_type(token_type): 143 if token_type in reverse_operators: 144 return reverse_operators[token_type] 145 return { 146 TOKEN_COMMENT_BEGIN: 'begin of comment', 147 TOKEN_COMMENT_END: 'end of comment', 148 TOKEN_COMMENT: 'comment', 149 TOKEN_LINECOMMENT: 'comment', 150 TOKEN_BLOCK_BEGIN: 'begin of statement block', 151 TOKEN_BLOCK_END: 'end of statement block', 152 TOKEN_VARIABLE_BEGIN: 'begin of print statement', 153 TOKEN_VARIABLE_END: 'end of print statement', 154 TOKEN_LINESTATEMENT_BEGIN: 'begin of line statement', 155 TOKEN_LINESTATEMENT_END: 'end of line statement', 156 TOKEN_DATA: 'template data / text', 157 TOKEN_EOF: 'end of template' 158 }.get(token_type, token_type) 159 160 161def describe_token(token): 162 """Returns a description of the token.""" 163 if token.type == 'name': 164 return token.value 165 return _describe_token_type(token.type) 166 167 168def describe_token_expr(expr): 169 """Like `describe_token` but for token expressions.""" 170 if ':' in expr: 171 type, value = expr.split(':', 1) 172 if type == 'name': 173 return value 174 else: 175 type = expr 176 return _describe_token_type(type) 177 178 179def count_newlines(value): 180 """Count the number of newline characters in the string. This is 181 useful for extensions that filter a stream. 182 """ 183 return len(newline_re.findall(value)) 184 185 186def compile_rules(environment): 187 """Compiles all the rules from the environment into a list of rules.""" 188 e = re.escape 189 rules = [ 190 (len(environment.comment_start_string), 'comment', 191 e(environment.comment_start_string)), 192 (len(environment.block_start_string), 'block', 193 e(environment.block_start_string)), 194 (len(environment.variable_start_string), 'variable', 195 e(environment.variable_start_string)) 196 ] 197 198 if environment.line_statement_prefix is not None: 199 rules.append((len(environment.line_statement_prefix), 'linestatement', 200 r'^\s*' + e(environment.line_statement_prefix))) 201 if environment.line_comment_prefix is not None: 202 rules.append((len(environment.line_comment_prefix), 'linecomment', 203 r'(?:^|(?<=\S))[^\S\r\n]*' + 204 e(environment.line_comment_prefix))) 205 206 return [x[1:] for x in sorted(rules, reverse=True)] 207 208 209class Failure(object): 210 """Class that raises a `TemplateSyntaxError` if called. 211 Used by the `Lexer` to specify known errors. 212 """ 213 214 def __init__(self, message, cls=TemplateSyntaxError): 215 self.message = message 216 self.error_class = cls 217 218 def __call__(self, lineno, filename): 219 raise self.error_class(self.message, lineno, filename) 220 221 222class Token(tuple): 223 """Token class.""" 224 __slots__ = () 225 lineno, type, value = (property(itemgetter(x)) for x in range(3)) 226 227 def __new__(cls, lineno, type, value): 228 return tuple.__new__(cls, (lineno, intern(str(type)), value)) 229 230 def __str__(self): 231 if self.type in reverse_operators: 232 return reverse_operators[self.type] 233 elif self.type == 'name': 234 return self.value 235 return self.type 236 237 def test(self, expr): 238 """Test a token against a token expression. This can either be a 239 token type or ``'token_type:token_value'``. This can only test 240 against string values and types. 241 """ 242 # here we do a regular string equality check as test_any is usually 243 # passed an iterable of not interned strings. 244 if self.type == expr: 245 return True 246 elif ':' in expr: 247 return expr.split(':', 1) == [self.type, self.value] 248 return False 249 250 def test_any(self, *iterable): 251 """Test against multiple token expressions.""" 252 for expr in iterable: 253 if self.test(expr): 254 return True 255 return False 256 257 def __repr__(self): 258 return 'Token(%r, %r, %r)' % ( 259 self.lineno, 260 self.type, 261 self.value 262 ) 263 264 265class TokenStreamIterator(object): 266 """The iterator for tokenstreams. Iterate over the stream 267 until the eof token is reached. 268 """ 269 270 def __init__(self, stream): 271 self.stream = stream 272 273 def __iter__(self): 274 return self 275 276 def next(self): 277 token = self.stream.current 278 if token.type is TOKEN_EOF: 279 self.stream.close() 280 raise StopIteration() 281 next(self.stream) 282 return token 283 284 285class TokenStream(object): 286 """A token stream is an iterable that yields :class:`Token`\s. The 287 parser however does not iterate over it but calls :meth:`next` to go 288 one token ahead. The current active token is stored as :attr:`current`. 289 """ 290 291 def __init__(self, generator, name, filename): 292 self._next = iter(generator).next 293 self._pushed = deque() 294 self.name = name 295 self.filename = filename 296 self.closed = False 297 self.current = Token(1, TOKEN_INITIAL, '') 298 next(self) 299 300 def __iter__(self): 301 return TokenStreamIterator(self) 302 303 def __nonzero__(self): 304 return bool(self._pushed) or self.current.type is not TOKEN_EOF 305 306 eos = property(lambda x: not x, doc="Are we at the end of the stream?") 307 308 def push(self, token): 309 """Push a token back to the stream.""" 310 self._pushed.append(token) 311 312 def look(self): 313 """Look at the next token.""" 314 old_token = next(self) 315 result = self.current 316 self.push(result) 317 self.current = old_token 318 return result 319 320 def skip(self, n=1): 321 """Got n tokens ahead.""" 322 for x in xrange(n): 323 next(self) 324 325 def next_if(self, expr): 326 """Perform the token test and return the token if it matched. 327 Otherwise the return value is `None`. 328 """ 329 if self.current.test(expr): 330 return next(self) 331 332 def skip_if(self, expr): 333 """Like :meth:`next_if` but only returns `True` or `False`.""" 334 return self.next_if(expr) is not None 335 336 def next(self): 337 """Go one token ahead and return the old one""" 338 rv = self.current 339 if self._pushed: 340 self.current = self._pushed.popleft() 341 elif self.current.type is not TOKEN_EOF: 342 try: 343 self.current = self._next() 344 except StopIteration: 345 self.close() 346 return rv 347 348 def close(self): 349 """Close the stream.""" 350 self.current = Token(self.current.lineno, TOKEN_EOF, '') 351 self._next = None 352 self.closed = True 353 354 def expect(self, expr): 355 """Expect a given token type and return it. This accepts the same 356 argument as :meth:`jinja2.lexer.Token.test`. 357 """ 358 if not self.current.test(expr): 359 expr = describe_token_expr(expr) 360 if self.current.type is TOKEN_EOF: 361 raise TemplateSyntaxError('unexpected end of template, ' 362 'expected %r.' % expr, 363 self.current.lineno, 364 self.name, self.filename) 365 raise TemplateSyntaxError("expected token %r, got %r" % 366 (expr, describe_token(self.current)), 367 self.current.lineno, 368 self.name, self.filename) 369 try: 370 return self.current 371 finally: 372 next(self) 373 374 375def get_lexer(environment): 376 """Return a lexer which is probably cached.""" 377 key = (environment.block_start_string, 378 environment.block_end_string, 379 environment.variable_start_string, 380 environment.variable_end_string, 381 environment.comment_start_string, 382 environment.comment_end_string, 383 environment.line_statement_prefix, 384 environment.line_comment_prefix, 385 environment.trim_blocks, 386 environment.newline_sequence) 387 lexer = _lexer_cache.get(key) 388 if lexer is None: 389 lexer = Lexer(environment) 390 _lexer_cache[key] = lexer 391 return lexer 392 393 394class Lexer(object): 395 """Class that implements a lexer for a given environment. Automatically 396 created by the environment class, usually you don't have to do that. 397 398 Note that the lexer is not automatically bound to an environment. 399 Multiple environments can share the same lexer. 400 """ 401 402 def __init__(self, environment): 403 # shortcuts 404 c = lambda x: re.compile(x, re.M | re.S) 405 e = re.escape 406 407 # lexing rules for tags 408 tag_rules = [ 409 (whitespace_re, TOKEN_WHITESPACE, None), 410 (float_re, TOKEN_FLOAT, None), 411 (integer_re, TOKEN_INTEGER, None), 412 (name_re, TOKEN_NAME, None), 413 (string_re, TOKEN_STRING, None), 414 (operator_re, TOKEN_OPERATOR, None) 415 ] 416 417 # assamble the root lexing rule. because "|" is ungreedy 418 # we have to sort by length so that the lexer continues working 419 # as expected when we have parsing rules like <% for block and 420 # <%= for variables. (if someone wants asp like syntax) 421 # variables are just part of the rules if variable processing 422 # is required. 423 root_tag_rules = compile_rules(environment) 424 425 # block suffix if trimming is enabled 426 block_suffix_re = environment.trim_blocks and '\\n?' or '' 427 428 self.newline_sequence = environment.newline_sequence 429 430 # global lexing rules 431 self.rules = { 432 'root': [ 433 # directives 434 (c('(.*?)(?:%s)' % '|'.join( 435 [r'(?P<raw_begin>(?:\s*%s\-|%s)\s*raw\s*%s)' % ( 436 e(environment.block_start_string), 437 e(environment.block_start_string), 438 e(environment.block_end_string) 439 )] + [ 440 r'(?P<%s_begin>\s*%s\-|%s)' % (n, r, r) 441 for n, r in root_tag_rules 442 ])), (TOKEN_DATA, '#bygroup'), '#bygroup'), 443 # data 444 (c('.+'), TOKEN_DATA, None) 445 ], 446 # comments 447 TOKEN_COMMENT_BEGIN: [ 448 (c(r'(.*?)((?:\-%s\s*|%s)%s)' % ( 449 e(environment.comment_end_string), 450 e(environment.comment_end_string), 451 block_suffix_re 452 )), (TOKEN_COMMENT, TOKEN_COMMENT_END), '#pop'), 453 (c('(.)'), (Failure('Missing end of comment tag'),), None) 454 ], 455 # blocks 456 TOKEN_BLOCK_BEGIN: [ 457 (c('(?:\-%s\s*|%s)%s' % ( 458 e(environment.block_end_string), 459 e(environment.block_end_string), 460 block_suffix_re 461 )), TOKEN_BLOCK_END, '#pop'), 462 ] + tag_rules, 463 # variables 464 TOKEN_VARIABLE_BEGIN: [ 465 (c('\-%s\s*|%s' % ( 466 e(environment.variable_end_string), 467 e(environment.variable_end_string) 468 )), TOKEN_VARIABLE_END, '#pop') 469 ] + tag_rules, 470 # raw block 471 TOKEN_RAW_BEGIN: [ 472 (c('(.*?)((?:\s*%s\-|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % ( 473 e(environment.block_start_string), 474 e(environment.block_start_string), 475 e(environment.block_end_string), 476 e(environment.block_end_string), 477 block_suffix_re 478 )), (TOKEN_DATA, TOKEN_RAW_END), '#pop'), 479 (c('(.)'), (Failure('Missing end of raw directive'),), None) 480 ], 481 # line statements 482 TOKEN_LINESTATEMENT_BEGIN: [ 483 (c(r'\s*(\n|$)'), TOKEN_LINESTATEMENT_END, '#pop') 484 ] + tag_rules, 485 # line comments 486 TOKEN_LINECOMMENT_BEGIN: [ 487 (c(r'(.*?)()(?=\n|$)'), (TOKEN_LINECOMMENT, 488 TOKEN_LINECOMMENT_END), '#pop') 489 ] 490 } 491 492 def _normalize_newlines(self, value): 493 """Called for strings and template data to normlize it to unicode.""" 494 return newline_re.sub(self.newline_sequence, value) 495 496 def tokenize(self, source, name=None, filename=None, state=None): 497 """Calls tokeniter + tokenize and wraps it in a token stream. 498 """ 499 stream = self.tokeniter(source, name, filename, state) 500 return TokenStream(self.wrap(stream, name, filename), name, filename) 501 502 def wrap(self, stream, name=None, filename=None): 503 """This is called with the stream as returned by `tokenize` and wraps 504 every token in a :class:`Token` and converts the value. 505 """ 506 for lineno, token, value in stream: 507 if token in ignored_tokens: 508 continue 509 elif token == 'linestatement_begin': 510 token = 'block_begin' 511 elif token == 'linestatement_end': 512 token = 'block_end' 513 # we are not interested in those tokens in the parser 514 elif token in ('raw_begin', 'raw_end'): 515 continue 516 elif token == 'data': 517 value = self._normalize_newlines(value) 518 elif token == 'keyword': 519 token = value 520 elif token == 'name': 521 value = str(value) 522 elif token == 'string': 523 # try to unescape string 524 try: 525 value = self._normalize_newlines(value[1:-1]) \ 526 .encode('ascii', 'backslashreplace') \ 527 .decode('unicode-escape') 528 except Exception, e: 529 msg = str(e).split(':')[-1].strip() 530 raise TemplateSyntaxError(msg, lineno, name, filename) 531 # if we can express it as bytestring (ascii only) 532 # we do that for support of semi broken APIs 533 # as datetime.datetime.strftime. On python 3 this 534 # call becomes a noop thanks to 2to3 535 try: 536 value = str(value) 537 except UnicodeError: 538 pass 539 elif token == 'integer': 540 value = int(value) 541 elif token == 'float': 542 value = float(value) 543 elif token == 'operator': 544 token = operators[value] 545 yield Token(lineno, token, value) 546 547 def tokeniter(self, source, name, filename=None, state=None): 548 """This method tokenizes the text and returns the tokens in a 549 generator. Use this method if you just want to tokenize a template. 550 """ 551 source = '\n'.join(unicode(source).splitlines()) 552 pos = 0 553 lineno = 1 554 stack = ['root'] 555 if state is not None and state != 'root': 556 assert state in ('variable', 'block'), 'invalid state' 557 stack.append(state + '_begin') 558 else: 559 state = 'root' 560 statetokens = self.rules[stack[-1]] 561 source_length = len(source) 562 563 balancing_stack = [] 564 565 while 1: 566 # tokenizer loop 567 for regex, tokens, new_state in statetokens: 568 m = regex.match(source, pos) 569 # if no match we try again with the next rule 570 if m is None: 571 continue 572 573 # we only match blocks and variables if brances / parentheses 574 # are balanced. continue parsing with the lower rule which 575 # is the operator rule. do this only if the end tags look 576 # like operators 577 if balancing_stack and \ 578 tokens in ('variable_end', 'block_end', 579 'linestatement_end'): 580 continue 581 582 # tuples support more options 583 if isinstance(tokens, tuple): 584 for idx, token in enumerate(tokens): 585 # failure group 586 if token.__class__ is Failure: 587 raise token(lineno, filename) 588 # bygroup is a bit more complex, in that case we 589 # yield for the current token the first named 590 # group that matched 591 elif token == '#bygroup': 592 for key, value in m.groupdict().iteritems(): 593 if value is not None: 594 yield lineno, key, value 595 lineno += value.count('\n') 596 break 597 else: 598 raise RuntimeError('%r wanted to resolve ' 599 'the token dynamically' 600 ' but no group matched' 601 % regex) 602 # normal group 603 else: 604 data = m.group(idx + 1) 605 if data or token not in ignore_if_empty: 606 yield lineno, token, data 607 lineno += data.count('\n') 608 609 # strings as token just are yielded as it. 610 else: 611 data = m.group() 612 # update brace/parentheses balance 613 if tokens == 'operator': 614 if data == '{': 615 balancing_stack.append('}') 616 elif data == '(': 617 balancing_stack.append(')') 618 elif data == '[': 619 balancing_stack.append(']') 620 elif data in ('}', ')', ']'): 621 if not balancing_stack: 622 raise TemplateSyntaxError('unexpected \'%s\'' % 623 data, lineno, name, 624 filename) 625 expected_op = balancing_stack.pop() 626 if expected_op != data: 627 raise TemplateSyntaxError('unexpected \'%s\', ' 628 'expected \'%s\'' % 629 (data, expected_op), 630 lineno, name, 631 filename) 632 # yield items 633 if data or tokens not in ignore_if_empty: 634 yield lineno, tokens, data 635 lineno += data.count('\n') 636 637 # fetch new position into new variable so that we can check 638 # if there is a internal parsing error which would result 639 # in an infinite loop 640 pos2 = m.end() 641 642 # handle state changes 643 if new_state is not None: 644 # remove the uppermost state 645 if new_state == '#pop': 646 stack.pop() 647 # resolve the new state by group checking 648 elif new_state == '#bygroup': 649 for key, value in m.groupdict().iteritems(): 650 if value is not None: 651 stack.append(key) 652 break 653 else: 654 raise RuntimeError('%r wanted to resolve the ' 655 'new state dynamically but' 656 ' no group matched' % 657 regex) 658 # direct state name given 659 else: 660 stack.append(new_state) 661 statetokens = self.rules[stack[-1]] 662 # we are still at the same position and no stack change. 663 # this means a loop without break condition, avoid that and 664 # raise error 665 elif pos2 == pos: 666 raise RuntimeError('%r yielded empty string without ' 667 'stack change' % regex) 668 # publish new function and start again 669 pos = pos2 670 break 671 # if loop terminated without break we havn't found a single match 672 # either we are at the end of the file or we have a problem 673 else: 674 # end of text 675 if pos >= source_length: 676 return 677 # something went wrong 678 raise TemplateSyntaxError('unexpected char %r at %d' % 679 (source[pos], pos), lineno, 680 name, filename) 681