1<?php 2 3/* 4 * This file is part of Twig. 5 * 6 * (c) Fabien Potencier 7 * (c) Armin Ronacher 8 * 9 * For the full copyright and license information, please view the LICENSE 10 * file that was distributed with this source code. 11 */ 12 13namespace Twig; 14 15use Twig\Error\SyntaxError; 16 17/** 18 * Lexes a template string. 19 * 20 * @author Fabien Potencier <fabien@symfony.com> 21 */ 22class Lexer implements \Twig_LexerInterface 23{ 24 protected $tokens; 25 protected $code; 26 protected $cursor; 27 protected $lineno; 28 protected $end; 29 protected $state; 30 protected $states; 31 protected $brackets; 32 protected $env; 33 // to be renamed to $name in 2.0 (where it is private) 34 protected $filename; 35 protected $options; 36 protected $regexes; 37 protected $position; 38 protected $positions; 39 protected $currentVarBlockLine; 40 41 private $source; 42 43 const STATE_DATA = 0; 44 const STATE_BLOCK = 1; 45 const STATE_VAR = 2; 46 const STATE_STRING = 3; 47 const STATE_INTERPOLATION = 4; 48 49 const REGEX_NAME = '/[a-zA-Z_\x7f-\xff][a-zA-Z0-9_\x7f-\xff]*/A'; 50 const REGEX_NUMBER = '/[0-9]+(?:\.[0-9]+)?([Ee][\+\-][0-9]+)?/A'; 51 const REGEX_STRING = '/"([^#"\\\\]*(?:\\\\.[^#"\\\\]*)*)"|\'([^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'/As'; 52 const REGEX_DQ_STRING_DELIM = '/"/A'; 53 const REGEX_DQ_STRING_PART = '/[^#"\\\\]*(?:(?:\\\\.|#(?!\{))[^#"\\\\]*)*/As'; 54 const PUNCTUATION = '()[]{}?:.,|'; 55 56 public function __construct(Environment $env, array $options = []) 57 { 58 $this->env = $env; 59 60 $this->options = array_merge([ 61 'tag_comment' => ['{#', '#}'], 62 'tag_block' => ['{%', '%}'], 63 'tag_variable' => ['{{', '}}'], 64 'whitespace_trim' => '-', 65 'whitespace_line_trim' => '~', 66 'whitespace_line_chars' => ' \t\0\x0B', 67 'interpolation' => ['#{', '}'], 68 ], $options); 69 70 // when PHP 7.3 is the min version, we will be able to remove the '#' part in preg_quote as it's part of the default 71 $this->regexes = [ 72 // }} 73 'lex_var' => '{ 74 \s* 75 (?:'. 76 preg_quote($this->options['whitespace_trim'].$this->options['tag_variable'][1], '#').'\s*'. // -}}\s* 77 '|'. 78 preg_quote($this->options['whitespace_line_trim'].$this->options['tag_variable'][1], '#').'['.$this->options['whitespace_line_chars'].']*'. // ~}}[ \t\0\x0B]* 79 '|'. 80 preg_quote($this->options['tag_variable'][1], '#'). // }} 81 ') 82 }Ax', 83 84 // %} 85 'lex_block' => '{ 86 \s* 87 (?:'. 88 preg_quote($this->options['whitespace_trim'].$this->options['tag_block'][1], '#').'\s*\n?'. // -%}\s*\n? 89 '|'. 90 preg_quote($this->options['whitespace_line_trim'].$this->options['tag_block'][1], '#').'['.$this->options['whitespace_line_chars'].']*'. // ~%}[ \t\0\x0B]* 91 '|'. 92 preg_quote($this->options['tag_block'][1], '#').'\n?'. // %}\n? 93 ') 94 }Ax', 95 96 // {% endverbatim %} 97 'lex_raw_data' => '{'. 98 preg_quote($this->options['tag_block'][0], '#'). // {% 99 '('. 100 $this->options['whitespace_trim']. // - 101 '|'. 102 $this->options['whitespace_line_trim']. // ~ 103 ')?\s*'. 104 '(?:end%s)'. // endraw or endverbatim 105 '\s*'. 106 '(?:'. 107 preg_quote($this->options['whitespace_trim'].$this->options['tag_block'][1], '#').'\s*'. // -%} 108 '|'. 109 preg_quote($this->options['whitespace_line_trim'].$this->options['tag_block'][1], '#').'['.$this->options['whitespace_line_chars'].']*'. // ~%}[ \t\0\x0B]* 110 '|'. 111 preg_quote($this->options['tag_block'][1], '#'). // %} 112 ') 113 }sx', 114 115 'operator' => $this->getOperatorRegex(), 116 117 // #} 118 'lex_comment' => '{ 119 (?:'. 120 preg_quote($this->options['whitespace_trim']).preg_quote($this->options['tag_comment'][1], '#').'\s*\n?'. // -#}\s*\n? 121 '|'. 122 preg_quote($this->options['whitespace_line_trim'].$this->options['tag_comment'][1], '#').'['.$this->options['whitespace_line_chars'].']*'. // ~#}[ \t\0\x0B]* 123 '|'. 124 preg_quote($this->options['tag_comment'][1], '#').'\n?'. // #}\n? 125 ') 126 }sx', 127 128 // verbatim %} 129 'lex_block_raw' => '{ 130 \s* 131 (raw|verbatim) 132 \s* 133 (?:'. 134 preg_quote($this->options['whitespace_trim'].$this->options['tag_block'][1], '#').'\s*'. // -%}\s* 135 '|'. 136 preg_quote($this->options['whitespace_line_trim'].$this->options['tag_block'][1], '#').'['.$this->options['whitespace_line_chars'].']*'. // ~%}[ \t\0\x0B]* 137 '|'. 138 preg_quote($this->options['tag_block'][1], '#'). // %} 139 ') 140 }Asx', 141 142 'lex_block_line' => '{\s*line\s+(\d+)\s*'.preg_quote($this->options['tag_block'][1], '#').'}As', 143 144 // {{ or {% or {# 145 'lex_tokens_start' => '{ 146 ('. 147 preg_quote($this->options['tag_variable'][0], '#'). // {{ 148 '|'. 149 preg_quote($this->options['tag_block'][0], '#'). // {% 150 '|'. 151 preg_quote($this->options['tag_comment'][0], '#'). // {# 152 ')('. 153 preg_quote($this->options['whitespace_trim'], '#'). // - 154 '|'. 155 preg_quote($this->options['whitespace_line_trim'], '#'). // ~ 156 ')? 157 }sx', 158 'interpolation_start' => '{'.preg_quote($this->options['interpolation'][0], '#').'\s*}A', 159 'interpolation_end' => '{\s*'.preg_quote($this->options['interpolation'][1], '#').'}A', 160 ]; 161 } 162 163 public function tokenize($code, $name = null) 164 { 165 if (!$code instanceof Source) { 166 @trigger_error(sprintf('Passing a string as the $code argument of %s() is deprecated since version 1.27 and will be removed in 2.0. Pass a \Twig\Source instance instead.', __METHOD__), E_USER_DEPRECATED); 167 $this->source = new Source($code, $name); 168 } else { 169 $this->source = $code; 170 } 171 172 if (((int) ini_get('mbstring.func_overload')) & 2) { 173 @trigger_error('Support for having "mbstring.func_overload" different from 0 is deprecated version 1.29 and will be removed in 2.0.', E_USER_DEPRECATED); 174 } 175 176 if (\function_exists('mb_internal_encoding') && ((int) ini_get('mbstring.func_overload')) & 2) { 177 $mbEncoding = mb_internal_encoding(); 178 mb_internal_encoding('ASCII'); 179 } else { 180 $mbEncoding = null; 181 } 182 183 $this->code = str_replace(["\r\n", "\r"], "\n", $this->source->getCode()); 184 $this->filename = $this->source->getName(); 185 $this->cursor = 0; 186 $this->lineno = 1; 187 $this->end = \strlen($this->code); 188 $this->tokens = []; 189 $this->state = self::STATE_DATA; 190 $this->states = []; 191 $this->brackets = []; 192 $this->position = -1; 193 194 // find all token starts in one go 195 preg_match_all($this->regexes['lex_tokens_start'], $this->code, $matches, PREG_OFFSET_CAPTURE); 196 $this->positions = $matches; 197 198 while ($this->cursor < $this->end) { 199 // dispatch to the lexing functions depending 200 // on the current state 201 switch ($this->state) { 202 case self::STATE_DATA: 203 $this->lexData(); 204 break; 205 206 case self::STATE_BLOCK: 207 $this->lexBlock(); 208 break; 209 210 case self::STATE_VAR: 211 $this->lexVar(); 212 break; 213 214 case self::STATE_STRING: 215 $this->lexString(); 216 break; 217 218 case self::STATE_INTERPOLATION: 219 $this->lexInterpolation(); 220 break; 221 } 222 } 223 224 $this->pushToken(Token::EOF_TYPE); 225 226 if (!empty($this->brackets)) { 227 list($expect, $lineno) = array_pop($this->brackets); 228 throw new SyntaxError(sprintf('Unclosed "%s".', $expect), $lineno, $this->source); 229 } 230 231 if ($mbEncoding) { 232 mb_internal_encoding($mbEncoding); 233 } 234 235 return new TokenStream($this->tokens, $this->source); 236 } 237 238 protected function lexData() 239 { 240 // if no matches are left we return the rest of the template as simple text token 241 if ($this->position == \count($this->positions[0]) - 1) { 242 $this->pushToken(Token::TEXT_TYPE, substr($this->code, $this->cursor)); 243 $this->cursor = $this->end; 244 245 return; 246 } 247 248 // Find the first token after the current cursor 249 $position = $this->positions[0][++$this->position]; 250 while ($position[1] < $this->cursor) { 251 if ($this->position == \count($this->positions[0]) - 1) { 252 return; 253 } 254 $position = $this->positions[0][++$this->position]; 255 } 256 257 // push the template text first 258 $text = $textContent = substr($this->code, $this->cursor, $position[1] - $this->cursor); 259 260 // trim? 261 if (isset($this->positions[2][$this->position][0])) { 262 if ($this->options['whitespace_trim'] === $this->positions[2][$this->position][0]) { 263 // whitespace_trim detected ({%-, {{- or {#-) 264 $text = rtrim($text); 265 } elseif ($this->options['whitespace_line_trim'] === $this->positions[2][$this->position][0]) { 266 // whitespace_line_trim detected ({%~, {{~ or {#~) 267 // don't trim \r and \n 268 $text = rtrim($text, " \t\0\x0B"); 269 } 270 } 271 $this->pushToken(Token::TEXT_TYPE, $text); 272 $this->moveCursor($textContent.$position[0]); 273 274 switch ($this->positions[1][$this->position][0]) { 275 case $this->options['tag_comment'][0]: 276 $this->lexComment(); 277 break; 278 279 case $this->options['tag_block'][0]: 280 // raw data? 281 if (preg_match($this->regexes['lex_block_raw'], $this->code, $match, 0, $this->cursor)) { 282 $this->moveCursor($match[0]); 283 $this->lexRawData($match[1]); 284 // {% line \d+ %} 285 } elseif (preg_match($this->regexes['lex_block_line'], $this->code, $match, 0, $this->cursor)) { 286 $this->moveCursor($match[0]); 287 $this->lineno = (int) $match[1]; 288 } else { 289 $this->pushToken(Token::BLOCK_START_TYPE); 290 $this->pushState(self::STATE_BLOCK); 291 $this->currentVarBlockLine = $this->lineno; 292 } 293 break; 294 295 case $this->options['tag_variable'][0]: 296 $this->pushToken(Token::VAR_START_TYPE); 297 $this->pushState(self::STATE_VAR); 298 $this->currentVarBlockLine = $this->lineno; 299 break; 300 } 301 } 302 303 protected function lexBlock() 304 { 305 if (empty($this->brackets) && preg_match($this->regexes['lex_block'], $this->code, $match, 0, $this->cursor)) { 306 $this->pushToken(Token::BLOCK_END_TYPE); 307 $this->moveCursor($match[0]); 308 $this->popState(); 309 } else { 310 $this->lexExpression(); 311 } 312 } 313 314 protected function lexVar() 315 { 316 if (empty($this->brackets) && preg_match($this->regexes['lex_var'], $this->code, $match, 0, $this->cursor)) { 317 $this->pushToken(Token::VAR_END_TYPE); 318 $this->moveCursor($match[0]); 319 $this->popState(); 320 } else { 321 $this->lexExpression(); 322 } 323 } 324 325 protected function lexExpression() 326 { 327 // whitespace 328 if (preg_match('/\s+/A', $this->code, $match, 0, $this->cursor)) { 329 $this->moveCursor($match[0]); 330 331 if ($this->cursor >= $this->end) { 332 throw new SyntaxError(sprintf('Unclosed "%s".', self::STATE_BLOCK === $this->state ? 'block' : 'variable'), $this->currentVarBlockLine, $this->source); 333 } 334 } 335 336 // arrow function 337 if ('=' === $this->code[$this->cursor] && '>' === $this->code[$this->cursor + 1]) { 338 $this->pushToken(Token::ARROW_TYPE, '=>'); 339 $this->moveCursor('=>'); 340 } 341 // operators 342 elseif (preg_match($this->regexes['operator'], $this->code, $match, 0, $this->cursor)) { 343 $this->pushToken(Token::OPERATOR_TYPE, preg_replace('/\s+/', ' ', $match[0])); 344 $this->moveCursor($match[0]); 345 } 346 // names 347 elseif (preg_match(self::REGEX_NAME, $this->code, $match, 0, $this->cursor)) { 348 $this->pushToken(Token::NAME_TYPE, $match[0]); 349 $this->moveCursor($match[0]); 350 } 351 // numbers 352 elseif (preg_match(self::REGEX_NUMBER, $this->code, $match, 0, $this->cursor)) { 353 $number = (float) $match[0]; // floats 354 if (ctype_digit($match[0]) && $number <= PHP_INT_MAX) { 355 $number = (int) $match[0]; // integers lower than the maximum 356 } 357 $this->pushToken(Token::NUMBER_TYPE, $number); 358 $this->moveCursor($match[0]); 359 } 360 // punctuation 361 elseif (false !== strpos(self::PUNCTUATION, $this->code[$this->cursor])) { 362 // opening bracket 363 if (false !== strpos('([{', $this->code[$this->cursor])) { 364 $this->brackets[] = [$this->code[$this->cursor], $this->lineno]; 365 } 366 // closing bracket 367 elseif (false !== strpos(')]}', $this->code[$this->cursor])) { 368 if (empty($this->brackets)) { 369 throw new SyntaxError(sprintf('Unexpected "%s".', $this->code[$this->cursor]), $this->lineno, $this->source); 370 } 371 372 list($expect, $lineno) = array_pop($this->brackets); 373 if ($this->code[$this->cursor] != strtr($expect, '([{', ')]}')) { 374 throw new SyntaxError(sprintf('Unclosed "%s".', $expect), $lineno, $this->source); 375 } 376 } 377 378 $this->pushToken(Token::PUNCTUATION_TYPE, $this->code[$this->cursor]); 379 ++$this->cursor; 380 } 381 // strings 382 elseif (preg_match(self::REGEX_STRING, $this->code, $match, 0, $this->cursor)) { 383 $this->pushToken(Token::STRING_TYPE, stripcslashes(substr($match[0], 1, -1))); 384 $this->moveCursor($match[0]); 385 } 386 // opening double quoted string 387 elseif (preg_match(self::REGEX_DQ_STRING_DELIM, $this->code, $match, 0, $this->cursor)) { 388 $this->brackets[] = ['"', $this->lineno]; 389 $this->pushState(self::STATE_STRING); 390 $this->moveCursor($match[0]); 391 } 392 // unlexable 393 else { 394 throw new SyntaxError(sprintf('Unexpected character "%s".', $this->code[$this->cursor]), $this->lineno, $this->source); 395 } 396 } 397 398 protected function lexRawData($tag) 399 { 400 if ('raw' === $tag) { 401 @trigger_error(sprintf('Twig Tag "raw" is deprecated since version 1.21. Use "verbatim" instead in %s at line %d.', $this->filename, $this->lineno), E_USER_DEPRECATED); 402 } 403 404 if (!preg_match(str_replace('%s', $tag, $this->regexes['lex_raw_data']), $this->code, $match, PREG_OFFSET_CAPTURE, $this->cursor)) { 405 throw new SyntaxError(sprintf('Unexpected end of file: Unclosed "%s" block.', $tag), $this->lineno, $this->source); 406 } 407 408 $text = substr($this->code, $this->cursor, $match[0][1] - $this->cursor); 409 $this->moveCursor($text.$match[0][0]); 410 411 // trim? 412 if (isset($match[1][0])) { 413 if ($this->options['whitespace_trim'] === $match[1][0]) { 414 // whitespace_trim detected ({%-, {{- or {#-) 415 $text = rtrim($text); 416 } else { 417 // whitespace_line_trim detected ({%~, {{~ or {#~) 418 // don't trim \r and \n 419 $text = rtrim($text, " \t\0\x0B"); 420 } 421 } 422 423 $this->pushToken(Token::TEXT_TYPE, $text); 424 } 425 426 protected function lexComment() 427 { 428 if (!preg_match($this->regexes['lex_comment'], $this->code, $match, PREG_OFFSET_CAPTURE, $this->cursor)) { 429 throw new SyntaxError('Unclosed comment.', $this->lineno, $this->source); 430 } 431 432 $this->moveCursor(substr($this->code, $this->cursor, $match[0][1] - $this->cursor).$match[0][0]); 433 } 434 435 protected function lexString() 436 { 437 if (preg_match($this->regexes['interpolation_start'], $this->code, $match, 0, $this->cursor)) { 438 $this->brackets[] = [$this->options['interpolation'][0], $this->lineno]; 439 $this->pushToken(Token::INTERPOLATION_START_TYPE); 440 $this->moveCursor($match[0]); 441 $this->pushState(self::STATE_INTERPOLATION); 442 } elseif (preg_match(self::REGEX_DQ_STRING_PART, $this->code, $match, 0, $this->cursor) && \strlen($match[0]) > 0) { 443 $this->pushToken(Token::STRING_TYPE, stripcslashes($match[0])); 444 $this->moveCursor($match[0]); 445 } elseif (preg_match(self::REGEX_DQ_STRING_DELIM, $this->code, $match, 0, $this->cursor)) { 446 list($expect, $lineno) = array_pop($this->brackets); 447 if ('"' != $this->code[$this->cursor]) { 448 throw new SyntaxError(sprintf('Unclosed "%s".', $expect), $lineno, $this->source); 449 } 450 451 $this->popState(); 452 ++$this->cursor; 453 } else { 454 // unlexable 455 throw new SyntaxError(sprintf('Unexpected character "%s".', $this->code[$this->cursor]), $this->lineno, $this->source); 456 } 457 } 458 459 protected function lexInterpolation() 460 { 461 $bracket = end($this->brackets); 462 if ($this->options['interpolation'][0] === $bracket[0] && preg_match($this->regexes['interpolation_end'], $this->code, $match, 0, $this->cursor)) { 463 array_pop($this->brackets); 464 $this->pushToken(Token::INTERPOLATION_END_TYPE); 465 $this->moveCursor($match[0]); 466 $this->popState(); 467 } else { 468 $this->lexExpression(); 469 } 470 } 471 472 protected function pushToken($type, $value = '') 473 { 474 // do not push empty text tokens 475 if (Token::TEXT_TYPE === $type && '' === $value) { 476 return; 477 } 478 479 $this->tokens[] = new Token($type, $value, $this->lineno); 480 } 481 482 protected function moveCursor($text) 483 { 484 $this->cursor += \strlen($text); 485 $this->lineno += substr_count($text, "\n"); 486 } 487 488 protected function getOperatorRegex() 489 { 490 $operators = array_merge( 491 ['='], 492 array_keys($this->env->getUnaryOperators()), 493 array_keys($this->env->getBinaryOperators()) 494 ); 495 496 $operators = array_combine($operators, array_map('strlen', $operators)); 497 arsort($operators); 498 499 $regex = []; 500 foreach ($operators as $operator => $length) { 501 // an operator that ends with a character must be followed by 502 // a whitespace or a parenthesis 503 if (ctype_alpha($operator[$length - 1])) { 504 $r = preg_quote($operator, '/').'(?=[\s()])'; 505 } else { 506 $r = preg_quote($operator, '/'); 507 } 508 509 // an operator with a space can be any amount of whitespaces 510 $r = preg_replace('/\s+/', '\s+', $r); 511 512 $regex[] = $r; 513 } 514 515 return '/'.implode('|', $regex).'/A'; 516 } 517 518 protected function pushState($state) 519 { 520 $this->states[] = $this->state; 521 $this->state = $state; 522 } 523 524 protected function popState() 525 { 526 if (0 === \count($this->states)) { 527 throw new \LogicException('Cannot pop state without a previous state.'); 528 } 529 530 $this->state = array_pop($this->states); 531 } 532} 533 534class_alias('Twig\Lexer', 'Twig_Lexer'); 535