1<?php
2
3/*
4 * This file is part of Twig.
5 *
6 * (c) Fabien Potencier
7 * (c) Armin Ronacher
8 *
9 * For the full copyright and license information, please view the LICENSE
10 * file that was distributed with this source code.
11 */
12
13namespace Twig;
14
15use Twig\Error\SyntaxError;
16
17/**
18 * Lexes a template string.
19 *
20 * @author Fabien Potencier <fabien@symfony.com>
21 */
22class Lexer implements \Twig_LexerInterface
23{
24    protected $tokens;
25    protected $code;
26    protected $cursor;
27    protected $lineno;
28    protected $end;
29    protected $state;
30    protected $states;
31    protected $brackets;
32    protected $env;
33    // to be renamed to $name in 2.0 (where it is private)
34    protected $filename;
35    protected $options;
36    protected $regexes;
37    protected $position;
38    protected $positions;
39    protected $currentVarBlockLine;
40
41    private $source;
42
43    const STATE_DATA = 0;
44    const STATE_BLOCK = 1;
45    const STATE_VAR = 2;
46    const STATE_STRING = 3;
47    const STATE_INTERPOLATION = 4;
48
49    const REGEX_NAME = '/[a-zA-Z_\x7f-\xff][a-zA-Z0-9_\x7f-\xff]*/A';
50    const REGEX_NUMBER = '/[0-9]+(?:\.[0-9]+)?([Ee][\+\-][0-9]+)?/A';
51    const REGEX_STRING = '/"([^#"\\\\]*(?:\\\\.[^#"\\\\]*)*)"|\'([^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'/As';
52    const REGEX_DQ_STRING_DELIM = '/"/A';
53    const REGEX_DQ_STRING_PART = '/[^#"\\\\]*(?:(?:\\\\.|#(?!\{))[^#"\\\\]*)*/As';
54    const PUNCTUATION = '()[]{}?:.,|';
55
56    public function __construct(Environment $env, array $options = [])
57    {
58        $this->env = $env;
59
60        $this->options = array_merge([
61            'tag_comment' => ['{#', '#}'],
62            'tag_block' => ['{%', '%}'],
63            'tag_variable' => ['{{', '}}'],
64            'whitespace_trim' => '-',
65            'whitespace_line_trim' => '~',
66            'whitespace_line_chars' => ' \t\0\x0B',
67            'interpolation' => ['#{', '}'],
68        ], $options);
69
70        // when PHP 7.3 is the min version, we will be able to remove the '#' part in preg_quote as it's part of the default
71        $this->regexes = [
72            // }}
73            'lex_var' => '{
74                \s*
75                (?:'.
76                    preg_quote($this->options['whitespace_trim'].$this->options['tag_variable'][1], '#').'\s*'. // -}}\s*
77                    '|'.
78                    preg_quote($this->options['whitespace_line_trim'].$this->options['tag_variable'][1], '#').'['.$this->options['whitespace_line_chars'].']*'. // ~}}[ \t\0\x0B]*
79                    '|'.
80                    preg_quote($this->options['tag_variable'][1], '#'). // }}
81                ')
82            }Ax',
83
84            // %}
85            'lex_block' => '{
86                \s*
87                (?:'.
88                    preg_quote($this->options['whitespace_trim'].$this->options['tag_block'][1], '#').'\s*\n?'. // -%}\s*\n?
89                    '|'.
90                    preg_quote($this->options['whitespace_line_trim'].$this->options['tag_block'][1], '#').'['.$this->options['whitespace_line_chars'].']*'. // ~%}[ \t\0\x0B]*
91                    '|'.
92                    preg_quote($this->options['tag_block'][1], '#').'\n?'. // %}\n?
93                ')
94            }Ax',
95
96            // {% endverbatim %}
97            'lex_raw_data' => '{'.
98                preg_quote($this->options['tag_block'][0], '#'). // {%
99                '('.
100                    $this->options['whitespace_trim']. // -
101                    '|'.
102                    $this->options['whitespace_line_trim']. // ~
103                ')?\s*'.
104                '(?:end%s)'. // endraw or endverbatim
105                '\s*'.
106                '(?:'.
107                    preg_quote($this->options['whitespace_trim'].$this->options['tag_block'][1], '#').'\s*'. // -%}
108                    '|'.
109                    preg_quote($this->options['whitespace_line_trim'].$this->options['tag_block'][1], '#').'['.$this->options['whitespace_line_chars'].']*'. // ~%}[ \t\0\x0B]*
110                    '|'.
111                    preg_quote($this->options['tag_block'][1], '#'). // %}
112                ')
113            }sx',
114
115            'operator' => $this->getOperatorRegex(),
116
117            // #}
118            'lex_comment' => '{
119                (?:'.
120                    preg_quote($this->options['whitespace_trim']).preg_quote($this->options['tag_comment'][1], '#').'\s*\n?'. // -#}\s*\n?
121                    '|'.
122                    preg_quote($this->options['whitespace_line_trim'].$this->options['tag_comment'][1], '#').'['.$this->options['whitespace_line_chars'].']*'. // ~#}[ \t\0\x0B]*
123                    '|'.
124                    preg_quote($this->options['tag_comment'][1], '#').'\n?'. // #}\n?
125                ')
126            }sx',
127
128            // verbatim %}
129            'lex_block_raw' => '{
130                \s*
131                (raw|verbatim)
132                \s*
133                (?:'.
134                    preg_quote($this->options['whitespace_trim'].$this->options['tag_block'][1], '#').'\s*'. // -%}\s*
135                    '|'.
136                    preg_quote($this->options['whitespace_line_trim'].$this->options['tag_block'][1], '#').'['.$this->options['whitespace_line_chars'].']*'. // ~%}[ \t\0\x0B]*
137                    '|'.
138                    preg_quote($this->options['tag_block'][1], '#'). // %}
139                ')
140            }Asx',
141
142            'lex_block_line' => '{\s*line\s+(\d+)\s*'.preg_quote($this->options['tag_block'][1], '#').'}As',
143
144            // {{ or {% or {#
145            'lex_tokens_start' => '{
146                ('.
147                    preg_quote($this->options['tag_variable'][0], '#'). // {{
148                    '|'.
149                    preg_quote($this->options['tag_block'][0], '#'). // {%
150                    '|'.
151                    preg_quote($this->options['tag_comment'][0], '#'). // {#
152                ')('.
153                    preg_quote($this->options['whitespace_trim'], '#'). // -
154                    '|'.
155                    preg_quote($this->options['whitespace_line_trim'], '#'). // ~
156                ')?
157            }sx',
158            'interpolation_start' => '{'.preg_quote($this->options['interpolation'][0], '#').'\s*}A',
159            'interpolation_end' => '{\s*'.preg_quote($this->options['interpolation'][1], '#').'}A',
160        ];
161    }
162
163    public function tokenize($code, $name = null)
164    {
165        if (!$code instanceof Source) {
166            @trigger_error(sprintf('Passing a string as the $code argument of %s() is deprecated since version 1.27 and will be removed in 2.0. Pass a \Twig\Source instance instead.', __METHOD__), E_USER_DEPRECATED);
167            $this->source = new Source($code, $name);
168        } else {
169            $this->source = $code;
170        }
171
172        if (((int) ini_get('mbstring.func_overload')) & 2) {
173            @trigger_error('Support for having "mbstring.func_overload" different from 0 is deprecated version 1.29 and will be removed in 2.0.', E_USER_DEPRECATED);
174        }
175
176        if (\function_exists('mb_internal_encoding') && ((int) ini_get('mbstring.func_overload')) & 2) {
177            $mbEncoding = mb_internal_encoding();
178            mb_internal_encoding('ASCII');
179        } else {
180            $mbEncoding = null;
181        }
182
183        $this->code = str_replace(["\r\n", "\r"], "\n", $this->source->getCode());
184        $this->filename = $this->source->getName();
185        $this->cursor = 0;
186        $this->lineno = 1;
187        $this->end = \strlen($this->code);
188        $this->tokens = [];
189        $this->state = self::STATE_DATA;
190        $this->states = [];
191        $this->brackets = [];
192        $this->position = -1;
193
194        // find all token starts in one go
195        preg_match_all($this->regexes['lex_tokens_start'], $this->code, $matches, PREG_OFFSET_CAPTURE);
196        $this->positions = $matches;
197
198        while ($this->cursor < $this->end) {
199            // dispatch to the lexing functions depending
200            // on the current state
201            switch ($this->state) {
202                case self::STATE_DATA:
203                    $this->lexData();
204                    break;
205
206                case self::STATE_BLOCK:
207                    $this->lexBlock();
208                    break;
209
210                case self::STATE_VAR:
211                    $this->lexVar();
212                    break;
213
214                case self::STATE_STRING:
215                    $this->lexString();
216                    break;
217
218                case self::STATE_INTERPOLATION:
219                    $this->lexInterpolation();
220                    break;
221            }
222        }
223
224        $this->pushToken(Token::EOF_TYPE);
225
226        if (!empty($this->brackets)) {
227            list($expect, $lineno) = array_pop($this->brackets);
228            throw new SyntaxError(sprintf('Unclosed "%s".', $expect), $lineno, $this->source);
229        }
230
231        if ($mbEncoding) {
232            mb_internal_encoding($mbEncoding);
233        }
234
235        return new TokenStream($this->tokens, $this->source);
236    }
237
238    protected function lexData()
239    {
240        // if no matches are left we return the rest of the template as simple text token
241        if ($this->position == \count($this->positions[0]) - 1) {
242            $this->pushToken(Token::TEXT_TYPE, substr($this->code, $this->cursor));
243            $this->cursor = $this->end;
244
245            return;
246        }
247
248        // Find the first token after the current cursor
249        $position = $this->positions[0][++$this->position];
250        while ($position[1] < $this->cursor) {
251            if ($this->position == \count($this->positions[0]) - 1) {
252                return;
253            }
254            $position = $this->positions[0][++$this->position];
255        }
256
257        // push the template text first
258        $text = $textContent = substr($this->code, $this->cursor, $position[1] - $this->cursor);
259
260        // trim?
261        if (isset($this->positions[2][$this->position][0])) {
262            if ($this->options['whitespace_trim'] === $this->positions[2][$this->position][0]) {
263                // whitespace_trim detected ({%-, {{- or {#-)
264                $text = rtrim($text);
265            } elseif ($this->options['whitespace_line_trim'] === $this->positions[2][$this->position][0]) {
266                // whitespace_line_trim detected ({%~, {{~ or {#~)
267                // don't trim \r and \n
268                $text = rtrim($text, " \t\0\x0B");
269            }
270        }
271        $this->pushToken(Token::TEXT_TYPE, $text);
272        $this->moveCursor($textContent.$position[0]);
273
274        switch ($this->positions[1][$this->position][0]) {
275            case $this->options['tag_comment'][0]:
276                $this->lexComment();
277                break;
278
279            case $this->options['tag_block'][0]:
280                // raw data?
281                if (preg_match($this->regexes['lex_block_raw'], $this->code, $match, 0, $this->cursor)) {
282                    $this->moveCursor($match[0]);
283                    $this->lexRawData($match[1]);
284                // {% line \d+ %}
285                } elseif (preg_match($this->regexes['lex_block_line'], $this->code, $match, 0, $this->cursor)) {
286                    $this->moveCursor($match[0]);
287                    $this->lineno = (int) $match[1];
288                } else {
289                    $this->pushToken(Token::BLOCK_START_TYPE);
290                    $this->pushState(self::STATE_BLOCK);
291                    $this->currentVarBlockLine = $this->lineno;
292                }
293                break;
294
295            case $this->options['tag_variable'][0]:
296                $this->pushToken(Token::VAR_START_TYPE);
297                $this->pushState(self::STATE_VAR);
298                $this->currentVarBlockLine = $this->lineno;
299                break;
300        }
301    }
302
303    protected function lexBlock()
304    {
305        if (empty($this->brackets) && preg_match($this->regexes['lex_block'], $this->code, $match, 0, $this->cursor)) {
306            $this->pushToken(Token::BLOCK_END_TYPE);
307            $this->moveCursor($match[0]);
308            $this->popState();
309        } else {
310            $this->lexExpression();
311        }
312    }
313
314    protected function lexVar()
315    {
316        if (empty($this->brackets) && preg_match($this->regexes['lex_var'], $this->code, $match, 0, $this->cursor)) {
317            $this->pushToken(Token::VAR_END_TYPE);
318            $this->moveCursor($match[0]);
319            $this->popState();
320        } else {
321            $this->lexExpression();
322        }
323    }
324
325    protected function lexExpression()
326    {
327        // whitespace
328        if (preg_match('/\s+/A', $this->code, $match, 0, $this->cursor)) {
329            $this->moveCursor($match[0]);
330
331            if ($this->cursor >= $this->end) {
332                throw new SyntaxError(sprintf('Unclosed "%s".', self::STATE_BLOCK === $this->state ? 'block' : 'variable'), $this->currentVarBlockLine, $this->source);
333            }
334        }
335
336        // arrow function
337        if ('=' === $this->code[$this->cursor] && '>' === $this->code[$this->cursor + 1]) {
338            $this->pushToken(Token::ARROW_TYPE, '=>');
339            $this->moveCursor('=>');
340        }
341        // operators
342        elseif (preg_match($this->regexes['operator'], $this->code, $match, 0, $this->cursor)) {
343            $this->pushToken(Token::OPERATOR_TYPE, preg_replace('/\s+/', ' ', $match[0]));
344            $this->moveCursor($match[0]);
345        }
346        // names
347        elseif (preg_match(self::REGEX_NAME, $this->code, $match, 0, $this->cursor)) {
348            $this->pushToken(Token::NAME_TYPE, $match[0]);
349            $this->moveCursor($match[0]);
350        }
351        // numbers
352        elseif (preg_match(self::REGEX_NUMBER, $this->code, $match, 0, $this->cursor)) {
353            $number = (float) $match[0];  // floats
354            if (ctype_digit($match[0]) && $number <= PHP_INT_MAX) {
355                $number = (int) $match[0]; // integers lower than the maximum
356            }
357            $this->pushToken(Token::NUMBER_TYPE, $number);
358            $this->moveCursor($match[0]);
359        }
360        // punctuation
361        elseif (false !== strpos(self::PUNCTUATION, $this->code[$this->cursor])) {
362            // opening bracket
363            if (false !== strpos('([{', $this->code[$this->cursor])) {
364                $this->brackets[] = [$this->code[$this->cursor], $this->lineno];
365            }
366            // closing bracket
367            elseif (false !== strpos(')]}', $this->code[$this->cursor])) {
368                if (empty($this->brackets)) {
369                    throw new SyntaxError(sprintf('Unexpected "%s".', $this->code[$this->cursor]), $this->lineno, $this->source);
370                }
371
372                list($expect, $lineno) = array_pop($this->brackets);
373                if ($this->code[$this->cursor] != strtr($expect, '([{', ')]}')) {
374                    throw new SyntaxError(sprintf('Unclosed "%s".', $expect), $lineno, $this->source);
375                }
376            }
377
378            $this->pushToken(Token::PUNCTUATION_TYPE, $this->code[$this->cursor]);
379            ++$this->cursor;
380        }
381        // strings
382        elseif (preg_match(self::REGEX_STRING, $this->code, $match, 0, $this->cursor)) {
383            $this->pushToken(Token::STRING_TYPE, stripcslashes(substr($match[0], 1, -1)));
384            $this->moveCursor($match[0]);
385        }
386        // opening double quoted string
387        elseif (preg_match(self::REGEX_DQ_STRING_DELIM, $this->code, $match, 0, $this->cursor)) {
388            $this->brackets[] = ['"', $this->lineno];
389            $this->pushState(self::STATE_STRING);
390            $this->moveCursor($match[0]);
391        }
392        // unlexable
393        else {
394            throw new SyntaxError(sprintf('Unexpected character "%s".', $this->code[$this->cursor]), $this->lineno, $this->source);
395        }
396    }
397
398    protected function lexRawData($tag)
399    {
400        if ('raw' === $tag) {
401            @trigger_error(sprintf('Twig Tag "raw" is deprecated since version 1.21. Use "verbatim" instead in %s at line %d.', $this->filename, $this->lineno), E_USER_DEPRECATED);
402        }
403
404        if (!preg_match(str_replace('%s', $tag, $this->regexes['lex_raw_data']), $this->code, $match, PREG_OFFSET_CAPTURE, $this->cursor)) {
405            throw new SyntaxError(sprintf('Unexpected end of file: Unclosed "%s" block.', $tag), $this->lineno, $this->source);
406        }
407
408        $text = substr($this->code, $this->cursor, $match[0][1] - $this->cursor);
409        $this->moveCursor($text.$match[0][0]);
410
411        // trim?
412        if (isset($match[1][0])) {
413            if ($this->options['whitespace_trim'] === $match[1][0]) {
414                // whitespace_trim detected ({%-, {{- or {#-)
415                $text = rtrim($text);
416            } else {
417                // whitespace_line_trim detected ({%~, {{~ or {#~)
418                // don't trim \r and \n
419                $text = rtrim($text, " \t\0\x0B");
420            }
421        }
422
423        $this->pushToken(Token::TEXT_TYPE, $text);
424    }
425
426    protected function lexComment()
427    {
428        if (!preg_match($this->regexes['lex_comment'], $this->code, $match, PREG_OFFSET_CAPTURE, $this->cursor)) {
429            throw new SyntaxError('Unclosed comment.', $this->lineno, $this->source);
430        }
431
432        $this->moveCursor(substr($this->code, $this->cursor, $match[0][1] - $this->cursor).$match[0][0]);
433    }
434
435    protected function lexString()
436    {
437        if (preg_match($this->regexes['interpolation_start'], $this->code, $match, 0, $this->cursor)) {
438            $this->brackets[] = [$this->options['interpolation'][0], $this->lineno];
439            $this->pushToken(Token::INTERPOLATION_START_TYPE);
440            $this->moveCursor($match[0]);
441            $this->pushState(self::STATE_INTERPOLATION);
442        } elseif (preg_match(self::REGEX_DQ_STRING_PART, $this->code, $match, 0, $this->cursor) && \strlen($match[0]) > 0) {
443            $this->pushToken(Token::STRING_TYPE, stripcslashes($match[0]));
444            $this->moveCursor($match[0]);
445        } elseif (preg_match(self::REGEX_DQ_STRING_DELIM, $this->code, $match, 0, $this->cursor)) {
446            list($expect, $lineno) = array_pop($this->brackets);
447            if ('"' != $this->code[$this->cursor]) {
448                throw new SyntaxError(sprintf('Unclosed "%s".', $expect), $lineno, $this->source);
449            }
450
451            $this->popState();
452            ++$this->cursor;
453        } else {
454            // unlexable
455            throw new SyntaxError(sprintf('Unexpected character "%s".', $this->code[$this->cursor]), $this->lineno, $this->source);
456        }
457    }
458
459    protected function lexInterpolation()
460    {
461        $bracket = end($this->brackets);
462        if ($this->options['interpolation'][0] === $bracket[0] && preg_match($this->regexes['interpolation_end'], $this->code, $match, 0, $this->cursor)) {
463            array_pop($this->brackets);
464            $this->pushToken(Token::INTERPOLATION_END_TYPE);
465            $this->moveCursor($match[0]);
466            $this->popState();
467        } else {
468            $this->lexExpression();
469        }
470    }
471
472    protected function pushToken($type, $value = '')
473    {
474        // do not push empty text tokens
475        if (Token::TEXT_TYPE === $type && '' === $value) {
476            return;
477        }
478
479        $this->tokens[] = new Token($type, $value, $this->lineno);
480    }
481
482    protected function moveCursor($text)
483    {
484        $this->cursor += \strlen($text);
485        $this->lineno += substr_count($text, "\n");
486    }
487
488    protected function getOperatorRegex()
489    {
490        $operators = array_merge(
491            ['='],
492            array_keys($this->env->getUnaryOperators()),
493            array_keys($this->env->getBinaryOperators())
494        );
495
496        $operators = array_combine($operators, array_map('strlen', $operators));
497        arsort($operators);
498
499        $regex = [];
500        foreach ($operators as $operator => $length) {
501            // an operator that ends with a character must be followed by
502            // a whitespace or a parenthesis
503            if (ctype_alpha($operator[$length - 1])) {
504                $r = preg_quote($operator, '/').'(?=[\s()])';
505            } else {
506                $r = preg_quote($operator, '/');
507            }
508
509            // an operator with a space can be any amount of whitespaces
510            $r = preg_replace('/\s+/', '\s+', $r);
511
512            $regex[] = $r;
513        }
514
515        return '/'.implode('|', $regex).'/A';
516    }
517
518    protected function pushState($state)
519    {
520        $this->states[] = $this->state;
521        $this->state = $state;
522    }
523
524    protected function popState()
525    {
526        if (0 === \count($this->states)) {
527            throw new \LogicException('Cannot pop state without a previous state.');
528        }
529
530        $this->state = array_pop($this->states);
531    }
532}
533
534class_alias('Twig\Lexer', 'Twig_Lexer');
535