1<?php
2
3/*
4 * This file is part of Mustache.php.
5 *
6 * (c) 2010-2017 Justin Hileman
7 *
8 * For the full copyright and license information, please view the LICENSE
9 * file that was distributed with this source code.
10 */
11
12/**
13 * Mustache Tokenizer class.
14 *
15 * This class is responsible for turning raw template source into a set of Mustache tokens.
16 */
17class Mustache_Tokenizer
18{
19    // Finite state machine states
20    const IN_TEXT     = 0;
21    const IN_TAG_TYPE = 1;
22    const IN_TAG      = 2;
23
24    // Token types
25    const T_SECTION      = '#';
26    const T_INVERTED     = '^';
27    const T_END_SECTION  = '/';
28    const T_COMMENT      = '!';
29    const T_PARTIAL      = '>';
30    const T_PARENT       = '<';
31    const T_DELIM_CHANGE = '=';
32    const T_ESCAPED      = '_v';
33    const T_UNESCAPED    = '{';
34    const T_UNESCAPED_2  = '&';
35    const T_TEXT         = '_t';
36    const T_PRAGMA       = '%';
37    const T_BLOCK_VAR    = '$';
38    const T_BLOCK_ARG    = '$arg';
39
40    // Valid token types
41    private static $tagTypes = array(
42        self::T_SECTION      => true,
43        self::T_INVERTED     => true,
44        self::T_END_SECTION  => true,
45        self::T_COMMENT      => true,
46        self::T_PARTIAL      => true,
47        self::T_PARENT       => true,
48        self::T_DELIM_CHANGE => true,
49        self::T_ESCAPED      => true,
50        self::T_UNESCAPED    => true,
51        self::T_UNESCAPED_2  => true,
52        self::T_PRAGMA       => true,
53        self::T_BLOCK_VAR    => true,
54    );
55
56    // Token properties
57    const TYPE    = 'type';
58    const NAME    = 'name';
59    const OTAG    = 'otag';
60    const CTAG    = 'ctag';
61    const LINE    = 'line';
62    const INDEX   = 'index';
63    const END     = 'end';
64    const INDENT  = 'indent';
65    const NODES   = 'nodes';
66    const VALUE   = 'value';
67    const FILTERS = 'filters';
68
69    private $state;
70    private $tagType;
71    private $buffer;
72    private $tokens;
73    private $seenTag;
74    private $line;
75
76    private $otag;
77    private $otagChar;
78    private $otagLen;
79
80    private $ctag;
81    private $ctagChar;
82    private $ctagLen;
83
84    /**
85     * Scan and tokenize template source.
86     *
87     * @throws Mustache_Exception_SyntaxException when mismatched section tags are encountered
88     * @throws Mustache_Exception_InvalidArgumentException when $delimiters string is invalid
89     *
90     * @param string $text       Mustache template source to tokenize
91     * @param string $delimiters Optionally, pass initial opening and closing delimiters (default: empty string)
92     *
93     * @return array Set of Mustache tokens
94     */
95    public function scan($text, $delimiters = '')
96    {
97        // Setting mbstring.func_overload makes things *really* slow.
98        // Let's do everyone a favor and scan this string as ASCII instead.
99        //
100        // The INI directive was removed in PHP 8.0 so we don't need to check there (and can drop it
101        // when we remove support for older versions of PHP).
102        //
103        // @codeCoverageIgnoreStart
104        $encoding = null;
105        if (version_compare(PHP_VERSION, '8.0.0', '<')) {
106            if (function_exists('mb_internal_encoding') && ini_get('mbstring.func_overload') & 2) {
107                $encoding = mb_internal_encoding();
108                mb_internal_encoding('ASCII');
109            }
110        }
111        // @codeCoverageIgnoreEnd
112
113        $this->reset();
114
115        if (is_string($delimiters) && $delimiters = trim($delimiters)) {
116            $this->setDelimiters($delimiters);
117        }
118
119        $len = strlen($text);
120        for ($i = 0; $i < $len; $i++) {
121            switch ($this->state) {
122                case self::IN_TEXT:
123                    $char = $text[$i];
124                    // Test whether it's time to change tags.
125                    if ($char === $this->otagChar && substr($text, $i, $this->otagLen) === $this->otag) {
126                        $i--;
127                        $this->flushBuffer();
128                        $this->state = self::IN_TAG_TYPE;
129                    } else {
130                        $this->buffer .= $char;
131                        if ($char === "\n") {
132                            $this->flushBuffer();
133                            $this->line++;
134                        }
135                    }
136                    break;
137
138                case self::IN_TAG_TYPE:
139                    $i += $this->otagLen - 1;
140                    $char = $text[$i + 1];
141                    if (isset(self::$tagTypes[$char])) {
142                        $tag = $char;
143                        $this->tagType = $tag;
144                    } else {
145                        $tag = null;
146                        $this->tagType = self::T_ESCAPED;
147                    }
148
149                    if ($this->tagType === self::T_DELIM_CHANGE) {
150                        $i = $this->changeDelimiters($text, $i);
151                        $this->state = self::IN_TEXT;
152                    } elseif ($this->tagType === self::T_PRAGMA) {
153                        $i = $this->addPragma($text, $i);
154                        $this->state = self::IN_TEXT;
155                    } else {
156                        if ($tag !== null) {
157                            $i++;
158                        }
159                        $this->state = self::IN_TAG;
160                    }
161                    $this->seenTag = $i;
162                    break;
163
164                default:
165                    $char = $text[$i];
166                    // Test whether it's time to change tags.
167                    if ($char === $this->ctagChar && substr($text, $i, $this->ctagLen) === $this->ctag) {
168                        $token = array(
169                            self::TYPE  => $this->tagType,
170                            self::NAME  => trim($this->buffer),
171                            self::OTAG  => $this->otag,
172                            self::CTAG  => $this->ctag,
173                            self::LINE  => $this->line,
174                            self::INDEX => ($this->tagType === self::T_END_SECTION) ? $this->seenTag - $this->otagLen : $i + $this->ctagLen,
175                        );
176
177                        if ($this->tagType === self::T_UNESCAPED) {
178                            // Clean up `{{{ tripleStache }}}` style tokens.
179                            if ($this->ctag === '}}') {
180                                if (($i + 2 < $len) && $text[$i + 2] === '}') {
181                                    $i++;
182                                } else {
183                                    $msg = sprintf(
184                                        'Mismatched tag delimiters: %s on line %d',
185                                        $token[self::NAME],
186                                        $token[self::LINE]
187                                    );
188
189                                    throw new Mustache_Exception_SyntaxException($msg, $token);
190                                }
191                            } else {
192                                $lastName = $token[self::NAME];
193                                if (substr($lastName, -1) === '}') {
194                                    $token[self::NAME] = trim(substr($lastName, 0, -1));
195                                } else {
196                                    $msg = sprintf(
197                                        'Mismatched tag delimiters: %s on line %d',
198                                        $token[self::NAME],
199                                        $token[self::LINE]
200                                    );
201
202                                    throw new Mustache_Exception_SyntaxException($msg, $token);
203                                }
204                            }
205                        }
206
207                        $this->buffer = '';
208                        $i += $this->ctagLen - 1;
209                        $this->state = self::IN_TEXT;
210                        $this->tokens[] = $token;
211                    } else {
212                        $this->buffer .= $char;
213                    }
214                    break;
215            }
216        }
217
218        $this->flushBuffer();
219
220        // Restore the user's encoding...
221        // @codeCoverageIgnoreStart
222        if ($encoding) {
223            mb_internal_encoding($encoding);
224        }
225        // @codeCoverageIgnoreEnd
226
227        return $this->tokens;
228    }
229
230    /**
231     * Helper function to reset tokenizer internal state.
232     */
233    private function reset()
234    {
235        $this->state    = self::IN_TEXT;
236        $this->tagType  = null;
237        $this->buffer   = '';
238        $this->tokens   = array();
239        $this->seenTag  = false;
240        $this->line     = 0;
241
242        $this->otag     = '{{';
243        $this->otagChar = '{';
244        $this->otagLen  = 2;
245
246        $this->ctag     = '}}';
247        $this->ctagChar = '}';
248        $this->ctagLen  = 2;
249    }
250
251    /**
252     * Flush the current buffer to a token.
253     */
254    private function flushBuffer()
255    {
256        if (strlen($this->buffer) > 0) {
257            $this->tokens[] = array(
258                self::TYPE  => self::T_TEXT,
259                self::LINE  => $this->line,
260                self::VALUE => $this->buffer,
261            );
262            $this->buffer   = '';
263        }
264    }
265
266    /**
267     * Change the current Mustache delimiters. Set new `otag` and `ctag` values.
268     *
269     * @throws Mustache_Exception_SyntaxException when delimiter string is invalid
270     *
271     * @param string $text  Mustache template source
272     * @param int    $index Current tokenizer index
273     *
274     * @return int New index value
275     */
276    private function changeDelimiters($text, $index)
277    {
278        $startIndex = strpos($text, '=', $index) + 1;
279        $close      = '=' . $this->ctag;
280        $closeIndex = strpos($text, $close, $index);
281
282        $token = array(
283            self::TYPE => self::T_DELIM_CHANGE,
284            self::LINE => $this->line,
285        );
286
287        try {
288            $this->setDelimiters(trim(substr($text, $startIndex, $closeIndex - $startIndex)));
289        } catch (Mustache_Exception_InvalidArgumentException $e) {
290            throw new Mustache_Exception_SyntaxException($e->getMessage(), $token);
291        }
292
293        $this->tokens[] = $token;
294
295        return $closeIndex + strlen($close) - 1;
296    }
297
298    /**
299     * Set the current Mustache `otag` and `ctag` delimiters.
300     *
301     * @throws Mustache_Exception_InvalidArgumentException when delimiter string is invalid
302     *
303     * @param string $delimiters
304     */
305    private function setDelimiters($delimiters)
306    {
307        if (!preg_match('/^\s*(\S+)\s+(\S+)\s*$/', $delimiters, $matches)) {
308            throw new Mustache_Exception_InvalidArgumentException(sprintf('Invalid delimiters: %s', $delimiters));
309        }
310
311        list($_, $otag, $ctag) = $matches;
312
313        $this->otag     = $otag;
314        $this->otagChar = $otag[0];
315        $this->otagLen  = strlen($otag);
316
317        $this->ctag     = $ctag;
318        $this->ctagChar = $ctag[0];
319        $this->ctagLen  = strlen($ctag);
320    }
321
322    /**
323     * Add pragma token.
324     *
325     * Pragmas are hoisted to the front of the template, so all pragma tokens
326     * will appear at the front of the token list.
327     *
328     * @param string $text
329     * @param int    $index
330     *
331     * @return int New index value
332     */
333    private function addPragma($text, $index)
334    {
335        $end    = strpos($text, $this->ctag, $index);
336        $pragma = trim(substr($text, $index + 2, $end - $index - 2));
337
338        // Pragmas are hoisted to the front of the template.
339        array_unshift($this->tokens, array(
340            self::TYPE => self::T_PRAGMA,
341            self::NAME => $pragma,
342            self::LINE => 0,
343        ));
344
345        return $end + $this->ctagLen - 1;
346    }
347}
348