1<?php
2
3declare(strict_types=1);
4
5namespace Doctrine\Common\Lexer;
6
7use ReflectionClass;
8use const PREG_SPLIT_DELIM_CAPTURE;
9use const PREG_SPLIT_NO_EMPTY;
10use const PREG_SPLIT_OFFSET_CAPTURE;
11use function implode;
12use function in_array;
13use function preg_split;
14use function sprintf;
15use function substr;
16
17/**
18 * Base class for writing simple lexers, i.e. for creating small DSLs.
19 */
20abstract class AbstractLexer
21{
22    /**
23     * Lexer original input string.
24     *
25     * @var string
26     */
27    private $input;
28
29    /**
30     * Array of scanned tokens.
31     *
32     * Each token is an associative array containing three items:
33     *  - 'value'    : the string value of the token in the input string
34     *  - 'type'     : the type of the token (identifier, numeric, string, input
35     *                 parameter, none)
36     *  - 'position' : the position of the token in the input string
37     *
38     * @var array
39     */
40    private $tokens = [];
41
42    /**
43     * Current lexer position in input string.
44     *
45     * @var int
46     */
47    private $position = 0;
48
49    /**
50     * Current peek of current lexer position.
51     *
52     * @var int
53     */
54    private $peek = 0;
55
56    /**
57     * The next token in the input.
58     *
59     * @var array|null
60     */
61    public $lookahead;
62
63    /**
64     * The last matched/seen token.
65     *
66     * @var array|null
67     */
68    public $token;
69
70    /**
71     * Composed regex for input parsing.
72     *
73     * @var string
74     */
75    private $regex;
76
77    /**
78     * Sets the input data to be tokenized.
79     *
80     * The Lexer is immediately reset and the new input tokenized.
81     * Any unprocessed tokens from any previous input are lost.
82     *
83     * @param string $input The input to be tokenized.
84     *
85     * @return void
86     */
87    public function setInput($input)
88    {
89        $this->input  = $input;
90        $this->tokens = [];
91
92        $this->reset();
93        $this->scan($input);
94    }
95
96    /**
97     * Resets the lexer.
98     *
99     * @return void
100     */
101    public function reset()
102    {
103        $this->lookahead = null;
104        $this->token     = null;
105        $this->peek      = 0;
106        $this->position  = 0;
107    }
108
109    /**
110     * Resets the peek pointer to 0.
111     *
112     * @return void
113     */
114    public function resetPeek()
115    {
116        $this->peek = 0;
117    }
118
119    /**
120     * Resets the lexer position on the input to the given position.
121     *
122     * @param int $position Position to place the lexical scanner.
123     *
124     * @return void
125     */
126    public function resetPosition($position = 0)
127    {
128        $this->position = $position;
129    }
130
131    /**
132     * Retrieve the original lexer's input until a given position.
133     *
134     * @param int $position
135     *
136     * @return string
137     */
138    public function getInputUntilPosition($position)
139    {
140        return substr($this->input, 0, $position);
141    }
142
143    /**
144     * Checks whether a given token matches the current lookahead.
145     *
146     * @param int|string $token
147     *
148     * @return bool
149     */
150    public function isNextToken($token)
151    {
152        return $this->lookahead !== null && $this->lookahead['type'] === $token;
153    }
154
155    /**
156     * Checks whether any of the given tokens matches the current lookahead.
157     *
158     * @param array $tokens
159     *
160     * @return bool
161     */
162    public function isNextTokenAny(array $tokens)
163    {
164        return $this->lookahead !== null && in_array($this->lookahead['type'], $tokens, true);
165    }
166
167    /**
168     * Moves to the next token in the input string.
169     *
170     * @return bool
171     */
172    public function moveNext()
173    {
174        $this->peek      = 0;
175        $this->token     = $this->lookahead;
176        $this->lookahead = isset($this->tokens[$this->position])
177            ? $this->tokens[$this->position++] : null;
178
179        return $this->lookahead !== null;
180    }
181
182    /**
183     * Tells the lexer to skip input tokens until it sees a token with the given value.
184     *
185     * @param string $type The token type to skip until.
186     *
187     * @return void
188     */
189    public function skipUntil($type)
190    {
191        while ($this->lookahead !== null && $this->lookahead['type'] !== $type) {
192            $this->moveNext();
193        }
194    }
195
196    /**
197     * Checks if given value is identical to the given token.
198     *
199     * @param mixed      $value
200     * @param int|string $token
201     *
202     * @return bool
203     */
204    public function isA($value, $token)
205    {
206        return $this->getType($value) === $token;
207    }
208
209    /**
210     * Moves the lookahead token forward.
211     *
212     * @return array|null The next token or NULL if there are no more tokens ahead.
213     */
214    public function peek()
215    {
216        if (isset($this->tokens[$this->position + $this->peek])) {
217            return $this->tokens[$this->position + $this->peek++];
218        }
219
220        return null;
221    }
222
223    /**
224     * Peeks at the next token, returns it and immediately resets the peek.
225     *
226     * @return array|null The next token or NULL if there are no more tokens ahead.
227     */
228    public function glimpse()
229    {
230        $peek       = $this->peek();
231        $this->peek = 0;
232
233        return $peek;
234    }
235
236    /**
237     * Scans the input string for tokens.
238     *
239     * @param string $input A query string.
240     *
241     * @return void
242     */
243    protected function scan($input)
244    {
245        if (! isset($this->regex)) {
246            $this->regex = sprintf(
247                '/(%s)|%s/%s',
248                implode(')|(', $this->getCatchablePatterns()),
249                implode('|', $this->getNonCatchablePatterns()),
250                $this->getModifiers()
251            );
252        }
253
254        $flags   = PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_OFFSET_CAPTURE;
255        $matches = preg_split($this->regex, $input, -1, $flags);
256
257        if ($matches === false) {
258            // Work around https://bugs.php.net/78122
259            $matches = [[$input, 0]];
260        }
261
262        foreach ($matches as $match) {
263            // Must remain before 'value' assignment since it can change content
264            $type = $this->getType($match[0]);
265
266            $this->tokens[] = [
267                'value' => $match[0],
268                'type'  => $type,
269                'position' => $match[1],
270            ];
271        }
272    }
273
274    /**
275     * Gets the literal for a given token.
276     *
277     * @param int|string $token
278     *
279     * @return int|string
280     */
281    public function getLiteral($token)
282    {
283        $className = static::class;
284        $reflClass = new ReflectionClass($className);
285        $constants = $reflClass->getConstants();
286
287        foreach ($constants as $name => $value) {
288            if ($value === $token) {
289                return $className . '::' . $name;
290            }
291        }
292
293        return $token;
294    }
295
296    /**
297     * Regex modifiers
298     *
299     * @return string
300     */
301    protected function getModifiers()
302    {
303        return 'iu';
304    }
305
306    /**
307     * Lexical catchable patterns.
308     *
309     * @return array
310     */
311    abstract protected function getCatchablePatterns();
312
313    /**
314     * Lexical non-catchable patterns.
315     *
316     * @return array
317     */
318    abstract protected function getNonCatchablePatterns();
319
320    /**
321     * Retrieve token type. Also processes the token value if necessary.
322     *
323     * @param string $value
324     *
325     * @return int|string|null
326     */
327    abstract protected function getType(&$value);
328}
329