1<?php 2 3declare(strict_types=1); 4 5namespace Doctrine\Common\Lexer; 6 7use ReflectionClass; 8use const PREG_SPLIT_DELIM_CAPTURE; 9use const PREG_SPLIT_NO_EMPTY; 10use const PREG_SPLIT_OFFSET_CAPTURE; 11use function implode; 12use function in_array; 13use function preg_split; 14use function sprintf; 15use function substr; 16 17/** 18 * Base class for writing simple lexers, i.e. for creating small DSLs. 19 */ 20abstract class AbstractLexer 21{ 22 /** 23 * Lexer original input string. 24 * 25 * @var string 26 */ 27 private $input; 28 29 /** 30 * Array of scanned tokens. 31 * 32 * Each token is an associative array containing three items: 33 * - 'value' : the string value of the token in the input string 34 * - 'type' : the type of the token (identifier, numeric, string, input 35 * parameter, none) 36 * - 'position' : the position of the token in the input string 37 * 38 * @var array 39 */ 40 private $tokens = []; 41 42 /** 43 * Current lexer position in input string. 44 * 45 * @var int 46 */ 47 private $position = 0; 48 49 /** 50 * Current peek of current lexer position. 51 * 52 * @var int 53 */ 54 private $peek = 0; 55 56 /** 57 * The next token in the input. 58 * 59 * @var array|null 60 */ 61 public $lookahead; 62 63 /** 64 * The last matched/seen token. 65 * 66 * @var array|null 67 */ 68 public $token; 69 70 /** 71 * Composed regex for input parsing. 72 * 73 * @var string 74 */ 75 private $regex; 76 77 /** 78 * Sets the input data to be tokenized. 79 * 80 * The Lexer is immediately reset and the new input tokenized. 81 * Any unprocessed tokens from any previous input are lost. 82 * 83 * @param string $input The input to be tokenized. 84 * 85 * @return void 86 */ 87 public function setInput($input) 88 { 89 $this->input = $input; 90 $this->tokens = []; 91 92 $this->reset(); 93 $this->scan($input); 94 } 95 96 /** 97 * Resets the lexer. 98 * 99 * @return void 100 */ 101 public function reset() 102 { 103 $this->lookahead = null; 104 $this->token = null; 105 $this->peek = 0; 106 $this->position = 0; 107 } 108 109 /** 110 * Resets the peek pointer to 0. 111 * 112 * @return void 113 */ 114 public function resetPeek() 115 { 116 $this->peek = 0; 117 } 118 119 /** 120 * Resets the lexer position on the input to the given position. 121 * 122 * @param int $position Position to place the lexical scanner. 123 * 124 * @return void 125 */ 126 public function resetPosition($position = 0) 127 { 128 $this->position = $position; 129 } 130 131 /** 132 * Retrieve the original lexer's input until a given position. 133 * 134 * @param int $position 135 * 136 * @return string 137 */ 138 public function getInputUntilPosition($position) 139 { 140 return substr($this->input, 0, $position); 141 } 142 143 /** 144 * Checks whether a given token matches the current lookahead. 145 * 146 * @param int|string $token 147 * 148 * @return bool 149 */ 150 public function isNextToken($token) 151 { 152 return $this->lookahead !== null && $this->lookahead['type'] === $token; 153 } 154 155 /** 156 * Checks whether any of the given tokens matches the current lookahead. 157 * 158 * @param array $tokens 159 * 160 * @return bool 161 */ 162 public function isNextTokenAny(array $tokens) 163 { 164 return $this->lookahead !== null && in_array($this->lookahead['type'], $tokens, true); 165 } 166 167 /** 168 * Moves to the next token in the input string. 169 * 170 * @return bool 171 */ 172 public function moveNext() 173 { 174 $this->peek = 0; 175 $this->token = $this->lookahead; 176 $this->lookahead = isset($this->tokens[$this->position]) 177 ? $this->tokens[$this->position++] : null; 178 179 return $this->lookahead !== null; 180 } 181 182 /** 183 * Tells the lexer to skip input tokens until it sees a token with the given value. 184 * 185 * @param string $type The token type to skip until. 186 * 187 * @return void 188 */ 189 public function skipUntil($type) 190 { 191 while ($this->lookahead !== null && $this->lookahead['type'] !== $type) { 192 $this->moveNext(); 193 } 194 } 195 196 /** 197 * Checks if given value is identical to the given token. 198 * 199 * @param mixed $value 200 * @param int|string $token 201 * 202 * @return bool 203 */ 204 public function isA($value, $token) 205 { 206 return $this->getType($value) === $token; 207 } 208 209 /** 210 * Moves the lookahead token forward. 211 * 212 * @return array|null The next token or NULL if there are no more tokens ahead. 213 */ 214 public function peek() 215 { 216 if (isset($this->tokens[$this->position + $this->peek])) { 217 return $this->tokens[$this->position + $this->peek++]; 218 } 219 220 return null; 221 } 222 223 /** 224 * Peeks at the next token, returns it and immediately resets the peek. 225 * 226 * @return array|null The next token or NULL if there are no more tokens ahead. 227 */ 228 public function glimpse() 229 { 230 $peek = $this->peek(); 231 $this->peek = 0; 232 233 return $peek; 234 } 235 236 /** 237 * Scans the input string for tokens. 238 * 239 * @param string $input A query string. 240 * 241 * @return void 242 */ 243 protected function scan($input) 244 { 245 if (! isset($this->regex)) { 246 $this->regex = sprintf( 247 '/(%s)|%s/%s', 248 implode(')|(', $this->getCatchablePatterns()), 249 implode('|', $this->getNonCatchablePatterns()), 250 $this->getModifiers() 251 ); 252 } 253 254 $flags = PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_OFFSET_CAPTURE; 255 $matches = preg_split($this->regex, $input, -1, $flags); 256 257 if ($matches === false) { 258 // Work around https://bugs.php.net/78122 259 $matches = [[$input, 0]]; 260 } 261 262 foreach ($matches as $match) { 263 // Must remain before 'value' assignment since it can change content 264 $type = $this->getType($match[0]); 265 266 $this->tokens[] = [ 267 'value' => $match[0], 268 'type' => $type, 269 'position' => $match[1], 270 ]; 271 } 272 } 273 274 /** 275 * Gets the literal for a given token. 276 * 277 * @param int|string $token 278 * 279 * @return int|string 280 */ 281 public function getLiteral($token) 282 { 283 $className = static::class; 284 $reflClass = new ReflectionClass($className); 285 $constants = $reflClass->getConstants(); 286 287 foreach ($constants as $name => $value) { 288 if ($value === $token) { 289 return $className . '::' . $name; 290 } 291 } 292 293 return $token; 294 } 295 296 /** 297 * Regex modifiers 298 * 299 * @return string 300 */ 301 protected function getModifiers() 302 { 303 return 'iu'; 304 } 305 306 /** 307 * Lexical catchable patterns. 308 * 309 * @return array 310 */ 311 abstract protected function getCatchablePatterns(); 312 313 /** 314 * Lexical non-catchable patterns. 315 * 316 * @return array 317 */ 318 abstract protected function getNonCatchablePatterns(); 319 320 /** 321 * Retrieve token type. Also processes the token value if necessary. 322 * 323 * @param string $value 324 * 325 * @return int|string|null 326 */ 327 abstract protected function getType(&$value); 328} 329