1 /*
2 This file is part of KDevelop
3 SPDX-FileCopyrightText: 2008 Niko Sams <niko.sams@gmail.com>
4
5 SPDX-License-Identifier: LGPL-2.0-or-later
6 */
7
8 #include "phplexer.h"
9
10 #include "phpparser.h"
11 #include "tokenstream.h"
12
13 #include <QString>
14 #include <QStringList>
15 #include <QRegExp>
16 #include <QDebug>
17
18 #include "parserdebug.h"
19
20 namespace Php
21 {
22
Lexer(TokenStream * tokenStream,const QString & content,int initialState)23 Lexer::Lexer(TokenStream* tokenStream, const QString& content, int initialState):
24 m_content(content), m_tokenStream(tokenStream),
25 m_curpos(0), m_contentSize(m_content.size()),
26 m_tokenBegin(0), m_tokenEnd(0), m_haltCompiler(0)
27 {
28 pushState(ErrorState);
29 if (initialState == DefaultState) {
30 pushState(HtmlState);
31 }
32 pushState(initialState);
33 }
34
state(int deepness) const35 int Lexer::state(int deepness) const
36 {
37 return m_state.at(m_state.size() - deepness - 1);
38 }
printState()39 void Lexer::printState()
40 {
41 int s = state();
42 if (s == ErrorState)
43 qDebug() << "ErrorState";
44 else if (s == HtmlState)
45 qDebug() << "HtmlState";
46 else if (s == DefaultState)
47 qDebug() << "DefaultState";
48 else if (s == String)
49 qDebug() << "String";
50 else if (s == StringVariable)
51 qDebug() << "StringVariable";
52 else if (s == StringVariableBracket)
53 qDebug() << "StringVariableBracket";
54 else if (s == StringVariableObjectOperator)
55 qDebug() << "StringVariableObjectOperator";
56 else if (s == StringVariableCurly)
57 qDebug() << "StringVariableCurly";
58 else if (s == StringVarname)
59 qDebug() << "StringVarname";
60 else if (s == StringHeredoc)
61 qDebug() << "StringHeredoc";
62 else if (s == StringBacktick)
63 qDebug() << "StringBacktick";
64 }
65
pushState(int state)66 void Lexer::pushState(int state)
67 {
68 m_state.push(state);
69 }
70
popState()71 void Lexer::popState()
72 {
73 m_state.pop();
74 }
75
nextTokenKind()76 int Lexer::nextTokenKind()
77 {
78 int token = Parser::Token_INVALID;
79 if (m_curpos >= m_contentSize) {
80 m_tokenBegin = -1;
81 m_tokenEnd = -1;
82 createNewline(m_curpos);
83 return 0;
84 }
85
86 const QChar* it = m_content.constData();
87 it += m_curpos;
88 m_tokenBegin = m_curpos;
89 switch (state()) {
90 case HtmlState:
91 if (it->unicode() == '<' && (it + 1)->unicode() == '?'
92 ///TODO: per-project configuration to set whether we use shortags
93 /// or not. In the former case we'd need to rise an error here
94 && !( (it + 2)->toLower().unicode() == 'x'
95 && (it + 3)->toLower().unicode() == 'm'
96 && (it + 4)->toLower().unicode() == 'l' ) )
97 {
98 token = Parser::Token_OPEN_TAG;
99 if ((it + 2)->unicode() == '=') {
100 token = Parser::Token_OPEN_TAG_WITH_ECHO;
101 m_curpos++;
102 it++;
103 } else if ((it + 2)->toLower().unicode() == 'p'
104 && (it + 3)->toLower().unicode() == 'h'
105 && (it + 4)->toLower().unicode() == 'p'
106 && (it + 5)->isSpace()) {
107 m_curpos += 4;
108 if ((it + 5)->unicode() == '\n') createNewline(m_curpos + 1);
109 }
110 m_curpos++;
111 pushState(DefaultState);
112 } else {
113 token = Parser::Token_INLINE_HTML;
114 while (m_curpos < m_contentSize) {
115 if (it->unicode() == '\n') createNewline(m_curpos);
116 if ((it + 1)->unicode() == '<' && (it + 2)->unicode() == '?') {
117 break;
118 }
119 it++;
120 m_curpos++;
121 }
122 }
123 break;
124 case DefaultState:
125 case StringVariableCurly: {
126 if (it->isSpace()) {
127 token = Parser::Token_WHITESPACE;
128 while (m_curpos < m_contentSize && it->isSpace()) {
129 if (it->unicode() == '\n') createNewline(m_curpos);
130 it++;
131 m_curpos++;
132 }
133 m_curpos--;
134 } else if (it->isDigit() || (it->unicode() == '.' && (it + 1)->isDigit())) {
135 QString num;bool hasPoint = false;
136 bool hex = false;
137 bool bin = false;
138 if (it->unicode() == '0' && (it + 1)->toLower() == 'x') {
139 it += 2;
140 m_curpos += 2;
141 hex = true;
142 }
143 if (it->unicode() == '0' && (it + 1)->toLower() == 'b') {
144 it += 2;
145 m_curpos += 2;
146 bin = true;
147 }
148 while (m_curpos < m_contentSize && (
149 it->isDigit()
150 || (!hex && !hasPoint && it->unicode() == '.')
151 || (bin && (it->unicode() == '0' || it->unicode() == '1'))
152 || (hex && (it->toLower() == 'a' || it->toLower() == 'b' ||
153 it->toLower() == 'c' || it->toLower() == 'd' ||
154 it->toLower() == 'e' || it->toLower() == 'f')))) {
155 if (it->unicode() == '.') hasPoint = true;
156 num.append(*it);
157 it++;
158 m_curpos++;
159 }
160 if (!hex && !bin && it->toLower() == 'e' &&
161 ((it + 1)->isDigit() ||
162 (((it + 1)->unicode() == '-' || (it + 1)->unicode() == '+') && (it + 2)->isDigit()))) {
163 //exponential number
164 token = Parser::Token_DNUMBER;
165 m_curpos++;
166 it++;
167 if (it->unicode() == '-' || it->unicode() == '+') {
168 it++;
169 m_curpos++;
170 }
171 while (m_curpos < m_contentSize && (it->isDigit())) {
172 it++;
173 m_curpos++;
174 }
175 m_curpos--;
176 } else {
177 m_curpos--;
178 if (hasPoint) {
179 token = Parser::Token_DNUMBER;
180 } else {
181 bool ok;
182 //check if string can be converted to long
183 //if we get an overflow use double
184 num.toLong(&ok, hex ? 16 : 10);
185 if (ok) {
186 token = Parser::Token_LNUMBER;
187 } else {
188 token = Parser::Token_DNUMBER;
189 }
190 }
191 }
192
193 } else if (processVariable(it)) {
194 token = Parser::Token_VARIABLE;
195 } else if (it->unicode() == '$') {
196 //when it was not recognized as variable
197 token = Parser::Token_DOLLAR;
198 } else if (it->unicode() == '}') {
199 token = Parser::Token_RBRACE;
200 if (state() == StringVariableCurly) {
201 popState();
202 }
203 } else if (it->unicode() == '{') {
204 token = Parser::Token_LBRACE;
205 if (state() == StringVariableCurly) {
206 pushState(StringVariableCurly);
207 }
208 } else if (it->unicode() == ')') {
209 token = Parser::Token_RPAREN;
210 } else if (it->unicode() == '(') {
211 it++;
212 int pos = m_curpos + 1;
213 while (pos < m_contentSize && it->isSpace()) {
214 it++;
215 pos++;
216 }
217 const int nameStart = pos;
218 while (pos < m_contentSize && it->isLetter()) {
219 it++;
220 pos++;
221 }
222 const auto name = m_content.midRef(nameStart, pos - nameStart);
223 while (pos < m_contentSize && it->isSpace()) {
224 it++;
225 pos++;
226 }
227 if (it->unicode() == ')') {
228 if (name.compare(QLatin1String("int"), Qt::CaseInsensitive) == 0
229 || name.compare(QLatin1String("integer"), Qt::CaseInsensitive) == 0)
230 {
231 token = Parser::Token_INT_CAST;
232 } else if (name.compare(QLatin1String("real"), Qt::CaseInsensitive) == 0
233 || name.compare(QLatin1String("double"), Qt::CaseInsensitive) == 0
234 || name.compare(QLatin1String("float"), Qt::CaseInsensitive) == 0)
235 {
236 token = Parser::Token_DOUBLE_CAST;
237 } else if (name.compare(QLatin1String("string"), Qt::CaseInsensitive) == 0) {
238 token = Parser::Token_STRING_CAST;
239 } else if (name.compare(QLatin1String("binary"), Qt::CaseInsensitive) == 0) {
240 //as in php
241 token = Parser::Token_STRING_CAST;
242 } else if (name.compare(QLatin1String("array"), Qt::CaseInsensitive) == 0) {
243 token = Parser::Token_ARRAY_CAST;
244 } else if (name.compare(QLatin1String("object"), Qt::CaseInsensitive) == 0) {
245 token = Parser::Token_OBJECT_CAST;
246 } else if (name.compare(QLatin1String("bool"), Qt::CaseInsensitive) == 0
247 || name.compare(QLatin1String("boolean"), Qt::CaseInsensitive) == 0)
248 {
249 token = Parser::Token_BOOL_CAST;
250 } else if (name.compare(QLatin1String("unset"), Qt::CaseInsensitive) == 0) {
251 token = Parser::Token_UNSET_CAST;
252 } else {
253 token = Parser::Token_LPAREN;
254 }
255
256 if (token != Parser::Token_LPAREN) {
257 m_curpos = pos;
258 }
259 } else {
260 token = Parser::Token_LPAREN;
261 }
262 } else if (it->unicode() == ']') {
263 token = Parser::Token_RBRACKET;
264 } else if (it->unicode() == '[') {
265 token = Parser::Token_LBRACKET;
266 } else if (it->unicode() == ',') {
267 token = Parser::Token_COMMA;
268 } else if (it->unicode() == '@') {
269 token = Parser::Token_AT;
270 } else if (it->unicode() == '!') {
271 if ((it + 1)->unicode() == '=') {
272 m_curpos++;
273 if ((it + 2)->unicode() == '=') {
274 m_curpos++;
275 token = Parser::Token_IS_NOT_IDENTICAL;
276 } else {
277 token = Parser::Token_IS_NOT_EQUAL;
278 }
279 } else {
280 token = Parser::Token_BANG;
281 }
282 } else if (it->unicode() == '<') {
283 if ((it + 1)->unicode() == '<') {
284 m_curpos++;
285 if ((it + 2)->unicode() == '<' && state() != StringVariableCurly) {
286 //HEREDOC string (<<< EOD\nfoo\nEOD;\n)
287 int pos = 3;
288 while (m_curpos + pos < m_contentSize &&
289 ((it + pos)->unicode() == ' ' || (it + pos)->unicode() == '\t')) {
290 pos++;
291 }
292 bool isNowdoc = (it + pos)->unicode() == '\'';
293 bool foundQuote = isNowdoc || (it + pos)->unicode() == '"';
294 if (foundQuote) {
295 ++pos;
296 }
297 if ((it + pos)->isLetter() || (it + pos)->unicode() == '_') { //identifier must start with a letter
298 m_hereNowDocIdentifier.clear();
299 while (m_curpos + pos < m_contentSize &&
300 ((it + pos)->isDigit() || (it + pos)->isLetter() || (it + pos)->unicode() == '_')) {
301 m_hereNowDocIdentifier.append(*(it + pos));
302 pos++;
303 }
304 if (foundQuote && (m_curpos + pos) < m_contentSize) {
305 if (isNowdoc && (it+pos)->unicode() == '\'') {
306 ++pos;
307 } else if ((it+pos)->unicode() == '"') {
308 ++pos;
309 }
310 }
311 if (m_curpos + pos < m_contentSize && (it + pos)->unicode() == '\n') {
312 //identifier must be followed by newline, newline is part of HEREDOC token
313 if (isNowdoc) {
314 token = Parser::Token_START_NOWDOC;
315 pushState(StringNowdoc);
316 } else {
317 token = Parser::Token_START_HEREDOC;
318 pushState(StringHeredoc);
319 }
320 m_curpos += pos - 1;
321 createNewline(m_curpos);
322 }
323 }
324 }
325
326 if (token != Parser::Token_START_HEREDOC && token != Parser::Token_START_NOWDOC) {
327 if ((it + 2)->unicode() == '=') {
328 m_curpos++;
329 token = Parser::Token_SL_ASSIGN;
330 } else {
331 token = Parser::Token_SL;
332 }
333 }
334 } else if ((it + 1)->unicode() == '=') {
335 m_curpos++;
336 token = Parser::Token_IS_SMALLER_OR_EQUAL;
337 } else if ((it + 1)->unicode() == '>') {
338 m_curpos++;
339 token = Parser::Token_IS_NOT_EQUAL;
340 } else {
341 token = Parser::Token_IS_SMALLER;
342 }
343 } else if (it->unicode() == '>') {
344 if ((it + 1)->unicode() == '>') {
345 m_curpos++;
346 if ((it + 2)->unicode() == '=') {
347 m_curpos++;
348 token = Parser::Token_SR_ASSIGN;
349 } else {
350 token = Parser::Token_SR;
351 }
352 } else if ((it + 1)->unicode() == '=') {
353 m_curpos++;
354 token = Parser::Token_IS_GREATER_OR_EQUAL;
355 } else {
356 token = Parser::Token_IS_GREATER;
357 }
358 } else if (it->unicode() == '~') {
359 token = Parser::Token_TILDE;
360 } else if (it->unicode() == ':') {
361 if ((it + 1)->unicode() == ':') {
362 m_curpos++;
363 token = Parser::Token_PAAMAYIM_NEKUDOTAYIM;
364 } else {
365 token = Parser::Token_COLON;
366 }
367 } else if (it->unicode() == '?') {
368 if ((it + 1)->unicode() == '>') {
369 //accept CLOSE_TAG inside StringVariableCurly too, as php does
370 token = Parser::Token_CLOSE_TAG;
371 m_curpos++;
372 while (state() != HtmlState) popState();
373 } else {
374 token = Parser::Token_QUESTION;
375 }
376 } else if (it->unicode() == '-' && (it + 1)->unicode() == '>') {
377 m_curpos++;
378 token = Parser::Token_OBJECT_OPERATOR;
379 if (isValidVariableIdentifier(it + 2)) {
380 pushState(StringVariableObjectOperator);
381 }
382 } else if (it->unicode() == '%') {
383 if ((it + 1)->unicode() == '=') {
384 m_curpos++;
385 token = Parser::Token_MOD_ASSIGN;
386 } else {
387 token = Parser::Token_MOD;
388 }
389 } else if (it->unicode() == '/') {
390 if ((it + 1)->unicode() == '=') {
391 m_curpos++;
392 token = Parser::Token_DIV_ASSIGN;
393 } else if ((it + 1)->unicode() == '/') {
394 //accept COMMENT inside StringVariableCurly too, as php does
395 if ((it + 2)->unicode() == '/') {
396 token = Parser::Token_DOC_COMMENT;
397 } else {
398 token = Parser::Token_COMMENT;
399 }
400 while (m_curpos < m_contentSize) {
401 if (m_curpos + 1 < m_contentSize && it->unicode() == '?' && (it + 1)->unicode() == '>') {
402 --it;
403 --m_curpos;
404 break;
405 }
406 if ( it->unicode() == '\n' ) {
407 createNewline(m_curpos);
408 if ( token == Parser::Token_COMMENT ) {
409 break;
410 } else {
411 // lookahead to check whether this doc comment spans multiple lines
412 const QChar* it2 = it + 1;
413 int pos = m_curpos + 1;
414 while ( pos < m_contentSize && (it2)->isSpace() && (it2)->unicode() != '\n' ) {
415 ++it2;
416 ++pos;
417 }
418 if ( it2->unicode() == '/' && (it2 + 1)->unicode() == '/'
419 && (it2 + 2)->unicode() == '/' ) {
420 // seems to be a multi-line doc-comment
421 it = it2 + 2;
422 m_curpos = pos + 2;
423 continue;
424 } else {
425 // not a multi-line doc-comment
426 break;
427 }
428 }
429 }
430 it++;
431 m_curpos++;
432 }
433 } else if ((it + 1)->unicode() == '*') {
434 //accept COMMENT inside StringVariableCurly too, as php does
435 if ((it + 2)->unicode() == '*' && (it + 3)->isSpace()) {
436 token = Parser::Token_DOC_COMMENT;
437 } else {
438 token = Parser::Token_COMMENT;
439 }
440 it += 2;
441 m_curpos += 2;
442 while (m_curpos < m_contentSize && !(it->unicode() == '*' && (it + 1)->unicode() == '/')) {
443 if (it->unicode() == '\n') {
444 createNewline(m_curpos);
445 }
446 it++;
447 m_curpos++;
448 }
449 m_curpos++;
450 } else {
451 token = Parser::Token_DIV;
452 }
453 } else if (it->unicode() == '#') {
454 //accept COMMENT inside StringVariableCurly too, as php does
455 token = Parser::Token_COMMENT;
456 while (m_curpos < m_contentSize) {
457 if (m_curpos + 1 < m_contentSize && it->unicode() == '?' && (it + 1)->unicode() == '>') {
458 --it;
459 --m_curpos;
460 break;
461 }
462 if (it->unicode() == '\n') {
463 createNewline(m_curpos);
464 break;
465 }
466 it++;
467 m_curpos++;
468 }
469 } else if (it->unicode() == '^') {
470 if ((it + 1)->unicode() == '=') {
471 m_curpos++;
472 token = Parser::Token_XOR_ASSIGN;
473 } else {
474 token = Parser::Token_BIT_XOR;
475 }
476 } else if (it->unicode() == '*') {
477 if ((it + 1)->unicode() == '=') {
478 m_curpos++;
479 token = Parser::Token_MUL_ASSIGN;
480 } else if ((it + 1)->unicode() == '*') {
481 m_curpos++;
482 if ((it + 2)->unicode() == '=') {
483 m_curpos++;
484 token = Parser::Token_EXP_ASSIGN;
485 } else {
486 token = Parser::Token_EXP;
487 }
488 } else {
489 token = Parser::Token_MUL;
490 }
491 } else if (it->unicode() == '|') {
492 if ((it + 1)->unicode() == '|') {
493 m_curpos++;
494 token = Parser::Token_BOOLEAN_OR;
495 } else if ((it + 1)->unicode() == '=') {
496 m_curpos++;
497 token = Parser::Token_OR_ASSIGN;
498 } else {
499 token = Parser::Token_BIT_OR;
500 }
501 } else if (it->unicode() == '&') {
502 if ((it + 1)->unicode() == '&') {
503 m_curpos++;
504 token = Parser::Token_BOOLEAN_AND;
505 } else if ((it + 1)->unicode() == '=') {
506 m_curpos++;
507 token = Parser::Token_AND_ASSIGN;
508 } else {
509 token = Parser::Token_BIT_AND;
510 }
511 } else if (it->unicode() == '+') {
512 if ((it + 1)->unicode() == '+') {
513 m_curpos++;
514 token = Parser::Token_INC;
515 } else if ((it + 1)->unicode() == '=') {
516 m_curpos++;
517 token = Parser::Token_PLUS_ASSIGN;
518 } else {
519 token = Parser::Token_PLUS;
520 }
521 } else if (it->unicode() == '-') {
522 if ((it + 1)->unicode() == '-') {
523 m_curpos++;
524 token = Parser::Token_DEC;
525 } else if ((it + 1)->unicode() == '=') {
526 m_curpos++;
527 token = Parser::Token_MINUS_ASSIGN;
528 } else {
529 token = Parser::Token_MINUS;
530 }
531 } else if (it->unicode() == '.') {
532 if ((it + 1)->unicode() == '=') {
533 m_curpos++;
534 token = Parser::Token_CONCAT_ASSIGN;
535 } else {
536 token = Parser::Token_CONCAT;
537 }
538 } else if (it->unicode() == '\\') {
539 token = Parser::Token_BACKSLASH;
540 } else if (it->unicode() == ';') {
541 token = Parser::Token_SEMICOLON;
542 } else if (it->unicode() == '\'') {
543 token = Parser::Token_CONSTANT_ENCAPSED_STRING;
544 it++;
545 m_curpos++;
546 int startPos = m_curpos;
547 while (m_curpos < m_contentSize
548 && (it->unicode() != '\'' || isEscapedWithBackslash(it, m_curpos, startPos))) {
549 if (it->unicode() == '\n') createNewline(m_curpos);
550 it++;
551 m_curpos++;
552 }
553 // if the string is never terminated, make sure we don't overflow the boundaries
554 if ( m_curpos == m_contentSize ) {
555 --m_curpos;
556 }
557 } else if (it->unicode() == '"') {
558 it++;
559 m_curpos++;
560 int stringSize = 0;
561 bool foundVar = false;
562 while (m_curpos + stringSize < m_contentSize
563 && (it->unicode() != '"' || isEscapedWithBackslash(it, m_curpos + stringSize, m_curpos)))
564 {
565 if (it->unicode() == '$' && !isEscapedWithBackslash(it, m_curpos + stringSize, m_curpos)
566 && ((it + 1)->unicode() == '{'
567 || (isValidVariableIdentifier(it + 1) && !(it + 1)->isDigit()))) {
568 foundVar = true;
569 break;
570 }
571 it++;
572 stringSize++;
573 }
574 if (!foundVar) {
575 // if the string is never terminated, make sure we don't overflow the boundaries
576 if ( m_curpos + stringSize == m_contentSize ) {
577 m_curpos--;
578 }
579 token = Parser::Token_CONSTANT_ENCAPSED_STRING;
580 it -= stringSize;
581 for (int j = 0; j < stringSize; j++) {
582 if (it->unicode() == '\n') {
583 createNewline(m_curpos + j);
584 }
585 it++;
586 }
587 m_curpos += stringSize;
588 } else {
589 // properly set the token pos to the starting double quote
590 m_curpos--;
591 token = Parser::Token_DOUBLE_QUOTE;
592 pushState(String);
593 }
594 } else if (it->unicode() == '`') {
595 token = Parser::Token_BACKTICK;
596 pushState(StringBacktick);
597 } else if (it->unicode() == '=') {
598 if ((it + 1)->unicode() == '=') {
599 m_curpos++;
600 if ((it + 2)->unicode() == '=') {
601 m_curpos++;
602 token = Parser::Token_IS_IDENTICAL;
603 } else {
604 token = Parser::Token_IS_EQUAL;
605 }
606 } else if ((it + 1)->unicode() == '>') {
607 m_curpos++;
608 token = Parser::Token_DOUBLE_ARROW;
609 } else {
610 token = Parser::Token_ASSIGN;
611 }
612 } else if (isValidVariableIdentifier(it) && !it->isDigit()) {
613 const int from = m_curpos;
614 while (m_curpos < m_contentSize && (isValidVariableIdentifier(it))) {
615 it++;
616 m_curpos++;
617 }
618 const QStringRef name = m_content.midRef(from, m_curpos - from);
619 m_curpos--;
620 if (name.compare(QLatin1String("echo"), Qt::CaseInsensitive) == 0) {
621 token = Parser::Token_ECHO;
622 } else if (name.compare(QLatin1String("include"), Qt::CaseInsensitive) == 0) {
623 token = Parser::Token_INCLUDE;
624 } else if (name.compare(QLatin1String("include_once"), Qt::CaseInsensitive) == 0) {
625 token = Parser::Token_INCLUDE_ONCE;
626 } else if (name.compare(QLatin1String("require"), Qt::CaseInsensitive) == 0) {
627 token = Parser::Token_REQUIRE;
628 } else if (name.compare(QLatin1String("require_once"), Qt::CaseInsensitive) == 0) {
629 token = Parser::Token_REQUIRE_ONCE;
630 } else if (name.compare(QLatin1String("eval"), Qt::CaseInsensitive) == 0) {
631 token = Parser::Token_EVAL;
632 } else if (name.compare(QLatin1String("print"), Qt::CaseInsensitive) == 0) {
633 token = Parser::Token_PRINT;
634 } else if (name.compare(QLatin1String("abstract"), Qt::CaseInsensitive) == 0) {
635 token = Parser::Token_ABSTRACT;
636 } else if (name.compare(QLatin1String("break"), Qt::CaseInsensitive) == 0) {
637 token = Parser::Token_BREAK;
638 } else if (name.compare(QLatin1String("case"), Qt::CaseInsensitive) == 0) {
639 token = Parser::Token_CASE;
640 } else if (name.compare(QLatin1String("catch"), Qt::CaseInsensitive) == 0) {
641 token = Parser::Token_CATCH;
642 } else if (name.compare(QLatin1String("class"), Qt::CaseInsensitive) == 0) {
643 token = Parser::Token_CLASS;
644 } else if (name.compare(QLatin1String("const"), Qt::CaseInsensitive) == 0) {
645 token = Parser::Token_CONST;
646 } else if (name.compare(QLatin1String("continue"), Qt::CaseInsensitive) == 0) {
647 token = Parser::Token_CONTINUE;
648 } else if (name.compare(QLatin1String("default"), Qt::CaseInsensitive) == 0) {
649 token = Parser::Token_DEFAULT;
650 } else if (name.compare(QLatin1String("do"), Qt::CaseInsensitive) == 0) {
651 token = Parser::Token_DO;
652 } else if (name.compare(QLatin1String("else"), Qt::CaseInsensitive) == 0) {
653 token = Parser::Token_ELSE;
654 } else if (name.compare(QLatin1String("extends"), Qt::CaseInsensitive) == 0) {
655 token = Parser::Token_EXTENDS;
656 } else if (name.compare(QLatin1String("final"), Qt::CaseInsensitive) == 0) {
657 token = Parser::Token_FINAL;
658 } else if (name.compare(QLatin1String("for"), Qt::CaseInsensitive) == 0) {
659 token = Parser::Token_FOR;
660 } else if (name.compare(QLatin1String("if"), Qt::CaseInsensitive) == 0) {
661 token = Parser::Token_IF;
662 } else if (name.compare(QLatin1String("implements"), Qt::CaseInsensitive) == 0) {
663 token = Parser::Token_IMPLEMENTS;
664 } else if (name.compare(QLatin1String("instanceof"), Qt::CaseInsensitive) == 0) {
665 token = Parser::Token_INSTANCEOF;
666 } else if (name.compare(QLatin1String("insteadof"), Qt::CaseInsensitive) == 0) {
667 token = Parser::Token_INSTEADOF;
668 } else if (name.compare(QLatin1String("interface"), Qt::CaseInsensitive) == 0) {
669 token = Parser::Token_INTERFACE;
670 } else if (name.compare(QLatin1String("trait"), Qt::CaseInsensitive) == 0) {
671 token = Parser::Token_TRAIT;
672 } else if (name.compare(QLatin1String("new"), Qt::CaseInsensitive) == 0) {
673 token = Parser::Token_NEW;
674 } else if (name.compare(QLatin1String("private"), Qt::CaseInsensitive) == 0) {
675 token = Parser::Token_PRIVATE;
676 } else if (name.compare(QLatin1String("protected"), Qt::CaseInsensitive) == 0) {
677 token = Parser::Token_PROTECTED;
678 } else if (name.compare(QLatin1String("public"), Qt::CaseInsensitive) == 0) {
679 token = Parser::Token_PUBLIC;
680 } else if (name.compare(QLatin1String("return"), Qt::CaseInsensitive) == 0) {
681 token = Parser::Token_RETURN;
682 } else if (name.compare(QLatin1String("static"), Qt::CaseInsensitive) == 0) {
683 const QChar* lookAhead = it;
684 int pos = m_curpos;
685 while (pos < m_contentSize && lookAhead->isSpace()) {
686 ++lookAhead;
687 ++pos;
688 }
689 if (pos + 1 < m_contentSize && lookAhead->unicode() == ':' && (++lookAhead)->unicode() == ':') {
690 // PHP 5.3 - late static
691 token = Parser::Token_STRING;
692 } else {
693 token = Parser::Token_STATIC;
694 }
695 } else if (name.compare(QLatin1String("switch"), Qt::CaseInsensitive) == 0) {
696 token = Parser::Token_SWITCH;
697 } else if (name.compare(QLatin1String("throw"), Qt::CaseInsensitive) == 0) {
698 token = Parser::Token_THROW;
699 } else if (name.compare(QLatin1String("try"), Qt::CaseInsensitive) == 0) {
700 token = Parser::Token_TRY;
701 } else if (name.compare(QLatin1String("finally"), Qt::CaseInsensitive) == 0) {
702 token = Parser::Token_FINALLY;
703 } else if (name.compare(QLatin1String("while"), Qt::CaseInsensitive) == 0) {
704 token = Parser::Token_WHILE;
705 } else if (name.compare(QLatin1String("clone"), Qt::CaseInsensitive) == 0) {
706 token = Parser::Token_CLONE;
707 } else if (name.compare(QLatin1String("exit"), Qt::CaseInsensitive) == 0 || name.compare(QLatin1String("die"), Qt::CaseInsensitive) == 0) {
708 token = Parser::Token_EXIT;
709 } else if (name.compare(QLatin1String("elseif"), Qt::CaseInsensitive) == 0) {
710 token = Parser::Token_ELSEIF;
711 } else if (name.compare(QLatin1String("endif"), Qt::CaseInsensitive) == 0) {
712 token = Parser::Token_ENDIF;
713 } else if (name.compare(QLatin1String("endwhile"), Qt::CaseInsensitive) == 0) {
714 token = Parser::Token_ENDWHILE;
715 } else if (name.compare(QLatin1String("endfor"), Qt::CaseInsensitive) == 0) {
716 token = Parser::Token_ENDFOR;
717 } else if (name.compare(QLatin1String("foreach"), Qt::CaseInsensitive) == 0) {
718 token = Parser::Token_FOREACH;
719 } else if (name.compare(QLatin1String("endforeach"), Qt::CaseInsensitive) == 0) {
720 token = Parser::Token_ENDFOREACH;
721 } else if (name.compare(QLatin1String("declare"), Qt::CaseInsensitive) == 0) {
722 token = Parser::Token_DECLARE;
723 } else if (name.compare(QLatin1String("enddeclare"), Qt::CaseInsensitive) == 0) {
724 token = Parser::Token_ENDDECLARE;
725 } else if (name.compare(QLatin1String("as"), Qt::CaseInsensitive) == 0) {
726 token = Parser::Token_AS;
727 } else if (name.compare(QLatin1String("endswitch"), Qt::CaseInsensitive) == 0) {
728 token = Parser::Token_ENDSWITCH;
729 } else if (name.compare(QLatin1String("function"), Qt::CaseInsensitive) == 0) {
730 token = Parser::Token_FUNCTION;
731 } else if (name.compare(QLatin1String("use"), Qt::CaseInsensitive) == 0) {
732 token = Parser::Token_USE;
733 } else if (name.compare(QLatin1String("goto"), Qt::CaseInsensitive) == 0) {
734 token = Parser::Token_GOTO;
735 } else if (name.compare(QLatin1String("global"), Qt::CaseInsensitive) == 0) {
736 token = Parser::Token_GLOBAL;
737 } else if (name.compare(QLatin1String("var"), Qt::CaseInsensitive) == 0) {
738 token = Parser::Token_VAR;
739 } else if (name.compare(QLatin1String("unset"), Qt::CaseInsensitive) == 0) {
740 token = Parser::Token_UNSET;
741 } else if (name.compare(QLatin1String("isset"), Qt::CaseInsensitive) == 0) {
742 token = Parser::Token_ISSET;
743 } else if (name.compare(QLatin1String("empty"), Qt::CaseInsensitive) == 0) {
744 token = Parser::Token_EMPTY;
745 } else if (name.compare(QLatin1String("__halt_compiler"), Qt::CaseInsensitive) == 0) {
746 token = Parser::Token_HALT_COMPILER;
747 } else if (name.compare(QLatin1String("list"), Qt::CaseInsensitive) == 0) {
748 token = Parser::Token_LIST;
749 } else if (name.compare(QLatin1String("array"), Qt::CaseInsensitive) == 0) {
750 token = Parser::Token_ARRAY;
751 } else if (name.compare(QLatin1String("__class__"), Qt::CaseInsensitive) == 0) {
752 token = Parser::Token_CLASS_C;
753 } else if (name.compare(QLatin1String("__method__"), Qt::CaseInsensitive) == 0) {
754 token = Parser::Token_METHOD_C;
755 } else if (name.compare(QLatin1String("__function__"), Qt::CaseInsensitive) == 0) {
756 token = Parser::Token_FUNC_C;
757 } else if (name.compare(QLatin1String("__line__"), Qt::CaseInsensitive) == 0) {
758 token = Parser::Token_LINE;
759 } else if (name.compare(QLatin1String("__file__"), Qt::CaseInsensitive) == 0) {
760 token = Parser::Token_FILE;
761 } else if (name.compare(QLatin1String("or"), Qt::CaseInsensitive) == 0) {
762 token = Parser::Token_LOGICAL_OR;
763 } else if (name.compare(QLatin1String("and"), Qt::CaseInsensitive) == 0) {
764 token = Parser::Token_LOGICAL_AND;
765 } else if (name.compare(QLatin1String("xor"), Qt::CaseInsensitive) == 0) {
766 token = Parser::Token_LOGICAL_XOR;
767 } else if (name.compare(QLatin1String("namespace"), Qt::CaseInsensitive) == 0) {
768 token = Parser::Token_NAMESPACE;
769 } else if (name.compare(QLatin1String("__namespace__"), Qt::CaseInsensitive) == 0) {
770 token = Parser::Token_NAMESPACE_C;
771 } else if (name.compare(QLatin1String("callable"), Qt::CaseInsensitive) == 0) {
772 token = Parser::Token_CALLABLE;
773 } else {
774 token = Parser::Token_STRING;
775 }
776 }
777 break;
778 }
779
780 case StringVariable:
781 case String:
782 case StringHeredoc:
783 case StringBacktick:
784 if ((state() == String || state(1) == String) && it->unicode() == '"') {
785 token = Parser::Token_DOUBLE_QUOTE;
786 if (state() == StringVariable) popState();
787 popState();
788 } else if ((state() == StringBacktick || state(1) == StringBacktick) && it->unicode() == '`') {
789 token = Parser::Token_BACKTICK;
790 if (state() == StringVariable) popState();
791 popState();
792 } else if ((state() == StringHeredoc || state(1) == StringHeredoc) && isHereNowDocEnd(it)) {
793 token = Parser::Token_END_HEREDOC;
794 m_curpos += m_hereNowDocIdentifier.length() - 1;
795 if (state() == StringVariable) popState();
796 popState();
797 } else if (processVariable(it)) {
798 token = Parser::Token_VARIABLE;
799 if (state() != StringVariable) pushState(StringVariable);
800 } else if (state() != StringVariable && it->unicode() == '$' && (it + 1)->unicode() == '{') {
801 token = Parser::Token_DOLLAR_OPEN_CURLY_BRACES;
802 m_curpos++;
803 it += 2;
804 //check if a valid variable follows
805 if ((isValidVariableIdentifier(it) && !it->isDigit())) {
806 pushState(StringVarname);
807 }
808
809 } else if (state() == StringVariable && it->unicode() == '[') {
810 token = Parser::Token_LBRACKET;
811 pushState(StringVariableBracket);
812 } else if (state() != StringVariable && it->unicode() == '{' && (it + 1)->unicode() == '$'
813 && ((isValidVariableIdentifier(it + 2) && !(it + 2)->isDigit()) || (it + 2)->unicode() == '{')) {
814 token = Parser::Token_CURLY_OPEN;
815 pushState(StringVariableCurly);
816 } else if (state() == StringVariable
817 && it->unicode() == '-' && (it + 1)->unicode() == '>'
818 && isValidVariableIdentifier(it + 2) && !(it + 2)->isDigit()) {
819 token = Parser::Token_OBJECT_OPERATOR;
820 m_curpos++;
821 pushState(StringVariableObjectOperator);
822 } else {
823 if (state() == StringVariable) popState();
824 token = Parser::Token_ENCAPSED_AND_WHITESPACE;
825 int startPos = m_curpos;
826 while (m_curpos < m_contentSize) {
827 if (!isEscapedWithBackslash(it, m_curpos, startPos) &&
828 ((it->unicode() == '$' && (it + 1)->unicode() == '{') ||
829 (it->unicode() == '{' && (it + 1)->unicode() == '$' && isValidVariableIdentifier(it + 2)) ||
830 (it->unicode() == '$' && isValidVariableIdentifier(it + 1) && !(it + 1)->isDigit()))) {
831 //variable is next ${var} or {$var}
832 break;
833 }
834 if (state() == String && it->unicode() == '"'
835 && !isEscapedWithBackslash(it, m_curpos, startPos)) {
836 //end of string
837 break;
838 }
839 if (state() == StringBacktick && it->unicode() == '`'
840 && !isEscapedWithBackslash(it, m_curpos, startPos)) {
841 //end of string
842 break;
843 }
844
845 if (it->unicode() == '\n') createNewline(m_curpos);
846 m_curpos++;
847 it++;
848
849 if (state() == StringHeredoc && (it - 1)->unicode() == '\n') {
850 //check for end of heredoc (\nEOD;\n)
851 if (state() == StringHeredoc && isHereNowDocEnd(it)) {
852 break;
853 }
854 }
855 }
856 m_curpos--;
857 }
858 break;
859 case StringNowdoc:
860 if (isHereNowDocEnd(it)) {
861 token = Parser::Token_END_NOWDOC;
862 m_curpos += m_hereNowDocIdentifier.length() - 1;
863 popState();
864 } else {
865 token = Parser::Token_STRING;
866 while (m_curpos < m_contentSize) {
867 if (it->unicode() == '\n') createNewline(m_curpos);
868 m_curpos++;
869 it++;
870
871 if ((it - 1)->unicode() == '\n' && isHereNowDocEnd(it)) {
872 //check for end of nowdoc (\nEOD;\n)
873 break;
874 }
875 }
876 m_curpos--;
877 }
878 break;
879 case StringVariableBracket:
880 if (it->unicode() == ']') {
881 token = Parser::Token_RBRACKET;
882 popState();
883 popState();
884 } else if (it->isDigit()) {
885 token = Parser::Token_NUM_STRING;
886 while (m_curpos < m_contentSize && it->isDigit()) {
887 it++;
888 m_curpos++;
889 }
890 m_curpos--;
891 } else {
892 token = Parser::Token_STRING;
893 while (m_curpos < m_contentSize && (it->unicode() != ']')) {
894 if (it->unicode() == '\n') createNewline(m_curpos);
895 it++;
896 m_curpos++;
897 }
898 m_curpos--;
899 }
900 break;
901 case StringVariableObjectOperator:
902 token = Parser::Token_STRING;
903 while (m_curpos < m_contentSize && isValidVariableIdentifier(it)) {
904 it++;
905 m_curpos++;
906 }
907 m_curpos--;
908 popState();
909 if (state() == StringVariable) popState();
910 break;
911 case StringVarname:
912 popState();
913 pushState(StringVariableCurly);
914 token = Parser::Token_STRING_VARNAME;
915 while (m_curpos < m_contentSize && isValidVariableIdentifier(it)) {
916 it++;
917 m_curpos++;
918 }
919 m_curpos--;
920 break;
921 default:
922 token = Parser::Token_INVALID;
923 break;
924 }
925 if (m_curpos > m_contentSize) {
926 m_tokenBegin = -1;
927 m_tokenEnd = -1;
928 return 0;
929 }
930 m_tokenEnd = m_curpos;
931 m_curpos++;
932
933 if (m_haltCompiler) {
934 //look for __halt_compiler(); and stop lexer there
935 if (m_haltCompiler == 4) {
936 token = 0; //EOF
937 } else if (token == Parser::Token_WHITESPACE || token == Parser::Token_COMMENT || token == Parser::Token_DOC_COMMENT) {
938 //ignore
939 } else if (m_haltCompiler == 1 && token == Parser::Token_LPAREN) {
940 m_haltCompiler++;
941 } else if (m_haltCompiler == 2 && token == Parser::Token_RPAREN) {
942 m_haltCompiler++;
943 } else if (m_haltCompiler == 3 && token == Parser::Token_SEMICOLON) {
944 m_haltCompiler++;
945 } else {
946 m_haltCompiler = 0;
947 }
948 }
949 if (token == Parser::Token_HALT_COMPILER && !m_haltCompiler) {
950 m_haltCompiler = 1;
951 }
952 return token;
953 }
954
tokenBegin() const955 qint64 Lexer::tokenBegin() const
956 {
957 return m_tokenBegin;
958 }
959
tokenEnd() const960 qint64 Lexer::tokenEnd() const
961 {
962 return m_tokenEnd;
963 }
964
isHereNowDocEnd(const QChar * it)965 bool Lexer::isHereNowDocEnd(const QChar* it)
966 {
967 int identiferLen = m_hereNowDocIdentifier.length();
968 QString lineStart;
969 for (int i = 0; i < identiferLen; i++) {
970 if (m_curpos + i >= m_contentSize) break;
971 lineStart.append(*(it + i));
972 }
973 if (lineStart == m_hereNowDocIdentifier &&
974 ((it + identiferLen)->unicode() == '\n'
975 || ((it + identiferLen)->unicode() == ';' &&
976 (it + identiferLen + 1)->unicode() == '\n'))) {
977 return true;
978 }
979 return false;
980 }
981
982 //used for strings, to check if " is escaped (\" is, \\" not)
isEscapedWithBackslash(const QChar * it,int curPos,int startPos)983 bool Lexer::isEscapedWithBackslash(const QChar* it, int curPos, int startPos)
984 {
985 int cnt = 0;
986 it--;
987 while (curPos > startPos && it->unicode() == '\\') {
988 cnt++;
989 it--;
990 }
991 return (cnt % 2) == 1;
992 }
993
processVariable(const QChar * it)994 bool Lexer::processVariable(const QChar* it)
995 {
996 const QChar* c2 = it + 1;
997 if (it->unicode() == '$' && (isValidVariableIdentifier(c2) && !c2->isDigit())) {
998 it++;
999 m_curpos++;
1000 while (m_curpos < m_contentSize
1001 && (isValidVariableIdentifier(it))) {
1002 it++;
1003 m_curpos++;
1004 }
1005 m_curpos--;
1006 return true;
1007 } else {
1008 return false;
1009 }
1010 }
isValidVariableIdentifier(const QChar * it)1011 bool Lexer::isValidVariableIdentifier(const QChar* it)
1012 {
1013 return it->isLetter() || it->isDigit() || it->unicode() == '_' || it->unicode() > 0x7f;
1014 }
1015
createNewline(int pos)1016 void Lexer::createNewline(int pos)
1017 {
1018 if (m_tokenStream) m_tokenStream->locationTable()->newline(pos);
1019 }
1020
1021 }
1022
1023