1 /*
2     This file is part of KDevelop
3     SPDX-FileCopyrightText: 2008 Niko Sams <niko.sams@gmail.com>
4 
5     SPDX-License-Identifier: LGPL-2.0-or-later
6 */
7 
8 #include "phplexer.h"
9 
10 #include "phpparser.h"
11 #include "tokenstream.h"
12 
13 #include <QString>
14 #include <QStringList>
15 #include <QRegExp>
16 #include <QDebug>
17 
18 #include "parserdebug.h"
19 
20 namespace Php
21 {
22 
Lexer(TokenStream * tokenStream,const QString & content,int initialState)23 Lexer::Lexer(TokenStream* tokenStream, const QString& content, int initialState):
24         m_content(content), m_tokenStream(tokenStream),
25         m_curpos(0), m_contentSize(m_content.size()),
26         m_tokenBegin(0), m_tokenEnd(0), m_haltCompiler(0)
27 {
28     pushState(ErrorState);
29     if (initialState == DefaultState) {
30         pushState(HtmlState);
31     }
32     pushState(initialState);
33 }
34 
state(int deepness) const35 int Lexer::state(int deepness) const
36 {
37     return m_state.at(m_state.size() - deepness - 1);
38 }
printState()39 void Lexer::printState()
40 {
41     int s = state();
42     if (s == ErrorState)
43         qDebug() << "ErrorState";
44     else if (s == HtmlState)
45         qDebug() << "HtmlState";
46     else if (s == DefaultState)
47         qDebug() << "DefaultState";
48     else if (s == String)
49         qDebug() << "String";
50     else if (s == StringVariable)
51         qDebug() << "StringVariable";
52     else if (s == StringVariableBracket)
53         qDebug() << "StringVariableBracket";
54     else if (s == StringVariableObjectOperator)
55         qDebug() << "StringVariableObjectOperator";
56     else if (s == StringVariableCurly)
57         qDebug() << "StringVariableCurly";
58     else if (s == StringVarname)
59         qDebug() << "StringVarname";
60     else if (s == StringHeredoc)
61         qDebug() << "StringHeredoc";
62     else if (s == StringBacktick)
63         qDebug() << "StringBacktick";
64 }
65 
pushState(int state)66 void Lexer::pushState(int state)
67 {
68     m_state.push(state);
69 }
70 
popState()71 void Lexer::popState()
72 {
73     m_state.pop();
74 }
75 
nextTokenKind()76 int Lexer::nextTokenKind()
77 {
78     int token = Parser::Token_INVALID;
79     if (m_curpos >= m_contentSize) {
80         m_tokenBegin = -1;
81         m_tokenEnd = -1;
82         createNewline(m_curpos);
83         return 0;
84     }
85 
86     const QChar* it = m_content.constData();
87     it += m_curpos;
88     m_tokenBegin = m_curpos;
89     switch (state()) {
90     case HtmlState:
91         if (it->unicode() == '<' && (it + 1)->unicode() == '?'
92             ///TODO: per-project configuration to set whether we use shortags
93             ///      or not. In the former case we'd need to rise an error here
94             && !( (it + 2)->toLower().unicode() == 'x'
95                  && (it + 3)->toLower().unicode() == 'm'
96                  && (it + 4)->toLower().unicode() == 'l' ) )
97         {
98             token = Parser::Token_OPEN_TAG;
99             if ((it + 2)->unicode() == '=') {
100                 token = Parser::Token_OPEN_TAG_WITH_ECHO;
101                 m_curpos++;
102                 it++;
103             } else if ((it + 2)->toLower().unicode() == 'p'
104                     && (it + 3)->toLower().unicode() == 'h'
105                     && (it + 4)->toLower().unicode() == 'p'
106                     && (it + 5)->isSpace()) {
107                 m_curpos += 4;
108                 if ((it + 5)->unicode() == '\n') createNewline(m_curpos + 1);
109             }
110             m_curpos++;
111             pushState(DefaultState);
112         } else {
113             token = Parser::Token_INLINE_HTML;
114             while (m_curpos < m_contentSize) {
115                 if (it->unicode() == '\n') createNewline(m_curpos);
116                 if ((it + 1)->unicode() == '<' && (it + 2)->unicode() == '?') {
117                     break;
118                 }
119                 it++;
120                 m_curpos++;
121             }
122         }
123         break;
124     case DefaultState:
125     case StringVariableCurly: {
126         if (it->isSpace()) {
127             token = Parser::Token_WHITESPACE;
128             while (m_curpos < m_contentSize && it->isSpace()) {
129                 if (it->unicode() == '\n') createNewline(m_curpos);
130                 it++;
131                 m_curpos++;
132             }
133             m_curpos--;
134         } else if (it->isDigit() || (it->unicode() == '.' && (it + 1)->isDigit())) {
135             QString num;bool hasPoint = false;
136             bool hex = false;
137             bool bin = false;
138             if (it->unicode() == '0' && (it + 1)->toLower() == 'x') {
139                 it += 2;
140                 m_curpos += 2;
141                 hex = true;
142             }
143             if (it->unicode() == '0' && (it + 1)->toLower() == 'b') {
144                 it += 2;
145                 m_curpos += 2;
146                 bin = true;
147             }
148             while (m_curpos < m_contentSize && (
149                         it->isDigit()
150                         || (!hex && !hasPoint && it->unicode() == '.')
151                         || (bin && (it->unicode() == '0' || it->unicode() == '1'))
152                         || (hex && (it->toLower() == 'a' || it->toLower() == 'b' ||
153                                     it->toLower() == 'c' || it->toLower() == 'd' ||
154                                     it->toLower() == 'e' || it->toLower() == 'f')))) {
155                 if (it->unicode() == '.') hasPoint = true;
156                 num.append(*it);
157                 it++;
158                 m_curpos++;
159             }
160             if (!hex && !bin && it->toLower() == 'e' &&
161                     ((it + 1)->isDigit() ||
162                      (((it + 1)->unicode() == '-' || (it + 1)->unicode() == '+') && (it + 2)->isDigit()))) {
163                 //exponential number
164                 token = Parser::Token_DNUMBER;
165                 m_curpos++;
166                 it++;
167                 if (it->unicode() == '-' || it->unicode() == '+') {
168                     it++;
169                     m_curpos++;
170                 }
171                 while (m_curpos < m_contentSize && (it->isDigit())) {
172                     it++;
173                     m_curpos++;
174                 }
175                 m_curpos--;
176             } else {
177                 m_curpos--;
178                 if (hasPoint) {
179                     token = Parser::Token_DNUMBER;
180                 } else {
181                     bool ok;
182                     //check if string can be converted to long
183                     //if we get an overflow use double
184                     num.toLong(&ok, hex ? 16 : 10);
185                     if (ok) {
186                         token = Parser::Token_LNUMBER;
187                     } else {
188                         token = Parser::Token_DNUMBER;
189                     }
190                 }
191             }
192 
193         } else if (processVariable(it)) {
194             token = Parser::Token_VARIABLE;
195         } else if (it->unicode() == '$') {
196             //when it was not recognized as variable
197             token = Parser::Token_DOLLAR;
198         } else if (it->unicode() == '}') {
199             token = Parser::Token_RBRACE;
200             if (state() == StringVariableCurly) {
201                 popState();
202             }
203         } else if (it->unicode() == '{') {
204             token = Parser::Token_LBRACE;
205             if (state() == StringVariableCurly) {
206                 pushState(StringVariableCurly);
207             }
208         } else if (it->unicode() == ')') {
209             token = Parser::Token_RPAREN;
210         } else if (it->unicode() == '(') {
211             it++;
212             int pos = m_curpos + 1;
213             while (pos < m_contentSize && it->isSpace()) {
214                 it++;
215                 pos++;
216             }
217             const int nameStart = pos;
218             while (pos < m_contentSize && it->isLetter()) {
219                 it++;
220                 pos++;
221             }
222             const auto name = m_content.midRef(nameStart, pos - nameStart);
223             while (pos < m_contentSize && it->isSpace()) {
224                 it++;
225                 pos++;
226             }
227             if (it->unicode() == ')') {
228                 if (name.compare(QLatin1String("int"), Qt::CaseInsensitive) == 0
229                     || name.compare(QLatin1String("integer"), Qt::CaseInsensitive) == 0)
230                 {
231                     token = Parser::Token_INT_CAST;
232                 } else if (name.compare(QLatin1String("real"), Qt::CaseInsensitive) == 0
233                     || name.compare(QLatin1String("double"), Qt::CaseInsensitive) == 0
234                     || name.compare(QLatin1String("float"), Qt::CaseInsensitive) == 0)
235                 {
236                     token = Parser::Token_DOUBLE_CAST;
237                 } else if (name.compare(QLatin1String("string"), Qt::CaseInsensitive) == 0) {
238                     token = Parser::Token_STRING_CAST;
239                 } else if (name.compare(QLatin1String("binary"), Qt::CaseInsensitive) == 0) {
240                     //as in php
241                     token = Parser::Token_STRING_CAST;
242                 } else if (name.compare(QLatin1String("array"), Qt::CaseInsensitive) == 0) {
243                     token = Parser::Token_ARRAY_CAST;
244                 } else if (name.compare(QLatin1String("object"), Qt::CaseInsensitive) == 0) {
245                     token = Parser::Token_OBJECT_CAST;
246                 } else if (name.compare(QLatin1String("bool"), Qt::CaseInsensitive) == 0
247                     || name.compare(QLatin1String("boolean"), Qt::CaseInsensitive) == 0)
248                 {
249                     token = Parser::Token_BOOL_CAST;
250                 } else if (name.compare(QLatin1String("unset"), Qt::CaseInsensitive) == 0) {
251                     token = Parser::Token_UNSET_CAST;
252                 } else {
253                     token = Parser::Token_LPAREN;
254                 }
255 
256                 if (token != Parser::Token_LPAREN) {
257                     m_curpos = pos;
258                 }
259             } else {
260                 token = Parser::Token_LPAREN;
261             }
262         } else if (it->unicode() == ']') {
263             token = Parser::Token_RBRACKET;
264         } else if (it->unicode() == '[') {
265             token = Parser::Token_LBRACKET;
266         } else if (it->unicode() == ',') {
267             token = Parser::Token_COMMA;
268         } else if (it->unicode() == '@') {
269             token = Parser::Token_AT;
270         } else if (it->unicode() == '!') {
271             if ((it + 1)->unicode() == '=') {
272                 m_curpos++;
273                 if ((it + 2)->unicode() == '=') {
274                     m_curpos++;
275                     token = Parser::Token_IS_NOT_IDENTICAL;
276                 } else {
277                     token = Parser::Token_IS_NOT_EQUAL;
278                 }
279             } else {
280                 token = Parser::Token_BANG;
281             }
282         } else if (it->unicode() == '<') {
283             if ((it + 1)->unicode() == '<') {
284                 m_curpos++;
285                 if ((it + 2)->unicode() == '<' && state() != StringVariableCurly) {
286                     //HEREDOC string (<<< EOD\nfoo\nEOD;\n)
287                     int pos = 3;
288                     while (m_curpos + pos < m_contentSize &&
289                             ((it + pos)->unicode() == ' ' || (it + pos)->unicode() == '\t')) {
290                         pos++;
291                     }
292                     bool isNowdoc = (it + pos)->unicode() == '\'';
293                     bool foundQuote = isNowdoc || (it + pos)->unicode() == '"';
294                     if (foundQuote) {
295                         ++pos;
296                     }
297                     if ((it + pos)->isLetter() || (it + pos)->unicode() == '_') { //identifier must start with a letter
298                         m_hereNowDocIdentifier.clear();
299                         while (m_curpos + pos < m_contentSize &&
300                                 ((it + pos)->isDigit() || (it + pos)->isLetter() || (it + pos)->unicode() == '_')) {
301                             m_hereNowDocIdentifier.append(*(it + pos));
302                             pos++;
303                         }
304                         if (foundQuote && (m_curpos + pos) < m_contentSize) {
305                             if (isNowdoc && (it+pos)->unicode() == '\'') {
306                                 ++pos;
307                             } else if ((it+pos)->unicode() == '"') {
308                                 ++pos;
309                             }
310                         }
311                         if (m_curpos + pos < m_contentSize && (it + pos)->unicode() == '\n') {
312                             //identifier must be followed by newline, newline is part of HEREDOC token
313                             if (isNowdoc) {
314                                 token = Parser::Token_START_NOWDOC;
315                                 pushState(StringNowdoc);
316                             } else {
317                                 token = Parser::Token_START_HEREDOC;
318                                 pushState(StringHeredoc);
319                             }
320                             m_curpos += pos - 1;
321                             createNewline(m_curpos);
322                         }
323                     }
324                 }
325 
326                 if (token != Parser::Token_START_HEREDOC && token != Parser::Token_START_NOWDOC) {
327                     if ((it + 2)->unicode() == '=') {
328                         m_curpos++;
329                         token = Parser::Token_SL_ASSIGN;
330                     } else {
331                         token = Parser::Token_SL;
332                     }
333                 }
334             } else if ((it + 1)->unicode() == '=') {
335                 m_curpos++;
336                 token = Parser::Token_IS_SMALLER_OR_EQUAL;
337             } else if ((it + 1)->unicode() == '>') {
338                 m_curpos++;
339                 token = Parser::Token_IS_NOT_EQUAL;
340             } else {
341                 token = Parser::Token_IS_SMALLER;
342             }
343         } else if (it->unicode() == '>') {
344             if ((it + 1)->unicode() == '>') {
345                 m_curpos++;
346                 if ((it + 2)->unicode() == '=') {
347                     m_curpos++;
348                     token = Parser::Token_SR_ASSIGN;
349                 } else {
350                     token = Parser::Token_SR;
351                 }
352             } else if ((it + 1)->unicode() == '=') {
353                 m_curpos++;
354                 token = Parser::Token_IS_GREATER_OR_EQUAL;
355             } else {
356                 token = Parser::Token_IS_GREATER;
357             }
358         } else if (it->unicode() == '~') {
359             token = Parser::Token_TILDE;
360         } else if (it->unicode() == ':') {
361             if ((it + 1)->unicode() == ':') {
362                 m_curpos++;
363                 token = Parser::Token_PAAMAYIM_NEKUDOTAYIM;
364             } else {
365                 token = Parser::Token_COLON;
366             }
367         } else if (it->unicode() == '?') {
368             if ((it + 1)->unicode() == '>') {
369                 //accept CLOSE_TAG inside StringVariableCurly too, as php does
370                 token = Parser::Token_CLOSE_TAG;
371                 m_curpos++;
372                 while (state() != HtmlState) popState();
373             } else {
374                 token = Parser::Token_QUESTION;
375             }
376         } else if (it->unicode() == '-' && (it + 1)->unicode() == '>') {
377             m_curpos++;
378             token = Parser::Token_OBJECT_OPERATOR;
379             if (isValidVariableIdentifier(it + 2)) {
380                 pushState(StringVariableObjectOperator);
381             }
382         } else if (it->unicode() == '%') {
383             if ((it + 1)->unicode() == '=') {
384                 m_curpos++;
385                 token = Parser::Token_MOD_ASSIGN;
386             } else {
387                 token = Parser::Token_MOD;
388             }
389         } else if (it->unicode() == '/') {
390             if ((it + 1)->unicode() == '=') {
391                 m_curpos++;
392                 token = Parser::Token_DIV_ASSIGN;
393             } else if ((it + 1)->unicode() == '/') {
394                 //accept COMMENT inside StringVariableCurly too, as php does
395                 if ((it + 2)->unicode() == '/') {
396                     token = Parser::Token_DOC_COMMENT;
397                 } else {
398                     token = Parser::Token_COMMENT;
399                 }
400                 while (m_curpos < m_contentSize) {
401                     if (m_curpos + 1 < m_contentSize && it->unicode() == '?' && (it + 1)->unicode() == '>') {
402                         --it;
403                         --m_curpos;
404                         break;
405                     }
406                     if ( it->unicode() == '\n' ) {
407                         createNewline(m_curpos);
408                         if ( token == Parser::Token_COMMENT ) {
409                             break;
410                         } else {
411                             // lookahead to check whether this doc comment spans multiple lines
412                             const QChar* it2 = it + 1;
413                             int pos = m_curpos + 1;
414                             while ( pos < m_contentSize && (it2)->isSpace() && (it2)->unicode() != '\n' ) {
415                                 ++it2;
416                                 ++pos;
417                             }
418                             if ( it2->unicode() == '/' && (it2 + 1)->unicode() == '/'
419                                  && (it2 + 2)->unicode() == '/' ) {
420                                 // seems to be a multi-line doc-comment
421                                 it = it2 + 2;
422                                 m_curpos = pos + 2;
423                                 continue;
424                             } else {
425                                 // not a multi-line doc-comment
426                                 break;
427                             }
428                         }
429                     }
430                     it++;
431                     m_curpos++;
432                 }
433             } else if ((it + 1)->unicode() == '*') {
434                 //accept COMMENT inside StringVariableCurly too, as php does
435                 if ((it + 2)->unicode() == '*' && (it + 3)->isSpace()) {
436                     token = Parser::Token_DOC_COMMENT;
437                 } else {
438                     token = Parser::Token_COMMENT;
439                 }
440                 it += 2;
441                 m_curpos += 2;
442                 while (m_curpos < m_contentSize && !(it->unicode() == '*' && (it + 1)->unicode() == '/')) {
443                     if (it->unicode() == '\n') {
444                         createNewline(m_curpos);
445                     }
446                     it++;
447                     m_curpos++;
448                 }
449                 m_curpos++;
450             } else {
451                 token = Parser::Token_DIV;
452             }
453         } else if (it->unicode() == '#') {
454             //accept COMMENT inside StringVariableCurly too, as php does
455             token = Parser::Token_COMMENT;
456             while (m_curpos < m_contentSize) {
457                 if (m_curpos + 1 < m_contentSize && it->unicode() == '?' && (it + 1)->unicode() == '>') {
458                     --it;
459                     --m_curpos;
460                     break;
461                 }
462                 if (it->unicode() == '\n') {
463                     createNewline(m_curpos);
464                     break;
465                 }
466                 it++;
467                 m_curpos++;
468             }
469         } else if (it->unicode() == '^') {
470             if ((it + 1)->unicode() == '=') {
471                 m_curpos++;
472                 token = Parser::Token_XOR_ASSIGN;
473             } else {
474                 token = Parser::Token_BIT_XOR;
475             }
476         } else if (it->unicode() == '*') {
477             if ((it + 1)->unicode() == '=') {
478                 m_curpos++;
479                 token = Parser::Token_MUL_ASSIGN;
480             } else if ((it + 1)->unicode() == '*') {
481                 m_curpos++;
482                 if ((it + 2)->unicode() == '=') {
483                     m_curpos++;
484                     token = Parser::Token_EXP_ASSIGN;
485                 } else {
486                     token = Parser::Token_EXP;
487                 }
488             } else {
489                 token = Parser::Token_MUL;
490             }
491         } else if (it->unicode() == '|') {
492             if ((it + 1)->unicode() == '|') {
493                 m_curpos++;
494                 token = Parser::Token_BOOLEAN_OR;
495             } else if ((it + 1)->unicode() == '=') {
496                 m_curpos++;
497                 token = Parser::Token_OR_ASSIGN;
498             } else {
499                 token = Parser::Token_BIT_OR;
500             }
501         } else if (it->unicode() == '&') {
502             if ((it + 1)->unicode() == '&') {
503                 m_curpos++;
504                 token = Parser::Token_BOOLEAN_AND;
505             } else if ((it + 1)->unicode() == '=') {
506                 m_curpos++;
507                 token = Parser::Token_AND_ASSIGN;
508             } else {
509                 token = Parser::Token_BIT_AND;
510             }
511         } else if (it->unicode() == '+') {
512             if ((it + 1)->unicode() == '+') {
513                 m_curpos++;
514                 token = Parser::Token_INC;
515             } else if ((it + 1)->unicode() == '=') {
516                 m_curpos++;
517                 token = Parser::Token_PLUS_ASSIGN;
518             } else {
519                 token = Parser::Token_PLUS;
520             }
521         } else if (it->unicode() == '-') {
522             if ((it + 1)->unicode() == '-') {
523                 m_curpos++;
524                 token = Parser::Token_DEC;
525             } else if ((it + 1)->unicode() == '=') {
526                 m_curpos++;
527                 token = Parser::Token_MINUS_ASSIGN;
528             } else {
529                 token = Parser::Token_MINUS;
530             }
531         } else if (it->unicode() == '.') {
532             if ((it + 1)->unicode() == '=') {
533                 m_curpos++;
534                 token = Parser::Token_CONCAT_ASSIGN;
535             } else {
536                 token = Parser::Token_CONCAT;
537             }
538         } else if (it->unicode() == '\\') {
539             token = Parser::Token_BACKSLASH;
540         } else if (it->unicode() == ';') {
541             token = Parser::Token_SEMICOLON;
542         } else if (it->unicode() == '\'') {
543             token = Parser::Token_CONSTANT_ENCAPSED_STRING;
544             it++;
545             m_curpos++;
546             int startPos = m_curpos;
547             while (m_curpos < m_contentSize
548                     && (it->unicode() != '\'' || isEscapedWithBackslash(it, m_curpos, startPos))) {
549                 if (it->unicode() == '\n') createNewline(m_curpos);
550                 it++;
551                 m_curpos++;
552             }
553             // if the string is never terminated, make sure we don't overflow the boundaries
554             if ( m_curpos == m_contentSize ) {
555                 --m_curpos;
556             }
557         } else if (it->unicode() == '"') {
558             it++;
559             m_curpos++;
560             int stringSize = 0;
561             bool foundVar = false;
562             while (m_curpos + stringSize < m_contentSize
563                     && (it->unicode() != '"' || isEscapedWithBackslash(it, m_curpos + stringSize, m_curpos)))
564             {
565                 if (it->unicode() == '$'  && !isEscapedWithBackslash(it, m_curpos + stringSize, m_curpos)
566                         && ((it + 1)->unicode() == '{'
567                             || (isValidVariableIdentifier(it + 1) && !(it + 1)->isDigit()))) {
568                     foundVar = true;
569                     break;
570                 }
571                 it++;
572                 stringSize++;
573             }
574             if (!foundVar) {
575                 // if the string is never terminated, make sure we don't overflow the boundaries
576                 if ( m_curpos + stringSize == m_contentSize ) {
577                     m_curpos--;
578                 }
579                 token = Parser::Token_CONSTANT_ENCAPSED_STRING;
580                 it -= stringSize;
581                 for (int j = 0; j < stringSize; j++) {
582                     if (it->unicode() == '\n') {
583                         createNewline(m_curpos + j);
584                     }
585                     it++;
586                 }
587                 m_curpos += stringSize;
588             } else {
589                 // properly set the token pos to the starting double quote
590                 m_curpos--;
591                 token = Parser::Token_DOUBLE_QUOTE;
592                 pushState(String);
593             }
594         } else if (it->unicode() == '`') {
595             token = Parser::Token_BACKTICK;
596             pushState(StringBacktick);
597         } else if (it->unicode() == '=') {
598             if ((it + 1)->unicode() == '=') {
599                 m_curpos++;
600                 if ((it + 2)->unicode() == '=') {
601                     m_curpos++;
602                     token = Parser::Token_IS_IDENTICAL;
603                 } else {
604                     token = Parser::Token_IS_EQUAL;
605                 }
606             } else if ((it + 1)->unicode() == '>') {
607                 m_curpos++;
608                 token = Parser::Token_DOUBLE_ARROW;
609             } else {
610                 token = Parser::Token_ASSIGN;
611             }
612         } else if (isValidVariableIdentifier(it) && !it->isDigit()) {
613             const int from = m_curpos;
614             while (m_curpos < m_contentSize && (isValidVariableIdentifier(it))) {
615                 it++;
616                 m_curpos++;
617             }
618             const QStringRef name = m_content.midRef(from, m_curpos - from);
619             m_curpos--;
620             if (name.compare(QLatin1String("echo"), Qt::CaseInsensitive) == 0) {
621                 token = Parser::Token_ECHO;
622             } else if (name.compare(QLatin1String("include"), Qt::CaseInsensitive) == 0) {
623                 token = Parser::Token_INCLUDE;
624             } else if (name.compare(QLatin1String("include_once"), Qt::CaseInsensitive) == 0) {
625                 token = Parser::Token_INCLUDE_ONCE;
626             } else if (name.compare(QLatin1String("require"), Qt::CaseInsensitive) == 0) {
627                 token = Parser::Token_REQUIRE;
628             } else if (name.compare(QLatin1String("require_once"), Qt::CaseInsensitive) == 0) {
629                 token = Parser::Token_REQUIRE_ONCE;
630             } else if (name.compare(QLatin1String("eval"), Qt::CaseInsensitive) == 0) {
631                 token = Parser::Token_EVAL;
632             } else if (name.compare(QLatin1String("print"), Qt::CaseInsensitive) == 0) {
633                 token = Parser::Token_PRINT;
634             } else if (name.compare(QLatin1String("abstract"), Qt::CaseInsensitive) == 0) {
635                 token = Parser::Token_ABSTRACT;
636             } else if (name.compare(QLatin1String("break"), Qt::CaseInsensitive) == 0) {
637                 token = Parser::Token_BREAK;
638             } else if (name.compare(QLatin1String("case"), Qt::CaseInsensitive) == 0) {
639                 token = Parser::Token_CASE;
640             } else if (name.compare(QLatin1String("catch"), Qt::CaseInsensitive) == 0) {
641                 token = Parser::Token_CATCH;
642             } else if (name.compare(QLatin1String("class"), Qt::CaseInsensitive) == 0) {
643                 token = Parser::Token_CLASS;
644             } else if (name.compare(QLatin1String("const"), Qt::CaseInsensitive) == 0) {
645                 token = Parser::Token_CONST;
646             } else if (name.compare(QLatin1String("continue"), Qt::CaseInsensitive) == 0) {
647                 token = Parser::Token_CONTINUE;
648             } else if (name.compare(QLatin1String("default"), Qt::CaseInsensitive) == 0) {
649                 token = Parser::Token_DEFAULT;
650             } else if (name.compare(QLatin1String("do"), Qt::CaseInsensitive) == 0) {
651                 token = Parser::Token_DO;
652             } else if (name.compare(QLatin1String("else"), Qt::CaseInsensitive) == 0) {
653                 token = Parser::Token_ELSE;
654             } else if (name.compare(QLatin1String("extends"), Qt::CaseInsensitive) == 0) {
655                 token = Parser::Token_EXTENDS;
656             } else if (name.compare(QLatin1String("final"), Qt::CaseInsensitive) == 0) {
657                 token = Parser::Token_FINAL;
658             } else if (name.compare(QLatin1String("for"), Qt::CaseInsensitive) == 0) {
659                 token = Parser::Token_FOR;
660             } else if (name.compare(QLatin1String("if"), Qt::CaseInsensitive) == 0) {
661                 token = Parser::Token_IF;
662             } else if (name.compare(QLatin1String("implements"), Qt::CaseInsensitive) == 0) {
663                 token = Parser::Token_IMPLEMENTS;
664             } else if (name.compare(QLatin1String("instanceof"), Qt::CaseInsensitive) == 0) {
665                 token = Parser::Token_INSTANCEOF;
666             } else if (name.compare(QLatin1String("insteadof"), Qt::CaseInsensitive) == 0) {
667                 token = Parser::Token_INSTEADOF;
668             } else if (name.compare(QLatin1String("interface"), Qt::CaseInsensitive) == 0) {
669                 token = Parser::Token_INTERFACE;
670             } else if (name.compare(QLatin1String("trait"), Qt::CaseInsensitive) == 0) {
671                 token = Parser::Token_TRAIT;
672             } else if (name.compare(QLatin1String("new"), Qt::CaseInsensitive) == 0) {
673                 token = Parser::Token_NEW;
674             } else if (name.compare(QLatin1String("private"), Qt::CaseInsensitive) == 0) {
675                 token = Parser::Token_PRIVATE;
676             } else if (name.compare(QLatin1String("protected"), Qt::CaseInsensitive) == 0) {
677                 token = Parser::Token_PROTECTED;
678             } else if (name.compare(QLatin1String("public"), Qt::CaseInsensitive) == 0) {
679                 token = Parser::Token_PUBLIC;
680             } else if (name.compare(QLatin1String("return"), Qt::CaseInsensitive) == 0) {
681                 token = Parser::Token_RETURN;
682             } else if (name.compare(QLatin1String("static"), Qt::CaseInsensitive) == 0) {
683                 const QChar* lookAhead = it;
684                 int pos = m_curpos;
685                 while (pos < m_contentSize && lookAhead->isSpace()) {
686                     ++lookAhead;
687                     ++pos;
688                 }
689                 if (pos + 1 < m_contentSize && lookAhead->unicode() == ':' && (++lookAhead)->unicode() == ':') {
690                     // PHP 5.3 - late static
691                     token = Parser::Token_STRING;
692                 } else {
693                     token = Parser::Token_STATIC;
694                 }
695             } else if (name.compare(QLatin1String("switch"), Qt::CaseInsensitive) == 0) {
696                 token = Parser::Token_SWITCH;
697             } else if (name.compare(QLatin1String("throw"), Qt::CaseInsensitive) == 0) {
698                 token = Parser::Token_THROW;
699             } else if (name.compare(QLatin1String("try"), Qt::CaseInsensitive) == 0) {
700                 token = Parser::Token_TRY;
701             } else if (name.compare(QLatin1String("finally"), Qt::CaseInsensitive) == 0) {
702                 token = Parser::Token_FINALLY;
703             } else if (name.compare(QLatin1String("while"), Qt::CaseInsensitive) == 0) {
704                 token = Parser::Token_WHILE;
705             } else if (name.compare(QLatin1String("clone"), Qt::CaseInsensitive) == 0) {
706                 token = Parser::Token_CLONE;
707             } else if (name.compare(QLatin1String("exit"), Qt::CaseInsensitive) == 0 || name.compare(QLatin1String("die"), Qt::CaseInsensitive) == 0) {
708                 token = Parser::Token_EXIT;
709             } else if (name.compare(QLatin1String("elseif"), Qt::CaseInsensitive) == 0) {
710                 token = Parser::Token_ELSEIF;
711             } else if (name.compare(QLatin1String("endif"), Qt::CaseInsensitive) == 0) {
712                 token = Parser::Token_ENDIF;
713             } else if (name.compare(QLatin1String("endwhile"), Qt::CaseInsensitive) == 0) {
714                 token = Parser::Token_ENDWHILE;
715             } else if (name.compare(QLatin1String("endfor"), Qt::CaseInsensitive) == 0) {
716                 token = Parser::Token_ENDFOR;
717             } else if (name.compare(QLatin1String("foreach"), Qt::CaseInsensitive) == 0) {
718                 token = Parser::Token_FOREACH;
719             } else if (name.compare(QLatin1String("endforeach"), Qt::CaseInsensitive) == 0) {
720                 token = Parser::Token_ENDFOREACH;
721             } else if (name.compare(QLatin1String("declare"), Qt::CaseInsensitive) == 0) {
722                 token = Parser::Token_DECLARE;
723             } else if (name.compare(QLatin1String("enddeclare"), Qt::CaseInsensitive) == 0) {
724                 token = Parser::Token_ENDDECLARE;
725             } else if (name.compare(QLatin1String("as"), Qt::CaseInsensitive) == 0) {
726                 token = Parser::Token_AS;
727             } else if (name.compare(QLatin1String("endswitch"), Qt::CaseInsensitive) == 0) {
728                 token = Parser::Token_ENDSWITCH;
729             } else if (name.compare(QLatin1String("function"), Qt::CaseInsensitive) == 0) {
730                 token = Parser::Token_FUNCTION;
731             } else if (name.compare(QLatin1String("use"), Qt::CaseInsensitive) == 0) {
732                 token = Parser::Token_USE;
733             } else if (name.compare(QLatin1String("goto"), Qt::CaseInsensitive) == 0) {
734                 token = Parser::Token_GOTO;
735             } else if (name.compare(QLatin1String("global"), Qt::CaseInsensitive) == 0) {
736                 token = Parser::Token_GLOBAL;
737             } else if (name.compare(QLatin1String("var"), Qt::CaseInsensitive) == 0) {
738                 token = Parser::Token_VAR;
739             } else if (name.compare(QLatin1String("unset"), Qt::CaseInsensitive) == 0) {
740                 token = Parser::Token_UNSET;
741             } else if (name.compare(QLatin1String("isset"), Qt::CaseInsensitive) == 0) {
742                 token = Parser::Token_ISSET;
743             } else if (name.compare(QLatin1String("empty"), Qt::CaseInsensitive) == 0) {
744                 token = Parser::Token_EMPTY;
745             } else if (name.compare(QLatin1String("__halt_compiler"), Qt::CaseInsensitive) == 0) {
746                 token = Parser::Token_HALT_COMPILER;
747             } else if (name.compare(QLatin1String("list"), Qt::CaseInsensitive) == 0) {
748                 token = Parser::Token_LIST;
749             } else if (name.compare(QLatin1String("array"), Qt::CaseInsensitive) == 0) {
750                 token = Parser::Token_ARRAY;
751             } else if (name.compare(QLatin1String("__class__"), Qt::CaseInsensitive) == 0) {
752                 token = Parser::Token_CLASS_C;
753             } else if (name.compare(QLatin1String("__method__"), Qt::CaseInsensitive) == 0) {
754                 token = Parser::Token_METHOD_C;
755             } else if (name.compare(QLatin1String("__function__"), Qt::CaseInsensitive) == 0) {
756                 token = Parser::Token_FUNC_C;
757             } else if (name.compare(QLatin1String("__line__"), Qt::CaseInsensitive) == 0) {
758                 token = Parser::Token_LINE;
759             } else if (name.compare(QLatin1String("__file__"), Qt::CaseInsensitive) == 0) {
760                 token = Parser::Token_FILE;
761             } else if (name.compare(QLatin1String("or"), Qt::CaseInsensitive) == 0) {
762                 token = Parser::Token_LOGICAL_OR;
763             } else if (name.compare(QLatin1String("and"), Qt::CaseInsensitive) == 0) {
764                 token = Parser::Token_LOGICAL_AND;
765             } else if (name.compare(QLatin1String("xor"), Qt::CaseInsensitive) == 0) {
766                 token = Parser::Token_LOGICAL_XOR;
767             } else if (name.compare(QLatin1String("namespace"), Qt::CaseInsensitive) == 0) {
768                 token = Parser::Token_NAMESPACE;
769             } else if (name.compare(QLatin1String("__namespace__"), Qt::CaseInsensitive) == 0) {
770                 token = Parser::Token_NAMESPACE_C;
771             } else if (name.compare(QLatin1String("callable"), Qt::CaseInsensitive) == 0) {
772                 token = Parser::Token_CALLABLE;
773             } else {
774                 token = Parser::Token_STRING;
775             }
776         }
777         break;
778     }
779 
780     case StringVariable:
781     case String:
782     case StringHeredoc:
783     case StringBacktick:
784         if ((state() == String || state(1) == String) && it->unicode() == '"') {
785             token = Parser::Token_DOUBLE_QUOTE;
786             if (state() == StringVariable) popState();
787             popState();
788         } else if ((state() == StringBacktick || state(1) == StringBacktick) && it->unicode() == '`') {
789             token = Parser::Token_BACKTICK;
790             if (state() == StringVariable) popState();
791             popState();
792         } else if ((state() == StringHeredoc || state(1) == StringHeredoc) && isHereNowDocEnd(it)) {
793             token = Parser::Token_END_HEREDOC;
794             m_curpos += m_hereNowDocIdentifier.length() - 1;
795             if (state() == StringVariable) popState();
796             popState();
797         } else if (processVariable(it)) {
798             token = Parser::Token_VARIABLE;
799             if (state() != StringVariable) pushState(StringVariable);
800         } else if (state() != StringVariable  && it->unicode() == '$' && (it + 1)->unicode() == '{') {
801             token = Parser::Token_DOLLAR_OPEN_CURLY_BRACES;
802             m_curpos++;
803             it += 2;
804             //check if a valid variable follows
805             if ((isValidVariableIdentifier(it) && !it->isDigit())) {
806                 pushState(StringVarname);
807             }
808 
809         } else if (state() == StringVariable && it->unicode() == '[') {
810             token = Parser::Token_LBRACKET;
811             pushState(StringVariableBracket);
812         } else if (state() != StringVariable && it->unicode() == '{' && (it + 1)->unicode() == '$'
813                    && ((isValidVariableIdentifier(it + 2) && !(it + 2)->isDigit()) || (it + 2)->unicode() == '{')) {
814             token = Parser::Token_CURLY_OPEN;
815             pushState(StringVariableCurly);
816         } else if (state() == StringVariable
817                    && it->unicode() == '-' && (it + 1)->unicode() == '>'
818                    && isValidVariableIdentifier(it + 2) && !(it + 2)->isDigit()) {
819             token = Parser::Token_OBJECT_OPERATOR;
820             m_curpos++;
821             pushState(StringVariableObjectOperator);
822         } else {
823             if (state() == StringVariable) popState();
824             token = Parser::Token_ENCAPSED_AND_WHITESPACE;
825             int startPos = m_curpos;
826             while (m_curpos < m_contentSize) {
827                 if (!isEscapedWithBackslash(it, m_curpos, startPos) &&
828                         ((it->unicode() == '$' && (it + 1)->unicode() == '{') ||
829                          (it->unicode() == '{' && (it + 1)->unicode() == '$' && isValidVariableIdentifier(it + 2)) ||
830                          (it->unicode() == '$' && isValidVariableIdentifier(it + 1) && !(it + 1)->isDigit()))) {
831                     //variable is next ${var} or {$var}
832                     break;
833                 }
834                 if (state() == String && it->unicode() == '"'
835                         && !isEscapedWithBackslash(it, m_curpos, startPos)) {
836                     //end of string
837                     break;
838                 }
839                 if (state() == StringBacktick && it->unicode() == '`'
840                         && !isEscapedWithBackslash(it, m_curpos, startPos)) {
841                     //end of string
842                     break;
843                 }
844 
845                 if (it->unicode() == '\n') createNewline(m_curpos);
846                 m_curpos++;
847                 it++;
848 
849                 if (state() == StringHeredoc && (it - 1)->unicode() == '\n') {
850                     //check for end of heredoc (\nEOD;\n)
851                     if (state() == StringHeredoc && isHereNowDocEnd(it)) {
852                         break;
853                     }
854                 }
855             }
856             m_curpos--;
857         }
858         break;
859     case StringNowdoc:
860         if (isHereNowDocEnd(it)) {
861             token = Parser::Token_END_NOWDOC;
862             m_curpos += m_hereNowDocIdentifier.length() - 1;
863             popState();
864         } else {
865             token = Parser::Token_STRING;
866             while (m_curpos < m_contentSize) {
867                 if (it->unicode() == '\n') createNewline(m_curpos);
868                 m_curpos++;
869                 it++;
870 
871                 if ((it - 1)->unicode() == '\n' && isHereNowDocEnd(it)) {
872                     //check for end of nowdoc (\nEOD;\n)
873                     break;
874                 }
875             }
876             m_curpos--;
877         }
878         break;
879     case StringVariableBracket:
880         if (it->unicode() == ']') {
881             token = Parser::Token_RBRACKET;
882             popState();
883             popState();
884         } else if (it->isDigit()) {
885             token = Parser::Token_NUM_STRING;
886             while (m_curpos < m_contentSize && it->isDigit()) {
887                 it++;
888                 m_curpos++;
889             }
890             m_curpos--;
891         } else {
892             token = Parser::Token_STRING;
893             while (m_curpos < m_contentSize && (it->unicode() != ']')) {
894                 if (it->unicode() == '\n') createNewline(m_curpos);
895                 it++;
896                 m_curpos++;
897             }
898             m_curpos--;
899         }
900         break;
901     case StringVariableObjectOperator:
902         token = Parser::Token_STRING;
903         while (m_curpos < m_contentSize && isValidVariableIdentifier(it)) {
904             it++;
905             m_curpos++;
906         }
907         m_curpos--;
908         popState();
909         if (state() == StringVariable) popState();
910         break;
911     case StringVarname:
912         popState();
913         pushState(StringVariableCurly);
914         token = Parser::Token_STRING_VARNAME;
915         while (m_curpos < m_contentSize && isValidVariableIdentifier(it)) {
916             it++;
917             m_curpos++;
918         }
919         m_curpos--;
920         break;
921     default:
922         token = Parser::Token_INVALID;
923         break;
924     }
925     if (m_curpos > m_contentSize) {
926         m_tokenBegin = -1;
927         m_tokenEnd = -1;
928         return 0;
929     }
930     m_tokenEnd = m_curpos;
931     m_curpos++;
932 
933     if (m_haltCompiler) {
934         //look for __halt_compiler(); and stop lexer there
935         if (m_haltCompiler == 4) {
936             token = 0; //EOF
937         } else if (token == Parser::Token_WHITESPACE || token == Parser::Token_COMMENT || token == Parser::Token_DOC_COMMENT) {
938             //ignore
939         } else if (m_haltCompiler == 1 && token == Parser::Token_LPAREN) {
940             m_haltCompiler++;
941         } else if (m_haltCompiler == 2 && token == Parser::Token_RPAREN) {
942             m_haltCompiler++;
943         } else if (m_haltCompiler == 3 && token == Parser::Token_SEMICOLON) {
944             m_haltCompiler++;
945         } else {
946             m_haltCompiler = 0;
947         }
948     }
949     if (token == Parser::Token_HALT_COMPILER && !m_haltCompiler) {
950         m_haltCompiler = 1;
951     }
952     return token;
953 }
954 
tokenBegin() const955 qint64 Lexer::tokenBegin() const
956 {
957     return m_tokenBegin;
958 }
959 
tokenEnd() const960 qint64 Lexer::tokenEnd() const
961 {
962     return m_tokenEnd;
963 }
964 
isHereNowDocEnd(const QChar * it)965 bool Lexer::isHereNowDocEnd(const QChar* it)
966 {
967     int identiferLen = m_hereNowDocIdentifier.length();
968     QString lineStart;
969     for (int i = 0; i < identiferLen; i++) {
970         if (m_curpos + i >= m_contentSize) break;
971         lineStart.append(*(it + i));
972     }
973     if (lineStart == m_hereNowDocIdentifier &&
974             ((it + identiferLen)->unicode() == '\n'
975              || ((it + identiferLen)->unicode() == ';' &&
976                  (it + identiferLen + 1)->unicode() == '\n'))) {
977         return true;
978     }
979     return false;
980 }
981 
982 //used for strings, to check if " is escaped (\" is, \\" not)
isEscapedWithBackslash(const QChar * it,int curPos,int startPos)983 bool Lexer::isEscapedWithBackslash(const QChar* it, int curPos, int startPos)
984 {
985     int cnt = 0;
986     it--;
987     while (curPos > startPos && it->unicode() == '\\') {
988         cnt++;
989         it--;
990     }
991     return (cnt % 2) == 1;
992 }
993 
processVariable(const QChar * it)994 bool Lexer::processVariable(const QChar* it)
995 {
996     const QChar* c2 = it + 1;
997     if (it->unicode() == '$' && (isValidVariableIdentifier(c2) && !c2->isDigit())) {
998         it++;
999         m_curpos++;
1000         while (m_curpos < m_contentSize
1001                 && (isValidVariableIdentifier(it))) {
1002             it++;
1003             m_curpos++;
1004         }
1005         m_curpos--;
1006         return true;
1007     } else {
1008         return false;
1009     }
1010 }
isValidVariableIdentifier(const QChar * it)1011 bool Lexer::isValidVariableIdentifier(const QChar* it)
1012 {
1013     return it->isLetter() || it->isDigit() || it->unicode() == '_' || it->unicode() > 0x7f;
1014 }
1015 
createNewline(int pos)1016 void Lexer::createNewline(int pos)
1017 {
1018     if (m_tokenStream) m_tokenStream->locationTable()->newline(pos);
1019 }
1020 
1021 }
1022 
1023