1 
2 /*
3  * File CParser.cpp.
4  *
5  * This file is part of the source code of the software program
6  * Vampire. It is protected by applicable
7  * copyright laws.
8  *
9  * This source code is distributed under the licence found here
10  * https://vprover.github.io/license.html
11  * and in the source directory
12  *
13  * In summary, you are allowed to use Vampire for non-commercial
14  * purposes but not allowed to distribute, modify, copy, create derivatives,
15  * or use in competitions.
16  * For other uses of Vampire please contact developers for a different
17  * licence, which we will make an effort to provide.
18  */
19 /**
20  * Implements class CParser
21  *
22  * @since 13/01/2011 Manchester
23  */
24 
25 #include <cstring>
26 
27 #include "Debug/Assertion.hpp"
28 #include "Debug/Tracer.hpp"
29 
30 #include "Lib/Int.hpp"
31 #include "Lib/Exception.hpp"
32 
33 #include "CParser.hpp"
34 
35 using namespace Shell;
36 using namespace Lib;
37 
38 /**
39  * Initialise a parser.
40  */
CParser(const char * input)41 CParser::CParser(const char* input)
42   : _input(input)
43 {
44 } // CParser::CParser
45 
~CParser()46 CParser::~CParser()
47 {
48 } // CParser::~CParser
49 
50 
51 /**
52  * Read an integer type suffix identified by the following grammar:
53  *   ('u'|'U')? ('l'|'L')
54  * | ('u'|'U')  ('l'|'L')?
55  * Save in @b tt its token type.
56  * @return the end position, if recognized and 0 if not
57  */
integerTypeSuffix(unsigned pos,LTType & tt)58 unsigned CParser::integerTypeSuffix(unsigned pos,LTType& tt)
59 {
60   CALL("CParser::integerTypeSuffix");
61   switch (_input[pos]) {
62   case 'u':
63   case 'U': {
64     switch (_input[pos+1]) {
65     case 'l':
66     case 'L':
67       tt = LT_ULONG_CONST;
68       return pos+2;
69     default:
70       tt = LT_UINT_CONST;
71       return pos+1;
72     }
73   }
74   case 'l':
75   case 'L':
76     tt = LT_LONG_CONST;
77     return pos+1;
78   default:
79     tt = LT_INT_CONST;
80     return 0;
81   }
82 } // integerTypeSuffix
83 
84 /**
85  * Read a decimal literal and saving its type in @b tt.
86  * Decimal literals use the following grammar.
87  * DECIMAL_LITERAL : ('0' | '1'..'9' '0'..'9'*) IntegerTypeSuffix?
88  * @return the end position, if recognized and 0 if not
89  */
decimalLiteral(unsigned pos,LTType & tt)90 unsigned CParser::decimalLiteral(unsigned pos,LTType& tt)
91 {
92   CALL("CParser::decimalLiteral");
93 
94   switch (_input[pos]) {
95   case '0':
96     pos++;
97     break;
98 
99   case '1':
100   case '2':
101   case '3':
102   case '4':
103   case '5':
104   case '6':
105   case '7':
106   case '8':
107   case '9':
108     for (;;) {
109       char n = _input[++pos];
110       if (n < '0' || n > '9') {
111 	break;
112       }
113     }
114     break;
115   default:
116     return 0;
117   }
118   unsigned pos1 = integerTypeSuffix(pos,tt);
119   return pos1 ? pos1 : pos;
120 } // decimalLiteral
121 
122 /**
123  * Read an octal literal identified by the following grammar:
124  * OCTAL_LITERAL : '0' ('0'..'7')+ IntegerTypeSuffix? ;
125  * Save in @b tt its token type
126  * @return the end position, if recognized and 0 if not
127  */
octalLiteral(unsigned pos,LTType & tt)128 unsigned CParser::octalLiteral(unsigned pos,LTType& tt)
129 {
130   CALL("CParser::octalLiteral");
131 
132   if (_input[pos] != '0') {
133     return 0;
134   }
135   for (;;) {
136     char n = _input[++pos];
137     if (n < '0' || n > '7') {
138       break;
139     }
140   }
141   unsigned pos1 = integerTypeSuffix(pos,tt);
142   return pos1 ? pos1 : pos;
143 } // octalLiteral
144 
145 /**
146  * True if c is a character that can occur in a hex literal.
147  * HexDigit : ('0'..'9'|'a'..'f'|'A'..'F') ;
148  */
hexDigit(char c)149 bool CParser::hexDigit(char c)
150 {
151   CALL("CParser::hexDigit");
152 
153   switch (c) {
154   case '0':
155   case '1':
156   case '2':
157   case '3':
158   case '4':
159   case '5':
160   case '6':
161   case '7':
162   case '8':
163   case '9':
164   case 'a':
165   case 'b':
166   case 'c':
167   case 'd':
168   case 'e':
169   case 'f':
170   case 'A':
171   case 'B':
172   case 'C':
173   case 'D':
174   case 'E':
175   case 'F':
176     return true;
177   default:
178     return false;
179   }
180 } // hexDigit
181 
182 /**
183  * Read an octal literal identified by the following grammar:
184  * HEX_LITERAL : '0' ('x'|'X') HexDigit+ IntegerTypeSuffix? ;
185  * Save in @b tt its token type.
186  * @return the end position, if recognized and 0 if not
187  */
hexLiteral(unsigned pos,LTType & tt)188 unsigned CParser::hexLiteral(unsigned pos,LTType& tt)
189 {
190   CALL("CParser::hexLiteral");
191 
192   if (_input[pos] != '0' ||
193       _input[pos+1] != 'x' ||
194       !hexDigit(_input[pos+2])) {
195     return 0;
196   }
197   pos += 2;
198   while (hexDigit(_input[++pos])) {}
199   unsigned pos1 = integerTypeSuffix(pos,tt);
200   return pos1 ? pos1 : pos;
201 } // hexLiteral
202 
203 /**
204  * Read an exponent.
205  * Exponent : ('e'|'E') ('+'|'-')? ('0'..'9')+ ;
206  * @return the end position, if recognized and 0 if not
207  */
exponent(unsigned pos)208 unsigned CParser::exponent(unsigned pos)
209 {
210   CALL("CParser::exponent");
211 
212   switch (_input[pos]) {
213   case 'e':
214   case 'E':
215     pos++;
216     break;
217   default:
218     return 0;
219   }
220   switch (_input[++pos]) {
221   case '+':
222   case '-':
223     pos++;
224     break;
225   default:
226     break;
227   }
228   if (!digit(_input[pos])) {
229     return 0;
230   }
231   while (digit(_input[++pos])) {}
232   return pos;
233 } // exponent
234 
235 /** True if the character is a float type suffix and set @b tt
236  * to the appropriate type.
237  * FloatTypeSuffix : ('f'|'F'|'d'|'D') ;
238  */
floatTypeSuffix(char c,LTType & tt)239 bool CParser::floatTypeSuffix(char c,LTType& tt)
240 {
241   CALL("CParser::floatTypeSuffix");
242 
243   switch (c) {
244   case 'f':
245   case 'F':
246     tt = LT_FLOAT_CONST;
247     return true;
248   case 'd':
249   case 'D':
250     tt = LT_DOUBLE_CONST;
251     return true;
252   default:
253     tt = LT_FLOAT_CONST;
254     return false;
255   }
256 } // floatTypeSuffix
257 
258 /**
259  * Read a floating point literal.
260  * FLOATING_POINT_LITERAL
261  *  :   ('0'..'9')+ '.' ('0'..'9')* Exponent? FloatTypeSuffix?
262  *  |   '.' ('0'..'9')+ Exponent? FloatTypeSuffix?
263  *  |   ('0'..'9')+ Exponent FloatTypeSuffix?
264  *  |   ('0'..'9')+ FloatTypeSuffix;
265  * @return the end position, if recognized, and 0 if not
266  */
floatingPointLiteral(unsigned pos,LTType & tt)267 unsigned CParser::floatingPointLiteral(unsigned pos,LTType& tt)
268 {
269   CALL("CParser::floatingPointLiteral");
270 
271   if (_input[pos] == '.') { // case 2
272     if (!digit(_input[++pos])) {
273       return 0;
274     }
275     while (digit(_input[++pos])) {}
276     unsigned pos1 = exponent(pos);
277     if (pos1) {
278       pos = pos1;
279     }
280     return floatTypeSuffix(_input[pos],tt) ? pos+1 : pos;
281   }
282   // cases 1,3,4
283   if (!digit(_input[pos])) {
284     return 0;
285   }
286   while (digit(_input[++pos])) {}
287   if (floatTypeSuffix(_input[pos],tt)) { // case 4
288     return pos+1;
289   }
290   if (_input[pos] != '.') { // case 3
291     pos = exponent(pos);
292     if (!pos) {
293       return 0;
294     }
295     return floatTypeSuffix(_input[pos],tt) ? pos+1 : pos;
296   }
297   // case 1
298   while (digit(_input[++pos])) {}
299   unsigned pos1 = exponent(pos);
300   if (pos1) {
301     pos = pos1;
302   }
303   return floatTypeSuffix(_input[pos],tt) ? pos+1 : pos;
304 } // floatingPointLiteral
305 
306 /** true of c is a letter: $, _, A-Z, a-z */
letter(char c)307 bool CParser::letter(char c)
308 {
309   return c == '$'
310     || c == '_'
311     || (c >= 'A' && c <= 'Z')
312     || (c >= 'a' && c <= 'z');
313 } // letter
314 
315 /**
316  * Read an identifier.
317  * IDENTIFIER : LETTER (LETTER|'0'..'9')*
318  * @return the end position, if recognized, and 0 if not
319  */
identifier(unsigned pos)320 unsigned CParser::identifier(unsigned pos)
321 {
322   CALL("CParser::identifier");
323 
324   if (!letter(_input[pos])) {
325     return 0;
326   }
327   for (;;) {
328     char c = _input[++pos];
329     if (letter(c) || digit(c)) continue;
330     return pos;
331   }
332 } // identifier
333 
334 /**
335  * Skip whitespace characters, comments, and lines having # as the first
336  * non-whitespace character.
337  * @return the position of first character after.
338  */
skipWhiteSpacesAndComments(unsigned pos)339 unsigned CParser::skipWhiteSpacesAndComments(unsigned pos)
340 {
341   CALL("CParser::skipWhiteSpacesAndComments");
342 
343   bool newLine = (pos == 0);
344   for (;;) {
345     switch (_input[pos]) {
346     case ' ':
347     case '\t':
348       pos++;
349       break;
350     case '\r':
351     case '\n':
352       newLine = true;
353       pos++;
354       break;
355     case '#':
356       if (!newLine) {
357 	throw LexerException(*this,pos,"character # not in the beginning of a line");
358       }
359       pos = skipToEndOfLine(pos+1);
360       break;
361     case '/':
362       switch (_input[pos+1]) {
363       case '*':
364 	newLine = false;
365 	// skip to the end of comments
366 	pos = skipToEndOfComment(pos+2);
367 	break;
368       case '/':
369 	pos = skipToEndOfLine(pos+2);
370 	newLine = true;
371 	break;
372       default:
373 	return pos;
374       }
375       break;
376     default:
377       return pos;
378     }
379   }
380 } // CParser::skipWhiteSpacesAndComments
381 
382 /**
383  * Skip characters to the end of line.
384  * @return the position of first character after.
385  */
skipToEndOfLine(unsigned pos)386 unsigned CParser::skipToEndOfLine(unsigned pos)
387 {
388   CALL("CParser::skipToEndOfLine");
389 
390   for (;;) {
391     char c = _input[pos];
392     switch (c) {
393     case 0:
394       return pos;
395     case '\n':
396       return pos+1;
397     case '\r':
398       return _input[pos+1] == '\n' ? pos+2 : pos+1;
399     default:
400       pos++;
401       break;
402     }
403   }
404 } // CParser::skipToEndOfLine
405 
406 /**
407  * Skip characters to the end of comment.
408  * @return the position of first character after.
409  */
skipToEndOfComment(unsigned pos)410 unsigned CParser::skipToEndOfComment(unsigned pos)
411 {
412   CALL("CParser::skipToEndOfComment");
413 
414   unsigned start = pos;
415   for (;;) {
416     char c = _input[pos];
417     switch (c) {
418     case 0:
419       throw LexerException(*this,start-2,"non-terminated comment");
420     case '*':
421       if (_input[pos+1] == '/') {
422 	return pos+2;
423       }
424       pos++;
425       break;
426     default:
427       pos++;
428       break;
429     }
430   }
431 } // CParser::skipToEndOfComment
432 
433 /**
434  * Create a new lexer exception.
435  */
LexerException(const CParser & parser,unsigned pos,vstring message)436 CParser::LexerException::LexerException (const CParser& parser,unsigned pos,vstring message)
437 {
438   _message = message + " at position " + Int::toString(pos) + "\n>>>" + Int::toString(int(parser.input()[pos])) + "<<<" +
439     "\n---------------" + (parser.input()+pos) + "\n---------------\n";
440 } // LexerException::LexerException
441 
442 /**
443  * Write itself to an ostream.
444  */
cry(ostream & out)445 void CParser::LexerException::cry (ostream& out)
446 {
447   out << "C lexer exception: " << _message << '\n';
448 } // CParser::LexerException::cry
449 
450 /**
451  * Create a new parser exception.
452  */
ParserException(const CParser & parser,unsigned pos,vstring message)453 CParser::ParserException::ParserException (const CParser& parser,unsigned pos,vstring message)
454 {
455   _message = message + " at position " + Int::toString(pos) + "\n>>>" + Int::toString(int(parser.input()[pos])) + "<<<" +
456     "\n---------------" + (parser.input()+pos) + "\n---------------\n";
457 } // ParserException::ParserException
458 
459 /**
460  * Write itself to an ostream.
461  */
cry(ostream & out)462 void CParser::ParserException::cry (ostream& out)
463 {
464   out << "C parser exception: " << _message << '\n';
465 } // CParser::ParserException::cry
466 
tokenize()467 void CParser::tokenize()
468 {
469   CALL("CParser::parse");
470 
471   unsigned pos = 0;
472   unsigned end;
473   Token token;
474 
475   for (;;) {
476     pos = skipWhiteSpacesAndComments(pos);
477     if (! _input[pos]) {
478       token.type = LT_EOF;
479       token.start = pos;
480       token.end = pos;
481       _tokens.push_back(token);
482       return;
483     }
484 
485     token.start = pos;
486     switch (_input[pos]) {
487     case 'a':
488     case 'b':
489     case 'c':
490     case 'd':
491     case 'e':
492     case 'f':
493     case 'g':
494     case 'i':
495     case 'l':
496     case 'r':
497     case 's':
498     case 't':
499     case 'u':
500     case 'v':
501     case 'w':
502       end = identifier(pos);
503       token.type = keyword(pos,end);
504       break;
505 
506     case 'h':
507     case 'j':
508     case 'k':
509     case 'm':
510     case 'n':
511     case 'o':
512     case 'p':
513     case 'q':
514     case 'x':
515     case 'y':
516     case 'z':
517     case 'A':
518     case 'B':
519     case 'C':
520     case 'D':
521     case 'E':
522     case 'F':
523     case 'G':
524     case 'H':
525     case 'I':
526     case 'J':
527     case 'K':
528     case 'L':
529     case 'M':
530     case 'N':
531     case 'O':
532     case 'P':
533     case 'Q':
534     case 'R':
535     case 'S':
536     case 'T':
537     case 'U':
538     case 'V':
539     case 'W':
540     case 'X':
541     case 'Y':
542     case 'Z':
543     case '$':
544     case '_':
545       end = identifier(pos);
546       token.type = LT_IDENTIFIER;
547       break;
548 
549     case '{':
550       end = pos+1;
551       token.type = LT_LBRACE;
552       break;
553     case '}':
554       end = pos+1;
555       token.type = LT_RBRACE;
556       break;
557     case '(':
558       end = pos+1;
559       token.type = LT_LPAR;
560       break;
561     case ')':
562       end = pos+1;
563       token.type = LT_RPAR;
564       break;
565     case ';':
566       end = pos+1;
567       token.type = LT_SEMICOLON;
568       break;
569 
570     case '=':
571       end = pos+1;
572       if (_input[end] == '=') {
573 	end++;
574 	token.type = LT_EQ_OP;
575 	break;
576       }
577       token.type = LT_ASSIGN;
578       break;
579 
580     case '+':
581       end = pos+1;
582       if (_input[end] == '=') {
583 	end++;
584 	token.type = LT_ADD_ASSIGN;
585 	break;
586       }
587       if (_input[end] == '+') {
588 	end++;
589 	token.type = LT_INC_OP;
590 	break;
591       }
592       token.type = LT_ADD;
593       break;
594 
595     case '*':
596       end = pos+1;
597       if (_input[end] == '=') {
598 	end++;
599 	token.type = LT_MULT_ASSIGN;
600 	break;
601       }
602       token.type = LT_MULT;
603       break;
604 
605     case '.':
606       end = pos+1;
607       if (_input[end] == '.') {
608 	if (_input[end+1] != '.') {
609 	  throw LexerException(*this,pos,"bad expression");
610 	}
611 	end += 2;
612 	token.type = LT_ELLIPSIS;
613 	break;
614       }
615       token.type = LT_DOT;
616       break;
617 
618     case '>':
619       end = pos+1;
620       if (_input[end] == '=') {
621 	end++;
622 	token.type = LT_GE_OP;
623 	break;
624       }
625       if (_input[end] != '>') {
626 	token.type = LT_GREATER;
627 	break;
628       }
629       if (_input[end+1] == '=') {
630 	end += 2;
631 	token.type = LT_RIGHT_ASSIGN;
632 	break;
633       }
634       end++;
635       token.type = LT_RIGHT_OP;
636       break;
637 
638     case '<':
639       end = pos+1;
640       if (_input[end] == '=') {
641 	end++;
642 	token.type = LT_LE_OP;
643 	break;
644       }
645       if (_input[end] == ':') {
646 	end++;
647 	token.type = LT_LBRACKET;
648 	break;
649       }
650       if (_input[end] == '%') {
651 	end++;
652 	token.type = LT_LBRACE;
653 	break;
654       }
655       if (_input[end] != '<') {
656 	token.type = LT_LESS;
657 	break;
658       }
659       if (_input[end+1] == '=') {
660 	end += 2;
661 	token.type = LT_LEFT_ASSIGN;
662 	break;
663       }
664       end++;
665       token.type = LT_LEFT_OP;
666       break;
667 
668     case '-':
669       end = pos+1;
670       if (_input[end] == '=') {
671 	end++;
672 	token.type = LT_SUB_ASSIGN;
673 	break;
674       }
675       if (_input[end] == '-') {
676 	end++;
677 	token.type = LT_DEC_OP;
678 	break;
679       }
680       if (_input[end] == '>') {
681 	end++;
682 	token.type = LT_PTR_OP;
683 	break;
684       }
685       token.type = LT_MINUS;
686       break;
687 
688     case '/':
689       end = pos+1;
690       if (_input[end] == '=') {
691 	end++;
692 	token.type = LT_DIV_ASSIGN;
693 	break;
694       }
695       token.type = LT_DIV;
696       break;
697 
698     case '%':
699       end = pos+1;
700       if (_input[end] == '=') {
701 	end++;
702 	token.type = LT_MOD_ASSIGN;
703 	break;
704       }
705       if (_input[end] == '>') {
706 	end++;
707 	token.type = LT_RBRACE;
708 	break;
709       }
710       token.type = LT_MOD;
711       break;
712 
713     case '&':
714       end = pos+1;
715       if (_input[end] == '=') {
716 	end++;
717 	token.type = LT_AND_ASSIGN;
718 	break;
719       }
720       if (_input[end] == '&') {
721 	end++;
722 	token.type = LT_AND_OP;
723 	break;
724       }
725       token.type = LT_AMP;
726       break;
727 
728     case '|':
729       end = pos+1;
730       if (_input[end] == '=') {
731 	end++;
732 	token.type = LT_OR_ASSIGN;
733 	break;
734       }
735       if (_input[end] == '|') {
736 	end++;
737 	token.type = LT_OR_OP;
738 	break;
739       }
740       token.type = LT_BAR;
741       break;
742 
743     case '^':
744       end = pos+1;
745       if (_input[end] == '=') {
746 	end++;
747 	token.type = LT_XOR_ASSIGN;
748 	break;
749       }
750       token.type = LT_XOR;
751       break;
752 
753     case '!':
754       end = pos+1;
755       if (_input[end] == '=') {
756 	end++;
757 	token.type = LT_NE_OP;
758 	break;
759       }
760       token.type = LT_EXCLAMATION;
761       break;
762 
763     case ':':
764       end = pos+1;
765       if (_input[end] == '>') {
766 	end++;
767 	token.type = LT_RBRACKET;
768 	break;
769       }
770       token.type = LT_COLON;
771       break;
772 
773     case '[':
774       end = pos+1;
775       token.type = LT_LBRACKET;
776       break;
777 
778     case ']':
779       end = pos+1;
780       token.type = LT_RBRACKET;
781       break;
782 
783     case ',':
784       end = pos+1;
785       token.type = LT_COMMA;
786       break;
787 
788     case '~':
789       end = pos+1;
790       token.type = LT_TILDE;
791       break;
792 
793     case '?':
794       end = pos+1;
795       token.type = LT_QUESTION;
796       break;
797 
798     case '0':
799     case '1':
800     case '2':
801     case '3':
802     case '4':
803     case '5':
804     case '6':
805     case '7':
806     case '8':
807     case '9':
808       end = numericConstant(pos,token.type);
809       if (!end) {
810 	throw LexerException(*this,pos,"bad numeric constant");
811       }
812       break;
813 
814     case '"':
815       end = stringConstant(pos);
816       if (!end) {
817 	throw LexerException(*this,pos,"non-terminated vstring constant");
818       }
819       token.type = LT_STRING_CONST;
820       break;
821 
822     case '\'':
823       end = charConstant(pos);
824       if (!end) {
825 	throw LexerException(*this,pos,"bad character constant");
826       }
827       token.type = LT_CHAR_CONST;
828       break;
829 
830     default:
831       throw LexerException(*this,pos,(vstring)"unrecognized symbol " + _input[pos] + " in the input");
832     }
833     // if (end <= pos) {
834     //   cout << "ERRRRRRRRRRRRRRRRRRRRRRR\n" << pos << ':' << end << ':' << _input[pos];
835     // }
836     ASS(end > pos);
837     token.end = end;
838     _tokens.push_back(token);
839     pos = end;
840   }
841 } // CParser::tokenize
842 
843 /**
844  * Return the token type of the identifier substring between the positions
845  * pos and end.
846  */
keyword(unsigned pos,unsigned end)847 CParser::LTType CParser::keyword(unsigned pos,unsigned end)
848 {
849   CALL("CParser::keyword");
850 
851   const char* text = _input+pos;
852   switch (*text) {
853   case 'a':
854     if (keyword(text+1,"uto",end-pos-1)) return LT_AUTO;
855     break;
856   case 'b':
857     if (keyword(text+1,"reak",end-pos-1)) return LT_BREAK;
858     break;
859   case 'c':
860     switch (text[1]) {
861     case 'a':
862       if (keyword(text+2,"se",end-pos-2)) return LT_CASE;
863       break;
864     case 'h':
865       if (keyword(text+2,"ar",end-pos-2)) return LT_CHAR;
866     case 'o':
867       if (keyword(text+2,"nst",end-pos-2)) return LT_CONST;
868       if (keyword(text+2,"ntinue",end-pos-2)) return LT_CONTINUE;
869       break;
870     default:
871       break;
872     }
873     break;
874 
875   case 'd':
876     switch (text[1]) {
877     case 'e':
878       if (keyword(text+2,"fault",end-pos-2)) return LT_DEFAULT;
879       break;
880     case 'o':
881       if (keyword(text+2,"",end-pos-2)) return LT_DO;
882       if (keyword(text+2,"uble",end-pos-2)) return LT_DOUBLE;
883       break;
884     default:
885       break;
886     }
887     break;
888 
889   case 'e':
890     if (keyword(text+1,"lse",end-pos-1)) return LT_ELSE;
891     if (keyword(text+1,"num",end-pos-1)) return LT_ENUM;
892     if (keyword(text+1,"xtern",end-pos-1)) return LT_EXTERN;
893     break;
894 
895   case 'f':
896     if (keyword(text+1,"loat",end-pos-1)) return LT_FLOAT;
897     if (keyword(text+1,"or",end-pos-1)) return LT_FOR;
898     break;
899 
900   case 'g':
901     if (keyword(text+1,"oto",end-pos-1)) return LT_GOTO;
902     break;
903 
904   case 'i':
905     if (keyword(text+1,"f",end-pos-1)) return LT_IF;
906     if (keyword(text+1,"nline",end-pos-1)) return LT_INLINE;
907     if (keyword(text+1,"nt",end-pos-1)) return LT_INT;
908     break;
909 
910   case 'l':
911     if (keyword(text+1,"ong",end-pos-1)) return LT_LONG;
912     break;
913 
914   case 'r':
915     if (keyword(text+1,"egister",end-pos-1)) return LT_REGISTER;
916     if (keyword(text+1,"estrict",end-pos-1)) return LT_RESTRICT;
917     if (keyword(text+1,"eturn",end-pos-1)) return LT_RETURN;
918     break;
919 
920   case 's':
921     if (keyword(text+1,"hort",end-pos-1)) return LT_SHORT;
922     if (keyword(text+1,"igned",end-pos-1)) return LT_SIGNED;
923     if (keyword(text+1,"izeof",end-pos-1)) return LT_SIZEOF;
924     if (keyword(text+1,"truct",end-pos-1)) return LT_STRUCT;
925     if (keyword(text+1,"witch",end-pos-1)) return LT_SWITCH;
926     break;
927 
928   case 't':
929     if (keyword(text+1,"ypedef",end-pos-1)) return LT_TYPEDEF;
930     break;
931 
932   case 'u':
933     if (keyword(text+1,"nion",end-pos-1)) return LT_UNION;
934     if (keyword(text+1,"nsigned",end-pos-1)) return LT_UNSIGNED;
935     break;
936 
937   case 'v':
938     if (keyword(text+1,"oid",end-pos-1)) return LT_VOID;
939     if (keyword(text+1,"olatile",end-pos-1)) return LT_VOLATILE;
940     break;
941 
942   case 'w':
943     if (keyword(text+1,"hile",end-pos-1)) return LT_WHILE;
944     break;
945 
946   default:
947     ASS(false);
948   }
949   return LT_IDENTIFIER;
950 } // CParser::keyword
951 
952 /**
953  * True if the first chars characters of txt coincides with the
954  * first chars characters of word
955  * @pre the first chars characters of txt are non-zero
956  */
keyword(const char * txt,const char * word,int chars)957 bool CParser::keyword(const char* txt,const char* word,int chars)
958 {
959   CALL("CParser::keyword/3");
960 
961   ASS(chars >= 0);
962 
963   while (chars--) {
964     ASS(*txt);
965     if (*txt++ != *word++) return false;
966   }
967   return !*word;
968 } // CParser::keyword/3
969 
970 /**
971  * Read a numeric constant and save the tag to @b tt.
972  * NUMERIC_CONSTANT
973  * : HEX_LITERAL
974  * | OCTAL_LITERAL
975  * | DECIMAL_LITERAL
976  * | FLOATING_POINT_LITERAL;
977  * @warning can be optimised by (1) putting float and integer recognition together and (2)
978  * @return the end position, if recognized, and 0 if not
979  */
numericConstant(unsigned pos,LTType & tt)980 unsigned CParser::numericConstant(unsigned pos,LTType& tt)
981 {
982   CALL("CParser::numericConstant");
983 
984   if (_input[pos] != '0') {
985     unsigned end = floatingPointLiteral(pos,tt);
986     if (end) {
987       return end;
988     }
989     return decimalLiteral(pos,tt);
990   }
991   switch (_input[pos+1]) {
992   case 'x':
993   case 'X':
994     return hexLiteral(pos,tt);
995   case '0':
996   case '1':
997   case '2':
998   case '3':
999   case '4':
1000   case '5':
1001   case '6':
1002   case '7':
1003   case '8':
1004   case '9':
1005     return octalLiteral(pos,tt);
1006   default:
1007     return decimalLiteral(pos,tt);
1008   }
1009 } // CParser::numericConstant
1010 
1011 /**
1012  * Read a string constant.
1013  * @return the end position, if recognized, and 0 if not
1014  */
stringConstant(unsigned pos)1015 unsigned CParser::stringConstant(unsigned pos)
1016 {
1017   CALL("CParser::stringConstant");
1018 
1019   ASS(_input[pos] == '"');
1020 
1021   for (;;) {
1022     switch (_input[++pos]) {
1023     case '"':
1024       return pos+1;
1025     case '\\':
1026       pos++;
1027       break;
1028     case 0:
1029       return 0;
1030     default:
1031       break;
1032     }
1033   }
1034 } // CParser::stringConstant
1035 
1036 /**
1037  * Read a character constant.
1038  * @return the end position, if recognized, and 0 if not
1039  */
charConstant(unsigned pos)1040 unsigned CParser::charConstant(unsigned pos)
1041 {
1042   CALL("CParser::charConstant");
1043   ASS(_input[pos] == '\'');
1044 
1045   switch (_input[++pos]) {
1046   case '\'':
1047     return 0;
1048   case '\\':
1049     pos++;
1050     break;
1051   case 0:
1052     return 0;
1053   default:
1054     break;
1055   }
1056   if (_input[++pos] == '\'') {
1057     return pos+1;
1058   }
1059   return 0;
1060 } // CParser::charConstant
1061 
1062 #if VDEBUG
output(ostream & str)1063 void CParser::output(ostream& str)
1064 {
1065   CALL("CParser::output");
1066 
1067   for (unsigned i = 0;;i++) {
1068     Token& tok = _tokens[i];
1069     if (tok.type == LT_EOF) {
1070       return;
1071     }
1072     for (unsigned p = tok.start;p < tok.end;p++) {
1073       str << _input[p];
1074     }
1075     str << '\n';
1076   }
1077 }
1078 #endif
1079 
1080 /**
1081  * Parse a primary expression using the following grammar.
1082  *  primary_expression
1083  *   : IDENTIFIER
1084  *   | CONSTANT
1085  *   | STRING_LITERAL
1086  *   | '(' expression ')';
1087  */
primaryExpression(unsigned pos,bool backtrack)1088 unsigned CParser::primaryExpression(unsigned pos,bool backtrack)
1089 {
1090   CALL("CParser::primaryExpression");
1091 
1092   Token& tok = _tokens[pos];
1093   switch (tok.type) {
1094   case LT_IDENTIFIER:
1095     _units.push_back(new Identifier(pos,pos+1));
1096     return pos+1;
1097 
1098   case LT_INT_CONST:
1099   case LT_LONG_CONST:
1100   case LT_UINT_CONST:
1101   case LT_ULONG_CONST:
1102   case LT_FLOAT_CONST:
1103   case LT_DOUBLE_CONST:
1104   case LT_STRING_CONST:
1105   case LT_CHAR_CONST:
1106     _units.push_back(new ConstantExpression(pos,pos+1));
1107     return pos+1;
1108 
1109   case LT_LPAR: {
1110     unsigned end = expression(pos,backtrack);
1111     if (!end) return 0;
1112     if (!consumeToken(LT_RPAR,end+1,backtrack)) return 0;
1113     return end+2;
1114   }
1115   default:
1116     if (backtrack) return 0;
1117     throw ParserException(*this,pos,"primary expression expected");
1118   }
1119 } // primaryExpression
1120 
1121 
1122 /**
1123  * Parse a postfix expression using the following grammar.
1124  * postfix_expression
1125  *	: primary_expression
1126  *	| postfix_expression '[' expression ']'
1127  *	| postfix_expression '(' ')'
1128  *	| postfix_expression '(' argument_expression_list ')'
1129  *	| postfix_expression '.' IDENTIFIER
1130  *	| postfix_expression PTR_OP IDENTIFIER
1131  *	| postfix_expression INC_OP
1132  *	| postfix_expression DEC_OP
1133  *	| '(' type_name ')' '{' initializer_list '}'
1134  *	| '(' type_name ')' '{' initializer_list ',' '}'
1135  * @warning the last two cases are not implemented
1136 */
postfixExpression(unsigned pos,bool backtrack)1137 unsigned CParser::postfixExpression(unsigned pos,bool backtrack)
1138 {
1139   CALL("CParser::postfixExpression");
1140   unsigned end = primaryExpression(pos,backtrack);
1141   if (!end) return 0;
1142 
1143   for (;;) {
1144     Token& tok = _tokens[end];
1145     switch (tok.type) {
1146     case LT_LBRACKET: {
1147       pos = expression(end+1,backtrack);
1148       if (!pos) return 0;
1149       if (!consumeToken(LT_RBRACKET,pos+1,backtrack)) return 0;
1150       pos += 2;
1151       Unit* rhs = _units.back();
1152       _units.pop_back();
1153       Unit* lhs = _units.back();
1154       _units.pop_back();
1155       _units.push_back(new ArrayApplication(lhs->start(),pos,lhs,rhs));
1156       break;
1157     }
1158 
1159     case LT_DOT:
1160     case LT_PTR_OP:
1161       if (!consumeToken(LT_IDENTIFIER,end+1,backtrack)) return 0;
1162       pos = end+2;
1163       break;
1164 
1165     case LT_INC_OP:
1166     case LT_DEC_OP:
1167       pos = end+1;
1168       break;
1169 
1170     case LT_LPAR:
1171       if (_tokens[end+1].type == LT_RPAR) return end+2;
1172       pos = argumentExpressionList(pos,backtrack);
1173       if (!pos) return 0;
1174       if (!consumeToken(LT_RPAR,pos,backtrack)) return 0;
1175       pos++;
1176       break;
1177 
1178     default:
1179       return end;
1180     }
1181   }
1182 } //
1183 
1184 /**
1185  * If the token type at position pos is t, return true.
1186  * Otherwise, if backtrack is true, then return 0, otherwise
1187  * raise an exception.
1188  */
consumeToken(LTType t,unsigned pos,bool backtrack)1189 bool CParser::consumeToken(LTType t,unsigned pos,bool backtrack)
1190 {
1191   if (_tokens[pos].type == t) return true;
1192   if (backtrack) return false;
1193   const char* what;
1194   switch (t) {
1195   case LT_RPAR:
1196     what = "}";
1197     break;
1198   case LT_IDENTIFIER:
1199     what = "identifier";
1200   default:
1201     ASS(false);
1202   }
1203   throw ParserException(*this,pos,(vstring)what + " expected");
1204 } // consumeToken
1205 
1206 /**
1207  * Parse a unary expression using the following grammar.
1208  *   unary_expression
1209  *	: postfix_expression
1210  *	| INC_OP unary_expression
1211  *	| DEC_OP unary_expression
1212  *	| unary_operator cast_expression
1213  *	| SIZEOF unary_expression
1214  *	| SIZEOF '(' type_name ')'
1215  *	;
1216  *  unary_operator : '&' | '*' | '+' | '-' | '~' | '!'
1217  * @warning two sizeof cases are not implemented and cast_expression is replaced by unary_expression
1218 */
unaryExpression(unsigned pos,bool backtrack)1219 unsigned CParser::unaryExpression(unsigned pos,bool backtrack)
1220 {
1221   CALL("CParser::unaryExpression");
1222 
1223   for (;;) {
1224     bool done = false;
1225     switch (_tokens[pos].type) {
1226     case LT_INC_OP:
1227     case LT_DEC_OP:
1228     case LT_AMP:
1229     case LT_MULT:
1230     case LT_ADD:
1231     case LT_MINUS:
1232     case LT_TILDE:
1233     case LT_EXCLAMATION:
1234       pos++;
1235       break;
1236     default:
1237       done = true;
1238       break;
1239     }
1240     if (done) {
1241       break;
1242     }
1243   }
1244   return postfixExpression(pos,backtrack);
1245 } // unaryExpression
1246 
1247 /**
1248  * Parse a multiplicative expression using the following grammar.
1249  * multiplicative_expression
1250  *	: cast_expression
1251  *	| multiplicative_expression '*' cast_expression
1252  *	| multiplicative_expression '/' cast_expression
1253  *	| multiplicative_expression '%' cast_expression
1254  * @warning since cast_expression is replaced by unary_expression
1255 */
multiplicativeExpression(unsigned pos,bool backtrack)1256 unsigned CParser::multiplicativeExpression(unsigned pos,bool backtrack)
1257 {
1258   CALL("CParser::multiplicativeExpression");
1259 
1260   for (;;) {
1261     unsigned end = unaryExpression(pos,backtrack);
1262     if (!end) return 0;
1263     switch (_tokens[end].type) {
1264     case LT_MULT:
1265     case LT_DIV:
1266     case LT_MOD:
1267       pos = end+1;
1268       break;
1269     default:
1270       return end;
1271     }
1272   }
1273 } // CParser::multiplicativeExpression
1274 
1275 /**
1276  * Parse a additive expression using the following grammar.
1277  * additive_expression
1278  *	: multiplicative_expression
1279  *	| additive_expression '+' multiplicative_expression
1280  *	| additive_expression '-' multiplicative_expression
1281 */
additiveExpression(unsigned pos,bool backtrack)1282 unsigned CParser::additiveExpression(unsigned pos,bool backtrack)
1283 {
1284   CALL("CParser::additiveExpression");
1285 
1286   for (;;) {
1287     unsigned end = multiplicativeExpression(pos,backtrack);
1288     if (!end) return 0;
1289     switch (_tokens[end].type) {
1290     case LT_ADD:
1291     case LT_MINUS:
1292       pos = end+1;
1293       break;
1294     default:
1295       return end;
1296     }
1297   }
1298 } // CParser::additiveExpression
1299 
1300 /**
1301  * Parse a shift expression using the following grammar.
1302  * shift_expression
1303  *	: additive_expression
1304  *	| shift_expression LEFT_OP additive_expression
1305  *	| shift_expression RIGHT_OP additive_expression
1306  */
shiftExpression(unsigned pos,bool backtrack)1307 unsigned CParser::shiftExpression(unsigned pos,bool backtrack)
1308 {
1309   CALL("CParser::shiftExpression");
1310 
1311   for (;;) {
1312     unsigned end = additiveExpression(pos,backtrack);
1313     if (!end) return 0;
1314     switch (_tokens[end].type) {
1315     case LT_LEFT_OP:
1316     case LT_RIGHT_OP:
1317       pos = end+1;
1318       break;
1319     default:
1320       return end;
1321     }
1322   }
1323 } // CParser::shiftExpression
1324 
1325 
1326 /**
1327  * Parse a relational expression using the following grammar.
1328  * relational_expression
1329  *	: shift_expression
1330  *	| relational_expression '<' shift_expression
1331  *	| relational_expression '>' shift_expression
1332  *	| relational_expression LE_OP shift_expression
1333  *	| relational_expression GE_OP shift_expression
1334  */
relationalExpression(unsigned pos,bool backtrack)1335 unsigned CParser::relationalExpression(unsigned pos,bool backtrack)
1336 {
1337   CALL("CParser::relationalExpression");
1338 
1339   for (;;) {
1340     unsigned end = shiftExpression(pos,backtrack);
1341     if (!end) return 0;
1342     switch (_tokens[end].type) {
1343     case LT_LESS:
1344     case LT_GREATER:
1345     case LT_LE_OP:
1346     case LT_GE_OP:
1347       pos = end+1;
1348       break;
1349     default:
1350       return end;
1351     }
1352   }
1353 } // CParser::relationalExpression
1354 
1355 /**
1356  * Parse an equality expression using the following grammar.
1357  * equality_expression
1358  *	: relational_expression
1359  *	| equality_expression EQ_OP relational_expression
1360  *	| equality_expression NE_OP relational_expression
1361  */
equalityExpression(unsigned pos,bool backtrack)1362 unsigned CParser::equalityExpression(unsigned pos,bool backtrack)
1363 {
1364   CALL("CParser::equalityExpression");
1365 
1366   for (;;) {
1367     unsigned end = relationalExpression(pos,backtrack);
1368     if (!end) return 0;
1369     switch (_tokens[end].type) {
1370     case LT_EQ_OP:
1371     case LT_NE_OP:
1372       pos = end+1;
1373       break;
1374     default:
1375       return end;
1376     }
1377   }
1378 } // CParser::equalityExpression
1379 
1380 /**
1381  * Parse an and expression using the following grammar.
1382  * and_expression
1383  * and_expression
1384  *	: equality_expression
1385  *	| and_expression '&' equality_expression
1386  */
andExpression(unsigned pos,bool backtrack)1387 unsigned CParser::andExpression(unsigned pos,bool backtrack)
1388 {
1389   CALL("CParser::andExpression");
1390 
1391   for (;;) {
1392     unsigned end = equalityExpression(pos,backtrack);
1393     if (!end) return 0;
1394     switch (_tokens[end].type) {
1395     case LT_AMP:
1396       pos = end+1;
1397       break;
1398     default:
1399       return end;
1400     }
1401   }
1402 } // CParser::andExpression
1403 
1404 
1405 /**
1406  * Parse an exclusive or expression using the following grammar.
1407  * exclusive_or_expression
1408  *	: and_expression
1409  *	| exclusive_or_expression '^' and_expression
1410  */
xorExpression(unsigned pos,bool backtrack)1411 unsigned CParser::xorExpression(unsigned pos,bool backtrack)
1412 {
1413   CALL("CParser::xorExpression");
1414 
1415   for (;;) {
1416     unsigned end = andExpression(pos,backtrack);
1417     if (!end) return 0;
1418     switch (_tokens[end].type) {
1419     case LT_XOR:
1420       pos = end+1;
1421       break;
1422     default:
1423       return end;
1424     }
1425   }
1426 } // CParser::xorExpression
1427 
1428 /**
1429  * Parse an inclusive or expression using the following grammar.
1430  * inclusive_or_expression
1431  *	: exclusive_or_expression
1432  *	| inclusive_or_expression '|' exclusive_or_expression
1433  */
orExpression(unsigned pos,bool backtrack)1434 unsigned CParser::orExpression(unsigned pos,bool backtrack)
1435 {
1436   CALL("CParser::orExpression");
1437 
1438   for (;;) {
1439     unsigned end = xorExpression(pos,backtrack);
1440     if (!end) return 0;
1441     switch (_tokens[end].type) {
1442     case LT_BAR:
1443       pos = end+1;
1444       break;
1445     default:
1446       return end;
1447     }
1448   }
1449 } // CParser::orExpression
1450 
1451 /**
1452  * Parse a logical and expression using the following grammar.
1453  * logical_and_expression
1454  *	: inclusive_or_expression
1455  *	| logical_and_expression AND_OP inclusive_or_expression
1456  */
logicalAndExpression(unsigned pos,bool backtrack)1457 unsigned CParser::logicalAndExpression(unsigned pos,bool backtrack)
1458 {
1459   CALL("CParser::logicalAndExpression");
1460 
1461   for (;;) {
1462     unsigned end = orExpression(pos,backtrack);
1463     if (!end) return 0;
1464     switch (_tokens[end].type) {
1465     case LT_AND_OP:
1466       pos = end+1;
1467       break;
1468     default:
1469       return end;
1470     }
1471   }
1472 } // CParser::logicalAndExpression
1473 
1474 /**
1475  * Parse a logical or expression using the following grammar.
1476  * logical_or_expression
1477  *	: logical_and_expression
1478  *	| logical_or_expression OR_OP logical_and_expression
1479  */
logicalOrExpression(unsigned pos,bool backtrack)1480 unsigned CParser::logicalOrExpression(unsigned pos,bool backtrack)
1481 {
1482   CALL("CParser::logicalOrExpression");
1483 
1484   for (;;) {
1485     unsigned end = logicalAndExpression(pos,backtrack);
1486     if (!end) return 0;
1487     switch (_tokens[end].type) {
1488     case LT_OR_OP:
1489       pos = end+1;
1490       break;
1491     default:
1492       return end;
1493     }
1494   }
1495 } // CParser::logicalOrExpression
1496 
1497 /**
1498  * Parse a conditional expression using the following grammar.
1499  * conditional_expression
1500  *	: logical_or_expression
1501  *	| logical_or_expression '?' expression ':' conditional_expression
1502  */
conditionalExpression(unsigned pos,bool backtrack)1503 unsigned CParser::conditionalExpression(unsigned pos,bool backtrack)
1504 {
1505   CALL("CParser::conditionalExpression");
1506 
1507   for (;;) {
1508     unsigned end = logicalOrExpression(pos,backtrack);
1509     if (!end) return 0;
1510     switch (_tokens[end].type) {
1511     case LT_QUESTION:
1512       pos = end+1;
1513       end = expression(pos,backtrack);
1514       if (!end) return 0;
1515       consumeToken(LT_COLON,end,backtrack);
1516       pos = end+1;
1517       break;
1518     default:
1519       return end;
1520     }
1521   }
1522 } // CParser::conditionalExpression
1523 
1524 /**
1525  * Parse an assignment expression using the following grammar.
1526  * assignment_expression
1527  *	: conditional_expression
1528  *	| unary_expression assignment_operator assignment_expression
1529  */
assignmentExpression(unsigned pos,bool backtrack)1530 unsigned CParser::assignmentExpression(unsigned pos,bool backtrack)
1531 {
1532   CALL("CParser::assignmentExpression");
1533   NOT_IMPLEMENTED;
1534 }
1535 
1536 /*
1537 assignment_operator
1538 	: '='
1539 	| MUL_ASSIGN
1540 	| DIV_ASSIGN
1541 	| MOD_ASSIGN
1542 	| ADD_ASSIGN
1543 	| SUB_ASSIGN
1544 	| LEFT_ASSIGN
1545 	| RIGHT_ASSIGN
1546 	| AND_ASSIGN
1547 	| XOR_ASSIGN
1548 	| OR_ASSIGN
1549 	;
1550 
1551 expression
1552 	: assignment_expression
1553 	| expression ',' assignment_expression
1554 	;
1555 
1556 */
1557 
expression(unsigned pos,bool backtrack)1558 unsigned CParser::expression(unsigned pos,bool backtrack)
1559 {
1560   return 0;
1561 }
1562 
argumentExpressionList(unsigned int,bool)1563 unsigned CParser::argumentExpressionList(unsigned int, bool)
1564 {
1565   return 0;
1566 }
1567 
Identifier(unsigned start,unsigned end)1568 CParser::Identifier::Identifier(unsigned start, unsigned end)
1569   : Unit(PT_IDENTIFIER,start,end)
1570 {
1571 }
1572 
ConstantExpression(unsigned start,unsigned end)1573 CParser::ConstantExpression::ConstantExpression(unsigned start, unsigned end)
1574   : Unit(PT_CONSTANT_EXPRESSION,start,end)
1575 {
1576 }
1577