1
2 /*
3 * File CParser.cpp.
4 *
5 * This file is part of the source code of the software program
6 * Vampire. It is protected by applicable
7 * copyright laws.
8 *
9 * This source code is distributed under the licence found here
10 * https://vprover.github.io/license.html
11 * and in the source directory
12 *
13 * In summary, you are allowed to use Vampire for non-commercial
14 * purposes but not allowed to distribute, modify, copy, create derivatives,
15 * or use in competitions.
16 * For other uses of Vampire please contact developers for a different
17 * licence, which we will make an effort to provide.
18 */
19 /**
20 * Implements class CParser
21 *
22 * @since 13/01/2011 Manchester
23 */
24
25 #include <cstring>
26
27 #include "Debug/Assertion.hpp"
28 #include "Debug/Tracer.hpp"
29
30 #include "Lib/Int.hpp"
31 #include "Lib/Exception.hpp"
32
33 #include "CParser.hpp"
34
35 using namespace Shell;
36 using namespace Lib;
37
38 /**
39 * Initialise a parser.
40 */
CParser(const char * input)41 CParser::CParser(const char* input)
42 : _input(input)
43 {
44 } // CParser::CParser
45
~CParser()46 CParser::~CParser()
47 {
48 } // CParser::~CParser
49
50
51 /**
52 * Read an integer type suffix identified by the following grammar:
53 * ('u'|'U')? ('l'|'L')
54 * | ('u'|'U') ('l'|'L')?
55 * Save in @b tt its token type.
56 * @return the end position, if recognized and 0 if not
57 */
integerTypeSuffix(unsigned pos,LTType & tt)58 unsigned CParser::integerTypeSuffix(unsigned pos,LTType& tt)
59 {
60 CALL("CParser::integerTypeSuffix");
61 switch (_input[pos]) {
62 case 'u':
63 case 'U': {
64 switch (_input[pos+1]) {
65 case 'l':
66 case 'L':
67 tt = LT_ULONG_CONST;
68 return pos+2;
69 default:
70 tt = LT_UINT_CONST;
71 return pos+1;
72 }
73 }
74 case 'l':
75 case 'L':
76 tt = LT_LONG_CONST;
77 return pos+1;
78 default:
79 tt = LT_INT_CONST;
80 return 0;
81 }
82 } // integerTypeSuffix
83
84 /**
85 * Read a decimal literal and saving its type in @b tt.
86 * Decimal literals use the following grammar.
87 * DECIMAL_LITERAL : ('0' | '1'..'9' '0'..'9'*) IntegerTypeSuffix?
88 * @return the end position, if recognized and 0 if not
89 */
decimalLiteral(unsigned pos,LTType & tt)90 unsigned CParser::decimalLiteral(unsigned pos,LTType& tt)
91 {
92 CALL("CParser::decimalLiteral");
93
94 switch (_input[pos]) {
95 case '0':
96 pos++;
97 break;
98
99 case '1':
100 case '2':
101 case '3':
102 case '4':
103 case '5':
104 case '6':
105 case '7':
106 case '8':
107 case '9':
108 for (;;) {
109 char n = _input[++pos];
110 if (n < '0' || n > '9') {
111 break;
112 }
113 }
114 break;
115 default:
116 return 0;
117 }
118 unsigned pos1 = integerTypeSuffix(pos,tt);
119 return pos1 ? pos1 : pos;
120 } // decimalLiteral
121
122 /**
123 * Read an octal literal identified by the following grammar:
124 * OCTAL_LITERAL : '0' ('0'..'7')+ IntegerTypeSuffix? ;
125 * Save in @b tt its token type
126 * @return the end position, if recognized and 0 if not
127 */
octalLiteral(unsigned pos,LTType & tt)128 unsigned CParser::octalLiteral(unsigned pos,LTType& tt)
129 {
130 CALL("CParser::octalLiteral");
131
132 if (_input[pos] != '0') {
133 return 0;
134 }
135 for (;;) {
136 char n = _input[++pos];
137 if (n < '0' || n > '7') {
138 break;
139 }
140 }
141 unsigned pos1 = integerTypeSuffix(pos,tt);
142 return pos1 ? pos1 : pos;
143 } // octalLiteral
144
145 /**
146 * True if c is a character that can occur in a hex literal.
147 * HexDigit : ('0'..'9'|'a'..'f'|'A'..'F') ;
148 */
hexDigit(char c)149 bool CParser::hexDigit(char c)
150 {
151 CALL("CParser::hexDigit");
152
153 switch (c) {
154 case '0':
155 case '1':
156 case '2':
157 case '3':
158 case '4':
159 case '5':
160 case '6':
161 case '7':
162 case '8':
163 case '9':
164 case 'a':
165 case 'b':
166 case 'c':
167 case 'd':
168 case 'e':
169 case 'f':
170 case 'A':
171 case 'B':
172 case 'C':
173 case 'D':
174 case 'E':
175 case 'F':
176 return true;
177 default:
178 return false;
179 }
180 } // hexDigit
181
182 /**
183 * Read an octal literal identified by the following grammar:
184 * HEX_LITERAL : '0' ('x'|'X') HexDigit+ IntegerTypeSuffix? ;
185 * Save in @b tt its token type.
186 * @return the end position, if recognized and 0 if not
187 */
hexLiteral(unsigned pos,LTType & tt)188 unsigned CParser::hexLiteral(unsigned pos,LTType& tt)
189 {
190 CALL("CParser::hexLiteral");
191
192 if (_input[pos] != '0' ||
193 _input[pos+1] != 'x' ||
194 !hexDigit(_input[pos+2])) {
195 return 0;
196 }
197 pos += 2;
198 while (hexDigit(_input[++pos])) {}
199 unsigned pos1 = integerTypeSuffix(pos,tt);
200 return pos1 ? pos1 : pos;
201 } // hexLiteral
202
203 /**
204 * Read an exponent.
205 * Exponent : ('e'|'E') ('+'|'-')? ('0'..'9')+ ;
206 * @return the end position, if recognized and 0 if not
207 */
exponent(unsigned pos)208 unsigned CParser::exponent(unsigned pos)
209 {
210 CALL("CParser::exponent");
211
212 switch (_input[pos]) {
213 case 'e':
214 case 'E':
215 pos++;
216 break;
217 default:
218 return 0;
219 }
220 switch (_input[++pos]) {
221 case '+':
222 case '-':
223 pos++;
224 break;
225 default:
226 break;
227 }
228 if (!digit(_input[pos])) {
229 return 0;
230 }
231 while (digit(_input[++pos])) {}
232 return pos;
233 } // exponent
234
235 /** True if the character is a float type suffix and set @b tt
236 * to the appropriate type.
237 * FloatTypeSuffix : ('f'|'F'|'d'|'D') ;
238 */
floatTypeSuffix(char c,LTType & tt)239 bool CParser::floatTypeSuffix(char c,LTType& tt)
240 {
241 CALL("CParser::floatTypeSuffix");
242
243 switch (c) {
244 case 'f':
245 case 'F':
246 tt = LT_FLOAT_CONST;
247 return true;
248 case 'd':
249 case 'D':
250 tt = LT_DOUBLE_CONST;
251 return true;
252 default:
253 tt = LT_FLOAT_CONST;
254 return false;
255 }
256 } // floatTypeSuffix
257
258 /**
259 * Read a floating point literal.
260 * FLOATING_POINT_LITERAL
261 * : ('0'..'9')+ '.' ('0'..'9')* Exponent? FloatTypeSuffix?
262 * | '.' ('0'..'9')+ Exponent? FloatTypeSuffix?
263 * | ('0'..'9')+ Exponent FloatTypeSuffix?
264 * | ('0'..'9')+ FloatTypeSuffix;
265 * @return the end position, if recognized, and 0 if not
266 */
floatingPointLiteral(unsigned pos,LTType & tt)267 unsigned CParser::floatingPointLiteral(unsigned pos,LTType& tt)
268 {
269 CALL("CParser::floatingPointLiteral");
270
271 if (_input[pos] == '.') { // case 2
272 if (!digit(_input[++pos])) {
273 return 0;
274 }
275 while (digit(_input[++pos])) {}
276 unsigned pos1 = exponent(pos);
277 if (pos1) {
278 pos = pos1;
279 }
280 return floatTypeSuffix(_input[pos],tt) ? pos+1 : pos;
281 }
282 // cases 1,3,4
283 if (!digit(_input[pos])) {
284 return 0;
285 }
286 while (digit(_input[++pos])) {}
287 if (floatTypeSuffix(_input[pos],tt)) { // case 4
288 return pos+1;
289 }
290 if (_input[pos] != '.') { // case 3
291 pos = exponent(pos);
292 if (!pos) {
293 return 0;
294 }
295 return floatTypeSuffix(_input[pos],tt) ? pos+1 : pos;
296 }
297 // case 1
298 while (digit(_input[++pos])) {}
299 unsigned pos1 = exponent(pos);
300 if (pos1) {
301 pos = pos1;
302 }
303 return floatTypeSuffix(_input[pos],tt) ? pos+1 : pos;
304 } // floatingPointLiteral
305
306 /** true of c is a letter: $, _, A-Z, a-z */
letter(char c)307 bool CParser::letter(char c)
308 {
309 return c == '$'
310 || c == '_'
311 || (c >= 'A' && c <= 'Z')
312 || (c >= 'a' && c <= 'z');
313 } // letter
314
315 /**
316 * Read an identifier.
317 * IDENTIFIER : LETTER (LETTER|'0'..'9')*
318 * @return the end position, if recognized, and 0 if not
319 */
identifier(unsigned pos)320 unsigned CParser::identifier(unsigned pos)
321 {
322 CALL("CParser::identifier");
323
324 if (!letter(_input[pos])) {
325 return 0;
326 }
327 for (;;) {
328 char c = _input[++pos];
329 if (letter(c) || digit(c)) continue;
330 return pos;
331 }
332 } // identifier
333
334 /**
335 * Skip whitespace characters, comments, and lines having # as the first
336 * non-whitespace character.
337 * @return the position of first character after.
338 */
skipWhiteSpacesAndComments(unsigned pos)339 unsigned CParser::skipWhiteSpacesAndComments(unsigned pos)
340 {
341 CALL("CParser::skipWhiteSpacesAndComments");
342
343 bool newLine = (pos == 0);
344 for (;;) {
345 switch (_input[pos]) {
346 case ' ':
347 case '\t':
348 pos++;
349 break;
350 case '\r':
351 case '\n':
352 newLine = true;
353 pos++;
354 break;
355 case '#':
356 if (!newLine) {
357 throw LexerException(*this,pos,"character # not in the beginning of a line");
358 }
359 pos = skipToEndOfLine(pos+1);
360 break;
361 case '/':
362 switch (_input[pos+1]) {
363 case '*':
364 newLine = false;
365 // skip to the end of comments
366 pos = skipToEndOfComment(pos+2);
367 break;
368 case '/':
369 pos = skipToEndOfLine(pos+2);
370 newLine = true;
371 break;
372 default:
373 return pos;
374 }
375 break;
376 default:
377 return pos;
378 }
379 }
380 } // CParser::skipWhiteSpacesAndComments
381
382 /**
383 * Skip characters to the end of line.
384 * @return the position of first character after.
385 */
skipToEndOfLine(unsigned pos)386 unsigned CParser::skipToEndOfLine(unsigned pos)
387 {
388 CALL("CParser::skipToEndOfLine");
389
390 for (;;) {
391 char c = _input[pos];
392 switch (c) {
393 case 0:
394 return pos;
395 case '\n':
396 return pos+1;
397 case '\r':
398 return _input[pos+1] == '\n' ? pos+2 : pos+1;
399 default:
400 pos++;
401 break;
402 }
403 }
404 } // CParser::skipToEndOfLine
405
406 /**
407 * Skip characters to the end of comment.
408 * @return the position of first character after.
409 */
skipToEndOfComment(unsigned pos)410 unsigned CParser::skipToEndOfComment(unsigned pos)
411 {
412 CALL("CParser::skipToEndOfComment");
413
414 unsigned start = pos;
415 for (;;) {
416 char c = _input[pos];
417 switch (c) {
418 case 0:
419 throw LexerException(*this,start-2,"non-terminated comment");
420 case '*':
421 if (_input[pos+1] == '/') {
422 return pos+2;
423 }
424 pos++;
425 break;
426 default:
427 pos++;
428 break;
429 }
430 }
431 } // CParser::skipToEndOfComment
432
433 /**
434 * Create a new lexer exception.
435 */
LexerException(const CParser & parser,unsigned pos,vstring message)436 CParser::LexerException::LexerException (const CParser& parser,unsigned pos,vstring message)
437 {
438 _message = message + " at position " + Int::toString(pos) + "\n>>>" + Int::toString(int(parser.input()[pos])) + "<<<" +
439 "\n---------------" + (parser.input()+pos) + "\n---------------\n";
440 } // LexerException::LexerException
441
442 /**
443 * Write itself to an ostream.
444 */
cry(ostream & out)445 void CParser::LexerException::cry (ostream& out)
446 {
447 out << "C lexer exception: " << _message << '\n';
448 } // CParser::LexerException::cry
449
450 /**
451 * Create a new parser exception.
452 */
ParserException(const CParser & parser,unsigned pos,vstring message)453 CParser::ParserException::ParserException (const CParser& parser,unsigned pos,vstring message)
454 {
455 _message = message + " at position " + Int::toString(pos) + "\n>>>" + Int::toString(int(parser.input()[pos])) + "<<<" +
456 "\n---------------" + (parser.input()+pos) + "\n---------------\n";
457 } // ParserException::ParserException
458
459 /**
460 * Write itself to an ostream.
461 */
cry(ostream & out)462 void CParser::ParserException::cry (ostream& out)
463 {
464 out << "C parser exception: " << _message << '\n';
465 } // CParser::ParserException::cry
466
tokenize()467 void CParser::tokenize()
468 {
469 CALL("CParser::parse");
470
471 unsigned pos = 0;
472 unsigned end;
473 Token token;
474
475 for (;;) {
476 pos = skipWhiteSpacesAndComments(pos);
477 if (! _input[pos]) {
478 token.type = LT_EOF;
479 token.start = pos;
480 token.end = pos;
481 _tokens.push_back(token);
482 return;
483 }
484
485 token.start = pos;
486 switch (_input[pos]) {
487 case 'a':
488 case 'b':
489 case 'c':
490 case 'd':
491 case 'e':
492 case 'f':
493 case 'g':
494 case 'i':
495 case 'l':
496 case 'r':
497 case 's':
498 case 't':
499 case 'u':
500 case 'v':
501 case 'w':
502 end = identifier(pos);
503 token.type = keyword(pos,end);
504 break;
505
506 case 'h':
507 case 'j':
508 case 'k':
509 case 'm':
510 case 'n':
511 case 'o':
512 case 'p':
513 case 'q':
514 case 'x':
515 case 'y':
516 case 'z':
517 case 'A':
518 case 'B':
519 case 'C':
520 case 'D':
521 case 'E':
522 case 'F':
523 case 'G':
524 case 'H':
525 case 'I':
526 case 'J':
527 case 'K':
528 case 'L':
529 case 'M':
530 case 'N':
531 case 'O':
532 case 'P':
533 case 'Q':
534 case 'R':
535 case 'S':
536 case 'T':
537 case 'U':
538 case 'V':
539 case 'W':
540 case 'X':
541 case 'Y':
542 case 'Z':
543 case '$':
544 case '_':
545 end = identifier(pos);
546 token.type = LT_IDENTIFIER;
547 break;
548
549 case '{':
550 end = pos+1;
551 token.type = LT_LBRACE;
552 break;
553 case '}':
554 end = pos+1;
555 token.type = LT_RBRACE;
556 break;
557 case '(':
558 end = pos+1;
559 token.type = LT_LPAR;
560 break;
561 case ')':
562 end = pos+1;
563 token.type = LT_RPAR;
564 break;
565 case ';':
566 end = pos+1;
567 token.type = LT_SEMICOLON;
568 break;
569
570 case '=':
571 end = pos+1;
572 if (_input[end] == '=') {
573 end++;
574 token.type = LT_EQ_OP;
575 break;
576 }
577 token.type = LT_ASSIGN;
578 break;
579
580 case '+':
581 end = pos+1;
582 if (_input[end] == '=') {
583 end++;
584 token.type = LT_ADD_ASSIGN;
585 break;
586 }
587 if (_input[end] == '+') {
588 end++;
589 token.type = LT_INC_OP;
590 break;
591 }
592 token.type = LT_ADD;
593 break;
594
595 case '*':
596 end = pos+1;
597 if (_input[end] == '=') {
598 end++;
599 token.type = LT_MULT_ASSIGN;
600 break;
601 }
602 token.type = LT_MULT;
603 break;
604
605 case '.':
606 end = pos+1;
607 if (_input[end] == '.') {
608 if (_input[end+1] != '.') {
609 throw LexerException(*this,pos,"bad expression");
610 }
611 end += 2;
612 token.type = LT_ELLIPSIS;
613 break;
614 }
615 token.type = LT_DOT;
616 break;
617
618 case '>':
619 end = pos+1;
620 if (_input[end] == '=') {
621 end++;
622 token.type = LT_GE_OP;
623 break;
624 }
625 if (_input[end] != '>') {
626 token.type = LT_GREATER;
627 break;
628 }
629 if (_input[end+1] == '=') {
630 end += 2;
631 token.type = LT_RIGHT_ASSIGN;
632 break;
633 }
634 end++;
635 token.type = LT_RIGHT_OP;
636 break;
637
638 case '<':
639 end = pos+1;
640 if (_input[end] == '=') {
641 end++;
642 token.type = LT_LE_OP;
643 break;
644 }
645 if (_input[end] == ':') {
646 end++;
647 token.type = LT_LBRACKET;
648 break;
649 }
650 if (_input[end] == '%') {
651 end++;
652 token.type = LT_LBRACE;
653 break;
654 }
655 if (_input[end] != '<') {
656 token.type = LT_LESS;
657 break;
658 }
659 if (_input[end+1] == '=') {
660 end += 2;
661 token.type = LT_LEFT_ASSIGN;
662 break;
663 }
664 end++;
665 token.type = LT_LEFT_OP;
666 break;
667
668 case '-':
669 end = pos+1;
670 if (_input[end] == '=') {
671 end++;
672 token.type = LT_SUB_ASSIGN;
673 break;
674 }
675 if (_input[end] == '-') {
676 end++;
677 token.type = LT_DEC_OP;
678 break;
679 }
680 if (_input[end] == '>') {
681 end++;
682 token.type = LT_PTR_OP;
683 break;
684 }
685 token.type = LT_MINUS;
686 break;
687
688 case '/':
689 end = pos+1;
690 if (_input[end] == '=') {
691 end++;
692 token.type = LT_DIV_ASSIGN;
693 break;
694 }
695 token.type = LT_DIV;
696 break;
697
698 case '%':
699 end = pos+1;
700 if (_input[end] == '=') {
701 end++;
702 token.type = LT_MOD_ASSIGN;
703 break;
704 }
705 if (_input[end] == '>') {
706 end++;
707 token.type = LT_RBRACE;
708 break;
709 }
710 token.type = LT_MOD;
711 break;
712
713 case '&':
714 end = pos+1;
715 if (_input[end] == '=') {
716 end++;
717 token.type = LT_AND_ASSIGN;
718 break;
719 }
720 if (_input[end] == '&') {
721 end++;
722 token.type = LT_AND_OP;
723 break;
724 }
725 token.type = LT_AMP;
726 break;
727
728 case '|':
729 end = pos+1;
730 if (_input[end] == '=') {
731 end++;
732 token.type = LT_OR_ASSIGN;
733 break;
734 }
735 if (_input[end] == '|') {
736 end++;
737 token.type = LT_OR_OP;
738 break;
739 }
740 token.type = LT_BAR;
741 break;
742
743 case '^':
744 end = pos+1;
745 if (_input[end] == '=') {
746 end++;
747 token.type = LT_XOR_ASSIGN;
748 break;
749 }
750 token.type = LT_XOR;
751 break;
752
753 case '!':
754 end = pos+1;
755 if (_input[end] == '=') {
756 end++;
757 token.type = LT_NE_OP;
758 break;
759 }
760 token.type = LT_EXCLAMATION;
761 break;
762
763 case ':':
764 end = pos+1;
765 if (_input[end] == '>') {
766 end++;
767 token.type = LT_RBRACKET;
768 break;
769 }
770 token.type = LT_COLON;
771 break;
772
773 case '[':
774 end = pos+1;
775 token.type = LT_LBRACKET;
776 break;
777
778 case ']':
779 end = pos+1;
780 token.type = LT_RBRACKET;
781 break;
782
783 case ',':
784 end = pos+1;
785 token.type = LT_COMMA;
786 break;
787
788 case '~':
789 end = pos+1;
790 token.type = LT_TILDE;
791 break;
792
793 case '?':
794 end = pos+1;
795 token.type = LT_QUESTION;
796 break;
797
798 case '0':
799 case '1':
800 case '2':
801 case '3':
802 case '4':
803 case '5':
804 case '6':
805 case '7':
806 case '8':
807 case '9':
808 end = numericConstant(pos,token.type);
809 if (!end) {
810 throw LexerException(*this,pos,"bad numeric constant");
811 }
812 break;
813
814 case '"':
815 end = stringConstant(pos);
816 if (!end) {
817 throw LexerException(*this,pos,"non-terminated vstring constant");
818 }
819 token.type = LT_STRING_CONST;
820 break;
821
822 case '\'':
823 end = charConstant(pos);
824 if (!end) {
825 throw LexerException(*this,pos,"bad character constant");
826 }
827 token.type = LT_CHAR_CONST;
828 break;
829
830 default:
831 throw LexerException(*this,pos,(vstring)"unrecognized symbol " + _input[pos] + " in the input");
832 }
833 // if (end <= pos) {
834 // cout << "ERRRRRRRRRRRRRRRRRRRRRRR\n" << pos << ':' << end << ':' << _input[pos];
835 // }
836 ASS(end > pos);
837 token.end = end;
838 _tokens.push_back(token);
839 pos = end;
840 }
841 } // CParser::tokenize
842
843 /**
844 * Return the token type of the identifier substring between the positions
845 * pos and end.
846 */
keyword(unsigned pos,unsigned end)847 CParser::LTType CParser::keyword(unsigned pos,unsigned end)
848 {
849 CALL("CParser::keyword");
850
851 const char* text = _input+pos;
852 switch (*text) {
853 case 'a':
854 if (keyword(text+1,"uto",end-pos-1)) return LT_AUTO;
855 break;
856 case 'b':
857 if (keyword(text+1,"reak",end-pos-1)) return LT_BREAK;
858 break;
859 case 'c':
860 switch (text[1]) {
861 case 'a':
862 if (keyword(text+2,"se",end-pos-2)) return LT_CASE;
863 break;
864 case 'h':
865 if (keyword(text+2,"ar",end-pos-2)) return LT_CHAR;
866 case 'o':
867 if (keyword(text+2,"nst",end-pos-2)) return LT_CONST;
868 if (keyword(text+2,"ntinue",end-pos-2)) return LT_CONTINUE;
869 break;
870 default:
871 break;
872 }
873 break;
874
875 case 'd':
876 switch (text[1]) {
877 case 'e':
878 if (keyword(text+2,"fault",end-pos-2)) return LT_DEFAULT;
879 break;
880 case 'o':
881 if (keyword(text+2,"",end-pos-2)) return LT_DO;
882 if (keyword(text+2,"uble",end-pos-2)) return LT_DOUBLE;
883 break;
884 default:
885 break;
886 }
887 break;
888
889 case 'e':
890 if (keyword(text+1,"lse",end-pos-1)) return LT_ELSE;
891 if (keyword(text+1,"num",end-pos-1)) return LT_ENUM;
892 if (keyword(text+1,"xtern",end-pos-1)) return LT_EXTERN;
893 break;
894
895 case 'f':
896 if (keyword(text+1,"loat",end-pos-1)) return LT_FLOAT;
897 if (keyword(text+1,"or",end-pos-1)) return LT_FOR;
898 break;
899
900 case 'g':
901 if (keyword(text+1,"oto",end-pos-1)) return LT_GOTO;
902 break;
903
904 case 'i':
905 if (keyword(text+1,"f",end-pos-1)) return LT_IF;
906 if (keyword(text+1,"nline",end-pos-1)) return LT_INLINE;
907 if (keyword(text+1,"nt",end-pos-1)) return LT_INT;
908 break;
909
910 case 'l':
911 if (keyword(text+1,"ong",end-pos-1)) return LT_LONG;
912 break;
913
914 case 'r':
915 if (keyword(text+1,"egister",end-pos-1)) return LT_REGISTER;
916 if (keyword(text+1,"estrict",end-pos-1)) return LT_RESTRICT;
917 if (keyword(text+1,"eturn",end-pos-1)) return LT_RETURN;
918 break;
919
920 case 's':
921 if (keyword(text+1,"hort",end-pos-1)) return LT_SHORT;
922 if (keyword(text+1,"igned",end-pos-1)) return LT_SIGNED;
923 if (keyword(text+1,"izeof",end-pos-1)) return LT_SIZEOF;
924 if (keyword(text+1,"truct",end-pos-1)) return LT_STRUCT;
925 if (keyword(text+1,"witch",end-pos-1)) return LT_SWITCH;
926 break;
927
928 case 't':
929 if (keyword(text+1,"ypedef",end-pos-1)) return LT_TYPEDEF;
930 break;
931
932 case 'u':
933 if (keyword(text+1,"nion",end-pos-1)) return LT_UNION;
934 if (keyword(text+1,"nsigned",end-pos-1)) return LT_UNSIGNED;
935 break;
936
937 case 'v':
938 if (keyword(text+1,"oid",end-pos-1)) return LT_VOID;
939 if (keyword(text+1,"olatile",end-pos-1)) return LT_VOLATILE;
940 break;
941
942 case 'w':
943 if (keyword(text+1,"hile",end-pos-1)) return LT_WHILE;
944 break;
945
946 default:
947 ASS(false);
948 }
949 return LT_IDENTIFIER;
950 } // CParser::keyword
951
952 /**
953 * True if the first chars characters of txt coincides with the
954 * first chars characters of word
955 * @pre the first chars characters of txt are non-zero
956 */
keyword(const char * txt,const char * word,int chars)957 bool CParser::keyword(const char* txt,const char* word,int chars)
958 {
959 CALL("CParser::keyword/3");
960
961 ASS(chars >= 0);
962
963 while (chars--) {
964 ASS(*txt);
965 if (*txt++ != *word++) return false;
966 }
967 return !*word;
968 } // CParser::keyword/3
969
970 /**
971 * Read a numeric constant and save the tag to @b tt.
972 * NUMERIC_CONSTANT
973 * : HEX_LITERAL
974 * | OCTAL_LITERAL
975 * | DECIMAL_LITERAL
976 * | FLOATING_POINT_LITERAL;
977 * @warning can be optimised by (1) putting float and integer recognition together and (2)
978 * @return the end position, if recognized, and 0 if not
979 */
numericConstant(unsigned pos,LTType & tt)980 unsigned CParser::numericConstant(unsigned pos,LTType& tt)
981 {
982 CALL("CParser::numericConstant");
983
984 if (_input[pos] != '0') {
985 unsigned end = floatingPointLiteral(pos,tt);
986 if (end) {
987 return end;
988 }
989 return decimalLiteral(pos,tt);
990 }
991 switch (_input[pos+1]) {
992 case 'x':
993 case 'X':
994 return hexLiteral(pos,tt);
995 case '0':
996 case '1':
997 case '2':
998 case '3':
999 case '4':
1000 case '5':
1001 case '6':
1002 case '7':
1003 case '8':
1004 case '9':
1005 return octalLiteral(pos,tt);
1006 default:
1007 return decimalLiteral(pos,tt);
1008 }
1009 } // CParser::numericConstant
1010
1011 /**
1012 * Read a string constant.
1013 * @return the end position, if recognized, and 0 if not
1014 */
stringConstant(unsigned pos)1015 unsigned CParser::stringConstant(unsigned pos)
1016 {
1017 CALL("CParser::stringConstant");
1018
1019 ASS(_input[pos] == '"');
1020
1021 for (;;) {
1022 switch (_input[++pos]) {
1023 case '"':
1024 return pos+1;
1025 case '\\':
1026 pos++;
1027 break;
1028 case 0:
1029 return 0;
1030 default:
1031 break;
1032 }
1033 }
1034 } // CParser::stringConstant
1035
1036 /**
1037 * Read a character constant.
1038 * @return the end position, if recognized, and 0 if not
1039 */
charConstant(unsigned pos)1040 unsigned CParser::charConstant(unsigned pos)
1041 {
1042 CALL("CParser::charConstant");
1043 ASS(_input[pos] == '\'');
1044
1045 switch (_input[++pos]) {
1046 case '\'':
1047 return 0;
1048 case '\\':
1049 pos++;
1050 break;
1051 case 0:
1052 return 0;
1053 default:
1054 break;
1055 }
1056 if (_input[++pos] == '\'') {
1057 return pos+1;
1058 }
1059 return 0;
1060 } // CParser::charConstant
1061
1062 #if VDEBUG
output(ostream & str)1063 void CParser::output(ostream& str)
1064 {
1065 CALL("CParser::output");
1066
1067 for (unsigned i = 0;;i++) {
1068 Token& tok = _tokens[i];
1069 if (tok.type == LT_EOF) {
1070 return;
1071 }
1072 for (unsigned p = tok.start;p < tok.end;p++) {
1073 str << _input[p];
1074 }
1075 str << '\n';
1076 }
1077 }
1078 #endif
1079
1080 /**
1081 * Parse a primary expression using the following grammar.
1082 * primary_expression
1083 * : IDENTIFIER
1084 * | CONSTANT
1085 * | STRING_LITERAL
1086 * | '(' expression ')';
1087 */
primaryExpression(unsigned pos,bool backtrack)1088 unsigned CParser::primaryExpression(unsigned pos,bool backtrack)
1089 {
1090 CALL("CParser::primaryExpression");
1091
1092 Token& tok = _tokens[pos];
1093 switch (tok.type) {
1094 case LT_IDENTIFIER:
1095 _units.push_back(new Identifier(pos,pos+1));
1096 return pos+1;
1097
1098 case LT_INT_CONST:
1099 case LT_LONG_CONST:
1100 case LT_UINT_CONST:
1101 case LT_ULONG_CONST:
1102 case LT_FLOAT_CONST:
1103 case LT_DOUBLE_CONST:
1104 case LT_STRING_CONST:
1105 case LT_CHAR_CONST:
1106 _units.push_back(new ConstantExpression(pos,pos+1));
1107 return pos+1;
1108
1109 case LT_LPAR: {
1110 unsigned end = expression(pos,backtrack);
1111 if (!end) return 0;
1112 if (!consumeToken(LT_RPAR,end+1,backtrack)) return 0;
1113 return end+2;
1114 }
1115 default:
1116 if (backtrack) return 0;
1117 throw ParserException(*this,pos,"primary expression expected");
1118 }
1119 } // primaryExpression
1120
1121
1122 /**
1123 * Parse a postfix expression using the following grammar.
1124 * postfix_expression
1125 * : primary_expression
1126 * | postfix_expression '[' expression ']'
1127 * | postfix_expression '(' ')'
1128 * | postfix_expression '(' argument_expression_list ')'
1129 * | postfix_expression '.' IDENTIFIER
1130 * | postfix_expression PTR_OP IDENTIFIER
1131 * | postfix_expression INC_OP
1132 * | postfix_expression DEC_OP
1133 * | '(' type_name ')' '{' initializer_list '}'
1134 * | '(' type_name ')' '{' initializer_list ',' '}'
1135 * @warning the last two cases are not implemented
1136 */
postfixExpression(unsigned pos,bool backtrack)1137 unsigned CParser::postfixExpression(unsigned pos,bool backtrack)
1138 {
1139 CALL("CParser::postfixExpression");
1140 unsigned end = primaryExpression(pos,backtrack);
1141 if (!end) return 0;
1142
1143 for (;;) {
1144 Token& tok = _tokens[end];
1145 switch (tok.type) {
1146 case LT_LBRACKET: {
1147 pos = expression(end+1,backtrack);
1148 if (!pos) return 0;
1149 if (!consumeToken(LT_RBRACKET,pos+1,backtrack)) return 0;
1150 pos += 2;
1151 Unit* rhs = _units.back();
1152 _units.pop_back();
1153 Unit* lhs = _units.back();
1154 _units.pop_back();
1155 _units.push_back(new ArrayApplication(lhs->start(),pos,lhs,rhs));
1156 break;
1157 }
1158
1159 case LT_DOT:
1160 case LT_PTR_OP:
1161 if (!consumeToken(LT_IDENTIFIER,end+1,backtrack)) return 0;
1162 pos = end+2;
1163 break;
1164
1165 case LT_INC_OP:
1166 case LT_DEC_OP:
1167 pos = end+1;
1168 break;
1169
1170 case LT_LPAR:
1171 if (_tokens[end+1].type == LT_RPAR) return end+2;
1172 pos = argumentExpressionList(pos,backtrack);
1173 if (!pos) return 0;
1174 if (!consumeToken(LT_RPAR,pos,backtrack)) return 0;
1175 pos++;
1176 break;
1177
1178 default:
1179 return end;
1180 }
1181 }
1182 } //
1183
1184 /**
1185 * If the token type at position pos is t, return true.
1186 * Otherwise, if backtrack is true, then return 0, otherwise
1187 * raise an exception.
1188 */
consumeToken(LTType t,unsigned pos,bool backtrack)1189 bool CParser::consumeToken(LTType t,unsigned pos,bool backtrack)
1190 {
1191 if (_tokens[pos].type == t) return true;
1192 if (backtrack) return false;
1193 const char* what;
1194 switch (t) {
1195 case LT_RPAR:
1196 what = "}";
1197 break;
1198 case LT_IDENTIFIER:
1199 what = "identifier";
1200 default:
1201 ASS(false);
1202 }
1203 throw ParserException(*this,pos,(vstring)what + " expected");
1204 } // consumeToken
1205
1206 /**
1207 * Parse a unary expression using the following grammar.
1208 * unary_expression
1209 * : postfix_expression
1210 * | INC_OP unary_expression
1211 * | DEC_OP unary_expression
1212 * | unary_operator cast_expression
1213 * | SIZEOF unary_expression
1214 * | SIZEOF '(' type_name ')'
1215 * ;
1216 * unary_operator : '&' | '*' | '+' | '-' | '~' | '!'
1217 * @warning two sizeof cases are not implemented and cast_expression is replaced by unary_expression
1218 */
unaryExpression(unsigned pos,bool backtrack)1219 unsigned CParser::unaryExpression(unsigned pos,bool backtrack)
1220 {
1221 CALL("CParser::unaryExpression");
1222
1223 for (;;) {
1224 bool done = false;
1225 switch (_tokens[pos].type) {
1226 case LT_INC_OP:
1227 case LT_DEC_OP:
1228 case LT_AMP:
1229 case LT_MULT:
1230 case LT_ADD:
1231 case LT_MINUS:
1232 case LT_TILDE:
1233 case LT_EXCLAMATION:
1234 pos++;
1235 break;
1236 default:
1237 done = true;
1238 break;
1239 }
1240 if (done) {
1241 break;
1242 }
1243 }
1244 return postfixExpression(pos,backtrack);
1245 } // unaryExpression
1246
1247 /**
1248 * Parse a multiplicative expression using the following grammar.
1249 * multiplicative_expression
1250 * : cast_expression
1251 * | multiplicative_expression '*' cast_expression
1252 * | multiplicative_expression '/' cast_expression
1253 * | multiplicative_expression '%' cast_expression
1254 * @warning since cast_expression is replaced by unary_expression
1255 */
multiplicativeExpression(unsigned pos,bool backtrack)1256 unsigned CParser::multiplicativeExpression(unsigned pos,bool backtrack)
1257 {
1258 CALL("CParser::multiplicativeExpression");
1259
1260 for (;;) {
1261 unsigned end = unaryExpression(pos,backtrack);
1262 if (!end) return 0;
1263 switch (_tokens[end].type) {
1264 case LT_MULT:
1265 case LT_DIV:
1266 case LT_MOD:
1267 pos = end+1;
1268 break;
1269 default:
1270 return end;
1271 }
1272 }
1273 } // CParser::multiplicativeExpression
1274
1275 /**
1276 * Parse a additive expression using the following grammar.
1277 * additive_expression
1278 * : multiplicative_expression
1279 * | additive_expression '+' multiplicative_expression
1280 * | additive_expression '-' multiplicative_expression
1281 */
additiveExpression(unsigned pos,bool backtrack)1282 unsigned CParser::additiveExpression(unsigned pos,bool backtrack)
1283 {
1284 CALL("CParser::additiveExpression");
1285
1286 for (;;) {
1287 unsigned end = multiplicativeExpression(pos,backtrack);
1288 if (!end) return 0;
1289 switch (_tokens[end].type) {
1290 case LT_ADD:
1291 case LT_MINUS:
1292 pos = end+1;
1293 break;
1294 default:
1295 return end;
1296 }
1297 }
1298 } // CParser::additiveExpression
1299
1300 /**
1301 * Parse a shift expression using the following grammar.
1302 * shift_expression
1303 * : additive_expression
1304 * | shift_expression LEFT_OP additive_expression
1305 * | shift_expression RIGHT_OP additive_expression
1306 */
shiftExpression(unsigned pos,bool backtrack)1307 unsigned CParser::shiftExpression(unsigned pos,bool backtrack)
1308 {
1309 CALL("CParser::shiftExpression");
1310
1311 for (;;) {
1312 unsigned end = additiveExpression(pos,backtrack);
1313 if (!end) return 0;
1314 switch (_tokens[end].type) {
1315 case LT_LEFT_OP:
1316 case LT_RIGHT_OP:
1317 pos = end+1;
1318 break;
1319 default:
1320 return end;
1321 }
1322 }
1323 } // CParser::shiftExpression
1324
1325
1326 /**
1327 * Parse a relational expression using the following grammar.
1328 * relational_expression
1329 * : shift_expression
1330 * | relational_expression '<' shift_expression
1331 * | relational_expression '>' shift_expression
1332 * | relational_expression LE_OP shift_expression
1333 * | relational_expression GE_OP shift_expression
1334 */
relationalExpression(unsigned pos,bool backtrack)1335 unsigned CParser::relationalExpression(unsigned pos,bool backtrack)
1336 {
1337 CALL("CParser::relationalExpression");
1338
1339 for (;;) {
1340 unsigned end = shiftExpression(pos,backtrack);
1341 if (!end) return 0;
1342 switch (_tokens[end].type) {
1343 case LT_LESS:
1344 case LT_GREATER:
1345 case LT_LE_OP:
1346 case LT_GE_OP:
1347 pos = end+1;
1348 break;
1349 default:
1350 return end;
1351 }
1352 }
1353 } // CParser::relationalExpression
1354
1355 /**
1356 * Parse an equality expression using the following grammar.
1357 * equality_expression
1358 * : relational_expression
1359 * | equality_expression EQ_OP relational_expression
1360 * | equality_expression NE_OP relational_expression
1361 */
equalityExpression(unsigned pos,bool backtrack)1362 unsigned CParser::equalityExpression(unsigned pos,bool backtrack)
1363 {
1364 CALL("CParser::equalityExpression");
1365
1366 for (;;) {
1367 unsigned end = relationalExpression(pos,backtrack);
1368 if (!end) return 0;
1369 switch (_tokens[end].type) {
1370 case LT_EQ_OP:
1371 case LT_NE_OP:
1372 pos = end+1;
1373 break;
1374 default:
1375 return end;
1376 }
1377 }
1378 } // CParser::equalityExpression
1379
1380 /**
1381 * Parse an and expression using the following grammar.
1382 * and_expression
1383 * and_expression
1384 * : equality_expression
1385 * | and_expression '&' equality_expression
1386 */
andExpression(unsigned pos,bool backtrack)1387 unsigned CParser::andExpression(unsigned pos,bool backtrack)
1388 {
1389 CALL("CParser::andExpression");
1390
1391 for (;;) {
1392 unsigned end = equalityExpression(pos,backtrack);
1393 if (!end) return 0;
1394 switch (_tokens[end].type) {
1395 case LT_AMP:
1396 pos = end+1;
1397 break;
1398 default:
1399 return end;
1400 }
1401 }
1402 } // CParser::andExpression
1403
1404
1405 /**
1406 * Parse an exclusive or expression using the following grammar.
1407 * exclusive_or_expression
1408 * : and_expression
1409 * | exclusive_or_expression '^' and_expression
1410 */
xorExpression(unsigned pos,bool backtrack)1411 unsigned CParser::xorExpression(unsigned pos,bool backtrack)
1412 {
1413 CALL("CParser::xorExpression");
1414
1415 for (;;) {
1416 unsigned end = andExpression(pos,backtrack);
1417 if (!end) return 0;
1418 switch (_tokens[end].type) {
1419 case LT_XOR:
1420 pos = end+1;
1421 break;
1422 default:
1423 return end;
1424 }
1425 }
1426 } // CParser::xorExpression
1427
1428 /**
1429 * Parse an inclusive or expression using the following grammar.
1430 * inclusive_or_expression
1431 * : exclusive_or_expression
1432 * | inclusive_or_expression '|' exclusive_or_expression
1433 */
orExpression(unsigned pos,bool backtrack)1434 unsigned CParser::orExpression(unsigned pos,bool backtrack)
1435 {
1436 CALL("CParser::orExpression");
1437
1438 for (;;) {
1439 unsigned end = xorExpression(pos,backtrack);
1440 if (!end) return 0;
1441 switch (_tokens[end].type) {
1442 case LT_BAR:
1443 pos = end+1;
1444 break;
1445 default:
1446 return end;
1447 }
1448 }
1449 } // CParser::orExpression
1450
1451 /**
1452 * Parse a logical and expression using the following grammar.
1453 * logical_and_expression
1454 * : inclusive_or_expression
1455 * | logical_and_expression AND_OP inclusive_or_expression
1456 */
logicalAndExpression(unsigned pos,bool backtrack)1457 unsigned CParser::logicalAndExpression(unsigned pos,bool backtrack)
1458 {
1459 CALL("CParser::logicalAndExpression");
1460
1461 for (;;) {
1462 unsigned end = orExpression(pos,backtrack);
1463 if (!end) return 0;
1464 switch (_tokens[end].type) {
1465 case LT_AND_OP:
1466 pos = end+1;
1467 break;
1468 default:
1469 return end;
1470 }
1471 }
1472 } // CParser::logicalAndExpression
1473
1474 /**
1475 * Parse a logical or expression using the following grammar.
1476 * logical_or_expression
1477 * : logical_and_expression
1478 * | logical_or_expression OR_OP logical_and_expression
1479 */
logicalOrExpression(unsigned pos,bool backtrack)1480 unsigned CParser::logicalOrExpression(unsigned pos,bool backtrack)
1481 {
1482 CALL("CParser::logicalOrExpression");
1483
1484 for (;;) {
1485 unsigned end = logicalAndExpression(pos,backtrack);
1486 if (!end) return 0;
1487 switch (_tokens[end].type) {
1488 case LT_OR_OP:
1489 pos = end+1;
1490 break;
1491 default:
1492 return end;
1493 }
1494 }
1495 } // CParser::logicalOrExpression
1496
1497 /**
1498 * Parse a conditional expression using the following grammar.
1499 * conditional_expression
1500 * : logical_or_expression
1501 * | logical_or_expression '?' expression ':' conditional_expression
1502 */
conditionalExpression(unsigned pos,bool backtrack)1503 unsigned CParser::conditionalExpression(unsigned pos,bool backtrack)
1504 {
1505 CALL("CParser::conditionalExpression");
1506
1507 for (;;) {
1508 unsigned end = logicalOrExpression(pos,backtrack);
1509 if (!end) return 0;
1510 switch (_tokens[end].type) {
1511 case LT_QUESTION:
1512 pos = end+1;
1513 end = expression(pos,backtrack);
1514 if (!end) return 0;
1515 consumeToken(LT_COLON,end,backtrack);
1516 pos = end+1;
1517 break;
1518 default:
1519 return end;
1520 }
1521 }
1522 } // CParser::conditionalExpression
1523
1524 /**
1525 * Parse an assignment expression using the following grammar.
1526 * assignment_expression
1527 * : conditional_expression
1528 * | unary_expression assignment_operator assignment_expression
1529 */
assignmentExpression(unsigned pos,bool backtrack)1530 unsigned CParser::assignmentExpression(unsigned pos,bool backtrack)
1531 {
1532 CALL("CParser::assignmentExpression");
1533 NOT_IMPLEMENTED;
1534 }
1535
1536 /*
1537 assignment_operator
1538 : '='
1539 | MUL_ASSIGN
1540 | DIV_ASSIGN
1541 | MOD_ASSIGN
1542 | ADD_ASSIGN
1543 | SUB_ASSIGN
1544 | LEFT_ASSIGN
1545 | RIGHT_ASSIGN
1546 | AND_ASSIGN
1547 | XOR_ASSIGN
1548 | OR_ASSIGN
1549 ;
1550
1551 expression
1552 : assignment_expression
1553 | expression ',' assignment_expression
1554 ;
1555
1556 */
1557
expression(unsigned pos,bool backtrack)1558 unsigned CParser::expression(unsigned pos,bool backtrack)
1559 {
1560 return 0;
1561 }
1562
argumentExpressionList(unsigned int,bool)1563 unsigned CParser::argumentExpressionList(unsigned int, bool)
1564 {
1565 return 0;
1566 }
1567
Identifier(unsigned start,unsigned end)1568 CParser::Identifier::Identifier(unsigned start, unsigned end)
1569 : Unit(PT_IDENTIFIER,start,end)
1570 {
1571 }
1572
ConstantExpression(unsigned start,unsigned end)1573 CParser::ConstantExpression::ConstantExpression(unsigned start, unsigned end)
1574 : Unit(PT_CONSTANT_EXPRESSION,start,end)
1575 {
1576 }
1577