1 
2 /*
3  * File CParser.hpp.
4  *
5  * This file is part of the source code of the software program
6  * Vampire. It is protected by applicable
7  * copyright laws.
8  *
9  * This source code is distributed under the licence found here
10  * https://vprover.github.io/license.html
11  * and in the source directory
12  *
13  * In summary, you are allowed to use Vampire for non-commercial
14  * purposes but not allowed to distribute, modify, copy, create derivatives,
15  * or use in competitions.
16  * For other uses of Vampire please contact developers for a different
17  * licence, which we will make an effort to provide.
18  */
19 /**
20  * @file CParser.hpp
21  * Defines class CParser for lexical analysis of C programs.
22  *
23  * @since 13/01/2011 Manchester
24  */
25 
26 #ifndef __CParser__
27 #define __CParser__
28 
29 #include <vector>
30 
31 #include "Lib/Exception.hpp"
32 #include "Lib/VString.hpp"
33 
34 using namespace std;
35 
36 namespace Shell {
37 
38 /**
39  * Class CParser, implements a C language parser.
40  * @since 13/01/2011 Manchester
41  */
42 class CParser
43 {
44 public:
45   /**
46    * Implements lexer exceptions.
47    * @since 14/01/2011 Manchester
48    */
49   class LexerException
50     : public Lib::Exception
51   {
52   public:
53     LexerException(const CParser&,unsigned pos,Lib::vstring message);
54     void cry(ostream&);
~LexerException()55     ~LexerException() {}
56   protected:
57     Lib::vstring _message;
58     unsigned _pos;
59   }; // CParser::LexerException
60 
61   /**
62    * Implements parser exceptions.
63    * @since 17/01/2011 Manchester
64    */
65   class ParserException
66     : public Lib::Exception
67   {
68   public:
69     ParserException(const CParser&,unsigned pos,Lib::vstring message);
70     void cry(ostream&);
~ParserException()71     ~ParserException() {}
72   protected:
73     Lib::vstring _message;
74     unsigned _pos;
75   }; // CParser::ParserException
76 
77   /** lexer token types */
78   enum LTType {
79     /** identifier */
80     LT_IDENTIFIER,
81 
82     /** keyword auto */
83     LT_AUTO,
84     /** keyword break */
85     LT_BREAK,
86     /** keyword case */
87     LT_CASE,
88     /** keyword char */
89     LT_CHAR,
90     /** keyword const */
91     LT_CONST,
92     /** keyword continue */
93     LT_CONTINUE,
94     /** keyword default */
95     LT_DEFAULT,
96     /** keyword do */
97     LT_DO,
98     /** keyword double */
99     LT_DOUBLE,
100     /** keyword else */
101     LT_ELSE,
102     /** keyword enum */
103     LT_ENUM,
104     /** keyword extern */
105     LT_EXTERN,
106     /** keyword float */
107     LT_FLOAT,
108     /** keyword for */
109     LT_FOR,
110     /** keyword goto */
111     LT_GOTO,
112     /** keyword if */
113     LT_IF,
114     /** keyword inline */
115     LT_INLINE,
116     /** keyword int */
117     LT_INT,
118     /** keyword long */
119     LT_LONG,
120     /** keyword register */
121     LT_REGISTER,
122     /** keyword restrict */
123     LT_RESTRICT,
124     /** keyword return */
125     LT_RETURN,
126     /** keyword short */
127     LT_SHORT,
128     /** keyword signed */
129     LT_SIGNED,
130     /** keyword sizeof */
131     LT_SIZEOF,
132     /** keyword struct */
133     LT_STRUCT,
134     /** keyword switch */
135     LT_SWITCH,
136     /** keyword typedef */
137     LT_TYPEDEF,
138     /** keyword union */
139     LT_UNION,
140     /** keyword unsigned */
141     LT_UNSIGNED,
142     /** keyword void */
143     LT_VOID,
144     /** keyword volatile */
145     LT_VOLATILE,
146     /** keyword while */
147     LT_WHILE,
148 
149     /** { */
150     LT_LBRACE,
151     /** } */
152     LT_RBRACE,
153     /** { */
154     LT_LPAR,
155     /** } */
156     LT_RPAR,
157     /** ; */
158     LT_SEMICOLON,
159     /** == */
160     LT_EQ_OP,
161     /** = */
162     LT_ASSIGN,
163     /** += */
164     LT_ADD_ASSIGN,
165     /** ++ */
166     LT_INC_OP,
167     /** + */
168     LT_ADD,
169     /** *= */
170     LT_MULT_ASSIGN,
171     /** * */
172     LT_MULT,
173     /** ... */
174     LT_ELLIPSIS,
175     /** dot */
176     LT_DOT,
177     /** >= */
178     LT_GE_OP,
179     /** > */
180     LT_GREATER,
181     /** >>= */
182     LT_RIGHT_ASSIGN,
183     /** >> */
184     LT_RIGHT_OP,
185     /** <= */
186     LT_LE_OP,
187     /** [ */
188     LT_LBRACKET,
189     /** < */
190     LT_LESS,
191     /** <<= */
192     LT_LEFT_ASSIGN,
193     /** << */
194     LT_LEFT_OP,
195     /** -= */
196     LT_SUB_ASSIGN,
197     /** -- */
198     LT_DEC_OP,
199     /** -> */
200     LT_PTR_OP,
201     /** - */
202     LT_MINUS,
203     /** /= */
204     LT_DIV_ASSIGN,
205     /** / */
206     LT_DIV,
207     /** %= */
208     LT_MOD_ASSIGN,
209     /** % */
210     LT_MOD,
211     /** &= */
212     LT_AND_ASSIGN,
213     /** && */
214     LT_AND_OP,
215     /** & */
216     LT_AMP,
217     /** |= */
218     LT_OR_ASSIGN,
219     /** || */
220     LT_OR_OP,
221     /** | */
222     LT_BAR,
223     /** ^= */
224     LT_XOR_ASSIGN,
225     /** ^ */
226     LT_XOR,
227     /** != */
228     LT_NE_OP,
229     /** ! */
230     LT_EXCLAMATION,
231     /** : */
232     LT_COLON,
233     /** ] */
234     LT_RBRACKET,
235     /** , */
236     LT_COMMA,
237     /** ~ */
238     LT_TILDE,
239     /** ? */
240     LT_QUESTION,
241 
242     /** an integer constant */
243     LT_INT_CONST,
244     /** a long constant */
245     LT_LONG_CONST,
246     /** an unsigned integer constant */
247     LT_UINT_CONST,
248     /** an unsigned long constant */
249     LT_ULONG_CONST,
250     /** a floating point constant */
251     LT_FLOAT_CONST,
252     /** a double floating point constant */
253     LT_DOUBLE_CONST,
254     /** a string constant */
255     LT_STRING_CONST,
256     /** a character constant */
257     LT_CHAR_CONST,
258     /** end-of-input */
259     LT_EOF,
260   };
261 
262   struct Token {
263     LTType type;
264     unsigned start;
265     unsigned end;
266   };
267 
268   /** parser token types */
269   enum PTType {
270     /** constant expression */
271     PT_CONSTANT_EXPRESSION,
272     /** array application */
273     PT_ARRAY_APPLICATION,
274     /** identifier */
275     PT_IDENTIFIER,
276   };
277 
278   /** the base class for all parsed expressions */
279   class Unit {
280   public:
281     /** the parser type of this unit */
type() const282     PTType type() const { return _type; }
283     /** create a new unit of a given type */
Unit(PTType tp,unsigned start,unsigned end)284     Unit(PTType tp,unsigned start,unsigned end)
285       : _type(tp), _start(start), _end(end)
286     {}
287     /** the start position of the unit in the array of tokens */
start() const288     unsigned start() const { return _start; }
289     /** the end position of the unit in the array of tokens */
end() const290     unsigned end() const { return _end; }
291   protected:
292     /** unit type */
293     PTType _type;
294     /** start position in the array of tokens */
295     unsigned _start;
296     /** end position in the array of tokens */
297     unsigned _end;
298   };
299 
300   /** constant expression */
301   class ConstantExpression
302     : public Unit
303   {
304   public:
305     /** create a new constant expression */
306     ConstantExpression(unsigned start,unsigned end);
307   };
308 
309   /** identifier */
310   class Identifier
311     : public Unit
312   {
313   public:
314     /** create a new identifier */
315     Identifier(unsigned start,unsigned end);
316   };
317 
318   /** identifier */
319   class ArrayApplication
320     : public Unit
321   {
322   public:
323     /** create a new array application expression */
ArrayApplication(unsigned start,unsigned end,Unit * lhs,Unit * rhs)324     ArrayApplication(unsigned start,unsigned end,Unit* lhs,Unit* rhs):
325       Unit(PT_ARRAY_APPLICATION,start,end)
326     {}
327   };
328 
329   CParser(const char* input);
330   ~CParser();
331   void tokenize();
332   /** return the input string */
input() const333   const char* input() const { return _input; }
334 
335   #if VDEBUG
336   void output(ostream& str);
337   static const char* toString(LTType);
338   #endif
339 
340 private:
341   // Lexer procedures
342   unsigned skipWhiteSpacesAndComments(unsigned pos);
343   unsigned skipToEndOfLine(unsigned pos);
344   unsigned skipToEndOfComment(unsigned pos);
345 
346   LTType keyword(unsigned pos,unsigned end);
347   static bool keyword(const char* txt,const char* word,int chars);
348 
349   unsigned integerTypeSuffix(unsigned start,LTType&);
350   unsigned decimalLiteral(unsigned start,LTType&);
351   unsigned octalLiteral(unsigned start,LTType&);
352   unsigned hexLiteral(unsigned start,LTType&);
353   unsigned floatingPointLiteral(unsigned start,LTType&);
354   unsigned exponent(unsigned start);
355   unsigned identifier(unsigned start);
356   unsigned numericConstant(unsigned start,LTType&);
357   unsigned stringConstant(unsigned start);
358   unsigned charConstant(unsigned start);
359 
360   /** true if c is a digit 0-9 */
digit(char c)361   static bool digit(char c) { return c >= '0' && c <= '9'; }
362   static bool letter(char c);
363   static bool floatTypeSuffix(char c,LTType&);
364   static bool hexDigit(char c);
365 
366   // parser procedures
367   unsigned primaryExpression(unsigned pos,bool backtrack);
368   unsigned postfixExpression(unsigned pos,bool backtrack);
369   unsigned unaryExpression(unsigned pos,bool backtrack);
370   unsigned multiplicativeExpression(unsigned pos,bool backtrack);
371   unsigned additiveExpression(unsigned pos,bool backtrack);
372   unsigned shiftExpression(unsigned pos,bool backtrack);
373   unsigned relationalExpression(unsigned pos,bool backtrack);
374   unsigned equalityExpression(unsigned pos,bool backtrack);
375   unsigned andExpression(unsigned pos,bool backtrack);
376   unsigned xorExpression(unsigned pos,bool backtrack);
377   unsigned orExpression(unsigned pos,bool backtrack);
378   unsigned logicalAndExpression(unsigned pos,bool backtrack);
379   unsigned logicalOrExpression(unsigned pos,bool backtrack);
380   unsigned conditionalExpression(unsigned pos,bool backtrack);
381   unsigned assignmentExpression(unsigned pos,bool backtrack);
382   unsigned expression(unsigned pos,bool backtrack);
383   bool consumeToken(LTType t,unsigned pos,bool backtrack);
384   unsigned argumentExpressionList(unsigned pos,bool backtrack);
385 
386   /** the input string, must be null-terminated */
387   const char* _input;
388   /** the collected list of tokens */
389   vector<Token> _tokens;
390   /** the collected list of parsed units */
391   vector<Unit*> _units;
392 }; // class CParser
393 
394 }
395 
396 #endif
397 
398