1 //===- TGLexer.h - Lexer for TableGen Files ---------------------*- C++ -*-===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This class represents the Lexer for tablegen files.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_LIB_TABLEGEN_TGLEXER_H
15 #define LLVM_LIB_TABLEGEN_TGLEXER_H
16 
17 #include "llvm/ADT/ArrayRef.h"
18 #include "llvm/ADT/StringRef.h"
19 #include "llvm/ADT/StringSet.h"
20 #include "llvm/Support/DataTypes.h"
21 #include "llvm/Support/SMLoc.h"
22 #include <cassert>
23 #include <map>
24 #include <memory>
25 #include <string>
26 
27 namespace llvm {
28 class SourceMgr;
29 class SMLoc;
30 class Twine;
31 
32 namespace tgtok {
33   enum TokKind {
34     // Markers
35     Eof, Error,
36 
37     // Tokens with no info.
38     minus, plus,        // - +
39     l_square, r_square, // [ ]
40     l_brace, r_brace,   // { }
41     l_paren, r_paren,   // ( )
42     less, greater,      // < >
43     colon, semi,        // : ;
44     comma, period,      // , .
45     equal, question,    // = ?
46     paste,              // #
47 
48     // Keywords.
49     Bit, Bits, Class, Code, Dag, Def, Foreach, Defm, Field, In, Int, Let, List,
50     MultiClass, String, Defset,
51 
52     // !keywords.
53     XConcat, XADD, XAND, XOR, XSRA, XSRL, XSHL, XListConcat, XStrConcat, XCast,
54     XSubst, XForEach, XFoldl, XHead, XTail, XSize, XEmpty, XIf, XEq, XIsA, XDag,
55     XNe, XLe, XLt, XGe, XGt,
56 
57     // Integer value.
58     IntVal,
59 
60     // Binary constant.  Note that these are sized according to the number of
61     // bits given.
62     BinaryIntVal,
63 
64     // String valued tokens.
65     Id, StrVal, VarName, CodeFragment,
66 
67     // Preprocessing tokens for internal usage by the lexer.
68     // They are never returned as a result of Lex().
69     Ifdef, Else, Endif, Define
70   };
71 }
72 
73 /// TGLexer - TableGen Lexer class.
74 class TGLexer {
75   SourceMgr &SrcMgr;
76 
77   const char *CurPtr;
78   StringRef CurBuf;
79 
80   // Information about the current token.
81   const char *TokStart;
82   tgtok::TokKind CurCode;
83   std::string CurStrVal;  // This is valid for ID, STRVAL, VARNAME, CODEFRAGMENT
84   int64_t CurIntVal;      // This is valid for INTVAL.
85 
86   /// CurBuffer - This is the current buffer index we're lexing from as managed
87   /// by the SourceMgr object.
88   unsigned CurBuffer;
89 
90 public:
91   typedef std::map<std::string, SMLoc> DependenciesMapTy;
92 private:
93   /// Dependencies - This is the list of all included files.
94   DependenciesMapTy Dependencies;
95 
96 public:
97   TGLexer(SourceMgr &SrcMgr, ArrayRef<std::string> Macros);
98 
Lex()99   tgtok::TokKind Lex() {
100     return CurCode = LexToken(CurPtr == CurBuf.begin());
101   }
102 
getDependencies()103   const DependenciesMapTy &getDependencies() const {
104     return Dependencies;
105   }
106 
getCode()107   tgtok::TokKind getCode() const { return CurCode; }
108 
getCurStrVal()109   const std::string &getCurStrVal() const {
110     assert((CurCode == tgtok::Id || CurCode == tgtok::StrVal ||
111             CurCode == tgtok::VarName || CurCode == tgtok::CodeFragment) &&
112            "This token doesn't have a string value");
113     return CurStrVal;
114   }
getCurIntVal()115   int64_t getCurIntVal() const {
116     assert(CurCode == tgtok::IntVal && "This token isn't an integer");
117     return CurIntVal;
118   }
getCurBinaryIntVal()119   std::pair<int64_t, unsigned> getCurBinaryIntVal() const {
120     assert(CurCode == tgtok::BinaryIntVal &&
121            "This token isn't a binary integer");
122     return std::make_pair(CurIntVal, (CurPtr - TokStart)-2);
123   }
124 
125   SMLoc getLoc() const;
126 
127 private:
128   /// LexToken - Read the next token and return its code.
129   tgtok::TokKind LexToken(bool FileOrLineStart = false);
130 
131   tgtok::TokKind ReturnError(SMLoc Loc, const Twine &Msg);
132   tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg);
133 
134   int getNextChar();
135   int peekNextChar(int Index) const;
136   void SkipBCPLComment();
137   bool SkipCComment();
138   tgtok::TokKind LexIdentifier();
139   bool LexInclude();
140   tgtok::TokKind LexString();
141   tgtok::TokKind LexVarName();
142   tgtok::TokKind LexNumber();
143   tgtok::TokKind LexBracket();
144   tgtok::TokKind LexExclaim();
145 
146   // Process EOF encountered in LexToken().
147   // If EOF is met in an include file, then the method will update
148   // CurPtr, CurBuf and preprocessing include stack, and return true.
149   // If EOF is met in the top-level file, then the method will
150   // update and check the preprocessing include stack, and return false.
151   bool processEOF();
152 
153   // *** Structures and methods for preprocessing support ***
154 
155   // A set of macro names that are defined either via command line or
156   // by using:
157   //     #define NAME
158   StringSet<> DefinedMacros;
159 
160   // Each of #ifdef and #else directives has a descriptor associated
161   // with it.
162   //
163   // An ordered list of preprocessing controls defined by #ifdef/#else
164   // directives that are in effect currently is called preprocessing
165   // control stack.  It is represented as a vector of PreprocessorControlDesc's.
166   //
167   // The control stack is updated according to the following rules:
168   //
169   // For each #ifdef we add an element to the control stack.
170   // For each #else we replace the top element with a descriptor
171   // with an inverted IsDefined value.
172   // For each #endif we pop the top element from the control stack.
173   //
174   // When CurPtr reaches the current buffer's end, the control stack
175   // must be empty, i.e. #ifdef and the corresponding #endif
176   // must be located in the same file.
177   struct PreprocessorControlDesc {
178     // Either tgtok::Ifdef or tgtok::Else.
179     tgtok::TokKind Kind;
180 
181     // True, if the condition for this directive is true, false - otherwise.
182     // Examples:
183     //     #ifdef NAME       : true, if NAME is defined, false - otherwise.
184     //     ...
185     //     #else             : false, if NAME is defined, true - otherwise.
186     bool IsDefined;
187 
188     // Pointer into CurBuf to the beginning of the preprocessing directive
189     // word, e.g.:
190     //     #ifdef NAME
191     //      ^ - SrcPos
192     SMLoc SrcPos;
193   };
194 
195   // We want to disallow code like this:
196   //     file1.td:
197   //         #define NAME
198   //         #ifdef NAME
199   //         include "file2.td"
200   //     EOF
201   //     file2.td:
202   //         #endif
203   //     EOF
204   //
205   // To do this, we clear the preprocessing control stack on entry
206   // to each of the included file.  PrepIncludeStack is used to store
207   // preprocessing control stacks for the current file and all its
208   // parent files.  The back() element is the preprocessing control
209   // stack for the current file.
210   std::vector<std::unique_ptr<std::vector<PreprocessorControlDesc>>>
211       PrepIncludeStack;
212 
213   // Validate that the current preprocessing control stack is empty,
214   // since we are about to exit a file, and pop the include stack.
215   //
216   // If IncludeStackMustBeEmpty is true, the include stack must be empty
217   // after the popping, otherwise, the include stack must not be empty
218   // after the popping.  Basically, the include stack must be empty
219   // only if we exit the "top-level" file (i.e. finish lexing).
220   //
221   // The method returns false, if the current preprocessing control stack
222   // is not empty (e.g. there is an unterminated #ifdef/#else),
223   // true - otherwise.
224   bool prepExitInclude(bool IncludeStackMustBeEmpty);
225 
226   // Look ahead for a preprocessing directive starting from CurPtr.  The caller
227   // must only call this method, if *(CurPtr - 1) is '#'.  If the method matches
228   // a preprocessing directive word followed by a whitespace, then it returns
229   // one of the internal token kinds, i.e. Ifdef, Else, Endif, Define.
230   //
231   // CurPtr is not adjusted by this method.
232   tgtok::TokKind prepIsDirective() const;
233 
234   // Given a preprocessing token kind, adjusts CurPtr to the end
235   // of the preprocessing directive word.  Returns true, unless
236   // an unsupported token kind is passed in.
237   //
238   // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective()
239   // to avoid adjusting CurPtr before we are sure that '#' is followed
240   // by a preprocessing directive.  If it is not, then we fall back to
241   // tgtok::paste interpretation of '#'.
242   bool prepEatPreprocessorDirective(tgtok::TokKind Kind);
243 
244   // The main "exit" point from the token parsing to preprocessor.
245   //
246   // The method is called for CurPtr, when prepIsDirective() returns
247   // true.  The first parameter matches the result of prepIsDirective(),
248   // denoting the actual preprocessor directive to be processed.
249   //
250   // If the preprocessing directive disables the tokens processing, e.g.:
251   //     #ifdef NAME // NAME is undefined
252   // then lexPreprocessor() enters the lines-skipping mode.
253   // In this mode, it does not parse any tokens, because the code under
254   // the #ifdef may not even be a correct tablegen code.  The preprocessor
255   // looks for lines containing other preprocessing directives, which
256   // may be prepended with whitespaces and C-style comments.  If the line
257   // does not contain a preprocessing directive, it is skipped completely.
258   // Otherwise, the preprocessing directive is processed by recursively
259   // calling lexPreprocessor().  The processing of the encountered
260   // preprocessing directives includes updating preprocessing control stack
261   // and adding new macros into DefinedMacros set.
262   //
263   // The second parameter controls whether lexPreprocessor() is called from
264   // LexToken() (true) or recursively from lexPreprocessor() (false).
265   //
266   // If ReturnNextLiveToken is true, the method returns the next
267   // LEX token following the current directive or following the end
268   // of the disabled preprocessing region corresponding to this directive.
269   // If ReturnNextLiveToken is false, the method returns the first parameter,
270   // unless there were errors encountered in the disabled preprocessing
271   // region - in this case, it returns tgtok::Error.
272   tgtok::TokKind lexPreprocessor(tgtok::TokKind Kind,
273                                  bool ReturnNextLiveToken = true);
274 
275   // Worker method for lexPreprocessor() to skip lines after some
276   // preprocessing directive up to the buffer end or to the directive
277   // that re-enables token processing.  The method returns true
278   // upon processing the next directive that re-enables tokens
279   // processing.  False is returned if an error was encountered.
280   //
281   // Note that prepSkipRegion() calls lexPreprocessor() to process
282   // encountered preprocessing directives.  In this case, the second
283   // parameter to lexPreprocessor() is set to false.  Being passed
284   // false ReturnNextLiveToken, lexPreprocessor() must never call
285   // prepSkipRegion().  We assert this by passing ReturnNextLiveToken
286   // to prepSkipRegion() and checking that it is never set to false.
287   bool prepSkipRegion(bool MustNeverBeFalse);
288 
289   // Lex name of the macro after either #ifdef or #define.  We could have used
290   // LexIdentifier(), but it has special handling of "include" word, which
291   // could result in awkward diagnostic errors.  Consider:
292   // ----
293   // #ifdef include
294   // class ...
295   // ----
296   // LexIdentifier() will engage LexInclude(), which will complain about
297   // missing file with name "class".  Instead, prepLexMacroName() will treat
298   // "include" as a normal macro name.
299   //
300   // On entry, CurPtr points to the end of a preprocessing directive word.
301   // The method allows for whitespaces between the preprocessing directive
302   // and the macro name.  The allowed whitespaces are ' ' and '\t'.
303   //
304   // If the first non-whitespace symbol after the preprocessing directive
305   // is a valid start symbol for an identifier (i.e. [a-zA-Z_]), then
306   // the method updates TokStart to the position of the first non-whitespace
307   // symbol, sets CurPtr to the position of the macro name's last symbol,
308   // and returns a string reference to the macro name.  Otherwise,
309   // TokStart is set to the first non-whitespace symbol after the preprocessing
310   // directive, and the method returns an empty string reference.
311   //
312   // In all cases, TokStart may be used to point to the word following
313   // the preprocessing directive.
314   StringRef prepLexMacroName();
315 
316   // Skip any whitespaces starting from CurPtr.  The method is used
317   // only in the lines-skipping mode to find the first non-whitespace
318   // symbol after or at CurPtr.  Allowed whitespaces are ' ', '\t', '\n'
319   // and '\r'.  The method skips C-style comments as well, because
320   // it is used to find the beginning of the preprocessing directive.
321   // If we do not handle C-style comments the following code would
322   // result in incorrect detection of a preprocessing directive:
323   //     /*
324   //     #ifdef NAME
325   //     */
326   // As long as we skip C-style comments, the following code is correctly
327   // recognized as a preprocessing directive:
328   //     /* first line comment
329   //        second line comment */ #ifdef NAME
330   //
331   // The method returns true upon reaching the first non-whitespace symbol
332   // or EOF, CurPtr is set to point to this symbol.  The method returns false,
333   // if an error occured during skipping of a C-style comment.
334   bool prepSkipLineBegin();
335 
336   // Skip any whitespaces or comments after a preprocessing directive.
337   // The method returns true upon reaching either end of the line
338   // or end of the file.  If there is a multiline C-style comment
339   // after the preprocessing directive, the method skips
340   // the comment, so the final CurPtr may point to one of the next lines.
341   // The method returns false, if an error occured during skipping
342   // C- or C++-style comment, or a non-whitespace symbol appears
343   // after the preprocessing directive.
344   //
345   // The method maybe called both during lines-skipping and tokens
346   // processing.  It actually verifies that only whitespaces or/and
347   // comments follow a preprocessing directive.
348   //
349   // After the execution of this mehod, CurPtr points either to new line
350   // symbol, buffer end or non-whitespace symbol following the preprocesing
351   // directive.
352   bool prepSkipDirectiveEnd();
353 
354   // Skip all symbols to the end of the line/file.
355   // The method adjusts CurPtr, so that it points to either new line
356   // symbol in the current line or the buffer end.
357   void prepSkipToLineEnd();
358 
359   // Return true, if the current preprocessor control stack is such that
360   // we should allow lexer to process the next token, false - otherwise.
361   //
362   // In particular, the method returns true, if all the #ifdef/#else
363   // controls on the stack have their IsDefined member set to true.
364   bool prepIsProcessingEnabled();
365 
366   // Report an error, if we reach EOF with non-empty preprocessing control
367   // stack.  This means there is no matching #endif for the previous
368   // #ifdef/#else.
369   void prepReportPreprocessorStackError();
370 };
371 
372 } // end namespace llvm
373 
374 #endif
375