1 //===- TGLexer.h - Lexer for TableGen Files ---------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This class represents the Lexer for tablegen files.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef LLVM_LIB_TABLEGEN_TGLEXER_H
14 #define LLVM_LIB_TABLEGEN_TGLEXER_H
15 
16 #include "llvm/ADT/StringRef.h"
17 #include "llvm/ADT/StringSet.h"
18 #include "llvm/Support/DataTypes.h"
19 #include "llvm/Support/SMLoc.h"
20 #include <cassert>
21 #include <memory>
22 #include <set>
23 #include <string>
24 #include <vector>
25 
26 namespace llvm {
27 template <typename T> class ArrayRef;
28 class SourceMgr;
29 class Twine;
30 
31 namespace tgtok {
32   enum TokKind {
33     // Markers
34     Eof, Error,
35 
36     // Tokens with no info.
37     minus, plus,        // - +
38     l_square, r_square, // [ ]
39     l_brace, r_brace,   // { }
40     l_paren, r_paren,   // ( )
41     less, greater,      // < >
42     colon, semi,        // : ;
43     comma, dot,         // , .
44     equal, question,    // = ?
45     paste,              // #
46     dotdotdot,          // ...
47 
48     // Reserved keywords. ('ElseKW' is named to distinguish it from the
49     // existing 'Else' that means the preprocessor #else.)
50     Assert, Bit, Bits, Class, Code, Dag, Def, Defm, Defset, Defvar, ElseKW,
51     FalseKW, Field, Foreach, If, In, Include, Int, Let, List, MultiClass,
52     String, Then, TrueKW,
53 
54     // Bang operators.
55     XConcat, XADD, XSUB, XMUL, XNOT, XAND, XOR, XXOR, XSRA, XSRL, XSHL,
56     XListConcat, XListSplat, XStrConcat, XInterleave, XSubstr, XFind, XCast,
57     XSubst, XForEach, XFilter, XFoldl, XHead, XTail, XSize, XEmpty, XIf,
58     XCond, XEq, XIsA, XDag, XNe, XLe, XLt, XGe, XGt, XSetDagOp, XGetDagOp,
59     XExists,
60 
61     // Boolean literals.
62     TrueVal, FalseVal,
63 
64     // Integer value.
65     IntVal,
66 
67     // Binary constant.  Note that these are sized according to the number of
68     // bits given.
69     BinaryIntVal,
70 
71     // String valued tokens.
72     Id, StrVal, VarName, CodeFragment,
73 
74     // Preprocessing tokens for internal usage by the lexer.
75     // They are never returned as a result of Lex().
76     Ifdef, Ifndef, Else, Endif, Define
77   };
78 }
79 
80 /// TGLexer - TableGen Lexer class.
81 class TGLexer {
82   SourceMgr &SrcMgr;
83 
84   const char *CurPtr = nullptr;
85   StringRef CurBuf;
86 
87   // Information about the current token.
88   const char *TokStart = nullptr;
89   tgtok::TokKind CurCode = tgtok::TokKind::Eof;
90   std::string CurStrVal; // This is valid for Id, StrVal, VarName, CodeFragment
91   int64_t CurIntVal = 0; // This is valid for IntVal.
92 
93   /// CurBuffer - This is the current buffer index we're lexing from as managed
94   /// by the SourceMgr object.
95   unsigned CurBuffer = 0;
96 
97 public:
98   typedef std::set<std::string> DependenciesSetTy;
99 
100 private:
101   /// Dependencies - This is the list of all included files.
102   DependenciesSetTy Dependencies;
103 
104 public:
105   TGLexer(SourceMgr &SrcMgr, ArrayRef<std::string> Macros);
106 
107   tgtok::TokKind Lex() {
108     return CurCode = LexToken(CurPtr == CurBuf.begin());
109   }
110 
111   const DependenciesSetTy &getDependencies() const {
112     return Dependencies;
113   }
114 
115   tgtok::TokKind getCode() const { return CurCode; }
116 
117   const std::string &getCurStrVal() const {
118     assert((CurCode == tgtok::Id || CurCode == tgtok::StrVal ||
119             CurCode == tgtok::VarName || CurCode == tgtok::CodeFragment) &&
120            "This token doesn't have a string value");
121     return CurStrVal;
122   }
123   int64_t getCurIntVal() const {
124     assert(CurCode == tgtok::IntVal && "This token isn't an integer");
125     return CurIntVal;
126   }
127   std::pair<int64_t, unsigned> getCurBinaryIntVal() const {
128     assert(CurCode == tgtok::BinaryIntVal &&
129            "This token isn't a binary integer");
130     return std::make_pair(CurIntVal, (CurPtr - TokStart)-2);
131   }
132 
133   SMLoc getLoc() const;
134 
135 private:
136   /// LexToken - Read the next token and return its code.
137   tgtok::TokKind LexToken(bool FileOrLineStart = false);
138 
139   tgtok::TokKind ReturnError(SMLoc Loc, const Twine &Msg);
140   tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg);
141 
142   int getNextChar();
143   int peekNextChar(int Index) const;
144   void SkipBCPLComment();
145   bool SkipCComment();
146   tgtok::TokKind LexIdentifier();
147   bool LexInclude();
148   tgtok::TokKind LexString();
149   tgtok::TokKind LexVarName();
150   tgtok::TokKind LexNumber();
151   tgtok::TokKind LexBracket();
152   tgtok::TokKind LexExclaim();
153 
154   // Process EOF encountered in LexToken().
155   // If EOF is met in an include file, then the method will update
156   // CurPtr, CurBuf and preprocessing include stack, and return true.
157   // If EOF is met in the top-level file, then the method will
158   // update and check the preprocessing include stack, and return false.
159   bool processEOF();
160 
161   // *** Structures and methods for preprocessing support ***
162 
163   // A set of macro names that are defined either via command line or
164   // by using:
165   //     #define NAME
166   StringSet<> DefinedMacros;
167 
168   // Each of #ifdef and #else directives has a descriptor associated
169   // with it.
170   //
171   // An ordered list of preprocessing controls defined by #ifdef/#else
172   // directives that are in effect currently is called preprocessing
173   // control stack.  It is represented as a vector of PreprocessorControlDesc's.
174   //
175   // The control stack is updated according to the following rules:
176   //
177   // For each #ifdef we add an element to the control stack.
178   // For each #else we replace the top element with a descriptor
179   // with an inverted IsDefined value.
180   // For each #endif we pop the top element from the control stack.
181   //
182   // When CurPtr reaches the current buffer's end, the control stack
183   // must be empty, i.e. #ifdef and the corresponding #endif
184   // must be located in the same file.
185   struct PreprocessorControlDesc {
186     // Either tgtok::Ifdef or tgtok::Else.
187     tgtok::TokKind Kind;
188 
189     // True, if the condition for this directive is true, false - otherwise.
190     // Examples:
191     //     #ifdef NAME       : true, if NAME is defined, false - otherwise.
192     //     ...
193     //     #else             : false, if NAME is defined, true - otherwise.
194     bool IsDefined;
195 
196     // Pointer into CurBuf to the beginning of the preprocessing directive
197     // word, e.g.:
198     //     #ifdef NAME
199     //      ^ - SrcPos
200     SMLoc SrcPos;
201   };
202 
203   // We want to disallow code like this:
204   //     file1.td:
205   //         #define NAME
206   //         #ifdef NAME
207   //         include "file2.td"
208   //     EOF
209   //     file2.td:
210   //         #endif
211   //     EOF
212   //
213   // To do this, we clear the preprocessing control stack on entry
214   // to each of the included file.  PrepIncludeStack is used to store
215   // preprocessing control stacks for the current file and all its
216   // parent files.  The back() element is the preprocessing control
217   // stack for the current file.
218   std::vector<std::unique_ptr<std::vector<PreprocessorControlDesc>>>
219       PrepIncludeStack;
220 
221   // Validate that the current preprocessing control stack is empty,
222   // since we are about to exit a file, and pop the include stack.
223   //
224   // If IncludeStackMustBeEmpty is true, the include stack must be empty
225   // after the popping, otherwise, the include stack must not be empty
226   // after the popping.  Basically, the include stack must be empty
227   // only if we exit the "top-level" file (i.e. finish lexing).
228   //
229   // The method returns false, if the current preprocessing control stack
230   // is not empty (e.g. there is an unterminated #ifdef/#else),
231   // true - otherwise.
232   bool prepExitInclude(bool IncludeStackMustBeEmpty);
233 
234   // Look ahead for a preprocessing directive starting from CurPtr.  The caller
235   // must only call this method, if *(CurPtr - 1) is '#'.  If the method matches
236   // a preprocessing directive word followed by a whitespace, then it returns
237   // one of the internal token kinds, i.e. Ifdef, Else, Endif, Define.
238   //
239   // CurPtr is not adjusted by this method.
240   tgtok::TokKind prepIsDirective() const;
241 
242   // Given a preprocessing token kind, adjusts CurPtr to the end
243   // of the preprocessing directive word.  Returns true, unless
244   // an unsupported token kind is passed in.
245   //
246   // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective()
247   // to avoid adjusting CurPtr before we are sure that '#' is followed
248   // by a preprocessing directive.  If it is not, then we fall back to
249   // tgtok::paste interpretation of '#'.
250   bool prepEatPreprocessorDirective(tgtok::TokKind Kind);
251 
252   // The main "exit" point from the token parsing to preprocessor.
253   //
254   // The method is called for CurPtr, when prepIsDirective() returns
255   // true.  The first parameter matches the result of prepIsDirective(),
256   // denoting the actual preprocessor directive to be processed.
257   //
258   // If the preprocessing directive disables the tokens processing, e.g.:
259   //     #ifdef NAME // NAME is undefined
260   // then lexPreprocessor() enters the lines-skipping mode.
261   // In this mode, it does not parse any tokens, because the code under
262   // the #ifdef may not even be a correct tablegen code.  The preprocessor
263   // looks for lines containing other preprocessing directives, which
264   // may be prepended with whitespaces and C-style comments.  If the line
265   // does not contain a preprocessing directive, it is skipped completely.
266   // Otherwise, the preprocessing directive is processed by recursively
267   // calling lexPreprocessor().  The processing of the encountered
268   // preprocessing directives includes updating preprocessing control stack
269   // and adding new macros into DefinedMacros set.
270   //
271   // The second parameter controls whether lexPreprocessor() is called from
272   // LexToken() (true) or recursively from lexPreprocessor() (false).
273   //
274   // If ReturnNextLiveToken is true, the method returns the next
275   // LEX token following the current directive or following the end
276   // of the disabled preprocessing region corresponding to this directive.
277   // If ReturnNextLiveToken is false, the method returns the first parameter,
278   // unless there were errors encountered in the disabled preprocessing
279   // region - in this case, it returns tgtok::Error.
280   tgtok::TokKind lexPreprocessor(tgtok::TokKind Kind,
281                                  bool ReturnNextLiveToken = true);
282 
283   // Worker method for lexPreprocessor() to skip lines after some
284   // preprocessing directive up to the buffer end or to the directive
285   // that re-enables token processing.  The method returns true
286   // upon processing the next directive that re-enables tokens
287   // processing.  False is returned if an error was encountered.
288   //
289   // Note that prepSkipRegion() calls lexPreprocessor() to process
290   // encountered preprocessing directives.  In this case, the second
291   // parameter to lexPreprocessor() is set to false.  Being passed
292   // false ReturnNextLiveToken, lexPreprocessor() must never call
293   // prepSkipRegion().  We assert this by passing ReturnNextLiveToken
294   // to prepSkipRegion() and checking that it is never set to false.
295   bool prepSkipRegion(bool MustNeverBeFalse);
296 
297   // Lex name of the macro after either #ifdef or #define.  We could have used
298   // LexIdentifier(), but it has special handling of "include" word, which
299   // could result in awkward diagnostic errors.  Consider:
300   // ----
301   // #ifdef include
302   // class ...
303   // ----
304   // LexIdentifier() will engage LexInclude(), which will complain about
305   // missing file with name "class".  Instead, prepLexMacroName() will treat
306   // "include" as a normal macro name.
307   //
308   // On entry, CurPtr points to the end of a preprocessing directive word.
309   // The method allows for whitespaces between the preprocessing directive
310   // and the macro name.  The allowed whitespaces are ' ' and '\t'.
311   //
312   // If the first non-whitespace symbol after the preprocessing directive
313   // is a valid start symbol for an identifier (i.e. [a-zA-Z_]), then
314   // the method updates TokStart to the position of the first non-whitespace
315   // symbol, sets CurPtr to the position of the macro name's last symbol,
316   // and returns a string reference to the macro name.  Otherwise,
317   // TokStart is set to the first non-whitespace symbol after the preprocessing
318   // directive, and the method returns an empty string reference.
319   //
320   // In all cases, TokStart may be used to point to the word following
321   // the preprocessing directive.
322   StringRef prepLexMacroName();
323 
324   // Skip any whitespaces starting from CurPtr.  The method is used
325   // only in the lines-skipping mode to find the first non-whitespace
326   // symbol after or at CurPtr.  Allowed whitespaces are ' ', '\t', '\n'
327   // and '\r'.  The method skips C-style comments as well, because
328   // it is used to find the beginning of the preprocessing directive.
329   // If we do not handle C-style comments the following code would
330   // result in incorrect detection of a preprocessing directive:
331   //     /*
332   //     #ifdef NAME
333   //     */
334   // As long as we skip C-style comments, the following code is correctly
335   // recognized as a preprocessing directive:
336   //     /* first line comment
337   //        second line comment */ #ifdef NAME
338   //
339   // The method returns true upon reaching the first non-whitespace symbol
340   // or EOF, CurPtr is set to point to this symbol.  The method returns false,
341   // if an error occurred during skipping of a C-style comment.
342   bool prepSkipLineBegin();
343 
344   // Skip any whitespaces or comments after a preprocessing directive.
345   // The method returns true upon reaching either end of the line
346   // or end of the file.  If there is a multiline C-style comment
347   // after the preprocessing directive, the method skips
348   // the comment, so the final CurPtr may point to one of the next lines.
349   // The method returns false, if an error occurred during skipping
350   // C- or C++-style comment, or a non-whitespace symbol appears
351   // after the preprocessing directive.
352   //
353   // The method maybe called both during lines-skipping and tokens
354   // processing.  It actually verifies that only whitespaces or/and
355   // comments follow a preprocessing directive.
356   //
357   // After the execution of this mehod, CurPtr points either to new line
358   // symbol, buffer end or non-whitespace symbol following the preprocesing
359   // directive.
360   bool prepSkipDirectiveEnd();
361 
362   // Skip all symbols to the end of the line/file.
363   // The method adjusts CurPtr, so that it points to either new line
364   // symbol in the current line or the buffer end.
365   void prepSkipToLineEnd();
366 
367   // Return true, if the current preprocessor control stack is such that
368   // we should allow lexer to process the next token, false - otherwise.
369   //
370   // In particular, the method returns true, if all the #ifdef/#else
371   // controls on the stack have their IsDefined member set to true.
372   bool prepIsProcessingEnabled();
373 
374   // Report an error, if we reach EOF with non-empty preprocessing control
375   // stack.  This means there is no matching #endif for the previous
376   // #ifdef/#else.
377   void prepReportPreprocessorStackError();
378 };
379 
380 } // end namespace llvm
381 
382 #endif
383