1 //===- TGLexer.h - Lexer for TableGen Files ---------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This class represents the Lexer for tablegen files.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef LLVM_LIB_TABLEGEN_TGLEXER_H
14 #define LLVM_LIB_TABLEGEN_TGLEXER_H
15 
16 #include "llvm/ADT/StringRef.h"
17 #include "llvm/ADT/StringSet.h"
18 #include "llvm/Support/DataTypes.h"
19 #include "llvm/Support/SMLoc.h"
20 #include <cassert>
21 #include <memory>
22 #include <set>
23 #include <string>
24 #include <vector>
25 
26 namespace llvm {
27 template <typename T> class ArrayRef;
28 class SourceMgr;
29 class Twine;
30 
31 namespace tgtok {
32   enum TokKind {
33     // Markers
34     Eof, Error,
35 
36     // Tokens with no info.
37     minus, plus,        // - +
38     l_square, r_square, // [ ]
39     l_brace, r_brace,   // { }
40     l_paren, r_paren,   // ( )
41     less, greater,      // < >
42     colon, semi,        // : ;
43     comma, dot,         // , .
44     equal, question,    // = ?
45     paste,              // #
46     dotdotdot,          // ...
47 
48     // Reserved keywords. ('ElseKW' is named to distinguish it from the
49     // existing 'Else' that means the preprocessor #else.)
50     Assert, Bit, Bits, Class, Code, Dag, Def, Defm, Defset, Defvar, ElseKW,
51     FalseKW, Field, Foreach, If, In, Include, Int, Let, List, MultiClass,
52     String, Then, TrueKW,
53 
54     // Bang operators.
55     XConcat, XADD, XSUB, XMUL, XDIV, XNOT, XLOG2, XAND, XOR, XXOR, XSRA, XSRL,
56     XSHL, XListConcat, XListSplat, XStrConcat, XInterleave, XSubstr, XFind,
57     XCast, XSubst, XForEach, XFilter, XFoldl, XHead, XTail, XSize, XEmpty, XIf,
58     XCond, XEq, XIsA, XDag, XNe, XLe, XLt, XGe, XGt, XSetDagOp, XGetDagOp,
59     XExists, XListRemove,
60 
61     // Boolean literals.
62     TrueVal, FalseVal,
63 
64     // Integer value.
65     IntVal,
66 
67     // Binary constant.  Note that these are sized according to the number of
68     // bits given.
69     BinaryIntVal,
70 
71     // String valued tokens.
72     Id, StrVal, VarName, CodeFragment,
73 
74     // Preprocessing tokens for internal usage by the lexer.
75     // They are never returned as a result of Lex().
76     Ifdef, Ifndef, Else, Endif, Define
77   };
78 }
79 
80 /// TGLexer - TableGen Lexer class.
81 class TGLexer {
82   SourceMgr &SrcMgr;
83 
84   const char *CurPtr = nullptr;
85   StringRef CurBuf;
86 
87   // Information about the current token.
88   const char *TokStart = nullptr;
89   tgtok::TokKind CurCode = tgtok::TokKind::Eof;
90   std::string CurStrVal; // This is valid for Id, StrVal, VarName, CodeFragment
91   int64_t CurIntVal = 0; // This is valid for IntVal.
92 
93   /// CurBuffer - This is the current buffer index we're lexing from as managed
94   /// by the SourceMgr object.
95   unsigned CurBuffer = 0;
96 
97 public:
98   typedef std::set<std::string> DependenciesSetTy;
99 
100 private:
101   /// Dependencies - This is the list of all included files.
102   DependenciesSetTy Dependencies;
103 
104 public:
105   TGLexer(SourceMgr &SrcMgr, ArrayRef<std::string> Macros);
106 
107   tgtok::TokKind Lex() {
108     return CurCode = LexToken(CurPtr == CurBuf.begin());
109   }
110 
111   const DependenciesSetTy &getDependencies() const {
112     return Dependencies;
113   }
114 
115   tgtok::TokKind getCode() const { return CurCode; }
116 
117   const std::string &getCurStrVal() const {
118     assert((CurCode == tgtok::Id || CurCode == tgtok::StrVal ||
119             CurCode == tgtok::VarName || CurCode == tgtok::CodeFragment) &&
120            "This token doesn't have a string value");
121     return CurStrVal;
122   }
123   int64_t getCurIntVal() const {
124     assert(CurCode == tgtok::IntVal && "This token isn't an integer");
125     return CurIntVal;
126   }
127   std::pair<int64_t, unsigned> getCurBinaryIntVal() const {
128     assert(CurCode == tgtok::BinaryIntVal &&
129            "This token isn't a binary integer");
130     return std::make_pair(CurIntVal, (CurPtr - TokStart)-2);
131   }
132 
133   SMLoc getLoc() const;
134   SMRange getLocRange() const;
135 
136 private:
137   /// LexToken - Read the next token and return its code.
138   tgtok::TokKind LexToken(bool FileOrLineStart = false);
139 
140   tgtok::TokKind ReturnError(SMLoc Loc, const Twine &Msg);
141   tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg);
142 
143   int getNextChar();
144   int peekNextChar(int Index) const;
145   void SkipBCPLComment();
146   bool SkipCComment();
147   tgtok::TokKind LexIdentifier();
148   bool LexInclude();
149   tgtok::TokKind LexString();
150   tgtok::TokKind LexVarName();
151   tgtok::TokKind LexNumber();
152   tgtok::TokKind LexBracket();
153   tgtok::TokKind LexExclaim();
154 
155   // Process EOF encountered in LexToken().
156   // If EOF is met in an include file, then the method will update
157   // CurPtr, CurBuf and preprocessing include stack, and return true.
158   // If EOF is met in the top-level file, then the method will
159   // update and check the preprocessing include stack, and return false.
160   bool processEOF();
161 
162   // *** Structures and methods for preprocessing support ***
163 
164   // A set of macro names that are defined either via command line or
165   // by using:
166   //     #define NAME
167   StringSet<> DefinedMacros;
168 
169   // Each of #ifdef and #else directives has a descriptor associated
170   // with it.
171   //
172   // An ordered list of preprocessing controls defined by #ifdef/#else
173   // directives that are in effect currently is called preprocessing
174   // control stack.  It is represented as a vector of PreprocessorControlDesc's.
175   //
176   // The control stack is updated according to the following rules:
177   //
178   // For each #ifdef we add an element to the control stack.
179   // For each #else we replace the top element with a descriptor
180   // with an inverted IsDefined value.
181   // For each #endif we pop the top element from the control stack.
182   //
183   // When CurPtr reaches the current buffer's end, the control stack
184   // must be empty, i.e. #ifdef and the corresponding #endif
185   // must be located in the same file.
186   struct PreprocessorControlDesc {
187     // Either tgtok::Ifdef or tgtok::Else.
188     tgtok::TokKind Kind;
189 
190     // True, if the condition for this directive is true, false - otherwise.
191     // Examples:
192     //     #ifdef NAME       : true, if NAME is defined, false - otherwise.
193     //     ...
194     //     #else             : false, if NAME is defined, true - otherwise.
195     bool IsDefined;
196 
197     // Pointer into CurBuf to the beginning of the preprocessing directive
198     // word, e.g.:
199     //     #ifdef NAME
200     //      ^ - SrcPos
201     SMLoc SrcPos;
202   };
203 
204   // We want to disallow code like this:
205   //     file1.td:
206   //         #define NAME
207   //         #ifdef NAME
208   //         include "file2.td"
209   //     EOF
210   //     file2.td:
211   //         #endif
212   //     EOF
213   //
214   // To do this, we clear the preprocessing control stack on entry
215   // to each of the included file.  PrepIncludeStack is used to store
216   // preprocessing control stacks for the current file and all its
217   // parent files.  The back() element is the preprocessing control
218   // stack for the current file.
219   std::vector<std::unique_ptr<std::vector<PreprocessorControlDesc>>>
220       PrepIncludeStack;
221 
222   // Validate that the current preprocessing control stack is empty,
223   // since we are about to exit a file, and pop the include stack.
224   //
225   // If IncludeStackMustBeEmpty is true, the include stack must be empty
226   // after the popping, otherwise, the include stack must not be empty
227   // after the popping.  Basically, the include stack must be empty
228   // only if we exit the "top-level" file (i.e. finish lexing).
229   //
230   // The method returns false, if the current preprocessing control stack
231   // is not empty (e.g. there is an unterminated #ifdef/#else),
232   // true - otherwise.
233   bool prepExitInclude(bool IncludeStackMustBeEmpty);
234 
235   // Look ahead for a preprocessing directive starting from CurPtr.  The caller
236   // must only call this method, if *(CurPtr - 1) is '#'.  If the method matches
237   // a preprocessing directive word followed by a whitespace, then it returns
238   // one of the internal token kinds, i.e. Ifdef, Else, Endif, Define.
239   //
240   // CurPtr is not adjusted by this method.
241   tgtok::TokKind prepIsDirective() const;
242 
243   // Given a preprocessing token kind, adjusts CurPtr to the end
244   // of the preprocessing directive word.  Returns true, unless
245   // an unsupported token kind is passed in.
246   //
247   // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective()
248   // to avoid adjusting CurPtr before we are sure that '#' is followed
249   // by a preprocessing directive.  If it is not, then we fall back to
250   // tgtok::paste interpretation of '#'.
251   bool prepEatPreprocessorDirective(tgtok::TokKind Kind);
252 
253   // The main "exit" point from the token parsing to preprocessor.
254   //
255   // The method is called for CurPtr, when prepIsDirective() returns
256   // true.  The first parameter matches the result of prepIsDirective(),
257   // denoting the actual preprocessor directive to be processed.
258   //
259   // If the preprocessing directive disables the tokens processing, e.g.:
260   //     #ifdef NAME // NAME is undefined
261   // then lexPreprocessor() enters the lines-skipping mode.
262   // In this mode, it does not parse any tokens, because the code under
263   // the #ifdef may not even be a correct tablegen code.  The preprocessor
264   // looks for lines containing other preprocessing directives, which
265   // may be prepended with whitespaces and C-style comments.  If the line
266   // does not contain a preprocessing directive, it is skipped completely.
267   // Otherwise, the preprocessing directive is processed by recursively
268   // calling lexPreprocessor().  The processing of the encountered
269   // preprocessing directives includes updating preprocessing control stack
270   // and adding new macros into DefinedMacros set.
271   //
272   // The second parameter controls whether lexPreprocessor() is called from
273   // LexToken() (true) or recursively from lexPreprocessor() (false).
274   //
275   // If ReturnNextLiveToken is true, the method returns the next
276   // LEX token following the current directive or following the end
277   // of the disabled preprocessing region corresponding to this directive.
278   // If ReturnNextLiveToken is false, the method returns the first parameter,
279   // unless there were errors encountered in the disabled preprocessing
280   // region - in this case, it returns tgtok::Error.
281   tgtok::TokKind lexPreprocessor(tgtok::TokKind Kind,
282                                  bool ReturnNextLiveToken = true);
283 
284   // Worker method for lexPreprocessor() to skip lines after some
285   // preprocessing directive up to the buffer end or to the directive
286   // that re-enables token processing.  The method returns true
287   // upon processing the next directive that re-enables tokens
288   // processing.  False is returned if an error was encountered.
289   //
290   // Note that prepSkipRegion() calls lexPreprocessor() to process
291   // encountered preprocessing directives.  In this case, the second
292   // parameter to lexPreprocessor() is set to false.  Being passed
293   // false ReturnNextLiveToken, lexPreprocessor() must never call
294   // prepSkipRegion().  We assert this by passing ReturnNextLiveToken
295   // to prepSkipRegion() and checking that it is never set to false.
296   bool prepSkipRegion(bool MustNeverBeFalse);
297 
298   // Lex name of the macro after either #ifdef or #define.  We could have used
299   // LexIdentifier(), but it has special handling of "include" word, which
300   // could result in awkward diagnostic errors.  Consider:
301   // ----
302   // #ifdef include
303   // class ...
304   // ----
305   // LexIdentifier() will engage LexInclude(), which will complain about
306   // missing file with name "class".  Instead, prepLexMacroName() will treat
307   // "include" as a normal macro name.
308   //
309   // On entry, CurPtr points to the end of a preprocessing directive word.
310   // The method allows for whitespaces between the preprocessing directive
311   // and the macro name.  The allowed whitespaces are ' ' and '\t'.
312   //
313   // If the first non-whitespace symbol after the preprocessing directive
314   // is a valid start symbol for an identifier (i.e. [a-zA-Z_]), then
315   // the method updates TokStart to the position of the first non-whitespace
316   // symbol, sets CurPtr to the position of the macro name's last symbol,
317   // and returns a string reference to the macro name.  Otherwise,
318   // TokStart is set to the first non-whitespace symbol after the preprocessing
319   // directive, and the method returns an empty string reference.
320   //
321   // In all cases, TokStart may be used to point to the word following
322   // the preprocessing directive.
323   StringRef prepLexMacroName();
324 
325   // Skip any whitespaces starting from CurPtr.  The method is used
326   // only in the lines-skipping mode to find the first non-whitespace
327   // symbol after or at CurPtr.  Allowed whitespaces are ' ', '\t', '\n'
328   // and '\r'.  The method skips C-style comments as well, because
329   // it is used to find the beginning of the preprocessing directive.
330   // If we do not handle C-style comments the following code would
331   // result in incorrect detection of a preprocessing directive:
332   //     /*
333   //     #ifdef NAME
334   //     */
335   // As long as we skip C-style comments, the following code is correctly
336   // recognized as a preprocessing directive:
337   //     /* first line comment
338   //        second line comment */ #ifdef NAME
339   //
340   // The method returns true upon reaching the first non-whitespace symbol
341   // or EOF, CurPtr is set to point to this symbol.  The method returns false,
342   // if an error occurred during skipping of a C-style comment.
343   bool prepSkipLineBegin();
344 
345   // Skip any whitespaces or comments after a preprocessing directive.
346   // The method returns true upon reaching either end of the line
347   // or end of the file.  If there is a multiline C-style comment
348   // after the preprocessing directive, the method skips
349   // the comment, so the final CurPtr may point to one of the next lines.
350   // The method returns false, if an error occurred during skipping
351   // C- or C++-style comment, or a non-whitespace symbol appears
352   // after the preprocessing directive.
353   //
354   // The method maybe called both during lines-skipping and tokens
355   // processing.  It actually verifies that only whitespaces or/and
356   // comments follow a preprocessing directive.
357   //
358   // After the execution of this mehod, CurPtr points either to new line
359   // symbol, buffer end or non-whitespace symbol following the preprocesing
360   // directive.
361   bool prepSkipDirectiveEnd();
362 
363   // Skip all symbols to the end of the line/file.
364   // The method adjusts CurPtr, so that it points to either new line
365   // symbol in the current line or the buffer end.
366   void prepSkipToLineEnd();
367 
368   // Return true, if the current preprocessor control stack is such that
369   // we should allow lexer to process the next token, false - otherwise.
370   //
371   // In particular, the method returns true, if all the #ifdef/#else
372   // controls on the stack have their IsDefined member set to true.
373   bool prepIsProcessingEnabled();
374 
375   // Report an error, if we reach EOF with non-empty preprocessing control
376   // stack.  This means there is no matching #endif for the previous
377   // #ifdef/#else.
378   void prepReportPreprocessorStackError();
379 };
380 
381 } // end namespace llvm
382 
383 #endif
384