1 //===- TGLexer.h - Lexer for TableGen Files ---------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This class represents the Lexer for tablegen files.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef LLVM_LIB_TABLEGEN_TGLEXER_H
14 #define LLVM_LIB_TABLEGEN_TGLEXER_H
15 
16 #include "llvm/ADT/StringRef.h"
17 #include "llvm/ADT/StringSet.h"
18 #include "llvm/Support/DataTypes.h"
19 #include "llvm/Support/SMLoc.h"
20 #include <cassert>
21 #include <memory>
22 #include <set>
23 #include <string>
24 #include <vector>
25 
26 namespace llvm {
27 template <typename T> class ArrayRef;
28 class SourceMgr;
29 class Twine;
30 
31 namespace tgtok {
32 enum TokKind {
33   // Markers
34   Eof,
35   Error,
36 
37   // Tokens with no info.
38   minus,     // -
39   plus,      // +
40   l_square,  // [
41   r_square,  // ]
42   l_brace,   // {
43   r_brace,   // }
44   l_paren,   // (
45   r_paren,   // )
46   less,      // <
47   greater,   // >
48   colon,     // :
49   semi,      // ;
50   comma,     // ,
51   dot,       // .
52   equal,     // =
53   question,  // ?
54   paste,     // #
55   dotdotdot, // ...
56 
57   // Reserved keywords. ('ElseKW' is named to distinguish it from the
58   // existing 'Else' that means the preprocessor #else.)
59   Assert,
60   Bit,
61   Bits,
62   Class,
63   Code,
64   Dag,
65   Def,
66   Defm,
67   Defset,
68   Defvar,
69   ElseKW,
70   FalseKW,
71   Field,
72   Foreach,
73   If,
74   In,
75   Include,
76   Int,
77   Let,
78   List,
79   MultiClass,
80   String,
81   Then,
82   TrueKW,
83 
84   // Bang operators.
85   XConcat,
86   XADD,
87   XSUB,
88   XMUL,
89   XDIV,
90   XNOT,
91   XLOG2,
92   XAND,
93   XOR,
94   XXOR,
95   XSRA,
96   XSRL,
97   XSHL,
98   XListConcat,
99   XListSplat,
100   XStrConcat,
101   XInterleave,
102   XSubstr,
103   XFind,
104   XCast,
105   XSubst,
106   XForEach,
107   XFilter,
108   XFoldl,
109   XHead,
110   XTail,
111   XSize,
112   XEmpty,
113   XIf,
114   XCond,
115   XEq,
116   XIsA,
117   XDag,
118   XNe,
119   XLe,
120   XLt,
121   XGe,
122   XGt,
123   XSetDagOp,
124   XGetDagOp,
125   XExists,
126   XListRemove,
127   XToLower,
128   XToUpper,
129   XRange,
130   XGetDagArg,
131   XGetDagName,
132   XSetDagArg,
133   XSetDagName,
134 
135   // Boolean literals.
136   TrueVal,
137   FalseVal,
138 
139   // Integer value.
140   IntVal,
141 
142   // Binary constant.  Note that these are sized according to the number of
143   // bits given.
144   BinaryIntVal,
145 
146   // String valued tokens.
147   Id,
148   StrVal,
149   VarName,
150   CodeFragment,
151 
152   // Preprocessing tokens for internal usage by the lexer.
153   // They are never returned as a result of Lex().
154   Ifdef,
155   Ifndef,
156   Else,
157   Endif,
158   Define
159 };
160 }
161 
162 /// TGLexer - TableGen Lexer class.
163 class TGLexer {
164   SourceMgr &SrcMgr;
165 
166   const char *CurPtr = nullptr;
167   StringRef CurBuf;
168 
169   // Information about the current token.
170   const char *TokStart = nullptr;
171   tgtok::TokKind CurCode = tgtok::TokKind::Eof;
172   std::string CurStrVal; // This is valid for Id, StrVal, VarName, CodeFragment
173   int64_t CurIntVal = 0; // This is valid for IntVal.
174 
175   /// CurBuffer - This is the current buffer index we're lexing from as managed
176   /// by the SourceMgr object.
177   unsigned CurBuffer = 0;
178 
179 public:
180   typedef std::set<std::string> DependenciesSetTy;
181 
182 private:
183   /// Dependencies - This is the list of all included files.
184   DependenciesSetTy Dependencies;
185 
186 public:
187   TGLexer(SourceMgr &SrcMgr, ArrayRef<std::string> Macros);
188 
189   tgtok::TokKind Lex() {
190     return CurCode = LexToken(CurPtr == CurBuf.begin());
191   }
192 
193   const DependenciesSetTy &getDependencies() const {
194     return Dependencies;
195   }
196 
197   tgtok::TokKind getCode() const { return CurCode; }
198 
199   const std::string &getCurStrVal() const {
200     assert((CurCode == tgtok::Id || CurCode == tgtok::StrVal ||
201             CurCode == tgtok::VarName || CurCode == tgtok::CodeFragment) &&
202            "This token doesn't have a string value");
203     return CurStrVal;
204   }
205   int64_t getCurIntVal() const {
206     assert(CurCode == tgtok::IntVal && "This token isn't an integer");
207     return CurIntVal;
208   }
209   std::pair<int64_t, unsigned> getCurBinaryIntVal() const {
210     assert(CurCode == tgtok::BinaryIntVal &&
211            "This token isn't a binary integer");
212     return std::make_pair(CurIntVal, (CurPtr - TokStart)-2);
213   }
214 
215   SMLoc getLoc() const;
216   SMRange getLocRange() const;
217 
218 private:
219   /// LexToken - Read the next token and return its code.
220   tgtok::TokKind LexToken(bool FileOrLineStart = false);
221 
222   tgtok::TokKind ReturnError(SMLoc Loc, const Twine &Msg);
223   tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg);
224 
225   int getNextChar();
226   int peekNextChar(int Index) const;
227   void SkipBCPLComment();
228   bool SkipCComment();
229   tgtok::TokKind LexIdentifier();
230   bool LexInclude();
231   tgtok::TokKind LexString();
232   tgtok::TokKind LexVarName();
233   tgtok::TokKind LexNumber();
234   tgtok::TokKind LexBracket();
235   tgtok::TokKind LexExclaim();
236 
237   // Process EOF encountered in LexToken().
238   // If EOF is met in an include file, then the method will update
239   // CurPtr, CurBuf and preprocessing include stack, and return true.
240   // If EOF is met in the top-level file, then the method will
241   // update and check the preprocessing include stack, and return false.
242   bool processEOF();
243 
244   // *** Structures and methods for preprocessing support ***
245 
246   // A set of macro names that are defined either via command line or
247   // by using:
248   //     #define NAME
249   StringSet<> DefinedMacros;
250 
251   // Each of #ifdef and #else directives has a descriptor associated
252   // with it.
253   //
254   // An ordered list of preprocessing controls defined by #ifdef/#else
255   // directives that are in effect currently is called preprocessing
256   // control stack.  It is represented as a vector of PreprocessorControlDesc's.
257   //
258   // The control stack is updated according to the following rules:
259   //
260   // For each #ifdef we add an element to the control stack.
261   // For each #else we replace the top element with a descriptor
262   // with an inverted IsDefined value.
263   // For each #endif we pop the top element from the control stack.
264   //
265   // When CurPtr reaches the current buffer's end, the control stack
266   // must be empty, i.e. #ifdef and the corresponding #endif
267   // must be located in the same file.
268   struct PreprocessorControlDesc {
269     // Either tgtok::Ifdef or tgtok::Else.
270     tgtok::TokKind Kind;
271 
272     // True, if the condition for this directive is true, false - otherwise.
273     // Examples:
274     //     #ifdef NAME       : true, if NAME is defined, false - otherwise.
275     //     ...
276     //     #else             : false, if NAME is defined, true - otherwise.
277     bool IsDefined;
278 
279     // Pointer into CurBuf to the beginning of the preprocessing directive
280     // word, e.g.:
281     //     #ifdef NAME
282     //      ^ - SrcPos
283     SMLoc SrcPos;
284   };
285 
286   // We want to disallow code like this:
287   //     file1.td:
288   //         #define NAME
289   //         #ifdef NAME
290   //         include "file2.td"
291   //     EOF
292   //     file2.td:
293   //         #endif
294   //     EOF
295   //
296   // To do this, we clear the preprocessing control stack on entry
297   // to each of the included file.  PrepIncludeStack is used to store
298   // preprocessing control stacks for the current file and all its
299   // parent files.  The back() element is the preprocessing control
300   // stack for the current file.
301   std::vector<std::unique_ptr<std::vector<PreprocessorControlDesc>>>
302       PrepIncludeStack;
303 
304   // Validate that the current preprocessing control stack is empty,
305   // since we are about to exit a file, and pop the include stack.
306   //
307   // If IncludeStackMustBeEmpty is true, the include stack must be empty
308   // after the popping, otherwise, the include stack must not be empty
309   // after the popping.  Basically, the include stack must be empty
310   // only if we exit the "top-level" file (i.e. finish lexing).
311   //
312   // The method returns false, if the current preprocessing control stack
313   // is not empty (e.g. there is an unterminated #ifdef/#else),
314   // true - otherwise.
315   bool prepExitInclude(bool IncludeStackMustBeEmpty);
316 
317   // Look ahead for a preprocessing directive starting from CurPtr.  The caller
318   // must only call this method, if *(CurPtr - 1) is '#'.  If the method matches
319   // a preprocessing directive word followed by a whitespace, then it returns
320   // one of the internal token kinds, i.e. Ifdef, Else, Endif, Define.
321   //
322   // CurPtr is not adjusted by this method.
323   tgtok::TokKind prepIsDirective() const;
324 
325   // Given a preprocessing token kind, adjusts CurPtr to the end
326   // of the preprocessing directive word.  Returns true, unless
327   // an unsupported token kind is passed in.
328   //
329   // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective()
330   // to avoid adjusting CurPtr before we are sure that '#' is followed
331   // by a preprocessing directive.  If it is not, then we fall back to
332   // tgtok::paste interpretation of '#'.
333   bool prepEatPreprocessorDirective(tgtok::TokKind Kind);
334 
335   // The main "exit" point from the token parsing to preprocessor.
336   //
337   // The method is called for CurPtr, when prepIsDirective() returns
338   // true.  The first parameter matches the result of prepIsDirective(),
339   // denoting the actual preprocessor directive to be processed.
340   //
341   // If the preprocessing directive disables the tokens processing, e.g.:
342   //     #ifdef NAME // NAME is undefined
343   // then lexPreprocessor() enters the lines-skipping mode.
344   // In this mode, it does not parse any tokens, because the code under
345   // the #ifdef may not even be a correct tablegen code.  The preprocessor
346   // looks for lines containing other preprocessing directives, which
347   // may be prepended with whitespaces and C-style comments.  If the line
348   // does not contain a preprocessing directive, it is skipped completely.
349   // Otherwise, the preprocessing directive is processed by recursively
350   // calling lexPreprocessor().  The processing of the encountered
351   // preprocessing directives includes updating preprocessing control stack
352   // and adding new macros into DefinedMacros set.
353   //
354   // The second parameter controls whether lexPreprocessor() is called from
355   // LexToken() (true) or recursively from lexPreprocessor() (false).
356   //
357   // If ReturnNextLiveToken is true, the method returns the next
358   // LEX token following the current directive or following the end
359   // of the disabled preprocessing region corresponding to this directive.
360   // If ReturnNextLiveToken is false, the method returns the first parameter,
361   // unless there were errors encountered in the disabled preprocessing
362   // region - in this case, it returns tgtok::Error.
363   tgtok::TokKind lexPreprocessor(tgtok::TokKind Kind,
364                                  bool ReturnNextLiveToken = true);
365 
366   // Worker method for lexPreprocessor() to skip lines after some
367   // preprocessing directive up to the buffer end or to the directive
368   // that re-enables token processing.  The method returns true
369   // upon processing the next directive that re-enables tokens
370   // processing.  False is returned if an error was encountered.
371   //
372   // Note that prepSkipRegion() calls lexPreprocessor() to process
373   // encountered preprocessing directives.  In this case, the second
374   // parameter to lexPreprocessor() is set to false.  Being passed
375   // false ReturnNextLiveToken, lexPreprocessor() must never call
376   // prepSkipRegion().  We assert this by passing ReturnNextLiveToken
377   // to prepSkipRegion() and checking that it is never set to false.
378   bool prepSkipRegion(bool MustNeverBeFalse);
379 
380   // Lex name of the macro after either #ifdef or #define.  We could have used
381   // LexIdentifier(), but it has special handling of "include" word, which
382   // could result in awkward diagnostic errors.  Consider:
383   // ----
384   // #ifdef include
385   // class ...
386   // ----
387   // LexIdentifier() will engage LexInclude(), which will complain about
388   // missing file with name "class".  Instead, prepLexMacroName() will treat
389   // "include" as a normal macro name.
390   //
391   // On entry, CurPtr points to the end of a preprocessing directive word.
392   // The method allows for whitespaces between the preprocessing directive
393   // and the macro name.  The allowed whitespaces are ' ' and '\t'.
394   //
395   // If the first non-whitespace symbol after the preprocessing directive
396   // is a valid start symbol for an identifier (i.e. [a-zA-Z_]), then
397   // the method updates TokStart to the position of the first non-whitespace
398   // symbol, sets CurPtr to the position of the macro name's last symbol,
399   // and returns a string reference to the macro name.  Otherwise,
400   // TokStart is set to the first non-whitespace symbol after the preprocessing
401   // directive, and the method returns an empty string reference.
402   //
403   // In all cases, TokStart may be used to point to the word following
404   // the preprocessing directive.
405   StringRef prepLexMacroName();
406 
407   // Skip any whitespaces starting from CurPtr.  The method is used
408   // only in the lines-skipping mode to find the first non-whitespace
409   // symbol after or at CurPtr.  Allowed whitespaces are ' ', '\t', '\n'
410   // and '\r'.  The method skips C-style comments as well, because
411   // it is used to find the beginning of the preprocessing directive.
412   // If we do not handle C-style comments the following code would
413   // result in incorrect detection of a preprocessing directive:
414   //     /*
415   //     #ifdef NAME
416   //     */
417   // As long as we skip C-style comments, the following code is correctly
418   // recognized as a preprocessing directive:
419   //     /* first line comment
420   //        second line comment */ #ifdef NAME
421   //
422   // The method returns true upon reaching the first non-whitespace symbol
423   // or EOF, CurPtr is set to point to this symbol.  The method returns false,
424   // if an error occurred during skipping of a C-style comment.
425   bool prepSkipLineBegin();
426 
427   // Skip any whitespaces or comments after a preprocessing directive.
428   // The method returns true upon reaching either end of the line
429   // or end of the file.  If there is a multiline C-style comment
430   // after the preprocessing directive, the method skips
431   // the comment, so the final CurPtr may point to one of the next lines.
432   // The method returns false, if an error occurred during skipping
433   // C- or C++-style comment, or a non-whitespace symbol appears
434   // after the preprocessing directive.
435   //
436   // The method maybe called both during lines-skipping and tokens
437   // processing.  It actually verifies that only whitespaces or/and
438   // comments follow a preprocessing directive.
439   //
440   // After the execution of this mehod, CurPtr points either to new line
441   // symbol, buffer end or non-whitespace symbol following the preprocesing
442   // directive.
443   bool prepSkipDirectiveEnd();
444 
445   // Skip all symbols to the end of the line/file.
446   // The method adjusts CurPtr, so that it points to either new line
447   // symbol in the current line or the buffer end.
448   void prepSkipToLineEnd();
449 
450   // Return true, if the current preprocessor control stack is such that
451   // we should allow lexer to process the next token, false - otherwise.
452   //
453   // In particular, the method returns true, if all the #ifdef/#else
454   // controls on the stack have their IsDefined member set to true.
455   bool prepIsProcessingEnabled();
456 
457   // Report an error, if we reach EOF with non-empty preprocessing control
458   // stack.  This means there is no matching #endif for the previous
459   // #ifdef/#else.
460   void prepReportPreprocessorStackError();
461 };
462 
463 } // end namespace llvm
464 
465 #endif
466