1 //===- TGLexer.h - Lexer for TableGen Files ---------------------*- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This class represents the Lexer for tablegen files. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_LIB_TABLEGEN_TGLEXER_H 15 #define LLVM_LIB_TABLEGEN_TGLEXER_H 16 17 #include "llvm/ADT/ArrayRef.h" 18 #include "llvm/ADT/StringRef.h" 19 #include "llvm/ADT/StringSet.h" 20 #include "llvm/Support/DataTypes.h" 21 #include "llvm/Support/SMLoc.h" 22 #include <cassert> 23 #include <map> 24 #include <memory> 25 #include <string> 26 27 namespace llvm { 28 class SourceMgr; 29 class SMLoc; 30 class Twine; 31 32 namespace tgtok { 33 enum TokKind { 34 // Markers 35 Eof, Error, 36 37 // Tokens with no info. 38 minus, plus, // - + 39 l_square, r_square, // [ ] 40 l_brace, r_brace, // { } 41 l_paren, r_paren, // ( ) 42 less, greater, // < > 43 colon, semi, // : ; 44 comma, period, // , . 45 equal, question, // = ? 46 paste, // # 47 48 // Keywords. 49 Bit, Bits, Class, Code, Dag, Def, Foreach, Defm, Field, In, Int, Let, List, 50 MultiClass, String, Defset, 51 52 // !keywords. 53 XConcat, XADD, XAND, XOR, XSRA, XSRL, XSHL, XListConcat, XStrConcat, XCast, 54 XSubst, XForEach, XFoldl, XHead, XTail, XSize, XEmpty, XIf, XEq, XIsA, XDag, 55 XNe, XLe, XLt, XGe, XGt, 56 57 // Integer value. 58 IntVal, 59 60 // Binary constant. Note that these are sized according to the number of 61 // bits given. 62 BinaryIntVal, 63 64 // String valued tokens. 65 Id, StrVal, VarName, CodeFragment, 66 67 // Preprocessing tokens for internal usage by the lexer. 68 // They are never returned as a result of Lex(). 69 Ifdef, Else, Endif, Define 70 }; 71 } 72 73 /// TGLexer - TableGen Lexer class. 74 class TGLexer { 75 SourceMgr &SrcMgr; 76 77 const char *CurPtr; 78 StringRef CurBuf; 79 80 // Information about the current token. 81 const char *TokStart; 82 tgtok::TokKind CurCode; 83 std::string CurStrVal; // This is valid for ID, STRVAL, VARNAME, CODEFRAGMENT 84 int64_t CurIntVal; // This is valid for INTVAL. 85 86 /// CurBuffer - This is the current buffer index we're lexing from as managed 87 /// by the SourceMgr object. 88 unsigned CurBuffer; 89 90 public: 91 typedef std::map<std::string, SMLoc> DependenciesMapTy; 92 private: 93 /// Dependencies - This is the list of all included files. 94 DependenciesMapTy Dependencies; 95 96 public: 97 TGLexer(SourceMgr &SrcMgr, ArrayRef<std::string> Macros); 98 Lex()99 tgtok::TokKind Lex() { 100 return CurCode = LexToken(CurPtr == CurBuf.begin()); 101 } 102 getDependencies()103 const DependenciesMapTy &getDependencies() const { 104 return Dependencies; 105 } 106 getCode()107 tgtok::TokKind getCode() const { return CurCode; } 108 getCurStrVal()109 const std::string &getCurStrVal() const { 110 assert((CurCode == tgtok::Id || CurCode == tgtok::StrVal || 111 CurCode == tgtok::VarName || CurCode == tgtok::CodeFragment) && 112 "This token doesn't have a string value"); 113 return CurStrVal; 114 } getCurIntVal()115 int64_t getCurIntVal() const { 116 assert(CurCode == tgtok::IntVal && "This token isn't an integer"); 117 return CurIntVal; 118 } getCurBinaryIntVal()119 std::pair<int64_t, unsigned> getCurBinaryIntVal() const { 120 assert(CurCode == tgtok::BinaryIntVal && 121 "This token isn't a binary integer"); 122 return std::make_pair(CurIntVal, (CurPtr - TokStart)-2); 123 } 124 125 SMLoc getLoc() const; 126 127 private: 128 /// LexToken - Read the next token and return its code. 129 tgtok::TokKind LexToken(bool FileOrLineStart = false); 130 131 tgtok::TokKind ReturnError(SMLoc Loc, const Twine &Msg); 132 tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg); 133 134 int getNextChar(); 135 int peekNextChar(int Index) const; 136 void SkipBCPLComment(); 137 bool SkipCComment(); 138 tgtok::TokKind LexIdentifier(); 139 bool LexInclude(); 140 tgtok::TokKind LexString(); 141 tgtok::TokKind LexVarName(); 142 tgtok::TokKind LexNumber(); 143 tgtok::TokKind LexBracket(); 144 tgtok::TokKind LexExclaim(); 145 146 // Process EOF encountered in LexToken(). 147 // If EOF is met in an include file, then the method will update 148 // CurPtr, CurBuf and preprocessing include stack, and return true. 149 // If EOF is met in the top-level file, then the method will 150 // update and check the preprocessing include stack, and return false. 151 bool processEOF(); 152 153 // *** Structures and methods for preprocessing support *** 154 155 // A set of macro names that are defined either via command line or 156 // by using: 157 // #define NAME 158 StringSet<> DefinedMacros; 159 160 // Each of #ifdef and #else directives has a descriptor associated 161 // with it. 162 // 163 // An ordered list of preprocessing controls defined by #ifdef/#else 164 // directives that are in effect currently is called preprocessing 165 // control stack. It is represented as a vector of PreprocessorControlDesc's. 166 // 167 // The control stack is updated according to the following rules: 168 // 169 // For each #ifdef we add an element to the control stack. 170 // For each #else we replace the top element with a descriptor 171 // with an inverted IsDefined value. 172 // For each #endif we pop the top element from the control stack. 173 // 174 // When CurPtr reaches the current buffer's end, the control stack 175 // must be empty, i.e. #ifdef and the corresponding #endif 176 // must be located in the same file. 177 struct PreprocessorControlDesc { 178 // Either tgtok::Ifdef or tgtok::Else. 179 tgtok::TokKind Kind; 180 181 // True, if the condition for this directive is true, false - otherwise. 182 // Examples: 183 // #ifdef NAME : true, if NAME is defined, false - otherwise. 184 // ... 185 // #else : false, if NAME is defined, true - otherwise. 186 bool IsDefined; 187 188 // Pointer into CurBuf to the beginning of the preprocessing directive 189 // word, e.g.: 190 // #ifdef NAME 191 // ^ - SrcPos 192 SMLoc SrcPos; 193 }; 194 195 // We want to disallow code like this: 196 // file1.td: 197 // #define NAME 198 // #ifdef NAME 199 // include "file2.td" 200 // EOF 201 // file2.td: 202 // #endif 203 // EOF 204 // 205 // To do this, we clear the preprocessing control stack on entry 206 // to each of the included file. PrepIncludeStack is used to store 207 // preprocessing control stacks for the current file and all its 208 // parent files. The back() element is the preprocessing control 209 // stack for the current file. 210 std::vector<std::unique_ptr<std::vector<PreprocessorControlDesc>>> 211 PrepIncludeStack; 212 213 // Validate that the current preprocessing control stack is empty, 214 // since we are about to exit a file, and pop the include stack. 215 // 216 // If IncludeStackMustBeEmpty is true, the include stack must be empty 217 // after the popping, otherwise, the include stack must not be empty 218 // after the popping. Basically, the include stack must be empty 219 // only if we exit the "top-level" file (i.e. finish lexing). 220 // 221 // The method returns false, if the current preprocessing control stack 222 // is not empty (e.g. there is an unterminated #ifdef/#else), 223 // true - otherwise. 224 bool prepExitInclude(bool IncludeStackMustBeEmpty); 225 226 // Look ahead for a preprocessing directive starting from CurPtr. The caller 227 // must only call this method, if *(CurPtr - 1) is '#'. If the method matches 228 // a preprocessing directive word followed by a whitespace, then it returns 229 // one of the internal token kinds, i.e. Ifdef, Else, Endif, Define. 230 // 231 // CurPtr is not adjusted by this method. 232 tgtok::TokKind prepIsDirective() const; 233 234 // Given a preprocessing token kind, adjusts CurPtr to the end 235 // of the preprocessing directive word. Returns true, unless 236 // an unsupported token kind is passed in. 237 // 238 // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective() 239 // to avoid adjusting CurPtr before we are sure that '#' is followed 240 // by a preprocessing directive. If it is not, then we fall back to 241 // tgtok::paste interpretation of '#'. 242 bool prepEatPreprocessorDirective(tgtok::TokKind Kind); 243 244 // The main "exit" point from the token parsing to preprocessor. 245 // 246 // The method is called for CurPtr, when prepIsDirective() returns 247 // true. The first parameter matches the result of prepIsDirective(), 248 // denoting the actual preprocessor directive to be processed. 249 // 250 // If the preprocessing directive disables the tokens processing, e.g.: 251 // #ifdef NAME // NAME is undefined 252 // then lexPreprocessor() enters the lines-skipping mode. 253 // In this mode, it does not parse any tokens, because the code under 254 // the #ifdef may not even be a correct tablegen code. The preprocessor 255 // looks for lines containing other preprocessing directives, which 256 // may be prepended with whitespaces and C-style comments. If the line 257 // does not contain a preprocessing directive, it is skipped completely. 258 // Otherwise, the preprocessing directive is processed by recursively 259 // calling lexPreprocessor(). The processing of the encountered 260 // preprocessing directives includes updating preprocessing control stack 261 // and adding new macros into DefinedMacros set. 262 // 263 // The second parameter controls whether lexPreprocessor() is called from 264 // LexToken() (true) or recursively from lexPreprocessor() (false). 265 // 266 // If ReturnNextLiveToken is true, the method returns the next 267 // LEX token following the current directive or following the end 268 // of the disabled preprocessing region corresponding to this directive. 269 // If ReturnNextLiveToken is false, the method returns the first parameter, 270 // unless there were errors encountered in the disabled preprocessing 271 // region - in this case, it returns tgtok::Error. 272 tgtok::TokKind lexPreprocessor(tgtok::TokKind Kind, 273 bool ReturnNextLiveToken = true); 274 275 // Worker method for lexPreprocessor() to skip lines after some 276 // preprocessing directive up to the buffer end or to the directive 277 // that re-enables token processing. The method returns true 278 // upon processing the next directive that re-enables tokens 279 // processing. False is returned if an error was encountered. 280 // 281 // Note that prepSkipRegion() calls lexPreprocessor() to process 282 // encountered preprocessing directives. In this case, the second 283 // parameter to lexPreprocessor() is set to false. Being passed 284 // false ReturnNextLiveToken, lexPreprocessor() must never call 285 // prepSkipRegion(). We assert this by passing ReturnNextLiveToken 286 // to prepSkipRegion() and checking that it is never set to false. 287 bool prepSkipRegion(bool MustNeverBeFalse); 288 289 // Lex name of the macro after either #ifdef or #define. We could have used 290 // LexIdentifier(), but it has special handling of "include" word, which 291 // could result in awkward diagnostic errors. Consider: 292 // ---- 293 // #ifdef include 294 // class ... 295 // ---- 296 // LexIdentifier() will engage LexInclude(), which will complain about 297 // missing file with name "class". Instead, prepLexMacroName() will treat 298 // "include" as a normal macro name. 299 // 300 // On entry, CurPtr points to the end of a preprocessing directive word. 301 // The method allows for whitespaces between the preprocessing directive 302 // and the macro name. The allowed whitespaces are ' ' and '\t'. 303 // 304 // If the first non-whitespace symbol after the preprocessing directive 305 // is a valid start symbol for an identifier (i.e. [a-zA-Z_]), then 306 // the method updates TokStart to the position of the first non-whitespace 307 // symbol, sets CurPtr to the position of the macro name's last symbol, 308 // and returns a string reference to the macro name. Otherwise, 309 // TokStart is set to the first non-whitespace symbol after the preprocessing 310 // directive, and the method returns an empty string reference. 311 // 312 // In all cases, TokStart may be used to point to the word following 313 // the preprocessing directive. 314 StringRef prepLexMacroName(); 315 316 // Skip any whitespaces starting from CurPtr. The method is used 317 // only in the lines-skipping mode to find the first non-whitespace 318 // symbol after or at CurPtr. Allowed whitespaces are ' ', '\t', '\n' 319 // and '\r'. The method skips C-style comments as well, because 320 // it is used to find the beginning of the preprocessing directive. 321 // If we do not handle C-style comments the following code would 322 // result in incorrect detection of a preprocessing directive: 323 // /* 324 // #ifdef NAME 325 // */ 326 // As long as we skip C-style comments, the following code is correctly 327 // recognized as a preprocessing directive: 328 // /* first line comment 329 // second line comment */ #ifdef NAME 330 // 331 // The method returns true upon reaching the first non-whitespace symbol 332 // or EOF, CurPtr is set to point to this symbol. The method returns false, 333 // if an error occured during skipping of a C-style comment. 334 bool prepSkipLineBegin(); 335 336 // Skip any whitespaces or comments after a preprocessing directive. 337 // The method returns true upon reaching either end of the line 338 // or end of the file. If there is a multiline C-style comment 339 // after the preprocessing directive, the method skips 340 // the comment, so the final CurPtr may point to one of the next lines. 341 // The method returns false, if an error occured during skipping 342 // C- or C++-style comment, or a non-whitespace symbol appears 343 // after the preprocessing directive. 344 // 345 // The method maybe called both during lines-skipping and tokens 346 // processing. It actually verifies that only whitespaces or/and 347 // comments follow a preprocessing directive. 348 // 349 // After the execution of this mehod, CurPtr points either to new line 350 // symbol, buffer end or non-whitespace symbol following the preprocesing 351 // directive. 352 bool prepSkipDirectiveEnd(); 353 354 // Skip all symbols to the end of the line/file. 355 // The method adjusts CurPtr, so that it points to either new line 356 // symbol in the current line or the buffer end. 357 void prepSkipToLineEnd(); 358 359 // Return true, if the current preprocessor control stack is such that 360 // we should allow lexer to process the next token, false - otherwise. 361 // 362 // In particular, the method returns true, if all the #ifdef/#else 363 // controls on the stack have their IsDefined member set to true. 364 bool prepIsProcessingEnabled(); 365 366 // Report an error, if we reach EOF with non-empty preprocessing control 367 // stack. This means there is no matching #endif for the previous 368 // #ifdef/#else. 369 void prepReportPreprocessorStackError(); 370 }; 371 372 } // end namespace llvm 373 374 #endif 375