1 //===- Tokens.h - collect tokens from preprocessing --------------*- C++-*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // Record tokens that a preprocessor emits and define operations to map between 9 // the tokens written in a file and tokens produced by the preprocessor. 10 // 11 // When running the compiler, there are two token streams we are interested in: 12 // - "spelled" tokens directly correspond to a substring written in some 13 // source file. 14 // - "expanded" tokens represent the result of preprocessing, parses consumes 15 // this token stream to produce the AST. 16 // 17 // Expanded tokens correspond directly to locations found in the AST, allowing 18 // to find subranges of the token stream covered by various AST nodes. Spelled 19 // tokens correspond directly to the source code written by the user. 20 // 21 // To allow composing these two use-cases, we also define operations that map 22 // between expanded and spelled tokens that produced them (macro calls, 23 // directives, etc). 24 // 25 //===----------------------------------------------------------------------===// 26 27 #ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H 28 #define LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H 29 30 #include "clang/Basic/FileManager.h" 31 #include "clang/Basic/LangOptions.h" 32 #include "clang/Basic/SourceLocation.h" 33 #include "clang/Basic/SourceManager.h" 34 #include "clang/Basic/TokenKinds.h" 35 #include "clang/Lex/Token.h" 36 #include "llvm/ADT/ArrayRef.h" 37 #include "llvm/ADT/DenseMap.h" 38 #include "llvm/ADT/Optional.h" 39 #include "llvm/ADT/StringRef.h" 40 #include "llvm/Support/Compiler.h" 41 #include "llvm/Support/raw_ostream.h" 42 #include <cstdint> 43 #include <tuple> 44 45 namespace clang { 46 class Preprocessor; 47 48 namespace syntax { 49 50 /// A half-open character range inside a particular file, the start offset is 51 /// included and the end offset is excluded from the range. 52 struct FileRange { 53 /// EXPECTS: File.isValid() && Begin <= End. 54 FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset); 55 /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(). 56 FileRange(const SourceManager &SM, SourceLocation BeginLoc, unsigned Length); 57 /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(), Begin <= End and files 58 /// are the same. 59 FileRange(const SourceManager &SM, SourceLocation BeginLoc, 60 SourceLocation EndLoc); 61 62 FileID file() const { return File; } 63 /// Start is a start offset (inclusive) in the corresponding file. 64 unsigned beginOffset() const { return Begin; } 65 /// End offset (exclusive) in the corresponding file. 66 unsigned endOffset() const { return End; } 67 68 unsigned length() const { return End - Begin; } 69 70 /// Check if \p Offset is inside the range. 71 bool contains(unsigned Offset) const { 72 return Begin <= Offset && Offset < End; 73 } 74 /// Check \p Offset is inside the range or equal to its endpoint. 75 bool touches(unsigned Offset) const { 76 return Begin <= Offset && Offset <= End; 77 } 78 79 /// Gets the substring that this FileRange refers to. 80 llvm::StringRef text(const SourceManager &SM) const; 81 82 /// Convert to the clang range. The returned range is always a char range, 83 /// never a token range. 84 CharSourceRange toCharRange(const SourceManager &SM) const; 85 86 friend bool operator==(const FileRange &L, const FileRange &R) { 87 return std::tie(L.File, L.Begin, L.End) == std::tie(R.File, R.Begin, R.End); 88 } 89 friend bool operator!=(const FileRange &L, const FileRange &R) { 90 return !(L == R); 91 } 92 93 private: 94 FileID File; 95 unsigned Begin; 96 unsigned End; 97 }; 98 99 /// For debugging purposes. 100 llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FileRange &R); 101 102 /// A token coming directly from a file or from a macro invocation. Has just 103 /// enough information to locate the token in the source code. 104 /// Can represent both expanded and spelled tokens. 105 class Token { 106 public: 107 Token(SourceLocation Location, unsigned Length, tok::TokenKind Kind); 108 /// EXPECTS: clang::Token is not an annotation token. 109 explicit Token(const clang::Token &T); 110 111 tok::TokenKind kind() const { return Kind; } 112 /// Location of the first character of a token. 113 SourceLocation location() const { return Location; } 114 /// Location right after the last character of a token. 115 SourceLocation endLocation() const { 116 return Location.getLocWithOffset(Length); 117 } 118 unsigned length() const { return Length; } 119 120 /// Get the substring covered by the token. Note that will include all 121 /// digraphs, newline continuations, etc. E.g. tokens for 'int' and 122 /// in\ 123 /// t 124 /// both have the same kind tok::kw_int, but results of text() are different. 125 llvm::StringRef text(const SourceManager &SM) const; 126 127 /// Gets a range of this token. 128 /// EXPECTS: token comes from a file, not from a macro expansion. 129 FileRange range(const SourceManager &SM) const; 130 131 /// Given two tokens inside the same file, returns a file range that starts at 132 /// \p First and ends at \p Last. 133 /// EXPECTS: First and Last are file tokens from the same file, Last starts 134 /// after First. 135 static FileRange range(const SourceManager &SM, const syntax::Token &First, 136 const syntax::Token &Last); 137 138 std::string dumpForTests(const SourceManager &SM) const; 139 /// For debugging purposes. 140 std::string str() const; 141 142 private: 143 SourceLocation Location; 144 unsigned Length; 145 tok::TokenKind Kind; 146 }; 147 /// For debugging purposes. Equivalent to a call to Token::str(). 148 llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T); 149 150 /// A list of tokens obtained by preprocessing a text buffer and operations to 151 /// map between the expanded and spelled tokens, i.e. TokenBuffer has 152 /// information about two token streams: 153 /// 1. Expanded tokens: tokens produced by the preprocessor after all macro 154 /// replacements, 155 /// 2. Spelled tokens: corresponding directly to the source code of a file 156 /// before any macro replacements occurred. 157 /// Here's an example to illustrate a difference between those two: 158 /// #define FOO 10 159 /// int a = FOO; 160 /// 161 /// Spelled tokens are {'#','define','FOO','10','int','a','=','FOO',';'}. 162 /// Expanded tokens are {'int','a','=','10',';','eof'}. 163 /// 164 /// Note that the expanded token stream has a tok::eof token at the end, the 165 /// spelled tokens never store a 'eof' token. 166 /// 167 /// The full list expanded tokens can be obtained with expandedTokens(). Spelled 168 /// tokens for each of the files can be obtained via spelledTokens(FileID). 169 /// 170 /// To map between the expanded and spelled tokens use findSpelledByExpanded(). 171 /// 172 /// To build a token buffer use the TokenCollector class. You can also compute 173 /// the spelled tokens of a file using the tokenize() helper. 174 /// 175 /// FIXME: allow mappings into macro arguments. 176 class TokenBuffer { 177 public: 178 TokenBuffer(const SourceManager &SourceMgr) : SourceMgr(&SourceMgr) {} 179 180 TokenBuffer(TokenBuffer &&) = default; 181 TokenBuffer(const TokenBuffer &) = delete; 182 TokenBuffer &operator=(TokenBuffer &&) = default; 183 TokenBuffer &operator=(const TokenBuffer &) = delete; 184 185 /// All tokens produced by the preprocessor after all macro replacements, 186 /// directives, etc. Source locations found in the clang AST will always 187 /// point to one of these tokens. 188 /// Tokens are in TU order (per SourceManager::isBeforeInTranslationUnit()). 189 /// FIXME: figure out how to handle token splitting, e.g. '>>' can be split 190 /// into two '>' tokens by the parser. However, TokenBuffer currently 191 /// keeps it as a single '>>' token. 192 llvm::ArrayRef<syntax::Token> expandedTokens() const { 193 return ExpandedTokens; 194 } 195 196 /// Builds a cache to make future calls to expandedToken(SourceRange) faster. 197 /// Creates an index only once. Further calls to it will be no-op. 198 void indexExpandedTokens(); 199 200 /// Returns the subrange of expandedTokens() corresponding to the closed 201 /// token range R. 202 /// Consider calling indexExpandedTokens() before for faster lookups. 203 llvm::ArrayRef<syntax::Token> expandedTokens(SourceRange R) const; 204 205 /// Returns the subrange of spelled tokens corresponding to AST node spanning 206 /// \p Expanded. This is the text that should be replaced if a refactoring 207 /// were to rewrite the node. If \p Expanded is empty, the returned value is 208 /// llvm::None. 209 /// 210 /// Will fail if the expanded tokens do not correspond to a sequence of 211 /// spelled tokens. E.g. for the following example: 212 /// 213 /// #define FIRST f1 f2 f3 214 /// #define SECOND s1 s2 s3 215 /// #define ID2(X, Y) X Y 216 /// 217 /// a FIRST b SECOND c // expanded tokens are: a f1 f2 f3 b s1 s2 s3 c 218 /// d ID2(e f g, h) i // expanded tokens are: d e f g h i 219 /// 220 /// the results would be: 221 /// expanded => spelled 222 /// ------------------------ 223 /// a => a 224 /// s1 s2 s3 => SECOND 225 /// a f1 f2 f3 => a FIRST 226 /// a f1 => can't map 227 /// s1 s2 => can't map 228 /// e f => e f 229 /// g h => can't map 230 /// 231 /// EXPECTS: \p Expanded is a subrange of expandedTokens(). 232 /// Complexity is logarithmic. 233 llvm::Optional<llvm::ArrayRef<syntax::Token>> 234 spelledForExpanded(llvm::ArrayRef<syntax::Token> Expanded) const; 235 236 /// Find the subranges of expanded tokens, corresponding to \p Spelled. 237 /// 238 /// Some spelled tokens may not be present in the expanded token stream, so 239 /// this function can return an empty vector, e.g. for tokens of macro 240 /// directives or disabled preprocessor branches. 241 /// 242 /// Some spelled tokens can be duplicated in the expanded token stream 243 /// multiple times and this function will return multiple results in those 244 /// cases. This happens when \p Spelled is inside a macro argument. 245 /// 246 /// FIXME: return correct results on macro arguments. For now, we return an 247 /// empty list. 248 /// 249 /// (!) will return empty vector on tokens from #define body: 250 /// E.g. for the following example: 251 /// 252 /// #define FIRST(A) f1 A = A f2 253 /// #define SECOND s 254 /// 255 /// a FIRST(arg) b SECOND c // expanded tokens are: a f1 arg = arg f2 b s 256 /// The results would be 257 /// spelled => expanded 258 /// ------------------------ 259 /// #define FIRST => {} 260 /// a FIRST(arg) => {a f1 arg = arg f2} 261 /// arg => {arg, arg} // arg #1 is before `=` and arg #2 is 262 /// // after `=` in the expanded tokens. 263 llvm::SmallVector<llvm::ArrayRef<syntax::Token>, 1> 264 expandedForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const; 265 266 /// An expansion produced by the preprocessor, includes macro expansions and 267 /// preprocessor directives. Preprocessor always maps a non-empty range of 268 /// spelled tokens to a (possibly empty) range of expanded tokens. Here is a 269 /// few examples of expansions: 270 /// #pragma once // Expands to an empty range. 271 /// #define FOO 1 2 3 // Expands an empty range. 272 /// FOO // Expands to "1 2 3". 273 /// FIXME(ibiryukov): implement this, currently #include expansions are empty. 274 /// #include <vector> // Expands to tokens produced by the include. 275 struct Expansion { 276 llvm::ArrayRef<syntax::Token> Spelled; 277 llvm::ArrayRef<syntax::Token> Expanded; 278 }; 279 /// If \p Spelled starts a mapping (e.g. if it's a macro name or '#' starting 280 /// a preprocessor directive) return the subrange of expanded tokens that the 281 /// macro expands to. 282 llvm::Optional<Expansion> 283 expansionStartingAt(const syntax::Token *Spelled) const; 284 /// Returns all expansions (partially) expanded from the specified tokens. 285 /// This is the expansions whose Spelled range intersects \p Spelled. 286 std::vector<Expansion> 287 expansionsOverlapping(llvm::ArrayRef<syntax::Token> Spelled) const; 288 289 /// Lexed tokens of a file before preprocessing. E.g. for the following input 290 /// #define DECL(name) int name = 10 291 /// DECL(a); 292 /// spelledTokens() returns 293 /// {"#", "define", "DECL", "(", "name", ")", "int", "name", "=", "10", 294 /// "DECL", "(", "a", ")", ";"} 295 llvm::ArrayRef<syntax::Token> spelledTokens(FileID FID) const; 296 297 /// Returns the spelled Token starting at Loc, if there are no such tokens 298 /// returns nullptr. 299 const syntax::Token *spelledTokenAt(SourceLocation Loc) const; 300 301 /// Get all tokens that expand a macro in \p FID. For the following input 302 /// #define FOO B 303 /// #define FOO2(X) int X 304 /// FOO2(XY) 305 /// int B; 306 /// FOO; 307 /// macroExpansions() returns {"FOO2", "FOO"} (from line 3 and 5 308 /// respecitvely). 309 std::vector<const syntax::Token *> macroExpansions(FileID FID) const; 310 311 const SourceManager &sourceManager() const { return *SourceMgr; } 312 313 std::string dumpForTests() const; 314 315 private: 316 /// Describes a mapping between a continuous subrange of spelled tokens and 317 /// expanded tokens. Represents macro expansions, preprocessor directives, 318 /// conditionally disabled pp regions, etc. 319 /// #define FOO 1+2 320 /// #define BAR(a) a + 1 321 /// FOO // invocation #1, tokens = {'1','+','2'}, macroTokens = {'FOO'}. 322 /// BAR(1) // invocation #2, tokens = {'a', '+', '1'}, 323 /// macroTokens = {'BAR', '(', '1', ')'}. 324 struct Mapping { 325 // Positions in the corresponding spelled token stream. The corresponding 326 // range is never empty. 327 unsigned BeginSpelled = 0; 328 unsigned EndSpelled = 0; 329 // Positions in the expanded token stream. The corresponding range can be 330 // empty. 331 unsigned BeginExpanded = 0; 332 unsigned EndExpanded = 0; 333 334 /// For debugging purposes. 335 std::string str() const; 336 }; 337 /// Spelled tokens of the file with information about the subranges. 338 struct MarkedFile { 339 /// Lexed, but not preprocessed, tokens of the file. These map directly to 340 /// text in the corresponding files and include tokens of all preprocessor 341 /// directives. 342 /// FIXME: spelled tokens don't change across FileID that map to the same 343 /// FileEntry. We could consider deduplicating them to save memory. 344 std::vector<syntax::Token> SpelledTokens; 345 /// A sorted list to convert between the spelled and expanded token streams. 346 std::vector<Mapping> Mappings; 347 /// The first expanded token produced for this FileID. 348 unsigned BeginExpanded = 0; 349 unsigned EndExpanded = 0; 350 }; 351 352 friend class TokenCollector; 353 354 /// Maps a single expanded token to its spelled counterpart or a mapping that 355 /// produced it. 356 std::pair<const syntax::Token *, const Mapping *> 357 spelledForExpandedToken(const syntax::Token *Expanded) const; 358 359 /// Returns a mapping starting before \p Spelled token, or nullptr if no 360 /// such mapping exists. 361 static const Mapping * 362 mappingStartingBeforeSpelled(const MarkedFile &F, 363 const syntax::Token *Spelled); 364 365 /// Convert a private Mapping to a public Expansion. 366 Expansion makeExpansion(const MarkedFile &, const Mapping &) const; 367 /// Returns the file that the Spelled tokens are taken from. 368 /// Asserts that they are non-empty, from a tracked file, and in-bounds. 369 const MarkedFile &fileForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const; 370 371 /// Token stream produced after preprocessing, conceputally this captures the 372 /// same stream as 'clang -E' (excluding the preprocessor directives like 373 /// #file, etc.). 374 std::vector<syntax::Token> ExpandedTokens; 375 // Index of ExpandedTokens for faster lookups by SourceLocation. 376 llvm::DenseMap<SourceLocation, unsigned> ExpandedTokIndex; 377 llvm::DenseMap<FileID, MarkedFile> Files; 378 // The value is never null, pointer instead of reference to avoid disabling 379 // implicit assignment operator. 380 const SourceManager *SourceMgr; 381 }; 382 383 /// The spelled tokens that overlap or touch a spelling location Loc. 384 /// This always returns 0-2 tokens. 385 llvm::ArrayRef<syntax::Token> 386 spelledTokensTouching(SourceLocation Loc, const syntax::TokenBuffer &Tokens); 387 llvm::ArrayRef<syntax::Token> 388 spelledTokensTouching(SourceLocation Loc, llvm::ArrayRef<syntax::Token> Tokens); 389 390 /// The identifier token that overlaps or touches a spelling location Loc. 391 /// If there is none, returns nullptr. 392 const syntax::Token * 393 spelledIdentifierTouching(SourceLocation Loc, 394 llvm::ArrayRef<syntax::Token> Tokens); 395 const syntax::Token * 396 spelledIdentifierTouching(SourceLocation Loc, 397 const syntax::TokenBuffer &Tokens); 398 399 /// Lex the text buffer, corresponding to \p FID, in raw mode and record the 400 /// resulting spelled tokens. Does minimal post-processing on raw identifiers, 401 /// setting the appropriate token kind (instead of the raw_identifier reported 402 /// by lexer in raw mode). This is a very low-level function, most users should 403 /// prefer to use TokenCollector. Lexing in raw mode produces wildly different 404 /// results from what one might expect when running a C++ frontend, e.g. 405 /// preprocessor does not run at all. 406 /// The result will *not* have a 'eof' token at the end. 407 std::vector<syntax::Token> tokenize(FileID FID, const SourceManager &SM, 408 const LangOptions &LO); 409 /// Similar to one above, instead of whole file tokenizes a part of it. Note 410 /// that, the first token might be incomplete if FR.startOffset is not at the 411 /// beginning of a token, and the last token returned will start before the 412 /// FR.endOffset but might end after it. 413 std::vector<syntax::Token> 414 tokenize(const FileRange &FR, const SourceManager &SM, const LangOptions &LO); 415 416 /// Collects tokens for the main file while running the frontend action. An 417 /// instance of this object should be created on 418 /// FrontendAction::BeginSourceFile() and the results should be consumed after 419 /// FrontendAction::Execute() finishes. 420 class TokenCollector { 421 public: 422 /// Adds the hooks to collect the tokens. Should be called before the 423 /// preprocessing starts, i.e. as a part of BeginSourceFile() or 424 /// CreateASTConsumer(). 425 TokenCollector(Preprocessor &P); 426 427 /// Finalizes token collection. Should be called after preprocessing is 428 /// finished, i.e. after running Execute(). 429 LLVM_NODISCARD TokenBuffer consume() &&; 430 431 private: 432 /// Maps from a start to an end spelling location of transformations 433 /// performed by the preprocessor. These include: 434 /// 1. range from '#' to the last token in the line for PP directives, 435 /// 2. macro name and arguments for macro expansions. 436 /// Note that we record only top-level macro expansions, intermediate 437 /// expansions (e.g. inside macro arguments) are ignored. 438 /// 439 /// Used to find correct boundaries of macro calls and directives when 440 /// building mappings from spelled to expanded tokens. 441 /// 442 /// Logically, at each point of the preprocessor execution there is a stack of 443 /// macro expansions being processed and we could use it to recover the 444 /// location information we need. However, the public preprocessor API only 445 /// exposes the points when macro expansions start (when we push a macro onto 446 /// the stack) and not when they end (when we pop a macro from the stack). 447 /// To workaround this limitation, we rely on source location information 448 /// stored in this map. 449 using PPExpansions = llvm::DenseMap<SourceLocation, SourceLocation>; 450 class Builder; 451 class CollectPPExpansions; 452 453 std::vector<syntax::Token> Expanded; 454 // FIXME: we only store macro expansions, also add directives(#pragma, etc.) 455 PPExpansions Expansions; 456 Preprocessor &PP; 457 CollectPPExpansions *Collector; 458 }; 459 460 } // namespace syntax 461 } // namespace clang 462 463 #endif 464