1 //===- Tokens.h - collect tokens from preprocessing --------------*- C++-*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // Record tokens that a preprocessor emits and define operations to map between
9 // the tokens written in a file and tokens produced by the preprocessor.
10 //
11 // When running the compiler, there are two token streams we are interested in:
12 //   - "spelled" tokens directly correspond to a substring written in some
13 //     source file.
14 //   - "expanded" tokens represent the result of preprocessing, parses consumes
15 //     this token stream to produce the AST.
16 //
17 // Expanded tokens correspond directly to locations found in the AST, allowing
18 // to find subranges of the token stream covered by various AST nodes. Spelled
19 // tokens correspond directly to the source code written by the user.
20 //
21 // To allow composing these two use-cases, we also define operations that map
22 // between expanded and spelled tokens that produced them (macro calls,
23 // directives, etc).
24 //
25 //===----------------------------------------------------------------------===//
26 
27 #ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
28 #define LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
29 
30 #include "clang/Basic/FileManager.h"
31 #include "clang/Basic/LangOptions.h"
32 #include "clang/Basic/SourceLocation.h"
33 #include "clang/Basic/SourceManager.h"
34 #include "clang/Basic/TokenKinds.h"
35 #include "clang/Lex/Token.h"
36 #include "llvm/ADT/ArrayRef.h"
37 #include "llvm/ADT/Optional.h"
38 #include "llvm/ADT/StringRef.h"
39 #include "llvm/Support/Compiler.h"
40 #include "llvm/Support/raw_ostream.h"
41 #include <cstdint>
42 #include <tuple>
43 
44 namespace clang {
45 class Preprocessor;
46 
47 namespace syntax {
48 
49 /// A half-open character range inside a particular file, the start offset is
50 /// included and the end offset is excluded from the range.
51 struct FileRange {
52   /// EXPECTS: File.isValid() && Begin <= End.
53   FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset);
54   /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID().
55   FileRange(const SourceManager &SM, SourceLocation BeginLoc, unsigned Length);
56   /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(), Begin <= End and files
57   ///          are the same.
58   FileRange(const SourceManager &SM, SourceLocation BeginLoc,
59             SourceLocation EndLoc);
60 
61   FileID file() const { return File; }
62   /// Start is a start offset (inclusive) in the corresponding file.
63   unsigned beginOffset() const { return Begin; }
64   /// End offset (exclusive) in the corresponding file.
65   unsigned endOffset() const { return End; }
66 
67   unsigned length() const { return End - Begin; }
68 
69   /// Check if \p Offset is inside the range.
70   bool contains(unsigned Offset) const {
71     return Begin <= Offset && Offset < End;
72   }
73   /// Check \p Offset is inside the range or equal to its endpoint.
74   bool touches(unsigned Offset) const {
75     return Begin <= Offset && Offset <= End;
76   }
77 
78   /// Gets the substring that this FileRange refers to.
79   llvm::StringRef text(const SourceManager &SM) const;
80 
81   /// Convert to the clang range. The returned range is always a char range,
82   /// never a token range.
83   CharSourceRange toCharRange(const SourceManager &SM) const;
84 
85   friend bool operator==(const FileRange &L, const FileRange &R) {
86     return std::tie(L.File, L.Begin, L.End) == std::tie(R.File, R.Begin, R.End);
87   }
88   friend bool operator!=(const FileRange &L, const FileRange &R) {
89     return !(L == R);
90   }
91 
92 private:
93   FileID File;
94   unsigned Begin;
95   unsigned End;
96 };
97 
98 /// For debugging purposes.
99 llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FileRange &R);
100 
101 /// A token coming directly from a file or from a macro invocation. Has just
102 /// enough information to locate the token in the source code.
103 /// Can represent both expanded and spelled tokens.
104 class Token {
105 public:
106   Token(SourceLocation Location, unsigned Length, tok::TokenKind Kind);
107   /// EXPECTS: clang::Token is not an annotation token.
108   explicit Token(const clang::Token &T);
109 
110   tok::TokenKind kind() const { return Kind; }
111   /// Location of the first character of a token.
112   SourceLocation location() const { return Location; }
113   /// Location right after the last character of a token.
114   SourceLocation endLocation() const {
115     return Location.getLocWithOffset(Length);
116   }
117   unsigned length() const { return Length; }
118 
119   /// Get the substring covered by the token. Note that will include all
120   /// digraphs, newline continuations, etc. E.g. tokens for 'int' and
121   ///    in\
122   ///    t
123   /// both have the same kind tok::kw_int, but results of text() are different.
124   llvm::StringRef text(const SourceManager &SM) const;
125 
126   /// Gets a range of this token.
127   /// EXPECTS: token comes from a file, not from a macro expansion.
128   FileRange range(const SourceManager &SM) const;
129 
130   /// Given two tokens inside the same file, returns a file range that starts at
131   /// \p First and ends at \p Last.
132   /// EXPECTS: First and Last are file tokens from the same file, Last starts
133   ///          after First.
134   static FileRange range(const SourceManager &SM, const syntax::Token &First,
135                          const syntax::Token &Last);
136 
137   std::string dumpForTests(const SourceManager &SM) const;
138   /// For debugging purposes.
139   std::string str() const;
140 
141 private:
142   SourceLocation Location;
143   unsigned Length;
144   tok::TokenKind Kind;
145 };
146 /// For debugging purposes. Equivalent to a call to Token::str().
147 llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T);
148 
149 /// A list of tokens obtained by preprocessing a text buffer and operations to
150 /// map between the expanded and spelled tokens, i.e. TokenBuffer has
151 /// information about two token streams:
152 ///    1. Expanded tokens: tokens produced by the preprocessor after all macro
153 ///       replacements,
154 ///    2. Spelled tokens: corresponding directly to the source code of a file
155 ///       before any macro replacements occurred.
156 /// Here's an example to illustrate a difference between those two:
157 ///     #define FOO 10
158 ///     int a = FOO;
159 ///
160 /// Spelled tokens are {'#','define','FOO','10','int','a','=','FOO',';'}.
161 /// Expanded tokens are {'int','a','=','10',';','eof'}.
162 ///
163 /// Note that the expanded token stream has a tok::eof token at the end, the
164 /// spelled tokens never store a 'eof' token.
165 ///
166 /// The full list expanded tokens can be obtained with expandedTokens(). Spelled
167 /// tokens for each of the files can be obtained via spelledTokens(FileID).
168 ///
169 /// To map between the expanded and spelled tokens use findSpelledByExpanded().
170 ///
171 /// To build a token buffer use the TokenCollector class. You can also compute
172 /// the spelled tokens of a file using the tokenize() helper.
173 ///
174 /// FIXME: allow to map from spelled to expanded tokens when use-case shows up.
175 /// FIXME: allow mappings into macro arguments.
176 class TokenBuffer {
177 public:
178   TokenBuffer(const SourceManager &SourceMgr) : SourceMgr(&SourceMgr) {}
179   /// All tokens produced by the preprocessor after all macro replacements,
180   /// directives, etc. Source locations found in the clang AST will always
181   /// point to one of these tokens.
182   /// Tokens are in TU order (per SourceManager::isBeforeInTranslationUnit()).
183   /// FIXME: figure out how to handle token splitting, e.g. '>>' can be split
184   ///        into two '>' tokens by the parser. However, TokenBuffer currently
185   ///        keeps it as a single '>>' token.
186   llvm::ArrayRef<syntax::Token> expandedTokens() const {
187     return ExpandedTokens;
188   }
189 
190   /// Returns the subrange of expandedTokens() corresponding to the closed
191   /// token range R.
192   llvm::ArrayRef<syntax::Token> expandedTokens(SourceRange R) const;
193 
194   /// Find the subrange of spelled tokens that produced the corresponding \p
195   /// Expanded tokens.
196   ///
197   /// EXPECTS: \p Expanded is a subrange of expandedTokens().
198   ///
199   /// Will fail if the expanded tokens do not correspond to a
200   /// sequence of spelled tokens. E.g. for the following example:
201   ///
202   ///   #define FIRST f1 f2 f3
203   ///   #define SECOND s1 s2 s3
204   ///
205   ///   a FIRST b SECOND c // expanded tokens are: a f1 f2 f3 b s1 s2 s3 c
206   ///
207   /// the results would be:
208   ///   expanded   => spelled
209   ///   ------------------------
210   ///            a => a
211   ///     s1 s2 s3 => SECOND
212   ///   a f1 f2 f3 => a FIRST
213   ///         a f1 => can't map
214   ///        s1 s2 => can't map
215   ///
216   /// If \p Expanded is empty, the returned value is llvm::None.
217   /// Complexity is logarithmic.
218   llvm::Optional<llvm::ArrayRef<syntax::Token>>
219   spelledForExpanded(llvm::ArrayRef<syntax::Token> Expanded) const;
220 
221   /// An expansion produced by the preprocessor, includes macro expansions and
222   /// preprocessor directives. Preprocessor always maps a non-empty range of
223   /// spelled tokens to a (possibly empty) range of expanded tokens. Here is a
224   /// few examples of expansions:
225   ///    #pragma once      // Expands to an empty range.
226   ///    #define FOO 1 2 3 // Expands an empty range.
227   ///    FOO               // Expands to "1 2 3".
228   /// FIXME(ibiryukov): implement this, currently #include expansions are empty.
229   ///    #include <vector> // Expands to tokens produced by the include.
230   struct Expansion {
231     llvm::ArrayRef<syntax::Token> Spelled;
232     llvm::ArrayRef<syntax::Token> Expanded;
233   };
234   /// If \p Spelled starts a mapping (e.g. if it's a macro name or '#' starting
235   /// a preprocessor directive) return the subrange of expanded tokens that the
236   /// macro expands to.
237   llvm::Optional<Expansion>
238   expansionStartingAt(const syntax::Token *Spelled) const;
239 
240   /// Lexed tokens of a file before preprocessing. E.g. for the following input
241   ///     #define DECL(name) int name = 10
242   ///     DECL(a);
243   /// spelledTokens() returns
244   ///    {"#", "define", "DECL", "(", "name", ")", "int", "name", "=", "10",
245   ///     "DECL", "(", "a", ")", ";"}
246   llvm::ArrayRef<syntax::Token> spelledTokens(FileID FID) const;
247 
248   /// Get all tokens that expand a macro in \p FID. For the following input
249   ///     #define FOO B
250   ///     #define FOO2(X) int X
251   ///     FOO2(XY)
252   ///     int B;
253   ///     FOO;
254   /// macroExpansions() returns {"FOO2", "FOO"} (from line 3 and 5
255   /// respecitvely).
256   std::vector<const syntax::Token *> macroExpansions(FileID FID) const;
257 
258   const SourceManager &sourceManager() const { return *SourceMgr; }
259 
260   std::string dumpForTests() const;
261 
262 private:
263   /// Describes a mapping between a continuous subrange of spelled tokens and
264   /// expanded tokens. Represents macro expansions, preprocessor directives,
265   /// conditionally disabled pp regions, etc.
266   ///   #define FOO 1+2
267   ///   #define BAR(a) a + 1
268   ///   FOO    // invocation #1, tokens = {'1','+','2'}, macroTokens = {'FOO'}.
269   ///   BAR(1) // invocation #2, tokens = {'a', '+', '1'},
270   ///                            macroTokens = {'BAR', '(', '1', ')'}.
271   struct Mapping {
272     // Positions in the corresponding spelled token stream. The corresponding
273     // range is never empty.
274     unsigned BeginSpelled = 0;
275     unsigned EndSpelled = 0;
276     // Positions in the expanded token stream. The corresponding range can be
277     // empty.
278     unsigned BeginExpanded = 0;
279     unsigned EndExpanded = 0;
280 
281     /// For debugging purposes.
282     std::string str() const;
283   };
284   /// Spelled tokens of the file with information about the subranges.
285   struct MarkedFile {
286     /// Lexed, but not preprocessed, tokens of the file. These map directly to
287     /// text in the corresponding files and include tokens of all preprocessor
288     /// directives.
289     /// FIXME: spelled tokens don't change across FileID that map to the same
290     ///        FileEntry. We could consider deduplicating them to save memory.
291     std::vector<syntax::Token> SpelledTokens;
292     /// A sorted list to convert between the spelled and expanded token streams.
293     std::vector<Mapping> Mappings;
294     /// The first expanded token produced for this FileID.
295     unsigned BeginExpanded = 0;
296     unsigned EndExpanded = 0;
297   };
298 
299   friend class TokenCollector;
300 
301   /// Maps a single expanded token to its spelled counterpart or a mapping that
302   /// produced it.
303   std::pair<const syntax::Token *, const Mapping *>
304   spelledForExpandedToken(const syntax::Token *Expanded) const;
305 
306   /// Token stream produced after preprocessing, conceputally this captures the
307   /// same stream as 'clang -E' (excluding the preprocessor directives like
308   /// #file, etc.).
309   std::vector<syntax::Token> ExpandedTokens;
310   llvm::DenseMap<FileID, MarkedFile> Files;
311   // The value is never null, pointer instead of reference to avoid disabling
312   // implicit assignment operator.
313   const SourceManager *SourceMgr;
314 };
315 
316 /// The spelled tokens that overlap or touch a spelling location Loc.
317 /// This always returns 0-2 tokens.
318 llvm::ArrayRef<syntax::Token>
319 spelledTokensTouching(SourceLocation Loc, const syntax::TokenBuffer &Tokens);
320 
321 /// The identifier token that overlaps or touches a spelling location Loc.
322 /// If there is none, returns nullptr.
323 const syntax::Token *
324 spelledIdentifierTouching(SourceLocation Loc,
325                           const syntax::TokenBuffer &Tokens);
326 
327 /// Lex the text buffer, corresponding to \p FID, in raw mode and record the
328 /// resulting spelled tokens. Does minimal post-processing on raw identifiers,
329 /// setting the appropriate token kind (instead of the raw_identifier reported
330 /// by lexer in raw mode). This is a very low-level function, most users should
331 /// prefer to use TokenCollector. Lexing in raw mode produces wildly different
332 /// results from what one might expect when running a C++ frontend, e.g.
333 /// preprocessor does not run at all.
334 /// The result will *not* have a 'eof' token at the end.
335 std::vector<syntax::Token> tokenize(FileID FID, const SourceManager &SM,
336                                     const LangOptions &LO);
337 
338 /// Collects tokens for the main file while running the frontend action. An
339 /// instance of this object should be created on
340 /// FrontendAction::BeginSourceFile() and the results should be consumed after
341 /// FrontendAction::Execute() finishes.
342 class TokenCollector {
343 public:
344   /// Adds the hooks to collect the tokens. Should be called before the
345   /// preprocessing starts, i.e. as a part of BeginSourceFile() or
346   /// CreateASTConsumer().
347   TokenCollector(Preprocessor &P);
348 
349   /// Finalizes token collection. Should be called after preprocessing is
350   /// finished, i.e. after running Execute().
351   LLVM_NODISCARD TokenBuffer consume() &&;
352 
353 private:
354   /// Maps from a start to an end spelling location of transformations
355   /// performed by the preprocessor. These include:
356   ///   1. range from '#' to the last token in the line for PP directives,
357   ///   2. macro name and arguments for macro expansions.
358   /// Note that we record only top-level macro expansions, intermediate
359   /// expansions (e.g. inside macro arguments) are ignored.
360   ///
361   /// Used to find correct boundaries of macro calls and directives when
362   /// building mappings from spelled to expanded tokens.
363   ///
364   /// Logically, at each point of the preprocessor execution there is a stack of
365   /// macro expansions being processed and we could use it to recover the
366   /// location information we need. However, the public preprocessor API only
367   /// exposes the points when macro expansions start (when we push a macro onto
368   /// the stack) and not when they end (when we pop a macro from the stack).
369   /// To workaround this limitation, we rely on source location information
370   /// stored in this map.
371   using PPExpansions = llvm::DenseMap</*SourceLocation*/ int, SourceLocation>;
372   class Builder;
373   class CollectPPExpansions;
374 
375   std::vector<syntax::Token> Expanded;
376   // FIXME: we only store macro expansions, also add directives(#pragma, etc.)
377   PPExpansions Expansions;
378   Preprocessor &PP;
379   CollectPPExpansions *Collector;
380 };
381 
382 } // namespace syntax
383 } // namespace clang
384 
385 #endif
386