1 //===--- Macros.h - Format C++ code -----------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains the main building blocks of macro support in
11 /// clang-format.
12 ///
13 /// In order to not violate the requirement that clang-format can format files
14 /// in isolation, clang-format's macro support uses expansions users provide
15 /// as part of clang-format's style configuration.
16 ///
17 /// Macro definitions are of the form "MACRO(p1, p2)=p1 + p2", but only support
18 /// one level of expansion (\see MacroExpander for a full description of what
19 /// is supported).
20 ///
21 /// As part of parsing, clang-format uses the MacroExpander to expand the
22 /// spelled token streams into expanded token streams when it encounters a
23 /// macro call. The UnwrappedLineParser continues to parse UnwrappedLines
24 /// from the expanded token stream.
25 /// After the expanded unwrapped lines are parsed, the MacroCallReconstructor
26 /// matches the spelled token stream into unwrapped lines that best resemble the
27 /// structure of the expanded unwrapped lines. These reconstructed unwrapped
28 /// lines are aliasing the tokens in the expanded token stream, so that token
29 /// annotations will be reused when formatting the spelled macro calls.
30 ///
31 /// When formatting, clang-format annotates and formats the expanded unwrapped
32 /// lines first, determining the token types. Next, it formats the spelled
33 /// unwrapped lines, keeping the token types fixed, while allowing other
34 /// formatting decisions to change.
35 ///
36 //===----------------------------------------------------------------------===//
37 
38 #ifndef CLANG_LIB_FORMAT_MACROS_H
39 #define CLANG_LIB_FORMAT_MACROS_H
40 
41 #include <list>
42 #include <map>
43 #include <string>
44 #include <vector>
45 
46 #include "FormatToken.h"
47 #include "llvm/ADT/ArrayRef.h"
48 #include "llvm/ADT/DenseMap.h"
49 #include "llvm/ADT/SmallVector.h"
50 #include "llvm/ADT/StringRef.h"
51 
52 namespace clang {
53 namespace format {
54 
55 struct UnwrappedLine;
56 struct UnwrappedLineNode;
57 
58 /// Takes a set of macro definitions as strings and allows expanding calls to
59 /// those macros.
60 ///
61 /// For example:
62 /// Definition: A(x, y)=x + y
63 /// Call      : A(int a = 1, 2)
64 /// Expansion : int a = 1 + 2
65 ///
66 /// Expansion does not check arity of the definition.
67 /// If fewer arguments than expected are provided, the remaining parameters
68 /// are considered empty:
69 /// Call     : A(a)
70 /// Expansion: a +
71 /// If more arguments than expected are provided, they will be discarded.
72 ///
73 /// The expander does not support:
74 /// - recursive expansion
75 /// - stringification
76 /// - concatenation
77 /// - variadic macros
78 ///
79 /// Furthermore, only a single expansion of each macro argument is supported,
80 /// so that we cannot get conflicting formatting decisions from different
81 /// expansions.
82 /// Definition: A(x)=x+x
83 /// Call      : A(id)
84 /// Expansion : id+x
85 ///
86 class MacroExpander {
87 public:
88   using ArgsList = llvm::ArrayRef<llvm::SmallVector<FormatToken *, 8>>;
89 
90   /// Construct a macro expander from a set of macro definitions.
91   /// Macro definitions must be encoded as UTF-8.
92   ///
93   /// Each entry in \p Macros must conform to the following simple
94   /// macro-definition language:
95   /// <definition> ::= <id> <expansion> | <id> "(" <params> ")" <expansion>
96   /// <params>     ::= <id-list> | ""
97   /// <id-list>    ::= <id> | <id> "," <params>
98   /// <expansion>  ::= "=" <tail> | <eof>
99   /// <tail>       ::= <tok> <tail> | <eof>
100   ///
101   /// Macros that cannot be parsed will be silently discarded.
102   ///
103   MacroExpander(const std::vector<std::string> &Macros,
104                 clang::SourceManager &SourceMgr, const FormatStyle &Style,
105                 llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
106                 IdentifierTable &IdentTable);
107   ~MacroExpander();
108 
109   /// Returns whether any macro \p Name is defined, regardless of overloads.
110   bool defined(llvm::StringRef Name) const;
111 
112   /// Returns whetherh there is an object-like overload, i.e. where the macro
113   /// has no arguments and should not consume subsequent parentheses.
114   bool objectLike(llvm::StringRef Name) const;
115 
116   /// Returns whether macro \p Name provides an overload with the given arity.
117   bool hasArity(llvm::StringRef Name, unsigned Arity) const;
118 
119   /// Returns the expanded stream of format tokens for \p ID, where
120   /// each element in \p Args is a positional argument to the macro call.
121   /// If \p Args is not set, the object-like overload is used.
122   /// If \p Args is set, the overload with the arity equal to \c Args.size() is
123   /// used.
124   llvm::SmallVector<FormatToken *, 8>
125   expand(FormatToken *ID, std::optional<ArgsList> OptionalArgs) const;
126 
127 private:
128   struct Definition;
129   class DefinitionParser;
130 
131   void parseDefinition(const std::string &Macro);
132 
133   clang::SourceManager &SourceMgr;
134   const FormatStyle &Style;
135   llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator;
136   IdentifierTable &IdentTable;
137   SmallVector<std::unique_ptr<llvm::MemoryBuffer>> Buffers;
138   llvm::StringMap<llvm::DenseMap<int, Definition>> FunctionLike;
139   llvm::StringMap<Definition> ObjectLike;
140 };
141 
142 /// Converts a sequence of UnwrappedLines containing expanded macros into a
143 /// single UnwrappedLine containing the macro calls.  This UnwrappedLine may be
144 /// broken into child lines, in a way that best conveys the structure of the
145 /// expanded code.
146 ///
147 /// In the simplest case, a spelled UnwrappedLine contains one macro, and after
148 /// expanding it we have one expanded UnwrappedLine.  In general, macro
149 /// expansions can span UnwrappedLines, and multiple macros can contribute
150 /// tokens to the same line.  We keep consuming expanded lines until:
151 /// *   all expansions that started have finished (we're not chopping any macros
152 ///     in half)
153 /// *   *and* we've reached the end of a *spelled* unwrapped line.
154 ///
155 /// A single UnwrappedLine represents this chunk of code.
156 ///
157 /// After this point, the state of the spelled/expanded stream is "in sync"
158 /// (both at the start of an UnwrappedLine, with no macros open), so the
159 /// Reconstructor can be thrown away and parsing can continue.
160 ///
161 /// Given a mapping from the macro name identifier token in the macro call
162 /// to the tokens of the macro call, for example:
163 /// CLASSA -> CLASSA({public: void x();})
164 ///
165 /// When getting the formatted lines of the expansion via the \c addLine method
166 /// (each '->' specifies a call to \c addLine ):
167 /// -> class A {
168 /// -> public:
169 /// ->   void x();
170 /// -> };
171 ///
172 /// Creates the tree of unwrapped lines containing the macro call tokens so that
173 /// the macro call tokens fit the semantic structure of the expanded formatted
174 /// lines:
175 /// -> CLASSA({
176 /// -> public:
177 /// ->   void x();
178 /// -> })
179 class MacroCallReconstructor {
180 public:
181   /// Create an Reconstructor whose resulting \p UnwrappedLine will start at
182   /// \p Level, using the map from name identifier token to the corresponding
183   /// tokens of the spelled macro call.
184   MacroCallReconstructor(
185       unsigned Level,
186       const llvm::DenseMap<FormatToken *, std::unique_ptr<UnwrappedLine>>
187           &ActiveExpansions);
188 
189   /// For the given \p Line, match all occurences of tokens expanded from a
190   /// macro to unwrapped lines in the spelled macro call so that the resulting
191   /// tree of unwrapped lines best resembles the structure of unwrapped lines
192   /// passed in via \c addLine.
193   void addLine(const UnwrappedLine &Line);
194 
195   /// Check whether at the current state there is no open macro expansion
196   /// that needs to be processed to finish an macro call.
197   /// Only when \c finished() is true, \c takeResult() can be called to retrieve
198   /// the resulting \c UnwrappedLine.
199   /// If there are multiple subsequent macro calls within an unwrapped line in
200   /// the spelled token stream, the calling code may also continue to call
201   /// \c addLine() when \c finished() is true.
202   bool finished() const { return ActiveExpansions.empty(); }
203 
204   /// Retrieve the formatted \c UnwrappedLine containing the orginal
205   /// macro calls, formatted according to the expanded token stream received
206   /// via \c addLine().
207   /// Generally, this line tries to have the same structure as the expanded,
208   /// formatted unwrapped lines handed in via \c addLine(), with the exception
209   /// that for multiple top-level lines, each subsequent line will be the
210   /// child of the last token in its predecessor. This representation is chosen
211   /// because it is a precondition to the formatter that we get what looks like
212   /// a single statement in a single \c UnwrappedLine (i.e. matching parens).
213   ///
214   /// If a token in a macro argument is a child of a token in the expansion,
215   /// the parent will be the corresponding token in the macro call.
216   /// For example:
217   ///   #define C(a, b) class C { a b
218   ///   C(int x;, int y;)
219   /// would expand to
220   ///   class C { int x; int y;
221   /// where in a formatted line "int x;" and "int y;" would both be new separate
222   /// lines.
223   ///
224   /// In the result, "int x;" will be a child of the opening parenthesis in "C("
225   /// and "int y;" will be a child of the "," token:
226   ///   C (
227   ///     \- int x;
228   ///     ,
229   ///     \- int y;
230   ///     )
231   UnwrappedLine takeResult() &&;
232 
233 private:
234   void add(FormatToken *Token, FormatToken *ExpandedParent, bool First);
235   void prepareParent(FormatToken *ExpandedParent, bool First);
236   FormatToken *getParentInResult(FormatToken *Parent);
237   void reconstruct(FormatToken *Token);
238   void startReconstruction(FormatToken *Token);
239   bool reconstructActiveCallUntil(FormatToken *Token);
240   void endReconstruction(FormatToken *Token);
241   bool processNextReconstructed();
242   void finalize();
243 
244   struct ReconstructedLine;
245 
246   void appendToken(FormatToken *Token, ReconstructedLine *L = nullptr);
247   UnwrappedLine createUnwrappedLine(const ReconstructedLine &Line, int Level);
248   void debug(const ReconstructedLine &Line, int Level);
249   ReconstructedLine &parentLine();
250   ReconstructedLine *currentLine();
251   void debugParentMap() const;
252 
253 #ifndef NDEBUG
254   enum ReconstructorState {
255     Start,      // No macro expansion was found in the input yet.
256     InProgress, // During a macro reconstruction.
257     Finalized,  // Past macro reconstruction, the result is finalized.
258   };
259   ReconstructorState State = Start;
260 #endif
261 
262   // Node in which we build up the resulting unwrapped line; this type is
263   // analogous to UnwrappedLineNode.
264   struct LineNode {
265     LineNode() = default;
266     LineNode(FormatToken *Tok) : Tok(Tok) {}
267     FormatToken *Tok = nullptr;
268     llvm::SmallVector<std::unique_ptr<ReconstructedLine>> Children;
269   };
270 
271   // Line in which we build up the resulting unwrapped line.
272   // FIXME: Investigate changing UnwrappedLine to a pointer type and using it
273   // instead of rolling our own type.
274   struct ReconstructedLine {
275     llvm::SmallVector<std::unique_ptr<LineNode>> Tokens;
276   };
277 
278   // The line in which we collect the resulting reconstructed output.
279   // To reduce special cases in the algorithm, the first level of the line
280   // contains a single null token that has the reconstructed incoming
281   // lines as children.
282   // In the end, we stich the lines together so that each subsequent line
283   // is a child of the last token of the previous line. This is necessary
284   // in order to format the overall expression as a single logical line -
285   // if we created separate lines, we'd format them with their own top-level
286   // indent depending on the semantic structure, which is not desired.
287   ReconstructedLine Result;
288 
289   // Stack of currently "open" lines, where each line's predecessor's last
290   // token is the parent token for that line.
291   llvm::SmallVector<ReconstructedLine *> ActiveReconstructedLines;
292 
293   // Maps from the expanded token to the token that takes its place in the
294   // reconstructed token stream in terms of parent-child relationships.
295   // Note that it might take multiple steps to arrive at the correct
296   // parent in the output.
297   // Given: #define C(a, b) []() { a; b; }
298   // And a call: C(f(), g())
299   // The structure in the incoming formatted unwrapped line will be:
300   // []() {
301   //      |- f();
302   //      \- g();
303   // }
304   // with f and g being children of the opening brace.
305   // In the reconstructed call:
306   // C(f(), g())
307   //  \- f()
308   //      \- g()
309   // We want f to be a child of the opening parenthesis and g to be a child
310   // of the comma token in the macro call.
311   // Thus, we map
312   // { -> (
313   // and add
314   // ( -> ,
315   // once we're past the comma in the reconstruction.
316   llvm::DenseMap<FormatToken *, FormatToken *>
317       SpelledParentToReconstructedParent;
318 
319   // Keeps track of a single expansion while we're reconstructing tokens it
320   // generated.
321   struct Expansion {
322     // The identifier token of the macro call.
323     FormatToken *ID;
324     // Our current position in the reconstruction.
325     std::list<UnwrappedLineNode>::iterator SpelledI;
326     // The end of the reconstructed token sequence.
327     std::list<UnwrappedLineNode>::iterator SpelledE;
328   };
329 
330   // Stack of macro calls for which we're in the middle of an expansion.
331   llvm::SmallVector<Expansion> ActiveExpansions;
332 
333   struct MacroCallState {
334     MacroCallState(ReconstructedLine *Line, FormatToken *ParentLastToken,
335                    FormatToken *MacroCallLParen);
336 
337     ReconstructedLine *Line;
338 
339     // The last token in the parent line or expansion, or nullptr if the macro
340     // expansion is on a top-level line.
341     //
342     // For example, in the macro call:
343     //   auto f = []() { ID(1); };
344     // The MacroCallState for ID will have '{' as ParentLastToken.
345     //
346     // In the macro call:
347     //   ID(ID(void f()));
348     // The MacroCallState of the outer ID will have nullptr as ParentLastToken,
349     // while the MacroCallState for the inner ID will have the '(' of the outer
350     // ID as ParentLastToken.
351     //
352     // In the macro call:
353     //   ID2(a, ID(b));
354     // The MacroCallState of ID will have ',' as ParentLastToken.
355     FormatToken *ParentLastToken;
356 
357     // The l_paren of this MacroCallState's macro call.
358     FormatToken *MacroCallLParen;
359   };
360 
361   // Keeps track of the lines into which the opening brace/parenthesis &
362   // argument separating commas for each level in the macro call go in order to
363   // put the corresponding closing brace/parenthesis into the same line in the
364   // output and keep track of which parents in the expanded token stream map to
365   // which tokens in the reconstructed stream.
366   // When an opening brace/parenthesis has children, we want the structure of
367   // the output line to be:
368   // |- MACRO
369   // |- (
370   // |  \- <argument>
371   // |- ,
372   // |  \- <argument>
373   // \- )
374   llvm::SmallVector<MacroCallState> MacroCallStructure;
375 
376   // Level the generated UnwrappedLine will be at.
377   const unsigned Level;
378 
379   // Maps from identifier of the macro call to an unwrapped line containing
380   // all tokens of the macro call.
381   const llvm::DenseMap<FormatToken *, std::unique_ptr<UnwrappedLine>>
382       &IdToReconstructed;
383 };
384 
385 } // namespace format
386 } // namespace clang
387 
388 #endif
389