1 //===- Tree.h - structure of the syntax tree ------------------*- C++ -*-=====//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // Defines the basic structure of the syntax tree. There are two kinds of nodes:
9 //   - leaf nodes correspond to tokens,
10 //   - tree nodes correspond to language grammar constructs.
11 //
12 // The tree is initially built from an AST. Each node of a newly built tree
13 // covers a continous subrange of expanded tokens (i.e. tokens after
14 // preprocessing), the specific tokens coverered are stored in the leaf nodes of
15 // a tree. A post-order traversal of a tree will visit leaf nodes in an order
16 // corresponding the original order of expanded tokens.
17 //
18 // This is still work in progress and highly experimental, we leave room for
19 // ourselves to completely change the design and/or implementation.
20 //===----------------------------------------------------------------------===//
21 #ifndef LLVM_CLANG_TOOLING_SYNTAX_TREE_H
22 #define LLVM_CLANG_TOOLING_SYNTAX_TREE_H
23 
24 #include "clang/Basic/TokenKinds.h"
25 #include "clang/Tooling/Syntax/TokenManager.h"
26 #include "llvm/ADT/iterator.h"
27 #include "llvm/Support/Allocator.h"
28 #include <cstdint>
29 #include <vector>
30 
31 namespace clang {
32 namespace syntax {
33 
34 /// A memory arena for syntax trees.
35 // FIXME: use BumpPtrAllocator directly.
36 class Arena {
37 public:
38   llvm::BumpPtrAllocator &getAllocator() { return Allocator; }
39 private:
40   /// Keeps all the allocated nodes and their intermediate data structures.
41   llvm::BumpPtrAllocator Allocator;
42 };
43 
44 class Tree;
45 class TreeBuilder;
46 class FactoryImpl;
47 class MutationsImpl;
48 
49 enum class NodeKind : uint16_t;
50 enum class NodeRole : uint8_t;
51 
52 /// A node in a syntax tree. Each node is either a Leaf (representing tokens) or
53 /// a Tree (representing language constructrs).
54 class Node {
55 protected:
56   /// Newly created nodes are detached from a tree, parent and sibling links are
57   /// set when the node is added as a child to another one.
58   Node(NodeKind Kind);
59   /// Nodes are allocated on Arenas; the destructor is never called.
60   ~Node() = default;
61 
62 public:
63   /// Nodes cannot simply be copied without violating tree invariants.
64   Node(const Node &) = delete;
65   Node &operator=(const Node &) = delete;
66   /// Idiomatically, nodes are allocated on an Arena and never moved.
67   Node(Node &&) = delete;
68   Node &operator=(Node &&) = delete;
69 
70   NodeKind getKind() const { return static_cast<NodeKind>(Kind); }
71   NodeRole getRole() const { return static_cast<NodeRole>(Role); }
72 
73   /// Whether the node is detached from a tree, i.e. does not have a parent.
74   bool isDetached() const;
75   /// Whether the node was created from the AST backed by the source code
76   /// rather than added later through mutation APIs or created with factory
77   /// functions.
78   /// When this flag is true, all subtrees are also original.
79   /// This flag is set to false on any modifications to the node or any of its
80   /// subtrees, even if this simply involves swapping existing subtrees.
81   bool isOriginal() const { return Original; }
82   /// If this function return false, the tree cannot be modified because there
83   /// is no reasonable way to produce the corresponding textual replacements.
84   /// This can happen when the node crosses macro expansion boundaries.
85   ///
86   /// Note that even if the node is not modifiable, its child nodes can be
87   /// modifiable.
88   bool canModify() const { return CanModify; }
89 
90   const Tree *getParent() const { return Parent; }
91   Tree *getParent() { return Parent; }
92 
93   const Node *getNextSibling() const { return NextSibling; }
94   Node *getNextSibling() { return NextSibling; }
95   const Node *getPreviousSibling() const { return PreviousSibling; }
96   Node *getPreviousSibling() { return PreviousSibling; }
97 
98   /// Dumps the structure of a subtree. For debugging and testing purposes.
99   std::string dump(const TokenManager &SM) const;
100   /// Dumps the tokens forming this subtree.
101   std::string dumpTokens(const TokenManager &SM) const;
102 
103   /// Asserts invariants on this node of the tree and its immediate children.
104   /// Will not recurse into the subtree. No-op if NDEBUG is set.
105   void assertInvariants() const;
106   /// Runs checkInvariants on all nodes in the subtree. No-op if NDEBUG is set.
107   void assertInvariantsRecursive() const;
108 
109 private:
110   // Tree is allowed to change the Parent link and Role.
111   friend class Tree;
112   // TreeBuilder is allowed to set the Original and CanModify flags.
113   friend class TreeBuilder;
114   // MutationsImpl sets roles and CanModify flag.
115   friend class MutationsImpl;
116   // FactoryImpl sets CanModify flag.
117   friend class FactoryImpl;
118 
119   void setRole(NodeRole NR);
120 
121   Tree *Parent;
122   Node *NextSibling;
123   Node *PreviousSibling;
124   unsigned Kind : 16;
125   unsigned Role : 8;
126   unsigned Original : 1;
127   unsigned CanModify : 1;
128 };
129 
130 /// A leaf node points to a single token.
131 // FIXME: add TokenKind field (borrow some bits from the Node::kind).
132 class Leaf final : public Node {
133 public:
134   Leaf(TokenManager::Key K);
135   static bool classof(const Node *N);
136 
137   TokenManager::Key getTokenKey() const { return K; }
138 
139 private:
140   TokenManager::Key K;
141 };
142 
143 /// A node that has children and represents a syntactic language construct.
144 class Tree : public Node {
145   /// Iterator over children (common base for const/non-const).
146   /// Not invalidated by tree mutations (holds a stable node pointer).
147   template <typename DerivedT, typename NodeT>
148   class ChildIteratorBase
149       : public llvm::iterator_facade_base<DerivedT, std::forward_iterator_tag,
150                                           NodeT> {
151   protected:
152     NodeT *N = nullptr;
153     using Base = ChildIteratorBase;
154 
155   public:
156     ChildIteratorBase() = default;
157     explicit ChildIteratorBase(NodeT *N) : N(N) {}
158 
159     friend bool operator==(const DerivedT &LHS, const DerivedT &RHS) {
160       return LHS.N == RHS.N;
161     }
162 
163     NodeT &operator*() const { return *N; }
164     DerivedT &operator++() {
165       N = N->getNextSibling();
166       return *static_cast<DerivedT *>(this);
167     }
168 
169     /// Truthy if valid (not past-the-end).
170     /// This allows: if (auto It = find_if(N.children(), ...) )
171     explicit operator bool() const { return N != nullptr; }
172     /// The element, or nullptr if past-the-end.
173     NodeT *asPointer() const { return N; }
174   };
175 
176 public:
177   static bool classof(const Node *N);
178 
179   Node *getFirstChild() { return FirstChild; }
180   const Node *getFirstChild() const { return FirstChild; }
181   Node *getLastChild() { return LastChild; }
182   const Node *getLastChild() const { return LastChild; }
183 
184   const Leaf *findFirstLeaf() const;
185   Leaf *findFirstLeaf() {
186     return const_cast<Leaf *>(const_cast<const Tree *>(this)->findFirstLeaf());
187   }
188 
189   const Leaf *findLastLeaf() const;
190   Leaf *findLastLeaf() {
191     return const_cast<Leaf *>(const_cast<const Tree *>(this)->findLastLeaf());
192   }
193 
194   /// child_iterator is not invalidated by mutations.
195   struct ChildIterator : ChildIteratorBase<ChildIterator, Node> {
196     using Base::ChildIteratorBase;
197   };
198   struct ConstChildIterator
199       : ChildIteratorBase<ConstChildIterator, const Node> {
200     using Base::ChildIteratorBase;
201     ConstChildIterator() = default;
202     ConstChildIterator(const ChildIterator &I) : Base(I.asPointer()) {}
203   };
204 
205   llvm::iterator_range<ChildIterator> getChildren() {
206     return {ChildIterator(getFirstChild()), ChildIterator()};
207   }
208   llvm::iterator_range<ConstChildIterator> getChildren() const {
209     return {ConstChildIterator(getFirstChild()), ConstChildIterator()};
210   }
211 
212   /// Find the first node with a corresponding role.
213   const Node *findChild(NodeRole R) const;
214   Node *findChild(NodeRole R) {
215     return const_cast<Node *>(const_cast<const Tree *>(this)->findChild(R));
216   }
217 
218 protected:
219   using Node::Node;
220 
221 private:
222   /// Append \p Child to the list of children and sets the parent pointer.
223   /// A very low-level operation that does not check any invariants, only used
224   /// by TreeBuilder and FactoryImpl.
225   /// EXPECTS: Role != Detached.
226   void appendChildLowLevel(Node *Child, NodeRole Role);
227   /// Similar but prepends.
228   void prependChildLowLevel(Node *Child, NodeRole Role);
229 
230   /// Like the previous overloads, but does not set role for \p Child.
231   /// EXPECTS: Child->Role != Detached
232   void appendChildLowLevel(Node *Child);
233   void prependChildLowLevel(Node *Child);
234   friend class TreeBuilder;
235   friend class FactoryImpl;
236 
237   /// Replace a range of children [Begin, End) with a list of
238   /// new nodes starting at \p New.
239   /// Only used by MutationsImpl to implement higher-level mutation operations.
240   /// (!) \p New can be null to model removal of the child range.
241   /// (!) \p End can be null to model one past the end.
242   /// (!) \p Begin can be null to model an append.
243   void replaceChildRangeLowLevel(Node *Begin, Node *End, Node *New);
244   friend class MutationsImpl;
245 
246   Node *FirstChild = nullptr;
247   Node *LastChild = nullptr;
248 };
249 
250 /// A list of Elements separated or terminated by a fixed token.
251 ///
252 /// This type models the following grammar construct:
253 /// delimited-list(element, delimiter, termination, canBeEmpty)
254 class List : public Tree {
255 public:
256   template <typename Element> struct ElementAndDelimiter {
257     Element *element;
258     Leaf *delimiter;
259   };
260 
261   enum class TerminationKind {
262     Terminated,
263     MaybeTerminated,
264     Separated,
265   };
266 
267   using Tree::Tree;
268   static bool classof(const Node *N);
269   /// Returns the elements and corresponding delimiters. Missing elements
270   /// and delimiters are represented as null pointers.
271   ///
272   /// For example, in a separated list:
273   /// "a, b, c"  <=> [("a" , ","), ("b" , "," ), ("c" , null)]
274   /// "a,  , c"  <=> [("a" , ","), (null, "," ), ("c" , null)]
275   /// "a, b  c"  <=> [("a" , ","), ("b" , null), ("c" , null)]
276   /// "a, b,"    <=> [("a" , ","), ("b" , "," ), (null, null)]
277   ///
278   /// In a terminated or maybe-terminated list:
279   /// "a; b; c;" <=> [("a" , ";"), ("b" , ";" ), ("c" , ";" )]
280   /// "a;  ; c;" <=> [("a" , ";"), (null, ";" ), ("c" , ";" )]
281   /// "a; b  c;" <=> [("a" , ";"), ("b" , null), ("c" , ";" )]
282   /// "a; b; c"  <=> [("a" , ";"), ("b" , ";" ), ("c" , null)]
283   std::vector<ElementAndDelimiter<Node>> getElementsAsNodesAndDelimiters();
284 
285   /// Returns the elements of the list. Missing elements are represented
286   /// as null pointers in the same way as in the return value of
287   /// `getElementsAsNodesAndDelimiters()`.
288   std::vector<Node *> getElementsAsNodes();
289 
290   // These can't be implemented with the information we have!
291 
292   /// Returns the appropriate delimiter for this list.
293   ///
294   /// Useful for discovering the correct delimiter to use when adding
295   /// elements to empty or one-element lists.
296   clang::tok::TokenKind getDelimiterTokenKind() const;
297 
298   TerminationKind getTerminationKind() const;
299 
300   /// Whether this list can be empty in syntactically and semantically correct
301   /// code.
302   ///
303   /// This list may be empty when the source code has errors even if
304   /// canBeEmpty() returns false.
305   bool canBeEmpty() const;
306 };
307 
308 } // namespace syntax
309 } // namespace clang
310 
311 #endif
312