1 //===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the Suffix Tree class and Suffix Tree Node struct. 10 // 11 //===----------------------------------------------------------------------===// 12 #ifndef LLVM_SUPPORT_SUFFIXTREE_H 13 #define LLVM_SUPPORT_SUFFIXTREE_H 14 15 #include "llvm/ADT/ArrayRef.h" 16 #include "llvm/ADT/DenseMap.h" 17 #include "llvm/Support/Allocator.h" 18 #include <vector> 19 20 namespace llvm { 21 22 /// Represents an undefined index in the suffix tree. 23 const unsigned EmptyIdx = -1; 24 25 /// A node in a suffix tree which represents a substring or suffix. 26 /// 27 /// Each node has either no children or at least two children, with the root 28 /// being a exception in the empty tree. 29 /// 30 /// Children are represented as a map between unsigned integers and nodes. If 31 /// a node N has a child M on unsigned integer k, then the mapping represented 32 /// by N is a proper prefix of the mapping represented by M. Note that this, 33 /// although similar to a trie is somewhat different: each node stores a full 34 /// substring of the full mapping rather than a single character state. 35 /// 36 /// Each internal node contains a pointer to the internal node representing 37 /// the same string, but with the first character chopped off. This is stored 38 /// in \p Link. Each leaf node stores the start index of its respective 39 /// suffix in \p SuffixIdx. 40 struct SuffixTreeNode { 41 42 /// The children of this node. 43 /// 44 /// A child existing on an unsigned integer implies that from the mapping 45 /// represented by the current node, there is a way to reach another 46 /// mapping by tacking that character on the end of the current string. 47 llvm::DenseMap<unsigned, SuffixTreeNode *> Children; 48 49 /// The start index of this node's substring in the main string. 50 unsigned StartIdx = EmptyIdx; 51 52 /// The end index of this node's substring in the main string. 53 /// 54 /// Every leaf node must have its \p EndIdx incremented at the end of every 55 /// step in the construction algorithm. To avoid having to update O(N) 56 /// nodes individually at the end of every step, the end index is stored 57 /// as a pointer. 58 unsigned *EndIdx = nullptr; 59 60 /// For leaves, the start index of the suffix represented by this node. 61 /// 62 /// For all other nodes, this is ignored. 63 unsigned SuffixIdx = EmptyIdx; 64 65 /// For internal nodes, a pointer to the internal node representing 66 /// the same sequence with the first character chopped off. 67 /// 68 /// This acts as a shortcut in Ukkonen's algorithm. One of the things that 69 /// Ukkonen's algorithm does to achieve linear-time construction is 70 /// keep track of which node the next insert should be at. This makes each 71 /// insert O(1), and there are a total of O(N) inserts. The suffix link 72 /// helps with inserting children of internal nodes. 73 /// 74 /// Say we add a child to an internal node with associated mapping S. The 75 /// next insertion must be at the node representing S - its first character. 76 /// This is given by the way that we iteratively build the tree in Ukkonen's 77 /// algorithm. The main idea is to look at the suffixes of each prefix in the 78 /// string, starting with the longest suffix of the prefix, and ending with 79 /// the shortest. Therefore, if we keep pointers between such nodes, we can 80 /// move to the next insertion point in O(1) time. If we don't, then we'd 81 /// have to query from the root, which takes O(N) time. This would make the 82 /// construction algorithm O(N^2) rather than O(N). 83 SuffixTreeNode *Link = nullptr; 84 85 /// The length of the string formed by concatenating the edge labels from the 86 /// root to this node. 87 unsigned ConcatLen = 0; 88 89 /// Returns true if this node is a leaf. 90 bool isLeaf() const { return SuffixIdx != EmptyIdx; } 91 92 /// Returns true if this node is the root of its owning \p SuffixTree. 93 bool isRoot() const { return StartIdx == EmptyIdx; } 94 95 /// Return the number of elements in the substring associated with this node. 96 size_t size() const { 97 98 // Is it the root? If so, it's the empty string so return 0. 99 if (isRoot()) 100 return 0; 101 102 assert(*EndIdx != EmptyIdx && "EndIdx is undefined!"); 103 104 // Size = the number of elements in the string. 105 // For example, [0 1 2 3] has length 4, not 3. 3-0 = 3, so we have 3-0+1. 106 return *EndIdx - StartIdx + 1; 107 } 108 109 SuffixTreeNode(unsigned StartIdx, unsigned *EndIdx, SuffixTreeNode *Link) 110 : StartIdx(StartIdx), EndIdx(EndIdx), Link(Link) {} 111 112 SuffixTreeNode() = default; 113 }; 114 115 /// A data structure for fast substring queries. 116 /// 117 /// Suffix trees represent the suffixes of their input strings in their leaves. 118 /// A suffix tree is a type of compressed trie structure where each node 119 /// represents an entire substring rather than a single character. Each leaf 120 /// of the tree is a suffix. 121 /// 122 /// A suffix tree can be seen as a type of state machine where each state is a 123 /// substring of the full string. The tree is structured so that, for a string 124 /// of length N, there are exactly N leaves in the tree. This structure allows 125 /// us to quickly find repeated substrings of the input string. 126 /// 127 /// In this implementation, a "string" is a vector of unsigned integers. 128 /// These integers may result from hashing some data type. A suffix tree can 129 /// contain 1 or many strings, which can then be queried as one large string. 130 /// 131 /// The suffix tree is implemented using Ukkonen's algorithm for linear-time 132 /// suffix tree construction. Ukkonen's algorithm is explained in more detail 133 /// in the paper by Esko Ukkonen "On-line construction of suffix trees. The 134 /// paper is available at 135 /// 136 /// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf 137 class SuffixTree { 138 public: 139 /// Each element is an integer representing an instruction in the module. 140 llvm::ArrayRef<unsigned> Str; 141 142 /// A repeated substring in the tree. 143 struct RepeatedSubstring { 144 /// The length of the string. 145 unsigned Length; 146 147 /// The start indices of each occurrence. 148 std::vector<unsigned> StartIndices; 149 }; 150 151 private: 152 /// Maintains each node in the tree. 153 llvm::SpecificBumpPtrAllocator<SuffixTreeNode> NodeAllocator; 154 155 /// The root of the suffix tree. 156 /// 157 /// The root represents the empty string. It is maintained by the 158 /// \p NodeAllocator like every other node in the tree. 159 SuffixTreeNode *Root = nullptr; 160 161 /// Maintains the end indices of the internal nodes in the tree. 162 /// 163 /// Each internal node is guaranteed to never have its end index change 164 /// during the construction algorithm; however, leaves must be updated at 165 /// every step. Therefore, we need to store leaf end indices by reference 166 /// to avoid updating O(N) leaves at every step of construction. Thus, 167 /// every internal node must be allocated its own end index. 168 llvm::BumpPtrAllocator InternalEndIdxAllocator; 169 170 /// The end index of each leaf in the tree. 171 unsigned LeafEndIdx = -1; 172 173 /// Helper struct which keeps track of the next insertion point in 174 /// Ukkonen's algorithm. 175 struct ActiveState { 176 /// The next node to insert at. 177 SuffixTreeNode *Node = nullptr; 178 179 /// The index of the first character in the substring currently being added. 180 unsigned Idx = EmptyIdx; 181 182 /// The length of the substring we have to add at the current step. 183 unsigned Len = 0; 184 }; 185 186 /// The point the next insertion will take place at in the 187 /// construction algorithm. 188 ActiveState Active; 189 190 /// Allocate a leaf node and add it to the tree. 191 /// 192 /// \param Parent The parent of this node. 193 /// \param StartIdx The start index of this node's associated string. 194 /// \param Edge The label on the edge leaving \p Parent to this node. 195 /// 196 /// \returns A pointer to the allocated leaf node. 197 SuffixTreeNode *insertLeaf(SuffixTreeNode &Parent, unsigned StartIdx, 198 unsigned Edge); 199 200 /// Allocate an internal node and add it to the tree. 201 /// 202 /// \param Parent The parent of this node. Only null when allocating the root. 203 /// \param StartIdx The start index of this node's associated string. 204 /// \param EndIdx The end index of this node's associated string. 205 /// \param Edge The label on the edge leaving \p Parent to this node. 206 /// 207 /// \returns A pointer to the allocated internal node. 208 SuffixTreeNode *insertInternalNode(SuffixTreeNode *Parent, unsigned StartIdx, 209 unsigned EndIdx, unsigned Edge); 210 211 /// Set the suffix indices of the leaves to the start indices of their 212 /// respective suffixes. 213 void setSuffixIndices(); 214 215 /// Construct the suffix tree for the prefix of the input ending at 216 /// \p EndIdx. 217 /// 218 /// Used to construct the full suffix tree iteratively. At the end of each 219 /// step, the constructed suffix tree is either a valid suffix tree, or a 220 /// suffix tree with implicit suffixes. At the end of the final step, the 221 /// suffix tree is a valid tree. 222 /// 223 /// \param EndIdx The end index of the current prefix in the main string. 224 /// \param SuffixesToAdd The number of suffixes that must be added 225 /// to complete the suffix tree at the current phase. 226 /// 227 /// \returns The number of suffixes that have not been added at the end of 228 /// this step. 229 unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd); 230 231 public: 232 /// Construct a suffix tree from a sequence of unsigned integers. 233 /// 234 /// \param Str The string to construct the suffix tree for. 235 SuffixTree(const std::vector<unsigned> &Str); 236 237 /// Iterator for finding all repeated substrings in the suffix tree. 238 struct RepeatedSubstringIterator { 239 private: 240 /// The current node we're visiting. 241 SuffixTreeNode *N = nullptr; 242 243 /// The repeated substring associated with this node. 244 RepeatedSubstring RS; 245 246 /// The nodes left to visit. 247 std::vector<SuffixTreeNode *> ToVisit; 248 249 /// The minimum length of a repeated substring to find. 250 /// Since we're outlining, we want at least two instructions in the range. 251 /// FIXME: This may not be true for targets like X86 which support many 252 /// instruction lengths. 253 const unsigned MinLength = 2; 254 255 /// Move the iterator to the next repeated substring. 256 void advance() { 257 // Clear the current state. If we're at the end of the range, then this 258 // is the state we want to be in. 259 RS = RepeatedSubstring(); 260 N = nullptr; 261 262 // Each leaf node represents a repeat of a string. 263 std::vector<SuffixTreeNode *> LeafChildren; 264 265 // Continue visiting nodes until we find one which repeats more than once. 266 while (!ToVisit.empty()) { 267 SuffixTreeNode *Curr = ToVisit.back(); 268 ToVisit.pop_back(); 269 LeafChildren.clear(); 270 271 // Keep track of the length of the string associated with the node. If 272 // it's too short, we'll quit. 273 unsigned Length = Curr->ConcatLen; 274 275 // Iterate over each child, saving internal nodes for visiting, and 276 // leaf nodes in LeafChildren. Internal nodes represent individual 277 // strings, which may repeat. 278 for (auto &ChildPair : Curr->Children) { 279 // Save all of this node's children for processing. 280 if (!ChildPair.second->isLeaf()) 281 ToVisit.push_back(ChildPair.second); 282 283 // It's not an internal node, so it must be a leaf. If we have a 284 // long enough string, then save the leaf children. 285 else if (Length >= MinLength) 286 LeafChildren.push_back(ChildPair.second); 287 } 288 289 // The root never represents a repeated substring. If we're looking at 290 // that, then skip it. 291 if (Curr->isRoot()) 292 continue; 293 294 // Do we have any repeated substrings? 295 if (LeafChildren.size() >= 2) { 296 // Yes. Update the state to reflect this, and then bail out. 297 N = Curr; 298 RS.Length = Length; 299 for (SuffixTreeNode *Leaf : LeafChildren) 300 RS.StartIndices.push_back(Leaf->SuffixIdx); 301 break; 302 } 303 } 304 305 // At this point, either NewRS is an empty RepeatedSubstring, or it was 306 // set in the above loop. Similarly, N is either nullptr, or the node 307 // associated with NewRS. 308 } 309 310 public: 311 /// Return the current repeated substring. 312 RepeatedSubstring &operator*() { return RS; } 313 314 RepeatedSubstringIterator &operator++() { 315 advance(); 316 return *this; 317 } 318 319 RepeatedSubstringIterator operator++(int I) { 320 RepeatedSubstringIterator It(*this); 321 advance(); 322 return It; 323 } 324 325 bool operator==(const RepeatedSubstringIterator &Other) const { 326 return N == Other.N; 327 } 328 bool operator!=(const RepeatedSubstringIterator &Other) const { 329 return !(*this == Other); 330 } 331 332 RepeatedSubstringIterator(SuffixTreeNode *N) : N(N) { 333 // Do we have a non-null node? 334 if (N) { 335 // Yes. At the first step, we need to visit all of N's children. 336 // Note: This means that we visit N last. 337 ToVisit.push_back(N); 338 advance(); 339 } 340 } 341 }; 342 343 typedef RepeatedSubstringIterator iterator; 344 iterator begin() { return iterator(Root); } 345 iterator end() { return iterator(nullptr); } 346 }; 347 348 } // namespace llvm 349 350 #endif // LLVM_SUPPORT_SUFFIXTREE_H 351