1 //===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // A data structure for fast substring queries. 9 // 10 // Suffix trees represent the suffixes of their input strings in their leaves. 11 // A suffix tree is a type of compressed trie structure where each node 12 // represents an entire substring rather than a single character. Each leaf 13 // of the tree is a suffix. 14 // 15 // A suffix tree can be seen as a type of state machine where each state is a 16 // substring of the full string. The tree is structured so that, for a string 17 // of length N, there are exactly N leaves in the tree. This structure allows 18 // us to quickly find repeated substrings of the input string. 19 // 20 // In this implementation, a "string" is a vector of unsigned integers. 21 // These integers may result from hashing some data type. A suffix tree can 22 // contain 1 or many strings, which can then be queried as one large string. 23 // 24 // The suffix tree is implemented using Ukkonen's algorithm for linear-time 25 // suffix tree construction. Ukkonen's algorithm is explained in more detail 26 // in the paper by Esko Ukkonen "On-line construction of suffix trees. The 27 // paper is available at 28 // 29 // https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf 30 //===----------------------------------------------------------------------===// 31 32 #ifndef LLVM_SUPPORT_SUFFIXTREE_H 33 #define LLVM_SUPPORT_SUFFIXTREE_H 34 35 #include "llvm/ADT/ArrayRef.h" 36 #include "llvm/Support/Allocator.h" 37 #include "llvm/Support/SuffixTreeNode.h" 38 39 namespace llvm { 40 class SuffixTree { 41 public: 42 /// Each element is an integer representing an instruction in the module. 43 ArrayRef<unsigned> Str; 44 45 /// A repeated substring in the tree. 46 struct RepeatedSubstring { 47 /// The length of the string. 48 unsigned Length; 49 50 /// The start indices of each occurrence. 51 SmallVector<unsigned> StartIndices; 52 }; 53 54 private: 55 /// Maintains internal nodes in the tree. 56 SpecificBumpPtrAllocator<SuffixTreeInternalNode> InternalNodeAllocator; 57 /// Maintains leaf nodes in the tree. 58 SpecificBumpPtrAllocator<SuffixTreeLeafNode> LeafNodeAllocator; 59 60 /// The root of the suffix tree. 61 /// 62 /// The root represents the empty string. It is maintained by the 63 /// \p NodeAllocator like every other node in the tree. 64 SuffixTreeInternalNode *Root = nullptr; 65 66 /// The end index of each leaf in the tree. 67 unsigned LeafEndIdx = SuffixTreeNode::EmptyIdx; 68 69 /// Helper struct which keeps track of the next insertion point in 70 /// Ukkonen's algorithm. 71 struct ActiveState { 72 /// The next node to insert at. 73 SuffixTreeInternalNode *Node = nullptr; 74 75 /// The index of the first character in the substring currently being added. 76 unsigned Idx = SuffixTreeNode::EmptyIdx; 77 78 /// The length of the substring we have to add at the current step. 79 unsigned Len = 0; 80 }; 81 82 /// The point the next insertion will take place at in the 83 /// construction algorithm. 84 ActiveState Active; 85 86 /// Allocate a leaf node and add it to the tree. 87 /// 88 /// \param Parent The parent of this node. 89 /// \param StartIdx The start index of this node's associated string. 90 /// \param Edge The label on the edge leaving \p Parent to this node. 91 /// 92 /// \returns A pointer to the allocated leaf node. 93 SuffixTreeNode *insertLeaf(SuffixTreeInternalNode &Parent, unsigned StartIdx, 94 unsigned Edge); 95 96 /// Allocate an internal node and add it to the tree. 97 /// 98 /// \param Parent The parent of this node. Only null when allocating the root. 99 /// \param StartIdx The start index of this node's associated string. 100 /// \param EndIdx The end index of this node's associated string. 101 /// \param Edge The label on the edge leaving \p Parent to this node. 102 /// 103 /// \returns A pointer to the allocated internal node. 104 SuffixTreeInternalNode *insertInternalNode(SuffixTreeInternalNode *Parent, 105 unsigned StartIdx, unsigned EndIdx, 106 unsigned Edge); 107 108 /// Allocate the root node and add it to the tree. 109 /// 110 /// \returns A pointer to the root. 111 SuffixTreeInternalNode *insertRoot(); 112 113 /// Set the suffix indices of the leaves to the start indices of their 114 /// respective suffixes. 115 void setSuffixIndices(); 116 117 /// Construct the suffix tree for the prefix of the input ending at 118 /// \p EndIdx. 119 /// 120 /// Used to construct the full suffix tree iteratively. At the end of each 121 /// step, the constructed suffix tree is either a valid suffix tree, or a 122 /// suffix tree with implicit suffixes. At the end of the final step, the 123 /// suffix tree is a valid tree. 124 /// 125 /// \param EndIdx The end index of the current prefix in the main string. 126 /// \param SuffixesToAdd The number of suffixes that must be added 127 /// to complete the suffix tree at the current phase. 128 /// 129 /// \returns The number of suffixes that have not been added at the end of 130 /// this step. 131 unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd); 132 133 public: 134 /// Construct a suffix tree from a sequence of unsigned integers. 135 /// 136 /// \param Str The string to construct the suffix tree for. 137 SuffixTree(const ArrayRef<unsigned> &Str); 138 139 /// Iterator for finding all repeated substrings in the suffix tree. 140 struct RepeatedSubstringIterator { 141 private: 142 /// The current node we're visiting. 143 SuffixTreeNode *N = nullptr; 144 145 /// The repeated substring associated with this node. 146 RepeatedSubstring RS; 147 148 /// The nodes left to visit. 149 SmallVector<SuffixTreeInternalNode *> InternalNodesToVisit; 150 151 /// The minimum length of a repeated substring to find. 152 /// Since we're outlining, we want at least two instructions in the range. 153 /// FIXME: This may not be true for targets like X86 which support many 154 /// instruction lengths. 155 const unsigned MinLength = 2; 156 157 /// Move the iterator to the next repeated substring. 158 void advance(); 159 160 public: 161 /// Return the current repeated substring. 162 RepeatedSubstring &operator*() { return RS; } 163 164 RepeatedSubstringIterator &operator++() { 165 advance(); 166 return *this; 167 } 168 169 RepeatedSubstringIterator operator++(int I) { 170 RepeatedSubstringIterator It(*this); 171 advance(); 172 return It; 173 } 174 175 bool operator==(const RepeatedSubstringIterator &Other) const { 176 return N == Other.N; 177 } 178 bool operator!=(const RepeatedSubstringIterator &Other) const { 179 return !(*this == Other); 180 } 181 182 RepeatedSubstringIterator(SuffixTreeInternalNode *N) : N(N) { 183 // Do we have a non-null node? 184 if (!N) 185 return; 186 // Yes. At the first step, we need to visit all of N's children. 187 // Note: This means that we visit N last. 188 InternalNodesToVisit.push_back(N); 189 advance(); 190 } 191 }; 192 193 typedef RepeatedSubstringIterator iterator; 194 iterator begin() { return iterator(Root); } 195 iterator end() { return iterator(nullptr); } 196 }; 197 198 } // namespace llvm 199 200 #endif // LLVM_SUPPORT_SUFFIXTREE_H 201