1 //===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // A data structure for fast substring queries.
9 //
10 // Suffix trees represent the suffixes of their input strings in their leaves.
11 // A suffix tree is a type of compressed trie structure where each node
12 // represents an entire substring rather than a single character. Each leaf
13 // of the tree is a suffix.
14 //
15 // A suffix tree can be seen as a type of state machine where each state is a
16 // substring of the full string. The tree is structured so that, for a string
17 // of length N, there are exactly N leaves in the tree. This structure allows
18 // us to quickly find repeated substrings of the input string.
19 //
20 // In this implementation, a "string" is a vector of unsigned integers.
21 // These integers may result from hashing some data type. A suffix tree can
22 // contain 1 or many strings, which can then be queried as one large string.
23 //
24 // The suffix tree is implemented using Ukkonen's algorithm for linear-time
25 // suffix tree construction. Ukkonen's algorithm is explained in more detail
26 // in the paper by Esko Ukkonen "On-line construction of suffix trees. The
27 // paper is available at
28 //
29 // https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
30 //===----------------------------------------------------------------------===//
31 
32 #ifndef LLVM_SUPPORT_SUFFIXTREE_H
33 #define LLVM_SUPPORT_SUFFIXTREE_H
34 
35 #include "llvm/ADT/ArrayRef.h"
36 #include "llvm/Support/Allocator.h"
37 #include "llvm/Support/SuffixTreeNode.h"
38 
39 namespace llvm {
40 class SuffixTree {
41 public:
42   /// Each element is an integer representing an instruction in the module.
43   ArrayRef<unsigned> Str;
44 
45   /// A repeated substring in the tree.
46   struct RepeatedSubstring {
47     /// The length of the string.
48     unsigned Length;
49 
50     /// The start indices of each occurrence.
51     SmallVector<unsigned> StartIndices;
52   };
53 
54 private:
55   /// Maintains internal nodes in the tree.
56   SpecificBumpPtrAllocator<SuffixTreeInternalNode> InternalNodeAllocator;
57   /// Maintains leaf nodes in the tree.
58   SpecificBumpPtrAllocator<SuffixTreeLeafNode> LeafNodeAllocator;
59 
60   /// The root of the suffix tree.
61   ///
62   /// The root represents the empty string. It is maintained by the
63   /// \p NodeAllocator like every other node in the tree.
64   SuffixTreeInternalNode *Root = nullptr;
65 
66   /// The end index of each leaf in the tree.
67   unsigned LeafEndIdx = SuffixTreeNode::EmptyIdx;
68 
69   /// Helper struct which keeps track of the next insertion point in
70   /// Ukkonen's algorithm.
71   struct ActiveState {
72     /// The next node to insert at.
73     SuffixTreeInternalNode *Node = nullptr;
74 
75     /// The index of the first character in the substring currently being added.
76     unsigned Idx = SuffixTreeNode::EmptyIdx;
77 
78     /// The length of the substring we have to add at the current step.
79     unsigned Len = 0;
80   };
81 
82   /// The point the next insertion will take place at in the
83   /// construction algorithm.
84   ActiveState Active;
85 
86   /// Allocate a leaf node and add it to the tree.
87   ///
88   /// \param Parent The parent of this node.
89   /// \param StartIdx The start index of this node's associated string.
90   /// \param Edge The label on the edge leaving \p Parent to this node.
91   ///
92   /// \returns A pointer to the allocated leaf node.
93   SuffixTreeNode *insertLeaf(SuffixTreeInternalNode &Parent, unsigned StartIdx,
94                              unsigned Edge);
95 
96   /// Allocate an internal node and add it to the tree.
97   ///
98   /// \param Parent The parent of this node. Only null when allocating the root.
99   /// \param StartIdx The start index of this node's associated string.
100   /// \param EndIdx The end index of this node's associated string.
101   /// \param Edge The label on the edge leaving \p Parent to this node.
102   ///
103   /// \returns A pointer to the allocated internal node.
104   SuffixTreeInternalNode *insertInternalNode(SuffixTreeInternalNode *Parent,
105                                              unsigned StartIdx, unsigned EndIdx,
106                                              unsigned Edge);
107 
108   /// Allocate the root node and add it to the tree.
109   ///
110   /// \returns A pointer to the root.
111   SuffixTreeInternalNode *insertRoot();
112 
113   /// Set the suffix indices of the leaves to the start indices of their
114   /// respective suffixes.
115   void setSuffixIndices();
116 
117   /// Construct the suffix tree for the prefix of the input ending at
118   /// \p EndIdx.
119   ///
120   /// Used to construct the full suffix tree iteratively. At the end of each
121   /// step, the constructed suffix tree is either a valid suffix tree, or a
122   /// suffix tree with implicit suffixes. At the end of the final step, the
123   /// suffix tree is a valid tree.
124   ///
125   /// \param EndIdx The end index of the current prefix in the main string.
126   /// \param SuffixesToAdd The number of suffixes that must be added
127   /// to complete the suffix tree at the current phase.
128   ///
129   /// \returns The number of suffixes that have not been added at the end of
130   /// this step.
131   unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd);
132 
133 public:
134   /// Construct a suffix tree from a sequence of unsigned integers.
135   ///
136   /// \param Str The string to construct the suffix tree for.
137   SuffixTree(const ArrayRef<unsigned> &Str);
138 
139   /// Iterator for finding all repeated substrings in the suffix tree.
140   struct RepeatedSubstringIterator {
141   private:
142     /// The current node we're visiting.
143     SuffixTreeNode *N = nullptr;
144 
145     /// The repeated substring associated with this node.
146     RepeatedSubstring RS;
147 
148     /// The nodes left to visit.
149     SmallVector<SuffixTreeInternalNode *> InternalNodesToVisit;
150 
151     /// The minimum length of a repeated substring to find.
152     /// Since we're outlining, we want at least two instructions in the range.
153     /// FIXME: This may not be true for targets like X86 which support many
154     /// instruction lengths.
155     const unsigned MinLength = 2;
156 
157     /// Move the iterator to the next repeated substring.
158     void advance();
159 
160   public:
161     /// Return the current repeated substring.
162     RepeatedSubstring &operator*() { return RS; }
163 
164     RepeatedSubstringIterator &operator++() {
165       advance();
166       return *this;
167     }
168 
169     RepeatedSubstringIterator operator++(int I) {
170       RepeatedSubstringIterator It(*this);
171       advance();
172       return It;
173     }
174 
175     bool operator==(const RepeatedSubstringIterator &Other) const {
176       return N == Other.N;
177     }
178     bool operator!=(const RepeatedSubstringIterator &Other) const {
179       return !(*this == Other);
180     }
181 
182     RepeatedSubstringIterator(SuffixTreeInternalNode *N) : N(N) {
183       // Do we have a non-null node?
184       if (!N)
185         return;
186       // Yes. At the first step, we need to visit all of N's children.
187       // Note: This means that we visit N last.
188       InternalNodesToVisit.push_back(N);
189       advance();
190     }
191   };
192 
193   typedef RepeatedSubstringIterator iterator;
194   iterator begin() { return iterator(Root); }
195   iterator end() { return iterator(nullptr); }
196 };
197 
198 } // namespace llvm
199 
200 #endif // LLVM_SUPPORT_SUFFIXTREE_H
201