1 //===- llvm/Support/SuffixTree.cpp - Implement Suffix Tree ------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the Suffix Tree class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "llvm/Support/SuffixTree.h"
14 #include "llvm/Support/Allocator.h"
15 #include "llvm/Support/Casting.h"
16 #include "llvm/Support/SuffixTreeNode.h"
17 
18 using namespace llvm;
19 
20 /// \returns the number of elements in the substring associated with \p N.
numElementsInSubstring(const SuffixTreeNode * N)21 static size_t numElementsInSubstring(const SuffixTreeNode *N) {
22   assert(N && "Got a null node?");
23   if (auto *Internal = dyn_cast<SuffixTreeInternalNode>(N))
24     if (Internal->isRoot())
25       return 0;
26   return N->getEndIdx() - N->getStartIdx() + 1;
27 }
28 
SuffixTree(const ArrayRef<unsigned> & Str)29 SuffixTree::SuffixTree(const ArrayRef<unsigned> &Str) : Str(Str) {
30   Root = insertRoot();
31   Active.Node = Root;
32 
33   // Keep track of the number of suffixes we have to add of the current
34   // prefix.
35   unsigned SuffixesToAdd = 0;
36 
37   // Construct the suffix tree iteratively on each prefix of the string.
38   // PfxEndIdx is the end index of the current prefix.
39   // End is one past the last element in the string.
40   for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) {
41     SuffixesToAdd++;
42     LeafEndIdx = PfxEndIdx; // Extend each of the leaves.
43     SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd);
44   }
45 
46   // Set the suffix indices of each leaf.
47   assert(Root && "Root node can't be nullptr!");
48   setSuffixIndices();
49 }
50 
insertLeaf(SuffixTreeInternalNode & Parent,unsigned StartIdx,unsigned Edge)51 SuffixTreeNode *SuffixTree::insertLeaf(SuffixTreeInternalNode &Parent,
52                                        unsigned StartIdx, unsigned Edge) {
53   assert(StartIdx <= LeafEndIdx && "String can't start after it ends!");
54   auto *N = new (LeafNodeAllocator.Allocate())
55       SuffixTreeLeafNode(StartIdx, &LeafEndIdx);
56   Parent.Children[Edge] = N;
57   return N;
58 }
59 
60 SuffixTreeInternalNode *
insertInternalNode(SuffixTreeInternalNode * Parent,unsigned StartIdx,unsigned EndIdx,unsigned Edge)61 SuffixTree::insertInternalNode(SuffixTreeInternalNode *Parent,
62                                unsigned StartIdx, unsigned EndIdx,
63                                unsigned Edge) {
64   assert(StartIdx <= EndIdx && "String can't start after it ends!");
65   assert(!(!Parent && StartIdx != SuffixTreeNode::EmptyIdx) &&
66          "Non-root internal nodes must have parents!");
67   auto *N = new (InternalNodeAllocator.Allocate())
68       SuffixTreeInternalNode(StartIdx, EndIdx, Root);
69   if (Parent)
70     Parent->Children[Edge] = N;
71   return N;
72 }
73 
insertRoot()74 SuffixTreeInternalNode *SuffixTree::insertRoot() {
75   return insertInternalNode(/*Parent = */ nullptr, SuffixTreeNode::EmptyIdx,
76                             SuffixTreeNode::EmptyIdx, /*Edge = */ 0);
77 }
78 
setSuffixIndices()79 void SuffixTree::setSuffixIndices() {
80   // List of nodes we need to visit along with the current length of the
81   // string.
82   SmallVector<std::pair<SuffixTreeNode *, unsigned>> ToVisit;
83 
84   // Current node being visited.
85   SuffixTreeNode *CurrNode = Root;
86 
87   // Sum of the lengths of the nodes down the path to the current one.
88   unsigned CurrNodeLen = 0;
89   ToVisit.push_back({CurrNode, CurrNodeLen});
90   while (!ToVisit.empty()) {
91     std::tie(CurrNode, CurrNodeLen) = ToVisit.back();
92     ToVisit.pop_back();
93     // Length of the current node from the root down to here.
94     CurrNode->setConcatLen(CurrNodeLen);
95     if (auto *InternalNode = dyn_cast<SuffixTreeInternalNode>(CurrNode))
96       for (auto &ChildPair : InternalNode->Children) {
97         assert(ChildPair.second && "Node had a null child!");
98         ToVisit.push_back(
99             {ChildPair.second,
100              CurrNodeLen + numElementsInSubstring(ChildPair.second)});
101       }
102     // No children, so we are at the end of the string.
103     if (auto *LeafNode = dyn_cast<SuffixTreeLeafNode>(CurrNode))
104       LeafNode->setSuffixIdx(Str.size() - CurrNodeLen);
105   }
106 }
107 
extend(unsigned EndIdx,unsigned SuffixesToAdd)108 unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) {
109   SuffixTreeInternalNode *NeedsLink = nullptr;
110 
111   while (SuffixesToAdd > 0) {
112 
113     // Are we waiting to add anything other than just the last character?
114     if (Active.Len == 0) {
115       // If not, then say the active index is the end index.
116       Active.Idx = EndIdx;
117     }
118 
119     assert(Active.Idx <= EndIdx && "Start index can't be after end index!");
120 
121     // The first character in the current substring we're looking at.
122     unsigned FirstChar = Str[Active.Idx];
123 
124     // Have we inserted anything starting with FirstChar at the current node?
125     if (Active.Node->Children.count(FirstChar) == 0) {
126       // If not, then we can just insert a leaf and move to the next step.
127       insertLeaf(*Active.Node, EndIdx, FirstChar);
128 
129       // The active node is an internal node, and we visited it, so it must
130       // need a link if it doesn't have one.
131       if (NeedsLink) {
132         NeedsLink->setLink(Active.Node);
133         NeedsLink = nullptr;
134       }
135     } else {
136       // There's a match with FirstChar, so look for the point in the tree to
137       // insert a new node.
138       SuffixTreeNode *NextNode = Active.Node->Children[FirstChar];
139 
140       unsigned SubstringLen = numElementsInSubstring(NextNode);
141 
142       // Is the current suffix we're trying to insert longer than the size of
143       // the child we want to move to?
144       if (Active.Len >= SubstringLen) {
145         // If yes, then consume the characters we've seen and move to the next
146         // node.
147         assert(isa<SuffixTreeInternalNode>(NextNode) &&
148                "Expected an internal node?");
149         Active.Idx += SubstringLen;
150         Active.Len -= SubstringLen;
151         Active.Node = cast<SuffixTreeInternalNode>(NextNode);
152         continue;
153       }
154 
155       // Otherwise, the suffix we're trying to insert must be contained in the
156       // next node we want to move to.
157       unsigned LastChar = Str[EndIdx];
158 
159       // Is the string we're trying to insert a substring of the next node?
160       if (Str[NextNode->getStartIdx() + Active.Len] == LastChar) {
161         // If yes, then we're done for this step. Remember our insertion point
162         // and move to the next end index. At this point, we have an implicit
163         // suffix tree.
164         if (NeedsLink && !Active.Node->isRoot()) {
165           NeedsLink->setLink(Active.Node);
166           NeedsLink = nullptr;
167         }
168 
169         Active.Len++;
170         break;
171       }
172 
173       // The string we're trying to insert isn't a substring of the next node,
174       // but matches up to a point. Split the node.
175       //
176       // For example, say we ended our search at a node n and we're trying to
177       // insert ABD. Then we'll create a new node s for AB, reduce n to just
178       // representing C, and insert a new leaf node l to represent d. This
179       // allows us to ensure that if n was a leaf, it remains a leaf.
180       //
181       //   | ABC  ---split--->  | AB
182       //   n                    s
183       //                     C / \ D
184       //                      n   l
185 
186       // The node s from the diagram
187       SuffixTreeInternalNode *SplitNode = insertInternalNode(
188           Active.Node, NextNode->getStartIdx(),
189           NextNode->getStartIdx() + Active.Len - 1, FirstChar);
190 
191       // Insert the new node representing the new substring into the tree as
192       // a child of the split node. This is the node l from the diagram.
193       insertLeaf(*SplitNode, EndIdx, LastChar);
194 
195       // Make the old node a child of the split node and update its start
196       // index. This is the node n from the diagram.
197       NextNode->incrementStartIdx(Active.Len);
198       SplitNode->Children[Str[NextNode->getStartIdx()]] = NextNode;
199 
200       // SplitNode is an internal node, update the suffix link.
201       if (NeedsLink)
202         NeedsLink->setLink(SplitNode);
203 
204       NeedsLink = SplitNode;
205     }
206 
207     // We've added something new to the tree, so there's one less suffix to
208     // add.
209     SuffixesToAdd--;
210 
211     if (Active.Node->isRoot()) {
212       if (Active.Len > 0) {
213         Active.Len--;
214         Active.Idx = EndIdx - SuffixesToAdd + 1;
215       }
216     } else {
217       // Start the next phase at the next smallest suffix.
218       Active.Node = Active.Node->getLink();
219     }
220   }
221 
222   return SuffixesToAdd;
223 }
224 
advance()225 void SuffixTree::RepeatedSubstringIterator::advance() {
226   // Clear the current state. If we're at the end of the range, then this
227   // is the state we want to be in.
228   RS = RepeatedSubstring();
229   N = nullptr;
230 
231   // Each leaf node represents a repeat of a string.
232   SmallVector<unsigned> RepeatedSubstringStarts;
233 
234   // Continue visiting nodes until we find one which repeats more than once.
235   while (!InternalNodesToVisit.empty()) {
236     RepeatedSubstringStarts.clear();
237     auto *Curr = InternalNodesToVisit.back();
238     InternalNodesToVisit.pop_back();
239 
240     // Keep track of the length of the string associated with the node. If
241     // it's too short, we'll quit.
242     unsigned Length = Curr->getConcatLen();
243 
244     // Iterate over each child, saving internal nodes for visiting, and
245     // leaf nodes in LeafChildren. Internal nodes represent individual
246     // strings, which may repeat.
247     for (auto &ChildPair : Curr->Children) {
248       // Save all of this node's children for processing.
249       if (auto *InternalChild =
250               dyn_cast<SuffixTreeInternalNode>(ChildPair.second)) {
251         InternalNodesToVisit.push_back(InternalChild);
252         continue;
253       }
254 
255       if (Length < MinLength)
256         continue;
257 
258       // Have an occurrence of a potentially repeated string. Save it.
259       auto *Leaf = cast<SuffixTreeLeafNode>(ChildPair.second);
260       RepeatedSubstringStarts.push_back(Leaf->getSuffixIdx());
261     }
262 
263     // The root never represents a repeated substring. If we're looking at
264     // that, then skip it.
265     if (Curr->isRoot())
266       continue;
267 
268     // Do we have any repeated substrings?
269     if (RepeatedSubstringStarts.size() < 2)
270       continue;
271 
272     // Yes. Update the state to reflect this, and then bail out.
273     N = Curr;
274     RS.Length = Length;
275     for (unsigned StartIdx : RepeatedSubstringStarts)
276       RS.StartIndices.push_back(StartIdx);
277     break;
278   }
279   // At this point, either NewRS is an empty RepeatedSubstring, or it was
280   // set in the above loop. Similarly, N is either nullptr, or the node
281   // associated with NewRS.
282 }
283