1 //===- llvm/ADT/SuffixTreeNode.h - Nodes for SuffixTrees --------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines nodes for use within a SuffixTree.
10 //
11 // Each node has either no children or at least two children, with the root
12 // being a exception in the empty tree.
13 //
14 // Children are represented as a map between unsigned integers and nodes. If
15 // a node N has a child M on unsigned integer k, then the mapping represented
16 // by N is a proper prefix of the mapping represented by M. Note that this,
17 // although similar to a trie is somewhat different: each node stores a full
18 // substring of the full mapping rather than a single character state.
19 //
20 // Each internal node contains a pointer to the internal node representing
21 // the same string, but with the first character chopped off. This is stored
22 // in \p Link. Each leaf node stores the start index of its respective
23 // suffix in \p SuffixIdx.
24 //===----------------------------------------------------------------------===//
25 
26 #ifndef LLVM_SUPPORT_SUFFIXTREE_NODE_H
27 #define LLVM_SUPPORT_SUFFIXTREE_NODE_H
28 #include "llvm/ADT/DenseMap.h"
29 
30 namespace llvm {
31 
32 /// A node in a suffix tree which represents a substring or suffix.
33 struct SuffixTreeNode {
34 public:
35   /// Represents an undefined index in the suffix tree.
36   static const unsigned EmptyIdx = -1;
37   enum class NodeKind { ST_Leaf, ST_Internal };
38 
39 private:
40   const NodeKind Kind;
41 
42   /// The start index of this node's substring in the main string.
43   unsigned StartIdx = EmptyIdx;
44 
45   /// The length of the string formed by concatenating the edge labels from
46   /// the root to this node.
47   unsigned ConcatLen = 0;
48 
49 public:
50   // LLVM RTTI boilerplate.
51   NodeKind getKind() const { return Kind; }
52 
53   /// \return the start index of this node's substring in the entire string.
54   unsigned getStartIdx() const;
55 
56   /// \returns the end index of this node.
57   virtual unsigned getEndIdx() const = 0;
58 
59   /// Advance this node's StartIdx by \p Inc.
60   void incrementStartIdx(unsigned Inc);
61 
62   /// Set the length of the string from the root to this node to \p Len.
63   void setConcatLen(unsigned Len);
64 
65   /// \returns the length of the string from the root to this node.
66   unsigned getConcatLen() const;
67 
68   SuffixTreeNode(NodeKind Kind, unsigned StartIdx)
69       : Kind(Kind), StartIdx(StartIdx) {}
70   virtual ~SuffixTreeNode() = default;
71 };
72 
73 // A node with two or more children, or the root.
74 struct SuffixTreeInternalNode : SuffixTreeNode {
75 private:
76   /// The end index of this node's substring in the main string.
77   ///
78   /// Every leaf node must have its \p EndIdx incremented at the end of every
79   /// step in the construction algorithm. To avoid having to update O(N)
80   /// nodes individually at the end of every step, the end index is stored
81   /// as a pointer.
82   unsigned EndIdx = EmptyIdx;
83 
84   /// A pointer to the internal node representing the same sequence with the
85   /// first character chopped off.
86   ///
87   /// This acts as a shortcut in Ukkonen's algorithm. One of the things that
88   /// Ukkonen's algorithm does to achieve linear-time construction is
89   /// keep track of which node the next insert should be at. This makes each
90   /// insert O(1), and there are a total of O(N) inserts. The suffix link
91   /// helps with inserting children of internal nodes.
92   ///
93   /// Say we add a child to an internal node with associated mapping S. The
94   /// next insertion must be at the node representing S - its first character.
95   /// This is given by the way that we iteratively build the tree in Ukkonen's
96   /// algorithm. The main idea is to look at the suffixes of each prefix in the
97   /// string, starting with the longest suffix of the prefix, and ending with
98   /// the shortest. Therefore, if we keep pointers between such nodes, we can
99   /// move to the next insertion point in O(1) time. If we don't, then we'd
100   /// have to query from the root, which takes O(N) time. This would make the
101   /// construction algorithm O(N^2) rather than O(N).
102   SuffixTreeInternalNode *Link = nullptr;
103 
104 public:
105   // LLVM RTTI boilerplate.
106   static bool classof(const SuffixTreeNode *N) {
107     return N->getKind() == NodeKind::ST_Internal;
108   }
109 
110   /// \returns true if this node is the root of its owning \p SuffixTree.
111   bool isRoot() const;
112 
113   /// \returns the end index of this node's substring in the entire string.
114   unsigned getEndIdx() const override;
115 
116   /// Sets \p Link to \p L. Assumes \p L is not null.
117   void setLink(SuffixTreeInternalNode *L);
118 
119   /// \returns the pointer to the Link node.
120   SuffixTreeInternalNode *getLink() const;
121 
122   /// The children of this node.
123   ///
124   /// A child existing on an unsigned integer implies that from the mapping
125   /// represented by the current node, there is a way to reach another
126   /// mapping by tacking that character on the end of the current string.
127   DenseMap<unsigned, SuffixTreeNode *> Children;
128 
129   SuffixTreeInternalNode(unsigned StartIdx, unsigned EndIdx,
130                          SuffixTreeInternalNode *Link)
131       : SuffixTreeNode(NodeKind::ST_Internal, StartIdx), EndIdx(EndIdx),
132         Link(Link) {}
133 
134   virtual ~SuffixTreeInternalNode() = default;
135 };
136 
137 // A node representing a suffix.
138 struct SuffixTreeLeafNode : SuffixTreeNode {
139 private:
140   /// The start index of the suffix represented by this leaf.
141   unsigned SuffixIdx = EmptyIdx;
142 
143   /// The end index of this node's substring in the main string.
144   ///
145   /// Every leaf node must have its \p EndIdx incremented at the end of every
146   /// step in the construction algorithm. To avoid having to update O(N)
147   /// nodes individually at the end of every step, the end index is stored
148   /// as a pointer.
149   unsigned *EndIdx = nullptr;
150 
151 public:
152   // LLVM RTTI boilerplate.
153   static bool classof(const SuffixTreeNode *N) {
154     return N->getKind() == NodeKind::ST_Leaf;
155   }
156 
157   /// \returns the end index of this node's substring in the entire string.
158   unsigned getEndIdx() const override;
159 
160   /// \returns the start index of the suffix represented by this leaf.
161   unsigned getSuffixIdx() const;
162 
163   /// Sets the start index of the suffix represented by this leaf to \p Idx.
164   void setSuffixIdx(unsigned Idx);
165   SuffixTreeLeafNode(unsigned StartIdx, unsigned *EndIdx)
166       : SuffixTreeNode(NodeKind::ST_Leaf, StartIdx), EndIdx(EndIdx) {}
167 
168   virtual ~SuffixTreeLeafNode() = default;
169 };
170 } // namespace llvm
171 #endif // LLVM_SUPPORT_SUFFIXTREE_NODE_H