1 //===- llvm/ADT/SuffixTreeNode.h - Nodes for SuffixTrees --------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines nodes for use within a SuffixTree. 10 // 11 // Each node has either no children or at least two children, with the root 12 // being a exception in the empty tree. 13 // 14 // Children are represented as a map between unsigned integers and nodes. If 15 // a node N has a child M on unsigned integer k, then the mapping represented 16 // by N is a proper prefix of the mapping represented by M. Note that this, 17 // although similar to a trie is somewhat different: each node stores a full 18 // substring of the full mapping rather than a single character state. 19 // 20 // Each internal node contains a pointer to the internal node representing 21 // the same string, but with the first character chopped off. This is stored 22 // in \p Link. Each leaf node stores the start index of its respective 23 // suffix in \p SuffixIdx. 24 //===----------------------------------------------------------------------===// 25 26 #ifndef LLVM_SUPPORT_SUFFIXTREE_NODE_H 27 #define LLVM_SUPPORT_SUFFIXTREE_NODE_H 28 #include "llvm/ADT/DenseMap.h" 29 30 namespace llvm { 31 32 /// A node in a suffix tree which represents a substring or suffix. 33 struct SuffixTreeNode { 34 public: 35 /// Represents an undefined index in the suffix tree. 36 static const unsigned EmptyIdx = -1; 37 enum class NodeKind { ST_Leaf, ST_Internal }; 38 39 private: 40 const NodeKind Kind; 41 42 /// The start index of this node's substring in the main string. 43 unsigned StartIdx = EmptyIdx; 44 45 /// The length of the string formed by concatenating the edge labels from 46 /// the root to this node. 47 unsigned ConcatLen = 0; 48 49 public: 50 // LLVM RTTI boilerplate. getKindSuffixTreeNode51 NodeKind getKind() const { return Kind; } 52 53 /// \return the start index of this node's substring in the entire string. 54 unsigned getStartIdx() const; 55 56 /// \returns the end index of this node. 57 virtual unsigned getEndIdx() const = 0; 58 59 /// Advance this node's StartIdx by \p Inc. 60 void incrementStartIdx(unsigned Inc); 61 62 /// Set the length of the string from the root to this node to \p Len. 63 void setConcatLen(unsigned Len); 64 65 /// \returns the length of the string from the root to this node. 66 unsigned getConcatLen() const; 67 SuffixTreeNodeSuffixTreeNode68 SuffixTreeNode(NodeKind Kind, unsigned StartIdx) 69 : Kind(Kind), StartIdx(StartIdx) {} 70 virtual ~SuffixTreeNode() = default; 71 }; 72 73 // A node with two or more children, or the root. 74 struct SuffixTreeInternalNode : SuffixTreeNode { 75 private: 76 /// The end index of this node's substring in the main string. 77 /// 78 /// Every leaf node must have its \p EndIdx incremented at the end of every 79 /// step in the construction algorithm. To avoid having to update O(N) 80 /// nodes individually at the end of every step, the end index is stored 81 /// as a pointer. 82 unsigned EndIdx = EmptyIdx; 83 84 /// A pointer to the internal node representing the same sequence with the 85 /// first character chopped off. 86 /// 87 /// This acts as a shortcut in Ukkonen's algorithm. One of the things that 88 /// Ukkonen's algorithm does to achieve linear-time construction is 89 /// keep track of which node the next insert should be at. This makes each 90 /// insert O(1), and there are a total of O(N) inserts. The suffix link 91 /// helps with inserting children of internal nodes. 92 /// 93 /// Say we add a child to an internal node with associated mapping S. The 94 /// next insertion must be at the node representing S - its first character. 95 /// This is given by the way that we iteratively build the tree in Ukkonen's 96 /// algorithm. The main idea is to look at the suffixes of each prefix in the 97 /// string, starting with the longest suffix of the prefix, and ending with 98 /// the shortest. Therefore, if we keep pointers between such nodes, we can 99 /// move to the next insertion point in O(1) time. If we don't, then we'd 100 /// have to query from the root, which takes O(N) time. This would make the 101 /// construction algorithm O(N^2) rather than O(N). 102 SuffixTreeInternalNode *Link = nullptr; 103 104 public: 105 // LLVM RTTI boilerplate. classofSuffixTreeInternalNode106 static bool classof(const SuffixTreeNode *N) { 107 return N->getKind() == NodeKind::ST_Internal; 108 } 109 110 /// \returns true if this node is the root of its owning \p SuffixTree. 111 bool isRoot() const; 112 113 /// \returns the end index of this node's substring in the entire string. 114 unsigned getEndIdx() const override; 115 116 /// Sets \p Link to \p L. Assumes \p L is not null. 117 void setLink(SuffixTreeInternalNode *L); 118 119 /// \returns the pointer to the Link node. 120 SuffixTreeInternalNode *getLink() const; 121 122 /// The children of this node. 123 /// 124 /// A child existing on an unsigned integer implies that from the mapping 125 /// represented by the current node, there is a way to reach another 126 /// mapping by tacking that character on the end of the current string. 127 DenseMap<unsigned, SuffixTreeNode *> Children; 128 SuffixTreeInternalNodeSuffixTreeInternalNode129 SuffixTreeInternalNode(unsigned StartIdx, unsigned EndIdx, 130 SuffixTreeInternalNode *Link) 131 : SuffixTreeNode(NodeKind::ST_Internal, StartIdx), EndIdx(EndIdx), 132 Link(Link) {} 133 134 virtual ~SuffixTreeInternalNode() = default; 135 }; 136 137 // A node representing a suffix. 138 struct SuffixTreeLeafNode : SuffixTreeNode { 139 private: 140 /// The start index of the suffix represented by this leaf. 141 unsigned SuffixIdx = EmptyIdx; 142 143 /// The end index of this node's substring in the main string. 144 /// 145 /// Every leaf node must have its \p EndIdx incremented at the end of every 146 /// step in the construction algorithm. To avoid having to update O(N) 147 /// nodes individually at the end of every step, the end index is stored 148 /// as a pointer. 149 unsigned *EndIdx = nullptr; 150 151 public: 152 // LLVM RTTI boilerplate. classofSuffixTreeLeafNode153 static bool classof(const SuffixTreeNode *N) { 154 return N->getKind() == NodeKind::ST_Leaf; 155 } 156 157 /// \returns the end index of this node's substring in the entire string. 158 unsigned getEndIdx() const override; 159 160 /// \returns the start index of the suffix represented by this leaf. 161 unsigned getSuffixIdx() const; 162 163 /// Sets the start index of the suffix represented by this leaf to \p Idx. 164 void setSuffixIdx(unsigned Idx); SuffixTreeLeafNodeSuffixTreeLeafNode165 SuffixTreeLeafNode(unsigned StartIdx, unsigned *EndIdx) 166 : SuffixTreeNode(NodeKind::ST_Leaf, StartIdx), EndIdx(EndIdx) {} 167 168 virtual ~SuffixTreeLeafNode() = default; 169 }; 170 } // namespace llvm 171 #endif // LLVM_SUPPORT_SUFFIXTREE_NODE_H