1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 #include "arrow/util/trie.h"
19
20 #include <iostream>
21 #include <utility>
22
23 #include "arrow/util/logging.h"
24
25 namespace arrow {
26 namespace internal {
27
Validate() const28 Status Trie::Validate() const {
29 const auto n_nodes = static_cast<fast_index_type>(nodes_.size());
30 if (size_ > n_nodes) {
31 return Status::Invalid("Number of entries larger than number of nodes");
32 }
33 for (const auto& node : nodes_) {
34 if (node.found_index_ >= size_) {
35 return Status::Invalid("Found index >= size");
36 }
37 if (node.child_lookup_ != -1 &&
38 node.child_lookup_ * 256 >
39 static_cast<fast_index_type>(lookup_table_.size() - 256)) {
40 return Status::Invalid("Child lookup base doesn't point to 256 valid indices");
41 }
42 }
43 for (const auto index : lookup_table_) {
44 if (index >= n_nodes) {
45 return Status::Invalid("Child lookup index out of bounds");
46 }
47 }
48 return Status::OK();
49 }
50
Dump(const Node * node,const std::string & indent) const51 void Trie::Dump(const Node* node, const std::string& indent) const {
52 std::cerr << "[\"" << node->substring_ << "\"]";
53 if (node->found_index_ >= 0) {
54 std::cerr << " *";
55 }
56 std::cerr << "\n";
57 if (node->child_lookup_ >= 0) {
58 auto child_indent = indent + " ";
59 std::cerr << child_indent << "|\n";
60 for (fast_index_type i = 0; i < 256; ++i) {
61 auto child_index = lookup_table_[node->child_lookup_ * 256 + i];
62 if (child_index >= 0) {
63 const Node* child = &nodes_[child_index];
64 std::cerr << child_indent << "|-> '" << static_cast<char>(i) << "' (" << i
65 << ") -> ";
66 Dump(child, child_indent);
67 }
68 }
69 }
70 }
71
Dump() const72 void Trie::Dump() const { Dump(&nodes_[0], ""); }
73
TrieBuilder()74 TrieBuilder::TrieBuilder() { trie_.nodes_.push_back(Trie::Node{-1, -1, ""}); }
75
AppendChildNode(Trie::Node * parent,uint8_t ch,Trie::Node && node)76 Status TrieBuilder::AppendChildNode(Trie::Node* parent, uint8_t ch, Trie::Node&& node) {
77 if (parent->child_lookup_ == -1) {
78 RETURN_NOT_OK(ExtendLookupTable(&parent->child_lookup_));
79 }
80 auto parent_lookup = parent->child_lookup_ * 256 + ch;
81
82 DCHECK_EQ(trie_.lookup_table_[parent_lookup], -1);
83 if (trie_.nodes_.size() >= static_cast<size_t>(kMaxIndex)) {
84 auto max_capacity = kMaxIndex;
85 return Status::CapacityError("TrieBuilder cannot contain more than ", max_capacity,
86 " child nodes");
87 }
88 trie_.nodes_.push_back(std::move(node));
89 trie_.lookup_table_[parent_lookup] = static_cast<index_type>(trie_.nodes_.size() - 1);
90 return Status::OK();
91 }
92
CreateChildNode(Trie::Node * parent,uint8_t ch,util::string_view substring)93 Status TrieBuilder::CreateChildNode(Trie::Node* parent, uint8_t ch,
94 util::string_view substring) {
95 const auto kMaxSubstringLength = Trie::kMaxSubstringLength;
96
97 while (substring.length() > kMaxSubstringLength) {
98 // Substring doesn't fit in node => create intermediate node
99 auto mid_node = Trie::Node{-1, -1, substring.substr(0, kMaxSubstringLength)};
100 RETURN_NOT_OK(AppendChildNode(parent, ch, std::move(mid_node)));
101 // Recurse
102 parent = &trie_.nodes_.back();
103 ch = static_cast<uint8_t>(substring[kMaxSubstringLength]);
104 substring = substring.substr(kMaxSubstringLength + 1);
105 }
106
107 // Create final matching node
108 auto child_node = Trie::Node{trie_.size_, -1, substring};
109 RETURN_NOT_OK(AppendChildNode(parent, ch, std::move(child_node)));
110 ++trie_.size_;
111 return Status::OK();
112 }
113
CreateChildNode(Trie::Node * parent,char ch,util::string_view substring)114 Status TrieBuilder::CreateChildNode(Trie::Node* parent, char ch,
115 util::string_view substring) {
116 return CreateChildNode(parent, static_cast<uint8_t>(ch), substring);
117 }
118
ExtendLookupTable(index_type * out_index)119 Status TrieBuilder::ExtendLookupTable(index_type* out_index) {
120 auto cur_size = trie_.lookup_table_.size();
121 auto cur_index = cur_size / 256;
122 if (cur_index > static_cast<size_t>(kMaxIndex)) {
123 return Status::CapacityError("TrieBuilder cannot extend lookup table further");
124 }
125 trie_.lookup_table_.resize(cur_size + 256, -1);
126 *out_index = static_cast<index_type>(cur_index);
127 return Status::OK();
128 }
129
SplitNode(fast_index_type node_index,fast_index_type split_at)130 Status TrieBuilder::SplitNode(fast_index_type node_index, fast_index_type split_at) {
131 Trie::Node* node = &trie_.nodes_[node_index];
132
133 DCHECK_LT(split_at, node->substring_length());
134
135 // Before:
136 // {node} -> [...]
137 // After:
138 // {node} -> [c] -> {out_node} -> [...]
139 auto child_node = Trie::Node{node->found_index_, node->child_lookup_,
140 node->substring_.substr(split_at + 1)};
141 auto ch = node->substring_[split_at];
142 node->child_lookup_ = -1;
143 node->found_index_ = -1;
144 node->substring_ = node->substring_.substr(0, split_at);
145 RETURN_NOT_OK(AppendChildNode(node, ch, std::move(child_node)));
146
147 return Status::OK();
148 }
149
Append(util::string_view s,bool allow_duplicate)150 Status TrieBuilder::Append(util::string_view s, bool allow_duplicate) {
151 // Find or create node for string
152 fast_index_type node_index = 0;
153 fast_index_type pos = 0;
154 fast_index_type remaining = static_cast<fast_index_type>(s.length());
155
156 while (true) {
157 Trie::Node* node = &trie_.nodes_[node_index];
158 const auto substring_length = node->substring_length();
159 const auto substring_data = node->substring_data();
160
161 for (fast_index_type i = 0; i < substring_length; ++i) {
162 if (remaining == 0) {
163 // New string too short => need to split node
164 RETURN_NOT_OK(SplitNode(node_index, i));
165 // Current node matches exactly
166 node = &trie_.nodes_[node_index];
167 node->found_index_ = trie_.size_++;
168 return Status::OK();
169 }
170 if (s[pos] != substring_data[i]) {
171 // Mismatching substring => need to split node
172 RETURN_NOT_OK(SplitNode(node_index, i));
173 // Create new node for mismatching char
174 node = &trie_.nodes_[node_index];
175 return CreateChildNode(node, s[pos], s.substr(pos + 1));
176 }
177 ++pos;
178 --remaining;
179 }
180 if (remaining == 0) {
181 // Node matches exactly
182 if (node->found_index_ >= 0) {
183 if (allow_duplicate) {
184 return Status::OK();
185 } else {
186 return Status::Invalid("Duplicate entry in trie");
187 }
188 }
189 node->found_index_ = trie_.size_++;
190 return Status::OK();
191 }
192 // Lookup child using next input character
193 if (node->child_lookup_ == -1) {
194 // Need to extend lookup table for this node
195 RETURN_NOT_OK(ExtendLookupTable(&node->child_lookup_));
196 }
197 auto c = static_cast<uint8_t>(s[pos++]);
198 --remaining;
199 node_index = trie_.lookup_table_[node->child_lookup_ * 256 + c];
200 if (node_index == -1) {
201 // Child not found => need to create child node
202 return CreateChildNode(node, c, s.substr(pos));
203 }
204 node = &trie_.nodes_[node_index];
205 }
206 }
207
Finish()208 Trie TrieBuilder::Finish() { return std::move(trie_); }
209
210 } // namespace internal
211 } // namespace arrow
212