1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #include "arrow/util/trie.h"
19 
20 #include <iostream>
21 #include <utility>
22 
23 #include "arrow/util/logging.h"
24 
25 namespace arrow {
26 namespace internal {
27 
Validate() const28 Status Trie::Validate() const {
29   const auto n_nodes = static_cast<fast_index_type>(nodes_.size());
30   if (size_ > n_nodes) {
31     return Status::Invalid("Number of entries larger than number of nodes");
32   }
33   for (const auto& node : nodes_) {
34     if (node.found_index_ >= size_) {
35       return Status::Invalid("Found index >= size");
36     }
37     if (node.child_lookup_ != -1 &&
38         node.child_lookup_ * 256 >
39             static_cast<fast_index_type>(lookup_table_.size() - 256)) {
40       return Status::Invalid("Child lookup base doesn't point to 256 valid indices");
41     }
42   }
43   for (const auto index : lookup_table_) {
44     if (index >= n_nodes) {
45       return Status::Invalid("Child lookup index out of bounds");
46     }
47   }
48   return Status::OK();
49 }
50 
Dump(const Node * node,const std::string & indent) const51 void Trie::Dump(const Node* node, const std::string& indent) const {
52   std::cerr << "[\"" << node->substring_ << "\"]";
53   if (node->found_index_ >= 0) {
54     std::cerr << " *";
55   }
56   std::cerr << "\n";
57   if (node->child_lookup_ >= 0) {
58     auto child_indent = indent + "   ";
59     std::cerr << child_indent << "|\n";
60     for (fast_index_type i = 0; i < 256; ++i) {
61       auto child_index = lookup_table_[node->child_lookup_ * 256 + i];
62       if (child_index >= 0) {
63         const Node* child = &nodes_[child_index];
64         std::cerr << child_indent << "|-> '" << static_cast<char>(i) << "' (" << i
65                   << ") -> ";
66         Dump(child, child_indent);
67       }
68     }
69   }
70 }
71 
Dump() const72 void Trie::Dump() const { Dump(&nodes_[0], ""); }
73 
TrieBuilder()74 TrieBuilder::TrieBuilder() { trie_.nodes_.push_back(Trie::Node{-1, -1, ""}); }
75 
AppendChildNode(Trie::Node * parent,uint8_t ch,Trie::Node && node)76 Status TrieBuilder::AppendChildNode(Trie::Node* parent, uint8_t ch, Trie::Node&& node) {
77   if (parent->child_lookup_ == -1) {
78     RETURN_NOT_OK(ExtendLookupTable(&parent->child_lookup_));
79   }
80   auto parent_lookup = parent->child_lookup_ * 256 + ch;
81 
82   DCHECK_EQ(trie_.lookup_table_[parent_lookup], -1);
83   if (trie_.nodes_.size() >= static_cast<size_t>(kMaxIndex)) {
84     auto max_capacity = kMaxIndex;
85     return Status::CapacityError("TrieBuilder cannot contain more than ", max_capacity,
86                                  " child nodes");
87   }
88   trie_.nodes_.push_back(std::move(node));
89   trie_.lookup_table_[parent_lookup] = static_cast<index_type>(trie_.nodes_.size() - 1);
90   return Status::OK();
91 }
92 
CreateChildNode(Trie::Node * parent,uint8_t ch,util::string_view substring)93 Status TrieBuilder::CreateChildNode(Trie::Node* parent, uint8_t ch,
94                                     util::string_view substring) {
95   const auto kMaxSubstringLength = Trie::kMaxSubstringLength;
96 
97   while (substring.length() > kMaxSubstringLength) {
98     // Substring doesn't fit in node => create intermediate node
99     auto mid_node = Trie::Node{-1, -1, substring.substr(0, kMaxSubstringLength)};
100     RETURN_NOT_OK(AppendChildNode(parent, ch, std::move(mid_node)));
101     // Recurse
102     parent = &trie_.nodes_.back();
103     ch = static_cast<uint8_t>(substring[kMaxSubstringLength]);
104     substring = substring.substr(kMaxSubstringLength + 1);
105   }
106 
107   // Create final matching node
108   auto child_node = Trie::Node{trie_.size_, -1, substring};
109   RETURN_NOT_OK(AppendChildNode(parent, ch, std::move(child_node)));
110   ++trie_.size_;
111   return Status::OK();
112 }
113 
CreateChildNode(Trie::Node * parent,char ch,util::string_view substring)114 Status TrieBuilder::CreateChildNode(Trie::Node* parent, char ch,
115                                     util::string_view substring) {
116   return CreateChildNode(parent, static_cast<uint8_t>(ch), substring);
117 }
118 
ExtendLookupTable(index_type * out_index)119 Status TrieBuilder::ExtendLookupTable(index_type* out_index) {
120   auto cur_size = trie_.lookup_table_.size();
121   auto cur_index = cur_size / 256;
122   if (cur_index > static_cast<size_t>(kMaxIndex)) {
123     return Status::CapacityError("TrieBuilder cannot extend lookup table further");
124   }
125   trie_.lookup_table_.resize(cur_size + 256, -1);
126   *out_index = static_cast<index_type>(cur_index);
127   return Status::OK();
128 }
129 
SplitNode(fast_index_type node_index,fast_index_type split_at)130 Status TrieBuilder::SplitNode(fast_index_type node_index, fast_index_type split_at) {
131   Trie::Node* node = &trie_.nodes_[node_index];
132 
133   DCHECK_LT(split_at, node->substring_length());
134 
135   // Before:
136   //   {node} -> [...]
137   // After:
138   //   {node} -> [c] -> {out_node} -> [...]
139   auto child_node = Trie::Node{node->found_index_, node->child_lookup_,
140                                node->substring_.substr(split_at + 1)};
141   auto ch = node->substring_[split_at];
142   node->child_lookup_ = -1;
143   node->found_index_ = -1;
144   node->substring_ = node->substring_.substr(0, split_at);
145   RETURN_NOT_OK(AppendChildNode(node, ch, std::move(child_node)));
146 
147   return Status::OK();
148 }
149 
Append(util::string_view s,bool allow_duplicate)150 Status TrieBuilder::Append(util::string_view s, bool allow_duplicate) {
151   // Find or create node for string
152   fast_index_type node_index = 0;
153   fast_index_type pos = 0;
154   fast_index_type remaining = static_cast<fast_index_type>(s.length());
155 
156   while (true) {
157     Trie::Node* node = &trie_.nodes_[node_index];
158     const auto substring_length = node->substring_length();
159     const auto substring_data = node->substring_data();
160 
161     for (fast_index_type i = 0; i < substring_length; ++i) {
162       if (remaining == 0) {
163         // New string too short => need to split node
164         RETURN_NOT_OK(SplitNode(node_index, i));
165         // Current node matches exactly
166         node = &trie_.nodes_[node_index];
167         node->found_index_ = trie_.size_++;
168         return Status::OK();
169       }
170       if (s[pos] != substring_data[i]) {
171         // Mismatching substring => need to split node
172         RETURN_NOT_OK(SplitNode(node_index, i));
173         // Create new node for mismatching char
174         node = &trie_.nodes_[node_index];
175         return CreateChildNode(node, s[pos], s.substr(pos + 1));
176       }
177       ++pos;
178       --remaining;
179     }
180     if (remaining == 0) {
181       // Node matches exactly
182       if (node->found_index_ >= 0) {
183         if (allow_duplicate) {
184           return Status::OK();
185         } else {
186           return Status::Invalid("Duplicate entry in trie");
187         }
188       }
189       node->found_index_ = trie_.size_++;
190       return Status::OK();
191     }
192     // Lookup child using next input character
193     if (node->child_lookup_ == -1) {
194       // Need to extend lookup table for this node
195       RETURN_NOT_OK(ExtendLookupTable(&node->child_lookup_));
196     }
197     auto c = static_cast<uint8_t>(s[pos++]);
198     --remaining;
199     node_index = trie_.lookup_table_[node->child_lookup_ * 256 + c];
200     if (node_index == -1) {
201       // Child not found => need to create child node
202       return CreateChildNode(node, c, s.substr(pos));
203     }
204     node = &trie_.nodes_[node_index];
205   }
206 }
207 
Finish()208 Trie TrieBuilder::Finish() { return std::move(trie_); }
209 
210 }  // namespace internal
211 }  // namespace arrow
212