1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #pragma once
19 
20 #include <cassert>
21 #include <cstdint>
22 #include <cstring>
23 #include <limits>
24 #include <string>
25 #include <utility>
26 #include <vector>
27 
28 #include "arrow/status.h"
29 #include "arrow/util/macros.h"
30 #include "arrow/util/string_view.h"
31 #include "arrow/util/visibility.h"
32 
33 namespace arrow {
34 namespace internal {
35 
36 // A non-zero-terminated small string class.
37 // std::string usually has a small string optimization
38 // (see review at https://shaharmike.com/cpp/std-string/)
39 // but this one allows tight control and optimization of memory layout.
40 template <uint8_t N>
41 class SmallString {
42  public:
SmallString()43   SmallString() : length_(0) {}
44 
45   template <typename T>
SmallString(const T & v)46   SmallString(const T& v) {  // NOLINT implicit constructor
47     *this = util::string_view(v);
48   }
49 
50   SmallString& operator=(const util::string_view s) {
51 #ifndef NDEBUG
52     CheckSize(s.size());
53 #endif
54     length_ = static_cast<uint8_t>(s.size());
55     std::memcpy(data_, s.data(), length_);
56     return *this;
57   }
58 
59   SmallString& operator=(const std::string& s) {
60     *this = util::string_view(s);
61     return *this;
62   }
63 
64   SmallString& operator=(const char* s) {
65     *this = util::string_view(s);
66     return *this;
67   }
68 
string_view()69   explicit operator util::string_view() const {
70     return util::string_view(data_, length_);
71   }
72 
data()73   const char* data() const { return data_; }
length()74   size_t length() const { return length_; }
empty()75   bool empty() const { return length_ == 0; }
76   char operator[](size_t pos) const {
77 #ifdef NDEBUG
78     assert(pos <= length_);
79 #endif
80     return data_[pos];
81   }
82 
substr(size_t pos)83   SmallString substr(size_t pos) const {
84     return SmallString(util::string_view(*this).substr(pos));
85   }
86 
substr(size_t pos,size_t count)87   SmallString substr(size_t pos, size_t count) const {
88     return SmallString(util::string_view(*this).substr(pos, count));
89   }
90 
91   template <typename T>
92   bool operator==(T&& other) const {
93     return util::string_view(*this) == util::string_view(std::forward<T>(other));
94   }
95 
96   template <typename T>
97   bool operator!=(T&& other) const {
98     return util::string_view(*this) != util::string_view(std::forward<T>(other));
99   }
100 
101  protected:
102   uint8_t length_;
103   char data_[N];
104 
105 #ifndef NDEBUG
CheckSize(size_t n)106   void CheckSize(size_t n) { assert(n <= N); }
107 #endif
108 };
109 
110 template <uint8_t N>
111 std::ostream& operator<<(std::ostream& os, const SmallString<N>& str) {
112   return os << util::string_view(str);
113 }
114 
115 // A trie class for byte strings, optimized for small sets of short strings.
116 // This class is immutable by design, use a TrieBuilder to construct it.
117 class ARROW_EXPORT Trie {
118   using index_type = int16_t;
119   using fast_index_type = int_fast16_t;
120 
121  public:
Trie()122   Trie() : size_(0) {}
123   Trie(Trie&&) = default;
124   Trie& operator=(Trie&&) = default;
125 
Find(util::string_view s)126   int32_t Find(util::string_view s) const {
127     const Node* node = &nodes_[0];
128     fast_index_type pos = 0;
129     fast_index_type remaining = static_cast<fast_index_type>(s.length());
130 
131     while (remaining > 0) {
132       auto substring_length = node->substring_length();
133       if (substring_length > 0) {
134         auto substring_data = node->substring_data();
135         if (remaining < substring_length) {
136           // Input too short
137           return -1;
138         }
139         for (fast_index_type i = 0; i < substring_length; ++i) {
140           if (s[pos++] != substring_data[i]) {
141             // Mismatching substring
142             return -1;
143           }
144           --remaining;
145         }
146         if (remaining == 0) {
147           // Matched node exactly
148           return node->found_index_;
149         }
150       }
151       // Lookup child using next input character
152       if (node->child_lookup_ == -1) {
153         // Input too long
154         return -1;
155       }
156       auto c = static_cast<uint8_t>(s[pos++]);
157       --remaining;
158       auto child_index = lookup_table_[node->child_lookup_ * 256 + c];
159       if (child_index == -1) {
160         // Child not found
161         return -1;
162       }
163       node = &nodes_[child_index];
164     }
165 
166     // Input exhausted
167     if (node->substring_.empty()) {
168       // Matched node exactly
169       return node->found_index_;
170     } else {
171       return -1;
172     }
173   }
174 
175   Status Validate() const;
176 
177   void Dump() const;
178 
179  protected:
180   static constexpr size_t kNodeSize = 16;
181   static constexpr auto kMaxSubstringLength =
182       kNodeSize - 2 * sizeof(index_type) - sizeof(int8_t);
183 
184   struct Node {
185     // If this node is a valid end of string, index of found string, otherwise -1
186     index_type found_index_;
187     // Base index for child lookup in lookup_table_ (-1 if no child nodes)
188     index_type child_lookup_;
189     // The substring for this node.
190     SmallString<kMaxSubstringLength> substring_;
191 
substring_lengthNode192     fast_index_type substring_length() const {
193       return static_cast<fast_index_type>(substring_.length());
194     }
substring_dataNode195     const char* substring_data() const { return substring_.data(); }
196   };
197 
198   static_assert(sizeof(Node) == kNodeSize, "Unexpected node size");
199 
200   ARROW_DISALLOW_COPY_AND_ASSIGN(Trie);
201 
202   void Dump(const Node* node, const std::string& indent) const;
203 
204   // Node table: entry 0 is the root node
205   std::vector<Node> nodes_;
206 
207   // Indexed lookup structure: gives index in node table, or -1 if not found
208   std::vector<index_type> lookup_table_;
209 
210   // Number of entries
211   index_type size_;
212 
213   friend class TrieBuilder;
214 };
215 
216 class ARROW_EXPORT TrieBuilder {
217   using index_type = Trie::index_type;
218   using fast_index_type = Trie::fast_index_type;
219 
220  public:
221   TrieBuilder();
222   Status Append(util::string_view s, bool allow_duplicate = false);
223   Trie Finish();
224 
225  protected:
226   // Extend the lookup table by 256 entries, return the index of the new span
227   Status ExtendLookupTable(index_type* out_lookup_index);
228   // Split the node given by the index at the substring index `split_at`
229   Status SplitNode(fast_index_type node_index, fast_index_type split_at);
230   // Append an already constructed child node to the parent
231   Status AppendChildNode(Trie::Node* parent, uint8_t ch, Trie::Node&& node);
232   // Create a matching child node from this parent
233   Status CreateChildNode(Trie::Node* parent, uint8_t ch, util::string_view substring);
234   Status CreateChildNode(Trie::Node* parent, char ch, util::string_view substring);
235 
236   Trie trie_;
237 
238   static constexpr auto kMaxIndex = std::numeric_limits<index_type>::max();
239 };
240 
241 }  // namespace internal
242 }  // namespace arrow
243