1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, 12 // software distributed under the License is distributed on an 13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 // KIND, either express or implied. See the License for the 15 // specific language governing permissions and limitations 16 // under the License. 17 18 #pragma once 19 20 #include <cassert> 21 #include <cstdint> 22 #include <cstring> 23 #include <limits> 24 #include <string> 25 #include <utility> 26 #include <vector> 27 28 #include "arrow/status.h" 29 #include "arrow/util/macros.h" 30 #include "arrow/util/string_view.h" 31 #include "arrow/util/visibility.h" 32 33 namespace arrow { 34 namespace internal { 35 36 // A non-zero-terminated small string class. 37 // std::string usually has a small string optimization 38 // (see review at https://shaharmike.com/cpp/std-string/) 39 // but this one allows tight control and optimization of memory layout. 40 template <uint8_t N> 41 class SmallString { 42 public: SmallString()43 SmallString() : length_(0) {} 44 45 template <typename T> SmallString(const T & v)46 SmallString(const T& v) { // NOLINT implicit constructor 47 *this = util::string_view(v); 48 } 49 50 SmallString& operator=(const util::string_view s) { 51 #ifndef NDEBUG 52 CheckSize(s.size()); 53 #endif 54 length_ = static_cast<uint8_t>(s.size()); 55 std::memcpy(data_, s.data(), length_); 56 return *this; 57 } 58 59 SmallString& operator=(const std::string& s) { 60 *this = util::string_view(s); 61 return *this; 62 } 63 64 SmallString& operator=(const char* s) { 65 *this = util::string_view(s); 66 return *this; 67 } 68 string_view()69 explicit operator util::string_view() const { 70 return util::string_view(data_, length_); 71 } 72 data()73 const char* data() const { return data_; } length()74 size_t length() const { return length_; } empty()75 bool empty() const { return length_ == 0; } 76 char operator[](size_t pos) const { 77 #ifdef NDEBUG 78 assert(pos <= length_); 79 #endif 80 return data_[pos]; 81 } 82 substr(size_t pos)83 SmallString substr(size_t pos) const { 84 return SmallString(util::string_view(*this).substr(pos)); 85 } 86 substr(size_t pos,size_t count)87 SmallString substr(size_t pos, size_t count) const { 88 return SmallString(util::string_view(*this).substr(pos, count)); 89 } 90 91 template <typename T> 92 bool operator==(T&& other) const { 93 return util::string_view(*this) == util::string_view(std::forward<T>(other)); 94 } 95 96 template <typename T> 97 bool operator!=(T&& other) const { 98 return util::string_view(*this) != util::string_view(std::forward<T>(other)); 99 } 100 101 protected: 102 uint8_t length_; 103 char data_[N]; 104 105 #ifndef NDEBUG CheckSize(size_t n)106 void CheckSize(size_t n) { assert(n <= N); } 107 #endif 108 }; 109 110 template <uint8_t N> 111 std::ostream& operator<<(std::ostream& os, const SmallString<N>& str) { 112 return os << util::string_view(str); 113 } 114 115 // A trie class for byte strings, optimized for small sets of short strings. 116 // This class is immutable by design, use a TrieBuilder to construct it. 117 class ARROW_EXPORT Trie { 118 using index_type = int16_t; 119 using fast_index_type = int_fast16_t; 120 121 public: Trie()122 Trie() : size_(0) {} 123 Trie(Trie&&) = default; 124 Trie& operator=(Trie&&) = default; 125 Find(util::string_view s)126 int32_t Find(util::string_view s) const { 127 const Node* node = &nodes_[0]; 128 fast_index_type pos = 0; 129 fast_index_type remaining = static_cast<fast_index_type>(s.length()); 130 131 while (remaining > 0) { 132 auto substring_length = node->substring_length(); 133 if (substring_length > 0) { 134 auto substring_data = node->substring_data(); 135 if (remaining < substring_length) { 136 // Input too short 137 return -1; 138 } 139 for (fast_index_type i = 0; i < substring_length; ++i) { 140 if (s[pos++] != substring_data[i]) { 141 // Mismatching substring 142 return -1; 143 } 144 --remaining; 145 } 146 if (remaining == 0) { 147 // Matched node exactly 148 return node->found_index_; 149 } 150 } 151 // Lookup child using next input character 152 if (node->child_lookup_ == -1) { 153 // Input too long 154 return -1; 155 } 156 auto c = static_cast<uint8_t>(s[pos++]); 157 --remaining; 158 auto child_index = lookup_table_[node->child_lookup_ * 256 + c]; 159 if (child_index == -1) { 160 // Child not found 161 return -1; 162 } 163 node = &nodes_[child_index]; 164 } 165 166 // Input exhausted 167 if (node->substring_.empty()) { 168 // Matched node exactly 169 return node->found_index_; 170 } else { 171 return -1; 172 } 173 } 174 175 Status Validate() const; 176 177 void Dump() const; 178 179 protected: 180 static constexpr size_t kNodeSize = 16; 181 static constexpr auto kMaxSubstringLength = 182 kNodeSize - 2 * sizeof(index_type) - sizeof(int8_t); 183 184 struct Node { 185 // If this node is a valid end of string, index of found string, otherwise -1 186 index_type found_index_; 187 // Base index for child lookup in lookup_table_ (-1 if no child nodes) 188 index_type child_lookup_; 189 // The substring for this node. 190 SmallString<kMaxSubstringLength> substring_; 191 substring_lengthNode192 fast_index_type substring_length() const { 193 return static_cast<fast_index_type>(substring_.length()); 194 } substring_dataNode195 const char* substring_data() const { return substring_.data(); } 196 }; 197 198 static_assert(sizeof(Node) == kNodeSize, "Unexpected node size"); 199 200 ARROW_DISALLOW_COPY_AND_ASSIGN(Trie); 201 202 void Dump(const Node* node, const std::string& indent) const; 203 204 // Node table: entry 0 is the root node 205 std::vector<Node> nodes_; 206 207 // Indexed lookup structure: gives index in node table, or -1 if not found 208 std::vector<index_type> lookup_table_; 209 210 // Number of entries 211 index_type size_; 212 213 friend class TrieBuilder; 214 }; 215 216 class ARROW_EXPORT TrieBuilder { 217 using index_type = Trie::index_type; 218 using fast_index_type = Trie::fast_index_type; 219 220 public: 221 TrieBuilder(); 222 Status Append(util::string_view s, bool allow_duplicate = false); 223 Trie Finish(); 224 225 protected: 226 // Extend the lookup table by 256 entries, return the index of the new span 227 Status ExtendLookupTable(index_type* out_lookup_index); 228 // Split the node given by the index at the substring index `split_at` 229 Status SplitNode(fast_index_type node_index, fast_index_type split_at); 230 // Append an already constructed child node to the parent 231 Status AppendChildNode(Trie::Node* parent, uint8_t ch, Trie::Node&& node); 232 // Create a matching child node from this parent 233 Status CreateChildNode(Trie::Node* parent, uint8_t ch, util::string_view substring); 234 Status CreateChildNode(Trie::Node* parent, char ch, util::string_view substring); 235 236 Trie trie_; 237 238 static constexpr auto kMaxIndex = std::numeric_limits<index_type>::max(); 239 }; 240 241 } // namespace internal 242 } // namespace arrow 243