1 // Copyright (c) Microsoft Corporation. All rights reserved. 2 // Licensed under the MIT license. 3 4 #pragma once 5 6 #include "kuku/common.h" 7 #include "kuku/locfunc.h" 8 #include <memory> 9 #include <random> 10 #include <set> 11 #include <stdexcept> 12 #include <vector> 13 14 namespace kuku 15 { 16 class QueryResult; 17 18 /** 19 The KukuTable class represents a cockoo hash table. It includes information about the location functions (hash 20 functions) and holds the items inserted into the table. 21 */ 22 class KukuTable 23 { 24 public: 25 /** 26 Creates a new empty hash table. 27 28 @param[in] table_size The size of the hash table 29 @param[in] stash_size The size of the stash (possibly zero) 30 @param[in] loc_func_count The number of location functions (hash functions) to use 31 @param[in] loc_func_seed The 128-bit seed for the location functions, represented as a hash table item 32 @param[in] max_probe The maximum number of random walk steps taken in attempting to insert an item 33 @param[in] empty_item A hash table item that represents an empty location in the table 34 @throws std::invalid_argument if loc_func_count is too large or too small 35 @throws std::invalid_argument if table_size is too large or too small 36 @throws std::invalid_argument if max_probe is zero 37 */ 38 KukuTable( 39 table_size_type table_size, table_size_type stash_size, std::uint32_t loc_func_count, 40 item_type loc_func_seed, std::uint64_t max_probe, item_type empty_item); 41 42 /** 43 Adds a single item to the hash table using random walk cuckoo hashing. The return value indicates whether 44 the item was successfully inserted (possibly into the stash) or not. 45 46 @param[in] item The hash table item to insert 47 @throws std::invalid_argument if the given item is the empty item for this hash table 48 */ 49 bool insert(item_type item); 50 51 /** 52 Queries for the presence of a given item in the hash table and stash. 53 54 @param[in] item The hash table item to query 55 @throws std::invalid_argument if the given item is the empty item for this hash table 56 */ 57 QueryResult query(item_type item) const; 58 59 /** 60 Returns a location that a given hash table item may be placed at. 61 62 @param[in] item The hash table item for which the location is to be obtained 63 @param[in] loc_func_index The index of the location function which to use to compute the location 64 @throws std::out_of_range if loc_func_index is out of range 65 @throws std::invalid_argument if the given item is the empty item for this hash table 66 */ location(item_type item,std::uint32_t loc_func_index)67 inline location_type location(item_type item, std::uint32_t loc_func_index) const 68 { 69 if (loc_func_index >= loc_funcs_.size()) 70 { 71 throw std::out_of_range("loc_func_index is out of range"); 72 } 73 if (is_empty_item(item)) 74 { 75 throw std::invalid_argument("item cannot be the empty item"); 76 } 77 return loc_funcs_[loc_func_index](item); 78 } 79 80 /** 81 Returns all hash table locations that this item may be placed at. 82 83 @throws std::invalid_argument if the given item is the empty item for this hash table 84 */ 85 std::set<location_type> all_locations(item_type item) const; 86 87 /** 88 Clears the hash table by filling every location with the empty item. 89 */ 90 void clear_table(); 91 92 /** 93 Returns the number of location functions used by the hash table. 94 */ loc_func_count()95 inline std::uint32_t loc_func_count() const noexcept 96 { 97 return static_cast<std::uint32_t>(loc_funcs_.size()); 98 } 99 100 /** 101 Returns a reference to the hash table. 102 */ table()103 inline const std::vector<item_type> &table() const noexcept 104 { 105 return table_; 106 } 107 108 /** 109 Returns a reference to a specific location in the hash table. 110 111 @param[in] index The index in the hash table 112 @throws std::out_of_range if index is out of range 113 */ table(location_type index)114 inline const item_type &table(location_type index) const 115 { 116 if (index >= table_size_) 117 { 118 throw std::out_of_range("index is out of range"); 119 } 120 return table_[index]; 121 } 122 123 /** 124 Returns a reference to the stash. 125 */ stash()126 inline const std::vector<item_type> &stash() const noexcept 127 { 128 return stash_; 129 } 130 131 /** 132 Returns a reference to a specific location in the stash. 133 134 @param[in] index The index in the stash 135 @throws std::out_of_range if index is out of range 136 */ stash(location_type index)137 inline const item_type &stash(location_type index) const 138 { 139 if (index >= stash_size_) 140 { 141 throw std::out_of_range("index is out of range"); 142 } 143 if (index >= stash_.size() && index < stash_size_) 144 { 145 return empty_item_; 146 } 147 return stash_[index]; 148 } 149 150 /** 151 Returns the size of the hash table. 152 */ table_size()153 inline table_size_type table_size() const noexcept 154 { 155 return table_size_; 156 } 157 158 /** 159 Returns the size of the stash. 160 */ stash_size()161 inline table_size_type stash_size() const noexcept 162 { 163 return stash_size_; 164 } 165 166 /** 167 Returns the 128-bit seed used for the location functions, represented as a hash table item. 168 */ loc_func_seed()169 inline item_type loc_func_seed() const noexcept 170 { 171 return loc_func_seed_; 172 } 173 174 /** 175 Returns the maximum number of random walk steps taken in attempting to insert an item. 176 */ max_probe()177 inline std::uint64_t max_probe() const noexcept 178 { 179 return max_probe_; 180 } 181 182 /** 183 Returns the hash table item that represents an empty location in the table. 184 */ empty_item()185 inline const item_type &empty_item() const noexcept 186 { 187 return empty_item_; 188 } 189 190 /** 191 Returns whether a given location in the table is empty. 192 193 @param[in] index The index in the hash table 194 @throws std::out_of_range if index is out of range 195 */ is_empty(location_type index)196 inline bool is_empty(location_type index) const noexcept 197 { 198 return is_empty_item(table(index)); 199 } 200 201 /** 202 Returns whether a given item is the empty item for this hash table. 203 204 @param[in] item The item to compare to the empty item 205 */ is_empty_item(const item_type & item)206 inline bool is_empty_item(const item_type &item) const noexcept 207 { 208 return are_equal_item(item, empty_item_); 209 } 210 211 /** 212 When the insert function fails to insert a hash table item, there is a leftover item that could not be inserted 213 into the table. This function will return the empty item if insertion never failed, and otherwise it will return 214 the latest leftover item. Note that due to how the random walk insertion process works, the leftover item is 215 usually not the same one that insert was called with. 216 */ leftover_item()217 inline item_type leftover_item() const noexcept 218 { 219 return leftover_item_; 220 } 221 222 /** 223 Returns the current fill rate of the hash table and stash. 224 */ fill_rate()225 inline double fill_rate() const noexcept 226 { 227 return static_cast<double>(inserted_items_) / 228 (static_cast<double>(table_size()) + static_cast<double>(stash_size_)); 229 } 230 231 private: 232 KukuTable(const KukuTable ©) = delete; 233 234 KukuTable &operator=(const KukuTable &assign) = delete; 235 236 void generate_loc_funcs(std::uint32_t loc_func_count, item_type seed); 237 238 /* 239 Swap an item in the table with a given item. 240 */ swap(item_type item,location_type location)241 inline item_type swap(item_type item, location_type location) noexcept 242 { 243 item_type old_item = table_[location]; 244 table_[location] = item; 245 return old_item; 246 } 247 248 /* 249 The hash table that holds all of the input data. 250 */ 251 std::vector<item_type> table_; 252 253 /* 254 The stash. 255 */ 256 std::vector<item_type> stash_; 257 258 /* 259 The hash functions. 260 */ 261 std::vector<LocFunc> loc_funcs_; 262 263 /* 264 The size of the table. 265 */ 266 const table_size_type table_size_; 267 268 /* 269 The size of the stash. 270 */ 271 const table_size_type stash_size_; 272 273 /* 274 Seed for the hash functions 275 */ 276 const item_type loc_func_seed_; 277 278 /* 279 The maximum number of attempts that are made to insert an item. 280 */ 281 const std::uint64_t max_probe_; 282 283 /* 284 An item value that denotes an empty item. 285 */ 286 const item_type empty_item_; 287 288 /* 289 Storage for an item that was evicted and could not be re-inserted. This 290 is populated when insert fails. 291 */ 292 item_type leftover_item_; 293 294 /* 295 The number of items that have been inserted to table or stash. 296 */ 297 table_size_type inserted_items_; 298 299 /* 300 Randomness source for location function sampling. 301 */ 302 std::mt19937_64 gen_; 303 304 std::uniform_int_distribution<std::uint32_t> u_; 305 }; 306 307 /** 308 The QueryResult class represents the result of a hash table query. It includes information about whether a queried 309 item was found in the hash table, its location in the hash table or stash (if found), and the index of the location 310 function (hash function) that was used to insert it. QueryResult objects are returned by the KukuTable::query 311 function. 312 */ 313 class QueryResult 314 { 315 friend class KukuTable; 316 317 public: 318 /** 319 Creates a QueryResult object. 320 */ 321 QueryResult() = default; 322 323 /** 324 Returns the hash table or stash location represented by this QueryResult. 325 */ location()326 inline location_type location() const noexcept 327 { 328 return location_; 329 } 330 331 /** 332 Returns the index of the location function that was used to insert the queried item. This value is meaningless 333 when in_stash() is true. A value equal to max_loc_func_count indicates the item was not found in the table or 334 stash. 335 */ loc_func_index()336 inline std::uint32_t loc_func_index() const noexcept 337 { 338 return loc_func_index_; 339 } 340 341 /** 342 Returns whether the queried item was found in the stash. 343 */ in_stash()344 inline bool in_stash() const noexcept 345 { 346 return !~loc_func_index_; 347 } 348 349 /** 350 Returns whether the queried item was found in the hash table or in the stash. 351 */ found()352 inline bool found() const noexcept 353 { 354 return !(loc_func_index_ & ~(max_loc_func_count - 1)) || in_stash(); 355 } 356 357 /** 358 Returns whether the queried item was found in the hash table or in the stash. 359 */ 360 inline operator bool() const noexcept 361 { 362 return found(); 363 } 364 365 private: QueryResult(location_type location,std::uint32_t loc_func_index)366 QueryResult(location_type location, std::uint32_t loc_func_index) 367 : location_(location), loc_func_index_(loc_func_index) 368 { 369 #ifdef KUKU_DEBUG 370 if (location >= max_table_size) 371 { 372 throw std::invalid_argument("invalid location"); 373 } 374 #endif 375 } 376 377 location_type location_ = 0; 378 379 std::uint32_t loc_func_index_ = 0; 380 }; 381 } // namespace kuku 382