1 // Copyright (c) Microsoft Corporation. All rights reserved.
2 // Licensed under the MIT license.
3 
4 #pragma once
5 
6 #include "kuku/common.h"
7 #include "kuku/locfunc.h"
8 #include <memory>
9 #include <random>
10 #include <set>
11 #include <stdexcept>
12 #include <vector>
13 
14 namespace kuku
15 {
16     class QueryResult;
17 
18     /**
19     The KukuTable class represents a cockoo hash table. It includes information about the location functions (hash
20     functions) and holds the items inserted into the table.
21     */
22     class KukuTable
23     {
24     public:
25         /**
26         Creates a new empty hash table.
27 
28         @param[in] table_size The size of the hash table
29         @param[in] stash_size The size of the stash (possibly zero)
30         @param[in] loc_func_count The number of location functions (hash functions) to use
31         @param[in] loc_func_seed The 128-bit seed for the location functions, represented as a hash table item
32         @param[in] max_probe The maximum number of random walk steps taken in attempting to insert an item
33         @param[in] empty_item A hash table item that represents an empty location in the table
34         @throws std::invalid_argument if loc_func_count is too large or too small
35         @throws std::invalid_argument if table_size is too large or too small
36         @throws std::invalid_argument if max_probe is zero
37         */
38         KukuTable(
39             table_size_type table_size, table_size_type stash_size, std::uint32_t loc_func_count,
40             item_type loc_func_seed, std::uint64_t max_probe, item_type empty_item);
41 
42         /**
43         Adds a single item to the hash table using random walk cuckoo hashing. The return value indicates whether
44         the item was successfully inserted (possibly into the stash) or not.
45 
46         @param[in] item The hash table item to insert
47         @throws std::invalid_argument if the given item is the empty item for this hash table
48         */
49         bool insert(item_type item);
50 
51         /**
52         Queries for the presence of a given item in the hash table and stash.
53 
54         @param[in] item The hash table item to query
55         @throws std::invalid_argument if the given item is the empty item for this hash table
56         */
57         QueryResult query(item_type item) const;
58 
59         /**
60         Returns a location that a given hash table item may be placed at.
61 
62         @param[in] item The hash table item for which the location is to be obtained
63         @param[in] loc_func_index The index of the location function which to use to compute the location
64         @throws std::out_of_range if loc_func_index is out of range
65         @throws std::invalid_argument if the given item is the empty item for this hash table
66         */
location(item_type item,std::uint32_t loc_func_index)67         inline location_type location(item_type item, std::uint32_t loc_func_index) const
68         {
69             if (loc_func_index >= loc_funcs_.size())
70             {
71                 throw std::out_of_range("loc_func_index is out of range");
72             }
73             if (is_empty_item(item))
74             {
75                 throw std::invalid_argument("item cannot be the empty item");
76             }
77             return loc_funcs_[loc_func_index](item);
78         }
79 
80         /**
81         Returns all hash table locations that this item may be placed at.
82 
83         @throws std::invalid_argument if the given item is the empty item for this hash table
84         */
85         std::set<location_type> all_locations(item_type item) const;
86 
87         /**
88         Clears the hash table by filling every location with the empty item.
89         */
90         void clear_table();
91 
92         /**
93         Returns the number of location functions used by the hash table.
94         */
loc_func_count()95         inline std::uint32_t loc_func_count() const noexcept
96         {
97             return static_cast<std::uint32_t>(loc_funcs_.size());
98         }
99 
100         /**
101         Returns a reference to the hash table.
102         */
table()103         inline const std::vector<item_type> &table() const noexcept
104         {
105             return table_;
106         }
107 
108         /**
109         Returns a reference to a specific location in the hash table.
110 
111         @param[in] index The index in the hash table
112         @throws std::out_of_range if index is out of range
113         */
table(location_type index)114         inline const item_type &table(location_type index) const
115         {
116             if (index >= table_size_)
117             {
118                 throw std::out_of_range("index is out of range");
119             }
120             return table_[index];
121         }
122 
123         /**
124         Returns a reference to the stash.
125         */
stash()126         inline const std::vector<item_type> &stash() const noexcept
127         {
128             return stash_;
129         }
130 
131         /**
132         Returns a reference to a specific location in the stash.
133 
134         @param[in] index The index in the stash
135         @throws std::out_of_range if index is out of range
136         */
stash(location_type index)137         inline const item_type &stash(location_type index) const
138         {
139             if (index >= stash_size_)
140             {
141                 throw std::out_of_range("index is out of range");
142             }
143             if (index >= stash_.size() && index < stash_size_)
144             {
145                 return empty_item_;
146             }
147             return stash_[index];
148         }
149 
150         /**
151         Returns the size of the hash table.
152         */
table_size()153         inline table_size_type table_size() const noexcept
154         {
155             return table_size_;
156         }
157 
158         /**
159         Returns the size of the stash.
160         */
stash_size()161         inline table_size_type stash_size() const noexcept
162         {
163             return stash_size_;
164         }
165 
166         /**
167         Returns the 128-bit seed used for the location functions, represented as a hash table item.
168         */
loc_func_seed()169         inline item_type loc_func_seed() const noexcept
170         {
171             return loc_func_seed_;
172         }
173 
174         /**
175         Returns the maximum number of random walk steps taken in attempting to insert an item.
176         */
max_probe()177         inline std::uint64_t max_probe() const noexcept
178         {
179             return max_probe_;
180         }
181 
182         /**
183         Returns the hash table item that represents an empty location in the table.
184         */
empty_item()185         inline const item_type &empty_item() const noexcept
186         {
187             return empty_item_;
188         }
189 
190         /**
191         Returns whether a given location in the table is empty.
192 
193         @param[in] index The index in the hash table
194         @throws std::out_of_range if index is out of range
195         */
is_empty(location_type index)196         inline bool is_empty(location_type index) const noexcept
197         {
198             return is_empty_item(table(index));
199         }
200 
201         /**
202         Returns whether a given item is the empty item for this hash table.
203 
204         @param[in] item The item to compare to the empty item
205         */
is_empty_item(const item_type & item)206         inline bool is_empty_item(const item_type &item) const noexcept
207         {
208             return are_equal_item(item, empty_item_);
209         }
210 
211         /**
212         When the insert function fails to insert a hash table item, there is a leftover item that could not be inserted
213         into the table. This function will return the empty item if insertion never failed, and otherwise it will return
214         the latest leftover item. Note that due to how the random walk insertion process works, the leftover item is
215         usually not the same one that insert was called with.
216         */
leftover_item()217         inline item_type leftover_item() const noexcept
218         {
219             return leftover_item_;
220         }
221 
222         /**
223         Returns the current fill rate of the hash table and stash.
224         */
fill_rate()225         inline double fill_rate() const noexcept
226         {
227             return static_cast<double>(inserted_items_) /
228                    (static_cast<double>(table_size()) + static_cast<double>(stash_size_));
229         }
230 
231     private:
232         KukuTable(const KukuTable &copy) = delete;
233 
234         KukuTable &operator=(const KukuTable &assign) = delete;
235 
236         void generate_loc_funcs(std::uint32_t loc_func_count, item_type seed);
237 
238         /*
239         Swap an item in the table with a given item.
240         */
swap(item_type item,location_type location)241         inline item_type swap(item_type item, location_type location) noexcept
242         {
243             item_type old_item = table_[location];
244             table_[location] = item;
245             return old_item;
246         }
247 
248         /*
249         The hash table that holds all of the input data.
250         */
251         std::vector<item_type> table_;
252 
253         /*
254         The stash.
255         */
256         std::vector<item_type> stash_;
257 
258         /*
259         The hash functions.
260         */
261         std::vector<LocFunc> loc_funcs_;
262 
263         /*
264         The size of the table.
265         */
266         const table_size_type table_size_;
267 
268         /*
269         The size of the stash.
270         */
271         const table_size_type stash_size_;
272 
273         /*
274         Seed for the hash functions
275         */
276         const item_type loc_func_seed_;
277 
278         /*
279         The maximum number of attempts that are made to insert an item.
280         */
281         const std::uint64_t max_probe_;
282 
283         /*
284         An item value that denotes an empty item.
285         */
286         const item_type empty_item_;
287 
288         /*
289         Storage for an item that was evicted and could not be re-inserted. This
290         is populated when insert fails.
291         */
292         item_type leftover_item_;
293 
294         /*
295         The number of items that have been inserted to table or stash.
296         */
297         table_size_type inserted_items_;
298 
299         /*
300         Randomness source for location function sampling.
301         */
302         std::mt19937_64 gen_;
303 
304         std::uniform_int_distribution<std::uint32_t> u_;
305     };
306 
307     /**
308     The QueryResult class represents the result of a hash table query. It includes information about whether a queried
309     item was found in the hash table, its location in the hash table or stash (if found), and the index of the location
310     function (hash function) that was used to insert it. QueryResult objects are returned by the KukuTable::query
311     function.
312     */
313     class QueryResult
314     {
315         friend class KukuTable;
316 
317     public:
318         /**
319         Creates a QueryResult object.
320         */
321         QueryResult() = default;
322 
323         /**
324         Returns the hash table or stash location represented by this QueryResult.
325         */
location()326         inline location_type location() const noexcept
327         {
328             return location_;
329         }
330 
331         /**
332         Returns the index of the location function that was used to insert the queried item. This value is meaningless
333         when in_stash() is true. A value equal to max_loc_func_count indicates the item was not found in the table or
334         stash.
335         */
loc_func_index()336         inline std::uint32_t loc_func_index() const noexcept
337         {
338             return loc_func_index_;
339         }
340 
341         /**
342         Returns whether the queried item was found in the stash.
343         */
in_stash()344         inline bool in_stash() const noexcept
345         {
346             return !~loc_func_index_;
347         }
348 
349         /**
350         Returns whether the queried item was found in the hash table or in the stash.
351         */
found()352         inline bool found() const noexcept
353         {
354             return !(loc_func_index_ & ~(max_loc_func_count - 1)) || in_stash();
355         }
356 
357         /**
358         Returns whether the queried item was found in the hash table or in the stash.
359         */
360         inline operator bool() const noexcept
361         {
362             return found();
363         }
364 
365     private:
QueryResult(location_type location,std::uint32_t loc_func_index)366         QueryResult(location_type location, std::uint32_t loc_func_index)
367             : location_(location), loc_func_index_(loc_func_index)
368         {
369 #ifdef KUKU_DEBUG
370             if (location >= max_table_size)
371             {
372                 throw std::invalid_argument("invalid location");
373             }
374 #endif
375         }
376 
377         location_type location_ = 0;
378 
379         std::uint32_t loc_func_index_ = 0;
380     };
381 } // namespace kuku
382