1 // Copyright (c) 2016 Jeremy Rubin 2 // Distributed under the MIT software license, see the accompanying 3 // file COPYING or http://www.opensource.org/licenses/mit-license.php. 4 5 #ifndef BITCOIN_CUCKOOCACHE_H 6 #define BITCOIN_CUCKOOCACHE_H 7 8 #include <algorithm> // std::find 9 #include <array> 10 #include <atomic> 11 #include <cmath> 12 #include <cstring> 13 #include <memory> 14 #include <utility> 15 #include <vector> 16 17 18 /** High-performance cache primitives. 19 * 20 * Summary: 21 * 22 * 1. @ref bit_packed_atomic_flags is bit-packed atomic flags for garbage collection 23 * 24 * 2. @ref cache is a cache which is performant in memory usage and lookup speed. It 25 * is lockfree for erase operations. Elements are lazily erased on the next insert. 26 */ 27 namespace CuckooCache 28 { 29 /** @ref bit_packed_atomic_flags implements a container for garbage collection flags 30 * that is only thread unsafe on calls to setup. This class bit-packs collection 31 * flags for memory efficiency. 32 * 33 * All operations are `std::memory_order_relaxed` so external mechanisms must 34 * ensure that writes and reads are properly synchronized. 35 * 36 * On setup(n), all bits up to `n` are marked as collected. 37 * 38 * Under the hood, because it is an 8-bit type, it makes sense to use a multiple 39 * of 8 for setup, but it will be safe if that is not the case as well. 40 */ 41 class bit_packed_atomic_flags 42 { 43 std::unique_ptr<std::atomic<uint8_t>[]> mem; 44 45 public: 46 /** No default constructor, as there must be some size. */ 47 bit_packed_atomic_flags() = delete; 48 49 /** 50 * bit_packed_atomic_flags constructor creates memory to sufficiently 51 * keep track of garbage collection information for `size` entries. 52 * 53 * @param size the number of elements to allocate space for 54 * 55 * @post bit_set, bit_unset, and bit_is_set function properly forall x. x < 56 * size 57 * @post All calls to bit_is_set (without subsequent bit_unset) will return 58 * true. 59 */ bit_packed_atomic_flags(uint32_t size)60 explicit bit_packed_atomic_flags(uint32_t size) 61 { 62 // pad out the size if needed 63 size = (size + 7) / 8; 64 mem.reset(new std::atomic<uint8_t>[size]); 65 for (uint32_t i = 0; i < size; ++i) 66 mem[i].store(0xFF); 67 }; 68 69 /** setup marks all entries and ensures that bit_packed_atomic_flags can store 70 * at least `b` entries. 71 * 72 * @param b the number of elements to allocate space for 73 * @post bit_set, bit_unset, and bit_is_set function properly forall x. x < 74 * b 75 * @post All calls to bit_is_set (without subsequent bit_unset) will return 76 * true. 77 */ setup(uint32_t b)78 inline void setup(uint32_t b) 79 { 80 bit_packed_atomic_flags d(b); 81 std::swap(mem, d.mem); 82 } 83 84 /** bit_set sets an entry as discardable. 85 * 86 * @param s the index of the entry to bit_set 87 * @post immediately subsequent call (assuming proper external memory 88 * ordering) to bit_is_set(s) == true. 89 */ bit_set(uint32_t s)90 inline void bit_set(uint32_t s) 91 { 92 mem[s >> 3].fetch_or(1 << (s & 7), std::memory_order_relaxed); 93 } 94 95 /** bit_unset marks an entry as something that should not be overwritten. 96 * 97 * @param s the index of the entry to bit_unset 98 * @post immediately subsequent call (assuming proper external memory 99 * ordering) to bit_is_set(s) == false. 100 */ bit_unset(uint32_t s)101 inline void bit_unset(uint32_t s) 102 { 103 mem[s >> 3].fetch_and(~(1 << (s & 7)), std::memory_order_relaxed); 104 } 105 106 /** bit_is_set queries the table for discardability at `s`. 107 * 108 * @param s the index of the entry to read 109 * @returns true if the bit at index `s` was set, false otherwise 110 * */ bit_is_set(uint32_t s)111 inline bool bit_is_set(uint32_t s) const 112 { 113 return (1 << (s & 7)) & mem[s >> 3].load(std::memory_order_relaxed); 114 } 115 }; 116 117 /** @ref cache implements a cache with properties similar to a cuckoo-set. 118 * 119 * The cache is able to hold up to `(~(uint32_t)0) - 1` elements. 120 * 121 * Read Operations: 122 * - contains() for `erase=false` 123 * 124 * Read+Erase Operations: 125 * - contains() for `erase=true` 126 * 127 * Erase Operations: 128 * - allow_erase() 129 * 130 * Write Operations: 131 * - setup() 132 * - setup_bytes() 133 * - insert() 134 * - please_keep() 135 * 136 * Synchronization Free Operations: 137 * - invalid() 138 * - compute_hashes() 139 * 140 * User Must Guarantee: 141 * 142 * 1. Write requires synchronized access (e.g. a lock) 143 * 2. Read requires no concurrent Write, synchronized with last insert. 144 * 3. Erase requires no concurrent Write, synchronized with last insert. 145 * 4. An Erase caller must release all memory before allowing a new Writer. 146 * 147 * 148 * Note on function names: 149 * - The name "allow_erase" is used because the real discard happens later. 150 * - The name "please_keep" is used because elements may be erased anyways on insert. 151 * 152 * @tparam Element should be a movable and copyable type 153 * @tparam Hash should be a function/callable which takes a template parameter 154 * hash_select and an Element and extracts a hash from it. Should return 155 * high-entropy uint32_t hashes for `Hash h; h<0>(e) ... h<7>(e)`. 156 */ 157 template <typename Element, typename Hash> 158 class cache 159 { 160 private: 161 /** table stores all the elements */ 162 std::vector<Element> table; 163 164 /** size stores the total available slots in the hash table */ 165 uint32_t size; 166 167 /** The bit_packed_atomic_flags array is marked mutable because we want 168 * garbage collection to be allowed to occur from const methods */ 169 mutable bit_packed_atomic_flags collection_flags; 170 171 /** epoch_flags tracks how recently an element was inserted into 172 * the cache. true denotes recent, false denotes not-recent. See insert() 173 * method for full semantics. 174 */ 175 mutable std::vector<bool> epoch_flags; 176 177 /** epoch_heuristic_counter is used to determine when an epoch might be aged 178 * & an expensive scan should be done. epoch_heuristic_counter is 179 * decremented on insert and reset to the new number of inserts which would 180 * cause the epoch to reach epoch_size when it reaches zero. 181 */ 182 uint32_t epoch_heuristic_counter; 183 184 /** epoch_size is set to be the number of elements supposed to be in a 185 * epoch. When the number of non-erased elements in an epoch 186 * exceeds epoch_size, a new epoch should be started and all 187 * current entries demoted. epoch_size is set to be 45% of size because 188 * we want to keep load around 90%, and we support 3 epochs at once -- 189 * one "dead" which has been erased, one "dying" which has been marked to be 190 * erased next, and one "living" which new inserts add to. 191 */ 192 uint32_t epoch_size; 193 194 /** depth_limit determines how many elements insert should try to replace. 195 * Should be set to log2(n). 196 */ 197 uint8_t depth_limit; 198 199 /** hash_function is a const instance of the hash function. It cannot be 200 * static or initialized at call time as it may have internal state (such as 201 * a nonce). 202 */ 203 const Hash hash_function; 204 205 /** compute_hashes is convenience for not having to write out this 206 * expression everywhere we use the hash values of an Element. 207 * 208 * We need to map the 32-bit input hash onto a hash bucket in a range [0, size) in a 209 * manner which preserves as much of the hash's uniformity as possible. Ideally 210 * this would be done by bitmasking but the size is usually not a power of two. 211 * 212 * The naive approach would be to use a mod -- which isn't perfectly uniform but so 213 * long as the hash is much larger than size it is not that bad. Unfortunately, 214 * mod/division is fairly slow on ordinary microprocessors (e.g. 90-ish cycles on 215 * haswell, ARM doesn't even have an instruction for it.); when the divisor is a 216 * constant the compiler will do clever tricks to turn it into a multiply+add+shift, 217 * but size is a run-time value so the compiler can't do that here. 218 * 219 * One option would be to implement the same trick the compiler uses and compute the 220 * constants for exact division based on the size, as described in "{N}-bit Unsigned 221 * Division via {N}-bit Multiply-Add" by Arch D. Robison in 2005. But that code is 222 * somewhat complicated and the result is still slower than other options: 223 * 224 * Instead we treat the 32-bit random number as a Q32 fixed-point number in the range 225 * [0, 1) and simply multiply it by the size. Then we just shift the result down by 226 * 32-bits to get our bucket number. The result has non-uniformity the same as a 227 * mod, but it is much faster to compute. More about this technique can be found at 228 * https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ . 229 * 230 * The resulting non-uniformity is also more equally distributed which would be 231 * advantageous for something like linear probing, though it shouldn't matter 232 * one way or the other for a cuckoo table. 233 * 234 * The primary disadvantage of this approach is increased intermediate precision is 235 * required but for a 32-bit random number we only need the high 32 bits of a 236 * 32*32->64 multiply, which means the operation is reasonably fast even on a 237 * typical 32-bit processor. 238 * 239 * @param e The element whose hashes will be returned 240 * @returns Deterministic hashes derived from `e` uniformly mapped onto the range [0, size) 241 */ compute_hashes(const Element & e)242 inline std::array<uint32_t, 8> compute_hashes(const Element& e) const 243 { 244 return {{(uint32_t)(((uint64_t)hash_function.template operator()<0>(e) * (uint64_t)size) >> 32), 245 (uint32_t)(((uint64_t)hash_function.template operator()<1>(e) * (uint64_t)size) >> 32), 246 (uint32_t)(((uint64_t)hash_function.template operator()<2>(e) * (uint64_t)size) >> 32), 247 (uint32_t)(((uint64_t)hash_function.template operator()<3>(e) * (uint64_t)size) >> 32), 248 (uint32_t)(((uint64_t)hash_function.template operator()<4>(e) * (uint64_t)size) >> 32), 249 (uint32_t)(((uint64_t)hash_function.template operator()<5>(e) * (uint64_t)size) >> 32), 250 (uint32_t)(((uint64_t)hash_function.template operator()<6>(e) * (uint64_t)size) >> 32), 251 (uint32_t)(((uint64_t)hash_function.template operator()<7>(e) * (uint64_t)size) >> 32)}}; 252 } 253 254 /** invalid returns a special index that can never be inserted to 255 * @returns the special constexpr index that can never be inserted to */ invalid()256 constexpr uint32_t invalid() const 257 { 258 return ~(uint32_t)0; 259 } 260 261 /** allow_erase marks the element at index `n` as discardable. Threadsafe 262 * without any concurrent insert. 263 * @param n the index to allow erasure of 264 */ allow_erase(uint32_t n)265 inline void allow_erase(uint32_t n) const 266 { 267 collection_flags.bit_set(n); 268 } 269 270 /** please_keep marks the element at index `n` as an entry that should be kept. 271 * Threadsafe without any concurrent insert. 272 * @param n the index to prioritize keeping 273 */ please_keep(uint32_t n)274 inline void please_keep(uint32_t n) const 275 { 276 collection_flags.bit_unset(n); 277 } 278 279 /** epoch_check handles the changing of epochs for elements stored in the 280 * cache. epoch_check should be run before every insert. 281 * 282 * First, epoch_check decrements and checks the cheap heuristic, and then does 283 * a more expensive scan if the cheap heuristic runs out. If the expensive 284 * scan succeeds, the epochs are aged and old elements are allow_erased. The 285 * cheap heuristic is reset to retrigger after the worst case growth of the 286 * current epoch's elements would exceed the epoch_size. 287 */ epoch_check()288 void epoch_check() 289 { 290 if (epoch_heuristic_counter != 0) { 291 --epoch_heuristic_counter; 292 return; 293 } 294 // count the number of elements from the latest epoch which 295 // have not been erased. 296 uint32_t epoch_unused_count = 0; 297 for (uint32_t i = 0; i < size; ++i) 298 epoch_unused_count += epoch_flags[i] && 299 !collection_flags.bit_is_set(i); 300 // If there are more non-deleted entries in the current epoch than the 301 // epoch size, then allow_erase on all elements in the old epoch (marked 302 // false) and move all elements in the current epoch to the old epoch 303 // but do not call allow_erase on their indices. 304 if (epoch_unused_count >= epoch_size) { 305 for (uint32_t i = 0; i < size; ++i) 306 if (epoch_flags[i]) 307 epoch_flags[i] = false; 308 else 309 allow_erase(i); 310 epoch_heuristic_counter = epoch_size; 311 } else 312 // reset the epoch_heuristic_counter to next do a scan when worst 313 // case behavior (no intermittent erases) would exceed epoch size, 314 // with a reasonable minimum scan size. 315 // Ordinarily, we would have to sanity check std::min(epoch_size, 316 // epoch_unused_count), but we already know that `epoch_unused_count 317 // < epoch_size` in this branch 318 epoch_heuristic_counter = std::max(1u, std::max(epoch_size / 16, 319 epoch_size - epoch_unused_count)); 320 } 321 322 public: 323 /** You must always construct a cache with some elements via a subsequent 324 * call to setup or setup_bytes, otherwise operations may segfault. 325 */ cache()326 cache() : table(), size(), collection_flags(0), epoch_flags(), 327 epoch_heuristic_counter(), epoch_size(), depth_limit(0), hash_function() 328 { 329 } 330 331 /** setup initializes the container to store no more than new_size 332 * elements. 333 * 334 * setup should only be called once. 335 * 336 * @param new_size the desired number of elements to store 337 * @returns the maximum number of elements storable 338 */ setup(uint32_t new_size)339 uint32_t setup(uint32_t new_size) 340 { 341 // depth_limit must be at least one otherwise errors can occur. 342 depth_limit = static_cast<uint8_t>(std::log2(static_cast<float>(std::max((uint32_t)2, new_size)))); 343 size = std::max<uint32_t>(2, new_size); 344 table.resize(size); 345 collection_flags.setup(size); 346 epoch_flags.resize(size); 347 // Set to 45% as described above 348 epoch_size = std::max((uint32_t)1, (45 * size) / 100); 349 // Initially set to wait for a whole epoch 350 epoch_heuristic_counter = epoch_size; 351 return size; 352 } 353 354 /** setup_bytes is a convenience function which accounts for internal memory 355 * usage when deciding how many elements to store. It isn't perfect because 356 * it doesn't account for any overhead (struct size, MallocUsage, collection 357 * and epoch flags). This was done to simplify selecting a power of two 358 * size. In the expected use case, an extra two bits per entry should be 359 * negligible compared to the size of the elements. 360 * 361 * @param bytes the approximate number of bytes to use for this data 362 * structure 363 * @returns the maximum number of elements storable (see setup() 364 * documentation for more detail) 365 */ setup_bytes(size_t bytes)366 uint32_t setup_bytes(size_t bytes) 367 { 368 return setup(bytes/sizeof(Element)); 369 } 370 371 /** insert loops at most depth_limit times trying to insert a hash 372 * at various locations in the table via a variant of the Cuckoo Algorithm 373 * with eight hash locations. 374 * 375 * It drops the last tried element if it runs out of depth before 376 * encountering an open slot. 377 * 378 * Thus: 379 * 380 * ``` 381 * insert(x); 382 * return contains(x, false); 383 * ``` 384 * 385 * is not guaranteed to return true. 386 * 387 * @param e the element to insert 388 * @post one of the following: All previously inserted elements and e are 389 * now in the table, one previously inserted element is evicted from the 390 * table, the entry attempted to be inserted is evicted. 391 */ insert(Element e)392 inline void insert(Element e) 393 { 394 epoch_check(); 395 uint32_t last_loc = invalid(); 396 bool last_epoch = true; 397 std::array<uint32_t, 8> locs = compute_hashes(e); 398 // Make sure we have not already inserted this element 399 // If we have, make sure that it does not get deleted 400 for (const uint32_t loc : locs) 401 if (table[loc] == e) { 402 please_keep(loc); 403 epoch_flags[loc] = last_epoch; 404 return; 405 } 406 for (uint8_t depth = 0; depth < depth_limit; ++depth) { 407 // First try to insert to an empty slot, if one exists 408 for (const uint32_t loc : locs) { 409 if (!collection_flags.bit_is_set(loc)) 410 continue; 411 table[loc] = std::move(e); 412 please_keep(loc); 413 epoch_flags[loc] = last_epoch; 414 return; 415 } 416 /** Swap with the element at the location that was 417 * not the last one looked at. Example: 418 * 419 * 1. On first iteration, last_loc == invalid(), find returns last, so 420 * last_loc defaults to locs[0]. 421 * 2. On further iterations, where last_loc == locs[k], last_loc will 422 * go to locs[k+1 % 8], i.e., next of the 8 indices wrapping around 423 * to 0 if needed. 424 * 425 * This prevents moving the element we just put in. 426 * 427 * The swap is not a move -- we must switch onto the evicted element 428 * for the next iteration. 429 */ 430 last_loc = locs[(1 + (std::find(locs.begin(), locs.end(), last_loc) - locs.begin())) & 7]; 431 std::swap(table[last_loc], e); 432 // Can't std::swap a std::vector<bool>::reference and a bool&. 433 bool epoch = last_epoch; 434 last_epoch = epoch_flags[last_loc]; 435 epoch_flags[last_loc] = epoch; 436 437 // Recompute the locs -- unfortunately happens one too many times! 438 locs = compute_hashes(e); 439 } 440 } 441 442 /** contains iterates through the hash locations for a given element 443 * and checks to see if it is present. 444 * 445 * contains does not check garbage collected state (in other words, 446 * garbage is only collected when the space is needed), so: 447 * 448 * ``` 449 * insert(x); 450 * if (contains(x, true)) 451 * return contains(x, false); 452 * else 453 * return true; 454 * ``` 455 * 456 * executed on a single thread will always return true! 457 * 458 * This is a great property for re-org performance for example. 459 * 460 * contains returns a bool set true if the element was found. 461 * 462 * @param e the element to check 463 * @param erase whether to attempt setting the garbage collect flag 464 * 465 * @post if erase is true and the element is found, then the garbage collect 466 * flag is set 467 * @returns true if the element is found, false otherwise 468 */ contains(const Element & e,const bool erase)469 inline bool contains(const Element& e, const bool erase) const 470 { 471 std::array<uint32_t, 8> locs = compute_hashes(e); 472 for (const uint32_t loc : locs) 473 if (table[loc] == e) { 474 if (erase) 475 allow_erase(loc); 476 return true; 477 } 478 return false; 479 } 480 }; 481 } // namespace CuckooCache 482 483 #endif // BITCOIN_CUCKOOCACHE_H 484