1 /* Copyright 2017-present Facebook, Inc. 2 * Licensed under the Apache License, Version 2.0 */ 3 #pragma once 4 #include <chrono> 5 #include <deque> 6 #include <unordered_map> 7 #include "Future.h" 8 #include "make_unique.h" 9 #include "watchman_synchronized.h" 10 11 namespace watchman { 12 13 /** LRUCache implements a cache map with an LRU eviction policy. 14 * 15 * Items are retained in the cache until evicted. 16 * Eviction occurs when attempting to insert a new item into 17 * a full cache. Eviction will pick the least-recently-used 18 * item to erase. 19 * 20 * set(), get(), erase() methods are provided for performing 21 * immediate operations on the cache contents. A clear() 22 * method allows purging the entire contents at once. 23 * 24 * An alternative get() API is provided that allows the cache 25 * to be satisfied by some function returning a Future<ValueType>. 26 * This API has some thundering herd protection built-in; 27 * multiple requests for the same key will aggregate on one 28 * pending node that will fulfill the request later. 29 * Since this mode may experience failure the cache allows 30 * setting an errorTTL as a negative cache TTL. The failure 31 * of the lookup will be remembered for the specified duration 32 * before allowing a subsequent async get to be tried again. 33 * 34 * Thread safety: the cache exposes stored values to its client 35 * via a Node type. The Node is a const shared_ptr, making it 36 * immutable to the client. The invariant is that if a client 37 * has a reference to a Node, the value() of the node can 38 * not be mutated by some other client. For pending lookups, 39 * clients will never be handed a Node; only a Future<Node>. 40 * 41 * There is a single lock protecting all mutation of the cache 42 * and its nodes. Because the cache is LRU it needs to touch 43 * a node as part of a lookup to ensure that it will not 44 * be evicted prematurely. 45 */ 46 47 template <typename KeyType, typename ValueType> 48 class LRUCache; 49 50 // Some of these class names are a bit too generic to stash 51 // in the typical "detail" namespace, so we have a more specific 52 // container here. 53 namespace lrucache { 54 template <typename Node> 55 class TailQHead; 56 57 /** The node tracks the value in the cache. 58 * We use a shared_ptr to manage its lifetime safely. 59 * Clients of the cache will only ever be handed a 60 * shared_ptr<const Node>. Once published to a client, 61 * the underlying value is immutable. 62 */ 63 template <typename KeyType, typename ValueType> 64 class Node { 65 friend class TailQHead<Node<KeyType, ValueType>>; 66 friend class LRUCache<KeyType, ValueType>; 67 68 public: 69 using PromiseList = std::deque<Promise<std::shared_ptr<const Node>>>; 70 71 // Construct a node via LRUCache::set() Node(const KeyType & key,ValueType && value)72 Node(const KeyType& key, ValueType&& value) 73 : key_(key), value_(std::move(value)) {} 74 75 // Construct a node using a getter function. 76 // The value is empty and the promise list is initialized. Node(const KeyType & key)77 explicit Node(const KeyType& key) 78 : key_(key), promises_(make_unique<PromiseList>()) {} 79 80 // Returns the underlying value. value()81 const ValueType& value() const { 82 return value_.value(); 83 } 84 85 // Returns the underlying value container. 86 // This is useful if you want to test for an error in a Future<> get. result()87 const Result<ValueType>& result() const { 88 return value_; 89 } 90 91 private: 92 // Obtain a future that will be ready when the pending 93 // fetches are complete. subscribe()94 Future<std::shared_ptr<const Node>> subscribe() { 95 promises_->emplace_back(); 96 return promises_->back().getFuture(); 97 } 98 99 // Test whether an error result can be evicted expired(std::chrono::steady_clock::time_point now)100 bool expired(std::chrono::steady_clock::time_point now) const { 101 return value_.hasError() && now >= deadline_; 102 } 103 104 // Address of next element 105 Node* next_{nullptr}; 106 // Address of previous next element 107 Node** addressOfPreviousNext_{nullptr}; 108 109 // The key 110 KeyType key_; 111 // The stored value 112 Result<ValueType> value_; 113 114 // The collection of clients waiting for this node to be available. 115 // This is a pointer so that we can minimize the space usage once 116 // it has been fulfilled. 117 std::unique_ptr<PromiseList> promises_; 118 119 // Time after which this node is to be considered invalid 120 std::chrono::steady_clock::time_point deadline_; 121 }; 122 123 // A doubly-linked intrusive list through the cache nodes. 124 // O(1) append and O(1) removal. 125 // This is a non-owning list that is used to maintain the 126 // ordered sets used for eviction. 127 template <typename Node> 128 class TailQHead { 129 public: 130 // First element 131 Node* first_; 132 // Address of the "next" field in the last element 133 Node** addressOfLastNext_; 134 135 public: TailQHead()136 TailQHead() : first_(nullptr), addressOfLastNext_(&first_) {} 137 insertTail(Node * node)138 void insertTail(Node* node) { 139 node->next_ = nullptr; 140 node->addressOfPreviousNext_ = addressOfLastNext_; 141 *addressOfLastNext_ = node; 142 addressOfLastNext_ = &node->next_; 143 } 144 remove(Node * node)145 void remove(Node* node) { 146 if (!node->addressOfPreviousNext_) { 147 // Not linked, NOP. 148 return; 149 } 150 151 if (node->next_) { 152 node->next_->addressOfPreviousNext_ = node->addressOfPreviousNext_; 153 } else { 154 addressOfLastNext_ = node->addressOfPreviousNext_; 155 } 156 *node->addressOfPreviousNext_ = node->next_; 157 } 158 159 // Bubble the node to the tail end of the list. touch(Node * node)160 void touch(Node* node) { 161 remove(node); 162 insertTail(node); 163 } 164 165 // Clear the list contents clear()166 void clear() { 167 first_ = nullptr; 168 addressOfLastNext_ = &first_; 169 } 170 171 // Returns a pointer to the first element. 172 // May be nullptr if the tailq is empty. head()173 Node* head() { 174 return first_; 175 } 176 }; 177 178 struct Stats { 179 // Number of times that a get resulted in a usable result 180 size_t cacheHit{0}; 181 // Number of times that a get resulted in sharing a pending get 182 size_t cacheShare{0}; 183 // Number of times that a get resulted in no usable result 184 size_t cacheMiss{0}; 185 // Number of times that an item was evicted for any reason 186 size_t cacheEvict{0}; 187 // Number of times that an item was inserted or replaced 188 size_t cacheStore{0}; 189 // Number of times that a get was attempted 190 size_t cacheLoad{0}; 191 // Number of times that an item was erased via erase() 192 size_t cacheErase{0}; 193 // Number of times that the cache has been clear()'d 194 size_t clearCount{0}; 195 clearStats196 void clear() { 197 cacheHit = 0; 198 cacheShare = 0; 199 cacheMiss = 0; 200 cacheEvict = 0; 201 cacheStore = 0; 202 cacheLoad = 0; 203 cacheErase = 0; 204 ++clearCount; 205 } 206 }; 207 208 // Factoring out the internal state struct here, as MSVC 209 // has a hard time compiling it otherwise. 210 template <typename KeyType, typename ValueType> 211 struct InternalState { 212 using NodeType = Node<KeyType, ValueType>; 213 // This owns the nodes in the map 214 std::unordered_map<KeyType, std::shared_ptr<NodeType>> map; 215 216 // Maintain some stats for cache introspection 217 Stats stats; 218 219 // To manage eviction we categorize a node into one of 220 // three sets and link it into the appropriate tailq 221 // below. A node belongs in only one set at a time. 222 223 // Nodes in the process of being fetched. We cannot 224 // evict anything in this set. Note that we don't 225 // strictly need to materialize this set (it can be 226 // inferred from node properties) but it is a nice 227 // invariant that any node always has a non-nullptr 228 // result from whichQ(). 229 TailQHead<NodeType> lookupOrder; 230 231 // Nodes with a successful result. Eviction policy 232 // is pure LRU. Nodes in this set are touched on 233 // access, causing the head to be the least recently 234 // used node in the set, and the tail to be the 235 // most recently used. 236 TailQHead<NodeType> evictionOrder; 237 238 // Nodes with an error result; these have a TTL 239 // that we must respect as a way to manage load. 240 // Nodes in this set are never touched; this means 241 // that the head of this list is always the node 242 // with the closest deadline. 243 TailQHead<NodeType> erroredOrder; 244 }; 245 246 } // namespace lrucache 247 248 struct CacheStats : public lrucache::Stats { CacheStatsCacheStats249 CacheStats(const lrucache::Stats s, size_t size) : Stats(s), size(size) {} 250 size_t size; 251 }; 252 253 // The cache. More information on this can be found at the 254 // top of this header file! 255 template <typename KeyType, typename ValueType> 256 class LRUCache { 257 public: 258 using NodeType = lrucache::Node<KeyType, ValueType>; 259 260 private: 261 using State = lrucache::InternalState<KeyType, ValueType>; 262 using LockedState = typename Synchronized<State>::LockedPtr; 263 264 public: 265 // Construct a cache with a defined limit and the specified 266 // negative caching TTL duration. The errorTTL is measured 267 // from the start of the lookup, not its completion. LRUCache(size_t maxItems,std::chrono::milliseconds errorTTL)268 LRUCache(size_t maxItems, std::chrono::milliseconds errorTTL) 269 : maxItems_(maxItems), errorTTL_(errorTTL) {} 270 271 // No moving or copying 272 LRUCache(const LRUCache&) = delete; 273 LRUCache& operator=(const LRUCache&) = delete; 274 LRUCache(LRUCache&&) = delete; 275 LRUCache& operator=(LRUCache&&) = delete; 276 277 // Lookup key and return the result. 278 // If the key was not present then the result will be nullptr. 279 // If there is an outstanding fetch in progress, throws an error; 280 // you must consistently use the Future<> version of get() for 281 // such a key. 282 std::shared_ptr<const NodeType> get( 283 const KeyType& key, 284 std::chrono::steady_clock::time_point now = 285 std::chrono::steady_clock::now()) { 286 auto state = state_.wlock(); 287 ++state->stats.cacheLoad; 288 289 auto it = state->map.find(key); 290 if (it == state->map.end()) { 291 ++state->stats.cacheMiss; 292 return nullptr; 293 } 294 295 auto node = it->second; 296 auto q = whichQ(node.get(), state); 297 298 // Remove expired item 299 if (node->expired(now)) { 300 state->map.erase(it); 301 q->remove(node.get()); 302 ++state->stats.cacheMiss; 303 ++state->stats.cacheEvict; 304 return nullptr; 305 } 306 307 if (q == &state->lookupOrder) { 308 // There's no safe way to allow this mode of fetch to subscribe 309 // to the pending lookup. The caller should use the getter 310 // flavor exclusively for this key. 311 throw std::runtime_error("mixing Future getter with direct getter"); 312 } 313 314 if (q == &state->evictionOrder) { 315 q->touch(node.get()); 316 } 317 318 ++state->stats.cacheHit; 319 return node; 320 } 321 322 // Lookup key using a getter function. 323 // If the key is not present in the cache, initiates a (possible async) 324 // load of that value using the supplied getter function. 325 // The getter function is a functor which behaves like: 326 // std::function<Future<ValueType>(const KeyType&)> 327 // 328 // If the key is present but not yet satisfied, append a Promise 329 // to the chain in the node and return a Future that will yield the 330 // node once the lookup is complete. 331 // 332 // If the key is present and satisfied, return a Future that will 333 // immediately yield the node. 334 // 335 // The purpose of this function is to reduce the exposure to 336 // "thundering herd" style problems. In watchman we tend to have 337 // limited contention for the same key but may have a great many 338 // requests for different keys; it is desirable to collapse 2x or 3x 339 // the requests for the same key into a single request. 340 // 341 // It is not safe to delete the LRUCache while there are outstanding 342 // getter calls. It is the responsibility of the caller to ensure 343 // that this does not happen. 344 template <typename Func> 345 Future<std::shared_ptr<const NodeType>> get( 346 const KeyType& key, 347 Func&& getter, 348 std::chrono::steady_clock::time_point now = 349 std::chrono::steady_clock::now()) { 350 std::shared_ptr<NodeType> node; 351 Future<std::shared_ptr<const NodeType>> future; 352 353 // Only hold the lock on the state while we set up the map entry. 354 { 355 auto state = state_.wlock(); 356 ++state->stats.cacheLoad; 357 358 auto it = state->map.find(key); 359 if (it != state->map.end()) { 360 node = it->second; 361 362 auto q = whichQ(node.get(), state); 363 364 if (!node->expired(now)) { 365 // Only touch successful nodes 366 if (q == &state->evictionOrder) { 367 q->touch(node.get()); 368 } 369 370 if (node->promises_) { 371 // Not yet satisfied, so chain on a promise 372 ++state->stats.cacheShare; 373 return node->subscribe(); 374 } 375 376 // Available now 377 ++state->stats.cacheHit; 378 return makeFuture<std::shared_ptr<const NodeType>>(node); 379 } 380 381 // Remove invalid node. 382 // We can't re-use it without introducing locking in 383 // the node itself. 384 q->remove(node.get()); 385 state->map.erase(it); 386 ++state->stats.cacheEvict; 387 } 388 389 // Try to make a new node; this can fail if we are too full 390 node = makeNode(state, now, key); 391 392 // Insert into map and the appropriate tailq 393 state->map.emplace(std::make_pair(node->key_, node)); 394 state->lookupOrder.insertTail(node.get()); 395 396 ++state->stats.cacheMiss; 397 // and ensure that we capture a subscription before we release 398 // the lock! 399 future = node->subscribe(); 400 } 401 402 // Arrange for the value to be populated. 403 // This is done outside the lock in case the getter is a 404 // simple callback that executes immediately in our context; 405 // if that were to then try to operate on the cache, it would 406 // deadlock. 407 getter(key).then([node, this, now](Result<ValueType>&& result) { 408 // We're going to steal the promises so that we can fulfil 409 // them outside of the lock 410 std::unique_ptr<typename NodeType::PromiseList> promises; 411 { 412 auto state = state_.wlock(); 413 node->value_ = std::move(result); 414 415 if (whichQ(node.get(), state) != &state->lookupOrder) { 416 // Should never happen... 417 abort(); 418 } 419 420 ++state->stats.cacheStore; 421 422 // We're no longer looking this up; we'll put it in the 423 // correct bucket just before we release the lock below. 424 state->lookupOrder.remove(node.get()); 425 426 // We only need a TTL for errors 427 if (node->value_.hasError()) { 428 // Note that we don't account for the time it takes to 429 // arrive at the error condition in this TTL. This 430 // is semi-deliberate; the for the sake of testing we 431 // are passing `now` through from outside. 432 node->deadline_ = now + errorTTL_; 433 } 434 435 // Steal the promises; we will fulfill them below 436 // after we've released the lock. 437 std::swap(promises, node->promises_); 438 439 if (whichQ(node.get(), state) == &state->lookupOrder) { 440 // Should never happen... 441 abort(); 442 } 443 444 // Now that the promises have been stolen, insert into 445 // the appropriate queue. 446 whichQ(node.get(), state)->insertTail(node.get()); 447 } 448 449 // Wake up all waiters 450 for (auto& p : *promises) { 451 p.setValue(node); 452 } 453 }); 454 455 return future; 456 } 457 458 // Explicitly set the value for a key. 459 // This will create or update a node as appropriate. 460 // This may fail if there is no space and no items can be evicted. 461 std::shared_ptr<const NodeType> set( 462 const KeyType& key, 463 ValueType&& value, 464 std::chrono::steady_clock::time_point now = 465 std::chrono::steady_clock::now()) { 466 auto state = state_.wlock(); 467 auto it = state->map.find(key); 468 469 if (it != state->map.end()) { 470 // Remove this item. We can't update the value in the 471 // item without introducing per-node locks, so we simply 472 // allow any external references to see their immutable 473 // value while we insert a new value in the map. 474 475 // Note that we don't check node->expired() here like 476 // we do in the get() path. The assumption is that the 477 // caller is deliberately replacing an errored node 478 // with some valid value. 479 auto oldNode = it->second; 480 whichQ(oldNode.get(), state)->remove(oldNode.get()); 481 state->map.erase(it); 482 ++state->stats.cacheEvict; 483 } 484 485 auto node = makeNode(state, now, key, std::move(value)); 486 state->map.emplace(std::make_pair(node->key_, node)); 487 whichQ(node.get(), state)->insertTail(node.get()); 488 ++state->stats.cacheStore; 489 490 return node; 491 } 492 493 // Erase the entry associated with key. 494 // Returns the node if it was present, else nullptr. erase(const KeyType & key)495 std::shared_ptr<const NodeType> erase(const KeyType& key) { 496 auto state = state_.wlock(); 497 auto it = state->map.find(key); 498 if (it == state->map.end()) { 499 return nullptr; 500 } 501 502 auto node = it->second; 503 // Note that we don't check node->expired() here like 504 // we do in the get() path. The assumption is that the 505 // caller is deliberately invalidating an errored node. 506 whichQ(node.get(), state)->remove(node.get()); 507 state->map.erase(it); 508 ++state->stats.cacheErase; 509 510 return node; 511 } 512 513 // Returns the number of cached items size()514 size_t size() const { 515 auto state = state_.rlock(); 516 return state->map.size(); 517 } 518 519 // Returns cache statistics stats()520 CacheStats stats() const { 521 auto state = state_.rlock(); 522 return CacheStats(state->stats, state->map.size()); 523 } 524 525 // Purge all of the entries from the cache clear()526 void clear() { 527 auto state = state_.wlock(); 528 state->evictionOrder.clear(); 529 state->erroredOrder.clear(); 530 state->lookupOrder.clear(); 531 state->map.clear(); 532 state->stats.clear(); 533 } 534 535 private: 536 // Small helper for creating a new Node. This checks for capacity 537 // and attempts to evict an item to make room if needed. 538 // The eviction may fail in some cases, which results in this method 539 // also throwing. 540 template <typename... Args> makeNode(LockedState & state,std::chrono::steady_clock::time_point now,Args &&...args)541 std::shared_ptr<NodeType> makeNode( 542 LockedState& state, 543 std::chrono::steady_clock::time_point now, 544 Args&&... args) { 545 // If we are too full, try to evict an item; this may throw if there are no 546 // evictable items! 547 if (state->map.size() + 1 > maxItems_) { 548 evictOne(state, now); 549 } 550 551 return std::make_shared<NodeType>(std::forward<Args>(args)...); 552 } 553 554 // Returns the queue into which the node should be placed (for new nodes), 555 // or should currently be linked into (for existing nodes). whichQ(NodeType * node,LockedState & state)556 lrucache::TailQHead<NodeType>* whichQ(NodeType* node, LockedState& state) { 557 if (node->promises_) { 558 return &state->lookupOrder; 559 } 560 561 if (!node->value_.hasValue()) { 562 return &state->erroredOrder; 563 } 564 565 return &state->evictionOrder; 566 } 567 568 // Attempt to evict a single item to make space for a new Node. 569 // May throw if there are too many errored or pending lookups. evictOne(LockedState & state,std::chrono::steady_clock::time_point now)570 void evictOne(LockedState& state, std::chrono::steady_clock::time_point now) { 571 // Since errors have a TTL (as opposed to infinite), try to 572 // evict one of those first. That keeps the cache focused 573 // on usable items rather than prematurely evicting something 574 // that might be useful again in the future. 575 auto node = state->erroredOrder.head(); 576 if (node && node->expired(now)) { 577 state->erroredOrder.remove(node); 578 // Erase from the map last, as this will invalidate node 579 state->map.erase(node->key_); 580 ++state->stats.cacheEvict; 581 return; 582 } 583 584 // Second choice is to evict a successful item 585 node = state->evictionOrder.head(); 586 if (node) { 587 state->evictionOrder.remove(node); 588 // Erase from the map last, as this will invalidate node 589 state->map.erase(node->key_); 590 ++state->stats.cacheEvict; 591 return; 592 } 593 594 // There are no evictable items to purge, so we are too full. 595 // This is a signal that there is a throughput problem. Rather than 596 // queueing up some more work, we just fail this request; this acts a 597 // brake when the system is swamped. 598 throw std::runtime_error("too many pending cache jobs"); 599 } 600 601 // The maximum allowed capacity 602 const size_t maxItems_; 603 // How long to cache items that have an error Result 604 const std::chrono::milliseconds errorTTL_; 605 Synchronized<State> state_; 606 }; 607 } 608