1 /* Copyright 2017-present Facebook, Inc.
2  * Licensed under the Apache License, Version 2.0 */
3 #pragma once
4 #include <chrono>
5 #include <deque>
6 #include <unordered_map>
7 #include "Future.h"
8 #include "make_unique.h"
9 #include "watchman_synchronized.h"
10 
11 namespace watchman {
12 
13 /** LRUCache implements a cache map with an LRU eviction policy.
14  *
15  * Items are retained in the cache until evicted.
16  * Eviction occurs when attempting to insert a new item into
17  * a full cache.  Eviction will pick the least-recently-used
18  * item to erase.
19  *
20  * set(), get(), erase() methods are provided for performing
21  * immediate operations on the cache contents.  A clear()
22  * method allows purging the entire contents at once.
23  *
24  * An alternative get() API is provided that allows the cache
25  * to be satisfied by some function returning a Future<ValueType>.
26  * This API has some thundering herd protection built-in;
27  * multiple requests for the same key will aggregate on one
28  * pending node that will fulfill the request later.
29  * Since this mode may experience failure the cache allows
30  * setting an errorTTL as a negative cache TTL.  The failure
31  * of the lookup will be remembered for the specified duration
32  * before allowing a subsequent async get to be tried again.
33  *
34  * Thread safety: the cache exposes stored values to its client
35  * via a Node type.  The Node is a const shared_ptr, making it
36  * immutable to the client.  The invariant is that if a client
37  * has a reference to a Node, the value() of the node can
38  * not be mutated by some other client.  For pending lookups,
39  * clients will never be handed a Node; only a Future<Node>.
40  *
41  * There is a single lock protecting all mutation of the cache
42  * and its nodes.  Because the cache is LRU it needs to touch
43  * a node as part of a lookup to ensure that it will not
44  * be evicted prematurely.
45  */
46 
47 template <typename KeyType, typename ValueType>
48 class LRUCache;
49 
50 // Some of these class names are a bit too generic to stash
51 // in the typical "detail" namespace, so we have a more specific
52 // container here.
53 namespace lrucache {
54 template <typename Node>
55 class TailQHead;
56 
57 /** The node tracks the value in the cache.
58  * We use a shared_ptr to manage its lifetime safely.
59  * Clients of the cache will only ever be handed a
60  * shared_ptr<const Node>.  Once published to a client,
61  * the underlying value is immutable.
62  */
63 template <typename KeyType, typename ValueType>
64 class Node {
65   friend class TailQHead<Node<KeyType, ValueType>>;
66   friend class LRUCache<KeyType, ValueType>;
67 
68  public:
69   using PromiseList = std::deque<Promise<std::shared_ptr<const Node>>>;
70 
71   // Construct a node via LRUCache::set()
Node(const KeyType & key,ValueType && value)72   Node(const KeyType& key, ValueType&& value)
73       : key_(key), value_(std::move(value)) {}
74 
75   // Construct a node using a getter function.
76   // The value is empty and the promise list is initialized.
Node(const KeyType & key)77   explicit Node(const KeyType& key)
78       : key_(key), promises_(make_unique<PromiseList>()) {}
79 
80   // Returns the underlying value.
value()81   const ValueType& value() const {
82     return value_.value();
83   }
84 
85   // Returns the underlying value container.
86   // This is useful if you want to test for an error in a Future<> get.
result()87   const Result<ValueType>& result() const {
88     return value_;
89   }
90 
91  private:
92   // Obtain a future that will be ready when the pending
93   // fetches are complete.
subscribe()94   Future<std::shared_ptr<const Node>> subscribe() {
95     promises_->emplace_back();
96     return promises_->back().getFuture();
97   }
98 
99   // Test whether an error result can be evicted
expired(std::chrono::steady_clock::time_point now)100   bool expired(std::chrono::steady_clock::time_point now) const {
101     return value_.hasError() && now >= deadline_;
102   }
103 
104   // Address of next element
105   Node* next_{nullptr};
106   // Address of previous next element
107   Node** addressOfPreviousNext_{nullptr};
108 
109   // The key
110   KeyType key_;
111   // The stored value
112   Result<ValueType> value_;
113 
114   // The collection of clients waiting for this node to be available.
115   // This is a pointer so that we can minimize the space usage once
116   // it has been fulfilled.
117   std::unique_ptr<PromiseList> promises_;
118 
119   // Time after which this node is to be considered invalid
120   std::chrono::steady_clock::time_point deadline_;
121 };
122 
123 // A doubly-linked intrusive list through the cache nodes.
124 // O(1) append and O(1) removal.
125 // This is a non-owning list that is used to maintain the
126 // ordered sets used for eviction.
127 template <typename Node>
128 class TailQHead {
129  public:
130   // First element
131   Node* first_;
132   // Address of the "next" field in the last element
133   Node** addressOfLastNext_;
134 
135  public:
TailQHead()136   TailQHead() : first_(nullptr), addressOfLastNext_(&first_) {}
137 
insertTail(Node * node)138   void insertTail(Node* node) {
139     node->next_ = nullptr;
140     node->addressOfPreviousNext_ = addressOfLastNext_;
141     *addressOfLastNext_ = node;
142     addressOfLastNext_ = &node->next_;
143   }
144 
remove(Node * node)145   void remove(Node* node) {
146     if (!node->addressOfPreviousNext_) {
147       // Not linked, NOP.
148       return;
149     }
150 
151     if (node->next_) {
152       node->next_->addressOfPreviousNext_ = node->addressOfPreviousNext_;
153     } else {
154       addressOfLastNext_ = node->addressOfPreviousNext_;
155     }
156     *node->addressOfPreviousNext_ = node->next_;
157   }
158 
159   // Bubble the node to the tail end of the list.
touch(Node * node)160   void touch(Node* node) {
161     remove(node);
162     insertTail(node);
163   }
164 
165   // Clear the list contents
clear()166   void clear() {
167     first_ = nullptr;
168     addressOfLastNext_ = &first_;
169   }
170 
171   // Returns a pointer to the first element.
172   // May be nullptr if the tailq is empty.
head()173   Node* head() {
174     return first_;
175   }
176 };
177 
178 struct Stats {
179   // Number of times that a get resulted in a usable result
180   size_t cacheHit{0};
181   // Number of times that a get resulted in sharing a pending get
182   size_t cacheShare{0};
183   // Number of times that a get resulted in no usable result
184   size_t cacheMiss{0};
185   // Number of times that an item was evicted for any reason
186   size_t cacheEvict{0};
187   // Number of times that an item was inserted or replaced
188   size_t cacheStore{0};
189   // Number of times that a get was attempted
190   size_t cacheLoad{0};
191   // Number of times that an item was erased via erase()
192   size_t cacheErase{0};
193   // Number of times that the cache has been clear()'d
194   size_t clearCount{0};
195 
clearStats196   void clear() {
197     cacheHit = 0;
198     cacheShare = 0;
199     cacheMiss = 0;
200     cacheEvict = 0;
201     cacheStore = 0;
202     cacheLoad = 0;
203     cacheErase = 0;
204     ++clearCount;
205   }
206 };
207 
208 // Factoring out the internal state struct here, as MSVC
209 // has a hard time compiling it otherwise.
210 template <typename KeyType, typename ValueType>
211 struct InternalState {
212   using NodeType = Node<KeyType, ValueType>;
213   // This owns the nodes in the map
214   std::unordered_map<KeyType, std::shared_ptr<NodeType>> map;
215 
216   // Maintain some stats for cache introspection
217   Stats stats;
218 
219   // To manage eviction we categorize a node into one of
220   // three sets and link it into the appropriate tailq
221   // below.  A node belongs in only one set at a time.
222 
223   // Nodes in the process of being fetched.  We cannot
224   // evict anything in this set.  Note that we don't
225   // strictly need to materialize this set (it can be
226   // inferred from node properties) but it is a nice
227   // invariant that any node always has a non-nullptr
228   // result from whichQ().
229   TailQHead<NodeType> lookupOrder;
230 
231   // Nodes with a successful result.  Eviction policy
232   // is pure LRU.  Nodes in this set are touched on
233   // access, causing the head to be the least recently
234   // used node in the set, and the tail to be the
235   // most recently used.
236   TailQHead<NodeType> evictionOrder;
237 
238   // Nodes with an error result; these have a TTL
239   // that we must respect as a way to manage load.
240   // Nodes in this set are never touched; this means
241   // that the head of this list is always the node
242   // with the closest deadline.
243   TailQHead<NodeType> erroredOrder;
244 };
245 
246 } // namespace lrucache
247 
248 struct CacheStats : public lrucache::Stats {
CacheStatsCacheStats249   CacheStats(const lrucache::Stats s, size_t size) : Stats(s), size(size) {}
250   size_t size;
251 };
252 
253 // The cache.  More information on this can be found at the
254 // top of this header file!
255 template <typename KeyType, typename ValueType>
256 class LRUCache {
257  public:
258   using NodeType = lrucache::Node<KeyType, ValueType>;
259 
260  private:
261   using State = lrucache::InternalState<KeyType, ValueType>;
262   using LockedState = typename Synchronized<State>::LockedPtr;
263 
264  public:
265   // Construct a cache with a defined limit and the specified
266   // negative caching TTL duration.  The errorTTL is measured
267   // from the start of the lookup, not its completion.
LRUCache(size_t maxItems,std::chrono::milliseconds errorTTL)268   LRUCache(size_t maxItems, std::chrono::milliseconds errorTTL)
269       : maxItems_(maxItems), errorTTL_(errorTTL) {}
270 
271   // No moving or copying
272   LRUCache(const LRUCache&) = delete;
273   LRUCache& operator=(const LRUCache&) = delete;
274   LRUCache(LRUCache&&) = delete;
275   LRUCache& operator=(LRUCache&&) = delete;
276 
277   // Lookup key and return the result.
278   // If the key was not present then the result will be nullptr.
279   // If there is an outstanding fetch in progress, throws an error;
280   // you must consistently use the Future<> version of get() for
281   // such a key.
282   std::shared_ptr<const NodeType> get(
283       const KeyType& key,
284       std::chrono::steady_clock::time_point now =
285           std::chrono::steady_clock::now()) {
286     auto state = state_.wlock();
287     ++state->stats.cacheLoad;
288 
289     auto it = state->map.find(key);
290     if (it == state->map.end()) {
291       ++state->stats.cacheMiss;
292       return nullptr;
293     }
294 
295     auto node = it->second;
296     auto q = whichQ(node.get(), state);
297 
298     // Remove expired item
299     if (node->expired(now)) {
300       state->map.erase(it);
301       q->remove(node.get());
302       ++state->stats.cacheMiss;
303       ++state->stats.cacheEvict;
304       return nullptr;
305     }
306 
307     if (q == &state->lookupOrder) {
308       // There's no safe way to allow this mode of fetch to subscribe
309       // to the pending lookup.  The caller should use the getter
310       // flavor exclusively for this key.
311       throw std::runtime_error("mixing Future getter with direct getter");
312     }
313 
314     if (q == &state->evictionOrder) {
315       q->touch(node.get());
316     }
317 
318     ++state->stats.cacheHit;
319     return node;
320   }
321 
322   // Lookup key using a getter function.
323   // If the key is not present in the cache, initiates a (possible async)
324   // load of that value using the supplied getter function.
325   // The getter function is a functor which behaves like:
326   // std::function<Future<ValueType>(const KeyType&)>
327   //
328   // If the key is present but not yet satisfied, append a Promise
329   // to the chain in the node and return a Future that will yield the
330   // node once the lookup is complete.
331   //
332   // If the key is present and satisfied, return a Future that will
333   // immediately yield the node.
334   //
335   // The purpose of this function is to reduce the exposure to
336   // "thundering herd" style problems.  In watchman we tend to have
337   // limited contention for the same key but may have a great many
338   // requests for different keys; it is desirable to collapse 2x or 3x
339   // the requests for the same key into a single request.
340   //
341   // It is not safe to delete the LRUCache while there are outstanding
342   // getter calls.  It is the responsibility of the caller to ensure
343   // that this does not happen.
344   template <typename Func>
345   Future<std::shared_ptr<const NodeType>> get(
346       const KeyType& key,
347       Func&& getter,
348       std::chrono::steady_clock::time_point now =
349           std::chrono::steady_clock::now()) {
350     std::shared_ptr<NodeType> node;
351     Future<std::shared_ptr<const NodeType>> future;
352 
353     // Only hold the lock on the state while we set up the map entry.
354     {
355       auto state = state_.wlock();
356       ++state->stats.cacheLoad;
357 
358       auto it = state->map.find(key);
359       if (it != state->map.end()) {
360         node = it->second;
361 
362         auto q = whichQ(node.get(), state);
363 
364         if (!node->expired(now)) {
365           // Only touch successful nodes
366           if (q == &state->evictionOrder) {
367             q->touch(node.get());
368           }
369 
370           if (node->promises_) {
371             // Not yet satisfied, so chain on a promise
372             ++state->stats.cacheShare;
373             return node->subscribe();
374           }
375 
376           // Available now
377           ++state->stats.cacheHit;
378           return makeFuture<std::shared_ptr<const NodeType>>(node);
379         }
380 
381         // Remove invalid node.
382         // We can't re-use it without introducing locking in
383         // the node itself.
384         q->remove(node.get());
385         state->map.erase(it);
386         ++state->stats.cacheEvict;
387       }
388 
389       // Try to make a new node; this can fail if we are too full
390       node = makeNode(state, now, key);
391 
392       // Insert into map and the appropriate tailq
393       state->map.emplace(std::make_pair(node->key_, node));
394       state->lookupOrder.insertTail(node.get());
395 
396       ++state->stats.cacheMiss;
397       // and ensure that we capture a subscription before we release
398       // the lock!
399       future = node->subscribe();
400     }
401 
402     // Arrange for the value to be populated.
403     // This is done outside the lock in case the getter is a
404     // simple callback that executes immediately in our context;
405     // if that were to then try to operate on the cache, it would
406     // deadlock.
407     getter(key).then([node, this, now](Result<ValueType>&& result) {
408       // We're going to steal the promises so that we can fulfil
409       // them outside of the lock
410       std::unique_ptr<typename NodeType::PromiseList> promises;
411       {
412         auto state = state_.wlock();
413         node->value_ = std::move(result);
414 
415         if (whichQ(node.get(), state) != &state->lookupOrder) {
416           // Should never happen...
417           abort();
418         }
419 
420         ++state->stats.cacheStore;
421 
422         // We're no longer looking this up; we'll put it in the
423         // correct bucket just before we release the lock below.
424         state->lookupOrder.remove(node.get());
425 
426         // We only need a TTL for errors
427         if (node->value_.hasError()) {
428           // Note that we don't account for the time it takes to
429           // arrive at the error condition in this TTL.  This
430           // is semi-deliberate; the for the sake of testing we
431           // are passing `now` through from outside.
432           node->deadline_ = now + errorTTL_;
433         }
434 
435         // Steal the promises; we will fulfill them below
436         // after we've released the lock.
437         std::swap(promises, node->promises_);
438 
439         if (whichQ(node.get(), state) == &state->lookupOrder) {
440           // Should never happen...
441           abort();
442         }
443 
444         // Now that the promises have been stolen, insert into
445         // the appropriate queue.
446         whichQ(node.get(), state)->insertTail(node.get());
447       }
448 
449       // Wake up all waiters
450       for (auto& p : *promises) {
451         p.setValue(node);
452       }
453     });
454 
455     return future;
456   }
457 
458   // Explicitly set the value for a key.
459   // This will create or update a node as appropriate.
460   // This may fail if there is no space and no items can be evicted.
461   std::shared_ptr<const NodeType> set(
462       const KeyType& key,
463       ValueType&& value,
464       std::chrono::steady_clock::time_point now =
465           std::chrono::steady_clock::now()) {
466     auto state = state_.wlock();
467     auto it = state->map.find(key);
468 
469     if (it != state->map.end()) {
470       // Remove this item.  We can't update the value in the
471       // item without introducing per-node locks, so we simply
472       // allow any external references to see their immutable
473       // value while we insert a new value in the map.
474 
475       // Note that we don't check node->expired() here like
476       // we do in the get() path.  The assumption is that the
477       // caller is deliberately replacing an errored node
478       // with some valid value.
479       auto oldNode = it->second;
480       whichQ(oldNode.get(), state)->remove(oldNode.get());
481       state->map.erase(it);
482       ++state->stats.cacheEvict;
483     }
484 
485     auto node = makeNode(state, now, key, std::move(value));
486     state->map.emplace(std::make_pair(node->key_, node));
487     whichQ(node.get(), state)->insertTail(node.get());
488     ++state->stats.cacheStore;
489 
490     return node;
491   }
492 
493   // Erase the entry associated with key.
494   // Returns the node if it was present, else nullptr.
erase(const KeyType & key)495   std::shared_ptr<const NodeType> erase(const KeyType& key) {
496     auto state = state_.wlock();
497     auto it = state->map.find(key);
498     if (it == state->map.end()) {
499       return nullptr;
500     }
501 
502     auto node = it->second;
503     // Note that we don't check node->expired() here like
504     // we do in the get() path.  The assumption is that the
505     // caller is deliberately invalidating an errored node.
506     whichQ(node.get(), state)->remove(node.get());
507     state->map.erase(it);
508     ++state->stats.cacheErase;
509 
510     return node;
511   }
512 
513   // Returns the number of cached items
size()514   size_t size() const {
515     auto state = state_.rlock();
516     return state->map.size();
517   }
518 
519   // Returns cache statistics
stats()520   CacheStats stats() const {
521     auto state = state_.rlock();
522     return CacheStats(state->stats, state->map.size());
523   }
524 
525   // Purge all of the entries from the cache
clear()526   void clear() {
527     auto state = state_.wlock();
528     state->evictionOrder.clear();
529     state->erroredOrder.clear();
530     state->lookupOrder.clear();
531     state->map.clear();
532     state->stats.clear();
533   }
534 
535  private:
536   // Small helper for creating a new Node.  This checks for capacity
537   // and attempts to evict an item to make room if needed.
538   // The eviction may fail in some cases, which results in this method
539   // also throwing.
540   template <typename... Args>
makeNode(LockedState & state,std::chrono::steady_clock::time_point now,Args &&...args)541   std::shared_ptr<NodeType> makeNode(
542       LockedState& state,
543       std::chrono::steady_clock::time_point now,
544       Args&&... args) {
545     // If we are too full, try to evict an item; this may throw if there are no
546     // evictable items!
547     if (state->map.size() + 1 > maxItems_) {
548       evictOne(state, now);
549     }
550 
551     return std::make_shared<NodeType>(std::forward<Args>(args)...);
552   }
553 
554   // Returns the queue into which the node should be placed (for new nodes),
555   // or should currently be linked into (for existing nodes).
whichQ(NodeType * node,LockedState & state)556   lrucache::TailQHead<NodeType>* whichQ(NodeType* node, LockedState& state) {
557     if (node->promises_) {
558       return &state->lookupOrder;
559     }
560 
561     if (!node->value_.hasValue()) {
562       return &state->erroredOrder;
563     }
564 
565     return &state->evictionOrder;
566   }
567 
568   // Attempt to evict a single item to make space for a new Node.
569   // May throw if there are too many errored or pending lookups.
evictOne(LockedState & state,std::chrono::steady_clock::time_point now)570   void evictOne(LockedState& state, std::chrono::steady_clock::time_point now) {
571     // Since errors have a TTL (as opposed to infinite), try to
572     // evict one of those first.  That keeps the cache focused
573     // on usable items rather than prematurely evicting something
574     // that might be useful again in the future.
575     auto node = state->erroredOrder.head();
576     if (node && node->expired(now)) {
577       state->erroredOrder.remove(node);
578       // Erase from the map last, as this will invalidate node
579       state->map.erase(node->key_);
580       ++state->stats.cacheEvict;
581       return;
582     }
583 
584     // Second choice is to evict a successful item
585     node = state->evictionOrder.head();
586     if (node) {
587       state->evictionOrder.remove(node);
588       // Erase from the map last, as this will invalidate node
589       state->map.erase(node->key_);
590       ++state->stats.cacheEvict;
591       return;
592     }
593 
594     // There are no evictable items to purge, so we are too full.
595     // This is a signal that there is a throughput problem.  Rather than
596     // queueing up some more work, we just fail this request; this acts a
597     // brake when the system is swamped.
598     throw std::runtime_error("too many pending cache jobs");
599   }
600 
601   // The maximum allowed capacity
602   const size_t maxItems_;
603   // How long to cache items that have an error Result
604   const std::chrono::milliseconds errorTTL_;
605   Synchronized<State> state_;
606 };
607 }
608