1 /*
2  * Copyright 2015-present Facebook, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *   http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 // @author Nathan Bronson (ngbronson@fb.com)
18 
19 #pragma once
20 
21 #include <stdint.h>
22 
23 #include <atomic>
24 #include <thread>
25 #include <type_traits>
26 
27 #include <folly/CPortability.h>
28 #include <folly/Likely.h>
29 #include <folly/concurrency/CacheLocality.h>
30 #include <folly/detail/Futex.h>
31 #include <folly/portability/Asm.h>
32 #include <folly/portability/SysResource.h>
33 #include <folly/synchronization/SanitizeThread.h>
34 
35 // SharedMutex is a reader-writer lock.  It is small, very fast, scalable
36 // on multi-core, and suitable for use when readers or writers may block.
37 // Unlike most other reader-writer locks, its throughput with concurrent
38 // readers scales linearly; it is able to acquire and release the lock
39 // in shared mode without cache line ping-ponging.  It is suitable for
40 // a wide range of lock hold times because it starts with spinning,
41 // proceeds to using sched_yield with a preemption heuristic, and then
42 // waits using futex and precise wakeups.
43 //
44 // SharedMutex provides all of the methods of folly::RWSpinLock,
45 // boost::shared_mutex, boost::upgrade_mutex, and C++14's
46 // std::shared_timed_mutex.  All operations that can block are available
47 // in try, try-for, and try-until (system_clock or steady_clock) versions.
48 //
49 // SharedMutexReadPriority gives priority to readers,
50 // SharedMutexWritePriority gives priority to writers.  SharedMutex is an
51 // alias for SharedMutexWritePriority, because writer starvation is more
52 // likely than reader starvation for the read-heavy workloads targetted
53 // by SharedMutex.
54 //
55 // In my tests SharedMutex is as good or better than the other
56 // reader-writer locks in use at Facebook for almost all use cases,
57 // sometimes by a wide margin.  (If it is rare that there are actually
58 // concurrent readers then RWSpinLock can be a few nanoseconds faster.)
59 // I compared it to folly::RWSpinLock, folly::RWTicketSpinLock64,
60 // boost::shared_mutex, pthread_rwlock_t, and a RWLock that internally uses
61 // spinlocks to guard state and pthread_mutex_t+pthread_cond_t to block.
62 // (Thrift's ReadWriteMutex is based underneath on pthread_rwlock_t.)
63 // It is generally as good or better than the rest when evaluating size,
64 // speed, scalability, or latency outliers.  In the corner cases where
65 // it is not the fastest (such as single-threaded use or heavy write
66 // contention) it is never very much worse than the best.  See the bottom
67 // of folly/test/SharedMutexTest.cpp for lots of microbenchmark results.
68 //
69 // Comparison to folly::RWSpinLock:
70 //
71 //  * SharedMutex is faster than RWSpinLock when there are actually
72 //    concurrent read accesses (sometimes much faster), and ~5 nanoseconds
73 //    slower when there is not actually any contention.  SharedMutex is
74 //    faster in every (benchmarked) scenario where the shared mode of
75 //    the lock is actually useful.
76 //
77 //  * Concurrent shared access to SharedMutex scales linearly, while total
78 //    RWSpinLock throughput drops as more threads try to access the lock
79 //    in shared mode.  Under very heavy read contention SharedMutex can
80 //    be two orders of magnitude faster than RWSpinLock (or any reader
81 //    writer lock that doesn't use striping or deferral).
82 //
83 //  * SharedMutex can safely protect blocking calls, because after an
84 //    initial period of spinning it waits using futex().
85 //
86 //  * RWSpinLock prioritizes readers, SharedMutex has both reader- and
87 //    writer-priority variants, but defaults to write priority.
88 //
89 //  * RWSpinLock's upgradeable mode blocks new readers, while SharedMutex's
90 //    doesn't.  Both semantics are reasonable.  The boost documentation
91 //    doesn't explicitly talk about this behavior (except by omitting
92 //    any statement that those lock modes conflict), but the boost
93 //    implementations do allow new readers while the upgradeable mode
94 //    is held.  See https://github.com/boostorg/thread/blob/master/
95 //      include/boost/thread/pthread/shared_mutex.hpp
96 //
97 //  * RWSpinLock::UpgradedHolder maps to SharedMutex::UpgradeHolder
98 //    (UpgradeableHolder would be even more pedantically correct).
99 //    SharedMutex's holders have fewer methods (no reset) and are less
100 //    tolerant (promotion and downgrade crash if the donor doesn't own
101 //    the lock, and you must use the default constructor rather than
102 //    passing a nullptr to the pointer constructor).
103 //
104 // Both SharedMutex and RWSpinLock provide "exclusive", "upgrade",
105 // and "shared" modes.  At all times num_threads_holding_exclusive +
106 // num_threads_holding_upgrade <= 1, and num_threads_holding_exclusive ==
107 // 0 || num_threads_holding_shared == 0.  RWSpinLock has the additional
108 // constraint that num_threads_holding_shared cannot increase while
109 // num_threads_holding_upgrade is non-zero.
110 //
111 // Comparison to the internal RWLock:
112 //
113 //  * SharedMutex doesn't allow a maximum reader count to be configured,
114 //    so it can't be used as a semaphore in the same way as RWLock.
115 //
116 //  * SharedMutex is 4 bytes, RWLock is 256.
117 //
118 //  * SharedMutex is as fast or faster than RWLock in all of my
119 //    microbenchmarks, and has positive rather than negative scalability.
120 //
121 //  * RWLock and SharedMutex are both writer priority locks.
122 //
123 //  * SharedMutex avoids latency outliers as well as RWLock.
124 //
125 //  * SharedMutex uses different names (t != 0 below):
126 //
127 //    RWLock::lock(0)    => SharedMutex::lock()
128 //
129 //    RWLock::lock(t)    => SharedMutex::try_lock_for(milliseconds(t))
130 //
131 //    RWLock::tryLock()  => SharedMutex::try_lock()
132 //
133 //    RWLock::unlock()   => SharedMutex::unlock()
134 //
135 //    RWLock::enter(0)   => SharedMutex::lock_shared()
136 //
137 //    RWLock::enter(t)   =>
138 //        SharedMutex::try_lock_shared_for(milliseconds(t))
139 //
140 //    RWLock::tryEnter() => SharedMutex::try_lock_shared()
141 //
142 //    RWLock::leave()    => SharedMutex::unlock_shared()
143 //
144 //  * RWLock allows the reader count to be adjusted by a value other
145 //    than 1 during enter() or leave(). SharedMutex doesn't currently
146 //    implement this feature.
147 //
148 //  * RWLock's methods are marked const, SharedMutex's aren't.
149 //
150 // Reader-writer locks have the potential to allow concurrent access
151 // to shared read-mostly data, but in practice they often provide no
152 // improvement over a mutex.  The problem is the cache coherence protocol
153 // of modern CPUs.  Coherence is provided by making sure that when a cache
154 // line is written it is present in only one core's cache.  Since a memory
155 // write is required to acquire a reader-writer lock in shared mode, the
156 // cache line holding the lock is invalidated in all of the other caches.
157 // This leads to cache misses when another thread wants to acquire or
158 // release the lock concurrently.  When the RWLock is colocated with the
159 // data it protects (common), cache misses can also continue occur when
160 // a thread that already holds the lock tries to read the protected data.
161 //
162 // Ideally, a reader-writer lock would allow multiple cores to acquire
163 // and release the lock in shared mode without incurring any cache misses.
164 // This requires that each core records its shared access in a cache line
165 // that isn't read or written by other read-locking cores.  (Writers will
166 // have to check all of the cache lines.)  Typical server hardware when
167 // this comment was written has 16 L1 caches and cache lines of 64 bytes,
168 // so a lock striped over all L1 caches would occupy a prohibitive 1024
169 // bytes.  Nothing says that we need a separate set of per-core memory
170 // locations for each lock, however.  Each SharedMutex instance is only
171 // 4 bytes, but all locks together share a 2K area in which they make a
172 // core-local record of lock acquisitions.
173 //
174 // SharedMutex's strategy of using a shared set of core-local stripes has
175 // a potential downside, because it means that acquisition of any lock in
176 // write mode can conflict with acquisition of any lock in shared mode.
177 // If a lock instance doesn't actually experience concurrency then this
178 // downside will outweight the upside of improved scalability for readers.
179 // To avoid this problem we dynamically detect concurrent accesses to
180 // SharedMutex, and don't start using the deferred mode unless we actually
181 // observe concurrency.  See kNumSharedToStartDeferring.
182 //
183 // It is explicitly allowed to call unlock_shared() from a different
184 // thread than lock_shared(), so long as they are properly paired.
185 // unlock_shared() needs to find the location at which lock_shared()
186 // recorded the lock, which might be in the lock itself or in any of
187 // the shared slots.  If you can conveniently pass state from lock
188 // acquisition to release then the fastest mechanism is to std::move
189 // the SharedMutex::ReadHolder instance or an SharedMutex::Token (using
190 // lock_shared(Token&) and unlock_shared(Token&)).  The guard or token
191 // will tell unlock_shared where in deferredReaders[] to look for the
192 // deferred lock.  The Token-less version of unlock_shared() works in all
193 // cases, but is optimized for the common (no inter-thread handoff) case.
194 //
195 // In both read- and write-priority mode, a waiting lock() (exclusive mode)
196 // only blocks readers after it has waited for an active upgrade lock to be
197 // released; until the upgrade lock is released (or upgraded or downgraded)
198 // readers will still be able to enter.  Preferences about lock acquisition
199 // are not guaranteed to be enforced perfectly (even if they were, there
200 // is theoretically the chance that a thread could be arbitrarily suspended
201 // between calling lock() and SharedMutex code actually getting executed).
202 //
203 // try_*_for methods always try at least once, even if the duration
204 // is zero or negative.  The duration type must be compatible with
205 // std::chrono::steady_clock.  try_*_until methods also always try at
206 // least once.  std::chrono::system_clock and std::chrono::steady_clock
207 // are supported.
208 //
209 // If you have observed by profiling that your SharedMutex-s are getting
210 // cache misses on deferredReaders[] due to another SharedMutex user, then
211 // you can use the tag type to create your own instantiation of the type.
212 // The contention threshold (see kNumSharedToStartDeferring) should make
213 // this unnecessary in all but the most extreme cases.  Make sure to check
214 // that the increased icache and dcache footprint of the tagged result is
215 // worth it.
216 
217 // SharedMutex's use of thread local storage is an optimization, so
218 // for the case where thread local storage is not supported, define it
219 // away.
220 
221 // Note about TSAN (ThreadSanitizer): the SharedMutexWritePriority version
222 // (the default) of this mutex is annotated appropriately so that TSAN can
223 // perform lock inversion analysis. However, the SharedMutexReadPriority version
224 // is not annotated.  This is because TSAN's lock order heuristic
225 // assumes that two calls to lock_shared must be ordered, which leads
226 // to too many false positives for the reader-priority case.
227 //
228 // Suppose thread A holds a SharedMutexWritePriority lock in shared mode and an
229 // independent thread B is waiting for exclusive access. Then a thread C's
230 // lock_shared can't proceed until A has released the lock. Discounting
231 // situations that never use exclusive mode (so no lock is necessary at all)
232 // this means that without higher-level reasoning it is not safe to ignore
233 // reader <-> reader interactions.
234 //
235 // This reasoning does not apply to SharedMutexReadPriority, because there are
236 // no actions by a thread B that can make C need to wait for A. Since the
237 // overwhelming majority of SharedMutex instances use write priority, we
238 // restrict the TSAN annotations to only SharedMutexWritePriority.
239 
240 #ifndef FOLLY_SHAREDMUTEX_TLS
241 #if !FOLLY_MOBILE
242 #define FOLLY_SHAREDMUTEX_TLS FOLLY_TLS
243 #else
244 #define FOLLY_SHAREDMUTEX_TLS
245 #endif
246 #endif
247 
248 namespace folly {
249 
250 struct SharedMutexToken {
251   enum class Type : uint16_t {
252     INVALID = 0,
253     INLINE_SHARED,
254     DEFERRED_SHARED,
255   };
256 
257   Type type_;
258   uint16_t slot_;
259 };
260 
261 namespace detail {
262 // Returns a guard that gives permission for the current thread to
263 // annotate, and adjust the annotation bits in, the SharedMutex at ptr.
264 std::unique_lock<std::mutex> sharedMutexAnnotationGuard(void* ptr);
265 } // namespace detail
266 
267 template <
268     bool ReaderPriority,
269     typename Tag_ = void,
270     template <typename> class Atom = std::atomic,
271     bool BlockImmediately = false,
272     bool AnnotateForThreadSanitizer = kIsSanitizeThread && !ReaderPriority>
273 class SharedMutexImpl {
274  public:
275   static constexpr bool kReaderPriority = ReaderPriority;
276 
277   typedef Tag_ Tag;
278 
279   typedef SharedMutexToken Token;
280 
281   class ReadHolder;
282   class UpgradeHolder;
283   class WriteHolder;
284 
SharedMutexImpl()285   constexpr SharedMutexImpl() noexcept : state_(0) {}
286 
287   SharedMutexImpl(const SharedMutexImpl&) = delete;
288   SharedMutexImpl(SharedMutexImpl&&) = delete;
289   SharedMutexImpl& operator=(const SharedMutexImpl&) = delete;
290   SharedMutexImpl& operator=(SharedMutexImpl&&) = delete;
291 
292   // It is an error to destroy an SharedMutex that still has
293   // any outstanding locks.  This is checked if NDEBUG isn't defined.
294   // SharedMutex's exclusive mode can be safely used to guard the lock's
295   // own destruction.  If, for example, you acquire the lock in exclusive
296   // mode and then observe that the object containing the lock is no longer
297   // needed, you can unlock() and then immediately destroy the lock.
298   // See https://sourceware.org/bugzilla/show_bug.cgi?id=13690 for a
299   // description about why this property needs to be explicitly mentioned.
~SharedMutexImpl()300   ~SharedMutexImpl() {
301     auto state = state_.load(std::memory_order_relaxed);
302     if (UNLIKELY((state & kHasS) != 0)) {
303       cleanupTokenlessSharedDeferred(state);
304     }
305 
306 #ifndef NDEBUG
307     // These asserts check that everybody has released the lock before it
308     // is destroyed.  If you arrive here while debugging that is likely
309     // the problem.  (You could also have general heap corruption.)
310 
311     // if a futexWait fails to go to sleep because the value has been
312     // changed, we don't necessarily clean up the wait bits, so it is
313     // possible they will be set here in a correct system
314     assert((state & ~(kWaitingAny | kMayDefer | kAnnotationCreated)) == 0);
315     if ((state & kMayDefer) != 0) {
316       for (uint32_t slot = 0; slot < kMaxDeferredReaders; ++slot) {
317         auto slotValue = deferredReader(slot)->load(std::memory_order_relaxed);
318         assert(!slotValueIsThis(slotValue));
319       }
320     }
321 #endif
322     annotateDestroy();
323   }
324 
lock()325   void lock() {
326     WaitForever ctx;
327     (void)lockExclusiveImpl(kHasSolo, ctx);
328     annotateAcquired(annotate_rwlock_level::wrlock);
329   }
330 
try_lock()331   bool try_lock() {
332     WaitNever ctx;
333     auto result = lockExclusiveImpl(kHasSolo, ctx);
334     annotateTryAcquired(result, annotate_rwlock_level::wrlock);
335     return result;
336   }
337 
338   template <class Rep, class Period>
try_lock_for(const std::chrono::duration<Rep,Period> & duration)339   bool try_lock_for(const std::chrono::duration<Rep, Period>& duration) {
340     WaitForDuration<Rep, Period> ctx(duration);
341     auto result = lockExclusiveImpl(kHasSolo, ctx);
342     annotateTryAcquired(result, annotate_rwlock_level::wrlock);
343     return result;
344   }
345 
346   template <class Clock, class Duration>
try_lock_until(const std::chrono::time_point<Clock,Duration> & absDeadline)347   bool try_lock_until(
348       const std::chrono::time_point<Clock, Duration>& absDeadline) {
349     WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
350     auto result = lockExclusiveImpl(kHasSolo, ctx);
351     annotateTryAcquired(result, annotate_rwlock_level::wrlock);
352     return result;
353   }
354 
unlock()355   void unlock() {
356     annotateReleased(annotate_rwlock_level::wrlock);
357     // It is possible that we have a left-over kWaitingNotS if the last
358     // unlock_shared() that let our matching lock() complete finished
359     // releasing before lock()'s futexWait went to sleep.  Clean it up now
360     auto state = (state_ &= ~(kWaitingNotS | kPrevDefer | kHasE));
361     assert((state & ~(kWaitingAny | kAnnotationCreated)) == 0);
362     wakeRegisteredWaiters(state, kWaitingE | kWaitingU | kWaitingS);
363   }
364 
365   // Managing the token yourself makes unlock_shared a bit faster
366 
lock_shared()367   void lock_shared() {
368     WaitForever ctx;
369     (void)lockSharedImpl(nullptr, ctx);
370     annotateAcquired(annotate_rwlock_level::rdlock);
371   }
372 
lock_shared(Token & token)373   void lock_shared(Token& token) {
374     WaitForever ctx;
375     (void)lockSharedImpl(&token, ctx);
376     annotateAcquired(annotate_rwlock_level::rdlock);
377   }
378 
try_lock_shared()379   bool try_lock_shared() {
380     WaitNever ctx;
381     auto result = lockSharedImpl(nullptr, ctx);
382     annotateTryAcquired(result, annotate_rwlock_level::rdlock);
383     return result;
384   }
385 
try_lock_shared(Token & token)386   bool try_lock_shared(Token& token) {
387     WaitNever ctx;
388     auto result = lockSharedImpl(&token, ctx);
389     annotateTryAcquired(result, annotate_rwlock_level::rdlock);
390     return result;
391   }
392 
393   template <class Rep, class Period>
try_lock_shared_for(const std::chrono::duration<Rep,Period> & duration)394   bool try_lock_shared_for(const std::chrono::duration<Rep, Period>& duration) {
395     WaitForDuration<Rep, Period> ctx(duration);
396     auto result = lockSharedImpl(nullptr, ctx);
397     annotateTryAcquired(result, annotate_rwlock_level::rdlock);
398     return result;
399   }
400 
401   template <class Rep, class Period>
try_lock_shared_for(const std::chrono::duration<Rep,Period> & duration,Token & token)402   bool try_lock_shared_for(
403       const std::chrono::duration<Rep, Period>& duration,
404       Token& token) {
405     WaitForDuration<Rep, Period> ctx(duration);
406     auto result = lockSharedImpl(&token, ctx);
407     annotateTryAcquired(result, annotate_rwlock_level::rdlock);
408     return result;
409   }
410 
411   template <class Clock, class Duration>
try_lock_shared_until(const std::chrono::time_point<Clock,Duration> & absDeadline)412   bool try_lock_shared_until(
413       const std::chrono::time_point<Clock, Duration>& absDeadline) {
414     WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
415     auto result = lockSharedImpl(nullptr, ctx);
416     annotateTryAcquired(result, annotate_rwlock_level::rdlock);
417     return result;
418   }
419 
420   template <class Clock, class Duration>
try_lock_shared_until(const std::chrono::time_point<Clock,Duration> & absDeadline,Token & token)421   bool try_lock_shared_until(
422       const std::chrono::time_point<Clock, Duration>& absDeadline,
423       Token& token) {
424     WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
425     auto result = lockSharedImpl(&token, ctx);
426     annotateTryAcquired(result, annotate_rwlock_level::rdlock);
427     return result;
428   }
429 
unlock_shared()430   void unlock_shared() {
431     annotateReleased(annotate_rwlock_level::rdlock);
432 
433     auto state = state_.load(std::memory_order_acquire);
434 
435     // kPrevDefer can only be set if HasE or BegunE is set
436     assert((state & (kPrevDefer | kHasE | kBegunE)) != kPrevDefer);
437 
438     // lock() strips kMayDefer immediately, but then copies it to
439     // kPrevDefer so we can tell if the pre-lock() lock_shared() might
440     // have deferred
441     if ((state & (kMayDefer | kPrevDefer)) == 0 ||
442         !tryUnlockTokenlessSharedDeferred()) {
443       // Matching lock_shared() couldn't have deferred, or the deferred
444       // lock has already been inlined by applyDeferredReaders()
445       unlockSharedInline();
446     }
447   }
448 
unlock_shared(Token & token)449   void unlock_shared(Token& token) {
450     annotateReleased(annotate_rwlock_level::rdlock);
451 
452     assert(
453         token.type_ == Token::Type::INLINE_SHARED ||
454         token.type_ == Token::Type::DEFERRED_SHARED);
455 
456     if (token.type_ != Token::Type::DEFERRED_SHARED ||
457         !tryUnlockSharedDeferred(token.slot_)) {
458       unlockSharedInline();
459     }
460 #ifndef NDEBUG
461     token.type_ = Token::Type::INVALID;
462 #endif
463   }
464 
unlock_and_lock_shared()465   void unlock_and_lock_shared() {
466     annotateReleased(annotate_rwlock_level::wrlock);
467     annotateAcquired(annotate_rwlock_level::rdlock);
468     // We can't use state_ -=, because we need to clear 2 bits (1 of which
469     // has an uncertain initial state) and set 1 other.  We might as well
470     // clear the relevant wake bits at the same time.  Note that since S
471     // doesn't block the beginning of a transition to E (writer priority
472     // can cut off new S, reader priority grabs BegunE and blocks deferred
473     // S) we need to wake E as well.
474     auto state = state_.load(std::memory_order_acquire);
475     do {
476       assert(
477           (state & ~(kWaitingAny | kPrevDefer | kAnnotationCreated)) == kHasE);
478     } while (!state_.compare_exchange_strong(
479         state, (state & ~(kWaitingAny | kPrevDefer | kHasE)) + kIncrHasS));
480     if ((state & (kWaitingE | kWaitingU | kWaitingS)) != 0) {
481       futexWakeAll(kWaitingE | kWaitingU | kWaitingS);
482     }
483   }
484 
unlock_and_lock_shared(Token & token)485   void unlock_and_lock_shared(Token& token) {
486     unlock_and_lock_shared();
487     token.type_ = Token::Type::INLINE_SHARED;
488   }
489 
lock_upgrade()490   void lock_upgrade() {
491     WaitForever ctx;
492     (void)lockUpgradeImpl(ctx);
493     // For TSAN: treat upgrade locks as equivalent to read locks
494     annotateAcquired(annotate_rwlock_level::rdlock);
495   }
496 
try_lock_upgrade()497   bool try_lock_upgrade() {
498     WaitNever ctx;
499     auto result = lockUpgradeImpl(ctx);
500     annotateTryAcquired(result, annotate_rwlock_level::rdlock);
501     return result;
502   }
503 
504   template <class Rep, class Period>
try_lock_upgrade_for(const std::chrono::duration<Rep,Period> & duration)505   bool try_lock_upgrade_for(
506       const std::chrono::duration<Rep, Period>& duration) {
507     WaitForDuration<Rep, Period> ctx(duration);
508     auto result = lockUpgradeImpl(ctx);
509     annotateTryAcquired(result, annotate_rwlock_level::rdlock);
510     return result;
511   }
512 
513   template <class Clock, class Duration>
try_lock_upgrade_until(const std::chrono::time_point<Clock,Duration> & absDeadline)514   bool try_lock_upgrade_until(
515       const std::chrono::time_point<Clock, Duration>& absDeadline) {
516     WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
517     auto result = lockUpgradeImpl(ctx);
518     annotateTryAcquired(result, annotate_rwlock_level::rdlock);
519     return result;
520   }
521 
unlock_upgrade()522   void unlock_upgrade() {
523     annotateReleased(annotate_rwlock_level::rdlock);
524     auto state = (state_ -= kHasU);
525     assert((state & (kWaitingNotS | kHasSolo)) == 0);
526     wakeRegisteredWaiters(state, kWaitingE | kWaitingU);
527   }
528 
unlock_upgrade_and_lock()529   void unlock_upgrade_and_lock() {
530     // no waiting necessary, so waitMask is empty
531     WaitForever ctx;
532     (void)lockExclusiveImpl(0, ctx);
533     annotateReleased(annotate_rwlock_level::rdlock);
534     annotateAcquired(annotate_rwlock_level::wrlock);
535   }
536 
unlock_upgrade_and_lock_shared()537   void unlock_upgrade_and_lock_shared() {
538     // No need to annotate for TSAN here because we model upgrade and shared
539     // locks as the same.
540     auto state = (state_ -= kHasU - kIncrHasS);
541     assert((state & (kWaitingNotS | kHasSolo)) == 0);
542     wakeRegisteredWaiters(state, kWaitingE | kWaitingU);
543   }
544 
unlock_upgrade_and_lock_shared(Token & token)545   void unlock_upgrade_and_lock_shared(Token& token) {
546     unlock_upgrade_and_lock_shared();
547     token.type_ = Token::Type::INLINE_SHARED;
548   }
549 
unlock_and_lock_upgrade()550   void unlock_and_lock_upgrade() {
551     annotateReleased(annotate_rwlock_level::wrlock);
552     annotateAcquired(annotate_rwlock_level::rdlock);
553     // We can't use state_ -=, because we need to clear 2 bits (1 of
554     // which has an uncertain initial state) and set 1 other.  We might
555     // as well clear the relevant wake bits at the same time.
556     auto state = state_.load(std::memory_order_acquire);
557     while (true) {
558       assert(
559           (state & ~(kWaitingAny | kPrevDefer | kAnnotationCreated)) == kHasE);
560       auto after =
561           (state & ~(kWaitingNotS | kWaitingS | kPrevDefer | kHasE)) + kHasU;
562       if (state_.compare_exchange_strong(state, after)) {
563         if ((state & kWaitingS) != 0) {
564           futexWakeAll(kWaitingS);
565         }
566         return;
567       }
568     }
569   }
570 
571  private:
572   typedef typename folly::detail::Futex<Atom> Futex;
573 
574   // Internally we use four kinds of wait contexts.  These are structs
575   // that provide a doWait method that returns true if a futex wake
576   // was issued that intersects with the waitMask, false if there was a
577   // timeout and no more waiting should be performed.  Spinning occurs
578   // before the wait context is invoked.
579 
580   struct WaitForever {
canBlockWaitForever581     bool canBlock() {
582       return true;
583     }
canTimeOutWaitForever584     bool canTimeOut() {
585       return false;
586     }
shouldTimeOutWaitForever587     bool shouldTimeOut() {
588       return false;
589     }
590 
doWaitWaitForever591     bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
592       detail::futexWait(&futex, expected, waitMask);
593       return true;
594     }
595   };
596 
597   struct WaitNever {
canBlockWaitNever598     bool canBlock() {
599       return false;
600     }
canTimeOutWaitNever601     bool canTimeOut() {
602       return true;
603     }
shouldTimeOutWaitNever604     bool shouldTimeOut() {
605       return true;
606     }
607 
doWaitWaitNever608     bool doWait(
609         Futex& /* futex */,
610         uint32_t /* expected */,
611         uint32_t /* waitMask */) {
612       return false;
613     }
614   };
615 
616   template <class Rep, class Period>
617   struct WaitForDuration {
618     std::chrono::duration<Rep, Period> duration_;
619     bool deadlineComputed_;
620     std::chrono::steady_clock::time_point deadline_;
621 
WaitForDurationWaitForDuration622     explicit WaitForDuration(const std::chrono::duration<Rep, Period>& duration)
623         : duration_(duration), deadlineComputed_(false) {}
624 
deadlineWaitForDuration625     std::chrono::steady_clock::time_point deadline() {
626       if (!deadlineComputed_) {
627         deadline_ = std::chrono::steady_clock::now() + duration_;
628         deadlineComputed_ = true;
629       }
630       return deadline_;
631     }
632 
canBlockWaitForDuration633     bool canBlock() {
634       return duration_.count() > 0;
635     }
canTimeOutWaitForDuration636     bool canTimeOut() {
637       return true;
638     }
639 
shouldTimeOutWaitForDuration640     bool shouldTimeOut() {
641       return std::chrono::steady_clock::now() > deadline();
642     }
643 
doWaitWaitForDuration644     bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
645       auto result =
646           detail::futexWaitUntil(&futex, expected, deadline(), waitMask);
647       return result != folly::detail::FutexResult::TIMEDOUT;
648     }
649   };
650 
651   template <class Clock, class Duration>
652   struct WaitUntilDeadline {
653     std::chrono::time_point<Clock, Duration> absDeadline_;
654 
canBlockWaitUntilDeadline655     bool canBlock() {
656       return true;
657     }
canTimeOutWaitUntilDeadline658     bool canTimeOut() {
659       return true;
660     }
shouldTimeOutWaitUntilDeadline661     bool shouldTimeOut() {
662       return Clock::now() > absDeadline_;
663     }
664 
doWaitWaitUntilDeadline665     bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
666       auto result =
667           detail::futexWaitUntil(&futex, expected, absDeadline_, waitMask);
668       return result != folly::detail::FutexResult::TIMEDOUT;
669     }
670   };
671 
annotateLazyCreate()672   void annotateLazyCreate() {
673     if (AnnotateForThreadSanitizer &&
674         (state_.load() & kAnnotationCreated) == 0) {
675       auto guard = detail::sharedMutexAnnotationGuard(this);
676       // check again
677       if ((state_.load() & kAnnotationCreated) == 0) {
678         state_.fetch_or(kAnnotationCreated);
679         annotate_benign_race_sized(
680             &state_, sizeof(state_), "init TSAN", __FILE__, __LINE__);
681         annotate_rwlock_create(this, __FILE__, __LINE__);
682       }
683     }
684   }
685 
annotateDestroy()686   void annotateDestroy() {
687     if (AnnotateForThreadSanitizer) {
688       annotateLazyCreate();
689       annotate_rwlock_destroy(this, __FILE__, __LINE__);
690     }
691   }
692 
annotateAcquired(annotate_rwlock_level w)693   void annotateAcquired(annotate_rwlock_level w) {
694     if (AnnotateForThreadSanitizer) {
695       annotateLazyCreate();
696       annotate_rwlock_acquired(this, w, __FILE__, __LINE__);
697     }
698   }
699 
annotateTryAcquired(bool result,annotate_rwlock_level w)700   void annotateTryAcquired(bool result, annotate_rwlock_level w) {
701     if (AnnotateForThreadSanitizer) {
702       annotateLazyCreate();
703       annotate_rwlock_try_acquired(this, w, result, __FILE__, __LINE__);
704     }
705   }
706 
annotateReleased(annotate_rwlock_level w)707   void annotateReleased(annotate_rwlock_level w) {
708     if (AnnotateForThreadSanitizer) {
709       assert((state_.load() & kAnnotationCreated) != 0);
710       annotate_rwlock_released(this, w, __FILE__, __LINE__);
711     }
712   }
713 
714   // 32 bits of state
715   Futex state_{};
716 
717   // S count needs to be on the end, because we explicitly allow it to
718   // underflow.  This can occur while we are in the middle of applying
719   // deferred locks (we remove them from deferredReaders[] before
720   // inlining them), or during token-less unlock_shared() if a racing
721   // lock_shared();unlock_shared() moves the deferredReaders slot while
722   // the first unlock_shared() is scanning.  The former case is cleaned
723   // up before we finish applying the locks.  The latter case can persist
724   // until destruction, when it is cleaned up.
725   static constexpr uint32_t kIncrHasS = 1 << 11;
726   static constexpr uint32_t kHasS = ~(kIncrHasS - 1);
727 
728   // Set if annotation has been completed for this instance.  That annotation
729   // (and setting this bit afterward) must be guarded by one of the mutexes in
730   // annotationCreationGuards.
731   static constexpr uint32_t kAnnotationCreated = 1 << 10;
732 
733   // If false, then there are definitely no deferred read locks for this
734   // instance.  Cleared after initialization and when exclusively locked.
735   static constexpr uint32_t kMayDefer = 1 << 9;
736 
737   // lock() cleared kMayDefer as soon as it starts draining readers (so
738   // that it doesn't have to do a second CAS once drain completes), but
739   // unlock_shared() still needs to know whether to scan deferredReaders[]
740   // or not.  We copy kMayDefer to kPrevDefer when setting kHasE or
741   // kBegunE, and clear it when clearing those bits.
742   static constexpr uint32_t kPrevDefer = 1 << 8;
743 
744   // Exclusive-locked blocks all read locks and write locks.  This bit
745   // may be set before all readers have finished, but in that case the
746   // thread that sets it won't return to the caller until all read locks
747   // have been released.
748   static constexpr uint32_t kHasE = 1 << 7;
749 
750   // Exclusive-draining means that lock() is waiting for existing readers
751   // to leave, but that new readers may still acquire shared access.
752   // This is only used in reader priority mode.  New readers during
753   // drain must be inline.  The difference between this and kHasU is that
754   // kBegunE prevents kMayDefer from being set.
755   static constexpr uint32_t kBegunE = 1 << 6;
756 
757   // At most one thread may have either exclusive or upgrade lock
758   // ownership.  Unlike exclusive mode, ownership of the lock in upgrade
759   // mode doesn't preclude other threads holding the lock in shared mode.
760   // boost's concept for this doesn't explicitly say whether new shared
761   // locks can be acquired one lock_upgrade has succeeded, but doesn't
762   // list that as disallowed.  RWSpinLock disallows new read locks after
763   // lock_upgrade has been acquired, but the boost implementation doesn't.
764   // We choose the latter.
765   static constexpr uint32_t kHasU = 1 << 5;
766 
767   // There are three states that we consider to be "solo", in that they
768   // cannot coexist with other solo states.  These are kHasE, kBegunE,
769   // and kHasU.  Note that S doesn't conflict with any of these, because
770   // setting the kHasE is only one of the two steps needed to actually
771   // acquire the lock in exclusive mode (the other is draining the existing
772   // S holders).
773   static constexpr uint32_t kHasSolo = kHasE | kBegunE | kHasU;
774 
775   // Once a thread sets kHasE it needs to wait for the current readers
776   // to exit the lock.  We give this a separate wait identity from the
777   // waiting to set kHasE so that we can perform partial wakeups (wake
778   // one instead of wake all).
779   static constexpr uint32_t kWaitingNotS = 1 << 4;
780 
781   // When waking writers we can either wake them all, in which case we
782   // can clear kWaitingE, or we can call futexWake(1).  futexWake tells
783   // us if anybody woke up, but even if we detect that nobody woke up we
784   // can't clear the bit after the fact without issuing another wakeup.
785   // To avoid thundering herds when there are lots of pending lock()
786   // without needing to call futexWake twice when there is only one
787   // waiter, kWaitingE actually encodes if we have observed multiple
788   // concurrent waiters.  Tricky: ABA issues on futexWait mean that when
789   // we see kWaitingESingle we can't assume that there is only one.
790   static constexpr uint32_t kWaitingESingle = 1 << 2;
791   static constexpr uint32_t kWaitingEMultiple = 1 << 3;
792   static constexpr uint32_t kWaitingE = kWaitingESingle | kWaitingEMultiple;
793 
794   // kWaitingU is essentially a 1 bit saturating counter.  It always
795   // requires a wakeAll.
796   static constexpr uint32_t kWaitingU = 1 << 1;
797 
798   // All blocked lock_shared() should be awoken, so it is correct (not
799   // suboptimal) to wakeAll if there are any shared readers.
800   static constexpr uint32_t kWaitingS = 1 << 0;
801 
802   // kWaitingAny is a mask of all of the bits that record the state of
803   // threads, rather than the state of the lock.  It is convenient to be
804   // able to mask them off during asserts.
805   static constexpr uint32_t kWaitingAny =
806       kWaitingNotS | kWaitingE | kWaitingU | kWaitingS;
807 
808   // The reader count at which a reader will attempt to use the lock
809   // in deferred mode.  If this value is 2, then the second concurrent
810   // reader will set kMayDefer and use deferredReaders[].  kMayDefer is
811   // cleared during exclusive access, so this threshold must be reached
812   // each time a lock is held in exclusive mode.
813   static constexpr uint32_t kNumSharedToStartDeferring = 2;
814 
815   // The typical number of spins that a thread will wait for a state
816   // transition.  There is no bound on the number of threads that can wait
817   // for a writer, so we are pretty conservative here to limit the chance
818   // that we are starving the writer of CPU.  Each spin is 6 or 7 nanos,
819   // almost all of which is in the pause instruction.
820   static constexpr uint32_t kMaxSpinCount = !BlockImmediately ? 1000 : 2;
821 
822   // The maximum number of soft yields before falling back to futex.
823   // If the preemption heuristic is activated we will fall back before
824   // this.  A soft yield takes ~900 nanos (two sched_yield plus a call
825   // to getrusage, with checks of the goal at each step).  Soft yields
826   // aren't compatible with deterministic execution under test (unlike
827   // futexWaitUntil, which has a capricious but deterministic back end).
828   static constexpr uint32_t kMaxSoftYieldCount = !BlockImmediately ? 1000 : 0;
829 
830   // If AccessSpreader assigns indexes from 0..k*n-1 on a system where some
831   // level of the memory hierarchy is symmetrically divided into k pieces
832   // (NUMA nodes, last-level caches, L1 caches, ...), then slot indexes
833   // that are the same after integer division by k share that resource.
834   // Our strategy for deferred readers is to probe up to numSlots/4 slots,
835   // using the full granularity of AccessSpreader for the start slot
836   // and then search outward.  We can use AccessSpreader::current(n)
837   // without managing our own spreader if kMaxDeferredReaders <=
838   // AccessSpreader::kMaxCpus, which is currently 128.
839   //
840   // Our 2-socket E5-2660 machines have 8 L1 caches on each chip,
841   // with 64 byte cache lines.  That means we need 64*16 bytes of
842   // deferredReaders[] to give each L1 its own playground.  On x86_64
843   // each DeferredReaderSlot is 8 bytes, so we need kMaxDeferredReaders
844   // * kDeferredSeparationFactor >= 64 * 16 / 8 == 128.  If
845   // kDeferredSearchDistance * kDeferredSeparationFactor <=
846   // 64 / 8 then we will search only within a single cache line, which
847   // guarantees we won't have inter-L1 contention.  We give ourselves
848   // a factor of 2 on the core count, which should hold us for a couple
849   // processor generations.  deferredReaders[] is 2048 bytes currently.
850  public:
851   static constexpr uint32_t kMaxDeferredReaders = 64;
852   static constexpr uint32_t kDeferredSearchDistance = 2;
853   static constexpr uint32_t kDeferredSeparationFactor = 4;
854 
855  private:
856   static_assert(
857       !(kMaxDeferredReaders & (kMaxDeferredReaders - 1)),
858       "kMaxDeferredReaders must be a power of 2");
859   static_assert(
860       !(kDeferredSearchDistance & (kDeferredSearchDistance - 1)),
861       "kDeferredSearchDistance must be a power of 2");
862 
863   // The number of deferred locks that can be simultaneously acquired
864   // by a thread via the token-less methods without performing any heap
865   // allocations.  Each of these costs 3 pointers (24 bytes, probably)
866   // per thread.  There's not much point in making this larger than
867   // kDeferredSearchDistance.
868   static constexpr uint32_t kTokenStackTLSCapacity = 2;
869 
870   // We need to make sure that if there is a lock_shared()
871   // and lock_shared(token) followed by unlock_shared() and
872   // unlock_shared(token), the token-less unlock doesn't null
873   // out deferredReaders[token.slot_].  If we allowed that, then
874   // unlock_shared(token) wouldn't be able to assume that its lock
875   // had been inlined by applyDeferredReaders when it finds that
876   // deferredReaders[token.slot_] no longer points to this.  We accomplish
877   // this by stealing bit 0 from the pointer to record that the slot's
878   // element has no token, hence our use of uintptr_t in deferredReaders[].
879   static constexpr uintptr_t kTokenless = 0x1;
880 
881   // This is the starting location for Token-less unlock_shared().
882   static FOLLY_SHAREDMUTEX_TLS uint32_t tls_lastTokenlessSlot;
883 
884   // Last deferred reader slot used.
885   static FOLLY_SHAREDMUTEX_TLS uint32_t tls_lastDeferredReaderSlot;
886 
887   // Only indexes divisible by kDeferredSeparationFactor are used.
888   // If any of those elements points to a SharedMutexImpl, then it
889   // should be considered that there is a shared lock on that instance.
890   // See kTokenless.
891  public:
892   typedef Atom<uintptr_t> DeferredReaderSlot;
893 
894  private:
alignas(hardware_destructive_interference_size)895   alignas(hardware_destructive_interference_size) static DeferredReaderSlot
896       deferredReaders[kMaxDeferredReaders * kDeferredSeparationFactor];
897 
898   // Performs an exclusive lock, waiting for state_ & waitMask to be
899   // zero first
900   template <class WaitContext>
901   bool lockExclusiveImpl(uint32_t preconditionGoalMask, WaitContext& ctx) {
902     uint32_t state = state_.load(std::memory_order_acquire);
903     if (LIKELY(
904             (state & (preconditionGoalMask | kMayDefer | kHasS)) == 0 &&
905             state_.compare_exchange_strong(state, (state | kHasE) & ~kHasU))) {
906       return true;
907     } else {
908       return lockExclusiveImpl(state, preconditionGoalMask, ctx);
909     }
910   }
911 
912   template <class WaitContext>
lockExclusiveImpl(uint32_t & state,uint32_t preconditionGoalMask,WaitContext & ctx)913   bool lockExclusiveImpl(
914       uint32_t& state,
915       uint32_t preconditionGoalMask,
916       WaitContext& ctx) {
917     while (true) {
918       if (UNLIKELY((state & preconditionGoalMask) != 0) &&
919           !waitForZeroBits(state, preconditionGoalMask, kWaitingE, ctx) &&
920           ctx.canTimeOut()) {
921         return false;
922       }
923 
924       uint32_t after = (state & kMayDefer) == 0 ? 0 : kPrevDefer;
925       if (!kReaderPriority || (state & (kMayDefer | kHasS)) == 0) {
926         // Block readers immediately, either because we are in write
927         // priority mode or because we can acquire the lock in one
928         // step.  Note that if state has kHasU, then we are doing an
929         // unlock_upgrade_and_lock() and we should clear it (reader
930         // priority branch also does this).
931         after |= (state | kHasE) & ~(kHasU | kMayDefer);
932       } else {
933         after |= (state | kBegunE) & ~(kHasU | kMayDefer);
934       }
935       if (state_.compare_exchange_strong(state, after)) {
936         auto before = state;
937         state = after;
938 
939         // If we set kHasE (writer priority) then no new readers can
940         // arrive.  If we set kBegunE then they can still enter, but
941         // they must be inline.  Either way we need to either spin on
942         // deferredReaders[] slots, or inline them so that we can wait on
943         // kHasS to zero itself.  deferredReaders[] is pointers, which on
944         // x86_64 are bigger than futex() can handle, so we inline the
945         // deferred locks instead of trying to futexWait on each slot.
946         // Readers are responsible for rechecking state_ after recording
947         // a deferred read to avoid atomicity problems between the state_
948         // CAS and applyDeferredReader's reads of deferredReaders[].
949         if (UNLIKELY((before & kMayDefer) != 0)) {
950           applyDeferredReaders(state, ctx);
951         }
952         while (true) {
953           assert((state & (kHasE | kBegunE)) != 0 && (state & kHasU) == 0);
954           if (UNLIKELY((state & kHasS) != 0) &&
955               !waitForZeroBits(state, kHasS, kWaitingNotS, ctx) &&
956               ctx.canTimeOut()) {
957             // Ugh.  We blocked new readers and other writers for a while,
958             // but were unable to complete.  Move on.  On the plus side
959             // we can clear kWaitingNotS because nobody else can piggyback
960             // on it.
961             state = (state_ &= ~(kPrevDefer | kHasE | kBegunE | kWaitingNotS));
962             wakeRegisteredWaiters(state, kWaitingE | kWaitingU | kWaitingS);
963             return false;
964           }
965 
966           if (kReaderPriority && (state & kHasE) == 0) {
967             assert((state & kBegunE) != 0);
968             if (!state_.compare_exchange_strong(
969                     state, (state & ~kBegunE) | kHasE)) {
970               continue;
971             }
972           }
973 
974           return true;
975         }
976       }
977     }
978   }
979 
980   template <class WaitContext>
waitForZeroBits(uint32_t & state,uint32_t goal,uint32_t waitMask,WaitContext & ctx)981   bool waitForZeroBits(
982       uint32_t& state,
983       uint32_t goal,
984       uint32_t waitMask,
985       WaitContext& ctx) {
986     uint32_t spinCount = 0;
987     while (true) {
988       state = state_.load(std::memory_order_acquire);
989       if ((state & goal) == 0) {
990         return true;
991       }
992       asm_volatile_pause();
993       ++spinCount;
994       if (UNLIKELY(spinCount >= kMaxSpinCount)) {
995         return ctx.canBlock() &&
996             yieldWaitForZeroBits(state, goal, waitMask, ctx);
997       }
998     }
999   }
1000 
1001   template <class WaitContext>
yieldWaitForZeroBits(uint32_t & state,uint32_t goal,uint32_t waitMask,WaitContext & ctx)1002   bool yieldWaitForZeroBits(
1003       uint32_t& state,
1004       uint32_t goal,
1005       uint32_t waitMask,
1006       WaitContext& ctx) {
1007 #ifdef RUSAGE_THREAD
1008     struct rusage usage;
1009     std::memset(&usage, 0, sizeof(usage));
1010     long before = -1;
1011 #endif
1012     for (uint32_t yieldCount = 0; yieldCount < kMaxSoftYieldCount;
1013          ++yieldCount) {
1014       for (int softState = 0; softState < 3; ++softState) {
1015         if (softState < 2) {
1016           std::this_thread::yield();
1017         } else {
1018 #ifdef RUSAGE_THREAD
1019           getrusage(RUSAGE_THREAD, &usage);
1020 #endif
1021         }
1022         if (((state = state_.load(std::memory_order_acquire)) & goal) == 0) {
1023           return true;
1024         }
1025         if (ctx.shouldTimeOut()) {
1026           return false;
1027         }
1028       }
1029 #ifdef RUSAGE_THREAD
1030       if (before >= 0 && usage.ru_nivcsw >= before + 2) {
1031         // One involuntary csw might just be occasional background work,
1032         // but if we get two in a row then we guess that there is someone
1033         // else who can profitably use this CPU.  Fall back to futex
1034         break;
1035       }
1036       before = usage.ru_nivcsw;
1037 #endif
1038     }
1039     return futexWaitForZeroBits(state, goal, waitMask, ctx);
1040   }
1041 
1042   template <class WaitContext>
futexWaitForZeroBits(uint32_t & state,uint32_t goal,uint32_t waitMask,WaitContext & ctx)1043   bool futexWaitForZeroBits(
1044       uint32_t& state,
1045       uint32_t goal,
1046       uint32_t waitMask,
1047       WaitContext& ctx) {
1048     assert(
1049         waitMask == kWaitingNotS || waitMask == kWaitingE ||
1050         waitMask == kWaitingU || waitMask == kWaitingS);
1051 
1052     while (true) {
1053       state = state_.load(std::memory_order_acquire);
1054       if ((state & goal) == 0) {
1055         return true;
1056       }
1057 
1058       auto after = state;
1059       if (waitMask == kWaitingE) {
1060         if ((state & kWaitingESingle) != 0) {
1061           after |= kWaitingEMultiple;
1062         } else {
1063           after |= kWaitingESingle;
1064         }
1065       } else {
1066         after |= waitMask;
1067       }
1068 
1069       // CAS is better than atomic |= here, because it lets us avoid
1070       // setting the wait flag when the goal is concurrently achieved
1071       if (after != state && !state_.compare_exchange_strong(state, after)) {
1072         continue;
1073       }
1074 
1075       if (!ctx.doWait(state_, after, waitMask)) {
1076         // timed out
1077         return false;
1078       }
1079     }
1080   }
1081 
1082   // Wakes up waiters registered in state_ as appropriate, clearing the
1083   // awaiting bits for anybody that was awoken.  Tries to perform direct
1084   // single wakeup of an exclusive waiter if appropriate
wakeRegisteredWaiters(uint32_t & state,uint32_t wakeMask)1085   void wakeRegisteredWaiters(uint32_t& state, uint32_t wakeMask) {
1086     if (UNLIKELY((state & wakeMask) != 0)) {
1087       wakeRegisteredWaitersImpl(state, wakeMask);
1088     }
1089   }
1090 
wakeRegisteredWaitersImpl(uint32_t & state,uint32_t wakeMask)1091   void wakeRegisteredWaitersImpl(uint32_t& state, uint32_t wakeMask) {
1092     // If there are multiple lock() pending only one of them will actually
1093     // get to wake up, so issuing futexWakeAll will make a thundering herd.
1094     // There's nothing stopping us from issuing futexWake(1) instead,
1095     // so long as the wait bits are still an accurate reflection of
1096     // the waiters.  If we notice (via futexWake's return value) that
1097     // nobody woke up then we can try again with the normal wake-all path.
1098     // Note that we can't just clear the bits at that point; we need to
1099     // clear the bits and then issue another wakeup.
1100     //
1101     // It is possible that we wake an E waiter but an outside S grabs the
1102     // lock instead, at which point we should wake pending U and S waiters.
1103     // Rather than tracking state to make the failing E regenerate the
1104     // wakeup, we just disable the optimization in the case that there
1105     // are waiting U or S that we are eligible to wake.
1106     if ((wakeMask & kWaitingE) == kWaitingE &&
1107         (state & wakeMask) == kWaitingE &&
1108         detail::futexWake(&state_, 1, kWaitingE) > 0) {
1109       // somebody woke up, so leave state_ as is and clear it later
1110       return;
1111     }
1112 
1113     if ((state & wakeMask) != 0) {
1114       auto prev = state_.fetch_and(~wakeMask);
1115       if ((prev & wakeMask) != 0) {
1116         futexWakeAll(wakeMask);
1117       }
1118       state = prev & ~wakeMask;
1119     }
1120   }
1121 
futexWakeAll(uint32_t wakeMask)1122   void futexWakeAll(uint32_t wakeMask) {
1123     detail::futexWake(&state_, std::numeric_limits<int>::max(), wakeMask);
1124   }
1125 
deferredReader(uint32_t slot)1126   DeferredReaderSlot* deferredReader(uint32_t slot) {
1127     return &deferredReaders[slot * kDeferredSeparationFactor];
1128   }
1129 
tokenfulSlotValue()1130   uintptr_t tokenfulSlotValue() {
1131     return reinterpret_cast<uintptr_t>(this);
1132   }
1133 
tokenlessSlotValue()1134   uintptr_t tokenlessSlotValue() {
1135     return tokenfulSlotValue() | kTokenless;
1136   }
1137 
slotValueIsThis(uintptr_t slotValue)1138   bool slotValueIsThis(uintptr_t slotValue) {
1139     return (slotValue & ~kTokenless) == tokenfulSlotValue();
1140   }
1141 
1142   // Clears any deferredReaders[] that point to this, adjusting the inline
1143   // shared lock count to compensate.  Does some spinning and yielding
1144   // to avoid the work.  Always finishes the application, even if ctx
1145   // times out.
1146   template <class WaitContext>
applyDeferredReaders(uint32_t & state,WaitContext & ctx)1147   void applyDeferredReaders(uint32_t& state, WaitContext& ctx) {
1148     uint32_t slot = 0;
1149 
1150     uint32_t spinCount = 0;
1151     while (true) {
1152       while (!slotValueIsThis(
1153           deferredReader(slot)->load(std::memory_order_acquire))) {
1154         if (++slot == kMaxDeferredReaders) {
1155           return;
1156         }
1157       }
1158       asm_volatile_pause();
1159       if (UNLIKELY(++spinCount >= kMaxSpinCount)) {
1160         applyDeferredReaders(state, ctx, slot);
1161         return;
1162       }
1163     }
1164   }
1165 
1166   template <class WaitContext>
applyDeferredReaders(uint32_t & state,WaitContext & ctx,uint32_t slot)1167   void applyDeferredReaders(uint32_t& state, WaitContext& ctx, uint32_t slot) {
1168 #ifdef RUSAGE_THREAD
1169     struct rusage usage;
1170     std::memset(&usage, 0, sizeof(usage));
1171     long before = -1;
1172 #endif
1173     for (uint32_t yieldCount = 0; yieldCount < kMaxSoftYieldCount;
1174          ++yieldCount) {
1175       for (int softState = 0; softState < 3; ++softState) {
1176         if (softState < 2) {
1177           std::this_thread::yield();
1178         } else {
1179 #ifdef RUSAGE_THREAD
1180           getrusage(RUSAGE_THREAD, &usage);
1181 #endif
1182         }
1183         while (!slotValueIsThis(
1184             deferredReader(slot)->load(std::memory_order_acquire))) {
1185           if (++slot == kMaxDeferredReaders) {
1186             return;
1187           }
1188         }
1189         if (ctx.shouldTimeOut()) {
1190           // finish applying immediately on timeout
1191           break;
1192         }
1193       }
1194 #ifdef RUSAGE_THREAD
1195       if (before >= 0 && usage.ru_nivcsw >= before + 2) {
1196         // heuristic says run queue is not empty
1197         break;
1198       }
1199       before = usage.ru_nivcsw;
1200 #endif
1201     }
1202 
1203     uint32_t movedSlotCount = 0;
1204     for (; slot < kMaxDeferredReaders; ++slot) {
1205       auto slotPtr = deferredReader(slot);
1206       auto slotValue = slotPtr->load(std::memory_order_acquire);
1207       if (slotValueIsThis(slotValue) &&
1208           slotPtr->compare_exchange_strong(slotValue, 0)) {
1209         ++movedSlotCount;
1210       }
1211     }
1212 
1213     if (movedSlotCount > 0) {
1214       state = (state_ += movedSlotCount * kIncrHasS);
1215     }
1216     assert((state & (kHasE | kBegunE)) != 0);
1217 
1218     // if state + kIncrHasS overflows (off the end of state) then either
1219     // we have 2^(32-9) readers (almost certainly an application bug)
1220     // or we had an underflow (also a bug)
1221     assert(state < state + kIncrHasS);
1222   }
1223 
1224   // It is straightfoward to make a token-less lock_shared() and
1225   // unlock_shared() either by making the token-less version always use
1226   // INLINE_SHARED mode or by removing the token version.  Supporting
1227   // deferred operation for both types is trickier than it appears, because
1228   // the purpose of the token it so that unlock_shared doesn't have to
1229   // look in other slots for its deferred lock.  Token-less unlock_shared
1230   // might place a deferred lock in one place and then release a different
1231   // slot that was originally used by the token-ful version.  If this was
1232   // important we could solve the problem by differentiating the deferred
1233   // locks so that cross-variety release wouldn't occur.  The best way
1234   // is probably to steal a bit from the pointer, making deferredLocks[]
1235   // an array of Atom<uintptr_t>.
1236 
1237   template <class WaitContext>
lockSharedImpl(Token * token,WaitContext & ctx)1238   bool lockSharedImpl(Token* token, WaitContext& ctx) {
1239     uint32_t state = state_.load(std::memory_order_relaxed);
1240     if ((state & (kHasS | kMayDefer | kHasE)) == 0 &&
1241         state_.compare_exchange_strong(state, state + kIncrHasS)) {
1242       if (token != nullptr) {
1243         token->type_ = Token::Type::INLINE_SHARED;
1244       }
1245       return true;
1246     }
1247     return lockSharedImpl(state, token, ctx);
1248   }
1249 
1250   template <class WaitContext>
1251   bool lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx);
1252 
1253   // Updates the state in/out argument as if the locks were made inline,
1254   // but does not update state_
cleanupTokenlessSharedDeferred(uint32_t & state)1255   void cleanupTokenlessSharedDeferred(uint32_t& state) {
1256     for (uint32_t i = 0; i < kMaxDeferredReaders; ++i) {
1257       auto slotPtr = deferredReader(i);
1258       auto slotValue = slotPtr->load(std::memory_order_relaxed);
1259       if (slotValue == tokenlessSlotValue()) {
1260         slotPtr->store(0, std::memory_order_relaxed);
1261         state += kIncrHasS;
1262         if ((state & kHasS) == 0) {
1263           break;
1264         }
1265       }
1266     }
1267   }
1268 
1269   bool tryUnlockTokenlessSharedDeferred();
1270 
tryUnlockSharedDeferred(uint32_t slot)1271   bool tryUnlockSharedDeferred(uint32_t slot) {
1272     assert(slot < kMaxDeferredReaders);
1273     auto slotValue = tokenfulSlotValue();
1274     return deferredReader(slot)->compare_exchange_strong(slotValue, 0);
1275   }
1276 
unlockSharedInline()1277   uint32_t unlockSharedInline() {
1278     uint32_t state = (state_ -= kIncrHasS);
1279     assert(
1280         (state & (kHasE | kBegunE | kMayDefer)) != 0 ||
1281         state < state + kIncrHasS);
1282     if ((state & kHasS) == 0) {
1283       // Only the second half of lock() can be blocked by a non-zero
1284       // reader count, so that's the only thing we need to wake
1285       wakeRegisteredWaiters(state, kWaitingNotS);
1286     }
1287     return state;
1288   }
1289 
1290   template <class WaitContext>
lockUpgradeImpl(WaitContext & ctx)1291   bool lockUpgradeImpl(WaitContext& ctx) {
1292     uint32_t state;
1293     do {
1294       if (!waitForZeroBits(state, kHasSolo, kWaitingU, ctx)) {
1295         return false;
1296       }
1297     } while (!state_.compare_exchange_strong(state, state | kHasU));
1298     return true;
1299   }
1300 
1301  public:
1302   class ReadHolder {
ReadHolder()1303     ReadHolder() : lock_(nullptr) {}
1304 
1305    public:
ReadHolder(const SharedMutexImpl * lock)1306     explicit ReadHolder(const SharedMutexImpl* lock)
1307         : lock_(const_cast<SharedMutexImpl*>(lock)) {
1308       if (lock_) {
1309         lock_->lock_shared(token_);
1310       }
1311     }
1312 
ReadHolder(const SharedMutexImpl & lock)1313     explicit ReadHolder(const SharedMutexImpl& lock)
1314         : lock_(const_cast<SharedMutexImpl*>(&lock)) {
1315       lock_->lock_shared(token_);
1316     }
1317 
ReadHolder(ReadHolder && rhs)1318     ReadHolder(ReadHolder&& rhs) noexcept
1319         : lock_(rhs.lock_), token_(rhs.token_) {
1320       rhs.lock_ = nullptr;
1321     }
1322 
1323     // Downgrade from upgrade mode
ReadHolder(UpgradeHolder && upgraded)1324     explicit ReadHolder(UpgradeHolder&& upgraded) : lock_(upgraded.lock_) {
1325       assert(upgraded.lock_ != nullptr);
1326       upgraded.lock_ = nullptr;
1327       lock_->unlock_upgrade_and_lock_shared(token_);
1328     }
1329 
1330     // Downgrade from exclusive mode
ReadHolder(WriteHolder && writer)1331     explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
1332       assert(writer.lock_ != nullptr);
1333       writer.lock_ = nullptr;
1334       lock_->unlock_and_lock_shared(token_);
1335     }
1336 
1337     ReadHolder& operator=(ReadHolder&& rhs) noexcept {
1338       std::swap(lock_, rhs.lock_);
1339       std::swap(token_, rhs.token_);
1340       return *this;
1341     }
1342 
1343     ReadHolder(const ReadHolder& rhs) = delete;
1344     ReadHolder& operator=(const ReadHolder& rhs) = delete;
1345 
~ReadHolder()1346     ~ReadHolder() {
1347       unlock();
1348     }
1349 
unlock()1350     void unlock() {
1351       if (lock_) {
1352         lock_->unlock_shared(token_);
1353         lock_ = nullptr;
1354       }
1355     }
1356 
1357    private:
1358     friend class UpgradeHolder;
1359     friend class WriteHolder;
1360     SharedMutexImpl* lock_;
1361     SharedMutexToken token_;
1362   };
1363 
1364   class UpgradeHolder {
UpgradeHolder()1365     UpgradeHolder() : lock_(nullptr) {}
1366 
1367    public:
UpgradeHolder(SharedMutexImpl * lock)1368     explicit UpgradeHolder(SharedMutexImpl* lock) : lock_(lock) {
1369       if (lock_) {
1370         lock_->lock_upgrade();
1371       }
1372     }
1373 
UpgradeHolder(SharedMutexImpl & lock)1374     explicit UpgradeHolder(SharedMutexImpl& lock) : lock_(&lock) {
1375       lock_->lock_upgrade();
1376     }
1377 
1378     // Downgrade from exclusive mode
UpgradeHolder(WriteHolder && writer)1379     explicit UpgradeHolder(WriteHolder&& writer) : lock_(writer.lock_) {
1380       assert(writer.lock_ != nullptr);
1381       writer.lock_ = nullptr;
1382       lock_->unlock_and_lock_upgrade();
1383     }
1384 
UpgradeHolder(UpgradeHolder && rhs)1385     UpgradeHolder(UpgradeHolder&& rhs) noexcept : lock_(rhs.lock_) {
1386       rhs.lock_ = nullptr;
1387     }
1388 
1389     UpgradeHolder& operator=(UpgradeHolder&& rhs) noexcept {
1390       std::swap(lock_, rhs.lock_);
1391       return *this;
1392     }
1393 
1394     UpgradeHolder(const UpgradeHolder& rhs) = delete;
1395     UpgradeHolder& operator=(const UpgradeHolder& rhs) = delete;
1396 
~UpgradeHolder()1397     ~UpgradeHolder() {
1398       unlock();
1399     }
1400 
unlock()1401     void unlock() {
1402       if (lock_) {
1403         lock_->unlock_upgrade();
1404         lock_ = nullptr;
1405       }
1406     }
1407 
1408    private:
1409     friend class WriteHolder;
1410     friend class ReadHolder;
1411     SharedMutexImpl* lock_;
1412   };
1413 
1414   class WriteHolder {
WriteHolder()1415     WriteHolder() : lock_(nullptr) {}
1416 
1417    public:
WriteHolder(SharedMutexImpl * lock)1418     explicit WriteHolder(SharedMutexImpl* lock) : lock_(lock) {
1419       if (lock_) {
1420         lock_->lock();
1421       }
1422     }
1423 
WriteHolder(SharedMutexImpl & lock)1424     explicit WriteHolder(SharedMutexImpl& lock) : lock_(&lock) {
1425       lock_->lock();
1426     }
1427 
1428     // Promotion from upgrade mode
WriteHolder(UpgradeHolder && upgrade)1429     explicit WriteHolder(UpgradeHolder&& upgrade) : lock_(upgrade.lock_) {
1430       assert(upgrade.lock_ != nullptr);
1431       upgrade.lock_ = nullptr;
1432       lock_->unlock_upgrade_and_lock();
1433     }
1434 
1435     // README:
1436     //
1437     // It is intended that WriteHolder(ReadHolder&& rhs) do not exist.
1438     //
1439     // Shared locks (read) can not safely upgrade to unique locks (write).
1440     // That upgrade path is a well-known recipe for deadlock, so we explicitly
1441     // disallow it.
1442     //
1443     // If you need to do a conditional mutation, you have a few options:
1444     // 1. Check the condition under a shared lock and release it.
1445     //    Then maybe check the condition again under a unique lock and maybe do
1446     //    the mutation.
1447     // 2. Check the condition once under an upgradeable lock.
1448     //    Then maybe upgrade the lock to a unique lock and do the mutation.
1449     // 3. Check the condition and maybe perform the mutation under a unique
1450     //    lock.
1451     //
1452     // Relevant upgradeable lock notes:
1453     // * At most one upgradeable lock can be held at a time for a given shared
1454     //   mutex, just like a unique lock.
1455     // * An upgradeable lock may be held concurrently with any number of shared
1456     //   locks.
1457     // * An upgradeable lock may be upgraded atomically to a unique lock.
1458 
WriteHolder(WriteHolder && rhs)1459     WriteHolder(WriteHolder&& rhs) noexcept : lock_(rhs.lock_) {
1460       rhs.lock_ = nullptr;
1461     }
1462 
1463     WriteHolder& operator=(WriteHolder&& rhs) noexcept {
1464       std::swap(lock_, rhs.lock_);
1465       return *this;
1466     }
1467 
1468     WriteHolder(const WriteHolder& rhs) = delete;
1469     WriteHolder& operator=(const WriteHolder& rhs) = delete;
1470 
~WriteHolder()1471     ~WriteHolder() {
1472       unlock();
1473     }
1474 
unlock()1475     void unlock() {
1476       if (lock_) {
1477         lock_->unlock();
1478         lock_ = nullptr;
1479       }
1480     }
1481 
1482    private:
1483     friend class ReadHolder;
1484     friend class UpgradeHolder;
1485     SharedMutexImpl* lock_;
1486   };
1487 
1488   // Adapters for Synchronized<>
acquireRead(SharedMutexImpl & lock)1489   friend void acquireRead(SharedMutexImpl& lock) {
1490     lock.lock_shared();
1491   }
acquireReadWrite(SharedMutexImpl & lock)1492   friend void acquireReadWrite(SharedMutexImpl& lock) {
1493     lock.lock();
1494   }
releaseRead(SharedMutexImpl & lock)1495   friend void releaseRead(SharedMutexImpl& lock) {
1496     lock.unlock_shared();
1497   }
releaseReadWrite(SharedMutexImpl & lock)1498   friend void releaseReadWrite(SharedMutexImpl& lock) {
1499     lock.unlock();
1500   }
acquireRead(SharedMutexImpl & lock,unsigned int ms)1501   friend bool acquireRead(SharedMutexImpl& lock, unsigned int ms) {
1502     return lock.try_lock_shared_for(std::chrono::milliseconds(ms));
1503   }
acquireReadWrite(SharedMutexImpl & lock,unsigned int ms)1504   friend bool acquireReadWrite(SharedMutexImpl& lock, unsigned int ms) {
1505     return lock.try_lock_for(std::chrono::milliseconds(ms));
1506   }
1507 };
1508 
1509 typedef SharedMutexImpl<true> SharedMutexReadPriority;
1510 typedef SharedMutexImpl<false> SharedMutexWritePriority;
1511 typedef SharedMutexWritePriority SharedMutex;
1512 typedef SharedMutexImpl<false, void, std::atomic, false, false>
1513     SharedMutexSuppressTSAN;
1514 
1515 // Prevent the compiler from instantiating these in other translation units.
1516 // They are instantiated once in SharedMutex.cpp
1517 extern template class SharedMutexImpl<true>;
1518 extern template class SharedMutexImpl<false>;
1519 
1520 template <
1521     bool ReaderPriority,
1522     typename Tag_,
1523     template <typename> class Atom,
1524     bool BlockImmediately,
1525     bool AnnotateForThreadSanitizer>
1526 alignas(hardware_destructive_interference_size) typename SharedMutexImpl<
1527     ReaderPriority,
1528     Tag_,
1529     Atom,
1530     BlockImmediately,
1531     AnnotateForThreadSanitizer>::DeferredReaderSlot
1532     SharedMutexImpl<
1533         ReaderPriority,
1534         Tag_,
1535         Atom,
1536         BlockImmediately,
1537         AnnotateForThreadSanitizer>::deferredReaders
1538         [kMaxDeferredReaders * kDeferredSeparationFactor] = {};
1539 
1540 template <
1541     bool ReaderPriority,
1542     typename Tag_,
1543     template <typename> class Atom,
1544     bool BlockImmediately,
1545     bool AnnotateForThreadSanitizer>
1546 FOLLY_SHAREDMUTEX_TLS uint32_t SharedMutexImpl<
1547     ReaderPriority,
1548     Tag_,
1549     Atom,
1550     BlockImmediately,
1551     AnnotateForThreadSanitizer>::tls_lastTokenlessSlot = 0;
1552 
1553 template <
1554     bool ReaderPriority,
1555     typename Tag_,
1556     template <typename> class Atom,
1557     bool BlockImmediately,
1558     bool AnnotateForThreadSanitizer>
1559 FOLLY_SHAREDMUTEX_TLS uint32_t SharedMutexImpl<
1560     ReaderPriority,
1561     Tag_,
1562     Atom,
1563     BlockImmediately,
1564     AnnotateForThreadSanitizer>::tls_lastDeferredReaderSlot = 0;
1565 
1566 template <
1567     bool ReaderPriority,
1568     typename Tag_,
1569     template <typename> class Atom,
1570     bool BlockImmediately,
1571     bool AnnotateForThreadSanitizer>
1572 bool SharedMutexImpl<
1573     ReaderPriority,
1574     Tag_,
1575     Atom,
1576     BlockImmediately,
tryUnlockTokenlessSharedDeferred()1577     AnnotateForThreadSanitizer>::tryUnlockTokenlessSharedDeferred() {
1578   auto bestSlot = tls_lastTokenlessSlot;
1579   for (uint32_t i = 0; i < kMaxDeferredReaders; ++i) {
1580     auto slotPtr = deferredReader(bestSlot ^ i);
1581     auto slotValue = slotPtr->load(std::memory_order_relaxed);
1582     if (slotValue == tokenlessSlotValue() &&
1583         slotPtr->compare_exchange_strong(slotValue, 0)) {
1584       tls_lastTokenlessSlot = bestSlot ^ i;
1585       return true;
1586     }
1587   }
1588   return false;
1589 }
1590 
1591 template <
1592     bool ReaderPriority,
1593     typename Tag_,
1594     template <typename> class Atom,
1595     bool BlockImmediately,
1596     bool AnnotateForThreadSanitizer>
1597 template <class WaitContext>
1598 bool SharedMutexImpl<
1599     ReaderPriority,
1600     Tag_,
1601     Atom,
1602     BlockImmediately,
1603     AnnotateForThreadSanitizer>::
lockSharedImpl(uint32_t & state,Token * token,WaitContext & ctx)1604     lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx) {
1605   while (true) {
1606     if (UNLIKELY((state & kHasE) != 0) &&
1607         !waitForZeroBits(state, kHasE, kWaitingS, ctx) && ctx.canTimeOut()) {
1608       return false;
1609     }
1610 
1611     uint32_t slot = tls_lastDeferredReaderSlot;
1612     uintptr_t slotValue = 1; // any non-zero value will do
1613 
1614     bool canAlreadyDefer = (state & kMayDefer) != 0;
1615     bool aboveDeferThreshold =
1616         (state & kHasS) >= (kNumSharedToStartDeferring - 1) * kIncrHasS;
1617     bool drainInProgress = ReaderPriority && (state & kBegunE) != 0;
1618     if (canAlreadyDefer || (aboveDeferThreshold && !drainInProgress)) {
1619       /* Try using the most recent slot first. */
1620       slotValue = deferredReader(slot)->load(std::memory_order_relaxed);
1621       if (slotValue != 0) {
1622         // starting point for our empty-slot search, can change after
1623         // calling waitForZeroBits
1624         uint32_t bestSlot =
1625             (uint32_t)folly::AccessSpreader<Atom>::current(kMaxDeferredReaders);
1626 
1627         // deferred readers are already enabled, or it is time to
1628         // enable them if we can find a slot
1629         for (uint32_t i = 0; i < kDeferredSearchDistance; ++i) {
1630           slot = bestSlot ^ i;
1631           assert(slot < kMaxDeferredReaders);
1632           slotValue = deferredReader(slot)->load(std::memory_order_relaxed);
1633           if (slotValue == 0) {
1634             // found empty slot
1635             tls_lastDeferredReaderSlot = slot;
1636             break;
1637           }
1638         }
1639       }
1640     }
1641 
1642     if (slotValue != 0) {
1643       // not yet deferred, or no empty slots
1644       if (state_.compare_exchange_strong(state, state + kIncrHasS)) {
1645         // successfully recorded the read lock inline
1646         if (token != nullptr) {
1647           token->type_ = Token::Type::INLINE_SHARED;
1648         }
1649         return true;
1650       }
1651       // state is updated, try again
1652       continue;
1653     }
1654 
1655     // record that deferred readers might be in use if necessary
1656     if ((state & kMayDefer) == 0) {
1657       if (!state_.compare_exchange_strong(state, state | kMayDefer)) {
1658         // keep going if CAS failed because somebody else set the bit
1659         // for us
1660         if ((state & (kHasE | kMayDefer)) != kMayDefer) {
1661           continue;
1662         }
1663       }
1664       // state = state | kMayDefer;
1665     }
1666 
1667     // try to use the slot
1668     bool gotSlot = deferredReader(slot)->compare_exchange_strong(
1669         slotValue,
1670         token == nullptr ? tokenlessSlotValue() : tokenfulSlotValue());
1671 
1672     // If we got the slot, we need to verify that an exclusive lock
1673     // didn't happen since we last checked.  If we didn't get the slot we
1674     // need to recheck state_ anyway to make sure we don't waste too much
1675     // work.  It is also possible that since we checked state_ someone
1676     // has acquired and released the write lock, clearing kMayDefer.
1677     // Both cases are covered by looking for the readers-possible bit,
1678     // because it is off when the exclusive lock bit is set.
1679     state = state_.load(std::memory_order_acquire);
1680 
1681     if (!gotSlot) {
1682       continue;
1683     }
1684 
1685     if (token == nullptr) {
1686       tls_lastTokenlessSlot = slot;
1687     }
1688 
1689     if ((state & kMayDefer) != 0) {
1690       assert((state & kHasE) == 0);
1691       // success
1692       if (token != nullptr) {
1693         token->type_ = Token::Type::DEFERRED_SHARED;
1694         token->slot_ = (uint16_t)slot;
1695       }
1696       return true;
1697     }
1698 
1699     // release the slot before retrying
1700     if (token == nullptr) {
1701       // We can't rely on slot.  Token-less slot values can be freed by
1702       // any unlock_shared(), so we need to do the full deferredReader
1703       // search during unlock.  Unlike unlock_shared(), we can't trust
1704       // kPrevDefer here.  This deferred lock isn't visible to lock()
1705       // (that's the whole reason we're undoing it) so there might have
1706       // subsequently been an unlock() and lock() with no intervening
1707       // transition to deferred mode.
1708       if (!tryUnlockTokenlessSharedDeferred()) {
1709         unlockSharedInline();
1710       }
1711     } else {
1712       if (!tryUnlockSharedDeferred(slot)) {
1713         unlockSharedInline();
1714       }
1715     }
1716 
1717     // We got here not because the lock was unavailable, but because
1718     // we lost a compare-and-swap.  Try-lock is typically allowed to
1719     // have spurious failures, but there is no lock efficiency gain
1720     // from exploiting that freedom here.
1721   }
1722 }
1723 
1724 } // namespace folly
1725