1 /* 2 * Copyright 2015-present Facebook, Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 // @author Nathan Bronson (ngbronson@fb.com) 18 19 #pragma once 20 21 #include <stdint.h> 22 23 #include <atomic> 24 #include <thread> 25 #include <type_traits> 26 27 #include <folly/CPortability.h> 28 #include <folly/Likely.h> 29 #include <folly/concurrency/CacheLocality.h> 30 #include <folly/detail/Futex.h> 31 #include <folly/portability/Asm.h> 32 #include <folly/portability/SysResource.h> 33 #include <folly/synchronization/SanitizeThread.h> 34 35 // SharedMutex is a reader-writer lock. It is small, very fast, scalable 36 // on multi-core, and suitable for use when readers or writers may block. 37 // Unlike most other reader-writer locks, its throughput with concurrent 38 // readers scales linearly; it is able to acquire and release the lock 39 // in shared mode without cache line ping-ponging. It is suitable for 40 // a wide range of lock hold times because it starts with spinning, 41 // proceeds to using sched_yield with a preemption heuristic, and then 42 // waits using futex and precise wakeups. 43 // 44 // SharedMutex provides all of the methods of folly::RWSpinLock, 45 // boost::shared_mutex, boost::upgrade_mutex, and C++14's 46 // std::shared_timed_mutex. All operations that can block are available 47 // in try, try-for, and try-until (system_clock or steady_clock) versions. 48 // 49 // SharedMutexReadPriority gives priority to readers, 50 // SharedMutexWritePriority gives priority to writers. SharedMutex is an 51 // alias for SharedMutexWritePriority, because writer starvation is more 52 // likely than reader starvation for the read-heavy workloads targetted 53 // by SharedMutex. 54 // 55 // In my tests SharedMutex is as good or better than the other 56 // reader-writer locks in use at Facebook for almost all use cases, 57 // sometimes by a wide margin. (If it is rare that there are actually 58 // concurrent readers then RWSpinLock can be a few nanoseconds faster.) 59 // I compared it to folly::RWSpinLock, folly::RWTicketSpinLock64, 60 // boost::shared_mutex, pthread_rwlock_t, and a RWLock that internally uses 61 // spinlocks to guard state and pthread_mutex_t+pthread_cond_t to block. 62 // (Thrift's ReadWriteMutex is based underneath on pthread_rwlock_t.) 63 // It is generally as good or better than the rest when evaluating size, 64 // speed, scalability, or latency outliers. In the corner cases where 65 // it is not the fastest (such as single-threaded use or heavy write 66 // contention) it is never very much worse than the best. See the bottom 67 // of folly/test/SharedMutexTest.cpp for lots of microbenchmark results. 68 // 69 // Comparison to folly::RWSpinLock: 70 // 71 // * SharedMutex is faster than RWSpinLock when there are actually 72 // concurrent read accesses (sometimes much faster), and ~5 nanoseconds 73 // slower when there is not actually any contention. SharedMutex is 74 // faster in every (benchmarked) scenario where the shared mode of 75 // the lock is actually useful. 76 // 77 // * Concurrent shared access to SharedMutex scales linearly, while total 78 // RWSpinLock throughput drops as more threads try to access the lock 79 // in shared mode. Under very heavy read contention SharedMutex can 80 // be two orders of magnitude faster than RWSpinLock (or any reader 81 // writer lock that doesn't use striping or deferral). 82 // 83 // * SharedMutex can safely protect blocking calls, because after an 84 // initial period of spinning it waits using futex(). 85 // 86 // * RWSpinLock prioritizes readers, SharedMutex has both reader- and 87 // writer-priority variants, but defaults to write priority. 88 // 89 // * RWSpinLock's upgradeable mode blocks new readers, while SharedMutex's 90 // doesn't. Both semantics are reasonable. The boost documentation 91 // doesn't explicitly talk about this behavior (except by omitting 92 // any statement that those lock modes conflict), but the boost 93 // implementations do allow new readers while the upgradeable mode 94 // is held. See https://github.com/boostorg/thread/blob/master/ 95 // include/boost/thread/pthread/shared_mutex.hpp 96 // 97 // * RWSpinLock::UpgradedHolder maps to SharedMutex::UpgradeHolder 98 // (UpgradeableHolder would be even more pedantically correct). 99 // SharedMutex's holders have fewer methods (no reset) and are less 100 // tolerant (promotion and downgrade crash if the donor doesn't own 101 // the lock, and you must use the default constructor rather than 102 // passing a nullptr to the pointer constructor). 103 // 104 // Both SharedMutex and RWSpinLock provide "exclusive", "upgrade", 105 // and "shared" modes. At all times num_threads_holding_exclusive + 106 // num_threads_holding_upgrade <= 1, and num_threads_holding_exclusive == 107 // 0 || num_threads_holding_shared == 0. RWSpinLock has the additional 108 // constraint that num_threads_holding_shared cannot increase while 109 // num_threads_holding_upgrade is non-zero. 110 // 111 // Comparison to the internal RWLock: 112 // 113 // * SharedMutex doesn't allow a maximum reader count to be configured, 114 // so it can't be used as a semaphore in the same way as RWLock. 115 // 116 // * SharedMutex is 4 bytes, RWLock is 256. 117 // 118 // * SharedMutex is as fast or faster than RWLock in all of my 119 // microbenchmarks, and has positive rather than negative scalability. 120 // 121 // * RWLock and SharedMutex are both writer priority locks. 122 // 123 // * SharedMutex avoids latency outliers as well as RWLock. 124 // 125 // * SharedMutex uses different names (t != 0 below): 126 // 127 // RWLock::lock(0) => SharedMutex::lock() 128 // 129 // RWLock::lock(t) => SharedMutex::try_lock_for(milliseconds(t)) 130 // 131 // RWLock::tryLock() => SharedMutex::try_lock() 132 // 133 // RWLock::unlock() => SharedMutex::unlock() 134 // 135 // RWLock::enter(0) => SharedMutex::lock_shared() 136 // 137 // RWLock::enter(t) => 138 // SharedMutex::try_lock_shared_for(milliseconds(t)) 139 // 140 // RWLock::tryEnter() => SharedMutex::try_lock_shared() 141 // 142 // RWLock::leave() => SharedMutex::unlock_shared() 143 // 144 // * RWLock allows the reader count to be adjusted by a value other 145 // than 1 during enter() or leave(). SharedMutex doesn't currently 146 // implement this feature. 147 // 148 // * RWLock's methods are marked const, SharedMutex's aren't. 149 // 150 // Reader-writer locks have the potential to allow concurrent access 151 // to shared read-mostly data, but in practice they often provide no 152 // improvement over a mutex. The problem is the cache coherence protocol 153 // of modern CPUs. Coherence is provided by making sure that when a cache 154 // line is written it is present in only one core's cache. Since a memory 155 // write is required to acquire a reader-writer lock in shared mode, the 156 // cache line holding the lock is invalidated in all of the other caches. 157 // This leads to cache misses when another thread wants to acquire or 158 // release the lock concurrently. When the RWLock is colocated with the 159 // data it protects (common), cache misses can also continue occur when 160 // a thread that already holds the lock tries to read the protected data. 161 // 162 // Ideally, a reader-writer lock would allow multiple cores to acquire 163 // and release the lock in shared mode without incurring any cache misses. 164 // This requires that each core records its shared access in a cache line 165 // that isn't read or written by other read-locking cores. (Writers will 166 // have to check all of the cache lines.) Typical server hardware when 167 // this comment was written has 16 L1 caches and cache lines of 64 bytes, 168 // so a lock striped over all L1 caches would occupy a prohibitive 1024 169 // bytes. Nothing says that we need a separate set of per-core memory 170 // locations for each lock, however. Each SharedMutex instance is only 171 // 4 bytes, but all locks together share a 2K area in which they make a 172 // core-local record of lock acquisitions. 173 // 174 // SharedMutex's strategy of using a shared set of core-local stripes has 175 // a potential downside, because it means that acquisition of any lock in 176 // write mode can conflict with acquisition of any lock in shared mode. 177 // If a lock instance doesn't actually experience concurrency then this 178 // downside will outweight the upside of improved scalability for readers. 179 // To avoid this problem we dynamically detect concurrent accesses to 180 // SharedMutex, and don't start using the deferred mode unless we actually 181 // observe concurrency. See kNumSharedToStartDeferring. 182 // 183 // It is explicitly allowed to call unlock_shared() from a different 184 // thread than lock_shared(), so long as they are properly paired. 185 // unlock_shared() needs to find the location at which lock_shared() 186 // recorded the lock, which might be in the lock itself or in any of 187 // the shared slots. If you can conveniently pass state from lock 188 // acquisition to release then the fastest mechanism is to std::move 189 // the SharedMutex::ReadHolder instance or an SharedMutex::Token (using 190 // lock_shared(Token&) and unlock_shared(Token&)). The guard or token 191 // will tell unlock_shared where in deferredReaders[] to look for the 192 // deferred lock. The Token-less version of unlock_shared() works in all 193 // cases, but is optimized for the common (no inter-thread handoff) case. 194 // 195 // In both read- and write-priority mode, a waiting lock() (exclusive mode) 196 // only blocks readers after it has waited for an active upgrade lock to be 197 // released; until the upgrade lock is released (or upgraded or downgraded) 198 // readers will still be able to enter. Preferences about lock acquisition 199 // are not guaranteed to be enforced perfectly (even if they were, there 200 // is theoretically the chance that a thread could be arbitrarily suspended 201 // between calling lock() and SharedMutex code actually getting executed). 202 // 203 // try_*_for methods always try at least once, even if the duration 204 // is zero or negative. The duration type must be compatible with 205 // std::chrono::steady_clock. try_*_until methods also always try at 206 // least once. std::chrono::system_clock and std::chrono::steady_clock 207 // are supported. 208 // 209 // If you have observed by profiling that your SharedMutex-s are getting 210 // cache misses on deferredReaders[] due to another SharedMutex user, then 211 // you can use the tag type to create your own instantiation of the type. 212 // The contention threshold (see kNumSharedToStartDeferring) should make 213 // this unnecessary in all but the most extreme cases. Make sure to check 214 // that the increased icache and dcache footprint of the tagged result is 215 // worth it. 216 217 // SharedMutex's use of thread local storage is an optimization, so 218 // for the case where thread local storage is not supported, define it 219 // away. 220 221 // Note about TSAN (ThreadSanitizer): the SharedMutexWritePriority version 222 // (the default) of this mutex is annotated appropriately so that TSAN can 223 // perform lock inversion analysis. However, the SharedMutexReadPriority version 224 // is not annotated. This is because TSAN's lock order heuristic 225 // assumes that two calls to lock_shared must be ordered, which leads 226 // to too many false positives for the reader-priority case. 227 // 228 // Suppose thread A holds a SharedMutexWritePriority lock in shared mode and an 229 // independent thread B is waiting for exclusive access. Then a thread C's 230 // lock_shared can't proceed until A has released the lock. Discounting 231 // situations that never use exclusive mode (so no lock is necessary at all) 232 // this means that without higher-level reasoning it is not safe to ignore 233 // reader <-> reader interactions. 234 // 235 // This reasoning does not apply to SharedMutexReadPriority, because there are 236 // no actions by a thread B that can make C need to wait for A. Since the 237 // overwhelming majority of SharedMutex instances use write priority, we 238 // restrict the TSAN annotations to only SharedMutexWritePriority. 239 240 #ifndef FOLLY_SHAREDMUTEX_TLS 241 #if !FOLLY_MOBILE 242 #define FOLLY_SHAREDMUTEX_TLS FOLLY_TLS 243 #else 244 #define FOLLY_SHAREDMUTEX_TLS 245 #endif 246 #endif 247 248 namespace folly { 249 250 struct SharedMutexToken { 251 enum class Type : uint16_t { 252 INVALID = 0, 253 INLINE_SHARED, 254 DEFERRED_SHARED, 255 }; 256 257 Type type_; 258 uint16_t slot_; 259 }; 260 261 namespace detail { 262 // Returns a guard that gives permission for the current thread to 263 // annotate, and adjust the annotation bits in, the SharedMutex at ptr. 264 std::unique_lock<std::mutex> sharedMutexAnnotationGuard(void* ptr); 265 } // namespace detail 266 267 template < 268 bool ReaderPriority, 269 typename Tag_ = void, 270 template <typename> class Atom = std::atomic, 271 bool BlockImmediately = false, 272 bool AnnotateForThreadSanitizer = kIsSanitizeThread && !ReaderPriority> 273 class SharedMutexImpl { 274 public: 275 static constexpr bool kReaderPriority = ReaderPriority; 276 277 typedef Tag_ Tag; 278 279 typedef SharedMutexToken Token; 280 281 class ReadHolder; 282 class UpgradeHolder; 283 class WriteHolder; 284 SharedMutexImpl()285 constexpr SharedMutexImpl() noexcept : state_(0) {} 286 287 SharedMutexImpl(const SharedMutexImpl&) = delete; 288 SharedMutexImpl(SharedMutexImpl&&) = delete; 289 SharedMutexImpl& operator=(const SharedMutexImpl&) = delete; 290 SharedMutexImpl& operator=(SharedMutexImpl&&) = delete; 291 292 // It is an error to destroy an SharedMutex that still has 293 // any outstanding locks. This is checked if NDEBUG isn't defined. 294 // SharedMutex's exclusive mode can be safely used to guard the lock's 295 // own destruction. If, for example, you acquire the lock in exclusive 296 // mode and then observe that the object containing the lock is no longer 297 // needed, you can unlock() and then immediately destroy the lock. 298 // See https://sourceware.org/bugzilla/show_bug.cgi?id=13690 for a 299 // description about why this property needs to be explicitly mentioned. ~SharedMutexImpl()300 ~SharedMutexImpl() { 301 auto state = state_.load(std::memory_order_relaxed); 302 if (UNLIKELY((state & kHasS) != 0)) { 303 cleanupTokenlessSharedDeferred(state); 304 } 305 306 #ifndef NDEBUG 307 // These asserts check that everybody has released the lock before it 308 // is destroyed. If you arrive here while debugging that is likely 309 // the problem. (You could also have general heap corruption.) 310 311 // if a futexWait fails to go to sleep because the value has been 312 // changed, we don't necessarily clean up the wait bits, so it is 313 // possible they will be set here in a correct system 314 assert((state & ~(kWaitingAny | kMayDefer | kAnnotationCreated)) == 0); 315 if ((state & kMayDefer) != 0) { 316 for (uint32_t slot = 0; slot < kMaxDeferredReaders; ++slot) { 317 auto slotValue = deferredReader(slot)->load(std::memory_order_relaxed); 318 assert(!slotValueIsThis(slotValue)); 319 } 320 } 321 #endif 322 annotateDestroy(); 323 } 324 lock()325 void lock() { 326 WaitForever ctx; 327 (void)lockExclusiveImpl(kHasSolo, ctx); 328 annotateAcquired(annotate_rwlock_level::wrlock); 329 } 330 try_lock()331 bool try_lock() { 332 WaitNever ctx; 333 auto result = lockExclusiveImpl(kHasSolo, ctx); 334 annotateTryAcquired(result, annotate_rwlock_level::wrlock); 335 return result; 336 } 337 338 template <class Rep, class Period> try_lock_for(const std::chrono::duration<Rep,Period> & duration)339 bool try_lock_for(const std::chrono::duration<Rep, Period>& duration) { 340 WaitForDuration<Rep, Period> ctx(duration); 341 auto result = lockExclusiveImpl(kHasSolo, ctx); 342 annotateTryAcquired(result, annotate_rwlock_level::wrlock); 343 return result; 344 } 345 346 template <class Clock, class Duration> try_lock_until(const std::chrono::time_point<Clock,Duration> & absDeadline)347 bool try_lock_until( 348 const std::chrono::time_point<Clock, Duration>& absDeadline) { 349 WaitUntilDeadline<Clock, Duration> ctx{absDeadline}; 350 auto result = lockExclusiveImpl(kHasSolo, ctx); 351 annotateTryAcquired(result, annotate_rwlock_level::wrlock); 352 return result; 353 } 354 unlock()355 void unlock() { 356 annotateReleased(annotate_rwlock_level::wrlock); 357 // It is possible that we have a left-over kWaitingNotS if the last 358 // unlock_shared() that let our matching lock() complete finished 359 // releasing before lock()'s futexWait went to sleep. Clean it up now 360 auto state = (state_ &= ~(kWaitingNotS | kPrevDefer | kHasE)); 361 assert((state & ~(kWaitingAny | kAnnotationCreated)) == 0); 362 wakeRegisteredWaiters(state, kWaitingE | kWaitingU | kWaitingS); 363 } 364 365 // Managing the token yourself makes unlock_shared a bit faster 366 lock_shared()367 void lock_shared() { 368 WaitForever ctx; 369 (void)lockSharedImpl(nullptr, ctx); 370 annotateAcquired(annotate_rwlock_level::rdlock); 371 } 372 lock_shared(Token & token)373 void lock_shared(Token& token) { 374 WaitForever ctx; 375 (void)lockSharedImpl(&token, ctx); 376 annotateAcquired(annotate_rwlock_level::rdlock); 377 } 378 try_lock_shared()379 bool try_lock_shared() { 380 WaitNever ctx; 381 auto result = lockSharedImpl(nullptr, ctx); 382 annotateTryAcquired(result, annotate_rwlock_level::rdlock); 383 return result; 384 } 385 try_lock_shared(Token & token)386 bool try_lock_shared(Token& token) { 387 WaitNever ctx; 388 auto result = lockSharedImpl(&token, ctx); 389 annotateTryAcquired(result, annotate_rwlock_level::rdlock); 390 return result; 391 } 392 393 template <class Rep, class Period> try_lock_shared_for(const std::chrono::duration<Rep,Period> & duration)394 bool try_lock_shared_for(const std::chrono::duration<Rep, Period>& duration) { 395 WaitForDuration<Rep, Period> ctx(duration); 396 auto result = lockSharedImpl(nullptr, ctx); 397 annotateTryAcquired(result, annotate_rwlock_level::rdlock); 398 return result; 399 } 400 401 template <class Rep, class Period> try_lock_shared_for(const std::chrono::duration<Rep,Period> & duration,Token & token)402 bool try_lock_shared_for( 403 const std::chrono::duration<Rep, Period>& duration, 404 Token& token) { 405 WaitForDuration<Rep, Period> ctx(duration); 406 auto result = lockSharedImpl(&token, ctx); 407 annotateTryAcquired(result, annotate_rwlock_level::rdlock); 408 return result; 409 } 410 411 template <class Clock, class Duration> try_lock_shared_until(const std::chrono::time_point<Clock,Duration> & absDeadline)412 bool try_lock_shared_until( 413 const std::chrono::time_point<Clock, Duration>& absDeadline) { 414 WaitUntilDeadline<Clock, Duration> ctx{absDeadline}; 415 auto result = lockSharedImpl(nullptr, ctx); 416 annotateTryAcquired(result, annotate_rwlock_level::rdlock); 417 return result; 418 } 419 420 template <class Clock, class Duration> try_lock_shared_until(const std::chrono::time_point<Clock,Duration> & absDeadline,Token & token)421 bool try_lock_shared_until( 422 const std::chrono::time_point<Clock, Duration>& absDeadline, 423 Token& token) { 424 WaitUntilDeadline<Clock, Duration> ctx{absDeadline}; 425 auto result = lockSharedImpl(&token, ctx); 426 annotateTryAcquired(result, annotate_rwlock_level::rdlock); 427 return result; 428 } 429 unlock_shared()430 void unlock_shared() { 431 annotateReleased(annotate_rwlock_level::rdlock); 432 433 auto state = state_.load(std::memory_order_acquire); 434 435 // kPrevDefer can only be set if HasE or BegunE is set 436 assert((state & (kPrevDefer | kHasE | kBegunE)) != kPrevDefer); 437 438 // lock() strips kMayDefer immediately, but then copies it to 439 // kPrevDefer so we can tell if the pre-lock() lock_shared() might 440 // have deferred 441 if ((state & (kMayDefer | kPrevDefer)) == 0 || 442 !tryUnlockTokenlessSharedDeferred()) { 443 // Matching lock_shared() couldn't have deferred, or the deferred 444 // lock has already been inlined by applyDeferredReaders() 445 unlockSharedInline(); 446 } 447 } 448 unlock_shared(Token & token)449 void unlock_shared(Token& token) { 450 annotateReleased(annotate_rwlock_level::rdlock); 451 452 assert( 453 token.type_ == Token::Type::INLINE_SHARED || 454 token.type_ == Token::Type::DEFERRED_SHARED); 455 456 if (token.type_ != Token::Type::DEFERRED_SHARED || 457 !tryUnlockSharedDeferred(token.slot_)) { 458 unlockSharedInline(); 459 } 460 #ifndef NDEBUG 461 token.type_ = Token::Type::INVALID; 462 #endif 463 } 464 unlock_and_lock_shared()465 void unlock_and_lock_shared() { 466 annotateReleased(annotate_rwlock_level::wrlock); 467 annotateAcquired(annotate_rwlock_level::rdlock); 468 // We can't use state_ -=, because we need to clear 2 bits (1 of which 469 // has an uncertain initial state) and set 1 other. We might as well 470 // clear the relevant wake bits at the same time. Note that since S 471 // doesn't block the beginning of a transition to E (writer priority 472 // can cut off new S, reader priority grabs BegunE and blocks deferred 473 // S) we need to wake E as well. 474 auto state = state_.load(std::memory_order_acquire); 475 do { 476 assert( 477 (state & ~(kWaitingAny | kPrevDefer | kAnnotationCreated)) == kHasE); 478 } while (!state_.compare_exchange_strong( 479 state, (state & ~(kWaitingAny | kPrevDefer | kHasE)) + kIncrHasS)); 480 if ((state & (kWaitingE | kWaitingU | kWaitingS)) != 0) { 481 futexWakeAll(kWaitingE | kWaitingU | kWaitingS); 482 } 483 } 484 unlock_and_lock_shared(Token & token)485 void unlock_and_lock_shared(Token& token) { 486 unlock_and_lock_shared(); 487 token.type_ = Token::Type::INLINE_SHARED; 488 } 489 lock_upgrade()490 void lock_upgrade() { 491 WaitForever ctx; 492 (void)lockUpgradeImpl(ctx); 493 // For TSAN: treat upgrade locks as equivalent to read locks 494 annotateAcquired(annotate_rwlock_level::rdlock); 495 } 496 try_lock_upgrade()497 bool try_lock_upgrade() { 498 WaitNever ctx; 499 auto result = lockUpgradeImpl(ctx); 500 annotateTryAcquired(result, annotate_rwlock_level::rdlock); 501 return result; 502 } 503 504 template <class Rep, class Period> try_lock_upgrade_for(const std::chrono::duration<Rep,Period> & duration)505 bool try_lock_upgrade_for( 506 const std::chrono::duration<Rep, Period>& duration) { 507 WaitForDuration<Rep, Period> ctx(duration); 508 auto result = lockUpgradeImpl(ctx); 509 annotateTryAcquired(result, annotate_rwlock_level::rdlock); 510 return result; 511 } 512 513 template <class Clock, class Duration> try_lock_upgrade_until(const std::chrono::time_point<Clock,Duration> & absDeadline)514 bool try_lock_upgrade_until( 515 const std::chrono::time_point<Clock, Duration>& absDeadline) { 516 WaitUntilDeadline<Clock, Duration> ctx{absDeadline}; 517 auto result = lockUpgradeImpl(ctx); 518 annotateTryAcquired(result, annotate_rwlock_level::rdlock); 519 return result; 520 } 521 unlock_upgrade()522 void unlock_upgrade() { 523 annotateReleased(annotate_rwlock_level::rdlock); 524 auto state = (state_ -= kHasU); 525 assert((state & (kWaitingNotS | kHasSolo)) == 0); 526 wakeRegisteredWaiters(state, kWaitingE | kWaitingU); 527 } 528 unlock_upgrade_and_lock()529 void unlock_upgrade_and_lock() { 530 // no waiting necessary, so waitMask is empty 531 WaitForever ctx; 532 (void)lockExclusiveImpl(0, ctx); 533 annotateReleased(annotate_rwlock_level::rdlock); 534 annotateAcquired(annotate_rwlock_level::wrlock); 535 } 536 unlock_upgrade_and_lock_shared()537 void unlock_upgrade_and_lock_shared() { 538 // No need to annotate for TSAN here because we model upgrade and shared 539 // locks as the same. 540 auto state = (state_ -= kHasU - kIncrHasS); 541 assert((state & (kWaitingNotS | kHasSolo)) == 0); 542 wakeRegisteredWaiters(state, kWaitingE | kWaitingU); 543 } 544 unlock_upgrade_and_lock_shared(Token & token)545 void unlock_upgrade_and_lock_shared(Token& token) { 546 unlock_upgrade_and_lock_shared(); 547 token.type_ = Token::Type::INLINE_SHARED; 548 } 549 unlock_and_lock_upgrade()550 void unlock_and_lock_upgrade() { 551 annotateReleased(annotate_rwlock_level::wrlock); 552 annotateAcquired(annotate_rwlock_level::rdlock); 553 // We can't use state_ -=, because we need to clear 2 bits (1 of 554 // which has an uncertain initial state) and set 1 other. We might 555 // as well clear the relevant wake bits at the same time. 556 auto state = state_.load(std::memory_order_acquire); 557 while (true) { 558 assert( 559 (state & ~(kWaitingAny | kPrevDefer | kAnnotationCreated)) == kHasE); 560 auto after = 561 (state & ~(kWaitingNotS | kWaitingS | kPrevDefer | kHasE)) + kHasU; 562 if (state_.compare_exchange_strong(state, after)) { 563 if ((state & kWaitingS) != 0) { 564 futexWakeAll(kWaitingS); 565 } 566 return; 567 } 568 } 569 } 570 571 private: 572 typedef typename folly::detail::Futex<Atom> Futex; 573 574 // Internally we use four kinds of wait contexts. These are structs 575 // that provide a doWait method that returns true if a futex wake 576 // was issued that intersects with the waitMask, false if there was a 577 // timeout and no more waiting should be performed. Spinning occurs 578 // before the wait context is invoked. 579 580 struct WaitForever { canBlockWaitForever581 bool canBlock() { 582 return true; 583 } canTimeOutWaitForever584 bool canTimeOut() { 585 return false; 586 } shouldTimeOutWaitForever587 bool shouldTimeOut() { 588 return false; 589 } 590 doWaitWaitForever591 bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) { 592 detail::futexWait(&futex, expected, waitMask); 593 return true; 594 } 595 }; 596 597 struct WaitNever { canBlockWaitNever598 bool canBlock() { 599 return false; 600 } canTimeOutWaitNever601 bool canTimeOut() { 602 return true; 603 } shouldTimeOutWaitNever604 bool shouldTimeOut() { 605 return true; 606 } 607 doWaitWaitNever608 bool doWait( 609 Futex& /* futex */, 610 uint32_t /* expected */, 611 uint32_t /* waitMask */) { 612 return false; 613 } 614 }; 615 616 template <class Rep, class Period> 617 struct WaitForDuration { 618 std::chrono::duration<Rep, Period> duration_; 619 bool deadlineComputed_; 620 std::chrono::steady_clock::time_point deadline_; 621 WaitForDurationWaitForDuration622 explicit WaitForDuration(const std::chrono::duration<Rep, Period>& duration) 623 : duration_(duration), deadlineComputed_(false) {} 624 deadlineWaitForDuration625 std::chrono::steady_clock::time_point deadline() { 626 if (!deadlineComputed_) { 627 deadline_ = std::chrono::steady_clock::now() + duration_; 628 deadlineComputed_ = true; 629 } 630 return deadline_; 631 } 632 canBlockWaitForDuration633 bool canBlock() { 634 return duration_.count() > 0; 635 } canTimeOutWaitForDuration636 bool canTimeOut() { 637 return true; 638 } 639 shouldTimeOutWaitForDuration640 bool shouldTimeOut() { 641 return std::chrono::steady_clock::now() > deadline(); 642 } 643 doWaitWaitForDuration644 bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) { 645 auto result = 646 detail::futexWaitUntil(&futex, expected, deadline(), waitMask); 647 return result != folly::detail::FutexResult::TIMEDOUT; 648 } 649 }; 650 651 template <class Clock, class Duration> 652 struct WaitUntilDeadline { 653 std::chrono::time_point<Clock, Duration> absDeadline_; 654 canBlockWaitUntilDeadline655 bool canBlock() { 656 return true; 657 } canTimeOutWaitUntilDeadline658 bool canTimeOut() { 659 return true; 660 } shouldTimeOutWaitUntilDeadline661 bool shouldTimeOut() { 662 return Clock::now() > absDeadline_; 663 } 664 doWaitWaitUntilDeadline665 bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) { 666 auto result = 667 detail::futexWaitUntil(&futex, expected, absDeadline_, waitMask); 668 return result != folly::detail::FutexResult::TIMEDOUT; 669 } 670 }; 671 annotateLazyCreate()672 void annotateLazyCreate() { 673 if (AnnotateForThreadSanitizer && 674 (state_.load() & kAnnotationCreated) == 0) { 675 auto guard = detail::sharedMutexAnnotationGuard(this); 676 // check again 677 if ((state_.load() & kAnnotationCreated) == 0) { 678 state_.fetch_or(kAnnotationCreated); 679 annotate_benign_race_sized( 680 &state_, sizeof(state_), "init TSAN", __FILE__, __LINE__); 681 annotate_rwlock_create(this, __FILE__, __LINE__); 682 } 683 } 684 } 685 annotateDestroy()686 void annotateDestroy() { 687 if (AnnotateForThreadSanitizer) { 688 annotateLazyCreate(); 689 annotate_rwlock_destroy(this, __FILE__, __LINE__); 690 } 691 } 692 annotateAcquired(annotate_rwlock_level w)693 void annotateAcquired(annotate_rwlock_level w) { 694 if (AnnotateForThreadSanitizer) { 695 annotateLazyCreate(); 696 annotate_rwlock_acquired(this, w, __FILE__, __LINE__); 697 } 698 } 699 annotateTryAcquired(bool result,annotate_rwlock_level w)700 void annotateTryAcquired(bool result, annotate_rwlock_level w) { 701 if (AnnotateForThreadSanitizer) { 702 annotateLazyCreate(); 703 annotate_rwlock_try_acquired(this, w, result, __FILE__, __LINE__); 704 } 705 } 706 annotateReleased(annotate_rwlock_level w)707 void annotateReleased(annotate_rwlock_level w) { 708 if (AnnotateForThreadSanitizer) { 709 assert((state_.load() & kAnnotationCreated) != 0); 710 annotate_rwlock_released(this, w, __FILE__, __LINE__); 711 } 712 } 713 714 // 32 bits of state 715 Futex state_{}; 716 717 // S count needs to be on the end, because we explicitly allow it to 718 // underflow. This can occur while we are in the middle of applying 719 // deferred locks (we remove them from deferredReaders[] before 720 // inlining them), or during token-less unlock_shared() if a racing 721 // lock_shared();unlock_shared() moves the deferredReaders slot while 722 // the first unlock_shared() is scanning. The former case is cleaned 723 // up before we finish applying the locks. The latter case can persist 724 // until destruction, when it is cleaned up. 725 static constexpr uint32_t kIncrHasS = 1 << 11; 726 static constexpr uint32_t kHasS = ~(kIncrHasS - 1); 727 728 // Set if annotation has been completed for this instance. That annotation 729 // (and setting this bit afterward) must be guarded by one of the mutexes in 730 // annotationCreationGuards. 731 static constexpr uint32_t kAnnotationCreated = 1 << 10; 732 733 // If false, then there are definitely no deferred read locks for this 734 // instance. Cleared after initialization and when exclusively locked. 735 static constexpr uint32_t kMayDefer = 1 << 9; 736 737 // lock() cleared kMayDefer as soon as it starts draining readers (so 738 // that it doesn't have to do a second CAS once drain completes), but 739 // unlock_shared() still needs to know whether to scan deferredReaders[] 740 // or not. We copy kMayDefer to kPrevDefer when setting kHasE or 741 // kBegunE, and clear it when clearing those bits. 742 static constexpr uint32_t kPrevDefer = 1 << 8; 743 744 // Exclusive-locked blocks all read locks and write locks. This bit 745 // may be set before all readers have finished, but in that case the 746 // thread that sets it won't return to the caller until all read locks 747 // have been released. 748 static constexpr uint32_t kHasE = 1 << 7; 749 750 // Exclusive-draining means that lock() is waiting for existing readers 751 // to leave, but that new readers may still acquire shared access. 752 // This is only used in reader priority mode. New readers during 753 // drain must be inline. The difference between this and kHasU is that 754 // kBegunE prevents kMayDefer from being set. 755 static constexpr uint32_t kBegunE = 1 << 6; 756 757 // At most one thread may have either exclusive or upgrade lock 758 // ownership. Unlike exclusive mode, ownership of the lock in upgrade 759 // mode doesn't preclude other threads holding the lock in shared mode. 760 // boost's concept for this doesn't explicitly say whether new shared 761 // locks can be acquired one lock_upgrade has succeeded, but doesn't 762 // list that as disallowed. RWSpinLock disallows new read locks after 763 // lock_upgrade has been acquired, but the boost implementation doesn't. 764 // We choose the latter. 765 static constexpr uint32_t kHasU = 1 << 5; 766 767 // There are three states that we consider to be "solo", in that they 768 // cannot coexist with other solo states. These are kHasE, kBegunE, 769 // and kHasU. Note that S doesn't conflict with any of these, because 770 // setting the kHasE is only one of the two steps needed to actually 771 // acquire the lock in exclusive mode (the other is draining the existing 772 // S holders). 773 static constexpr uint32_t kHasSolo = kHasE | kBegunE | kHasU; 774 775 // Once a thread sets kHasE it needs to wait for the current readers 776 // to exit the lock. We give this a separate wait identity from the 777 // waiting to set kHasE so that we can perform partial wakeups (wake 778 // one instead of wake all). 779 static constexpr uint32_t kWaitingNotS = 1 << 4; 780 781 // When waking writers we can either wake them all, in which case we 782 // can clear kWaitingE, or we can call futexWake(1). futexWake tells 783 // us if anybody woke up, but even if we detect that nobody woke up we 784 // can't clear the bit after the fact without issuing another wakeup. 785 // To avoid thundering herds when there are lots of pending lock() 786 // without needing to call futexWake twice when there is only one 787 // waiter, kWaitingE actually encodes if we have observed multiple 788 // concurrent waiters. Tricky: ABA issues on futexWait mean that when 789 // we see kWaitingESingle we can't assume that there is only one. 790 static constexpr uint32_t kWaitingESingle = 1 << 2; 791 static constexpr uint32_t kWaitingEMultiple = 1 << 3; 792 static constexpr uint32_t kWaitingE = kWaitingESingle | kWaitingEMultiple; 793 794 // kWaitingU is essentially a 1 bit saturating counter. It always 795 // requires a wakeAll. 796 static constexpr uint32_t kWaitingU = 1 << 1; 797 798 // All blocked lock_shared() should be awoken, so it is correct (not 799 // suboptimal) to wakeAll if there are any shared readers. 800 static constexpr uint32_t kWaitingS = 1 << 0; 801 802 // kWaitingAny is a mask of all of the bits that record the state of 803 // threads, rather than the state of the lock. It is convenient to be 804 // able to mask them off during asserts. 805 static constexpr uint32_t kWaitingAny = 806 kWaitingNotS | kWaitingE | kWaitingU | kWaitingS; 807 808 // The reader count at which a reader will attempt to use the lock 809 // in deferred mode. If this value is 2, then the second concurrent 810 // reader will set kMayDefer and use deferredReaders[]. kMayDefer is 811 // cleared during exclusive access, so this threshold must be reached 812 // each time a lock is held in exclusive mode. 813 static constexpr uint32_t kNumSharedToStartDeferring = 2; 814 815 // The typical number of spins that a thread will wait for a state 816 // transition. There is no bound on the number of threads that can wait 817 // for a writer, so we are pretty conservative here to limit the chance 818 // that we are starving the writer of CPU. Each spin is 6 or 7 nanos, 819 // almost all of which is in the pause instruction. 820 static constexpr uint32_t kMaxSpinCount = !BlockImmediately ? 1000 : 2; 821 822 // The maximum number of soft yields before falling back to futex. 823 // If the preemption heuristic is activated we will fall back before 824 // this. A soft yield takes ~900 nanos (two sched_yield plus a call 825 // to getrusage, with checks of the goal at each step). Soft yields 826 // aren't compatible with deterministic execution under test (unlike 827 // futexWaitUntil, which has a capricious but deterministic back end). 828 static constexpr uint32_t kMaxSoftYieldCount = !BlockImmediately ? 1000 : 0; 829 830 // If AccessSpreader assigns indexes from 0..k*n-1 on a system where some 831 // level of the memory hierarchy is symmetrically divided into k pieces 832 // (NUMA nodes, last-level caches, L1 caches, ...), then slot indexes 833 // that are the same after integer division by k share that resource. 834 // Our strategy for deferred readers is to probe up to numSlots/4 slots, 835 // using the full granularity of AccessSpreader for the start slot 836 // and then search outward. We can use AccessSpreader::current(n) 837 // without managing our own spreader if kMaxDeferredReaders <= 838 // AccessSpreader::kMaxCpus, which is currently 128. 839 // 840 // Our 2-socket E5-2660 machines have 8 L1 caches on each chip, 841 // with 64 byte cache lines. That means we need 64*16 bytes of 842 // deferredReaders[] to give each L1 its own playground. On x86_64 843 // each DeferredReaderSlot is 8 bytes, so we need kMaxDeferredReaders 844 // * kDeferredSeparationFactor >= 64 * 16 / 8 == 128. If 845 // kDeferredSearchDistance * kDeferredSeparationFactor <= 846 // 64 / 8 then we will search only within a single cache line, which 847 // guarantees we won't have inter-L1 contention. We give ourselves 848 // a factor of 2 on the core count, which should hold us for a couple 849 // processor generations. deferredReaders[] is 2048 bytes currently. 850 public: 851 static constexpr uint32_t kMaxDeferredReaders = 64; 852 static constexpr uint32_t kDeferredSearchDistance = 2; 853 static constexpr uint32_t kDeferredSeparationFactor = 4; 854 855 private: 856 static_assert( 857 !(kMaxDeferredReaders & (kMaxDeferredReaders - 1)), 858 "kMaxDeferredReaders must be a power of 2"); 859 static_assert( 860 !(kDeferredSearchDistance & (kDeferredSearchDistance - 1)), 861 "kDeferredSearchDistance must be a power of 2"); 862 863 // The number of deferred locks that can be simultaneously acquired 864 // by a thread via the token-less methods without performing any heap 865 // allocations. Each of these costs 3 pointers (24 bytes, probably) 866 // per thread. There's not much point in making this larger than 867 // kDeferredSearchDistance. 868 static constexpr uint32_t kTokenStackTLSCapacity = 2; 869 870 // We need to make sure that if there is a lock_shared() 871 // and lock_shared(token) followed by unlock_shared() and 872 // unlock_shared(token), the token-less unlock doesn't null 873 // out deferredReaders[token.slot_]. If we allowed that, then 874 // unlock_shared(token) wouldn't be able to assume that its lock 875 // had been inlined by applyDeferredReaders when it finds that 876 // deferredReaders[token.slot_] no longer points to this. We accomplish 877 // this by stealing bit 0 from the pointer to record that the slot's 878 // element has no token, hence our use of uintptr_t in deferredReaders[]. 879 static constexpr uintptr_t kTokenless = 0x1; 880 881 // This is the starting location for Token-less unlock_shared(). 882 static FOLLY_SHAREDMUTEX_TLS uint32_t tls_lastTokenlessSlot; 883 884 // Last deferred reader slot used. 885 static FOLLY_SHAREDMUTEX_TLS uint32_t tls_lastDeferredReaderSlot; 886 887 // Only indexes divisible by kDeferredSeparationFactor are used. 888 // If any of those elements points to a SharedMutexImpl, then it 889 // should be considered that there is a shared lock on that instance. 890 // See kTokenless. 891 public: 892 typedef Atom<uintptr_t> DeferredReaderSlot; 893 894 private: alignas(hardware_destructive_interference_size)895 alignas(hardware_destructive_interference_size) static DeferredReaderSlot 896 deferredReaders[kMaxDeferredReaders * kDeferredSeparationFactor]; 897 898 // Performs an exclusive lock, waiting for state_ & waitMask to be 899 // zero first 900 template <class WaitContext> 901 bool lockExclusiveImpl(uint32_t preconditionGoalMask, WaitContext& ctx) { 902 uint32_t state = state_.load(std::memory_order_acquire); 903 if (LIKELY( 904 (state & (preconditionGoalMask | kMayDefer | kHasS)) == 0 && 905 state_.compare_exchange_strong(state, (state | kHasE) & ~kHasU))) { 906 return true; 907 } else { 908 return lockExclusiveImpl(state, preconditionGoalMask, ctx); 909 } 910 } 911 912 template <class WaitContext> lockExclusiveImpl(uint32_t & state,uint32_t preconditionGoalMask,WaitContext & ctx)913 bool lockExclusiveImpl( 914 uint32_t& state, 915 uint32_t preconditionGoalMask, 916 WaitContext& ctx) { 917 while (true) { 918 if (UNLIKELY((state & preconditionGoalMask) != 0) && 919 !waitForZeroBits(state, preconditionGoalMask, kWaitingE, ctx) && 920 ctx.canTimeOut()) { 921 return false; 922 } 923 924 uint32_t after = (state & kMayDefer) == 0 ? 0 : kPrevDefer; 925 if (!kReaderPriority || (state & (kMayDefer | kHasS)) == 0) { 926 // Block readers immediately, either because we are in write 927 // priority mode or because we can acquire the lock in one 928 // step. Note that if state has kHasU, then we are doing an 929 // unlock_upgrade_and_lock() and we should clear it (reader 930 // priority branch also does this). 931 after |= (state | kHasE) & ~(kHasU | kMayDefer); 932 } else { 933 after |= (state | kBegunE) & ~(kHasU | kMayDefer); 934 } 935 if (state_.compare_exchange_strong(state, after)) { 936 auto before = state; 937 state = after; 938 939 // If we set kHasE (writer priority) then no new readers can 940 // arrive. If we set kBegunE then they can still enter, but 941 // they must be inline. Either way we need to either spin on 942 // deferredReaders[] slots, or inline them so that we can wait on 943 // kHasS to zero itself. deferredReaders[] is pointers, which on 944 // x86_64 are bigger than futex() can handle, so we inline the 945 // deferred locks instead of trying to futexWait on each slot. 946 // Readers are responsible for rechecking state_ after recording 947 // a deferred read to avoid atomicity problems between the state_ 948 // CAS and applyDeferredReader's reads of deferredReaders[]. 949 if (UNLIKELY((before & kMayDefer) != 0)) { 950 applyDeferredReaders(state, ctx); 951 } 952 while (true) { 953 assert((state & (kHasE | kBegunE)) != 0 && (state & kHasU) == 0); 954 if (UNLIKELY((state & kHasS) != 0) && 955 !waitForZeroBits(state, kHasS, kWaitingNotS, ctx) && 956 ctx.canTimeOut()) { 957 // Ugh. We blocked new readers and other writers for a while, 958 // but were unable to complete. Move on. On the plus side 959 // we can clear kWaitingNotS because nobody else can piggyback 960 // on it. 961 state = (state_ &= ~(kPrevDefer | kHasE | kBegunE | kWaitingNotS)); 962 wakeRegisteredWaiters(state, kWaitingE | kWaitingU | kWaitingS); 963 return false; 964 } 965 966 if (kReaderPriority && (state & kHasE) == 0) { 967 assert((state & kBegunE) != 0); 968 if (!state_.compare_exchange_strong( 969 state, (state & ~kBegunE) | kHasE)) { 970 continue; 971 } 972 } 973 974 return true; 975 } 976 } 977 } 978 } 979 980 template <class WaitContext> waitForZeroBits(uint32_t & state,uint32_t goal,uint32_t waitMask,WaitContext & ctx)981 bool waitForZeroBits( 982 uint32_t& state, 983 uint32_t goal, 984 uint32_t waitMask, 985 WaitContext& ctx) { 986 uint32_t spinCount = 0; 987 while (true) { 988 state = state_.load(std::memory_order_acquire); 989 if ((state & goal) == 0) { 990 return true; 991 } 992 asm_volatile_pause(); 993 ++spinCount; 994 if (UNLIKELY(spinCount >= kMaxSpinCount)) { 995 return ctx.canBlock() && 996 yieldWaitForZeroBits(state, goal, waitMask, ctx); 997 } 998 } 999 } 1000 1001 template <class WaitContext> yieldWaitForZeroBits(uint32_t & state,uint32_t goal,uint32_t waitMask,WaitContext & ctx)1002 bool yieldWaitForZeroBits( 1003 uint32_t& state, 1004 uint32_t goal, 1005 uint32_t waitMask, 1006 WaitContext& ctx) { 1007 #ifdef RUSAGE_THREAD 1008 struct rusage usage; 1009 std::memset(&usage, 0, sizeof(usage)); 1010 long before = -1; 1011 #endif 1012 for (uint32_t yieldCount = 0; yieldCount < kMaxSoftYieldCount; 1013 ++yieldCount) { 1014 for (int softState = 0; softState < 3; ++softState) { 1015 if (softState < 2) { 1016 std::this_thread::yield(); 1017 } else { 1018 #ifdef RUSAGE_THREAD 1019 getrusage(RUSAGE_THREAD, &usage); 1020 #endif 1021 } 1022 if (((state = state_.load(std::memory_order_acquire)) & goal) == 0) { 1023 return true; 1024 } 1025 if (ctx.shouldTimeOut()) { 1026 return false; 1027 } 1028 } 1029 #ifdef RUSAGE_THREAD 1030 if (before >= 0 && usage.ru_nivcsw >= before + 2) { 1031 // One involuntary csw might just be occasional background work, 1032 // but if we get two in a row then we guess that there is someone 1033 // else who can profitably use this CPU. Fall back to futex 1034 break; 1035 } 1036 before = usage.ru_nivcsw; 1037 #endif 1038 } 1039 return futexWaitForZeroBits(state, goal, waitMask, ctx); 1040 } 1041 1042 template <class WaitContext> futexWaitForZeroBits(uint32_t & state,uint32_t goal,uint32_t waitMask,WaitContext & ctx)1043 bool futexWaitForZeroBits( 1044 uint32_t& state, 1045 uint32_t goal, 1046 uint32_t waitMask, 1047 WaitContext& ctx) { 1048 assert( 1049 waitMask == kWaitingNotS || waitMask == kWaitingE || 1050 waitMask == kWaitingU || waitMask == kWaitingS); 1051 1052 while (true) { 1053 state = state_.load(std::memory_order_acquire); 1054 if ((state & goal) == 0) { 1055 return true; 1056 } 1057 1058 auto after = state; 1059 if (waitMask == kWaitingE) { 1060 if ((state & kWaitingESingle) != 0) { 1061 after |= kWaitingEMultiple; 1062 } else { 1063 after |= kWaitingESingle; 1064 } 1065 } else { 1066 after |= waitMask; 1067 } 1068 1069 // CAS is better than atomic |= here, because it lets us avoid 1070 // setting the wait flag when the goal is concurrently achieved 1071 if (after != state && !state_.compare_exchange_strong(state, after)) { 1072 continue; 1073 } 1074 1075 if (!ctx.doWait(state_, after, waitMask)) { 1076 // timed out 1077 return false; 1078 } 1079 } 1080 } 1081 1082 // Wakes up waiters registered in state_ as appropriate, clearing the 1083 // awaiting bits for anybody that was awoken. Tries to perform direct 1084 // single wakeup of an exclusive waiter if appropriate wakeRegisteredWaiters(uint32_t & state,uint32_t wakeMask)1085 void wakeRegisteredWaiters(uint32_t& state, uint32_t wakeMask) { 1086 if (UNLIKELY((state & wakeMask) != 0)) { 1087 wakeRegisteredWaitersImpl(state, wakeMask); 1088 } 1089 } 1090 wakeRegisteredWaitersImpl(uint32_t & state,uint32_t wakeMask)1091 void wakeRegisteredWaitersImpl(uint32_t& state, uint32_t wakeMask) { 1092 // If there are multiple lock() pending only one of them will actually 1093 // get to wake up, so issuing futexWakeAll will make a thundering herd. 1094 // There's nothing stopping us from issuing futexWake(1) instead, 1095 // so long as the wait bits are still an accurate reflection of 1096 // the waiters. If we notice (via futexWake's return value) that 1097 // nobody woke up then we can try again with the normal wake-all path. 1098 // Note that we can't just clear the bits at that point; we need to 1099 // clear the bits and then issue another wakeup. 1100 // 1101 // It is possible that we wake an E waiter but an outside S grabs the 1102 // lock instead, at which point we should wake pending U and S waiters. 1103 // Rather than tracking state to make the failing E regenerate the 1104 // wakeup, we just disable the optimization in the case that there 1105 // are waiting U or S that we are eligible to wake. 1106 if ((wakeMask & kWaitingE) == kWaitingE && 1107 (state & wakeMask) == kWaitingE && 1108 detail::futexWake(&state_, 1, kWaitingE) > 0) { 1109 // somebody woke up, so leave state_ as is and clear it later 1110 return; 1111 } 1112 1113 if ((state & wakeMask) != 0) { 1114 auto prev = state_.fetch_and(~wakeMask); 1115 if ((prev & wakeMask) != 0) { 1116 futexWakeAll(wakeMask); 1117 } 1118 state = prev & ~wakeMask; 1119 } 1120 } 1121 futexWakeAll(uint32_t wakeMask)1122 void futexWakeAll(uint32_t wakeMask) { 1123 detail::futexWake(&state_, std::numeric_limits<int>::max(), wakeMask); 1124 } 1125 deferredReader(uint32_t slot)1126 DeferredReaderSlot* deferredReader(uint32_t slot) { 1127 return &deferredReaders[slot * kDeferredSeparationFactor]; 1128 } 1129 tokenfulSlotValue()1130 uintptr_t tokenfulSlotValue() { 1131 return reinterpret_cast<uintptr_t>(this); 1132 } 1133 tokenlessSlotValue()1134 uintptr_t tokenlessSlotValue() { 1135 return tokenfulSlotValue() | kTokenless; 1136 } 1137 slotValueIsThis(uintptr_t slotValue)1138 bool slotValueIsThis(uintptr_t slotValue) { 1139 return (slotValue & ~kTokenless) == tokenfulSlotValue(); 1140 } 1141 1142 // Clears any deferredReaders[] that point to this, adjusting the inline 1143 // shared lock count to compensate. Does some spinning and yielding 1144 // to avoid the work. Always finishes the application, even if ctx 1145 // times out. 1146 template <class WaitContext> applyDeferredReaders(uint32_t & state,WaitContext & ctx)1147 void applyDeferredReaders(uint32_t& state, WaitContext& ctx) { 1148 uint32_t slot = 0; 1149 1150 uint32_t spinCount = 0; 1151 while (true) { 1152 while (!slotValueIsThis( 1153 deferredReader(slot)->load(std::memory_order_acquire))) { 1154 if (++slot == kMaxDeferredReaders) { 1155 return; 1156 } 1157 } 1158 asm_volatile_pause(); 1159 if (UNLIKELY(++spinCount >= kMaxSpinCount)) { 1160 applyDeferredReaders(state, ctx, slot); 1161 return; 1162 } 1163 } 1164 } 1165 1166 template <class WaitContext> applyDeferredReaders(uint32_t & state,WaitContext & ctx,uint32_t slot)1167 void applyDeferredReaders(uint32_t& state, WaitContext& ctx, uint32_t slot) { 1168 #ifdef RUSAGE_THREAD 1169 struct rusage usage; 1170 std::memset(&usage, 0, sizeof(usage)); 1171 long before = -1; 1172 #endif 1173 for (uint32_t yieldCount = 0; yieldCount < kMaxSoftYieldCount; 1174 ++yieldCount) { 1175 for (int softState = 0; softState < 3; ++softState) { 1176 if (softState < 2) { 1177 std::this_thread::yield(); 1178 } else { 1179 #ifdef RUSAGE_THREAD 1180 getrusage(RUSAGE_THREAD, &usage); 1181 #endif 1182 } 1183 while (!slotValueIsThis( 1184 deferredReader(slot)->load(std::memory_order_acquire))) { 1185 if (++slot == kMaxDeferredReaders) { 1186 return; 1187 } 1188 } 1189 if (ctx.shouldTimeOut()) { 1190 // finish applying immediately on timeout 1191 break; 1192 } 1193 } 1194 #ifdef RUSAGE_THREAD 1195 if (before >= 0 && usage.ru_nivcsw >= before + 2) { 1196 // heuristic says run queue is not empty 1197 break; 1198 } 1199 before = usage.ru_nivcsw; 1200 #endif 1201 } 1202 1203 uint32_t movedSlotCount = 0; 1204 for (; slot < kMaxDeferredReaders; ++slot) { 1205 auto slotPtr = deferredReader(slot); 1206 auto slotValue = slotPtr->load(std::memory_order_acquire); 1207 if (slotValueIsThis(slotValue) && 1208 slotPtr->compare_exchange_strong(slotValue, 0)) { 1209 ++movedSlotCount; 1210 } 1211 } 1212 1213 if (movedSlotCount > 0) { 1214 state = (state_ += movedSlotCount * kIncrHasS); 1215 } 1216 assert((state & (kHasE | kBegunE)) != 0); 1217 1218 // if state + kIncrHasS overflows (off the end of state) then either 1219 // we have 2^(32-9) readers (almost certainly an application bug) 1220 // or we had an underflow (also a bug) 1221 assert(state < state + kIncrHasS); 1222 } 1223 1224 // It is straightfoward to make a token-less lock_shared() and 1225 // unlock_shared() either by making the token-less version always use 1226 // INLINE_SHARED mode or by removing the token version. Supporting 1227 // deferred operation for both types is trickier than it appears, because 1228 // the purpose of the token it so that unlock_shared doesn't have to 1229 // look in other slots for its deferred lock. Token-less unlock_shared 1230 // might place a deferred lock in one place and then release a different 1231 // slot that was originally used by the token-ful version. If this was 1232 // important we could solve the problem by differentiating the deferred 1233 // locks so that cross-variety release wouldn't occur. The best way 1234 // is probably to steal a bit from the pointer, making deferredLocks[] 1235 // an array of Atom<uintptr_t>. 1236 1237 template <class WaitContext> lockSharedImpl(Token * token,WaitContext & ctx)1238 bool lockSharedImpl(Token* token, WaitContext& ctx) { 1239 uint32_t state = state_.load(std::memory_order_relaxed); 1240 if ((state & (kHasS | kMayDefer | kHasE)) == 0 && 1241 state_.compare_exchange_strong(state, state + kIncrHasS)) { 1242 if (token != nullptr) { 1243 token->type_ = Token::Type::INLINE_SHARED; 1244 } 1245 return true; 1246 } 1247 return lockSharedImpl(state, token, ctx); 1248 } 1249 1250 template <class WaitContext> 1251 bool lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx); 1252 1253 // Updates the state in/out argument as if the locks were made inline, 1254 // but does not update state_ cleanupTokenlessSharedDeferred(uint32_t & state)1255 void cleanupTokenlessSharedDeferred(uint32_t& state) { 1256 for (uint32_t i = 0; i < kMaxDeferredReaders; ++i) { 1257 auto slotPtr = deferredReader(i); 1258 auto slotValue = slotPtr->load(std::memory_order_relaxed); 1259 if (slotValue == tokenlessSlotValue()) { 1260 slotPtr->store(0, std::memory_order_relaxed); 1261 state += kIncrHasS; 1262 if ((state & kHasS) == 0) { 1263 break; 1264 } 1265 } 1266 } 1267 } 1268 1269 bool tryUnlockTokenlessSharedDeferred(); 1270 tryUnlockSharedDeferred(uint32_t slot)1271 bool tryUnlockSharedDeferred(uint32_t slot) { 1272 assert(slot < kMaxDeferredReaders); 1273 auto slotValue = tokenfulSlotValue(); 1274 return deferredReader(slot)->compare_exchange_strong(slotValue, 0); 1275 } 1276 unlockSharedInline()1277 uint32_t unlockSharedInline() { 1278 uint32_t state = (state_ -= kIncrHasS); 1279 assert( 1280 (state & (kHasE | kBegunE | kMayDefer)) != 0 || 1281 state < state + kIncrHasS); 1282 if ((state & kHasS) == 0) { 1283 // Only the second half of lock() can be blocked by a non-zero 1284 // reader count, so that's the only thing we need to wake 1285 wakeRegisteredWaiters(state, kWaitingNotS); 1286 } 1287 return state; 1288 } 1289 1290 template <class WaitContext> lockUpgradeImpl(WaitContext & ctx)1291 bool lockUpgradeImpl(WaitContext& ctx) { 1292 uint32_t state; 1293 do { 1294 if (!waitForZeroBits(state, kHasSolo, kWaitingU, ctx)) { 1295 return false; 1296 } 1297 } while (!state_.compare_exchange_strong(state, state | kHasU)); 1298 return true; 1299 } 1300 1301 public: 1302 class ReadHolder { ReadHolder()1303 ReadHolder() : lock_(nullptr) {} 1304 1305 public: ReadHolder(const SharedMutexImpl * lock)1306 explicit ReadHolder(const SharedMutexImpl* lock) 1307 : lock_(const_cast<SharedMutexImpl*>(lock)) { 1308 if (lock_) { 1309 lock_->lock_shared(token_); 1310 } 1311 } 1312 ReadHolder(const SharedMutexImpl & lock)1313 explicit ReadHolder(const SharedMutexImpl& lock) 1314 : lock_(const_cast<SharedMutexImpl*>(&lock)) { 1315 lock_->lock_shared(token_); 1316 } 1317 ReadHolder(ReadHolder && rhs)1318 ReadHolder(ReadHolder&& rhs) noexcept 1319 : lock_(rhs.lock_), token_(rhs.token_) { 1320 rhs.lock_ = nullptr; 1321 } 1322 1323 // Downgrade from upgrade mode ReadHolder(UpgradeHolder && upgraded)1324 explicit ReadHolder(UpgradeHolder&& upgraded) : lock_(upgraded.lock_) { 1325 assert(upgraded.lock_ != nullptr); 1326 upgraded.lock_ = nullptr; 1327 lock_->unlock_upgrade_and_lock_shared(token_); 1328 } 1329 1330 // Downgrade from exclusive mode ReadHolder(WriteHolder && writer)1331 explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) { 1332 assert(writer.lock_ != nullptr); 1333 writer.lock_ = nullptr; 1334 lock_->unlock_and_lock_shared(token_); 1335 } 1336 1337 ReadHolder& operator=(ReadHolder&& rhs) noexcept { 1338 std::swap(lock_, rhs.lock_); 1339 std::swap(token_, rhs.token_); 1340 return *this; 1341 } 1342 1343 ReadHolder(const ReadHolder& rhs) = delete; 1344 ReadHolder& operator=(const ReadHolder& rhs) = delete; 1345 ~ReadHolder()1346 ~ReadHolder() { 1347 unlock(); 1348 } 1349 unlock()1350 void unlock() { 1351 if (lock_) { 1352 lock_->unlock_shared(token_); 1353 lock_ = nullptr; 1354 } 1355 } 1356 1357 private: 1358 friend class UpgradeHolder; 1359 friend class WriteHolder; 1360 SharedMutexImpl* lock_; 1361 SharedMutexToken token_; 1362 }; 1363 1364 class UpgradeHolder { UpgradeHolder()1365 UpgradeHolder() : lock_(nullptr) {} 1366 1367 public: UpgradeHolder(SharedMutexImpl * lock)1368 explicit UpgradeHolder(SharedMutexImpl* lock) : lock_(lock) { 1369 if (lock_) { 1370 lock_->lock_upgrade(); 1371 } 1372 } 1373 UpgradeHolder(SharedMutexImpl & lock)1374 explicit UpgradeHolder(SharedMutexImpl& lock) : lock_(&lock) { 1375 lock_->lock_upgrade(); 1376 } 1377 1378 // Downgrade from exclusive mode UpgradeHolder(WriteHolder && writer)1379 explicit UpgradeHolder(WriteHolder&& writer) : lock_(writer.lock_) { 1380 assert(writer.lock_ != nullptr); 1381 writer.lock_ = nullptr; 1382 lock_->unlock_and_lock_upgrade(); 1383 } 1384 UpgradeHolder(UpgradeHolder && rhs)1385 UpgradeHolder(UpgradeHolder&& rhs) noexcept : lock_(rhs.lock_) { 1386 rhs.lock_ = nullptr; 1387 } 1388 1389 UpgradeHolder& operator=(UpgradeHolder&& rhs) noexcept { 1390 std::swap(lock_, rhs.lock_); 1391 return *this; 1392 } 1393 1394 UpgradeHolder(const UpgradeHolder& rhs) = delete; 1395 UpgradeHolder& operator=(const UpgradeHolder& rhs) = delete; 1396 ~UpgradeHolder()1397 ~UpgradeHolder() { 1398 unlock(); 1399 } 1400 unlock()1401 void unlock() { 1402 if (lock_) { 1403 lock_->unlock_upgrade(); 1404 lock_ = nullptr; 1405 } 1406 } 1407 1408 private: 1409 friend class WriteHolder; 1410 friend class ReadHolder; 1411 SharedMutexImpl* lock_; 1412 }; 1413 1414 class WriteHolder { WriteHolder()1415 WriteHolder() : lock_(nullptr) {} 1416 1417 public: WriteHolder(SharedMutexImpl * lock)1418 explicit WriteHolder(SharedMutexImpl* lock) : lock_(lock) { 1419 if (lock_) { 1420 lock_->lock(); 1421 } 1422 } 1423 WriteHolder(SharedMutexImpl & lock)1424 explicit WriteHolder(SharedMutexImpl& lock) : lock_(&lock) { 1425 lock_->lock(); 1426 } 1427 1428 // Promotion from upgrade mode WriteHolder(UpgradeHolder && upgrade)1429 explicit WriteHolder(UpgradeHolder&& upgrade) : lock_(upgrade.lock_) { 1430 assert(upgrade.lock_ != nullptr); 1431 upgrade.lock_ = nullptr; 1432 lock_->unlock_upgrade_and_lock(); 1433 } 1434 1435 // README: 1436 // 1437 // It is intended that WriteHolder(ReadHolder&& rhs) do not exist. 1438 // 1439 // Shared locks (read) can not safely upgrade to unique locks (write). 1440 // That upgrade path is a well-known recipe for deadlock, so we explicitly 1441 // disallow it. 1442 // 1443 // If you need to do a conditional mutation, you have a few options: 1444 // 1. Check the condition under a shared lock and release it. 1445 // Then maybe check the condition again under a unique lock and maybe do 1446 // the mutation. 1447 // 2. Check the condition once under an upgradeable lock. 1448 // Then maybe upgrade the lock to a unique lock and do the mutation. 1449 // 3. Check the condition and maybe perform the mutation under a unique 1450 // lock. 1451 // 1452 // Relevant upgradeable lock notes: 1453 // * At most one upgradeable lock can be held at a time for a given shared 1454 // mutex, just like a unique lock. 1455 // * An upgradeable lock may be held concurrently with any number of shared 1456 // locks. 1457 // * An upgradeable lock may be upgraded atomically to a unique lock. 1458 WriteHolder(WriteHolder && rhs)1459 WriteHolder(WriteHolder&& rhs) noexcept : lock_(rhs.lock_) { 1460 rhs.lock_ = nullptr; 1461 } 1462 1463 WriteHolder& operator=(WriteHolder&& rhs) noexcept { 1464 std::swap(lock_, rhs.lock_); 1465 return *this; 1466 } 1467 1468 WriteHolder(const WriteHolder& rhs) = delete; 1469 WriteHolder& operator=(const WriteHolder& rhs) = delete; 1470 ~WriteHolder()1471 ~WriteHolder() { 1472 unlock(); 1473 } 1474 unlock()1475 void unlock() { 1476 if (lock_) { 1477 lock_->unlock(); 1478 lock_ = nullptr; 1479 } 1480 } 1481 1482 private: 1483 friend class ReadHolder; 1484 friend class UpgradeHolder; 1485 SharedMutexImpl* lock_; 1486 }; 1487 1488 // Adapters for Synchronized<> acquireRead(SharedMutexImpl & lock)1489 friend void acquireRead(SharedMutexImpl& lock) { 1490 lock.lock_shared(); 1491 } acquireReadWrite(SharedMutexImpl & lock)1492 friend void acquireReadWrite(SharedMutexImpl& lock) { 1493 lock.lock(); 1494 } releaseRead(SharedMutexImpl & lock)1495 friend void releaseRead(SharedMutexImpl& lock) { 1496 lock.unlock_shared(); 1497 } releaseReadWrite(SharedMutexImpl & lock)1498 friend void releaseReadWrite(SharedMutexImpl& lock) { 1499 lock.unlock(); 1500 } acquireRead(SharedMutexImpl & lock,unsigned int ms)1501 friend bool acquireRead(SharedMutexImpl& lock, unsigned int ms) { 1502 return lock.try_lock_shared_for(std::chrono::milliseconds(ms)); 1503 } acquireReadWrite(SharedMutexImpl & lock,unsigned int ms)1504 friend bool acquireReadWrite(SharedMutexImpl& lock, unsigned int ms) { 1505 return lock.try_lock_for(std::chrono::milliseconds(ms)); 1506 } 1507 }; 1508 1509 typedef SharedMutexImpl<true> SharedMutexReadPriority; 1510 typedef SharedMutexImpl<false> SharedMutexWritePriority; 1511 typedef SharedMutexWritePriority SharedMutex; 1512 typedef SharedMutexImpl<false, void, std::atomic, false, false> 1513 SharedMutexSuppressTSAN; 1514 1515 // Prevent the compiler from instantiating these in other translation units. 1516 // They are instantiated once in SharedMutex.cpp 1517 extern template class SharedMutexImpl<true>; 1518 extern template class SharedMutexImpl<false>; 1519 1520 template < 1521 bool ReaderPriority, 1522 typename Tag_, 1523 template <typename> class Atom, 1524 bool BlockImmediately, 1525 bool AnnotateForThreadSanitizer> 1526 alignas(hardware_destructive_interference_size) typename SharedMutexImpl< 1527 ReaderPriority, 1528 Tag_, 1529 Atom, 1530 BlockImmediately, 1531 AnnotateForThreadSanitizer>::DeferredReaderSlot 1532 SharedMutexImpl< 1533 ReaderPriority, 1534 Tag_, 1535 Atom, 1536 BlockImmediately, 1537 AnnotateForThreadSanitizer>::deferredReaders 1538 [kMaxDeferredReaders * kDeferredSeparationFactor] = {}; 1539 1540 template < 1541 bool ReaderPriority, 1542 typename Tag_, 1543 template <typename> class Atom, 1544 bool BlockImmediately, 1545 bool AnnotateForThreadSanitizer> 1546 FOLLY_SHAREDMUTEX_TLS uint32_t SharedMutexImpl< 1547 ReaderPriority, 1548 Tag_, 1549 Atom, 1550 BlockImmediately, 1551 AnnotateForThreadSanitizer>::tls_lastTokenlessSlot = 0; 1552 1553 template < 1554 bool ReaderPriority, 1555 typename Tag_, 1556 template <typename> class Atom, 1557 bool BlockImmediately, 1558 bool AnnotateForThreadSanitizer> 1559 FOLLY_SHAREDMUTEX_TLS uint32_t SharedMutexImpl< 1560 ReaderPriority, 1561 Tag_, 1562 Atom, 1563 BlockImmediately, 1564 AnnotateForThreadSanitizer>::tls_lastDeferredReaderSlot = 0; 1565 1566 template < 1567 bool ReaderPriority, 1568 typename Tag_, 1569 template <typename> class Atom, 1570 bool BlockImmediately, 1571 bool AnnotateForThreadSanitizer> 1572 bool SharedMutexImpl< 1573 ReaderPriority, 1574 Tag_, 1575 Atom, 1576 BlockImmediately, tryUnlockTokenlessSharedDeferred()1577 AnnotateForThreadSanitizer>::tryUnlockTokenlessSharedDeferred() { 1578 auto bestSlot = tls_lastTokenlessSlot; 1579 for (uint32_t i = 0; i < kMaxDeferredReaders; ++i) { 1580 auto slotPtr = deferredReader(bestSlot ^ i); 1581 auto slotValue = slotPtr->load(std::memory_order_relaxed); 1582 if (slotValue == tokenlessSlotValue() && 1583 slotPtr->compare_exchange_strong(slotValue, 0)) { 1584 tls_lastTokenlessSlot = bestSlot ^ i; 1585 return true; 1586 } 1587 } 1588 return false; 1589 } 1590 1591 template < 1592 bool ReaderPriority, 1593 typename Tag_, 1594 template <typename> class Atom, 1595 bool BlockImmediately, 1596 bool AnnotateForThreadSanitizer> 1597 template <class WaitContext> 1598 bool SharedMutexImpl< 1599 ReaderPriority, 1600 Tag_, 1601 Atom, 1602 BlockImmediately, 1603 AnnotateForThreadSanitizer>:: lockSharedImpl(uint32_t & state,Token * token,WaitContext & ctx)1604 lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx) { 1605 while (true) { 1606 if (UNLIKELY((state & kHasE) != 0) && 1607 !waitForZeroBits(state, kHasE, kWaitingS, ctx) && ctx.canTimeOut()) { 1608 return false; 1609 } 1610 1611 uint32_t slot = tls_lastDeferredReaderSlot; 1612 uintptr_t slotValue = 1; // any non-zero value will do 1613 1614 bool canAlreadyDefer = (state & kMayDefer) != 0; 1615 bool aboveDeferThreshold = 1616 (state & kHasS) >= (kNumSharedToStartDeferring - 1) * kIncrHasS; 1617 bool drainInProgress = ReaderPriority && (state & kBegunE) != 0; 1618 if (canAlreadyDefer || (aboveDeferThreshold && !drainInProgress)) { 1619 /* Try using the most recent slot first. */ 1620 slotValue = deferredReader(slot)->load(std::memory_order_relaxed); 1621 if (slotValue != 0) { 1622 // starting point for our empty-slot search, can change after 1623 // calling waitForZeroBits 1624 uint32_t bestSlot = 1625 (uint32_t)folly::AccessSpreader<Atom>::current(kMaxDeferredReaders); 1626 1627 // deferred readers are already enabled, or it is time to 1628 // enable them if we can find a slot 1629 for (uint32_t i = 0; i < kDeferredSearchDistance; ++i) { 1630 slot = bestSlot ^ i; 1631 assert(slot < kMaxDeferredReaders); 1632 slotValue = deferredReader(slot)->load(std::memory_order_relaxed); 1633 if (slotValue == 0) { 1634 // found empty slot 1635 tls_lastDeferredReaderSlot = slot; 1636 break; 1637 } 1638 } 1639 } 1640 } 1641 1642 if (slotValue != 0) { 1643 // not yet deferred, or no empty slots 1644 if (state_.compare_exchange_strong(state, state + kIncrHasS)) { 1645 // successfully recorded the read lock inline 1646 if (token != nullptr) { 1647 token->type_ = Token::Type::INLINE_SHARED; 1648 } 1649 return true; 1650 } 1651 // state is updated, try again 1652 continue; 1653 } 1654 1655 // record that deferred readers might be in use if necessary 1656 if ((state & kMayDefer) == 0) { 1657 if (!state_.compare_exchange_strong(state, state | kMayDefer)) { 1658 // keep going if CAS failed because somebody else set the bit 1659 // for us 1660 if ((state & (kHasE | kMayDefer)) != kMayDefer) { 1661 continue; 1662 } 1663 } 1664 // state = state | kMayDefer; 1665 } 1666 1667 // try to use the slot 1668 bool gotSlot = deferredReader(slot)->compare_exchange_strong( 1669 slotValue, 1670 token == nullptr ? tokenlessSlotValue() : tokenfulSlotValue()); 1671 1672 // If we got the slot, we need to verify that an exclusive lock 1673 // didn't happen since we last checked. If we didn't get the slot we 1674 // need to recheck state_ anyway to make sure we don't waste too much 1675 // work. It is also possible that since we checked state_ someone 1676 // has acquired and released the write lock, clearing kMayDefer. 1677 // Both cases are covered by looking for the readers-possible bit, 1678 // because it is off when the exclusive lock bit is set. 1679 state = state_.load(std::memory_order_acquire); 1680 1681 if (!gotSlot) { 1682 continue; 1683 } 1684 1685 if (token == nullptr) { 1686 tls_lastTokenlessSlot = slot; 1687 } 1688 1689 if ((state & kMayDefer) != 0) { 1690 assert((state & kHasE) == 0); 1691 // success 1692 if (token != nullptr) { 1693 token->type_ = Token::Type::DEFERRED_SHARED; 1694 token->slot_ = (uint16_t)slot; 1695 } 1696 return true; 1697 } 1698 1699 // release the slot before retrying 1700 if (token == nullptr) { 1701 // We can't rely on slot. Token-less slot values can be freed by 1702 // any unlock_shared(), so we need to do the full deferredReader 1703 // search during unlock. Unlike unlock_shared(), we can't trust 1704 // kPrevDefer here. This deferred lock isn't visible to lock() 1705 // (that's the whole reason we're undoing it) so there might have 1706 // subsequently been an unlock() and lock() with no intervening 1707 // transition to deferred mode. 1708 if (!tryUnlockTokenlessSharedDeferred()) { 1709 unlockSharedInline(); 1710 } 1711 } else { 1712 if (!tryUnlockSharedDeferred(slot)) { 1713 unlockSharedInline(); 1714 } 1715 } 1716 1717 // We got here not because the lock was unavailable, but because 1718 // we lost a compare-and-swap. Try-lock is typically allowed to 1719 // have spurious failures, but there is no lock efficiency gain 1720 // from exploiting that freedom here. 1721 } 1722 } 1723 1724 } // namespace folly 1725