1 // Copyright 2020 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef BASE_ALLOCATOR_PARTITION_ALLOCATOR_SPINNING_MUTEX_H_
6 #define BASE_ALLOCATOR_PARTITION_ALLOCATOR_SPINNING_MUTEX_H_
7 
8 #include <algorithm>
9 #include <atomic>
10 
11 #include "base/allocator/partition_allocator/yield_processor.h"
12 #include "base/base_export.h"
13 #include "base/compiler_specific.h"
14 #include "base/thread_annotations.h"
15 #include "build/build_config.h"
16 
17 #if defined(OS_WIN)
18 #include <windows.h>
19 #endif
20 
21 #if defined(OS_LINUX) || defined(OS_CHROMEOS) || defined(OS_ANDROID)
22 #define PA_HAS_LINUX_KERNEL
23 #endif
24 
25 #if defined(PA_HAS_LINUX_KERNEL) || defined(OS_WIN)
26 #define PA_HAS_SPINNING_MUTEX
27 #endif
28 
29 #if defined(PA_HAS_SPINNING_MUTEX)
30 namespace base {
31 namespace internal {
32 
33 // Simple spinning lock. It will spin in user space a set number of times before
34 // going into the kernel to sleep.
35 //
36 // This is intended to give "the best of both worlds" between a SpinLock and
37 // base::Lock:
38 // - SpinLock: Inlined fast path, no external function calls, just
39 //   compare-and-swap. Short waits do not go into the kernel. Good behavior in
40 //   low contention cases.
41 // - base::Lock: Good behavior in case of contention.
42 //
43 // We don't rely on base::Lock which we could make spin (by calling Try() in a
44 // loop), as performance is below a custom spinlock as seen on high-level
45 // benchmarks. Instead this implements a simple non-recursive mutex on top of
46 // the futex() syscall on Linux, and SRWLock on Windows. The main difference
47 // between this and a libc implementation is that it only supports the simplest
48 // path: private (to a process), non-recursive mutexes with no priority
49 // inheritance, no timed waits.
50 //
51 // As an interesting side-effect to be used in the allocator, this code does not
52 // make any allocations, locks are small with a constexpr constructor and no
53 // destructor.
54 class LOCKABLE BASE_EXPORT SpinningMutex {
55  public:
56   inline constexpr SpinningMutex();
57   ALWAYS_INLINE void Acquire() EXCLUSIVE_LOCK_FUNCTION();
58   ALWAYS_INLINE void Release() UNLOCK_FUNCTION();
59   ALWAYS_INLINE bool Try() EXCLUSIVE_TRYLOCK_FUNCTION(true);
AssertAcquired()60   void AssertAcquired() const {}  // Not supported.
61 
62  private:
63   void LockSlow();
64 
65   // Same as SpinLock, not scientifically calibrated. Consider lowering later,
66   // as the slow path has better characteristics than SpinLocks's.
67   static constexpr int kSpinCount = 1000;
68 
69 #if defined(PA_HAS_LINUX_KERNEL)
70   void FutexWait();
71   void FutexWake();
72 
73   static constexpr int kUnlocked = 0;
74   static constexpr int kLockedUncontended = 1;
75   static constexpr int kLockedContended = 2;
76 
77   std::atomic<int32_t> state_{kUnlocked};
78 #else
79   SRWLOCK lock_ = SRWLOCK_INIT;
80 #endif
81 };
82 
Acquire()83 ALWAYS_INLINE void SpinningMutex::Acquire() {
84   int tries = 0;
85   int backoff = 1;
86   // Busy-waiting is inlined, which is fine as long as we have few callers. This
87   // is only used for the partition lock, so this is the case.
88   do {
89     if (LIKELY(Try()))
90       return;
91     // Note: Per the intel optimization manual
92     // (https://software.intel.com/content/dam/develop/public/us/en/documents/64-ia-32-architectures-optimization-manual.pdf),
93     // the "pause" instruction is more costly on Skylake Client than on previous
94     // (and subsequent?) architectures. The latency is found to be 141 cycles
95     // there. This is not a big issue here as we don't spin long enough for this
96     // to become a problem, as we spend a maximum of ~141k cycles ~= 47us at
97     // 3GHz in "pause".
98     //
99     // Also, loop several times here, following the guidelines in section 2.3.4
100     // of the manual, "Pause latency in Skylake Client Microarchitecture".
101     for (int yields = 0; yields < backoff; yields++) {
102       YIELD_PROCESSOR;
103       tries++;
104     }
105     constexpr int kMaxBackoff = 64;
106     backoff = std::min(kMaxBackoff, backoff << 1);
107   } while (tries < kSpinCount);
108 
109   LockSlow();
110 }
111 
112 inline constexpr SpinningMutex::SpinningMutex() = default;
113 
114 #if defined(PA_HAS_LINUX_KERNEL)
115 
Try()116 ALWAYS_INLINE bool SpinningMutex::Try() {
117   int expected = kUnlocked;
118   return (state_.load(std::memory_order_relaxed) == expected) &&
119          state_.compare_exchange_strong(expected, kLockedUncontended,
120                                         std::memory_order_acquire,
121                                         std::memory_order_relaxed);
122 }
123 
Release()124 ALWAYS_INLINE void SpinningMutex::Release() {
125   if (UNLIKELY(state_.exchange(kUnlocked, std::memory_order_release) ==
126                kLockedContended)) {
127     // |kLockedContended|: there is a waiter to wake up.
128     //
129     // Here there is a window where the lock is unlocked, since we just set it
130     // to |kUnlocked| above. Meaning that another thread can grab the lock
131     // in-between now and |FutexWake()| waking up a waiter. Aside from
132     // potentially fairness, this is not an issue, as the newly-awaken thread
133     // will check that the lock is still free.
134     //
135     // There is a small pessimization here though: if we have a single waiter,
136     // then when it wakes up, the lock will be set to |kLockedContended|, so
137     // when this waiter releases the lock, it will needlessly call
138     // |FutexWake()|, even though there are no waiters. This is supported by the
139     // kernel, and is what bionic (Android's libc) also does.
140     FutexWake();
141   }
142 }
143 
144 #else
145 
Try()146 ALWAYS_INLINE bool SpinningMutex::Try() {
147   return !!::TryAcquireSRWLockExclusive(reinterpret_cast<PSRWLOCK>(&lock_));
148 }
149 
Release()150 ALWAYS_INLINE void SpinningMutex::Release() {
151   ::ReleaseSRWLockExclusive(reinterpret_cast<PSRWLOCK>(&lock_));
152 }
153 
154 #endif
155 
156 }  // namespace internal
157 }  // namespace base
158 #endif  // defined(PA_HAS_SPINNING_MUTEX)
159 
160 #endif  // BASE_ALLOCATOR_PARTITION_ALLOCATOR_SPINNING_MUTEX_H_
161