1 // Copyright 2020 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifndef BASE_ALLOCATOR_PARTITION_ALLOCATOR_SPINNING_MUTEX_H_
6 #define BASE_ALLOCATOR_PARTITION_ALLOCATOR_SPINNING_MUTEX_H_
7
8 #include <algorithm>
9 #include <atomic>
10
11 #include "base/allocator/partition_allocator/yield_processor.h"
12 #include "base/base_export.h"
13 #include "base/compiler_specific.h"
14 #include "base/thread_annotations.h"
15 #include "build/build_config.h"
16
17 #if defined(OS_WIN)
18 #include <windows.h>
19 #endif
20
21 #if defined(OS_LINUX) || defined(OS_CHROMEOS) || defined(OS_ANDROID)
22 #define PA_HAS_LINUX_KERNEL
23 #endif
24
25 #if defined(PA_HAS_LINUX_KERNEL) || defined(OS_WIN)
26 #define PA_HAS_SPINNING_MUTEX
27 #endif
28
29 #if defined(PA_HAS_SPINNING_MUTEX)
30 namespace base {
31 namespace internal {
32
33 // Simple spinning lock. It will spin in user space a set number of times before
34 // going into the kernel to sleep.
35 //
36 // This is intended to give "the best of both worlds" between a SpinLock and
37 // base::Lock:
38 // - SpinLock: Inlined fast path, no external function calls, just
39 // compare-and-swap. Short waits do not go into the kernel. Good behavior in
40 // low contention cases.
41 // - base::Lock: Good behavior in case of contention.
42 //
43 // We don't rely on base::Lock which we could make spin (by calling Try() in a
44 // loop), as performance is below a custom spinlock as seen on high-level
45 // benchmarks. Instead this implements a simple non-recursive mutex on top of
46 // the futex() syscall on Linux, and SRWLock on Windows. The main difference
47 // between this and a libc implementation is that it only supports the simplest
48 // path: private (to a process), non-recursive mutexes with no priority
49 // inheritance, no timed waits.
50 //
51 // As an interesting side-effect to be used in the allocator, this code does not
52 // make any allocations, locks are small with a constexpr constructor and no
53 // destructor.
54 class LOCKABLE BASE_EXPORT SpinningMutex {
55 public:
56 inline constexpr SpinningMutex();
57 ALWAYS_INLINE void Acquire() EXCLUSIVE_LOCK_FUNCTION();
58 ALWAYS_INLINE void Release() UNLOCK_FUNCTION();
59 ALWAYS_INLINE bool Try() EXCLUSIVE_TRYLOCK_FUNCTION(true);
AssertAcquired()60 void AssertAcquired() const {} // Not supported.
61
62 private:
63 void LockSlow();
64
65 // Same as SpinLock, not scientifically calibrated. Consider lowering later,
66 // as the slow path has better characteristics than SpinLocks's.
67 static constexpr int kSpinCount = 1000;
68
69 #if defined(PA_HAS_LINUX_KERNEL)
70 void FutexWait();
71 void FutexWake();
72
73 static constexpr int kUnlocked = 0;
74 static constexpr int kLockedUncontended = 1;
75 static constexpr int kLockedContended = 2;
76
77 std::atomic<int32_t> state_{kUnlocked};
78 #else
79 SRWLOCK lock_ = SRWLOCK_INIT;
80 #endif
81 };
82
Acquire()83 ALWAYS_INLINE void SpinningMutex::Acquire() {
84 int tries = 0;
85 int backoff = 1;
86 // Busy-waiting is inlined, which is fine as long as we have few callers. This
87 // is only used for the partition lock, so this is the case.
88 do {
89 if (LIKELY(Try()))
90 return;
91 // Note: Per the intel optimization manual
92 // (https://software.intel.com/content/dam/develop/public/us/en/documents/64-ia-32-architectures-optimization-manual.pdf),
93 // the "pause" instruction is more costly on Skylake Client than on previous
94 // (and subsequent?) architectures. The latency is found to be 141 cycles
95 // there. This is not a big issue here as we don't spin long enough for this
96 // to become a problem, as we spend a maximum of ~141k cycles ~= 47us at
97 // 3GHz in "pause".
98 //
99 // Also, loop several times here, following the guidelines in section 2.3.4
100 // of the manual, "Pause latency in Skylake Client Microarchitecture".
101 for (int yields = 0; yields < backoff; yields++) {
102 YIELD_PROCESSOR;
103 tries++;
104 }
105 constexpr int kMaxBackoff = 64;
106 backoff = std::min(kMaxBackoff, backoff << 1);
107 } while (tries < kSpinCount);
108
109 LockSlow();
110 }
111
112 inline constexpr SpinningMutex::SpinningMutex() = default;
113
114 #if defined(PA_HAS_LINUX_KERNEL)
115
Try()116 ALWAYS_INLINE bool SpinningMutex::Try() {
117 int expected = kUnlocked;
118 return (state_.load(std::memory_order_relaxed) == expected) &&
119 state_.compare_exchange_strong(expected, kLockedUncontended,
120 std::memory_order_acquire,
121 std::memory_order_relaxed);
122 }
123
Release()124 ALWAYS_INLINE void SpinningMutex::Release() {
125 if (UNLIKELY(state_.exchange(kUnlocked, std::memory_order_release) ==
126 kLockedContended)) {
127 // |kLockedContended|: there is a waiter to wake up.
128 //
129 // Here there is a window where the lock is unlocked, since we just set it
130 // to |kUnlocked| above. Meaning that another thread can grab the lock
131 // in-between now and |FutexWake()| waking up a waiter. Aside from
132 // potentially fairness, this is not an issue, as the newly-awaken thread
133 // will check that the lock is still free.
134 //
135 // There is a small pessimization here though: if we have a single waiter,
136 // then when it wakes up, the lock will be set to |kLockedContended|, so
137 // when this waiter releases the lock, it will needlessly call
138 // |FutexWake()|, even though there are no waiters. This is supported by the
139 // kernel, and is what bionic (Android's libc) also does.
140 FutexWake();
141 }
142 }
143
144 #else
145
Try()146 ALWAYS_INLINE bool SpinningMutex::Try() {
147 return !!::TryAcquireSRWLockExclusive(reinterpret_cast<PSRWLOCK>(&lock_));
148 }
149
Release()150 ALWAYS_INLINE void SpinningMutex::Release() {
151 ::ReleaseSRWLockExclusive(reinterpret_cast<PSRWLOCK>(&lock_));
152 }
153
154 #endif
155
156 } // namespace internal
157 } // namespace base
158 #endif // defined(PA_HAS_SPINNING_MUTEX)
159
160 #endif // BASE_ALLOCATOR_PARTITION_ALLOCATOR_SPINNING_MUTEX_H_
161