1 /* 2 * Copyright (c) 2005 Jeffrey M. Hsu. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Jeffrey M. Hsu. and Matthew Dillon 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of The DragonFly Project nor the names of its 16 * contributors may be used to endorse or promote products derived 17 * from this software without specific, prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 22 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 23 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 24 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 25 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 26 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 27 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 29 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 /* 34 * The implementation is designed to avoid looping when compatible operations 35 * are executed. 36 * 37 * To acquire a spinlock we first increment counta. Then we check if counta 38 * meets our requirements. For an exclusive spinlock it must be 1, of a 39 * shared spinlock it must either be 1 or the SHARED_SPINLOCK bit must be set. 40 * 41 * Shared spinlock failure case: Decrement the count, loop until we can 42 * transition from 0 to SHARED_SPINLOCK|1, or until we find SHARED_SPINLOCK 43 * is set and increment the count. 44 * 45 * Exclusive spinlock failure case: While maintaining the count, clear the 46 * SHARED_SPINLOCK flag unconditionally. Then use an atomic add to transfer 47 * the count from the low bits to the high bits of counta. Then loop until 48 * all low bits are 0. Once the low bits drop to 0 we can transfer the 49 * count back with an atomic_cmpset_int(), atomically, and return. 50 */ 51 #include <sys/param.h> 52 #include <sys/systm.h> 53 #include <sys/types.h> 54 #include <sys/kernel.h> 55 #include <sys/sysctl.h> 56 #ifdef INVARIANTS 57 #include <sys/proc.h> 58 #endif 59 #include <sys/priv.h> 60 #include <machine/atomic.h> 61 #include <machine/cpu.h> 62 #include <machine/cpufunc.h> 63 #include <machine/specialreg.h> 64 #include <machine/clock.h> 65 #include <sys/spinlock.h> 66 #include <sys/spinlock2.h> 67 #include <sys/ktr.h> 68 69 struct spinlock pmap_spin = SPINLOCK_INITIALIZER(pmap_spin); 70 71 #ifdef SMP 72 73 struct indefinite_info { 74 sysclock_t base; 75 int secs; 76 }; 77 78 /* 79 * Kernal Trace 80 */ 81 #if !defined(KTR_SPIN_CONTENTION) 82 #define KTR_SPIN_CONTENTION KTR_ALL 83 #endif 84 #define SPIN_STRING "spin=%p type=%c" 85 #define SPIN_ARG_SIZE (sizeof(void *) + sizeof(int)) 86 87 KTR_INFO_MASTER(spin); 88 #if 0 89 KTR_INFO(KTR_SPIN_CONTENTION, spin, beg, 0, SPIN_STRING, SPIN_ARG_SIZE); 90 KTR_INFO(KTR_SPIN_CONTENTION, spin, end, 1, SPIN_STRING, SPIN_ARG_SIZE); 91 #endif 92 93 #define logspin(name, spin, type) \ 94 KTR_LOG(spin_ ## name, spin, type) 95 96 #ifdef INVARIANTS 97 static int spin_lock_test_mode; 98 #endif 99 100 static int64_t spinlocks_contested1; 101 SYSCTL_QUAD(_debug, OID_AUTO, spinlocks_contested1, CTLFLAG_RD, 102 &spinlocks_contested1, 0, 103 "Spinlock contention count due to collisions with exclusive lock holders"); 104 105 static int64_t spinlocks_contested2; 106 SYSCTL_QUAD(_debug, OID_AUTO, spinlocks_contested2, CTLFLAG_RD, 107 &spinlocks_contested2, 0, 108 "Serious spinlock contention count"); 109 110 #ifdef DEBUG_LOCKS_LATENCY 111 112 static long spinlocks_add_latency; 113 SYSCTL_LONG(_debug, OID_AUTO, spinlocks_add_latency, CTLFLAG_RW, 114 &spinlocks_add_latency, 0, 115 "Add spinlock latency"); 116 117 #endif 118 119 120 /* 121 * We need a fairly large pool to avoid contention on large SMP systems, 122 * particularly multi-chip systems. 123 */ 124 /*#define SPINLOCK_NUM_POOL 8101*/ 125 #define SPINLOCK_NUM_POOL 8192 126 #define SPINLOCK_NUM_POOL_MASK (SPINLOCK_NUM_POOL - 1) 127 128 static __cachealign struct { 129 struct spinlock spin; 130 char filler[32 - sizeof(struct spinlock)]; 131 } pool_spinlocks[SPINLOCK_NUM_POOL]; 132 133 static int spin_indefinite_check(struct spinlock *spin, 134 struct indefinite_info *info); 135 136 /* 137 * We contested due to another exclusive lock holder. We lose. 138 * 139 * We have to unwind the attempt and may acquire the spinlock 140 * anyway while doing so. countb was incremented on our behalf. 141 */ 142 int 143 spin_trylock_contested(struct spinlock *spin) 144 { 145 globaldata_t gd = mycpu; 146 147 /*++spinlocks_contested1;*/ 148 /*atomic_add_int(&spin->counta, -1);*/ 149 --gd->gd_spinlocks; 150 --gd->gd_curthread->td_critcount; 151 return (FALSE); 152 } 153 154 /* 155 * The spin_lock() inline was unable to acquire the lock. 156 * 157 * atomic_swap_int() is the absolute fastest spinlock instruction, at 158 * least on multi-socket systems. All instructions seem to be about 159 * the same on single-socket multi-core systems. However, atomic_swap_int() 160 * does not result in an even distribution of successful acquisitions. 161 * 162 * UNFORTUNATELY we cannot really use atomic_swap_int() when also implementing 163 * shared spin locks, so as we do a better job removing contention we've 164 * moved to atomic_cmpset_int() to be able handle multiple states. 165 * 166 * Another problem we have is that (at least on the 48-core opteron we test 167 * with) having all 48 cores contesting the same spin lock reduces 168 * performance to around 600,000 ops/sec, verses millions when fewer cores 169 * are going after the same lock. 170 * 171 * Backoff algorithms can create even worse starvation problems, and don't 172 * really improve performance when a lot of cores are contending. 173 * 174 * Our solution is to allow the data cache to lazy-update by reading it 175 * non-atomically and only attempting to acquire the lock if the lazy read 176 * looks good. This effectively limits cache bus bandwidth. A cpu_pause() 177 * (for intel/amd anyhow) is not strictly needed as cache bus resource use 178 * is governed by the lazy update. 179 * 180 * WARNING!!!! Performance matters here, by a huge margin. 181 * 182 * 48-core test with pre-read / -j 48 no-modules kernel compile 183 * with fanned-out inactive and active queues came in at 55 seconds. 184 * 185 * 48-core test with pre-read / -j 48 no-modules kernel compile 186 * came in at 75 seconds. Without pre-read it came in at 170 seconds. 187 * 188 * 4-core test with pre-read / -j 48 no-modules kernel compile 189 * came in at 83 seconds. Without pre-read it came in at 83 seconds 190 * as well (no difference). 191 */ 192 void 193 spin_lock_contested(struct spinlock *spin) 194 { 195 struct indefinite_info info = { 0, 0 }; 196 int i; 197 198 /* 199 * Force any existing shared locks to exclusive so no new shared 200 * locks can occur. Transfer our count to the high bits, then 201 * loop until we can acquire the low counter (== 1). 202 */ 203 atomic_clear_int(&spin->counta, SPINLOCK_SHARED); 204 atomic_add_int(&spin->counta, SPINLOCK_EXCLWAIT - 1); 205 206 #ifdef DEBUG_LOCKS_LATENCY 207 long j; 208 for (j = spinlocks_add_latency; j > 0; --j) 209 cpu_ccfence(); 210 #endif 211 if (spin_lock_test_mode > 10 && 212 spin->countb > spin_lock_test_mode && 213 (spin_lock_test_mode & 0xFF) == mycpu->gd_cpuid) { 214 spin->countb = 0; 215 print_backtrace(-1); 216 } 217 218 i = 0; 219 ++spin->countb; 220 221 /*logspin(beg, spin, 'w');*/ 222 for (;;) { 223 /* 224 * If the low bits are zero, try to acquire the exclusive lock 225 * by transfering our high bit counter to the low bits. 226 * 227 * NOTE: Reading spin->counta prior to the swap is extremely 228 * important on multi-chip/many-core boxes. On 48-core 229 * this one change improves fully concurrent all-cores 230 * compiles by 100% or better. 231 * 232 * I can't emphasize enough how important the pre-read 233 * is in preventing hw cache bus armageddon on 234 * multi-chip systems. And on single-chip/multi-core 235 * systems it just doesn't hurt. 236 */ 237 uint32_t ovalue = spin->counta; 238 cpu_ccfence(); 239 if ((ovalue & (SPINLOCK_EXCLWAIT - 1)) == 0 && 240 atomic_cmpset_int(&spin->counta, ovalue, 241 (ovalue - SPINLOCK_EXCLWAIT) | 1)) { 242 break; 243 } 244 if ((++i & 0x7F) == 0x7F) { 245 ++spin->countb; 246 if (spin_indefinite_check(spin, &info)) 247 break; 248 } 249 } 250 /*logspin(end, spin, 'w');*/ 251 } 252 253 /* 254 * Shared spinlocks 255 */ 256 void 257 spin_lock_shared_contested(struct spinlock *spin) 258 { 259 struct indefinite_info info = { 0, 0 }; 260 int i; 261 262 atomic_add_int(&spin->counta, -1); 263 #ifdef DEBUG_LOCKS_LATENCY 264 long j; 265 for (j = spinlocks_add_latency; j > 0; --j) 266 cpu_ccfence(); 267 #endif 268 if (spin_lock_test_mode > 10 && 269 spin->countb > spin_lock_test_mode && 270 (spin_lock_test_mode & 0xFF) == mycpu->gd_cpuid) { 271 spin->countb = 0; 272 print_backtrace(-1); 273 } 274 275 i = 0; 276 ++spin->countb; 277 278 /*logspin(beg, spin, 'w');*/ 279 for (;;) { 280 /* 281 * NOTE: Reading spin->counta prior to the swap is extremely 282 * important on multi-chip/many-core boxes. On 48-core 283 * this one change improves fully concurrent all-cores 284 * compiles by 100% or better. 285 * 286 * I can't emphasize enough how important the pre-read 287 * is in preventing hw cache bus armageddon on 288 * multi-chip systems. And on single-chip/multi-core 289 * systems it just doesn't hurt. 290 */ 291 uint32_t ovalue = spin->counta; 292 293 cpu_ccfence(); 294 if (ovalue == 0) { 295 if (atomic_cmpset_int(&spin->counta, 0, 296 SPINLOCK_SHARED | 1)) 297 break; 298 } else if (ovalue & SPINLOCK_SHARED) { 299 if (atomic_cmpset_int(&spin->counta, ovalue, 300 ovalue + 1)) 301 break; 302 } 303 if ((++i & 0x7F) == 0x7F) { 304 ++spin->countb; 305 if (spin_indefinite_check(spin, &info)) 306 break; 307 } 308 } 309 /*logspin(end, spin, 'w');*/ 310 } 311 312 /* 313 * Pool functions (SHARED SPINLOCKS NOT SUPPORTED) 314 */ 315 static __inline int 316 _spin_pool_hash(void *ptr) 317 { 318 int i; 319 320 i = ((int)(uintptr_t) ptr >> 5) ^ ((int)(uintptr_t)ptr >> 12); 321 i &= SPINLOCK_NUM_POOL_MASK; 322 return (i); 323 } 324 325 void 326 _spin_pool_lock(void *chan) 327 { 328 struct spinlock *sp; 329 330 sp = &pool_spinlocks[_spin_pool_hash(chan)].spin; 331 spin_lock(sp); 332 } 333 334 void 335 _spin_pool_unlock(void *chan) 336 { 337 struct spinlock *sp; 338 339 sp = &pool_spinlocks[_spin_pool_hash(chan)].spin; 340 spin_unlock(sp); 341 } 342 343 344 static 345 int 346 spin_indefinite_check(struct spinlock *spin, struct indefinite_info *info) 347 { 348 sysclock_t count; 349 350 cpu_spinlock_contested(); 351 352 count = sys_cputimer->count(); 353 if (info->secs == 0) { 354 info->base = count; 355 ++info->secs; 356 } else if (count - info->base > sys_cputimer->freq) { 357 kprintf("spin_lock: %p, indefinite wait (%d secs)!\n", 358 spin, info->secs); 359 info->base = count; 360 ++info->secs; 361 if (panicstr) 362 return (TRUE); 363 #if defined(INVARIANTS) 364 if (spin_lock_test_mode) { 365 print_backtrace(-1); 366 return (TRUE); 367 } 368 #endif 369 #if defined(INVARIANTS) 370 if (info->secs == 11) 371 print_backtrace(-1); 372 #endif 373 if (info->secs == 60) 374 panic("spin_lock: %p, indefinite wait!", spin); 375 } 376 return (FALSE); 377 } 378 379 /* 380 * If INVARIANTS is enabled various spinlock timing tests can be run 381 * by setting debug.spin_lock_test: 382 * 383 * 1 Test the indefinite wait code 384 * 2 Time the best-case exclusive lock overhead (spin_test_count) 385 * 3 Time the best-case shared lock overhead (spin_test_count) 386 */ 387 388 #ifdef INVARIANTS 389 390 static int spin_test_count = 10000000; 391 SYSCTL_INT(_debug, OID_AUTO, spin_test_count, CTLFLAG_RW, &spin_test_count, 0, 392 "Number of iterations to use for spinlock wait code test"); 393 394 static int 395 sysctl_spin_lock_test(SYSCTL_HANDLER_ARGS) 396 { 397 struct spinlock spin; 398 int error; 399 int value = 0; 400 int i; 401 402 if ((error = priv_check(curthread, PRIV_ROOT)) != 0) 403 return (error); 404 if ((error = SYSCTL_IN(req, &value, sizeof(value))) != 0) 405 return (error); 406 407 /* 408 * Indefinite wait test 409 */ 410 if (value == 1) { 411 spin_init(&spin); 412 spin_lock(&spin); /* force an indefinite wait */ 413 spin_lock_test_mode = 1; 414 spin_lock(&spin); 415 spin_unlock(&spin); /* Clean up the spinlock count */ 416 spin_unlock(&spin); 417 spin_lock_test_mode = 0; 418 } 419 420 /* 421 * Time best-case exclusive spinlocks 422 */ 423 if (value == 2) { 424 globaldata_t gd = mycpu; 425 426 spin_init(&spin); 427 for (i = spin_test_count; i > 0; --i) { 428 spin_lock_quick(gd, &spin); 429 spin_unlock_quick(gd, &spin); 430 } 431 } 432 433 return (0); 434 } 435 436 SYSCTL_PROC(_debug, KERN_PROC_ALL, spin_lock_test, CTLFLAG_RW|CTLTYPE_INT, 437 0, 0, sysctl_spin_lock_test, "I", "Test spinlock wait code"); 438 439 #endif /* INVARIANTS */ 440 #endif /* SMP */ 441