xref: /dragonfly/sys/kern/kern_mutex.c (revision ff0a8843)
1 /*
2  * Copyright (c) 2009 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * Implement fast persistent locks based on atomic_cmpset_int() with
36  * semantics similar to lockmgr locks but faster and taking up much less
37  * space.  Taken from HAMMER's lock implementation.
38  *
39  * These are meant to complement our LWKT tokens.  Tokens are only held
40  * while the thread is running.  Mutexes can be held across blocking
41  * conditions.
42  *
43  * - Exclusive priority over shared to prevent SMP starvation.
44  * - locks can be aborted (async callback, if any, will be made w/ENOLCK).
45  * - locks can be asynchronous.
46  * - synchronous fast path if no blocking occurs (async callback is not
47  *   made in this case).
48  *
49  * Generally speaking any caller-supplied link state must be properly
50  * initialized before use.
51  *
52  * Most of the support is in sys/mutex[2].h.  We mostly provide backoff
53  * functions here.
54  */
55 
56 #include <sys/param.h>
57 #include <sys/systm.h>
58 #include <sys/kernel.h>
59 #include <sys/sysctl.h>
60 #include <sys/indefinite.h>
61 #include <sys/thread.h>
62 
63 #include <machine/cpufunc.h>
64 
65 #include <sys/thread2.h>
66 #include <sys/mutex2.h>
67 #include <sys/indefinite2.h>
68 
69 static int mtx_chain_link_ex(mtx_t *mtx, u_int olock);
70 static int mtx_chain_link_sh(mtx_t *mtx, u_int olock);
71 static void mtx_delete_link(mtx_t *mtx, mtx_link_t *link);
72 
73 /*
74  * Exclusive-lock a mutex, block until acquired unless link is async.
75  * Recursion is allowed.
76  *
77  * Returns 0 on success, the tsleep() return code on failure, EINPROGRESS
78  * if async.  If immediately successful an async exclusive lock will return 0
79  * and not issue the async callback or link the link structure.  The caller
80  * must handle this case (typically this is an optimal code path).
81  *
82  * A tsleep() error can only be returned if PCATCH is specified in the flags.
83  */
84 static __inline int
85 __mtx_lock_ex(mtx_t *mtx, mtx_link_t *link, int flags, int to)
86 {
87 	thread_t td;
88 	u_int	lock;
89 	u_int	nlock;
90 	int	error;
91 	int	isasync;
92 
93 	for (;;) {
94 		lock = mtx->mtx_lock;
95 		cpu_ccfence();
96 
97 		if (lock == 0) {
98 			nlock = MTX_EXCLUSIVE | 1;
99 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
100 				mtx->mtx_owner = curthread;
101 				cpu_sfence();
102 				link->state = MTX_LINK_ACQUIRED;
103 				error = 0;
104 				break;
105 			}
106 			continue;
107 		}
108 		if ((lock & MTX_EXCLUSIVE) && mtx->mtx_owner == curthread) {
109 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
110 			nlock = lock + 1;
111 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
112 				cpu_sfence();
113 				link->state = MTX_LINK_ACQUIRED;
114 				error = 0;
115 				break;
116 			}
117 			continue;
118 		}
119 
120 		/*
121 		 * We need MTX_LINKSPIN to manipulate exlink or
122 		 * shlink.
123 		 *
124 		 * We must set MTX_EXWANTED with MTX_LINKSPIN to indicate
125 		 * pending exclusive requests.  It cannot be set as a separate
126 		 * operation prior to acquiring MTX_LINKSPIN.
127 		 *
128 		 * To avoid unnecessary cpu cache traffic we poll
129 		 * for collisions.  It is also possible that EXWANTED
130 		 * state failing the above test was spurious, so all the
131 		 * tests must be repeated if we cannot obtain LINKSPIN
132 		 * with the prior state tests intact (i.e. don't reload
133 		 * the (lock) variable here, for heaven's sake!).
134 		 */
135 		if (lock & MTX_LINKSPIN) {
136 			cpu_pause();
137 			continue;
138 		}
139 		td = curthread;
140 		nlock = lock | MTX_EXWANTED | MTX_LINKSPIN;
141 		crit_enter_raw(td);
142 		if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock) == 0) {
143 			crit_exit_raw(td);
144 			continue;
145 		}
146 
147 		/*
148 		 * Check for early abort.
149 		 */
150 		if (link->state == MTX_LINK_ABORTED) {
151 			if (mtx->mtx_exlink == NULL) {
152 				atomic_clear_int(&mtx->mtx_lock,
153 						 MTX_LINKSPIN |
154 						 MTX_EXWANTED);
155 			} else {
156 				atomic_clear_int(&mtx->mtx_lock,
157 						 MTX_LINKSPIN);
158 			}
159 			crit_exit_raw(td);
160 			link->state = MTX_LINK_IDLE;
161 			error = ENOLCK;
162 			break;
163 		}
164 
165 		/*
166 		 * Add our link to the exlink list and release LINKSPIN.
167 		 */
168 		link->owner = td;
169 		link->state = MTX_LINK_LINKED_EX;
170 		if (mtx->mtx_exlink) {
171 			link->next = mtx->mtx_exlink;
172 			link->prev = link->next->prev;
173 			link->next->prev = link;
174 			link->prev->next = link;
175 		} else {
176 			link->next = link;
177 			link->prev = link;
178 			mtx->mtx_exlink = link;
179 		}
180 		isasync = (link->callback != NULL);
181 		atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN);
182 		crit_exit_raw(td);
183 
184 		/*
185 		 * If asynchronous lock request return without
186 		 * blocking, leave link structure linked.
187 		 */
188 		if (isasync) {
189 			error = EINPROGRESS;
190 			break;
191 		}
192 
193 		/*
194 		 * Wait for lock
195 		 */
196 		error = mtx_wait_link(mtx, link, flags, to);
197 		break;
198 	}
199 	return (error);
200 }
201 
202 int
203 _mtx_lock_ex_link(mtx_t *mtx, mtx_link_t *link, int flags, int to)
204 {
205 	return(__mtx_lock_ex(mtx, link, flags, to));
206 }
207 
208 int
209 _mtx_lock_ex(mtx_t *mtx, int flags, int to)
210 {
211 	mtx_link_t link;
212 
213 	mtx_link_init(&link);
214 	return(__mtx_lock_ex(mtx, &link, flags, to));
215 }
216 
217 int
218 _mtx_lock_ex_quick(mtx_t *mtx)
219 {
220 	mtx_link_t link;
221 
222 	mtx_link_init(&link);
223 	return(__mtx_lock_ex(mtx, &link, 0, 0));
224 }
225 
226 /*
227  * Share-lock a mutex, block until acquired.  Recursion is allowed.
228  *
229  * Returns 0 on success, or the tsleep() return code on failure.
230  * An error can only be returned if PCATCH is specified in the flags.
231  *
232  * NOTE: Shared locks get a mass-wakeup so if the tsleep fails we
233  *	 do not have to chain the wakeup().
234  */
235 static __inline int
236 __mtx_lock_sh(mtx_t *mtx, mtx_link_t *link, int flags, int to)
237 {
238 	thread_t td;
239 	u_int	lock;
240 	u_int	nlock;
241 	int	error;
242 	int	isasync;
243 
244 	for (;;) {
245 		lock = mtx->mtx_lock;
246 		cpu_ccfence();
247 
248 		if (lock == 0) {
249 			nlock = 1;
250 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
251 				error = 0;
252 				cpu_sfence();
253 				link->state = MTX_LINK_ACQUIRED;
254 				break;
255 			}
256 			continue;
257 		}
258 		if ((lock & (MTX_EXCLUSIVE | MTX_EXWANTED)) == 0) {
259 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
260 			nlock = lock + 1;
261 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
262 				error = 0;
263 				cpu_sfence();
264 				link->state = MTX_LINK_ACQUIRED;
265 				break;
266 			}
267 			continue;
268 		}
269 
270 		/*
271 		 * We need MTX_LINKSPIN to manipulate exlink or
272 		 * shlink.
273 		 *
274 		 * We must set MTX_SHWANTED with MTX_LINKSPIN to indicate
275 		 * pending shared requests.  It cannot be set as a separate
276 		 * operation prior to acquiring MTX_LINKSPIN.
277 		 *
278 		 * To avoid unnecessary cpu cache traffic we poll
279 		 * for collisions.  It is also possible that EXWANTED
280 		 * state failing the above test was spurious, so all the
281 		 * tests must be repeated if we cannot obtain LINKSPIN
282 		 * with the prior state tests intact (i.e. don't reload
283 		 * the (lock) variable here, for heaven's sake!).
284 		 */
285 		if (lock & MTX_LINKSPIN) {
286 			cpu_pause();
287 			continue;
288 		}
289 		td = curthread;
290 		nlock = lock | MTX_SHWANTED | MTX_LINKSPIN;
291 		crit_enter_raw(td);
292 		if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock) == 0) {
293 			crit_exit_raw(td);
294 			continue;
295 		}
296 
297 		/*
298 		 * Check for early abort.  Other shared lock requestors
299 		 * could have sneaked in before we set LINKSPIN so make
300 		 * sure we undo the state properly.
301 		 */
302 		if (link->state == MTX_LINK_ABORTED) {
303 			if (mtx->mtx_shlink) {
304 				atomic_clear_int(&mtx->mtx_lock,
305 						 MTX_LINKSPIN);
306 			} else {
307 				atomic_clear_int(&mtx->mtx_lock,
308 						 MTX_LINKSPIN |
309 						 MTX_SHWANTED);
310 			}
311 			crit_exit_raw(td);
312 			link->state = MTX_LINK_IDLE;
313 			error = ENOLCK;
314 			break;
315 		}
316 
317 		/*
318 		 * Add our link to the shlink list and release LINKSPIN.
319 		 */
320 		link->owner = td;
321 		link->state = MTX_LINK_LINKED_SH;
322 		if (mtx->mtx_shlink) {
323 			link->next = mtx->mtx_shlink;
324 			link->prev = link->next->prev;
325 			link->next->prev = link;
326 			link->prev->next = link;
327 		} else {
328 			link->next = link;
329 			link->prev = link;
330 			mtx->mtx_shlink = link;
331 		}
332 		isasync = (link->callback != NULL);
333 		atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN);
334 		crit_exit_raw(td);
335 
336 		/*
337 		 * If asynchronous lock request return without
338 		 * blocking, leave link structure linked.
339 		 */
340 		if (isasync) {
341 			error = EINPROGRESS;
342 			break;
343 		}
344 
345 		/*
346 		 * Wait for lock
347 		 */
348 		error = mtx_wait_link(mtx, link, flags, to);
349 		break;
350 	}
351 	return (error);
352 }
353 
354 int
355 _mtx_lock_sh_link(mtx_t *mtx, mtx_link_t *link, int flags, int to)
356 {
357 	return(__mtx_lock_sh(mtx, link, flags, to));
358 }
359 
360 int
361 _mtx_lock_sh(mtx_t *mtx, int flags, int to)
362 {
363 	mtx_link_t link;
364 
365 	mtx_link_init(&link);
366 	return(__mtx_lock_sh(mtx, &link, flags, to));
367 }
368 
369 int
370 _mtx_lock_sh_quick(mtx_t *mtx)
371 {
372 	mtx_link_t link;
373 
374 	mtx_link_init(&link);
375 	return(__mtx_lock_sh(mtx, &link, 0, 0));
376 }
377 
378 /*
379  * Get an exclusive spinlock the hard way.
380  */
381 void
382 _mtx_spinlock(mtx_t *mtx)
383 {
384 	u_int	lock;
385 	u_int	nlock;
386 	int	bb = 1;
387 	int	bo;
388 
389 	for (;;) {
390 		lock = mtx->mtx_lock;
391 		if (lock == 0) {
392 			nlock = MTX_EXCLUSIVE | 1;
393 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
394 				mtx->mtx_owner = curthread;
395 				break;
396 			}
397 		} else if ((lock & MTX_EXCLUSIVE) &&
398 			   mtx->mtx_owner == curthread) {
399 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
400 			nlock = lock + 1;
401 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
402 				break;
403 		} else {
404 			/* MWAIT here */
405 			if (bb < 1000)
406 				++bb;
407 			cpu_pause();
408 			for (bo = 0; bo < bb; ++bo)
409 				;
410 		}
411 		cpu_pause();
412 	}
413 }
414 
415 /*
416  * Attempt to acquire a spinlock, if we fail we must undo the
417  * gd->gd_spinlocks/gd->gd_curthead->td_critcount predisposition.
418  *
419  * Returns 0 on success, EAGAIN on failure.
420  */
421 int
422 _mtx_spinlock_try(mtx_t *mtx)
423 {
424 	globaldata_t gd = mycpu;
425 	u_int	lock;
426 	u_int	nlock;
427 	int	res = 0;
428 
429 	for (;;) {
430 		lock = mtx->mtx_lock;
431 		if (lock == 0) {
432 			nlock = MTX_EXCLUSIVE | 1;
433 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
434 				mtx->mtx_owner = gd->gd_curthread;
435 				break;
436 			}
437 		} else if ((lock & MTX_EXCLUSIVE) &&
438 			   mtx->mtx_owner == gd->gd_curthread) {
439 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
440 			nlock = lock + 1;
441 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
442 				break;
443 		} else {
444 			--gd->gd_spinlocks;
445 			cpu_ccfence();
446 			crit_exit_raw(gd->gd_curthread);
447 			res = EAGAIN;
448 			break;
449 		}
450 		cpu_pause();
451 	}
452 	return res;
453 }
454 
455 #if 0
456 
457 void
458 _mtx_spinlock_sh(mtx_t *mtx)
459 {
460 	u_int	lock;
461 	u_int	nlock;
462 	int	bb = 1;
463 	int	bo;
464 
465 	for (;;) {
466 		lock = mtx->mtx_lock;
467 		if ((lock & MTX_EXCLUSIVE) == 0) {
468 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
469 			nlock = lock + 1;
470 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
471 				break;
472 		} else {
473 			/* MWAIT here */
474 			if (bb < 1000)
475 				++bb;
476 			cpu_pause();
477 			for (bo = 0; bo < bb; ++bo)
478 				;
479 		}
480 		cpu_pause();
481 	}
482 }
483 
484 #endif
485 
486 int
487 _mtx_lock_ex_try(mtx_t *mtx)
488 {
489 	u_int	lock;
490 	u_int	nlock;
491 	int	error;
492 
493 	for (;;) {
494 		lock = mtx->mtx_lock;
495 		if (lock == 0) {
496 			nlock = MTX_EXCLUSIVE | 1;
497 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
498 				mtx->mtx_owner = curthread;
499 				error = 0;
500 				break;
501 			}
502 		} else if ((lock & MTX_EXCLUSIVE) &&
503 			   mtx->mtx_owner == curthread) {
504 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
505 			nlock = lock + 1;
506 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
507 				error = 0;
508 				break;
509 			}
510 		} else {
511 			error = EAGAIN;
512 			break;
513 		}
514 		cpu_pause();
515 	}
516 	return (error);
517 }
518 
519 int
520 _mtx_lock_sh_try(mtx_t *mtx)
521 {
522 	u_int	lock;
523 	u_int	nlock;
524 	int	error = 0;
525 
526 	for (;;) {
527 		lock = mtx->mtx_lock;
528 		if ((lock & MTX_EXCLUSIVE) == 0) {
529 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
530 			nlock = lock + 1;
531 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
532 				break;
533 		} else {
534 			error = EAGAIN;
535 			break;
536 		}
537 		cpu_pause();
538 	}
539 	return (error);
540 }
541 
542 /*
543  * If the lock is held exclusively it must be owned by the caller.  If the
544  * lock is already a shared lock this operation is a NOP.  A panic will
545  * occur if the lock is not held either shared or exclusive.
546  *
547  * The exclusive count is converted to a shared count.
548  */
549 void
550 _mtx_downgrade(mtx_t *mtx)
551 {
552 	u_int	lock;
553 	u_int	nlock;
554 
555 	for (;;) {
556 		lock = mtx->mtx_lock;
557 		cpu_ccfence();
558 
559 		/*
560 		 * NOP if already shared.
561 		 */
562 		if ((lock & MTX_EXCLUSIVE) == 0) {
563 			KKASSERT((lock & MTX_MASK) > 0);
564 			break;
565 		}
566 
567 		/*
568 		 * Transfer count to shared.  Any additional pending shared
569 		 * waiters must be woken up.
570 		 */
571 		if (lock & MTX_SHWANTED) {
572 			if (mtx_chain_link_sh(mtx, lock))
573 				break;
574 			/* retry */
575 		} else {
576 			nlock = lock & ~MTX_EXCLUSIVE;
577 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
578 				break;
579 			/* retry */
580 		}
581 		cpu_pause();
582 	}
583 }
584 
585 /*
586  * Upgrade a shared lock to an exclusive lock.  The upgrade will fail if
587  * the shared lock has a count other then 1.  Optimize the most likely case
588  * but note that a single cmpset can fail due to WANTED races.
589  *
590  * If the lock is held exclusively it must be owned by the caller and
591  * this function will simply return without doing anything.   A panic will
592  * occur if the lock is held exclusively by someone other then the caller.
593  *
594  * Returns 0 on success, EDEADLK on failure.
595  */
596 int
597 _mtx_upgrade_try(mtx_t *mtx)
598 {
599 	u_int	lock;
600 	u_int	nlock;
601 	int	error = 0;
602 
603 	for (;;) {
604 		lock = mtx->mtx_lock;
605 		cpu_ccfence();
606 
607 		if ((lock & ~MTX_EXWANTED) == 1) {
608 			nlock = lock | MTX_EXCLUSIVE;
609 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
610 				mtx->mtx_owner = curthread;
611 				break;
612 			}
613 		} else if (lock & MTX_EXCLUSIVE) {
614 			KKASSERT(mtx->mtx_owner == curthread);
615 			break;
616 		} else {
617 			error = EDEADLK;
618 			break;
619 		}
620 		cpu_pause();
621 	}
622 	return (error);
623 }
624 
625 /*
626  * Unlock a lock.  The caller must hold the lock either shared or exclusive.
627  *
628  * On the last release we handle any pending chains.
629  */
630 void
631 _mtx_unlock(mtx_t *mtx)
632 {
633 	u_int	lock;
634 	u_int	nlock;
635 
636 	for (;;) {
637 		lock = mtx->mtx_lock;
638 		cpu_ccfence();
639 
640 		switch(lock) {
641 		case MTX_EXCLUSIVE | 1:
642 			/*
643 			 * Last release, exclusive lock.
644 			 * No exclusive or shared requests pending.
645 			 */
646 			mtx->mtx_owner = NULL;
647 			nlock = 0;
648 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
649 				goto done;
650 			break;
651 		case MTX_EXCLUSIVE | MTX_EXWANTED | 1:
652 		case MTX_EXCLUSIVE | MTX_EXWANTED | MTX_SHWANTED | 1:
653 			/*
654 			 * Last release, exclusive lock.
655 			 * Exclusive requests pending.
656 			 * Exclusive requests have priority over shared reqs.
657 			 */
658 			if (mtx_chain_link_ex(mtx, lock))
659 				goto done;
660 			break;
661 		case MTX_EXCLUSIVE | MTX_SHWANTED | 1:
662 			/*
663 			 * Last release, exclusive lock.
664 			 *
665 			 * Shared requests are pending.  Transfer our count (1)
666 			 * to the first shared request, wakeup all shared reqs.
667 			 */
668 			if (mtx_chain_link_sh(mtx, lock))
669 				goto done;
670 			break;
671 		case 1:
672 			/*
673 			 * Last release, shared lock.
674 			 * No exclusive or shared requests pending.
675 			 */
676 			nlock = 0;
677 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
678 				goto done;
679 			break;
680 		case MTX_EXWANTED | 1:
681 		case MTX_EXWANTED | MTX_SHWANTED | 1:
682 			/*
683 			 * Last release, shared lock.
684 			 *
685 			 * Exclusive requests are pending.  Upgrade this
686 			 * final shared lock to exclusive and transfer our
687 			 * count (1) to the next exclusive request.
688 			 *
689 			 * Exclusive requests have priority over shared reqs.
690 			 */
691 			if (mtx_chain_link_ex(mtx, lock))
692 				goto done;
693 			break;
694 		case MTX_SHWANTED | 1:
695 			/*
696 			 * Last release, shared lock.
697 			 * Shared requests pending.
698 			 */
699 			if (mtx_chain_link_sh(mtx, lock))
700 				goto done;
701 			break;
702 		default:
703 			/*
704 			 * We have to loop if this is the last release but
705 			 * someone is fiddling with LINKSPIN.
706 			 */
707 			if ((lock & MTX_MASK) == 1) {
708 				KKASSERT(lock & MTX_LINKSPIN);
709 				break;
710 			}
711 
712 			/*
713 			 * Not the last release (shared or exclusive)
714 			 */
715 			nlock = lock - 1;
716 			KKASSERT((nlock & MTX_MASK) != MTX_MASK);
717 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
718 				goto done;
719 			break;
720 		}
721 		/* loop try again */
722 		cpu_pause();
723 	}
724 done:
725 	;
726 }
727 
728 /*
729  * Chain pending links.  Called on the last release of an exclusive or
730  * shared lock when the appropriate WANTED bit is set.  mtx_lock old state
731  * is passed in with the count left at 1, which we can inherit, and other
732  * bits which we must adjust in a single atomic operation.
733  *
734  * Return non-zero on success, 0 if caller needs to retry.
735  *
736  * NOTE: It's ok if MTX_EXWANTED is in an indeterminant state while we are
737  *	 acquiring LINKSPIN as all other cases will also need to acquire
738  *	 LINKSPIN when handling the EXWANTED case.
739  */
740 static int
741 mtx_chain_link_ex(mtx_t *mtx, u_int olock)
742 {
743 	thread_t td = curthread;
744 	mtx_link_t *link;
745 	u_int	nlock;
746 
747 	olock &= ~MTX_LINKSPIN;
748 	nlock = olock | MTX_LINKSPIN | MTX_EXCLUSIVE;	/* upgrade if necc */
749 	crit_enter_raw(td);
750 	if (atomic_cmpset_int(&mtx->mtx_lock, olock, nlock)) {
751 		link = mtx->mtx_exlink;
752 		KKASSERT(link != NULL);
753 		if (link->next == link) {
754 			mtx->mtx_exlink = NULL;
755 			nlock = MTX_LINKSPIN | MTX_EXWANTED;	/* to clear */
756 		} else {
757 			mtx->mtx_exlink = link->next;
758 			link->next->prev = link->prev;
759 			link->prev->next = link->next;
760 			nlock = MTX_LINKSPIN;			/* to clear */
761 		}
762 		KKASSERT(link->state == MTX_LINK_LINKED_EX);
763 		mtx->mtx_owner = link->owner;
764 		cpu_sfence();
765 
766 		/*
767 		 * WARNING! The callback can only be safely
768 		 *	    made with LINKSPIN still held
769 		 *	    and in a critical section.
770 		 *
771 		 * WARNING! The link can go away after the
772 		 *	    state is set, or after the
773 		 *	    callback.
774 		 */
775 		if (link->callback) {
776 			link->state = MTX_LINK_CALLEDBACK;
777 			link->callback(link, link->arg, 0);
778 		} else {
779 			link->state = MTX_LINK_ACQUIRED;
780 			wakeup(link);
781 		}
782 		atomic_clear_int(&mtx->mtx_lock, nlock);
783 		crit_exit_raw(td);
784 		return 1;
785 	}
786 	/* retry */
787 	crit_exit_raw(td);
788 
789 	return 0;
790 }
791 
792 /*
793  * Flush waiting shared locks.  The lock's prior state is passed in and must
794  * be adjusted atomically only if it matches and LINKSPIN is not set.
795  *
796  * IMPORTANT! The caller has left one active count on the lock for us to
797  *	      consume.  We will apply this to the first link, but must add
798  *	      additional counts for any other links.
799  */
800 static int
801 mtx_chain_link_sh(mtx_t *mtx, u_int olock)
802 {
803 	thread_t td = curthread;
804 	mtx_link_t *link;
805 	u_int	addcount;
806 	u_int	nlock;
807 
808 	olock &= ~MTX_LINKSPIN;
809 	nlock = olock | MTX_LINKSPIN;
810 	nlock &= ~MTX_EXCLUSIVE;
811 	crit_enter_raw(td);
812 	if (atomic_cmpset_int(&mtx->mtx_lock, olock, nlock)) {
813 		/*
814 		 * It should not be possible for SHWANTED to be set without
815 		 * any links pending.
816 		 */
817 		KKASSERT(mtx->mtx_shlink != NULL);
818 
819 		/*
820 		 * We have to process the count for all shared locks before
821 		 * we process any of the links.  Count the additional shared
822 		 * locks beyond the first link (which is already accounted
823 		 * for) and associate the full count with the lock
824 		 * immediately.
825 		 */
826 		addcount = 0;
827 		for (link = mtx->mtx_shlink->next; link != mtx->mtx_shlink;
828 		     link = link->next) {
829 			++addcount;
830 		}
831 		if (addcount > 0)
832 			atomic_add_int(&mtx->mtx_lock, addcount);
833 
834 		/*
835 		 * We can wakeup all waiting shared locks.
836 		 */
837 		while ((link = mtx->mtx_shlink) != NULL) {
838 			KKASSERT(link->state == MTX_LINK_LINKED_SH);
839 			if (link->next == link) {
840 				mtx->mtx_shlink = NULL;
841 			} else {
842 				mtx->mtx_shlink = link->next;
843 				link->next->prev = link->prev;
844 				link->prev->next = link->next;
845 			}
846 			link->next = NULL;
847 			link->prev = NULL;
848 			cpu_sfence();
849 			if (link->callback) {
850 				link->state = MTX_LINK_CALLEDBACK;
851 				link->callback(link, link->arg, 0);
852 			} else {
853 				cpu_sfence();
854 				link->state = MTX_LINK_ACQUIRED;
855 				wakeup(link);
856 			}
857 		}
858 		atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN |
859 						 MTX_SHWANTED);
860 		crit_exit_raw(td);
861 		return 1;
862 	}
863 	/* retry */
864 	crit_exit_raw(td);
865 
866 	return 0;
867 }
868 
869 /*
870  * Delete a link structure after tsleep has failed.  This code is not
871  * in the critical path as most exclusive waits are chained.
872  */
873 static
874 void
875 mtx_delete_link(mtx_t *mtx, mtx_link_t *link)
876 {
877 	thread_t td = curthread;
878 	u_int	lock;
879 	u_int	nlock;
880 
881 	/*
882 	 * Acquire MTX_LINKSPIN.
883 	 *
884 	 * Do not use cmpxchg to wait for LINKSPIN to clear as this might
885 	 * result in too much cpu cache traffic.
886 	 */
887 	crit_enter_raw(td);
888 	for (;;) {
889 		lock = mtx->mtx_lock;
890 		if (lock & MTX_LINKSPIN) {
891 			cpu_pause();
892 			continue;
893 		}
894 		nlock = lock | MTX_LINKSPIN;
895 		if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
896 			break;
897 		cpu_pause();
898 	}
899 
900 	/*
901 	 * Delete the link and release LINKSPIN.
902 	 */
903 	nlock = MTX_LINKSPIN;	/* to clear */
904 
905 	switch(link->state) {
906 	case MTX_LINK_LINKED_EX:
907 		if (link->next == link) {
908 			mtx->mtx_exlink = NULL;
909 			nlock |= MTX_EXWANTED;	/* to clear */
910 		} else {
911 			mtx->mtx_exlink = link->next;
912 			link->next->prev = link->prev;
913 			link->prev->next = link->next;
914 		}
915 		break;
916 	case MTX_LINK_LINKED_SH:
917 		if (link->next == link) {
918 			mtx->mtx_shlink = NULL;
919 			nlock |= MTX_SHWANTED;	/* to clear */
920 		} else {
921 			mtx->mtx_shlink = link->next;
922 			link->next->prev = link->prev;
923 			link->prev->next = link->next;
924 		}
925 		break;
926 	default:
927 		/* no change */
928 		break;
929 	}
930 	atomic_clear_int(&mtx->mtx_lock, nlock);
931 	crit_exit_raw(td);
932 }
933 
934 /*
935  * Wait for async lock completion or abort.  Returns ENOLCK if an abort
936  * occurred.
937  */
938 int
939 mtx_wait_link(mtx_t *mtx, mtx_link_t *link, int flags, int to)
940 {
941 	indefinite_info_t info;
942 	int error;
943 
944 	indefinite_init(&info, mtx->mtx_ident, 1,
945 			((link->state & MTX_LINK_LINKED_SH) ? 'm' : 'M'));
946 
947 	/*
948 	 * Sleep.  Handle false wakeups, interruptions, etc.
949 	 * The link may also have been aborted.  The LINKED
950 	 * bit was set by this cpu so we can test it without
951 	 * fences.
952 	 */
953 	error = 0;
954 	while (link->state & MTX_LINK_LINKED) {
955 		tsleep_interlock(link, 0);
956 		cpu_lfence();
957 		if (link->state & MTX_LINK_LINKED) {
958 			error = tsleep(link, flags | PINTERLOCKED,
959 				       mtx->mtx_ident, to);
960 			if (error)
961 				break;
962 		}
963 		if ((mtx->mtx_flags & MTXF_NOCOLLSTATS) == 0)
964 			indefinite_check(&info);
965 	}
966 
967 	/*
968 	 * We need at least a lfence (load fence) to ensure our cpu does not
969 	 * reorder loads (of data outside the lock structure) prior to the
970 	 * remote cpu's release, since the above test may have run without
971 	 * any atomic interactions.
972 	 *
973 	 * If we do not do this then state updated by the other cpu before
974 	 * releasing its lock may not be read cleanly by our cpu when this
975 	 * function returns.  Even though the other cpu ordered its stores,
976 	 * our loads can still be out of order.
977 	 */
978 	cpu_mfence();
979 
980 	/*
981 	 * We are done, make sure the link structure is unlinked.
982 	 * It may still be on the list due to e.g. EINTR or
983 	 * EWOULDBLOCK.
984 	 *
985 	 * It is possible for the tsleep to race an ABORT and cause
986 	 * error to be 0.
987 	 *
988 	 * The tsleep() can be woken up for numerous reasons and error
989 	 * might be zero in situations where we intend to return an error.
990 	 *
991 	 * (This is the synchronous case so state cannot be CALLEDBACK)
992 	 */
993 	switch(link->state) {
994 	case MTX_LINK_ACQUIRED:
995 	case MTX_LINK_CALLEDBACK:
996 		error = 0;
997 		break;
998 	case MTX_LINK_ABORTED:
999 		error = ENOLCK;
1000 		break;
1001 	case MTX_LINK_LINKED_EX:
1002 	case MTX_LINK_LINKED_SH:
1003 		mtx_delete_link(mtx, link);
1004 		/* fall through */
1005 	default:
1006 		if (error == 0)
1007 			error = EWOULDBLOCK;
1008 		break;
1009 	}
1010 
1011 	/*
1012 	 * Clear state on status returned.
1013 	 */
1014 	link->state = MTX_LINK_IDLE;
1015 
1016 	if ((mtx->mtx_flags & MTXF_NOCOLLSTATS) == 0)
1017 		indefinite_done(&info);
1018 
1019 	return error;
1020 }
1021 
1022 /*
1023  * Abort a mutex locking operation, causing mtx_lock_ex_link() to
1024  * return ENOLCK.  This may be called at any time after the mtx_link
1025  * is initialized or the status from a previous lock has been
1026  * returned.  If called prior to the next (non-try) lock attempt, the
1027  * next lock attempt using this link structure will abort instantly.
1028  *
1029  * Caller must still wait for the operation to complete, either from a
1030  * blocking call that is still in progress or by calling mtx_wait_link().
1031  *
1032  * If an asynchronous lock request is possibly in-progress, the caller
1033  * should call mtx_wait_link() synchronously.  Note that the asynchronous
1034  * lock callback will NOT be called if a successful abort occurred. XXX
1035  */
1036 void
1037 mtx_abort_link(mtx_t *mtx, mtx_link_t *link)
1038 {
1039 	thread_t td = curthread;
1040 	u_int	lock;
1041 	u_int	nlock;
1042 
1043 	/*
1044 	 * Acquire MTX_LINKSPIN
1045 	 */
1046 	crit_enter_raw(td);
1047 	for (;;) {
1048 		lock = mtx->mtx_lock;
1049 		if (lock & MTX_LINKSPIN) {
1050 			cpu_pause();
1051 			continue;
1052 		}
1053 		nlock = lock | MTX_LINKSPIN;
1054 		if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
1055 			break;
1056 		cpu_pause();
1057 	}
1058 
1059 	/*
1060 	 * Do the abort.
1061 	 *
1062 	 * WARNING! Link structure can disappear once link->state is set.
1063 	 */
1064 	nlock = MTX_LINKSPIN;	/* to clear */
1065 
1066 	switch(link->state) {
1067 	case MTX_LINK_IDLE:
1068 		/*
1069 		 * Link not started yet
1070 		 */
1071 		link->state = MTX_LINK_ABORTED;
1072 		break;
1073 	case MTX_LINK_LINKED_EX:
1074 		/*
1075 		 * de-link, mark aborted, and potentially wakeup the thread
1076 		 * or issue the callback.
1077 		 */
1078 		if (link->next == link) {
1079 			if (mtx->mtx_exlink == link) {
1080 				mtx->mtx_exlink = NULL;
1081 				nlock |= MTX_EXWANTED;	/* to clear */
1082 			}
1083 		} else {
1084 			if (mtx->mtx_exlink == link)
1085 				mtx->mtx_exlink = link->next;
1086 			link->next->prev = link->prev;
1087 			link->prev->next = link->next;
1088 		}
1089 
1090 		/*
1091 		 * When aborting the async callback is still made.  We must
1092 		 * not set the link status to ABORTED in the callback case
1093 		 * since there is nothing else to clear its status if the
1094 		 * link is reused.
1095 		 */
1096 		if (link->callback) {
1097 			link->state = MTX_LINK_CALLEDBACK;
1098 			link->callback(link, link->arg, ENOLCK);
1099 		} else {
1100 			link->state = MTX_LINK_ABORTED;
1101 			wakeup(link);
1102 		}
1103 		break;
1104 	case MTX_LINK_LINKED_SH:
1105 		/*
1106 		 * de-link, mark aborted, and potentially wakeup the thread
1107 		 * or issue the callback.
1108 		 */
1109 		if (link->next == link) {
1110 			if (mtx->mtx_shlink == link) {
1111 				mtx->mtx_shlink = NULL;
1112 				nlock |= MTX_SHWANTED;	/* to clear */
1113 			}
1114 		} else {
1115 			if (mtx->mtx_shlink == link)
1116 				mtx->mtx_shlink = link->next;
1117 			link->next->prev = link->prev;
1118 			link->prev->next = link->next;
1119 		}
1120 
1121 		/*
1122 		 * When aborting the async callback is still made.  We must
1123 		 * not set the link status to ABORTED in the callback case
1124 		 * since there is nothing else to clear its status if the
1125 		 * link is reused.
1126 		 */
1127 		if (link->callback) {
1128 			link->state = MTX_LINK_CALLEDBACK;
1129 			link->callback(link, link->arg, ENOLCK);
1130 		} else {
1131 			link->state = MTX_LINK_ABORTED;
1132 			wakeup(link);
1133 		}
1134 		break;
1135 	case MTX_LINK_ACQUIRED:
1136 	case MTX_LINK_CALLEDBACK:
1137 		/*
1138 		 * Too late, the lock was acquired.  Let it complete.
1139 		 */
1140 		break;
1141 	default:
1142 		/*
1143 		 * link already aborted, do nothing.
1144 		 */
1145 		break;
1146 	}
1147 	atomic_clear_int(&mtx->mtx_lock, nlock);
1148 	crit_exit_raw(td);
1149 }
1150