xref: /dragonfly/sys/kern/kern_mutex.c (revision 7d84b73d)
1 /*
2  * Copyright (c) 2009 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * Implement fast persistent locks based on atomic_cmpset_int() with
36  * semantics similar to lockmgr locks but faster and taking up much less
37  * space.  Taken from HAMMER's lock implementation.
38  *
39  * These are meant to complement our LWKT tokens.  Tokens are only held
40  * while the thread is running.  Mutexes can be held across blocking
41  * conditions.
42  *
43  * - Exclusive priority over shared to prevent SMP starvation.
44  * - locks can be aborted (async callback, if any, will be made w/ENOLCK).
45  * - locks can be asynchronous.
46  * - synchronous fast path if no blocking occurs (async callback is not
47  *   made in this case).
48  *
49  * Generally speaking any caller-supplied link state must be properly
50  * initialized before use.
51  *
52  * Most of the support is in sys/mutex[2].h.  We mostly provide backoff
53  * functions here.
54  */
55 
56 #include <sys/param.h>
57 #include <sys/systm.h>
58 #include <sys/kernel.h>
59 #include <sys/sysctl.h>
60 #include <sys/indefinite.h>
61 #include <sys/thread.h>
62 
63 #include <machine/cpufunc.h>
64 
65 #include <sys/thread2.h>
66 #include <sys/mutex2.h>
67 #include <sys/indefinite2.h>
68 
69 static int mtx_chain_link_ex(mtx_t *mtx, u_int olock);
70 static int mtx_chain_link_sh(mtx_t *mtx, u_int olock);
71 static void mtx_delete_link(mtx_t *mtx, mtx_link_t *link);
72 
73 /*
74  * Exclusive-lock a mutex, block until acquired unless link is async.
75  * Recursion is allowed.
76  *
77  * Returns 0 on success, the tsleep() return code on failure, EINPROGRESS
78  * if async.  If immediately successful an async exclusive lock will return 0
79  * and not issue the async callback or link the link structure.  The caller
80  * must handle this case (typically this is an optimal code path).
81  *
82  * A tsleep() error can only be returned if PCATCH is specified in the flags.
83  */
84 static __inline int
85 __mtx_lock_ex(mtx_t *mtx, mtx_link_t *link, int flags, int to)
86 {
87 	thread_t td;
88 	u_int	lock;
89 	u_int	nlock;
90 	int	error;
91 	int	isasync;
92 
93 	for (;;) {
94 		lock = mtx->mtx_lock;
95 		cpu_ccfence();
96 
97 		if (lock == 0) {
98 			nlock = MTX_EXCLUSIVE | 1;
99 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
100 				mtx->mtx_owner = curthread;
101 				cpu_sfence();
102 				link->state = MTX_LINK_ACQUIRED;
103 				error = 0;
104 				break;
105 			}
106 			continue;
107 		}
108 		if ((lock & MTX_EXCLUSIVE) && mtx->mtx_owner == curthread) {
109 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
110 			nlock = lock + 1;
111 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
112 				cpu_sfence();
113 				link->state = MTX_LINK_ACQUIRED;
114 				error = 0;
115 				break;
116 			}
117 			continue;
118 		}
119 
120 		/*
121 		 * We need MTX_LINKSPIN to manipulate exlink or
122 		 * shlink.
123 		 *
124 		 * We must set MTX_EXWANTED with MTX_LINKSPIN to indicate
125 		 * pending exclusive requests.  It cannot be set as a separate
126 		 * operation prior to acquiring MTX_LINKSPIN.
127 		 *
128 		 * To avoid unnecessary cpu cache traffic we poll
129 		 * for collisions.  It is also possible that EXWANTED
130 		 * state failing the above test was spurious, so all the
131 		 * tests must be repeated if we cannot obtain LINKSPIN
132 		 * with the prior state tests intact (i.e. don't reload
133 		 * the (lock) variable here, for heaven's sake!).
134 		 */
135 		if (lock & MTX_LINKSPIN) {
136 			cpu_pause();
137 			continue;
138 		}
139 		td = curthread;
140 		nlock = lock | MTX_EXWANTED | MTX_LINKSPIN;
141 		crit_enter_quick(td);
142 		if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock) == 0) {
143 			crit_exit_quick(td);
144 			continue;
145 		}
146 
147 		/*
148 		 * Check for early abort.
149 		 */
150 		if (link->state == MTX_LINK_ABORTED) {
151 			if (mtx->mtx_exlink == NULL) {
152 				atomic_clear_int(&mtx->mtx_lock,
153 						 MTX_LINKSPIN |
154 						 MTX_EXWANTED);
155 			} else {
156 				atomic_clear_int(&mtx->mtx_lock,
157 						 MTX_LINKSPIN);
158 			}
159 			crit_exit_quick(td);
160 			link->state = MTX_LINK_IDLE;
161 			error = ENOLCK;
162 			break;
163 		}
164 
165 		/*
166 		 * Add our link to the exlink list and release LINKSPIN.
167 		 */
168 		link->owner = td;
169 		link->state = MTX_LINK_LINKED_EX;
170 		if (mtx->mtx_exlink) {
171 			link->next = mtx->mtx_exlink;
172 			link->prev = link->next->prev;
173 			link->next->prev = link;
174 			link->prev->next = link;
175 		} else {
176 			link->next = link;
177 			link->prev = link;
178 			mtx->mtx_exlink = link;
179 		}
180 		isasync = (link->callback != NULL);
181 		atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN);
182 		crit_exit_quick(td);
183 
184 		/*
185 		 * If asynchronous lock request return without
186 		 * blocking, leave link structure linked.
187 		 */
188 		if (isasync) {
189 			error = EINPROGRESS;
190 			break;
191 		}
192 
193 		/*
194 		 * Wait for lock
195 		 */
196 		error = mtx_wait_link(mtx, link, flags, to);
197 		break;
198 	}
199 	return (error);
200 }
201 
202 int
203 _mtx_lock_ex_link(mtx_t *mtx, mtx_link_t *link, int flags, int to)
204 {
205 	return(__mtx_lock_ex(mtx, link, flags, to));
206 }
207 
208 int
209 _mtx_lock_ex(mtx_t *mtx, int flags, int to)
210 {
211 	mtx_link_t link;
212 
213 	mtx_link_init(&link);
214 	return(__mtx_lock_ex(mtx, &link, flags, to));
215 }
216 
217 int
218 _mtx_lock_ex_quick(mtx_t *mtx)
219 {
220 	mtx_link_t link;
221 
222 	mtx_link_init(&link);
223 	return(__mtx_lock_ex(mtx, &link, 0, 0));
224 }
225 
226 /*
227  * Share-lock a mutex, block until acquired.  Recursion is allowed.
228  *
229  * Returns 0 on success, or the tsleep() return code on failure.
230  * An error can only be returned if PCATCH is specified in the flags.
231  *
232  * NOTE: Shared locks get a mass-wakeup so if the tsleep fails we
233  *	 do not have to chain the wakeup().
234  */
235 static __inline int
236 __mtx_lock_sh(mtx_t *mtx, mtx_link_t *link, int flags, int to)
237 {
238 	thread_t td;
239 	u_int	lock;
240 	u_int	nlock;
241 	int	error;
242 	int	isasync;
243 
244 	for (;;) {
245 		lock = mtx->mtx_lock;
246 		cpu_ccfence();
247 
248 		if (lock == 0) {
249 			nlock = 1;
250 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
251 				error = 0;
252 				cpu_sfence();
253 				link->state = MTX_LINK_ACQUIRED;
254 				break;
255 			}
256 			continue;
257 		}
258 		if ((lock & (MTX_EXCLUSIVE | MTX_EXWANTED)) == 0) {
259 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
260 			nlock = lock + 1;
261 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
262 				error = 0;
263 				cpu_sfence();
264 				link->state = MTX_LINK_ACQUIRED;
265 				break;
266 			}
267 			continue;
268 		}
269 
270 		/*
271 		 * We need MTX_LINKSPIN to manipulate exlink or
272 		 * shlink.
273 		 *
274 		 * We must set MTX_SHWANTED with MTX_LINKSPIN to indicate
275 		 * pending shared requests.  It cannot be set as a separate
276 		 * operation prior to acquiring MTX_LINKSPIN.
277 		 *
278 		 * To avoid unnecessary cpu cache traffic we poll
279 		 * for collisions.  It is also possible that EXWANTED
280 		 * state failing the above test was spurious, so all the
281 		 * tests must be repeated if we cannot obtain LINKSPIN
282 		 * with the prior state tests intact (i.e. don't reload
283 		 * the (lock) variable here, for heaven's sake!).
284 		 */
285 		if (lock & MTX_LINKSPIN) {
286 			cpu_pause();
287 			continue;
288 		}
289 		td = curthread;
290 		nlock = lock | MTX_SHWANTED | MTX_LINKSPIN;
291 		crit_enter_quick(td);
292 		if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock) == 0) {
293 			crit_exit_quick(td);
294 			continue;
295 		}
296 
297 		/*
298 		 * Check for early abort.  Other shared lock requestors
299 		 * could have sneaked in before we set LINKSPIN so make
300 		 * sure we undo the state properly.
301 		 */
302 		if (link->state == MTX_LINK_ABORTED) {
303 			if (mtx->mtx_shlink) {
304 				atomic_clear_int(&mtx->mtx_lock,
305 						 MTX_LINKSPIN);
306 			} else {
307 				atomic_clear_int(&mtx->mtx_lock,
308 						 MTX_LINKSPIN |
309 						 MTX_SHWANTED);
310 			}
311 			crit_exit_quick(td);
312 			link->state = MTX_LINK_IDLE;
313 			error = ENOLCK;
314 			break;
315 		}
316 
317 		/*
318 		 * Add our link to the shlink list and release LINKSPIN.
319 		 */
320 		link->owner = td;
321 		link->state = MTX_LINK_LINKED_SH;
322 		if (mtx->mtx_shlink) {
323 			link->next = mtx->mtx_shlink;
324 			link->prev = link->next->prev;
325 			link->next->prev = link;
326 			link->prev->next = link;
327 		} else {
328 			link->next = link;
329 			link->prev = link;
330 			mtx->mtx_shlink = link;
331 		}
332 		isasync = (link->callback != NULL);
333 		atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN);
334 		crit_exit_quick(td);
335 
336 		/*
337 		 * If asynchronous lock request return without
338 		 * blocking, leave link structure linked.
339 		 */
340 		if (isasync) {
341 			error = EINPROGRESS;
342 			break;
343 		}
344 
345 		/*
346 		 * Wait for lock
347 		 */
348 		error = mtx_wait_link(mtx, link, flags, to);
349 		break;
350 	}
351 	return (error);
352 }
353 
354 int
355 _mtx_lock_sh_link(mtx_t *mtx, mtx_link_t *link, int flags, int to)
356 {
357 	return(__mtx_lock_sh(mtx, link, flags, to));
358 }
359 
360 int
361 _mtx_lock_sh(mtx_t *mtx, int flags, int to)
362 {
363 	mtx_link_t link;
364 
365 	mtx_link_init(&link);
366 	return(__mtx_lock_sh(mtx, &link, flags, to));
367 }
368 
369 int
370 _mtx_lock_sh_quick(mtx_t *mtx)
371 {
372 	mtx_link_t link;
373 
374 	mtx_link_init(&link);
375 	return(__mtx_lock_sh(mtx, &link, 0, 0));
376 }
377 
378 /*
379  * Get an exclusive spinlock the hard way.
380  */
381 void
382 _mtx_spinlock(mtx_t *mtx)
383 {
384 	u_int	lock;
385 	u_int	nlock;
386 	int	bb = 1;
387 	int	bo;
388 
389 	for (;;) {
390 		lock = mtx->mtx_lock;
391 		if (lock == 0) {
392 			nlock = MTX_EXCLUSIVE | 1;
393 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
394 				mtx->mtx_owner = curthread;
395 				break;
396 			}
397 		} else if ((lock & MTX_EXCLUSIVE) &&
398 			   mtx->mtx_owner == curthread) {
399 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
400 			nlock = lock + 1;
401 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
402 				break;
403 		} else {
404 			/* MWAIT here */
405 			if (bb < 1000)
406 				++bb;
407 			cpu_pause();
408 			for (bo = 0; bo < bb; ++bo)
409 				;
410 		}
411 		cpu_pause();
412 	}
413 }
414 
415 /*
416  * Attempt to acquire a spinlock, if we fail we must undo the
417  * gd->gd_spinlocks/gd->gd_curthead->td_critcount predisposition.
418  *
419  * Returns 0 on success, EAGAIN on failure.
420  */
421 int
422 _mtx_spinlock_try(mtx_t *mtx)
423 {
424 	globaldata_t gd = mycpu;
425 	u_int	lock;
426 	u_int	nlock;
427 	int	res = 0;
428 
429 	for (;;) {
430 		lock = mtx->mtx_lock;
431 		if (lock == 0) {
432 			nlock = MTX_EXCLUSIVE | 1;
433 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
434 				mtx->mtx_owner = gd->gd_curthread;
435 				break;
436 			}
437 		} else if ((lock & MTX_EXCLUSIVE) &&
438 			   mtx->mtx_owner == gd->gd_curthread) {
439 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
440 			nlock = lock + 1;
441 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
442 				break;
443 		} else {
444 			--gd->gd_spinlocks;
445 			cpu_ccfence();
446 			crit_exit_quick(gd->gd_curthread);
447 			res = EAGAIN;
448 			break;
449 		}
450 		cpu_pause();
451 	}
452 	return res;
453 }
454 
455 #if 0
456 
457 void
458 _mtx_spinlock_sh(mtx_t *mtx)
459 {
460 	u_int	lock;
461 	u_int	nlock;
462 	int	bb = 1;
463 	int	bo;
464 
465 	for (;;) {
466 		lock = mtx->mtx_lock;
467 		if ((lock & MTX_EXCLUSIVE) == 0) {
468 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
469 			nlock = lock + 1;
470 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
471 				break;
472 		} else {
473 			/* MWAIT here */
474 			if (bb < 1000)
475 				++bb;
476 			cpu_pause();
477 			for (bo = 0; bo < bb; ++bo)
478 				;
479 		}
480 		cpu_pause();
481 	}
482 }
483 
484 #endif
485 
486 int
487 _mtx_lock_ex_try(mtx_t *mtx)
488 {
489 	u_int	lock;
490 	u_int	nlock;
491 	int	error;
492 
493 	for (;;) {
494 		lock = mtx->mtx_lock;
495 		if (lock == 0) {
496 			nlock = MTX_EXCLUSIVE | 1;
497 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
498 				mtx->mtx_owner = curthread;
499 				error = 0;
500 				break;
501 			}
502 		} else if ((lock & MTX_EXCLUSIVE) &&
503 			   mtx->mtx_owner == curthread) {
504 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
505 			nlock = lock + 1;
506 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
507 				error = 0;
508 				break;
509 			}
510 		} else {
511 			error = EAGAIN;
512 			break;
513 		}
514 		cpu_pause();
515 	}
516 	return (error);
517 }
518 
519 int
520 _mtx_lock_sh_try(mtx_t *mtx)
521 {
522 	u_int	lock;
523 	u_int	nlock;
524 	int	error = 0;
525 
526 	for (;;) {
527 		lock = mtx->mtx_lock;
528 		if ((lock & MTX_EXCLUSIVE) == 0) {
529 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
530 			nlock = lock + 1;
531 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
532 				break;
533 		} else {
534 			error = EAGAIN;
535 			break;
536 		}
537 		cpu_pause();
538 	}
539 	return (error);
540 }
541 
542 /*
543  * If the lock is held exclusively it must be owned by the caller.  If the
544  * lock is already a shared lock this operation is a NOP.  A panic will
545  * occur if the lock is not held either shared or exclusive.
546  *
547  * The exclusive count is converted to a shared count.
548  */
549 void
550 _mtx_downgrade(mtx_t *mtx)
551 {
552 	u_int	lock;
553 	u_int	nlock;
554 
555 	for (;;) {
556 		lock = mtx->mtx_lock;
557 		cpu_ccfence();
558 
559 		/*
560 		 * NOP if already shared.
561 		 */
562 		if ((lock & MTX_EXCLUSIVE) == 0) {
563 			KKASSERT((lock & MTX_MASK) > 0);
564 			break;
565 		}
566 
567 		/*
568 		 * Transfer count to shared.  Any additional pending shared
569 		 * waiters must be woken up.
570 		 */
571 		if (lock & MTX_SHWANTED) {
572 			if (mtx_chain_link_sh(mtx, lock))
573 				break;
574 			/* retry */
575 		} else {
576 			nlock = lock & ~MTX_EXCLUSIVE;
577 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
578 				break;
579 			/* retry */
580 		}
581 		cpu_pause();
582 	}
583 }
584 
585 /*
586  * Upgrade a shared lock to an exclusive lock.  The upgrade will fail if
587  * the shared lock has a count other then 1.  Optimize the most likely case
588  * but note that a single cmpset can fail due to WANTED races.
589  *
590  * If the lock is held exclusively it must be owned by the caller and
591  * this function will simply return without doing anything.   A panic will
592  * occur if the lock is held exclusively by someone other then the caller.
593  *
594  * Returns 0 on success, EDEADLK on failure.
595  */
596 int
597 _mtx_upgrade_try(mtx_t *mtx)
598 {
599 	u_int	lock;
600 	u_int	nlock;
601 	int	error = 0;
602 
603 	for (;;) {
604 		lock = mtx->mtx_lock;
605 		cpu_ccfence();
606 
607 		if ((lock & ~MTX_EXWANTED) == 1) {
608 			nlock = lock | MTX_EXCLUSIVE;
609 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
610 				mtx->mtx_owner = curthread;
611 				break;
612 			}
613 		} else if (lock & MTX_EXCLUSIVE) {
614 			KKASSERT(mtx->mtx_owner == curthread);
615 			break;
616 		} else {
617 			error = EDEADLK;
618 			break;
619 		}
620 		cpu_pause();
621 	}
622 	return (error);
623 }
624 
625 /*
626  * Unlock a lock.  The caller must hold the lock either shared or exclusive.
627  *
628  * On the last release we handle any pending chains.
629  */
630 void
631 _mtx_unlock(mtx_t *mtx)
632 {
633 	thread_t td __debugvar = curthread;
634 	u_int	lock;
635 	u_int	nlock;
636 
637 	for (;;) {
638 		lock = mtx->mtx_lock;
639 		cpu_ccfence();
640 
641 		switch(lock) {
642 		case MTX_EXCLUSIVE | 1:
643 			/*
644 			 * Last release, exclusive lock.
645 			 * No exclusive or shared requests pending.
646 			 */
647 			KKASSERT(mtx->mtx_owner == td ||
648 				 mtx->mtx_owner == NULL);
649 			mtx->mtx_owner = NULL;
650 			nlock = 0;
651 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
652 				goto done;
653 			break;
654 		case MTX_EXCLUSIVE | MTX_EXWANTED | 1:
655 		case MTX_EXCLUSIVE | MTX_EXWANTED | MTX_SHWANTED | 1:
656 			/*
657 			 * Last release, exclusive lock.
658 			 * Exclusive requests pending.
659 			 * Exclusive requests have priority over shared reqs.
660 			 */
661 			KKASSERT(mtx->mtx_owner == td ||
662 				 mtx->mtx_owner == NULL);
663 			mtx->mtx_owner = NULL;
664 			if (mtx_chain_link_ex(mtx, lock))
665 				goto done;
666 			break;
667 		case MTX_EXCLUSIVE | MTX_SHWANTED | 1:
668 			/*
669 			 * Last release, exclusive lock.
670 			 *
671 			 * Shared requests are pending.  Transfer our count (1)
672 			 * to the first shared request, wakeup all shared reqs.
673 			 */
674 			KKASSERT(mtx->mtx_owner == td ||
675 				 mtx->mtx_owner == NULL);
676 			mtx->mtx_owner = NULL;
677 			if (mtx_chain_link_sh(mtx, lock))
678 				goto done;
679 			break;
680 		case 1:
681 			/*
682 			 * Last release, shared lock.
683 			 * No exclusive or shared requests pending.
684 			 */
685 			nlock = 0;
686 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
687 				goto done;
688 			break;
689 		case MTX_EXWANTED | 1:
690 		case MTX_EXWANTED | MTX_SHWANTED | 1:
691 			/*
692 			 * Last release, shared lock.
693 			 *
694 			 * Exclusive requests are pending.  Upgrade this
695 			 * final shared lock to exclusive and transfer our
696 			 * count (1) to the next exclusive request.
697 			 *
698 			 * Exclusive requests have priority over shared reqs.
699 			 */
700 			if (mtx_chain_link_ex(mtx, lock))
701 				goto done;
702 			break;
703 		case MTX_SHWANTED | 1:
704 			/*
705 			 * Last release, shared lock.
706 			 * Shared requests pending.
707 			 */
708 			if (mtx_chain_link_sh(mtx, lock))
709 				goto done;
710 			break;
711 		default:
712 			/*
713 			 * We have to loop if this is the last release but
714 			 * someone is fiddling with LINKSPIN.
715 			 */
716 			if ((lock & MTX_MASK) == 1) {
717 				KKASSERT(lock & MTX_LINKSPIN);
718 				break;
719 			}
720 
721 			/*
722 			 * Not the last release (shared or exclusive)
723 			 */
724 			nlock = lock - 1;
725 			KKASSERT((nlock & MTX_MASK) != MTX_MASK);
726 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
727 				goto done;
728 			break;
729 		}
730 		/* loop try again */
731 		cpu_pause();
732 	}
733 done:
734 	;
735 }
736 
737 /*
738  * Chain pending links.  Called on the last release of an exclusive or
739  * shared lock when the appropriate WANTED bit is set.  mtx_lock old state
740  * is passed in with the count left at 1, which we can inherit, and other
741  * bits which we must adjust in a single atomic operation.
742  *
743  * Return non-zero on success, 0 if caller needs to retry.
744  *
745  * NOTE: It's ok if MTX_EXWANTED is in an indeterminant state while we are
746  *	 acquiring LINKSPIN as all other cases will also need to acquire
747  *	 LINKSPIN when handling the EXWANTED case.
748  */
749 static int
750 mtx_chain_link_ex(mtx_t *mtx, u_int olock)
751 {
752 	thread_t td = curthread;
753 	mtx_link_t *link;
754 	u_int	nlock;
755 
756 	olock &= ~MTX_LINKSPIN;
757 	nlock = olock | MTX_LINKSPIN | MTX_EXCLUSIVE;	/* upgrade if necc */
758 	crit_enter_quick(td);
759 	if (atomic_cmpset_int(&mtx->mtx_lock, olock, nlock)) {
760 		link = mtx->mtx_exlink;
761 		KKASSERT(link != NULL);
762 		if (link->next == link) {
763 			mtx->mtx_exlink = NULL;
764 			nlock = MTX_LINKSPIN | MTX_EXWANTED;	/* to clear */
765 		} else {
766 			mtx->mtx_exlink = link->next;
767 			link->next->prev = link->prev;
768 			link->prev->next = link->next;
769 			nlock = MTX_LINKSPIN;			/* to clear */
770 		}
771 		KKASSERT(link->state == MTX_LINK_LINKED_EX);
772 		mtx->mtx_owner = link->owner;
773 		cpu_sfence();
774 
775 		/*
776 		 * WARNING! The callback can only be safely
777 		 *	    made with LINKSPIN still held
778 		 *	    and in a critical section.
779 		 *
780 		 * WARNING! The link can go away after the
781 		 *	    state is set, or after the
782 		 *	    callback.
783 		 */
784 		if (link->callback) {
785 			link->state = MTX_LINK_CALLEDBACK;
786 			link->callback(link, link->arg, 0);
787 		} else {
788 			link->state = MTX_LINK_ACQUIRED;
789 			wakeup(link);
790 		}
791 		atomic_clear_int(&mtx->mtx_lock, nlock);
792 		crit_exit_quick(td);
793 		return 1;
794 	}
795 	/* retry */
796 	crit_exit_quick(td);
797 
798 	return 0;
799 }
800 
801 /*
802  * Flush waiting shared locks.  The lock's prior state is passed in and must
803  * be adjusted atomically only if it matches and LINKSPIN is not set.
804  *
805  * IMPORTANT! The caller has left one active count on the lock for us to
806  *	      consume.  We will apply this to the first link, but must add
807  *	      additional counts for any other links.
808  */
809 static int
810 mtx_chain_link_sh(mtx_t *mtx, u_int olock)
811 {
812 	thread_t td = curthread;
813 	mtx_link_t *link;
814 	u_int	addcount;
815 	u_int	nlock;
816 
817 	olock &= ~MTX_LINKSPIN;
818 	nlock = olock | MTX_LINKSPIN;
819 	nlock &= ~MTX_EXCLUSIVE;
820 	crit_enter_quick(td);
821 	if (atomic_cmpset_int(&mtx->mtx_lock, olock, nlock)) {
822 		/*
823 		 * It should not be possible for SHWANTED to be set without
824 		 * any links pending.
825 		 */
826 		KKASSERT(mtx->mtx_shlink != NULL);
827 
828 		/*
829 		 * We have to process the count for all shared locks before
830 		 * we process any of the links.  Count the additional shared
831 		 * locks beyond the first link (which is already accounted
832 		 * for) and associate the full count with the lock
833 		 * immediately.
834 		 */
835 		addcount = 0;
836 		for (link = mtx->mtx_shlink->next; link != mtx->mtx_shlink;
837 		     link = link->next) {
838 			++addcount;
839 		}
840 		if (addcount > 0)
841 			atomic_add_int(&mtx->mtx_lock, addcount);
842 
843 		/*
844 		 * We can wakeup all waiting shared locks.
845 		 */
846 		while ((link = mtx->mtx_shlink) != NULL) {
847 			KKASSERT(link->state == MTX_LINK_LINKED_SH);
848 			if (link->next == link) {
849 				mtx->mtx_shlink = NULL;
850 			} else {
851 				mtx->mtx_shlink = link->next;
852 				link->next->prev = link->prev;
853 				link->prev->next = link->next;
854 			}
855 			link->next = NULL;
856 			link->prev = NULL;
857 			cpu_sfence();
858 			if (link->callback) {
859 				link->state = MTX_LINK_CALLEDBACK;
860 				link->callback(link, link->arg, 0);
861 			} else {
862 				cpu_sfence();
863 				link->state = MTX_LINK_ACQUIRED;
864 				wakeup(link);
865 			}
866 		}
867 		atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN |
868 						 MTX_SHWANTED);
869 		crit_exit_quick(td);
870 		return 1;
871 	}
872 	/* retry */
873 	crit_exit_quick(td);
874 
875 	return 0;
876 }
877 
878 /*
879  * Delete a link structure after tsleep has failed.  This code is not
880  * in the critical path as most exclusive waits are chained.
881  */
882 static
883 void
884 mtx_delete_link(mtx_t *mtx, mtx_link_t *link)
885 {
886 	thread_t td = curthread;
887 	u_int	lock;
888 	u_int	nlock;
889 
890 	/*
891 	 * Acquire MTX_LINKSPIN.
892 	 *
893 	 * Do not use cmpxchg to wait for LINKSPIN to clear as this might
894 	 * result in too much cpu cache traffic.
895 	 */
896 	crit_enter_quick(td);
897 	for (;;) {
898 		lock = mtx->mtx_lock;
899 		if (lock & MTX_LINKSPIN) {
900 			cpu_pause();
901 			continue;
902 		}
903 		nlock = lock | MTX_LINKSPIN;
904 		if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
905 			break;
906 		cpu_pause();
907 	}
908 
909 	/*
910 	 * Delete the link and release LINKSPIN.
911 	 */
912 	nlock = MTX_LINKSPIN;	/* to clear */
913 
914 	switch(link->state) {
915 	case MTX_LINK_LINKED_EX:
916 		if (link->next == link) {
917 			mtx->mtx_exlink = NULL;
918 			nlock |= MTX_EXWANTED;	/* to clear */
919 		} else {
920 			mtx->mtx_exlink = link->next;
921 			link->next->prev = link->prev;
922 			link->prev->next = link->next;
923 		}
924 		break;
925 	case MTX_LINK_LINKED_SH:
926 		if (link->next == link) {
927 			mtx->mtx_shlink = NULL;
928 			nlock |= MTX_SHWANTED;	/* to clear */
929 		} else {
930 			mtx->mtx_shlink = link->next;
931 			link->next->prev = link->prev;
932 			link->prev->next = link->next;
933 		}
934 		break;
935 	default:
936 		/* no change */
937 		break;
938 	}
939 	atomic_clear_int(&mtx->mtx_lock, nlock);
940 	crit_exit_quick(td);
941 }
942 
943 /*
944  * Wait for async lock completion or abort.  Returns ENOLCK if an abort
945  * occurred.
946  */
947 int
948 mtx_wait_link(mtx_t *mtx, mtx_link_t *link, int flags, int to)
949 {
950 	indefinite_info_t info;
951 	int error;
952 
953 	indefinite_init(&info, mtx, mtx->mtx_ident, 1,
954 			((link->state & MTX_LINK_LINKED_SH) ? 'm' : 'M'));
955 
956 	/*
957 	 * Sleep.  Handle false wakeups, interruptions, etc.
958 	 * The link may also have been aborted.  The LINKED
959 	 * bit was set by this cpu so we can test it without
960 	 * fences.
961 	 */
962 	error = 0;
963 	while (link->state & MTX_LINK_LINKED) {
964 		tsleep_interlock(link, 0);
965 		cpu_lfence();
966 		if (link->state & MTX_LINK_LINKED) {
967 			error = tsleep(link, flags | PINTERLOCKED,
968 				       mtx->mtx_ident, to);
969 			if (error)
970 				break;
971 		}
972 		if ((mtx->mtx_flags & MTXF_NOCOLLSTATS) == 0)
973 			indefinite_check(&info);
974 	}
975 
976 	/*
977 	 * We need at least a lfence (load fence) to ensure our cpu does not
978 	 * reorder loads (of data outside the lock structure) prior to the
979 	 * remote cpu's release, since the above test may have run without
980 	 * any atomic interactions.
981 	 *
982 	 * If we do not do this then state updated by the other cpu before
983 	 * releasing its lock may not be read cleanly by our cpu when this
984 	 * function returns.  Even though the other cpu ordered its stores,
985 	 * our loads can still be out of order.
986 	 */
987 	cpu_mfence();
988 
989 	/*
990 	 * We are done, make sure the link structure is unlinked.
991 	 * It may still be on the list due to e.g. EINTR or
992 	 * EWOULDBLOCK.
993 	 *
994 	 * It is possible for the tsleep to race an ABORT and cause
995 	 * error to be 0.
996 	 *
997 	 * The tsleep() can be woken up for numerous reasons and error
998 	 * might be zero in situations where we intend to return an error.
999 	 *
1000 	 * (This is the synchronous case so state cannot be CALLEDBACK)
1001 	 */
1002 	switch(link->state) {
1003 	case MTX_LINK_ACQUIRED:
1004 	case MTX_LINK_CALLEDBACK:
1005 		error = 0;
1006 		break;
1007 	case MTX_LINK_ABORTED:
1008 		error = ENOLCK;
1009 		break;
1010 	case MTX_LINK_LINKED_EX:
1011 	case MTX_LINK_LINKED_SH:
1012 		mtx_delete_link(mtx, link);
1013 		/* fall through */
1014 	default:
1015 		if (error == 0)
1016 			error = EWOULDBLOCK;
1017 		break;
1018 	}
1019 
1020 	/*
1021 	 * Clear state on status returned.
1022 	 */
1023 	link->state = MTX_LINK_IDLE;
1024 
1025 	if ((mtx->mtx_flags & MTXF_NOCOLLSTATS) == 0)
1026 		indefinite_done(&info);
1027 
1028 	return error;
1029 }
1030 
1031 /*
1032  * Abort a mutex locking operation, causing mtx_lock_ex_link() to
1033  * return ENOLCK.  This may be called at any time after the mtx_link
1034  * is initialized or the status from a previous lock has been
1035  * returned.  If called prior to the next (non-try) lock attempt, the
1036  * next lock attempt using this link structure will abort instantly.
1037  *
1038  * Caller must still wait for the operation to complete, either from a
1039  * blocking call that is still in progress or by calling mtx_wait_link().
1040  *
1041  * If an asynchronous lock request is possibly in-progress, the caller
1042  * should call mtx_wait_link() synchronously.  Note that the asynchronous
1043  * lock callback will NOT be called if a successful abort occurred. XXX
1044  */
1045 void
1046 mtx_abort_link(mtx_t *mtx, mtx_link_t *link)
1047 {
1048 	thread_t td = curthread;
1049 	u_int	lock;
1050 	u_int	nlock;
1051 
1052 	/*
1053 	 * Acquire MTX_LINKSPIN
1054 	 */
1055 	crit_enter_quick(td);
1056 	for (;;) {
1057 		lock = mtx->mtx_lock;
1058 		if (lock & MTX_LINKSPIN) {
1059 			cpu_pause();
1060 			continue;
1061 		}
1062 		nlock = lock | MTX_LINKSPIN;
1063 		if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
1064 			break;
1065 		cpu_pause();
1066 	}
1067 
1068 	/*
1069 	 * Do the abort.
1070 	 *
1071 	 * WARNING! Link structure can disappear once link->state is set.
1072 	 */
1073 	nlock = MTX_LINKSPIN;	/* to clear */
1074 
1075 	switch(link->state) {
1076 	case MTX_LINK_IDLE:
1077 		/*
1078 		 * Link not started yet
1079 		 */
1080 		link->state = MTX_LINK_ABORTED;
1081 		break;
1082 	case MTX_LINK_LINKED_EX:
1083 		/*
1084 		 * de-link, mark aborted, and potentially wakeup the thread
1085 		 * or issue the callback.
1086 		 */
1087 		if (link->next == link) {
1088 			if (mtx->mtx_exlink == link) {
1089 				mtx->mtx_exlink = NULL;
1090 				nlock |= MTX_EXWANTED;	/* to clear */
1091 			}
1092 		} else {
1093 			if (mtx->mtx_exlink == link)
1094 				mtx->mtx_exlink = link->next;
1095 			link->next->prev = link->prev;
1096 			link->prev->next = link->next;
1097 		}
1098 
1099 		/*
1100 		 * When aborting the async callback is still made.  We must
1101 		 * not set the link status to ABORTED in the callback case
1102 		 * since there is nothing else to clear its status if the
1103 		 * link is reused.
1104 		 */
1105 		if (link->callback) {
1106 			link->state = MTX_LINK_CALLEDBACK;
1107 			link->callback(link, link->arg, ENOLCK);
1108 		} else {
1109 			link->state = MTX_LINK_ABORTED;
1110 			wakeup(link);
1111 		}
1112 		break;
1113 	case MTX_LINK_LINKED_SH:
1114 		/*
1115 		 * de-link, mark aborted, and potentially wakeup the thread
1116 		 * or issue the callback.
1117 		 */
1118 		if (link->next == link) {
1119 			if (mtx->mtx_shlink == link) {
1120 				mtx->mtx_shlink = NULL;
1121 				nlock |= MTX_SHWANTED;	/* to clear */
1122 			}
1123 		} else {
1124 			if (mtx->mtx_shlink == link)
1125 				mtx->mtx_shlink = link->next;
1126 			link->next->prev = link->prev;
1127 			link->prev->next = link->next;
1128 		}
1129 
1130 		/*
1131 		 * When aborting the async callback is still made.  We must
1132 		 * not set the link status to ABORTED in the callback case
1133 		 * since there is nothing else to clear its status if the
1134 		 * link is reused.
1135 		 */
1136 		if (link->callback) {
1137 			link->state = MTX_LINK_CALLEDBACK;
1138 			link->callback(link, link->arg, ENOLCK);
1139 		} else {
1140 			link->state = MTX_LINK_ABORTED;
1141 			wakeup(link);
1142 		}
1143 		break;
1144 	case MTX_LINK_ACQUIRED:
1145 	case MTX_LINK_CALLEDBACK:
1146 		/*
1147 		 * Too late, the lock was acquired.  Let it complete.
1148 		 */
1149 		break;
1150 	default:
1151 		/*
1152 		 * link already aborted, do nothing.
1153 		 */
1154 		break;
1155 	}
1156 	atomic_clear_int(&mtx->mtx_lock, nlock);
1157 	crit_exit_quick(td);
1158 }
1159