xref: /dragonfly/sys/kern/kern_mutex.c (revision 1fe7e945)
1 /*
2  * Copyright (c) 2009 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * Implement fast persistent locks based on atomic_cmpset_int() with
36  * semantics similar to lockmgr locks but faster and taking up much less
37  * space.  Taken from HAMMER's lock implementation.
38  *
39  * These are meant to complement our LWKT tokens.  Tokens are only held
40  * while the thread is running.  Mutexes can be held across blocking
41  * conditions.
42  *
43  * - Exclusive priority over shared to prevent SMP starvation.
44  * - locks can be aborted (async callback, if any, will be made w/ENOLCK).
45  * - locks can be asynchronous.
46  * - synchronous fast path if no blocking occurs (async callback is not
47  *   made in this case).
48  *
49  * Generally speaking any caller-supplied link state must be properly
50  * initialized before use.
51  *
52  * Most of the support is in sys/mutex[2].h.  We mostly provide backoff
53  * functions here.
54  */
55 
56 #include <sys/param.h>
57 #include <sys/systm.h>
58 #include <sys/kernel.h>
59 #include <sys/sysctl.h>
60 #include <sys/thread.h>
61 
62 #include <machine/cpufunc.h>
63 
64 #include <sys/thread2.h>
65 #include <sys/mutex2.h>
66 
67 static int mtx_chain_link_ex(mtx_t *mtx, u_int olock);
68 static int mtx_chain_link_sh(mtx_t *mtx, u_int olock);
69 static void mtx_delete_link(mtx_t *mtx, mtx_link_t *link);
70 
71 /*
72  * Exclusive-lock a mutex, block until acquired unless link is async.
73  * Recursion is allowed.
74  *
75  * Returns 0 on success, the tsleep() return code on failure, EINPROGRESS
76  * if async.  If immediately successful an async exclusive lock will return 0
77  * and not issue the async callback or link the link structure.  The caller
78  * must handle this case (typically this is an optimal code path).
79  *
80  * A tsleep() error can only be returned if PCATCH is specified in the flags.
81  */
82 static __inline int
83 __mtx_lock_ex(mtx_t *mtx, mtx_link_t *link, int flags, int to)
84 {
85 	thread_t td;
86 	u_int	lock;
87 	u_int	nlock;
88 	int	error;
89 	int	isasync;
90 
91 	for (;;) {
92 		lock = mtx->mtx_lock;
93 		cpu_ccfence();
94 
95 		if (lock == 0) {
96 			nlock = MTX_EXCLUSIVE | 1;
97 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
98 				mtx->mtx_owner = curthread;
99 				cpu_sfence();
100 				link->state = MTX_LINK_ACQUIRED;
101 				error = 0;
102 				break;
103 			}
104 			continue;
105 		}
106 		if ((lock & MTX_EXCLUSIVE) && mtx->mtx_owner == curthread) {
107 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
108 			nlock = lock + 1;
109 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
110 				cpu_sfence();
111 				link->state = MTX_LINK_ACQUIRED;
112 				error = 0;
113 				break;
114 			}
115 			continue;
116 		}
117 
118 		/*
119 		 * We need MTX_LINKSPIN to manipulate exlink or
120 		 * shlink.
121 		 *
122 		 * We must set MTX_EXWANTED with MTX_LINKSPIN to indicate
123 		 * pending exclusive requests.  It cannot be set as a separate
124 		 * operation prior to acquiring MTX_LINKSPIN.
125 		 *
126 		 * To avoid unnecessary cpu cache traffic we poll
127 		 * for collisions.  It is also possible that EXWANTED
128 		 * state failing the above test was spurious, so all the
129 		 * tests must be repeated if we cannot obtain LINKSPIN
130 		 * with the prior state tests intact (i.e. don't reload
131 		 * the (lock) variable here, for heaven's sake!).
132 		 */
133 		if (lock & MTX_LINKSPIN) {
134 			cpu_pause();
135 			continue;
136 		}
137 		td = curthread;
138 		nlock = lock | MTX_EXWANTED | MTX_LINKSPIN;
139 		crit_enter_raw(td);
140 		if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock) == 0) {
141 			crit_exit_raw(td);
142 			continue;
143 		}
144 
145 		/*
146 		 * Check for early abort.
147 		 */
148 		if (link->state == MTX_LINK_ABORTED) {
149 			if (mtx->mtx_exlink == NULL) {
150 				atomic_clear_int(&mtx->mtx_lock,
151 						 MTX_LINKSPIN |
152 						 MTX_EXWANTED);
153 			} else {
154 				atomic_clear_int(&mtx->mtx_lock,
155 						 MTX_LINKSPIN);
156 			}
157 			crit_exit_raw(td);
158 			link->state = MTX_LINK_IDLE;
159 			error = ENOLCK;
160 			break;
161 		}
162 
163 		/*
164 		 * Add our link to the exlink list and release LINKSPIN.
165 		 */
166 		link->owner = td;
167 		link->state = MTX_LINK_LINKED_EX;
168 		if (mtx->mtx_exlink) {
169 			link->next = mtx->mtx_exlink;
170 			link->prev = link->next->prev;
171 			link->next->prev = link;
172 			link->prev->next = link;
173 		} else {
174 			link->next = link;
175 			link->prev = link;
176 			mtx->mtx_exlink = link;
177 		}
178 		isasync = (link->callback != NULL);
179 		atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN);
180 		crit_exit_raw(td);
181 
182 		/*
183 		 * If asynchronous lock request return without
184 		 * blocking, leave link structure linked.
185 		 */
186 		if (isasync) {
187 			error = EINPROGRESS;
188 			break;
189 		}
190 
191 		/*
192 		 * Wait for lock
193 		 */
194 		error = mtx_wait_link(mtx, link, flags, to);
195 		break;
196 	}
197 	return (error);
198 }
199 
200 int
201 _mtx_lock_ex_link(mtx_t *mtx, mtx_link_t *link, int flags, int to)
202 {
203 	return(__mtx_lock_ex(mtx, link, flags, to));
204 }
205 
206 int
207 _mtx_lock_ex(mtx_t *mtx, int flags, int to)
208 {
209 	mtx_link_t link;
210 
211 	mtx_link_init(&link);
212 	return(__mtx_lock_ex(mtx, &link, flags, to));
213 }
214 
215 int
216 _mtx_lock_ex_quick(mtx_t *mtx)
217 {
218 	mtx_link_t link;
219 
220 	mtx_link_init(&link);
221 	return(__mtx_lock_ex(mtx, &link, 0, 0));
222 }
223 
224 /*
225  * Share-lock a mutex, block until acquired.  Recursion is allowed.
226  *
227  * Returns 0 on success, or the tsleep() return code on failure.
228  * An error can only be returned if PCATCH is specified in the flags.
229  *
230  * NOTE: Shared locks get a mass-wakeup so if the tsleep fails we
231  *	 do not have to chain the wakeup().
232  */
233 static __inline int
234 __mtx_lock_sh(mtx_t *mtx, mtx_link_t *link, int flags, int to)
235 {
236 	thread_t td;
237 	u_int	lock;
238 	u_int	nlock;
239 	int	error;
240 	int	isasync;
241 
242 	for (;;) {
243 		lock = mtx->mtx_lock;
244 		cpu_ccfence();
245 
246 		if (lock == 0) {
247 			nlock = 1;
248 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
249 				error = 0;
250 				cpu_sfence();
251 				link->state = MTX_LINK_ACQUIRED;
252 				break;
253 			}
254 			continue;
255 		}
256 		if ((lock & (MTX_EXCLUSIVE | MTX_EXWANTED)) == 0) {
257 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
258 			nlock = lock + 1;
259 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
260 				error = 0;
261 				cpu_sfence();
262 				link->state = MTX_LINK_ACQUIRED;
263 				break;
264 			}
265 			continue;
266 		}
267 
268 		/*
269 		 * We need MTX_LINKSPIN to manipulate exlink or
270 		 * shlink.
271 		 *
272 		 * We must set MTX_SHWANTED with MTX_LINKSPIN to indicate
273 		 * pending shared requests.  It cannot be set as a separate
274 		 * operation prior to acquiring MTX_LINKSPIN.
275 		 *
276 		 * To avoid unnecessary cpu cache traffic we poll
277 		 * for collisions.  It is also possible that EXWANTED
278 		 * state failing the above test was spurious, so all the
279 		 * tests must be repeated if we cannot obtain LINKSPIN
280 		 * with the prior state tests intact (i.e. don't reload
281 		 * the (lock) variable here, for heaven's sake!).
282 		 */
283 		if (lock & MTX_LINKSPIN) {
284 			cpu_pause();
285 			continue;
286 		}
287 		td = curthread;
288 		nlock = lock | MTX_SHWANTED | MTX_LINKSPIN;
289 		crit_enter_raw(td);
290 		if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock) == 0) {
291 			crit_exit_raw(td);
292 			continue;
293 		}
294 
295 		/*
296 		 * Check for early abort.  Other shared lock requestors
297 		 * could have sneaked in before we set LINKSPIN so make
298 		 * sure we undo the state properly.
299 		 */
300 		if (link->state == MTX_LINK_ABORTED) {
301 			if (mtx->mtx_shlink) {
302 				atomic_clear_int(&mtx->mtx_lock,
303 						 MTX_LINKSPIN);
304 			} else {
305 				atomic_clear_int(&mtx->mtx_lock,
306 						 MTX_LINKSPIN |
307 						 MTX_SHWANTED);
308 			}
309 			crit_exit_raw(td);
310 			link->state = MTX_LINK_IDLE;
311 			error = ENOLCK;
312 			break;
313 		}
314 
315 		/*
316 		 * Add our link to the shlink list and release LINKSPIN.
317 		 */
318 		link->owner = td;
319 		link->state = MTX_LINK_LINKED_SH;
320 		if (mtx->mtx_shlink) {
321 			link->next = mtx->mtx_shlink;
322 			link->prev = link->next->prev;
323 			link->next->prev = link;
324 			link->prev->next = link;
325 		} else {
326 			link->next = link;
327 			link->prev = link;
328 			mtx->mtx_shlink = link;
329 		}
330 		isasync = (link->callback != NULL);
331 		atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN);
332 		crit_exit_raw(td);
333 
334 		/*
335 		 * If asynchronous lock request return without
336 		 * blocking, leave link structure linked.
337 		 */
338 		if (isasync) {
339 			error = EINPROGRESS;
340 			break;
341 		}
342 
343 		/*
344 		 * Wait for lock
345 		 */
346 		error = mtx_wait_link(mtx, link, flags, to);
347 		break;
348 	}
349 	return (error);
350 }
351 
352 int
353 _mtx_lock_sh_link(mtx_t *mtx, mtx_link_t *link, int flags, int to)
354 {
355 	return(__mtx_lock_sh(mtx, link, flags, to));
356 }
357 
358 int
359 _mtx_lock_sh(mtx_t *mtx, int flags, int to)
360 {
361 	mtx_link_t link;
362 
363 	mtx_link_init(&link);
364 	return(__mtx_lock_sh(mtx, &link, flags, to));
365 }
366 
367 int
368 _mtx_lock_sh_quick(mtx_t *mtx)
369 {
370 	mtx_link_t link;
371 
372 	mtx_link_init(&link);
373 	return(__mtx_lock_sh(mtx, &link, 0, 0));
374 }
375 
376 /*
377  * Get an exclusive spinlock the hard way.
378  */
379 void
380 _mtx_spinlock(mtx_t *mtx)
381 {
382 	u_int	lock;
383 	u_int	nlock;
384 	int	bb = 1;
385 	int	bo;
386 
387 	for (;;) {
388 		lock = mtx->mtx_lock;
389 		if (lock == 0) {
390 			nlock = MTX_EXCLUSIVE | 1;
391 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
392 				mtx->mtx_owner = curthread;
393 				break;
394 			}
395 		} else if ((lock & MTX_EXCLUSIVE) &&
396 			   mtx->mtx_owner == curthread) {
397 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
398 			nlock = lock + 1;
399 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
400 				break;
401 		} else {
402 			/* MWAIT here */
403 			if (bb < 1000)
404 				++bb;
405 			cpu_pause();
406 			for (bo = 0; bo < bb; ++bo)
407 				;
408 		}
409 		cpu_pause();
410 	}
411 }
412 
413 /*
414  * Attempt to acquire a spinlock, if we fail we must undo the
415  * gd->gd_spinlocks/gd->gd_curthead->td_critcount predisposition.
416  *
417  * Returns 0 on success, EAGAIN on failure.
418  */
419 int
420 _mtx_spinlock_try(mtx_t *mtx)
421 {
422 	globaldata_t gd = mycpu;
423 	u_int	lock;
424 	u_int	nlock;
425 	int	res = 0;
426 
427 	for (;;) {
428 		lock = mtx->mtx_lock;
429 		if (lock == 0) {
430 			nlock = MTX_EXCLUSIVE | 1;
431 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
432 				mtx->mtx_owner = gd->gd_curthread;
433 				break;
434 			}
435 		} else if ((lock & MTX_EXCLUSIVE) &&
436 			   mtx->mtx_owner == gd->gd_curthread) {
437 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
438 			nlock = lock + 1;
439 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
440 				break;
441 		} else {
442 			--gd->gd_spinlocks;
443 			cpu_ccfence();
444 			crit_exit_raw(gd->gd_curthread);
445 			res = EAGAIN;
446 			break;
447 		}
448 		cpu_pause();
449 	}
450 	return res;
451 }
452 
453 #if 0
454 
455 void
456 _mtx_spinlock_sh(mtx_t *mtx)
457 {
458 	u_int	lock;
459 	u_int	nlock;
460 	int	bb = 1;
461 	int	bo;
462 
463 	for (;;) {
464 		lock = mtx->mtx_lock;
465 		if ((lock & MTX_EXCLUSIVE) == 0) {
466 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
467 			nlock = lock + 1;
468 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
469 				break;
470 		} else {
471 			/* MWAIT here */
472 			if (bb < 1000)
473 				++bb;
474 			cpu_pause();
475 			for (bo = 0; bo < bb; ++bo)
476 				;
477 		}
478 		cpu_pause();
479 	}
480 }
481 
482 #endif
483 
484 int
485 _mtx_lock_ex_try(mtx_t *mtx)
486 {
487 	u_int	lock;
488 	u_int	nlock;
489 	int	error;
490 
491 	for (;;) {
492 		lock = mtx->mtx_lock;
493 		if (lock == 0) {
494 			nlock = MTX_EXCLUSIVE | 1;
495 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
496 				mtx->mtx_owner = curthread;
497 				error = 0;
498 				break;
499 			}
500 		} else if ((lock & MTX_EXCLUSIVE) &&
501 			   mtx->mtx_owner == curthread) {
502 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
503 			nlock = lock + 1;
504 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
505 				error = 0;
506 				break;
507 			}
508 		} else {
509 			error = EAGAIN;
510 			break;
511 		}
512 		cpu_pause();
513 	}
514 	return (error);
515 }
516 
517 int
518 _mtx_lock_sh_try(mtx_t *mtx)
519 {
520 	u_int	lock;
521 	u_int	nlock;
522 	int	error = 0;
523 
524 	for (;;) {
525 		lock = mtx->mtx_lock;
526 		if ((lock & MTX_EXCLUSIVE) == 0) {
527 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
528 			nlock = lock + 1;
529 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
530 				break;
531 		} else {
532 			error = EAGAIN;
533 			break;
534 		}
535 		cpu_pause();
536 	}
537 	return (error);
538 }
539 
540 /*
541  * If the lock is held exclusively it must be owned by the caller.  If the
542  * lock is already a shared lock this operation is a NOP.  A panic will
543  * occur if the lock is not held either shared or exclusive.
544  *
545  * The exclusive count is converted to a shared count.
546  */
547 void
548 _mtx_downgrade(mtx_t *mtx)
549 {
550 	u_int	lock;
551 	u_int	nlock;
552 
553 	for (;;) {
554 		lock = mtx->mtx_lock;
555 		cpu_ccfence();
556 
557 		/*
558 		 * NOP if already shared.
559 		 */
560 		if ((lock & MTX_EXCLUSIVE) == 0) {
561 			KKASSERT((lock & MTX_MASK) > 0);
562 			break;
563 		}
564 
565 		/*
566 		 * Transfer count to shared.  Any additional pending shared
567 		 * waiters must be woken up.
568 		 */
569 		if (lock & MTX_SHWANTED) {
570 			if (mtx_chain_link_sh(mtx, lock))
571 				break;
572 			/* retry */
573 		} else {
574 			nlock = lock & ~MTX_EXCLUSIVE;
575 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
576 				break;
577 			/* retry */
578 		}
579 		cpu_pause();
580 	}
581 }
582 
583 /*
584  * Upgrade a shared lock to an exclusive lock.  The upgrade will fail if
585  * the shared lock has a count other then 1.  Optimize the most likely case
586  * but note that a single cmpset can fail due to WANTED races.
587  *
588  * If the lock is held exclusively it must be owned by the caller and
589  * this function will simply return without doing anything.   A panic will
590  * occur if the lock is held exclusively by someone other then the caller.
591  *
592  * Returns 0 on success, EDEADLK on failure.
593  */
594 int
595 _mtx_upgrade_try(mtx_t *mtx)
596 {
597 	u_int	lock;
598 	u_int	nlock;
599 	int	error = 0;
600 
601 	for (;;) {
602 		lock = mtx->mtx_lock;
603 		cpu_ccfence();
604 
605 		if ((lock & ~MTX_EXWANTED) == 1) {
606 			nlock = lock | MTX_EXCLUSIVE;
607 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
608 				mtx->mtx_owner = curthread;
609 				break;
610 			}
611 		} else if (lock & MTX_EXCLUSIVE) {
612 			KKASSERT(mtx->mtx_owner == curthread);
613 			break;
614 		} else {
615 			error = EDEADLK;
616 			break;
617 		}
618 		cpu_pause();
619 	}
620 	return (error);
621 }
622 
623 /*
624  * Unlock a lock.  The caller must hold the lock either shared or exclusive.
625  *
626  * On the last release we handle any pending chains.
627  */
628 void
629 _mtx_unlock(mtx_t *mtx)
630 {
631 	u_int	lock;
632 	u_int	nlock;
633 
634 	for (;;) {
635 		lock = mtx->mtx_lock;
636 		cpu_ccfence();
637 
638 		switch(lock) {
639 		case MTX_EXCLUSIVE | 1:
640 			/*
641 			 * Last release, exclusive lock.
642 			 * No exclusive or shared requests pending.
643 			 */
644 			mtx->mtx_owner = NULL;
645 			nlock = 0;
646 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
647 				goto done;
648 			break;
649 		case MTX_EXCLUSIVE | MTX_EXWANTED | 1:
650 		case MTX_EXCLUSIVE | MTX_EXWANTED | MTX_SHWANTED | 1:
651 			/*
652 			 * Last release, exclusive lock.
653 			 * Exclusive requests pending.
654 			 * Exclusive requests have priority over shared reqs.
655 			 */
656 			if (mtx_chain_link_ex(mtx, lock))
657 				goto done;
658 			break;
659 		case MTX_EXCLUSIVE | MTX_SHWANTED | 1:
660 			/*
661 			 * Last release, exclusive lock.
662 			 *
663 			 * Shared requests are pending.  Transfer our count (1)
664 			 * to the first shared request, wakeup all shared reqs.
665 			 */
666 			if (mtx_chain_link_sh(mtx, lock))
667 				goto done;
668 			break;
669 		case 1:
670 			/*
671 			 * Last release, shared lock.
672 			 * No exclusive or shared requests pending.
673 			 */
674 			nlock = 0;
675 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
676 				goto done;
677 			break;
678 		case MTX_EXWANTED | 1:
679 		case MTX_EXWANTED | MTX_SHWANTED | 1:
680 			/*
681 			 * Last release, shared lock.
682 			 *
683 			 * Exclusive requests are pending.  Upgrade this
684 			 * final shared lock to exclusive and transfer our
685 			 * count (1) to the next exclusive request.
686 			 *
687 			 * Exclusive requests have priority over shared reqs.
688 			 */
689 			if (mtx_chain_link_ex(mtx, lock))
690 				goto done;
691 			break;
692 		case MTX_SHWANTED | 1:
693 			/*
694 			 * Last release, shared lock.
695 			 * Shared requests pending.
696 			 */
697 			if (mtx_chain_link_sh(mtx, lock))
698 				goto done;
699 			break;
700 		default:
701 			/*
702 			 * We have to loop if this is the last release but
703 			 * someone is fiddling with LINKSPIN.
704 			 */
705 			if ((lock & MTX_MASK) == 1) {
706 				KKASSERT(lock & MTX_LINKSPIN);
707 				break;
708 			}
709 
710 			/*
711 			 * Not the last release (shared or exclusive)
712 			 */
713 			nlock = lock - 1;
714 			KKASSERT((nlock & MTX_MASK) != MTX_MASK);
715 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
716 				goto done;
717 			break;
718 		}
719 		/* loop try again */
720 		cpu_pause();
721 	}
722 done:
723 	;
724 }
725 
726 /*
727  * Chain pending links.  Called on the last release of an exclusive or
728  * shared lock when the appropriate WANTED bit is set.  mtx_lock old state
729  * is passed in with the count left at 1, which we can inherit, and other
730  * bits which we must adjust in a single atomic operation.
731  *
732  * Return non-zero on success, 0 if caller needs to retry.
733  *
734  * NOTE: It's ok if MTX_EXWANTED is in an indeterminant state while we are
735  *	 acquiring LINKSPIN as all other cases will also need to acquire
736  *	 LINKSPIN when handling the EXWANTED case.
737  */
738 static int
739 mtx_chain_link_ex(mtx_t *mtx, u_int olock)
740 {
741 	thread_t td = curthread;
742 	mtx_link_t *link;
743 	u_int	nlock;
744 
745 	olock &= ~MTX_LINKSPIN;
746 	nlock = olock | MTX_LINKSPIN | MTX_EXCLUSIVE;	/* upgrade if necc */
747 	crit_enter_raw(td);
748 	if (atomic_cmpset_int(&mtx->mtx_lock, olock, nlock)) {
749 		link = mtx->mtx_exlink;
750 		KKASSERT(link != NULL);
751 		if (link->next == link) {
752 			mtx->mtx_exlink = NULL;
753 			nlock = MTX_LINKSPIN | MTX_EXWANTED;	/* to clear */
754 		} else {
755 			mtx->mtx_exlink = link->next;
756 			link->next->prev = link->prev;
757 			link->prev->next = link->next;
758 			nlock = MTX_LINKSPIN;			/* to clear */
759 		}
760 		KKASSERT(link->state == MTX_LINK_LINKED_EX);
761 		mtx->mtx_owner = link->owner;
762 		cpu_sfence();
763 
764 		/*
765 		 * WARNING! The callback can only be safely
766 		 *	    made with LINKSPIN still held
767 		 *	    and in a critical section.
768 		 *
769 		 * WARNING! The link can go away after the
770 		 *	    state is set, or after the
771 		 *	    callback.
772 		 */
773 		if (link->callback) {
774 			link->state = MTX_LINK_CALLEDBACK;
775 			link->callback(link, link->arg, 0);
776 		} else {
777 			link->state = MTX_LINK_ACQUIRED;
778 			wakeup(link);
779 		}
780 		atomic_clear_int(&mtx->mtx_lock, nlock);
781 		crit_exit_raw(td);
782 		return 1;
783 	}
784 	/* retry */
785 	crit_exit_raw(td);
786 
787 	return 0;
788 }
789 
790 /*
791  * Flush waiting shared locks.  The lock's prior state is passed in and must
792  * be adjusted atomically only if it matches and LINKSPIN is not set.
793  *
794  * IMPORTANT! The caller has left one active count on the lock for us to
795  *	      consume.  We will apply this to the first link, but must add
796  *	      additional counts for any other links.
797  */
798 static int
799 mtx_chain_link_sh(mtx_t *mtx, u_int olock)
800 {
801 	thread_t td = curthread;
802 	mtx_link_t *link;
803 	u_int	addcount;
804 	u_int	nlock;
805 
806 	olock &= ~MTX_LINKSPIN;
807 	nlock = olock | MTX_LINKSPIN;
808 	nlock &= ~MTX_EXCLUSIVE;
809 	crit_enter_raw(td);
810 	if (atomic_cmpset_int(&mtx->mtx_lock, olock, nlock)) {
811 		/*
812 		 * It should not be possible for SHWANTED to be set without
813 		 * any links pending.
814 		 */
815 		KKASSERT(mtx->mtx_shlink != NULL);
816 
817 		/*
818 		 * We have to process the count for all shared locks before
819 		 * we process any of the links.  Count the additional shared
820 		 * locks beyond the first link (which is already accounted
821 		 * for) and associate the full count with the lock
822 		 * immediately.
823 		 */
824 		addcount = 0;
825 		for (link = mtx->mtx_shlink->next; link != mtx->mtx_shlink;
826 		     link = link->next) {
827 			++addcount;
828 		}
829 		if (addcount > 0)
830 			atomic_add_int(&mtx->mtx_lock, addcount);
831 
832 		/*
833 		 * We can wakeup all waiting shared locks.
834 		 */
835 		while ((link = mtx->mtx_shlink) != NULL) {
836 			KKASSERT(link->state == MTX_LINK_LINKED_SH);
837 			if (link->next == link) {
838 				mtx->mtx_shlink = NULL;
839 			} else {
840 				mtx->mtx_shlink = link->next;
841 				link->next->prev = link->prev;
842 				link->prev->next = link->next;
843 			}
844 			link->next = NULL;
845 			link->prev = NULL;
846 			cpu_sfence();
847 			if (link->callback) {
848 				link->state = MTX_LINK_CALLEDBACK;
849 				link->callback(link, link->arg, 0);
850 			} else {
851 				cpu_sfence();
852 				link->state = MTX_LINK_ACQUIRED;
853 				wakeup(link);
854 			}
855 		}
856 		atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN |
857 						 MTX_SHWANTED);
858 		crit_exit_raw(td);
859 		return 1;
860 	}
861 	/* retry */
862 	crit_exit_raw(td);
863 
864 	return 0;
865 }
866 
867 /*
868  * Delete a link structure after tsleep has failed.  This code is not
869  * in the critical path as most exclusive waits are chained.
870  */
871 static
872 void
873 mtx_delete_link(mtx_t *mtx, mtx_link_t *link)
874 {
875 	thread_t td = curthread;
876 	u_int	lock;
877 	u_int	nlock;
878 
879 	/*
880 	 * Acquire MTX_LINKSPIN.
881 	 *
882 	 * Do not use cmpxchg to wait for LINKSPIN to clear as this might
883 	 * result in too much cpu cache traffic.
884 	 */
885 	crit_enter_raw(td);
886 	for (;;) {
887 		lock = mtx->mtx_lock;
888 		if (lock & MTX_LINKSPIN) {
889 			cpu_pause();
890 			continue;
891 		}
892 		nlock = lock | MTX_LINKSPIN;
893 		if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
894 			break;
895 		cpu_pause();
896 	}
897 
898 	/*
899 	 * Delete the link and release LINKSPIN.
900 	 */
901 	nlock = MTX_LINKSPIN;	/* to clear */
902 
903 	switch(link->state) {
904 	case MTX_LINK_LINKED_EX:
905 		if (link->next == link) {
906 			mtx->mtx_exlink = NULL;
907 			nlock |= MTX_EXWANTED;	/* to clear */
908 		} else {
909 			mtx->mtx_exlink = link->next;
910 			link->next->prev = link->prev;
911 			link->prev->next = link->next;
912 		}
913 		break;
914 	case MTX_LINK_LINKED_SH:
915 		if (link->next == link) {
916 			mtx->mtx_shlink = NULL;
917 			nlock |= MTX_SHWANTED;	/* to clear */
918 		} else {
919 			mtx->mtx_shlink = link->next;
920 			link->next->prev = link->prev;
921 			link->prev->next = link->next;
922 		}
923 		break;
924 	default:
925 		/* no change */
926 		break;
927 	}
928 	atomic_clear_int(&mtx->mtx_lock, nlock);
929 	crit_exit_raw(td);
930 }
931 
932 /*
933  * Wait for async lock completion or abort.  Returns ENOLCK if an abort
934  * occurred.
935  */
936 int
937 mtx_wait_link(mtx_t *mtx, mtx_link_t *link, int flags, int to)
938 {
939 	int error;
940 
941 	/*
942 	 * Sleep.  Handle false wakeups, interruptions, etc.
943 	 * The link may also have been aborted.  The LINKED
944 	 * bit was set by this cpu so we can test it without
945 	 * fences.
946 	 */
947 	error = 0;
948 	while (link->state & MTX_LINK_LINKED) {
949 		tsleep_interlock(link, 0);
950 		cpu_lfence();
951 		if (link->state & MTX_LINK_LINKED) {
952 			if (link->state & MTX_LINK_LINKED_SH)
953 				mycpu->gd_cnt.v_lock_name[0] = 'S';
954 			else
955 				mycpu->gd_cnt.v_lock_name[0] = 'X';
956 			strncpy(mycpu->gd_cnt.v_lock_name + 1,
957 				mtx->mtx_ident,
958 				sizeof(mycpu->gd_cnt.v_lock_name) - 2);
959 			++mycpu->gd_cnt.v_lock_colls;
960 
961 			error = tsleep(link, flags | PINTERLOCKED,
962 				       mtx->mtx_ident, to);
963 			if (error)
964 				break;
965 		}
966 	}
967 
968 	/*
969 	 * We need at least a lfence (load fence) to ensure our cpu does not
970 	 * reorder loads (of data outside the lock structure) prior to the
971 	 * remote cpu's release, since the above test may have run without
972 	 * any atomic interactions.
973 	 *
974 	 * If we do not do this then state updated by the other cpu before
975 	 * releasing its lock may not be read cleanly by our cpu when this
976 	 * function returns.  Even though the other cpu ordered its stores,
977 	 * our loads can still be out of order.
978 	 */
979 	cpu_mfence();
980 
981 	/*
982 	 * We are done, make sure the link structure is unlinked.
983 	 * It may still be on the list due to e.g. EINTR or
984 	 * EWOULDBLOCK.
985 	 *
986 	 * It is possible for the tsleep to race an ABORT and cause
987 	 * error to be 0.
988 	 *
989 	 * The tsleep() can be woken up for numerous reasons and error
990 	 * might be zero in situations where we intend to return an error.
991 	 *
992 	 * (This is the synchronous case so state cannot be CALLEDBACK)
993 	 */
994 	switch(link->state) {
995 	case MTX_LINK_ACQUIRED:
996 	case MTX_LINK_CALLEDBACK:
997 		error = 0;
998 		break;
999 	case MTX_LINK_ABORTED:
1000 		error = ENOLCK;
1001 		break;
1002 	case MTX_LINK_LINKED_EX:
1003 	case MTX_LINK_LINKED_SH:
1004 		mtx_delete_link(mtx, link);
1005 		/* fall through */
1006 	default:
1007 		if (error == 0)
1008 			error = EWOULDBLOCK;
1009 		break;
1010 	}
1011 
1012 	/*
1013 	 * Clear state on status returned.
1014 	 */
1015 	link->state = MTX_LINK_IDLE;
1016 
1017 	return error;
1018 }
1019 
1020 /*
1021  * Abort a mutex locking operation, causing mtx_lock_ex_link() to
1022  * return ENOLCK.  This may be called at any time after the mtx_link
1023  * is initialized or the status from a previous lock has been
1024  * returned.  If called prior to the next (non-try) lock attempt, the
1025  * next lock attempt using this link structure will abort instantly.
1026  *
1027  * Caller must still wait for the operation to complete, either from a
1028  * blocking call that is still in progress or by calling mtx_wait_link().
1029  *
1030  * If an asynchronous lock request is possibly in-progress, the caller
1031  * should call mtx_wait_link() synchronously.  Note that the asynchronous
1032  * lock callback will NOT be called if a successful abort occurred. XXX
1033  */
1034 void
1035 mtx_abort_link(mtx_t *mtx, mtx_link_t *link)
1036 {
1037 	thread_t td = curthread;
1038 	u_int	lock;
1039 	u_int	nlock;
1040 
1041 	/*
1042 	 * Acquire MTX_LINKSPIN
1043 	 */
1044 	crit_enter_raw(td);
1045 	for (;;) {
1046 		lock = mtx->mtx_lock;
1047 		if (lock & MTX_LINKSPIN) {
1048 			cpu_pause();
1049 			continue;
1050 		}
1051 		nlock = lock | MTX_LINKSPIN;
1052 		if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
1053 			break;
1054 		cpu_pause();
1055 	}
1056 
1057 	/*
1058 	 * Do the abort.
1059 	 *
1060 	 * WARNING! Link structure can disappear once link->state is set.
1061 	 */
1062 	nlock = MTX_LINKSPIN;	/* to clear */
1063 
1064 	switch(link->state) {
1065 	case MTX_LINK_IDLE:
1066 		/*
1067 		 * Link not started yet
1068 		 */
1069 		link->state = MTX_LINK_ABORTED;
1070 		break;
1071 	case MTX_LINK_LINKED_EX:
1072 		/*
1073 		 * de-link, mark aborted, and potentially wakeup the thread
1074 		 * or issue the callback.
1075 		 */
1076 		if (link->next == link) {
1077 			if (mtx->mtx_exlink == link) {
1078 				mtx->mtx_exlink = NULL;
1079 				nlock |= MTX_EXWANTED;	/* to clear */
1080 			}
1081 		} else {
1082 			if (mtx->mtx_exlink == link)
1083 				mtx->mtx_exlink = link->next;
1084 			link->next->prev = link->prev;
1085 			link->prev->next = link->next;
1086 		}
1087 
1088 		/*
1089 		 * When aborting the async callback is still made.  We must
1090 		 * not set the link status to ABORTED in the callback case
1091 		 * since there is nothing else to clear its status if the
1092 		 * link is reused.
1093 		 */
1094 		if (link->callback) {
1095 			link->state = MTX_LINK_CALLEDBACK;
1096 			link->callback(link, link->arg, ENOLCK);
1097 		} else {
1098 			link->state = MTX_LINK_ABORTED;
1099 			wakeup(link);
1100 		}
1101 		break;
1102 	case MTX_LINK_LINKED_SH:
1103 		/*
1104 		 * de-link, mark aborted, and potentially wakeup the thread
1105 		 * or issue the callback.
1106 		 */
1107 		if (link->next == link) {
1108 			if (mtx->mtx_shlink == link) {
1109 				mtx->mtx_shlink = NULL;
1110 				nlock |= MTX_SHWANTED;	/* to clear */
1111 			}
1112 		} else {
1113 			if (mtx->mtx_shlink == link)
1114 				mtx->mtx_shlink = link->next;
1115 			link->next->prev = link->prev;
1116 			link->prev->next = link->next;
1117 		}
1118 
1119 		/*
1120 		 * When aborting the async callback is still made.  We must
1121 		 * not set the link status to ABORTED in the callback case
1122 		 * since there is nothing else to clear its status if the
1123 		 * link is reused.
1124 		 */
1125 		if (link->callback) {
1126 			link->state = MTX_LINK_CALLEDBACK;
1127 			link->callback(link, link->arg, ENOLCK);
1128 		} else {
1129 			link->state = MTX_LINK_ABORTED;
1130 			wakeup(link);
1131 		}
1132 		break;
1133 	case MTX_LINK_ACQUIRED:
1134 	case MTX_LINK_CALLEDBACK:
1135 		/*
1136 		 * Too late, the lock was acquired.  Let it complete.
1137 		 */
1138 		break;
1139 	default:
1140 		/*
1141 		 * link already aborted, do nothing.
1142 		 */
1143 		break;
1144 	}
1145 	atomic_clear_int(&mtx->mtx_lock, nlock);
1146 	crit_exit_raw(td);
1147 }
1148