xref: /dragonfly/sys/kern/kern_mutex.c (revision e90c1ebb)
1 /*
2  * Copyright (c) 2009 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * Implement fast persistent locks based on atomic_cmpset_int() with
36  * semantics similar to lockmgr locks but faster and taking up much less
37  * space.  Taken from HAMMER's lock implementation.
38  *
39  * These are meant to complement our LWKT tokens.  Tokens are only held
40  * while the thread is running.  Mutexes can be held across blocking
41  * conditions.
42  *
43  * - Exclusive priority over shared to prevent SMP starvation.
44  * - locks can be aborted (async callback, if any, will be made w/ENOLCK).
45  * - locks can be asynchronous.
46  * - synchronous fast path if no blocking occurs (async callback is not
47  *   made in this case).
48  *
49  * Generally speaking any caller-supplied link state must be properly
50  * initialized before use.
51  *
52  * Most of the support is in sys/mutex[2].h.  We mostly provide backoff
53  * functions here.
54  */
55 
56 #include <sys/param.h>
57 #include <sys/systm.h>
58 #include <sys/kernel.h>
59 #include <sys/sysctl.h>
60 #include <sys/thread.h>
61 
62 #include <machine/cpufunc.h>
63 
64 #include <sys/thread2.h>
65 #include <sys/mutex2.h>
66 
67 static int mtx_chain_link_ex(mtx_t *mtx, u_int olock);
68 static int mtx_chain_link_sh(mtx_t *mtx, u_int olock, int addcount);
69 static void mtx_delete_link(mtx_t *mtx, mtx_link_t *link);
70 
71 /*
72  * Exclusive-lock a mutex, block until acquired unless link is async.
73  * Recursion is allowed.
74  *
75  * Returns 0 on success, the tsleep() return code on failure, EINPROGRESS
76  * if async.  If immediately successful an async exclusive lock will return 0
77  * and not issue the async callback or link the link structure.  The caller
78  * must handle this case (typically this is an optimal code path).
79  *
80  * A tsleep() error can only be returned if PCATCH is specified in the flags.
81  */
82 static __inline int
83 __mtx_lock_ex(mtx_t *mtx, mtx_link_t *link, int flags, int to)
84 {
85 	thread_t td;
86 	u_int	lock;
87 	u_int	nlock;
88 	int	error;
89 	int	isasync;
90 
91 	for (;;) {
92 		lock = mtx->mtx_lock;
93 		cpu_ccfence();
94 
95 		if (lock == 0) {
96 			nlock = MTX_EXCLUSIVE | 1;
97 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
98 				mtx->mtx_owner = curthread;
99 				link->state = MTX_LINK_ACQUIRED;
100 				error = 0;
101 				break;
102 			}
103 			continue;
104 		}
105 		if ((lock & MTX_EXCLUSIVE) && mtx->mtx_owner == curthread) {
106 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
107 			nlock = lock + 1;
108 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
109 				link->state = MTX_LINK_ACQUIRED;
110 				error = 0;
111 				break;
112 			}
113 			continue;
114 		}
115 
116 		/*
117 		 * We need MTX_LINKSPIN to manipulate exlink or
118 		 * shlink.
119 		 *
120 		 * We must set MTX_EXWANTED with MTX_LINKSPIN to indicate
121 		 * pending shared requests.  It cannot be set as a separate
122 		 * operation prior to acquiring MTX_LINKSPIN.
123 		 *
124 		 * To avoid unnecessary cpu cache traffic we poll
125 		 * for collisions.  It is also possible that EXWANTED
126 		 * state failing the above test was spurious, so all the
127 		 * tests must be repeated if we cannot obtain LINKSPIN
128 		 * with the prior state tests intact (i.e. don't reload
129 		 * the (lock) variable here, for heaven's sake!).
130 		 */
131 		if (lock & MTX_LINKSPIN) {
132 			cpu_pause();
133 			continue;
134 		}
135 		td = curthread;
136 		nlock = lock | MTX_EXWANTED | MTX_LINKSPIN;
137 		++td->td_critcount;
138 		if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock) == 0) {
139 			--td->td_critcount;
140 			continue;
141 		}
142 
143 		/*
144 		 * Check for early abort.
145 		 */
146 		if (link->state == MTX_LINK_ABORTED) {
147 			if (mtx->mtx_exlink == NULL) {
148 				atomic_clear_int(&mtx->mtx_lock,
149 						 MTX_LINKSPIN |
150 						 MTX_EXWANTED);
151 			} else {
152 				atomic_clear_int(&mtx->mtx_lock,
153 						 MTX_LINKSPIN);
154 			}
155 			--td->td_critcount;
156 			link->state = MTX_LINK_IDLE;
157 			error = ENOLCK;
158 			break;
159 		}
160 
161 		/*
162 		 * Add our link to the exlink list and release LINKSPIN.
163 		 */
164 		link->owner = td;
165 		link->state = MTX_LINK_LINKED_EX;
166 		if (mtx->mtx_exlink) {
167 			link->next = mtx->mtx_exlink;
168 			link->prev = link->next->prev;
169 			link->next->prev = link;
170 			link->prev->next = link;
171 		} else {
172 			link->next = link;
173 			link->prev = link;
174 			mtx->mtx_exlink = link;
175 		}
176 		isasync = (link->callback != NULL);
177 		atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN);
178 		--td->td_critcount;
179 
180 		/*
181 		 * If asynchronous lock request return without
182 		 * blocking, leave link structure linked.
183 		 */
184 		if (isasync) {
185 			error = EINPROGRESS;
186 			break;
187 		}
188 
189 		/*
190 		 * Wait for lock
191 		 */
192 		error = mtx_wait_link(mtx, link, flags, to);
193 		break;
194 	}
195 	return (error);
196 }
197 
198 int
199 _mtx_lock_ex_link(mtx_t *mtx, mtx_link_t *link, int flags, int to)
200 {
201 	return(__mtx_lock_ex(mtx, link, flags, to));
202 }
203 
204 int
205 _mtx_lock_ex(mtx_t *mtx, int flags, int to)
206 {
207 	mtx_link_t link;
208 
209 	mtx_link_init(&link);
210 	return(__mtx_lock_ex(mtx, &link, flags, to));
211 }
212 
213 int
214 _mtx_lock_ex_quick(mtx_t *mtx)
215 {
216 	mtx_link_t link;
217 
218 	mtx_link_init(&link);
219 	return(__mtx_lock_ex(mtx, &link, 0, 0));
220 }
221 
222 /*
223  * Share-lock a mutex, block until acquired.  Recursion is allowed.
224  *
225  * Returns 0 on success, or the tsleep() return code on failure.
226  * An error can only be returned if PCATCH is specified in the flags.
227  *
228  * NOTE: Shared locks get a mass-wakeup so if the tsleep fails we
229  *	 do not have to chain the wakeup().
230  */
231 static __inline int
232 __mtx_lock_sh(mtx_t *mtx, mtx_link_t *link, int flags, int to)
233 {
234 	thread_t td;
235 	u_int	lock;
236 	u_int	nlock;
237 	int	error;
238 	int	isasync;
239 
240 	for (;;) {
241 		lock = mtx->mtx_lock;
242 		cpu_ccfence();
243 
244 		if (lock == 0) {
245 			nlock = 1;
246 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
247 				error = 0;
248 				link->state = MTX_LINK_ACQUIRED;
249 				break;
250 			}
251 			continue;
252 		}
253 		if ((lock & (MTX_EXCLUSIVE | MTX_EXWANTED)) == 0) {
254 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
255 			nlock = lock + 1;
256 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
257 				error = 0;
258 				link->state = MTX_LINK_ACQUIRED;
259 				break;
260 			}
261 			continue;
262 		}
263 
264 		/*
265 		 * We need MTX_LINKSPIN to manipulate exlink or
266 		 * shlink.
267 		 *
268 		 * We must set MTX_SHWANTED with MTX_LINKSPIN to indicate
269 		 * pending shared requests.  It cannot be set as a separate
270 		 * operation prior to acquiring MTX_LINKSPIN.
271 		 *
272 		 * To avoid unnecessary cpu cache traffic we poll
273 		 * for collisions.  It is also possible that EXWANTED
274 		 * state failing the above test was spurious, so all the
275 		 * tests must be repeated if we cannot obtain LINKSPIN
276 		 * with the prior state tests intact (i.e. don't reload
277 		 * the (lock) variable here, for heaven's sake!).
278 		 */
279 		if (lock & MTX_LINKSPIN) {
280 			cpu_pause();
281 			continue;
282 		}
283 		td = curthread;
284 		nlock = lock | MTX_SHWANTED | MTX_LINKSPIN;
285 		++td->td_critcount;
286 		if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock) == 0) {
287 			--td->td_critcount;
288 			continue;
289 		}
290 
291 		/*
292 		 * Check for early abort.
293 		 */
294 		if (link->state == MTX_LINK_ABORTED) {
295 			if (mtx->mtx_exlink == NULL) {
296 				atomic_clear_int(&mtx->mtx_lock,
297 						 MTX_LINKSPIN |
298 						 MTX_SHWANTED);
299 			} else {
300 				atomic_clear_int(&mtx->mtx_lock,
301 						 MTX_LINKSPIN);
302 			}
303 			--td->td_critcount;
304 			link->state = MTX_LINK_IDLE;
305 			error = ENOLCK;
306 			break;
307 		}
308 
309 		/*
310 		 * Add our link to the exlink list and release LINKSPIN.
311 		 */
312 		link->owner = td;
313 		link->state = MTX_LINK_LINKED_SH;
314 		if (mtx->mtx_shlink) {
315 			link->next = mtx->mtx_shlink;
316 			link->prev = link->next->prev;
317 			link->next->prev = link;
318 			link->prev->next = link;
319 		} else {
320 			link->next = link;
321 			link->prev = link;
322 			mtx->mtx_shlink = link;
323 		}
324 		isasync = (link->callback != NULL);
325 		atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN);
326 		--td->td_critcount;
327 
328 		/*
329 		 * If asynchronous lock request return without
330 		 * blocking, leave link structure linked.
331 		 */
332 		if (isasync) {
333 			error = EINPROGRESS;
334 			break;
335 		}
336 
337 		/*
338 		 * Wait for lock
339 		 */
340 		error = mtx_wait_link(mtx, link, flags, to);
341 		break;
342 	}
343 	return (error);
344 }
345 
346 int
347 _mtx_lock_sh_link(mtx_t *mtx, mtx_link_t *link, int flags, int to)
348 {
349 	return(__mtx_lock_sh(mtx, link, flags, to));
350 }
351 
352 int
353 _mtx_lock_sh(mtx_t *mtx, int flags, int to)
354 {
355 	mtx_link_t link;
356 
357 	mtx_link_init(&link);
358 	return(__mtx_lock_sh(mtx, &link, flags, to));
359 }
360 
361 int
362 _mtx_lock_sh_quick(mtx_t *mtx)
363 {
364 	mtx_link_t link;
365 
366 	mtx_link_init(&link);
367 	return(__mtx_lock_sh(mtx, &link, 0, 0));
368 }
369 
370 /*
371  * Get an exclusive spinlock the hard way.
372  */
373 void
374 _mtx_spinlock(mtx_t *mtx)
375 {
376 	u_int	lock;
377 	u_int	nlock;
378 	int	bb = 1;
379 	int	bo;
380 
381 	for (;;) {
382 		lock = mtx->mtx_lock;
383 		if (lock == 0) {
384 			nlock = MTX_EXCLUSIVE | 1;
385 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
386 				mtx->mtx_owner = curthread;
387 				break;
388 			}
389 		} else if ((lock & MTX_EXCLUSIVE) &&
390 			   mtx->mtx_owner == curthread) {
391 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
392 			nlock = lock + 1;
393 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
394 				break;
395 		} else {
396 			/* MWAIT here */
397 			if (bb < 1000)
398 				++bb;
399 			cpu_pause();
400 			for (bo = 0; bo < bb; ++bo)
401 				;
402 		}
403 		cpu_pause();
404 	}
405 }
406 
407 /*
408  * Attempt to acquire a spinlock, if we fail we must undo the
409  * gd->gd_spinlocks/gd->gd_curthead->td_critcount predisposition.
410  *
411  * Returns 0 on success, EAGAIN on failure.
412  */
413 int
414 _mtx_spinlock_try(mtx_t *mtx)
415 {
416 	globaldata_t gd = mycpu;
417 	u_int	lock;
418 	u_int	nlock;
419 	int	res = 0;
420 
421 	for (;;) {
422 		lock = mtx->mtx_lock;
423 		if (lock == 0) {
424 			nlock = MTX_EXCLUSIVE | 1;
425 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
426 				mtx->mtx_owner = gd->gd_curthread;
427 				break;
428 			}
429 		} else if ((lock & MTX_EXCLUSIVE) &&
430 			   mtx->mtx_owner == gd->gd_curthread) {
431 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
432 			nlock = lock + 1;
433 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
434 				break;
435 		} else {
436 			--gd->gd_spinlocks;
437 			cpu_ccfence();
438 			--gd->gd_curthread->td_critcount;
439 			res = EAGAIN;
440 			break;
441 		}
442 		cpu_pause();
443 	}
444 	return res;
445 }
446 
447 #if 0
448 
449 void
450 _mtx_spinlock_sh(mtx_t *mtx)
451 {
452 	u_int	lock;
453 	u_int	nlock;
454 	int	bb = 1;
455 	int	bo;
456 
457 	for (;;) {
458 		lock = mtx->mtx_lock;
459 		if ((lock & MTX_EXCLUSIVE) == 0) {
460 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
461 			nlock = lock + 1;
462 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
463 				break;
464 		} else {
465 			/* MWAIT here */
466 			if (bb < 1000)
467 				++bb;
468 			cpu_pause();
469 			for (bo = 0; bo < bb; ++bo)
470 				;
471 		}
472 		cpu_pause();
473 	}
474 }
475 
476 #endif
477 
478 int
479 _mtx_lock_ex_try(mtx_t *mtx)
480 {
481 	u_int	lock;
482 	u_int	nlock;
483 	int	error;
484 
485 	for (;;) {
486 		lock = mtx->mtx_lock;
487 		if (lock == 0) {
488 			nlock = MTX_EXCLUSIVE | 1;
489 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
490 				mtx->mtx_owner = curthread;
491 				error = 0;
492 				break;
493 			}
494 		} else if ((lock & MTX_EXCLUSIVE) &&
495 			   mtx->mtx_owner == curthread) {
496 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
497 			nlock = lock + 1;
498 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
499 				error = 0;
500 				break;
501 			}
502 		} else {
503 			error = EAGAIN;
504 			break;
505 		}
506 		cpu_pause();
507 	}
508 	return (error);
509 }
510 
511 int
512 _mtx_lock_sh_try(mtx_t *mtx)
513 {
514 	u_int	lock;
515 	u_int	nlock;
516 	int	error = 0;
517 
518 	for (;;) {
519 		lock = mtx->mtx_lock;
520 		if ((lock & MTX_EXCLUSIVE) == 0) {
521 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
522 			nlock = lock + 1;
523 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
524 				break;
525 		} else {
526 			error = EAGAIN;
527 			break;
528 		}
529 		cpu_pause();
530 	}
531 	return (error);
532 }
533 
534 /*
535  * If the lock is held exclusively it must be owned by the caller.  If the
536  * lock is already a shared lock this operation is a NOP.  A panic will
537  * occur if the lock is not held either shared or exclusive.
538  *
539  * The exclusive count is converted to a shared count.
540  */
541 void
542 _mtx_downgrade(mtx_t *mtx)
543 {
544 	u_int	lock;
545 	u_int	nlock;
546 
547 	for (;;) {
548 		lock = mtx->mtx_lock;
549 		cpu_ccfence();
550 
551 		/*
552 		 * NOP if already shared.
553 		 */
554 		if ((lock & MTX_EXCLUSIVE) == 0) {
555 			KKASSERT((lock & MTX_MASK) > 0);
556 			break;
557 		}
558 
559 		/*
560 		 * Transfer count to shared.  Any additional pending shared
561 		 * waiters must be woken up.
562 		 */
563 		if (lock & MTX_SHWANTED) {
564 			if (mtx_chain_link_sh(mtx, lock, 1))
565 				break;
566 			/* retry */
567 		} else {
568 			nlock = lock & ~MTX_EXCLUSIVE;
569 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
570 				break;
571 			/* retry */
572 		}
573 		cpu_pause();
574 	}
575 }
576 
577 /*
578  * Upgrade a shared lock to an exclusive lock.  The upgrade will fail if
579  * the shared lock has a count other then 1.  Optimize the most likely case
580  * but note that a single cmpset can fail due to WANTED races.
581  *
582  * If the lock is held exclusively it must be owned by the caller and
583  * this function will simply return without doing anything.   A panic will
584  * occur if the lock is held exclusively by someone other then the caller.
585  *
586  * Returns 0 on success, EDEADLK on failure.
587  */
588 int
589 _mtx_upgrade_try(mtx_t *mtx)
590 {
591 	u_int	lock;
592 	u_int	nlock;
593 	int	error = 0;
594 
595 	for (;;) {
596 		lock = mtx->mtx_lock;
597 
598 		if ((lock & ~MTX_EXWANTED) == 1) {
599 			nlock = lock | MTX_EXCLUSIVE;
600 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
601 				mtx->mtx_owner = curthread;
602 				break;
603 			}
604 		} else if (lock & MTX_EXCLUSIVE) {
605 			KKASSERT(mtx->mtx_owner == curthread);
606 			break;
607 		} else {
608 			error = EDEADLK;
609 			break;
610 		}
611 		cpu_pause();
612 	}
613 	return (error);
614 }
615 
616 /*
617  * Unlock a lock.  The caller must hold the lock either shared or exclusive.
618  *
619  * On the last release we handle any pending chains.
620  */
621 void
622 _mtx_unlock(mtx_t *mtx)
623 {
624 	u_int	lock;
625 	u_int	nlock;
626 
627 	for (;;) {
628 		lock = mtx->mtx_lock;
629 		cpu_ccfence();
630 
631 		switch(lock) {
632 		case MTX_EXCLUSIVE | 1:
633 			/*
634 			 * Last release, exclusive lock.
635 			 * No exclusive or shared requests pending.
636 			 */
637 			mtx->mtx_owner = NULL;
638 			nlock = 0;
639 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
640 				goto done;
641 			break;
642 		case MTX_EXCLUSIVE | MTX_EXWANTED | 1:
643 		case MTX_EXCLUSIVE | MTX_EXWANTED | MTX_SHWANTED | 1:
644 			/*
645 			 * Last release, exclusive lock.
646 			 * Exclusive requests pending.
647 			 * Exclusive requests have priority over shared reqs.
648 			 */
649 			if (mtx_chain_link_ex(mtx, lock))
650 				goto done;
651 			break;
652 		case MTX_EXCLUSIVE | MTX_SHWANTED | 1:
653 			/*
654 			 * Last release, exclusive lock.
655 			 *
656 			 * Shared requests are pending.  Transfer our count (1)
657 			 * to the first shared request, wakeup all shared reqs.
658 			 */
659 			if (mtx_chain_link_sh(mtx, lock, 0))
660 				goto done;
661 			break;
662 		case 1:
663 			/*
664 			 * Last release, shared lock.
665 			 * No exclusive or shared requests pending.
666 			 */
667 			nlock = 0;
668 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
669 				goto done;
670 			break;
671 		case MTX_EXWANTED | 1:
672 		case MTX_EXWANTED | MTX_SHWANTED | 1:
673 			/*
674 			 * Last release, shared lock.
675 			 *
676 			 * Exclusive requests are pending.  Transfer our
677 			 * count (1) to the next exclusive request.
678 			 *
679 			 * Exclusive requests have priority over shared reqs.
680 			 */
681 			if (mtx_chain_link_ex(mtx, lock))
682 				goto done;
683 			break;
684 		case MTX_SHWANTED | 1:
685 			/*
686 			 * Last release, shared lock.
687 			 * Shared requests pending.
688 			 */
689 			if (mtx_chain_link_sh(mtx, lock, 0))
690 				goto done;
691 			break;
692 		default:
693 			/*
694 			 * We have to loop if this is the last release but
695 			 * someone is fiddling with LINKSPIN.
696 			 */
697 			if ((lock & MTX_MASK) == 1) {
698 				KKASSERT(lock & MTX_LINKSPIN);
699 				break;
700 			}
701 
702 			/*
703 			 * Not the last release (shared or exclusive)
704 			 */
705 			nlock = lock - 1;
706 			KKASSERT((nlock & MTX_MASK) != MTX_MASK);
707 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
708 				goto done;
709 			break;
710 		}
711 		/* loop try again */
712 		cpu_pause();
713 	}
714 done:
715 	;
716 }
717 
718 /*
719  * Chain pending links.  Called on the last release of an exclusive or
720  * shared lock when the appropriate WANTED bit is set.  mtx_lock old state
721  * is passed in with the count left at 1, which we can inherit, and other
722  * bits which we must adjust in a single atomic operation.
723  *
724  * Return non-zero on success, 0 if caller needs to retry.
725  *
726  * NOTE: It's ok if MTX_EXWANTED is in an indeterminant state while we are
727  *	 acquiring LINKSPIN as all other cases will also need to acquire
728  *	 LINKSPIN when handling the EXWANTED case.
729  */
730 static int
731 mtx_chain_link_ex(mtx_t *mtx, u_int olock)
732 {
733 	thread_t td = curthread;
734 	mtx_link_t *link;
735 	u_int	nlock;
736 
737 	olock &= ~MTX_LINKSPIN;
738 	nlock = olock | MTX_LINKSPIN | MTX_EXCLUSIVE;
739 	++td->td_critcount;
740 	if (atomic_cmpset_int(&mtx->mtx_lock, olock, nlock)) {
741 		link = mtx->mtx_exlink;
742 		KKASSERT(link != NULL);
743 		if (link->next == link) {
744 			mtx->mtx_exlink = NULL;
745 			nlock = MTX_LINKSPIN | MTX_EXWANTED;	/* to clear */
746 		} else {
747 			mtx->mtx_exlink = link->next;
748 			link->next->prev = link->prev;
749 			link->prev->next = link->next;
750 			nlock = MTX_LINKSPIN;			/* to clear */
751 		}
752 		KKASSERT(link->state == MTX_LINK_LINKED_EX);
753 		mtx->mtx_owner = link->owner;
754 		cpu_sfence();
755 
756 		/*
757 		 * WARNING! The callback can only be safely
758 		 *	    made with LINKSPIN still held
759 		 *	    and in a critical section.
760 		 *
761 		 * WARNING! The link can go away after the
762 		 *	    state is set, or after the
763 		 *	    callback.
764 		 */
765 		if (link->callback) {
766 			link->state = MTX_LINK_CALLEDBACK;
767 			link->callback(link, link->arg, 0);
768 		} else {
769 			link->state = MTX_LINK_ACQUIRED;
770 			wakeup(link);
771 		}
772 		atomic_clear_int(&mtx->mtx_lock, nlock);
773 		--td->td_critcount;
774 		return 1;
775 	}
776 	/* retry */
777 	--td->td_critcount;
778 	return 0;
779 }
780 
781 /*
782  * Flush waiting shared locks.  The lock's prior state is passed in and must
783  * be adjusted atomically only if it matches.
784  *
785  * If addcount is 0, the count for the first shared lock in the chain is
786  * assumed to have already been accounted for.
787  *
788  * If addcount is 1, the count for the first shared lock in the chain has
789  * not yet been accounted for.
790  */
791 static int
792 mtx_chain_link_sh(mtx_t *mtx, u_int olock, int addcount)
793 {
794 	thread_t td = curthread;
795 	mtx_link_t *link;
796 	u_int	nlock;
797 
798 	olock &= ~MTX_LINKSPIN;
799 	nlock = olock | MTX_LINKSPIN;
800 	nlock &= ~MTX_EXCLUSIVE;
801 	++td->td_critcount;
802 	if (atomic_cmpset_int(&mtx->mtx_lock, olock, nlock)) {
803 		KKASSERT(mtx->mtx_shlink != NULL);
804 		for (;;) {
805 			link = mtx->mtx_shlink;
806 			atomic_add_int(&mtx->mtx_lock, addcount);
807 			KKASSERT(link->state == MTX_LINK_LINKED_SH);
808 			if (link->next == link) {
809 				mtx->mtx_shlink = NULL;
810 				cpu_sfence();
811 
812 				/*
813 				 * WARNING! The callback can only be safely
814 				 *	    made with LINKSPIN still held
815 				 *	    and in a critical section.
816 				 *
817 				 * WARNING! The link can go away after the
818 				 *	    state is set, or after the
819 				 *	    callback.
820 				 */
821 				if (link->callback) {
822 					link->state = MTX_LINK_CALLEDBACK;
823 					link->callback(link, link->arg, 0);
824 				} else {
825 					link->state = MTX_LINK_ACQUIRED;
826 					wakeup(link);
827 				}
828 				break;
829 			}
830 			mtx->mtx_shlink = link->next;
831 			link->next->prev = link->prev;
832 			link->prev->next = link->next;
833 			cpu_sfence();
834 			link->state = MTX_LINK_ACQUIRED;
835 			/* link can go away */
836 			wakeup(link);
837 			addcount = 1;
838 		}
839 		atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN |
840 						 MTX_SHWANTED);
841 		--td->td_critcount;
842 		return 1;
843 	}
844 	/* retry */
845 	--td->td_critcount;
846 	return 0;
847 }
848 
849 /*
850  * Delete a link structure after tsleep has failed.  This code is not
851  * in the critical path as most exclusive waits are chained.
852  */
853 static
854 void
855 mtx_delete_link(mtx_t *mtx, mtx_link_t *link)
856 {
857 	thread_t td = curthread;
858 	u_int	lock;
859 	u_int	nlock;
860 
861 	/*
862 	 * Acquire MTX_LINKSPIN.
863 	 *
864 	 * Do not use cmpxchg to wait for LINKSPIN to clear as this might
865 	 * result in too much cpu cache traffic.
866 	 */
867 	++td->td_critcount;
868 	for (;;) {
869 		lock = mtx->mtx_lock;
870 		if (lock & MTX_LINKSPIN) {
871 			cpu_pause();
872 			continue;
873 		}
874 		nlock = lock | MTX_LINKSPIN;
875 		if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
876 			break;
877 		cpu_pause();
878 	}
879 
880 	/*
881 	 * Delete the link and release LINKSPIN.
882 	 */
883 	nlock = MTX_LINKSPIN;	/* to clear */
884 
885 	switch(link->state) {
886 	case MTX_LINK_LINKED_EX:
887 		if (link->next == link) {
888 			mtx->mtx_exlink = NULL;
889 			nlock |= MTX_EXWANTED;	/* to clear */
890 		} else {
891 			mtx->mtx_exlink = link->next;
892 			link->next->prev = link->prev;
893 			link->prev->next = link->next;
894 		}
895 		break;
896 	case MTX_LINK_LINKED_SH:
897 		if (link->next == link) {
898 			mtx->mtx_shlink = NULL;
899 			nlock |= MTX_SHWANTED;	/* to clear */
900 		} else {
901 			mtx->mtx_shlink = link->next;
902 			link->next->prev = link->prev;
903 			link->prev->next = link->next;
904 		}
905 		break;
906 	default:
907 		/* no change */
908 		break;
909 	}
910 	atomic_clear_int(&mtx->mtx_lock, nlock);
911 	--td->td_critcount;
912 }
913 
914 /*
915  * Wait for async lock completion or abort.  Returns ENOLCK if an abort
916  * occurred.
917  */
918 int
919 mtx_wait_link(mtx_t *mtx, mtx_link_t *link, int flags, int to)
920 {
921 	int error;
922 
923 	/*
924 	 * Sleep.  Handle false wakeups, interruptions, etc.
925 	 * The link may also have been aborted.
926 	 */
927 	error = 0;
928 	while (link->state & MTX_LINK_LINKED) {
929 		tsleep_interlock(link, 0);
930 		cpu_lfence();
931 		if (link->state & MTX_LINK_LINKED) {
932 			if (link->state & MTX_LINK_LINKED_SH)
933 				mycpu->gd_cnt.v_lock_name[0] = 'S';
934 			else
935 				mycpu->gd_cnt.v_lock_name[0] = 'X';
936 			strncpy(mycpu->gd_cnt.v_lock_name + 1,
937 				mtx->mtx_ident,
938 				sizeof(mycpu->gd_cnt.v_lock_name) - 2);
939 			++mycpu->gd_cnt.v_lock_colls;
940 
941 			error = tsleep(link, flags | PINTERLOCKED,
942 				       mtx->mtx_ident, to);
943 			if (error)
944 				break;
945 		}
946 	}
947 
948 	/*
949 	 * We are done, make sure the link structure is unlinked.
950 	 * It may still be on the list due to e.g. EINTR or
951 	 * EWOULDBLOCK.
952 	 *
953 	 * It is possible for the tsleep to race an ABORT and cause
954 	 * error to be 0.
955 	 *
956 	 * The tsleep() can be woken up for numerous reasons and error
957 	 * might be zero in situations where we intend to return an error.
958 	 *
959 	 * (This is the synchronous case so state cannot be CALLEDBACK)
960 	 */
961 	switch(link->state) {
962 	case MTX_LINK_ACQUIRED:
963 	case MTX_LINK_CALLEDBACK:
964 		error = 0;
965 		break;
966 	case MTX_LINK_ABORTED:
967 		error = ENOLCK;
968 		break;
969 	case MTX_LINK_LINKED_EX:
970 	case MTX_LINK_LINKED_SH:
971 		mtx_delete_link(mtx, link);
972 		/* fall through */
973 	default:
974 		if (error == 0)
975 			error = EWOULDBLOCK;
976 		break;
977 	}
978 
979 	/*
980 	 * Clear state on status returned.
981 	 */
982 	link->state = MTX_LINK_IDLE;
983 
984 	return error;
985 }
986 
987 /*
988  * Abort a mutex locking operation, causing mtx_lock_ex_link() to
989  * return ENOLCK.  This may be called at any time after the mtx_link
990  * is initialized or the status from a previous lock has been
991  * returned.  If called prior to the next (non-try) lock attempt, the
992  * next lock attempt using this link structure will abort instantly.
993  *
994  * Caller must still wait for the operation to complete, either from a
995  * blocking call that is still in progress or by calling mtx_wait_link().
996  *
997  * If an asynchronous lock request is possibly in-progress, the caller
998  * should call mtx_wait_link() synchronously.  Note that the asynchronous
999  * lock callback will NOT be called if a successful abort occurred. XXX
1000  */
1001 void
1002 mtx_abort_link(mtx_t *mtx, mtx_link_t *link)
1003 {
1004 	thread_t td = curthread;
1005 	u_int	lock;
1006 	u_int	nlock;
1007 
1008 	/*
1009 	 * Acquire MTX_LINKSPIN
1010 	 */
1011 	++td->td_critcount;
1012 	for (;;) {
1013 		lock = mtx->mtx_lock;
1014 		if (lock & MTX_LINKSPIN) {
1015 			cpu_pause();
1016 			continue;
1017 		}
1018 		nlock = lock | MTX_LINKSPIN;
1019 		if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
1020 			break;
1021 		cpu_pause();
1022 	}
1023 
1024 	/*
1025 	 * Do the abort.
1026 	 *
1027 	 * WARNING! Link structure can disappear once link->state is set.
1028 	 */
1029 	nlock = MTX_LINKSPIN;	/* to clear */
1030 
1031 	switch(link->state) {
1032 	case MTX_LINK_IDLE:
1033 		/*
1034 		 * Link not started yet
1035 		 */
1036 		link->state = MTX_LINK_ABORTED;
1037 		break;
1038 	case MTX_LINK_LINKED_EX:
1039 		/*
1040 		 * de-link, mark aborted, and potentially wakeup the thread
1041 		 * or issue the callback.
1042 		 */
1043 		if (link->next == link) {
1044 			if (mtx->mtx_exlink == link) {
1045 				mtx->mtx_exlink = NULL;
1046 				nlock |= MTX_EXWANTED;	/* to clear */
1047 			}
1048 		} else {
1049 			if (mtx->mtx_exlink == link)
1050 				mtx->mtx_exlink = link->next;
1051 			link->next->prev = link->prev;
1052 			link->prev->next = link->next;
1053 		}
1054 
1055 		/*
1056 		 * When aborting the async callback is still made.  We must
1057 		 * not set the link status to ABORTED in the callback case
1058 		 * since there is nothing else to clear its status if the
1059 		 * link is reused.
1060 		 */
1061 		if (link->callback) {
1062 			link->state = MTX_LINK_CALLEDBACK;
1063 			link->callback(link, link->arg, ENOLCK);
1064 		} else {
1065 			link->state = MTX_LINK_ABORTED;
1066 			wakeup(link);
1067 		}
1068 		break;
1069 	case MTX_LINK_LINKED_SH:
1070 		/*
1071 		 * de-link, mark aborted, and potentially wakeup the thread
1072 		 * or issue the callback.
1073 		 */
1074 		if (link->next == link) {
1075 			if (mtx->mtx_shlink == link) {
1076 				mtx->mtx_shlink = NULL;
1077 				nlock |= MTX_SHWANTED;	/* to clear */
1078 			}
1079 		} else {
1080 			if (mtx->mtx_shlink == link)
1081 				mtx->mtx_shlink = link->next;
1082 			link->next->prev = link->prev;
1083 			link->prev->next = link->next;
1084 		}
1085 
1086 		/*
1087 		 * When aborting the async callback is still made.  We must
1088 		 * not set the link status to ABORTED in the callback case
1089 		 * since there is nothing else to clear its status if the
1090 		 * link is reused.
1091 		 */
1092 		if (link->callback) {
1093 			link->state = MTX_LINK_CALLEDBACK;
1094 			link->callback(link, link->arg, ENOLCK);
1095 		} else {
1096 			link->state = MTX_LINK_ABORTED;
1097 			wakeup(link);
1098 		}
1099 		break;
1100 	case MTX_LINK_ACQUIRED:
1101 	case MTX_LINK_CALLEDBACK:
1102 		/*
1103 		 * Too late, the lock was acquired.  Let it complete.
1104 		 */
1105 		break;
1106 	default:
1107 		/*
1108 		 * link already aborted, do nothing.
1109 		 */
1110 		break;
1111 	}
1112 	atomic_clear_int(&mtx->mtx_lock, nlock);
1113 	--td->td_critcount;
1114 }
1115