xref: /dragonfly/sys/kern/kern_mutex.c (revision 0ca59c34)
1 /*
2  * Copyright (c) 2009 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * Implement fast persistent locks based on atomic_cmpset_int() with
36  * semantics similar to lockmgr locks but faster and taking up much less
37  * space.  Taken from HAMMER's lock implementation.
38  *
39  * These are meant to complement our LWKT tokens.  Tokens are only held
40  * while the thread is running.  Mutexes can be held across blocking
41  * conditions.
42  *
43  * - Exclusive priority over shared to prevent SMP starvation.
44  * - locks can be aborted (async callback, if any, will be made w/ENOLCK).
45  * - locks can be asynchronous.
46  * - synchronous fast path if no blocking occurs (async callback is not
47  *   made in this case).
48  *
49  * Generally speaking any caller-supplied link state must be properly
50  * initialized before use.
51  *
52  * Most of the support is in sys/mutex[2].h.  We mostly provide backoff
53  * functions here.
54  */
55 
56 #include <sys/param.h>
57 #include <sys/systm.h>
58 #include <sys/kernel.h>
59 #include <sys/sysctl.h>
60 #include <sys/thread.h>
61 
62 #include <machine/cpufunc.h>
63 
64 #include <sys/thread2.h>
65 #include <sys/mutex2.h>
66 
67 static int64_t mtx_contention_count;
68 static int64_t mtx_collision_count;
69 static int64_t mtx_wakeup_count;
70 
71 SYSCTL_QUAD(_kern, OID_AUTO, mtx_contention_count, CTLFLAG_RW,
72 	    &mtx_contention_count, 0, "");
73 SYSCTL_QUAD(_kern, OID_AUTO, mtx_collision_count, CTLFLAG_RW,
74 	    &mtx_collision_count, 0, "");
75 SYSCTL_QUAD(_kern, OID_AUTO, mtx_wakeup_count, CTLFLAG_RW,
76 	    &mtx_wakeup_count, 0, "");
77 
78 static int mtx_chain_link_ex(mtx_t *mtx, u_int olock);
79 static int mtx_chain_link_sh(mtx_t *mtx, u_int olock, int addcount);
80 static void mtx_delete_link(mtx_t *mtx, mtx_link_t *link);
81 
82 /*
83  * Exclusive-lock a mutex, block until acquired unless link is async.
84  * Recursion is allowed.
85  *
86  * Returns 0 on success, the tsleep() return code on failure, EINPROGRESS
87  * if async.  If immediately successful an async exclusive lock will return 0
88  * and not issue the async callback or link the link structure.  The caller
89  * must handle this case (typically this is an optimal code path).
90  *
91  * A tsleep() error can only be returned if PCATCH is specified in the flags.
92  */
93 static __inline int
94 __mtx_lock_ex(mtx_t *mtx, mtx_link_t *link, int flags, int to)
95 {
96 	thread_t td;
97 	u_int	lock;
98 	u_int	nlock;
99 	int	error;
100 	int	isasync;
101 
102 	for (;;) {
103 		lock = mtx->mtx_lock;
104 		cpu_ccfence();
105 
106 		if (lock == 0) {
107 			nlock = MTX_EXCLUSIVE | 1;
108 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
109 				mtx->mtx_owner = curthread;
110 				link->state = MTX_LINK_ACQUIRED;
111 				error = 0;
112 				break;
113 			}
114 			continue;
115 		}
116 		if ((lock & MTX_EXCLUSIVE) && mtx->mtx_owner == curthread) {
117 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
118 			nlock = lock + 1;
119 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
120 				link->state = MTX_LINK_ACQUIRED;
121 				error = 0;
122 				break;
123 			}
124 			continue;
125 		}
126 
127 		/*
128 		 * We need MTX_LINKSPIN to manipulate exlink or
129 		 * shlink.
130 		 *
131 		 * We must set MTX_EXWANTED with MTX_LINKSPIN to indicate
132 		 * pending shared requests.  It cannot be set as a separate
133 		 * operation prior to acquiring MTX_LINKSPIN.
134 		 *
135 		 * To avoid unnecessary cpu cache traffic we poll
136 		 * for collisions.  It is also possible that EXWANTED
137 		 * state failing the above test was spurious, so all the
138 		 * tests must be repeated if we cannot obtain LINKSPIN
139 		 * with the prior state tests intact (i.e. don't reload
140 		 * the (lock) variable here, for heaven's sake!).
141 		 */
142 		if (lock & MTX_LINKSPIN) {
143 			cpu_pause();
144 			++mtx_collision_count;
145 			continue;
146 		}
147 		td = curthread;
148 		nlock = lock | MTX_EXWANTED | MTX_LINKSPIN;
149 		++td->td_critcount;
150 		if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock) == 0) {
151 			--td->td_critcount;
152 			continue;
153 		}
154 
155 		/*
156 		 * Check for early abort.
157 		 */
158 		if (link->state == MTX_LINK_ABORTED) {
159 			if (mtx->mtx_exlink == NULL) {
160 				atomic_clear_int(&mtx->mtx_lock,
161 						 MTX_LINKSPIN |
162 						 MTX_EXWANTED);
163 			} else {
164 				atomic_clear_int(&mtx->mtx_lock,
165 						 MTX_LINKSPIN);
166 			}
167 			--td->td_critcount;
168 			link->state = MTX_LINK_IDLE;
169 			error = ENOLCK;
170 			break;
171 		}
172 
173 		/*
174 		 * Add our link to the exlink list and release LINKSPIN.
175 		 */
176 		link->owner = td;
177 		link->state = MTX_LINK_LINKED_EX;
178 		if (mtx->mtx_exlink) {
179 			link->next = mtx->mtx_exlink;
180 			link->prev = link->next->prev;
181 			link->next->prev = link;
182 			link->prev->next = link;
183 		} else {
184 			link->next = link;
185 			link->prev = link;
186 			mtx->mtx_exlink = link;
187 		}
188 		isasync = (link->callback != NULL);
189 		atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN);
190 		--td->td_critcount;
191 
192 		/*
193 		 * If asynchronous lock request return without
194 		 * blocking, leave link structure linked.
195 		 */
196 		if (isasync) {
197 			error = EINPROGRESS;
198 			break;
199 		}
200 
201 		/*
202 		 * Wait for lock
203 		 */
204 		error = mtx_wait_link(mtx, link, flags, to);
205 		break;
206 	}
207 	return (error);
208 }
209 
210 int
211 _mtx_lock_ex_link(mtx_t *mtx, mtx_link_t *link, int flags, int to)
212 {
213 	return(__mtx_lock_ex(mtx, link, flags, to));
214 }
215 
216 int
217 _mtx_lock_ex(mtx_t *mtx, int flags, int to)
218 {
219 	mtx_link_t link;
220 
221 	mtx_link_init(&link);
222 	return(__mtx_lock_ex(mtx, &link, flags, to));
223 }
224 
225 int
226 _mtx_lock_ex_quick(mtx_t *mtx)
227 {
228 	mtx_link_t link;
229 
230 	mtx_link_init(&link);
231 	return(__mtx_lock_ex(mtx, &link, 0, 0));
232 }
233 
234 /*
235  * Share-lock a mutex, block until acquired.  Recursion is allowed.
236  *
237  * Returns 0 on success, or the tsleep() return code on failure.
238  * An error can only be returned if PCATCH is specified in the flags.
239  *
240  * NOTE: Shared locks get a mass-wakeup so if the tsleep fails we
241  *	 do not have to chain the wakeup().
242  */
243 static __inline int
244 __mtx_lock_sh(mtx_t *mtx, mtx_link_t *link, int flags, int to)
245 {
246 	thread_t td;
247 	u_int	lock;
248 	u_int	nlock;
249 	int	error;
250 	int	isasync;
251 
252 	for (;;) {
253 		lock = mtx->mtx_lock;
254 		cpu_ccfence();
255 
256 		if (lock == 0) {
257 			nlock = 1;
258 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
259 				error = 0;
260 				link->state = MTX_LINK_ACQUIRED;
261 				break;
262 			}
263 			continue;
264 		}
265 		if ((lock & (MTX_EXCLUSIVE | MTX_EXWANTED)) == 0) {
266 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
267 			nlock = lock + 1;
268 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
269 				error = 0;
270 				link->state = MTX_LINK_ACQUIRED;
271 				break;
272 			}
273 			continue;
274 		}
275 
276 		/*
277 		 * We need MTX_LINKSPIN to manipulate exlink or
278 		 * shlink.
279 		 *
280 		 * We must set MTX_SHWANTED with MTX_LINKSPIN to indicate
281 		 * pending shared requests.  It cannot be set as a separate
282 		 * operation prior to acquiring MTX_LINKSPIN.
283 		 *
284 		 * To avoid unnecessary cpu cache traffic we poll
285 		 * for collisions.  It is also possible that EXWANTED
286 		 * state failing the above test was spurious, so all the
287 		 * tests must be repeated if we cannot obtain LINKSPIN
288 		 * with the prior state tests intact (i.e. don't reload
289 		 * the (lock) variable here, for heaven's sake!).
290 		 */
291 		if (lock & MTX_LINKSPIN) {
292 			cpu_pause();
293 			++mtx_collision_count;
294 			continue;
295 		}
296 		td = curthread;
297 		nlock = lock | MTX_SHWANTED | MTX_LINKSPIN;
298 		++td->td_critcount;
299 		if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock) == 0) {
300 			--td->td_critcount;
301 			continue;
302 		}
303 
304 		/*
305 		 * Check for early abort.
306 		 */
307 		if (link->state == MTX_LINK_ABORTED) {
308 			if (mtx->mtx_exlink == NULL) {
309 				atomic_clear_int(&mtx->mtx_lock,
310 						 MTX_LINKSPIN |
311 						 MTX_SHWANTED);
312 			} else {
313 				atomic_clear_int(&mtx->mtx_lock,
314 						 MTX_LINKSPIN);
315 			}
316 			--td->td_critcount;
317 			link->state = MTX_LINK_IDLE;
318 			error = ENOLCK;
319 			break;
320 		}
321 
322 		/*
323 		 * Add our link to the exlink list and release LINKSPIN.
324 		 */
325 		link->owner = td;
326 		link->state = MTX_LINK_LINKED_SH;
327 		if (mtx->mtx_shlink) {
328 			link->next = mtx->mtx_shlink;
329 			link->prev = link->next->prev;
330 			link->next->prev = link;
331 			link->prev->next = link;
332 		} else {
333 			link->next = link;
334 			link->prev = link;
335 			mtx->mtx_shlink = link;
336 		}
337 		isasync = (link->callback != NULL);
338 		atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN);
339 		--td->td_critcount;
340 
341 		/*
342 		 * If asynchronous lock request return without
343 		 * blocking, leave link structure linked.
344 		 */
345 		if (isasync) {
346 			error = EINPROGRESS;
347 			break;
348 		}
349 
350 		/*
351 		 * Wait for lock
352 		 */
353 		error = mtx_wait_link(mtx, link, flags, to);
354 		break;
355 	}
356 	return (error);
357 }
358 
359 int
360 _mtx_lock_sh_link(mtx_t *mtx, mtx_link_t *link, int flags, int to)
361 {
362 	return(__mtx_lock_sh(mtx, link, flags, to));
363 }
364 
365 int
366 _mtx_lock_sh(mtx_t *mtx, int flags, int to)
367 {
368 	mtx_link_t link;
369 
370 	mtx_link_init(&link);
371 	return(__mtx_lock_sh(mtx, &link, flags, to));
372 }
373 
374 int
375 _mtx_lock_sh_quick(mtx_t *mtx)
376 {
377 	mtx_link_t link;
378 
379 	mtx_link_init(&link);
380 	return(__mtx_lock_sh(mtx, &link, 0, 0));
381 }
382 
383 /*
384  * Get an exclusive spinlock the hard way.
385  */
386 void
387 _mtx_spinlock(mtx_t *mtx)
388 {
389 	u_int	lock;
390 	u_int	nlock;
391 	int	bb = 1;
392 	int	bo;
393 
394 	for (;;) {
395 		lock = mtx->mtx_lock;
396 		if (lock == 0) {
397 			nlock = MTX_EXCLUSIVE | 1;
398 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
399 				mtx->mtx_owner = curthread;
400 				break;
401 			}
402 		} else if ((lock & MTX_EXCLUSIVE) &&
403 			   mtx->mtx_owner == curthread) {
404 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
405 			nlock = lock + 1;
406 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
407 				break;
408 		} else {
409 			/* MWAIT here */
410 			if (bb < 1000)
411 				++bb;
412 			cpu_pause();
413 			for (bo = 0; bo < bb; ++bo)
414 				;
415 			++mtx_contention_count;
416 		}
417 		cpu_pause();
418 		++mtx_collision_count;
419 	}
420 }
421 
422 /*
423  * Attempt to acquire a spinlock, if we fail we must undo the
424  * gd->gd_spinlocks/gd->gd_curthead->td_critcount predisposition.
425  *
426  * Returns 0 on success, EAGAIN on failure.
427  */
428 int
429 _mtx_spinlock_try(mtx_t *mtx)
430 {
431 	globaldata_t gd = mycpu;
432 	u_int	lock;
433 	u_int	nlock;
434 	int	res = 0;
435 
436 	for (;;) {
437 		lock = mtx->mtx_lock;
438 		if (lock == 0) {
439 			nlock = MTX_EXCLUSIVE | 1;
440 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
441 				mtx->mtx_owner = gd->gd_curthread;
442 				break;
443 			}
444 		} else if ((lock & MTX_EXCLUSIVE) &&
445 			   mtx->mtx_owner == gd->gd_curthread) {
446 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
447 			nlock = lock + 1;
448 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
449 				break;
450 		} else {
451 			--gd->gd_spinlocks;
452 			cpu_ccfence();
453 			--gd->gd_curthread->td_critcount;
454 			res = EAGAIN;
455 			break;
456 		}
457 		cpu_pause();
458 		++mtx_collision_count;
459 	}
460 	return res;
461 }
462 
463 #if 0
464 
465 void
466 _mtx_spinlock_sh(mtx_t *mtx)
467 {
468 	u_int	lock;
469 	u_int	nlock;
470 	int	bb = 1;
471 	int	bo;
472 
473 	for (;;) {
474 		lock = mtx->mtx_lock;
475 		if ((lock & MTX_EXCLUSIVE) == 0) {
476 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
477 			nlock = lock + 1;
478 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
479 				break;
480 		} else {
481 			/* MWAIT here */
482 			if (bb < 1000)
483 				++bb;
484 			cpu_pause();
485 			for (bo = 0; bo < bb; ++bo)
486 				;
487 			++mtx_contention_count;
488 		}
489 		cpu_pause();
490 		++mtx_collision_count;
491 	}
492 }
493 
494 #endif
495 
496 int
497 _mtx_lock_ex_try(mtx_t *mtx)
498 {
499 	u_int	lock;
500 	u_int	nlock;
501 	int	error;
502 
503 	for (;;) {
504 		lock = mtx->mtx_lock;
505 		if (lock == 0) {
506 			nlock = MTX_EXCLUSIVE | 1;
507 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
508 				mtx->mtx_owner = curthread;
509 				error = 0;
510 				break;
511 			}
512 		} else if ((lock & MTX_EXCLUSIVE) &&
513 			   mtx->mtx_owner == curthread) {
514 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
515 			nlock = lock + 1;
516 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
517 				error = 0;
518 				break;
519 			}
520 		} else {
521 			error = EAGAIN;
522 			break;
523 		}
524 		cpu_pause();
525 		++mtx_collision_count;
526 	}
527 	return (error);
528 }
529 
530 int
531 _mtx_lock_sh_try(mtx_t *mtx)
532 {
533 	u_int	lock;
534 	u_int	nlock;
535 	int	error = 0;
536 
537 	for (;;) {
538 		lock = mtx->mtx_lock;
539 		if ((lock & MTX_EXCLUSIVE) == 0) {
540 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
541 			nlock = lock + 1;
542 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
543 				break;
544 		} else {
545 			error = EAGAIN;
546 			break;
547 		}
548 		cpu_pause();
549 		++mtx_collision_count;
550 	}
551 	return (error);
552 }
553 
554 /*
555  * If the lock is held exclusively it must be owned by the caller.  If the
556  * lock is already a shared lock this operation is a NOP.  A panic will
557  * occur if the lock is not held either shared or exclusive.
558  *
559  * The exclusive count is converted to a shared count.
560  */
561 void
562 _mtx_downgrade(mtx_t *mtx)
563 {
564 	u_int	lock;
565 	u_int	nlock;
566 
567 	for (;;) {
568 		lock = mtx->mtx_lock;
569 		cpu_ccfence();
570 
571 		/*
572 		 * NOP if already shared.
573 		 */
574 		if ((lock & MTX_EXCLUSIVE) == 0) {
575 			KKASSERT((lock & MTX_MASK) > 0);
576 			break;
577 		}
578 
579 		/*
580 		 * Transfer count to shared.  Any additional pending shared
581 		 * waiters must be woken up.
582 		 */
583 		if (lock & MTX_SHWANTED) {
584 			if (mtx_chain_link_sh(mtx, lock, 1))
585 				break;
586 			/* retry */
587 		} else {
588 			nlock = lock & ~MTX_EXCLUSIVE;
589 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
590 				break;
591 			/* retry */
592 		}
593 		cpu_pause();
594 		++mtx_collision_count;
595 	}
596 }
597 
598 /*
599  * Upgrade a shared lock to an exclusive lock.  The upgrade will fail if
600  * the shared lock has a count other then 1.  Optimize the most likely case
601  * but note that a single cmpset can fail due to WANTED races.
602  *
603  * If the lock is held exclusively it must be owned by the caller and
604  * this function will simply return without doing anything.   A panic will
605  * occur if the lock is held exclusively by someone other then the caller.
606  *
607  * Returns 0 on success, EDEADLK on failure.
608  */
609 int
610 _mtx_upgrade_try(mtx_t *mtx)
611 {
612 	u_int	lock;
613 	u_int	nlock;
614 	int	error = 0;
615 
616 	for (;;) {
617 		lock = mtx->mtx_lock;
618 
619 		if ((lock & ~MTX_EXWANTED) == 1) {
620 			nlock = lock | MTX_EXCLUSIVE;
621 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
622 				mtx->mtx_owner = curthread;
623 				break;
624 			}
625 		} else if (lock & MTX_EXCLUSIVE) {
626 			KKASSERT(mtx->mtx_owner == curthread);
627 			break;
628 		} else {
629 			error = EDEADLK;
630 			break;
631 		}
632 		cpu_pause();
633 		++mtx_collision_count;
634 	}
635 	return (error);
636 }
637 
638 /*
639  * Unlock a lock.  The caller must hold the lock either shared or exclusive.
640  *
641  * On the last release we handle any pending chains.
642  */
643 void
644 _mtx_unlock(mtx_t *mtx)
645 {
646 	u_int	lock;
647 	u_int	nlock;
648 
649 	for (;;) {
650 		lock = mtx->mtx_lock;
651 		cpu_ccfence();
652 
653 		switch(lock) {
654 		case MTX_EXCLUSIVE | 1:
655 			/*
656 			 * Last release, exclusive lock.
657 			 * No exclusive or shared requests pending.
658 			 */
659 			mtx->mtx_owner = NULL;
660 			nlock = 0;
661 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
662 				goto done;
663 			break;
664 		case MTX_EXCLUSIVE | MTX_EXWANTED | 1:
665 		case MTX_EXCLUSIVE | MTX_EXWANTED | MTX_SHWANTED | 1:
666 			/*
667 			 * Last release, exclusive lock.
668 			 * Exclusive requests pending.
669 			 * Exclusive requests have priority over shared reqs.
670 			 */
671 			if (mtx_chain_link_ex(mtx, lock))
672 				goto done;
673 			break;
674 		case MTX_EXCLUSIVE | MTX_SHWANTED | 1:
675 			/*
676 			 * Last release, exclusive lock.
677 			 *
678 			 * Shared requests are pending.  Transfer our count (1)
679 			 * to the first shared request, wakeup all shared reqs.
680 			 */
681 			if (mtx_chain_link_sh(mtx, lock, 0))
682 				goto done;
683 			break;
684 		case 1:
685 			/*
686 			 * Last release, shared lock.
687 			 * No exclusive or shared requests pending.
688 			 */
689 			nlock = 0;
690 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
691 				goto done;
692 			break;
693 		case MTX_EXWANTED | 1:
694 		case MTX_EXWANTED | MTX_SHWANTED | 1:
695 			/*
696 			 * Last release, shared lock.
697 			 *
698 			 * Exclusive requests are pending.  Transfer our
699 			 * count (1) to the next exclusive request.
700 			 *
701 			 * Exclusive requests have priority over shared reqs.
702 			 */
703 			if (mtx_chain_link_ex(mtx, lock))
704 				goto done;
705 			break;
706 		case MTX_SHWANTED | 1:
707 			/*
708 			 * Last release, shared lock.
709 			 * Shared requests pending.
710 			 */
711 			if (mtx_chain_link_sh(mtx, lock, 0))
712 				goto done;
713 			break;
714 		default:
715 			/*
716 			 * We have to loop if this is the last release but
717 			 * someone is fiddling with LINKSPIN.
718 			 */
719 			if ((lock & MTX_MASK) == 1) {
720 				KKASSERT(lock & MTX_LINKSPIN);
721 				break;
722 			}
723 
724 			/*
725 			 * Not the last release (shared or exclusive)
726 			 */
727 			nlock = lock - 1;
728 			KKASSERT((nlock & MTX_MASK) != MTX_MASK);
729 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
730 				goto done;
731 			break;
732 		}
733 		/* loop try again */
734 		cpu_pause();
735 		++mtx_collision_count;
736 	}
737 done:
738 	;
739 }
740 
741 /*
742  * Chain pending links.  Called on the last release of an exclusive or
743  * shared lock when the appropriate WANTED bit is set.  mtx_lock old state
744  * is passed in with the count left at 1, which we can inherit, and other
745  * bits which we must adjust in a single atomic operation.
746  *
747  * Return non-zero on success, 0 if caller needs to retry.
748  *
749  * NOTE: It's ok if MTX_EXWANTED is in an indeterminant state while we are
750  *	 acquiring LINKSPIN as all other cases will also need to acquire
751  *	 LINKSPIN when handling the EXWANTED case.
752  */
753 static int
754 mtx_chain_link_ex(mtx_t *mtx, u_int olock)
755 {
756 	thread_t td = curthread;
757 	mtx_link_t *link;
758 	u_int	nlock;
759 
760 	olock &= ~MTX_LINKSPIN;
761 	nlock = olock | MTX_LINKSPIN | MTX_EXCLUSIVE;
762 	++td->td_critcount;
763 	if (atomic_cmpset_int(&mtx->mtx_lock, olock, nlock)) {
764 		link = mtx->mtx_exlink;
765 		KKASSERT(link != NULL);
766 		if (link->next == link) {
767 			mtx->mtx_exlink = NULL;
768 			nlock = MTX_LINKSPIN | MTX_EXWANTED;	/* to clear */
769 		} else {
770 			mtx->mtx_exlink = link->next;
771 			link->next->prev = link->prev;
772 			link->prev->next = link->next;
773 			nlock = MTX_LINKSPIN;			/* to clear */
774 		}
775 		KKASSERT(link->state == MTX_LINK_LINKED_EX);
776 		mtx->mtx_owner = link->owner;
777 		cpu_sfence();
778 
779 		/*
780 		 * WARNING! The callback can only be safely
781 		 *	    made with LINKSPIN still held
782 		 *	    and in a critical section.
783 		 *
784 		 * WARNING! The link can go away after the
785 		 *	    state is set, or after the
786 		 *	    callback.
787 		 */
788 		if (link->callback) {
789 			link->state = MTX_LINK_CALLEDBACK;
790 			link->callback(link, link->arg, 0);
791 		} else {
792 			link->state = MTX_LINK_ACQUIRED;
793 			wakeup(link);
794 		}
795 		atomic_clear_int(&mtx->mtx_lock, nlock);
796 		--td->td_critcount;
797 		++mtx_wakeup_count;
798 		return 1;
799 	}
800 	/* retry */
801 	--td->td_critcount;
802 	return 0;
803 }
804 
805 /*
806  * Flush waiting shared locks.  The lock's prior state is passed in and must
807  * be adjusted atomically only if it matches.
808  *
809  * If addcount is 0, the count for the first shared lock in the chain is
810  * assumed to have already been accounted for.
811  *
812  * If addcount is 1, the count for the first shared lock in the chain has
813  * not yet been accounted for.
814  */
815 static int
816 mtx_chain_link_sh(mtx_t *mtx, u_int olock, int addcount)
817 {
818 	thread_t td = curthread;
819 	mtx_link_t *link;
820 	u_int	nlock;
821 
822 	olock &= ~MTX_LINKSPIN;
823 	nlock = olock | MTX_LINKSPIN;
824 	nlock &= ~MTX_EXCLUSIVE;
825 	++td->td_critcount;
826 	if (atomic_cmpset_int(&mtx->mtx_lock, olock, nlock)) {
827 		KKASSERT(mtx->mtx_shlink != NULL);
828 		for (;;) {
829 			link = mtx->mtx_shlink;
830 			atomic_add_int(&mtx->mtx_lock, addcount);
831 			KKASSERT(link->state == MTX_LINK_LINKED_SH);
832 			if (link->next == link) {
833 				mtx->mtx_shlink = NULL;
834 				cpu_sfence();
835 
836 				/*
837 				 * WARNING! The callback can only be safely
838 				 *	    made with LINKSPIN still held
839 				 *	    and in a critical section.
840 				 *
841 				 * WARNING! The link can go away after the
842 				 *	    state is set, or after the
843 				 *	    callback.
844 				 */
845 				if (link->callback) {
846 					link->state = MTX_LINK_CALLEDBACK;
847 					link->callback(link, link->arg, 0);
848 				} else {
849 					link->state = MTX_LINK_ACQUIRED;
850 					wakeup(link);
851 				}
852 				++mtx_wakeup_count;
853 				break;
854 			}
855 			mtx->mtx_shlink = link->next;
856 			link->next->prev = link->prev;
857 			link->prev->next = link->next;
858 			cpu_sfence();
859 			link->state = MTX_LINK_ACQUIRED;
860 			/* link can go away */
861 			wakeup(link);
862 			++mtx_wakeup_count;
863 			addcount = 1;
864 		}
865 		atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN |
866 						 MTX_SHWANTED);
867 		--td->td_critcount;
868 		return 1;
869 	}
870 	/* retry */
871 	--td->td_critcount;
872 	return 0;
873 }
874 
875 /*
876  * Delete a link structure after tsleep has failed.  This code is not
877  * in the critical path as most exclusive waits are chained.
878  */
879 static
880 void
881 mtx_delete_link(mtx_t *mtx, mtx_link_t *link)
882 {
883 	thread_t td = curthread;
884 	u_int	lock;
885 	u_int	nlock;
886 
887 	/*
888 	 * Acquire MTX_LINKSPIN.
889 	 *
890 	 * Do not use cmpxchg to wait for LINKSPIN to clear as this might
891 	 * result in too much cpu cache traffic.
892 	 */
893 	++td->td_critcount;
894 	for (;;) {
895 		lock = mtx->mtx_lock;
896 		if (lock & MTX_LINKSPIN) {
897 			cpu_pause();
898 			++mtx_collision_count;
899 			continue;
900 		}
901 		nlock = lock | MTX_LINKSPIN;
902 		if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
903 			break;
904 		cpu_pause();
905 		++mtx_collision_count;
906 	}
907 
908 	/*
909 	 * Delete the link and release LINKSPIN.
910 	 */
911 	nlock = MTX_LINKSPIN;	/* to clear */
912 
913 	switch(link->state) {
914 	case MTX_LINK_LINKED_EX:
915 		if (link->next == link) {
916 			mtx->mtx_exlink = NULL;
917 			nlock |= MTX_EXWANTED;	/* to clear */
918 		} else {
919 			mtx->mtx_exlink = link->next;
920 			link->next->prev = link->prev;
921 			link->prev->next = link->next;
922 		}
923 		break;
924 	case MTX_LINK_LINKED_SH:
925 		if (link->next == link) {
926 			mtx->mtx_shlink = NULL;
927 			nlock |= MTX_SHWANTED;	/* to clear */
928 		} else {
929 			mtx->mtx_shlink = link->next;
930 			link->next->prev = link->prev;
931 			link->prev->next = link->next;
932 		}
933 		break;
934 	default:
935 		/* no change */
936 		break;
937 	}
938 	atomic_clear_int(&mtx->mtx_lock, nlock);
939 	--td->td_critcount;
940 }
941 
942 /*
943  * Wait for async lock completion or abort.  Returns ENOLCK if an abort
944  * occurred.
945  */
946 int
947 mtx_wait_link(mtx_t *mtx, mtx_link_t *link, int flags, int to)
948 {
949 	int error;
950 
951 	/*
952 	 * Sleep.  Handle false wakeups, interruptions, etc.
953 	 * The link may also have been aborted.
954 	 */
955 	error = 0;
956 	while (link->state & MTX_LINK_LINKED) {
957 		tsleep_interlock(link, 0);
958 		cpu_lfence();
959 		if (link->state & MTX_LINK_LINKED) {
960 			++mtx_contention_count;
961 			if (link->state & MTX_LINK_LINKED_SH)
962 				mycpu->gd_cnt.v_lock_name[0] = 'S';
963 			else
964 				mycpu->gd_cnt.v_lock_name[0] = 'X';
965 			strncpy(mycpu->gd_cnt.v_lock_name + 1,
966 				mtx->mtx_ident,
967 				sizeof(mycpu->gd_cnt.v_lock_name) - 2);
968 			++mycpu->gd_cnt.v_lock_colls;
969 
970 			error = tsleep(link, flags | PINTERLOCKED,
971 				       mtx->mtx_ident, to);
972 			if (error)
973 				break;
974 		}
975 	}
976 
977 	/*
978 	 * We are done, make sure the link structure is unlinked.
979 	 * It may still be on the list due to e.g. EINTR or
980 	 * EWOULDBLOCK.
981 	 *
982 	 * It is possible for the tsleep to race an ABORT and cause
983 	 * error to be 0.
984 	 *
985 	 * The tsleep() can be woken up for numerous reasons and error
986 	 * might be zero in situations where we intend to return an error.
987 	 *
988 	 * (This is the synchronous case so state cannot be CALLEDBACK)
989 	 */
990 	switch(link->state) {
991 	case MTX_LINK_ACQUIRED:
992 	case MTX_LINK_CALLEDBACK:
993 		error = 0;
994 		break;
995 	case MTX_LINK_ABORTED:
996 		error = ENOLCK;
997 		break;
998 	case MTX_LINK_LINKED_EX:
999 	case MTX_LINK_LINKED_SH:
1000 		mtx_delete_link(mtx, link);
1001 		/* fall through */
1002 	default:
1003 		if (error == 0)
1004 			error = EWOULDBLOCK;
1005 		break;
1006 	}
1007 
1008 	/*
1009 	 * Clear state on status returned.
1010 	 */
1011 	link->state = MTX_LINK_IDLE;
1012 
1013 	return error;
1014 }
1015 
1016 /*
1017  * Abort a mutex locking operation, causing mtx_lock_ex_link() to
1018  * return ENOLCK.  This may be called at any time after the mtx_link
1019  * is initialized or the status from a previous lock has been
1020  * returned.  If called prior to the next (non-try) lock attempt, the
1021  * next lock attempt using this link structure will abort instantly.
1022  *
1023  * Caller must still wait for the operation to complete, either from a
1024  * blocking call that is still in progress or by calling mtx_wait_link().
1025  *
1026  * If an asynchronous lock request is possibly in-progress, the caller
1027  * should call mtx_wait_link() synchronously.  Note that the asynchronous
1028  * lock callback will NOT be called if a successful abort occurred. XXX
1029  */
1030 void
1031 mtx_abort_link(mtx_t *mtx, mtx_link_t *link)
1032 {
1033 	thread_t td = curthread;
1034 	u_int	lock;
1035 	u_int	nlock;
1036 
1037 	/*
1038 	 * Acquire MTX_LINKSPIN
1039 	 */
1040 	++td->td_critcount;
1041 	for (;;) {
1042 		lock = mtx->mtx_lock;
1043 		if (lock & MTX_LINKSPIN) {
1044 			cpu_pause();
1045 			++mtx_collision_count;
1046 			continue;
1047 		}
1048 		nlock = lock | MTX_LINKSPIN;
1049 		if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
1050 			break;
1051 		cpu_pause();
1052 		++mtx_collision_count;
1053 	}
1054 
1055 	/*
1056 	 * Do the abort.
1057 	 *
1058 	 * WARNING! Link structure can disappear once link->state is set.
1059 	 */
1060 	nlock = MTX_LINKSPIN;	/* to clear */
1061 
1062 	switch(link->state) {
1063 	case MTX_LINK_IDLE:
1064 		/*
1065 		 * Link not started yet
1066 		 */
1067 		link->state = MTX_LINK_ABORTED;
1068 		break;
1069 	case MTX_LINK_LINKED_EX:
1070 		/*
1071 		 * de-link, mark aborted, and potentially wakeup the thread
1072 		 * or issue the callback.
1073 		 */
1074 		if (link->next == link) {
1075 			if (mtx->mtx_exlink == link) {
1076 				mtx->mtx_exlink = NULL;
1077 				nlock |= MTX_EXWANTED;	/* to clear */
1078 			}
1079 		} else {
1080 			if (mtx->mtx_exlink == link)
1081 				mtx->mtx_exlink = link->next;
1082 			link->next->prev = link->prev;
1083 			link->prev->next = link->next;
1084 		}
1085 
1086 		/*
1087 		 * When aborting the async callback is still made.  We must
1088 		 * not set the link status to ABORTED in the callback case
1089 		 * since there is nothing else to clear its status if the
1090 		 * link is reused.
1091 		 */
1092 		if (link->callback) {
1093 			link->state = MTX_LINK_CALLEDBACK;
1094 			link->callback(link, link->arg, ENOLCK);
1095 		} else {
1096 			link->state = MTX_LINK_ABORTED;
1097 			wakeup(link);
1098 		}
1099 		++mtx_wakeup_count;
1100 		break;
1101 	case MTX_LINK_LINKED_SH:
1102 		/*
1103 		 * de-link, mark aborted, and potentially wakeup the thread
1104 		 * or issue the callback.
1105 		 */
1106 		if (link->next == link) {
1107 			if (mtx->mtx_shlink == link) {
1108 				mtx->mtx_shlink = NULL;
1109 				nlock |= MTX_SHWANTED;	/* to clear */
1110 			}
1111 		} else {
1112 			if (mtx->mtx_shlink == link)
1113 				mtx->mtx_shlink = link->next;
1114 			link->next->prev = link->prev;
1115 			link->prev->next = link->next;
1116 		}
1117 
1118 		/*
1119 		 * When aborting the async callback is still made.  We must
1120 		 * not set the link status to ABORTED in the callback case
1121 		 * since there is nothing else to clear its status if the
1122 		 * link is reused.
1123 		 */
1124 		if (link->callback) {
1125 			link->state = MTX_LINK_CALLEDBACK;
1126 			link->callback(link, link->arg, ENOLCK);
1127 		} else {
1128 			link->state = MTX_LINK_ABORTED;
1129 			wakeup(link);
1130 		}
1131 		++mtx_wakeup_count;
1132 		break;
1133 	case MTX_LINK_ACQUIRED:
1134 	case MTX_LINK_CALLEDBACK:
1135 		/*
1136 		 * Too late, the lock was acquired.  Let it complete.
1137 		 */
1138 		break;
1139 	default:
1140 		/*
1141 		 * link already aborted, do nothing.
1142 		 */
1143 		break;
1144 	}
1145 	atomic_clear_int(&mtx->mtx_lock, nlock);
1146 	--td->td_critcount;
1147 }
1148