xref: /illumos-gate/usr/src/lib/libc/port/threads/rwlock.c (revision d362b749)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include "lint.h"
30 #include "thr_uberdata.h"
31 #include <sys/sdt.h>
32 
33 #define	TRY_FLAG		0x10
34 #define	READ_LOCK		0
35 #define	WRITE_LOCK		1
36 #define	READ_LOCK_TRY		(READ_LOCK | TRY_FLAG)
37 #define	WRITE_LOCK_TRY		(WRITE_LOCK | TRY_FLAG)
38 
39 #define	NLOCKS	4	/* initial number of readlock_t structs allocated */
40 
41 #define	ASSERT_CONSISTENT_STATE(readers)		\
42 	ASSERT(!((readers) & URW_WRITE_LOCKED) ||	\
43 		((readers) & ~URW_HAS_WAITERS) == URW_WRITE_LOCKED)
44 
45 /*
46  * Find/allocate an entry for rwlp in our array of rwlocks held for reading.
47  * We must be deferring signals for this to be safe.
48  * Else if we are returning an entry with ul_rdlockcnt == 0,
49  * it could be reassigned behind our back in a signal handler.
50  */
51 static readlock_t *
52 rwl_entry(rwlock_t *rwlp)
53 {
54 	ulwp_t *self = curthread;
55 	readlock_t *remembered = NULL;
56 	readlock_t *readlockp;
57 	uint_t nlocks;
58 
59 	/* we must be deferring signals */
60 	ASSERT((self->ul_critical + self->ul_sigdefer) != 0);
61 
62 	if ((nlocks = self->ul_rdlockcnt) != 0)
63 		readlockp = self->ul_readlock.array;
64 	else {
65 		nlocks = 1;
66 		readlockp = &self->ul_readlock.single;
67 	}
68 
69 	for (; nlocks; nlocks--, readlockp++) {
70 		if (readlockp->rd_rwlock == rwlp)
71 			return (readlockp);
72 		if (readlockp->rd_count == 0 && remembered == NULL)
73 			remembered = readlockp;
74 	}
75 	if (remembered != NULL) {
76 		remembered->rd_rwlock = rwlp;
77 		return (remembered);
78 	}
79 
80 	/*
81 	 * No entry available.  Allocate more space, converting the single
82 	 * readlock_t entry into an array of readlock_t entries if necessary.
83 	 */
84 	if ((nlocks = self->ul_rdlockcnt) == 0) {
85 		/*
86 		 * Initial allocation of the readlock_t array.
87 		 * Convert the single entry into an array.
88 		 */
89 		self->ul_rdlockcnt = nlocks = NLOCKS;
90 		readlockp = lmalloc(nlocks * sizeof (readlock_t));
91 		/*
92 		 * The single readlock_t becomes the first entry in the array.
93 		 */
94 		*readlockp = self->ul_readlock.single;
95 		self->ul_readlock.single.rd_count = 0;
96 		self->ul_readlock.array = readlockp;
97 		/*
98 		 * Return the next available entry in the array.
99 		 */
100 		(++readlockp)->rd_rwlock = rwlp;
101 		return (readlockp);
102 	}
103 	/*
104 	 * Reallocate the array, double the size each time.
105 	 */
106 	readlockp = lmalloc(nlocks * 2 * sizeof (readlock_t));
107 	(void) _memcpy(readlockp, self->ul_readlock.array,
108 		nlocks * sizeof (readlock_t));
109 	lfree(self->ul_readlock.array, nlocks * sizeof (readlock_t));
110 	self->ul_readlock.array = readlockp;
111 	self->ul_rdlockcnt *= 2;
112 	/*
113 	 * Return the next available entry in the newly allocated array.
114 	 */
115 	(readlockp += nlocks)->rd_rwlock = rwlp;
116 	return (readlockp);
117 }
118 
119 /*
120  * Free the array of rwlocks held for reading.
121  */
122 void
123 rwl_free(ulwp_t *ulwp)
124 {
125 	uint_t nlocks;
126 
127 	if ((nlocks = ulwp->ul_rdlockcnt) != 0)
128 		lfree(ulwp->ul_readlock.array, nlocks * sizeof (readlock_t));
129 	ulwp->ul_rdlockcnt = 0;
130 	ulwp->ul_readlock.single.rd_rwlock = NULL;
131 	ulwp->ul_readlock.single.rd_count = 0;
132 }
133 
134 /*
135  * Check if a reader version of the lock is held by the current thread.
136  * rw_read_is_held() is private to libc.
137  */
138 #pragma weak rw_read_is_held = _rw_read_held
139 #pragma weak rw_read_held = _rw_read_held
140 int
141 _rw_read_held(rwlock_t *rwlp)
142 {
143 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
144 	uint32_t readers;
145 	ulwp_t *self = curthread;
146 	readlock_t *readlockp;
147 	uint_t nlocks;
148 	int rval = 0;
149 
150 	no_preempt(self);
151 
152 	readers = *rwstate;
153 	ASSERT_CONSISTENT_STATE(readers);
154 	if (!(readers & URW_WRITE_LOCKED) &&
155 	    (readers & URW_READERS_MASK) != 0) {
156 		/*
157 		 * The lock is held for reading by some thread.
158 		 * Search our array of rwlocks held for reading for a match.
159 		 */
160 		if ((nlocks = self->ul_rdlockcnt) != 0)
161 			readlockp = self->ul_readlock.array;
162 		else {
163 			nlocks = 1;
164 			readlockp = &self->ul_readlock.single;
165 		}
166 		for (; nlocks; nlocks--, readlockp++) {
167 			if (readlockp->rd_rwlock == rwlp) {
168 				if (readlockp->rd_count)
169 					rval = 1;
170 				break;
171 			}
172 		}
173 	}
174 
175 	preempt(self);
176 	return (rval);
177 }
178 
179 /*
180  * Check if a writer version of the lock is held by the current thread.
181  * rw_write_is_held() is private to libc.
182  */
183 #pragma weak rw_write_is_held = _rw_write_held
184 #pragma weak rw_write_held = _rw_write_held
185 int
186 _rw_write_held(rwlock_t *rwlp)
187 {
188 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
189 	uint32_t readers;
190 	ulwp_t *self = curthread;
191 	int rval;
192 
193 	no_preempt(self);
194 
195 	readers = *rwstate;
196 	ASSERT_CONSISTENT_STATE(readers);
197 	rval = ((readers & URW_WRITE_LOCKED) &&
198 	    rwlp->rwlock_owner == (uintptr_t)self &&
199 	    (rwlp->rwlock_type == USYNC_THREAD ||
200 	    rwlp->rwlock_ownerpid == self->ul_uberdata->pid));
201 
202 	preempt(self);
203 	return (rval);
204 }
205 
206 #pragma weak rwlock_init = __rwlock_init
207 #pragma weak _rwlock_init = __rwlock_init
208 /* ARGSUSED2 */
209 int
210 __rwlock_init(rwlock_t *rwlp, int type, void *arg)
211 {
212 	if (type != USYNC_THREAD && type != USYNC_PROCESS)
213 		return (EINVAL);
214 	/*
215 	 * Once reinitialized, we can no longer be holding a read or write lock.
216 	 * We can do nothing about other threads that are holding read locks.
217 	 */
218 	sigoff(curthread);
219 	rwl_entry(rwlp)->rd_count = 0;
220 	sigon(curthread);
221 	(void) _memset(rwlp, 0, sizeof (*rwlp));
222 	rwlp->rwlock_type = (uint16_t)type;
223 	rwlp->rwlock_magic = RWL_MAGIC;
224 	rwlp->mutex.mutex_type = (uint8_t)type;
225 	rwlp->mutex.mutex_flag = LOCK_INITED;
226 	rwlp->mutex.mutex_magic = MUTEX_MAGIC;
227 	return (0);
228 }
229 
230 #pragma weak rwlock_destroy = __rwlock_destroy
231 #pragma weak _rwlock_destroy = __rwlock_destroy
232 #pragma weak pthread_rwlock_destroy = __rwlock_destroy
233 #pragma weak _pthread_rwlock_destroy = __rwlock_destroy
234 int
235 __rwlock_destroy(rwlock_t *rwlp)
236 {
237 	/*
238 	 * Once destroyed, we can no longer be holding a read or write lock.
239 	 * We can do nothing about other threads that are holding read locks.
240 	 */
241 	sigoff(curthread);
242 	rwl_entry(rwlp)->rd_count = 0;
243 	sigon(curthread);
244 	rwlp->rwlock_magic = 0;
245 	tdb_sync_obj_deregister(rwlp);
246 	return (0);
247 }
248 
249 /*
250  * Attempt to acquire a readers lock.  Return true on success.
251  */
252 static int
253 read_lock_try(rwlock_t *rwlp, int ignore_waiters_flag)
254 {
255 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
256 	uint32_t mask = ignore_waiters_flag?
257 		URW_WRITE_LOCKED : (URW_HAS_WAITERS | URW_WRITE_LOCKED);
258 	uint32_t readers;
259 	ulwp_t *self = curthread;
260 
261 	no_preempt(self);
262 	while (((readers = *rwstate) & mask) == 0) {
263 		if (atomic_cas_32(rwstate, readers, readers + 1) == readers) {
264 			preempt(self);
265 			return (1);
266 		}
267 	}
268 	preempt(self);
269 	return (0);
270 }
271 
272 /*
273  * Attempt to release a reader lock.  Return true on success.
274  */
275 static int
276 read_unlock_try(rwlock_t *rwlp)
277 {
278 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
279 	uint32_t readers;
280 	ulwp_t *self = curthread;
281 
282 	no_preempt(self);
283 	while (((readers = *rwstate) & URW_HAS_WAITERS) == 0) {
284 		if (atomic_cas_32(rwstate, readers, readers - 1) == readers) {
285 			preempt(self);
286 			return (1);
287 		}
288 	}
289 	preempt(self);
290 	return (0);
291 }
292 
293 /*
294  * Attempt to acquire a writer lock.  Return true on success.
295  */
296 static int
297 write_lock_try(rwlock_t *rwlp, int ignore_waiters_flag)
298 {
299 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
300 	uint32_t mask = ignore_waiters_flag?
301 		(URW_WRITE_LOCKED | URW_READERS_MASK) :
302 		(URW_HAS_WAITERS | URW_WRITE_LOCKED | URW_READERS_MASK);
303 	ulwp_t *self = curthread;
304 	uint32_t readers;
305 
306 	no_preempt(self);
307 	while (((readers = *rwstate) & mask) == 0) {
308 		if (atomic_cas_32(rwstate, readers, readers | URW_WRITE_LOCKED)
309 		    == readers) {
310 			preempt(self);
311 			return (1);
312 		}
313 	}
314 	preempt(self);
315 	return (0);
316 }
317 
318 /*
319  * Attempt to release a writer lock.  Return true on success.
320  */
321 static int
322 write_unlock_try(rwlock_t *rwlp)
323 {
324 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
325 	uint32_t readers;
326 	ulwp_t *self = curthread;
327 
328 	no_preempt(self);
329 	while (((readers = *rwstate) & URW_HAS_WAITERS) == 0) {
330 		if (atomic_cas_32(rwstate, readers, 0) == readers) {
331 			preempt(self);
332 			return (1);
333 		}
334 	}
335 	preempt(self);
336 	return (0);
337 }
338 
339 /*
340  * Wake up thread(s) sleeping on the rwlock queue and then
341  * drop the queue lock.  Return non-zero if we wake up someone.
342  * This is called when a thread releases a lock that appears to have waiters.
343  */
344 static int
345 rw_queue_release(queue_head_t *qp, rwlock_t *rwlp)
346 {
347 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
348 	uint32_t readers;
349 	uint32_t writers;
350 	int nlwpid = 0;
351 	int maxlwps = MAXLWPS;
352 	ulwp_t *self;
353 	ulwp_t **ulwpp;
354 	ulwp_t *ulwp;
355 	ulwp_t *prev = NULL;
356 	lwpid_t buffer[MAXLWPS];
357 	lwpid_t *lwpid = buffer;
358 
359 	readers = *rwstate;
360 	ASSERT_CONSISTENT_STATE(readers);
361 	if (!(readers & URW_HAS_WAITERS)) {
362 		queue_unlock(qp);
363 		return (0);
364 	}
365 	readers &= URW_READERS_MASK;
366 	writers = 0;
367 
368 	/*
369 	 * Walk the list of waiters and prepare to wake up as
370 	 * many readers as we encounter before encountering
371 	 * a writer.  If the first thread on the list is a
372 	 * writer, stop there and wake it up.
373 	 *
374 	 * We keep track of lwpids that are to be unparked in lwpid[].
375 	 * __lwp_unpark_all() is called to unpark all of them after
376 	 * they have been removed from the sleep queue and the sleep
377 	 * queue lock has been dropped.  If we run out of space in our
378 	 * on-stack buffer, we need to allocate more but we can't call
379 	 * lmalloc() because we are holding a queue lock when the overflow
380 	 * occurs and lmalloc() acquires a lock.  We can't use alloca()
381 	 * either because the application may have allocated a small
382 	 * stack and we don't want to overrun the stack.  So we call
383 	 * alloc_lwpids() to allocate a bigger buffer using the mmap()
384 	 * system call directly since that path acquires no locks.
385 	 */
386 	ulwpp = &qp->qh_head;
387 	while ((ulwp = *ulwpp) != NULL) {
388 		if (ulwp->ul_wchan != rwlp) {
389 			prev = ulwp;
390 			ulwpp = &ulwp->ul_link;
391 			continue;
392 		}
393 		if (ulwp->ul_writer) {
394 			if (writers != 0 || readers != 0)
395 				break;
396 			/* one writer to wake */
397 			writers++;
398 		} else {
399 			if (writers != 0)
400 				break;
401 			/* at least one reader to wake */
402 			readers++;
403 			if (nlwpid == maxlwps)
404 				lwpid = alloc_lwpids(lwpid, &nlwpid, &maxlwps);
405 		}
406 		(void) queue_unlink(qp, ulwpp, prev);
407 		lwpid[nlwpid++] = ulwp->ul_lwpid;
408 	}
409 	if (ulwp == NULL)
410 		atomic_and_32(rwstate, ~URW_HAS_WAITERS);
411 	if (nlwpid == 0) {
412 		queue_unlock(qp);
413 	} else {
414 		self = curthread;
415 		no_preempt(self);
416 		queue_unlock(qp);
417 		if (nlwpid == 1)
418 			(void) __lwp_unpark(lwpid[0]);
419 		else
420 			(void) __lwp_unpark_all(lwpid, nlwpid);
421 		preempt(self);
422 	}
423 	if (lwpid != buffer)
424 		(void) _private_munmap(lwpid, maxlwps * sizeof (lwpid_t));
425 	return (nlwpid != 0);
426 }
427 
428 /*
429  * Common code for rdlock, timedrdlock, wrlock, timedwrlock, tryrdlock,
430  * and trywrlock for process-shared (USYNC_PROCESS) rwlocks.
431  *
432  * Note: if the lock appears to be contended we call __lwp_rwlock_rdlock()
433  * or __lwp_rwlock_wrlock() holding the mutex. These return with the mutex
434  * released, and if they need to sleep will release the mutex first. In the
435  * event of a spurious wakeup, these will return EAGAIN (because it is much
436  * easier for us to re-acquire the mutex here).
437  */
438 int
439 shared_rwlock_lock(rwlock_t *rwlp, timespec_t *tsp, int rd_wr)
440 {
441 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
442 	mutex_t *mp = &rwlp->mutex;
443 	/* LINTED set but not used */
444 	uint32_t readers;
445 	int try_flag;
446 	int error;
447 
448 	try_flag = (rd_wr & TRY_FLAG);
449 	rd_wr &= ~TRY_FLAG;
450 	ASSERT(rd_wr == READ_LOCK || rd_wr == WRITE_LOCK);
451 
452 	if (!try_flag) {
453 		DTRACE_PROBE2(plockstat, rw__block, rwlp, rd_wr);
454 	}
455 
456 	do {
457 		if (try_flag && (*rwstate & URW_WRITE_LOCKED)) {
458 			error = EBUSY;
459 			break;
460 		}
461 		if ((error = _private_mutex_lock(mp)) != 0)
462 			break;
463 		if (rd_wr == READ_LOCK) {
464 			if (read_lock_try(rwlp, 0)) {
465 				(void) _private_mutex_unlock(mp);
466 				break;
467 			}
468 		} else {
469 			if (write_lock_try(rwlp, 0)) {
470 				(void) _private_mutex_unlock(mp);
471 				break;
472 			}
473 		}
474 		atomic_or_32(rwstate, URW_HAS_WAITERS);
475 		readers = *rwstate;
476 		ASSERT_CONSISTENT_STATE(readers);
477 		/*
478 		 * The calls to __lwp_rwlock_*() below will release the mutex,
479 		 * so we need a dtrace probe here.
480 		 */
481 		mp->mutex_owner = 0;
482 		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
483 		/*
484 		 * The waiters bit may be inaccurate.
485 		 * Only the kernel knows for sure.
486 		 */
487 		if (rd_wr == READ_LOCK) {
488 			if (try_flag)
489 				error = __lwp_rwlock_tryrdlock(rwlp);
490 			else
491 				error = __lwp_rwlock_rdlock(rwlp, tsp);
492 		} else {
493 			if (try_flag)
494 				error = __lwp_rwlock_trywrlock(rwlp);
495 			else
496 				error = __lwp_rwlock_wrlock(rwlp, tsp);
497 		}
498 	} while (error == EAGAIN || error == EINTR);
499 
500 	if (!try_flag) {
501 		DTRACE_PROBE3(plockstat, rw__blocked, rwlp, rd_wr, error == 0);
502 	}
503 
504 	return (error);
505 }
506 
507 /*
508  * Common code for rdlock, timedrdlock, wrlock, timedwrlock, tryrdlock,
509  * and trywrlock for process-private (USYNC_THREAD) rwlocks.
510  */
511 int
512 rwlock_lock(rwlock_t *rwlp, timespec_t *tsp, int rd_wr)
513 {
514 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
515 	uint32_t readers;
516 	ulwp_t *self = curthread;
517 	queue_head_t *qp;
518 	ulwp_t *ulwp;
519 	int try_flag;
520 	int error = 0;
521 
522 	try_flag = (rd_wr & TRY_FLAG);
523 	rd_wr &= ~TRY_FLAG;
524 	ASSERT(rd_wr == READ_LOCK || rd_wr == WRITE_LOCK);
525 
526 	if (!try_flag) {
527 		DTRACE_PROBE2(plockstat, rw__block, rwlp, rd_wr);
528 	}
529 
530 	qp = queue_lock(rwlp, MX);
531 retry:
532 	while (error == 0) {
533 		if (rd_wr == READ_LOCK) {
534 			if (read_lock_try(rwlp, 0))
535 				goto out;
536 		} else {
537 			if (write_lock_try(rwlp, 0))
538 				goto out;
539 		}
540 		atomic_or_32(rwstate, URW_HAS_WAITERS);
541 		readers = *rwstate;
542 		ASSERT_CONSISTENT_STATE(readers);
543 		if ((readers & URW_WRITE_LOCKED) ||
544 		    (rd_wr == WRITE_LOCK &&
545 		    (readers & URW_READERS_MASK) != 0))
546 			/* EMPTY */;	/* somebody holds the lock */
547 		else if ((ulwp = queue_waiter(qp, rwlp)) == NULL) {
548 			atomic_and_32(rwstate, ~URW_HAS_WAITERS);
549 			break;		/* no queued waiters */
550 		} else {
551 			int our_pri = real_priority(self);
552 			int his_pri = real_priority(ulwp);
553 
554 			if (rd_wr == WRITE_LOCK) {
555 				/*
556 				 * We defer to a queued thread that has
557 				 * a higher priority than ours.
558 				 */
559 				if (his_pri <= our_pri)
560 					break;
561 			} else {
562 				/*
563 				 * We defer to a queued thread that has
564 				 * a higher priority than ours or that
565 				 * is a writer whose priority equals ours.
566 				 */
567 				if (his_pri < our_pri ||
568 				    (his_pri == our_pri && !ulwp->ul_writer))
569 					break;
570 			}
571 		}
572 		/*
573 		 * We are about to block.
574 		 * If we're doing a trylock, return EBUSY instead.
575 		 */
576 		if (try_flag) {
577 			error = EBUSY;
578 			break;
579 		}
580 		/*
581 		 * Enqueue writers ahead of readers of the
582 		 * same priority.
583 		 */
584 		self->ul_writer = rd_wr;	/* *must* be 0 or 1 */
585 		enqueue(qp, self, rwlp, MX);
586 		set_parking_flag(self, 1);
587 		queue_unlock(qp);
588 		if ((error = __lwp_park(tsp, 0)) == EINTR)
589 			error = 0;
590 		self->ul_writer = 0;
591 		set_parking_flag(self, 0);
592 		qp = queue_lock(rwlp, MX);
593 		if (self->ul_sleepq && dequeue_self(qp, rwlp) == 0)
594 			atomic_and_32(rwstate, ~URW_HAS_WAITERS);
595 	}
596 
597 	if (error == 0) {
598 		if (rd_wr == READ_LOCK) {
599 			if (!read_lock_try(rwlp, 1))
600 				goto retry;
601 		} else {
602 			if (!write_lock_try(rwlp, 1))
603 				goto retry;
604 		}
605 	}
606 
607 out:
608 	queue_unlock(qp);
609 
610 	if (!try_flag) {
611 		DTRACE_PROBE3(plockstat, rw__blocked, rwlp, rd_wr, error == 0);
612 	}
613 
614 	return (error);
615 }
616 
617 int
618 rw_rdlock_impl(rwlock_t *rwlp, timespec_t *tsp)
619 {
620 	ulwp_t *self = curthread;
621 	uberdata_t *udp = self->ul_uberdata;
622 	readlock_t *readlockp;
623 	tdb_rwlock_stats_t *rwsp = RWLOCK_STATS(rwlp, udp);
624 	int error;
625 
626 	/*
627 	 * If we already hold a readers lock on this rwlock,
628 	 * just increment our reference count and return.
629 	 */
630 	sigoff(self);
631 	readlockp = rwl_entry(rwlp);
632 	if (readlockp->rd_count != 0) {
633 		if (readlockp->rd_count == READ_LOCK_MAX) {
634 			sigon(self);
635 			error = EAGAIN;
636 			goto out;
637 		}
638 		sigon(self);
639 		error = 0;
640 		goto out;
641 	}
642 	sigon(self);
643 
644 	/*
645 	 * If we hold the writer lock, bail out.
646 	 */
647 	if (rw_write_is_held(rwlp)) {
648 		if (self->ul_error_detection)
649 			rwlock_error(rwlp, "rwlock_rdlock",
650 			    "calling thread owns the writer lock");
651 		error = EDEADLK;
652 		goto out;
653 	}
654 
655 	if (read_lock_try(rwlp, 0))
656 		error = 0;
657 	else if (rwlp->rwlock_type == USYNC_PROCESS)	/* kernel-level */
658 		error = shared_rwlock_lock(rwlp, tsp, READ_LOCK);
659 	else						/* user-level */
660 		error = rwlock_lock(rwlp, tsp, READ_LOCK);
661 
662 out:
663 	if (error == 0) {
664 		sigoff(self);
665 		rwl_entry(rwlp)->rd_count++;
666 		sigon(self);
667 		if (rwsp)
668 			tdb_incr(rwsp->rw_rdlock);
669 		DTRACE_PROBE2(plockstat, rw__acquire, rwlp, READ_LOCK);
670 	} else {
671 		DTRACE_PROBE3(plockstat, rw__error, rwlp, READ_LOCK, error);
672 	}
673 
674 	return (error);
675 }
676 
677 #pragma weak rw_rdlock = __rw_rdlock
678 #pragma weak _rw_rdlock = __rw_rdlock
679 #pragma weak pthread_rwlock_rdlock = __rw_rdlock
680 #pragma weak _pthread_rwlock_rdlock = __rw_rdlock
681 int
682 __rw_rdlock(rwlock_t *rwlp)
683 {
684 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
685 	return (rw_rdlock_impl(rwlp, NULL));
686 }
687 
688 void
689 lrw_rdlock(rwlock_t *rwlp)
690 {
691 	enter_critical(curthread);
692 	(void) rw_rdlock_impl(rwlp, NULL);
693 }
694 
695 #pragma weak pthread_rwlock_reltimedrdlock_np = \
696 	_pthread_rwlock_reltimedrdlock_np
697 int
698 _pthread_rwlock_reltimedrdlock_np(rwlock_t *rwlp, const timespec_t *reltime)
699 {
700 	timespec_t tslocal = *reltime;
701 	int error;
702 
703 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
704 	error = rw_rdlock_impl(rwlp, &tslocal);
705 	if (error == ETIME)
706 		error = ETIMEDOUT;
707 	return (error);
708 }
709 
710 #pragma weak pthread_rwlock_timedrdlock = _pthread_rwlock_timedrdlock
711 int
712 _pthread_rwlock_timedrdlock(rwlock_t *rwlp, const timespec_t *abstime)
713 {
714 	timespec_t tslocal;
715 	int error;
716 
717 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
718 	abstime_to_reltime(CLOCK_REALTIME, abstime, &tslocal);
719 	error = rw_rdlock_impl(rwlp, &tslocal);
720 	if (error == ETIME)
721 		error = ETIMEDOUT;
722 	return (error);
723 }
724 
725 int
726 rw_wrlock_impl(rwlock_t *rwlp, timespec_t *tsp)
727 {
728 	ulwp_t *self = curthread;
729 	uberdata_t *udp = self->ul_uberdata;
730 	tdb_rwlock_stats_t *rwsp = RWLOCK_STATS(rwlp, udp);
731 	int error;
732 
733 	/*
734 	 * If we hold a readers lock on this rwlock, bail out.
735 	 */
736 	if (rw_read_is_held(rwlp)) {
737 		if (self->ul_error_detection)
738 			rwlock_error(rwlp, "rwlock_wrlock",
739 			    "calling thread owns the readers lock");
740 		error = EDEADLK;
741 		goto out;
742 	}
743 
744 	/*
745 	 * If we hold the writer lock, bail out.
746 	 */
747 	if (rw_write_is_held(rwlp)) {
748 		if (self->ul_error_detection)
749 			rwlock_error(rwlp, "rwlock_wrlock",
750 			    "calling thread owns the writer lock");
751 		error = EDEADLK;
752 		goto out;
753 	}
754 
755 	if (write_lock_try(rwlp, 0))
756 		error = 0;
757 	else if (rwlp->rwlock_type == USYNC_PROCESS)	/* kernel-level */
758 		error = shared_rwlock_lock(rwlp, tsp, WRITE_LOCK);
759 	else						/* user-level */
760 		error = rwlock_lock(rwlp, tsp, WRITE_LOCK);
761 
762 out:
763 	if (error == 0) {
764 		rwlp->rwlock_owner = (uintptr_t)self;
765 		if (rwlp->rwlock_type == USYNC_PROCESS)
766 			rwlp->rwlock_ownerpid = udp->pid;
767 		if (rwsp) {
768 			tdb_incr(rwsp->rw_wrlock);
769 			rwsp->rw_wrlock_begin_hold = gethrtime();
770 		}
771 		DTRACE_PROBE2(plockstat, rw__acquire, rwlp, WRITE_LOCK);
772 	} else {
773 		DTRACE_PROBE3(plockstat, rw__error, rwlp, WRITE_LOCK, error);
774 	}
775 	return (error);
776 }
777 
778 #pragma weak rw_wrlock = __rw_wrlock
779 #pragma weak _rw_wrlock = __rw_wrlock
780 #pragma weak pthread_rwlock_wrlock = __rw_wrlock
781 #pragma weak _pthread_rwlock_wrlock = __rw_wrlock
782 int
783 __rw_wrlock(rwlock_t *rwlp)
784 {
785 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
786 	return (rw_wrlock_impl(rwlp, NULL));
787 }
788 
789 void
790 lrw_wrlock(rwlock_t *rwlp)
791 {
792 	enter_critical(curthread);
793 	(void) rw_wrlock_impl(rwlp, NULL);
794 }
795 
796 #pragma weak pthread_rwlock_reltimedwrlock_np = \
797 	_pthread_rwlock_reltimedwrlock_np
798 int
799 _pthread_rwlock_reltimedwrlock_np(rwlock_t *rwlp, const timespec_t *reltime)
800 {
801 	timespec_t tslocal = *reltime;
802 	int error;
803 
804 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
805 	error = rw_wrlock_impl(rwlp, &tslocal);
806 	if (error == ETIME)
807 		error = ETIMEDOUT;
808 	return (error);
809 }
810 
811 #pragma weak pthread_rwlock_timedwrlock = _pthread_rwlock_timedwrlock
812 int
813 _pthread_rwlock_timedwrlock(rwlock_t *rwlp, const timespec_t *abstime)
814 {
815 	timespec_t tslocal;
816 	int error;
817 
818 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
819 	abstime_to_reltime(CLOCK_REALTIME, abstime, &tslocal);
820 	error = rw_wrlock_impl(rwlp, &tslocal);
821 	if (error == ETIME)
822 		error = ETIMEDOUT;
823 	return (error);
824 }
825 
826 #pragma weak rw_tryrdlock = __rw_tryrdlock
827 #pragma weak _rw_tryrdlock = __rw_tryrdlock
828 #pragma weak pthread_rwlock_tryrdlock = __rw_tryrdlock
829 #pragma weak _pthread_rwlock_tryrdlock = __rw_tryrdlock
830 int
831 __rw_tryrdlock(rwlock_t *rwlp)
832 {
833 	ulwp_t *self = curthread;
834 	uberdata_t *udp = self->ul_uberdata;
835 	tdb_rwlock_stats_t *rwsp = RWLOCK_STATS(rwlp, udp);
836 	readlock_t *readlockp;
837 	int error;
838 
839 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
840 
841 	if (rwsp)
842 		tdb_incr(rwsp->rw_rdlock_try);
843 
844 	/*
845 	 * If we already hold a readers lock on this rwlock,
846 	 * just increment our reference count and return.
847 	 */
848 	sigoff(self);
849 	readlockp = rwl_entry(rwlp);
850 	if (readlockp->rd_count != 0) {
851 		if (readlockp->rd_count == READ_LOCK_MAX) {
852 			sigon(self);
853 			error = EAGAIN;
854 			goto out;
855 		}
856 		sigon(self);
857 		error = 0;
858 		goto out;
859 	}
860 	sigon(self);
861 
862 	if (read_lock_try(rwlp, 0))
863 		error = 0;
864 	else if (rwlp->rwlock_type == USYNC_PROCESS)	/* kernel-level */
865 		error = shared_rwlock_lock(rwlp, NULL, READ_LOCK_TRY);
866 	else						/* user-level */
867 		error = rwlock_lock(rwlp, NULL, READ_LOCK_TRY);
868 
869 out:
870 	if (error == 0) {
871 		sigoff(self);
872 		rwl_entry(rwlp)->rd_count++;
873 		sigon(self);
874 		DTRACE_PROBE2(plockstat, rw__acquire, rwlp, READ_LOCK);
875 	} else {
876 		if (rwsp)
877 			tdb_incr(rwsp->rw_rdlock_try_fail);
878 		if (error != EBUSY) {
879 			DTRACE_PROBE3(plockstat, rw__error, rwlp, READ_LOCK,
880 			    error);
881 		}
882 	}
883 
884 	return (error);
885 }
886 
887 #pragma weak rw_trywrlock = __rw_trywrlock
888 #pragma weak _rw_trywrlock = __rw_trywrlock
889 #pragma weak pthread_rwlock_trywrlock = __rw_trywrlock
890 #pragma weak _pthread_rwlock_trywrlock = __rw_trywrlock
891 int
892 __rw_trywrlock(rwlock_t *rwlp)
893 {
894 	ulwp_t *self = curthread;
895 	uberdata_t *udp = self->ul_uberdata;
896 	tdb_rwlock_stats_t *rwsp = RWLOCK_STATS(rwlp, udp);
897 	int error;
898 
899 	ASSERT(!self->ul_critical || self->ul_bindflags);
900 
901 	if (rwsp)
902 		tdb_incr(rwsp->rw_wrlock_try);
903 
904 	if (write_lock_try(rwlp, 0))
905 		error = 0;
906 	else if (rwlp->rwlock_type == USYNC_PROCESS)	/* kernel-level */
907 		error = shared_rwlock_lock(rwlp, NULL, WRITE_LOCK_TRY);
908 	else						/* user-level */
909 		error = rwlock_lock(rwlp, NULL, WRITE_LOCK_TRY);
910 
911 	if (error == 0) {
912 		rwlp->rwlock_owner = (uintptr_t)self;
913 		if (rwlp->rwlock_type == USYNC_PROCESS)
914 			rwlp->rwlock_ownerpid = udp->pid;
915 		if (rwsp)
916 			rwsp->rw_wrlock_begin_hold = gethrtime();
917 		DTRACE_PROBE2(plockstat, rw__acquire, rwlp, WRITE_LOCK);
918 	} else {
919 		if (rwsp)
920 			tdb_incr(rwsp->rw_wrlock_try_fail);
921 		if (error != EBUSY) {
922 			DTRACE_PROBE3(plockstat, rw__error, rwlp, WRITE_LOCK,
923 			    error);
924 		}
925 	}
926 	return (error);
927 }
928 
929 #pragma weak rw_unlock = __rw_unlock
930 #pragma weak _rw_unlock = __rw_unlock
931 #pragma weak pthread_rwlock_unlock = __rw_unlock
932 #pragma weak _pthread_rwlock_unlock = __rw_unlock
933 int
934 __rw_unlock(rwlock_t *rwlp)
935 {
936 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
937 	uint32_t readers;
938 	ulwp_t *self = curthread;
939 	uberdata_t *udp = self->ul_uberdata;
940 	tdb_rwlock_stats_t *rwsp;
941 	queue_head_t *qp;
942 	int rd_wr;
943 	int waked = 0;
944 
945 	readers = *rwstate;
946 	ASSERT_CONSISTENT_STATE(readers);
947 	if (readers & URW_WRITE_LOCKED) {
948 		rd_wr = WRITE_LOCK;
949 		readers = 0;
950 	} else {
951 		rd_wr = READ_LOCK;
952 		readers &= URW_READERS_MASK;
953 	}
954 
955 	if (rd_wr == WRITE_LOCK) {
956 		/*
957 		 * Since the writer lock is held, we'd better be
958 		 * holding it, else we cannot legitimately be here.
959 		 */
960 		if (!rw_write_is_held(rwlp)) {
961 			if (self->ul_error_detection)
962 				rwlock_error(rwlp, "rwlock_unlock",
963 				    "writer lock held, "
964 				    "but not by the calling thread");
965 			return (EPERM);
966 		}
967 		if ((rwsp = RWLOCK_STATS(rwlp, udp)) != NULL) {
968 			if (rwsp->rw_wrlock_begin_hold)
969 				rwsp->rw_wrlock_hold_time +=
970 				    gethrtime() - rwsp->rw_wrlock_begin_hold;
971 			rwsp->rw_wrlock_begin_hold = 0;
972 		}
973 		rwlp->rwlock_owner = 0;
974 		rwlp->rwlock_ownerpid = 0;
975 	} else if (readers > 0) {
976 		/*
977 		 * A readers lock is held; if we don't hold one, bail out.
978 		 */
979 		readlock_t *readlockp;
980 
981 		sigoff(self);
982 		readlockp = rwl_entry(rwlp);
983 		if (readlockp->rd_count == 0) {
984 			sigon(self);
985 			if (self->ul_error_detection)
986 				rwlock_error(rwlp, "rwlock_unlock",
987 				    "readers lock held, "
988 				    "but not by the calling thread");
989 			return (EPERM);
990 		}
991 		/*
992 		 * If we hold more than one readers lock on this rwlock,
993 		 * just decrement our reference count and return.
994 		 */
995 		if (--readlockp->rd_count != 0) {
996 			sigon(self);
997 			goto out;
998 		}
999 		sigon(self);
1000 	} else {
1001 		/*
1002 		 * This is a usage error.
1003 		 * No thread should release an unowned lock.
1004 		 */
1005 		if (self->ul_error_detection)
1006 			rwlock_error(rwlp, "rwlock_unlock", "lock not owned");
1007 		return (EPERM);
1008 	}
1009 
1010 	if (rd_wr == WRITE_LOCK && write_unlock_try(rwlp)) {
1011 		/* EMPTY */;
1012 	} else if (rd_wr == READ_LOCK && read_unlock_try(rwlp)) {
1013 		/* EMPTY */;
1014 	} else if (rwlp->rwlock_type == USYNC_PROCESS) {
1015 		(void) _private_mutex_lock(&rwlp->mutex);
1016 		(void) __lwp_rwlock_unlock(rwlp);
1017 		(void) _private_mutex_unlock(&rwlp->mutex);
1018 		waked = 1;
1019 	} else {
1020 		qp = queue_lock(rwlp, MX);
1021 		if (rd_wr == READ_LOCK)
1022 			atomic_dec_32(rwstate);
1023 		else
1024 			atomic_and_32(rwstate, ~URW_WRITE_LOCKED);
1025 		waked = rw_queue_release(qp, rwlp);
1026 	}
1027 
1028 out:
1029 	DTRACE_PROBE2(plockstat, rw__release, rwlp, rd_wr);
1030 
1031 	/*
1032 	 * Yield to the thread we just waked up, just in case we might
1033 	 * be about to grab the rwlock again immediately upon return.
1034 	 * This is pretty weak but it helps on a uniprocessor and also
1035 	 * when cpu affinity has assigned both ourself and the other
1036 	 * thread to the same CPU.  Note that lwp_yield() will yield
1037 	 * the processor only if the writer is at the same or higher
1038 	 * priority than ourself.  This provides more balanced program
1039 	 * behavior; it doesn't guarantee acquisition of the lock by
1040 	 * the pending writer.
1041 	 */
1042 	if (waked)
1043 		lwp_yield();
1044 	return (0);
1045 }
1046 
1047 void
1048 lrw_unlock(rwlock_t *rwlp)
1049 {
1050 	(void) __rw_unlock(rwlp);
1051 	exit_critical(curthread);
1052 }
1053