1 /*	$NetBSD: linux_futex.c,v 1.34 2016/05/20 13:54:34 chs Exp $ */
2 
3 /*-
4  * Copyright (c) 2005 Emmanuel Dreyfus, all rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. All advertising materials mentioning features or use of this software
15  *    must display the following acknowledgement:
16  *	This product includes software developed by Emmanuel Dreyfus
17  * 4. The name of the author may not be used to endorse or promote
18  *    products derived from this software without specific prior written
19  *    permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS''
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
23  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
25  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <sys/cdefs.h>
35 __KERNEL_RCSID(1, "$NetBSD: linux_futex.c,v 1.34 2016/05/20 13:54:34 chs Exp $");
36 
37 #include <sys/param.h>
38 #include <sys/time.h>
39 #include <sys/systm.h>
40 #include <sys/proc.h>
41 #include <sys/lwp.h>
42 #include <sys/queue.h>
43 #include <sys/condvar.h>
44 #include <sys/mutex.h>
45 #include <sys/kmem.h>
46 #include <sys/kernel.h>
47 #include <sys/atomic.h>
48 
49 #include <compat/linux/common/linux_types.h>
50 #include <compat/linux/common/linux_emuldata.h>
51 #include <compat/linux/common/linux_exec.h>
52 #include <compat/linux/common/linux_signal.h>
53 #include <compat/linux/common/linux_futex.h>
54 #include <compat/linux/common/linux_sched.h>
55 #include <compat/linux/common/linux_machdep.h>
56 #include <compat/linux/linux_syscallargs.h>
57 
58 struct futex;
59 
60 struct waiting_proc {
61 	struct futex *wp_futex;
62 	kcondvar_t wp_futex_cv;
63 	TAILQ_ENTRY(waiting_proc) wp_list;
64 	bool wp_onlist;
65 };
66 struct futex {
67 	void *f_uaddr;
68 	int f_refcount;
69 	uint32_t f_bitset;
70 	LIST_ENTRY(futex) f_list;
71 	TAILQ_HEAD(, waiting_proc) f_waiting_proc;
72 };
73 
74 static LIST_HEAD(futex_list, futex) futex_list;
75 static kmutex_t futex_lock;
76 
77 #define FUTEX_LOCK	mutex_enter(&futex_lock)
78 #define FUTEX_UNLOCK	mutex_exit(&futex_lock)
79 #define FUTEX_LOCKASSERT	KASSERT(mutex_owned(&futex_lock))
80 
81 #define FUTEX_SYSTEM_LOCK	KERNEL_LOCK(1, NULL)
82 #define FUTEX_SYSTEM_UNLOCK	KERNEL_UNLOCK_ONE(0)
83 
84 #ifdef DEBUG_LINUX_FUTEX
85 int debug_futex = 1;
86 #define FUTEXPRINTF(a) do { if (debug_futex) printf a; } while (0)
87 #else
88 #define FUTEXPRINTF(a)
89 #endif
90 
91 void
linux_futex_init(void)92 linux_futex_init(void)
93 {
94 	FUTEXPRINTF(("%s: initializing futex\n", __func__));
95 	mutex_init(&futex_lock, MUTEX_DEFAULT, IPL_NONE);
96 }
97 
98 void
linux_futex_fini(void)99 linux_futex_fini(void)
100 {
101 	FUTEXPRINTF(("%s: destroying futex\n", __func__));
102 	mutex_destroy(&futex_lock);
103 }
104 
105 static struct waiting_proc *futex_wp_alloc(void);
106 static void futex_wp_free(struct waiting_proc *);
107 static struct futex *futex_get(void *, uint32_t);
108 static void futex_ref(struct futex *);
109 static void futex_put(struct futex *);
110 static int futex_sleep(struct futex **, lwp_t *, int, struct waiting_proc *);
111 static int futex_wake(struct futex *, int, struct futex *, int);
112 static int futex_atomic_op(lwp_t *, int, void *);
113 
114 int
linux_sys_futex(struct lwp * l,const struct linux_sys_futex_args * uap,register_t * retval)115 linux_sys_futex(struct lwp *l, const struct linux_sys_futex_args *uap, register_t *retval)
116 {
117 	/* {
118 		syscallarg(int *) uaddr;
119 		syscallarg(int) op;
120 		syscallarg(int) val;
121 		syscallarg(const struct linux_timespec *) timeout;
122 		syscallarg(int *) uaddr2;
123 		syscallarg(int) val3;
124 	} */
125 	struct linux_timespec lts;
126 	struct timespec ts = { 0, 0 };
127 	int error;
128 
129 	if ((SCARG(uap, op) & LINUX_FUTEX_CMD_MASK) == LINUX_FUTEX_WAIT &&
130 	    SCARG(uap, timeout) != NULL) {
131 		if ((error = copyin(SCARG(uap, timeout),
132 		    &lts, sizeof(lts))) != 0) {
133 			return error;
134 		}
135 		linux_to_native_timespec(&ts, &lts);
136 	}
137 	return linux_do_futex(l, uap, retval, &ts);
138 }
139 
140 int
linux_do_futex(struct lwp * l,const struct linux_sys_futex_args * uap,register_t * retval,struct timespec * ts)141 linux_do_futex(struct lwp *l, const struct linux_sys_futex_args *uap, register_t *retval, struct timespec *ts)
142 {
143 	/* {
144 		syscallarg(int *) uaddr;
145 		syscallarg(int) op;
146 		syscallarg(int) val;
147 		syscallarg(const struct linux_timespec *) timeout;
148 		syscallarg(int *) uaddr2;
149 		syscallarg(int) val3;
150 	} */
151 	int val, val3;
152 	int ret;
153 	int error = 0;
154 	struct futex *f;
155 	struct futex *newf;
156 	int tout;
157 	struct futex *f2;
158 	struct waiting_proc *wp;
159 	int op_ret, cmd;
160 	clockid_t clk;
161 
162 	cmd = SCARG(uap, op) & LINUX_FUTEX_CMD_MASK;
163 	val3 = SCARG(uap, val3);
164 
165 	if (SCARG(uap, op) & LINUX_FUTEX_CLOCK_REALTIME) {
166 		switch (cmd) {
167 		case LINUX_FUTEX_WAIT_BITSET:
168 		case LINUX_FUTEX_WAIT:
169 			clk = CLOCK_REALTIME;
170 			break;
171 		default:
172 			return ENOSYS;
173 		}
174 	} else
175 		clk = CLOCK_MONOTONIC;
176 
177 	/*
178 	 * Our implementation provides only private futexes. Most of the apps
179 	 * should use private futexes but don't claim so. Therefore we treat
180 	 * all futexes as private by clearing the FUTEX_PRIVATE_FLAG. It works
181 	 * in most cases (ie. when futexes are not shared on file descriptor
182 	 * or between different processes).
183 	 *
184 	 * Note that we don't handle bitsets at all at the moment. We need
185 	 * to move from refcounting uaddr's to handling multiple futex entries
186 	 * pointing to the same uaddr, but having possibly different bitmask.
187 	 * Perhaps move to an implementation where each uaddr has a list of
188 	 * futexes.
189 	 */
190 	switch (cmd) {
191 	case LINUX_FUTEX_WAIT:
192 		val3 = FUTEX_BITSET_MATCH_ANY;
193 		/*FALLTHROUGH*/
194 	case LINUX_FUTEX_WAIT_BITSET:
195 		if ((error = ts2timo(clk, 0, ts, &tout, NULL)) != 0) {
196 			if (error != ETIMEDOUT)
197 				return error;
198 			/*
199 			 * If the user process requests a non null timeout,
200 			 * make sure we do not turn it into an infinite
201 			 * timeout because tout is 0.
202 			 *
203 			 * We use a minimal timeout of 1/hz. Maybe it would make
204 			 * sense to just return ETIMEDOUT without sleeping.
205 			 */
206 			if (SCARG(uap, timeout) != NULL)
207 				tout = 1;
208 			else
209 				tout = 0;
210 		}
211 		FUTEX_SYSTEM_LOCK;
212 		if ((error = copyin(SCARG(uap, uaddr),
213 		    &val, sizeof(val))) != 0) {
214 			FUTEX_SYSTEM_UNLOCK;
215 			return error;
216 		}
217 
218 		if (val != SCARG(uap, val)) {
219 			FUTEX_SYSTEM_UNLOCK;
220 			return EWOULDBLOCK;
221 		}
222 
223 		FUTEXPRINTF(("FUTEX_WAIT %d.%d: val = %d, uaddr = %p, "
224 		    "*uaddr = %d, timeout = %lld.%09ld\n",
225 		    l->l_proc->p_pid, l->l_lid, SCARG(uap, val),
226 		    SCARG(uap, uaddr), val, (long long)ts->tv_sec,
227 		    ts->tv_nsec));
228 
229 
230 		wp = futex_wp_alloc();
231 		FUTEX_LOCK;
232 		f = futex_get(SCARG(uap, uaddr), val3);
233 		ret = futex_sleep(&f, l, tout, wp);
234 		futex_put(f);
235 		FUTEX_UNLOCK;
236 		futex_wp_free(wp);
237 
238 		FUTEXPRINTF(("FUTEX_WAIT %d.%d: uaddr = %p, "
239 		    "ret = %d\n", l->l_proc->p_pid, l->l_lid,
240 		    SCARG(uap, uaddr), ret));
241 
242 		FUTEX_SYSTEM_UNLOCK;
243 		switch (ret) {
244 		case EWOULDBLOCK:	/* timeout */
245 			return ETIMEDOUT;
246 			break;
247 		case EINTR:		/* signal */
248 			return EINTR;
249 			break;
250 		case 0:			/* FUTEX_WAKE received */
251 			FUTEXPRINTF(("FUTEX_WAIT %d.%d: uaddr = %p, got it\n",
252 			    l->l_proc->p_pid, l->l_lid, SCARG(uap, uaddr)));
253 			return 0;
254 			break;
255 		default:
256 			FUTEXPRINTF(("FUTEX_WAIT: unexpected ret = %d\n", ret));
257 			break;
258 		}
259 
260 		/* NOTREACHED */
261 		break;
262 
263 	case LINUX_FUTEX_WAKE:
264 		val = FUTEX_BITSET_MATCH_ANY;
265 		/*FALLTHROUGH*/
266 	case LINUX_FUTEX_WAKE_BITSET:
267 		/*
268 		 * XXX: Linux is able cope with different addresses
269 		 * corresponding to the same mapped memory in the sleeping
270 		 * and the waker process(es).
271 		 */
272 		FUTEXPRINTF(("FUTEX_WAKE %d.%d: uaddr = %p, val = %d\n",
273 		    l->l_proc->p_pid, l->l_lid,
274 		    SCARG(uap, uaddr), SCARG(uap, val)));
275 
276 		FUTEX_SYSTEM_LOCK;
277 		FUTEX_LOCK;
278 		f = futex_get(SCARG(uap, uaddr), val3);
279 		*retval = futex_wake(f, SCARG(uap, val), NULL, 0);
280 		futex_put(f);
281 		FUTEX_UNLOCK;
282 		FUTEX_SYSTEM_UNLOCK;
283 
284 		break;
285 
286 	case LINUX_FUTEX_CMP_REQUEUE:
287 		FUTEX_SYSTEM_LOCK;
288 
289 		if ((error = copyin(SCARG(uap, uaddr),
290 		    &val, sizeof(val))) != 0) {
291 			FUTEX_SYSTEM_UNLOCK;
292 			return error;
293 		}
294 
295 		if (val != val3) {
296 			FUTEX_SYSTEM_UNLOCK;
297 			return EAGAIN;
298 		}
299 
300 		FUTEXPRINTF(("FUTEX_CMP_REQUEUE %d.%d: uaddr = %p, val = %d, "
301 		    "uaddr2 = %p, val2 = %d\n",
302 		    l->l_proc->p_pid, l->l_lid,
303 		    SCARG(uap, uaddr), SCARG(uap, val), SCARG(uap, uaddr2),
304 		    (int)(unsigned long)SCARG(uap, timeout)));
305 
306 		FUTEX_LOCK;
307 		f = futex_get(SCARG(uap, uaddr), val3);
308 		newf = futex_get(SCARG(uap, uaddr2), val3);
309 		*retval = futex_wake(f, SCARG(uap, val), newf,
310 		    (int)(unsigned long)SCARG(uap, timeout));
311 		futex_put(f);
312 		futex_put(newf);
313 		FUTEX_UNLOCK;
314 
315 		FUTEX_SYSTEM_UNLOCK;
316 		break;
317 
318 	case LINUX_FUTEX_REQUEUE:
319 		FUTEX_SYSTEM_LOCK;
320 
321 		FUTEXPRINTF(("FUTEX_REQUEUE %d.%d: uaddr = %p, val = %d, "
322 		    "uaddr2 = %p, val2 = %d\n",
323 		    l->l_proc->p_pid, l->l_lid,
324 		    SCARG(uap, uaddr), SCARG(uap, val), SCARG(uap, uaddr2),
325 		    (int)(unsigned long)SCARG(uap, timeout)));
326 
327 		FUTEX_LOCK;
328 		f = futex_get(SCARG(uap, uaddr), val3);
329 		newf = futex_get(SCARG(uap, uaddr2), val3);
330 		*retval = futex_wake(f, SCARG(uap, val), newf,
331 		    (int)(unsigned long)SCARG(uap, timeout));
332 		futex_put(f);
333 		futex_put(newf);
334 		FUTEX_UNLOCK;
335 
336 		FUTEX_SYSTEM_UNLOCK;
337 		break;
338 
339 	case LINUX_FUTEX_FD:
340 		FUTEXPRINTF(("%s: unimplemented op %d\n", __func__, cmd));
341 		return ENOSYS;
342 	case LINUX_FUTEX_WAKE_OP:
343 		FUTEX_SYSTEM_LOCK;
344 
345 		FUTEXPRINTF(("FUTEX_WAKE_OP %d.%d: uaddr = %p, op = %d, "
346 		    "val = %d, uaddr2 = %p, val2 = %d\n",
347 		    l->l_proc->p_pid, l->l_lid,
348 		    SCARG(uap, uaddr), cmd, SCARG(uap, val),
349 		    SCARG(uap, uaddr2),
350 		    (int)(unsigned long)SCARG(uap, timeout)));
351 
352 		FUTEX_LOCK;
353 		f = futex_get(SCARG(uap, uaddr), val3);
354 		f2 = futex_get(SCARG(uap, uaddr2), val3);
355 		FUTEX_UNLOCK;
356 
357 		/*
358 		 * This function returns positive number as results and
359 		 * negative as errors
360 		 */
361 		op_ret = futex_atomic_op(l, val3, SCARG(uap, uaddr2));
362 		FUTEX_LOCK;
363 		if (op_ret < 0) {
364 			futex_put(f);
365 			futex_put(f2);
366 			FUTEX_UNLOCK;
367 			FUTEX_SYSTEM_UNLOCK;
368 			return -op_ret;
369 		}
370 
371 		ret = futex_wake(f, SCARG(uap, val), NULL, 0);
372 		futex_put(f);
373 		if (op_ret > 0) {
374 			op_ret = 0;
375 			/*
376 			 * Linux abuses the address of the timespec parameter
377 			 * as the number of retries
378 			 */
379 			op_ret += futex_wake(f2,
380 			    (int)(unsigned long)SCARG(uap, timeout), NULL, 0);
381 			ret += op_ret;
382 		}
383 		futex_put(f2);
384 		FUTEX_UNLOCK;
385 		FUTEX_SYSTEM_UNLOCK;
386 		*retval = ret;
387 		break;
388 	default:
389 		FUTEXPRINTF(("%s: unknown op %d\n", __func__, cmd));
390 		return ENOSYS;
391 	}
392 	return 0;
393 }
394 
395 static struct waiting_proc *
futex_wp_alloc(void)396 futex_wp_alloc(void)
397 {
398 	struct waiting_proc *wp;
399 
400 	wp = kmem_zalloc(sizeof(*wp), KM_SLEEP);
401 	cv_init(&wp->wp_futex_cv, "futex");
402 	return wp;
403 }
404 
405 static void
futex_wp_free(struct waiting_proc * wp)406 futex_wp_free(struct waiting_proc *wp)
407 {
408 
409 	cv_destroy(&wp->wp_futex_cv);
410 	kmem_free(wp, sizeof(*wp));
411 }
412 
413 static struct futex *
futex_get(void * uaddr,uint32_t bitset)414 futex_get(void *uaddr, uint32_t bitset)
415 {
416 	struct futex *f;
417 
418 	FUTEX_LOCKASSERT;
419 
420 	LIST_FOREACH(f, &futex_list, f_list) {
421 		if (f->f_uaddr == uaddr) {
422 			f->f_refcount++;
423 			return f;
424 		}
425 	}
426 
427 	/* Not found, create it */
428 	f = kmem_zalloc(sizeof(*f), KM_SLEEP);
429 	f->f_uaddr = uaddr;
430 	f->f_bitset = bitset;
431 	f->f_refcount = 1;
432 	TAILQ_INIT(&f->f_waiting_proc);
433 	LIST_INSERT_HEAD(&futex_list, f, f_list);
434 
435 	return f;
436 }
437 
438 static void
futex_ref(struct futex * f)439 futex_ref(struct futex *f)
440 {
441 
442 	FUTEX_LOCKASSERT;
443 
444 	f->f_refcount++;
445 }
446 
447 static void
futex_put(struct futex * f)448 futex_put(struct futex *f)
449 {
450 
451 	FUTEX_LOCKASSERT;
452 
453 	f->f_refcount--;
454 	if (f->f_refcount == 0) {
455 		KASSERT(TAILQ_EMPTY(&f->f_waiting_proc));
456 		LIST_REMOVE(f, f_list);
457 		kmem_free(f, sizeof(*f));
458 	}
459 }
460 
461 static int
futex_sleep(struct futex ** fp,lwp_t * l,int timeout,struct waiting_proc * wp)462 futex_sleep(struct futex **fp, lwp_t *l, int timeout, struct waiting_proc *wp)
463 {
464 	struct futex *f;
465 	int ret;
466 
467 	FUTEX_LOCKASSERT;
468 
469 	f = *fp;
470 	wp->wp_futex = f;
471 	TAILQ_INSERT_TAIL(&f->f_waiting_proc, wp, wp_list);
472 	wp->wp_onlist = true;
473 	ret = cv_timedwait_sig(&wp->wp_futex_cv, &futex_lock, timeout);
474 
475 	/*
476 	 * we may have been requeued to a different futex before we were
477 	 * woken up, so let the caller know which futex to put.   if we were
478 	 * woken by futex_wake() then it took us off the waiting list,
479 	 * but if our sleep was interrupted or timed out then we might
480 	 * need to take ourselves off the waiting list.
481 	 */
482 
483 	f = wp->wp_futex;
484 	if (wp->wp_onlist) {
485 		TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
486 	}
487 	*fp = f;
488 	return ret;
489 }
490 
491 static int
futex_wake(struct futex * f,int n,struct futex * newf,int n2)492 futex_wake(struct futex *f, int n, struct futex *newf, int n2)
493 {
494 	struct waiting_proc *wp;
495 	int count = 0;
496 
497 	FUTEX_LOCKASSERT;
498 
499 	/*
500 	 * wake up up to n threads waiting on this futex.
501 	 */
502 
503 	while (n--) {
504 		wp = TAILQ_FIRST(&f->f_waiting_proc);
505 		if (wp == NULL)
506 			return count;
507 
508 		KASSERT(f == wp->wp_futex);
509 		TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
510 		wp->wp_onlist = false;
511 		cv_signal(&wp->wp_futex_cv);
512 		count++;
513 	}
514 	if (newf == NULL)
515 		return count;
516 
517 	/*
518 	 * then requeue up to n2 additional threads to newf
519 	 * (without waking them up).
520 	 */
521 
522 	while (n2--) {
523 		wp = TAILQ_FIRST(&f->f_waiting_proc);
524 		if (wp == NULL)
525 			return count;
526 
527 		KASSERT(f == wp->wp_futex);
528 		TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
529 		futex_put(f);
530 
531 		wp->wp_futex = newf;
532 		futex_ref(newf);
533 		TAILQ_INSERT_TAIL(&newf->f_waiting_proc, wp, wp_list);
534 		count++;
535 	}
536 	return count;
537 }
538 
539 static int
futex_atomic_op(lwp_t * l,int encoded_op,void * uaddr)540 futex_atomic_op(lwp_t *l, int encoded_op, void *uaddr)
541 {
542 	const int op = (encoded_op >> 28) & 7;
543 	const int cmp = (encoded_op >> 24) & 15;
544 	const int cmparg = (encoded_op << 20) >> 20;
545 	int oparg = (encoded_op << 8) >> 20;
546 	int error, oldval, cval;
547 
548 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
549 		oparg = 1 << oparg;
550 
551 	/* XXX: linux verifies access here and returns EFAULT */
552 
553 	if (copyin(uaddr, &cval, sizeof(int)) != 0)
554 		return -EFAULT;
555 
556 	for (;;) {
557 		int nval;
558 
559 		switch (op) {
560 		case FUTEX_OP_SET:
561 			nval = oparg;
562 			break;
563 		case FUTEX_OP_ADD:
564 			nval = cval + oparg;
565 			break;
566 		case FUTEX_OP_OR:
567 			nval = cval | oparg;
568 			break;
569 		case FUTEX_OP_ANDN:
570 			nval = cval & ~oparg;
571 			break;
572 		case FUTEX_OP_XOR:
573 			nval = cval ^ oparg;
574 			break;
575 		default:
576 			return -ENOSYS;
577 		}
578 
579 		error = ucas_int(uaddr, cval, nval, &oldval);
580 		if (error || oldval == cval) {
581 			break;
582 		}
583 		cval = oldval;
584 	}
585 
586 	if (error)
587 		return -EFAULT;
588 
589 	switch (cmp) {
590 	case FUTEX_OP_CMP_EQ:
591 		return (oldval == cmparg);
592 	case FUTEX_OP_CMP_NE:
593 		return (oldval != cmparg);
594 	case FUTEX_OP_CMP_LT:
595 		return (oldval < cmparg);
596 	case FUTEX_OP_CMP_GE:
597 		return (oldval >= cmparg);
598 	case FUTEX_OP_CMP_LE:
599 		return (oldval <= cmparg);
600 	case FUTEX_OP_CMP_GT:
601 		return (oldval > cmparg);
602 	default:
603 		return -ENOSYS;
604 	}
605 }
606 
607 int
linux_sys_set_robust_list(struct lwp * l,const struct linux_sys_set_robust_list_args * uap,register_t * retval)608 linux_sys_set_robust_list(struct lwp *l,
609     const struct linux_sys_set_robust_list_args *uap, register_t *retval)
610 {
611 	/* {
612 		syscallarg(struct linux_robust_list_head *) head;
613 		syscallarg(size_t) len;
614 	} */
615 	struct linux_emuldata *led;
616 
617 	if (SCARG(uap, len) != sizeof(struct linux_robust_list_head))
618 		return EINVAL;
619 	led = l->l_emuldata;
620 	led->led_robust_head = SCARG(uap, head);
621 	*retval = 0;
622 	return 0;
623 }
624 
625 int
linux_sys_get_robust_list(struct lwp * l,const struct linux_sys_get_robust_list_args * uap,register_t * retval)626 linux_sys_get_robust_list(struct lwp *l,
627     const struct linux_sys_get_robust_list_args *uap, register_t *retval)
628 {
629 	/* {
630 		syscallarg(int) pid;
631 		syscallarg(struct linux_robust_list_head **) head;
632 		syscallarg(size_t *) len;
633 	} */
634 	struct proc *p;
635 	struct linux_emuldata *led;
636 	struct linux_robust_list_head *head;
637 	size_t len;
638 	int error = 0;
639 
640 	p = l->l_proc;
641 	if (!SCARG(uap, pid)) {
642 		led = l->l_emuldata;
643 		head = led->led_robust_head;
644 	} else {
645 		mutex_enter(p->p_lock);
646 		l = lwp_find(p, SCARG(uap, pid));
647 		if (l != NULL) {
648 			led = l->l_emuldata;
649 			head = led->led_robust_head;
650 		}
651 		mutex_exit(p->p_lock);
652 		if (l == NULL) {
653 			return ESRCH;
654 		}
655 	}
656 #ifdef __arch64__
657 	if (p->p_flag & PK_32) {
658 		uint32_t u32;
659 
660 		u32 = 12;
661 		error = copyout(&u32, SCARG(uap, len), sizeof(u32));
662 		if (error)
663 			return error;
664 		u32 = (uint32_t)(uintptr_t)head;
665 		return copyout(&u32, SCARG(uap, head), sizeof(u32));
666 	}
667 #endif
668 
669 	len = sizeof(*head);
670 	error = copyout(&len, SCARG(uap, len), sizeof(len));
671 	if (error)
672 		return error;
673 	return copyout(&head, SCARG(uap, head), sizeof(head));
674 }
675 
676 static int
handle_futex_death(void * uaddr,pid_t pid,int pi)677 handle_futex_death(void *uaddr, pid_t pid, int pi)
678 {
679 	int uval, nval, mval;
680 	struct futex *f;
681 
682 retry:
683 	if (copyin(uaddr, &uval, sizeof(uval)))
684 		return EFAULT;
685 
686 	if ((uval & FUTEX_TID_MASK) == pid) {
687 		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
688 		nval = atomic_cas_32(uaddr, uval, mval);
689 
690 		if (nval == -1)
691 			return EFAULT;
692 
693 		if (nval != uval)
694 			goto retry;
695 
696 		if (!pi && (uval & FUTEX_WAITERS)) {
697 			FUTEX_LOCK;
698 			f = futex_get(uaddr, FUTEX_BITSET_MATCH_ANY);
699 			futex_wake(f, 1, NULL, 0);
700 			FUTEX_UNLOCK;
701 		}
702 	}
703 
704 	return 0;
705 }
706 
707 static int
fetch_robust_entry(struct lwp * l,struct linux_robust_list ** entry,struct linux_robust_list ** head,int * pi)708 fetch_robust_entry(struct lwp *l, struct linux_robust_list **entry,
709     struct linux_robust_list **head, int *pi)
710 {
711 	unsigned long uentry;
712 
713 #ifdef __arch64__
714 	if (l->l_proc->p_flag & PK_32) {
715 		uint32_t u32;
716 
717 		if (copyin(head, &u32, sizeof(u32)))
718 			return EFAULT;
719 		uentry = (unsigned long)u32;
720 	} else
721 #endif
722 	if (copyin(head, &uentry, sizeof(uentry)))
723 		return EFAULT;
724 
725 	*entry = (void *)(uentry & ~1UL);
726 	*pi = uentry & 1;
727 
728 	return 0;
729 }
730 
731 /* This walks the list of robust futexes, releasing them. */
732 void
release_futexes(struct lwp * l)733 release_futexes(struct lwp *l)
734 {
735 	struct linux_robust_list_head head;
736 	struct linux_robust_list *entry, *next_entry = NULL, *pending;
737 	unsigned int limit = 2048, pi, next_pi, pip;
738 	struct linux_emuldata *led;
739 	unsigned long futex_offset;
740 	int rc;
741 
742 	led = l->l_emuldata;
743 	if (led->led_robust_head == NULL)
744 		return;
745 
746 #ifdef __arch64__
747 	if (l->l_proc->p_flag & PK_32) {
748 		uint32_t u32s[3];
749 
750 		if (copyin(led->led_robust_head, u32s, sizeof(u32s)))
751 			return;
752 
753 		head.list.next = (void *)(uintptr_t)u32s[0];
754 		head.futex_offset = (unsigned long)u32s[1];
755 		head.pending_list = (void *)(uintptr_t)u32s[2];
756 	} else
757 #endif
758 	if (copyin(led->led_robust_head, &head, sizeof(head)))
759 		return;
760 
761 	if (fetch_robust_entry(l, &entry, &head.list.next, &pi))
762 		return;
763 
764 #ifdef __arch64__
765 	if (l->l_proc->p_flag & PK_32) {
766 		uint32_t u32;
767 
768 		if (copyin(led->led_robust_head, &u32, sizeof(u32)))
769 			return;
770 
771 		head.futex_offset = (unsigned long)u32;
772 	} else
773 #endif
774 	if (copyin(&head.futex_offset, &futex_offset, sizeof(unsigned long)))
775 		return;
776 
777 	if (fetch_robust_entry(l, &pending, &head.pending_list, &pip))
778 		return;
779 
780 	while (entry != &head.list) {
781 		rc = fetch_robust_entry(l, &next_entry, &entry->next, &next_pi);
782 
783 		if (entry != pending)
784 			if (handle_futex_death((char *)entry + futex_offset,
785 			    l->l_lid, pi))
786 				return;
787 
788 		if (rc)
789 			return;
790 
791 		entry = next_entry;
792 		pi = next_pi;
793 
794 		if (!--limit)
795 			break;
796 
797 		yield();	/* XXX why? */
798 	}
799 
800 	if (pending)
801 		handle_futex_death((char *)pending + futex_offset,
802 		    l->l_lid, pip);
803 }
804