xref: /freebsd/sys/amd64/linux32/linux32_machdep.c (revision 7bd6fde3)
1 /*-
2  * Copyright (c) 2004 Tim J. Robbins
3  * Copyright (c) 2002 Doug Rabson
4  * Copyright (c) 2000 Marcel Moolenaar
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer
12  *    in this position and unchanged.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. The name of the author may not be used to endorse or promote products
17  *    derived from this software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/param.h>
35 #include <sys/kernel.h>
36 #include <sys/systm.h>
37 #include <sys/file.h>
38 #include <sys/fcntl.h>
39 #include <sys/clock.h>
40 #include <sys/imgact.h>
41 #include <sys/limits.h>
42 #include <sys/lock.h>
43 #include <sys/malloc.h>
44 #include <sys/mman.h>
45 #include <sys/mutex.h>
46 #include <sys/priv.h>
47 #include <sys/proc.h>
48 #include <sys/resource.h>
49 #include <sys/resourcevar.h>
50 #include <sys/sched.h>
51 #include <sys/syscallsubr.h>
52 #include <sys/sysproto.h>
53 #include <sys/unistd.h>
54 
55 #include <machine/frame.h>
56 #include <machine/psl.h>
57 
58 #include <vm/vm.h>
59 #include <vm/pmap.h>
60 #include <vm/vm_extern.h>
61 #include <vm/vm_kern.h>
62 #include <vm/vm_map.h>
63 
64 #include <amd64/linux32/linux.h>
65 #include <amd64/linux32/linux32_proto.h>
66 #include <compat/linux/linux_ipc.h>
67 #include <compat/linux/linux_signal.h>
68 #include <compat/linux/linux_util.h>
69 #include <compat/linux/linux_emul.h>
70 
71 struct l_old_select_argv {
72 	l_int		nfds;
73 	l_uintptr_t	readfds;
74 	l_uintptr_t	writefds;
75 	l_uintptr_t	exceptfds;
76 	l_uintptr_t	timeout;
77 } __packed;
78 
79 int
80 linux_to_bsd_sigaltstack(int lsa)
81 {
82 	int bsa = 0;
83 
84 	if (lsa & LINUX_SS_DISABLE)
85 		bsa |= SS_DISABLE;
86 	if (lsa & LINUX_SS_ONSTACK)
87 		bsa |= SS_ONSTACK;
88 	return (bsa);
89 }
90 
91 int
92 bsd_to_linux_sigaltstack(int bsa)
93 {
94 	int lsa = 0;
95 
96 	if (bsa & SS_DISABLE)
97 		lsa |= LINUX_SS_DISABLE;
98 	if (bsa & SS_ONSTACK)
99 		lsa |= LINUX_SS_ONSTACK;
100 	return (lsa);
101 }
102 
103 /*
104  * Custom version of exec_copyin_args() so that we can translate
105  * the pointers.
106  */
107 static int
108 linux_exec_copyin_args(struct image_args *args, char *fname,
109     enum uio_seg segflg, char **argv, char **envv)
110 {
111 	char *argp, *envp;
112 	u_int32_t *p32, arg;
113 	size_t length;
114 	int error;
115 
116 	bzero(args, sizeof(*args));
117 	if (argv == NULL)
118 		return (EFAULT);
119 
120 	/*
121 	 * Allocate temporary demand zeroed space for argument and
122 	 *	environment strings
123 	 */
124 	args->buf = (char *) kmem_alloc_wait(exec_map,
125 	    PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
126 	if (args->buf == NULL)
127 		return (ENOMEM);
128 	args->begin_argv = args->buf;
129 	args->endp = args->begin_argv;
130 	args->stringspace = ARG_MAX;
131 
132 	args->fname = args->buf + ARG_MAX;
133 
134 	/*
135 	 * Copy the file name.
136 	 */
137 	error = (segflg == UIO_SYSSPACE) ?
138 	    copystr(fname, args->fname, PATH_MAX, &length) :
139 	    copyinstr(fname, args->fname, PATH_MAX, &length);
140 	if (error != 0)
141 		goto err_exit;
142 
143 	/*
144 	 * extract arguments first
145 	 */
146 	p32 = (u_int32_t *)argv;
147 	for (;;) {
148 		error = copyin(p32++, &arg, sizeof(arg));
149 		if (error)
150 			goto err_exit;
151 		if (arg == 0)
152 			break;
153 		argp = PTRIN(arg);
154 		error = copyinstr(argp, args->endp, args->stringspace, &length);
155 		if (error) {
156 			if (error == ENAMETOOLONG)
157 				error = E2BIG;
158 
159 			goto err_exit;
160 		}
161 		args->stringspace -= length;
162 		args->endp += length;
163 		args->argc++;
164 	}
165 
166 	args->begin_envv = args->endp;
167 
168 	/*
169 	 * extract environment strings
170 	 */
171 	if (envv) {
172 		p32 = (u_int32_t *)envv;
173 		for (;;) {
174 			error = copyin(p32++, &arg, sizeof(arg));
175 			if (error)
176 				goto err_exit;
177 			if (arg == 0)
178 				break;
179 			envp = PTRIN(arg);
180 			error = copyinstr(envp, args->endp, args->stringspace,
181 			    &length);
182 			if (error) {
183 				if (error == ENAMETOOLONG)
184 					error = E2BIG;
185 				goto err_exit;
186 			}
187 			args->stringspace -= length;
188 			args->endp += length;
189 			args->envc++;
190 		}
191 	}
192 
193 	return (0);
194 
195 err_exit:
196 	kmem_free_wakeup(exec_map, (vm_offset_t)args->buf,
197 	    PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
198 	args->buf = NULL;
199 	return (error);
200 }
201 
202 int
203 linux_execve(struct thread *td, struct linux_execve_args *args)
204 {
205 	struct image_args eargs;
206 	char *path;
207 	int error;
208 
209 	LCONVPATHEXIST(td, args->path, &path);
210 
211 #ifdef DEBUG
212 	if (ldebug(execve))
213 		printf(ARGS(execve, "%s"), path);
214 #endif
215 
216 	error = linux_exec_copyin_args(&eargs, path, UIO_SYSSPACE, args->argp,
217 	    args->envp);
218 	free(path, M_TEMP);
219 	if (error == 0)
220 		error = kern_execve(td, &eargs, NULL);
221 	if (error == 0)
222 	   	/* linux process can exec fbsd one, dont attempt
223 		 * to create emuldata for such process using
224 		 * linux_proc_init, this leads to a panic on KASSERT
225 		 * because such process has p->p_emuldata == NULL
226 		 */
227 	   	if (td->td_proc->p_sysent == &elf_linux_sysvec)
228    		   	error = linux_proc_init(td, 0, 0);
229 	return (error);
230 }
231 
232 struct iovec32 {
233 	u_int32_t iov_base;
234 	int	iov_len;
235 };
236 
237 CTASSERT(sizeof(struct iovec32) == 8);
238 
239 static int
240 linux32_copyinuio(struct iovec32 *iovp, u_int iovcnt, struct uio **uiop)
241 {
242 	struct iovec32 iov32;
243 	struct iovec *iov;
244 	struct uio *uio;
245 	u_int iovlen;
246 	int error, i;
247 
248 	*uiop = NULL;
249 	if (iovcnt > UIO_MAXIOV)
250 		return (EINVAL);
251 	iovlen = iovcnt * sizeof(struct iovec);
252 	uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK);
253 	iov = (struct iovec *)(uio + 1);
254 	for (i = 0; i < iovcnt; i++) {
255 		error = copyin(&iovp[i], &iov32, sizeof(struct iovec32));
256 		if (error) {
257 			free(uio, M_IOV);
258 			return (error);
259 		}
260 		iov[i].iov_base = PTRIN(iov32.iov_base);
261 		iov[i].iov_len = iov32.iov_len;
262 	}
263 	uio->uio_iov = iov;
264 	uio->uio_iovcnt = iovcnt;
265 	uio->uio_segflg = UIO_USERSPACE;
266 	uio->uio_offset = -1;
267 	uio->uio_resid = 0;
268 	for (i = 0; i < iovcnt; i++) {
269 		if (iov->iov_len > INT_MAX - uio->uio_resid) {
270 			free(uio, M_IOV);
271 			return (EINVAL);
272 		}
273 		uio->uio_resid += iov->iov_len;
274 		iov++;
275 	}
276 	*uiop = uio;
277 	return (0);
278 }
279 
280 int
281 linux_readv(struct thread *td, struct linux_readv_args *uap)
282 {
283 	struct uio *auio;
284 	int error;
285 
286 	error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio);
287 	if (error)
288 		return (error);
289 	error = kern_readv(td, uap->fd, auio);
290 	free(auio, M_IOV);
291 	return (error);
292 }
293 
294 int
295 linux_writev(struct thread *td, struct linux_writev_args *uap)
296 {
297 	struct uio *auio;
298 	int error;
299 
300 	error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio);
301 	if (error)
302 		return (error);
303 	error = kern_writev(td, uap->fd, auio);
304 	free(auio, M_IOV);
305 	return (error);
306 }
307 
308 struct l_ipc_kludge {
309 	l_uintptr_t msgp;
310 	l_long msgtyp;
311 } __packed;
312 
313 int
314 linux_ipc(struct thread *td, struct linux_ipc_args *args)
315 {
316 
317 	switch (args->what & 0xFFFF) {
318 	case LINUX_SEMOP: {
319 		struct linux_semop_args a;
320 
321 		a.semid = args->arg1;
322 		a.tsops = args->ptr;
323 		a.nsops = args->arg2;
324 		return (linux_semop(td, &a));
325 	}
326 	case LINUX_SEMGET: {
327 		struct linux_semget_args a;
328 
329 		a.key = args->arg1;
330 		a.nsems = args->arg2;
331 		a.semflg = args->arg3;
332 		return (linux_semget(td, &a));
333 	}
334 	case LINUX_SEMCTL: {
335 		struct linux_semctl_args a;
336 		int error;
337 
338 		a.semid = args->arg1;
339 		a.semnum = args->arg2;
340 		a.cmd = args->arg3;
341 		error = copyin(args->ptr, &a.arg, sizeof(a.arg));
342 		if (error)
343 			return (error);
344 		return (linux_semctl(td, &a));
345 	}
346 	case LINUX_MSGSND: {
347 		struct linux_msgsnd_args a;
348 
349 		a.msqid = args->arg1;
350 		a.msgp = args->ptr;
351 		a.msgsz = args->arg2;
352 		a.msgflg = args->arg3;
353 		return (linux_msgsnd(td, &a));
354 	}
355 	case LINUX_MSGRCV: {
356 		struct linux_msgrcv_args a;
357 
358 		a.msqid = args->arg1;
359 		a.msgsz = args->arg2;
360 		a.msgflg = args->arg3;
361 		if ((args->what >> 16) == 0) {
362 			struct l_ipc_kludge tmp;
363 			int error;
364 
365 			if (args->ptr == 0)
366 				return (EINVAL);
367 			error = copyin(args->ptr, &tmp, sizeof(tmp));
368 			if (error)
369 				return (error);
370 			a.msgp = PTRIN(tmp.msgp);
371 			a.msgtyp = tmp.msgtyp;
372 		} else {
373 			a.msgp = args->ptr;
374 			a.msgtyp = args->arg5;
375 		}
376 		return (linux_msgrcv(td, &a));
377 	}
378 	case LINUX_MSGGET: {
379 		struct linux_msgget_args a;
380 
381 		a.key = args->arg1;
382 		a.msgflg = args->arg2;
383 		return (linux_msgget(td, &a));
384 	}
385 	case LINUX_MSGCTL: {
386 		struct linux_msgctl_args a;
387 
388 		a.msqid = args->arg1;
389 		a.cmd = args->arg2;
390 		a.buf = args->ptr;
391 		return (linux_msgctl(td, &a));
392 	}
393 	case LINUX_SHMAT: {
394 		struct linux_shmat_args a;
395 
396 		a.shmid = args->arg1;
397 		a.shmaddr = args->ptr;
398 		a.shmflg = args->arg2;
399 		a.raddr = PTRIN((l_uint)args->arg3);
400 		return (linux_shmat(td, &a));
401 	}
402 	case LINUX_SHMDT: {
403 		struct linux_shmdt_args a;
404 
405 		a.shmaddr = args->ptr;
406 		return (linux_shmdt(td, &a));
407 	}
408 	case LINUX_SHMGET: {
409 		struct linux_shmget_args a;
410 
411 		a.key = args->arg1;
412 		a.size = args->arg2;
413 		a.shmflg = args->arg3;
414 		return (linux_shmget(td, &a));
415 	}
416 	case LINUX_SHMCTL: {
417 		struct linux_shmctl_args a;
418 
419 		a.shmid = args->arg1;
420 		a.cmd = args->arg2;
421 		a.buf = args->ptr;
422 		return (linux_shmctl(td, &a));
423 	}
424 	default:
425 		break;
426 	}
427 
428 	return (EINVAL);
429 }
430 
431 int
432 linux_old_select(struct thread *td, struct linux_old_select_args *args)
433 {
434 	struct l_old_select_argv linux_args;
435 	struct linux_select_args newsel;
436 	int error;
437 
438 #ifdef DEBUG
439 	if (ldebug(old_select))
440 		printf(ARGS(old_select, "%p"), args->ptr);
441 #endif
442 
443 	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
444 	if (error)
445 		return (error);
446 
447 	newsel.nfds = linux_args.nfds;
448 	newsel.readfds = PTRIN(linux_args.readfds);
449 	newsel.writefds = PTRIN(linux_args.writefds);
450 	newsel.exceptfds = PTRIN(linux_args.exceptfds);
451 	newsel.timeout = PTRIN(linux_args.timeout);
452 	return (linux_select(td, &newsel));
453 }
454 
455 int
456 linux_fork(struct thread *td, struct linux_fork_args *args)
457 {
458 	int error;
459 	struct proc *p2;
460 	struct thread *td2;
461 
462 #ifdef DEBUG
463 	if (ldebug(fork))
464 		printf(ARGS(fork, ""));
465 #endif
466 
467 	if ((error = fork1(td, RFFDG | RFPROC | RFSTOPPED, 0, &p2)) != 0)
468 		return (error);
469 
470 	if (error == 0) {
471 		td->td_retval[0] = p2->p_pid;
472 		td->td_retval[1] = 0;
473 	}
474 
475 	if (td->td_retval[1] == 1)
476 		td->td_retval[0] = 0;
477 	error = linux_proc_init(td, td->td_retval[0], 0);
478 	if (error)
479 		return (error);
480 
481 	td2 = FIRST_THREAD_IN_PROC(p2);
482 
483 	/* make it run */
484 	mtx_lock_spin(&sched_lock);
485 	TD_SET_CAN_RUN(td2);
486 	sched_add(td2, SRQ_BORING);
487 	mtx_unlock_spin(&sched_lock);
488 
489 	return (0);
490 }
491 
492 int
493 linux_vfork(struct thread *td, struct linux_vfork_args *args)
494 {
495 	int error;
496 	struct proc *p2;
497 	struct thread *td2;
498 
499 #ifdef DEBUG
500 	if (ldebug(vfork))
501 		printf(ARGS(vfork, ""));
502 #endif
503 
504 	/* exclude RFPPWAIT */
505 	if ((error = fork1(td, RFFDG | RFPROC | RFMEM | RFSTOPPED, 0, &p2)) != 0)
506 		return (error);
507 	if (error == 0) {
508 	   	td->td_retval[0] = p2->p_pid;
509 		td->td_retval[1] = 0;
510 	}
511 	/* Are we the child? */
512 	if (td->td_retval[1] == 1)
513 		td->td_retval[0] = 0;
514 	error = linux_proc_init(td, td->td_retval[0], 0);
515 	if (error)
516 		return (error);
517 
518 	PROC_LOCK(p2);
519 	p2->p_flag |= P_PPWAIT;
520 	PROC_UNLOCK(p2);
521 
522 	td2 = FIRST_THREAD_IN_PROC(p2);
523 
524 	/* make it run */
525 	mtx_lock_spin(&sched_lock);
526 	TD_SET_CAN_RUN(td2);
527 	sched_add(td2, SRQ_BORING);
528 	mtx_unlock_spin(&sched_lock);
529 
530 	/* wait for the children to exit, ie. emulate vfork */
531 	PROC_LOCK(p2);
532 	while (p2->p_flag & P_PPWAIT)
533 	   	msleep(td->td_proc, &p2->p_mtx, PWAIT, "ppwait", 0);
534 	PROC_UNLOCK(p2);
535 
536 	return (0);
537 }
538 
539 int
540 linux_clone(struct thread *td, struct linux_clone_args *args)
541 {
542 	int error, ff = RFPROC | RFSTOPPED;
543 	struct proc *p2;
544 	struct thread *td2;
545 	int exit_signal;
546 	struct linux_emuldata *em;
547 
548 #ifdef DEBUG
549 	if (ldebug(clone)) {
550    	   	printf(ARGS(clone, "flags %x, stack %x, parent tid: %x, child tid: %x"),
551 		    (unsigned int)args->flags, (unsigned int)(uintptr_t)args->stack,
552 		    (unsigned int)(uintptr_t)args->parent_tidptr,
553 		    (unsigned int)(uintptr_t)args->child_tidptr);
554 	}
555 #endif
556 
557 	exit_signal = args->flags & 0x000000ff;
558 	if (!LINUX_SIG_VALID(exit_signal) && exit_signal != 0)
559 		return (EINVAL);
560 
561 	if (exit_signal <= LINUX_SIGTBLSZ)
562 		exit_signal = linux_to_bsd_signal[_SIG_IDX(exit_signal)];
563 
564 	if (args->flags & LINUX_CLONE_VM)
565 		ff |= RFMEM;
566 	if (args->flags & LINUX_CLONE_SIGHAND)
567 		ff |= RFSIGSHARE;
568 	/*
569 	 * XXX: in linux sharing of fs info (chroot/cwd/umask)
570 	 * and open files is independant. in fbsd its in one
571 	 * structure but in reality it doesn't cause any problems
572 	 * because both of these flags are usually set together.
573 	 */
574 	if (!(args->flags & (LINUX_CLONE_FILES | LINUX_CLONE_FS)))
575 		ff |= RFFDG;
576 
577 	/*
578 	 * Attempt to detect when linux_clone(2) is used for creating
579 	 * kernel threads. Unfortunately despite the existence of the
580 	 * CLONE_THREAD flag, version of linuxthreads package used in
581 	 * most popular distros as of beginning of 2005 doesn't make
582 	 * any use of it. Therefore, this detection relies on
583 	 * empirical observation that linuxthreads sets certain
584 	 * combination of flags, so that we can make more or less
585 	 * precise detection and notify the FreeBSD kernel that several
586 	 * processes are in fact part of the same threading group, so
587 	 * that special treatment is necessary for signal delivery
588 	 * between those processes and fd locking.
589 	 */
590 	if ((args->flags & 0xffffff00) == LINUX_THREADING_FLAGS)
591 		ff |= RFTHREAD;
592 
593 	error = fork1(td, ff, 0, &p2);
594 	if (error)
595 		return (error);
596 
597 	if (args->flags & (LINUX_CLONE_PARENT | LINUX_CLONE_THREAD)) {
598 	   	sx_xlock(&proctree_lock);
599 		PROC_LOCK(p2);
600 		proc_reparent(p2, td->td_proc->p_pptr);
601 		PROC_UNLOCK(p2);
602 		sx_xunlock(&proctree_lock);
603 	}
604 
605 	/* create the emuldata */
606 	error = linux_proc_init(td, p2->p_pid, args->flags);
607 	/* reference it - no need to check this */
608 	em = em_find(p2, EMUL_DOLOCK);
609 	KASSERT(em != NULL, ("clone: emuldata not found.\n"));
610 	/* and adjust it */
611 	if (args->flags & LINUX_CLONE_PARENT_SETTID) {
612 	   	if (args->parent_tidptr == NULL) {
613 		   	EMUL_UNLOCK(&emul_lock);
614 			return (EINVAL);
615 		}
616 		error = copyout(&p2->p_pid, args->parent_tidptr, sizeof(p2->p_pid));
617 		if (error) {
618 		   	EMUL_UNLOCK(&emul_lock);
619 			return (error);
620 		}
621 	}
622 
623 	if (args->flags & LINUX_CLONE_THREAD) {
624 	   	/* XXX: linux mangles pgrp and pptr somehow
625 		 * I think it might be this but I am not sure.
626 		 */
627 #ifdef notyet
628 	   	PROC_LOCK(p2);
629 	   	p2->p_pgrp = td->td_proc->p_pgrp;
630 	   	PROC_UNLOCK(p2);
631 #endif
632 	 	exit_signal = 0;
633 	}
634 
635 	if (args->flags & LINUX_CLONE_CHILD_SETTID)
636 		em->child_set_tid = args->child_tidptr;
637 	else
638 	   	em->child_set_tid = NULL;
639 
640 	if (args->flags & LINUX_CLONE_CHILD_CLEARTID)
641 		em->child_clear_tid = args->child_tidptr;
642 	else
643 	   	em->child_clear_tid = NULL;
644 
645 	EMUL_UNLOCK(&emul_lock);
646 
647 	PROC_LOCK(p2);
648 	p2->p_sigparent = exit_signal;
649 	PROC_UNLOCK(p2);
650 	td2 = FIRST_THREAD_IN_PROC(p2);
651 	/*
652 	 * in a case of stack = NULL we are supposed to COW calling process stack
653 	 * this is what normal fork() does so we just keep the tf_rsp arg intact
654 	 */
655 	if (args->stack)
656    	   	td2->td_frame->tf_rsp = PTROUT(args->stack);
657 
658 	if (args->flags & LINUX_CLONE_SETTLS) {
659 	   	/* XXX: todo */
660 	}
661 
662 #ifdef DEBUG
663 	if (ldebug(clone))
664 		printf(LMSG("clone: successful rfork to %ld, stack %p sig = %d"),
665 		    (long)p2->p_pid, args->stack, exit_signal);
666 #endif
667 	if (args->flags & LINUX_CLONE_VFORK) {
668 	   	PROC_LOCK(p2);
669 	   	p2->p_flag |= P_PPWAIT;
670 	   	PROC_UNLOCK(p2);
671 	}
672 
673 	/*
674 	 * Make this runnable after we are finished with it.
675 	 */
676 	mtx_lock_spin(&sched_lock);
677 	TD_SET_CAN_RUN(td2);
678 	sched_add(td2, SRQ_BORING);
679 	mtx_unlock_spin(&sched_lock);
680 
681 	td->td_retval[0] = p2->p_pid;
682 	td->td_retval[1] = 0;
683 
684 	if (args->flags & LINUX_CLONE_VFORK) {
685    	   	/* wait for the children to exit, ie. emulate vfork */
686    	   	PROC_LOCK(p2);
687 		while (p2->p_flag & P_PPWAIT)
688    		   	msleep(td->td_proc, &p2->p_mtx, PWAIT, "ppwait", 0);
689 		PROC_UNLOCK(p2);
690 	}
691 
692 	return (0);
693 }
694 
695 #define STACK_SIZE  (2 * 1024 * 1024)
696 #define GUARD_SIZE  (4 * PAGE_SIZE)
697 
698 static int linux_mmap_common(struct thread *, struct l_mmap_argv *);
699 
700 int
701 linux_mmap2(struct thread *td, struct linux_mmap2_args *args)
702 {
703 	struct l_mmap_argv linux_args;
704 
705 #ifdef DEBUG
706 	if (ldebug(mmap2))
707 		printf(ARGS(mmap2, "%p, %d, %d, 0x%08x, %d, %d"),
708 		    (void *)(intptr_t)args->addr, args->len, args->prot,
709 		    args->flags, args->fd, args->pgoff);
710 #endif
711 
712 	linux_args.addr = PTROUT(args->addr);
713 	linux_args.len = args->len;
714 	linux_args.prot = args->prot;
715 	linux_args.flags = args->flags;
716 	linux_args.fd = args->fd;
717 	linux_args.pgoff = args->pgoff;
718 
719 	return (linux_mmap_common(td, &linux_args));
720 }
721 
722 int
723 linux_mmap(struct thread *td, struct linux_mmap_args *args)
724 {
725 	int error;
726 	struct l_mmap_argv linux_args;
727 
728 	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
729 	if (error)
730 		return (error);
731 
732 #ifdef DEBUG
733 	if (ldebug(mmap))
734 		printf(ARGS(mmap, "%p, %d, %d, 0x%08x, %d, %d"),
735 		    (void *)(intptr_t)linux_args.addr, linux_args.len,
736 		    linux_args.prot, linux_args.flags, linux_args.fd,
737 		    linux_args.pgoff);
738 #endif
739 	if ((linux_args.pgoff % PAGE_SIZE) != 0)
740 		return (EINVAL);
741 	linux_args.pgoff /= PAGE_SIZE;
742 
743 	return (linux_mmap_common(td, &linux_args));
744 }
745 
746 static int
747 linux_mmap_common(struct thread *td, struct l_mmap_argv *linux_args)
748 {
749 	struct proc *p = td->td_proc;
750 	struct mmap_args /* {
751 		caddr_t addr;
752 		size_t len;
753 		int prot;
754 		int flags;
755 		int fd;
756 		long pad;
757 		off_t pos;
758 	} */ bsd_args;
759 	int error;
760 	struct file *fp;
761 
762 	error = 0;
763 	bsd_args.flags = 0;
764 	fp = NULL;
765 
766 	/*
767 	 * Linux mmap(2):
768 	 * You must specify exactly one of MAP_SHARED and MAP_PRIVATE
769 	 */
770 	if (! ((linux_args->flags & LINUX_MAP_SHARED) ^
771 	    (linux_args->flags & LINUX_MAP_PRIVATE)))
772 		return (EINVAL);
773 
774 	if (linux_args->flags & LINUX_MAP_SHARED)
775 		bsd_args.flags |= MAP_SHARED;
776 	if (linux_args->flags & LINUX_MAP_PRIVATE)
777 		bsd_args.flags |= MAP_PRIVATE;
778 	if (linux_args->flags & LINUX_MAP_FIXED)
779 		bsd_args.flags |= MAP_FIXED;
780 	if (linux_args->flags & LINUX_MAP_ANON)
781 		bsd_args.flags |= MAP_ANON;
782 	else
783 		bsd_args.flags |= MAP_NOSYNC;
784 	if (linux_args->flags & LINUX_MAP_GROWSDOWN)
785 		bsd_args.flags |= MAP_STACK;
786 
787 	/*
788 	 * PROT_READ, PROT_WRITE, or PROT_EXEC implies PROT_READ and PROT_EXEC
789 	 * on Linux/i386. We do this to ensure maximum compatibility.
790 	 * Linux/ia64 does the same in i386 emulation mode.
791 	 */
792 	bsd_args.prot = linux_args->prot;
793 	if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC))
794 		bsd_args.prot |= PROT_READ | PROT_EXEC;
795 
796 	/* Linux does not check file descriptor when MAP_ANONYMOUS is set. */
797 	bsd_args.fd = (bsd_args.flags & MAP_ANON) ? -1 : linux_args->fd;
798 	if (bsd_args.fd != -1) {
799 		/*
800 		 * Linux follows Solaris mmap(2) description:
801 		 * The file descriptor fildes is opened with
802 		 * read permission, regardless of the
803 		 * protection options specified.
804 		 */
805 
806 		if ((error = fget(td, bsd_args.fd, &fp)) != 0)
807 			return (error);
808 		if (fp->f_type != DTYPE_VNODE) {
809 			fdrop(fp, td);
810 			return (EINVAL);
811 		}
812 
813 		/* Linux mmap() just fails for O_WRONLY files */
814 		if (!(fp->f_flag & FREAD)) {
815 			fdrop(fp, td);
816 			return (EACCES);
817 		}
818 
819 		fdrop(fp, td);
820 	}
821 
822 	if (linux_args->flags & LINUX_MAP_GROWSDOWN) {
823 		/*
824 		 * The linux MAP_GROWSDOWN option does not limit auto
825 		 * growth of the region.  Linux mmap with this option
826 		 * takes as addr the inital BOS, and as len, the initial
827 		 * region size.  It can then grow down from addr without
828 		 * limit.  However, linux threads has an implicit internal
829 		 * limit to stack size of STACK_SIZE.  Its just not
830 		 * enforced explicitly in linux.  But, here we impose
831 		 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack
832 		 * region, since we can do this with our mmap.
833 		 *
834 		 * Our mmap with MAP_STACK takes addr as the maximum
835 		 * downsize limit on BOS, and as len the max size of
836 		 * the region.  It them maps the top SGROWSIZ bytes,
837 		 * and auto grows the region down, up to the limit
838 		 * in addr.
839 		 *
840 		 * If we don't use the MAP_STACK option, the effect
841 		 * of this code is to allocate a stack region of a
842 		 * fixed size of (STACK_SIZE - GUARD_SIZE).
843 		 */
844 
845 		if ((caddr_t)PTRIN(linux_args->addr) + linux_args->len >
846 		    p->p_vmspace->vm_maxsaddr) {
847 			/*
848 			 * Some linux apps will attempt to mmap
849 			 * thread stacks near the top of their
850 			 * address space.  If their TOS is greater
851 			 * than vm_maxsaddr, vm_map_growstack()
852 			 * will confuse the thread stack with the
853 			 * process stack and deliver a SEGV if they
854 			 * attempt to grow the thread stack past their
855 			 * current stacksize rlimit.  To avoid this,
856 			 * adjust vm_maxsaddr upwards to reflect
857 			 * the current stacksize rlimit rather
858 			 * than the maximum possible stacksize.
859 			 * It would be better to adjust the
860 			 * mmap'ed region, but some apps do not check
861 			 * mmap's return value.
862 			 */
863 			PROC_LOCK(p);
864 			p->p_vmspace->vm_maxsaddr = (char *)LINUX32_USRSTACK -
865 			    lim_cur(p, RLIMIT_STACK);
866 			PROC_UNLOCK(p);
867 		}
868 
869 		/* This gives us our maximum stack size */
870 		if (linux_args->len > STACK_SIZE - GUARD_SIZE)
871 			bsd_args.len = linux_args->len;
872 		else
873 			bsd_args.len  = STACK_SIZE - GUARD_SIZE;
874 
875 		/*
876 		 * This gives us a new BOS.  If we're using VM_STACK, then
877 		 * mmap will just map the top SGROWSIZ bytes, and let
878 		 * the stack grow down to the limit at BOS.  If we're
879 		 * not using VM_STACK we map the full stack, since we
880 		 * don't have a way to autogrow it.
881 		 */
882 		bsd_args.addr = (caddr_t)PTRIN(linux_args->addr) -
883 		    bsd_args.len;
884 	} else {
885 		bsd_args.addr = (caddr_t)PTRIN(linux_args->addr);
886 		bsd_args.len  = linux_args->len;
887 	}
888 	bsd_args.pos = (off_t)linux_args->pgoff * PAGE_SIZE;
889 	bsd_args.pad = 0;
890 
891 #ifdef DEBUG
892 	if (ldebug(mmap))
893 		printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n",
894 		    __func__,
895 		    (void *)bsd_args.addr, (int)bsd_args.len, bsd_args.prot,
896 		    bsd_args.flags, bsd_args.fd, (int)bsd_args.pos);
897 #endif
898 	error = mmap(td, &bsd_args);
899 #ifdef DEBUG
900 	if (ldebug(mmap))
901 		printf("-> %s() return: 0x%x (0x%08x)\n",
902 			__func__, error, (u_int)td->td_retval[0]);
903 #endif
904 	return (error);
905 }
906 
907 int
908 linux_iopl(struct thread *td, struct linux_iopl_args *args)
909 {
910 	int error;
911 
912 	if (args->level < 0 || args->level > 3)
913 		return (EINVAL);
914 	if ((error = priv_check(td, PRIV_IO)) != 0)
915 		return (error);
916 	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
917 		return (error);
918 	td->td_frame->tf_rflags = (td->td_frame->tf_rflags & ~PSL_IOPL) |
919 	    (args->level * (PSL_IOPL / 3));
920 
921 	return (0);
922 }
923 
924 int
925 linux_pipe(struct thread *td, struct linux_pipe_args *args)
926 {
927 	int pip[2];
928 	int error;
929 	register_t reg_rdx;
930 
931 #ifdef DEBUG
932 	if (ldebug(pipe))
933 		printf(ARGS(pipe, "*"));
934 #endif
935 
936 	reg_rdx = td->td_retval[1];
937 	error = pipe(td, 0);
938 	if (error) {
939 		td->td_retval[1] = reg_rdx;
940 		return (error);
941 	}
942 
943 	pip[0] = td->td_retval[0];
944 	pip[1] = td->td_retval[1];
945 	error = copyout(pip, args->pipefds, 2 * sizeof(int));
946 	if (error) {
947 		td->td_retval[1] = reg_rdx;
948 		return (error);
949 	}
950 
951 	td->td_retval[1] = reg_rdx;
952 	td->td_retval[0] = 0;
953 	return (0);
954 }
955 
956 int
957 linux_sigaction(struct thread *td, struct linux_sigaction_args *args)
958 {
959 	l_osigaction_t osa;
960 	l_sigaction_t act, oact;
961 	int error;
962 
963 #ifdef DEBUG
964 	if (ldebug(sigaction))
965 		printf(ARGS(sigaction, "%d, %p, %p"),
966 		    args->sig, (void *)args->nsa, (void *)args->osa);
967 #endif
968 
969 	if (args->nsa != NULL) {
970 		error = copyin(args->nsa, &osa, sizeof(l_osigaction_t));
971 		if (error)
972 			return (error);
973 		act.lsa_handler = osa.lsa_handler;
974 		act.lsa_flags = osa.lsa_flags;
975 		act.lsa_restorer = osa.lsa_restorer;
976 		LINUX_SIGEMPTYSET(act.lsa_mask);
977 		act.lsa_mask.__bits[0] = osa.lsa_mask;
978 	}
979 
980 	error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL,
981 	    args->osa ? &oact : NULL);
982 
983 	if (args->osa != NULL && !error) {
984 		osa.lsa_handler = oact.lsa_handler;
985 		osa.lsa_flags = oact.lsa_flags;
986 		osa.lsa_restorer = oact.lsa_restorer;
987 		osa.lsa_mask = oact.lsa_mask.__bits[0];
988 		error = copyout(&osa, args->osa, sizeof(l_osigaction_t));
989 	}
990 
991 	return (error);
992 }
993 
994 /*
995  * Linux has two extra args, restart and oldmask.  We dont use these,
996  * but it seems that "restart" is actually a context pointer that
997  * enables the signal to happen with a different register set.
998  */
999 int
1000 linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args)
1001 {
1002 	sigset_t sigmask;
1003 	l_sigset_t mask;
1004 
1005 #ifdef DEBUG
1006 	if (ldebug(sigsuspend))
1007 		printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask);
1008 #endif
1009 
1010 	LINUX_SIGEMPTYSET(mask);
1011 	mask.__bits[0] = args->mask;
1012 	linux_to_bsd_sigset(&mask, &sigmask);
1013 	return (kern_sigsuspend(td, sigmask));
1014 }
1015 
1016 int
1017 linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap)
1018 {
1019 	l_sigset_t lmask;
1020 	sigset_t sigmask;
1021 	int error;
1022 
1023 #ifdef DEBUG
1024 	if (ldebug(rt_sigsuspend))
1025 		printf(ARGS(rt_sigsuspend, "%p, %d"),
1026 		    (void *)uap->newset, uap->sigsetsize);
1027 #endif
1028 
1029 	if (uap->sigsetsize != sizeof(l_sigset_t))
1030 		return (EINVAL);
1031 
1032 	error = copyin(uap->newset, &lmask, sizeof(l_sigset_t));
1033 	if (error)
1034 		return (error);
1035 
1036 	linux_to_bsd_sigset(&lmask, &sigmask);
1037 	return (kern_sigsuspend(td, sigmask));
1038 }
1039 
1040 int
1041 linux_pause(struct thread *td, struct linux_pause_args *args)
1042 {
1043 	struct proc *p = td->td_proc;
1044 	sigset_t sigmask;
1045 
1046 #ifdef DEBUG
1047 	if (ldebug(pause))
1048 		printf(ARGS(pause, ""));
1049 #endif
1050 
1051 	PROC_LOCK(p);
1052 	sigmask = td->td_sigmask;
1053 	PROC_UNLOCK(p);
1054 	return (kern_sigsuspend(td, sigmask));
1055 }
1056 
1057 int
1058 linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap)
1059 {
1060 	stack_t ss, oss;
1061 	l_stack_t lss;
1062 	int error;
1063 
1064 #ifdef DEBUG
1065 	if (ldebug(sigaltstack))
1066 		printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss);
1067 #endif
1068 
1069 	if (uap->uss != NULL) {
1070 		error = copyin(uap->uss, &lss, sizeof(l_stack_t));
1071 		if (error)
1072 			return (error);
1073 
1074 		ss.ss_sp = PTRIN(lss.ss_sp);
1075 		ss.ss_size = lss.ss_size;
1076 		ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags);
1077 	}
1078 	error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL,
1079 	    (uap->uoss != NULL) ? &oss : NULL);
1080 	if (!error && uap->uoss != NULL) {
1081 		lss.ss_sp = PTROUT(oss.ss_sp);
1082 		lss.ss_size = oss.ss_size;
1083 		lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags);
1084 		error = copyout(&lss, uap->uoss, sizeof(l_stack_t));
1085 	}
1086 
1087 	return (error);
1088 }
1089 
1090 int
1091 linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args)
1092 {
1093 	struct ftruncate_args sa;
1094 
1095 #ifdef DEBUG
1096 	if (ldebug(ftruncate64))
1097 		printf(ARGS(ftruncate64, "%u, %jd"), args->fd,
1098 		    (intmax_t)args->length);
1099 #endif
1100 
1101 	sa.fd = args->fd;
1102 	sa.pad = 0;
1103 	sa.length = args->length;
1104 	return ftruncate(td, &sa);
1105 }
1106 
1107 int
1108 linux_gettimeofday(struct thread *td, struct linux_gettimeofday_args *uap)
1109 {
1110 	struct timeval atv;
1111 	l_timeval atv32;
1112 	struct timezone rtz;
1113 	int error = 0;
1114 
1115 	if (uap->tp) {
1116 		microtime(&atv);
1117 		atv32.tv_sec = atv.tv_sec;
1118 		atv32.tv_usec = atv.tv_usec;
1119 		error = copyout(&atv32, uap->tp, sizeof (atv32));
1120 	}
1121 	if (error == 0 && uap->tzp != NULL) {
1122 		rtz.tz_minuteswest = tz_minuteswest;
1123 		rtz.tz_dsttime = tz_dsttime;
1124 		error = copyout(&rtz, uap->tzp, sizeof (rtz));
1125 	}
1126 	return (error);
1127 }
1128 
1129 int
1130 linux_getrusage(struct thread *td, struct linux_getrusage_args *uap)
1131 {
1132 	struct l_rusage s32;
1133 	struct rusage s;
1134 	int error;
1135 
1136 	error = kern_getrusage(td, uap->who, &s);
1137 	if (error != 0)
1138 		return (error);
1139 	if (uap->rusage != NULL) {
1140 		s32.ru_utime.tv_sec = s.ru_utime.tv_sec;
1141 		s32.ru_utime.tv_usec = s.ru_utime.tv_usec;
1142 		s32.ru_stime.tv_sec = s.ru_stime.tv_sec;
1143 		s32.ru_stime.tv_usec = s.ru_stime.tv_usec;
1144 		s32.ru_maxrss = s.ru_maxrss;
1145 		s32.ru_ixrss = s.ru_ixrss;
1146 		s32.ru_idrss = s.ru_idrss;
1147 		s32.ru_isrss = s.ru_isrss;
1148 		s32.ru_minflt = s.ru_minflt;
1149 		s32.ru_majflt = s.ru_majflt;
1150 		s32.ru_nswap = s.ru_nswap;
1151 		s32.ru_inblock = s.ru_inblock;
1152 		s32.ru_oublock = s.ru_oublock;
1153 		s32.ru_msgsnd = s.ru_msgsnd;
1154 		s32.ru_msgrcv = s.ru_msgrcv;
1155 		s32.ru_nsignals = s.ru_nsignals;
1156 		s32.ru_nvcsw = s.ru_nvcsw;
1157 		s32.ru_nivcsw = s.ru_nivcsw;
1158 		error = copyout(&s32, uap->rusage, sizeof(s32));
1159 	}
1160 	return (error);
1161 }
1162 
1163 int
1164 linux_sched_rr_get_interval(struct thread *td,
1165     struct linux_sched_rr_get_interval_args *uap)
1166 {
1167 	struct timespec ts;
1168 	struct l_timespec ts32;
1169 	int error;
1170 
1171 	error = kern_sched_rr_get_interval(td, uap->pid, &ts);
1172 	if (error != 0)
1173 		return (error);
1174 	ts32.tv_sec = ts.tv_sec;
1175 	ts32.tv_nsec = ts.tv_nsec;
1176 	return (copyout(&ts32, uap->interval, sizeof(ts32)));
1177 }
1178 
1179 int
1180 linux_mprotect(struct thread *td, struct linux_mprotect_args *uap)
1181 {
1182 	struct mprotect_args bsd_args;
1183 
1184 	bsd_args.addr = uap->addr;
1185 	bsd_args.len = uap->len;
1186 	bsd_args.prot = uap->prot;
1187 	if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC))
1188 		bsd_args.prot |= PROT_READ | PROT_EXEC;
1189 	return (mprotect(td, &bsd_args));
1190 }
1191