xref: /freebsd/sys/i386/linux/linux_sysvec.c (revision 39beb93c)
1 /*-
2  * Copyright (c) 1994-1996 S�ren Schmidt
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer
10  *    in this position and unchanged.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. The name of the author may not be used to endorse or promote products
15  *    derived from this software without specific prior written permission
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/exec.h>
35 #include <sys/fcntl.h>
36 #include <sys/imgact.h>
37 #include <sys/imgact_aout.h>
38 #include <sys/imgact_elf.h>
39 #include <sys/kernel.h>
40 #include <sys/lock.h>
41 #include <sys/malloc.h>
42 #include <sys/module.h>
43 #include <sys/mutex.h>
44 #include <sys/proc.h>
45 #include <sys/signalvar.h>
46 #include <sys/syscallsubr.h>
47 #include <sys/sysent.h>
48 #include <sys/sysproto.h>
49 #include <sys/vnode.h>
50 #include <sys/eventhandler.h>
51 
52 #include <vm/vm.h>
53 #include <vm/pmap.h>
54 #include <vm/vm_extern.h>
55 #include <vm/vm_map.h>
56 #include <vm/vm_object.h>
57 #include <vm/vm_page.h>
58 #include <vm/vm_param.h>
59 
60 #include <machine/cpu.h>
61 #include <machine/md_var.h>
62 #include <machine/pcb.h>
63 
64 #include <i386/linux/linux.h>
65 #include <i386/linux/linux_proto.h>
66 #include <compat/linux/linux_emul.h>
67 #include <compat/linux/linux_mib.h>
68 #include <compat/linux/linux_signal.h>
69 #include <compat/linux/linux_util.h>
70 
71 MODULE_VERSION(linux, 1);
72 
73 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
74 
75 #if BYTE_ORDER == LITTLE_ENDIAN
76 #define SHELLMAGIC      0x2123 /* #! */
77 #else
78 #define SHELLMAGIC      0x2321
79 #endif
80 
81 /*
82  * Allow the sendsig functions to use the ldebug() facility
83  * even though they are not syscalls themselves. Map them
84  * to syscall 0. This is slightly less bogus than using
85  * ldebug(sigreturn).
86  */
87 #define	LINUX_SYS_linux_rt_sendsig	0
88 #define	LINUX_SYS_linux_sendsig		0
89 
90 #define	fldcw(addr)		__asm("fldcw %0" : : "m" (*(addr)))
91 #define	__LINUX_NPXCW__		0x37f
92 
93 extern char linux_sigcode[];
94 extern int linux_szsigcode;
95 
96 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
97 
98 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
99 SET_DECLARE(linux_device_handler_set, struct linux_device_handler);
100 
101 static int	linux_fixup(register_t **stack_base,
102 		    struct image_params *iparams);
103 static int	elf_linux_fixup(register_t **stack_base,
104 		    struct image_params *iparams);
105 static void	linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
106 		    caddr_t *params);
107 static void     linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
108 static void	exec_linux_setregs(struct thread *td, u_long entry,
109 				   u_long stack, u_long ps_strings);
110 
111 extern LIST_HEAD(futex_list, futex) futex_list;
112 extern struct sx futex_sx;
113 
114 static eventhandler_tag linux_exit_tag;
115 static eventhandler_tag linux_schedtail_tag;
116 static eventhandler_tag linux_exec_tag;
117 
118 /*
119  * Linux syscalls return negative errno's, we do positive and map them
120  * Reference:
121  *   FreeBSD: src/sys/sys/errno.h
122  *   Linux:   linux-2.6.17.8/include/asm-generic/errno-base.h
123  *            linux-2.6.17.8/include/asm-generic/errno.h
124  */
125 static int bsd_to_linux_errno[ELAST + 1] = {
126 	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
127 	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
128 	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
129 	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
130 	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
131 	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
132 	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
133 	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
134 	  -6,  -6, -43, -42, -75,-125, -84, -95, -16, -74,
135 	 -72, -67, -71
136 };
137 
138 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
139 	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
140 	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
141 	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
142 	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
143 	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
144 	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
145 	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
146 	0, LINUX_SIGUSR1, LINUX_SIGUSR2
147 };
148 
149 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
150 	SIGHUP, SIGINT, SIGQUIT, SIGILL,
151 	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
152 	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
153 	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
154 	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
155 	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
156 	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
157 	SIGIO, SIGURG, SIGSYS
158 };
159 
160 #define LINUX_T_UNKNOWN  255
161 static int _bsd_to_linux_trapcode[] = {
162 	LINUX_T_UNKNOWN,	/* 0 */
163 	6,			/* 1  T_PRIVINFLT */
164 	LINUX_T_UNKNOWN,	/* 2 */
165 	3,			/* 3  T_BPTFLT */
166 	LINUX_T_UNKNOWN,	/* 4 */
167 	LINUX_T_UNKNOWN,	/* 5 */
168 	16,			/* 6  T_ARITHTRAP */
169 	254,			/* 7  T_ASTFLT */
170 	LINUX_T_UNKNOWN,	/* 8 */
171 	13,			/* 9  T_PROTFLT */
172 	1,			/* 10 T_TRCTRAP */
173 	LINUX_T_UNKNOWN,	/* 11 */
174 	14,			/* 12 T_PAGEFLT */
175 	LINUX_T_UNKNOWN,	/* 13 */
176 	17,			/* 14 T_ALIGNFLT */
177 	LINUX_T_UNKNOWN,	/* 15 */
178 	LINUX_T_UNKNOWN,	/* 16 */
179 	LINUX_T_UNKNOWN,	/* 17 */
180 	0,			/* 18 T_DIVIDE */
181 	2,			/* 19 T_NMI */
182 	4,			/* 20 T_OFLOW */
183 	5,			/* 21 T_BOUND */
184 	7,			/* 22 T_DNA */
185 	8,			/* 23 T_DOUBLEFLT */
186 	9,			/* 24 T_FPOPFLT */
187 	10,			/* 25 T_TSSFLT */
188 	11,			/* 26 T_SEGNPFLT */
189 	12,			/* 27 T_STKFLT */
190 	18,			/* 28 T_MCHK */
191 	19,			/* 29 T_XMMFLT */
192 	15			/* 30 T_RESERVED */
193 };
194 #define bsd_to_linux_trapcode(code) \
195     ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
196      _bsd_to_linux_trapcode[(code)]: \
197      LINUX_T_UNKNOWN)
198 
199 /*
200  * If FreeBSD & Linux have a difference of opinion about what a trap
201  * means, deal with it here.
202  *
203  * MPSAFE
204  */
205 static int
206 translate_traps(int signal, int trap_code)
207 {
208 	if (signal != SIGBUS)
209 		return signal;
210 	switch (trap_code) {
211 	case T_PROTFLT:
212 	case T_TSSFLT:
213 	case T_DOUBLEFLT:
214 	case T_PAGEFLT:
215 		return SIGSEGV;
216 	default:
217 		return signal;
218 	}
219 }
220 
221 static int
222 linux_fixup(register_t **stack_base, struct image_params *imgp)
223 {
224 	register_t *argv, *envp;
225 
226 	argv = *stack_base;
227 	envp = *stack_base + (imgp->args->argc + 1);
228 	(*stack_base)--;
229 	**stack_base = (intptr_t)(void *)envp;
230 	(*stack_base)--;
231 	**stack_base = (intptr_t)(void *)argv;
232 	(*stack_base)--;
233 	**stack_base = imgp->args->argc;
234 	return 0;
235 }
236 
237 static int
238 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
239 {
240 	Elf32_Auxargs *args;
241 	register_t *pos;
242 
243 	KASSERT(curthread->td_proc == imgp->proc,
244 	    ("unsafe elf_linux_fixup(), should be curproc"));
245 	args = (Elf32_Auxargs *)imgp->auxargs;
246 	pos = *stack_base + (imgp->args->argc + imgp->args->envc + 2);
247 
248 	if (args->execfd != -1)
249 		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
250 	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
251 	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
252 	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
253 	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
254 	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
255 	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
256 	AUXARGS_ENTRY(pos, AT_BASE, args->base);
257 	AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
258 	AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
259 	AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
260 	AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
261 	AUXARGS_ENTRY(pos, AT_NULL, 0);
262 
263 	free(imgp->auxargs, M_TEMP);
264 	imgp->auxargs = NULL;
265 
266 	(*stack_base)--;
267 	**stack_base = (register_t)imgp->args->argc;
268 	return 0;
269 }
270 
271 extern int _ucodesel, _udatasel;
272 extern unsigned long linux_sznonrtsigcode;
273 
274 static void
275 linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
276 {
277 	struct thread *td = curthread;
278 	struct proc *p = td->td_proc;
279 	struct sigacts *psp;
280 	struct trapframe *regs;
281 	struct l_rt_sigframe *fp, frame;
282 	int sig, code;
283 	int oonstack;
284 
285 	sig = ksi->ksi_signo;
286 	code = ksi->ksi_code;
287 	PROC_LOCK_ASSERT(p, MA_OWNED);
288 	psp = p->p_sigacts;
289 	mtx_assert(&psp->ps_mtx, MA_OWNED);
290 	regs = td->td_frame;
291 	oonstack = sigonstack(regs->tf_esp);
292 
293 #ifdef DEBUG
294 	if (ldebug(rt_sendsig))
295 		printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
296 		    catcher, sig, (void*)mask, code);
297 #endif
298 	/*
299 	 * Allocate space for the signal handler context.
300 	 */
301 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
302 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
303 		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
304 		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
305 	} else
306 		fp = (struct l_rt_sigframe *)regs->tf_esp - 1;
307 	mtx_unlock(&psp->ps_mtx);
308 
309 	/*
310 	 * Build the argument list for the signal handler.
311 	 */
312 	if (p->p_sysent->sv_sigtbl)
313 		if (sig <= p->p_sysent->sv_sigsize)
314 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
315 
316 	bzero(&frame, sizeof(frame));
317 
318 	frame.sf_handler = catcher;
319 	frame.sf_sig = sig;
320 	frame.sf_siginfo = &fp->sf_si;
321 	frame.sf_ucontext = &fp->sf_sc;
322 
323 	/* Fill in POSIX parts */
324 	ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig);
325 
326 	/*
327 	 * Build the signal context to be used by sigreturn.
328 	 */
329 	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
330 	frame.sf_sc.uc_link = NULL;		/* XXX ??? */
331 
332 	frame.sf_sc.uc_stack.ss_sp = td->td_sigstk.ss_sp;
333 	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
334 	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
335 	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
336 	PROC_UNLOCK(p);
337 
338 	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
339 
340 	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
341 	frame.sf_sc.uc_mcontext.sc_gs     = rgs();
342 	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
343 	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
344 	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
345 	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_edi;
346 	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_esi;
347 	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_ebp;
348 	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_ebx;
349 	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_edx;
350 	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_ecx;
351 	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_eax;
352 	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_eip;
353 	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
354 	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_eflags;
355 	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_esp;
356 	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
357 	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
358 	frame.sf_sc.uc_mcontext.sc_cr2    = (register_t)ksi->ksi_addr;
359 	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
360 
361 #ifdef DEBUG
362 	if (ldebug(rt_sendsig))
363 		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
364 		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
365 		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
366 #endif
367 
368 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
369 		/*
370 		 * Process has trashed its stack; give it an illegal
371 		 * instruction to halt it in its tracks.
372 		 */
373 #ifdef DEBUG
374 		if (ldebug(rt_sendsig))
375 			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
376 			    fp, oonstack);
377 #endif
378 		PROC_LOCK(p);
379 		sigexit(td, SIGILL);
380 	}
381 
382 	/*
383 	 * Build context to run handler in.
384 	 */
385 	regs->tf_esp = (int)fp;
386 	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
387 	    linux_sznonrtsigcode;
388 	regs->tf_eflags &= ~(PSL_T | PSL_VM | PSL_D);
389 	regs->tf_cs = _ucodesel;
390 	regs->tf_ds = _udatasel;
391 	regs->tf_es = _udatasel;
392 	regs->tf_fs = _udatasel;
393 	regs->tf_ss = _udatasel;
394 	PROC_LOCK(p);
395 	mtx_lock(&psp->ps_mtx);
396 }
397 
398 
399 /*
400  * Send an interrupt to process.
401  *
402  * Stack is set up to allow sigcode stored
403  * in u. to call routine, followed by kcall
404  * to sigreturn routine below.  After sigreturn
405  * resets the signal mask, the stack, and the
406  * frame pointer, it returns to the user
407  * specified pc, psl.
408  */
409 static void
410 linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
411 {
412 	struct thread *td = curthread;
413 	struct proc *p = td->td_proc;
414 	struct sigacts *psp;
415 	struct trapframe *regs;
416 	struct l_sigframe *fp, frame;
417 	l_sigset_t lmask;
418 	int sig, code;
419 	int oonstack, i;
420 
421 	PROC_LOCK_ASSERT(p, MA_OWNED);
422 	psp = p->p_sigacts;
423 	sig = ksi->ksi_signo;
424 	code = ksi->ksi_code;
425 	mtx_assert(&psp->ps_mtx, MA_OWNED);
426 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
427 		/* Signal handler installed with SA_SIGINFO. */
428 		linux_rt_sendsig(catcher, ksi, mask);
429 		return;
430 	}
431 	regs = td->td_frame;
432 	oonstack = sigonstack(regs->tf_esp);
433 
434 #ifdef DEBUG
435 	if (ldebug(sendsig))
436 		printf(ARGS(sendsig, "%p, %d, %p, %u"),
437 		    catcher, sig, (void*)mask, code);
438 #endif
439 
440 	/*
441 	 * Allocate space for the signal handler context.
442 	 */
443 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
444 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
445 		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
446 		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
447 	} else
448 		fp = (struct l_sigframe *)regs->tf_esp - 1;
449 	mtx_unlock(&psp->ps_mtx);
450 	PROC_UNLOCK(p);
451 
452 	/*
453 	 * Build the argument list for the signal handler.
454 	 */
455 	if (p->p_sysent->sv_sigtbl)
456 		if (sig <= p->p_sysent->sv_sigsize)
457 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
458 
459 	bzero(&frame, sizeof(frame));
460 
461 	frame.sf_handler = catcher;
462 	frame.sf_sig = sig;
463 
464 	bsd_to_linux_sigset(mask, &lmask);
465 
466 	/*
467 	 * Build the signal context to be used by sigreturn.
468 	 */
469 	frame.sf_sc.sc_mask   = lmask.__bits[0];
470 	frame.sf_sc.sc_gs     = rgs();
471 	frame.sf_sc.sc_fs     = regs->tf_fs;
472 	frame.sf_sc.sc_es     = regs->tf_es;
473 	frame.sf_sc.sc_ds     = regs->tf_ds;
474 	frame.sf_sc.sc_edi    = regs->tf_edi;
475 	frame.sf_sc.sc_esi    = regs->tf_esi;
476 	frame.sf_sc.sc_ebp    = regs->tf_ebp;
477 	frame.sf_sc.sc_ebx    = regs->tf_ebx;
478 	frame.sf_sc.sc_edx    = regs->tf_edx;
479 	frame.sf_sc.sc_ecx    = regs->tf_ecx;
480 	frame.sf_sc.sc_eax    = regs->tf_eax;
481 	frame.sf_sc.sc_eip    = regs->tf_eip;
482 	frame.sf_sc.sc_cs     = regs->tf_cs;
483 	frame.sf_sc.sc_eflags = regs->tf_eflags;
484 	frame.sf_sc.sc_esp_at_signal = regs->tf_esp;
485 	frame.sf_sc.sc_ss     = regs->tf_ss;
486 	frame.sf_sc.sc_err    = regs->tf_err;
487 	frame.sf_sc.sc_cr2    = (register_t)ksi->ksi_addr;
488 	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(ksi->ksi_trapno);
489 
490 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
491 		frame.sf_extramask[i] = lmask.__bits[i+1];
492 
493 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
494 		/*
495 		 * Process has trashed its stack; give it an illegal
496 		 * instruction to halt it in its tracks.
497 		 */
498 		PROC_LOCK(p);
499 		sigexit(td, SIGILL);
500 	}
501 
502 	/*
503 	 * Build context to run handler in.
504 	 */
505 	regs->tf_esp = (int)fp;
506 	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
507 	regs->tf_eflags &= ~(PSL_T | PSL_VM | PSL_D);
508 	regs->tf_cs = _ucodesel;
509 	regs->tf_ds = _udatasel;
510 	regs->tf_es = _udatasel;
511 	regs->tf_fs = _udatasel;
512 	regs->tf_ss = _udatasel;
513 	PROC_LOCK(p);
514 	mtx_lock(&psp->ps_mtx);
515 }
516 
517 /*
518  * System call to cleanup state after a signal
519  * has been taken.  Reset signal mask and
520  * stack state from context left by sendsig (above).
521  * Return to previous pc and psl as specified by
522  * context left by sendsig. Check carefully to
523  * make sure that the user has not modified the
524  * psl to gain improper privileges or to cause
525  * a machine fault.
526  */
527 int
528 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
529 {
530 	struct proc *p = td->td_proc;
531 	struct l_sigframe frame;
532 	struct trapframe *regs;
533 	l_sigset_t lmask;
534 	int eflags, i;
535 	ksiginfo_t ksi;
536 
537 	regs = td->td_frame;
538 
539 #ifdef DEBUG
540 	if (ldebug(sigreturn))
541 		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
542 #endif
543 	/*
544 	 * The trampoline code hands us the sigframe.
545 	 * It is unsafe to keep track of it ourselves, in the event that a
546 	 * program jumps out of a signal handler.
547 	 */
548 	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
549 		return (EFAULT);
550 
551 	/*
552 	 * Check for security violations.
553 	 */
554 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
555 	eflags = frame.sf_sc.sc_eflags;
556 	/*
557 	 * XXX do allow users to change the privileged flag PSL_RF.  The
558 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
559 	 * sometimes set it there too.  tf_eflags is kept in the signal
560 	 * context during signal handling and there is no other place
561 	 * to remember it, so the PSL_RF bit may be corrupted by the
562 	 * signal handler without us knowing.  Corruption of the PSL_RF
563 	 * bit at worst causes one more or one less debugger trap, so
564 	 * allowing it is fairly harmless.
565 	 */
566 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
567 		return(EINVAL);
568 
569 	/*
570 	 * Don't allow users to load a valid privileged %cs.  Let the
571 	 * hardware check for invalid selectors, excess privilege in
572 	 * other selectors, invalid %eip's and invalid %esp's.
573 	 */
574 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
575 	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
576 		ksiginfo_init_trap(&ksi);
577 		ksi.ksi_signo = SIGBUS;
578 		ksi.ksi_code = BUS_OBJERR;
579 		ksi.ksi_trapno = T_PROTFLT;
580 		ksi.ksi_addr = (void *)regs->tf_eip;
581 		trapsignal(td, &ksi);
582 		return(EINVAL);
583 	}
584 
585 	lmask.__bits[0] = frame.sf_sc.sc_mask;
586 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
587 		lmask.__bits[i+1] = frame.sf_extramask[i];
588 	PROC_LOCK(p);
589 	linux_to_bsd_sigset(&lmask, &td->td_sigmask);
590 	SIG_CANTMASK(td->td_sigmask);
591 	signotify(td);
592 	PROC_UNLOCK(p);
593 
594 	/*
595 	 * Restore signal context.
596 	 */
597 	/* %gs was restored by the trampoline. */
598 	regs->tf_fs     = frame.sf_sc.sc_fs;
599 	regs->tf_es     = frame.sf_sc.sc_es;
600 	regs->tf_ds     = frame.sf_sc.sc_ds;
601 	regs->tf_edi    = frame.sf_sc.sc_edi;
602 	regs->tf_esi    = frame.sf_sc.sc_esi;
603 	regs->tf_ebp    = frame.sf_sc.sc_ebp;
604 	regs->tf_ebx    = frame.sf_sc.sc_ebx;
605 	regs->tf_edx    = frame.sf_sc.sc_edx;
606 	regs->tf_ecx    = frame.sf_sc.sc_ecx;
607 	regs->tf_eax    = frame.sf_sc.sc_eax;
608 	regs->tf_eip    = frame.sf_sc.sc_eip;
609 	regs->tf_cs     = frame.sf_sc.sc_cs;
610 	regs->tf_eflags = eflags;
611 	regs->tf_esp    = frame.sf_sc.sc_esp_at_signal;
612 	regs->tf_ss     = frame.sf_sc.sc_ss;
613 
614 	return (EJUSTRETURN);
615 }
616 
617 /*
618  * System call to cleanup state after a signal
619  * has been taken.  Reset signal mask and
620  * stack state from context left by rt_sendsig (above).
621  * Return to previous pc and psl as specified by
622  * context left by sendsig. Check carefully to
623  * make sure that the user has not modified the
624  * psl to gain improper privileges or to cause
625  * a machine fault.
626  */
627 int
628 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
629 {
630 	struct proc *p = td->td_proc;
631 	struct l_ucontext uc;
632 	struct l_sigcontext *context;
633 	l_stack_t *lss;
634 	stack_t ss;
635 	struct trapframe *regs;
636 	int eflags;
637 	ksiginfo_t ksi;
638 
639 	regs = td->td_frame;
640 
641 #ifdef DEBUG
642 	if (ldebug(rt_sigreturn))
643 		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
644 #endif
645 	/*
646 	 * The trampoline code hands us the ucontext.
647 	 * It is unsafe to keep track of it ourselves, in the event that a
648 	 * program jumps out of a signal handler.
649 	 */
650 	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
651 		return (EFAULT);
652 
653 	context = &uc.uc_mcontext;
654 
655 	/*
656 	 * Check for security violations.
657 	 */
658 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
659 	eflags = context->sc_eflags;
660 	/*
661 	 * XXX do allow users to change the privileged flag PSL_RF.  The
662 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
663 	 * sometimes set it there too.  tf_eflags is kept in the signal
664 	 * context during signal handling and there is no other place
665 	 * to remember it, so the PSL_RF bit may be corrupted by the
666 	 * signal handler without us knowing.  Corruption of the PSL_RF
667 	 * bit at worst causes one more or one less debugger trap, so
668 	 * allowing it is fairly harmless.
669 	 */
670 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
671 		return(EINVAL);
672 
673 	/*
674 	 * Don't allow users to load a valid privileged %cs.  Let the
675 	 * hardware check for invalid selectors, excess privilege in
676 	 * other selectors, invalid %eip's and invalid %esp's.
677 	 */
678 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
679 	if (!CS_SECURE(context->sc_cs)) {
680 		ksiginfo_init_trap(&ksi);
681 		ksi.ksi_signo = SIGBUS;
682 		ksi.ksi_code = BUS_OBJERR;
683 		ksi.ksi_trapno = T_PROTFLT;
684 		ksi.ksi_addr = (void *)regs->tf_eip;
685 		trapsignal(td, &ksi);
686 		return(EINVAL);
687 	}
688 
689 	PROC_LOCK(p);
690 	linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask);
691 	SIG_CANTMASK(td->td_sigmask);
692 	signotify(td);
693 	PROC_UNLOCK(p);
694 
695 	/*
696 	 * Restore signal context
697 	 */
698 	/* %gs was restored by the trampoline. */
699 	regs->tf_fs     = context->sc_fs;
700 	regs->tf_es     = context->sc_es;
701 	regs->tf_ds     = context->sc_ds;
702 	regs->tf_edi    = context->sc_edi;
703 	regs->tf_esi    = context->sc_esi;
704 	regs->tf_ebp    = context->sc_ebp;
705 	regs->tf_ebx    = context->sc_ebx;
706 	regs->tf_edx    = context->sc_edx;
707 	regs->tf_ecx    = context->sc_ecx;
708 	regs->tf_eax    = context->sc_eax;
709 	regs->tf_eip    = context->sc_eip;
710 	regs->tf_cs     = context->sc_cs;
711 	regs->tf_eflags = eflags;
712 	regs->tf_esp    = context->sc_esp_at_signal;
713 	regs->tf_ss     = context->sc_ss;
714 
715 	/*
716 	 * call sigaltstack & ignore results..
717 	 */
718 	lss = &uc.uc_stack;
719 	ss.ss_sp = lss->ss_sp;
720 	ss.ss_size = lss->ss_size;
721 	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
722 
723 #ifdef DEBUG
724 	if (ldebug(rt_sigreturn))
725 		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
726 		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
727 #endif
728 	(void)kern_sigaltstack(td, &ss, NULL);
729 
730 	return (EJUSTRETURN);
731 }
732 
733 /*
734  * MPSAFE
735  */
736 static void
737 linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
738 {
739 	args[0] = tf->tf_ebx;
740 	args[1] = tf->tf_ecx;
741 	args[2] = tf->tf_edx;
742 	args[3] = tf->tf_esi;
743 	args[4] = tf->tf_edi;
744 	args[5] = tf->tf_ebp;	/* Unconfirmed */
745 	*params = NULL;		/* no copyin */
746 }
747 
748 /*
749  * If a linux binary is exec'ing something, try this image activator
750  * first.  We override standard shell script execution in order to
751  * be able to modify the interpreter path.  We only do this if a linux
752  * binary is doing the exec, so we do not create an EXEC module for it.
753  */
754 static int	exec_linux_imgact_try(struct image_params *iparams);
755 
756 static int
757 exec_linux_imgact_try(struct image_params *imgp)
758 {
759     const char *head = (const char *)imgp->image_header;
760     char *rpath;
761     int error = -1, len;
762 
763     /*
764      * The interpreter for shell scripts run from a linux binary needs
765      * to be located in /compat/linux if possible in order to recursively
766      * maintain linux path emulation.
767      */
768     if (((const short *)head)[0] == SHELLMAGIC) {
769 	    /*
770 	     * Run our normal shell image activator.  If it succeeds attempt
771 	     * to use the alternate path for the interpreter.  If an alternate
772 	     * path is found, use our stringspace to store it.
773 	     */
774 	    if ((error = exec_shell_imgact(imgp)) == 0) {
775 		    linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
776 			imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0, AT_FDCWD);
777 		    if (rpath != NULL) {
778 			    len = strlen(rpath) + 1;
779 
780 			    if (len <= MAXSHELLCMDLEN) {
781 				    memcpy(imgp->interpreter_name, rpath, len);
782 			    }
783 			    free(rpath, M_TEMP);
784 		    }
785 	    }
786     }
787     return(error);
788 }
789 
790 /*
791  * exec_setregs may initialize some registers differently than Linux
792  * does, thus potentially confusing Linux binaries. If necessary, we
793  * override the exec_setregs default(s) here.
794  */
795 static void
796 exec_linux_setregs(struct thread *td, u_long entry,
797 		   u_long stack, u_long ps_strings)
798 {
799 	static const u_short control = __LINUX_NPXCW__;
800 	struct pcb *pcb = td->td_pcb;
801 
802 	exec_setregs(td, entry, stack, ps_strings);
803 
804 	/* Linux sets %gs to 0, we default to _udatasel */
805 	pcb->pcb_gs = 0; load_gs(0);
806 
807 	/* Linux sets the i387 to extended precision. */
808 	fldcw(&control);
809 }
810 
811 struct sysentvec linux_sysvec = {
812 	.sv_size	= LINUX_SYS_MAXSYSCALL,
813 	.sv_table	= linux_sysent,
814 	.sv_mask	= 0,
815 	.sv_sigsize	= LINUX_SIGTBLSZ,
816 	.sv_sigtbl	= bsd_to_linux_signal,
817 	.sv_errsize	= ELAST + 1,
818 	.sv_errtbl	= bsd_to_linux_errno,
819 	.sv_transtrap	= translate_traps,
820 	.sv_fixup	= linux_fixup,
821 	.sv_sendsig	= linux_sendsig,
822 	.sv_sigcode	= linux_sigcode,
823 	.sv_szsigcode	= &linux_szsigcode,
824 	.sv_prepsyscall	= linux_prepsyscall,
825 	.sv_name	= "Linux a.out",
826 	.sv_coredump	= NULL,
827 	.sv_imgact_try	= exec_linux_imgact_try,
828 	.sv_minsigstksz	= LINUX_MINSIGSTKSZ,
829 	.sv_pagesize	= PAGE_SIZE,
830 	.sv_minuser	= VM_MIN_ADDRESS,
831 	.sv_maxuser	= VM_MAXUSER_ADDRESS,
832 	.sv_usrstack	= USRSTACK,
833 	.sv_psstrings	= PS_STRINGS,
834 	.sv_stackprot	= VM_PROT_ALL,
835 	.sv_copyout_strings = exec_copyout_strings,
836 	.sv_setregs	= exec_linux_setregs,
837 	.sv_fixlimit	= NULL,
838 	.sv_maxssiz	= NULL,
839 	.sv_flags	= SV_ABI_LINUX | SV_AOUT | SV_IA32 | SV_ILP32
840 };
841 
842 struct sysentvec elf_linux_sysvec = {
843 	.sv_size	= LINUX_SYS_MAXSYSCALL,
844 	.sv_table	= linux_sysent,
845 	.sv_mask	= 0,
846 	.sv_sigsize	= LINUX_SIGTBLSZ,
847 	.sv_sigtbl	= bsd_to_linux_signal,
848 	.sv_errsize	= ELAST + 1,
849 	.sv_errtbl	= bsd_to_linux_errno,
850 	.sv_transtrap	= translate_traps,
851 	.sv_fixup	= elf_linux_fixup,
852 	.sv_sendsig	= linux_sendsig,
853 	.sv_sigcode	= linux_sigcode,
854 	.sv_szsigcode	= &linux_szsigcode,
855 	.sv_prepsyscall	= linux_prepsyscall,
856 	.sv_name	= "Linux ELF",
857 	.sv_coredump	= elf32_coredump,
858 	.sv_imgact_try	= exec_linux_imgact_try,
859 	.sv_minsigstksz	= LINUX_MINSIGSTKSZ,
860 	.sv_pagesize	= PAGE_SIZE,
861 	.sv_minuser	= VM_MIN_ADDRESS,
862 	.sv_maxuser	= VM_MAXUSER_ADDRESS,
863 	.sv_usrstack	= USRSTACK,
864 	.sv_psstrings	= PS_STRINGS,
865 	.sv_stackprot	= VM_PROT_ALL,
866 	.sv_copyout_strings = exec_copyout_strings,
867 	.sv_setregs	= exec_linux_setregs,
868 	.sv_fixlimit	= NULL,
869 	.sv_maxssiz	= NULL,
870 	.sv_flags	= SV_ABI_LINUX | SV_IA32 | SV_ILP32
871 };
872 
873 static Elf32_Brandinfo linux_brand = {
874 	.brand		= ELFOSABI_LINUX,
875 	.machine	= EM_386,
876 	.compat_3_brand	= "Linux",
877 	.emul_path	= "/compat/linux",
878 	.interp_path	= "/lib/ld-linux.so.1",
879 	.sysvec		= &elf_linux_sysvec,
880 	.interp_newpath	= NULL,
881 	.flags		= BI_CAN_EXEC_DYN,
882 };
883 
884 static Elf32_Brandinfo linux_glibc2brand = {
885 	.brand		= ELFOSABI_LINUX,
886 	.machine	= EM_386,
887 	.compat_3_brand	= "Linux",
888 	.emul_path	= "/compat/linux",
889 	.interp_path	= "/lib/ld-linux.so.2",
890 	.sysvec		= &elf_linux_sysvec,
891 	.interp_newpath	= NULL,
892 	.flags		= BI_CAN_EXEC_DYN,
893 };
894 
895 Elf32_Brandinfo *linux_brandlist[] = {
896 	&linux_brand,
897 	&linux_glibc2brand,
898 	NULL
899 };
900 
901 static int
902 linux_elf_modevent(module_t mod, int type, void *data)
903 {
904 	Elf32_Brandinfo **brandinfo;
905 	int error;
906 	struct linux_ioctl_handler **lihp;
907 	struct linux_device_handler **ldhp;
908 
909 	error = 0;
910 
911 	switch(type) {
912 	case MOD_LOAD:
913 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
914 		     ++brandinfo)
915 			if (elf32_insert_brand_entry(*brandinfo) < 0)
916 				error = EINVAL;
917 		if (error == 0) {
918 			SET_FOREACH(lihp, linux_ioctl_handler_set)
919 				linux_ioctl_register_handler(*lihp);
920 			SET_FOREACH(ldhp, linux_device_handler_set)
921 				linux_device_register_handler(*ldhp);
922 			mtx_init(&emul_lock, "emuldata lock", NULL, MTX_DEF);
923 			sx_init(&emul_shared_lock, "emuldata->shared lock");
924 			LIST_INIT(&futex_list);
925 			sx_init(&futex_sx, "futex protection lock");
926 			linux_exit_tag = EVENTHANDLER_REGISTER(process_exit, linux_proc_exit,
927 			      NULL, 1000);
928 			linux_schedtail_tag = EVENTHANDLER_REGISTER(schedtail, linux_schedtail,
929 			      NULL, 1000);
930 			linux_exec_tag = EVENTHANDLER_REGISTER(process_exec, linux_proc_exec,
931 			      NULL, 1000);
932 			if (bootverbose)
933 				printf("Linux ELF exec handler installed\n");
934 		} else
935 			printf("cannot insert Linux ELF brand handler\n");
936 		break;
937 	case MOD_UNLOAD:
938 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
939 		     ++brandinfo)
940 			if (elf32_brand_inuse(*brandinfo))
941 				error = EBUSY;
942 		if (error == 0) {
943 			for (brandinfo = &linux_brandlist[0];
944 			     *brandinfo != NULL; ++brandinfo)
945 				if (elf32_remove_brand_entry(*brandinfo) < 0)
946 					error = EINVAL;
947 		}
948 		if (error == 0) {
949 			SET_FOREACH(lihp, linux_ioctl_handler_set)
950 				linux_ioctl_unregister_handler(*lihp);
951 			SET_FOREACH(ldhp, linux_device_handler_set)
952 				linux_device_unregister_handler(*ldhp);
953 			mtx_destroy(&emul_lock);
954 			sx_destroy(&emul_shared_lock);
955 			sx_destroy(&futex_sx);
956 			EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag);
957 			EVENTHANDLER_DEREGISTER(schedtail, linux_schedtail_tag);
958 			EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag);
959 			if (bootverbose)
960 				printf("Linux ELF exec handler removed\n");
961 		} else
962 			printf("Could not deinstall ELF interpreter entry\n");
963 		break;
964 	default:
965 		return EOPNOTSUPP;
966 	}
967 	return error;
968 }
969 
970 static moduledata_t linux_elf_mod = {
971 	"linuxelf",
972 	linux_elf_modevent,
973 	0
974 };
975 
976 DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
977