xref: /netbsd/sys/arch/i386/i386/trap.c (revision 6550d01e)
1 /*	$NetBSD: trap.c,v 1.260 2010/12/20 00:25:35 matt Exp $	*/
2 
3 /*-
4  * Copyright (c) 1998, 2000, 2005, 2006, 2007, 2008 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Charles M. Hannum.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*-
33  * Copyright (c) 1990 The Regents of the University of California.
34  * All rights reserved.
35  *
36  * This code is derived from software contributed to Berkeley by
37  * the University of Utah, and William Jolitz.
38  *
39  * Redistribution and use in source and binary forms, with or without
40  * modification, are permitted provided that the following conditions
41  * are met:
42  * 1. Redistributions of source code must retain the above copyright
43  *    notice, this list of conditions and the following disclaimer.
44  * 2. Redistributions in binary form must reproduce the above copyright
45  *    notice, this list of conditions and the following disclaimer in the
46  *    documentation and/or other materials provided with the distribution.
47  * 3. Neither the name of the University nor the names of its contributors
48  *    may be used to endorse or promote products derived from this software
49  *    without specific prior written permission.
50  *
51  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
52  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
53  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
54  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
55  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
56  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
57  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
58  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
59  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
60  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61  * SUCH DAMAGE.
62  *
63  *	@(#)trap.c	7.4 (Berkeley) 5/13/91
64  */
65 
66 /*
67  * 386 Trap and System call handling
68  */
69 
70 #include <sys/cdefs.h>
71 __KERNEL_RCSID(0, "$NetBSD: trap.c,v 1.260 2010/12/20 00:25:35 matt Exp $");
72 
73 #include "opt_ddb.h"
74 #include "opt_kgdb.h"
75 #include "opt_lockdebug.h"
76 #include "opt_multiprocessor.h"
77 #include "opt_vm86.h"
78 #include "opt_kvm86.h"
79 #include "opt_kstack_dr0.h"
80 #include "opt_xen.h"
81 #include "opt_dtrace.h"
82 
83 #include <sys/param.h>
84 #include <sys/systm.h>
85 #include <sys/proc.h>
86 #include <sys/acct.h>
87 #include <sys/kauth.h>
88 #include <sys/kernel.h>
89 #include <sys/kmem.h>
90 #include <sys/ras.h>
91 #include <sys/signal.h>
92 #include <sys/syscall.h>
93 #include <sys/cpu.h>
94 #include <sys/ucontext.h>
95 #include <sys/sa.h>
96 #include <sys/savar.h>
97 
98 #include <uvm/uvm_extern.h>
99 
100 #include <machine/cpufunc.h>
101 #include <machine/psl.h>
102 #include <machine/reg.h>
103 #include <machine/trap.h>
104 #include <machine/userret.h>
105 #ifdef DDB
106 #include <machine/db_machdep.h>
107 #endif
108 
109 #include "mca.h"
110 #if NMCA > 0
111 #include <machine/mca_machdep.h>
112 #endif
113 
114 #include <x86/nmi.h>
115 
116 #include "isa.h"
117 
118 #ifdef KGDB
119 #include <sys/kgdb.h>
120 #endif
121 
122 #include "npx.h"
123 
124 #ifdef KDTRACE_HOOKS
125 #include <sys/dtrace_bsd.h>
126 
127 /*
128  * This is a hook which is initialised by the dtrace module
129  * to handle traps which might occur during DTrace probe
130  * execution.
131  */
132 dtrace_trap_func_t	dtrace_trap_func = NULL;
133 
134 dtrace_doubletrap_func_t	dtrace_doubletrap_func = NULL;
135 #endif
136 
137 
138 static inline int xmm_si_code(struct lwp *);
139 void trap(struct trapframe *);
140 void trap_tss(struct i386tss *, int, int);
141 void trap_return_fault_return(struct trapframe *) __dead;
142 
143 #ifdef KVM86
144 #include <machine/kvm86.h>
145 #define KVM86MODE (kvm86_incall)
146 #else
147 #define KVM86MODE (0)
148 #endif
149 
150 const char * const trap_type[] = {
151 	"privileged instruction fault",		/*  0 T_PRIVINFLT */
152 	"breakpoint trap",			/*  1 T_BPTFLT */
153 	"arithmetic trap",			/*  2 T_ARITHTRAP */
154 	"asynchronous system trap",		/*  3 T_ASTFLT */
155 	"protection fault",			/*  4 T_PROTFLT */
156 	"trace trap",				/*  5 T_TRCTRAP */
157 	"page fault",				/*  6 T_PAGEFLT */
158 	"alignment fault",			/*  7 T_ALIGNFLT */
159 	"integer divide fault",			/*  8 T_DIVIDE */
160 	"non-maskable interrupt",		/*  9 T_NMI */
161 	"overflow trap",			/* 10 T_OFLOW */
162 	"bounds check fault",			/* 11 T_BOUND */
163 	"FPU not available fault",		/* 12 T_DNA */
164 	"double fault",				/* 13 T_DOUBLEFLT */
165 	"FPU operand fetch fault",		/* 14 T_FPOPFLT */
166 	"invalid TSS fault",			/* 15 T_TSSFLT */
167 	"segment not present fault",		/* 16 T_SEGNPFLT */
168 	"stack fault",				/* 17 T_STKFLT */
169 	"machine check fault",			/* 18 T_MCA */
170 	"SSE FP exception",			/* 19 T_XMM */
171 	"reserved trap",			/* 20 T_RESERVED */
172 };
173 int	trap_types = __arraycount(trap_type);
174 
175 #ifdef DEBUG
176 int	trapdebug = 0;
177 #endif
178 
179 #define	IDTVEC(name)	__CONCAT(X, name)
180 
181 void
182 trap_tss(struct i386tss *tss, int trapno, int code)
183 {
184 	struct trapframe tf;
185 
186 	tf.tf_gs = tss->tss_gs;
187 	tf.tf_fs = tss->tss_fs;
188 	tf.tf_es = tss->__tss_es;
189 	tf.tf_ds = tss->__tss_ds;
190 	tf.tf_edi = tss->__tss_edi;
191 	tf.tf_esi = tss->__tss_esi;
192 	tf.tf_ebp = tss->tss_ebp;
193 	tf.tf_ebx = tss->__tss_ebx;
194 	tf.tf_edx = tss->__tss_edx;
195 	tf.tf_ecx = tss->__tss_ecx;
196 	tf.tf_eax = tss->__tss_eax;
197 	tf.tf_trapno = trapno;
198 	tf.tf_err = code | TC_TSS;
199 	tf.tf_eip = tss->__tss_eip;
200 	tf.tf_cs = tss->__tss_cs;
201 	tf.tf_eflags = tss->__tss_eflags;
202 	tf.tf_esp = tss->tss_esp;
203 	tf.tf_ss = tss->__tss_ss;
204 	trap(&tf);
205 }
206 
207 static inline int
208 xmm_si_code(struct lwp *l)
209 {
210 	struct pcb *pcb;
211 	uint32_t mxcsr, mask;
212 
213 	if (!i386_use_fxsave) {
214 #ifdef DIAGNOSTIC
215 		panic("SSE FP Exception, but no SSE");
216 #endif
217 		return 0;
218 	}
219 	pcb = lwp_getpcb(l);
220 	mxcsr = pcb->pcb_savefpu.sv_xmm.sv_env.en_mxcsr;
221 
222 	/*
223          * Since we only have a single status and control register,
224 	 * we use the exception mask bits to mask disabled exceptions
225 	 */
226 	mask = ~((mxcsr & __INITIAL_MXCSR__) >> 7) & 0xff;
227         switch (mask & mxcsr) {
228 	case EN_SW_INVOP:
229 		return FPE_FLTINV;
230 	case EN_SW_DENORM:
231 	case EN_SW_PRECLOSS:
232 		return FPE_FLTRES;
233 	case EN_SW_ZERODIV:
234 		return FPE_FLTDIV;
235 	case EN_SW_OVERFLOW:
236 		return FPE_FLTOVF;
237 	case EN_SW_UNDERFLOW:
238 		return FPE_FLTUND;
239 	case EN_SW_DATACHAIN:
240 		return FPE_FLTSUB;
241 	case 0:
242 	default:
243 		return 0;
244 	}
245 }
246 
247 static void *
248 onfault_handler(const struct pcb *pcb, const struct trapframe *tf)
249 {
250 	struct onfault_table {
251 		uintptr_t start;
252 		uintptr_t end;
253 		void *handler;
254 	};
255 	extern const struct onfault_table onfault_table[];
256 	const struct onfault_table *p;
257 	uintptr_t pc;
258 
259 	if (pcb->pcb_onfault != NULL) {
260 		return pcb->pcb_onfault;
261 	}
262 
263 	pc = tf->tf_eip;
264 	for (p = onfault_table; p->start; p++) {
265 		if (p->start <= pc && pc < p->end) {
266 			return p->handler;
267 		}
268 	}
269 	return NULL;
270 }
271 
272 static void
273 trap_print(int type, struct trapframe *frame)
274 {
275 	if (frame->tf_trapno < trap_types)
276 		printf("fatal %s", trap_type[frame->tf_trapno]);
277 	else
278 		printf("unknown trap %d", frame->tf_trapno);
279 	printf(" in %s mode\n", (type & T_USER) ? "user" : "supervisor");
280 	printf("trap type %d code %x eip %x cs %x eflags %x cr2 %lx ilevel %x\n",
281 	    type, frame->tf_err, frame->tf_eip, frame->tf_cs,
282 	    frame->tf_eflags, (long)rcr2(), curcpu()->ci_ilevel);
283 }
284 
285 static void
286 check_dr0(void)
287 {
288 #ifdef KSTACK_CHECK_DR0
289 	u_int mask, dr6 = rdr6();
290 
291 	mask = 1 << 0; /* dr0 */
292 	if (dr6 & mask) {
293 		panic("trap on DR0: maybe kernel stack overflow\n");
294 #if 0
295 		dr6 &= ~mask;
296 		ldr6(dr6);
297 		return;
298 #endif
299 	}
300 #endif
301 }
302 
303 /*
304  * trap(frame): exception, fault, and trap interface to BSD kernel.
305  *
306  * This common code is called from assembly language IDT gate entry routines
307  * that prepare a suitable stack frame, and restore this frame after the
308  * exception has been processed. Note that the effect is as if the arguments
309  * were passed call by reference.
310  */
311 void
312 trap(struct trapframe *frame)
313 {
314 	struct lwp *l = curlwp;
315 	struct proc *p;
316 	struct pcb *pcb;
317 	extern char fusubail[], kcopy_fault[], return_address_fault[],
318 	    IDTVEC(osyscall)[];
319 	struct trapframe *vframe;
320 	ksiginfo_t ksi;
321 	void *onfault;
322 	int type, error;
323 	uint32_t cr2;
324 	bool pfail;
325 
326 	if (__predict_true(l != NULL)) {
327 		pcb = lwp_getpcb(l);
328 		p = l->l_proc;
329 	} else {
330 		/*
331 		 * this can happen eg. on break points in early on boot.
332 		 */
333 		pcb = NULL;
334 		p = NULL;
335 	}
336 	type = frame->tf_trapno;
337 
338 #ifdef DEBUG
339 	if (trapdebug) {
340 		printf("trap %d code %x eip %x cs %x eflags %x cr2 %lx cpl %x\n",
341 		    type, frame->tf_err, frame->tf_eip, frame->tf_cs,
342 		    frame->tf_eflags, rcr2(), curcpu()->ci_ilevel);
343 		printf("curlwp %p%s", curlwp, curlwp ? " " : "\n");
344 		if (curlwp)
345 			printf("pid %d lid %d\n", l->l_proc->p_pid, l->l_lid);
346 	}
347 #endif
348 	if (type != T_NMI && !KVM86MODE &&
349 	    !KERNELMODE(frame->tf_cs, frame->tf_eflags)) {
350 		type |= T_USER;
351 		l->l_md.md_regs = frame;
352 		pcb->pcb_cr2 = 0;
353 		LWP_CACHE_CREDS(l, p);
354 	}
355 
356 #ifdef KDTRACE_HOOKS
357 	/*
358 	 * A trap can occur while DTrace executes a probe. Before
359 	 * executing the probe, DTrace blocks re-scheduling and sets
360 	 * a flag in it's per-cpu flags to indicate that it doesn't
361 	 * want to fault. On returning from the the probe, the no-fault
362 	 * flag is cleared and finally re-scheduling is enabled.
363 	 *
364 	 * If the DTrace kernel module has registered a trap handler,
365 	 * call it and if it returns non-zero, assume that it has
366 	 * handled the trap and modified the trap frame so that this
367 	 * function can return normally.
368 	 */
369 	if ((type == T_PROTFLT || type == T_PAGEFLT) &&
370 	    dtrace_trap_func != NULL) {
371 		if ((*dtrace_trap_func)(frame, type)) {
372 			return;
373 		}
374 	}
375 #endif
376 
377 	switch (type) {
378 
379 	case T_ASTFLT:
380 		if (KVM86MODE) {
381 			break;
382 		}
383 		/*FALLTHROUGH*/
384 
385 	default:
386 	we_re_toast:
387 		if (type == T_TRCTRAP)
388 			check_dr0();
389 		else
390 			trap_print(type, frame);
391 #ifdef DDB
392 		if (kdb_trap(type, 0, frame))
393 			return;
394 #endif
395 #ifdef KGDB
396 		if (kgdb_trap(type, frame))
397 			return;
398 		else {
399 			/*
400 			 * If this is a breakpoint, don't panic
401 			 * if we're not connected.
402 			 */
403 			if (type == T_BPTFLT) {
404 				printf("kgdb: ignored %s\n", trap_type[type]);
405 				return;
406 			}
407 		}
408 #endif
409 		panic("trap");
410 		/*NOTREACHED*/
411 
412 	case T_PROTFLT:
413 #ifdef KVM86
414 		if (KVM86MODE) {
415 			kvm86_gpfault(frame);
416 			return;
417 		}
418 #endif
419 	case T_SEGNPFLT:
420 	case T_ALIGNFLT:
421 	case T_TSSFLT:
422 		if (p == NULL)
423 			goto we_re_toast;
424 		/* Check for copyin/copyout fault. */
425 		onfault = onfault_handler(pcb, frame);
426 		if (onfault != NULL) {
427 copyefault:
428 			error = EFAULT;
429 copyfault:
430 			frame->tf_eip = (uintptr_t)onfault;
431 			frame->tf_eax = error;
432 			return;
433 		}
434 
435 		/*
436 		 * Check for failure during return to user mode.
437 		 * This can happen loading invalid values into the segment
438 		 * registers, or during the 'iret' itself.
439 		 *
440 		 * We do this by looking at the instruction we faulted on.
441 		 * The specific instructions we recognize only happen when
442 		 * returning from a trap, syscall, or interrupt.
443 		 */
444 
445 kernelfault:
446 		KSI_INIT_TRAP(&ksi);
447 		ksi.ksi_signo = SIGSEGV;
448 		ksi.ksi_code = SEGV_ACCERR;
449 		ksi.ksi_trap = type;
450 
451 		switch (*(u_char *)frame->tf_eip) {
452 		case 0xcf:	/* iret */
453 			/*
454 			 * The 'iret' instruction faulted, so we have the
455 			 * 'user' registers saved after the kernel %eip:%cs:%fl
456 			 * of the 'iret' and below that the user %eip:%cs:%fl
457 			 * the 'iret' was processing.
458 			 * We must delete the 3 words of kernel return address
459 			 * from the stack to generate a normal stack frame
460 			 * (eg for sending a SIGSEGV).
461 			 */
462 			vframe = (void *)((int *)frame + 3);
463 			if (KERNELMODE(vframe->tf_cs, vframe->tf_eflags))
464 				goto we_re_toast;
465 			memmove(vframe, frame,
466 			    offsetof(struct trapframe, tf_eip));
467 			/* Set the faulting address to the user %eip */
468 			ksi.ksi_addr = (void *)vframe->tf_eip;
469 			break;
470 		case 0x8e:
471 			switch (*(uint32_t *)frame->tf_eip) {
472 			case 0x8e242c8e:	/* mov (%esp,%gs), then */
473 			case 0x0424648e:	/* mov 0x4(%esp),%fs */
474 			case 0x0824448e:	/* mov 0x8(%esp),%es */
475 			case 0x0c245c8e:	/* mov 0xc(%esp),%ds */
476 				break;
477 			default:
478 				goto we_re_toast;
479 			}
480 			/*
481 			 * We faulted loading one if the user segment registers.
482 			 * The stack frame containing the user registers is
483 			 * still valid and is just below the %eip:%cs:%fl of
484 			 * the kernel fault frame.
485 			 */
486 			vframe = (void *)(&frame->tf_eflags + 1);
487 			if (KERNELMODE(vframe->tf_cs, vframe->tf_eflags))
488 				goto we_re_toast;
489 			/* There is no valid address for the fault */
490 			break;
491 		default:
492 			goto we_re_toast;
493 		}
494 		/*
495 		 * We might have faulted trying to execute the
496 		 * trampoline for a local (nested) signal handler.
497 		 * Only generate SIGSEGV if the user %cs isn't changed.
498 		 * (This is only strictly necessary in the 'iret' case.)
499 		 */
500 		if (!pmap_exec_fixup(&p->p_vmspace->vm_map, vframe, pcb)) {
501 			/* Save outer frame for any signal return */
502 			l->l_md.md_regs = vframe;
503 			(*p->p_emul->e_trapsignal)(l, &ksi);
504 		}
505 		/* Return to user by reloading the user frame */
506 		trap_return_fault_return(vframe);
507 		/* NOTREACHED */
508 
509 	case T_PROTFLT|T_USER:		/* protection fault */
510 	case T_TSSFLT|T_USER:
511 	case T_SEGNPFLT|T_USER:
512 	case T_STKFLT|T_USER:
513 	case T_ALIGNFLT|T_USER:
514 		KSI_INIT_TRAP(&ksi);
515 
516 		ksi.ksi_addr = (void *)rcr2();
517 		switch (type) {
518 		case T_SEGNPFLT|T_USER:
519 		case T_STKFLT|T_USER:
520 			ksi.ksi_signo = SIGBUS;
521 			ksi.ksi_code = BUS_ADRERR;
522 			break;
523 		case T_TSSFLT|T_USER:
524 			ksi.ksi_signo = SIGBUS;
525 			ksi.ksi_code = BUS_OBJERR;
526 			break;
527 		case T_ALIGNFLT|T_USER:
528 			ksi.ksi_signo = SIGBUS;
529 			ksi.ksi_code = BUS_ADRALN;
530 			break;
531 		case T_PROTFLT|T_USER:
532 #ifdef VM86
533 			if (frame->tf_eflags & PSL_VM) {
534 				vm86_gpfault(l, type & ~T_USER);
535 				goto out;
536 			}
537 #endif
538 			/*
539 			 * If pmap_exec_fixup does something,
540 			 * let's retry the trap.
541 			 */
542 			if (pmap_exec_fixup(&p->p_vmspace->vm_map, frame, pcb)){
543 				goto out;
544 			}
545 			ksi.ksi_signo = SIGSEGV;
546 			ksi.ksi_code = SEGV_ACCERR;
547 			break;
548 		default:
549 			KASSERT(0);
550 			break;
551 		}
552 		goto trapsignal;
553 
554 	case T_PRIVINFLT|T_USER:	/* privileged instruction fault */
555 	case T_FPOPFLT|T_USER:		/* coprocessor operand fault */
556 		KSI_INIT_TRAP(&ksi);
557 		ksi.ksi_signo = SIGILL;
558 		ksi.ksi_addr = (void *)rcr2();
559 		switch (type) {
560 		case T_PRIVINFLT|T_USER:
561 			ksi.ksi_code = ILL_PRVOPC;
562 			break;
563 		case T_FPOPFLT|T_USER:
564 			ksi.ksi_code = ILL_COPROC;
565 			break;
566 		default:
567 			ksi.ksi_code = 0;
568 			break;
569 		}
570 		goto trapsignal;
571 
572 	case T_ASTFLT|T_USER:
573 		/* Allow process switch. */
574 		//curcpu()->ci_data.cpu_nast++;
575 		if (l->l_pflag & LP_OWEUPC) {
576 			l->l_pflag &= ~LP_OWEUPC;
577 			ADDUPROF(l);
578 		}
579 		/* Allow a forced task switch. */
580 		if (curcpu()->ci_want_resched) {
581 			preempt();
582 		}
583 		goto out;
584 
585 	case T_DNA|T_USER: {
586 		KSI_INIT_TRAP(&ksi);
587 		ksi.ksi_signo = SIGKILL;
588 		ksi.ksi_addr = (void *)frame->tf_eip;
589 		printf("pid %d killed due to lack of floating point\n",
590 		    p->p_pid);
591 		goto trapsignal;
592 	}
593 
594 	case T_XMM|T_USER:
595 	case T_BOUND|T_USER:
596 	case T_OFLOW|T_USER:
597 	case T_DIVIDE|T_USER:
598 	case T_ARITHTRAP|T_USER:
599 		KSI_INIT_TRAP(&ksi);
600 		ksi.ksi_signo = SIGFPE;
601 		ksi.ksi_addr = (void *)frame->tf_eip;
602 		switch (type) {
603 		case T_XMM|T_USER:
604 			ksi.ksi_code = xmm_si_code(l);
605 			break;
606 		case T_BOUND|T_USER:
607 			ksi.ksi_code = FPE_FLTSUB;
608 			break;
609 		case T_OFLOW|T_USER:
610 			ksi.ksi_code = FPE_INTOVF;
611 			break;
612 		case T_DIVIDE|T_USER:
613 			ksi.ksi_code = FPE_INTDIV;
614 			break;
615 		case T_ARITHTRAP|T_USER:
616 			ksi.ksi_code = npxtrap(l);
617 			break;
618 		default:
619 			ksi.ksi_code = 0;
620 			break;
621 		}
622 		goto trapsignal;
623 
624 	case T_PAGEFLT:
625 		/* Allow page faults in kernel mode. */
626 		if (__predict_false(l == NULL))
627 			goto we_re_toast;
628 
629 		/*
630 		 * fusubail is used by [fs]uswintr() to prevent page faulting
631 		 * from inside the profiling interrupt.
632 		 */
633 		onfault = pcb->pcb_onfault;
634 		if (onfault == fusubail || onfault == return_address_fault) {
635 			goto copyefault;
636 		}
637 		if (cpu_intr_p() || (l->l_pflag & LP_INTR) != 0) {
638 			goto we_re_toast;
639 		}
640 
641 		cr2 = rcr2();
642 		goto faultcommon;
643 
644 	case T_PAGEFLT|T_USER: {	/* page fault */
645 		register vaddr_t va;
646 		register struct vmspace *vm;
647 		register struct vm_map *map;
648 		vm_prot_t ftype;
649 		extern struct vm_map *kernel_map;
650 
651 		cr2 = rcr2();
652 		if (l->l_flag & LW_SA) {
653 			l->l_savp->savp_faultaddr = (vaddr_t)cr2;
654 			l->l_pflag |= LP_SA_PAGEFAULT;
655 		}
656 faultcommon:
657 		vm = p->p_vmspace;
658 		if (__predict_false(vm == NULL)) {
659 			goto we_re_toast;
660 		}
661 		pcb->pcb_cr2 = cr2;
662 		va = trunc_page((vaddr_t)cr2);
663 		/*
664 		 * It is only a kernel address space fault iff:
665 		 *	1. (type & T_USER) == 0  and
666 		 *	2. pcb_onfault not set or
667 		 *	3. pcb_onfault set but supervisor space fault
668 		 * The last can occur during an exec() copyin where the
669 		 * argument space is lazy-allocated.
670 		 */
671 		if (type == T_PAGEFLT && va >= KERNBASE)
672 			map = kernel_map;
673 		else
674 			map = &vm->vm_map;
675 		if (frame->tf_err & PGEX_W)
676 			ftype = VM_PROT_WRITE;
677 		else if (frame->tf_err & PGEX_X)
678 			ftype = VM_PROT_EXECUTE;
679 		else
680 			ftype = VM_PROT_READ;
681 
682 #ifdef DIAGNOSTIC
683 		if (map == kernel_map && va == 0) {
684 			printf("trap: bad kernel access at %lx\n", va);
685 			goto we_re_toast;
686 		}
687 #endif
688 		/* Fault the original page in. */
689 		onfault = pcb->pcb_onfault;
690 		pcb->pcb_onfault = NULL;
691 		error = uvm_fault(map, va, ftype);
692 		pcb->pcb_onfault = onfault;
693 		if (error == 0) {
694 			if (map != kernel_map && (void *)va >= vm->vm_maxsaddr)
695 				uvm_grow(p, va);
696 
697 			pfail = false;
698 			while (type == T_PAGEFLT) {
699 				/*
700 				 * we need to switch pmap now if we're in
701 				 * the middle of copyin/out.
702 				 *
703 				 * but we don't need to do so for kcopy as
704 				 * it never touch userspace.
705  				 */
706 				kpreempt_disable();
707 				if (curcpu()->ci_want_pmapload) {
708 					onfault = onfault_handler(pcb, frame);
709 					if (onfault != kcopy_fault) {
710 						pmap_load();
711 					}
712 				}
713 				/*
714 				 * We need to keep the pmap loaded and
715 				 * so avoid being preempted until back
716 				 * into the copy functions.  Disable
717 				 * interrupts at the hardware level before
718 				 * re-enabling preemption.  Interrupts
719 				 * will be re-enabled by 'iret' when
720 				 * returning back out of the trap stub.
721 				 * They'll only be re-enabled when the
722 				 * program counter is once again in
723 				 * the copy functions, and so visible
724 				 * to cpu_kpreempt_exit().
725 				 */
726 #ifndef XEN
727 				x86_disable_intr();
728 #endif
729 				l->l_nopreempt--;
730 				if (l->l_nopreempt > 0 || !l->l_dopreempt ||
731 				    pfail) {
732 					return;
733 				}
734 #ifndef XEN
735 				x86_enable_intr();
736 #endif
737 				/*
738 				 * If preemption fails for some reason,
739 				 * don't retry it.  The conditions won't
740 				 * change under our nose.
741 				 */
742 				pfail = kpreempt(0);
743 			}
744 			l->l_pflag &= ~LP_SA_PAGEFAULT;
745 			goto out;
746 		}
747 		KSI_INIT_TRAP(&ksi);
748 		ksi.ksi_trap = type & ~T_USER;
749 		ksi.ksi_addr = (void *)cr2;
750 		if (error == EACCES) {
751 			ksi.ksi_code = SEGV_ACCERR;
752 			error = EFAULT;
753 		} else {
754 			ksi.ksi_code = SEGV_MAPERR;
755 		}
756 
757 		if (type == T_PAGEFLT) {
758 			onfault = onfault_handler(pcb, frame);
759 			if (onfault != NULL)
760 				goto copyfault;
761 			printf("uvm_fault(%p, %#lx, %d) -> %#x\n",
762 			    map, va, ftype, error);
763 			goto kernelfault;
764 		}
765 		if (error == ENOMEM) {
766 			ksi.ksi_signo = SIGKILL;
767 			printf("UVM: pid %d (%s), uid %d killed: out of swap\n",
768 			       p->p_pid, p->p_comm,
769 			       l->l_cred ?
770 			       kauth_cred_geteuid(l->l_cred) : -1);
771 		} else {
772 			ksi.ksi_signo = SIGSEGV;
773 		}
774 		(*p->p_emul->e_trapsignal)(l, &ksi);
775 		l->l_pflag &= ~LP_SA_PAGEFAULT;
776 		break;
777 	}
778 
779 	case T_TRCTRAP:
780 		/* Check whether they single-stepped into a lcall. */
781 		if (frame->tf_eip == (int)IDTVEC(osyscall))
782 			return;
783 		if (frame->tf_eip == (int)IDTVEC(osyscall) + 1) {
784 			frame->tf_eflags &= ~PSL_T;
785 			return;
786 		}
787 		goto we_re_toast;
788 
789 	case T_BPTFLT|T_USER:		/* bpt instruction fault */
790 	case T_TRCTRAP|T_USER:		/* trace trap */
791 		/*
792 		 * Don't go single-stepping into a RAS.
793 		 */
794 		if (p->p_raslist == NULL ||
795 		    (ras_lookup(p, (void *)frame->tf_eip) == (void *)-1)) {
796 			KSI_INIT_TRAP(&ksi);
797 			ksi.ksi_signo = SIGTRAP;
798 			ksi.ksi_trap = type & ~T_USER;
799 			if (type == (T_BPTFLT|T_USER))
800 				ksi.ksi_code = TRAP_BRKPT;
801 			else
802 				ksi.ksi_code = TRAP_TRACE;
803 			ksi.ksi_addr = (void *)frame->tf_eip;
804 			(*p->p_emul->e_trapsignal)(l, &ksi);
805 		}
806 		break;
807 
808 	case T_NMI:
809 #if !defined(XEN)
810 		if (nmi_dispatch(frame))
811 			return;
812 #if (NISA > 0 || NMCA > 0)
813 #if defined(KGDB) || defined(DDB)
814 		/* NMI can be hooked up to a pushbutton for debugging */
815 		printf ("NMI ... going to debugger\n");
816 #ifdef KGDB
817 
818 		if (kgdb_trap(type, frame))
819 			return;
820 #endif
821 #ifdef DDB
822 		if (kdb_trap(type, 0, frame))
823 			return;
824 #endif
825 #endif /* KGDB || DDB */
826 		/* machine/parity/power fail/"kitchen sink" faults */
827 
828 #if NMCA > 0
829 		/* mca_nmi() takes care to call x86_nmi() if appropriate */
830 		if (mca_nmi() != 0)
831 			goto we_re_toast;
832 		else
833 			return;
834 #else /* NISA > 0 */
835 		if (x86_nmi() != 0)
836 			goto we_re_toast;
837 		else
838 			return;
839 #endif /* NMCA > 0 */
840 #endif /* (NISA > 0 || NMCA > 0) */
841 #endif /* !defined(XEN) */
842 		;	/* avoid a label at end of compound statement */
843 	}
844 
845 	if ((type & T_USER) == 0)
846 		return;
847 out:
848 	userret(l);
849 	return;
850 trapsignal:
851 	ksi.ksi_trap = type & ~T_USER;
852 	(*p->p_emul->e_trapsignal)(l, &ksi);
853 	userret(l);
854 }
855 
856 /*
857  * startlwp: start of a new LWP.
858  */
859 void
860 startlwp(void *arg)
861 {
862 	ucontext_t *uc = arg;
863 	lwp_t *l = curlwp;
864 	int error;
865 
866 	error = cpu_setmcontext(l, &uc->uc_mcontext, uc->uc_flags);
867 	KASSERT(error == 0);
868 
869 	kmem_free(uc, sizeof(ucontext_t));
870 	userret(l);
871 }
872 
873 /*
874  * XXX_SA: This is a terrible name.
875  */
876 void
877 upcallret(struct lwp *l)
878 {
879 	KERNEL_UNLOCK_LAST(l);
880 	userret(l);
881 }
882