xref: /dragonfly/sys/platform/vkernel64/x86_64/trap.c (revision dcd37f7d)
1 /*-
2  * Copyright (C) 1994, David Greenman
3  * Copyright (c) 1990, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * the University of Utah, and William Jolitz.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *	This product includes software developed by the University of
20  *	California, Berkeley and its contributors.
21  * 4. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  *
37  *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
38  * $FreeBSD: src/sys/i386/i386/trap.c,v 1.147.2.11 2003/02/27 19:09:59 luoqi Exp $
39  */
40 
41 /*
42  * x86_64 Trap and System call handling
43  */
44 
45 #include "use_isa.h"
46 
47 #include "opt_ddb.h"
48 #include "opt_ktrace.h"
49 
50 #include <sys/param.h>
51 #include <sys/systm.h>
52 #include <sys/proc.h>
53 #include <sys/pioctl.h>
54 #include <sys/kernel.h>
55 #include <sys/resourcevar.h>
56 #include <sys/signalvar.h>
57 #include <sys/signal2.h>
58 #include <sys/syscall.h>
59 #include <sys/sysctl.h>
60 #include <sys/sysent.h>
61 #include <sys/uio.h>
62 #include <sys/vmmeter.h>
63 #include <sys/malloc.h>
64 #ifdef KTRACE
65 #include <sys/ktrace.h>
66 #endif
67 #include <sys/ktr.h>
68 #include <sys/upcall.h>
69 #include <sys/vkernel.h>
70 #include <sys/sysproto.h>
71 #include <sys/sysunion.h>
72 #include <sys/vmspace.h>
73 
74 #include <vm/vm.h>
75 #include <vm/vm_param.h>
76 #include <sys/lock.h>
77 #include <vm/pmap.h>
78 #include <vm/vm_kern.h>
79 #include <vm/vm_map.h>
80 #include <vm/vm_page.h>
81 #include <vm/vm_extern.h>
82 
83 #include <machine/cpu.h>
84 #include <machine/md_var.h>
85 #include <machine/pcb.h>
86 #include <machine/smp.h>
87 #include <machine/tss.h>
88 #include <machine/globaldata.h>
89 
90 #include <ddb/ddb.h>
91 
92 #include <sys/msgport2.h>
93 #include <sys/thread2.h>
94 #include <sys/mplock2.h>
95 
96 #ifdef SMP
97 
98 #define MAKEMPSAFE(have_mplock)			\
99 	if (have_mplock == 0) {			\
100 		get_mplock();			\
101 		have_mplock = 1;		\
102 	}
103 
104 #else
105 
106 #define MAKEMPSAFE(have_mplock)
107 
108 #endif
109 
110 int (*pmath_emulate) (struct trapframe *);
111 
112 extern int trapwrite (unsigned addr);
113 
114 static int trap_pfault (struct trapframe *, int, vm_offset_t);
115 static void trap_fatal (struct trapframe *, int, vm_offset_t);
116 void dblfault_handler (void);
117 
118 #if 0
119 extern inthand_t IDTVEC(syscall);
120 #endif
121 
122 #define MAX_TRAP_MSG		30
123 static char *trap_msg[] = {
124 	"",					/*  0 unused */
125 	"privileged instruction fault",		/*  1 T_PRIVINFLT */
126 	"",					/*  2 unused */
127 	"breakpoint instruction fault",		/*  3 T_BPTFLT */
128 	"",					/*  4 unused */
129 	"",					/*  5 unused */
130 	"arithmetic trap",			/*  6 T_ARITHTRAP */
131 	"system forced exception",		/*  7 T_ASTFLT */
132 	"",					/*  8 unused */
133 	"general protection fault",		/*  9 T_PROTFLT */
134 	"trace trap",				/* 10 T_TRCTRAP */
135 	"",					/* 11 unused */
136 	"page fault",				/* 12 T_PAGEFLT */
137 	"",					/* 13 unused */
138 	"alignment fault",			/* 14 T_ALIGNFLT */
139 	"",					/* 15 unused */
140 	"",					/* 16 unused */
141 	"",					/* 17 unused */
142 	"integer divide fault",			/* 18 T_DIVIDE */
143 	"non-maskable interrupt trap",		/* 19 T_NMI */
144 	"overflow trap",			/* 20 T_OFLOW */
145 	"FPU bounds check fault",		/* 21 T_BOUND */
146 	"FPU device not available",		/* 22 T_DNA */
147 	"double fault",				/* 23 T_DOUBLEFLT */
148 	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
149 	"invalid TSS fault",			/* 25 T_TSSFLT */
150 	"segment not present fault",		/* 26 T_SEGNPFLT */
151 	"stack fault",				/* 27 T_STKFLT */
152 	"machine check trap",			/* 28 T_MCHK */
153 	"SIMD floating-point exception",	/* 29 T_XMMFLT */
154 	"reserved (unknown) fault",		/* 30 T_RESERVED */
155 };
156 
157 #ifdef DDB
158 static int ddb_on_nmi = 1;
159 SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW,
160 	&ddb_on_nmi, 0, "Go to DDB on NMI");
161 #endif
162 static int panic_on_nmi = 1;
163 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW,
164 	&panic_on_nmi, 0, "Panic on NMI");
165 static int fast_release;
166 SYSCTL_INT(_machdep, OID_AUTO, fast_release, CTLFLAG_RW,
167 	&fast_release, 0, "Passive Release was optimal");
168 static int slow_release;
169 SYSCTL_INT(_machdep, OID_AUTO, slow_release, CTLFLAG_RW,
170 	&slow_release, 0, "Passive Release was nonoptimal");
171 #ifdef SMP
172 static int syscall_mpsafe = 1;
173 SYSCTL_INT(_kern, OID_AUTO, syscall_mpsafe, CTLFLAG_RW,
174 	&syscall_mpsafe, 0, "Allow MPSAFE marked syscalls to run without BGL");
175 TUNABLE_INT("kern.syscall_mpsafe", &syscall_mpsafe);
176 static int trap_mpsafe = 1;
177 SYSCTL_INT(_kern, OID_AUTO, trap_mpsafe, CTLFLAG_RW,
178 	&trap_mpsafe, 0, "Allow traps to mostly run without the BGL");
179 TUNABLE_INT("kern.trap_mpsafe", &trap_mpsafe);
180 #endif
181 
182 MALLOC_DEFINE(M_SYSMSG, "sysmsg", "sysmsg structure");
183 extern int max_sysmsg;
184 
185 /*
186  * Passively intercepts the thread switch function to increase the thread
187  * priority from a user priority to a kernel priority, reducing
188  * syscall and trap overhead for the case where no switch occurs.
189  *
190  * Synchronizes td_ucred with p_ucred.  This is used by system calls,
191  * signal handling, faults, AST traps, and anything else that enters the
192  * kernel from userland and provides the kernel with a stable read-only
193  * copy of the process ucred.
194  */
195 static __inline void
196 userenter(struct thread *curtd, struct proc *curp)
197 {
198 	struct ucred *ocred;
199 	struct ucred *ncred;
200 
201 	curtd->td_release = lwkt_passive_release;
202 
203 	if (curtd->td_ucred != curp->p_ucred) {
204 		ncred = crhold(curp->p_ucred);
205 		ocred = curtd->td_ucred;
206 		curtd->td_ucred = ncred;
207 		if (ocred)
208 			crfree(ocred);
209 	}
210 }
211 
212 /*
213  * Handle signals, upcalls, profiling, and other AST's and/or tasks that
214  * must be completed before we can return to or try to return to userland.
215  *
216  * Note that td_sticks is a 64 bit quantity, but there's no point doing 64
217  * arithmatic on the delta calculation so the absolute tick values are
218  * truncated to an integer.
219  */
220 static void
221 userret(struct lwp *lp, struct trapframe *frame, int sticks)
222 {
223 	struct proc *p = lp->lwp_proc;
224 	int sig;
225 
226 	/*
227 	 * Charge system time if profiling.  Note: times are in microseconds.
228 	 * This may do a copyout and block, so do it first even though it
229 	 * means some system time will be charged as user time.
230 	 */
231 	if (p->p_flag & P_PROFIL) {
232 		addupc_task(p, frame->tf_rip,
233 			(u_int)((int)lp->lwp_thread->td_sticks - sticks));
234 	}
235 
236 recheck:
237 	/*
238 	 * If the jungle wants us dead, so be it.
239 	 */
240 	if (lp->lwp_flag & LWP_WEXIT) {
241 		get_mplock();
242 		lwp_exit(0);
243 		rel_mplock(); /* NOT REACHED */
244 	}
245 
246 	/*
247 	 * Block here if we are in a stopped state.
248 	 */
249 	if (p->p_stat == SSTOP) {
250 		get_mplock();
251 		tstop();
252 		rel_mplock();
253 		goto recheck;
254 	}
255 
256 	/*
257 	 * Post any pending upcalls
258 	 */
259 	if (p->p_flag & P_UPCALLPEND) {
260 		get_mplock();
261 		p->p_flag &= ~P_UPCALLPEND;
262 		postupcall(lp);
263 		rel_mplock();
264 		goto recheck;
265 	}
266 
267 	/*
268 	 * Post any pending signals
269 	 */
270 	if ((sig = CURSIG_TRACE(lp)) != 0) {
271 		get_mplock();
272 		postsig(sig);
273 		rel_mplock();
274 		goto recheck;
275 	}
276 
277 	/*
278 	 * block here if we are swapped out, but still process signals
279 	 * (such as SIGKILL).  proc0 (the swapin scheduler) is already
280 	 * aware of our situation, we do not have to wake it up.
281 	 */
282 	if (p->p_flag & P_SWAPPEDOUT) {
283 		get_mplock();
284 		p->p_flag |= P_SWAPWAIT;
285 		swapin_request();
286 		if (p->p_flag & P_SWAPWAIT)
287 			tsleep(p, PCATCH, "SWOUT", 0);
288 		p->p_flag &= ~P_SWAPWAIT;
289 		rel_mplock();
290 		goto recheck;
291 	}
292 
293 	/*
294 	 * Make sure postsig() handled request to restore old signal mask after
295 	 * running signal handler.
296 	 */
297 	KKASSERT((lp->lwp_flag & LWP_OLDMASK) == 0);
298 }
299 
300 /*
301  * Cleanup from userenter and any passive release that might have occured.
302  * We must reclaim the current-process designation before we can return
303  * to usermode.  We also handle both LWKT and USER reschedule requests.
304  */
305 static __inline void
306 userexit(struct lwp *lp)
307 {
308 	struct thread *td = lp->lwp_thread;
309 	/* globaldata_t gd = td->td_gd; */
310 
311 	/*
312 	 * Handle stop requests at kernel priority.  Any requests queued
313 	 * after this loop will generate another AST.
314 	 */
315 	while (lp->lwp_proc->p_stat == SSTOP) {
316 		get_mplock();
317 		tstop();
318 		rel_mplock();
319 	}
320 
321 	/*
322 	 * Reduce our priority in preparation for a return to userland.  If
323 	 * our passive release function was still in place, our priority was
324 	 * never raised and does not need to be reduced.
325 	 */
326 	lwkt_passive_recover(td);
327 
328 	/*
329 	 * Become the current user scheduled process if we aren't already,
330 	 * and deal with reschedule requests and other factors.
331 	 */
332 	lp->lwp_proc->p_usched->acquire_curproc(lp);
333 	/* WARNING: we may have migrated cpu's */
334 	/* gd = td->td_gd; */
335 }
336 
337 #if !defined(KTR_KERNENTRY)
338 #define	KTR_KERNENTRY	KTR_ALL
339 #endif
340 KTR_INFO_MASTER(kernentry);
341 KTR_INFO(KTR_KERNENTRY, kernentry, trap, 0, "pid=%d, tid=%d, trapno=%d, eva=%p",
342 	 sizeof(int) + sizeof(int) + sizeof(int) + sizeof(vm_offset_t));
343 KTR_INFO(KTR_KERNENTRY, kernentry, trap_ret, 0, "pid=%d, tid=%d",
344 	 sizeof(int) + sizeof(int));
345 KTR_INFO(KTR_KERNENTRY, kernentry, syscall, 0, "pid=%d, tid=%d, call=%d",
346 	 sizeof(int) + sizeof(int) + sizeof(int));
347 KTR_INFO(KTR_KERNENTRY, kernentry, syscall_ret, 0, "pid=%d, tid=%d, err=%d",
348 	 sizeof(int) + sizeof(int) + sizeof(int));
349 KTR_INFO(KTR_KERNENTRY, kernentry, fork_ret, 0, "pid=%d, tid=%d",
350 	 sizeof(int) + sizeof(int));
351 
352 /*
353  * Exception, fault, and trap interface to the kernel.
354  * This common code is called from assembly language IDT gate entry
355  * routines that prepare a suitable stack frame, and restore this
356  * frame after the exception has been processed.
357  *
358  * This function is also called from doreti in an interlock to handle ASTs.
359  * For example:  hardwareint->INTROUTINE->(set ast)->doreti->trap
360  *
361  * NOTE!  We have to retrieve the fault address prior to obtaining the
362  * MP lock because get_mplock() may switch out.  YYY cr2 really ought
363  * to be retrieved by the assembly code, not here.
364  *
365  * XXX gd_trap_nesting_level currently prevents lwkt_switch() from panicing
366  * if an attempt is made to switch from a fast interrupt or IPI.  This is
367  * necessary to properly take fatal kernel traps on SMP machines if
368  * get_mplock() has to block.
369  */
370 
371 void
372 user_trap(struct trapframe *frame)
373 {
374 	struct globaldata *gd = mycpu;
375 	struct thread *td = gd->gd_curthread;
376 	struct lwp *lp = td->td_lwp;
377 	struct proc *p;
378 	int sticks = 0;
379 	int i = 0, ucode = 0, type, code;
380 #ifdef SMP
381 	int have_mplock = 0;
382 #endif
383 #ifdef INVARIANTS
384 	int crit_count = td->td_pri & ~TDPRI_MASK;
385 #endif
386 	vm_offset_t eva;
387 
388 	p = td->td_proc;
389 
390 	if (frame->tf_trapno == T_PAGEFLT)
391 		eva = frame->tf_addr;
392 	else
393 		eva = 0;
394 #if 0
395 	kprintf("USER_TRAP AT %08lx xflags %ld trapno %ld eva %08lx\n",
396 		frame->tf_rip, frame->tf_xflags, frame->tf_trapno, eva);
397 #endif
398 
399 	/*
400 	 * Everything coming from user mode runs through user_trap,
401 	 * including system calls.
402 	 */
403 	if (frame->tf_trapno == T_FAST_SYSCALL) {
404 		syscall2(frame);
405 		return;
406 	}
407 
408 	KTR_LOG(kernentry_trap, lp->lwp_proc->p_pid, lp->lwp_tid,
409 		frame->tf_trapno, eva);
410 
411 #ifdef DDB
412 	if (db_active) {
413 		eva = (frame->tf_trapno == T_PAGEFLT ? rcr2() : 0);
414 		++gd->gd_trap_nesting_level;
415 		MAKEMPSAFE(have_mplock);
416 		trap_fatal(frame, TRUE, eva);
417 		--gd->gd_trap_nesting_level;
418 		goto out2;
419 	}
420 #endif
421 
422 	++gd->gd_trap_nesting_level;
423 #ifdef SMP
424 	if (trap_mpsafe == 0)
425 		MAKEMPSAFE(have_mplock);
426 #endif
427 
428 	--gd->gd_trap_nesting_level;
429 
430 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
431 restart:
432 #endif
433 	type = frame->tf_trapno;
434 	code = frame->tf_err;
435 
436 	userenter(td, p);
437 
438 	sticks = (int)td->td_sticks;
439 	lp->lwp_md.md_regs = frame;
440 
441 	switch (type) {
442 	case T_PRIVINFLT:	/* privileged instruction fault */
443 		ucode = type;
444 		i = SIGILL;
445 		break;
446 
447 	case T_BPTFLT:		/* bpt instruction fault */
448 	case T_TRCTRAP:		/* trace trap */
449 		frame->tf_rflags &= ~PSL_T;
450 		i = SIGTRAP;
451 		break;
452 
453 	case T_ARITHTRAP:	/* arithmetic trap */
454 		ucode = code;
455 		i = SIGFPE;
456 		break;
457 
458 	case T_ASTFLT:		/* Allow process switch */
459 		mycpu->gd_cnt.v_soft++;
460 		if (mycpu->gd_reqflags & RQF_AST_OWEUPC) {
461 			atomic_clear_int_nonlocked(&mycpu->gd_reqflags,
462 				    RQF_AST_OWEUPC);
463 			addupc_task(p, p->p_prof.pr_addr,
464 				    p->p_prof.pr_ticks);
465 		}
466 		goto out;
467 
468 		/*
469 		 * The following two traps can happen in
470 		 * vm86 mode, and, if so, we want to handle
471 		 * them specially.
472 		 */
473 	case T_PROTFLT:		/* general protection fault */
474 	case T_STKFLT:		/* stack fault */
475 #if 0
476 		if (frame->tf_eflags & PSL_VM) {
477 			i = vm86_emulate((struct vm86frame *)frame);
478 			if (i == 0)
479 				goto out;
480 			break;
481 		}
482 #endif
483 		/* FALL THROUGH */
484 
485 	case T_SEGNPFLT:	/* segment not present fault */
486 	case T_TSSFLT:		/* invalid TSS fault */
487 	case T_DOUBLEFLT:	/* double fault */
488 	default:
489 		ucode = code + BUS_SEGM_FAULT ;
490 		i = SIGBUS;
491 		break;
492 
493 	case T_PAGEFLT:		/* page fault */
494 		MAKEMPSAFE(have_mplock);
495 		i = trap_pfault(frame, TRUE, eva);
496 		if (i == -1)
497 			goto out;
498 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
499 		if (i == -2)
500 			goto restart;
501 #endif
502 		if (i == 0)
503 			goto out;
504 
505 		ucode = T_PAGEFLT;
506 		break;
507 
508 	case T_DIVIDE:		/* integer divide fault */
509 		ucode = FPE_INTDIV;
510 		i = SIGFPE;
511 		break;
512 
513 #if NISA > 0
514 	case T_NMI:
515 		MAKEMPSAFE(have_mplock);
516 		/* machine/parity/power fail/"kitchen sink" faults */
517 		if (isa_nmi(code) == 0) {
518 #ifdef DDB
519 			/*
520 			 * NMI can be hooked up to a pushbutton
521 			 * for debugging.
522 			 */
523 			if (ddb_on_nmi) {
524 				kprintf ("NMI ... going to debugger\n");
525 				kdb_trap (type, 0, frame);
526 			}
527 #endif /* DDB */
528 			goto out2;
529 		} else if (panic_on_nmi)
530 			panic("NMI indicates hardware failure");
531 		break;
532 #endif /* NISA > 0 */
533 
534 	case T_OFLOW:		/* integer overflow fault */
535 		ucode = FPE_INTOVF;
536 		i = SIGFPE;
537 		break;
538 
539 	case T_BOUND:		/* bounds check fault */
540 		ucode = FPE_FLTSUB;
541 		i = SIGFPE;
542 		break;
543 
544 	case T_DNA:
545 		/*
546 		 * Virtual kernel intercept - pass the DNA exception
547 		 * to the (emulated) virtual kernel if it asked to handle
548 		 * it.  This occurs when the virtual kernel is holding
549 		 * onto the FP context for a different emulated
550 		 * process then the one currently running.
551 		 *
552 		 * We must still call npxdna() since we may have
553 		 * saved FP state that the (emulated) virtual kernel
554 		 * needs to hand over to a different emulated process.
555 		 */
556 		if (lp->lwp_vkernel && lp->lwp_vkernel->ve &&
557 		    (td->td_pcb->pcb_flags & FP_VIRTFP)
558 		) {
559 			npxdna(frame);
560 			break;
561 		}
562 		/*
563 		 * The kernel may have switched out the FP unit's
564 		 * state, causing the user process to take a fault
565 		 * when it tries to use the FP unit.  Restore the
566 		 * state here
567 		 */
568 		if (npxdna(frame))
569 			goto out;
570 		if (!pmath_emulate) {
571 			i = SIGFPE;
572 			ucode = FPE_FPU_NP_TRAP;
573 			break;
574 		}
575 		i = (*pmath_emulate)(frame);
576 		if (i == 0) {
577 			if (!(frame->tf_rflags & PSL_T))
578 				goto out2;
579 			frame->tf_rflags &= ~PSL_T;
580 			i = SIGTRAP;
581 		}
582 		/* else ucode = emulator_only_knows() XXX */
583 		break;
584 
585 	case T_FPOPFLT:		/* FPU operand fetch fault */
586 		ucode = T_FPOPFLT;
587 		i = SIGILL;
588 		break;
589 
590 	case T_XMMFLT:		/* SIMD floating-point exception */
591 		ucode = 0; /* XXX */
592 		i = SIGFPE;
593 		break;
594 	}
595 
596 	/*
597 	 * Virtual kernel intercept - if the fault is directly related to a
598 	 * VM context managed by a virtual kernel then let the virtual kernel
599 	 * handle it.
600 	 */
601 	if (lp->lwp_vkernel && lp->lwp_vkernel->ve) {
602 		vkernel_trap(lp, frame);
603 		goto out;
604 	}
605 
606 	/*
607 	 * Translate fault for emulators (e.g. Linux)
608 	 */
609 	if (*p->p_sysent->sv_transtrap)
610 		i = (*p->p_sysent->sv_transtrap)(i, type);
611 
612 	MAKEMPSAFE(have_mplock);
613 	trapsignal(lp, i, ucode);
614 
615 #ifdef DEBUG
616 	if (type <= MAX_TRAP_MSG) {
617 		uprintf("fatal process exception: %s",
618 			trap_msg[type]);
619 		if ((type == T_PAGEFLT) || (type == T_PROTFLT))
620 			uprintf(", fault VA = 0x%lx", (u_long)eva);
621 		uprintf("\n");
622 	}
623 #endif
624 
625 out:
626 #ifdef SMP
627 	KASSERT(td->td_mpcount == have_mplock, ("badmpcount trap/end from %p", (void *)frame->tf_rip));
628 #endif
629 	userret(lp, frame, sticks);
630 	userexit(lp);
631 out2:	;
632 #ifdef SMP
633 	if (have_mplock)
634 		rel_mplock();
635 #endif
636 	KTR_LOG(kernentry_trap_ret, lp->lwp_proc->p_pid, lp->lwp_tid);
637 #ifdef INVARIANTS
638 	KASSERT(crit_count == (td->td_pri & ~TDPRI_MASK),
639 		("syscall: critical section count mismatch! %d/%d",
640 		crit_count / TDPRI_CRIT, td->td_pri / TDPRI_CRIT));
641 #endif
642 }
643 
644 void
645 kern_trap(struct trapframe *frame)
646 {
647 	struct globaldata *gd = mycpu;
648 	struct thread *td = gd->gd_curthread;
649 	struct lwp *lp;
650 	struct proc *p;
651 	int i = 0, ucode = 0, type, code;
652 #ifdef SMP
653 	int have_mplock = 0;
654 #endif
655 #ifdef INVARIANTS
656 	int crit_count = td->td_pri & ~TDPRI_MASK;
657 #endif
658 	vm_offset_t eva;
659 
660 	lp = td->td_lwp;
661 	p = td->td_proc;
662 
663 	if (frame->tf_trapno == T_PAGEFLT)
664 		eva = frame->tf_addr;
665 	else
666 		eva = 0;
667 
668 #ifdef DDB
669 	if (db_active) {
670 		++gd->gd_trap_nesting_level;
671 		MAKEMPSAFE(have_mplock);
672 		trap_fatal(frame, FALSE, eva);
673 		--gd->gd_trap_nesting_level;
674 		goto out2;
675 	}
676 #endif
677 
678 	++gd->gd_trap_nesting_level;
679 
680 #ifdef SMP
681 	if (trap_mpsafe == 0)
682 		MAKEMPSAFE(have_mplock);
683 #endif
684 
685 	--gd->gd_trap_nesting_level;
686 
687 	type = frame->tf_trapno;
688 	code = frame->tf_err;
689 
690 #if 0
691 kernel_trap:
692 #endif
693 	/* kernel trap */
694 
695 	switch (type) {
696 	case T_PAGEFLT:			/* page fault */
697 		MAKEMPSAFE(have_mplock);
698 		trap_pfault(frame, FALSE, eva);
699 		goto out2;
700 
701 	case T_DNA:
702 		/*
703 		 * The kernel may be using npx for copying or other
704 		 * purposes.
705 		 */
706 		panic("kernel NPX should not happen");
707 		if (npxdna(frame))
708 			goto out2;
709 		break;
710 
711 	case T_PROTFLT:		/* general protection fault */
712 	case T_SEGNPFLT:	/* segment not present fault */
713 		/*
714 		 * Invalid segment selectors and out of bounds
715 		 * %eip's and %esp's can be set up in user mode.
716 		 * This causes a fault in kernel mode when the
717 		 * kernel tries to return to user mode.  We want
718 		 * to get this fault so that we can fix the
719 		 * problem here and not have to check all the
720 		 * selectors and pointers when the user changes
721 		 * them.
722 		 */
723 		if (mycpu->gd_intr_nesting_level == 0) {
724 			if (td->td_pcb->pcb_onfault) {
725 				frame->tf_rip =
726 				    (register_t)td->td_pcb->pcb_onfault;
727 				goto out2;
728 			}
729 		}
730 		break;
731 
732 	case T_TSSFLT:
733 		/*
734 		 * PSL_NT can be set in user mode and isn't cleared
735 		 * automatically when the kernel is entered.  This
736 		 * causes a TSS fault when the kernel attempts to
737 		 * `iret' because the TSS link is uninitialized.  We
738 		 * want to get this fault so that we can fix the
739 		 * problem here and not every time the kernel is
740 		 * entered.
741 		 */
742 		if (frame->tf_rflags & PSL_NT) {
743 			frame->tf_rflags &= ~PSL_NT;
744 			goto out2;
745 		}
746 		break;
747 
748 	case T_TRCTRAP:	 /* trace trap */
749 #if 0
750 		if (frame->tf_eip == (int)IDTVEC(syscall)) {
751 			/*
752 			 * We've just entered system mode via the
753 			 * syscall lcall.  Continue single stepping
754 			 * silently until the syscall handler has
755 			 * saved the flags.
756 			 */
757 			goto out2;
758 		}
759 		if (frame->tf_eip == (int)IDTVEC(syscall) + 1) {
760 			/*
761 			 * The syscall handler has now saved the
762 			 * flags.  Stop single stepping it.
763 			 */
764 			frame->tf_eflags &= ~PSL_T;
765 			goto out2;
766 		}
767 #endif
768 #if 0
769 		/*
770 		 * Ignore debug register trace traps due to
771 		 * accesses in the user's address space, which
772 		 * can happen under several conditions such as
773 		 * if a user sets a watchpoint on a buffer and
774 		 * then passes that buffer to a system call.
775 		 * We still want to get TRCTRAPS for addresses
776 		 * in kernel space because that is useful when
777 		 * debugging the kernel.
778 		 */
779 		if (user_dbreg_trap()) {
780 			/*
781 			 * Reset breakpoint bits because the
782 			 * processor doesn't
783 			 */
784 			load_dr6(rdr6() & 0xfffffff0);
785 			goto out2;
786 		}
787 #endif
788 		/*
789 		 * Fall through (TRCTRAP kernel mode, kernel address)
790 		 */
791 	case T_BPTFLT:
792 		/*
793 		 * If DDB is enabled, let it handle the debugger trap.
794 		 * Otherwise, debugger traps "can't happen".
795 		 */
796 #ifdef DDB
797 		MAKEMPSAFE(have_mplock);
798 		if (kdb_trap (type, 0, frame))
799 			goto out2;
800 #endif
801 		break;
802 	case T_DIVIDE:
803 		MAKEMPSAFE(have_mplock);
804 		trap_fatal(frame, FALSE, eva);
805 		goto out2;
806 	case T_NMI:
807 		MAKEMPSAFE(have_mplock);
808 		trap_fatal(frame, FALSE, eva);
809 		goto out2;
810 	case T_SYSCALL80:
811 	case T_FAST_SYSCALL:
812 		/*
813 		 * Ignore this trap generated from a spurious SIGTRAP.
814 		 *
815 		 * single stepping in / syscalls leads to spurious / SIGTRAP
816 		 * so ignore
817 		 *
818 		 * Haiku (c) 2007 Simon 'corecode' Schubert
819 		 */
820 		goto out2;
821 	}
822 
823 	/*
824 	 * Translate fault for emulators (e.g. Linux)
825 	 */
826 	if (*p->p_sysent->sv_transtrap)
827 		i = (*p->p_sysent->sv_transtrap)(i, type);
828 
829 	MAKEMPSAFE(have_mplock);
830 	trapsignal(lp, i, ucode);
831 
832 #ifdef DEBUG
833 	if (type <= MAX_TRAP_MSG) {
834 		uprintf("fatal process exception: %s",
835 			trap_msg[type]);
836 		if ((type == T_PAGEFLT) || (type == T_PROTFLT))
837 			uprintf(", fault VA = 0x%lx", (u_long)eva);
838 		uprintf("\n");
839 	}
840 #endif
841 
842 out2:
843 	;
844 #ifdef SMP
845 	if (have_mplock)
846 		rel_mplock();
847 #endif
848 #ifdef INVARIANTS
849 	KASSERT(crit_count == (td->td_pri & ~TDPRI_MASK),
850 		("syscall: critical section count mismatch! %d/%d",
851 		crit_count / TDPRI_CRIT, td->td_pri / TDPRI_CRIT));
852 #endif
853 }
854 
855 int
856 trap_pfault(struct trapframe *frame, int usermode, vm_offset_t eva)
857 {
858 	vm_offset_t va;
859 	struct vmspace *vm = NULL;
860 	vm_map_t map = 0;
861 	int rv = 0;
862 	vm_prot_t ftype;
863 	thread_t td = curthread;
864 	struct lwp *lp = td->td_lwp;
865 
866 	va = trunc_page(eva);
867 	if (usermode == FALSE) {
868 		/*
869 		 * This is a fault on kernel virtual memory.
870 		 */
871 		map = &kernel_map;
872 	} else {
873 		/*
874 		 * This is a fault on non-kernel virtual memory.
875 		 * vm is initialized above to NULL. If curproc is NULL
876 		 * or curproc->p_vmspace is NULL the fault is fatal.
877 		 */
878 		if (lp != NULL)
879 			vm = lp->lwp_vmspace;
880 
881 		if (vm == NULL)
882 			goto nogo;
883 
884 		map = &vm->vm_map;
885 	}
886 
887 	if (frame->tf_err & PGEX_W)
888 		ftype = VM_PROT_READ | VM_PROT_WRITE;
889 	else
890 		ftype = VM_PROT_READ;
891 
892 	if (map != &kernel_map) {
893 		/*
894 		 * Keep swapout from messing with us during this
895 		 *	critical time.
896 		 */
897 		PHOLD(lp->lwp_proc);
898 
899 		/*
900 		 * Grow the stack if necessary
901 		 */
902 		/* grow_stack returns false only if va falls into
903 		 * a growable stack region and the stack growth
904 		 * fails.  It returns true if va was not within
905 		 * a growable stack region, or if the stack
906 		 * growth succeeded.
907 		 */
908 		if (!grow_stack (lp->lwp_proc, va)) {
909 			rv = KERN_FAILURE;
910 			PRELE(lp->lwp_proc);
911 			goto nogo;
912 		}
913 
914 		/* Fault in the user page: */
915 		rv = vm_fault(map, va, ftype,
916 			      (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY
917 						      : VM_FAULT_NORMAL);
918 
919 		PRELE(lp->lwp_proc);
920 	} else {
921 		/*
922 		 * Don't have to worry about process locking or stacks in the kernel.
923 		 */
924 		rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
925 	}
926 
927 	if (rv == KERN_SUCCESS)
928 		return (0);
929 nogo:
930 	if (!usermode) {
931 		if (td->td_gd->gd_intr_nesting_level == 0 &&
932 		    td->td_pcb->pcb_onfault) {
933 			frame->tf_rip = (register_t)td->td_pcb->pcb_onfault;
934 			return (0);
935 		}
936 		trap_fatal(frame, usermode, eva);
937 		return (-1);
938 	}
939 
940 	/*
941 	 * NOTE: on x86_64 we have a tf_addr field in the trapframe, no
942 	 * kludge is needed to pass the fault address to signal handlers.
943 	 */
944 	struct proc *p = td->td_proc;
945 	kprintf("seg-fault accessing address %p rip=%p pid=%d p_comm=%s\n",
946 		(void *)va, (void *)frame->tf_rip, p->p_pid, p->p_comm);
947 	/* Debugger("seg-fault"); */
948 
949 	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
950 }
951 
952 static void
953 trap_fatal(struct trapframe *frame, int usermode, vm_offset_t eva)
954 {
955 	int code, type, ss;
956 	long rsp;
957 
958 	code = frame->tf_xflags;
959 	type = frame->tf_trapno;
960 
961 	if (type <= MAX_TRAP_MSG) {
962 		kprintf("\n\nFatal trap %d: %s while in %s mode\n",
963 			type, trap_msg[type],
964 			(usermode ? "user" : "kernel"));
965 	}
966 #ifdef SMP
967 	/* two separate prints in case of a trap on an unmapped page */
968 	kprintf("mp_lock = %08x; ", mp_lock);
969 	kprintf("cpuid = %d\n", mycpu->gd_cpuid);
970 #endif
971 	if (type == T_PAGEFLT) {
972 		kprintf("fault virtual address	= %p\n", (void *)eva);
973 		kprintf("fault code		= %s %s, %s\n",
974 			usermode ? "user" : "supervisor",
975 			code & PGEX_W ? "write" : "read",
976 			code & PGEX_P ? "protection violation" : "page not present");
977 	}
978 	kprintf("instruction pointer	= 0x%lx:0x%lx\n",
979 	       frame->tf_cs & 0xffff, frame->tf_rip);
980 	if (usermode) {
981 		ss = frame->tf_ss & 0xffff;
982 		rsp = frame->tf_rsp;
983 	} else {
984 		ss = GSEL(GDATA_SEL, SEL_KPL);
985 		rsp = (long)&frame->tf_rsp;
986 	}
987 	kprintf("stack pointer	        = 0x%x:0x%lx\n", ss, rsp);
988 	kprintf("frame pointer	        = 0x%x:0x%lx\n", ss, frame->tf_rbp);
989 	kprintf("processor eflags	= ");
990 	if (frame->tf_rflags & PSL_T)
991 		kprintf("trace trap, ");
992 	if (frame->tf_rflags & PSL_I)
993 		kprintf("interrupt enabled, ");
994 	if (frame->tf_rflags & PSL_NT)
995 		kprintf("nested task, ");
996 	if (frame->tf_rflags & PSL_RF)
997 		kprintf("resume, ");
998 #if 0
999 	if (frame->tf_eflags & PSL_VM)
1000 		kprintf("vm86, ");
1001 #endif
1002 	kprintf("IOPL = %jd\n", (intmax_t)((frame->tf_rflags & PSL_IOPL) >> 12));
1003 	kprintf("current process		= ");
1004 	if (curproc) {
1005 		kprintf("%lu (%s)\n",
1006 		    (u_long)curproc->p_pid, curproc->p_comm ?
1007 		    curproc->p_comm : "");
1008 	} else {
1009 		kprintf("Idle\n");
1010 	}
1011 	kprintf("current thread          = pri %d ", curthread->td_pri);
1012 	if (curthread->td_pri >= TDPRI_CRIT)
1013 		kprintf("(CRIT)");
1014 	kprintf("\n");
1015 #ifdef SMP
1016 /**
1017  *  XXX FIXME:
1018  *	we probably SHOULD have stopped the other CPUs before now!
1019  *	another CPU COULD have been touching cpl at this moment...
1020  */
1021 	kprintf(" <- SMP: XXX");
1022 #endif
1023 	kprintf("\n");
1024 
1025 #ifdef KDB
1026 	if (kdb_trap(&psl))
1027 		return;
1028 #endif
1029 #ifdef DDB
1030 	if ((debugger_on_panic || db_active) && kdb_trap(type, code, frame))
1031 		return;
1032 #endif
1033 	kprintf("trap number		= %d\n", type);
1034 	if (type <= MAX_TRAP_MSG)
1035 		panic("%s", trap_msg[type]);
1036 	else
1037 		panic("unknown/reserved trap");
1038 }
1039 
1040 /*
1041  * Double fault handler. Called when a fault occurs while writing
1042  * a frame for a trap/exception onto the stack. This usually occurs
1043  * when the stack overflows (such is the case with infinite recursion,
1044  * for example).
1045  *
1046  * XXX Note that the current PTD gets replaced by IdlePTD when the
1047  * task switch occurs. This means that the stack that was active at
1048  * the time of the double fault is not available at <kstack> unless
1049  * the machine was idle when the double fault occurred. The downside
1050  * of this is that "trace <ebp>" in ddb won't work.
1051  */
1052 void
1053 dblfault_handler(void)
1054 {
1055 #if JG
1056 	struct mdglobaldata *gd = mdcpu;
1057 #endif
1058 
1059 	kprintf("\nFatal double fault:\n");
1060 #if JG
1061 	kprintf("rip = 0x%lx\n", gd->gd_common_tss.tss_rip);
1062 	kprintf("rsp = 0x%lx\n", gd->gd_common_tss.tss_rsp);
1063 	kprintf("rbp = 0x%lx\n", gd->gd_common_tss.tss_rbp);
1064 #endif
1065 #ifdef SMP
1066 	/* two separate prints in case of a trap on an unmapped page */
1067 	kprintf("mp_lock = %08x; ", mp_lock);
1068 	kprintf("cpuid = %d\n", mycpu->gd_cpuid);
1069 #endif
1070 	panic("double fault");
1071 }
1072 
1073 /*
1074  * Compensate for 386 brain damage (missing URKR).
1075  * This is a little simpler than the pagefault handler in trap() because
1076  * it the page tables have already been faulted in and high addresses
1077  * are thrown out early for other reasons.
1078  */
1079 int
1080 trapwrite(unsigned addr)
1081 {
1082 	struct lwp *lp;
1083 	vm_offset_t va;
1084 	struct vmspace *vm;
1085 	int rv;
1086 
1087 	va = trunc_page((vm_offset_t)addr);
1088 	/*
1089 	 * XXX - MAX is END.  Changed > to >= for temp. fix.
1090 	 */
1091 	if (va >= VM_MAX_USER_ADDRESS)
1092 		return (1);
1093 
1094 	lp = curthread->td_lwp;
1095 	vm = lp->lwp_vmspace;
1096 
1097 	PHOLD(lp->lwp_proc);
1098 
1099 	if (!grow_stack (lp->lwp_proc, va)) {
1100 		PRELE(lp->lwp_proc);
1101 		return (1);
1102 	}
1103 
1104 	/*
1105 	 * fault the data page
1106 	 */
1107 	rv = vm_fault(&vm->vm_map, va, VM_PROT_WRITE, VM_FAULT_DIRTY);
1108 
1109 	PRELE(lp->lwp_proc);
1110 
1111 	if (rv != KERN_SUCCESS)
1112 		return 1;
1113 
1114 	return (0);
1115 }
1116 
1117 /*
1118  *	syscall2 -	MP aware system call request C handler
1119  *
1120  *	A system call is essentially treated as a trap except that the
1121  *	MP lock is not held on entry or return.  We are responsible for
1122  *	obtaining the MP lock if necessary and for handling ASTs
1123  *	(e.g. a task switch) prior to return.
1124  *
1125  *	In general, only simple access and manipulation of curproc and
1126  *	the current stack is allowed without having to hold MP lock.
1127  *
1128  *	MPSAFE - note that large sections of this routine are run without
1129  *		 the MP lock.
1130  */
1131 void
1132 syscall2(struct trapframe *frame)
1133 {
1134 	struct thread *td = curthread;
1135 	struct proc *p = td->td_proc;
1136 	struct lwp *lp = td->td_lwp;
1137 	caddr_t params;
1138 	struct sysent *callp;
1139 	register_t orig_tf_rflags;
1140 	int sticks;
1141 	int error;
1142 	int narg;
1143 #ifdef INVARIANTS
1144 	int crit_count = td->td_pri & ~TDPRI_MASK;
1145 #endif
1146 #ifdef SMP
1147 	int have_mplock = 0;
1148 #endif
1149 	register_t *argp;
1150 	u_int code;
1151 	int reg, regcnt;
1152 	union sysunion args;
1153 	register_t *argsdst;
1154 
1155 	mycpu->gd_cnt.v_syscall++;
1156 
1157 	KTR_LOG(kernentry_syscall, lp->lwp_proc->p_pid, lp->lwp_tid,
1158 		frame->tf_eax);
1159 
1160 #ifdef SMP
1161 	KASSERT(td->td_mpcount == 0, ("badmpcount syscall2 from %p", (void *)frame->tf_rip));
1162 	if (syscall_mpsafe == 0)
1163 		MAKEMPSAFE(have_mplock);
1164 #endif
1165 	userenter(td, p);	/* lazy raise our priority */
1166 
1167 	reg = 0;
1168 	regcnt = 6;
1169 	/*
1170 	 * Misc
1171 	 */
1172 	sticks = (int)td->td_sticks;
1173 	orig_tf_rflags = frame->tf_rflags;
1174 
1175 	/*
1176 	 * Virtual kernel intercept - if a VM context managed by a virtual
1177 	 * kernel issues a system call the virtual kernel handles it, not us.
1178 	 * Restore the virtual kernel context and return from its system
1179 	 * call.  The current frame is copied out to the virtual kernel.
1180 	 */
1181 	if (lp->lwp_vkernel && lp->lwp_vkernel->ve) {
1182 		vkernel_trap(lp, frame);
1183 		error = EJUSTRETURN;
1184 		goto out;
1185 	}
1186 
1187 	/*
1188 	 * Get the system call parameters and account for time
1189 	 */
1190 	lp->lwp_md.md_regs = frame;
1191 	params = (caddr_t)frame->tf_rsp + sizeof(register_t);
1192 	code = frame->tf_rax;
1193 
1194 	if (p->p_sysent->sv_prepsyscall) {
1195 		(*p->p_sysent->sv_prepsyscall)(
1196 			frame, (int *)(&args.nosys.sysmsg + 1),
1197 			&code, &params);
1198 	} else {
1199 		if (code == SYS_syscall || code == SYS___syscall) {
1200 			code = frame->tf_rdi;
1201 			reg++;
1202 			regcnt--;
1203 		}
1204 	}
1205 
1206 	if (p->p_sysent->sv_mask)
1207 		code &= p->p_sysent->sv_mask;
1208 
1209 	if (code >= p->p_sysent->sv_size)
1210 		callp = &p->p_sysent->sv_table[0];
1211 	else
1212 		callp = &p->p_sysent->sv_table[code];
1213 
1214 	narg = callp->sy_narg & SYF_ARGMASK;
1215 
1216 	/*
1217 	 * On x86_64 we get up to six arguments in registers. The rest are
1218 	 * on the stack. The first six members of 'struct trapframe' happen
1219 	 * to be the registers used to pass arguments, in exactly the right
1220 	 * order.
1221 	 */
1222 	argp = &frame->tf_rdi;
1223 	argp += reg;
1224 	argsdst = (register_t *)(&args.nosys.sysmsg + 1);
1225 	/*
1226 	 * JG can we overflow the space pointed to by 'argsdst'
1227 	 * either with 'bcopy' or with 'copyin'?
1228 	 */
1229 	bcopy(argp, argsdst, sizeof(register_t) * regcnt);
1230 	/*
1231 	 * copyin is MP aware, but the tracing code is not
1232 	 */
1233 	if (narg > regcnt) {
1234 		KASSERT(params != NULL, ("copyin args with no params!"));
1235 		error = copyin(params, &argsdst[regcnt],
1236 			(narg - regcnt) * sizeof(register_t));
1237 		if (error) {
1238 #ifdef KTRACE
1239 			if (KTRPOINT(td, KTR_SYSCALL)) {
1240 				MAKEMPSAFE(have_mplock);
1241 
1242 				ktrsyscall(lp, code, narg,
1243 					(void *)(&args.nosys.sysmsg + 1));
1244 			}
1245 #endif
1246 			goto bad;
1247 		}
1248 	}
1249 
1250 #ifdef KTRACE
1251 	if (KTRPOINT(td, KTR_SYSCALL)) {
1252 		MAKEMPSAFE(have_mplock);
1253 		ktrsyscall(lp, code, narg, (void *)(&args.nosys.sysmsg + 1));
1254 	}
1255 #endif
1256 
1257 	/*
1258 	 * Default return value is 0 (will be copied to %rax).  Double-value
1259 	 * returns use %rax and %rdx.  %rdx is left unchanged for system
1260 	 * calls which return only one result.
1261 	 */
1262 	args.sysmsg_fds[0] = 0;
1263 	args.sysmsg_fds[1] = frame->tf_rdx;
1264 
1265 	/*
1266 	 * The syscall might manipulate the trap frame. If it does it
1267 	 * will probably return EJUSTRETURN.
1268 	 */
1269 	args.sysmsg_frame = frame;
1270 
1271 	STOPEVENT(p, S_SCE, narg);	/* MP aware */
1272 
1273 	/*
1274 	 * NOTE: All system calls run MPSAFE now.  The system call itself
1275 	 *	 is responsible for getting the MP lock.
1276 	 */
1277 	error = (*callp->sy_call)(&args);
1278 
1279 #if 0
1280 	kprintf("system call %d returned %d\n", code, error);
1281 #endif
1282 
1283 out:
1284 	/*
1285 	 * MP SAFE (we may or may not have the MP lock at this point)
1286 	 */
1287 	switch (error) {
1288 	case 0:
1289 		/*
1290 		 * Reinitialize proc pointer `p' as it may be different
1291 		 * if this is a child returning from fork syscall.
1292 		 */
1293 		p = curproc;
1294 		lp = curthread->td_lwp;
1295 		frame->tf_rax = args.sysmsg_fds[0];
1296 		frame->tf_rdx = args.sysmsg_fds[1];
1297 		frame->tf_rflags &= ~PSL_C;
1298 		break;
1299 	case ERESTART:
1300 		/*
1301 		 * Reconstruct pc, we know that 'syscall' is 2 bytes.
1302 		 * We have to do a full context restore so that %r10
1303 		 * (which was holding the value of %rcx) is restored for
1304 		 * the next iteration.
1305 		 */
1306 		frame->tf_rip -= frame->tf_err;
1307 		frame->tf_r10 = frame->tf_rcx;
1308 		break;
1309 	case EJUSTRETURN:
1310 		break;
1311 	case EASYNC:
1312 		panic("Unexpected EASYNC return value (for now)");
1313 	default:
1314 bad:
1315 		if (p->p_sysent->sv_errsize) {
1316 			if (error >= p->p_sysent->sv_errsize)
1317 				error = -1;	/* XXX */
1318 			else
1319 				error = p->p_sysent->sv_errtbl[error];
1320 		}
1321 		frame->tf_rax = error;
1322 		frame->tf_rflags |= PSL_C;
1323 		break;
1324 	}
1325 
1326 	/*
1327 	 * Traced syscall.  trapsignal() is not MP aware.
1328 	 */
1329 	if (orig_tf_rflags & PSL_T) {
1330 		MAKEMPSAFE(have_mplock);
1331 		frame->tf_rflags &= ~PSL_T;
1332 		trapsignal(lp, SIGTRAP, 0);
1333 	}
1334 
1335 	/*
1336 	 * Handle reschedule and other end-of-syscall issues
1337 	 */
1338 	userret(lp, frame, sticks);
1339 
1340 #ifdef KTRACE
1341 	if (KTRPOINT(td, KTR_SYSRET)) {
1342 		MAKEMPSAFE(have_mplock);
1343 		ktrsysret(lp, code, error, args.sysmsg_result);
1344 	}
1345 #endif
1346 
1347 	/*
1348 	 * This works because errno is findable through the
1349 	 * register set.  If we ever support an emulation where this
1350 	 * is not the case, this code will need to be revisited.
1351 	 */
1352 	STOPEVENT(p, S_SCX, code);
1353 
1354 	userexit(lp);
1355 #ifdef SMP
1356 	/*
1357 	 * Release the MP lock if we had to get it
1358 	 */
1359 	KASSERT(td->td_mpcount == have_mplock,
1360 		("badmpcount syscall2/end from %p", (void *)frame->tf_rip));
1361 	if (have_mplock)
1362 		rel_mplock();
1363 #endif
1364 	KTR_LOG(kernentry_syscall_ret, lp->lwp_proc->p_pid, lp->lwp_tid, error);
1365 #ifdef INVARIANTS
1366 	KASSERT(crit_count == (td->td_pri & ~TDPRI_MASK),
1367 		("syscall: critical section count mismatch! %d/%d",
1368 		crit_count / TDPRI_CRIT, td->td_pri / TDPRI_CRIT));
1369 #endif
1370 }
1371 
1372 void
1373 fork_return(struct lwp *lp, struct trapframe *frame)
1374 {
1375 	frame->tf_rax = 0;		/* Child returns zero */
1376 	frame->tf_rflags &= ~PSL_C;	/* success */
1377 	frame->tf_rdx = 1;
1378 
1379 	generic_lwp_return(lp, frame);
1380 	KTR_LOG(kernentry_fork_ret, lp->lwp_proc->p_pid, lp->lwp_tid);
1381 }
1382 
1383 /*
1384  * Simplified back end of syscall(), used when returning from fork()
1385  * or lwp_create() directly into user mode.  MP lock is held on entry and
1386  * should be released on return.  This code will return back into the fork
1387  * trampoline code which then runs doreti.
1388  */
1389 void
1390 generic_lwp_return(struct lwp *lp, struct trapframe *frame)
1391 {
1392 	struct proc *p = lp->lwp_proc;
1393 
1394 	/*
1395 	 * Newly forked processes are given a kernel priority.  We have to
1396 	 * adjust the priority to a normal user priority and fake entry
1397 	 * into the kernel (call userenter()) to install a passive release
1398 	 * function just in case userret() decides to stop the process.  This
1399 	 * can occur when ^Z races a fork.  If we do not install the passive
1400 	 * release function the current process designation will not be
1401 	 * released when the thread goes to sleep.
1402 	 */
1403 	lwkt_setpri_self(TDPRI_USER_NORM);
1404 	userenter(lp->lwp_thread, p);
1405 	userret(lp, frame, 0);
1406 #ifdef KTRACE
1407 	if (KTRPOINT(lp->lwp_thread, KTR_SYSRET))
1408 		ktrsysret(lp, SYS_fork, 0, 0);
1409 #endif
1410 	p->p_flag |= P_PASSIVE_ACQ;
1411 	userexit(lp);
1412 	p->p_flag &= ~P_PASSIVE_ACQ;
1413 #ifdef SMP
1414 	KKASSERT(lp->lwp_thread->td_mpcount == 1);
1415 	rel_mplock();
1416 #endif
1417 }
1418 
1419 /*
1420  * doreti has turned into this.  The frame is directly on the stack.  We
1421  * pull everything else we need (fpu and tls context) from the current
1422  * thread.
1423  *
1424  * Note on fpu interactions: In a virtual kernel, the fpu context for
1425  * an emulated user mode process is not shared with the virtual kernel's
1426  * fpu context, so we only have to 'stack' fpu contexts within the virtual
1427  * kernel itself, and not even then since the signal() contexts that we care
1428  * about save and restore the FPU state (I think anyhow).
1429  *
1430  * vmspace_ctl() returns an error only if it had problems instaling the
1431  * context we supplied or problems copying data to/from our VM space.
1432  */
1433 void
1434 go_user(struct intrframe *frame)
1435 {
1436 	struct trapframe *tf = (void *)&frame->if_rdi;
1437 	int r;
1438 
1439 	/*
1440 	 * Interrupts may be disabled on entry, make sure all signals
1441 	 * can be received before beginning our loop.
1442 	 */
1443 	sigsetmask(0);
1444 
1445 	/*
1446 	 * Switch to the current simulated user process, then call
1447 	 * user_trap() when we break out of it (usually due to a signal).
1448 	 */
1449 	for (;;) {
1450 		/*
1451 		 * Tell the real kernel whether it is ok to use the FP
1452 		 * unit or not.
1453 		 */
1454 		if (mdcpu->gd_npxthread == curthread) {
1455 			tf->tf_xflags &= ~PGEX_FPFAULT;
1456 		} else {
1457 			tf->tf_xflags |= PGEX_FPFAULT;
1458 		}
1459 
1460 		/*
1461 		 * Run emulated user process context.  This call interlocks
1462 		 * with new mailbox signals.
1463 		 *
1464 		 * Set PGEX_U unconditionally, indicating a user frame (the
1465 		 * bit is normally set only by T_PAGEFLT).
1466 		 */
1467 		r = vmspace_ctl(&curproc->p_vmspace->vm_pmap, VMSPACE_CTL_RUN,
1468 				tf, &curthread->td_savevext);
1469 		frame->if_xflags |= PGEX_U;
1470 #if 0
1471 		kprintf("GO USER %d trap %ld EVA %08lx RIP %08lx RSP %08lx XFLAGS %02lx/%02lx\n",
1472 			r, tf->tf_trapno, tf->tf_addr, tf->tf_rip, tf->tf_rsp,
1473 			tf->tf_xflags, frame->if_xflags);
1474 #endif
1475 		if (r < 0) {
1476 			if (errno != EINTR)
1477 				panic("vmspace_ctl failed error %d", errno);
1478 		} else {
1479 			if (tf->tf_trapno) {
1480 				user_trap(tf);
1481 			}
1482 		}
1483 		if (mycpu->gd_reqflags & RQF_AST_MASK) {
1484 			tf->tf_trapno = T_ASTFLT;
1485 			user_trap(tf);
1486 		}
1487 		tf->tf_trapno = 0;
1488 	}
1489 }
1490 
1491 /*
1492  * If PGEX_FPFAULT is set then set FP_VIRTFP in the PCB to force a T_DNA
1493  * fault (which is then passed back to the virtual kernel) if an attempt is
1494  * made to use the FP unit.
1495  *
1496  * XXX this is a fairly big hack.
1497  */
1498 void
1499 set_vkernel_fp(struct trapframe *frame)
1500 {
1501 	struct thread *td = curthread;
1502 
1503 	if (frame->tf_xflags & PGEX_FPFAULT) {
1504 		td->td_pcb->pcb_flags |= FP_VIRTFP;
1505 		if (mdcpu->gd_npxthread == td)
1506 			npxexit();
1507 	} else {
1508 		td->td_pcb->pcb_flags &= ~FP_VIRTFP;
1509 	}
1510 }
1511 
1512 /*
1513  * Called from vkernel_trap() to fixup the vkernel's syscall
1514  * frame for vmspace_ctl() return.
1515  */
1516 void
1517 cpu_vkernel_trap(struct trapframe *frame, int error)
1518 {
1519 	frame->tf_rax = error;
1520 	if (error)
1521 		frame->tf_rflags |= PSL_C;
1522 	else
1523 		frame->tf_rflags &= ~PSL_C;
1524 }
1525