1 /*- 2 * Copyright (c) 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * Copyright (C) 1994, David Greenman 5 * Copyright (c) 2008 The DragonFly Project. 6 * Copyright (c) 2008 Jordan Gordeev. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * the University of Utah, and William Jolitz. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the University of 22 * California, Berkeley and its contributors. 23 * 4. Neither the name of the University nor the names of its contributors 24 * may be used to endorse or promote products derived from this software 25 * without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 * 39 * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 40 * $FreeBSD: src/sys/i386/i386/trap.c,v 1.147.2.11 2003/02/27 19:09:59 luoqi Exp $ 41 */ 42 43 /* 44 * x86_64 Trap and System call handling 45 */ 46 47 #include "use_isa.h" 48 49 #include "opt_ddb.h" 50 #include "opt_ktrace.h" 51 52 #include <machine/frame.h> 53 #include <sys/param.h> 54 #include <sys/systm.h> 55 #include <sys/kernel.h> 56 #include <sys/kerneldump.h> 57 #include <sys/proc.h> 58 #include <sys/pioctl.h> 59 #include <sys/types.h> 60 #include <sys/signal2.h> 61 #include <sys/syscall.h> 62 #include <sys/sysctl.h> 63 #include <sys/sysent.h> 64 #include <sys/systm.h> 65 #ifdef KTRACE 66 #include <sys/ktrace.h> 67 #endif 68 #include <sys/ktr.h> 69 #include <sys/sysmsg.h> 70 #include <sys/sysproto.h> 71 #include <sys/sysunion.h> 72 73 #include <vm/pmap.h> 74 #include <vm/vm.h> 75 #include <vm/vm_extern.h> 76 #include <vm/vm_kern.h> 77 #include <vm/vm_param.h> 78 #include <machine/cpu.h> 79 #include <machine/pcb.h> 80 #include <machine/smp.h> 81 #include <machine/thread.h> 82 #include <machine/clock.h> 83 #include <machine/vmparam.h> 84 #include <machine/md_var.h> 85 #include <machine_base/isa/isa_intr.h> 86 #include <machine_base/apic/lapic.h> 87 88 #include <ddb/ddb.h> 89 90 #include <sys/thread2.h> 91 #include <sys/mplock2.h> 92 93 #ifdef SMP 94 95 #define MAKEMPSAFE(have_mplock) \ 96 if (have_mplock == 0) { \ 97 get_mplock(); \ 98 have_mplock = 1; \ 99 } 100 101 #else 102 103 #define MAKEMPSAFE(have_mplock) 104 105 #endif 106 107 extern void trap(struct trapframe *frame); 108 109 static int trap_pfault(struct trapframe *, int); 110 static void trap_fatal(struct trapframe *, vm_offset_t); 111 void dblfault_handler(struct trapframe *frame); 112 113 #define MAX_TRAP_MSG 30 114 static char *trap_msg[] = { 115 "", /* 0 unused */ 116 "privileged instruction fault", /* 1 T_PRIVINFLT */ 117 "", /* 2 unused */ 118 "breakpoint instruction fault", /* 3 T_BPTFLT */ 119 "", /* 4 unused */ 120 "", /* 5 unused */ 121 "arithmetic trap", /* 6 T_ARITHTRAP */ 122 "system forced exception", /* 7 T_ASTFLT */ 123 "", /* 8 unused */ 124 "general protection fault", /* 9 T_PROTFLT */ 125 "trace trap", /* 10 T_TRCTRAP */ 126 "", /* 11 unused */ 127 "page fault", /* 12 T_PAGEFLT */ 128 "", /* 13 unused */ 129 "alignment fault", /* 14 T_ALIGNFLT */ 130 "", /* 15 unused */ 131 "", /* 16 unused */ 132 "", /* 17 unused */ 133 "integer divide fault", /* 18 T_DIVIDE */ 134 "non-maskable interrupt trap", /* 19 T_NMI */ 135 "overflow trap", /* 20 T_OFLOW */ 136 "FPU bounds check fault", /* 21 T_BOUND */ 137 "FPU device not available", /* 22 T_DNA */ 138 "double fault", /* 23 T_DOUBLEFLT */ 139 "FPU operand fetch fault", /* 24 T_FPOPFLT */ 140 "invalid TSS fault", /* 25 T_TSSFLT */ 141 "segment not present fault", /* 26 T_SEGNPFLT */ 142 "stack fault", /* 27 T_STKFLT */ 143 "machine check trap", /* 28 T_MCHK */ 144 "SIMD floating-point exception", /* 29 T_XMMFLT */ 145 "reserved (unknown) fault", /* 30 T_RESERVED */ 146 }; 147 148 #ifdef DDB 149 static int ddb_on_nmi = 1; 150 SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW, 151 &ddb_on_nmi, 0, "Go to DDB on NMI"); 152 static int ddb_on_seg_fault = 0; 153 SYSCTL_INT(_machdep, OID_AUTO, ddb_on_seg_fault, CTLFLAG_RW, 154 &ddb_on_seg_fault, 0, "Go to DDB on user seg-fault"); 155 static int freeze_on_seg_fault = 0; 156 SYSCTL_INT(_machdep, OID_AUTO, freeze_on_seg_fault, CTLFLAG_RW, 157 &freeze_on_seg_fault, 0, "Go to DDB on user seg-fault"); 158 #endif 159 static int panic_on_nmi = 1; 160 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW, 161 &panic_on_nmi, 0, "Panic on NMI"); 162 static int fast_release; 163 SYSCTL_INT(_machdep, OID_AUTO, fast_release, CTLFLAG_RW, 164 &fast_release, 0, "Passive Release was optimal"); 165 static int slow_release; 166 SYSCTL_INT(_machdep, OID_AUTO, slow_release, CTLFLAG_RW, 167 &slow_release, 0, "Passive Release was nonoptimal"); 168 169 /* 170 * System call debugging records the worst-case system call 171 * overhead (inclusive of blocking), but may be inaccurate. 172 */ 173 /*#define SYSCALL_DEBUG*/ 174 #ifdef SYSCALL_DEBUG 175 uint64_t SysCallsWorstCase[SYS_MAXSYSCALL]; 176 #endif 177 178 /* 179 * Passively intercepts the thread switch function to increase 180 * the thread priority from a user priority to a kernel priority, reducing 181 * syscall and trap overhead for the case where no switch occurs. 182 * 183 * Synchronizes td_ucred with p_ucred. This is used by system calls, 184 * signal handling, faults, AST traps, and anything else that enters the 185 * kernel from userland and provides the kernel with a stable read-only 186 * copy of the process ucred. 187 */ 188 static __inline void 189 userenter(struct thread *curtd, struct proc *curp) 190 { 191 struct ucred *ocred; 192 struct ucred *ncred; 193 194 curtd->td_release = lwkt_passive_release; 195 196 if (curtd->td_ucred != curp->p_ucred) { 197 ncred = crhold(curp->p_ucred); 198 ocred = curtd->td_ucred; 199 curtd->td_ucred = ncred; 200 if (ocred) 201 crfree(ocred); 202 } 203 204 /* 205 * Debugging, remove top two user stack pages to catch kernel faults 206 */ 207 if (freeze_on_seg_fault > 1 && curtd->td_lwp) { 208 pmap_remove(vmspace_pmap(curtd->td_lwp->lwp_vmspace), 209 0x00007FFFFFFFD000LU, 210 0x0000800000000000LU); 211 } 212 } 213 214 /* 215 * Handle signals, upcalls, profiling, and other AST's and/or tasks that 216 * must be completed before we can return to or try to return to userland. 217 * 218 * Note that td_sticks is a 64 bit quantity, but there's no point doing 64 219 * arithmatic on the delta calculation so the absolute tick values are 220 * truncated to an integer. 221 */ 222 static void 223 userret(struct lwp *lp, struct trapframe *frame, int sticks) 224 { 225 struct proc *p = lp->lwp_proc; 226 int sig; 227 228 /* 229 * Charge system time if profiling. Note: times are in microseconds. 230 * This may do a copyout and block, so do it first even though it 231 * means some system time will be charged as user time. 232 */ 233 if (p->p_flags & P_PROFIL) { 234 addupc_task(p, frame->tf_rip, 235 (u_int)((int)lp->lwp_thread->td_sticks - sticks)); 236 } 237 238 recheck: 239 /* 240 * If the jungle wants us dead, so be it. 241 */ 242 if (lp->lwp_mpflags & LWP_MP_WEXIT) { 243 lwkt_gettoken(&p->p_token); 244 lwp_exit(0); 245 lwkt_reltoken(&p->p_token); /* NOT REACHED */ 246 } 247 248 /* 249 * Block here if we are in a stopped state. 250 */ 251 if (p->p_stat == SSTOP || dump_stop_usertds) { 252 lwkt_gettoken(&p->p_token); 253 tstop(); 254 lwkt_reltoken(&p->p_token); 255 goto recheck; 256 } 257 258 /* 259 * Post any pending upcalls. If running a virtual kernel be sure 260 * to restore the virtual kernel's vmspace before posting the upcall. 261 */ 262 if (p->p_flags & (P_SIGVTALRM | P_SIGPROF | P_UPCALLPEND)) { 263 lwkt_gettoken(&p->p_token); 264 if (p->p_flags & P_SIGVTALRM) { 265 p->p_flags &= ~P_SIGVTALRM; 266 ksignal(p, SIGVTALRM); 267 } 268 if (p->p_flags & P_SIGPROF) { 269 p->p_flags &= ~P_SIGPROF; 270 ksignal(p, SIGPROF); 271 } 272 if (p->p_flags & P_UPCALLPEND) { 273 p->p_flags &= ~P_UPCALLPEND; 274 postupcall(lp); 275 } 276 lwkt_reltoken(&p->p_token); 277 goto recheck; 278 } 279 280 /* 281 * Post any pending signals. If running a virtual kernel be sure 282 * to restore the virtual kernel's vmspace before posting the signal. 283 * 284 * WARNING! postsig() can exit and not return. 285 */ 286 if ((sig = CURSIG_TRACE(lp)) != 0) { 287 lwkt_gettoken(&p->p_token); 288 postsig(sig); 289 lwkt_reltoken(&p->p_token); 290 goto recheck; 291 } 292 293 /* 294 * block here if we are swapped out, but still process signals 295 * (such as SIGKILL). proc0 (the swapin scheduler) is already 296 * aware of our situation, we do not have to wake it up. 297 */ 298 if (p->p_flags & P_SWAPPEDOUT) { 299 lwkt_gettoken(&p->p_token); 300 get_mplock(); 301 p->p_flags |= P_SWAPWAIT; 302 swapin_request(); 303 if (p->p_flags & P_SWAPWAIT) 304 tsleep(p, PCATCH, "SWOUT", 0); 305 p->p_flags &= ~P_SWAPWAIT; 306 rel_mplock(); 307 lwkt_reltoken(&p->p_token); 308 goto recheck; 309 } 310 311 /* 312 * Make sure postsig() handled request to restore old signal mask after 313 * running signal handler. 314 */ 315 KKASSERT((lp->lwp_flags & LWP_OLDMASK) == 0); 316 } 317 318 /* 319 * Cleanup from userenter and any passive release that might have occured. 320 * We must reclaim the current-process designation before we can return 321 * to usermode. We also handle both LWKT and USER reschedule requests. 322 */ 323 static __inline void 324 userexit(struct lwp *lp) 325 { 326 struct thread *td = lp->lwp_thread; 327 /* globaldata_t gd = td->td_gd; */ 328 329 /* 330 * Handle stop requests at kernel priority. Any requests queued 331 * after this loop will generate another AST. 332 */ 333 while (lp->lwp_proc->p_stat == SSTOP) { 334 lwkt_gettoken(&lp->lwp_proc->p_token); 335 tstop(); 336 lwkt_reltoken(&lp->lwp_proc->p_token); 337 } 338 339 /* 340 * Reduce our priority in preparation for a return to userland. If 341 * our passive release function was still in place, our priority was 342 * never raised and does not need to be reduced. 343 */ 344 lwkt_passive_recover(td); 345 346 /* 347 * Become the current user scheduled process if we aren't already, 348 * and deal with reschedule requests and other factors. 349 */ 350 lp->lwp_proc->p_usched->acquire_curproc(lp); 351 /* WARNING: we may have migrated cpu's */ 352 /* gd = td->td_gd; */ 353 } 354 355 #if !defined(KTR_KERNENTRY) 356 #define KTR_KERNENTRY KTR_ALL 357 #endif 358 KTR_INFO_MASTER(kernentry); 359 KTR_INFO(KTR_KERNENTRY, kernentry, trap, 0, 360 "TRAP(pid %d, tid %d, trapno %ld, eva %lu)", 361 pid_t pid, lwpid_t tid, register_t trapno, vm_offset_t eva); 362 KTR_INFO(KTR_KERNENTRY, kernentry, trap_ret, 0, "TRAP_RET(pid %d, tid %d)", 363 pid_t pid, lwpid_t tid); 364 KTR_INFO(KTR_KERNENTRY, kernentry, syscall, 0, "SYSC(pid %d, tid %d, nr %ld)", 365 pid_t pid, lwpid_t tid, register_t trapno); 366 KTR_INFO(KTR_KERNENTRY, kernentry, syscall_ret, 0, "SYSRET(pid %d, tid %d, err %d)", 367 pid_t pid, lwpid_t tid, int err); 368 KTR_INFO(KTR_KERNENTRY, kernentry, fork_ret, 0, "FORKRET(pid %d, tid %d)", 369 pid_t pid, lwpid_t tid); 370 371 /* 372 * Exception, fault, and trap interface to the kernel. 373 * This common code is called from assembly language IDT gate entry 374 * routines that prepare a suitable stack frame, and restore this 375 * frame after the exception has been processed. 376 * 377 * This function is also called from doreti in an interlock to handle ASTs. 378 * For example: hardwareint->INTROUTINE->(set ast)->doreti->trap 379 * 380 * NOTE! We have to retrieve the fault address prior to obtaining the 381 * MP lock because get_mplock() may switch out. YYY cr2 really ought 382 * to be retrieved by the assembly code, not here. 383 * 384 * XXX gd_trap_nesting_level currently prevents lwkt_switch() from panicing 385 * if an attempt is made to switch from a fast interrupt or IPI. This is 386 * necessary to properly take fatal kernel traps on SMP machines if 387 * get_mplock() has to block. 388 */ 389 390 void 391 trap(struct trapframe *frame) 392 { 393 struct globaldata *gd = mycpu; 394 struct thread *td = gd->gd_curthread; 395 struct lwp *lp = td->td_lwp; 396 struct proc *p; 397 int sticks = 0; 398 int i = 0, ucode = 0, type, code; 399 #ifdef SMP 400 int have_mplock = 0; 401 #endif 402 #ifdef INVARIANTS 403 int crit_count = td->td_critcount; 404 lwkt_tokref_t curstop = td->td_toks_stop; 405 #endif 406 vm_offset_t eva; 407 408 p = td->td_proc; 409 clear_quickret(); 410 411 #ifdef DDB 412 /* 413 * We need to allow T_DNA faults when the debugger is active since 414 * some dumping paths do large bcopy() which use the floating 415 * point registers for faster copying. 416 */ 417 if (db_active && frame->tf_trapno != T_DNA) { 418 eva = (frame->tf_trapno == T_PAGEFLT ? frame->tf_addr : 0); 419 ++gd->gd_trap_nesting_level; 420 MAKEMPSAFE(have_mplock); 421 trap_fatal(frame, eva); 422 --gd->gd_trap_nesting_level; 423 goto out2; 424 } 425 #endif 426 427 eva = 0; 428 429 if ((frame->tf_rflags & PSL_I) == 0) { 430 /* 431 * Buggy application or kernel code has disabled interrupts 432 * and then trapped. Enabling interrupts now is wrong, but 433 * it is better than running with interrupts disabled until 434 * they are accidentally enabled later. 435 */ 436 type = frame->tf_trapno; 437 if (ISPL(frame->tf_cs) == SEL_UPL) { 438 MAKEMPSAFE(have_mplock); 439 /* JG curproc can be NULL */ 440 kprintf( 441 "pid %ld (%s): trap %d with interrupts disabled\n", 442 (long)curproc->p_pid, curproc->p_comm, type); 443 } else if (type != T_NMI && type != T_BPTFLT && 444 type != T_TRCTRAP) { 445 /* 446 * XXX not quite right, since this may be for a 447 * multiple fault in user mode. 448 */ 449 MAKEMPSAFE(have_mplock); 450 kprintf("kernel trap %d with interrupts disabled\n", 451 type); 452 } 453 cpu_enable_intr(); 454 } 455 456 type = frame->tf_trapno; 457 code = frame->tf_err; 458 459 if (ISPL(frame->tf_cs) == SEL_UPL) { 460 /* user trap */ 461 462 KTR_LOG(kernentry_trap, p->p_pid, lp->lwp_tid, 463 frame->tf_trapno, eva); 464 465 userenter(td, p); 466 467 sticks = (int)td->td_sticks; 468 KASSERT(lp->lwp_md.md_regs == frame, 469 ("Frame mismatch %p %p", lp->lwp_md.md_regs, frame)); 470 471 switch (type) { 472 case T_PRIVINFLT: /* privileged instruction fault */ 473 i = SIGILL; 474 ucode = ILL_PRVOPC; 475 break; 476 477 case T_BPTFLT: /* bpt instruction fault */ 478 case T_TRCTRAP: /* trace trap */ 479 frame->tf_rflags &= ~PSL_T; 480 i = SIGTRAP; 481 ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT); 482 break; 483 484 case T_ARITHTRAP: /* arithmetic trap */ 485 ucode = code; 486 i = SIGFPE; 487 break; 488 489 case T_ASTFLT: /* Allow process switch */ 490 mycpu->gd_cnt.v_soft++; 491 if (mycpu->gd_reqflags & RQF_AST_OWEUPC) { 492 atomic_clear_int(&mycpu->gd_reqflags, 493 RQF_AST_OWEUPC); 494 addupc_task(p, p->p_prof.pr_addr, 495 p->p_prof.pr_ticks); 496 } 497 goto out; 498 499 case T_PROTFLT: /* general protection fault */ 500 i = SIGBUS; 501 ucode = BUS_OBJERR; 502 break; 503 case T_STKFLT: /* stack fault */ 504 case T_SEGNPFLT: /* segment not present fault */ 505 i = SIGBUS; 506 ucode = BUS_ADRERR; 507 break; 508 case T_TSSFLT: /* invalid TSS fault */ 509 case T_DOUBLEFLT: /* double fault */ 510 default: 511 i = SIGBUS; 512 ucode = BUS_OBJERR; 513 break; 514 515 case T_PAGEFLT: /* page fault */ 516 i = trap_pfault(frame, TRUE); 517 if (frame->tf_rip == 0) { 518 kprintf("T_PAGEFLT: Warning %%rip == 0!\n"); 519 while (freeze_on_seg_fault) { 520 tsleep(p, 0, "freeze", hz * 20); 521 } 522 } 523 if (i == -1 || i == 0) 524 goto out; 525 526 527 if (i == SIGSEGV) 528 ucode = SEGV_MAPERR; 529 else { 530 i = SIGSEGV; 531 ucode = SEGV_ACCERR; 532 } 533 break; 534 535 case T_DIVIDE: /* integer divide fault */ 536 ucode = FPE_INTDIV; 537 i = SIGFPE; 538 break; 539 540 #if NISA > 0 541 case T_NMI: 542 MAKEMPSAFE(have_mplock); 543 /* machine/parity/power fail/"kitchen sink" faults */ 544 if (isa_nmi(code) == 0) { 545 #ifdef DDB 546 /* 547 * NMI can be hooked up to a pushbutton 548 * for debugging. 549 */ 550 if (ddb_on_nmi) { 551 kprintf ("NMI ... going to debugger\n"); 552 kdb_trap(type, 0, frame); 553 } 554 #endif /* DDB */ 555 goto out2; 556 } else if (panic_on_nmi) 557 panic("NMI indicates hardware failure"); 558 break; 559 #endif /* NISA > 0 */ 560 561 case T_OFLOW: /* integer overflow fault */ 562 ucode = FPE_INTOVF; 563 i = SIGFPE; 564 break; 565 566 case T_BOUND: /* bounds check fault */ 567 ucode = FPE_FLTSUB; 568 i = SIGFPE; 569 break; 570 571 case T_DNA: 572 /* 573 * Virtual kernel intercept - pass the DNA exception 574 * to the virtual kernel if it asked to handle it. 575 * This occurs when the virtual kernel is holding 576 * onto the FP context for a different emulated 577 * process then the one currently running. 578 * 579 * We must still call npxdna() since we may have 580 * saved FP state that the virtual kernel needs 581 * to hand over to a different emulated process. 582 */ 583 if (lp->lwp_vkernel && lp->lwp_vkernel->ve && 584 (td->td_pcb->pcb_flags & FP_VIRTFP) 585 ) { 586 npxdna(); 587 break; 588 } 589 590 /* 591 * The kernel may have switched out the FP unit's 592 * state, causing the user process to take a fault 593 * when it tries to use the FP unit. Restore the 594 * state here 595 */ 596 if (npxdna()) 597 goto out; 598 i = SIGFPE; 599 ucode = FPE_FPU_NP_TRAP; 600 break; 601 602 case T_FPOPFLT: /* FPU operand fetch fault */ 603 ucode = ILL_COPROC; 604 i = SIGILL; 605 break; 606 607 case T_XMMFLT: /* SIMD floating-point exception */ 608 ucode = 0; /* XXX */ 609 i = SIGFPE; 610 break; 611 } 612 } else { 613 /* kernel trap */ 614 615 switch (type) { 616 case T_PAGEFLT: /* page fault */ 617 trap_pfault(frame, FALSE); 618 goto out2; 619 620 case T_DNA: 621 /* 622 * The kernel is apparently using fpu for copying. 623 * XXX this should be fatal unless the kernel has 624 * registered such use. 625 */ 626 if (npxdna()) 627 goto out2; 628 break; 629 630 case T_STKFLT: /* stack fault */ 631 break; 632 633 case T_PROTFLT: /* general protection fault */ 634 case T_SEGNPFLT: /* segment not present fault */ 635 /* 636 * Invalid segment selectors and out of bounds 637 * %rip's and %rsp's can be set up in user mode. 638 * This causes a fault in kernel mode when the 639 * kernel tries to return to user mode. We want 640 * to get this fault so that we can fix the 641 * problem here and not have to check all the 642 * selectors and pointers when the user changes 643 * them. 644 */ 645 if (mycpu->gd_intr_nesting_level == 0) { 646 if (td->td_pcb->pcb_onfault) { 647 frame->tf_rip = (register_t) 648 td->td_pcb->pcb_onfault; 649 goto out2; 650 } 651 if (frame->tf_rip == (long)doreti_iret) { 652 frame->tf_rip = (long)doreti_iret_fault; 653 goto out2; 654 } 655 } 656 break; 657 658 case T_TSSFLT: 659 /* 660 * PSL_NT can be set in user mode and isn't cleared 661 * automatically when the kernel is entered. This 662 * causes a TSS fault when the kernel attempts to 663 * `iret' because the TSS link is uninitialized. We 664 * want to get this fault so that we can fix the 665 * problem here and not every time the kernel is 666 * entered. 667 */ 668 if (frame->tf_rflags & PSL_NT) { 669 frame->tf_rflags &= ~PSL_NT; 670 goto out2; 671 } 672 break; 673 674 case T_TRCTRAP: /* trace trap */ 675 #if 0 676 if (frame->tf_rip == (int)IDTVEC(syscall)) { 677 /* 678 * We've just entered system mode via the 679 * syscall lcall. Continue single stepping 680 * silently until the syscall handler has 681 * saved the flags. 682 */ 683 goto out2; 684 } 685 if (frame->tf_rip == (int)IDTVEC(syscall) + 1) { 686 /* 687 * The syscall handler has now saved the 688 * flags. Stop single stepping it. 689 */ 690 frame->tf_rflags &= ~PSL_T; 691 goto out2; 692 } 693 #endif 694 695 /* 696 * Ignore debug register trace traps due to 697 * accesses in the user's address space, which 698 * can happen under several conditions such as 699 * if a user sets a watchpoint on a buffer and 700 * then passes that buffer to a system call. 701 * We still want to get TRCTRAPS for addresses 702 * in kernel space because that is useful when 703 * debugging the kernel. 704 */ 705 #if JG 706 if (user_dbreg_trap()) { 707 /* 708 * Reset breakpoint bits because the 709 * processor doesn't 710 */ 711 /* XXX check upper bits here */ 712 load_dr6(rdr6() & 0xfffffff0); 713 goto out2; 714 } 715 #endif 716 /* 717 * FALLTHROUGH (TRCTRAP kernel mode, kernel address) 718 */ 719 case T_BPTFLT: 720 /* 721 * If DDB is enabled, let it handle the debugger trap. 722 * Otherwise, debugger traps "can't happen". 723 */ 724 ucode = TRAP_BRKPT; 725 #ifdef DDB 726 MAKEMPSAFE(have_mplock); 727 if (kdb_trap(type, 0, frame)) 728 goto out2; 729 #endif 730 break; 731 732 #if NISA > 0 733 case T_NMI: 734 MAKEMPSAFE(have_mplock); 735 /* machine/parity/power fail/"kitchen sink" faults */ 736 if (isa_nmi(code) == 0) { 737 #ifdef DDB 738 /* 739 * NMI can be hooked up to a pushbutton 740 * for debugging. 741 */ 742 if (ddb_on_nmi) { 743 kprintf ("NMI ... going to debugger\n"); 744 kdb_trap(type, 0, frame); 745 } 746 #endif /* DDB */ 747 goto out2; 748 } else if (panic_on_nmi == 0) 749 goto out2; 750 /* FALL THROUGH */ 751 #endif /* NISA > 0 */ 752 } 753 MAKEMPSAFE(have_mplock); 754 trap_fatal(frame, 0); 755 goto out2; 756 } 757 758 /* 759 * Virtual kernel intercept - if the fault is directly related to a 760 * VM context managed by a virtual kernel then let the virtual kernel 761 * handle it. 762 */ 763 if (lp->lwp_vkernel && lp->lwp_vkernel->ve) { 764 vkernel_trap(lp, frame); 765 goto out; 766 } 767 768 /* Translate fault for emulators (e.g. Linux) */ 769 if (*p->p_sysent->sv_transtrap) 770 i = (*p->p_sysent->sv_transtrap)(i, type); 771 772 MAKEMPSAFE(have_mplock); 773 trapsignal(lp, i, ucode); 774 775 #ifdef DEBUG 776 if (type <= MAX_TRAP_MSG) { 777 uprintf("fatal process exception: %s", 778 trap_msg[type]); 779 if ((type == T_PAGEFLT) || (type == T_PROTFLT)) 780 uprintf(", fault VA = 0x%lx", frame->tf_addr); 781 uprintf("\n"); 782 } 783 #endif 784 785 out: 786 userret(lp, frame, sticks); 787 userexit(lp); 788 out2: ; 789 #ifdef SMP 790 if (have_mplock) 791 rel_mplock(); 792 #endif 793 if (p != NULL && lp != NULL) 794 KTR_LOG(kernentry_trap_ret, p->p_pid, lp->lwp_tid); 795 #ifdef INVARIANTS 796 KASSERT(crit_count == td->td_critcount, 797 ("trap: critical section count mismatch! %d/%d", 798 crit_count, td->td_pri)); 799 KASSERT(curstop == td->td_toks_stop, 800 ("trap: extra tokens held after trap! %ld/%ld", 801 curstop - &td->td_toks_base, 802 td->td_toks_stop - &td->td_toks_base)); 803 #endif 804 } 805 806 static int 807 trap_pfault(struct trapframe *frame, int usermode) 808 { 809 vm_offset_t va; 810 struct vmspace *vm = NULL; 811 vm_map_t map; 812 int rv = 0; 813 int fault_flags; 814 vm_prot_t ftype; 815 thread_t td = curthread; 816 struct lwp *lp = td->td_lwp; 817 struct proc *p; 818 819 va = trunc_page(frame->tf_addr); 820 if (va >= VM_MIN_KERNEL_ADDRESS) { 821 /* 822 * Don't allow user-mode faults in kernel address space. 823 */ 824 if (usermode) { 825 fault_flags = -1; 826 ftype = -1; 827 goto nogo; 828 } 829 830 map = &kernel_map; 831 } else { 832 /* 833 * This is a fault on non-kernel virtual memory. 834 * vm is initialized above to NULL. If curproc is NULL 835 * or curproc->p_vmspace is NULL the fault is fatal. 836 */ 837 if (lp != NULL) 838 vm = lp->lwp_vmspace; 839 840 if (vm == NULL) { 841 fault_flags = -1; 842 ftype = -1; 843 goto nogo; 844 } 845 846 /* 847 * Debugging, try to catch kernel faults on the user address space when not inside 848 * on onfault (e.g. copyin/copyout) routine. 849 */ 850 if (usermode == 0 && (td->td_pcb == NULL || td->td_pcb->pcb_onfault == NULL)) { 851 if (freeze_on_seg_fault) { 852 kprintf("trap_pfault: user address fault from kernel mode " 853 "%016lx\n", (long)frame->tf_addr); 854 while (freeze_on_seg_fault) { 855 tsleep(&freeze_on_seg_fault, 0, "frzseg", hz * 20); 856 } 857 } 858 } 859 map = &vm->vm_map; 860 } 861 862 /* 863 * PGEX_I is defined only if the execute disable bit capability is 864 * supported and enabled. 865 */ 866 if (frame->tf_err & PGEX_W) 867 ftype = VM_PROT_WRITE; 868 #if JG 869 else if ((frame->tf_err & PGEX_I) && pg_nx != 0) 870 ftype = VM_PROT_EXECUTE; 871 #endif 872 else 873 ftype = VM_PROT_READ; 874 875 if (map != &kernel_map) { 876 /* 877 * Keep swapout from messing with us during this 878 * critical time. 879 */ 880 PHOLD(lp->lwp_proc); 881 882 /* 883 * Issue fault 884 */ 885 fault_flags = 0; 886 if (usermode) 887 fault_flags |= VM_FAULT_BURST; 888 if (ftype & VM_PROT_WRITE) 889 fault_flags |= VM_FAULT_DIRTY; 890 else 891 fault_flags |= VM_FAULT_NORMAL; 892 rv = vm_fault(map, va, ftype, fault_flags); 893 894 PRELE(lp->lwp_proc); 895 } else { 896 /* 897 * Don't have to worry about process locking or stacks in the 898 * kernel. 899 */ 900 fault_flags = VM_FAULT_NORMAL; 901 rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); 902 } 903 if (rv == KERN_SUCCESS) 904 return (0); 905 nogo: 906 if (!usermode) { 907 if (td->td_gd->gd_intr_nesting_level == 0 && 908 td->td_pcb->pcb_onfault) { 909 frame->tf_rip = (register_t)td->td_pcb->pcb_onfault; 910 return (0); 911 } 912 trap_fatal(frame, frame->tf_addr); 913 return (-1); 914 } 915 916 /* 917 * NOTE: on x86_64 we have a tf_addr field in the trapframe, no 918 * kludge is needed to pass the fault address to signal handlers. 919 */ 920 p = td->td_proc; 921 if (td->td_lwp->lwp_vkernel == NULL) { 922 if (bootverbose || freeze_on_seg_fault || ddb_on_seg_fault) { 923 kprintf("seg-fault ft=%04x ff=%04x addr=%p rip=%p " 924 "pid=%d cpu=%d p_comm=%s\n", 925 ftype, fault_flags, 926 (void *)frame->tf_addr, 927 (void *)frame->tf_rip, 928 p->p_pid, mycpu->gd_cpuid, p->p_comm); 929 } 930 #ifdef DDB 931 while (freeze_on_seg_fault) { 932 tsleep(p, 0, "freeze", hz * 20); 933 } 934 if (ddb_on_seg_fault) 935 Debugger("ddb_on_seg_fault"); 936 #endif 937 } 938 939 return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); 940 } 941 942 static void 943 trap_fatal(struct trapframe *frame, vm_offset_t eva) 944 { 945 int code, ss; 946 u_int type; 947 long rsp; 948 struct soft_segment_descriptor softseg; 949 char *msg; 950 951 code = frame->tf_err; 952 type = frame->tf_trapno; 953 sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)], &softseg); 954 955 if (type <= MAX_TRAP_MSG) 956 msg = trap_msg[type]; 957 else 958 msg = "UNKNOWN"; 959 kprintf("\n\nFatal trap %d: %s while in %s mode\n", type, msg, 960 ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); 961 #ifdef SMP 962 /* three separate prints in case of a trap on an unmapped page */ 963 kprintf("cpuid = %d; ", mycpu->gd_cpuid); 964 kprintf("lapic->id = %08x\n", lapic->id); 965 #endif 966 if (type == T_PAGEFLT) { 967 kprintf("fault virtual address = 0x%lx\n", eva); 968 kprintf("fault code = %s %s %s, %s\n", 969 code & PGEX_U ? "user" : "supervisor", 970 code & PGEX_W ? "write" : "read", 971 code & PGEX_I ? "instruction" : "data", 972 code & PGEX_P ? "protection violation" : "page not present"); 973 } 974 kprintf("instruction pointer = 0x%lx:0x%lx\n", 975 frame->tf_cs & 0xffff, frame->tf_rip); 976 if (ISPL(frame->tf_cs) == SEL_UPL) { 977 ss = frame->tf_ss & 0xffff; 978 rsp = frame->tf_rsp; 979 } else { 980 ss = GSEL(GDATA_SEL, SEL_KPL); 981 rsp = (long)&frame->tf_rsp; 982 } 983 kprintf("stack pointer = 0x%x:0x%lx\n", ss, rsp); 984 kprintf("frame pointer = 0x%x:0x%lx\n", ss, frame->tf_rbp); 985 kprintf("code segment = base 0x%lx, limit 0x%lx, type 0x%x\n", 986 softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); 987 kprintf(" = DPL %d, pres %d, long %d, def32 %d, gran %d\n", 988 softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_long, softseg.ssd_def32, 989 softseg.ssd_gran); 990 kprintf("processor eflags = "); 991 if (frame->tf_rflags & PSL_T) 992 kprintf("trace trap, "); 993 if (frame->tf_rflags & PSL_I) 994 kprintf("interrupt enabled, "); 995 if (frame->tf_rflags & PSL_NT) 996 kprintf("nested task, "); 997 if (frame->tf_rflags & PSL_RF) 998 kprintf("resume, "); 999 kprintf("IOPL = %ld\n", (frame->tf_rflags & PSL_IOPL) >> 12); 1000 kprintf("current process = "); 1001 if (curproc) { 1002 kprintf("%lu\n", 1003 (u_long)curproc->p_pid); 1004 } else { 1005 kprintf("Idle\n"); 1006 } 1007 kprintf("current thread = pri %d ", curthread->td_pri); 1008 if (curthread->td_critcount) 1009 kprintf("(CRIT)"); 1010 kprintf("\n"); 1011 1012 #ifdef DDB 1013 if ((debugger_on_panic || db_active) && kdb_trap(type, code, frame)) 1014 return; 1015 #endif 1016 kprintf("trap number = %d\n", type); 1017 if (type <= MAX_TRAP_MSG) 1018 panic("%s", trap_msg[type]); 1019 else 1020 panic("unknown/reserved trap"); 1021 } 1022 1023 /* 1024 * Double fault handler. Called when a fault occurs while writing 1025 * a frame for a trap/exception onto the stack. This usually occurs 1026 * when the stack overflows (such is the case with infinite recursion, 1027 * for example). 1028 */ 1029 static __inline 1030 int 1031 in_kstack_guard(register_t rptr) 1032 { 1033 thread_t td = curthread; 1034 1035 if ((char *)rptr >= td->td_kstack && 1036 (char *)rptr < td->td_kstack + PAGE_SIZE) { 1037 return 1; 1038 } 1039 return 0; 1040 } 1041 1042 void 1043 dblfault_handler(struct trapframe *frame) 1044 { 1045 thread_t td = curthread; 1046 1047 if (in_kstack_guard(frame->tf_rsp) || in_kstack_guard(frame->tf_rbp)) { 1048 kprintf("DOUBLE FAULT - KERNEL STACK GUARD HIT!\n"); 1049 if (in_kstack_guard(frame->tf_rsp)) 1050 frame->tf_rsp = (register_t)(td->td_kstack + PAGE_SIZE); 1051 if (in_kstack_guard(frame->tf_rbp)) 1052 frame->tf_rbp = (register_t)(td->td_kstack + PAGE_SIZE); 1053 } else { 1054 kprintf("DOUBLE FAULT\n"); 1055 } 1056 kprintf("\nFatal double fault\n"); 1057 kprintf("rip = 0x%lx\n", frame->tf_rip); 1058 kprintf("rsp = 0x%lx\n", frame->tf_rsp); 1059 kprintf("rbp = 0x%lx\n", frame->tf_rbp); 1060 #ifdef SMP 1061 /* three separate prints in case of a trap on an unmapped page */ 1062 kprintf("cpuid = %d; ", mycpu->gd_cpuid); 1063 kprintf("lapic->id = %08x\n", lapic->id); 1064 #endif 1065 panic("double fault"); 1066 } 1067 1068 /* 1069 * syscall2 - MP aware system call request C handler 1070 * 1071 * A system call is essentially treated as a trap except that the 1072 * MP lock is not held on entry or return. We are responsible for 1073 * obtaining the MP lock if necessary and for handling ASTs 1074 * (e.g. a task switch) prior to return. 1075 * 1076 * MPSAFE 1077 */ 1078 void 1079 syscall2(struct trapframe *frame) 1080 { 1081 struct thread *td = curthread; 1082 struct proc *p = td->td_proc; 1083 struct lwp *lp = td->td_lwp; 1084 caddr_t params; 1085 struct sysent *callp; 1086 register_t orig_tf_rflags; 1087 int sticks; 1088 int error; 1089 int narg; 1090 #ifdef INVARIANTS 1091 int crit_count = td->td_critcount; 1092 #endif 1093 #ifdef SMP 1094 int have_mplock = 0; 1095 #endif 1096 register_t *argp; 1097 u_int code; 1098 int reg, regcnt; 1099 union sysunion args; 1100 register_t *argsdst; 1101 1102 mycpu->gd_cnt.v_syscall++; 1103 1104 #ifdef DIAGNOSTIC 1105 if (ISPL(frame->tf_cs) != SEL_UPL) { 1106 get_mplock(); 1107 panic("syscall"); 1108 /* NOT REACHED */ 1109 } 1110 #endif 1111 1112 KTR_LOG(kernentry_syscall, p->p_pid, lp->lwp_tid, 1113 frame->tf_rax); 1114 1115 userenter(td, p); /* lazy raise our priority */ 1116 1117 reg = 0; 1118 regcnt = 6; 1119 /* 1120 * Misc 1121 */ 1122 sticks = (int)td->td_sticks; 1123 orig_tf_rflags = frame->tf_rflags; 1124 1125 /* 1126 * Virtual kernel intercept - if a VM context managed by a virtual 1127 * kernel issues a system call the virtual kernel handles it, not us. 1128 * Restore the virtual kernel context and return from its system 1129 * call. The current frame is copied out to the virtual kernel. 1130 */ 1131 if (lp->lwp_vkernel && lp->lwp_vkernel->ve) { 1132 vkernel_trap(lp, frame); 1133 error = EJUSTRETURN; 1134 goto out; 1135 } 1136 1137 /* 1138 * Get the system call parameters and account for time 1139 */ 1140 KASSERT(lp->lwp_md.md_regs == frame, 1141 ("Frame mismatch %p %p", lp->lwp_md.md_regs, frame)); 1142 params = (caddr_t)frame->tf_rsp + sizeof(register_t); 1143 code = frame->tf_rax; 1144 1145 if (p->p_sysent->sv_prepsyscall) { 1146 (*p->p_sysent->sv_prepsyscall)( 1147 frame, (int *)(&args.nosys.sysmsg + 1), 1148 &code, ¶ms); 1149 } else { 1150 if (code == SYS_syscall || code == SYS___syscall) { 1151 code = frame->tf_rdi; 1152 reg++; 1153 regcnt--; 1154 } 1155 } 1156 1157 if (p->p_sysent->sv_mask) 1158 code &= p->p_sysent->sv_mask; 1159 1160 if (code >= p->p_sysent->sv_size) 1161 callp = &p->p_sysent->sv_table[0]; 1162 else 1163 callp = &p->p_sysent->sv_table[code]; 1164 1165 narg = callp->sy_narg & SYF_ARGMASK; 1166 1167 /* 1168 * On x86_64 we get up to six arguments in registers. The rest are 1169 * on the stack. The first six members of 'struct trapframe' happen 1170 * to be the registers used to pass arguments, in exactly the right 1171 * order. 1172 */ 1173 argp = &frame->tf_rdi; 1174 argp += reg; 1175 argsdst = (register_t *)(&args.nosys.sysmsg + 1); 1176 /* 1177 * JG can we overflow the space pointed to by 'argsdst' 1178 * either with 'bcopy' or with 'copyin'? 1179 */ 1180 bcopy(argp, argsdst, sizeof(register_t) * regcnt); 1181 /* 1182 * copyin is MP aware, but the tracing code is not 1183 */ 1184 if (narg > regcnt) { 1185 KASSERT(params != NULL, ("copyin args with no params!")); 1186 error = copyin(params, &argsdst[regcnt], 1187 (narg - regcnt) * sizeof(register_t)); 1188 if (error) { 1189 #ifdef KTRACE 1190 if (KTRPOINT(td, KTR_SYSCALL)) { 1191 MAKEMPSAFE(have_mplock); 1192 1193 ktrsyscall(lp, code, narg, 1194 (void *)(&args.nosys.sysmsg + 1)); 1195 } 1196 #endif 1197 goto bad; 1198 } 1199 } 1200 1201 #ifdef KTRACE 1202 if (KTRPOINT(td, KTR_SYSCALL)) { 1203 MAKEMPSAFE(have_mplock); 1204 ktrsyscall(lp, code, narg, (void *)(&args.nosys.sysmsg + 1)); 1205 } 1206 #endif 1207 1208 /* 1209 * Default return value is 0 (will be copied to %rax). Double-value 1210 * returns use %rax and %rdx. %rdx is left unchanged for system 1211 * calls which return only one result. 1212 */ 1213 args.sysmsg_fds[0] = 0; 1214 args.sysmsg_fds[1] = frame->tf_rdx; 1215 1216 /* 1217 * The syscall might manipulate the trap frame. If it does it 1218 * will probably return EJUSTRETURN. 1219 */ 1220 args.sysmsg_frame = frame; 1221 1222 STOPEVENT(p, S_SCE, narg); /* MP aware */ 1223 1224 /* 1225 * NOTE: All system calls run MPSAFE now. The system call itself 1226 * is responsible for getting the MP lock. 1227 */ 1228 #ifdef SYSCALL_DEBUG 1229 uint64_t tscval = rdtsc(); 1230 #endif 1231 error = (*callp->sy_call)(&args); 1232 #ifdef SYSCALL_DEBUG 1233 tscval = rdtsc() - tscval; 1234 tscval = tscval * 1000000 / tsc_frequency; 1235 if (SysCallsWorstCase[code] < tscval) 1236 SysCallsWorstCase[code] = tscval; 1237 #endif 1238 1239 out: 1240 /* 1241 * MP SAFE (we may or may not have the MP lock at this point) 1242 */ 1243 //kprintf("SYSMSG %d ", error); 1244 switch (error) { 1245 case 0: 1246 /* 1247 * Reinitialize proc pointer `p' as it may be different 1248 * if this is a child returning from fork syscall. 1249 */ 1250 p = curproc; 1251 lp = curthread->td_lwp; 1252 frame->tf_rax = args.sysmsg_fds[0]; 1253 frame->tf_rdx = args.sysmsg_fds[1]; 1254 frame->tf_rflags &= ~PSL_C; 1255 break; 1256 case ERESTART: 1257 /* 1258 * Reconstruct pc, we know that 'syscall' is 2 bytes. 1259 * We have to do a full context restore so that %r10 1260 * (which was holding the value of %rcx) is restored for 1261 * the next iteration. 1262 */ 1263 if (frame->tf_err != 0 && frame->tf_err != 2) 1264 kprintf("lp %s:%d frame->tf_err is weird %ld\n", 1265 td->td_comm, lp->lwp_proc->p_pid, frame->tf_err); 1266 frame->tf_rip -= frame->tf_err; 1267 frame->tf_r10 = frame->tf_rcx; 1268 break; 1269 case EJUSTRETURN: 1270 break; 1271 case EASYNC: 1272 panic("Unexpected EASYNC return value (for now)"); 1273 default: 1274 bad: 1275 if (p->p_sysent->sv_errsize) { 1276 if (error >= p->p_sysent->sv_errsize) 1277 error = -1; /* XXX */ 1278 else 1279 error = p->p_sysent->sv_errtbl[error]; 1280 } 1281 frame->tf_rax = error; 1282 frame->tf_rflags |= PSL_C; 1283 break; 1284 } 1285 1286 /* 1287 * Traced syscall. trapsignal() is not MP aware. 1288 */ 1289 if (orig_tf_rflags & PSL_T) { 1290 MAKEMPSAFE(have_mplock); 1291 frame->tf_rflags &= ~PSL_T; 1292 trapsignal(lp, SIGTRAP, TRAP_TRACE); 1293 } 1294 1295 /* 1296 * Handle reschedule and other end-of-syscall issues 1297 */ 1298 userret(lp, frame, sticks); 1299 1300 #ifdef KTRACE 1301 if (KTRPOINT(td, KTR_SYSRET)) { 1302 MAKEMPSAFE(have_mplock); 1303 ktrsysret(lp, code, error, args.sysmsg_result); 1304 } 1305 #endif 1306 1307 /* 1308 * This works because errno is findable through the 1309 * register set. If we ever support an emulation where this 1310 * is not the case, this code will need to be revisited. 1311 */ 1312 STOPEVENT(p, S_SCX, code); 1313 1314 userexit(lp); 1315 #ifdef SMP 1316 /* 1317 * Release the MP lock if we had to get it 1318 */ 1319 if (have_mplock) 1320 rel_mplock(); 1321 #endif 1322 KTR_LOG(kernentry_syscall_ret, p->p_pid, lp->lwp_tid, error); 1323 #ifdef INVARIANTS 1324 KASSERT(crit_count == td->td_critcount, 1325 ("syscall: critical section count mismatch! %d/%d", 1326 crit_count, td->td_pri)); 1327 KASSERT(&td->td_toks_base == td->td_toks_stop, 1328 ("syscall: extra tokens held after trap! %ld", 1329 td->td_toks_stop - &td->td_toks_base)); 1330 #endif 1331 } 1332 1333 /* 1334 * NOTE: mplock not held at any point 1335 */ 1336 void 1337 fork_return(struct lwp *lp, struct trapframe *frame) 1338 { 1339 frame->tf_rax = 0; /* Child returns zero */ 1340 frame->tf_rflags &= ~PSL_C; /* success */ 1341 frame->tf_rdx = 1; 1342 1343 generic_lwp_return(lp, frame); 1344 KTR_LOG(kernentry_fork_ret, lp->lwp_proc->p_pid, lp->lwp_tid); 1345 } 1346 1347 /* 1348 * Simplified back end of syscall(), used when returning from fork() 1349 * directly into user mode. 1350 * 1351 * This code will return back into the fork trampoline code which then 1352 * runs doreti. 1353 * 1354 * NOTE: The mplock is not held at any point. 1355 */ 1356 void 1357 generic_lwp_return(struct lwp *lp, struct trapframe *frame) 1358 { 1359 struct proc *p = lp->lwp_proc; 1360 1361 /* 1362 * Newly forked processes are given a kernel priority. We have to 1363 * adjust the priority to a normal user priority and fake entry 1364 * into the kernel (call userenter()) to install a passive release 1365 * function just in case userret() decides to stop the process. This 1366 * can occur when ^Z races a fork. If we do not install the passive 1367 * release function the current process designation will not be 1368 * released when the thread goes to sleep. 1369 */ 1370 lwkt_setpri_self(TDPRI_USER_NORM); 1371 userenter(lp->lwp_thread, p); 1372 userret(lp, frame, 0); 1373 #ifdef KTRACE 1374 if (KTRPOINT(lp->lwp_thread, KTR_SYSRET)) 1375 ktrsysret(lp, SYS_fork, 0, 0); 1376 #endif 1377 lp->lwp_flags |= LWP_PASSIVE_ACQ; 1378 userexit(lp); 1379 lp->lwp_flags &= ~LWP_PASSIVE_ACQ; 1380 } 1381 1382 /* 1383 * If PGEX_FPFAULT is set then set FP_VIRTFP in the PCB to force a T_DNA 1384 * fault (which is then passed back to the virtual kernel) if an attempt is 1385 * made to use the FP unit. 1386 * 1387 * XXX this is a fairly big hack. 1388 */ 1389 void 1390 set_vkernel_fp(struct trapframe *frame) 1391 { 1392 struct thread *td = curthread; 1393 1394 if (frame->tf_xflags & PGEX_FPFAULT) { 1395 td->td_pcb->pcb_flags |= FP_VIRTFP; 1396 if (mdcpu->gd_npxthread == td) 1397 npxexit(); 1398 } else { 1399 td->td_pcb->pcb_flags &= ~FP_VIRTFP; 1400 } 1401 } 1402 1403 /* 1404 * Called from vkernel_trap() to fixup the vkernel's syscall 1405 * frame for vmspace_ctl() return. 1406 */ 1407 void 1408 cpu_vkernel_trap(struct trapframe *frame, int error) 1409 { 1410 frame->tf_rax = error; 1411 if (error) 1412 frame->tf_rflags |= PSL_C; 1413 else 1414 frame->tf_rflags &= ~PSL_C; 1415 } 1416