1 /*- 2 * Copyright (C) 1994, David Greenman 3 * Copyright (c) 1990, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the University of Utah, and William Jolitz. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 38 * $FreeBSD: src/sys/i386/i386/trap.c,v 1.147.2.11 2003/02/27 19:09:59 luoqi Exp $ 39 */ 40 41 /* 42 * x86_64 Trap and System call handling 43 */ 44 45 #include "use_isa.h" 46 47 #include "opt_ddb.h" 48 #include "opt_ktrace.h" 49 50 #include <sys/param.h> 51 #include <sys/systm.h> 52 #include <sys/proc.h> 53 #include <sys/pioctl.h> 54 #include <sys/kernel.h> 55 #include <sys/resourcevar.h> 56 #include <sys/signalvar.h> 57 #include <sys/signal2.h> 58 #include <sys/syscall.h> 59 #include <sys/sysctl.h> 60 #include <sys/sysent.h> 61 #include <sys/uio.h> 62 #include <sys/vmmeter.h> 63 #include <sys/malloc.h> 64 #ifdef KTRACE 65 #include <sys/ktrace.h> 66 #endif 67 #include <sys/ktr.h> 68 #include <sys/upcall.h> 69 #include <sys/vkernel.h> 70 #include <sys/sysproto.h> 71 #include <sys/sysunion.h> 72 #include <sys/vmspace.h> 73 74 #include <vm/vm.h> 75 #include <vm/vm_param.h> 76 #include <sys/lock.h> 77 #include <vm/pmap.h> 78 #include <vm/vm_kern.h> 79 #include <vm/vm_map.h> 80 #include <vm/vm_page.h> 81 #include <vm/vm_extern.h> 82 83 #include <machine/cpu.h> 84 #include <machine/md_var.h> 85 #include <machine/pcb.h> 86 #include <machine/smp.h> 87 #include <machine/tss.h> 88 #include <machine/globaldata.h> 89 90 #include <ddb/ddb.h> 91 92 #include <sys/msgport2.h> 93 #include <sys/thread2.h> 94 #include <sys/mplock2.h> 95 96 #ifdef SMP 97 98 #define MAKEMPSAFE(have_mplock) \ 99 if (have_mplock == 0) { \ 100 get_mplock(); \ 101 have_mplock = 1; \ 102 } 103 104 #else 105 106 #define MAKEMPSAFE(have_mplock) 107 108 #endif 109 110 int (*pmath_emulate) (struct trapframe *); 111 112 extern int trapwrite (unsigned addr); 113 114 static int trap_pfault (struct trapframe *, int, vm_offset_t); 115 static void trap_fatal (struct trapframe *, int, vm_offset_t); 116 void dblfault_handler (void); 117 118 #if 0 119 extern inthand_t IDTVEC(syscall); 120 #endif 121 122 #define MAX_TRAP_MSG 30 123 static char *trap_msg[] = { 124 "", /* 0 unused */ 125 "privileged instruction fault", /* 1 T_PRIVINFLT */ 126 "", /* 2 unused */ 127 "breakpoint instruction fault", /* 3 T_BPTFLT */ 128 "", /* 4 unused */ 129 "", /* 5 unused */ 130 "arithmetic trap", /* 6 T_ARITHTRAP */ 131 "system forced exception", /* 7 T_ASTFLT */ 132 "", /* 8 unused */ 133 "general protection fault", /* 9 T_PROTFLT */ 134 "trace trap", /* 10 T_TRCTRAP */ 135 "", /* 11 unused */ 136 "page fault", /* 12 T_PAGEFLT */ 137 "", /* 13 unused */ 138 "alignment fault", /* 14 T_ALIGNFLT */ 139 "", /* 15 unused */ 140 "", /* 16 unused */ 141 "", /* 17 unused */ 142 "integer divide fault", /* 18 T_DIVIDE */ 143 "non-maskable interrupt trap", /* 19 T_NMI */ 144 "overflow trap", /* 20 T_OFLOW */ 145 "FPU bounds check fault", /* 21 T_BOUND */ 146 "FPU device not available", /* 22 T_DNA */ 147 "double fault", /* 23 T_DOUBLEFLT */ 148 "FPU operand fetch fault", /* 24 T_FPOPFLT */ 149 "invalid TSS fault", /* 25 T_TSSFLT */ 150 "segment not present fault", /* 26 T_SEGNPFLT */ 151 "stack fault", /* 27 T_STKFLT */ 152 "machine check trap", /* 28 T_MCHK */ 153 "SIMD floating-point exception", /* 29 T_XMMFLT */ 154 "reserved (unknown) fault", /* 30 T_RESERVED */ 155 }; 156 157 #ifdef DDB 158 static int ddb_on_nmi = 1; 159 SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW, 160 &ddb_on_nmi, 0, "Go to DDB on NMI"); 161 #endif 162 static int panic_on_nmi = 1; 163 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW, 164 &panic_on_nmi, 0, "Panic on NMI"); 165 static int fast_release; 166 SYSCTL_INT(_machdep, OID_AUTO, fast_release, CTLFLAG_RW, 167 &fast_release, 0, "Passive Release was optimal"); 168 static int slow_release; 169 SYSCTL_INT(_machdep, OID_AUTO, slow_release, CTLFLAG_RW, 170 &slow_release, 0, "Passive Release was nonoptimal"); 171 #ifdef SMP 172 static int syscall_mpsafe = 1; 173 SYSCTL_INT(_kern, OID_AUTO, syscall_mpsafe, CTLFLAG_RW, 174 &syscall_mpsafe, 0, "Allow MPSAFE marked syscalls to run without BGL"); 175 TUNABLE_INT("kern.syscall_mpsafe", &syscall_mpsafe); 176 static int trap_mpsafe = 1; 177 SYSCTL_INT(_kern, OID_AUTO, trap_mpsafe, CTLFLAG_RW, 178 &trap_mpsafe, 0, "Allow traps to mostly run without the BGL"); 179 TUNABLE_INT("kern.trap_mpsafe", &trap_mpsafe); 180 #endif 181 182 MALLOC_DEFINE(M_SYSMSG, "sysmsg", "sysmsg structure"); 183 extern int max_sysmsg; 184 185 /* 186 * Passively intercepts the thread switch function to increase the thread 187 * priority from a user priority to a kernel priority, reducing 188 * syscall and trap overhead for the case where no switch occurs. 189 * 190 * Synchronizes td_ucred with p_ucred. This is used by system calls, 191 * signal handling, faults, AST traps, and anything else that enters the 192 * kernel from userland and provides the kernel with a stable read-only 193 * copy of the process ucred. 194 */ 195 static __inline void 196 userenter(struct thread *curtd, struct proc *curp) 197 { 198 struct ucred *ocred; 199 struct ucred *ncred; 200 201 curtd->td_release = lwkt_passive_release; 202 203 if (curtd->td_ucred != curp->p_ucred) { 204 ncred = crhold(curp->p_ucred); 205 ocred = curtd->td_ucred; 206 curtd->td_ucred = ncred; 207 if (ocred) 208 crfree(ocred); 209 } 210 } 211 212 /* 213 * Handle signals, upcalls, profiling, and other AST's and/or tasks that 214 * must be completed before we can return to or try to return to userland. 215 * 216 * Note that td_sticks is a 64 bit quantity, but there's no point doing 64 217 * arithmatic on the delta calculation so the absolute tick values are 218 * truncated to an integer. 219 */ 220 static void 221 userret(struct lwp *lp, struct trapframe *frame, int sticks) 222 { 223 struct proc *p = lp->lwp_proc; 224 int sig; 225 226 /* 227 * Charge system time if profiling. Note: times are in microseconds. 228 * This may do a copyout and block, so do it first even though it 229 * means some system time will be charged as user time. 230 */ 231 if (p->p_flag & P_PROFIL) { 232 addupc_task(p, frame->tf_rip, 233 (u_int)((int)lp->lwp_thread->td_sticks - sticks)); 234 } 235 236 recheck: 237 /* 238 * If the jungle wants us dead, so be it. 239 */ 240 if (lp->lwp_flag & LWP_WEXIT) { 241 get_mplock(); 242 lwp_exit(0); 243 rel_mplock(); /* NOT REACHED */ 244 } 245 246 /* 247 * Block here if we are in a stopped state. 248 */ 249 if (p->p_stat == SSTOP) { 250 get_mplock(); 251 tstop(); 252 rel_mplock(); 253 goto recheck; 254 } 255 256 /* 257 * Post any pending upcalls 258 */ 259 if (p->p_flag & P_UPCALLPEND) { 260 get_mplock(); 261 p->p_flag &= ~P_UPCALLPEND; 262 postupcall(lp); 263 rel_mplock(); 264 goto recheck; 265 } 266 267 /* 268 * Post any pending signals 269 */ 270 if ((sig = CURSIG_TRACE(lp)) != 0) { 271 get_mplock(); 272 postsig(sig); 273 rel_mplock(); 274 goto recheck; 275 } 276 277 /* 278 * block here if we are swapped out, but still process signals 279 * (such as SIGKILL). proc0 (the swapin scheduler) is already 280 * aware of our situation, we do not have to wake it up. 281 */ 282 if (p->p_flag & P_SWAPPEDOUT) { 283 get_mplock(); 284 p->p_flag |= P_SWAPWAIT; 285 swapin_request(); 286 if (p->p_flag & P_SWAPWAIT) 287 tsleep(p, PCATCH, "SWOUT", 0); 288 p->p_flag &= ~P_SWAPWAIT; 289 rel_mplock(); 290 goto recheck; 291 } 292 293 /* 294 * Make sure postsig() handled request to restore old signal mask after 295 * running signal handler. 296 */ 297 KKASSERT((lp->lwp_flag & LWP_OLDMASK) == 0); 298 } 299 300 /* 301 * Cleanup from userenter and any passive release that might have occured. 302 * We must reclaim the current-process designation before we can return 303 * to usermode. We also handle both LWKT and USER reschedule requests. 304 */ 305 static __inline void 306 userexit(struct lwp *lp) 307 { 308 struct thread *td = lp->lwp_thread; 309 /* globaldata_t gd = td->td_gd; */ 310 311 /* 312 * Handle stop requests at kernel priority. Any requests queued 313 * after this loop will generate another AST. 314 */ 315 while (lp->lwp_proc->p_stat == SSTOP) { 316 get_mplock(); 317 tstop(); 318 rel_mplock(); 319 } 320 321 /* 322 * Reduce our priority in preparation for a return to userland. If 323 * our passive release function was still in place, our priority was 324 * never raised and does not need to be reduced. 325 */ 326 lwkt_passive_recover(td); 327 328 /* 329 * Become the current user scheduled process if we aren't already, 330 * and deal with reschedule requests and other factors. 331 */ 332 lp->lwp_proc->p_usched->acquire_curproc(lp); 333 /* WARNING: we may have migrated cpu's */ 334 /* gd = td->td_gd; */ 335 } 336 337 #if !defined(KTR_KERNENTRY) 338 #define KTR_KERNENTRY KTR_ALL 339 #endif 340 KTR_INFO_MASTER(kernentry); 341 KTR_INFO(KTR_KERNENTRY, kernentry, trap, 0, "pid=%d, tid=%d, trapno=%d, eva=%p", 342 sizeof(int) + sizeof(int) + sizeof(int) + sizeof(vm_offset_t)); 343 KTR_INFO(KTR_KERNENTRY, kernentry, trap_ret, 0, "pid=%d, tid=%d", 344 sizeof(int) + sizeof(int)); 345 KTR_INFO(KTR_KERNENTRY, kernentry, syscall, 0, "pid=%d, tid=%d, call=%d", 346 sizeof(int) + sizeof(int) + sizeof(int)); 347 KTR_INFO(KTR_KERNENTRY, kernentry, syscall_ret, 0, "pid=%d, tid=%d, err=%d", 348 sizeof(int) + sizeof(int) + sizeof(int)); 349 KTR_INFO(KTR_KERNENTRY, kernentry, fork_ret, 0, "pid=%d, tid=%d", 350 sizeof(int) + sizeof(int)); 351 352 /* 353 * Exception, fault, and trap interface to the kernel. 354 * This common code is called from assembly language IDT gate entry 355 * routines that prepare a suitable stack frame, and restore this 356 * frame after the exception has been processed. 357 * 358 * This function is also called from doreti in an interlock to handle ASTs. 359 * For example: hardwareint->INTROUTINE->(set ast)->doreti->trap 360 * 361 * NOTE! We have to retrieve the fault address prior to obtaining the 362 * MP lock because get_mplock() may switch out. YYY cr2 really ought 363 * to be retrieved by the assembly code, not here. 364 * 365 * XXX gd_trap_nesting_level currently prevents lwkt_switch() from panicing 366 * if an attempt is made to switch from a fast interrupt or IPI. This is 367 * necessary to properly take fatal kernel traps on SMP machines if 368 * get_mplock() has to block. 369 */ 370 371 void 372 user_trap(struct trapframe *frame) 373 { 374 struct globaldata *gd = mycpu; 375 struct thread *td = gd->gd_curthread; 376 struct lwp *lp = td->td_lwp; 377 struct proc *p; 378 int sticks = 0; 379 int i = 0, ucode = 0, type, code; 380 #ifdef SMP 381 int have_mplock = 0; 382 #endif 383 #ifdef INVARIANTS 384 int crit_count = td->td_pri & ~TDPRI_MASK; 385 #endif 386 vm_offset_t eva; 387 388 p = td->td_proc; 389 390 if (frame->tf_trapno == T_PAGEFLT) 391 eva = frame->tf_addr; 392 else 393 eva = 0; 394 #if 0 395 kprintf("USER_TRAP AT %08lx xflags %ld trapno %ld eva %08lx\n", 396 frame->tf_rip, frame->tf_xflags, frame->tf_trapno, eva); 397 #endif 398 399 /* 400 * Everything coming from user mode runs through user_trap, 401 * including system calls. 402 */ 403 if (frame->tf_trapno == T_FAST_SYSCALL) { 404 syscall2(frame); 405 return; 406 } 407 408 KTR_LOG(kernentry_trap, lp->lwp_proc->p_pid, lp->lwp_tid, 409 frame->tf_trapno, eva); 410 411 #ifdef DDB 412 if (db_active) { 413 eva = (frame->tf_trapno == T_PAGEFLT ? rcr2() : 0); 414 ++gd->gd_trap_nesting_level; 415 MAKEMPSAFE(have_mplock); 416 trap_fatal(frame, TRUE, eva); 417 --gd->gd_trap_nesting_level; 418 goto out2; 419 } 420 #endif 421 422 ++gd->gd_trap_nesting_level; 423 #ifdef SMP 424 if (trap_mpsafe == 0) 425 MAKEMPSAFE(have_mplock); 426 #endif 427 428 --gd->gd_trap_nesting_level; 429 430 #if defined(I586_CPU) && !defined(NO_F00F_HACK) 431 restart: 432 #endif 433 type = frame->tf_trapno; 434 code = frame->tf_err; 435 436 userenter(td, p); 437 438 sticks = (int)td->td_sticks; 439 lp->lwp_md.md_regs = frame; 440 441 switch (type) { 442 case T_PRIVINFLT: /* privileged instruction fault */ 443 ucode = type; 444 i = SIGILL; 445 break; 446 447 case T_BPTFLT: /* bpt instruction fault */ 448 case T_TRCTRAP: /* trace trap */ 449 frame->tf_rflags &= ~PSL_T; 450 i = SIGTRAP; 451 break; 452 453 case T_ARITHTRAP: /* arithmetic trap */ 454 ucode = code; 455 i = SIGFPE; 456 break; 457 458 case T_ASTFLT: /* Allow process switch */ 459 mycpu->gd_cnt.v_soft++; 460 if (mycpu->gd_reqflags & RQF_AST_OWEUPC) { 461 atomic_clear_int_nonlocked(&mycpu->gd_reqflags, 462 RQF_AST_OWEUPC); 463 addupc_task(p, p->p_prof.pr_addr, 464 p->p_prof.pr_ticks); 465 } 466 goto out; 467 468 /* 469 * The following two traps can happen in 470 * vm86 mode, and, if so, we want to handle 471 * them specially. 472 */ 473 case T_PROTFLT: /* general protection fault */ 474 case T_STKFLT: /* stack fault */ 475 #if 0 476 if (frame->tf_eflags & PSL_VM) { 477 i = vm86_emulate((struct vm86frame *)frame); 478 if (i == 0) 479 goto out; 480 break; 481 } 482 #endif 483 /* FALL THROUGH */ 484 485 case T_SEGNPFLT: /* segment not present fault */ 486 case T_TSSFLT: /* invalid TSS fault */ 487 case T_DOUBLEFLT: /* double fault */ 488 default: 489 ucode = code + BUS_SEGM_FAULT ; 490 i = SIGBUS; 491 break; 492 493 case T_PAGEFLT: /* page fault */ 494 MAKEMPSAFE(have_mplock); 495 i = trap_pfault(frame, TRUE, eva); 496 if (i == -1) 497 goto out; 498 #if defined(I586_CPU) && !defined(NO_F00F_HACK) 499 if (i == -2) 500 goto restart; 501 #endif 502 if (i == 0) 503 goto out; 504 505 ucode = T_PAGEFLT; 506 break; 507 508 case T_DIVIDE: /* integer divide fault */ 509 ucode = FPE_INTDIV; 510 i = SIGFPE; 511 break; 512 513 #if NISA > 0 514 case T_NMI: 515 MAKEMPSAFE(have_mplock); 516 /* machine/parity/power fail/"kitchen sink" faults */ 517 if (isa_nmi(code) == 0) { 518 #ifdef DDB 519 /* 520 * NMI can be hooked up to a pushbutton 521 * for debugging. 522 */ 523 if (ddb_on_nmi) { 524 kprintf ("NMI ... going to debugger\n"); 525 kdb_trap (type, 0, frame); 526 } 527 #endif /* DDB */ 528 goto out2; 529 } else if (panic_on_nmi) 530 panic("NMI indicates hardware failure"); 531 break; 532 #endif /* NISA > 0 */ 533 534 case T_OFLOW: /* integer overflow fault */ 535 ucode = FPE_INTOVF; 536 i = SIGFPE; 537 break; 538 539 case T_BOUND: /* bounds check fault */ 540 ucode = FPE_FLTSUB; 541 i = SIGFPE; 542 break; 543 544 case T_DNA: 545 /* 546 * Virtual kernel intercept - pass the DNA exception 547 * to the (emulated) virtual kernel if it asked to handle 548 * it. This occurs when the virtual kernel is holding 549 * onto the FP context for a different emulated 550 * process then the one currently running. 551 * 552 * We must still call npxdna() since we may have 553 * saved FP state that the (emulated) virtual kernel 554 * needs to hand over to a different emulated process. 555 */ 556 if (lp->lwp_vkernel && lp->lwp_vkernel->ve && 557 (td->td_pcb->pcb_flags & FP_VIRTFP) 558 ) { 559 npxdna(frame); 560 break; 561 } 562 /* 563 * The kernel may have switched out the FP unit's 564 * state, causing the user process to take a fault 565 * when it tries to use the FP unit. Restore the 566 * state here 567 */ 568 if (npxdna(frame)) 569 goto out; 570 if (!pmath_emulate) { 571 i = SIGFPE; 572 ucode = FPE_FPU_NP_TRAP; 573 break; 574 } 575 i = (*pmath_emulate)(frame); 576 if (i == 0) { 577 if (!(frame->tf_rflags & PSL_T)) 578 goto out2; 579 frame->tf_rflags &= ~PSL_T; 580 i = SIGTRAP; 581 } 582 /* else ucode = emulator_only_knows() XXX */ 583 break; 584 585 case T_FPOPFLT: /* FPU operand fetch fault */ 586 ucode = T_FPOPFLT; 587 i = SIGILL; 588 break; 589 590 case T_XMMFLT: /* SIMD floating-point exception */ 591 ucode = 0; /* XXX */ 592 i = SIGFPE; 593 break; 594 } 595 596 /* 597 * Virtual kernel intercept - if the fault is directly related to a 598 * VM context managed by a virtual kernel then let the virtual kernel 599 * handle it. 600 */ 601 if (lp->lwp_vkernel && lp->lwp_vkernel->ve) { 602 vkernel_trap(lp, frame); 603 goto out; 604 } 605 606 /* 607 * Translate fault for emulators (e.g. Linux) 608 */ 609 if (*p->p_sysent->sv_transtrap) 610 i = (*p->p_sysent->sv_transtrap)(i, type); 611 612 MAKEMPSAFE(have_mplock); 613 trapsignal(lp, i, ucode); 614 615 #ifdef DEBUG 616 if (type <= MAX_TRAP_MSG) { 617 uprintf("fatal process exception: %s", 618 trap_msg[type]); 619 if ((type == T_PAGEFLT) || (type == T_PROTFLT)) 620 uprintf(", fault VA = 0x%lx", (u_long)eva); 621 uprintf("\n"); 622 } 623 #endif 624 625 out: 626 #ifdef SMP 627 KASSERT(td->td_mpcount == have_mplock, ("badmpcount trap/end from %p", (void *)frame->tf_rip)); 628 #endif 629 userret(lp, frame, sticks); 630 userexit(lp); 631 out2: ; 632 #ifdef SMP 633 if (have_mplock) 634 rel_mplock(); 635 #endif 636 KTR_LOG(kernentry_trap_ret, lp->lwp_proc->p_pid, lp->lwp_tid); 637 #ifdef INVARIANTS 638 KASSERT(crit_count == (td->td_pri & ~TDPRI_MASK), 639 ("syscall: critical section count mismatch! %d/%d", 640 crit_count / TDPRI_CRIT, td->td_pri / TDPRI_CRIT)); 641 #endif 642 } 643 644 void 645 kern_trap(struct trapframe *frame) 646 { 647 struct globaldata *gd = mycpu; 648 struct thread *td = gd->gd_curthread; 649 struct lwp *lp; 650 struct proc *p; 651 int i = 0, ucode = 0, type, code; 652 #ifdef SMP 653 int have_mplock = 0; 654 #endif 655 #ifdef INVARIANTS 656 int crit_count = td->td_pri & ~TDPRI_MASK; 657 #endif 658 vm_offset_t eva; 659 660 lp = td->td_lwp; 661 p = td->td_proc; 662 663 if (frame->tf_trapno == T_PAGEFLT) 664 eva = frame->tf_addr; 665 else 666 eva = 0; 667 668 #ifdef DDB 669 if (db_active) { 670 ++gd->gd_trap_nesting_level; 671 MAKEMPSAFE(have_mplock); 672 trap_fatal(frame, FALSE, eva); 673 --gd->gd_trap_nesting_level; 674 goto out2; 675 } 676 #endif 677 678 ++gd->gd_trap_nesting_level; 679 680 #ifdef SMP 681 if (trap_mpsafe == 0) 682 MAKEMPSAFE(have_mplock); 683 #endif 684 685 --gd->gd_trap_nesting_level; 686 687 type = frame->tf_trapno; 688 code = frame->tf_err; 689 690 #if 0 691 kernel_trap: 692 #endif 693 /* kernel trap */ 694 695 switch (type) { 696 case T_PAGEFLT: /* page fault */ 697 MAKEMPSAFE(have_mplock); 698 trap_pfault(frame, FALSE, eva); 699 goto out2; 700 701 case T_DNA: 702 /* 703 * The kernel may be using npx for copying or other 704 * purposes. 705 */ 706 panic("kernel NPX should not happen"); 707 if (npxdna(frame)) 708 goto out2; 709 break; 710 711 case T_PROTFLT: /* general protection fault */ 712 case T_SEGNPFLT: /* segment not present fault */ 713 /* 714 * Invalid segment selectors and out of bounds 715 * %eip's and %esp's can be set up in user mode. 716 * This causes a fault in kernel mode when the 717 * kernel tries to return to user mode. We want 718 * to get this fault so that we can fix the 719 * problem here and not have to check all the 720 * selectors and pointers when the user changes 721 * them. 722 */ 723 if (mycpu->gd_intr_nesting_level == 0) { 724 if (td->td_pcb->pcb_onfault) { 725 frame->tf_rip = 726 (register_t)td->td_pcb->pcb_onfault; 727 goto out2; 728 } 729 } 730 break; 731 732 case T_TSSFLT: 733 /* 734 * PSL_NT can be set in user mode and isn't cleared 735 * automatically when the kernel is entered. This 736 * causes a TSS fault when the kernel attempts to 737 * `iret' because the TSS link is uninitialized. We 738 * want to get this fault so that we can fix the 739 * problem here and not every time the kernel is 740 * entered. 741 */ 742 if (frame->tf_rflags & PSL_NT) { 743 frame->tf_rflags &= ~PSL_NT; 744 goto out2; 745 } 746 break; 747 748 case T_TRCTRAP: /* trace trap */ 749 #if 0 750 if (frame->tf_eip == (int)IDTVEC(syscall)) { 751 /* 752 * We've just entered system mode via the 753 * syscall lcall. Continue single stepping 754 * silently until the syscall handler has 755 * saved the flags. 756 */ 757 goto out2; 758 } 759 if (frame->tf_eip == (int)IDTVEC(syscall) + 1) { 760 /* 761 * The syscall handler has now saved the 762 * flags. Stop single stepping it. 763 */ 764 frame->tf_eflags &= ~PSL_T; 765 goto out2; 766 } 767 #endif 768 #if 0 769 /* 770 * Ignore debug register trace traps due to 771 * accesses in the user's address space, which 772 * can happen under several conditions such as 773 * if a user sets a watchpoint on a buffer and 774 * then passes that buffer to a system call. 775 * We still want to get TRCTRAPS for addresses 776 * in kernel space because that is useful when 777 * debugging the kernel. 778 */ 779 if (user_dbreg_trap()) { 780 /* 781 * Reset breakpoint bits because the 782 * processor doesn't 783 */ 784 load_dr6(rdr6() & 0xfffffff0); 785 goto out2; 786 } 787 #endif 788 /* 789 * Fall through (TRCTRAP kernel mode, kernel address) 790 */ 791 case T_BPTFLT: 792 /* 793 * If DDB is enabled, let it handle the debugger trap. 794 * Otherwise, debugger traps "can't happen". 795 */ 796 #ifdef DDB 797 MAKEMPSAFE(have_mplock); 798 if (kdb_trap (type, 0, frame)) 799 goto out2; 800 #endif 801 break; 802 case T_DIVIDE: 803 MAKEMPSAFE(have_mplock); 804 trap_fatal(frame, FALSE, eva); 805 goto out2; 806 case T_NMI: 807 MAKEMPSAFE(have_mplock); 808 trap_fatal(frame, FALSE, eva); 809 goto out2; 810 case T_SYSCALL80: 811 case T_FAST_SYSCALL: 812 /* 813 * Ignore this trap generated from a spurious SIGTRAP. 814 * 815 * single stepping in / syscalls leads to spurious / SIGTRAP 816 * so ignore 817 * 818 * Haiku (c) 2007 Simon 'corecode' Schubert 819 */ 820 goto out2; 821 } 822 823 /* 824 * Translate fault for emulators (e.g. Linux) 825 */ 826 if (*p->p_sysent->sv_transtrap) 827 i = (*p->p_sysent->sv_transtrap)(i, type); 828 829 MAKEMPSAFE(have_mplock); 830 trapsignal(lp, i, ucode); 831 832 #ifdef DEBUG 833 if (type <= MAX_TRAP_MSG) { 834 uprintf("fatal process exception: %s", 835 trap_msg[type]); 836 if ((type == T_PAGEFLT) || (type == T_PROTFLT)) 837 uprintf(", fault VA = 0x%lx", (u_long)eva); 838 uprintf("\n"); 839 } 840 #endif 841 842 out2: 843 ; 844 #ifdef SMP 845 if (have_mplock) 846 rel_mplock(); 847 #endif 848 #ifdef INVARIANTS 849 KASSERT(crit_count == (td->td_pri & ~TDPRI_MASK), 850 ("syscall: critical section count mismatch! %d/%d", 851 crit_count / TDPRI_CRIT, td->td_pri / TDPRI_CRIT)); 852 #endif 853 } 854 855 int 856 trap_pfault(struct trapframe *frame, int usermode, vm_offset_t eva) 857 { 858 vm_offset_t va; 859 struct vmspace *vm = NULL; 860 vm_map_t map = 0; 861 int rv = 0; 862 vm_prot_t ftype; 863 thread_t td = curthread; 864 struct lwp *lp = td->td_lwp; 865 866 va = trunc_page(eva); 867 if (usermode == FALSE) { 868 /* 869 * This is a fault on kernel virtual memory. 870 */ 871 map = &kernel_map; 872 } else { 873 /* 874 * This is a fault on non-kernel virtual memory. 875 * vm is initialized above to NULL. If curproc is NULL 876 * or curproc->p_vmspace is NULL the fault is fatal. 877 */ 878 if (lp != NULL) 879 vm = lp->lwp_vmspace; 880 881 if (vm == NULL) 882 goto nogo; 883 884 map = &vm->vm_map; 885 } 886 887 if (frame->tf_err & PGEX_W) 888 ftype = VM_PROT_READ | VM_PROT_WRITE; 889 else 890 ftype = VM_PROT_READ; 891 892 if (map != &kernel_map) { 893 /* 894 * Keep swapout from messing with us during this 895 * critical time. 896 */ 897 PHOLD(lp->lwp_proc); 898 899 /* 900 * Grow the stack if necessary 901 */ 902 /* grow_stack returns false only if va falls into 903 * a growable stack region and the stack growth 904 * fails. It returns true if va was not within 905 * a growable stack region, or if the stack 906 * growth succeeded. 907 */ 908 if (!grow_stack (lp->lwp_proc, va)) { 909 rv = KERN_FAILURE; 910 PRELE(lp->lwp_proc); 911 goto nogo; 912 } 913 914 /* Fault in the user page: */ 915 rv = vm_fault(map, va, ftype, 916 (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY 917 : VM_FAULT_NORMAL); 918 919 PRELE(lp->lwp_proc); 920 } else { 921 /* 922 * Don't have to worry about process locking or stacks in the kernel. 923 */ 924 rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); 925 } 926 927 if (rv == KERN_SUCCESS) 928 return (0); 929 nogo: 930 if (!usermode) { 931 if (td->td_gd->gd_intr_nesting_level == 0 && 932 td->td_pcb->pcb_onfault) { 933 frame->tf_rip = (register_t)td->td_pcb->pcb_onfault; 934 return (0); 935 } 936 trap_fatal(frame, usermode, eva); 937 return (-1); 938 } 939 940 /* 941 * NOTE: on x86_64 we have a tf_addr field in the trapframe, no 942 * kludge is needed to pass the fault address to signal handlers. 943 */ 944 struct proc *p = td->td_proc; 945 kprintf("seg-fault accessing address %p rip=%p pid=%d p_comm=%s\n", 946 (void *)va, (void *)frame->tf_rip, p->p_pid, p->p_comm); 947 /* Debugger("seg-fault"); */ 948 949 return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); 950 } 951 952 static void 953 trap_fatal(struct trapframe *frame, int usermode, vm_offset_t eva) 954 { 955 int code, type, ss; 956 long rsp; 957 958 code = frame->tf_xflags; 959 type = frame->tf_trapno; 960 961 if (type <= MAX_TRAP_MSG) { 962 kprintf("\n\nFatal trap %d: %s while in %s mode\n", 963 type, trap_msg[type], 964 (usermode ? "user" : "kernel")); 965 } 966 #ifdef SMP 967 /* two separate prints in case of a trap on an unmapped page */ 968 kprintf("mp_lock = %08x; ", mp_lock); 969 kprintf("cpuid = %d\n", mycpu->gd_cpuid); 970 #endif 971 if (type == T_PAGEFLT) { 972 kprintf("fault virtual address = %p\n", (void *)eva); 973 kprintf("fault code = %s %s, %s\n", 974 usermode ? "user" : "supervisor", 975 code & PGEX_W ? "write" : "read", 976 code & PGEX_P ? "protection violation" : "page not present"); 977 } 978 kprintf("instruction pointer = 0x%lx:0x%lx\n", 979 frame->tf_cs & 0xffff, frame->tf_rip); 980 if (usermode) { 981 ss = frame->tf_ss & 0xffff; 982 rsp = frame->tf_rsp; 983 } else { 984 ss = GSEL(GDATA_SEL, SEL_KPL); 985 rsp = (long)&frame->tf_rsp; 986 } 987 kprintf("stack pointer = 0x%x:0x%lx\n", ss, rsp); 988 kprintf("frame pointer = 0x%x:0x%lx\n", ss, frame->tf_rbp); 989 kprintf("processor eflags = "); 990 if (frame->tf_rflags & PSL_T) 991 kprintf("trace trap, "); 992 if (frame->tf_rflags & PSL_I) 993 kprintf("interrupt enabled, "); 994 if (frame->tf_rflags & PSL_NT) 995 kprintf("nested task, "); 996 if (frame->tf_rflags & PSL_RF) 997 kprintf("resume, "); 998 #if 0 999 if (frame->tf_eflags & PSL_VM) 1000 kprintf("vm86, "); 1001 #endif 1002 kprintf("IOPL = %jd\n", (intmax_t)((frame->tf_rflags & PSL_IOPL) >> 12)); 1003 kprintf("current process = "); 1004 if (curproc) { 1005 kprintf("%lu (%s)\n", 1006 (u_long)curproc->p_pid, curproc->p_comm ? 1007 curproc->p_comm : ""); 1008 } else { 1009 kprintf("Idle\n"); 1010 } 1011 kprintf("current thread = pri %d ", curthread->td_pri); 1012 if (curthread->td_pri >= TDPRI_CRIT) 1013 kprintf("(CRIT)"); 1014 kprintf("\n"); 1015 #ifdef SMP 1016 /** 1017 * XXX FIXME: 1018 * we probably SHOULD have stopped the other CPUs before now! 1019 * another CPU COULD have been touching cpl at this moment... 1020 */ 1021 kprintf(" <- SMP: XXX"); 1022 #endif 1023 kprintf("\n"); 1024 1025 #ifdef KDB 1026 if (kdb_trap(&psl)) 1027 return; 1028 #endif 1029 #ifdef DDB 1030 if ((debugger_on_panic || db_active) && kdb_trap(type, code, frame)) 1031 return; 1032 #endif 1033 kprintf("trap number = %d\n", type); 1034 if (type <= MAX_TRAP_MSG) 1035 panic("%s", trap_msg[type]); 1036 else 1037 panic("unknown/reserved trap"); 1038 } 1039 1040 /* 1041 * Double fault handler. Called when a fault occurs while writing 1042 * a frame for a trap/exception onto the stack. This usually occurs 1043 * when the stack overflows (such is the case with infinite recursion, 1044 * for example). 1045 * 1046 * XXX Note that the current PTD gets replaced by IdlePTD when the 1047 * task switch occurs. This means that the stack that was active at 1048 * the time of the double fault is not available at <kstack> unless 1049 * the machine was idle when the double fault occurred. The downside 1050 * of this is that "trace <ebp>" in ddb won't work. 1051 */ 1052 void 1053 dblfault_handler(void) 1054 { 1055 #if JG 1056 struct mdglobaldata *gd = mdcpu; 1057 #endif 1058 1059 kprintf("\nFatal double fault:\n"); 1060 #if JG 1061 kprintf("rip = 0x%lx\n", gd->gd_common_tss.tss_rip); 1062 kprintf("rsp = 0x%lx\n", gd->gd_common_tss.tss_rsp); 1063 kprintf("rbp = 0x%lx\n", gd->gd_common_tss.tss_rbp); 1064 #endif 1065 #ifdef SMP 1066 /* two separate prints in case of a trap on an unmapped page */ 1067 kprintf("mp_lock = %08x; ", mp_lock); 1068 kprintf("cpuid = %d\n", mycpu->gd_cpuid); 1069 #endif 1070 panic("double fault"); 1071 } 1072 1073 /* 1074 * Compensate for 386 brain damage (missing URKR). 1075 * This is a little simpler than the pagefault handler in trap() because 1076 * it the page tables have already been faulted in and high addresses 1077 * are thrown out early for other reasons. 1078 */ 1079 int 1080 trapwrite(unsigned addr) 1081 { 1082 struct lwp *lp; 1083 vm_offset_t va; 1084 struct vmspace *vm; 1085 int rv; 1086 1087 va = trunc_page((vm_offset_t)addr); 1088 /* 1089 * XXX - MAX is END. Changed > to >= for temp. fix. 1090 */ 1091 if (va >= VM_MAX_USER_ADDRESS) 1092 return (1); 1093 1094 lp = curthread->td_lwp; 1095 vm = lp->lwp_vmspace; 1096 1097 PHOLD(lp->lwp_proc); 1098 1099 if (!grow_stack (lp->lwp_proc, va)) { 1100 PRELE(lp->lwp_proc); 1101 return (1); 1102 } 1103 1104 /* 1105 * fault the data page 1106 */ 1107 rv = vm_fault(&vm->vm_map, va, VM_PROT_WRITE, VM_FAULT_DIRTY); 1108 1109 PRELE(lp->lwp_proc); 1110 1111 if (rv != KERN_SUCCESS) 1112 return 1; 1113 1114 return (0); 1115 } 1116 1117 /* 1118 * syscall2 - MP aware system call request C handler 1119 * 1120 * A system call is essentially treated as a trap except that the 1121 * MP lock is not held on entry or return. We are responsible for 1122 * obtaining the MP lock if necessary and for handling ASTs 1123 * (e.g. a task switch) prior to return. 1124 * 1125 * In general, only simple access and manipulation of curproc and 1126 * the current stack is allowed without having to hold MP lock. 1127 * 1128 * MPSAFE - note that large sections of this routine are run without 1129 * the MP lock. 1130 */ 1131 void 1132 syscall2(struct trapframe *frame) 1133 { 1134 struct thread *td = curthread; 1135 struct proc *p = td->td_proc; 1136 struct lwp *lp = td->td_lwp; 1137 caddr_t params; 1138 struct sysent *callp; 1139 register_t orig_tf_rflags; 1140 int sticks; 1141 int error; 1142 int narg; 1143 #ifdef INVARIANTS 1144 int crit_count = td->td_pri & ~TDPRI_MASK; 1145 #endif 1146 #ifdef SMP 1147 int have_mplock = 0; 1148 #endif 1149 register_t *argp; 1150 u_int code; 1151 int reg, regcnt; 1152 union sysunion args; 1153 register_t *argsdst; 1154 1155 mycpu->gd_cnt.v_syscall++; 1156 1157 KTR_LOG(kernentry_syscall, lp->lwp_proc->p_pid, lp->lwp_tid, 1158 frame->tf_eax); 1159 1160 #ifdef SMP 1161 KASSERT(td->td_mpcount == 0, ("badmpcount syscall2 from %p", (void *)frame->tf_rip)); 1162 if (syscall_mpsafe == 0) 1163 MAKEMPSAFE(have_mplock); 1164 #endif 1165 userenter(td, p); /* lazy raise our priority */ 1166 1167 reg = 0; 1168 regcnt = 6; 1169 /* 1170 * Misc 1171 */ 1172 sticks = (int)td->td_sticks; 1173 orig_tf_rflags = frame->tf_rflags; 1174 1175 /* 1176 * Virtual kernel intercept - if a VM context managed by a virtual 1177 * kernel issues a system call the virtual kernel handles it, not us. 1178 * Restore the virtual kernel context and return from its system 1179 * call. The current frame is copied out to the virtual kernel. 1180 */ 1181 if (lp->lwp_vkernel && lp->lwp_vkernel->ve) { 1182 vkernel_trap(lp, frame); 1183 error = EJUSTRETURN; 1184 goto out; 1185 } 1186 1187 /* 1188 * Get the system call parameters and account for time 1189 */ 1190 lp->lwp_md.md_regs = frame; 1191 params = (caddr_t)frame->tf_rsp + sizeof(register_t); 1192 code = frame->tf_rax; 1193 1194 if (p->p_sysent->sv_prepsyscall) { 1195 (*p->p_sysent->sv_prepsyscall)( 1196 frame, (int *)(&args.nosys.sysmsg + 1), 1197 &code, ¶ms); 1198 } else { 1199 if (code == SYS_syscall || code == SYS___syscall) { 1200 code = frame->tf_rdi; 1201 reg++; 1202 regcnt--; 1203 } 1204 } 1205 1206 if (p->p_sysent->sv_mask) 1207 code &= p->p_sysent->sv_mask; 1208 1209 if (code >= p->p_sysent->sv_size) 1210 callp = &p->p_sysent->sv_table[0]; 1211 else 1212 callp = &p->p_sysent->sv_table[code]; 1213 1214 narg = callp->sy_narg & SYF_ARGMASK; 1215 1216 /* 1217 * On x86_64 we get up to six arguments in registers. The rest are 1218 * on the stack. The first six members of 'struct trapframe' happen 1219 * to be the registers used to pass arguments, in exactly the right 1220 * order. 1221 */ 1222 argp = &frame->tf_rdi; 1223 argp += reg; 1224 argsdst = (register_t *)(&args.nosys.sysmsg + 1); 1225 /* 1226 * JG can we overflow the space pointed to by 'argsdst' 1227 * either with 'bcopy' or with 'copyin'? 1228 */ 1229 bcopy(argp, argsdst, sizeof(register_t) * regcnt); 1230 /* 1231 * copyin is MP aware, but the tracing code is not 1232 */ 1233 if (narg > regcnt) { 1234 KASSERT(params != NULL, ("copyin args with no params!")); 1235 error = copyin(params, &argsdst[regcnt], 1236 (narg - regcnt) * sizeof(register_t)); 1237 if (error) { 1238 #ifdef KTRACE 1239 if (KTRPOINT(td, KTR_SYSCALL)) { 1240 MAKEMPSAFE(have_mplock); 1241 1242 ktrsyscall(lp, code, narg, 1243 (void *)(&args.nosys.sysmsg + 1)); 1244 } 1245 #endif 1246 goto bad; 1247 } 1248 } 1249 1250 #ifdef KTRACE 1251 if (KTRPOINT(td, KTR_SYSCALL)) { 1252 MAKEMPSAFE(have_mplock); 1253 ktrsyscall(lp, code, narg, (void *)(&args.nosys.sysmsg + 1)); 1254 } 1255 #endif 1256 1257 /* 1258 * Default return value is 0 (will be copied to %rax). Double-value 1259 * returns use %rax and %rdx. %rdx is left unchanged for system 1260 * calls which return only one result. 1261 */ 1262 args.sysmsg_fds[0] = 0; 1263 args.sysmsg_fds[1] = frame->tf_rdx; 1264 1265 /* 1266 * The syscall might manipulate the trap frame. If it does it 1267 * will probably return EJUSTRETURN. 1268 */ 1269 args.sysmsg_frame = frame; 1270 1271 STOPEVENT(p, S_SCE, narg); /* MP aware */ 1272 1273 /* 1274 * NOTE: All system calls run MPSAFE now. The system call itself 1275 * is responsible for getting the MP lock. 1276 */ 1277 error = (*callp->sy_call)(&args); 1278 1279 #if 0 1280 kprintf("system call %d returned %d\n", code, error); 1281 #endif 1282 1283 out: 1284 /* 1285 * MP SAFE (we may or may not have the MP lock at this point) 1286 */ 1287 switch (error) { 1288 case 0: 1289 /* 1290 * Reinitialize proc pointer `p' as it may be different 1291 * if this is a child returning from fork syscall. 1292 */ 1293 p = curproc; 1294 lp = curthread->td_lwp; 1295 frame->tf_rax = args.sysmsg_fds[0]; 1296 frame->tf_rdx = args.sysmsg_fds[1]; 1297 frame->tf_rflags &= ~PSL_C; 1298 break; 1299 case ERESTART: 1300 /* 1301 * Reconstruct pc, we know that 'syscall' is 2 bytes. 1302 * We have to do a full context restore so that %r10 1303 * (which was holding the value of %rcx) is restored for 1304 * the next iteration. 1305 */ 1306 frame->tf_rip -= frame->tf_err; 1307 frame->tf_r10 = frame->tf_rcx; 1308 break; 1309 case EJUSTRETURN: 1310 break; 1311 case EASYNC: 1312 panic("Unexpected EASYNC return value (for now)"); 1313 default: 1314 bad: 1315 if (p->p_sysent->sv_errsize) { 1316 if (error >= p->p_sysent->sv_errsize) 1317 error = -1; /* XXX */ 1318 else 1319 error = p->p_sysent->sv_errtbl[error]; 1320 } 1321 frame->tf_rax = error; 1322 frame->tf_rflags |= PSL_C; 1323 break; 1324 } 1325 1326 /* 1327 * Traced syscall. trapsignal() is not MP aware. 1328 */ 1329 if (orig_tf_rflags & PSL_T) { 1330 MAKEMPSAFE(have_mplock); 1331 frame->tf_rflags &= ~PSL_T; 1332 trapsignal(lp, SIGTRAP, 0); 1333 } 1334 1335 /* 1336 * Handle reschedule and other end-of-syscall issues 1337 */ 1338 userret(lp, frame, sticks); 1339 1340 #ifdef KTRACE 1341 if (KTRPOINT(td, KTR_SYSRET)) { 1342 MAKEMPSAFE(have_mplock); 1343 ktrsysret(lp, code, error, args.sysmsg_result); 1344 } 1345 #endif 1346 1347 /* 1348 * This works because errno is findable through the 1349 * register set. If we ever support an emulation where this 1350 * is not the case, this code will need to be revisited. 1351 */ 1352 STOPEVENT(p, S_SCX, code); 1353 1354 userexit(lp); 1355 #ifdef SMP 1356 /* 1357 * Release the MP lock if we had to get it 1358 */ 1359 KASSERT(td->td_mpcount == have_mplock, 1360 ("badmpcount syscall2/end from %p", (void *)frame->tf_rip)); 1361 if (have_mplock) 1362 rel_mplock(); 1363 #endif 1364 KTR_LOG(kernentry_syscall_ret, lp->lwp_proc->p_pid, lp->lwp_tid, error); 1365 #ifdef INVARIANTS 1366 KASSERT(crit_count == (td->td_pri & ~TDPRI_MASK), 1367 ("syscall: critical section count mismatch! %d/%d", 1368 crit_count / TDPRI_CRIT, td->td_pri / TDPRI_CRIT)); 1369 #endif 1370 } 1371 1372 void 1373 fork_return(struct lwp *lp, struct trapframe *frame) 1374 { 1375 frame->tf_rax = 0; /* Child returns zero */ 1376 frame->tf_rflags &= ~PSL_C; /* success */ 1377 frame->tf_rdx = 1; 1378 1379 generic_lwp_return(lp, frame); 1380 KTR_LOG(kernentry_fork_ret, lp->lwp_proc->p_pid, lp->lwp_tid); 1381 } 1382 1383 /* 1384 * Simplified back end of syscall(), used when returning from fork() 1385 * or lwp_create() directly into user mode. MP lock is held on entry and 1386 * should be released on return. This code will return back into the fork 1387 * trampoline code which then runs doreti. 1388 */ 1389 void 1390 generic_lwp_return(struct lwp *lp, struct trapframe *frame) 1391 { 1392 struct proc *p = lp->lwp_proc; 1393 1394 /* 1395 * Newly forked processes are given a kernel priority. We have to 1396 * adjust the priority to a normal user priority and fake entry 1397 * into the kernel (call userenter()) to install a passive release 1398 * function just in case userret() decides to stop the process. This 1399 * can occur when ^Z races a fork. If we do not install the passive 1400 * release function the current process designation will not be 1401 * released when the thread goes to sleep. 1402 */ 1403 lwkt_setpri_self(TDPRI_USER_NORM); 1404 userenter(lp->lwp_thread, p); 1405 userret(lp, frame, 0); 1406 #ifdef KTRACE 1407 if (KTRPOINT(lp->lwp_thread, KTR_SYSRET)) 1408 ktrsysret(lp, SYS_fork, 0, 0); 1409 #endif 1410 p->p_flag |= P_PASSIVE_ACQ; 1411 userexit(lp); 1412 p->p_flag &= ~P_PASSIVE_ACQ; 1413 #ifdef SMP 1414 KKASSERT(lp->lwp_thread->td_mpcount == 1); 1415 rel_mplock(); 1416 #endif 1417 } 1418 1419 /* 1420 * doreti has turned into this. The frame is directly on the stack. We 1421 * pull everything else we need (fpu and tls context) from the current 1422 * thread. 1423 * 1424 * Note on fpu interactions: In a virtual kernel, the fpu context for 1425 * an emulated user mode process is not shared with the virtual kernel's 1426 * fpu context, so we only have to 'stack' fpu contexts within the virtual 1427 * kernel itself, and not even then since the signal() contexts that we care 1428 * about save and restore the FPU state (I think anyhow). 1429 * 1430 * vmspace_ctl() returns an error only if it had problems instaling the 1431 * context we supplied or problems copying data to/from our VM space. 1432 */ 1433 void 1434 go_user(struct intrframe *frame) 1435 { 1436 struct trapframe *tf = (void *)&frame->if_rdi; 1437 int r; 1438 1439 /* 1440 * Interrupts may be disabled on entry, make sure all signals 1441 * can be received before beginning our loop. 1442 */ 1443 sigsetmask(0); 1444 1445 /* 1446 * Switch to the current simulated user process, then call 1447 * user_trap() when we break out of it (usually due to a signal). 1448 */ 1449 for (;;) { 1450 /* 1451 * Tell the real kernel whether it is ok to use the FP 1452 * unit or not. 1453 */ 1454 if (mdcpu->gd_npxthread == curthread) { 1455 tf->tf_xflags &= ~PGEX_FPFAULT; 1456 } else { 1457 tf->tf_xflags |= PGEX_FPFAULT; 1458 } 1459 1460 /* 1461 * Run emulated user process context. This call interlocks 1462 * with new mailbox signals. 1463 * 1464 * Set PGEX_U unconditionally, indicating a user frame (the 1465 * bit is normally set only by T_PAGEFLT). 1466 */ 1467 r = vmspace_ctl(&curproc->p_vmspace->vm_pmap, VMSPACE_CTL_RUN, 1468 tf, &curthread->td_savevext); 1469 frame->if_xflags |= PGEX_U; 1470 #if 0 1471 kprintf("GO USER %d trap %ld EVA %08lx RIP %08lx RSP %08lx XFLAGS %02lx/%02lx\n", 1472 r, tf->tf_trapno, tf->tf_addr, tf->tf_rip, tf->tf_rsp, 1473 tf->tf_xflags, frame->if_xflags); 1474 #endif 1475 if (r < 0) { 1476 if (errno != EINTR) 1477 panic("vmspace_ctl failed error %d", errno); 1478 } else { 1479 if (tf->tf_trapno) { 1480 user_trap(tf); 1481 } 1482 } 1483 if (mycpu->gd_reqflags & RQF_AST_MASK) { 1484 tf->tf_trapno = T_ASTFLT; 1485 user_trap(tf); 1486 } 1487 tf->tf_trapno = 0; 1488 } 1489 } 1490 1491 /* 1492 * If PGEX_FPFAULT is set then set FP_VIRTFP in the PCB to force a T_DNA 1493 * fault (which is then passed back to the virtual kernel) if an attempt is 1494 * made to use the FP unit. 1495 * 1496 * XXX this is a fairly big hack. 1497 */ 1498 void 1499 set_vkernel_fp(struct trapframe *frame) 1500 { 1501 struct thread *td = curthread; 1502 1503 if (frame->tf_xflags & PGEX_FPFAULT) { 1504 td->td_pcb->pcb_flags |= FP_VIRTFP; 1505 if (mdcpu->gd_npxthread == td) 1506 npxexit(); 1507 } else { 1508 td->td_pcb->pcb_flags &= ~FP_VIRTFP; 1509 } 1510 } 1511 1512 /* 1513 * Called from vkernel_trap() to fixup the vkernel's syscall 1514 * frame for vmspace_ctl() return. 1515 */ 1516 void 1517 cpu_vkernel_trap(struct trapframe *frame, int error) 1518 { 1519 frame->tf_rax = error; 1520 if (error) 1521 frame->tf_rflags |= PSL_C; 1522 else 1523 frame->tf_rflags &= ~PSL_C; 1524 } 1525