1 /*- 2 * Copyright (C) 1994, David Greenman 3 * Copyright (c) 1990, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the University of Utah, and William Jolitz. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 38 * $FreeBSD: src/sys/i386/i386/trap.c,v 1.147.2.11 2003/02/27 19:09:59 luoqi Exp $ 39 */ 40 41 /* 42 * x86_64 Trap and System call handling 43 */ 44 45 #include "use_isa.h" 46 47 #include "opt_ddb.h" 48 #include "opt_ktrace.h" 49 50 #include <sys/param.h> 51 #include <sys/systm.h> 52 #include <sys/proc.h> 53 #include <sys/pioctl.h> 54 #include <sys/kernel.h> 55 #include <sys/resourcevar.h> 56 #include <sys/signalvar.h> 57 #include <sys/signal2.h> 58 #include <sys/syscall.h> 59 #include <sys/sysctl.h> 60 #include <sys/sysent.h> 61 #include <sys/vmmeter.h> 62 #include <sys/malloc.h> 63 #ifdef KTRACE 64 #include <sys/ktrace.h> 65 #endif 66 #include <sys/ktr.h> 67 #include <sys/vkernel.h> 68 #include <sys/sysmsg.h> 69 #include <sys/vmspace.h> 70 71 #include <vm/vm.h> 72 #include <vm/vm_param.h> 73 #include <sys/lock.h> 74 #include <vm/pmap.h> 75 #include <vm/vm_kern.h> 76 #include <vm/vm_map.h> 77 #include <vm/vm_page.h> 78 #include <vm/vm_extern.h> 79 80 #include <machine/cpu.h> 81 #include <machine/md_var.h> 82 #include <machine/pcb.h> 83 #include <machine/smp.h> 84 #include <machine/tss.h> 85 #include <machine/globaldata.h> 86 87 #include <ddb/ddb.h> 88 89 #include <sys/msgport2.h> 90 #include <sys/thread2.h> 91 #include <sys/mplock2.h> 92 93 int (*pmath_emulate) (struct trapframe *); 94 95 static int trap_pfault (struct trapframe *, int, vm_offset_t); 96 static void trap_fatal (struct trapframe *, int, vm_offset_t); 97 void dblfault_handler (void); 98 extern int vmm_enabled; 99 100 static struct krate segfltrate = { 1 }; 101 102 #if 0 103 extern inthand_t IDTVEC(syscall); 104 #endif 105 106 #define MAX_TRAP_MSG 30 107 static char *trap_msg[] = { 108 "", /* 0 unused */ 109 "privileged instruction fault", /* 1 T_PRIVINFLT */ 110 "", /* 2 unused */ 111 "breakpoint instruction fault", /* 3 T_BPTFLT */ 112 "", /* 4 unused */ 113 "", /* 5 unused */ 114 "arithmetic trap", /* 6 T_ARITHTRAP */ 115 "system forced exception", /* 7 T_ASTFLT */ 116 "", /* 8 unused */ 117 "general protection fault", /* 9 T_PROTFLT */ 118 "trace trap", /* 10 T_TRCTRAP */ 119 "", /* 11 unused */ 120 "page fault", /* 12 T_PAGEFLT */ 121 "", /* 13 unused */ 122 "alignment fault", /* 14 T_ALIGNFLT */ 123 "", /* 15 unused */ 124 "", /* 16 unused */ 125 "", /* 17 unused */ 126 "integer divide fault", /* 18 T_DIVIDE */ 127 "non-maskable interrupt trap", /* 19 T_NMI */ 128 "overflow trap", /* 20 T_OFLOW */ 129 "FPU bounds check fault", /* 21 T_BOUND */ 130 "FPU device not available", /* 22 T_DNA */ 131 "double fault", /* 23 T_DOUBLEFLT */ 132 "FPU operand fetch fault", /* 24 T_FPOPFLT */ 133 "invalid TSS fault", /* 25 T_TSSFLT */ 134 "segment not present fault", /* 26 T_SEGNPFLT */ 135 "stack fault", /* 27 T_STKFLT */ 136 "machine check trap", /* 28 T_MCHK */ 137 "SIMD floating-point exception", /* 29 T_XMMFLT */ 138 "reserved (unknown) fault", /* 30 T_RESERVED */ 139 }; 140 141 #ifdef DDB 142 static int ddb_on_nmi = 1; 143 SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW, 144 &ddb_on_nmi, 0, "Go to DDB on NMI"); 145 #endif 146 static int panic_on_nmi = 1; 147 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW, 148 &panic_on_nmi, 0, "Panic on NMI"); 149 150 /* 151 * Passively intercepts the thread switch function to increase 152 * the thread priority from a user priority to a kernel priority, reducing 153 * syscall and trap overhead for the case where no switch occurs. 154 * 155 * Synchronizes td_ucred with p_ucred. This is used by system calls, 156 * signal handling, faults, AST traps, and anything else that enters the 157 * kernel from userland and provides the kernel with a stable read-only 158 * copy of the process ucred. 159 */ 160 static __inline void 161 userenter(struct thread *curtd, struct proc *curp) 162 { 163 struct ucred *ocred; 164 struct ucred *ncred; 165 166 curtd->td_release = lwkt_passive_release; 167 168 if (curtd->td_ucred != curp->p_ucred) { 169 ncred = crhold(curp->p_ucred); 170 ocred = curtd->td_ucred; 171 curtd->td_ucred = ncred; 172 if (ocred) 173 crfree(ocred); 174 } 175 } 176 177 /* 178 * Handle signals, profiling, and other AST's and/or tasks that 179 * must be completed before we can return to or try to return to userland. 180 * 181 * Note that td_sticks is a 64 bit quantity, but there's no point doing 64 182 * arithmatic on the delta calculation so the absolute tick values are 183 * truncated to an integer. 184 */ 185 static void 186 userret(struct lwp *lp, struct trapframe *frame, int sticks) 187 { 188 struct proc *p = lp->lwp_proc; 189 int sig; 190 int ptok; 191 192 /* 193 * Charge system time if profiling. Note: times are in microseconds. 194 * This may do a copyout and block, so do it first even though it 195 * means some system time will be charged as user time. 196 */ 197 if (p->p_flags & P_PROFIL) { 198 addupc_task(p, frame->tf_rip, 199 (u_int)((int)lp->lwp_thread->td_sticks - sticks)); 200 } 201 202 recheck: 203 /* 204 * Specific on-return-to-usermode checks (LWP_MP_WEXIT, 205 * LWP_MP_VNLRU, etc). 206 */ 207 if (lp->lwp_mpflags & LWP_MP_URETMASK) 208 lwpuserret(lp); 209 210 /* 211 * Block here if we are in a stopped state. 212 */ 213 if (STOPLWP(p, lp)) { 214 lwkt_gettoken(&p->p_token); 215 tstop(); 216 lwkt_reltoken(&p->p_token); 217 goto recheck; 218 } 219 220 /* 221 * Post any pending upcalls. If running a virtual kernel be sure 222 * to restore the virtual kernel's vmspace before posting the upcall. 223 */ 224 if (p->p_flags & (P_SIGVTALRM | P_SIGPROF)) { 225 lwkt_gettoken(&p->p_token); 226 if (p->p_flags & P_SIGVTALRM) { 227 p->p_flags &= ~P_SIGVTALRM; 228 ksignal(p, SIGVTALRM); 229 } 230 if (p->p_flags & P_SIGPROF) { 231 p->p_flags &= ~P_SIGPROF; 232 ksignal(p, SIGPROF); 233 } 234 lwkt_reltoken(&p->p_token); 235 goto recheck; 236 } 237 238 /* 239 * Post any pending signals 240 * 241 * WARNING! postsig() can exit and not return. 242 */ 243 if ((sig = CURSIG_LCK_TRACE(lp, &ptok)) != 0) { 244 postsig(sig, ptok); 245 goto recheck; 246 } 247 248 /* 249 * In a multi-threaded program it is possible for a thread to change 250 * signal state during a system call which temporarily changes the 251 * signal mask. In this case postsig() might not be run and we 252 * have to restore the mask ourselves. 253 */ 254 if (lp->lwp_flags & LWP_OLDMASK) { 255 lp->lwp_flags &= ~LWP_OLDMASK; 256 lp->lwp_sigmask = lp->lwp_oldsigmask; 257 goto recheck; 258 } 259 } 260 261 /* 262 * Cleanup from userenter and any passive release that might have occured. 263 * We must reclaim the current-process designation before we can return 264 * to usermode. We also handle both LWKT and USER reschedule requests. 265 */ 266 static __inline void 267 userexit(struct lwp *lp) 268 { 269 struct thread *td = lp->lwp_thread; 270 /* globaldata_t gd = td->td_gd; */ 271 272 /* 273 * Handle stop requests at kernel priority. Any requests queued 274 * after this loop will generate another AST. 275 */ 276 while (STOPLWP(lp->lwp_proc, lp)) { 277 lwkt_gettoken(&lp->lwp_proc->p_token); 278 tstop(); 279 lwkt_reltoken(&lp->lwp_proc->p_token); 280 } 281 282 /* 283 * Reduce our priority in preparation for a return to userland. If 284 * our passive release function was still in place, our priority was 285 * never raised and does not need to be reduced. 286 */ 287 lwkt_passive_recover(td); 288 289 /* 290 * Become the current user scheduled process if we aren't already, 291 * and deal with reschedule requests and other factors. 292 */ 293 lp->lwp_proc->p_usched->acquire_curproc(lp); 294 /* WARNING: we may have migrated cpu's */ 295 /* gd = td->td_gd; */ 296 } 297 298 #if !defined(KTR_KERNENTRY) 299 #define KTR_KERNENTRY KTR_ALL 300 #endif 301 KTR_INFO_MASTER(kernentry); 302 KTR_INFO(KTR_KERNENTRY, kernentry, trap, 0, 303 "TRAP(pid %hd, tid %hd, trapno %ld, eva %lu)", 304 pid_t pid, lwpid_t tid, register_t trapno, vm_offset_t eva); 305 KTR_INFO(KTR_KERNENTRY, kernentry, trap_ret, 0, "TRAP_RET(pid %hd, tid %hd)", 306 pid_t pid, lwpid_t tid); 307 KTR_INFO(KTR_KERNENTRY, kernentry, syscall, 0, "SYSC(pid %hd, tid %hd, nr %ld)", 308 pid_t pid, lwpid_t tid, register_t trapno); 309 KTR_INFO(KTR_KERNENTRY, kernentry, syscall_ret, 0, "SYSRET(pid %hd, tid %hd, err %d)", 310 pid_t pid, lwpid_t tid, int err); 311 KTR_INFO(KTR_KERNENTRY, kernentry, fork_ret, 0, "FORKRET(pid %hd, tid %hd)", 312 pid_t pid, lwpid_t tid); 313 314 /* 315 * Exception, fault, and trap interface to the kernel. 316 * This common code is called from assembly language IDT gate entry 317 * routines that prepare a suitable stack frame, and restore this 318 * frame after the exception has been processed. 319 * 320 * This function is also called from doreti in an interlock to handle ASTs. 321 * For example: hardwareint->INTROUTINE->(set ast)->doreti->trap 322 * 323 * NOTE! We have to retrieve the fault address prior to obtaining the 324 * MP lock because get_mplock() may switch out. YYY cr2 really ought 325 * to be retrieved by the assembly code, not here. 326 * 327 * XXX gd_trap_nesting_level currently prevents lwkt_switch() from panicing 328 * if an attempt is made to switch from a fast interrupt or IPI. This is 329 * necessary to properly take fatal kernel traps on SMP machines if 330 * get_mplock() has to block. 331 */ 332 333 void 334 user_trap(struct trapframe *frame) 335 { 336 struct globaldata *gd = mycpu; 337 struct thread *td = gd->gd_curthread; 338 struct lwp *lp = td->td_lwp; 339 struct proc *p; 340 int sticks = 0; 341 int i = 0, ucode = 0, type, code; 342 #ifdef INVARIANTS 343 int crit_count = td->td_critcount; 344 lwkt_tokref_t curstop = td->td_toks_stop; 345 #endif 346 vm_offset_t eva; 347 348 p = td->td_proc; 349 350 if (frame->tf_trapno == T_PAGEFLT) 351 eva = frame->tf_addr; 352 else 353 eva = 0; 354 #if 0 355 kprintf("USER_TRAP AT %08lx xflags %ld trapno %ld eva %08lx\n", 356 frame->tf_rip, frame->tf_xflags, frame->tf_trapno, eva); 357 #endif 358 359 /* 360 * Everything coming from user mode runs through user_trap, 361 * including system calls. 362 */ 363 if (frame->tf_trapno == T_FAST_SYSCALL) { 364 syscall2(frame); 365 return; 366 } 367 368 KTR_LOG(kernentry_trap, lp->lwp_proc->p_pid, lp->lwp_tid, 369 frame->tf_trapno, eva); 370 371 #ifdef DDB 372 if (db_active) { 373 eva = (frame->tf_trapno == T_PAGEFLT ? rcr2() : 0); 374 ++gd->gd_trap_nesting_level; 375 trap_fatal(frame, TRUE, eva); 376 --gd->gd_trap_nesting_level; 377 goto out2; 378 } 379 #endif 380 381 type = frame->tf_trapno; 382 code = frame->tf_err; 383 384 userenter(td, p); 385 386 sticks = (int)td->td_sticks; 387 lp->lwp_md.md_regs = frame; 388 389 switch (type) { 390 case T_PRIVINFLT: /* privileged instruction fault */ 391 i = SIGILL; 392 ucode = ILL_PRVOPC; 393 break; 394 395 case T_BPTFLT: /* bpt instruction fault */ 396 case T_TRCTRAP: /* trace trap */ 397 frame->tf_rflags &= ~PSL_T; 398 i = SIGTRAP; 399 ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT); 400 break; 401 402 case T_ARITHTRAP: /* arithmetic trap */ 403 ucode = code; 404 i = SIGFPE; 405 break; 406 407 case T_ASTFLT: /* Allow process switch */ 408 mycpu->gd_cnt.v_soft++; 409 if (mycpu->gd_reqflags & RQF_AST_OWEUPC) { 410 atomic_clear_int(&mycpu->gd_reqflags, RQF_AST_OWEUPC); 411 addupc_task(p, p->p_prof.pr_addr, p->p_prof.pr_ticks); 412 } 413 goto out; 414 415 /* 416 * The following two traps can happen in 417 * vm86 mode, and, if so, we want to handle 418 * them specially. 419 */ 420 case T_PROTFLT: /* general protection fault */ 421 case T_STKFLT: /* stack fault */ 422 #if 0 423 if (frame->tf_eflags & PSL_VM) { 424 i = vm86_emulate((struct vm86frame *)frame); 425 if (i == 0) 426 goto out; 427 break; 428 } 429 #endif 430 /* FALL THROUGH */ 431 432 case T_SEGNPFLT: /* segment not present fault */ 433 case T_TSSFLT: /* invalid TSS fault */ 434 case T_DOUBLEFLT: /* double fault */ 435 default: 436 i = SIGBUS; 437 ucode = code + BUS_SEGM_FAULT ; 438 break; 439 440 case T_PAGEFLT: /* page fault */ 441 i = trap_pfault(frame, TRUE, eva); 442 if (i == -1 || i == 0) 443 goto out; 444 445 446 if (i == SIGSEGV) 447 ucode = SEGV_MAPERR; 448 else { 449 i = SIGSEGV; 450 ucode = SEGV_ACCERR; 451 } 452 break; 453 454 case T_DIVIDE: /* integer divide fault */ 455 ucode = FPE_INTDIV; 456 i = SIGFPE; 457 break; 458 459 #if NISA > 0 460 case T_NMI: 461 /* machine/parity/power fail/"kitchen sink" faults */ 462 if (isa_nmi(code) == 0) { 463 #ifdef DDB 464 /* 465 * NMI can be hooked up to a pushbutton 466 * for debugging. 467 */ 468 if (ddb_on_nmi) { 469 kprintf ("NMI ... going to debugger\n"); 470 kdb_trap(type, 0, frame); 471 } 472 #endif /* DDB */ 473 goto out2; 474 } else if (panic_on_nmi) 475 panic("NMI indicates hardware failure"); 476 break; 477 #endif /* NISA > 0 */ 478 479 case T_OFLOW: /* integer overflow fault */ 480 ucode = FPE_INTOVF; 481 i = SIGFPE; 482 break; 483 484 case T_BOUND: /* bounds check fault */ 485 ucode = FPE_FLTSUB; 486 i = SIGFPE; 487 break; 488 489 case T_DNA: 490 /* 491 * Virtual kernel intercept - pass the DNA exception 492 * to the (emulated) virtual kernel if it asked to handle 493 * it. This occurs when the virtual kernel is holding 494 * onto the FP context for a different emulated 495 * process then the one currently running. 496 * 497 * We must still call npxdna() since we may have 498 * saved FP state that the (emulated) virtual kernel 499 * needs to hand over to a different emulated process. 500 */ 501 if (lp->lwp_vkernel && lp->lwp_vkernel->ve && 502 (td->td_pcb->pcb_flags & FP_VIRTFP) 503 ) { 504 npxdna(frame); 505 break; 506 } 507 508 /* 509 * The kernel may have switched out the FP unit's 510 * state, causing the user process to take a fault 511 * when it tries to use the FP unit. Restore the 512 * state here 513 */ 514 if (npxdna(frame)) { 515 gd->gd_cnt.v_trap++; 516 goto out; 517 } 518 if (!pmath_emulate) { 519 i = SIGFPE; 520 ucode = FPE_FPU_NP_TRAP; 521 break; 522 } 523 i = (*pmath_emulate)(frame); 524 if (i == 0) { 525 if (!(frame->tf_rflags & PSL_T)) 526 goto out2; 527 frame->tf_rflags &= ~PSL_T; 528 i = SIGTRAP; 529 } 530 /* else ucode = emulator_only_knows() XXX */ 531 break; 532 533 case T_FPOPFLT: /* FPU operand fetch fault */ 534 ucode = T_FPOPFLT; 535 i = SIGILL; 536 break; 537 538 case T_XMMFLT: /* SIMD floating-point exception */ 539 ucode = 0; /* XXX */ 540 i = SIGFPE; 541 break; 542 } 543 544 /* 545 * Virtual kernel intercept - if the fault is directly related to a 546 * VM context managed by a virtual kernel then let the virtual kernel 547 * handle it. 548 */ 549 if (lp->lwp_vkernel && lp->lwp_vkernel->ve) { 550 vkernel_trap(lp, frame); 551 goto out; 552 } 553 554 /* 555 * Translate fault for emulators (e.g. Linux) 556 */ 557 if (*p->p_sysent->sv_transtrap) 558 i = (*p->p_sysent->sv_transtrap)(i, type); 559 560 trapsignal(lp, i, ucode); 561 562 #ifdef DEBUG 563 if (type <= MAX_TRAP_MSG) { 564 uprintf("fatal process exception: %s", 565 trap_msg[type]); 566 if ((type == T_PAGEFLT) || (type == T_PROTFLT)) 567 uprintf(", fault VA = 0x%lx", (u_long)eva); 568 uprintf("\n"); 569 } 570 #endif 571 572 out: 573 userret(lp, frame, sticks); 574 userexit(lp); 575 out2: ; 576 KTR_LOG(kernentry_trap_ret, lp->lwp_proc->p_pid, lp->lwp_tid); 577 #ifdef INVARIANTS 578 KASSERT(crit_count == td->td_critcount, 579 ("trap: critical section count mismatch! %d/%d", 580 crit_count, td->td_pri)); 581 KASSERT(curstop == td->td_toks_stop, 582 ("trap: extra tokens held after trap! %ld/%ld", 583 curstop - &td->td_toks_base, 584 td->td_toks_stop - &td->td_toks_base)); 585 #endif 586 } 587 588 void 589 kern_trap(struct trapframe *frame) 590 { 591 struct globaldata *gd = mycpu; 592 struct thread *td = gd->gd_curthread; 593 struct lwp *lp; 594 struct proc *p; 595 int i = 0, ucode = 0, type, code; 596 #ifdef INVARIANTS 597 int crit_count = td->td_critcount; 598 lwkt_tokref_t curstop = td->td_toks_stop; 599 #endif 600 vm_offset_t eva; 601 602 lp = td->td_lwp; 603 p = td->td_proc; 604 605 if (frame->tf_trapno == T_PAGEFLT) 606 eva = frame->tf_addr; 607 else 608 eva = 0; 609 610 #ifdef DDB 611 if (db_active) { 612 ++gd->gd_trap_nesting_level; 613 trap_fatal(frame, FALSE, eva); 614 --gd->gd_trap_nesting_level; 615 goto out2; 616 } 617 #endif 618 619 type = frame->tf_trapno; 620 code = frame->tf_err; 621 622 #if 0 623 kernel_trap: 624 #endif 625 /* kernel trap */ 626 627 switch (type) { 628 case T_PAGEFLT: /* page fault */ 629 trap_pfault(frame, FALSE, eva); 630 goto out2; 631 632 case T_DNA: 633 /* 634 * The kernel may be using npx for copying or other 635 * purposes. 636 */ 637 panic("kernel NPX should not happen"); 638 if (npxdna(frame)) 639 goto out2; 640 break; 641 642 case T_PROTFLT: /* general protection fault */ 643 case T_SEGNPFLT: /* segment not present fault */ 644 /* 645 * Invalid segment selectors and out of bounds 646 * %eip's and %esp's can be set up in user mode. 647 * This causes a fault in kernel mode when the 648 * kernel tries to return to user mode. We want 649 * to get this fault so that we can fix the 650 * problem here and not have to check all the 651 * selectors and pointers when the user changes 652 * them. 653 */ 654 if (mycpu->gd_intr_nesting_level == 0) { 655 if (td->td_pcb->pcb_onfault) { 656 frame->tf_rip = 657 (register_t)td->td_pcb->pcb_onfault; 658 goto out2; 659 } 660 } 661 break; 662 663 case T_TSSFLT: 664 /* 665 * PSL_NT can be set in user mode and isn't cleared 666 * automatically when the kernel is entered. This 667 * causes a TSS fault when the kernel attempts to 668 * `iret' because the TSS link is uninitialized. We 669 * want to get this fault so that we can fix the 670 * problem here and not every time the kernel is 671 * entered. 672 */ 673 if (frame->tf_rflags & PSL_NT) { 674 frame->tf_rflags &= ~PSL_NT; 675 goto out2; 676 } 677 break; 678 679 case T_TRCTRAP: /* trace trap */ 680 #if 0 681 if (frame->tf_eip == (int)IDTVEC(syscall)) { 682 /* 683 * We've just entered system mode via the 684 * syscall lcall. Continue single stepping 685 * silently until the syscall handler has 686 * saved the flags. 687 */ 688 goto out2; 689 } 690 if (frame->tf_eip == (int)IDTVEC(syscall) + 1) { 691 /* 692 * The syscall handler has now saved the 693 * flags. Stop single stepping it. 694 */ 695 frame->tf_eflags &= ~PSL_T; 696 goto out2; 697 } 698 #endif 699 #if 0 700 /* 701 * Ignore debug register trace traps due to 702 * accesses in the user's address space, which 703 * can happen under several conditions such as 704 * if a user sets a watchpoint on a buffer and 705 * then passes that buffer to a system call. 706 * We still want to get TRCTRAPS for addresses 707 * in kernel space because that is useful when 708 * debugging the kernel. 709 */ 710 if (user_dbreg_trap()) { 711 /* 712 * Reset breakpoint bits because the 713 * processor doesn't 714 */ 715 load_dr6(rdr6() & 0xfffffff0); 716 goto out2; 717 } 718 #endif 719 /* 720 * Fall through (TRCTRAP kernel mode, kernel address) 721 */ 722 case T_BPTFLT: 723 /* 724 * If DDB is enabled, let it handle the debugger trap. 725 * Otherwise, debugger traps "can't happen". 726 */ 727 #ifdef DDB 728 if (kdb_trap (type, 0, frame)) 729 goto out2; 730 #endif 731 break; 732 case T_DIVIDE: 733 trap_fatal(frame, FALSE, eva); 734 goto out2; 735 case T_NMI: 736 trap_fatal(frame, FALSE, eva); 737 goto out2; 738 case T_SYSCALL80: 739 case T_FAST_SYSCALL: 740 /* 741 * Ignore this trap generated from a spurious SIGTRAP. 742 * 743 * single stepping in / syscalls leads to spurious / SIGTRAP 744 * so ignore 745 * 746 * Haiku (c) 2007 Simon 'corecode' Schubert 747 */ 748 goto out2; 749 } 750 751 /* 752 * Translate fault for emulators (e.g. Linux) 753 */ 754 if (*p->p_sysent->sv_transtrap) 755 i = (*p->p_sysent->sv_transtrap)(i, type); 756 757 gd->gd_cnt.v_trap++; 758 trapsignal(lp, i, ucode); 759 760 #ifdef DEBUG 761 if (type <= MAX_TRAP_MSG) { 762 uprintf("fatal process exception: %s", 763 trap_msg[type]); 764 if ((type == T_PAGEFLT) || (type == T_PROTFLT)) 765 uprintf(", fault VA = 0x%lx", (u_long)eva); 766 uprintf("\n"); 767 } 768 #endif 769 770 out2: 771 ; 772 #ifdef INVARIANTS 773 KASSERT(crit_count == td->td_critcount, 774 ("trap: critical section count mismatch! %d/%d", 775 crit_count, td->td_pri)); 776 KASSERT(curstop == td->td_toks_stop, 777 ("trap: extra tokens held after trap! %ld/%ld", 778 curstop - &td->td_toks_base, 779 td->td_toks_stop - &td->td_toks_base)); 780 #endif 781 } 782 783 int 784 trap_pfault(struct trapframe *frame, int usermode, vm_offset_t eva) 785 { 786 vm_offset_t va; 787 struct vmspace *vm = NULL; 788 vm_map_t map = 0; 789 int rv = 0; 790 vm_prot_t ftype; 791 thread_t td = curthread; 792 struct lwp *lp = td->td_lwp; 793 int fault_flags; 794 795 va = trunc_page(eva); 796 if (usermode == FALSE) { 797 /* 798 * This is a fault on kernel virtual memory. 799 */ 800 map = &kernel_map; 801 } else { 802 /* 803 * This is a fault on non-kernel virtual memory. 804 * vm is initialized above to NULL. If curproc is NULL 805 * or curproc->p_vmspace is NULL the fault is fatal. 806 */ 807 if (lp != NULL) 808 vm = lp->lwp_vmspace; 809 810 if (vm == NULL) 811 goto nogo; 812 813 map = &vm->vm_map; 814 } 815 816 if (frame->tf_err & PGEX_W) 817 ftype = VM_PROT_READ | VM_PROT_WRITE; 818 else if (frame->tf_err & PGEX_I) 819 ftype = VM_PROT_EXECUTE; 820 else 821 ftype = VM_PROT_READ; 822 823 if (map != &kernel_map) { 824 /* 825 * Keep swapout from messing with us during this 826 * critical time. 827 */ 828 PHOLD(lp->lwp_proc); 829 830 #if 0 831 /* 832 * Grow the stack if necessary 833 */ 834 /* grow_stack returns false only if va falls into 835 * a growable stack region and the stack growth 836 * fails. It returns true if va was not within 837 * a growable stack region, or if the stack 838 * growth succeeded. 839 */ 840 if (!grow_stack (map, va)) { 841 rv = KERN_FAILURE; 842 PRELE(lp->lwp_proc); 843 goto nogo; 844 } 845 #endif 846 847 fault_flags = 0; 848 if (usermode) 849 fault_flags |= VM_FAULT_BURST | VM_FAULT_USERMODE; 850 if (ftype & VM_PROT_WRITE) 851 fault_flags |= VM_FAULT_DIRTY; 852 else 853 fault_flags |= VM_FAULT_NORMAL; 854 rv = vm_fault(map, va, ftype, fault_flags); 855 856 PRELE(lp->lwp_proc); 857 } else { 858 /* 859 * Don't have to worry about process locking or stacks in the kernel. 860 */ 861 rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); 862 } 863 864 if (rv == KERN_SUCCESS) 865 return (0); 866 nogo: 867 if (!usermode) { 868 if (td->td_gd->gd_intr_nesting_level == 0 && 869 td->td_pcb->pcb_onfault) { 870 frame->tf_rip = (register_t)td->td_pcb->pcb_onfault; 871 return (0); 872 } 873 trap_fatal(frame, usermode, eva); 874 return (-1); 875 } 876 877 /* 878 * NOTE: on x86_64 we have a tf_addr field in the trapframe, no 879 * kludge is needed to pass the fault address to signal handlers. 880 */ 881 struct proc *p = td->td_proc; 882 krateprintf(&segfltrate, 883 "seg-fault accessing address %p " 884 "rip=%p pid=%d p_comm=%s\n", 885 (void *)va, 886 (void *)frame->tf_rip, p->p_pid, p->p_comm); 887 /* Debugger("seg-fault"); */ 888 889 return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); 890 } 891 892 static void 893 trap_fatal(struct trapframe *frame, int usermode, vm_offset_t eva) 894 { 895 int code, type, ss; 896 long rsp; 897 898 code = frame->tf_xflags; 899 type = frame->tf_trapno; 900 901 if (type <= MAX_TRAP_MSG) { 902 kprintf("\n\nFatal trap %d: %s while in %s mode\n", 903 type, trap_msg[type], 904 (usermode ? "user" : "kernel")); 905 } 906 /* two separate prints in case of a trap on an unmapped page */ 907 kprintf("cpuid = %d\n", mycpu->gd_cpuid); 908 if (type == T_PAGEFLT) { 909 kprintf("fault virtual address = %p\n", (void *)eva); 910 kprintf("fault code = %s %s, %s\n", 911 usermode ? "user" : "supervisor", 912 code & PGEX_W ? "write" : "read", 913 code & PGEX_P ? "protection violation" : "page not present"); 914 } 915 kprintf("instruction pointer = 0x%lx:0x%lx\n", 916 frame->tf_cs & 0xffff, frame->tf_rip); 917 if (usermode) { 918 ss = frame->tf_ss & 0xffff; 919 rsp = frame->tf_rsp; 920 } else { 921 ss = GSEL(GDATA_SEL, SEL_KPL); 922 rsp = (long)&frame->tf_rsp; 923 } 924 kprintf("stack pointer = 0x%x:0x%lx\n", ss, rsp); 925 kprintf("frame pointer = 0x%x:0x%lx\n", ss, frame->tf_rbp); 926 kprintf("processor eflags = "); 927 if (frame->tf_rflags & PSL_T) 928 kprintf("trace trap, "); 929 if (frame->tf_rflags & PSL_I) 930 kprintf("interrupt enabled, "); 931 if (frame->tf_rflags & PSL_NT) 932 kprintf("nested task, "); 933 if (frame->tf_rflags & PSL_RF) 934 kprintf("resume, "); 935 #if 0 936 if (frame->tf_eflags & PSL_VM) 937 kprintf("vm86, "); 938 #endif 939 kprintf("IOPL = %jd\n", (intmax_t)((frame->tf_rflags & PSL_IOPL) >> 12)); 940 kprintf("current process = "); 941 if (curproc) { 942 kprintf("%lu (%s)\n", 943 (u_long)curproc->p_pid, curproc->p_comm ? 944 curproc->p_comm : ""); 945 } else { 946 kprintf("Idle\n"); 947 } 948 kprintf("current thread = pri %d ", curthread->td_pri); 949 if (curthread->td_critcount) 950 kprintf("(CRIT)"); 951 kprintf("\n"); 952 /** 953 * XXX FIXME: 954 * we probably SHOULD have stopped the other CPUs before now! 955 * another CPU COULD have been touching cpl at this moment... 956 */ 957 kprintf(" <- SMP: XXX"); 958 kprintf("\n"); 959 960 #ifdef KDB 961 if (kdb_trap(&psl)) 962 return; 963 #endif 964 #ifdef DDB 965 if ((debugger_on_panic || db_active) && kdb_trap(type, code, frame)) 966 return; 967 #endif 968 kprintf("trap number = %d\n", type); 969 if (type <= MAX_TRAP_MSG) 970 panic("%s", trap_msg[type]); 971 else 972 panic("unknown/reserved trap"); 973 } 974 975 /* 976 * Double fault handler. Called when a fault occurs while writing 977 * a frame for a trap/exception onto the stack. This usually occurs 978 * when the stack overflows (such is the case with infinite recursion, 979 * for example). 980 * 981 * XXX Note that the current PTD gets replaced by IdlePTD when the 982 * task switch occurs. This means that the stack that was active at 983 * the time of the double fault is not available at <kstack> unless 984 * the machine was idle when the double fault occurred. The downside 985 * of this is that "trace <ebp>" in ddb won't work. 986 */ 987 void 988 dblfault_handler(void) 989 { 990 #if 0 /* JG */ 991 struct mdglobaldata *gd = mdcpu; 992 #endif 993 994 kprintf("\nFatal double fault:\n"); 995 #if 0 /* JG */ 996 kprintf("rip = 0x%lx\n", gd->gd_common_tss.tss_rip); 997 kprintf("rsp = 0x%lx\n", gd->gd_common_tss.tss_rsp); 998 kprintf("rbp = 0x%lx\n", gd->gd_common_tss.tss_rbp); 999 #endif 1000 /* two separate prints in case of a trap on an unmapped page */ 1001 kprintf("cpuid = %d\n", mycpu->gd_cpuid); 1002 panic("double fault"); 1003 } 1004 1005 /* 1006 * syscall2 - MP aware system call request C handler 1007 * 1008 * A system call is essentially treated as a trap except that the 1009 * MP lock is not held on entry or return. We are responsible for 1010 * obtaining the MP lock if necessary and for handling ASTs 1011 * (e.g. a task switch) prior to return. 1012 */ 1013 void 1014 syscall2(struct trapframe *frame) 1015 { 1016 struct thread *td = curthread; 1017 struct proc *p = td->td_proc; 1018 struct lwp *lp = td->td_lwp; 1019 struct sysent *callp; 1020 register_t orig_tf_rflags; 1021 int sticks; 1022 int error; 1023 int narg; 1024 #ifdef INVARIANTS 1025 int crit_count = td->td_critcount; 1026 lwkt_tokref_t curstop = td->td_toks_stop; 1027 #endif 1028 struct sysmsg sysmsg; 1029 union sysunion *argp; 1030 u_int code; 1031 const int regcnt = 6; 1032 1033 mycpu->gd_cnt.v_syscall++; 1034 1035 KTR_LOG(kernentry_syscall, lp->lwp_proc->p_pid, lp->lwp_tid, 1036 frame->tf_rax); 1037 1038 userenter(td, p); /* lazy raise our priority */ 1039 1040 /* 1041 * Misc 1042 */ 1043 sticks = (int)td->td_sticks; 1044 orig_tf_rflags = frame->tf_rflags; 1045 1046 /* 1047 * Virtual kernel intercept - if a VM context managed by a virtual 1048 * kernel issues a system call the virtual kernel handles it, not us. 1049 * Restore the virtual kernel context and return from its system 1050 * call. The current frame is copied out to the virtual kernel. 1051 */ 1052 if (__predict_false(lp->lwp_vkernel && lp->lwp_vkernel->ve)) { 1053 vkernel_trap(lp, frame); 1054 error = EJUSTRETURN; 1055 callp = NULL; 1056 code = 0; 1057 goto out; 1058 } 1059 1060 /* 1061 * Get the system call parameters and account for time 1062 */ 1063 lp->lwp_md.md_regs = frame; 1064 code = frame->tf_rax; 1065 1066 if (code >= p->p_sysent->sv_size) 1067 code = SYS___nosys; 1068 argp = (union sysunion *)&frame->tf_rdi; 1069 callp = &p->p_sysent->sv_table[code]; 1070 1071 /* 1072 * On x86_64 we get up to six arguments in registers. The rest are 1073 * on the stack. The first six members of 'struct trapframe' happen 1074 * to be the registers used to pass arguments, in exactly the right 1075 * order. 1076 * 1077 * Any arguments beyond available argument-passing registers must 1078 * be copyin()'d from the user stack. 1079 */ 1080 narg = callp->sy_narg; 1081 if (__predict_false(narg > regcnt)) { 1082 register_t *argsdst; 1083 caddr_t params; 1084 1085 argsdst = (register_t *)&sysmsg.extargs; 1086 bcopy(argp, argsdst, sizeof(register_t) * regcnt); 1087 params = (caddr_t)frame->tf_rsp + sizeof(register_t); 1088 1089 KASSERT(params != NULL, ("copyin args with no params!")); 1090 error = copyin(params, &argsdst[regcnt], 1091 (narg - regcnt) * sizeof(register_t)); 1092 argp = (void *)argsdst; 1093 if (error) { 1094 #ifdef KTRACE 1095 if (KTRPOINT(td, KTR_SYSCALL)) { 1096 ktrsyscall(lp, code, narg, argp); 1097 } 1098 #endif 1099 goto bad; 1100 } 1101 } 1102 1103 #ifdef KTRACE 1104 if (KTRPOINT(td, KTR_SYSCALL)) { 1105 ktrsyscall(lp, code, narg, argp); 1106 } 1107 #endif 1108 1109 /* 1110 * Default return value is 0 (will be copied to %rax). Double-value 1111 * returns use %rax and %rdx. %rdx is left unchanged for system 1112 * calls which return only one result. 1113 */ 1114 sysmsg.sysmsg_fds[0] = 0; 1115 sysmsg.sysmsg_fds[1] = frame->tf_rdx; 1116 1117 /* 1118 * The syscall might manipulate the trap frame. If it does it 1119 * will probably return EJUSTRETURN. 1120 */ 1121 sysmsg.sysmsg_frame = frame; 1122 1123 STOPEVENT(p, S_SCE, narg); /* MP aware */ 1124 1125 /* 1126 * NOTE: All system calls run MPSAFE now. The system call itself 1127 * is responsible for getting the MP lock. 1128 */ 1129 error = (*callp->sy_call)(&sysmsg, argp); 1130 1131 #if 0 1132 kprintf("system call %d returned %d\n", code, error); 1133 #endif 1134 1135 out: 1136 /* 1137 * MP SAFE (we may or may not have the MP lock at this point) 1138 */ 1139 switch (error) { 1140 case 0: 1141 /* 1142 * Reinitialize proc pointer `p' as it may be different 1143 * if this is a child returning from fork syscall. 1144 */ 1145 p = curproc; 1146 lp = curthread->td_lwp; 1147 frame->tf_rax = sysmsg.sysmsg_fds[0]; 1148 frame->tf_rdx = sysmsg.sysmsg_fds[1]; 1149 frame->tf_rflags &= ~PSL_C; 1150 break; 1151 case ERESTART: 1152 /* 1153 * Reconstruct pc, we know that 'syscall' is 2 bytes. 1154 * We have to do a full context restore so that %r10 1155 * (which was holding the value of %rcx) is restored for 1156 * the next iteration. 1157 */ 1158 frame->tf_rip -= frame->tf_err; 1159 frame->tf_r10 = frame->tf_rcx; 1160 break; 1161 case EJUSTRETURN: 1162 break; 1163 case EASYNC: 1164 panic("Unexpected EASYNC return value (for now)"); 1165 default: 1166 bad: 1167 if (p->p_sysent->sv_errsize) { 1168 if (error >= p->p_sysent->sv_errsize) 1169 error = -1; /* XXX */ 1170 else 1171 error = p->p_sysent->sv_errtbl[error]; 1172 } 1173 frame->tf_rax = error; 1174 frame->tf_rflags |= PSL_C; 1175 break; 1176 } 1177 1178 /* 1179 * Traced syscall. trapsignal() is not MP aware. 1180 */ 1181 if (orig_tf_rflags & PSL_T) { 1182 frame->tf_rflags &= ~PSL_T; 1183 trapsignal(lp, SIGTRAP, 0); 1184 } 1185 1186 /* 1187 * Handle reschedule and other end-of-syscall issues 1188 */ 1189 userret(lp, frame, sticks); 1190 1191 #ifdef KTRACE 1192 if (KTRPOINT(td, KTR_SYSRET)) { 1193 ktrsysret(lp, code, error, sysmsg.sysmsg_result); 1194 } 1195 #endif 1196 1197 /* 1198 * This works because errno is findable through the 1199 * register set. If we ever support an emulation where this 1200 * is not the case, this code will need to be revisited. 1201 */ 1202 STOPEVENT(p, S_SCX, code); 1203 1204 userexit(lp); 1205 KTR_LOG(kernentry_syscall_ret, lp->lwp_proc->p_pid, lp->lwp_tid, error); 1206 #ifdef INVARIANTS 1207 KASSERT(&td->td_toks_base == td->td_toks_stop, 1208 ("syscall: critical section count mismatch! %d/%d", 1209 crit_count, td->td_pri)); 1210 KASSERT(curstop == td->td_toks_stop, 1211 ("syscall: extra tokens held after trap! %ld", 1212 td->td_toks_stop - &td->td_toks_base)); 1213 #endif 1214 } 1215 1216 /* 1217 * Handles the syscall() and __syscall() API 1218 */ 1219 void xsyscall(struct sysmsg *sysmsg, struct nosys_args *uap); 1220 1221 int 1222 sys_xsyscall(struct sysmsg *sysmsg, const struct nosys_args *uap) 1223 { 1224 struct trapframe *frame; 1225 struct sysent *callp; 1226 union sysunion *argp; 1227 struct thread *td; 1228 const int regcnt = 5; /* number of args passed in registers */ 1229 u_int code; 1230 int error; 1231 int narg; 1232 1233 td = curthread; 1234 frame = sysmsg->sysmsg_frame; 1235 code = (u_int)frame->tf_rdi; 1236 if (code >= td->td_proc->p_sysent->sv_size) 1237 code = SYS___nosys; 1238 argp = (union sysunion *)(&frame->tf_rdi + 1); 1239 callp = &td->td_proc->p_sysent->sv_table[code]; 1240 narg = callp->sy_narg; 1241 1242 /* 1243 * On x86_64 we get up to six arguments in registers. The rest are 1244 * on the stack. However, for syscall() and __syscall() the syscall 1245 * number is inserted as the first argument, so the limit is reduced 1246 * by one to five. 1247 */ 1248 if (__predict_false(narg > regcnt)) { 1249 register_t *argsdst; 1250 caddr_t params; 1251 1252 argsdst = (register_t *)&sysmsg->extargs; 1253 bcopy(argp, argsdst, sizeof(register_t) * regcnt); 1254 params = (caddr_t)frame->tf_rsp + sizeof(register_t); 1255 error = copyin(params, &argsdst[regcnt], 1256 (narg - regcnt) * sizeof(register_t)); 1257 argp = (void *)argsdst; 1258 if (error) 1259 return error; 1260 } 1261 1262 #ifdef KTRACE 1263 if (KTRPOINTP(td->td_proc, td, KTR_SYSCALL)) { 1264 ktrsyscall(td->td_lwp, code, narg, argp); 1265 } 1266 #endif 1267 1268 error = (*callp->sy_call)(sysmsg, argp); 1269 1270 #ifdef KTRACE 1271 if (KTRPOINTP(td->td_proc, td, KTR_SYSRET)) { 1272 ktrsysret(td->td_lwp, code, error, sysmsg->sysmsg_result); 1273 } 1274 #endif 1275 1276 return error; 1277 } 1278 1279 /* 1280 * NOTE: mplock not held at any point 1281 */ 1282 void 1283 fork_return(struct lwp *lp, struct trapframe *frame) 1284 { 1285 frame->tf_rax = 0; /* Child returns zero */ 1286 frame->tf_rflags &= ~PSL_C; /* success */ 1287 frame->tf_rdx = 1; 1288 1289 generic_lwp_return(lp, frame); 1290 KTR_LOG(kernentry_fork_ret, lp->lwp_proc->p_pid, lp->lwp_tid); 1291 } 1292 1293 /* 1294 * Simplified back end of syscall(), used when returning from fork() 1295 * directly into user mode. 1296 * 1297 * This code will return back into the fork trampoline code which then 1298 * runs doreti. 1299 * 1300 * NOTE: The mplock is not held at any point. 1301 */ 1302 void 1303 generic_lwp_return(struct lwp *lp, struct trapframe *frame) 1304 { 1305 struct proc *p = lp->lwp_proc; 1306 1307 /* 1308 * Check for exit-race. If one lwp exits the process concurrent with 1309 * another lwp creating a new thread, the two operations may cross 1310 * each other resulting in the newly-created lwp not receiving a 1311 * KILL signal. 1312 */ 1313 if (p->p_flags & P_WEXIT) { 1314 lwpsignal(p, lp, SIGKILL); 1315 } 1316 1317 /* 1318 * Newly forked processes are given a kernel priority. We have to 1319 * adjust the priority to a normal user priority and fake entry 1320 * into the kernel (call userenter()) to install a passive release 1321 * function just in case userret() decides to stop the process. This 1322 * can occur when ^Z races a fork. If we do not install the passive 1323 * release function the current process designation will not be 1324 * released when the thread goes to sleep. 1325 */ 1326 lwkt_setpri_self(TDPRI_USER_NORM); 1327 userenter(lp->lwp_thread, p); 1328 userret(lp, frame, 0); 1329 #ifdef KTRACE 1330 if (KTRPOINT(lp->lwp_thread, KTR_SYSRET)) 1331 ktrsysret(lp, SYS_fork, 0, 0); 1332 #endif 1333 lp->lwp_flags |= LWP_PASSIVE_ACQ; 1334 userexit(lp); 1335 lp->lwp_flags &= ~LWP_PASSIVE_ACQ; 1336 } 1337 1338 /* 1339 * doreti has turned into this. The frame is directly on the stack. We 1340 * pull everything else we need (fpu and tls context) from the current 1341 * thread. 1342 * 1343 * Note on fpu interactions: In a virtual kernel, the fpu context for 1344 * an emulated user mode process is not shared with the virtual kernel's 1345 * fpu context, so we only have to 'stack' fpu contexts within the virtual 1346 * kernel itself, and not even then since the signal() contexts that we care 1347 * about save and restore the FPU state (I think anyhow). 1348 * 1349 * vmspace_ctl() returns an error only if it had problems instaling the 1350 * context we supplied or problems copying data to/from our VM space. 1351 */ 1352 void 1353 go_user(struct intrframe *frame) 1354 { 1355 struct trapframe *tf = (void *)&frame->if_rdi; 1356 globaldata_t gd; 1357 int r; 1358 void *id; 1359 1360 /* 1361 * Interrupts may be disabled on entry, make sure all signals 1362 * can be received before beginning our loop. 1363 */ 1364 sigsetmask(0); 1365 1366 /* 1367 * Switch to the current simulated user process, then call 1368 * user_trap() when we break out of it (usually due to a signal). 1369 */ 1370 for (;;) { 1371 #if 1 1372 /* 1373 * Always make the FPU state correct. This should generally 1374 * be faster because the cost of taking a #NM fault through 1375 * the vkernel to the real kernel is astronomical. 1376 */ 1377 crit_enter(); 1378 tf->tf_xflags &= ~PGEX_FPFAULT; 1379 if (mdcpu->gd_npxthread != curthread) { 1380 if (mdcpu->gd_npxthread) 1381 npxsave(mdcpu->gd_npxthread->td_savefpu); 1382 npxdna(tf); 1383 } 1384 #else 1385 /* 1386 * Tell the real kernel whether it is ok to use the FP 1387 * unit or not, allowing us to take a T_DNA exception 1388 * if the context tries to use the FP. 1389 */ 1390 if (mdcpu->gd_npxthread == curthread) { 1391 tf->tf_xflags &= ~PGEX_FPFAULT; 1392 } else { 1393 tf->tf_xflags |= PGEX_FPFAULT; 1394 } 1395 #endif 1396 1397 /* 1398 * Run emulated user process context. This call interlocks 1399 * with new mailbox signals. 1400 * 1401 * Set PGEX_U unconditionally, indicating a user frame (the 1402 * bit is normally set only by T_PAGEFLT). 1403 */ 1404 if (vmm_enabled) 1405 id = (void *)vtophys(curproc->p_vmspace->vm_pmap.pm_pml4); 1406 else 1407 id = &curproc->p_vmspace->vm_pmap; 1408 1409 /* 1410 * The GDF_VIRTUSER hack helps statclock() figure out who 1411 * the tick belongs to. 1412 */ 1413 gd = mycpu; 1414 gd->gd_flags |= GDF_VIRTUSER; 1415 r = vmspace_ctl(id, VMSPACE_CTL_RUN, tf, 1416 &curthread->td_savevext); 1417 1418 frame->if_xflags |= PGEX_U; 1419 1420 /* 1421 * Immediately save the user FPU state. The vkernel is a 1422 * user program and libraries like libc will use the FP 1423 * unit. 1424 */ 1425 if (mdcpu->gd_npxthread == curthread) { 1426 npxsave(mdcpu->gd_npxthread->td_savefpu); 1427 } 1428 crit_exit(); 1429 gd->gd_flags &= ~GDF_VIRTUSER; 1430 #if 0 1431 kprintf("GO USER %d trap %ld EVA %08lx RIP %08lx RSP %08lx XFLAGS %02lx/%02lx\n", 1432 r, tf->tf_trapno, tf->tf_addr, tf->tf_rip, tf->tf_rsp, 1433 tf->tf_xflags, frame->if_xflags); 1434 #endif 1435 if (r < 0) { 1436 if (errno != EINTR) 1437 panic("vmspace_ctl failed error %d", errno); 1438 } else { 1439 if (tf->tf_trapno) { 1440 user_trap(tf); 1441 } 1442 } 1443 if (mycpu->gd_reqflags & RQF_AST_MASK) { 1444 tf->tf_trapno = T_ASTFLT; 1445 user_trap(tf); 1446 } 1447 tf->tf_trapno = 0; 1448 } 1449 } 1450 1451 /* 1452 * If PGEX_FPFAULT is set then set FP_VIRTFP in the PCB to force a T_DNA 1453 * fault (which is then passed back to the virtual kernel) if an attempt is 1454 * made to use the FP unit. 1455 * 1456 * XXX this is a fairly big hack. 1457 */ 1458 void 1459 set_vkernel_fp(struct trapframe *frame) 1460 { 1461 struct thread *td = curthread; 1462 1463 if (frame->tf_xflags & PGEX_FPFAULT) { 1464 td->td_pcb->pcb_flags |= FP_VIRTFP; 1465 if (mdcpu->gd_npxthread == td) 1466 npxexit(); 1467 } else { 1468 td->td_pcb->pcb_flags &= ~FP_VIRTFP; 1469 } 1470 } 1471 1472 /* 1473 * Called from vkernel_trap() to fixup the vkernel's syscall 1474 * frame for vmspace_ctl() return. 1475 */ 1476 void 1477 cpu_vkernel_trap(struct trapframe *frame, int error) 1478 { 1479 frame->tf_rax = error; 1480 if (error) 1481 frame->tf_rflags |= PSL_C; 1482 else 1483 frame->tf_rflags &= ~PSL_C; 1484 } 1485