1 /*- 2 * Copyright (c) 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * Copyright (C) 1994, David Greenman 5 * Copyright (c) 2008 The DragonFly Project. 6 * Copyright (c) 2008 Jordan Gordeev. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * the University of Utah, and William Jolitz. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the University of 22 * California, Berkeley and its contributors. 23 * 4. Neither the name of the University nor the names of its contributors 24 * may be used to endorse or promote products derived from this software 25 * without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 * 39 * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 40 * $FreeBSD: src/sys/i386/i386/trap.c,v 1.147.2.11 2003/02/27 19:09:59 luoqi Exp $ 41 */ 42 43 /* 44 * x86_64 Trap and System call handling 45 */ 46 47 #include "use_isa.h" 48 49 #include "opt_ddb.h" 50 #include "opt_ktrace.h" 51 52 #include <machine/frame.h> 53 #include <sys/param.h> 54 #include <sys/systm.h> 55 #include <sys/kernel.h> 56 #include <sys/kerneldump.h> 57 #include <sys/proc.h> 58 #include <sys/pioctl.h> 59 #include <sys/types.h> 60 #include <sys/signal2.h> 61 #include <sys/syscall.h> 62 #include <sys/sysctl.h> 63 #include <sys/sysent.h> 64 #include <sys/systm.h> 65 #ifdef KTRACE 66 #include <sys/ktrace.h> 67 #endif 68 #include <sys/ktr.h> 69 #include <sys/sysmsg.h> 70 #include <sys/sysproto.h> 71 #include <sys/sysunion.h> 72 73 #include <vm/pmap.h> 74 #include <vm/vm.h> 75 #include <vm/vm_extern.h> 76 #include <vm/vm_kern.h> 77 #include <vm/vm_param.h> 78 #include <machine/cpu.h> 79 #include <machine/pcb.h> 80 #include <machine/smp.h> 81 #include <machine/thread.h> 82 #include <machine/vmparam.h> 83 #include <machine/md_var.h> 84 #include <machine_base/isa/isa_intr.h> 85 #include <machine_base/apic/lapic.h> 86 87 #include <ddb/ddb.h> 88 89 #include <sys/thread2.h> 90 #include <sys/mplock2.h> 91 92 #ifdef SMP 93 94 #define MAKEMPSAFE(have_mplock) \ 95 if (have_mplock == 0) { \ 96 get_mplock(); \ 97 have_mplock = 1; \ 98 } 99 100 #else 101 102 #define MAKEMPSAFE(have_mplock) 103 104 #endif 105 106 extern void trap(struct trapframe *frame); 107 108 static int trap_pfault(struct trapframe *, int); 109 static void trap_fatal(struct trapframe *, vm_offset_t); 110 void dblfault_handler(struct trapframe *frame); 111 112 #define MAX_TRAP_MSG 30 113 static char *trap_msg[] = { 114 "", /* 0 unused */ 115 "privileged instruction fault", /* 1 T_PRIVINFLT */ 116 "", /* 2 unused */ 117 "breakpoint instruction fault", /* 3 T_BPTFLT */ 118 "", /* 4 unused */ 119 "", /* 5 unused */ 120 "arithmetic trap", /* 6 T_ARITHTRAP */ 121 "system forced exception", /* 7 T_ASTFLT */ 122 "", /* 8 unused */ 123 "general protection fault", /* 9 T_PROTFLT */ 124 "trace trap", /* 10 T_TRCTRAP */ 125 "", /* 11 unused */ 126 "page fault", /* 12 T_PAGEFLT */ 127 "", /* 13 unused */ 128 "alignment fault", /* 14 T_ALIGNFLT */ 129 "", /* 15 unused */ 130 "", /* 16 unused */ 131 "", /* 17 unused */ 132 "integer divide fault", /* 18 T_DIVIDE */ 133 "non-maskable interrupt trap", /* 19 T_NMI */ 134 "overflow trap", /* 20 T_OFLOW */ 135 "FPU bounds check fault", /* 21 T_BOUND */ 136 "FPU device not available", /* 22 T_DNA */ 137 "double fault", /* 23 T_DOUBLEFLT */ 138 "FPU operand fetch fault", /* 24 T_FPOPFLT */ 139 "invalid TSS fault", /* 25 T_TSSFLT */ 140 "segment not present fault", /* 26 T_SEGNPFLT */ 141 "stack fault", /* 27 T_STKFLT */ 142 "machine check trap", /* 28 T_MCHK */ 143 "SIMD floating-point exception", /* 29 T_XMMFLT */ 144 "reserved (unknown) fault", /* 30 T_RESERVED */ 145 }; 146 147 #ifdef DDB 148 static int ddb_on_nmi = 1; 149 SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW, 150 &ddb_on_nmi, 0, "Go to DDB on NMI"); 151 static int ddb_on_seg_fault = 0; 152 SYSCTL_INT(_machdep, OID_AUTO, ddb_on_seg_fault, CTLFLAG_RW, 153 &ddb_on_seg_fault, 0, "Go to DDB on user seg-fault"); 154 #endif 155 static int panic_on_nmi = 1; 156 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW, 157 &panic_on_nmi, 0, "Panic on NMI"); 158 static int fast_release; 159 SYSCTL_INT(_machdep, OID_AUTO, fast_release, CTLFLAG_RW, 160 &fast_release, 0, "Passive Release was optimal"); 161 static int slow_release; 162 SYSCTL_INT(_machdep, OID_AUTO, slow_release, CTLFLAG_RW, 163 &slow_release, 0, "Passive Release was nonoptimal"); 164 165 /* 166 * Passively intercepts the thread switch function to increase 167 * the thread priority from a user priority to a kernel priority, reducing 168 * syscall and trap overhead for the case where no switch occurs. 169 * 170 * Synchronizes td_ucred with p_ucred. This is used by system calls, 171 * signal handling, faults, AST traps, and anything else that enters the 172 * kernel from userland and provides the kernel with a stable read-only 173 * copy of the process ucred. 174 */ 175 static __inline void 176 userenter(struct thread *curtd, struct proc *curp) 177 { 178 struct ucred *ocred; 179 struct ucred *ncred; 180 181 curtd->td_release = lwkt_passive_release; 182 183 if (curtd->td_ucred != curp->p_ucred) { 184 ncred = crhold(curp->p_ucred); 185 ocred = curtd->td_ucred; 186 curtd->td_ucred = ncred; 187 if (ocred) 188 crfree(ocred); 189 } 190 } 191 192 /* 193 * Handle signals, upcalls, profiling, and other AST's and/or tasks that 194 * must be completed before we can return to or try to return to userland. 195 * 196 * Note that td_sticks is a 64 bit quantity, but there's no point doing 64 197 * arithmatic on the delta calculation so the absolute tick values are 198 * truncated to an integer. 199 */ 200 static void 201 userret(struct lwp *lp, struct trapframe *frame, int sticks) 202 { 203 struct proc *p = lp->lwp_proc; 204 int sig; 205 206 /* 207 * Charge system time if profiling. Note: times are in microseconds. 208 * This may do a copyout and block, so do it first even though it 209 * means some system time will be charged as user time. 210 */ 211 if (p->p_flag & P_PROFIL) { 212 addupc_task(p, frame->tf_rip, 213 (u_int)((int)lp->lwp_thread->td_sticks - sticks)); 214 } 215 216 recheck: 217 /* 218 * If the jungle wants us dead, so be it. 219 */ 220 if (lp->lwp_flag & LWP_WEXIT) { 221 lwkt_gettoken(&p->p_token); 222 lwp_exit(0); 223 lwkt_reltoken(&p->p_token); /* NOT REACHED */ 224 } 225 226 /* 227 * Block here if we are in a stopped state. 228 */ 229 if (p->p_stat == SSTOP || dump_stop_usertds) { 230 get_mplock(); 231 tstop(); 232 rel_mplock(); 233 goto recheck; 234 } 235 236 /* 237 * Post any pending upcalls. If running a virtual kernel be sure 238 * to restore the virtual kernel's vmspace before posting the upcall. 239 */ 240 if (p->p_flag & P_UPCALLPEND) { 241 p->p_flag &= ~P_UPCALLPEND; 242 get_mplock(); 243 postupcall(lp); 244 rel_mplock(); 245 goto recheck; 246 } 247 248 /* 249 * Post any pending signals. If running a virtual kernel be sure 250 * to restore the virtual kernel's vmspace before posting the signal. 251 * 252 * WARNING! postsig() can exit and not return. 253 */ 254 if ((sig = CURSIG_TRACE(lp)) != 0) { 255 get_mplock(); 256 postsig(sig); 257 rel_mplock(); 258 goto recheck; 259 } 260 261 /* 262 * block here if we are swapped out, but still process signals 263 * (such as SIGKILL). proc0 (the swapin scheduler) is already 264 * aware of our situation, we do not have to wake it up. 265 */ 266 if (p->p_flag & P_SWAPPEDOUT) { 267 get_mplock(); 268 p->p_flag |= P_SWAPWAIT; 269 swapin_request(); 270 if (p->p_flag & P_SWAPWAIT) 271 tsleep(p, PCATCH, "SWOUT", 0); 272 p->p_flag &= ~P_SWAPWAIT; 273 rel_mplock(); 274 goto recheck; 275 } 276 277 /* 278 * Make sure postsig() handled request to restore old signal mask after 279 * running signal handler. 280 */ 281 KKASSERT((lp->lwp_flag & LWP_OLDMASK) == 0); 282 } 283 284 /* 285 * Cleanup from userenter and any passive release that might have occured. 286 * We must reclaim the current-process designation before we can return 287 * to usermode. We also handle both LWKT and USER reschedule requests. 288 */ 289 static __inline void 290 userexit(struct lwp *lp) 291 { 292 struct thread *td = lp->lwp_thread; 293 /* globaldata_t gd = td->td_gd;*/ 294 295 /* 296 * Handle stop requests at kernel priority. Any requests queued 297 * after this loop will generate another AST. 298 */ 299 while (lp->lwp_proc->p_stat == SSTOP) { 300 get_mplock(); 301 tstop(); 302 rel_mplock(); 303 } 304 305 /* 306 * Reduce our priority in preparation for a return to userland. If 307 * our passive release function was still in place, our priority was 308 * never raised and does not need to be reduced. 309 */ 310 lwkt_passive_recover(td); 311 312 /* 313 * Become the current user scheduled process if we aren't already, 314 * and deal with reschedule requests and other factors. 315 */ 316 lp->lwp_proc->p_usched->acquire_curproc(lp); 317 /* WARNING: we may have migrated cpu's */ 318 /* gd = td->td_gd; */ 319 } 320 321 #if !defined(KTR_KERNENTRY) 322 #define KTR_KERNENTRY KTR_ALL 323 #endif 324 KTR_INFO_MASTER(kernentry); 325 KTR_INFO(KTR_KERNENTRY, kernentry, trap, 0, "STR", 326 sizeof(long) + sizeof(long) + sizeof(long) + sizeof(vm_offset_t)); 327 KTR_INFO(KTR_KERNENTRY, kernentry, trap_ret, 0, "STR", 328 sizeof(long) + sizeof(long)); 329 KTR_INFO(KTR_KERNENTRY, kernentry, syscall, 0, "STR", 330 sizeof(long) + sizeof(long) + sizeof(long)); 331 KTR_INFO(KTR_KERNENTRY, kernentry, syscall_ret, 0, "STR", 332 sizeof(long) + sizeof(long) + sizeof(long)); 333 KTR_INFO(KTR_KERNENTRY, kernentry, fork_ret, 0, "STR", 334 sizeof(long) + sizeof(long)); 335 336 /* 337 * Exception, fault, and trap interface to the kernel. 338 * This common code is called from assembly language IDT gate entry 339 * routines that prepare a suitable stack frame, and restore this 340 * frame after the exception has been processed. 341 * 342 * This function is also called from doreti in an interlock to handle ASTs. 343 * For example: hardwareint->INTROUTINE->(set ast)->doreti->trap 344 * 345 * NOTE! We have to retrieve the fault address prior to obtaining the 346 * MP lock because get_mplock() may switch out. YYY cr2 really ought 347 * to be retrieved by the assembly code, not here. 348 * 349 * XXX gd_trap_nesting_level currently prevents lwkt_switch() from panicing 350 * if an attempt is made to switch from a fast interrupt or IPI. This is 351 * necessary to properly take fatal kernel traps on SMP machines if 352 * get_mplock() has to block. 353 */ 354 355 void 356 trap(struct trapframe *frame) 357 { 358 struct globaldata *gd = mycpu; 359 struct thread *td = gd->gd_curthread; 360 struct lwp *lp = td->td_lwp; 361 struct proc *p; 362 int sticks = 0; 363 int i = 0, ucode = 0, type, code; 364 #ifdef SMP 365 int have_mplock = 0; 366 #endif 367 #ifdef INVARIANTS 368 int crit_count = td->td_critcount; 369 lwkt_tokref_t curstop = td->td_toks_stop; 370 #endif 371 vm_offset_t eva; 372 373 p = td->td_proc; 374 375 #ifdef DDB 376 /* 377 * We need to allow T_DNA faults when the debugger is active since 378 * some dumping paths do large bcopy() which use the floating 379 * point registers for faster copying. 380 */ 381 if (db_active && frame->tf_trapno != T_DNA) { 382 eva = (frame->tf_trapno == T_PAGEFLT ? frame->tf_addr : 0); 383 ++gd->gd_trap_nesting_level; 384 MAKEMPSAFE(have_mplock); 385 trap_fatal(frame, eva); 386 --gd->gd_trap_nesting_level; 387 goto out2; 388 } 389 #endif 390 391 eva = 0; 392 393 if ((frame->tf_rflags & PSL_I) == 0) { 394 /* 395 * Buggy application or kernel code has disabled interrupts 396 * and then trapped. Enabling interrupts now is wrong, but 397 * it is better than running with interrupts disabled until 398 * they are accidentally enabled later. 399 */ 400 type = frame->tf_trapno; 401 if (ISPL(frame->tf_cs) == SEL_UPL) { 402 MAKEMPSAFE(have_mplock); 403 /* JG curproc can be NULL */ 404 kprintf( 405 "pid %ld (%s): trap %d with interrupts disabled\n", 406 (long)curproc->p_pid, curproc->p_comm, type); 407 } else if (type != T_NMI && type != T_BPTFLT && 408 type != T_TRCTRAP) { 409 /* 410 * XXX not quite right, since this may be for a 411 * multiple fault in user mode. 412 */ 413 MAKEMPSAFE(have_mplock); 414 kprintf("kernel trap %d with interrupts disabled\n", 415 type); 416 } 417 cpu_enable_intr(); 418 } 419 420 type = frame->tf_trapno; 421 code = frame->tf_err; 422 423 if (ISPL(frame->tf_cs) == SEL_UPL) { 424 /* user trap */ 425 426 KTR_LOG(kernentry_trap, p->p_pid, lp->lwp_tid, 427 frame->tf_trapno, eva); 428 429 userenter(td, p); 430 431 sticks = (int)td->td_sticks; 432 KASSERT(lp->lwp_md.md_regs == frame, 433 ("Frame mismatch %p %p", lp->lwp_md.md_regs, frame)); 434 435 switch (type) { 436 case T_PRIVINFLT: /* privileged instruction fault */ 437 ucode = ILL_PRVOPC; 438 i = SIGILL; 439 break; 440 441 case T_BPTFLT: /* bpt instruction fault */ 442 case T_TRCTRAP: /* trace trap */ 443 frame->tf_rflags &= ~PSL_T; 444 ucode = TRAP_TRACE; 445 i = SIGTRAP; 446 break; 447 448 case T_ARITHTRAP: /* arithmetic trap */ 449 ucode = code; 450 i = SIGFPE; 451 #if 0 452 #if JG 453 ucode = fputrap(); 454 #else 455 ucode = code; 456 #endif 457 i = SIGFPE; 458 #endif 459 break; 460 461 case T_ASTFLT: /* Allow process switch */ 462 mycpu->gd_cnt.v_soft++; 463 if (mycpu->gd_reqflags & RQF_AST_OWEUPC) { 464 atomic_clear_int(&mycpu->gd_reqflags, 465 RQF_AST_OWEUPC); 466 addupc_task(p, p->p_prof.pr_addr, 467 p->p_prof.pr_ticks); 468 } 469 goto out; 470 471 case T_PROTFLT: /* general protection fault */ 472 i = SIGBUS; 473 ucode = BUS_OBJERR; 474 break; 475 case T_SEGNPFLT: /* segment not present fault */ 476 i = SIGBUS; 477 ucode = BUS_ADRERR; 478 break; 479 case T_TSSFLT: /* invalid TSS fault */ 480 case T_DOUBLEFLT: /* double fault */ 481 i = SIGBUS; 482 ucode = BUS_OBJERR; 483 default: 484 #if 0 485 ucode = code + BUS_SEGM_FAULT ; /* XXX: ???*/ 486 #endif 487 ucode = BUS_OBJERR; 488 i = SIGBUS; 489 break; 490 491 case T_PAGEFLT: /* page fault */ 492 i = trap_pfault(frame, TRUE); 493 if (frame->tf_rip == 0) 494 kprintf("T_PAGEFLT: Warning %%rip == 0!\n"); 495 if (i == -1) 496 goto out; 497 if (i == 0) 498 goto out; 499 500 #if 0 501 ucode = T_PAGEFLT; 502 #endif 503 if (i == SIGSEGV) 504 ucode = SEGV_MAPERR; 505 else 506 ucode = BUS_ADRERR; 507 break; 508 509 case T_DIVIDE: /* integer divide fault */ 510 ucode = FPE_INTDIV; 511 i = SIGFPE; 512 break; 513 514 #if NISA > 0 515 case T_NMI: 516 MAKEMPSAFE(have_mplock); 517 /* machine/parity/power fail/"kitchen sink" faults */ 518 if (isa_nmi(code) == 0) { 519 #ifdef DDB 520 /* 521 * NMI can be hooked up to a pushbutton 522 * for debugging. 523 */ 524 if (ddb_on_nmi) { 525 kprintf ("NMI ... going to debugger\n"); 526 kdb_trap(type, 0, frame); 527 } 528 #endif /* DDB */ 529 goto out2; 530 } else if (panic_on_nmi) 531 panic("NMI indicates hardware failure"); 532 break; 533 #endif /* NISA > 0 */ 534 535 case T_OFLOW: /* integer overflow fault */ 536 ucode = FPE_INTOVF; 537 i = SIGFPE; 538 break; 539 540 case T_BOUND: /* bounds check fault */ 541 ucode = FPE_FLTSUB; 542 i = SIGFPE; 543 break; 544 545 case T_DNA: 546 /* 547 * Virtual kernel intercept - pass the DNA exception 548 * to the virtual kernel if it asked to handle it. 549 * This occurs when the virtual kernel is holding 550 * onto the FP context for a different emulated 551 * process then the one currently running. 552 * 553 * We must still call npxdna() since we may have 554 * saved FP state that the virtual kernel needs 555 * to hand over to a different emulated process. 556 */ 557 if (lp->lwp_vkernel && lp->lwp_vkernel->ve && 558 (td->td_pcb->pcb_flags & FP_VIRTFP) 559 ) { 560 npxdna(); 561 break; 562 } 563 564 /* 565 * The kernel may have switched out the FP unit's 566 * state, causing the user process to take a fault 567 * when it tries to use the FP unit. Restore the 568 * state here 569 */ 570 if (npxdna()) 571 goto out; 572 i = SIGFPE; 573 ucode = FPE_FPU_NP_TRAP; 574 break; 575 576 case T_FPOPFLT: /* FPU operand fetch fault */ 577 ucode = ILL_COPROC; 578 i = SIGILL; 579 break; 580 581 case T_XMMFLT: /* SIMD floating-point exception */ 582 ucode = 0; /* XXX */ 583 i = SIGFPE; 584 break; 585 } 586 } else { 587 /* kernel trap */ 588 589 switch (type) { 590 case T_PAGEFLT: /* page fault */ 591 trap_pfault(frame, FALSE); 592 goto out2; 593 594 case T_DNA: 595 /* 596 * The kernel is apparently using fpu for copying. 597 * XXX this should be fatal unless the kernel has 598 * registered such use. 599 */ 600 if (npxdna()) 601 goto out2; 602 break; 603 604 case T_STKFLT: /* stack fault */ 605 break; 606 607 case T_PROTFLT: /* general protection fault */ 608 case T_SEGNPFLT: /* segment not present fault */ 609 /* 610 * Invalid segment selectors and out of bounds 611 * %rip's and %rsp's can be set up in user mode. 612 * This causes a fault in kernel mode when the 613 * kernel tries to return to user mode. We want 614 * to get this fault so that we can fix the 615 * problem here and not have to check all the 616 * selectors and pointers when the user changes 617 * them. 618 */ 619 if (mycpu->gd_intr_nesting_level == 0) { 620 if (td->td_pcb->pcb_onfault) { 621 frame->tf_rip = (register_t) 622 td->td_pcb->pcb_onfault; 623 goto out2; 624 } 625 if (frame->tf_rip == (long)doreti_iret) { 626 frame->tf_rip = (long)doreti_iret_fault; 627 goto out2; 628 } 629 } 630 break; 631 632 case T_TSSFLT: 633 /* 634 * PSL_NT can be set in user mode and isn't cleared 635 * automatically when the kernel is entered. This 636 * causes a TSS fault when the kernel attempts to 637 * `iret' because the TSS link is uninitialized. We 638 * want to get this fault so that we can fix the 639 * problem here and not every time the kernel is 640 * entered. 641 */ 642 if (frame->tf_rflags & PSL_NT) { 643 frame->tf_rflags &= ~PSL_NT; 644 goto out2; 645 } 646 break; 647 648 case T_TRCTRAP: /* trace trap */ 649 #if 0 650 if (frame->tf_rip == (int)IDTVEC(syscall)) { 651 /* 652 * We've just entered system mode via the 653 * syscall lcall. Continue single stepping 654 * silently until the syscall handler has 655 * saved the flags. 656 */ 657 goto out2; 658 } 659 if (frame->tf_rip == (int)IDTVEC(syscall) + 1) { 660 /* 661 * The syscall handler has now saved the 662 * flags. Stop single stepping it. 663 */ 664 frame->tf_rflags &= ~PSL_T; 665 goto out2; 666 } 667 #endif 668 669 /* 670 * Ignore debug register trace traps due to 671 * accesses in the user's address space, which 672 * can happen under several conditions such as 673 * if a user sets a watchpoint on a buffer and 674 * then passes that buffer to a system call. 675 * We still want to get TRCTRAPS for addresses 676 * in kernel space because that is useful when 677 * debugging the kernel. 678 */ 679 #if JG 680 if (user_dbreg_trap()) { 681 /* 682 * Reset breakpoint bits because the 683 * processor doesn't 684 */ 685 /* XXX check upper bits here */ 686 load_dr6(rdr6() & 0xfffffff0); 687 goto out2; 688 } 689 #endif 690 /* 691 * FALLTHROUGH (TRCTRAP kernel mode, kernel address) 692 */ 693 case T_BPTFLT: 694 /* 695 * If DDB is enabled, let it handle the debugger trap. 696 * Otherwise, debugger traps "can't happen". 697 */ 698 ucode = TRAP_BRKPT; 699 #ifdef DDB 700 MAKEMPSAFE(have_mplock); 701 if (kdb_trap(type, 0, frame)) 702 goto out2; 703 #endif 704 break; 705 706 #if NISA > 0 707 case T_NMI: 708 MAKEMPSAFE(have_mplock); 709 /* machine/parity/power fail/"kitchen sink" faults */ 710 if (isa_nmi(code) == 0) { 711 #ifdef DDB 712 /* 713 * NMI can be hooked up to a pushbutton 714 * for debugging. 715 */ 716 if (ddb_on_nmi) { 717 kprintf ("NMI ... going to debugger\n"); 718 kdb_trap(type, 0, frame); 719 } 720 #endif /* DDB */ 721 goto out2; 722 } else if (panic_on_nmi == 0) 723 goto out2; 724 /* FALL THROUGH */ 725 #endif /* NISA > 0 */ 726 } 727 MAKEMPSAFE(have_mplock); 728 trap_fatal(frame, 0); 729 goto out2; 730 } 731 732 /* 733 * Virtual kernel intercept - if the fault is directly related to a 734 * VM context managed by a virtual kernel then let the virtual kernel 735 * handle it. 736 */ 737 if (lp->lwp_vkernel && lp->lwp_vkernel->ve) { 738 vkernel_trap(lp, frame); 739 goto out; 740 } 741 742 /* 743 * Translate fault for emulators (e.g. Linux) 744 */ 745 if (*p->p_sysent->sv_transtrap) 746 i = (*p->p_sysent->sv_transtrap)(i, type); 747 748 MAKEMPSAFE(have_mplock); 749 trapsignal(lp, i, ucode); 750 751 #ifdef DEBUG 752 if (type <= MAX_TRAP_MSG) { 753 uprintf("fatal process exception: %s", 754 trap_msg[type]); 755 if ((type == T_PAGEFLT) || (type == T_PROTFLT)) 756 uprintf(", fault VA = 0x%lx", frame->tf_addr); 757 uprintf("\n"); 758 } 759 #endif 760 761 out: 762 userret(lp, frame, sticks); 763 userexit(lp); 764 out2: ; 765 #ifdef SMP 766 if (have_mplock) 767 rel_mplock(); 768 #endif 769 if (p != NULL && lp != NULL) 770 KTR_LOG(kernentry_trap_ret, p->p_pid, lp->lwp_tid); 771 #ifdef INVARIANTS 772 KASSERT(crit_count == td->td_critcount, 773 ("trap: critical section count mismatch! %d/%d", 774 crit_count, td->td_pri)); 775 KASSERT(curstop == td->td_toks_stop, 776 ("trap: extra tokens held after trap! %ld/%ld", 777 curstop - &td->td_toks_base, 778 td->td_toks_stop - &td->td_toks_base)); 779 #endif 780 } 781 782 static int 783 trap_pfault(struct trapframe *frame, int usermode) 784 { 785 vm_offset_t va; 786 struct vmspace *vm = NULL; 787 vm_map_t map; 788 int rv = 0; 789 int fault_flags; 790 vm_prot_t ftype; 791 thread_t td = curthread; 792 struct lwp *lp = td->td_lwp; 793 struct proc *p; 794 795 va = trunc_page(frame->tf_addr); 796 if (va >= VM_MIN_KERNEL_ADDRESS) { 797 /* 798 * Don't allow user-mode faults in kernel address space. 799 */ 800 if (usermode) { 801 fault_flags = -1; 802 ftype = -1; 803 goto nogo; 804 } 805 806 map = &kernel_map; 807 } else { 808 /* 809 * This is a fault on non-kernel virtual memory. 810 * vm is initialized above to NULL. If curproc is NULL 811 * or curproc->p_vmspace is NULL the fault is fatal. 812 */ 813 if (lp != NULL) 814 vm = lp->lwp_vmspace; 815 816 if (vm == NULL) { 817 fault_flags = -1; 818 ftype = -1; 819 goto nogo; 820 } 821 822 map = &vm->vm_map; 823 } 824 825 /* 826 * PGEX_I is defined only if the execute disable bit capability is 827 * supported and enabled. 828 */ 829 if (frame->tf_err & PGEX_W) 830 ftype = VM_PROT_WRITE; 831 #if JG 832 else if ((frame->tf_err & PGEX_I) && pg_nx != 0) 833 ftype = VM_PROT_EXECUTE; 834 #endif 835 else 836 ftype = VM_PROT_READ; 837 838 if (map != &kernel_map) { 839 /* 840 * Keep swapout from messing with us during this 841 * critical time. 842 */ 843 PHOLD(lp->lwp_proc); 844 845 /* 846 * Issue fault 847 */ 848 fault_flags = 0; 849 if (usermode) 850 fault_flags |= VM_FAULT_BURST; 851 if (ftype & VM_PROT_WRITE) 852 fault_flags |= VM_FAULT_DIRTY; 853 else 854 fault_flags |= VM_FAULT_NORMAL; 855 rv = vm_fault(map, va, ftype, fault_flags); 856 857 PRELE(lp->lwp_proc); 858 } else { 859 /* 860 * Don't have to worry about process locking or stacks 861 * in the kernel. 862 */ 863 fault_flags = VM_FAULT_NORMAL; 864 rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); 865 } 866 867 if (rv == KERN_SUCCESS) 868 return (0); 869 nogo: 870 if (!usermode) { 871 if (td->td_gd->gd_intr_nesting_level == 0 && 872 td->td_pcb->pcb_onfault) { 873 frame->tf_rip = (register_t)td->td_pcb->pcb_onfault; 874 return (0); 875 } 876 trap_fatal(frame, frame->tf_addr); 877 return (-1); 878 } 879 880 /* 881 * NOTE: on x86_64 we have a tf_addr field in the trapframe, no 882 * kludge is needed to pass the fault address to signal handlers. 883 */ 884 p = td->td_proc; 885 if (td->td_lwp->lwp_vkernel == NULL) { 886 if (bootverbose) 887 kprintf("seg-fault ft=%04x ff=%04x addr=%p rip=%p " 888 "pid=%d p_comm=%s\n", 889 ftype, fault_flags, 890 (void *)frame->tf_addr, 891 (void *)frame->tf_rip, 892 p->p_pid, p->p_comm); 893 #ifdef DDB 894 if (ddb_on_seg_fault) 895 Debugger("ddb_on_seg_fault"); 896 #endif 897 } 898 899 return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); 900 } 901 902 static void 903 trap_fatal(struct trapframe *frame, vm_offset_t eva) 904 { 905 int code, ss; 906 u_int type; 907 long rsp; 908 struct soft_segment_descriptor softseg; 909 char *msg; 910 911 code = frame->tf_err; 912 type = frame->tf_trapno; 913 sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)], &softseg); 914 915 if (type <= MAX_TRAP_MSG) 916 msg = trap_msg[type]; 917 else 918 msg = "UNKNOWN"; 919 kprintf("\n\nFatal trap %d: %s while in %s mode\n", type, msg, 920 ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); 921 #ifdef SMP 922 /* three separate prints in case of a trap on an unmapped page */ 923 kprintf("cpuid = %d; ", mycpu->gd_cpuid); 924 kprintf("lapic->id = %08x\n", lapic->id); 925 #endif 926 if (type == T_PAGEFLT) { 927 kprintf("fault virtual address = 0x%lx\n", eva); 928 kprintf("fault code = %s %s %s, %s\n", 929 code & PGEX_U ? "user" : "supervisor", 930 code & PGEX_W ? "write" : "read", 931 code & PGEX_I ? "instruction" : "data", 932 code & PGEX_P ? "protection violation" : "page not present"); 933 } 934 kprintf("instruction pointer = 0x%lx:0x%lx\n", 935 frame->tf_cs & 0xffff, frame->tf_rip); 936 if (ISPL(frame->tf_cs) == SEL_UPL) { 937 ss = frame->tf_ss & 0xffff; 938 rsp = frame->tf_rsp; 939 } else { 940 ss = GSEL(GDATA_SEL, SEL_KPL); 941 rsp = (long)&frame->tf_rsp; 942 } 943 kprintf("stack pointer = 0x%x:0x%lx\n", ss, rsp); 944 kprintf("frame pointer = 0x%x:0x%lx\n", ss, frame->tf_rbp); 945 kprintf("code segment = base 0x%lx, limit 0x%lx, type 0x%x\n", 946 softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); 947 kprintf(" = DPL %d, pres %d, long %d, def32 %d, gran %d\n", 948 softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_long, softseg.ssd_def32, 949 softseg.ssd_gran); 950 kprintf("processor eflags = "); 951 if (frame->tf_rflags & PSL_T) 952 kprintf("trace trap, "); 953 if (frame->tf_rflags & PSL_I) 954 kprintf("interrupt enabled, "); 955 if (frame->tf_rflags & PSL_NT) 956 kprintf("nested task, "); 957 if (frame->tf_rflags & PSL_RF) 958 kprintf("resume, "); 959 kprintf("IOPL = %ld\n", (frame->tf_rflags & PSL_IOPL) >> 12); 960 kprintf("current process = "); 961 if (curproc) { 962 kprintf("%lu\n", 963 (u_long)curproc->p_pid); 964 } else { 965 kprintf("Idle\n"); 966 } 967 kprintf("current thread = pri %d ", curthread->td_pri); 968 if (curthread->td_critcount) 969 kprintf("(CRIT)"); 970 kprintf("\n"); 971 972 #ifdef DDB 973 if ((debugger_on_panic || db_active) && kdb_trap(type, code, frame)) 974 return; 975 #endif 976 kprintf("trap number = %d\n", type); 977 if (type <= MAX_TRAP_MSG) 978 panic("%s", trap_msg[type]); 979 else 980 panic("unknown/reserved trap"); 981 } 982 983 /* 984 * Double fault handler. Called when a fault occurs while writing 985 * a frame for a trap/exception onto the stack. This usually occurs 986 * when the stack overflows (such is the case with infinite recursion, 987 * for example). 988 */ 989 static __inline 990 int 991 in_kstack_guard(register_t rptr) 992 { 993 thread_t td = curthread; 994 995 if ((char *)rptr >= td->td_kstack && 996 (char *)rptr < td->td_kstack + PAGE_SIZE) { 997 return 1; 998 } 999 return 0; 1000 } 1001 1002 void 1003 dblfault_handler(struct trapframe *frame) 1004 { 1005 thread_t td = curthread; 1006 1007 if (in_kstack_guard(frame->tf_rsp) || in_kstack_guard(frame->tf_rbp)) { 1008 kprintf("DOUBLE FAULT - KERNEL STACK GUARD HIT!\n"); 1009 if (in_kstack_guard(frame->tf_rsp)) 1010 frame->tf_rsp = (register_t)(td->td_kstack + PAGE_SIZE); 1011 if (in_kstack_guard(frame->tf_rbp)) 1012 frame->tf_rbp = (register_t)(td->td_kstack + PAGE_SIZE); 1013 } else { 1014 kprintf("DOUBLE FAULT\n"); 1015 } 1016 kprintf("\nFatal double fault\n"); 1017 kprintf("rip = 0x%lx\n", frame->tf_rip); 1018 kprintf("rsp = 0x%lx\n", frame->tf_rsp); 1019 kprintf("rbp = 0x%lx\n", frame->tf_rbp); 1020 #ifdef SMP 1021 /* three separate prints in case of a trap on an unmapped page */ 1022 kprintf("cpuid = %d; ", mycpu->gd_cpuid); 1023 kprintf("lapic->id = %08x\n", lapic->id); 1024 #endif 1025 panic("double fault"); 1026 } 1027 1028 /* 1029 * syscall2 - MP aware system call request C handler 1030 * 1031 * A system call is essentially treated as a trap except that the 1032 * MP lock is not held on entry or return. We are responsible for 1033 * obtaining the MP lock if necessary and for handling ASTs 1034 * (e.g. a task switch) prior to return. 1035 * 1036 * MPSAFE 1037 */ 1038 void 1039 syscall2(struct trapframe *frame) 1040 { 1041 struct thread *td = curthread; 1042 struct proc *p = td->td_proc; 1043 struct lwp *lp = td->td_lwp; 1044 caddr_t params; 1045 struct sysent *callp; 1046 register_t orig_tf_rflags; 1047 int sticks; 1048 int error; 1049 int narg; 1050 #ifdef INVARIANTS 1051 int crit_count = td->td_critcount; 1052 #endif 1053 #ifdef SMP 1054 int have_mplock = 0; 1055 #endif 1056 register_t *argp; 1057 u_int code; 1058 int reg, regcnt; 1059 union sysunion args; 1060 register_t *argsdst; 1061 1062 mycpu->gd_cnt.v_syscall++; 1063 1064 #ifdef DIAGNOSTIC 1065 if (ISPL(frame->tf_cs) != SEL_UPL) { 1066 get_mplock(); 1067 panic("syscall"); 1068 /* NOT REACHED */ 1069 } 1070 #endif 1071 1072 KTR_LOG(kernentry_syscall, p->p_pid, lp->lwp_tid, 1073 frame->tf_rax); 1074 1075 userenter(td, p); /* lazy raise our priority */ 1076 1077 reg = 0; 1078 regcnt = 6; 1079 /* 1080 * Misc 1081 */ 1082 sticks = (int)td->td_sticks; 1083 orig_tf_rflags = frame->tf_rflags; 1084 1085 /* 1086 * Virtual kernel intercept - if a VM context managed by a virtual 1087 * kernel issues a system call the virtual kernel handles it, not us. 1088 * Restore the virtual kernel context and return from its system 1089 * call. The current frame is copied out to the virtual kernel. 1090 */ 1091 if (lp->lwp_vkernel && lp->lwp_vkernel->ve) { 1092 vkernel_trap(lp, frame); 1093 error = EJUSTRETURN; 1094 goto out; 1095 } 1096 1097 /* 1098 * Get the system call parameters and account for time 1099 */ 1100 KASSERT(lp->lwp_md.md_regs == frame, 1101 ("Frame mismatch %p %p", lp->lwp_md.md_regs, frame)); 1102 params = (caddr_t)frame->tf_rsp + sizeof(register_t); 1103 code = frame->tf_rax; 1104 1105 if (p->p_sysent->sv_prepsyscall) { 1106 (*p->p_sysent->sv_prepsyscall)( 1107 frame, (int *)(&args.nosys.sysmsg + 1), 1108 &code, ¶ms); 1109 } else { 1110 if (code == SYS_syscall || code == SYS___syscall) { 1111 code = frame->tf_rdi; 1112 reg++; 1113 regcnt--; 1114 } 1115 } 1116 1117 if (p->p_sysent->sv_mask) 1118 code &= p->p_sysent->sv_mask; 1119 1120 if (code >= p->p_sysent->sv_size) 1121 callp = &p->p_sysent->sv_table[0]; 1122 else 1123 callp = &p->p_sysent->sv_table[code]; 1124 1125 narg = callp->sy_narg & SYF_ARGMASK; 1126 1127 /* 1128 * On x86_64 we get up to six arguments in registers. The rest are 1129 * on the stack. The first six members of 'struct trapframe' happen 1130 * to be the registers used to pass arguments, in exactly the right 1131 * order. 1132 */ 1133 argp = &frame->tf_rdi; 1134 argp += reg; 1135 argsdst = (register_t *)(&args.nosys.sysmsg + 1); 1136 /* 1137 * JG can we overflow the space pointed to by 'argsdst' 1138 * either with 'bcopy' or with 'copyin'? 1139 */ 1140 bcopy(argp, argsdst, sizeof(register_t) * regcnt); 1141 /* 1142 * copyin is MP aware, but the tracing code is not 1143 */ 1144 if (narg > regcnt) { 1145 KASSERT(params != NULL, ("copyin args with no params!")); 1146 error = copyin(params, &argsdst[regcnt], 1147 (narg - regcnt) * sizeof(register_t)); 1148 if (error) { 1149 #ifdef KTRACE 1150 if (KTRPOINT(td, KTR_SYSCALL)) { 1151 MAKEMPSAFE(have_mplock); 1152 1153 ktrsyscall(lp, code, narg, 1154 (void *)(&args.nosys.sysmsg + 1)); 1155 } 1156 #endif 1157 goto bad; 1158 } 1159 } 1160 1161 #ifdef KTRACE 1162 if (KTRPOINT(td, KTR_SYSCALL)) { 1163 MAKEMPSAFE(have_mplock); 1164 ktrsyscall(lp, code, narg, (void *)(&args.nosys.sysmsg + 1)); 1165 } 1166 #endif 1167 1168 /* 1169 * Default return value is 0 (will be copied to %rax). Double-value 1170 * returns use %rax and %rdx. %rdx is left unchanged for system 1171 * calls which return only one result. 1172 */ 1173 args.sysmsg_fds[0] = 0; 1174 args.sysmsg_fds[1] = frame->tf_rdx; 1175 1176 /* 1177 * The syscall might manipulate the trap frame. If it does it 1178 * will probably return EJUSTRETURN. 1179 */ 1180 args.sysmsg_frame = frame; 1181 1182 STOPEVENT(p, S_SCE, narg); /* MP aware */ 1183 1184 /* 1185 * NOTE: All system calls run MPSAFE now. The system call itself 1186 * is responsible for getting the MP lock. 1187 */ 1188 error = (*callp->sy_call)(&args); 1189 1190 out: 1191 /* 1192 * MP SAFE (we may or may not have the MP lock at this point) 1193 */ 1194 //kprintf("SYSMSG %d ", error); 1195 switch (error) { 1196 case 0: 1197 /* 1198 * Reinitialize proc pointer `p' as it may be different 1199 * if this is a child returning from fork syscall. 1200 */ 1201 p = curproc; 1202 lp = curthread->td_lwp; 1203 frame->tf_rax = args.sysmsg_fds[0]; 1204 frame->tf_rdx = args.sysmsg_fds[1]; 1205 frame->tf_rflags &= ~PSL_C; 1206 break; 1207 case ERESTART: 1208 /* 1209 * Reconstruct pc, we know that 'syscall' is 2 bytes. 1210 * We have to do a full context restore so that %r10 1211 * (which was holding the value of %rcx) is restored for 1212 * the next iteration. 1213 */ 1214 frame->tf_rip -= frame->tf_err; 1215 frame->tf_r10 = frame->tf_rcx; 1216 break; 1217 case EJUSTRETURN: 1218 break; 1219 case EASYNC: 1220 panic("Unexpected EASYNC return value (for now)"); 1221 default: 1222 bad: 1223 if (p->p_sysent->sv_errsize) { 1224 if (error >= p->p_sysent->sv_errsize) 1225 error = -1; /* XXX */ 1226 else 1227 error = p->p_sysent->sv_errtbl[error]; 1228 } 1229 frame->tf_rax = error; 1230 frame->tf_rflags |= PSL_C; 1231 break; 1232 } 1233 1234 /* 1235 * Traced syscall. trapsignal() is not MP aware. 1236 */ 1237 if (orig_tf_rflags & PSL_T) { 1238 MAKEMPSAFE(have_mplock); 1239 frame->tf_rflags &= ~PSL_T; 1240 trapsignal(lp, SIGTRAP, TRAP_TRACE); 1241 } 1242 1243 /* 1244 * Handle reschedule and other end-of-syscall issues 1245 */ 1246 userret(lp, frame, sticks); 1247 1248 #ifdef KTRACE 1249 if (KTRPOINT(td, KTR_SYSRET)) { 1250 MAKEMPSAFE(have_mplock); 1251 ktrsysret(lp, code, error, args.sysmsg_result); 1252 } 1253 #endif 1254 1255 /* 1256 * This works because errno is findable through the 1257 * register set. If we ever support an emulation where this 1258 * is not the case, this code will need to be revisited. 1259 */ 1260 STOPEVENT(p, S_SCX, code); 1261 1262 userexit(lp); 1263 #ifdef SMP 1264 /* 1265 * Release the MP lock if we had to get it 1266 */ 1267 if (have_mplock) 1268 rel_mplock(); 1269 #endif 1270 KTR_LOG(kernentry_syscall_ret, p->p_pid, lp->lwp_tid, error); 1271 #ifdef INVARIANTS 1272 KASSERT(crit_count == td->td_critcount, 1273 ("syscall: critical section count mismatch! %d/%d", 1274 crit_count, td->td_pri)); 1275 KASSERT(&td->td_toks_base == td->td_toks_stop, 1276 ("syscall: extra tokens held after trap! %ld", 1277 td->td_toks_stop - &td->td_toks_base)); 1278 #endif 1279 } 1280 1281 /* 1282 * NOTE: mplock not held at any point 1283 */ 1284 void 1285 fork_return(struct lwp *lp, struct trapframe *frame) 1286 { 1287 frame->tf_rax = 0; /* Child returns zero */ 1288 frame->tf_rflags &= ~PSL_C; /* success */ 1289 frame->tf_rdx = 1; 1290 1291 generic_lwp_return(lp, frame); 1292 KTR_LOG(kernentry_fork_ret, lp->lwp_proc->p_pid, lp->lwp_tid); 1293 } 1294 1295 /* 1296 * Simplified back end of syscall(), used when returning from fork() 1297 * directly into user mode. 1298 * 1299 * This code will return back into the fork trampoline code which then 1300 * runs doreti. 1301 * 1302 * NOTE: The mplock is not held at any point. 1303 */ 1304 void 1305 generic_lwp_return(struct lwp *lp, struct trapframe *frame) 1306 { 1307 struct proc *p = lp->lwp_proc; 1308 1309 /* 1310 * Newly forked processes are given a kernel priority. We have to 1311 * adjust the priority to a normal user priority and fake entry 1312 * into the kernel (call userenter()) to install a passive release 1313 * function just in case userret() decides to stop the process. This 1314 * can occur when ^Z races a fork. If we do not install the passive 1315 * release function the current process designation will not be 1316 * released when the thread goes to sleep. 1317 */ 1318 lwkt_setpri_self(TDPRI_USER_NORM); 1319 userenter(lp->lwp_thread, p); 1320 userret(lp, frame, 0); 1321 #ifdef KTRACE 1322 if (KTRPOINT(lp->lwp_thread, KTR_SYSRET)) 1323 ktrsysret(lp, SYS_fork, 0, 0); 1324 #endif 1325 p->p_flag |= P_PASSIVE_ACQ; 1326 userexit(lp); 1327 p->p_flag &= ~P_PASSIVE_ACQ; 1328 } 1329 1330 /* 1331 * If PGEX_FPFAULT is set then set FP_VIRTFP in the PCB to force a T_DNA 1332 * fault (which is then passed back to the virtual kernel) if an attempt is 1333 * made to use the FP unit. 1334 * 1335 * XXX this is a fairly big hack. 1336 */ 1337 void 1338 set_vkernel_fp(struct trapframe *frame) 1339 { 1340 struct thread *td = curthread; 1341 1342 if (frame->tf_xflags & PGEX_FPFAULT) { 1343 td->td_pcb->pcb_flags |= FP_VIRTFP; 1344 if (mdcpu->gd_npxthread == td) 1345 npxexit(); 1346 } else { 1347 td->td_pcb->pcb_flags &= ~FP_VIRTFP; 1348 } 1349 } 1350 1351 /* 1352 * Called from vkernel_trap() to fixup the vkernel's syscall 1353 * frame for vmspace_ctl() return. 1354 */ 1355 void 1356 cpu_vkernel_trap(struct trapframe *frame, int error) 1357 { 1358 frame->tf_rax = error; 1359 if (error) 1360 frame->tf_rflags |= PSL_C; 1361 else 1362 frame->tf_rflags &= ~PSL_C; 1363 } 1364