1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (C) 1994, David Greenman 5 * Copyright (c) 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * the University of Utah, and William Jolitz. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the University of 22 * California, Berkeley and its contributors. 23 * 4. Neither the name of the University nor the names of its contributors 24 * may be used to endorse or promote products derived from this software 25 * without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 * 39 * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 40 */ 41 42 #include <sys/cdefs.h> 43 __FBSDID("$FreeBSD$"); 44 45 /* 46 * 386 Trap and System call handling 47 */ 48 49 #include "opt_clock.h" 50 #include "opt_compat.h" 51 #include "opt_cpu.h" 52 #include "opt_hwpmc_hooks.h" 53 #include "opt_isa.h" 54 #include "opt_kdb.h" 55 #include "opt_trap.h" 56 57 #include <sys/param.h> 58 #include <sys/bus.h> 59 #include <sys/systm.h> 60 #include <sys/proc.h> 61 #include <sys/ptrace.h> 62 #include <sys/kdb.h> 63 #include <sys/kernel.h> 64 #include <sys/ktr.h> 65 #include <sys/lock.h> 66 #include <sys/mutex.h> 67 #include <sys/resourcevar.h> 68 #include <sys/signalvar.h> 69 #include <sys/syscall.h> 70 #include <sys/sysctl.h> 71 #include <sys/sysent.h> 72 #include <sys/uio.h> 73 #include <sys/vmmeter.h> 74 #ifdef HWPMC_HOOKS 75 #include <sys/pmckern.h> 76 PMC_SOFT_DEFINE( , , page_fault, all); 77 PMC_SOFT_DEFINE( , , page_fault, read); 78 PMC_SOFT_DEFINE( , , page_fault, write); 79 #endif 80 #include <security/audit/audit.h> 81 82 #include <vm/vm.h> 83 #include <vm/vm_param.h> 84 #include <vm/pmap.h> 85 #include <vm/vm_kern.h> 86 #include <vm/vm_map.h> 87 #include <vm/vm_page.h> 88 #include <vm/vm_extern.h> 89 90 #include <machine/cpu.h> 91 #include <machine/intr_machdep.h> 92 #include <x86/mca.h> 93 #include <machine/md_var.h> 94 #include <machine/pcb.h> 95 #ifdef SMP 96 #include <machine/smp.h> 97 #endif 98 #include <machine/stack.h> 99 #include <machine/trap.h> 100 #include <machine/tss.h> 101 #include <machine/vm86.h> 102 103 #ifdef POWERFAIL_NMI 104 #include <sys/syslog.h> 105 #include <machine/clock.h> 106 #endif 107 108 #ifdef KDTRACE_HOOKS 109 #include <sys/dtrace_bsd.h> 110 #endif 111 112 void trap(struct trapframe *frame); 113 void syscall(struct trapframe *frame); 114 115 static int trap_pfault(struct trapframe *, bool, vm_offset_t, int *, int *); 116 static void trap_fatal(struct trapframe *, vm_offset_t); 117 #ifdef KDTRACE_HOOKS 118 static bool trap_user_dtrace(struct trapframe *, 119 int (**hook)(struct trapframe *)); 120 #endif 121 void dblfault_handler(void); 122 123 extern inthand_t IDTVEC(bpt), IDTVEC(dbg), IDTVEC(int0x80_syscall); 124 extern uint64_t pg_nx; 125 126 struct trap_data { 127 bool ei; 128 const char *msg; 129 }; 130 131 static const struct trap_data trap_data[] = { 132 [T_PRIVINFLT] = { .ei = true, .msg = "privileged instruction fault" }, 133 [T_BPTFLT] = { .ei = false, .msg = "breakpoint instruction fault" }, 134 [T_ARITHTRAP] = { .ei = true, .msg = "arithmetic trap" }, 135 [T_PROTFLT] = { .ei = true, .msg = "general protection fault" }, 136 [T_TRCTRAP] = { .ei = false, .msg = "debug exception" }, 137 [T_PAGEFLT] = { .ei = true, .msg = "page fault" }, 138 [T_ALIGNFLT] = { .ei = true, .msg = "alignment fault" }, 139 [T_DIVIDE] = { .ei = true, .msg = "integer divide fault" }, 140 [T_NMI] = { .ei = false, .msg = "non-maskable interrupt trap" }, 141 [T_OFLOW] = { .ei = true, .msg = "overflow trap" }, 142 [T_BOUND] = { .ei = true, .msg = "FPU bounds check fault" }, 143 [T_DNA] = { .ei = true, .msg = "FPU device not available" }, 144 [T_DOUBLEFLT] = { .ei = false, .msg = "double fault" }, 145 [T_FPOPFLT] = { .ei = true, .msg = "FPU operand fetch fault" }, 146 [T_TSSFLT] = { .ei = true, .msg = "invalid TSS fault" }, 147 [T_SEGNPFLT] = { .ei = true, .msg = "segment not present fault" }, 148 [T_STKFLT] = { .ei = true, .msg = "stack fault" }, 149 [T_MCHK] = { .ei = true, .msg = "machine check trap" }, 150 [T_XMMFLT] = { .ei = true, .msg = "SIMD floating-point exception" }, 151 [T_DTRACE_RET] ={ .ei = true, .msg = "DTrace pid return trap" }, 152 }; 153 154 static bool 155 trap_enable_intr(int trapno) 156 { 157 158 MPASS(trapno > 0); 159 if (trapno < nitems(trap_data) && trap_data[trapno].msg != NULL) 160 return (trap_data[trapno].ei); 161 return (false); 162 } 163 164 static const char * 165 trap_msg(int trapno) 166 { 167 const char *res; 168 static const char unkn[] = "UNKNOWN"; 169 170 res = NULL; 171 if (trapno < nitems(trap_data)) 172 res = trap_data[trapno].msg; 173 if (res == NULL) 174 res = unkn; 175 return (res); 176 } 177 178 #if defined(I586_CPU) && !defined(NO_F00F_HACK) 179 int has_f00f_bug = 0; /* Initialized so that it can be patched. */ 180 #endif 181 182 static int uprintf_signal; 183 SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RW, 184 &uprintf_signal, 0, 185 "Print debugging information on trap signal to ctty"); 186 187 /* 188 * Exception, fault, and trap interface to the FreeBSD kernel. 189 * This common code is called from assembly language IDT gate entry 190 * routines that prepare a suitable stack frame, and restore this 191 * frame after the exception has been processed. 192 */ 193 194 void 195 trap(struct trapframe *frame) 196 { 197 ksiginfo_t ksi; 198 struct thread *td; 199 struct proc *p; 200 int pf, signo, ucode; 201 u_int type; 202 register_t addr, dr6; 203 vm_offset_t eva; 204 #ifdef POWERFAIL_NMI 205 static int lastalert = 0; 206 #endif 207 208 td = curthread; 209 p = td->td_proc; 210 dr6 = 0; 211 212 VM_CNT_INC(v_trap); 213 type = frame->tf_trapno; 214 215 KASSERT((read_eflags() & PSL_I) == 0, 216 ("trap: interrupts enabled, type %d frame %p", type, frame)); 217 218 #ifdef SMP 219 /* Handler for NMI IPIs used for stopping CPUs. */ 220 if (type == T_NMI && ipi_nmi_handler() == 0) 221 return; 222 #endif /* SMP */ 223 224 #ifdef KDB 225 if (kdb_active) { 226 kdb_reenter(); 227 return; 228 } 229 #endif 230 231 if (type == T_RESERVED) { 232 trap_fatal(frame, 0); 233 return; 234 } 235 236 if (type == T_NMI) { 237 #ifdef HWPMC_HOOKS 238 /* 239 * CPU PMCs interrupt using an NMI so we check for that first. 240 * If the HWPMC module is active, 'pmc_hook' will point to 241 * the function to be called. A non-zero return value from the 242 * hook means that the NMI was consumed by it and that we can 243 * return immediately. 244 */ 245 if (pmc_intr != NULL && 246 (*pmc_intr)(frame) != 0) 247 return; 248 #endif 249 } 250 251 if (type == T_MCHK) { 252 mca_intr(); 253 return; 254 } 255 256 #ifdef KDTRACE_HOOKS 257 /* 258 * A trap can occur while DTrace executes a probe. Before 259 * executing the probe, DTrace blocks re-scheduling and sets 260 * a flag in its per-cpu flags to indicate that it doesn't 261 * want to fault. On returning from the probe, the no-fault 262 * flag is cleared and finally re-scheduling is enabled. 263 */ 264 if ((type == T_PROTFLT || type == T_PAGEFLT) && 265 dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, type)) 266 return; 267 #endif 268 269 /* 270 * We must not allow context switches until %cr2 is read. 271 * Also, for some Cyrix CPUs, %cr2 is clobbered by interrupts. 272 * All faults use interrupt gates, so %cr2 can be safely read 273 * now, before optional enable of the interrupts below. 274 */ 275 if (type == T_PAGEFLT) 276 eva = rcr2(); 277 278 /* 279 * Buggy application or kernel code has disabled interrupts 280 * and then trapped. Enabling interrupts now is wrong, but it 281 * is better than running with interrupts disabled until they 282 * are accidentally enabled later. 283 */ 284 if ((frame->tf_eflags & PSL_I) == 0 && TRAPF_USERMODE(frame) && 285 (curpcb->pcb_flags & PCB_VM86CALL) == 0) 286 uprintf("pid %ld (%s): trap %d with interrupts disabled\n", 287 (long)curproc->p_pid, curthread->td_name, type); 288 289 /* 290 * Conditionally reenable interrupts. If we hold a spin lock, 291 * then we must not reenable interrupts. This might be a 292 * spurious page fault. 293 */ 294 if (trap_enable_intr(type) && td->td_md.md_spinlock_count == 0 && 295 frame->tf_eip != (int)cpu_switch_load_gs) 296 enable_intr(); 297 298 if (TRAPF_USERMODE(frame) && (curpcb->pcb_flags & PCB_VM86CALL) == 0) { 299 /* user trap */ 300 301 td->td_pticks = 0; 302 td->td_frame = frame; 303 addr = frame->tf_eip; 304 if (td->td_cowgen != p->p_cowgen) 305 thread_cow_update(td); 306 307 switch (type) { 308 case T_PRIVINFLT: /* privileged instruction fault */ 309 signo = SIGILL; 310 ucode = ILL_PRVOPC; 311 break; 312 313 case T_BPTFLT: /* bpt instruction fault */ 314 #ifdef KDTRACE_HOOKS 315 if (trap_user_dtrace(frame, &dtrace_pid_probe_ptr)) 316 return; 317 #else 318 enable_intr(); 319 #endif 320 signo = SIGTRAP; 321 ucode = TRAP_BRKPT; 322 break; 323 324 case T_TRCTRAP: /* debug exception */ 325 enable_intr(); 326 user_trctrap_out: 327 signo = SIGTRAP; 328 ucode = TRAP_TRACE; 329 dr6 = rdr6(); 330 if ((dr6 & DBREG_DR6_BS) != 0) { 331 PROC_LOCK(td->td_proc); 332 if ((td->td_dbgflags & TDB_STEP) != 0) { 333 td->td_frame->tf_eflags &= ~PSL_T; 334 td->td_dbgflags &= ~TDB_STEP; 335 } 336 PROC_UNLOCK(td->td_proc); 337 } 338 break; 339 340 case T_ARITHTRAP: /* arithmetic trap */ 341 ucode = npxtrap_x87(); 342 if (ucode == -1) 343 return; 344 signo = SIGFPE; 345 break; 346 347 /* 348 * The following two traps can happen in vm86 mode, 349 * and, if so, we want to handle them specially. 350 */ 351 case T_PROTFLT: /* general protection fault */ 352 case T_STKFLT: /* stack fault */ 353 if (frame->tf_eflags & PSL_VM) { 354 signo = vm86_emulate((struct vm86frame *)frame); 355 ucode = 0; /* XXXKIB: better code ? */ 356 if (signo == SIGTRAP) { 357 load_dr6(rdr6() | 0x4000); 358 goto user_trctrap_out; 359 } 360 if (signo == 0) 361 goto user; 362 break; 363 } 364 signo = SIGBUS; 365 ucode = (type == T_PROTFLT) ? BUS_OBJERR : BUS_ADRERR; 366 break; 367 case T_SEGNPFLT: /* segment not present fault */ 368 signo = SIGBUS; 369 ucode = BUS_ADRERR; 370 break; 371 case T_TSSFLT: /* invalid TSS fault */ 372 signo = SIGBUS; 373 ucode = BUS_OBJERR; 374 break; 375 case T_ALIGNFLT: 376 signo = SIGBUS; 377 ucode = BUS_ADRALN; 378 break; 379 case T_DOUBLEFLT: /* double fault */ 380 default: 381 signo = SIGBUS; 382 ucode = BUS_OBJERR; 383 break; 384 385 case T_PAGEFLT: /* page fault */ 386 addr = eva; 387 pf = trap_pfault(frame, true, eva, &signo, &ucode); 388 #if defined(I586_CPU) && !defined(NO_F00F_HACK) 389 if (pf == -2) { 390 /* 391 * The f00f hack workaround has triggered, so 392 * treat the fault as an illegal instruction 393 * (T_PRIVINFLT) instead of a page fault. 394 */ 395 type = frame->tf_trapno = T_PRIVINFLT; 396 break; 397 } 398 #endif 399 if (pf == -1) 400 return; 401 if (pf == 0) 402 goto user; 403 break; 404 405 case T_DIVIDE: /* integer divide fault */ 406 ucode = FPE_INTDIV; 407 signo = SIGFPE; 408 break; 409 410 #ifdef DEV_ISA 411 case T_NMI: 412 #ifdef POWERFAIL_NMI 413 #ifndef TIMER_FREQ 414 # define TIMER_FREQ 1193182 415 #endif 416 if (time_second - lastalert > 10) { 417 log(LOG_WARNING, "NMI: power fail\n"); 418 sysbeep(880, hz); 419 lastalert = time_second; 420 } 421 return; 422 #else /* !POWERFAIL_NMI */ 423 nmi_handle_intr(type, frame); 424 return; 425 #endif /* POWERFAIL_NMI */ 426 #endif /* DEV_ISA */ 427 428 case T_OFLOW: /* integer overflow fault */ 429 ucode = FPE_INTOVF; 430 signo = SIGFPE; 431 break; 432 433 case T_BOUND: /* bounds check fault */ 434 ucode = FPE_FLTSUB; 435 signo = SIGFPE; 436 break; 437 438 case T_DNA: 439 KASSERT(PCB_USER_FPU(td->td_pcb), 440 ("kernel FPU ctx has leaked")); 441 /* transparent fault (due to context switch "late") */ 442 if (npxdna()) 443 return; 444 uprintf("pid %d killed due to lack of floating point\n", 445 p->p_pid); 446 signo = SIGKILL; 447 ucode = 0; 448 break; 449 450 case T_FPOPFLT: /* FPU operand fetch fault */ 451 ucode = ILL_COPROC; 452 signo = SIGILL; 453 break; 454 455 case T_XMMFLT: /* SIMD floating-point exception */ 456 ucode = npxtrap_sse(); 457 if (ucode == -1) 458 return; 459 signo = SIGFPE; 460 break; 461 #ifdef KDTRACE_HOOKS 462 case T_DTRACE_RET: 463 (void)trap_user_dtrace(frame, &dtrace_return_probe_ptr); 464 return; 465 #endif 466 } 467 } else { 468 /* kernel trap */ 469 470 KASSERT(cold || td->td_ucred != NULL, 471 ("kernel trap doesn't have ucred")); 472 switch (type) { 473 case T_PAGEFLT: /* page fault */ 474 (void)trap_pfault(frame, false, eva, NULL, NULL); 475 return; 476 477 case T_DNA: 478 if (PCB_USER_FPU(td->td_pcb)) 479 panic("Unregistered use of FPU in kernel"); 480 if (npxdna()) 481 return; 482 break; 483 484 case T_ARITHTRAP: /* arithmetic trap */ 485 case T_XMMFLT: /* SIMD floating-point exception */ 486 case T_FPOPFLT: /* FPU operand fetch fault */ 487 /* 488 * XXXKIB for now disable any FPU traps in kernel 489 * handler registration seems to be overkill 490 */ 491 trap_fatal(frame, 0); 492 return; 493 494 /* 495 * The following two traps can happen in 496 * vm86 mode, and, if so, we want to handle 497 * them specially. 498 */ 499 case T_PROTFLT: /* general protection fault */ 500 case T_STKFLT: /* stack fault */ 501 if (frame->tf_eflags & PSL_VM) { 502 signo = vm86_emulate((struct vm86frame *)frame); 503 if (signo == SIGTRAP) { 504 type = T_TRCTRAP; 505 load_dr6(rdr6() | 0x4000); 506 goto kernel_trctrap; 507 } 508 if (signo != 0) 509 /* 510 * returns to original process 511 */ 512 vm86_trap((struct vm86frame *)frame); 513 return; 514 } 515 /* FALL THROUGH */ 516 case T_SEGNPFLT: /* segment not present fault */ 517 if (curpcb->pcb_flags & PCB_VM86CALL) 518 break; 519 520 /* 521 * Invalid %fs's and %gs's can be created using 522 * procfs or PT_SETREGS or by invalidating the 523 * underlying LDT entry. This causes a fault 524 * in kernel mode when the kernel attempts to 525 * switch contexts. Lose the bad context 526 * (XXX) so that we can continue, and generate 527 * a signal. 528 */ 529 if (frame->tf_eip == (int)cpu_switch_load_gs) { 530 curpcb->pcb_gs = 0; 531 #if 0 532 PROC_LOCK(p); 533 kern_psignal(p, SIGBUS); 534 PROC_UNLOCK(p); 535 #endif 536 return; 537 } 538 539 if (td->td_intr_nesting_level != 0) 540 break; 541 542 /* 543 * Invalid segment selectors and out of bounds 544 * %eip's and %esp's can be set up in user mode. 545 * This causes a fault in kernel mode when the 546 * kernel tries to return to user mode. We want 547 * to get this fault so that we can fix the 548 * problem here and not have to check all the 549 * selectors and pointers when the user changes 550 * them. 551 * 552 * N.B. Comparing to long mode, 32-bit mode 553 * does not push %esp on the trap frame, 554 * because iretl faulted while in ring 0. As 555 * the consequence, there is no need to fixup 556 * the stack pointer for doreti_iret_fault, 557 * the fixup and the complimentary trap() call 558 * are executed on the main thread stack, not 559 * on the trampoline stack. 560 */ 561 if (frame->tf_eip == (int)doreti_iret + setidt_disp) { 562 frame->tf_eip = (int)doreti_iret_fault + 563 setidt_disp; 564 return; 565 } 566 if (type == T_STKFLT) 567 break; 568 569 if (frame->tf_eip == (int)doreti_popl_ds + 570 setidt_disp) { 571 frame->tf_eip = (int)doreti_popl_ds_fault + 572 setidt_disp; 573 return; 574 } 575 if (frame->tf_eip == (int)doreti_popl_es + 576 setidt_disp) { 577 frame->tf_eip = (int)doreti_popl_es_fault + 578 setidt_disp; 579 return; 580 } 581 if (frame->tf_eip == (int)doreti_popl_fs + 582 setidt_disp) { 583 frame->tf_eip = (int)doreti_popl_fs_fault + 584 setidt_disp; 585 return; 586 } 587 if (curpcb->pcb_onfault != NULL) { 588 frame->tf_eip = (int)curpcb->pcb_onfault; 589 return; 590 } 591 break; 592 593 case T_TSSFLT: 594 /* 595 * PSL_NT can be set in user mode and isn't cleared 596 * automatically when the kernel is entered. This 597 * causes a TSS fault when the kernel attempts to 598 * `iret' because the TSS link is uninitialized. We 599 * want to get this fault so that we can fix the 600 * problem here and not every time the kernel is 601 * entered. 602 */ 603 if (frame->tf_eflags & PSL_NT) { 604 frame->tf_eflags &= ~PSL_NT; 605 return; 606 } 607 break; 608 609 case T_TRCTRAP: /* debug exception */ 610 kernel_trctrap: 611 /* Clear any pending debug events. */ 612 dr6 = rdr6(); 613 load_dr6(0); 614 615 /* 616 * Ignore debug register exceptions due to 617 * accesses in the user's address space, which 618 * can happen under several conditions such as 619 * if a user sets a watchpoint on a buffer and 620 * then passes that buffer to a system call. 621 * We still want to get TRCTRAPS for addresses 622 * in kernel space because that is useful when 623 * debugging the kernel. 624 */ 625 if (user_dbreg_trap(dr6) && 626 !(curpcb->pcb_flags & PCB_VM86CALL)) 627 return; 628 629 /* 630 * Malicious user code can configure a debug 631 * register watchpoint to trap on data access 632 * to the top of stack and then execute 'pop 633 * %ss; int 3'. Due to exception deferral for 634 * 'pop %ss', the CPU will not interrupt 'int 635 * 3' to raise the DB# exception for the debug 636 * register but will postpone the DB# until 637 * execution of the first instruction of the 638 * BP# handler (in kernel mode). Normally the 639 * previous check would ignore DB# exceptions 640 * for watchpoints on user addresses raised in 641 * kernel mode. However, some CPU errata 642 * include cases where DB# exceptions do not 643 * properly set bits in %dr6, e.g. Haswell 644 * HSD23 and Skylake-X SKZ24. 645 * 646 * A deferred DB# can also be raised on the 647 * first instructions of system call entry 648 * points or single-step traps via similar use 649 * of 'pop %ss' or 'mov xxx, %ss'. 650 */ 651 if (frame->tf_eip == 652 (uintptr_t)IDTVEC(int0x80_syscall) + setidt_disp || 653 frame->tf_eip == (uintptr_t)IDTVEC(bpt) + 654 setidt_disp || 655 frame->tf_eip == (uintptr_t)IDTVEC(dbg) + 656 setidt_disp) 657 return; 658 /* 659 * FALLTHROUGH (TRCTRAP kernel mode, kernel address) 660 */ 661 case T_BPTFLT: 662 /* 663 * If KDB is enabled, let it handle the debugger trap. 664 * Otherwise, debugger traps "can't happen". 665 */ 666 #ifdef KDB 667 if (kdb_trap(type, dr6, frame)) 668 return; 669 #endif 670 break; 671 672 #ifdef DEV_ISA 673 case T_NMI: 674 #ifdef POWERFAIL_NMI 675 if (time_second - lastalert > 10) { 676 log(LOG_WARNING, "NMI: power fail\n"); 677 sysbeep(880, hz); 678 lastalert = time_second; 679 } 680 return; 681 #else /* !POWERFAIL_NMI */ 682 nmi_handle_intr(type, frame); 683 return; 684 #endif /* POWERFAIL_NMI */ 685 #endif /* DEV_ISA */ 686 } 687 688 trap_fatal(frame, eva); 689 return; 690 } 691 692 /* Translate fault for emulators (e.g. Linux) */ 693 if (*p->p_sysent->sv_transtrap != NULL) 694 signo = (*p->p_sysent->sv_transtrap)(signo, type); 695 696 ksiginfo_init_trap(&ksi); 697 ksi.ksi_signo = signo; 698 ksi.ksi_code = ucode; 699 ksi.ksi_addr = (void *)addr; 700 ksi.ksi_trapno = type; 701 if (uprintf_signal) { 702 uprintf("pid %d comm %s: signal %d err %x code %d type %d " 703 "addr 0x%x ss 0x%04x esp 0x%08x cs 0x%04x eip 0x%08x " 704 "<%02x %02x %02x %02x %02x %02x %02x %02x>\n", 705 p->p_pid, p->p_comm, signo, frame->tf_err, ucode, type, 706 addr, frame->tf_ss, frame->tf_esp, frame->tf_cs, 707 frame->tf_eip, 708 fubyte((void *)(frame->tf_eip + 0)), 709 fubyte((void *)(frame->tf_eip + 1)), 710 fubyte((void *)(frame->tf_eip + 2)), 711 fubyte((void *)(frame->tf_eip + 3)), 712 fubyte((void *)(frame->tf_eip + 4)), 713 fubyte((void *)(frame->tf_eip + 5)), 714 fubyte((void *)(frame->tf_eip + 6)), 715 fubyte((void *)(frame->tf_eip + 7))); 716 } 717 KASSERT((read_eflags() & PSL_I) != 0, ("interrupts disabled")); 718 trapsignal(td, &ksi); 719 720 user: 721 userret(td, frame); 722 KASSERT(PCB_USER_FPU(td->td_pcb), 723 ("Return from trap with kernel FPU ctx leaked")); 724 } 725 726 /* 727 * Handle all details of a page fault. 728 * Returns: 729 * -2 if the fault was caused by triggered workaround for Intel Pentium 730 * 0xf00f bug. 731 * -1 if this fault was fatal, typically from kernel mode 732 * (cannot happen, but we need to return something). 733 * 0 if this fault was handled by updating either the user or kernel 734 * page table, execution can continue. 735 * 1 if this fault was from usermode and it was not handled, a synchronous 736 * signal should be delivered to the thread. *signo returns the signal 737 * number, *ucode gives si_code. 738 */ 739 static int 740 trap_pfault(struct trapframe *frame, bool usermode, vm_offset_t eva, 741 int *signo, int *ucode) 742 { 743 struct thread *td; 744 struct proc *p; 745 vm_map_t map; 746 int rv; 747 vm_prot_t ftype; 748 749 MPASS(!usermode || (signo != NULL && ucode != NULL)); 750 751 td = curthread; 752 p = td->td_proc; 753 754 if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) { 755 /* 756 * Due to both processor errata and lazy TLB invalidation when 757 * access restrictions are removed from virtual pages, memory 758 * accesses that are allowed by the physical mapping layer may 759 * nonetheless cause one spurious page fault per virtual page. 760 * When the thread is executing a "no faulting" section that 761 * is bracketed by vm_fault_{disable,enable}_pagefaults(), 762 * every page fault is treated as a spurious page fault, 763 * unless it accesses the same virtual address as the most 764 * recent page fault within the same "no faulting" section. 765 */ 766 if (td->td_md.md_spurflt_addr != eva || 767 (td->td_pflags & TDP_RESETSPUR) != 0) { 768 /* 769 * Do nothing to the TLB. A stale TLB entry is 770 * flushed automatically by a page fault. 771 */ 772 td->td_md.md_spurflt_addr = eva; 773 td->td_pflags &= ~TDP_RESETSPUR; 774 return (0); 775 } 776 } else { 777 /* 778 * If we get a page fault while in a critical section, then 779 * it is most likely a fatal kernel page fault. The kernel 780 * is already going to panic trying to get a sleep lock to 781 * do the VM lookup, so just consider it a fatal trap so the 782 * kernel can print out a useful trap message and even get 783 * to the debugger. 784 * 785 * If we get a page fault while holding a non-sleepable 786 * lock, then it is most likely a fatal kernel page fault. 787 * If WITNESS is enabled, then it's going to whine about 788 * bogus LORs with various VM locks, so just skip to the 789 * fatal trap handling directly. 790 */ 791 if (td->td_critnest != 0 || 792 WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL, 793 "Kernel page fault") != 0) { 794 trap_fatal(frame, eva); 795 return (-1); 796 } 797 } 798 if (eva >= PMAP_TRM_MIN_ADDRESS) { 799 /* 800 * Don't allow user-mode faults in kernel address space. 801 * An exception: if the faulting address is the invalid 802 * instruction entry in the IDT, then the Intel Pentium 803 * F00F bug workaround was triggered, and we need to 804 * treat it is as an illegal instruction, and not a page 805 * fault. 806 */ 807 #if defined(I586_CPU) && !defined(NO_F00F_HACK) 808 if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) { 809 *ucode = ILL_PRVOPC; 810 *signo = SIGILL; 811 return (-2); 812 } 813 #endif 814 if (usermode) { 815 *signo = SIGSEGV; 816 *ucode = SEGV_MAPERR; 817 return (1); 818 } 819 trap_fatal(frame, eva); 820 return (-1); 821 } else { 822 map = usermode ? &p->p_vmspace->vm_map : kernel_map; 823 824 /* 825 * Kernel cannot access a user-space address directly 826 * because user pages are not mapped. Also, page 827 * faults must not be caused during the interrupts. 828 */ 829 if (!usermode && td->td_intr_nesting_level != 0) { 830 trap_fatal(frame, eva); 831 return (-1); 832 } 833 } 834 835 /* 836 * If the trap was caused by errant bits in the PTE then panic. 837 */ 838 if (frame->tf_err & PGEX_RSV) { 839 trap_fatal(frame, eva); 840 return (-1); 841 } 842 843 /* 844 * PGEX_I is defined only if the execute disable bit capability is 845 * supported and enabled. 846 */ 847 if (frame->tf_err & PGEX_W) 848 ftype = VM_PROT_WRITE; 849 else if ((frame->tf_err & PGEX_I) && pg_nx != 0) 850 ftype = VM_PROT_EXECUTE; 851 else 852 ftype = VM_PROT_READ; 853 854 /* Fault in the page. */ 855 rv = vm_fault_trap(map, eva, ftype, VM_FAULT_NORMAL, signo, ucode); 856 if (rv == KERN_SUCCESS) { 857 #ifdef HWPMC_HOOKS 858 if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { 859 PMC_SOFT_CALL_TF( , , page_fault, all, frame); 860 if (ftype == VM_PROT_READ) 861 PMC_SOFT_CALL_TF( , , page_fault, read, 862 frame); 863 else 864 PMC_SOFT_CALL_TF( , , page_fault, write, 865 frame); 866 } 867 #endif 868 return (0); 869 } 870 if (usermode) 871 return (1); 872 if (td->td_intr_nesting_level == 0 && 873 curpcb->pcb_onfault != NULL) { 874 frame->tf_eip = (int)curpcb->pcb_onfault; 875 return (0); 876 } 877 trap_fatal(frame, eva); 878 return (-1); 879 } 880 881 static void 882 trap_fatal(frame, eva) 883 struct trapframe *frame; 884 vm_offset_t eva; 885 { 886 int code, ss, esp; 887 u_int type; 888 struct soft_segment_descriptor softseg; 889 #ifdef KDB 890 bool handled; 891 #endif 892 893 code = frame->tf_err; 894 type = frame->tf_trapno; 895 sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg); 896 897 printf("\n\nFatal trap %d: %s while in %s mode\n", type, trap_msg(type), 898 frame->tf_eflags & PSL_VM ? "vm86" : 899 ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); 900 #ifdef SMP 901 /* two separate prints in case of a trap on an unmapped page */ 902 printf("cpuid = %d; ", PCPU_GET(cpuid)); 903 printf("apic id = %02x\n", PCPU_GET(apic_id)); 904 #endif 905 if (type == T_PAGEFLT) { 906 printf("fault virtual address = 0x%x\n", eva); 907 printf("fault code = %s %s%s, %s\n", 908 code & PGEX_U ? "user" : "supervisor", 909 code & PGEX_W ? "write" : "read", 910 pg_nx != 0 ? 911 (code & PGEX_I ? " instruction" : " data") : 912 "", 913 code & PGEX_RSV ? "reserved bits in PTE" : 914 code & PGEX_P ? "protection violation" : "page not present"); 915 } else { 916 printf("error code = %#x\n", code); 917 } 918 printf("instruction pointer = 0x%x:0x%x\n", 919 frame->tf_cs & 0xffff, frame->tf_eip); 920 if (TF_HAS_STACKREGS(frame)) { 921 ss = frame->tf_ss & 0xffff; 922 esp = frame->tf_esp; 923 } else { 924 ss = GSEL(GDATA_SEL, SEL_KPL); 925 esp = (int)&frame->tf_esp; 926 } 927 printf("stack pointer = 0x%x:0x%x\n", ss, esp); 928 printf("frame pointer = 0x%x:0x%x\n", ss, frame->tf_ebp); 929 printf("code segment = base 0x%x, limit 0x%x, type 0x%x\n", 930 softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); 931 printf(" = DPL %d, pres %d, def32 %d, gran %d\n", 932 softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32, 933 softseg.ssd_gran); 934 printf("processor eflags = "); 935 if (frame->tf_eflags & PSL_T) 936 printf("trace trap, "); 937 if (frame->tf_eflags & PSL_I) 938 printf("interrupt enabled, "); 939 if (frame->tf_eflags & PSL_NT) 940 printf("nested task, "); 941 if (frame->tf_eflags & PSL_RF) 942 printf("resume, "); 943 if (frame->tf_eflags & PSL_VM) 944 printf("vm86, "); 945 printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12); 946 printf("current process = %d (%s)\n", 947 curproc->p_pid, curthread->td_name); 948 949 #ifdef KDB 950 if (debugger_on_trap) { 951 kdb_why = KDB_WHY_TRAP; 952 frame->tf_err = eva; /* smuggle fault address to ddb */ 953 handled = kdb_trap(type, 0, frame); 954 frame->tf_err = code; /* restore error code */ 955 kdb_why = KDB_WHY_UNSET; 956 if (handled) 957 return; 958 } 959 #endif 960 printf("trap number = %d\n", type); 961 if (trap_msg(type) != NULL) 962 panic("%s", trap_msg(type)); 963 else 964 panic("unknown/reserved trap"); 965 } 966 967 #ifdef KDTRACE_HOOKS 968 /* 969 * Invoke a userspace DTrace hook. The hook pointer is cleared when no 970 * userspace probes are enabled, so we must synchronize with DTrace to ensure 971 * that a trapping thread is able to call the hook before it is cleared. 972 */ 973 static bool 974 trap_user_dtrace(struct trapframe *frame, int (**hookp)(struct trapframe *)) 975 { 976 int (*hook)(struct trapframe *); 977 978 hook = atomic_load_ptr(hookp); 979 enable_intr(); 980 if (hook != NULL) 981 return ((hook)(frame) == 0); 982 return (false); 983 } 984 #endif 985 986 /* 987 * Double fault handler. Called when a fault occurs while writing 988 * a frame for a trap/exception onto the stack. This usually occurs 989 * when the stack overflows (such is the case with infinite recursion, 990 * for example). 991 * 992 * XXX Note that the current PTD gets replaced by IdlePTD when the 993 * task switch occurs. This means that the stack that was active at 994 * the time of the double fault is not available at <kstack> unless 995 * the machine was idle when the double fault occurred. The downside 996 * of this is that "trace <ebp>" in ddb won't work. 997 */ 998 void 999 dblfault_handler(void) 1000 { 1001 #ifdef KDTRACE_HOOKS 1002 if (dtrace_doubletrap_func != NULL) 1003 (*dtrace_doubletrap_func)(); 1004 #endif 1005 printf("\nFatal double fault:\n"); 1006 printf("eip = 0x%x\n", PCPU_GET(common_tssp)->tss_eip); 1007 printf("esp = 0x%x\n", PCPU_GET(common_tssp)->tss_esp); 1008 printf("ebp = 0x%x\n", PCPU_GET(common_tssp)->tss_ebp); 1009 #ifdef SMP 1010 /* two separate prints in case of a trap on an unmapped page */ 1011 printf("cpuid = %d; ", PCPU_GET(cpuid)); 1012 printf("apic id = %02x\n", PCPU_GET(apic_id)); 1013 #endif 1014 panic("double fault"); 1015 } 1016 1017 int 1018 cpu_fetch_syscall_args(struct thread *td) 1019 { 1020 struct proc *p; 1021 struct trapframe *frame; 1022 struct syscall_args *sa; 1023 caddr_t params; 1024 long tmp; 1025 int error; 1026 #ifdef COMPAT_43 1027 u_int32_t eip; 1028 int cs; 1029 #endif 1030 1031 p = td->td_proc; 1032 frame = td->td_frame; 1033 sa = &td->td_sa; 1034 1035 #ifdef COMPAT_43 1036 if (__predict_false(frame->tf_cs == 7 && frame->tf_eip == 2)) { 1037 /* 1038 * In lcall $7,$0 after int $0x80. Convert the user 1039 * frame to what it would be for a direct int 0x80 instead 1040 * of lcall $7,$0, by popping the lcall return address. 1041 */ 1042 error = fueword32((void *)frame->tf_esp, &eip); 1043 if (error == -1) 1044 return (EFAULT); 1045 cs = fuword16((void *)(frame->tf_esp + sizeof(u_int32_t))); 1046 if (cs == -1) 1047 return (EFAULT); 1048 1049 /* 1050 * Unwind in-kernel frame after all stack frame pieces 1051 * were successfully read. 1052 */ 1053 frame->tf_eip = eip; 1054 frame->tf_cs = cs; 1055 frame->tf_esp += 2 * sizeof(u_int32_t); 1056 frame->tf_err = 7; /* size of lcall $7,$0 */ 1057 } 1058 #endif 1059 1060 sa->code = frame->tf_eax; 1061 params = (caddr_t)frame->tf_esp + sizeof(uint32_t); 1062 1063 /* 1064 * Need to check if this is a 32 bit or 64 bit syscall. 1065 */ 1066 if (sa->code == SYS_syscall) { 1067 /* 1068 * Code is first argument, followed by actual args. 1069 */ 1070 error = fueword(params, &tmp); 1071 if (error == -1) 1072 return (EFAULT); 1073 sa->code = tmp; 1074 params += sizeof(uint32_t); 1075 } else if (sa->code == SYS___syscall) { 1076 /* 1077 * Like syscall, but code is a quad, so as to maintain 1078 * quad alignment for the rest of the arguments. 1079 */ 1080 error = fueword(params, &tmp); 1081 if (error == -1) 1082 return (EFAULT); 1083 sa->code = tmp; 1084 params += sizeof(quad_t); 1085 } 1086 1087 if (sa->code >= p->p_sysent->sv_size) 1088 sa->callp = &p->p_sysent->sv_table[0]; 1089 else 1090 sa->callp = &p->p_sysent->sv_table[sa->code]; 1091 sa->narg = sa->callp->sy_narg; 1092 1093 if (params != NULL && sa->narg != 0) 1094 error = copyin(params, (caddr_t)sa->args, 1095 (u_int)(sa->narg * sizeof(uint32_t))); 1096 else 1097 error = 0; 1098 1099 if (error == 0) { 1100 td->td_retval[0] = 0; 1101 td->td_retval[1] = frame->tf_edx; 1102 } 1103 1104 return (error); 1105 } 1106 1107 #include "../../kern/subr_syscall.c" 1108 1109 /* 1110 * syscall - system call request C handler. A system call is 1111 * essentially treated as a trap by reusing the frame layout. 1112 */ 1113 void 1114 syscall(struct trapframe *frame) 1115 { 1116 struct thread *td; 1117 register_t orig_tf_eflags; 1118 ksiginfo_t ksi; 1119 1120 #ifdef DIAGNOSTIC 1121 if (!(TRAPF_USERMODE(frame) && 1122 (curpcb->pcb_flags & PCB_VM86CALL) == 0)) { 1123 panic("syscall"); 1124 /* NOT REACHED */ 1125 } 1126 #endif 1127 orig_tf_eflags = frame->tf_eflags; 1128 1129 td = curthread; 1130 td->td_frame = frame; 1131 1132 syscallenter(td); 1133 1134 /* 1135 * Traced syscall. 1136 */ 1137 if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) { 1138 frame->tf_eflags &= ~PSL_T; 1139 ksiginfo_init_trap(&ksi); 1140 ksi.ksi_signo = SIGTRAP; 1141 ksi.ksi_code = TRAP_TRACE; 1142 ksi.ksi_addr = (void *)frame->tf_eip; 1143 trapsignal(td, &ksi); 1144 } 1145 1146 KASSERT(PCB_USER_FPU(td->td_pcb), 1147 ("System call %s returning with kernel FPU ctx leaked", 1148 syscallname(td->td_proc, td->td_sa.code))); 1149 KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td), 1150 ("System call %s returning with mangled pcb_save", 1151 syscallname(td->td_proc, td->td_sa.code))); 1152 1153 syscallret(td); 1154 } 1155