1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 2018 The FreeBSD Foundation 5 * Copyright (c) 1992 Terrence R. Lambert. 6 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * William Jolitz. 11 * 12 * Portions of this software were developed by A. Joseph Koshy under 13 * sponsorship from the FreeBSD Foundation and Google, Inc. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 3. All advertising materials mentioning features or use of this software 24 * must display the following acknowledgement: 25 * This product includes software developed by the University of 26 * California, Berkeley and its contributors. 27 * 4. Neither the name of the University nor the names of its contributors 28 * may be used to endorse or promote products derived from this software 29 * without specific prior written permission. 30 * 31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 41 * SUCH DAMAGE. 42 * 43 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 44 */ 45 46 #include <sys/cdefs.h> 47 __FBSDID("$FreeBSD$"); 48 49 #include "opt_apic.h" 50 #include "opt_atpic.h" 51 #include "opt_cpu.h" 52 #include "opt_ddb.h" 53 #include "opt_inet.h" 54 #include "opt_isa.h" 55 #include "opt_kstack_pages.h" 56 #include "opt_maxmem.h" 57 #include "opt_mp_watchdog.h" 58 #include "opt_perfmon.h" 59 #include "opt_platform.h" 60 61 #include <sys/param.h> 62 #include <sys/proc.h> 63 #include <sys/systm.h> 64 #include <sys/bio.h> 65 #include <sys/buf.h> 66 #include <sys/bus.h> 67 #include <sys/callout.h> 68 #include <sys/cons.h> 69 #include <sys/cpu.h> 70 #include <sys/eventhandler.h> 71 #include <sys/exec.h> 72 #include <sys/imgact.h> 73 #include <sys/kdb.h> 74 #include <sys/kernel.h> 75 #include <sys/ktr.h> 76 #include <sys/linker.h> 77 #include <sys/lock.h> 78 #include <sys/malloc.h> 79 #include <sys/memrange.h> 80 #include <sys/msgbuf.h> 81 #include <sys/mutex.h> 82 #include <sys/pcpu.h> 83 #include <sys/ptrace.h> 84 #include <sys/reboot.h> 85 #include <sys/rwlock.h> 86 #include <sys/sched.h> 87 #include <sys/signalvar.h> 88 #include <sys/smp.h> 89 #include <sys/syscallsubr.h> 90 #include <sys/sysctl.h> 91 #include <sys/sysent.h> 92 #include <sys/sysproto.h> 93 #include <sys/ucontext.h> 94 #include <sys/vmmeter.h> 95 96 #include <vm/vm.h> 97 #include <vm/vm_extern.h> 98 #include <vm/vm_kern.h> 99 #include <vm/vm_page.h> 100 #include <vm/vm_map.h> 101 #include <vm/vm_object.h> 102 #include <vm/vm_pager.h> 103 #include <vm/vm_param.h> 104 #include <vm/vm_phys.h> 105 106 #ifdef DDB 107 #ifndef KDB 108 #error KDB must be enabled in order for DDB to work! 109 #endif 110 #include <ddb/ddb.h> 111 #include <ddb/db_sym.h> 112 #endif 113 114 #include <isa/rtc.h> 115 116 #include <net/netisr.h> 117 118 #include <machine/bootinfo.h> 119 #include <machine/clock.h> 120 #include <machine/cpu.h> 121 #include <machine/cputypes.h> 122 #include <machine/intr_machdep.h> 123 #include <x86/mca.h> 124 #include <machine/md_var.h> 125 #include <machine/metadata.h> 126 #include <machine/mp_watchdog.h> 127 #include <machine/pc/bios.h> 128 #include <machine/pcb.h> 129 #include <machine/pcb_ext.h> 130 #include <machine/proc.h> 131 #include <machine/reg.h> 132 #include <machine/sigframe.h> 133 #include <machine/specialreg.h> 134 #include <machine/sysarch.h> 135 #include <machine/trap.h> 136 #include <x86/ucode.h> 137 #include <machine/vm86.h> 138 #include <x86/init.h> 139 #ifdef PERFMON 140 #include <machine/perfmon.h> 141 #endif 142 #ifdef SMP 143 #include <machine/smp.h> 144 #endif 145 #ifdef FDT 146 #include <x86/fdt.h> 147 #endif 148 149 #ifdef DEV_APIC 150 #include <x86/apicvar.h> 151 #endif 152 153 #ifdef DEV_ISA 154 #include <x86/isa/icu.h> 155 #endif 156 157 /* Sanity check for __curthread() */ 158 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); 159 160 register_t init386(int first); 161 void dblfault_handler(void); 162 void identify_cpu(void); 163 164 static void cpu_startup(void *); 165 static void fpstate_drop(struct thread *td); 166 static void get_fpcontext(struct thread *td, mcontext_t *mcp, 167 char *xfpusave, size_t xfpusave_len); 168 static int set_fpcontext(struct thread *td, mcontext_t *mcp, 169 char *xfpustate, size_t xfpustate_len); 170 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); 171 172 /* Intel ICH registers */ 173 #define ICH_PMBASE 0x400 174 #define ICH_SMI_EN ICH_PMBASE + 0x30 175 176 int _udatasel, _ucodesel; 177 u_int basemem; 178 static int above4g_allow = 1; 179 static int above24g_allow = 0; 180 181 int cold = 1; 182 183 #ifdef COMPAT_43 184 static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); 185 #endif 186 #ifdef COMPAT_FREEBSD4 187 static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); 188 #endif 189 190 long Maxmem = 0; 191 long realmem = 0; 192 193 #ifdef PAE 194 FEATURE(pae, "Physical Address Extensions"); 195 #endif 196 197 /* 198 * The number of PHYSMAP entries must be one less than the number of 199 * PHYSSEG entries because the PHYSMAP entry that spans the largest 200 * physical address that is accessible by ISA DMA is split into two 201 * PHYSSEG entries. 202 */ 203 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 204 205 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; 206 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; 207 208 /* must be 2 less so 0 0 can signal end of chunks */ 209 #define PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2) 210 #define DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2) 211 212 struct kva_md_info kmi; 213 214 static struct trapframe proc0_tf; 215 struct pcpu __pcpu[MAXCPU]; 216 217 struct mtx icu_lock; 218 219 struct mem_range_softc mem_range_softc; 220 221 extern char start_exceptions[], end_exceptions[]; 222 223 extern struct sysentvec elf32_freebsd_sysvec; 224 225 /* Default init_ops implementation. */ 226 struct init_ops init_ops = { 227 .early_clock_source_init = i8254_init, 228 .early_delay = i8254_delay, 229 #ifdef DEV_APIC 230 .msi_init = msi_init, 231 #endif 232 }; 233 234 static void 235 cpu_startup(dummy) 236 void *dummy; 237 { 238 uintmax_t memsize; 239 char *sysenv; 240 241 /* 242 * On MacBooks, we need to disallow the legacy USB circuit to 243 * generate an SMI# because this can cause several problems, 244 * namely: incorrect CPU frequency detection and failure to 245 * start the APs. 246 * We do this by disabling a bit in the SMI_EN (SMI Control and 247 * Enable register) of the Intel ICH LPC Interface Bridge. 248 */ 249 sysenv = kern_getenv("smbios.system.product"); 250 if (sysenv != NULL) { 251 if (strncmp(sysenv, "MacBook1,1", 10) == 0 || 252 strncmp(sysenv, "MacBook3,1", 10) == 0 || 253 strncmp(sysenv, "MacBook4,1", 10) == 0 || 254 strncmp(sysenv, "MacBookPro1,1", 13) == 0 || 255 strncmp(sysenv, "MacBookPro1,2", 13) == 0 || 256 strncmp(sysenv, "MacBookPro3,1", 13) == 0 || 257 strncmp(sysenv, "MacBookPro4,1", 13) == 0 || 258 strncmp(sysenv, "Macmini1,1", 10) == 0) { 259 if (bootverbose) 260 printf("Disabling LEGACY_USB_EN bit on " 261 "Intel ICH.\n"); 262 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); 263 } 264 freeenv(sysenv); 265 } 266 267 /* 268 * Good {morning,afternoon,evening,night}. 269 */ 270 startrtclock(); 271 printcpuinfo(); 272 panicifcpuunsupported(); 273 #ifdef PERFMON 274 perfmon_init(); 275 #endif 276 277 /* 278 * Display physical memory if SMBIOS reports reasonable amount. 279 */ 280 memsize = 0; 281 sysenv = kern_getenv("smbios.memory.enabled"); 282 if (sysenv != NULL) { 283 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; 284 freeenv(sysenv); 285 } 286 if (memsize < ptoa((uintmax_t)vm_free_count())) 287 memsize = ptoa((uintmax_t)Maxmem); 288 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); 289 realmem = atop(memsize); 290 291 /* 292 * Display any holes after the first chunk of extended memory. 293 */ 294 if (bootverbose) { 295 int indx; 296 297 printf("Physical memory chunk(s):\n"); 298 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 299 vm_paddr_t size; 300 301 size = phys_avail[indx + 1] - phys_avail[indx]; 302 printf( 303 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", 304 (uintmax_t)phys_avail[indx], 305 (uintmax_t)phys_avail[indx + 1] - 1, 306 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); 307 } 308 } 309 310 vm_ksubmap_init(&kmi); 311 312 printf("avail memory = %ju (%ju MB)\n", 313 ptoa((uintmax_t)vm_free_count()), 314 ptoa((uintmax_t)vm_free_count()) / 1048576); 315 316 /* 317 * Set up buffers, so they can be used to read disk labels. 318 */ 319 bufinit(); 320 vm_pager_bufferinit(); 321 cpu_setregs(); 322 } 323 324 /* 325 * Send an interrupt to process. 326 * 327 * Stack is set up to allow sigcode stored 328 * at top to call routine, followed by call 329 * to sigreturn routine below. After sigreturn 330 * resets the signal mask, the stack, and the 331 * frame pointer, it returns to the user 332 * specified pc, psl. 333 */ 334 #ifdef COMPAT_43 335 static void 336 osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 337 { 338 struct osigframe sf, *fp; 339 struct proc *p; 340 struct thread *td; 341 struct sigacts *psp; 342 struct trapframe *regs; 343 int sig; 344 int oonstack; 345 346 td = curthread; 347 p = td->td_proc; 348 PROC_LOCK_ASSERT(p, MA_OWNED); 349 sig = ksi->ksi_signo; 350 psp = p->p_sigacts; 351 mtx_assert(&psp->ps_mtx, MA_OWNED); 352 regs = td->td_frame; 353 oonstack = sigonstack(regs->tf_esp); 354 355 /* Allocate space for the signal handler context. */ 356 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && 357 SIGISMEMBER(psp->ps_sigonstack, sig)) { 358 fp = (struct osigframe *)((uintptr_t)td->td_sigstk.ss_sp + 359 td->td_sigstk.ss_size - sizeof(struct osigframe)); 360 #if defined(COMPAT_43) 361 td->td_sigstk.ss_flags |= SS_ONSTACK; 362 #endif 363 } else 364 fp = (struct osigframe *)regs->tf_esp - 1; 365 366 /* Build the argument list for the signal handler. */ 367 sf.sf_signum = sig; 368 sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc; 369 bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo)); 370 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 371 /* Signal handler installed with SA_SIGINFO. */ 372 sf.sf_arg2 = (register_t)&fp->sf_siginfo; 373 sf.sf_siginfo.si_signo = sig; 374 sf.sf_siginfo.si_code = ksi->ksi_code; 375 sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher; 376 sf.sf_addr = 0; 377 } else { 378 /* Old FreeBSD-style arguments. */ 379 sf.sf_arg2 = ksi->ksi_code; 380 sf.sf_addr = (register_t)ksi->ksi_addr; 381 sf.sf_ahu.sf_handler = catcher; 382 } 383 mtx_unlock(&psp->ps_mtx); 384 PROC_UNLOCK(p); 385 386 /* Save most if not all of trap frame. */ 387 sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax; 388 sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx; 389 sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx; 390 sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx; 391 sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi; 392 sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi; 393 sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs; 394 sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds; 395 sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss; 396 sf.sf_siginfo.si_sc.sc_es = regs->tf_es; 397 sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs; 398 sf.sf_siginfo.si_sc.sc_gs = rgs(); 399 sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp; 400 401 /* Build the signal context to be used by osigreturn(). */ 402 sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0; 403 SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask); 404 sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp; 405 sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp; 406 sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip; 407 sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags; 408 sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno; 409 sf.sf_siginfo.si_sc.sc_err = regs->tf_err; 410 411 /* 412 * If we're a vm86 process, we want to save the segment registers. 413 * We also change eflags to be our emulated eflags, not the actual 414 * eflags. 415 */ 416 if (regs->tf_eflags & PSL_VM) { 417 /* XXX confusing names: `tf' isn't a trapframe; `regs' is. */ 418 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 419 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 420 421 sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs; 422 sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs; 423 sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es; 424 sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds; 425 426 if (vm86->vm86_has_vme == 0) 427 sf.sf_siginfo.si_sc.sc_ps = 428 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 429 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 430 431 /* See sendsig() for comments. */ 432 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 433 } 434 435 /* 436 * Copy the sigframe out to the user's stack. 437 */ 438 if (copyout(&sf, fp, sizeof(*fp)) != 0) { 439 PROC_LOCK(p); 440 sigexit(td, SIGILL); 441 } 442 443 regs->tf_esp = (int)fp; 444 if (p->p_sysent->sv_sigcode_base != 0) { 445 regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode - 446 szosigcode; 447 } else { 448 /* a.out sysentvec does not use shared page */ 449 regs->tf_eip = p->p_sysent->sv_psstrings - szosigcode; 450 } 451 regs->tf_eflags &= ~(PSL_T | PSL_D); 452 regs->tf_cs = _ucodesel; 453 regs->tf_ds = _udatasel; 454 regs->tf_es = _udatasel; 455 regs->tf_fs = _udatasel; 456 load_gs(_udatasel); 457 regs->tf_ss = _udatasel; 458 PROC_LOCK(p); 459 mtx_lock(&psp->ps_mtx); 460 } 461 #endif /* COMPAT_43 */ 462 463 #ifdef COMPAT_FREEBSD4 464 static void 465 freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 466 { 467 struct sigframe4 sf, *sfp; 468 struct proc *p; 469 struct thread *td; 470 struct sigacts *psp; 471 struct trapframe *regs; 472 int sig; 473 int oonstack; 474 475 td = curthread; 476 p = td->td_proc; 477 PROC_LOCK_ASSERT(p, MA_OWNED); 478 sig = ksi->ksi_signo; 479 psp = p->p_sigacts; 480 mtx_assert(&psp->ps_mtx, MA_OWNED); 481 regs = td->td_frame; 482 oonstack = sigonstack(regs->tf_esp); 483 484 /* Save user context. */ 485 bzero(&sf, sizeof(sf)); 486 sf.sf_uc.uc_sigmask = *mask; 487 sf.sf_uc.uc_stack = td->td_sigstk; 488 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 489 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 490 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 491 sf.sf_uc.uc_mcontext.mc_gs = rgs(); 492 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); 493 bzero(sf.sf_uc.uc_mcontext.mc_fpregs, 494 sizeof(sf.sf_uc.uc_mcontext.mc_fpregs)); 495 bzero(sf.sf_uc.uc_mcontext.__spare__, 496 sizeof(sf.sf_uc.uc_mcontext.__spare__)); 497 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); 498 499 /* Allocate space for the signal handler context. */ 500 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 501 SIGISMEMBER(psp->ps_sigonstack, sig)) { 502 sfp = (struct sigframe4 *)((uintptr_t)td->td_sigstk.ss_sp + 503 td->td_sigstk.ss_size - sizeof(struct sigframe4)); 504 #if defined(COMPAT_43) 505 td->td_sigstk.ss_flags |= SS_ONSTACK; 506 #endif 507 } else 508 sfp = (struct sigframe4 *)regs->tf_esp - 1; 509 510 /* Build the argument list for the signal handler. */ 511 sf.sf_signum = sig; 512 sf.sf_ucontext = (register_t)&sfp->sf_uc; 513 bzero(&sf.sf_si, sizeof(sf.sf_si)); 514 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 515 /* Signal handler installed with SA_SIGINFO. */ 516 sf.sf_siginfo = (register_t)&sfp->sf_si; 517 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 518 519 /* Fill in POSIX parts */ 520 sf.sf_si.si_signo = sig; 521 sf.sf_si.si_code = ksi->ksi_code; 522 sf.sf_si.si_addr = ksi->ksi_addr; 523 } else { 524 /* Old FreeBSD-style arguments. */ 525 sf.sf_siginfo = ksi->ksi_code; 526 sf.sf_addr = (register_t)ksi->ksi_addr; 527 sf.sf_ahu.sf_handler = catcher; 528 } 529 mtx_unlock(&psp->ps_mtx); 530 PROC_UNLOCK(p); 531 532 /* 533 * If we're a vm86 process, we want to save the segment registers. 534 * We also change eflags to be our emulated eflags, not the actual 535 * eflags. 536 */ 537 if (regs->tf_eflags & PSL_VM) { 538 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 539 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 540 541 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 542 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 543 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 544 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 545 546 if (vm86->vm86_has_vme == 0) 547 sf.sf_uc.uc_mcontext.mc_eflags = 548 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 549 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 550 551 /* 552 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 553 * syscalls made by the signal handler. This just avoids 554 * wasting time for our lazy fixup of such faults. PSL_NT 555 * does nothing in vm86 mode, but vm86 programs can set it 556 * almost legitimately in probes for old cpu types. 557 */ 558 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 559 } 560 561 /* 562 * Copy the sigframe out to the user's stack. 563 */ 564 if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { 565 PROC_LOCK(p); 566 sigexit(td, SIGILL); 567 } 568 569 regs->tf_esp = (int)sfp; 570 regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode - 571 szfreebsd4_sigcode; 572 regs->tf_eflags &= ~(PSL_T | PSL_D); 573 regs->tf_cs = _ucodesel; 574 regs->tf_ds = _udatasel; 575 regs->tf_es = _udatasel; 576 regs->tf_fs = _udatasel; 577 regs->tf_ss = _udatasel; 578 PROC_LOCK(p); 579 mtx_lock(&psp->ps_mtx); 580 } 581 #endif /* COMPAT_FREEBSD4 */ 582 583 void 584 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 585 { 586 struct sigframe sf, *sfp; 587 struct proc *p; 588 struct thread *td; 589 struct sigacts *psp; 590 char *sp; 591 struct trapframe *regs; 592 struct segment_descriptor *sdp; 593 char *xfpusave; 594 size_t xfpusave_len; 595 int sig; 596 int oonstack; 597 598 td = curthread; 599 p = td->td_proc; 600 PROC_LOCK_ASSERT(p, MA_OWNED); 601 sig = ksi->ksi_signo; 602 psp = p->p_sigacts; 603 mtx_assert(&psp->ps_mtx, MA_OWNED); 604 #ifdef COMPAT_FREEBSD4 605 if (SIGISMEMBER(psp->ps_freebsd4, sig)) { 606 freebsd4_sendsig(catcher, ksi, mask); 607 return; 608 } 609 #endif 610 #ifdef COMPAT_43 611 if (SIGISMEMBER(psp->ps_osigset, sig)) { 612 osendsig(catcher, ksi, mask); 613 return; 614 } 615 #endif 616 regs = td->td_frame; 617 oonstack = sigonstack(regs->tf_esp); 618 619 if (cpu_max_ext_state_size > sizeof(union savefpu) && use_xsave) { 620 xfpusave_len = cpu_max_ext_state_size - sizeof(union savefpu); 621 xfpusave = __builtin_alloca(xfpusave_len); 622 } else { 623 xfpusave_len = 0; 624 xfpusave = NULL; 625 } 626 627 /* Save user context. */ 628 bzero(&sf, sizeof(sf)); 629 sf.sf_uc.uc_sigmask = *mask; 630 sf.sf_uc.uc_stack = td->td_sigstk; 631 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 632 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 633 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 634 sf.sf_uc.uc_mcontext.mc_gs = rgs(); 635 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); 636 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ 637 get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); 638 fpstate_drop(td); 639 /* 640 * Unconditionally fill the fsbase and gsbase into the mcontext. 641 */ 642 sdp = &td->td_pcb->pcb_fsd; 643 sf.sf_uc.uc_mcontext.mc_fsbase = sdp->sd_hibase << 24 | 644 sdp->sd_lobase; 645 sdp = &td->td_pcb->pcb_gsd; 646 sf.sf_uc.uc_mcontext.mc_gsbase = sdp->sd_hibase << 24 | 647 sdp->sd_lobase; 648 bzero(sf.sf_uc.uc_mcontext.mc_spare2, 649 sizeof(sf.sf_uc.uc_mcontext.mc_spare2)); 650 651 /* Allocate space for the signal handler context. */ 652 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 653 SIGISMEMBER(psp->ps_sigonstack, sig)) { 654 sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size; 655 #if defined(COMPAT_43) 656 td->td_sigstk.ss_flags |= SS_ONSTACK; 657 #endif 658 } else 659 sp = (char *)regs->tf_esp - 128; 660 if (xfpusave != NULL) { 661 sp -= xfpusave_len; 662 sp = (char *)((unsigned int)sp & ~0x3F); 663 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp; 664 } 665 sp -= sizeof(struct sigframe); 666 667 /* Align to 16 bytes. */ 668 sfp = (struct sigframe *)((unsigned int)sp & ~0xF); 669 670 /* Build the argument list for the signal handler. */ 671 sf.sf_signum = sig; 672 sf.sf_ucontext = (register_t)&sfp->sf_uc; 673 bzero(&sf.sf_si, sizeof(sf.sf_si)); 674 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 675 /* Signal handler installed with SA_SIGINFO. */ 676 sf.sf_siginfo = (register_t)&sfp->sf_si; 677 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 678 679 /* Fill in POSIX parts */ 680 sf.sf_si = ksi->ksi_info; 681 sf.sf_si.si_signo = sig; /* maybe a translated signal */ 682 } else { 683 /* Old FreeBSD-style arguments. */ 684 sf.sf_siginfo = ksi->ksi_code; 685 sf.sf_addr = (register_t)ksi->ksi_addr; 686 sf.sf_ahu.sf_handler = catcher; 687 } 688 mtx_unlock(&psp->ps_mtx); 689 PROC_UNLOCK(p); 690 691 /* 692 * If we're a vm86 process, we want to save the segment registers. 693 * We also change eflags to be our emulated eflags, not the actual 694 * eflags. 695 */ 696 if (regs->tf_eflags & PSL_VM) { 697 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 698 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 699 700 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 701 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 702 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 703 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 704 705 if (vm86->vm86_has_vme == 0) 706 sf.sf_uc.uc_mcontext.mc_eflags = 707 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 708 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 709 710 /* 711 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 712 * syscalls made by the signal handler. This just avoids 713 * wasting time for our lazy fixup of such faults. PSL_NT 714 * does nothing in vm86 mode, but vm86 programs can set it 715 * almost legitimately in probes for old cpu types. 716 */ 717 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 718 } 719 720 /* 721 * Copy the sigframe out to the user's stack. 722 */ 723 if (copyout(&sf, sfp, sizeof(*sfp)) != 0 || 724 (xfpusave != NULL && copyout(xfpusave, 725 (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len) 726 != 0)) { 727 PROC_LOCK(p); 728 sigexit(td, SIGILL); 729 } 730 731 regs->tf_esp = (int)sfp; 732 regs->tf_eip = p->p_sysent->sv_sigcode_base; 733 if (regs->tf_eip == 0) 734 regs->tf_eip = p->p_sysent->sv_psstrings - szsigcode; 735 regs->tf_eflags &= ~(PSL_T | PSL_D); 736 regs->tf_cs = _ucodesel; 737 regs->tf_ds = _udatasel; 738 regs->tf_es = _udatasel; 739 regs->tf_fs = _udatasel; 740 regs->tf_ss = _udatasel; 741 PROC_LOCK(p); 742 mtx_lock(&psp->ps_mtx); 743 } 744 745 /* 746 * System call to cleanup state after a signal 747 * has been taken. Reset signal mask and 748 * stack state from context left by sendsig (above). 749 * Return to previous pc and psl as specified by 750 * context left by sendsig. Check carefully to 751 * make sure that the user has not modified the 752 * state to gain improper privileges. 753 * 754 * MPSAFE 755 */ 756 #ifdef COMPAT_43 757 int 758 osigreturn(td, uap) 759 struct thread *td; 760 struct osigreturn_args /* { 761 struct osigcontext *sigcntxp; 762 } */ *uap; 763 { 764 struct osigcontext sc; 765 struct trapframe *regs; 766 struct osigcontext *scp; 767 int eflags, error; 768 ksiginfo_t ksi; 769 770 regs = td->td_frame; 771 error = copyin(uap->sigcntxp, &sc, sizeof(sc)); 772 if (error != 0) 773 return (error); 774 scp = ≻ 775 eflags = scp->sc_ps; 776 if (eflags & PSL_VM) { 777 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 778 struct vm86_kernel *vm86; 779 780 /* 781 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 782 * set up the vm86 area, and we can't enter vm86 mode. 783 */ 784 if (td->td_pcb->pcb_ext == 0) 785 return (EINVAL); 786 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 787 if (vm86->vm86_inited == 0) 788 return (EINVAL); 789 790 /* Go back to user mode if both flags are set. */ 791 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 792 ksiginfo_init_trap(&ksi); 793 ksi.ksi_signo = SIGBUS; 794 ksi.ksi_code = BUS_OBJERR; 795 ksi.ksi_addr = (void *)regs->tf_eip; 796 trapsignal(td, &ksi); 797 } 798 799 if (vm86->vm86_has_vme) { 800 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 801 (eflags & VME_USERCHANGE) | PSL_VM; 802 } else { 803 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 804 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 805 (eflags & VM_USERCHANGE) | PSL_VM; 806 } 807 tf->tf_vm86_ds = scp->sc_ds; 808 tf->tf_vm86_es = scp->sc_es; 809 tf->tf_vm86_fs = scp->sc_fs; 810 tf->tf_vm86_gs = scp->sc_gs; 811 tf->tf_ds = _udatasel; 812 tf->tf_es = _udatasel; 813 tf->tf_fs = _udatasel; 814 } else { 815 /* 816 * Don't allow users to change privileged or reserved flags. 817 */ 818 if (!EFL_SECURE(eflags, regs->tf_eflags)) { 819 return (EINVAL); 820 } 821 822 /* 823 * Don't allow users to load a valid privileged %cs. Let the 824 * hardware check for invalid selectors, excess privilege in 825 * other selectors, invalid %eip's and invalid %esp's. 826 */ 827 if (!CS_SECURE(scp->sc_cs)) { 828 ksiginfo_init_trap(&ksi); 829 ksi.ksi_signo = SIGBUS; 830 ksi.ksi_code = BUS_OBJERR; 831 ksi.ksi_trapno = T_PROTFLT; 832 ksi.ksi_addr = (void *)regs->tf_eip; 833 trapsignal(td, &ksi); 834 return (EINVAL); 835 } 836 regs->tf_ds = scp->sc_ds; 837 regs->tf_es = scp->sc_es; 838 regs->tf_fs = scp->sc_fs; 839 } 840 841 /* Restore remaining registers. */ 842 regs->tf_eax = scp->sc_eax; 843 regs->tf_ebx = scp->sc_ebx; 844 regs->tf_ecx = scp->sc_ecx; 845 regs->tf_edx = scp->sc_edx; 846 regs->tf_esi = scp->sc_esi; 847 regs->tf_edi = scp->sc_edi; 848 regs->tf_cs = scp->sc_cs; 849 regs->tf_ss = scp->sc_ss; 850 regs->tf_isp = scp->sc_isp; 851 regs->tf_ebp = scp->sc_fp; 852 regs->tf_esp = scp->sc_sp; 853 regs->tf_eip = scp->sc_pc; 854 regs->tf_eflags = eflags; 855 856 #if defined(COMPAT_43) 857 if (scp->sc_onstack & 1) 858 td->td_sigstk.ss_flags |= SS_ONSTACK; 859 else 860 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 861 #endif 862 kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL, 863 SIGPROCMASK_OLD); 864 return (EJUSTRETURN); 865 } 866 #endif /* COMPAT_43 */ 867 868 #ifdef COMPAT_FREEBSD4 869 /* 870 * MPSAFE 871 */ 872 int 873 freebsd4_sigreturn(td, uap) 874 struct thread *td; 875 struct freebsd4_sigreturn_args /* { 876 const ucontext4 *sigcntxp; 877 } */ *uap; 878 { 879 struct ucontext4 uc; 880 struct trapframe *regs; 881 struct ucontext4 *ucp; 882 int cs, eflags, error; 883 ksiginfo_t ksi; 884 885 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 886 if (error != 0) 887 return (error); 888 ucp = &uc; 889 regs = td->td_frame; 890 eflags = ucp->uc_mcontext.mc_eflags; 891 if (eflags & PSL_VM) { 892 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 893 struct vm86_kernel *vm86; 894 895 /* 896 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 897 * set up the vm86 area, and we can't enter vm86 mode. 898 */ 899 if (td->td_pcb->pcb_ext == 0) 900 return (EINVAL); 901 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 902 if (vm86->vm86_inited == 0) 903 return (EINVAL); 904 905 /* Go back to user mode if both flags are set. */ 906 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 907 ksiginfo_init_trap(&ksi); 908 ksi.ksi_signo = SIGBUS; 909 ksi.ksi_code = BUS_OBJERR; 910 ksi.ksi_addr = (void *)regs->tf_eip; 911 trapsignal(td, &ksi); 912 } 913 if (vm86->vm86_has_vme) { 914 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 915 (eflags & VME_USERCHANGE) | PSL_VM; 916 } else { 917 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 918 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 919 (eflags & VM_USERCHANGE) | PSL_VM; 920 } 921 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); 922 tf->tf_eflags = eflags; 923 tf->tf_vm86_ds = tf->tf_ds; 924 tf->tf_vm86_es = tf->tf_es; 925 tf->tf_vm86_fs = tf->tf_fs; 926 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; 927 tf->tf_ds = _udatasel; 928 tf->tf_es = _udatasel; 929 tf->tf_fs = _udatasel; 930 } else { 931 /* 932 * Don't allow users to change privileged or reserved flags. 933 */ 934 if (!EFL_SECURE(eflags, regs->tf_eflags)) { 935 uprintf("pid %d (%s): freebsd4_sigreturn eflags = 0x%x\n", 936 td->td_proc->p_pid, td->td_name, eflags); 937 return (EINVAL); 938 } 939 940 /* 941 * Don't allow users to load a valid privileged %cs. Let the 942 * hardware check for invalid selectors, excess privilege in 943 * other selectors, invalid %eip's and invalid %esp's. 944 */ 945 cs = ucp->uc_mcontext.mc_cs; 946 if (!CS_SECURE(cs)) { 947 uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n", 948 td->td_proc->p_pid, td->td_name, cs); 949 ksiginfo_init_trap(&ksi); 950 ksi.ksi_signo = SIGBUS; 951 ksi.ksi_code = BUS_OBJERR; 952 ksi.ksi_trapno = T_PROTFLT; 953 ksi.ksi_addr = (void *)regs->tf_eip; 954 trapsignal(td, &ksi); 955 return (EINVAL); 956 } 957 958 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); 959 } 960 961 #if defined(COMPAT_43) 962 if (ucp->uc_mcontext.mc_onstack & 1) 963 td->td_sigstk.ss_flags |= SS_ONSTACK; 964 else 965 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 966 #endif 967 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 968 return (EJUSTRETURN); 969 } 970 #endif /* COMPAT_FREEBSD4 */ 971 972 /* 973 * MPSAFE 974 */ 975 int 976 sys_sigreturn(td, uap) 977 struct thread *td; 978 struct sigreturn_args /* { 979 const struct __ucontext *sigcntxp; 980 } */ *uap; 981 { 982 ucontext_t uc; 983 struct proc *p; 984 struct trapframe *regs; 985 ucontext_t *ucp; 986 char *xfpustate; 987 size_t xfpustate_len; 988 int cs, eflags, error, ret; 989 ksiginfo_t ksi; 990 991 p = td->td_proc; 992 993 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 994 if (error != 0) 995 return (error); 996 ucp = &uc; 997 if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) { 998 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid, 999 td->td_name, ucp->uc_mcontext.mc_flags); 1000 return (EINVAL); 1001 } 1002 regs = td->td_frame; 1003 eflags = ucp->uc_mcontext.mc_eflags; 1004 if (eflags & PSL_VM) { 1005 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 1006 struct vm86_kernel *vm86; 1007 1008 /* 1009 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 1010 * set up the vm86 area, and we can't enter vm86 mode. 1011 */ 1012 if (td->td_pcb->pcb_ext == 0) 1013 return (EINVAL); 1014 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 1015 if (vm86->vm86_inited == 0) 1016 return (EINVAL); 1017 1018 /* Go back to user mode if both flags are set. */ 1019 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 1020 ksiginfo_init_trap(&ksi); 1021 ksi.ksi_signo = SIGBUS; 1022 ksi.ksi_code = BUS_OBJERR; 1023 ksi.ksi_addr = (void *)regs->tf_eip; 1024 trapsignal(td, &ksi); 1025 } 1026 1027 if (vm86->vm86_has_vme) { 1028 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 1029 (eflags & VME_USERCHANGE) | PSL_VM; 1030 } else { 1031 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 1032 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 1033 (eflags & VM_USERCHANGE) | PSL_VM; 1034 } 1035 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); 1036 tf->tf_eflags = eflags; 1037 tf->tf_vm86_ds = tf->tf_ds; 1038 tf->tf_vm86_es = tf->tf_es; 1039 tf->tf_vm86_fs = tf->tf_fs; 1040 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; 1041 tf->tf_ds = _udatasel; 1042 tf->tf_es = _udatasel; 1043 tf->tf_fs = _udatasel; 1044 } else { 1045 /* 1046 * Don't allow users to change privileged or reserved flags. 1047 */ 1048 if (!EFL_SECURE(eflags, regs->tf_eflags)) { 1049 uprintf("pid %d (%s): sigreturn eflags = 0x%x\n", 1050 td->td_proc->p_pid, td->td_name, eflags); 1051 return (EINVAL); 1052 } 1053 1054 /* 1055 * Don't allow users to load a valid privileged %cs. Let the 1056 * hardware check for invalid selectors, excess privilege in 1057 * other selectors, invalid %eip's and invalid %esp's. 1058 */ 1059 cs = ucp->uc_mcontext.mc_cs; 1060 if (!CS_SECURE(cs)) { 1061 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", 1062 td->td_proc->p_pid, td->td_name, cs); 1063 ksiginfo_init_trap(&ksi); 1064 ksi.ksi_signo = SIGBUS; 1065 ksi.ksi_code = BUS_OBJERR; 1066 ksi.ksi_trapno = T_PROTFLT; 1067 ksi.ksi_addr = (void *)regs->tf_eip; 1068 trapsignal(td, &ksi); 1069 return (EINVAL); 1070 } 1071 1072 if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) { 1073 xfpustate_len = uc.uc_mcontext.mc_xfpustate_len; 1074 if (xfpustate_len > cpu_max_ext_state_size - 1075 sizeof(union savefpu)) { 1076 uprintf( 1077 "pid %d (%s): sigreturn xfpusave_len = 0x%zx\n", 1078 p->p_pid, td->td_name, xfpustate_len); 1079 return (EINVAL); 1080 } 1081 xfpustate = __builtin_alloca(xfpustate_len); 1082 error = copyin((const void *)uc.uc_mcontext.mc_xfpustate, 1083 xfpustate, xfpustate_len); 1084 if (error != 0) { 1085 uprintf( 1086 "pid %d (%s): sigreturn copying xfpustate failed\n", 1087 p->p_pid, td->td_name); 1088 return (error); 1089 } 1090 } else { 1091 xfpustate = NULL; 1092 xfpustate_len = 0; 1093 } 1094 ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, 1095 xfpustate_len); 1096 if (ret != 0) 1097 return (ret); 1098 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); 1099 } 1100 1101 #if defined(COMPAT_43) 1102 if (ucp->uc_mcontext.mc_onstack & 1) 1103 td->td_sigstk.ss_flags |= SS_ONSTACK; 1104 else 1105 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 1106 #endif 1107 1108 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 1109 return (EJUSTRETURN); 1110 } 1111 1112 #ifdef COMPAT_43 1113 static void 1114 setup_priv_lcall_gate(struct proc *p) 1115 { 1116 struct i386_ldt_args uap; 1117 union descriptor desc; 1118 u_int lcall_addr; 1119 1120 bzero(&uap, sizeof(uap)); 1121 uap.start = 0; 1122 uap.num = 1; 1123 lcall_addr = p->p_sysent->sv_psstrings - sz_lcall_tramp; 1124 bzero(&desc, sizeof(desc)); 1125 desc.sd.sd_type = SDT_MEMERA; 1126 desc.sd.sd_dpl = SEL_UPL; 1127 desc.sd.sd_p = 1; 1128 desc.sd.sd_def32 = 1; 1129 desc.sd.sd_gran = 1; 1130 desc.sd.sd_lolimit = 0xffff; 1131 desc.sd.sd_hilimit = 0xf; 1132 desc.sd.sd_lobase = lcall_addr; 1133 desc.sd.sd_hibase = lcall_addr >> 24; 1134 i386_set_ldt(curthread, &uap, &desc); 1135 } 1136 #endif 1137 1138 /* 1139 * Reset registers to default values on exec. 1140 */ 1141 void 1142 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) 1143 { 1144 struct trapframe *regs; 1145 struct pcb *pcb; 1146 register_t saved_eflags; 1147 1148 regs = td->td_frame; 1149 pcb = td->td_pcb; 1150 1151 /* Reset pc->pcb_gs and %gs before possibly invalidating it. */ 1152 pcb->pcb_gs = _udatasel; 1153 load_gs(_udatasel); 1154 1155 mtx_lock_spin(&dt_lock); 1156 if (td->td_proc->p_md.md_ldt != NULL) 1157 user_ldt_free(td); 1158 else 1159 mtx_unlock_spin(&dt_lock); 1160 1161 #ifdef COMPAT_43 1162 if (td->td_proc->p_sysent->sv_psstrings != 1163 elf32_freebsd_sysvec.sv_psstrings) 1164 setup_priv_lcall_gate(td->td_proc); 1165 #endif 1166 1167 /* 1168 * Reset the fs and gs bases. The values from the old address 1169 * space do not make sense for the new program. In particular, 1170 * gsbase might be the TLS base for the old program but the new 1171 * program has no TLS now. 1172 */ 1173 set_fsbase(td, 0); 1174 set_gsbase(td, 0); 1175 1176 /* Make sure edx is 0x0 on entry. Linux binaries depend on it. */ 1177 saved_eflags = regs->tf_eflags & PSL_T; 1178 bzero((char *)regs, sizeof(struct trapframe)); 1179 regs->tf_eip = imgp->entry_addr; 1180 regs->tf_esp = stack; 1181 regs->tf_eflags = PSL_USER | saved_eflags; 1182 regs->tf_ss = _udatasel; 1183 regs->tf_ds = _udatasel; 1184 regs->tf_es = _udatasel; 1185 regs->tf_fs = _udatasel; 1186 regs->tf_cs = _ucodesel; 1187 1188 /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */ 1189 regs->tf_ebx = imgp->ps_strings; 1190 1191 /* 1192 * Reset the hardware debug registers if they were in use. 1193 * They won't have any meaning for the newly exec'd process. 1194 */ 1195 if (pcb->pcb_flags & PCB_DBREGS) { 1196 pcb->pcb_dr0 = 0; 1197 pcb->pcb_dr1 = 0; 1198 pcb->pcb_dr2 = 0; 1199 pcb->pcb_dr3 = 0; 1200 pcb->pcb_dr6 = 0; 1201 pcb->pcb_dr7 = 0; 1202 if (pcb == curpcb) { 1203 /* 1204 * Clear the debug registers on the running 1205 * CPU, otherwise they will end up affecting 1206 * the next process we switch to. 1207 */ 1208 reset_dbregs(); 1209 } 1210 pcb->pcb_flags &= ~PCB_DBREGS; 1211 } 1212 1213 pcb->pcb_initial_npxcw = __INITIAL_NPXCW__; 1214 1215 /* 1216 * Drop the FP state if we hold it, so that the process gets a 1217 * clean FP state if it uses the FPU again. 1218 */ 1219 fpstate_drop(td); 1220 } 1221 1222 void 1223 cpu_setregs(void) 1224 { 1225 unsigned int cr0; 1226 1227 cr0 = rcr0(); 1228 1229 /* 1230 * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support: 1231 * 1232 * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT 1233 * instructions. We must set the CR0_MP bit and use the CR0_TS 1234 * bit to control the trap, because setting the CR0_EM bit does 1235 * not cause WAIT instructions to trap. It's important to trap 1236 * WAIT instructions - otherwise the "wait" variants of no-wait 1237 * control instructions would degenerate to the "no-wait" variants 1238 * after FP context switches but work correctly otherwise. It's 1239 * particularly important to trap WAITs when there is no NPX - 1240 * otherwise the "wait" variants would always degenerate. 1241 * 1242 * Try setting CR0_NE to get correct error reporting on 486DX's. 1243 * Setting it should fail or do nothing on lesser processors. 1244 */ 1245 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; 1246 load_cr0(cr0); 1247 load_gs(_udatasel); 1248 } 1249 1250 u_long bootdev; /* not a struct cdev *- encoding is different */ 1251 SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev, 1252 CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)"); 1253 1254 static char bootmethod[16] = "BIOS"; 1255 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0, 1256 "System firmware boot method"); 1257 1258 /* 1259 * Initialize 386 and configure to run kernel 1260 */ 1261 1262 /* 1263 * Initialize segments & interrupt table 1264 */ 1265 1266 int _default_ldt; 1267 1268 struct mtx dt_lock; /* lock for GDT and LDT */ 1269 1270 union descriptor gdt0[NGDT]; /* initial global descriptor table */ 1271 union descriptor *gdt = gdt0; /* global descriptor table */ 1272 1273 union descriptor *ldt; /* local descriptor table */ 1274 1275 static struct gate_descriptor idt0[NIDT]; 1276 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 1277 1278 static struct i386tss *dblfault_tss; 1279 static char *dblfault_stack; 1280 1281 static struct i386tss common_tss0; 1282 1283 vm_offset_t proc0kstack; 1284 1285 /* 1286 * software prototypes -- in more palatable form. 1287 * 1288 * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret 1289 * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it) 1290 */ 1291 struct soft_segment_descriptor gdt_segs[] = { 1292 /* GNULL_SEL 0 Null Descriptor */ 1293 { .ssd_base = 0x0, 1294 .ssd_limit = 0x0, 1295 .ssd_type = 0, 1296 .ssd_dpl = SEL_KPL, 1297 .ssd_p = 0, 1298 .ssd_xx = 0, .ssd_xx1 = 0, 1299 .ssd_def32 = 0, 1300 .ssd_gran = 0 }, 1301 /* GPRIV_SEL 1 SMP Per-Processor Private Data Descriptor */ 1302 { .ssd_base = 0x0, 1303 .ssd_limit = 0xfffff, 1304 .ssd_type = SDT_MEMRWA, 1305 .ssd_dpl = SEL_KPL, 1306 .ssd_p = 1, 1307 .ssd_xx = 0, .ssd_xx1 = 0, 1308 .ssd_def32 = 1, 1309 .ssd_gran = 1 }, 1310 /* GUFS_SEL 2 %fs Descriptor for user */ 1311 { .ssd_base = 0x0, 1312 .ssd_limit = 0xfffff, 1313 .ssd_type = SDT_MEMRWA, 1314 .ssd_dpl = SEL_UPL, 1315 .ssd_p = 1, 1316 .ssd_xx = 0, .ssd_xx1 = 0, 1317 .ssd_def32 = 1, 1318 .ssd_gran = 1 }, 1319 /* GUGS_SEL 3 %gs Descriptor for user */ 1320 { .ssd_base = 0x0, 1321 .ssd_limit = 0xfffff, 1322 .ssd_type = SDT_MEMRWA, 1323 .ssd_dpl = SEL_UPL, 1324 .ssd_p = 1, 1325 .ssd_xx = 0, .ssd_xx1 = 0, 1326 .ssd_def32 = 1, 1327 .ssd_gran = 1 }, 1328 /* GCODE_SEL 4 Code Descriptor for kernel */ 1329 { .ssd_base = 0x0, 1330 .ssd_limit = 0xfffff, 1331 .ssd_type = SDT_MEMERA, 1332 .ssd_dpl = SEL_KPL, 1333 .ssd_p = 1, 1334 .ssd_xx = 0, .ssd_xx1 = 0, 1335 .ssd_def32 = 1, 1336 .ssd_gran = 1 }, 1337 /* GDATA_SEL 5 Data Descriptor for kernel */ 1338 { .ssd_base = 0x0, 1339 .ssd_limit = 0xfffff, 1340 .ssd_type = SDT_MEMRWA, 1341 .ssd_dpl = SEL_KPL, 1342 .ssd_p = 1, 1343 .ssd_xx = 0, .ssd_xx1 = 0, 1344 .ssd_def32 = 1, 1345 .ssd_gran = 1 }, 1346 /* GUCODE_SEL 6 Code Descriptor for user */ 1347 { .ssd_base = 0x0, 1348 .ssd_limit = 0xfffff, 1349 .ssd_type = SDT_MEMERA, 1350 .ssd_dpl = SEL_UPL, 1351 .ssd_p = 1, 1352 .ssd_xx = 0, .ssd_xx1 = 0, 1353 .ssd_def32 = 1, 1354 .ssd_gran = 1 }, 1355 /* GUDATA_SEL 7 Data Descriptor for user */ 1356 { .ssd_base = 0x0, 1357 .ssd_limit = 0xfffff, 1358 .ssd_type = SDT_MEMRWA, 1359 .ssd_dpl = SEL_UPL, 1360 .ssd_p = 1, 1361 .ssd_xx = 0, .ssd_xx1 = 0, 1362 .ssd_def32 = 1, 1363 .ssd_gran = 1 }, 1364 /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */ 1365 { .ssd_base = 0x400, 1366 .ssd_limit = 0xfffff, 1367 .ssd_type = SDT_MEMRWA, 1368 .ssd_dpl = SEL_KPL, 1369 .ssd_p = 1, 1370 .ssd_xx = 0, .ssd_xx1 = 0, 1371 .ssd_def32 = 1, 1372 .ssd_gran = 1 }, 1373 /* GPROC0_SEL 9 Proc 0 Tss Descriptor */ 1374 { 1375 .ssd_base = 0x0, 1376 .ssd_limit = sizeof(struct i386tss)-1, 1377 .ssd_type = SDT_SYS386TSS, 1378 .ssd_dpl = 0, 1379 .ssd_p = 1, 1380 .ssd_xx = 0, .ssd_xx1 = 0, 1381 .ssd_def32 = 0, 1382 .ssd_gran = 0 }, 1383 /* GLDT_SEL 10 LDT Descriptor */ 1384 { .ssd_base = 0, 1385 .ssd_limit = sizeof(union descriptor) * NLDT - 1, 1386 .ssd_type = SDT_SYSLDT, 1387 .ssd_dpl = SEL_UPL, 1388 .ssd_p = 1, 1389 .ssd_xx = 0, .ssd_xx1 = 0, 1390 .ssd_def32 = 0, 1391 .ssd_gran = 0 }, 1392 /* GUSERLDT_SEL 11 User LDT Descriptor per process */ 1393 { .ssd_base = 0, 1394 .ssd_limit = (512 * sizeof(union descriptor)-1), 1395 .ssd_type = SDT_SYSLDT, 1396 .ssd_dpl = 0, 1397 .ssd_p = 1, 1398 .ssd_xx = 0, .ssd_xx1 = 0, 1399 .ssd_def32 = 0, 1400 .ssd_gran = 0 }, 1401 /* GPANIC_SEL 12 Panic Tss Descriptor */ 1402 { .ssd_base = 0, 1403 .ssd_limit = sizeof(struct i386tss)-1, 1404 .ssd_type = SDT_SYS386TSS, 1405 .ssd_dpl = 0, 1406 .ssd_p = 1, 1407 .ssd_xx = 0, .ssd_xx1 = 0, 1408 .ssd_def32 = 0, 1409 .ssd_gran = 0 }, 1410 /* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */ 1411 { .ssd_base = 0, 1412 .ssd_limit = 0xfffff, 1413 .ssd_type = SDT_MEMERA, 1414 .ssd_dpl = 0, 1415 .ssd_p = 1, 1416 .ssd_xx = 0, .ssd_xx1 = 0, 1417 .ssd_def32 = 0, 1418 .ssd_gran = 1 }, 1419 /* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */ 1420 { .ssd_base = 0, 1421 .ssd_limit = 0xfffff, 1422 .ssd_type = SDT_MEMERA, 1423 .ssd_dpl = 0, 1424 .ssd_p = 1, 1425 .ssd_xx = 0, .ssd_xx1 = 0, 1426 .ssd_def32 = 0, 1427 .ssd_gran = 1 }, 1428 /* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */ 1429 { .ssd_base = 0, 1430 .ssd_limit = 0xfffff, 1431 .ssd_type = SDT_MEMRWA, 1432 .ssd_dpl = 0, 1433 .ssd_p = 1, 1434 .ssd_xx = 0, .ssd_xx1 = 0, 1435 .ssd_def32 = 1, 1436 .ssd_gran = 1 }, 1437 /* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */ 1438 { .ssd_base = 0, 1439 .ssd_limit = 0xfffff, 1440 .ssd_type = SDT_MEMRWA, 1441 .ssd_dpl = 0, 1442 .ssd_p = 1, 1443 .ssd_xx = 0, .ssd_xx1 = 0, 1444 .ssd_def32 = 0, 1445 .ssd_gran = 1 }, 1446 /* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */ 1447 { .ssd_base = 0, 1448 .ssd_limit = 0xfffff, 1449 .ssd_type = SDT_MEMRWA, 1450 .ssd_dpl = 0, 1451 .ssd_p = 1, 1452 .ssd_xx = 0, .ssd_xx1 = 0, 1453 .ssd_def32 = 0, 1454 .ssd_gran = 1 }, 1455 /* GNDIS_SEL 18 NDIS Descriptor */ 1456 { .ssd_base = 0x0, 1457 .ssd_limit = 0x0, 1458 .ssd_type = 0, 1459 .ssd_dpl = 0, 1460 .ssd_p = 0, 1461 .ssd_xx = 0, .ssd_xx1 = 0, 1462 .ssd_def32 = 0, 1463 .ssd_gran = 0 }, 1464 }; 1465 1466 static struct soft_segment_descriptor ldt_segs[] = { 1467 /* Null Descriptor - overwritten by call gate */ 1468 { .ssd_base = 0x0, 1469 .ssd_limit = 0x0, 1470 .ssd_type = 0, 1471 .ssd_dpl = 0, 1472 .ssd_p = 0, 1473 .ssd_xx = 0, .ssd_xx1 = 0, 1474 .ssd_def32 = 0, 1475 .ssd_gran = 0 }, 1476 /* Null Descriptor - overwritten by call gate */ 1477 { .ssd_base = 0x0, 1478 .ssd_limit = 0x0, 1479 .ssd_type = 0, 1480 .ssd_dpl = 0, 1481 .ssd_p = 0, 1482 .ssd_xx = 0, .ssd_xx1 = 0, 1483 .ssd_def32 = 0, 1484 .ssd_gran = 0 }, 1485 /* Null Descriptor - overwritten by call gate */ 1486 { .ssd_base = 0x0, 1487 .ssd_limit = 0x0, 1488 .ssd_type = 0, 1489 .ssd_dpl = 0, 1490 .ssd_p = 0, 1491 .ssd_xx = 0, .ssd_xx1 = 0, 1492 .ssd_def32 = 0, 1493 .ssd_gran = 0 }, 1494 /* Code Descriptor for user */ 1495 { .ssd_base = 0x0, 1496 .ssd_limit = 0xfffff, 1497 .ssd_type = SDT_MEMERA, 1498 .ssd_dpl = SEL_UPL, 1499 .ssd_p = 1, 1500 .ssd_xx = 0, .ssd_xx1 = 0, 1501 .ssd_def32 = 1, 1502 .ssd_gran = 1 }, 1503 /* Null Descriptor - overwritten by call gate */ 1504 { .ssd_base = 0x0, 1505 .ssd_limit = 0x0, 1506 .ssd_type = 0, 1507 .ssd_dpl = 0, 1508 .ssd_p = 0, 1509 .ssd_xx = 0, .ssd_xx1 = 0, 1510 .ssd_def32 = 0, 1511 .ssd_gran = 0 }, 1512 /* Data Descriptor for user */ 1513 { .ssd_base = 0x0, 1514 .ssd_limit = 0xfffff, 1515 .ssd_type = SDT_MEMRWA, 1516 .ssd_dpl = SEL_UPL, 1517 .ssd_p = 1, 1518 .ssd_xx = 0, .ssd_xx1 = 0, 1519 .ssd_def32 = 1, 1520 .ssd_gran = 1 }, 1521 }; 1522 1523 uintptr_t setidt_disp; 1524 1525 void 1526 setidt(int idx, inthand_t *func, int typ, int dpl, int selec) 1527 { 1528 uintptr_t off; 1529 1530 off = func != NULL ? (uintptr_t)func + setidt_disp : 0; 1531 setidt_nodisp(idx, off, typ, dpl, selec); 1532 } 1533 1534 void 1535 setidt_nodisp(int idx, uintptr_t off, int typ, int dpl, int selec) 1536 { 1537 struct gate_descriptor *ip; 1538 1539 ip = idt + idx; 1540 ip->gd_looffset = off; 1541 ip->gd_selector = selec; 1542 ip->gd_stkcpy = 0; 1543 ip->gd_xx = 0; 1544 ip->gd_type = typ; 1545 ip->gd_dpl = dpl; 1546 ip->gd_p = 1; 1547 ip->gd_hioffset = ((u_int)off) >> 16 ; 1548 } 1549 1550 extern inthand_t 1551 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 1552 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 1553 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 1554 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 1555 IDTVEC(xmm), 1556 #ifdef KDTRACE_HOOKS 1557 IDTVEC(dtrace_ret), 1558 #endif 1559 #ifdef XENHVM 1560 IDTVEC(xen_intr_upcall), 1561 #endif 1562 IDTVEC(int0x80_syscall); 1563 1564 #ifdef DDB 1565 /* 1566 * Display the index and function name of any IDT entries that don't use 1567 * the default 'rsvd' entry point. 1568 */ 1569 DB_SHOW_COMMAND(idt, db_show_idt) 1570 { 1571 struct gate_descriptor *ip; 1572 int idx; 1573 uintptr_t func, func_trm; 1574 bool trm; 1575 1576 ip = idt; 1577 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { 1578 if (ip->gd_type == SDT_SYSTASKGT) { 1579 db_printf("%3d\t<TASK>\n", idx); 1580 } else { 1581 func = (ip->gd_hioffset << 16 | ip->gd_looffset); 1582 if (func >= PMAP_TRM_MIN_ADDRESS) { 1583 func_trm = func; 1584 func -= setidt_disp; 1585 trm = true; 1586 } else 1587 trm = false; 1588 if (func != (uintptr_t)&IDTVEC(rsvd)) { 1589 db_printf("%3d\t", idx); 1590 db_printsym(func, DB_STGY_PROC); 1591 if (trm) 1592 db_printf(" (trampoline %#x)", 1593 func_trm); 1594 db_printf("\n"); 1595 } 1596 } 1597 ip++; 1598 } 1599 } 1600 1601 /* Show privileged registers. */ 1602 DB_SHOW_COMMAND(sysregs, db_show_sysregs) 1603 { 1604 uint64_t idtr, gdtr; 1605 1606 idtr = ridt(); 1607 db_printf("idtr\t0x%08x/%04x\n", 1608 (u_int)(idtr >> 16), (u_int)idtr & 0xffff); 1609 gdtr = rgdt(); 1610 db_printf("gdtr\t0x%08x/%04x\n", 1611 (u_int)(gdtr >> 16), (u_int)gdtr & 0xffff); 1612 db_printf("ldtr\t0x%04x\n", rldt()); 1613 db_printf("tr\t0x%04x\n", rtr()); 1614 db_printf("cr0\t0x%08x\n", rcr0()); 1615 db_printf("cr2\t0x%08x\n", rcr2()); 1616 db_printf("cr3\t0x%08x\n", rcr3()); 1617 db_printf("cr4\t0x%08x\n", rcr4()); 1618 if (rcr4() & CR4_XSAVE) 1619 db_printf("xcr0\t0x%016llx\n", rxcr(0)); 1620 if (amd_feature & (AMDID_NX | AMDID_LM)) 1621 db_printf("EFER\t0x%016llx\n", rdmsr(MSR_EFER)); 1622 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX)) 1623 db_printf("FEATURES_CTL\t0x%016llx\n", 1624 rdmsr(MSR_IA32_FEATURE_CONTROL)); 1625 if ((cpu_vendor_id == CPU_VENDOR_INTEL || 1626 cpu_vendor_id == CPU_VENDOR_AMD) && CPUID_TO_FAMILY(cpu_id) >= 6) 1627 db_printf("DEBUG_CTL\t0x%016llx\n", rdmsr(MSR_DEBUGCTLMSR)); 1628 if (cpu_feature & CPUID_PAT) 1629 db_printf("PAT\t0x%016llx\n", rdmsr(MSR_PAT)); 1630 } 1631 1632 DB_SHOW_COMMAND(dbregs, db_show_dbregs) 1633 { 1634 1635 db_printf("dr0\t0x%08x\n", rdr0()); 1636 db_printf("dr1\t0x%08x\n", rdr1()); 1637 db_printf("dr2\t0x%08x\n", rdr2()); 1638 db_printf("dr3\t0x%08x\n", rdr3()); 1639 db_printf("dr6\t0x%08x\n", rdr6()); 1640 db_printf("dr7\t0x%08x\n", rdr7()); 1641 } 1642 1643 DB_SHOW_COMMAND(frame, db_show_frame) 1644 { 1645 struct trapframe *frame; 1646 1647 frame = have_addr ? (struct trapframe *)addr : curthread->td_frame; 1648 printf("ss %#x esp %#x efl %#x cs %#x eip %#x\n", 1649 frame->tf_ss, frame->tf_esp, frame->tf_eflags, frame->tf_cs, 1650 frame->tf_eip); 1651 printf("err %#x trapno %d\n", frame->tf_err, frame->tf_trapno); 1652 printf("ds %#x es %#x fs %#x\n", 1653 frame->tf_ds, frame->tf_es, frame->tf_fs); 1654 printf("eax %#x ecx %#x edx %#x ebx %#x\n", 1655 frame->tf_eax, frame->tf_ecx, frame->tf_edx, frame->tf_ebx); 1656 printf("ebp %#x esi %#x edi %#x\n", 1657 frame->tf_ebp, frame->tf_esi, frame->tf_edi); 1658 1659 } 1660 #endif 1661 1662 void 1663 sdtossd(sd, ssd) 1664 struct segment_descriptor *sd; 1665 struct soft_segment_descriptor *ssd; 1666 { 1667 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 1668 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 1669 ssd->ssd_type = sd->sd_type; 1670 ssd->ssd_dpl = sd->sd_dpl; 1671 ssd->ssd_p = sd->sd_p; 1672 ssd->ssd_def32 = sd->sd_def32; 1673 ssd->ssd_gran = sd->sd_gran; 1674 } 1675 1676 static int 1677 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, 1678 int *physmap_idxp) 1679 { 1680 uint64_t lim, ign; 1681 int i, insert_idx, physmap_idx; 1682 1683 physmap_idx = *physmap_idxp; 1684 1685 if (length == 0) 1686 return (1); 1687 1688 lim = 0x100000000; /* 4G */ 1689 if (pae_mode && above4g_allow) 1690 lim = above24g_allow ? -1ULL : 0x600000000; /* 24G */ 1691 if (base >= lim) { 1692 printf("%uK of memory above %uGB ignored, pae %d " 1693 "above4g_allow %d above24g_allow %d\n", 1694 (u_int)(length / 1024), (u_int)(lim >> 30), pae_mode, 1695 above4g_allow, above24g_allow); 1696 return (1); 1697 } 1698 if (base + length >= lim) { 1699 ign = base + length - lim; 1700 length -= ign; 1701 printf("%uK of memory above %uGB ignored, pae %d " 1702 "above4g_allow %d above24g_allow %d\n", 1703 (u_int)(ign / 1024), (u_int)(lim >> 30), pae_mode, 1704 above4g_allow, above24g_allow); 1705 } 1706 1707 /* 1708 * Find insertion point while checking for overlap. Start off by 1709 * assuming the new entry will be added to the end. 1710 */ 1711 insert_idx = physmap_idx + 2; 1712 for (i = 0; i <= physmap_idx; i += 2) { 1713 if (base < physmap[i + 1]) { 1714 if (base + length <= physmap[i]) { 1715 insert_idx = i; 1716 break; 1717 } 1718 if (boothowto & RB_VERBOSE) 1719 printf( 1720 "Overlapping memory regions, ignoring second region\n"); 1721 return (1); 1722 } 1723 } 1724 1725 /* See if we can prepend to the next entry. */ 1726 if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { 1727 physmap[insert_idx] = base; 1728 return (1); 1729 } 1730 1731 /* See if we can append to the previous entry. */ 1732 if (insert_idx > 0 && base == physmap[insert_idx - 1]) { 1733 physmap[insert_idx - 1] += length; 1734 return (1); 1735 } 1736 1737 physmap_idx += 2; 1738 *physmap_idxp = physmap_idx; 1739 if (physmap_idx == PHYSMAP_SIZE) { 1740 printf( 1741 "Too many segments in the physical address map, giving up\n"); 1742 return (0); 1743 } 1744 1745 /* 1746 * Move the last 'N' entries down to make room for the new 1747 * entry if needed. 1748 */ 1749 for (i = physmap_idx; i > insert_idx; i -= 2) { 1750 physmap[i] = physmap[i - 2]; 1751 physmap[i + 1] = physmap[i - 1]; 1752 } 1753 1754 /* Insert the new entry. */ 1755 physmap[insert_idx] = base; 1756 physmap[insert_idx + 1] = base + length; 1757 return (1); 1758 } 1759 1760 static int 1761 add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp) 1762 { 1763 if (boothowto & RB_VERBOSE) 1764 printf("SMAP type=%02x base=%016llx len=%016llx\n", 1765 smap->type, smap->base, smap->length); 1766 1767 if (smap->type != SMAP_TYPE_MEMORY) 1768 return (1); 1769 1770 return (add_physmap_entry(smap->base, smap->length, physmap, 1771 physmap_idxp)); 1772 } 1773 1774 static void 1775 add_smap_entries(struct bios_smap *smapbase, vm_paddr_t *physmap, 1776 int *physmap_idxp) 1777 { 1778 struct bios_smap *smap, *smapend; 1779 u_int32_t smapsize; 1780 /* 1781 * Memory map from INT 15:E820. 1782 * 1783 * subr_module.c says: 1784 * "Consumer may safely assume that size value precedes data." 1785 * ie: an int32_t immediately precedes SMAP. 1786 */ 1787 smapsize = *((u_int32_t *)smapbase - 1); 1788 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 1789 1790 for (smap = smapbase; smap < smapend; smap++) 1791 if (!add_smap_entry(smap, physmap, physmap_idxp)) 1792 break; 1793 } 1794 1795 static void 1796 basemem_setup(void) 1797 { 1798 1799 if (basemem > 640) { 1800 printf("Preposterous BIOS basemem of %uK, truncating to 640K\n", 1801 basemem); 1802 basemem = 640; 1803 } 1804 1805 pmap_basemem_setup(basemem); 1806 } 1807 1808 /* 1809 * Populate the (physmap) array with base/bound pairs describing the 1810 * available physical memory in the system, then test this memory and 1811 * build the phys_avail array describing the actually-available memory. 1812 * 1813 * If we cannot accurately determine the physical memory map, then use 1814 * value from the 0xE801 call, and failing that, the RTC. 1815 * 1816 * Total memory size may be set by the kernel environment variable 1817 * hw.physmem or the compile-time define MAXMEM. 1818 * 1819 * XXX first should be vm_paddr_t. 1820 */ 1821 static void 1822 getmemsize(int first) 1823 { 1824 int has_smap, off, physmap_idx, pa_indx, da_indx; 1825 u_long memtest; 1826 vm_paddr_t physmap[PHYSMAP_SIZE]; 1827 quad_t dcons_addr, dcons_size, physmem_tunable; 1828 int hasbrokenint12, i, res; 1829 u_int extmem; 1830 struct vm86frame vmf; 1831 struct vm86context vmc; 1832 vm_paddr_t pa; 1833 struct bios_smap *smap, *smapbase; 1834 caddr_t kmdp; 1835 1836 has_smap = 0; 1837 bzero(&vmf, sizeof(vmf)); 1838 bzero(physmap, sizeof(physmap)); 1839 basemem = 0; 1840 1841 /* 1842 * Tell the physical memory allocator about pages used to store 1843 * the kernel and preloaded data. See kmem_bootstrap_free(). 1844 */ 1845 vm_phys_add_seg((vm_paddr_t)KERNLOAD, trunc_page(first)); 1846 1847 TUNABLE_INT_FETCH("hw.above4g_allow", &above4g_allow); 1848 TUNABLE_INT_FETCH("hw.above24g_allow", &above24g_allow); 1849 1850 /* 1851 * Check if the loader supplied an SMAP memory map. If so, 1852 * use that and do not make any VM86 calls. 1853 */ 1854 physmap_idx = 0; 1855 kmdp = preload_search_by_type("elf kernel"); 1856 if (kmdp == NULL) 1857 kmdp = preload_search_by_type("elf32 kernel"); 1858 smapbase = (struct bios_smap *)preload_search_info(kmdp, 1859 MODINFO_METADATA | MODINFOMD_SMAP); 1860 if (smapbase != NULL) { 1861 add_smap_entries(smapbase, physmap, &physmap_idx); 1862 has_smap = 1; 1863 goto have_smap; 1864 } 1865 1866 /* 1867 * Some newer BIOSes have a broken INT 12H implementation 1868 * which causes a kernel panic immediately. In this case, we 1869 * need use the SMAP to determine the base memory size. 1870 */ 1871 hasbrokenint12 = 0; 1872 TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12); 1873 if (hasbrokenint12 == 0) { 1874 /* Use INT12 to determine base memory size. */ 1875 vm86_intcall(0x12, &vmf); 1876 basemem = vmf.vmf_ax; 1877 basemem_setup(); 1878 } 1879 1880 /* 1881 * Fetch the memory map with INT 15:E820. Map page 1 R/W into 1882 * the kernel page table so we can use it as a buffer. The 1883 * kernel will unmap this page later. 1884 */ 1885 vmc.npages = 0; 1886 smap = (void *)vm86_addpage(&vmc, 1, PMAP_MAP_LOW + ptoa(1)); 1887 res = vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di); 1888 KASSERT(res != 0, ("vm86_getptr() failed: address not found")); 1889 1890 vmf.vmf_ebx = 0; 1891 do { 1892 vmf.vmf_eax = 0xE820; 1893 vmf.vmf_edx = SMAP_SIG; 1894 vmf.vmf_ecx = sizeof(struct bios_smap); 1895 i = vm86_datacall(0x15, &vmf, &vmc); 1896 if (i || vmf.vmf_eax != SMAP_SIG) 1897 break; 1898 has_smap = 1; 1899 if (!add_smap_entry(smap, physmap, &physmap_idx)) 1900 break; 1901 } while (vmf.vmf_ebx != 0); 1902 1903 have_smap: 1904 /* 1905 * If we didn't fetch the "base memory" size from INT12, 1906 * figure it out from the SMAP (or just guess). 1907 */ 1908 if (basemem == 0) { 1909 for (i = 0; i <= physmap_idx; i += 2) { 1910 if (physmap[i] == 0x00000000) { 1911 basemem = physmap[i + 1] / 1024; 1912 break; 1913 } 1914 } 1915 1916 /* XXX: If we couldn't find basemem from SMAP, just guess. */ 1917 if (basemem == 0) 1918 basemem = 640; 1919 basemem_setup(); 1920 } 1921 1922 if (physmap[1] != 0) 1923 goto physmap_done; 1924 1925 /* 1926 * If we failed to find an SMAP, figure out the extended 1927 * memory size. We will then build a simple memory map with 1928 * two segments, one for "base memory" and the second for 1929 * "extended memory". Note that "extended memory" starts at a 1930 * physical address of 1MB and that both basemem and extmem 1931 * are in units of 1KB. 1932 * 1933 * First, try to fetch the extended memory size via INT 15:E801. 1934 */ 1935 vmf.vmf_ax = 0xE801; 1936 if (vm86_intcall(0x15, &vmf) == 0) { 1937 extmem = vmf.vmf_cx + vmf.vmf_dx * 64; 1938 } else { 1939 /* 1940 * If INT15:E801 fails, this is our last ditch effort 1941 * to determine the extended memory size. Currently 1942 * we prefer the RTC value over INT15:88. 1943 */ 1944 #if 0 1945 vmf.vmf_ah = 0x88; 1946 vm86_intcall(0x15, &vmf); 1947 extmem = vmf.vmf_ax; 1948 #else 1949 extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8); 1950 #endif 1951 } 1952 1953 /* 1954 * Special hack for chipsets that still remap the 384k hole when 1955 * there's 16MB of memory - this really confuses people that 1956 * are trying to use bus mastering ISA controllers with the 1957 * "16MB limit"; they only have 16MB, but the remapping puts 1958 * them beyond the limit. 1959 * 1960 * If extended memory is between 15-16MB (16-17MB phys address range), 1961 * chop it to 15MB. 1962 */ 1963 if ((extmem > 15 * 1024) && (extmem < 16 * 1024)) 1964 extmem = 15 * 1024; 1965 1966 physmap[0] = 0; 1967 physmap[1] = basemem * 1024; 1968 physmap_idx = 2; 1969 physmap[physmap_idx] = 0x100000; 1970 physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; 1971 1972 physmap_done: 1973 /* 1974 * Now, physmap contains a map of physical memory. 1975 */ 1976 1977 #ifdef SMP 1978 /* make hole for AP bootstrap code */ 1979 alloc_ap_trampoline(physmap, &physmap_idx); 1980 #endif 1981 1982 /* 1983 * Maxmem isn't the "maximum memory", it's one larger than the 1984 * highest page of the physical address space. It should be 1985 * called something like "Maxphyspage". We may adjust this 1986 * based on ``hw.physmem'' and the results of the memory test. 1987 * 1988 * This is especially confusing when it is much larger than the 1989 * memory size and is displayed as "realmem". 1990 */ 1991 Maxmem = atop(physmap[physmap_idx + 1]); 1992 1993 #ifdef MAXMEM 1994 Maxmem = MAXMEM / 4; 1995 #endif 1996 1997 if (TUNABLE_QUAD_FETCH("hw.physmem", &physmem_tunable)) 1998 Maxmem = atop(physmem_tunable); 1999 2000 /* 2001 * If we have an SMAP, don't allow MAXMEM or hw.physmem to extend 2002 * the amount of memory in the system. 2003 */ 2004 if (has_smap && Maxmem > atop(physmap[physmap_idx + 1])) 2005 Maxmem = atop(physmap[physmap_idx + 1]); 2006 2007 /* 2008 * The boot memory test is disabled by default, as it takes a 2009 * significant amount of time on large-memory systems, and is 2010 * unfriendly to virtual machines as it unnecessarily touches all 2011 * pages. 2012 * 2013 * A general name is used as the code may be extended to support 2014 * additional tests beyond the current "page present" test. 2015 */ 2016 memtest = 0; 2017 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); 2018 2019 if (atop(physmap[physmap_idx + 1]) != Maxmem && 2020 (boothowto & RB_VERBOSE)) 2021 printf("Physical memory use set to %ldK\n", Maxmem * 4); 2022 2023 /* 2024 * If Maxmem has been increased beyond what the system has detected, 2025 * extend the last memory segment to the new limit. 2026 */ 2027 if (atop(physmap[physmap_idx + 1]) < Maxmem) 2028 physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem); 2029 2030 /* call pmap initialization to make new kernel address space */ 2031 pmap_bootstrap(first); 2032 2033 /* 2034 * Size up each available chunk of physical memory. 2035 */ 2036 physmap[0] = PAGE_SIZE; /* mask off page 0 */ 2037 pa_indx = 0; 2038 da_indx = 1; 2039 phys_avail[pa_indx++] = physmap[0]; 2040 phys_avail[pa_indx] = physmap[0]; 2041 dump_avail[da_indx] = physmap[0]; 2042 2043 /* 2044 * Get dcons buffer address 2045 */ 2046 if (getenv_quad("dcons.addr", &dcons_addr) == 0 || 2047 getenv_quad("dcons.size", &dcons_size) == 0) 2048 dcons_addr = 0; 2049 2050 /* 2051 * physmap is in bytes, so when converting to page boundaries, 2052 * round up the start address and round down the end address. 2053 */ 2054 for (i = 0; i <= physmap_idx; i += 2) { 2055 vm_paddr_t end; 2056 2057 end = ptoa((vm_paddr_t)Maxmem); 2058 if (physmap[i + 1] < end) 2059 end = trunc_page(physmap[i + 1]); 2060 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 2061 int tmp, page_bad, full; 2062 int *ptr; 2063 2064 full = FALSE; 2065 /* 2066 * block out kernel memory as not available. 2067 */ 2068 if (pa >= KERNLOAD && pa < first) 2069 goto do_dump_avail; 2070 2071 /* 2072 * block out dcons buffer 2073 */ 2074 if (dcons_addr > 0 2075 && pa >= trunc_page(dcons_addr) 2076 && pa < dcons_addr + dcons_size) 2077 goto do_dump_avail; 2078 2079 page_bad = FALSE; 2080 if (memtest == 0) 2081 goto skip_memtest; 2082 2083 /* 2084 * map page into kernel: valid, read/write,non-cacheable 2085 */ 2086 ptr = (int *)pmap_cmap3(pa, PG_V | PG_RW | PG_N); 2087 2088 tmp = *(int *)ptr; 2089 /* 2090 * Test for alternating 1's and 0's 2091 */ 2092 *(volatile int *)ptr = 0xaaaaaaaa; 2093 if (*(volatile int *)ptr != 0xaaaaaaaa) 2094 page_bad = TRUE; 2095 /* 2096 * Test for alternating 0's and 1's 2097 */ 2098 *(volatile int *)ptr = 0x55555555; 2099 if (*(volatile int *)ptr != 0x55555555) 2100 page_bad = TRUE; 2101 /* 2102 * Test for all 1's 2103 */ 2104 *(volatile int *)ptr = 0xffffffff; 2105 if (*(volatile int *)ptr != 0xffffffff) 2106 page_bad = TRUE; 2107 /* 2108 * Test for all 0's 2109 */ 2110 *(volatile int *)ptr = 0x0; 2111 if (*(volatile int *)ptr != 0x0) 2112 page_bad = TRUE; 2113 /* 2114 * Restore original value. 2115 */ 2116 *(int *)ptr = tmp; 2117 2118 skip_memtest: 2119 /* 2120 * Adjust array of valid/good pages. 2121 */ 2122 if (page_bad == TRUE) 2123 continue; 2124 /* 2125 * If this good page is a continuation of the 2126 * previous set of good pages, then just increase 2127 * the end pointer. Otherwise start a new chunk. 2128 * Note that "end" points one higher than end, 2129 * making the range >= start and < end. 2130 * If we're also doing a speculative memory 2131 * test and we at or past the end, bump up Maxmem 2132 * so that we keep going. The first bad page 2133 * will terminate the loop. 2134 */ 2135 if (phys_avail[pa_indx] == pa) { 2136 phys_avail[pa_indx] += PAGE_SIZE; 2137 } else { 2138 pa_indx++; 2139 if (pa_indx == PHYS_AVAIL_ARRAY_END) { 2140 printf( 2141 "Too many holes in the physical address space, giving up\n"); 2142 pa_indx--; 2143 full = TRUE; 2144 goto do_dump_avail; 2145 } 2146 phys_avail[pa_indx++] = pa; /* start */ 2147 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 2148 } 2149 physmem++; 2150 do_dump_avail: 2151 if (dump_avail[da_indx] == pa) { 2152 dump_avail[da_indx] += PAGE_SIZE; 2153 } else { 2154 da_indx++; 2155 if (da_indx == DUMP_AVAIL_ARRAY_END) { 2156 da_indx--; 2157 goto do_next; 2158 } 2159 dump_avail[da_indx++] = pa; /* start */ 2160 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 2161 } 2162 do_next: 2163 if (full) 2164 break; 2165 } 2166 } 2167 pmap_cmap3(0, 0); 2168 2169 /* 2170 * XXX 2171 * The last chunk must contain at least one page plus the message 2172 * buffer to avoid complicating other code (message buffer address 2173 * calculation, etc.). 2174 */ 2175 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 2176 round_page(msgbufsize) >= phys_avail[pa_indx]) { 2177 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 2178 phys_avail[pa_indx--] = 0; 2179 phys_avail[pa_indx--] = 0; 2180 } 2181 2182 Maxmem = atop(phys_avail[pa_indx]); 2183 2184 /* Trim off space for the message buffer. */ 2185 phys_avail[pa_indx] -= round_page(msgbufsize); 2186 2187 /* Map the message buffer. */ 2188 for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE) 2189 pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + 2190 off); 2191 } 2192 2193 static void 2194 i386_kdb_init(void) 2195 { 2196 #ifdef DDB 2197 db_fetch_ksymtab(bootinfo.bi_symtab, bootinfo.bi_esymtab); 2198 #endif 2199 kdb_init(); 2200 #ifdef KDB 2201 if (boothowto & RB_KDB) 2202 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); 2203 #endif 2204 } 2205 2206 static void 2207 fixup_idt(void) 2208 { 2209 struct gate_descriptor *ip; 2210 uintptr_t off; 2211 int x; 2212 2213 for (x = 0; x < NIDT; x++) { 2214 ip = &idt[x]; 2215 if (ip->gd_type != SDT_SYS386IGT && 2216 ip->gd_type != SDT_SYS386TGT) 2217 continue; 2218 off = ip->gd_looffset + (((u_int)ip->gd_hioffset) << 16); 2219 KASSERT(off >= (uintptr_t)start_exceptions && 2220 off < (uintptr_t)end_exceptions, 2221 ("IDT[%d] type %d off %#x", x, ip->gd_type, off)); 2222 off += setidt_disp; 2223 MPASS(off >= PMAP_TRM_MIN_ADDRESS && 2224 off < PMAP_TRM_MAX_ADDRESS); 2225 ip->gd_looffset = off; 2226 ip->gd_hioffset = off >> 16; 2227 } 2228 } 2229 2230 static void 2231 i386_setidt1(void) 2232 { 2233 int x; 2234 2235 /* exceptions */ 2236 for (x = 0; x < NIDT; x++) 2237 setidt(x, &IDTVEC(rsvd), SDT_SYS386IGT, SEL_KPL, 2238 GSEL(GCODE_SEL, SEL_KPL)); 2239 setidt(IDT_DE, &IDTVEC(div), SDT_SYS386IGT, SEL_KPL, 2240 GSEL(GCODE_SEL, SEL_KPL)); 2241 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL, 2242 GSEL(GCODE_SEL, SEL_KPL)); 2243 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL, 2244 GSEL(GCODE_SEL, SEL_KPL)); 2245 setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL, 2246 GSEL(GCODE_SEL, SEL_KPL)); 2247 setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386IGT, SEL_UPL, 2248 GSEL(GCODE_SEL, SEL_KPL)); 2249 setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386IGT, SEL_KPL, 2250 GSEL(GCODE_SEL, SEL_KPL)); 2251 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386IGT, SEL_KPL, 2252 GSEL(GCODE_SEL, SEL_KPL)); 2253 setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386IGT, SEL_KPL, 2254 GSEL(GCODE_SEL, SEL_KPL)); 2255 setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, 2256 SEL_KPL)); 2257 setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386IGT, 2258 SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 2259 setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386IGT, SEL_KPL, 2260 GSEL(GCODE_SEL, SEL_KPL)); 2261 setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386IGT, SEL_KPL, 2262 GSEL(GCODE_SEL, SEL_KPL)); 2263 setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386IGT, SEL_KPL, 2264 GSEL(GCODE_SEL, SEL_KPL)); 2265 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386IGT, SEL_KPL, 2266 GSEL(GCODE_SEL, SEL_KPL)); 2267 setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL, 2268 GSEL(GCODE_SEL, SEL_KPL)); 2269 setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386IGT, SEL_KPL, 2270 GSEL(GCODE_SEL, SEL_KPL)); 2271 setidt(IDT_AC, &IDTVEC(align), SDT_SYS386IGT, SEL_KPL, 2272 GSEL(GCODE_SEL, SEL_KPL)); 2273 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386IGT, SEL_KPL, 2274 GSEL(GCODE_SEL, SEL_KPL)); 2275 setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386IGT, SEL_KPL, 2276 GSEL(GCODE_SEL, SEL_KPL)); 2277 setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), 2278 SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); 2279 #ifdef KDTRACE_HOOKS 2280 setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), 2281 SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); 2282 #endif 2283 #ifdef XENHVM 2284 setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), 2285 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 2286 #endif 2287 } 2288 2289 static void 2290 i386_setidt2(void) 2291 { 2292 2293 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386IGT, SEL_KPL, 2294 GSEL(GCODE_SEL, SEL_KPL)); 2295 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386IGT, SEL_KPL, 2296 GSEL(GCODE_SEL, SEL_KPL)); 2297 } 2298 2299 #if defined(DEV_ISA) && !defined(DEV_ATPIC) 2300 static void 2301 i386_setidt3(void) 2302 { 2303 2304 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), 2305 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 2306 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), 2307 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 2308 } 2309 #endif 2310 2311 register_t 2312 init386(int first) 2313 { 2314 struct region_descriptor r_gdt, r_idt; /* table descriptors */ 2315 int gsel_tss, metadata_missing, x, pa; 2316 struct pcpu *pc; 2317 struct xstate_hdr *xhdr; 2318 caddr_t kmdp; 2319 vm_offset_t addend; 2320 size_t ucode_len; 2321 int late_console; 2322 2323 thread0.td_kstack = proc0kstack; 2324 thread0.td_kstack_pages = TD0_KSTACK_PAGES; 2325 2326 /* 2327 * This may be done better later if it gets more high level 2328 * components in it. If so just link td->td_proc here. 2329 */ 2330 proc_linkup0(&proc0, &thread0); 2331 2332 if (bootinfo.bi_modulep) { 2333 metadata_missing = 0; 2334 addend = (vm_paddr_t)bootinfo.bi_modulep < KERNBASE ? 2335 PMAP_MAP_LOW : 0; 2336 preload_metadata = (caddr_t)bootinfo.bi_modulep + addend; 2337 preload_bootstrap_relocate(addend); 2338 } else { 2339 metadata_missing = 1; 2340 } 2341 2342 if (bootinfo.bi_envp != 0) { 2343 addend = (vm_paddr_t)bootinfo.bi_envp < KERNBASE ? 2344 PMAP_MAP_LOW : 0; 2345 init_static_kenv((char *)bootinfo.bi_envp + addend, 0); 2346 } else { 2347 init_static_kenv(NULL, 0); 2348 } 2349 2350 /* 2351 * Re-evaluate CPU features if we loaded a microcode update. 2352 */ 2353 ucode_len = ucode_load_bsp(first); 2354 if (ucode_len != 0) { 2355 identify_cpu(); 2356 first = roundup2(first + ucode_len, PAGE_SIZE); 2357 } 2358 2359 identify_hypervisor(); 2360 2361 /* Init basic tunables, hz etc */ 2362 init_param1(); 2363 2364 /* 2365 * Make gdt memory segments. All segments cover the full 4GB 2366 * of address space and permissions are enforced at page level. 2367 */ 2368 gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1); 2369 gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1); 2370 gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1); 2371 gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1); 2372 gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1); 2373 gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1); 2374 2375 pc = &__pcpu[0]; 2376 gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1); 2377 gdt_segs[GPRIV_SEL].ssd_base = (int)pc; 2378 gdt_segs[GPROC0_SEL].ssd_base = (int)&common_tss0; 2379 2380 for (x = 0; x < NGDT; x++) 2381 ssdtosd(&gdt_segs[x], &gdt0[x].sd); 2382 2383 r_gdt.rd_limit = NGDT * sizeof(gdt0[0]) - 1; 2384 r_gdt.rd_base = (int)gdt0; 2385 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN); 2386 lgdt(&r_gdt); 2387 2388 pcpu_init(pc, 0, sizeof(struct pcpu)); 2389 for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE) 2390 pmap_kenter(pa, pa); 2391 dpcpu_init((void *)first, 0); 2392 first += DPCPU_SIZE; 2393 PCPU_SET(prvspace, pc); 2394 PCPU_SET(curthread, &thread0); 2395 /* Non-late cninit() and printf() can be moved up to here. */ 2396 2397 /* 2398 * Initialize mutexes. 2399 * 2400 * icu_lock: in order to allow an interrupt to occur in a critical 2401 * section, to set pcpu->ipending (etc...) properly, we 2402 * must be able to get the icu lock, so it can't be 2403 * under witness. 2404 */ 2405 mutex_init(); 2406 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE); 2407 2408 i386_setidt1(); 2409 2410 r_idt.rd_limit = sizeof(idt0) - 1; 2411 r_idt.rd_base = (int) idt; 2412 lidt(&r_idt); 2413 2414 /* 2415 * Initialize the clock before the console so that console 2416 * initialization can use DELAY(). 2417 */ 2418 clock_init(); 2419 2420 finishidentcpu(); /* Final stage of CPU initialization */ 2421 i386_setidt2(); 2422 pmap_set_nx(); 2423 initializecpu(); /* Initialize CPU registers */ 2424 initializecpucache(); 2425 2426 /* pointer to selector slot for %fs/%gs */ 2427 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); 2428 2429 /* Initialize the tss (except for the final esp0) early for vm86. */ 2430 common_tss0.tss_esp0 = thread0.td_kstack + thread0.td_kstack_pages * 2431 PAGE_SIZE - VM86_STACK_SPACE; 2432 common_tss0.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL); 2433 common_tss0.tss_ioopt = sizeof(struct i386tss) << 16; 2434 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 2435 PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); 2436 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); 2437 ltr(gsel_tss); 2438 2439 /* Initialize the PIC early for vm86 calls. */ 2440 #ifdef DEV_ISA 2441 #ifdef DEV_ATPIC 2442 elcr_probe(); 2443 atpic_startup(); 2444 #else 2445 /* Reset and mask the atpics and leave them shut down. */ 2446 atpic_reset(); 2447 2448 /* 2449 * Point the ICU spurious interrupt vectors at the APIC spurious 2450 * interrupt handler. 2451 */ 2452 i386_setidt3(); 2453 #endif 2454 #endif 2455 2456 /* 2457 * The console and kdb should be initialized even earlier than here, 2458 * but some console drivers don't work until after getmemsize(). 2459 * Default to late console initialization to support these drivers. 2460 * This loses mainly printf()s in getmemsize() and early debugging. 2461 */ 2462 late_console = 1; 2463 TUNABLE_INT_FETCH("debug.late_console", &late_console); 2464 if (!late_console) { 2465 cninit(); 2466 i386_kdb_init(); 2467 } 2468 2469 kmdp = preload_search_by_type("elf kernel"); 2470 link_elf_ireloc(kmdp); 2471 2472 vm86_initialize(); 2473 getmemsize(first); 2474 init_param2(physmem); 2475 2476 /* now running on new page tables, configured,and u/iom is accessible */ 2477 2478 if (late_console) 2479 cninit(); 2480 2481 if (metadata_missing) 2482 printf("WARNING: loader(8) metadata is missing!\n"); 2483 2484 if (late_console) 2485 i386_kdb_init(); 2486 2487 msgbufinit(msgbufp, msgbufsize); 2488 npxinit(true); 2489 /* 2490 * Set up thread0 pcb after npxinit calculated pcb + fpu save 2491 * area size. Zero out the extended state header in fpu save 2492 * area. 2493 */ 2494 thread0.td_pcb = get_pcb_td(&thread0); 2495 thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0); 2496 bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size); 2497 if (use_xsave) { 2498 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + 2499 1); 2500 xhdr->xstate_bv = xsave_mask; 2501 } 2502 PCPU_SET(curpcb, thread0.td_pcb); 2503 /* Move esp0 in the tss to its final place. */ 2504 /* Note: -16 is so we can grow the trapframe if we came from vm86 */ 2505 common_tss0.tss_esp0 = (vm_offset_t)thread0.td_pcb - VM86_STACK_SPACE; 2506 PCPU_SET(kesp0, common_tss0.tss_esp0); 2507 gdt[GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; /* clear busy bit */ 2508 ltr(gsel_tss); 2509 2510 /* transfer to user mode */ 2511 2512 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 2513 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 2514 2515 /* setup proc 0's pcb */ 2516 thread0.td_pcb->pcb_flags = 0; 2517 thread0.td_pcb->pcb_cr3 = pmap_get_kcr3(); 2518 thread0.td_pcb->pcb_ext = 0; 2519 thread0.td_frame = &proc0_tf; 2520 2521 cpu_probe_amdc1e(); 2522 2523 #ifdef FDT 2524 x86_init_fdt(); 2525 #endif 2526 2527 /* Location of kernel stack for locore */ 2528 return ((register_t)thread0.td_pcb); 2529 } 2530 2531 static void 2532 machdep_init_trampoline(void) 2533 { 2534 struct region_descriptor r_gdt, r_idt; 2535 struct i386tss *tss; 2536 char *copyout_buf, *trampoline, *tramp_stack_base; 2537 int x; 2538 2539 gdt = pmap_trm_alloc(sizeof(union descriptor) * NGDT * mp_ncpus, 2540 M_NOWAIT | M_ZERO); 2541 bcopy(gdt0, gdt, sizeof(union descriptor) * NGDT); 2542 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 2543 r_gdt.rd_base = (int)gdt; 2544 lgdt(&r_gdt); 2545 2546 tss = pmap_trm_alloc(sizeof(struct i386tss) * mp_ncpus, 2547 M_NOWAIT | M_ZERO); 2548 bcopy(&common_tss0, tss, sizeof(struct i386tss)); 2549 gdt[GPROC0_SEL].sd.sd_lobase = (int)tss; 2550 gdt[GPROC0_SEL].sd.sd_hibase = (u_int)tss >> 24; 2551 gdt[GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; 2552 2553 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); 2554 PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); 2555 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); 2556 PCPU_SET(common_tssp, tss); 2557 ltr(GSEL(GPROC0_SEL, SEL_KPL)); 2558 2559 trampoline = pmap_trm_alloc(end_exceptions - start_exceptions, 2560 M_NOWAIT); 2561 bcopy(start_exceptions, trampoline, end_exceptions - start_exceptions); 2562 tramp_stack_base = pmap_trm_alloc(TRAMP_STACK_SZ, M_NOWAIT); 2563 PCPU_SET(trampstk, (uintptr_t)tramp_stack_base + TRAMP_STACK_SZ - 2564 VM86_STACK_SPACE); 2565 tss[0].tss_esp0 = PCPU_GET(trampstk); 2566 2567 idt = pmap_trm_alloc(sizeof(idt0), M_NOWAIT | M_ZERO); 2568 bcopy(idt0, idt, sizeof(idt0)); 2569 2570 /* Re-initialize new IDT since the handlers were relocated */ 2571 setidt_disp = trampoline - start_exceptions; 2572 fixup_idt(); 2573 2574 r_idt.rd_limit = sizeof(struct gate_descriptor) * NIDT - 1; 2575 r_idt.rd_base = (int)idt; 2576 lidt(&r_idt); 2577 2578 /* dblfault TSS */ 2579 dblfault_tss = pmap_trm_alloc(sizeof(struct i386tss), M_NOWAIT | M_ZERO); 2580 dblfault_stack = pmap_trm_alloc(PAGE_SIZE, M_NOWAIT); 2581 dblfault_tss->tss_esp = dblfault_tss->tss_esp0 = 2582 dblfault_tss->tss_esp1 = dblfault_tss->tss_esp2 = 2583 (int)dblfault_stack + PAGE_SIZE; 2584 dblfault_tss->tss_ss = dblfault_tss->tss_ss0 = dblfault_tss->tss_ss1 = 2585 dblfault_tss->tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); 2586 dblfault_tss->tss_cr3 = pmap_get_kcr3(); 2587 dblfault_tss->tss_eip = (int)dblfault_handler; 2588 dblfault_tss->tss_eflags = PSL_KERNEL; 2589 dblfault_tss->tss_ds = dblfault_tss->tss_es = 2590 dblfault_tss->tss_gs = GSEL(GDATA_SEL, SEL_KPL); 2591 dblfault_tss->tss_fs = GSEL(GPRIV_SEL, SEL_KPL); 2592 dblfault_tss->tss_cs = GSEL(GCODE_SEL, SEL_KPL); 2593 dblfault_tss->tss_ldt = GSEL(GLDT_SEL, SEL_KPL); 2594 gdt[GPANIC_SEL].sd.sd_lobase = (int)dblfault_tss; 2595 gdt[GPANIC_SEL].sd.sd_hibase = (u_int)dblfault_tss >> 24; 2596 2597 /* make ldt memory segments */ 2598 ldt = pmap_trm_alloc(sizeof(union descriptor) * NLDT, 2599 M_NOWAIT | M_ZERO); 2600 gdt[GLDT_SEL].sd.sd_lobase = (int)ldt; 2601 gdt[GLDT_SEL].sd.sd_hibase = (u_int)ldt >> 24; 2602 ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1); 2603 ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1); 2604 for (x = 0; x < nitems(ldt_segs); x++) 2605 ssdtosd(&ldt_segs[x], &ldt[x].sd); 2606 2607 _default_ldt = GSEL(GLDT_SEL, SEL_KPL); 2608 lldt(_default_ldt); 2609 PCPU_SET(currentldt, _default_ldt); 2610 2611 copyout_buf = pmap_trm_alloc(TRAMP_COPYOUT_SZ, M_NOWAIT); 2612 PCPU_SET(copyout_buf, copyout_buf); 2613 copyout_init_tramp(); 2614 } 2615 SYSINIT(vm_mem, SI_SUB_VM, SI_ORDER_SECOND, machdep_init_trampoline, NULL); 2616 2617 #ifdef COMPAT_43 2618 static void 2619 i386_setup_lcall_gate(void) 2620 { 2621 struct sysentvec *sv; 2622 struct user_segment_descriptor desc; 2623 u_int lcall_addr; 2624 2625 sv = &elf32_freebsd_sysvec; 2626 lcall_addr = (uintptr_t)sv->sv_psstrings - sz_lcall_tramp; 2627 2628 bzero(&desc, sizeof(desc)); 2629 desc.sd_type = SDT_MEMERA; 2630 desc.sd_dpl = SEL_UPL; 2631 desc.sd_p = 1; 2632 desc.sd_def32 = 1; 2633 desc.sd_gran = 1; 2634 desc.sd_lolimit = 0xffff; 2635 desc.sd_hilimit = 0xf; 2636 desc.sd_lobase = lcall_addr; 2637 desc.sd_hibase = lcall_addr >> 24; 2638 bcopy(&desc, &ldt[LSYS5CALLS_SEL], sizeof(desc)); 2639 } 2640 SYSINIT(elf32, SI_SUB_EXEC, SI_ORDER_ANY, i386_setup_lcall_gate, NULL); 2641 #endif 2642 2643 void 2644 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) 2645 { 2646 2647 pcpu->pc_acpi_id = 0xffffffff; 2648 } 2649 2650 static int 2651 smap_sysctl_handler(SYSCTL_HANDLER_ARGS) 2652 { 2653 struct bios_smap *smapbase; 2654 struct bios_smap_xattr smap; 2655 caddr_t kmdp; 2656 uint32_t *smapattr; 2657 int count, error, i; 2658 2659 /* Retrieve the system memory map from the loader. */ 2660 kmdp = preload_search_by_type("elf kernel"); 2661 if (kmdp == NULL) 2662 kmdp = preload_search_by_type("elf32 kernel"); 2663 smapbase = (struct bios_smap *)preload_search_info(kmdp, 2664 MODINFO_METADATA | MODINFOMD_SMAP); 2665 if (smapbase == NULL) 2666 return (0); 2667 smapattr = (uint32_t *)preload_search_info(kmdp, 2668 MODINFO_METADATA | MODINFOMD_SMAP_XATTR); 2669 count = *((u_int32_t *)smapbase - 1) / sizeof(*smapbase); 2670 error = 0; 2671 for (i = 0; i < count; i++) { 2672 smap.base = smapbase[i].base; 2673 smap.length = smapbase[i].length; 2674 smap.type = smapbase[i].type; 2675 if (smapattr != NULL) 2676 smap.xattr = smapattr[i]; 2677 else 2678 smap.xattr = 0; 2679 error = SYSCTL_OUT(req, &smap, sizeof(smap)); 2680 } 2681 return (error); 2682 } 2683 SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, 2684 smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data"); 2685 2686 void 2687 spinlock_enter(void) 2688 { 2689 struct thread *td; 2690 register_t flags; 2691 2692 td = curthread; 2693 if (td->td_md.md_spinlock_count == 0) { 2694 flags = intr_disable(); 2695 td->td_md.md_spinlock_count = 1; 2696 td->td_md.md_saved_flags = flags; 2697 } else 2698 td->td_md.md_spinlock_count++; 2699 critical_enter(); 2700 } 2701 2702 void 2703 spinlock_exit(void) 2704 { 2705 struct thread *td; 2706 register_t flags; 2707 2708 td = curthread; 2709 critical_exit(); 2710 flags = td->td_md.md_saved_flags; 2711 td->td_md.md_spinlock_count--; 2712 if (td->td_md.md_spinlock_count == 0) 2713 intr_restore(flags); 2714 } 2715 2716 #if defined(I586_CPU) && !defined(NO_F00F_HACK) 2717 static void f00f_hack(void *unused); 2718 SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL); 2719 2720 static void 2721 f00f_hack(void *unused) 2722 { 2723 struct region_descriptor r_idt; 2724 struct gate_descriptor *new_idt; 2725 vm_offset_t tmp; 2726 2727 if (!has_f00f_bug) 2728 return; 2729 2730 GIANT_REQUIRED; 2731 2732 printf("Intel Pentium detected, installing workaround for F00F bug\n"); 2733 2734 tmp = (vm_offset_t)pmap_trm_alloc(PAGE_SIZE * 3, M_NOWAIT | M_ZERO); 2735 if (tmp == 0) 2736 panic("kmem_malloc returned 0"); 2737 tmp = round_page(tmp); 2738 2739 /* Put the problematic entry (#6) at the end of the lower page. */ 2740 new_idt = (struct gate_descriptor *) 2741 (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor)); 2742 bcopy(idt, new_idt, sizeof(idt0)); 2743 r_idt.rd_base = (u_int)new_idt; 2744 r_idt.rd_limit = sizeof(idt0) - 1; 2745 lidt(&r_idt); 2746 /* SMP machines do not need the F00F hack. */ 2747 idt = new_idt; 2748 pmap_protect(kernel_pmap, tmp, tmp + PAGE_SIZE, VM_PROT_READ); 2749 } 2750 #endif /* defined(I586_CPU) && !NO_F00F_HACK */ 2751 2752 /* 2753 * Construct a PCB from a trapframe. This is called from kdb_trap() where 2754 * we want to start a backtrace from the function that caused us to enter 2755 * the debugger. We have the context in the trapframe, but base the trace 2756 * on the PCB. The PCB doesn't have to be perfect, as long as it contains 2757 * enough for a backtrace. 2758 */ 2759 void 2760 makectx(struct trapframe *tf, struct pcb *pcb) 2761 { 2762 2763 pcb->pcb_edi = tf->tf_edi; 2764 pcb->pcb_esi = tf->tf_esi; 2765 pcb->pcb_ebp = tf->tf_ebp; 2766 pcb->pcb_ebx = tf->tf_ebx; 2767 pcb->pcb_eip = tf->tf_eip; 2768 pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8; 2769 pcb->pcb_gs = rgs(); 2770 } 2771 2772 int 2773 ptrace_set_pc(struct thread *td, u_long addr) 2774 { 2775 2776 td->td_frame->tf_eip = addr; 2777 return (0); 2778 } 2779 2780 int 2781 ptrace_single_step(struct thread *td) 2782 { 2783 2784 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); 2785 if ((td->td_frame->tf_eflags & PSL_T) == 0) { 2786 td->td_frame->tf_eflags |= PSL_T; 2787 td->td_dbgflags |= TDB_STEP; 2788 } 2789 return (0); 2790 } 2791 2792 int 2793 ptrace_clear_single_step(struct thread *td) 2794 { 2795 2796 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); 2797 td->td_frame->tf_eflags &= ~PSL_T; 2798 td->td_dbgflags &= ~TDB_STEP; 2799 return (0); 2800 } 2801 2802 int 2803 fill_regs(struct thread *td, struct reg *regs) 2804 { 2805 struct pcb *pcb; 2806 struct trapframe *tp; 2807 2808 tp = td->td_frame; 2809 pcb = td->td_pcb; 2810 regs->r_gs = pcb->pcb_gs; 2811 return (fill_frame_regs(tp, regs)); 2812 } 2813 2814 int 2815 fill_frame_regs(struct trapframe *tp, struct reg *regs) 2816 { 2817 2818 regs->r_fs = tp->tf_fs; 2819 regs->r_es = tp->tf_es; 2820 regs->r_ds = tp->tf_ds; 2821 regs->r_edi = tp->tf_edi; 2822 regs->r_esi = tp->tf_esi; 2823 regs->r_ebp = tp->tf_ebp; 2824 regs->r_ebx = tp->tf_ebx; 2825 regs->r_edx = tp->tf_edx; 2826 regs->r_ecx = tp->tf_ecx; 2827 regs->r_eax = tp->tf_eax; 2828 regs->r_eip = tp->tf_eip; 2829 regs->r_cs = tp->tf_cs; 2830 regs->r_eflags = tp->tf_eflags; 2831 regs->r_esp = tp->tf_esp; 2832 regs->r_ss = tp->tf_ss; 2833 regs->r_err = 0; 2834 regs->r_trapno = 0; 2835 return (0); 2836 } 2837 2838 int 2839 set_regs(struct thread *td, struct reg *regs) 2840 { 2841 struct pcb *pcb; 2842 struct trapframe *tp; 2843 2844 tp = td->td_frame; 2845 if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) || 2846 !CS_SECURE(regs->r_cs)) 2847 return (EINVAL); 2848 pcb = td->td_pcb; 2849 tp->tf_fs = regs->r_fs; 2850 tp->tf_es = regs->r_es; 2851 tp->tf_ds = regs->r_ds; 2852 tp->tf_edi = regs->r_edi; 2853 tp->tf_esi = regs->r_esi; 2854 tp->tf_ebp = regs->r_ebp; 2855 tp->tf_ebx = regs->r_ebx; 2856 tp->tf_edx = regs->r_edx; 2857 tp->tf_ecx = regs->r_ecx; 2858 tp->tf_eax = regs->r_eax; 2859 tp->tf_eip = regs->r_eip; 2860 tp->tf_cs = regs->r_cs; 2861 tp->tf_eflags = regs->r_eflags; 2862 tp->tf_esp = regs->r_esp; 2863 tp->tf_ss = regs->r_ss; 2864 pcb->pcb_gs = regs->r_gs; 2865 return (0); 2866 } 2867 2868 int 2869 fill_fpregs(struct thread *td, struct fpreg *fpregs) 2870 { 2871 2872 KASSERT(td == curthread || TD_IS_SUSPENDED(td) || 2873 P_SHOULDSTOP(td->td_proc), 2874 ("not suspended thread %p", td)); 2875 npxgetregs(td); 2876 if (cpu_fxsr) 2877 npx_fill_fpregs_xmm(&get_pcb_user_save_td(td)->sv_xmm, 2878 (struct save87 *)fpregs); 2879 else 2880 bcopy(&get_pcb_user_save_td(td)->sv_87, fpregs, 2881 sizeof(*fpregs)); 2882 return (0); 2883 } 2884 2885 int 2886 set_fpregs(struct thread *td, struct fpreg *fpregs) 2887 { 2888 2889 critical_enter(); 2890 if (cpu_fxsr) 2891 npx_set_fpregs_xmm((struct save87 *)fpregs, 2892 &get_pcb_user_save_td(td)->sv_xmm); 2893 else 2894 bcopy(fpregs, &get_pcb_user_save_td(td)->sv_87, 2895 sizeof(*fpregs)); 2896 npxuserinited(td); 2897 critical_exit(); 2898 return (0); 2899 } 2900 2901 /* 2902 * Get machine context. 2903 */ 2904 int 2905 get_mcontext(struct thread *td, mcontext_t *mcp, int flags) 2906 { 2907 struct trapframe *tp; 2908 struct segment_descriptor *sdp; 2909 2910 tp = td->td_frame; 2911 2912 PROC_LOCK(curthread->td_proc); 2913 mcp->mc_onstack = sigonstack(tp->tf_esp); 2914 PROC_UNLOCK(curthread->td_proc); 2915 mcp->mc_gs = td->td_pcb->pcb_gs; 2916 mcp->mc_fs = tp->tf_fs; 2917 mcp->mc_es = tp->tf_es; 2918 mcp->mc_ds = tp->tf_ds; 2919 mcp->mc_edi = tp->tf_edi; 2920 mcp->mc_esi = tp->tf_esi; 2921 mcp->mc_ebp = tp->tf_ebp; 2922 mcp->mc_isp = tp->tf_isp; 2923 mcp->mc_eflags = tp->tf_eflags; 2924 if (flags & GET_MC_CLEAR_RET) { 2925 mcp->mc_eax = 0; 2926 mcp->mc_edx = 0; 2927 mcp->mc_eflags &= ~PSL_C; 2928 } else { 2929 mcp->mc_eax = tp->tf_eax; 2930 mcp->mc_edx = tp->tf_edx; 2931 } 2932 mcp->mc_ebx = tp->tf_ebx; 2933 mcp->mc_ecx = tp->tf_ecx; 2934 mcp->mc_eip = tp->tf_eip; 2935 mcp->mc_cs = tp->tf_cs; 2936 mcp->mc_esp = tp->tf_esp; 2937 mcp->mc_ss = tp->tf_ss; 2938 mcp->mc_len = sizeof(*mcp); 2939 get_fpcontext(td, mcp, NULL, 0); 2940 sdp = &td->td_pcb->pcb_fsd; 2941 mcp->mc_fsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; 2942 sdp = &td->td_pcb->pcb_gsd; 2943 mcp->mc_gsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; 2944 mcp->mc_flags = 0; 2945 mcp->mc_xfpustate = 0; 2946 mcp->mc_xfpustate_len = 0; 2947 bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2)); 2948 return (0); 2949 } 2950 2951 /* 2952 * Set machine context. 2953 * 2954 * However, we don't set any but the user modifiable flags, and we won't 2955 * touch the cs selector. 2956 */ 2957 int 2958 set_mcontext(struct thread *td, mcontext_t *mcp) 2959 { 2960 struct trapframe *tp; 2961 char *xfpustate; 2962 int eflags, ret; 2963 2964 tp = td->td_frame; 2965 if (mcp->mc_len != sizeof(*mcp) || 2966 (mcp->mc_flags & ~_MC_FLAG_MASK) != 0) 2967 return (EINVAL); 2968 eflags = (mcp->mc_eflags & PSL_USERCHANGE) | 2969 (tp->tf_eflags & ~PSL_USERCHANGE); 2970 if (mcp->mc_flags & _MC_HASFPXSTATE) { 2971 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - 2972 sizeof(union savefpu)) 2973 return (EINVAL); 2974 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len); 2975 ret = copyin((void *)mcp->mc_xfpustate, xfpustate, 2976 mcp->mc_xfpustate_len); 2977 if (ret != 0) 2978 return (ret); 2979 } else 2980 xfpustate = NULL; 2981 ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len); 2982 if (ret != 0) 2983 return (ret); 2984 tp->tf_fs = mcp->mc_fs; 2985 tp->tf_es = mcp->mc_es; 2986 tp->tf_ds = mcp->mc_ds; 2987 tp->tf_edi = mcp->mc_edi; 2988 tp->tf_esi = mcp->mc_esi; 2989 tp->tf_ebp = mcp->mc_ebp; 2990 tp->tf_ebx = mcp->mc_ebx; 2991 tp->tf_edx = mcp->mc_edx; 2992 tp->tf_ecx = mcp->mc_ecx; 2993 tp->tf_eax = mcp->mc_eax; 2994 tp->tf_eip = mcp->mc_eip; 2995 tp->tf_eflags = eflags; 2996 tp->tf_esp = mcp->mc_esp; 2997 tp->tf_ss = mcp->mc_ss; 2998 td->td_pcb->pcb_gs = mcp->mc_gs; 2999 return (0); 3000 } 3001 3002 static void 3003 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, 3004 size_t xfpusave_len) 3005 { 3006 size_t max_len, len; 3007 3008 mcp->mc_ownedfp = npxgetregs(td); 3009 bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0], 3010 sizeof(mcp->mc_fpstate)); 3011 mcp->mc_fpformat = npxformat(); 3012 if (!use_xsave || xfpusave_len == 0) 3013 return; 3014 max_len = cpu_max_ext_state_size - sizeof(union savefpu); 3015 len = xfpusave_len; 3016 if (len > max_len) { 3017 len = max_len; 3018 bzero(xfpusave + max_len, len - max_len); 3019 } 3020 mcp->mc_flags |= _MC_HASFPXSTATE; 3021 mcp->mc_xfpustate_len = len; 3022 bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len); 3023 } 3024 3025 static int 3026 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, 3027 size_t xfpustate_len) 3028 { 3029 int error; 3030 3031 if (mcp->mc_fpformat == _MC_FPFMT_NODEV) 3032 return (0); 3033 else if (mcp->mc_fpformat != _MC_FPFMT_387 && 3034 mcp->mc_fpformat != _MC_FPFMT_XMM) 3035 return (EINVAL); 3036 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) { 3037 /* We don't care what state is left in the FPU or PCB. */ 3038 fpstate_drop(td); 3039 error = 0; 3040 } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || 3041 mcp->mc_ownedfp == _MC_FPOWNED_PCB) { 3042 error = npxsetregs(td, (union savefpu *)&mcp->mc_fpstate, 3043 xfpustate, xfpustate_len); 3044 } else 3045 return (EINVAL); 3046 return (error); 3047 } 3048 3049 static void 3050 fpstate_drop(struct thread *td) 3051 { 3052 3053 KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu")); 3054 critical_enter(); 3055 if (PCPU_GET(fpcurthread) == td) 3056 npxdrop(); 3057 /* 3058 * XXX force a full drop of the npx. The above only drops it if we 3059 * owned it. npxgetregs() has the same bug in the !cpu_fxsr case. 3060 * 3061 * XXX I don't much like npxgetregs()'s semantics of doing a full 3062 * drop. Dropping only to the pcb matches fnsave's behaviour. 3063 * We only need to drop to !PCB_INITDONE in sendsig(). But 3064 * sendsig() is the only caller of npxgetregs()... perhaps we just 3065 * have too many layers. 3066 */ 3067 curthread->td_pcb->pcb_flags &= ~(PCB_NPXINITDONE | 3068 PCB_NPXUSERINITDONE); 3069 critical_exit(); 3070 } 3071 3072 int 3073 fill_dbregs(struct thread *td, struct dbreg *dbregs) 3074 { 3075 struct pcb *pcb; 3076 3077 if (td == NULL) { 3078 dbregs->dr[0] = rdr0(); 3079 dbregs->dr[1] = rdr1(); 3080 dbregs->dr[2] = rdr2(); 3081 dbregs->dr[3] = rdr3(); 3082 dbregs->dr[6] = rdr6(); 3083 dbregs->dr[7] = rdr7(); 3084 } else { 3085 pcb = td->td_pcb; 3086 dbregs->dr[0] = pcb->pcb_dr0; 3087 dbregs->dr[1] = pcb->pcb_dr1; 3088 dbregs->dr[2] = pcb->pcb_dr2; 3089 dbregs->dr[3] = pcb->pcb_dr3; 3090 dbregs->dr[6] = pcb->pcb_dr6; 3091 dbregs->dr[7] = pcb->pcb_dr7; 3092 } 3093 dbregs->dr[4] = 0; 3094 dbregs->dr[5] = 0; 3095 return (0); 3096 } 3097 3098 int 3099 set_dbregs(struct thread *td, struct dbreg *dbregs) 3100 { 3101 struct pcb *pcb; 3102 int i; 3103 3104 if (td == NULL) { 3105 load_dr0(dbregs->dr[0]); 3106 load_dr1(dbregs->dr[1]); 3107 load_dr2(dbregs->dr[2]); 3108 load_dr3(dbregs->dr[3]); 3109 load_dr6(dbregs->dr[6]); 3110 load_dr7(dbregs->dr[7]); 3111 } else { 3112 /* 3113 * Don't let an illegal value for dr7 get set. Specifically, 3114 * check for undefined settings. Setting these bit patterns 3115 * result in undefined behaviour and can lead to an unexpected 3116 * TRCTRAP. 3117 */ 3118 for (i = 0; i < 4; i++) { 3119 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) 3120 return (EINVAL); 3121 if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02) 3122 return (EINVAL); 3123 } 3124 3125 pcb = td->td_pcb; 3126 3127 /* 3128 * Don't let a process set a breakpoint that is not within the 3129 * process's address space. If a process could do this, it 3130 * could halt the system by setting a breakpoint in the kernel 3131 * (if ddb was enabled). Thus, we need to check to make sure 3132 * that no breakpoints are being enabled for addresses outside 3133 * process's address space. 3134 * 3135 * XXX - what about when the watched area of the user's 3136 * address space is written into from within the kernel 3137 * ... wouldn't that still cause a breakpoint to be generated 3138 * from within kernel mode? 3139 */ 3140 3141 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { 3142 /* dr0 is enabled */ 3143 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) 3144 return (EINVAL); 3145 } 3146 3147 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { 3148 /* dr1 is enabled */ 3149 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) 3150 return (EINVAL); 3151 } 3152 3153 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { 3154 /* dr2 is enabled */ 3155 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) 3156 return (EINVAL); 3157 } 3158 3159 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { 3160 /* dr3 is enabled */ 3161 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) 3162 return (EINVAL); 3163 } 3164 3165 pcb->pcb_dr0 = dbregs->dr[0]; 3166 pcb->pcb_dr1 = dbregs->dr[1]; 3167 pcb->pcb_dr2 = dbregs->dr[2]; 3168 pcb->pcb_dr3 = dbregs->dr[3]; 3169 pcb->pcb_dr6 = dbregs->dr[6]; 3170 pcb->pcb_dr7 = dbregs->dr[7]; 3171 3172 pcb->pcb_flags |= PCB_DBREGS; 3173 } 3174 3175 return (0); 3176 } 3177 3178 /* 3179 * Return > 0 if a hardware breakpoint has been hit, and the 3180 * breakpoint was in user space. Return 0, otherwise. 3181 */ 3182 int 3183 user_dbreg_trap(register_t dr6) 3184 { 3185 u_int32_t dr7; 3186 u_int32_t bp; /* breakpoint bits extracted from dr6 */ 3187 int nbp; /* number of breakpoints that triggered */ 3188 caddr_t addr[4]; /* breakpoint addresses */ 3189 int i; 3190 3191 bp = dr6 & DBREG_DR6_BMASK; 3192 if (bp == 0) { 3193 /* 3194 * None of the breakpoint bits are set meaning this 3195 * trap was not caused by any of the debug registers 3196 */ 3197 return 0; 3198 } 3199 3200 dr7 = rdr7(); 3201 if ((dr7 & 0x000000ff) == 0) { 3202 /* 3203 * all GE and LE bits in the dr7 register are zero, 3204 * thus the trap couldn't have been caused by the 3205 * hardware debug registers 3206 */ 3207 return 0; 3208 } 3209 3210 nbp = 0; 3211 3212 /* 3213 * at least one of the breakpoints were hit, check to see 3214 * which ones and if any of them are user space addresses 3215 */ 3216 3217 if (bp & 0x01) { 3218 addr[nbp++] = (caddr_t)rdr0(); 3219 } 3220 if (bp & 0x02) { 3221 addr[nbp++] = (caddr_t)rdr1(); 3222 } 3223 if (bp & 0x04) { 3224 addr[nbp++] = (caddr_t)rdr2(); 3225 } 3226 if (bp & 0x08) { 3227 addr[nbp++] = (caddr_t)rdr3(); 3228 } 3229 3230 for (i = 0; i < nbp; i++) { 3231 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { 3232 /* 3233 * addr[i] is in user space 3234 */ 3235 return nbp; 3236 } 3237 } 3238 3239 /* 3240 * None of the breakpoints are in user space. 3241 */ 3242 return 0; 3243 } 3244 3245 #ifdef KDB 3246 3247 /* 3248 * Provide inb() and outb() as functions. They are normally only available as 3249 * inline functions, thus cannot be called from the debugger. 3250 */ 3251 3252 /* silence compiler warnings */ 3253 u_char inb_(u_short); 3254 void outb_(u_short, u_char); 3255 3256 u_char 3257 inb_(u_short port) 3258 { 3259 return inb(port); 3260 } 3261 3262 void 3263 outb_(u_short port, u_char data) 3264 { 3265 outb(port, data); 3266 } 3267 3268 #endif /* KDB */ 3269