1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 2003 Peter Wemm. 5 * Copyright (c) 1992 Terrence R. Lambert. 6 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * William Jolitz. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. All advertising materials mentioning features or use of this software 21 * must display the following acknowledgement: 22 * This product includes software developed by the University of 23 * California, Berkeley and its contributors. 24 * 4. Neither the name of the University nor the names of its contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 * 40 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_atpic.h" 47 #include "opt_cpu.h" 48 #include "opt_ddb.h" 49 #include "opt_inet.h" 50 #include "opt_isa.h" 51 #include "opt_kstack_pages.h" 52 #include "opt_maxmem.h" 53 #include "opt_mp_watchdog.h" 54 #include "opt_pci.h" 55 #include "opt_platform.h" 56 #include "opt_sched.h" 57 58 #include <sys/param.h> 59 #include <sys/proc.h> 60 #include <sys/systm.h> 61 #include <sys/asan.h> 62 #include <sys/bio.h> 63 #include <sys/buf.h> 64 #include <sys/bus.h> 65 #include <sys/callout.h> 66 #include <sys/cons.h> 67 #include <sys/cpu.h> 68 #include <sys/csan.h> 69 #include <sys/efi.h> 70 #include <sys/eventhandler.h> 71 #include <sys/exec.h> 72 #include <sys/imgact.h> 73 #include <sys/kdb.h> 74 #include <sys/kernel.h> 75 #include <sys/ktr.h> 76 #include <sys/linker.h> 77 #include <sys/lock.h> 78 #include <sys/malloc.h> 79 #include <sys/memrange.h> 80 #include <sys/msan.h> 81 #include <sys/msgbuf.h> 82 #include <sys/mutex.h> 83 #include <sys/pcpu.h> 84 #include <sys/ptrace.h> 85 #include <sys/reboot.h> 86 #include <sys/reg.h> 87 #include <sys/rwlock.h> 88 #include <sys/sched.h> 89 #include <sys/signalvar.h> 90 #ifdef SMP 91 #include <sys/smp.h> 92 #endif 93 #include <sys/syscallsubr.h> 94 #include <sys/sysctl.h> 95 #include <sys/sysent.h> 96 #include <sys/sysproto.h> 97 #include <sys/ucontext.h> 98 #include <sys/vmmeter.h> 99 100 #include <vm/vm.h> 101 #include <vm/vm_param.h> 102 #include <vm/vm_extern.h> 103 #include <vm/vm_kern.h> 104 #include <vm/vm_page.h> 105 #include <vm/vm_map.h> 106 #include <vm/vm_object.h> 107 #include <vm/vm_pager.h> 108 #include <vm/vm_phys.h> 109 #include <vm/vm_dumpset.h> 110 111 #ifdef DDB 112 #ifndef KDB 113 #error KDB must be enabled in order for DDB to work! 114 #endif 115 #include <ddb/ddb.h> 116 #include <ddb/db_sym.h> 117 #endif 118 119 #include <net/netisr.h> 120 121 #include <machine/clock.h> 122 #include <machine/cpu.h> 123 #include <machine/cputypes.h> 124 #include <machine/frame.h> 125 #include <machine/intr_machdep.h> 126 #include <x86/mca.h> 127 #include <machine/md_var.h> 128 #include <machine/metadata.h> 129 #include <machine/mp_watchdog.h> 130 #include <machine/pc/bios.h> 131 #include <machine/pcb.h> 132 #include <machine/proc.h> 133 #include <machine/sigframe.h> 134 #include <machine/specialreg.h> 135 #include <machine/trap.h> 136 #include <machine/tss.h> 137 #include <x86/ucode.h> 138 #include <x86/ifunc.h> 139 #ifdef SMP 140 #include <machine/smp.h> 141 #endif 142 #ifdef FDT 143 #include <x86/fdt.h> 144 #endif 145 146 #ifdef DEV_ATPIC 147 #include <x86/isa/icu.h> 148 #else 149 #include <x86/apicvar.h> 150 #endif 151 152 #include <isa/isareg.h> 153 #include <isa/rtc.h> 154 #include <x86/init.h> 155 156 /* Sanity check for __curthread() */ 157 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); 158 159 /* 160 * The PTI trampoline stack needs enough space for a hardware trapframe and a 161 * couple of scratch registers, as well as the trapframe left behind after an 162 * iret fault. 163 */ 164 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) - 165 offsetof(struct pti_frame, pti_rip)); 166 167 extern u_int64_t hammer_time(u_int64_t, u_int64_t); 168 169 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 170 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 171 172 static void cpu_startup(void *); 173 static void get_fpcontext(struct thread *td, mcontext_t *mcp, 174 char *xfpusave, size_t xfpusave_len); 175 static int set_fpcontext(struct thread *td, mcontext_t *mcp, 176 char *xfpustate, size_t xfpustate_len); 177 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); 178 179 /* Preload data parse function */ 180 static caddr_t native_parse_preload_data(u_int64_t); 181 182 /* Native function to fetch and parse the e820 map */ 183 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *); 184 185 /* Default init_ops implementation. */ 186 struct init_ops init_ops = { 187 .parse_preload_data = native_parse_preload_data, 188 .early_clock_source_init = i8254_init, 189 .early_delay = i8254_delay, 190 .parse_memmap = native_parse_memmap, 191 }; 192 193 /* 194 * Physical address of the EFI System Table. Stashed from the metadata hints 195 * passed into the kernel and used by the EFI code to call runtime services. 196 */ 197 vm_paddr_t efi_systbl_phys; 198 199 /* Intel ICH registers */ 200 #define ICH_PMBASE 0x400 201 #define ICH_SMI_EN ICH_PMBASE + 0x30 202 203 int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel; 204 205 int cold = 1; 206 207 long Maxmem = 0; 208 long realmem = 0; 209 210 struct kva_md_info kmi; 211 212 static struct trapframe proc0_tf; 213 struct region_descriptor r_idt; 214 215 struct pcpu *__pcpu; 216 struct pcpu temp_bsp_pcpu; 217 218 struct mtx icu_lock; 219 220 struct mem_range_softc mem_range_softc; 221 222 struct mtx dt_lock; /* lock for GDT and LDT */ 223 224 void (*vmm_resume_p)(void); 225 226 bool efi_boot; 227 228 static void 229 cpu_startup(dummy) 230 void *dummy; 231 { 232 uintmax_t memsize; 233 char *sysenv; 234 235 /* 236 * On MacBooks, we need to disallow the legacy USB circuit to 237 * generate an SMI# because this can cause several problems, 238 * namely: incorrect CPU frequency detection and failure to 239 * start the APs. 240 * We do this by disabling a bit in the SMI_EN (SMI Control and 241 * Enable register) of the Intel ICH LPC Interface Bridge. 242 */ 243 sysenv = kern_getenv("smbios.system.product"); 244 if (sysenv != NULL) { 245 if (strncmp(sysenv, "MacBook1,1", 10) == 0 || 246 strncmp(sysenv, "MacBook3,1", 10) == 0 || 247 strncmp(sysenv, "MacBook4,1", 10) == 0 || 248 strncmp(sysenv, "MacBookPro1,1", 13) == 0 || 249 strncmp(sysenv, "MacBookPro1,2", 13) == 0 || 250 strncmp(sysenv, "MacBookPro3,1", 13) == 0 || 251 strncmp(sysenv, "MacBookPro4,1", 13) == 0 || 252 strncmp(sysenv, "Macmini1,1", 10) == 0) { 253 if (bootverbose) 254 printf("Disabling LEGACY_USB_EN bit on " 255 "Intel ICH.\n"); 256 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); 257 } 258 freeenv(sysenv); 259 } 260 261 /* 262 * Good {morning,afternoon,evening,night}. 263 */ 264 startrtclock(); 265 printcpuinfo(); 266 267 /* 268 * Display physical memory if SMBIOS reports reasonable amount. 269 */ 270 memsize = 0; 271 sysenv = kern_getenv("smbios.memory.enabled"); 272 if (sysenv != NULL) { 273 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; 274 freeenv(sysenv); 275 } 276 if (memsize < ptoa((uintmax_t)vm_free_count())) 277 memsize = ptoa((uintmax_t)Maxmem); 278 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); 279 realmem = atop(memsize); 280 281 /* 282 * Display any holes after the first chunk of extended memory. 283 */ 284 if (bootverbose) { 285 int indx; 286 287 printf("Physical memory chunk(s):\n"); 288 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 289 vm_paddr_t size; 290 291 size = phys_avail[indx + 1] - phys_avail[indx]; 292 printf( 293 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", 294 (uintmax_t)phys_avail[indx], 295 (uintmax_t)phys_avail[indx + 1] - 1, 296 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); 297 } 298 } 299 300 vm_ksubmap_init(&kmi); 301 302 printf("avail memory = %ju (%ju MB)\n", 303 ptoa((uintmax_t)vm_free_count()), 304 ptoa((uintmax_t)vm_free_count()) / 1048576); 305 #ifdef DEV_PCI 306 if (bootverbose && intel_graphics_stolen_base != 0) 307 printf("intel stolen mem: base %#jx size %ju MB\n", 308 (uintmax_t)intel_graphics_stolen_base, 309 (uintmax_t)intel_graphics_stolen_size / 1024 / 1024); 310 #endif 311 312 /* 313 * Set up buffers, so they can be used to read disk labels. 314 */ 315 bufinit(); 316 vm_pager_bufferinit(); 317 318 cpu_setregs(); 319 } 320 321 static void 322 late_ifunc_resolve(void *dummy __unused) 323 { 324 link_elf_late_ireloc(); 325 } 326 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL); 327 328 /* 329 * Send an interrupt to process. 330 * 331 * Stack is set up to allow sigcode stored 332 * at top to call routine, followed by call 333 * to sigreturn routine below. After sigreturn 334 * resets the signal mask, the stack, and the 335 * frame pointer, it returns to the user 336 * specified pc, psl. 337 */ 338 void 339 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 340 { 341 struct sigframe sf, *sfp; 342 struct pcb *pcb; 343 struct proc *p; 344 struct thread *td; 345 struct sigacts *psp; 346 char *sp; 347 struct trapframe *regs; 348 char *xfpusave; 349 size_t xfpusave_len; 350 int sig; 351 int oonstack; 352 353 td = curthread; 354 pcb = td->td_pcb; 355 p = td->td_proc; 356 PROC_LOCK_ASSERT(p, MA_OWNED); 357 sig = ksi->ksi_signo; 358 psp = p->p_sigacts; 359 mtx_assert(&psp->ps_mtx, MA_OWNED); 360 regs = td->td_frame; 361 oonstack = sigonstack(regs->tf_rsp); 362 363 if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) { 364 xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu); 365 xfpusave = __builtin_alloca(xfpusave_len); 366 } else { 367 xfpusave_len = 0; 368 xfpusave = NULL; 369 } 370 371 /* Save user context. */ 372 bzero(&sf, sizeof(sf)); 373 sf.sf_uc.uc_sigmask = *mask; 374 sf.sf_uc.uc_stack = td->td_sigstk; 375 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 376 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 377 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 378 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs)); 379 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ 380 get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); 381 fpstate_drop(td); 382 update_pcb_bases(pcb); 383 sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase; 384 sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase; 385 bzero(sf.sf_uc.uc_mcontext.mc_spare, 386 sizeof(sf.sf_uc.uc_mcontext.mc_spare)); 387 388 /* Allocate space for the signal handler context. */ 389 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 390 SIGISMEMBER(psp->ps_sigonstack, sig)) { 391 sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size; 392 #if defined(COMPAT_43) 393 td->td_sigstk.ss_flags |= SS_ONSTACK; 394 #endif 395 } else 396 sp = (char *)regs->tf_rsp - 128; 397 if (xfpusave != NULL) { 398 sp -= xfpusave_len; 399 sp = (char *)((unsigned long)sp & ~0x3Ful); 400 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp; 401 } 402 sp -= sizeof(struct sigframe); 403 /* Align to 16 bytes. */ 404 sfp = (struct sigframe *)((unsigned long)sp & ~0xFul); 405 406 /* Build the argument list for the signal handler. */ 407 regs->tf_rdi = sig; /* arg 1 in %rdi */ 408 regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */ 409 bzero(&sf.sf_si, sizeof(sf.sf_si)); 410 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 411 /* Signal handler installed with SA_SIGINFO. */ 412 regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */ 413 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 414 415 /* Fill in POSIX parts */ 416 sf.sf_si = ksi->ksi_info; 417 sf.sf_si.si_signo = sig; /* maybe a translated signal */ 418 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ 419 } else { 420 /* Old FreeBSD-style arguments. */ 421 regs->tf_rsi = ksi->ksi_code; /* arg 2 in %rsi */ 422 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ 423 sf.sf_ahu.sf_handler = catcher; 424 } 425 mtx_unlock(&psp->ps_mtx); 426 PROC_UNLOCK(p); 427 428 /* 429 * Copy the sigframe out to the user's stack. 430 */ 431 if (copyout(&sf, sfp, sizeof(*sfp)) != 0 || 432 (xfpusave != NULL && copyout(xfpusave, 433 (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len) 434 != 0)) { 435 #ifdef DEBUG 436 printf("process %ld has trashed its stack\n", (long)p->p_pid); 437 #endif 438 PROC_LOCK(p); 439 sigexit(td, SIGILL); 440 } 441 442 regs->tf_rsp = (long)sfp; 443 regs->tf_rip = p->p_sysent->sv_sigcode_base; 444 regs->tf_rflags &= ~(PSL_T | PSL_D); 445 regs->tf_cs = _ucodesel; 446 regs->tf_ds = _udatasel; 447 regs->tf_ss = _udatasel; 448 regs->tf_es = _udatasel; 449 regs->tf_fs = _ufssel; 450 regs->tf_gs = _ugssel; 451 regs->tf_flags = TF_HASSEGS; 452 PROC_LOCK(p); 453 mtx_lock(&psp->ps_mtx); 454 } 455 456 /* 457 * System call to cleanup state after a signal 458 * has been taken. Reset signal mask and 459 * stack state from context left by sendsig (above). 460 * Return to previous pc and psl as specified by 461 * context left by sendsig. Check carefully to 462 * make sure that the user has not modified the 463 * state to gain improper privileges. 464 * 465 * MPSAFE 466 */ 467 int 468 sys_sigreturn(td, uap) 469 struct thread *td; 470 struct sigreturn_args /* { 471 const struct __ucontext *sigcntxp; 472 } */ *uap; 473 { 474 ucontext_t uc; 475 struct pcb *pcb; 476 struct proc *p; 477 struct trapframe *regs; 478 ucontext_t *ucp; 479 char *xfpustate; 480 size_t xfpustate_len; 481 long rflags; 482 int cs, error, ret; 483 ksiginfo_t ksi; 484 485 pcb = td->td_pcb; 486 p = td->td_proc; 487 488 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 489 if (error != 0) { 490 uprintf("pid %d (%s): sigreturn copyin failed\n", 491 p->p_pid, td->td_name); 492 return (error); 493 } 494 ucp = &uc; 495 if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) { 496 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid, 497 td->td_name, ucp->uc_mcontext.mc_flags); 498 return (EINVAL); 499 } 500 regs = td->td_frame; 501 rflags = ucp->uc_mcontext.mc_rflags; 502 /* 503 * Don't allow users to change privileged or reserved flags. 504 */ 505 if (!EFL_SECURE(rflags, regs->tf_rflags)) { 506 uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid, 507 td->td_name, rflags); 508 return (EINVAL); 509 } 510 511 /* 512 * Don't allow users to load a valid privileged %cs. Let the 513 * hardware check for invalid selectors, excess privilege in 514 * other selectors, invalid %eip's and invalid %esp's. 515 */ 516 cs = ucp->uc_mcontext.mc_cs; 517 if (!CS_SECURE(cs)) { 518 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid, 519 td->td_name, cs); 520 ksiginfo_init_trap(&ksi); 521 ksi.ksi_signo = SIGBUS; 522 ksi.ksi_code = BUS_OBJERR; 523 ksi.ksi_trapno = T_PROTFLT; 524 ksi.ksi_addr = (void *)regs->tf_rip; 525 trapsignal(td, &ksi); 526 return (EINVAL); 527 } 528 529 if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) { 530 xfpustate_len = uc.uc_mcontext.mc_xfpustate_len; 531 if (xfpustate_len > cpu_max_ext_state_size - 532 sizeof(struct savefpu)) { 533 uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n", 534 p->p_pid, td->td_name, xfpustate_len); 535 return (EINVAL); 536 } 537 xfpustate = __builtin_alloca(xfpustate_len); 538 error = copyin((const void *)uc.uc_mcontext.mc_xfpustate, 539 xfpustate, xfpustate_len); 540 if (error != 0) { 541 uprintf( 542 "pid %d (%s): sigreturn copying xfpustate failed\n", 543 p->p_pid, td->td_name); 544 return (error); 545 } 546 } else { 547 xfpustate = NULL; 548 xfpustate_len = 0; 549 } 550 ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len); 551 if (ret != 0) { 552 uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n", 553 p->p_pid, td->td_name, ret); 554 return (ret); 555 } 556 bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs)); 557 update_pcb_bases(pcb); 558 pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase; 559 pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase; 560 561 #if defined(COMPAT_43) 562 if (ucp->uc_mcontext.mc_onstack & 1) 563 td->td_sigstk.ss_flags |= SS_ONSTACK; 564 else 565 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 566 #endif 567 568 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 569 return (EJUSTRETURN); 570 } 571 572 #ifdef COMPAT_FREEBSD4 573 int 574 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap) 575 { 576 577 return sys_sigreturn(td, (struct sigreturn_args *)uap); 578 } 579 #endif 580 581 /* 582 * Reset the hardware debug registers if they were in use. 583 * They won't have any meaning for the newly exec'd process. 584 */ 585 void 586 x86_clear_dbregs(struct pcb *pcb) 587 { 588 if ((pcb->pcb_flags & PCB_DBREGS) == 0) 589 return; 590 591 pcb->pcb_dr0 = 0; 592 pcb->pcb_dr1 = 0; 593 pcb->pcb_dr2 = 0; 594 pcb->pcb_dr3 = 0; 595 pcb->pcb_dr6 = 0; 596 pcb->pcb_dr7 = 0; 597 598 if (pcb == curpcb) { 599 /* 600 * Clear the debug registers on the running CPU, 601 * otherwise they will end up affecting the next 602 * process we switch to. 603 */ 604 reset_dbregs(); 605 } 606 clear_pcb_flags(pcb, PCB_DBREGS); 607 } 608 609 /* 610 * Reset registers to default values on exec. 611 */ 612 void 613 exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) 614 { 615 struct trapframe *regs; 616 struct pcb *pcb; 617 register_t saved_rflags; 618 619 regs = td->td_frame; 620 pcb = td->td_pcb; 621 622 if (td->td_proc->p_md.md_ldt != NULL) 623 user_ldt_free(td); 624 625 update_pcb_bases(pcb); 626 pcb->pcb_fsbase = 0; 627 pcb->pcb_gsbase = 0; 628 clear_pcb_flags(pcb, PCB_32BIT); 629 pcb->pcb_initial_fpucw = __INITIAL_FPUCW__; 630 631 saved_rflags = regs->tf_rflags & PSL_T; 632 bzero((char *)regs, sizeof(struct trapframe)); 633 regs->tf_rip = imgp->entry_addr; 634 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; 635 regs->tf_rdi = stack; /* argv */ 636 regs->tf_rflags = PSL_USER | saved_rflags; 637 regs->tf_ss = _udatasel; 638 regs->tf_cs = _ucodesel; 639 regs->tf_ds = _udatasel; 640 regs->tf_es = _udatasel; 641 regs->tf_fs = _ufssel; 642 regs->tf_gs = _ugssel; 643 regs->tf_flags = TF_HASSEGS; 644 645 x86_clear_dbregs(pcb); 646 647 /* 648 * Drop the FP state if we hold it, so that the process gets a 649 * clean FP state if it uses the FPU again. 650 */ 651 fpstate_drop(td); 652 } 653 654 void 655 cpu_setregs(void) 656 { 657 register_t cr0; 658 659 cr0 = rcr0(); 660 /* 661 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the 662 * BSP. See the comments there about why we set them. 663 */ 664 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; 665 load_cr0(cr0); 666 } 667 668 /* 669 * Initialize amd64 and configure to run kernel 670 */ 671 672 /* 673 * Initialize segments & interrupt table 674 */ 675 static struct gate_descriptor idt0[NIDT]; 676 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 677 678 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16); 679 static char mce0_stack[MCE_STACK_SIZE] __aligned(16); 680 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16); 681 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16); 682 CTASSERT(sizeof(struct nmi_pcpu) == 16); 683 684 /* 685 * Software prototypes -- in more palatable form. 686 * 687 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same 688 * slots as corresponding segments for i386 kernel. 689 */ 690 struct soft_segment_descriptor gdt_segs[] = { 691 /* GNULL_SEL 0 Null Descriptor */ 692 { .ssd_base = 0x0, 693 .ssd_limit = 0x0, 694 .ssd_type = 0, 695 .ssd_dpl = 0, 696 .ssd_p = 0, 697 .ssd_long = 0, 698 .ssd_def32 = 0, 699 .ssd_gran = 0 }, 700 /* GNULL2_SEL 1 Null Descriptor */ 701 { .ssd_base = 0x0, 702 .ssd_limit = 0x0, 703 .ssd_type = 0, 704 .ssd_dpl = 0, 705 .ssd_p = 0, 706 .ssd_long = 0, 707 .ssd_def32 = 0, 708 .ssd_gran = 0 }, 709 /* GUFS32_SEL 2 32 bit %gs Descriptor for user */ 710 { .ssd_base = 0x0, 711 .ssd_limit = 0xfffff, 712 .ssd_type = SDT_MEMRWA, 713 .ssd_dpl = SEL_UPL, 714 .ssd_p = 1, 715 .ssd_long = 0, 716 .ssd_def32 = 1, 717 .ssd_gran = 1 }, 718 /* GUGS32_SEL 3 32 bit %fs Descriptor for user */ 719 { .ssd_base = 0x0, 720 .ssd_limit = 0xfffff, 721 .ssd_type = SDT_MEMRWA, 722 .ssd_dpl = SEL_UPL, 723 .ssd_p = 1, 724 .ssd_long = 0, 725 .ssd_def32 = 1, 726 .ssd_gran = 1 }, 727 /* GCODE_SEL 4 Code Descriptor for kernel */ 728 { .ssd_base = 0x0, 729 .ssd_limit = 0xfffff, 730 .ssd_type = SDT_MEMERA, 731 .ssd_dpl = SEL_KPL, 732 .ssd_p = 1, 733 .ssd_long = 1, 734 .ssd_def32 = 0, 735 .ssd_gran = 1 }, 736 /* GDATA_SEL 5 Data Descriptor for kernel */ 737 { .ssd_base = 0x0, 738 .ssd_limit = 0xfffff, 739 .ssd_type = SDT_MEMRWA, 740 .ssd_dpl = SEL_KPL, 741 .ssd_p = 1, 742 .ssd_long = 1, 743 .ssd_def32 = 0, 744 .ssd_gran = 1 }, 745 /* GUCODE32_SEL 6 32 bit Code Descriptor for user */ 746 { .ssd_base = 0x0, 747 .ssd_limit = 0xfffff, 748 .ssd_type = SDT_MEMERA, 749 .ssd_dpl = SEL_UPL, 750 .ssd_p = 1, 751 .ssd_long = 0, 752 .ssd_def32 = 1, 753 .ssd_gran = 1 }, 754 /* GUDATA_SEL 7 32/64 bit Data Descriptor for user */ 755 { .ssd_base = 0x0, 756 .ssd_limit = 0xfffff, 757 .ssd_type = SDT_MEMRWA, 758 .ssd_dpl = SEL_UPL, 759 .ssd_p = 1, 760 .ssd_long = 0, 761 .ssd_def32 = 1, 762 .ssd_gran = 1 }, 763 /* GUCODE_SEL 8 64 bit Code Descriptor for user */ 764 { .ssd_base = 0x0, 765 .ssd_limit = 0xfffff, 766 .ssd_type = SDT_MEMERA, 767 .ssd_dpl = SEL_UPL, 768 .ssd_p = 1, 769 .ssd_long = 1, 770 .ssd_def32 = 0, 771 .ssd_gran = 1 }, 772 /* GPROC0_SEL 9 Proc 0 Tss Descriptor */ 773 { .ssd_base = 0x0, 774 .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1, 775 .ssd_type = SDT_SYSTSS, 776 .ssd_dpl = SEL_KPL, 777 .ssd_p = 1, 778 .ssd_long = 0, 779 .ssd_def32 = 0, 780 .ssd_gran = 0 }, 781 /* Actually, the TSS is a system descriptor which is double size */ 782 { .ssd_base = 0x0, 783 .ssd_limit = 0x0, 784 .ssd_type = 0, 785 .ssd_dpl = 0, 786 .ssd_p = 0, 787 .ssd_long = 0, 788 .ssd_def32 = 0, 789 .ssd_gran = 0 }, 790 /* GUSERLDT_SEL 11 LDT Descriptor */ 791 { .ssd_base = 0x0, 792 .ssd_limit = 0x0, 793 .ssd_type = 0, 794 .ssd_dpl = 0, 795 .ssd_p = 0, 796 .ssd_long = 0, 797 .ssd_def32 = 0, 798 .ssd_gran = 0 }, 799 /* GUSERLDT_SEL 12 LDT Descriptor, double size */ 800 { .ssd_base = 0x0, 801 .ssd_limit = 0x0, 802 .ssd_type = 0, 803 .ssd_dpl = 0, 804 .ssd_p = 0, 805 .ssd_long = 0, 806 .ssd_def32 = 0, 807 .ssd_gran = 0 }, 808 }; 809 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT"); 810 811 void 812 setidt(int idx, inthand_t *func, int typ, int dpl, int ist) 813 { 814 struct gate_descriptor *ip; 815 816 ip = idt + idx; 817 ip->gd_looffset = (uintptr_t)func; 818 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); 819 ip->gd_ist = ist; 820 ip->gd_xx = 0; 821 ip->gd_type = typ; 822 ip->gd_dpl = dpl; 823 ip->gd_p = 1; 824 ip->gd_hioffset = ((uintptr_t)func)>>16 ; 825 } 826 827 extern inthand_t 828 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 829 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 830 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 831 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 832 IDTVEC(xmm), IDTVEC(dblfault), 833 IDTVEC(div_pti), IDTVEC(bpt_pti), 834 IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti), 835 IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti), 836 IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti), 837 IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti), 838 IDTVEC(xmm_pti), 839 #ifdef KDTRACE_HOOKS 840 IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti), 841 #endif 842 #ifdef XENHVM 843 IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti), 844 #endif 845 IDTVEC(fast_syscall), IDTVEC(fast_syscall32), 846 IDTVEC(fast_syscall_pti); 847 848 #ifdef DDB 849 /* 850 * Display the index and function name of any IDT entries that don't use 851 * the default 'rsvd' entry point. 852 */ 853 DB_SHOW_COMMAND(idt, db_show_idt) 854 { 855 struct gate_descriptor *ip; 856 int idx; 857 uintptr_t func; 858 859 ip = idt; 860 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { 861 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); 862 if (func != (uintptr_t)&IDTVEC(rsvd)) { 863 db_printf("%3d\t", idx); 864 db_printsym(func, DB_STGY_PROC); 865 db_printf("\n"); 866 } 867 ip++; 868 } 869 } 870 871 /* Show privileged registers. */ 872 DB_SHOW_COMMAND(sysregs, db_show_sysregs) 873 { 874 struct { 875 uint16_t limit; 876 uint64_t base; 877 } __packed idtr, gdtr; 878 uint16_t ldt, tr; 879 880 __asm __volatile("sidt %0" : "=m" (idtr)); 881 db_printf("idtr\t0x%016lx/%04x\n", 882 (u_long)idtr.base, (u_int)idtr.limit); 883 __asm __volatile("sgdt %0" : "=m" (gdtr)); 884 db_printf("gdtr\t0x%016lx/%04x\n", 885 (u_long)gdtr.base, (u_int)gdtr.limit); 886 __asm __volatile("sldt %0" : "=r" (ldt)); 887 db_printf("ldtr\t0x%04x\n", ldt); 888 __asm __volatile("str %0" : "=r" (tr)); 889 db_printf("tr\t0x%04x\n", tr); 890 db_printf("cr0\t0x%016lx\n", rcr0()); 891 db_printf("cr2\t0x%016lx\n", rcr2()); 892 db_printf("cr3\t0x%016lx\n", rcr3()); 893 db_printf("cr4\t0x%016lx\n", rcr4()); 894 if (rcr4() & CR4_XSAVE) 895 db_printf("xcr0\t0x%016lx\n", rxcr(0)); 896 db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER)); 897 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX)) 898 db_printf("FEATURES_CTL\t%016lx\n", 899 rdmsr(MSR_IA32_FEATURE_CONTROL)); 900 db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR)); 901 db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT)); 902 db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE)); 903 } 904 905 DB_SHOW_COMMAND(dbregs, db_show_dbregs) 906 { 907 908 db_printf("dr0\t0x%016lx\n", rdr0()); 909 db_printf("dr1\t0x%016lx\n", rdr1()); 910 db_printf("dr2\t0x%016lx\n", rdr2()); 911 db_printf("dr3\t0x%016lx\n", rdr3()); 912 db_printf("dr6\t0x%016lx\n", rdr6()); 913 db_printf("dr7\t0x%016lx\n", rdr7()); 914 } 915 #endif 916 917 void 918 sdtossd(sd, ssd) 919 struct user_segment_descriptor *sd; 920 struct soft_segment_descriptor *ssd; 921 { 922 923 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 924 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 925 ssd->ssd_type = sd->sd_type; 926 ssd->ssd_dpl = sd->sd_dpl; 927 ssd->ssd_p = sd->sd_p; 928 ssd->ssd_long = sd->sd_long; 929 ssd->ssd_def32 = sd->sd_def32; 930 ssd->ssd_gran = sd->sd_gran; 931 } 932 933 void 934 ssdtosd(ssd, sd) 935 struct soft_segment_descriptor *ssd; 936 struct user_segment_descriptor *sd; 937 { 938 939 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 940 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; 941 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 942 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 943 sd->sd_type = ssd->ssd_type; 944 sd->sd_dpl = ssd->ssd_dpl; 945 sd->sd_p = ssd->ssd_p; 946 sd->sd_long = ssd->ssd_long; 947 sd->sd_def32 = ssd->ssd_def32; 948 sd->sd_gran = ssd->ssd_gran; 949 } 950 951 void 952 ssdtosyssd(ssd, sd) 953 struct soft_segment_descriptor *ssd; 954 struct system_segment_descriptor *sd; 955 { 956 957 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 958 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; 959 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 960 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 961 sd->sd_type = ssd->ssd_type; 962 sd->sd_dpl = ssd->ssd_dpl; 963 sd->sd_p = ssd->ssd_p; 964 sd->sd_gran = ssd->ssd_gran; 965 } 966 967 u_int basemem; 968 969 static int 970 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, 971 int *physmap_idxp) 972 { 973 int i, insert_idx, physmap_idx; 974 975 physmap_idx = *physmap_idxp; 976 977 if (length == 0) 978 return (1); 979 980 /* 981 * Find insertion point while checking for overlap. Start off by 982 * assuming the new entry will be added to the end. 983 * 984 * NB: physmap_idx points to the next free slot. 985 */ 986 insert_idx = physmap_idx; 987 for (i = 0; i <= physmap_idx; i += 2) { 988 if (base < physmap[i + 1]) { 989 if (base + length <= physmap[i]) { 990 insert_idx = i; 991 break; 992 } 993 if (boothowto & RB_VERBOSE) 994 printf( 995 "Overlapping memory regions, ignoring second region\n"); 996 return (1); 997 } 998 } 999 1000 /* See if we can prepend to the next entry. */ 1001 if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { 1002 physmap[insert_idx] = base; 1003 return (1); 1004 } 1005 1006 /* See if we can append to the previous entry. */ 1007 if (insert_idx > 0 && base == physmap[insert_idx - 1]) { 1008 physmap[insert_idx - 1] += length; 1009 return (1); 1010 } 1011 1012 physmap_idx += 2; 1013 *physmap_idxp = physmap_idx; 1014 if (physmap_idx == PHYS_AVAIL_ENTRIES) { 1015 printf( 1016 "Too many segments in the physical address map, giving up\n"); 1017 return (0); 1018 } 1019 1020 /* 1021 * Move the last 'N' entries down to make room for the new 1022 * entry if needed. 1023 */ 1024 for (i = (physmap_idx - 2); i > insert_idx; i -= 2) { 1025 physmap[i] = physmap[i - 2]; 1026 physmap[i + 1] = physmap[i - 1]; 1027 } 1028 1029 /* Insert the new entry. */ 1030 physmap[insert_idx] = base; 1031 physmap[insert_idx + 1] = base + length; 1032 return (1); 1033 } 1034 1035 void 1036 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize, 1037 vm_paddr_t *physmap, int *physmap_idx) 1038 { 1039 struct bios_smap *smap, *smapend; 1040 1041 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 1042 1043 for (smap = smapbase; smap < smapend; smap++) { 1044 if (boothowto & RB_VERBOSE) 1045 printf("SMAP type=%02x base=%016lx len=%016lx\n", 1046 smap->type, smap->base, smap->length); 1047 1048 if (smap->type != SMAP_TYPE_MEMORY) 1049 continue; 1050 1051 if (!add_physmap_entry(smap->base, smap->length, physmap, 1052 physmap_idx)) 1053 break; 1054 } 1055 } 1056 1057 static void 1058 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap, 1059 int *physmap_idx) 1060 { 1061 struct efi_md *map, *p; 1062 const char *type; 1063 size_t efisz; 1064 int ndesc, i; 1065 1066 static const char *types[] = { 1067 "Reserved", 1068 "LoaderCode", 1069 "LoaderData", 1070 "BootServicesCode", 1071 "BootServicesData", 1072 "RuntimeServicesCode", 1073 "RuntimeServicesData", 1074 "ConventionalMemory", 1075 "UnusableMemory", 1076 "ACPIReclaimMemory", 1077 "ACPIMemoryNVS", 1078 "MemoryMappedIO", 1079 "MemoryMappedIOPortSpace", 1080 "PalCode", 1081 "PersistentMemory" 1082 }; 1083 1084 /* 1085 * Memory map data provided by UEFI via the GetMemoryMap 1086 * Boot Services API. 1087 */ 1088 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf; 1089 map = (struct efi_md *)((uint8_t *)efihdr + efisz); 1090 1091 if (efihdr->descriptor_size == 0) 1092 return; 1093 ndesc = efihdr->memory_size / efihdr->descriptor_size; 1094 1095 if (boothowto & RB_VERBOSE) 1096 printf("%23s %12s %12s %8s %4s\n", 1097 "Type", "Physical", "Virtual", "#Pages", "Attr"); 1098 1099 for (i = 0, p = map; i < ndesc; i++, 1100 p = efi_next_descriptor(p, efihdr->descriptor_size)) { 1101 if (boothowto & RB_VERBOSE) { 1102 if (p->md_type < nitems(types)) 1103 type = types[p->md_type]; 1104 else 1105 type = "<INVALID>"; 1106 printf("%23s %012lx %012lx %08lx ", type, p->md_phys, 1107 p->md_virt, p->md_pages); 1108 if (p->md_attr & EFI_MD_ATTR_UC) 1109 printf("UC "); 1110 if (p->md_attr & EFI_MD_ATTR_WC) 1111 printf("WC "); 1112 if (p->md_attr & EFI_MD_ATTR_WT) 1113 printf("WT "); 1114 if (p->md_attr & EFI_MD_ATTR_WB) 1115 printf("WB "); 1116 if (p->md_attr & EFI_MD_ATTR_UCE) 1117 printf("UCE "); 1118 if (p->md_attr & EFI_MD_ATTR_WP) 1119 printf("WP "); 1120 if (p->md_attr & EFI_MD_ATTR_RP) 1121 printf("RP "); 1122 if (p->md_attr & EFI_MD_ATTR_XP) 1123 printf("XP "); 1124 if (p->md_attr & EFI_MD_ATTR_NV) 1125 printf("NV "); 1126 if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE) 1127 printf("MORE_RELIABLE "); 1128 if (p->md_attr & EFI_MD_ATTR_RO) 1129 printf("RO "); 1130 if (p->md_attr & EFI_MD_ATTR_RT) 1131 printf("RUNTIME"); 1132 printf("\n"); 1133 } 1134 1135 switch (p->md_type) { 1136 case EFI_MD_TYPE_CODE: 1137 case EFI_MD_TYPE_DATA: 1138 case EFI_MD_TYPE_BS_CODE: 1139 case EFI_MD_TYPE_BS_DATA: 1140 case EFI_MD_TYPE_FREE: 1141 /* 1142 * We're allowed to use any entry with these types. 1143 */ 1144 break; 1145 default: 1146 continue; 1147 } 1148 1149 if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE), 1150 physmap, physmap_idx)) 1151 break; 1152 } 1153 } 1154 1155 static void 1156 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx) 1157 { 1158 struct bios_smap *smap; 1159 struct efi_map_header *efihdr; 1160 u_int32_t size; 1161 1162 /* 1163 * Memory map from INT 15:E820. 1164 * 1165 * subr_module.c says: 1166 * "Consumer may safely assume that size value precedes data." 1167 * ie: an int32_t immediately precedes smap. 1168 */ 1169 1170 efihdr = (struct efi_map_header *)preload_search_info(kmdp, 1171 MODINFO_METADATA | MODINFOMD_EFI_MAP); 1172 smap = (struct bios_smap *)preload_search_info(kmdp, 1173 MODINFO_METADATA | MODINFOMD_SMAP); 1174 if (efihdr == NULL && smap == NULL) 1175 panic("No BIOS smap or EFI map info from loader!"); 1176 1177 if (efihdr != NULL) { 1178 add_efi_map_entries(efihdr, physmap, physmap_idx); 1179 strlcpy(bootmethod, "UEFI", sizeof(bootmethod)); 1180 } else { 1181 size = *((u_int32_t *)smap - 1); 1182 bios_add_smap_entries(smap, size, physmap, physmap_idx); 1183 strlcpy(bootmethod, "BIOS", sizeof(bootmethod)); 1184 } 1185 } 1186 1187 #define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE) 1188 1189 /* 1190 * Populate the (physmap) array with base/bound pairs describing the 1191 * available physical memory in the system, then test this memory and 1192 * build the phys_avail array describing the actually-available memory. 1193 * 1194 * Total memory size may be set by the kernel environment variable 1195 * hw.physmem or the compile-time define MAXMEM. 1196 * 1197 * XXX first should be vm_paddr_t. 1198 */ 1199 static void 1200 getmemsize(caddr_t kmdp, u_int64_t first) 1201 { 1202 int i, physmap_idx, pa_indx, da_indx; 1203 vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES]; 1204 u_long physmem_start, physmem_tunable, memtest; 1205 pt_entry_t *pte; 1206 quad_t dcons_addr, dcons_size; 1207 int page_counter; 1208 1209 /* 1210 * Tell the physical memory allocator about pages used to store 1211 * the kernel and preloaded data. See kmem_bootstrap_free(). 1212 */ 1213 vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first)); 1214 1215 bzero(physmap, sizeof(physmap)); 1216 physmap_idx = 0; 1217 1218 init_ops.parse_memmap(kmdp, physmap, &physmap_idx); 1219 physmap_idx -= 2; 1220 1221 /* 1222 * Find the 'base memory' segment for SMP 1223 */ 1224 basemem = 0; 1225 for (i = 0; i <= physmap_idx; i += 2) { 1226 if (physmap[i] <= 0xA0000) { 1227 basemem = physmap[i + 1] / 1024; 1228 break; 1229 } 1230 } 1231 if (basemem == 0 || basemem > 640) { 1232 if (bootverbose) 1233 printf( 1234 "Memory map doesn't contain a basemem segment, faking it"); 1235 basemem = 640; 1236 } 1237 1238 /* 1239 * Maxmem isn't the "maximum memory", it's one larger than the 1240 * highest page of the physical address space. It should be 1241 * called something like "Maxphyspage". We may adjust this 1242 * based on ``hw.physmem'' and the results of the memory test. 1243 */ 1244 Maxmem = atop(physmap[physmap_idx + 1]); 1245 1246 #ifdef MAXMEM 1247 Maxmem = MAXMEM / 4; 1248 #endif 1249 1250 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 1251 Maxmem = atop(physmem_tunable); 1252 1253 /* 1254 * The boot memory test is disabled by default, as it takes a 1255 * significant amount of time on large-memory systems, and is 1256 * unfriendly to virtual machines as it unnecessarily touches all 1257 * pages. 1258 * 1259 * A general name is used as the code may be extended to support 1260 * additional tests beyond the current "page present" test. 1261 */ 1262 memtest = 0; 1263 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); 1264 1265 /* 1266 * Don't allow MAXMEM or hw.physmem to extend the amount of memory 1267 * in the system. 1268 */ 1269 if (Maxmem > atop(physmap[physmap_idx + 1])) 1270 Maxmem = atop(physmap[physmap_idx + 1]); 1271 1272 if (atop(physmap[physmap_idx + 1]) != Maxmem && 1273 (boothowto & RB_VERBOSE)) 1274 printf("Physical memory use set to %ldK\n", Maxmem * 4); 1275 1276 /* call pmap initialization to make new kernel address space */ 1277 pmap_bootstrap(&first); 1278 1279 /* 1280 * Size up each available chunk of physical memory. 1281 * 1282 * XXX Some BIOSes corrupt low 64KB between suspend and resume. 1283 * By default, mask off the first 16 pages unless we appear to be 1284 * running in a VM. 1285 */ 1286 physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT; 1287 TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start); 1288 if (physmap[0] < physmem_start) { 1289 if (physmem_start < PAGE_SIZE) 1290 physmap[0] = PAGE_SIZE; 1291 else if (physmem_start >= physmap[1]) 1292 physmap[0] = round_page(physmap[1] - PAGE_SIZE); 1293 else 1294 physmap[0] = round_page(physmem_start); 1295 } 1296 pa_indx = 0; 1297 da_indx = 1; 1298 phys_avail[pa_indx++] = physmap[0]; 1299 phys_avail[pa_indx] = physmap[0]; 1300 dump_avail[da_indx] = physmap[0]; 1301 pte = CMAP1; 1302 1303 /* 1304 * Get dcons buffer address 1305 */ 1306 if (getenv_quad("dcons.addr", &dcons_addr) == 0 || 1307 getenv_quad("dcons.size", &dcons_size) == 0) 1308 dcons_addr = 0; 1309 1310 /* 1311 * physmap is in bytes, so when converting to page boundaries, 1312 * round up the start address and round down the end address. 1313 */ 1314 page_counter = 0; 1315 if (memtest != 0) 1316 printf("Testing system memory"); 1317 for (i = 0; i <= physmap_idx; i += 2) { 1318 vm_paddr_t end; 1319 1320 end = ptoa((vm_paddr_t)Maxmem); 1321 if (physmap[i + 1] < end) 1322 end = trunc_page(physmap[i + 1]); 1323 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 1324 int tmp, page_bad, full; 1325 int *ptr = (int *)CADDR1; 1326 1327 full = FALSE; 1328 /* 1329 * block out kernel memory as not available. 1330 */ 1331 if (pa >= (vm_paddr_t)kernphys && pa < first) 1332 goto do_dump_avail; 1333 1334 /* 1335 * block out dcons buffer 1336 */ 1337 if (dcons_addr > 0 1338 && pa >= trunc_page(dcons_addr) 1339 && pa < dcons_addr + dcons_size) 1340 goto do_dump_avail; 1341 1342 page_bad = FALSE; 1343 if (memtest == 0) 1344 goto skip_memtest; 1345 1346 /* 1347 * Print a "." every GB to show we're making 1348 * progress. 1349 */ 1350 page_counter++; 1351 if ((page_counter % PAGES_PER_GB) == 0) 1352 printf("."); 1353 1354 /* 1355 * map page into kernel: valid, read/write,non-cacheable 1356 */ 1357 *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD; 1358 invltlb(); 1359 1360 tmp = *(int *)ptr; 1361 /* 1362 * Test for alternating 1's and 0's 1363 */ 1364 *(volatile int *)ptr = 0xaaaaaaaa; 1365 if (*(volatile int *)ptr != 0xaaaaaaaa) 1366 page_bad = TRUE; 1367 /* 1368 * Test for alternating 0's and 1's 1369 */ 1370 *(volatile int *)ptr = 0x55555555; 1371 if (*(volatile int *)ptr != 0x55555555) 1372 page_bad = TRUE; 1373 /* 1374 * Test for all 1's 1375 */ 1376 *(volatile int *)ptr = 0xffffffff; 1377 if (*(volatile int *)ptr != 0xffffffff) 1378 page_bad = TRUE; 1379 /* 1380 * Test for all 0's 1381 */ 1382 *(volatile int *)ptr = 0x0; 1383 if (*(volatile int *)ptr != 0x0) 1384 page_bad = TRUE; 1385 /* 1386 * Restore original value. 1387 */ 1388 *(int *)ptr = tmp; 1389 1390 skip_memtest: 1391 /* 1392 * Adjust array of valid/good pages. 1393 */ 1394 if (page_bad == TRUE) 1395 continue; 1396 /* 1397 * If this good page is a continuation of the 1398 * previous set of good pages, then just increase 1399 * the end pointer. Otherwise start a new chunk. 1400 * Note that "end" points one higher than end, 1401 * making the range >= start and < end. 1402 * If we're also doing a speculative memory 1403 * test and we at or past the end, bump up Maxmem 1404 * so that we keep going. The first bad page 1405 * will terminate the loop. 1406 */ 1407 if (phys_avail[pa_indx] == pa) { 1408 phys_avail[pa_indx] += PAGE_SIZE; 1409 } else { 1410 pa_indx++; 1411 if (pa_indx == PHYS_AVAIL_ENTRIES) { 1412 printf( 1413 "Too many holes in the physical address space, giving up\n"); 1414 pa_indx--; 1415 full = TRUE; 1416 goto do_dump_avail; 1417 } 1418 phys_avail[pa_indx++] = pa; /* start */ 1419 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 1420 } 1421 physmem++; 1422 do_dump_avail: 1423 if (dump_avail[da_indx] == pa) { 1424 dump_avail[da_indx] += PAGE_SIZE; 1425 } else { 1426 da_indx++; 1427 if (da_indx == PHYS_AVAIL_ENTRIES) { 1428 da_indx--; 1429 goto do_next; 1430 } 1431 dump_avail[da_indx++] = pa; /* start */ 1432 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 1433 } 1434 do_next: 1435 if (full) 1436 break; 1437 } 1438 } 1439 *pte = 0; 1440 invltlb(); 1441 if (memtest != 0) 1442 printf("\n"); 1443 1444 /* 1445 * XXX 1446 * The last chunk must contain at least one page plus the message 1447 * buffer to avoid complicating other code (message buffer address 1448 * calculation, etc.). 1449 */ 1450 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 1451 round_page(msgbufsize) >= phys_avail[pa_indx]) { 1452 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 1453 phys_avail[pa_indx--] = 0; 1454 phys_avail[pa_indx--] = 0; 1455 } 1456 1457 Maxmem = atop(phys_avail[pa_indx]); 1458 1459 /* Trim off space for the message buffer. */ 1460 phys_avail[pa_indx] -= round_page(msgbufsize); 1461 1462 /* Map the message buffer. */ 1463 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]); 1464 } 1465 1466 static caddr_t 1467 native_parse_preload_data(u_int64_t modulep) 1468 { 1469 caddr_t kmdp; 1470 char *envp; 1471 #ifdef DDB 1472 vm_offset_t ksym_start; 1473 vm_offset_t ksym_end; 1474 #endif 1475 1476 preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE); 1477 preload_bootstrap_relocate(KERNBASE); 1478 kmdp = preload_search_by_type("elf kernel"); 1479 if (kmdp == NULL) 1480 kmdp = preload_search_by_type("elf64 kernel"); 1481 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); 1482 envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *); 1483 if (envp != NULL) 1484 envp += KERNBASE; 1485 init_static_kenv(envp, 0); 1486 #ifdef DDB 1487 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); 1488 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); 1489 db_fetch_ksymtab(ksym_start, ksym_end, 0); 1490 #endif 1491 efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t); 1492 1493 return (kmdp); 1494 } 1495 1496 static void 1497 amd64_kdb_init(void) 1498 { 1499 kdb_init(); 1500 #ifdef KDB 1501 if (boothowto & RB_KDB) 1502 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); 1503 #endif 1504 } 1505 1506 /* Set up the fast syscall stuff */ 1507 void 1508 amd64_conf_fast_syscall(void) 1509 { 1510 uint64_t msr; 1511 1512 msr = rdmsr(MSR_EFER) | EFER_SCE; 1513 wrmsr(MSR_EFER, msr); 1514 wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) : 1515 (u_int64_t)IDTVEC(fast_syscall)); 1516 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 1517 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 1518 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 1519 wrmsr(MSR_STAR, msr); 1520 wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC); 1521 } 1522 1523 void 1524 amd64_bsp_pcpu_init1(struct pcpu *pc) 1525 { 1526 struct user_segment_descriptor *gdt; 1527 1528 PCPU_SET(prvspace, pc); 1529 gdt = *PCPU_PTR(gdt); 1530 PCPU_SET(curthread, &thread0); 1531 PCPU_SET(tssp, PCPU_PTR(common_tss)); 1532 PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1533 PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]); 1534 PCPU_SET(fs32p, &gdt[GUFS32_SEL]); 1535 PCPU_SET(gs32p, &gdt[GUGS32_SEL]); 1536 PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK); 1537 PCPU_SET(smp_tlb_gen, 1); 1538 } 1539 1540 void 1541 amd64_bsp_pcpu_init2(uint64_t rsp0) 1542 { 1543 1544 PCPU_SET(rsp0, rsp0); 1545 PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) + 1546 PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful); 1547 PCPU_SET(curpcb, thread0.td_pcb); 1548 } 1549 1550 void 1551 amd64_bsp_ist_init(struct pcpu *pc) 1552 { 1553 struct nmi_pcpu *np; 1554 struct amd64tss *tssp; 1555 1556 tssp = &pc->pc_common_tss; 1557 1558 /* doublefault stack space, runs on ist1 */ 1559 np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1; 1560 np->np_pcpu = (register_t)pc; 1561 tssp->tss_ist1 = (long)np; 1562 1563 /* 1564 * NMI stack, runs on ist2. The pcpu pointer is stored just 1565 * above the start of the ist2 stack. 1566 */ 1567 np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1; 1568 np->np_pcpu = (register_t)pc; 1569 tssp->tss_ist2 = (long)np; 1570 1571 /* 1572 * MC# stack, runs on ist3. The pcpu pointer is stored just 1573 * above the start of the ist3 stack. 1574 */ 1575 np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1; 1576 np->np_pcpu = (register_t)pc; 1577 tssp->tss_ist3 = (long)np; 1578 1579 /* 1580 * DB# stack, runs on ist4. 1581 */ 1582 np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1; 1583 np->np_pcpu = (register_t)pc; 1584 tssp->tss_ist4 = (long)np; 1585 } 1586 1587 u_int64_t 1588 hammer_time(u_int64_t modulep, u_int64_t physfree) 1589 { 1590 caddr_t kmdp; 1591 int gsel_tss, x; 1592 struct pcpu *pc; 1593 struct xstate_hdr *xhdr; 1594 uint64_t cr3, rsp0; 1595 pml4_entry_t *pml4e; 1596 pdp_entry_t *pdpe; 1597 pd_entry_t *pde; 1598 char *env; 1599 struct user_segment_descriptor *gdt; 1600 struct region_descriptor r_gdt; 1601 size_t kstack0_sz; 1602 int late_console; 1603 1604 TSRAW(&thread0, TS_ENTER, __func__, NULL); 1605 1606 /* 1607 * Calculate kernphys by inspecting page table created by loader. 1608 * The assumptions: 1609 * - kernel is mapped at KERNBASE, backed by contiguous phys memory 1610 * aligned at 2M, below 4G (the latter is important for AP startup) 1611 * - there is a 2M hole at KERNBASE 1612 * - kernel is mapped with 2M superpages 1613 * - all participating memory, i.e. kernel, modules, metadata, 1614 * page table is accessible by pre-created 1:1 mapping 1615 * (right now loader creates 1:1 mapping for lower 4G, and all 1616 * memory is from there) 1617 * - there is a usable memory block right after the end of the 1618 * mapped kernel and all modules/metadata, pointed to by 1619 * physfree, for early allocations 1620 */ 1621 cr3 = rcr3(); 1622 pml4e = (pml4_entry_t *)(cr3 & ~PAGE_MASK) + pmap_pml4e_index( 1623 (vm_offset_t)hammer_time); 1624 pdpe = (pdp_entry_t *)(*pml4e & ~PAGE_MASK) + pmap_pdpe_index( 1625 (vm_offset_t)hammer_time); 1626 pde = (pd_entry_t *)(*pdpe & ~PAGE_MASK) + pmap_pde_index( 1627 (vm_offset_t)hammer_time); 1628 kernphys = (vm_paddr_t)(*pde & ~PDRMASK) - 1629 (vm_paddr_t)(((vm_offset_t)hammer_time - KERNBASE) & ~PDRMASK); 1630 1631 /* Fix-up for 2M hole */ 1632 physfree += kernphys; 1633 kernphys += NBPDR; 1634 1635 kmdp = init_ops.parse_preload_data(modulep); 1636 1637 efi_boot = preload_search_info(kmdp, MODINFO_METADATA | 1638 MODINFOMD_EFI_MAP) != NULL; 1639 1640 if (!efi_boot) { 1641 /* Tell the bios to warmboot next time */ 1642 atomic_store_short((u_short *)0x472, 0x1234); 1643 } 1644 1645 physfree += ucode_load_bsp(physfree - kernphys + KERNSTART); 1646 physfree = roundup2(physfree, PAGE_SIZE); 1647 1648 identify_cpu1(); 1649 identify_hypervisor(); 1650 identify_cpu_fixup_bsp(); 1651 identify_cpu2(); 1652 initializecpucache(); 1653 1654 /* 1655 * Check for pti, pcid, and invpcid before ifuncs are 1656 * resolved, to correctly select the implementation for 1657 * pmap_activate_sw_mode(). 1658 */ 1659 pti = pti_get_default(); 1660 TUNABLE_INT_FETCH("vm.pmap.pti", &pti); 1661 TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); 1662 if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) { 1663 invpcid_works = (cpu_stdext_feature & 1664 CPUID_STDEXT_INVPCID) != 0; 1665 } else { 1666 pmap_pcid_enabled = 0; 1667 } 1668 1669 link_elf_ireloc(kmdp); 1670 1671 /* 1672 * This may be done better later if it gets more high level 1673 * components in it. If so just link td->td_proc here. 1674 */ 1675 proc_linkup0(&proc0, &thread0); 1676 1677 /* Init basic tunables, hz etc */ 1678 init_param1(); 1679 1680 thread0.td_kstack = physfree - kernphys + KERNSTART; 1681 thread0.td_kstack_pages = kstack_pages; 1682 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; 1683 bzero((void *)thread0.td_kstack, kstack0_sz); 1684 physfree += kstack0_sz; 1685 1686 /* 1687 * Initialize enough of thread0 for delayed invalidation to 1688 * work very early. Rely on thread0.td_base_pri 1689 * zero-initialization, it is reset to PVM at proc0_init(). 1690 */ 1691 pmap_thread_init_invl_gen(&thread0); 1692 1693 pc = &temp_bsp_pcpu; 1694 pcpu_init(pc, 0, sizeof(struct pcpu)); 1695 gdt = &temp_bsp_pcpu.pc_gdt[0]; 1696 1697 /* 1698 * make gdt memory segments 1699 */ 1700 for (x = 0; x < NGDT; x++) { 1701 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && 1702 x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1) 1703 ssdtosd(&gdt_segs[x], &gdt[x]); 1704 } 1705 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss; 1706 ssdtosyssd(&gdt_segs[GPROC0_SEL], 1707 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1708 1709 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 1710 r_gdt.rd_base = (long)gdt; 1711 lgdt(&r_gdt); 1712 1713 wrmsr(MSR_FSBASE, 0); /* User value */ 1714 wrmsr(MSR_GSBASE, (u_int64_t)pc); 1715 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ 1716 1717 dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0); 1718 physfree += DPCPU_SIZE; 1719 amd64_bsp_pcpu_init1(pc); 1720 /* Non-late cninit() and printf() can be moved up to here. */ 1721 1722 /* 1723 * Initialize mutexes. 1724 * 1725 * icu_lock: in order to allow an interrupt to occur in a critical 1726 * section, to set pcpu->ipending (etc...) properly, we 1727 * must be able to get the icu lock, so it can't be 1728 * under witness. 1729 */ 1730 mutex_init(); 1731 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS); 1732 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF); 1733 1734 /* exceptions */ 1735 for (x = 0; x < NIDT; x++) 1736 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT, 1737 SEL_KPL, 0); 1738 setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT, 1739 SEL_KPL, 0); 1740 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4); 1741 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2); 1742 setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT, 1743 SEL_UPL, 0); 1744 setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT, 1745 SEL_UPL, 0); 1746 setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT, 1747 SEL_KPL, 0); 1748 setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT, 1749 SEL_KPL, 0); 1750 setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT, 1751 SEL_KPL, 0); 1752 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); 1753 setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm), 1754 SDT_SYSIGT, SEL_KPL, 0); 1755 setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT, 1756 SEL_KPL, 0); 1757 setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing), 1758 SDT_SYSIGT, SEL_KPL, 0); 1759 setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT, 1760 SEL_KPL, 0); 1761 setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT, 1762 SEL_KPL, 0); 1763 setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT, 1764 SEL_KPL, 0); 1765 setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT, 1766 SEL_KPL, 0); 1767 setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT, 1768 SEL_KPL, 0); 1769 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3); 1770 setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT, 1771 SEL_KPL, 0); 1772 #ifdef KDTRACE_HOOKS 1773 setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) : 1774 &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0); 1775 #endif 1776 #ifdef XENHVM 1777 setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) : 1778 &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0); 1779 #endif 1780 r_idt.rd_limit = sizeof(idt0) - 1; 1781 r_idt.rd_base = (long) idt; 1782 lidt(&r_idt); 1783 1784 /* 1785 * Initialize the clock before the console so that console 1786 * initialization can use DELAY(). 1787 */ 1788 clock_init(); 1789 1790 /* 1791 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4) 1792 * transition). 1793 * Once bootblocks have updated, we can test directly for 1794 * efi_systbl != NULL here... 1795 */ 1796 if (efi_boot) 1797 vty_set_preferred(VTY_VT); 1798 1799 TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable); 1800 TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable); 1801 1802 TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable); 1803 TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable); 1804 1805 TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush", 1806 &syscall_ret_l1d_flush_mode); 1807 1808 TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable); 1809 TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable); 1810 1811 TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable); 1812 1813 TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable", 1814 &x86_rngds_mitg_enable); 1815 1816 finishidentcpu(); /* Final stage of CPU initialization */ 1817 initializecpu(); /* Initialize CPU registers */ 1818 1819 amd64_bsp_ist_init(pc); 1820 1821 /* Set the IO permission bitmap (empty due to tss seg limit) */ 1822 pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) + 1823 IOPERM_BITMAP_SIZE; 1824 1825 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 1826 ltr(gsel_tss); 1827 1828 amd64_conf_fast_syscall(); 1829 1830 /* 1831 * We initialize the PCB pointer early so that exception 1832 * handlers will work. Also set up td_critnest to short-cut 1833 * the page fault handler. 1834 */ 1835 cpu_max_ext_state_size = sizeof(struct savefpu); 1836 set_top_of_stack_td(&thread0); 1837 thread0.td_pcb = get_pcb_td(&thread0); 1838 thread0.td_critnest = 1; 1839 1840 /* 1841 * The console and kdb should be initialized even earlier than here, 1842 * but some console drivers don't work until after getmemsize(). 1843 * Default to late console initialization to support these drivers. 1844 * This loses mainly printf()s in getmemsize() and early debugging. 1845 */ 1846 late_console = 1; 1847 TUNABLE_INT_FETCH("debug.late_console", &late_console); 1848 if (!late_console) { 1849 cninit(); 1850 amd64_kdb_init(); 1851 } 1852 1853 getmemsize(kmdp, physfree); 1854 init_param2(physmem); 1855 1856 /* now running on new page tables, configured,and u/iom is accessible */ 1857 1858 #ifdef DEV_PCI 1859 /* This call might adjust phys_avail[]. */ 1860 pci_early_quirks(); 1861 #endif 1862 1863 if (late_console) 1864 cninit(); 1865 1866 /* 1867 * Dump the boot metadata. We have to wait for cninit() since console 1868 * output is required. If it's grossly incorrect the kernel will never 1869 * make it this far. 1870 */ 1871 if (getenv_is_true("debug.dump_modinfo_at_boot")) 1872 preload_dump(); 1873 1874 #ifdef DEV_ISA 1875 #ifdef DEV_ATPIC 1876 elcr_probe(); 1877 atpic_startup(); 1878 #else 1879 /* Reset and mask the atpics and leave them shut down. */ 1880 atpic_reset(); 1881 1882 /* 1883 * Point the ICU spurious interrupt vectors at the APIC spurious 1884 * interrupt handler. 1885 */ 1886 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); 1887 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); 1888 #endif 1889 #else 1890 #error "have you forgotten the isa device?" 1891 #endif 1892 1893 if (late_console) 1894 amd64_kdb_init(); 1895 1896 msgbufinit(msgbufp, msgbufsize); 1897 fpuinit(); 1898 1899 /* 1900 * Reinitialize thread0's stack base now that the xsave area size is 1901 * known. Set up thread0's pcb save area after fpuinit calculated fpu 1902 * save area size. Zero out the extended state header in fpu save area. 1903 */ 1904 set_top_of_stack_td(&thread0); 1905 thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0); 1906 bzero(thread0.td_pcb->pcb_save, cpu_max_ext_state_size); 1907 if (use_xsave) { 1908 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + 1909 1); 1910 xhdr->xstate_bv = xsave_mask; 1911 } 1912 /* make an initial tss so cpu can get interrupt stack on syscall! */ 1913 rsp0 = thread0.td_md.md_stack_base; 1914 /* Ensure the stack is aligned to 16 bytes */ 1915 rsp0 &= ~0xFul; 1916 PCPU_PTR(common_tss)->tss_rsp0 = rsp0; 1917 amd64_bsp_pcpu_init2(rsp0); 1918 1919 /* transfer to user mode */ 1920 1921 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 1922 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 1923 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); 1924 _ufssel = GSEL(GUFS32_SEL, SEL_UPL); 1925 _ugssel = GSEL(GUGS32_SEL, SEL_UPL); 1926 1927 load_ds(_udatasel); 1928 load_es(_udatasel); 1929 load_fs(_ufssel); 1930 1931 /* setup proc 0's pcb */ 1932 thread0.td_pcb->pcb_flags = 0; 1933 thread0.td_frame = &proc0_tf; 1934 1935 env = kern_getenv("kernelname"); 1936 if (env != NULL) 1937 strlcpy(kernelname, env, sizeof(kernelname)); 1938 1939 kcsan_cpu_init(0); 1940 1941 #ifdef FDT 1942 x86_init_fdt(); 1943 #endif 1944 thread0.td_critnest = 0; 1945 1946 kasan_init(); 1947 kmsan_init(); 1948 1949 TSEXIT(); 1950 1951 /* Location of kernel stack for locore */ 1952 return (thread0.td_md.md_stack_base); 1953 } 1954 1955 void 1956 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) 1957 { 1958 1959 pcpu->pc_acpi_id = 0xffffffff; 1960 } 1961 1962 static int 1963 smap_sysctl_handler(SYSCTL_HANDLER_ARGS) 1964 { 1965 struct bios_smap *smapbase; 1966 struct bios_smap_xattr smap; 1967 caddr_t kmdp; 1968 uint32_t *smapattr; 1969 int count, error, i; 1970 1971 /* Retrieve the system memory map from the loader. */ 1972 kmdp = preload_search_by_type("elf kernel"); 1973 if (kmdp == NULL) 1974 kmdp = preload_search_by_type("elf64 kernel"); 1975 smapbase = (struct bios_smap *)preload_search_info(kmdp, 1976 MODINFO_METADATA | MODINFOMD_SMAP); 1977 if (smapbase == NULL) 1978 return (0); 1979 smapattr = (uint32_t *)preload_search_info(kmdp, 1980 MODINFO_METADATA | MODINFOMD_SMAP_XATTR); 1981 count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase); 1982 error = 0; 1983 for (i = 0; i < count; i++) { 1984 smap.base = smapbase[i].base; 1985 smap.length = smapbase[i].length; 1986 smap.type = smapbase[i].type; 1987 if (smapattr != NULL) 1988 smap.xattr = smapattr[i]; 1989 else 1990 smap.xattr = 0; 1991 error = SYSCTL_OUT(req, &smap, sizeof(smap)); 1992 } 1993 return (error); 1994 } 1995 SYSCTL_PROC(_machdep, OID_AUTO, smap, 1996 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 1997 smap_sysctl_handler, "S,bios_smap_xattr", 1998 "Raw BIOS SMAP data"); 1999 2000 static int 2001 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS) 2002 { 2003 struct efi_map_header *efihdr; 2004 caddr_t kmdp; 2005 uint32_t efisize; 2006 2007 kmdp = preload_search_by_type("elf kernel"); 2008 if (kmdp == NULL) 2009 kmdp = preload_search_by_type("elf64 kernel"); 2010 efihdr = (struct efi_map_header *)preload_search_info(kmdp, 2011 MODINFO_METADATA | MODINFOMD_EFI_MAP); 2012 if (efihdr == NULL) 2013 return (0); 2014 efisize = *((uint32_t *)efihdr - 1); 2015 return (SYSCTL_OUT(req, efihdr, efisize)); 2016 } 2017 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, 2018 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 2019 efi_map_sysctl_handler, "S,efi_map_header", 2020 "Raw EFI Memory Map"); 2021 2022 void 2023 spinlock_enter(void) 2024 { 2025 struct thread *td; 2026 register_t flags; 2027 2028 td = curthread; 2029 if (td->td_md.md_spinlock_count == 0) { 2030 flags = intr_disable(); 2031 td->td_md.md_spinlock_count = 1; 2032 td->td_md.md_saved_flags = flags; 2033 critical_enter(); 2034 } else 2035 td->td_md.md_spinlock_count++; 2036 } 2037 2038 void 2039 spinlock_exit(void) 2040 { 2041 struct thread *td; 2042 register_t flags; 2043 2044 td = curthread; 2045 flags = td->td_md.md_saved_flags; 2046 td->td_md.md_spinlock_count--; 2047 if (td->td_md.md_spinlock_count == 0) { 2048 critical_exit(); 2049 intr_restore(flags); 2050 } 2051 } 2052 2053 /* 2054 * Construct a PCB from a trapframe. This is called from kdb_trap() where 2055 * we want to start a backtrace from the function that caused us to enter 2056 * the debugger. We have the context in the trapframe, but base the trace 2057 * on the PCB. The PCB doesn't have to be perfect, as long as it contains 2058 * enough for a backtrace. 2059 */ 2060 void 2061 makectx(struct trapframe *tf, struct pcb *pcb) 2062 { 2063 2064 pcb->pcb_r12 = tf->tf_r12; 2065 pcb->pcb_r13 = tf->tf_r13; 2066 pcb->pcb_r14 = tf->tf_r14; 2067 pcb->pcb_r15 = tf->tf_r15; 2068 pcb->pcb_rbp = tf->tf_rbp; 2069 pcb->pcb_rbx = tf->tf_rbx; 2070 pcb->pcb_rip = tf->tf_rip; 2071 pcb->pcb_rsp = tf->tf_rsp; 2072 } 2073 2074 int 2075 ptrace_set_pc(struct thread *td, unsigned long addr) 2076 { 2077 2078 td->td_frame->tf_rip = addr; 2079 set_pcb_flags(td->td_pcb, PCB_FULL_IRET); 2080 return (0); 2081 } 2082 2083 int 2084 ptrace_single_step(struct thread *td) 2085 { 2086 2087 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); 2088 if ((td->td_frame->tf_rflags & PSL_T) == 0) { 2089 td->td_frame->tf_rflags |= PSL_T; 2090 td->td_dbgflags |= TDB_STEP; 2091 } 2092 return (0); 2093 } 2094 2095 int 2096 ptrace_clear_single_step(struct thread *td) 2097 { 2098 2099 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); 2100 td->td_frame->tf_rflags &= ~PSL_T; 2101 td->td_dbgflags &= ~TDB_STEP; 2102 return (0); 2103 } 2104 2105 int 2106 fill_regs(struct thread *td, struct reg *regs) 2107 { 2108 struct trapframe *tp; 2109 2110 tp = td->td_frame; 2111 return (fill_frame_regs(tp, regs)); 2112 } 2113 2114 int 2115 fill_frame_regs(struct trapframe *tp, struct reg *regs) 2116 { 2117 2118 regs->r_r15 = tp->tf_r15; 2119 regs->r_r14 = tp->tf_r14; 2120 regs->r_r13 = tp->tf_r13; 2121 regs->r_r12 = tp->tf_r12; 2122 regs->r_r11 = tp->tf_r11; 2123 regs->r_r10 = tp->tf_r10; 2124 regs->r_r9 = tp->tf_r9; 2125 regs->r_r8 = tp->tf_r8; 2126 regs->r_rdi = tp->tf_rdi; 2127 regs->r_rsi = tp->tf_rsi; 2128 regs->r_rbp = tp->tf_rbp; 2129 regs->r_rbx = tp->tf_rbx; 2130 regs->r_rdx = tp->tf_rdx; 2131 regs->r_rcx = tp->tf_rcx; 2132 regs->r_rax = tp->tf_rax; 2133 regs->r_rip = tp->tf_rip; 2134 regs->r_cs = tp->tf_cs; 2135 regs->r_rflags = tp->tf_rflags; 2136 regs->r_rsp = tp->tf_rsp; 2137 regs->r_ss = tp->tf_ss; 2138 if (tp->tf_flags & TF_HASSEGS) { 2139 regs->r_ds = tp->tf_ds; 2140 regs->r_es = tp->tf_es; 2141 regs->r_fs = tp->tf_fs; 2142 regs->r_gs = tp->tf_gs; 2143 } else { 2144 regs->r_ds = 0; 2145 regs->r_es = 0; 2146 regs->r_fs = 0; 2147 regs->r_gs = 0; 2148 } 2149 regs->r_err = 0; 2150 regs->r_trapno = 0; 2151 return (0); 2152 } 2153 2154 int 2155 set_regs(struct thread *td, struct reg *regs) 2156 { 2157 struct trapframe *tp; 2158 register_t rflags; 2159 2160 tp = td->td_frame; 2161 rflags = regs->r_rflags & 0xffffffff; 2162 if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs)) 2163 return (EINVAL); 2164 tp->tf_r15 = regs->r_r15; 2165 tp->tf_r14 = regs->r_r14; 2166 tp->tf_r13 = regs->r_r13; 2167 tp->tf_r12 = regs->r_r12; 2168 tp->tf_r11 = regs->r_r11; 2169 tp->tf_r10 = regs->r_r10; 2170 tp->tf_r9 = regs->r_r9; 2171 tp->tf_r8 = regs->r_r8; 2172 tp->tf_rdi = regs->r_rdi; 2173 tp->tf_rsi = regs->r_rsi; 2174 tp->tf_rbp = regs->r_rbp; 2175 tp->tf_rbx = regs->r_rbx; 2176 tp->tf_rdx = regs->r_rdx; 2177 tp->tf_rcx = regs->r_rcx; 2178 tp->tf_rax = regs->r_rax; 2179 tp->tf_rip = regs->r_rip; 2180 tp->tf_cs = regs->r_cs; 2181 tp->tf_rflags = rflags; 2182 tp->tf_rsp = regs->r_rsp; 2183 tp->tf_ss = regs->r_ss; 2184 if (0) { /* XXXKIB */ 2185 tp->tf_ds = regs->r_ds; 2186 tp->tf_es = regs->r_es; 2187 tp->tf_fs = regs->r_fs; 2188 tp->tf_gs = regs->r_gs; 2189 tp->tf_flags = TF_HASSEGS; 2190 } 2191 set_pcb_flags(td->td_pcb, PCB_FULL_IRET); 2192 return (0); 2193 } 2194 2195 /* XXX check all this stuff! */ 2196 /* externalize from sv_xmm */ 2197 static void 2198 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs) 2199 { 2200 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; 2201 struct envxmm *penv_xmm = &sv_xmm->sv_env; 2202 int i; 2203 2204 /* pcb -> fpregs */ 2205 bzero(fpregs, sizeof(*fpregs)); 2206 2207 /* FPU control/status */ 2208 penv_fpreg->en_cw = penv_xmm->en_cw; 2209 penv_fpreg->en_sw = penv_xmm->en_sw; 2210 penv_fpreg->en_tw = penv_xmm->en_tw; 2211 penv_fpreg->en_opcode = penv_xmm->en_opcode; 2212 penv_fpreg->en_rip = penv_xmm->en_rip; 2213 penv_fpreg->en_rdp = penv_xmm->en_rdp; 2214 penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr; 2215 penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask; 2216 2217 /* FPU registers */ 2218 for (i = 0; i < 8; ++i) 2219 bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10); 2220 2221 /* SSE registers */ 2222 for (i = 0; i < 16; ++i) 2223 bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16); 2224 } 2225 2226 /* internalize from fpregs into sv_xmm */ 2227 static void 2228 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm) 2229 { 2230 struct envxmm *penv_xmm = &sv_xmm->sv_env; 2231 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; 2232 int i; 2233 2234 /* fpregs -> pcb */ 2235 /* FPU control/status */ 2236 penv_xmm->en_cw = penv_fpreg->en_cw; 2237 penv_xmm->en_sw = penv_fpreg->en_sw; 2238 penv_xmm->en_tw = penv_fpreg->en_tw; 2239 penv_xmm->en_opcode = penv_fpreg->en_opcode; 2240 penv_xmm->en_rip = penv_fpreg->en_rip; 2241 penv_xmm->en_rdp = penv_fpreg->en_rdp; 2242 penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr; 2243 penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask; 2244 2245 /* FPU registers */ 2246 for (i = 0; i < 8; ++i) 2247 bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10); 2248 2249 /* SSE registers */ 2250 for (i = 0; i < 16; ++i) 2251 bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16); 2252 } 2253 2254 /* externalize from td->pcb */ 2255 int 2256 fill_fpregs(struct thread *td, struct fpreg *fpregs) 2257 { 2258 2259 KASSERT(td == curthread || TD_IS_SUSPENDED(td) || 2260 P_SHOULDSTOP(td->td_proc), 2261 ("not suspended thread %p", td)); 2262 fpugetregs(td); 2263 fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs); 2264 return (0); 2265 } 2266 2267 /* internalize to td->pcb */ 2268 int 2269 set_fpregs(struct thread *td, struct fpreg *fpregs) 2270 { 2271 2272 critical_enter(); 2273 set_fpregs_xmm(fpregs, get_pcb_user_save_td(td)); 2274 fpuuserinited(td); 2275 critical_exit(); 2276 return (0); 2277 } 2278 2279 /* 2280 * Get machine context. 2281 */ 2282 int 2283 get_mcontext(struct thread *td, mcontext_t *mcp, int flags) 2284 { 2285 struct pcb *pcb; 2286 struct trapframe *tp; 2287 2288 pcb = td->td_pcb; 2289 tp = td->td_frame; 2290 PROC_LOCK(curthread->td_proc); 2291 mcp->mc_onstack = sigonstack(tp->tf_rsp); 2292 PROC_UNLOCK(curthread->td_proc); 2293 mcp->mc_r15 = tp->tf_r15; 2294 mcp->mc_r14 = tp->tf_r14; 2295 mcp->mc_r13 = tp->tf_r13; 2296 mcp->mc_r12 = tp->tf_r12; 2297 mcp->mc_r11 = tp->tf_r11; 2298 mcp->mc_r10 = tp->tf_r10; 2299 mcp->mc_r9 = tp->tf_r9; 2300 mcp->mc_r8 = tp->tf_r8; 2301 mcp->mc_rdi = tp->tf_rdi; 2302 mcp->mc_rsi = tp->tf_rsi; 2303 mcp->mc_rbp = tp->tf_rbp; 2304 mcp->mc_rbx = tp->tf_rbx; 2305 mcp->mc_rcx = tp->tf_rcx; 2306 mcp->mc_rflags = tp->tf_rflags; 2307 if (flags & GET_MC_CLEAR_RET) { 2308 mcp->mc_rax = 0; 2309 mcp->mc_rdx = 0; 2310 mcp->mc_rflags &= ~PSL_C; 2311 } else { 2312 mcp->mc_rax = tp->tf_rax; 2313 mcp->mc_rdx = tp->tf_rdx; 2314 } 2315 mcp->mc_rip = tp->tf_rip; 2316 mcp->mc_cs = tp->tf_cs; 2317 mcp->mc_rsp = tp->tf_rsp; 2318 mcp->mc_ss = tp->tf_ss; 2319 mcp->mc_ds = tp->tf_ds; 2320 mcp->mc_es = tp->tf_es; 2321 mcp->mc_fs = tp->tf_fs; 2322 mcp->mc_gs = tp->tf_gs; 2323 mcp->mc_flags = tp->tf_flags; 2324 mcp->mc_len = sizeof(*mcp); 2325 get_fpcontext(td, mcp, NULL, 0); 2326 update_pcb_bases(pcb); 2327 mcp->mc_fsbase = pcb->pcb_fsbase; 2328 mcp->mc_gsbase = pcb->pcb_gsbase; 2329 mcp->mc_xfpustate = 0; 2330 mcp->mc_xfpustate_len = 0; 2331 bzero(mcp->mc_spare, sizeof(mcp->mc_spare)); 2332 return (0); 2333 } 2334 2335 /* 2336 * Set machine context. 2337 * 2338 * However, we don't set any but the user modifiable flags, and we won't 2339 * touch the cs selector. 2340 */ 2341 int 2342 set_mcontext(struct thread *td, mcontext_t *mcp) 2343 { 2344 struct pcb *pcb; 2345 struct trapframe *tp; 2346 char *xfpustate; 2347 long rflags; 2348 int ret; 2349 2350 pcb = td->td_pcb; 2351 tp = td->td_frame; 2352 if (mcp->mc_len != sizeof(*mcp) || 2353 (mcp->mc_flags & ~_MC_FLAG_MASK) != 0) 2354 return (EINVAL); 2355 rflags = (mcp->mc_rflags & PSL_USERCHANGE) | 2356 (tp->tf_rflags & ~PSL_USERCHANGE); 2357 if (mcp->mc_flags & _MC_HASFPXSTATE) { 2358 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - 2359 sizeof(struct savefpu)) 2360 return (EINVAL); 2361 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len); 2362 ret = copyin((void *)mcp->mc_xfpustate, xfpustate, 2363 mcp->mc_xfpustate_len); 2364 if (ret != 0) 2365 return (ret); 2366 } else 2367 xfpustate = NULL; 2368 ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len); 2369 if (ret != 0) 2370 return (ret); 2371 tp->tf_r15 = mcp->mc_r15; 2372 tp->tf_r14 = mcp->mc_r14; 2373 tp->tf_r13 = mcp->mc_r13; 2374 tp->tf_r12 = mcp->mc_r12; 2375 tp->tf_r11 = mcp->mc_r11; 2376 tp->tf_r10 = mcp->mc_r10; 2377 tp->tf_r9 = mcp->mc_r9; 2378 tp->tf_r8 = mcp->mc_r8; 2379 tp->tf_rdi = mcp->mc_rdi; 2380 tp->tf_rsi = mcp->mc_rsi; 2381 tp->tf_rbp = mcp->mc_rbp; 2382 tp->tf_rbx = mcp->mc_rbx; 2383 tp->tf_rdx = mcp->mc_rdx; 2384 tp->tf_rcx = mcp->mc_rcx; 2385 tp->tf_rax = mcp->mc_rax; 2386 tp->tf_rip = mcp->mc_rip; 2387 tp->tf_rflags = rflags; 2388 tp->tf_rsp = mcp->mc_rsp; 2389 tp->tf_ss = mcp->mc_ss; 2390 tp->tf_flags = mcp->mc_flags; 2391 if (tp->tf_flags & TF_HASSEGS) { 2392 tp->tf_ds = mcp->mc_ds; 2393 tp->tf_es = mcp->mc_es; 2394 tp->tf_fs = mcp->mc_fs; 2395 tp->tf_gs = mcp->mc_gs; 2396 } 2397 set_pcb_flags(pcb, PCB_FULL_IRET); 2398 if (mcp->mc_flags & _MC_HASBASES) { 2399 pcb->pcb_fsbase = mcp->mc_fsbase; 2400 pcb->pcb_gsbase = mcp->mc_gsbase; 2401 } 2402 return (0); 2403 } 2404 2405 static void 2406 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, 2407 size_t xfpusave_len) 2408 { 2409 size_t max_len, len; 2410 2411 mcp->mc_ownedfp = fpugetregs(td); 2412 bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0], 2413 sizeof(mcp->mc_fpstate)); 2414 mcp->mc_fpformat = fpuformat(); 2415 if (!use_xsave || xfpusave_len == 0) 2416 return; 2417 max_len = cpu_max_ext_state_size - sizeof(struct savefpu); 2418 len = xfpusave_len; 2419 if (len > max_len) { 2420 len = max_len; 2421 bzero(xfpusave + max_len, len - max_len); 2422 } 2423 mcp->mc_flags |= _MC_HASFPXSTATE; 2424 mcp->mc_xfpustate_len = len; 2425 bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len); 2426 } 2427 2428 static int 2429 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, 2430 size_t xfpustate_len) 2431 { 2432 int error; 2433 2434 if (mcp->mc_fpformat == _MC_FPFMT_NODEV) 2435 return (0); 2436 else if (mcp->mc_fpformat != _MC_FPFMT_XMM) 2437 return (EINVAL); 2438 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) { 2439 /* We don't care what state is left in the FPU or PCB. */ 2440 fpstate_drop(td); 2441 error = 0; 2442 } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || 2443 mcp->mc_ownedfp == _MC_FPOWNED_PCB) { 2444 error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate, 2445 xfpustate, xfpustate_len); 2446 } else 2447 return (EINVAL); 2448 return (error); 2449 } 2450 2451 void 2452 fpstate_drop(struct thread *td) 2453 { 2454 2455 KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu")); 2456 critical_enter(); 2457 if (PCPU_GET(fpcurthread) == td) 2458 fpudrop(); 2459 /* 2460 * XXX force a full drop of the fpu. The above only drops it if we 2461 * owned it. 2462 * 2463 * XXX I don't much like fpugetuserregs()'s semantics of doing a full 2464 * drop. Dropping only to the pcb matches fnsave's behaviour. 2465 * We only need to drop to !PCB_INITDONE in sendsig(). But 2466 * sendsig() is the only caller of fpugetuserregs()... perhaps we just 2467 * have too many layers. 2468 */ 2469 clear_pcb_flags(curthread->td_pcb, 2470 PCB_FPUINITDONE | PCB_USERFPUINITDONE); 2471 critical_exit(); 2472 } 2473 2474 int 2475 fill_dbregs(struct thread *td, struct dbreg *dbregs) 2476 { 2477 struct pcb *pcb; 2478 2479 if (td == NULL) { 2480 dbregs->dr[0] = rdr0(); 2481 dbregs->dr[1] = rdr1(); 2482 dbregs->dr[2] = rdr2(); 2483 dbregs->dr[3] = rdr3(); 2484 dbregs->dr[6] = rdr6(); 2485 dbregs->dr[7] = rdr7(); 2486 } else { 2487 pcb = td->td_pcb; 2488 dbregs->dr[0] = pcb->pcb_dr0; 2489 dbregs->dr[1] = pcb->pcb_dr1; 2490 dbregs->dr[2] = pcb->pcb_dr2; 2491 dbregs->dr[3] = pcb->pcb_dr3; 2492 dbregs->dr[6] = pcb->pcb_dr6; 2493 dbregs->dr[7] = pcb->pcb_dr7; 2494 } 2495 dbregs->dr[4] = 0; 2496 dbregs->dr[5] = 0; 2497 dbregs->dr[8] = 0; 2498 dbregs->dr[9] = 0; 2499 dbregs->dr[10] = 0; 2500 dbregs->dr[11] = 0; 2501 dbregs->dr[12] = 0; 2502 dbregs->dr[13] = 0; 2503 dbregs->dr[14] = 0; 2504 dbregs->dr[15] = 0; 2505 return (0); 2506 } 2507 2508 int 2509 set_dbregs(struct thread *td, struct dbreg *dbregs) 2510 { 2511 struct pcb *pcb; 2512 int i; 2513 2514 if (td == NULL) { 2515 load_dr0(dbregs->dr[0]); 2516 load_dr1(dbregs->dr[1]); 2517 load_dr2(dbregs->dr[2]); 2518 load_dr3(dbregs->dr[3]); 2519 load_dr6(dbregs->dr[6]); 2520 load_dr7(dbregs->dr[7]); 2521 } else { 2522 /* 2523 * Don't let an illegal value for dr7 get set. Specifically, 2524 * check for undefined settings. Setting these bit patterns 2525 * result in undefined behaviour and can lead to an unexpected 2526 * TRCTRAP or a general protection fault right here. 2527 * Upper bits of dr6 and dr7 must not be set 2528 */ 2529 for (i = 0; i < 4; i++) { 2530 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) 2531 return (EINVAL); 2532 if (td->td_frame->tf_cs == _ucode32sel && 2533 DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8) 2534 return (EINVAL); 2535 } 2536 if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 || 2537 (dbregs->dr[7] & 0xffffffff00000000ul) != 0) 2538 return (EINVAL); 2539 2540 pcb = td->td_pcb; 2541 2542 /* 2543 * Don't let a process set a breakpoint that is not within the 2544 * process's address space. If a process could do this, it 2545 * could halt the system by setting a breakpoint in the kernel 2546 * (if ddb was enabled). Thus, we need to check to make sure 2547 * that no breakpoints are being enabled for addresses outside 2548 * process's address space. 2549 * 2550 * XXX - what about when the watched area of the user's 2551 * address space is written into from within the kernel 2552 * ... wouldn't that still cause a breakpoint to be generated 2553 * from within kernel mode? 2554 */ 2555 2556 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { 2557 /* dr0 is enabled */ 2558 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) 2559 return (EINVAL); 2560 } 2561 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { 2562 /* dr1 is enabled */ 2563 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) 2564 return (EINVAL); 2565 } 2566 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { 2567 /* dr2 is enabled */ 2568 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) 2569 return (EINVAL); 2570 } 2571 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { 2572 /* dr3 is enabled */ 2573 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) 2574 return (EINVAL); 2575 } 2576 2577 pcb->pcb_dr0 = dbregs->dr[0]; 2578 pcb->pcb_dr1 = dbregs->dr[1]; 2579 pcb->pcb_dr2 = dbregs->dr[2]; 2580 pcb->pcb_dr3 = dbregs->dr[3]; 2581 pcb->pcb_dr6 = dbregs->dr[6]; 2582 pcb->pcb_dr7 = dbregs->dr[7]; 2583 2584 set_pcb_flags(pcb, PCB_DBREGS); 2585 } 2586 2587 return (0); 2588 } 2589 2590 void 2591 reset_dbregs(void) 2592 { 2593 2594 load_dr7(0); /* Turn off the control bits first */ 2595 load_dr0(0); 2596 load_dr1(0); 2597 load_dr2(0); 2598 load_dr3(0); 2599 load_dr6(0); 2600 } 2601 2602 /* 2603 * Return > 0 if a hardware breakpoint has been hit, and the 2604 * breakpoint was in user space. Return 0, otherwise. 2605 */ 2606 int 2607 user_dbreg_trap(register_t dr6) 2608 { 2609 u_int64_t dr7; 2610 u_int64_t bp; /* breakpoint bits extracted from dr6 */ 2611 int nbp; /* number of breakpoints that triggered */ 2612 caddr_t addr[4]; /* breakpoint addresses */ 2613 int i; 2614 2615 bp = dr6 & DBREG_DR6_BMASK; 2616 if (bp == 0) { 2617 /* 2618 * None of the breakpoint bits are set meaning this 2619 * trap was not caused by any of the debug registers 2620 */ 2621 return 0; 2622 } 2623 2624 dr7 = rdr7(); 2625 if ((dr7 & 0x000000ff) == 0) { 2626 /* 2627 * all GE and LE bits in the dr7 register are zero, 2628 * thus the trap couldn't have been caused by the 2629 * hardware debug registers 2630 */ 2631 return 0; 2632 } 2633 2634 nbp = 0; 2635 2636 /* 2637 * at least one of the breakpoints were hit, check to see 2638 * which ones and if any of them are user space addresses 2639 */ 2640 2641 if (bp & 0x01) { 2642 addr[nbp++] = (caddr_t)rdr0(); 2643 } 2644 if (bp & 0x02) { 2645 addr[nbp++] = (caddr_t)rdr1(); 2646 } 2647 if (bp & 0x04) { 2648 addr[nbp++] = (caddr_t)rdr2(); 2649 } 2650 if (bp & 0x08) { 2651 addr[nbp++] = (caddr_t)rdr3(); 2652 } 2653 2654 for (i = 0; i < nbp; i++) { 2655 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { 2656 /* 2657 * addr[i] is in user space 2658 */ 2659 return nbp; 2660 } 2661 } 2662 2663 /* 2664 * None of the breakpoints are in user space. 2665 */ 2666 return 0; 2667 } 2668 2669 /* 2670 * The pcb_flags is only modified by current thread, or by other threads 2671 * when current thread is stopped. However, current thread may change it 2672 * from the interrupt context in cpu_switch(), or in the trap handler. 2673 * When we read-modify-write pcb_flags from C sources, compiler may generate 2674 * code that is not atomic regarding the interrupt handler. If a trap or 2675 * interrupt happens and any flag is modified from the handler, it can be 2676 * clobbered with the cached value later. Therefore, we implement setting 2677 * and clearing flags with single-instruction functions, which do not race 2678 * with possible modification of the flags from the trap or interrupt context, 2679 * because traps and interrupts are executed only on instruction boundary. 2680 */ 2681 void 2682 set_pcb_flags_raw(struct pcb *pcb, const u_int flags) 2683 { 2684 2685 __asm __volatile("orl %1,%0" 2686 : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags) 2687 : "cc", "memory"); 2688 2689 } 2690 2691 /* 2692 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs 2693 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into 2694 * pcb if user space modified the bases. We must save on the context 2695 * switch or if the return to usermode happens through the doreti. 2696 * 2697 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET, 2698 * which have a consequence that the base MSRs must be saved each time 2699 * the PCB_FULL_IRET flag is set. We disable interrupts to sync with 2700 * context switches. 2701 */ 2702 static void 2703 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags) 2704 { 2705 register_t r; 2706 2707 if (curpcb == pcb && 2708 (flags & PCB_FULL_IRET) != 0 && 2709 (pcb->pcb_flags & PCB_FULL_IRET) == 0) { 2710 r = intr_disable(); 2711 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) { 2712 if (rfs() == _ufssel) 2713 pcb->pcb_fsbase = rdfsbase(); 2714 if (rgs() == _ugssel) 2715 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE); 2716 } 2717 set_pcb_flags_raw(pcb, flags); 2718 intr_restore(r); 2719 } else { 2720 set_pcb_flags_raw(pcb, flags); 2721 } 2722 } 2723 2724 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int)) 2725 { 2726 2727 return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ? 2728 set_pcb_flags_fsgsbase : set_pcb_flags_raw); 2729 } 2730 2731 void 2732 clear_pcb_flags(struct pcb *pcb, const u_int flags) 2733 { 2734 2735 __asm __volatile("andl %1,%0" 2736 : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags) 2737 : "cc", "memory"); 2738 } 2739 2740 #ifdef KDB 2741 2742 /* 2743 * Provide inb() and outb() as functions. They are normally only available as 2744 * inline functions, thus cannot be called from the debugger. 2745 */ 2746 2747 /* silence compiler warnings */ 2748 u_char inb_(u_short); 2749 void outb_(u_short, u_char); 2750 2751 u_char 2752 inb_(u_short port) 2753 { 2754 return inb(port); 2755 } 2756 2757 void 2758 outb_(u_short port, u_char data) 2759 { 2760 outb(port, data); 2761 } 2762 2763 #endif /* KDB */ 2764 2765 #undef memset 2766 #undef memmove 2767 #undef memcpy 2768 2769 void *memset_std(void *buf, int c, size_t len); 2770 void *memset_erms(void *buf, int c, size_t len); 2771 void *memmove_std(void * _Nonnull dst, const void * _Nonnull src, 2772 size_t len); 2773 void *memmove_erms(void * _Nonnull dst, const void * _Nonnull src, 2774 size_t len); 2775 void *memcpy_std(void * _Nonnull dst, const void * _Nonnull src, 2776 size_t len); 2777 void *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src, 2778 size_t len); 2779 2780 #ifdef KCSAN 2781 /* 2782 * These fail to build as ifuncs when used with KCSAN. 2783 */ 2784 void * 2785 memset(void *buf, int c, size_t len) 2786 { 2787 2788 return (memset_std(buf, c, len)); 2789 } 2790 2791 void * 2792 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len) 2793 { 2794 2795 return (memmove_std(dst, src, len)); 2796 } 2797 2798 void * 2799 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len) 2800 { 2801 2802 return (memcpy_std(dst, src, len)); 2803 } 2804 #else 2805 DEFINE_IFUNC(, void *, memset, (void *, int, size_t)) 2806 { 2807 2808 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 2809 memset_erms : memset_std); 2810 } 2811 2812 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull, 2813 size_t)) 2814 { 2815 2816 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 2817 memmove_erms : memmove_std); 2818 } 2819 2820 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t)) 2821 { 2822 2823 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 2824 memcpy_erms : memcpy_std); 2825 } 2826 #endif 2827 2828 void pagezero_std(void *addr); 2829 void pagezero_erms(void *addr); 2830 DEFINE_IFUNC(, void , pagezero, (void *)) 2831 { 2832 2833 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 2834 pagezero_erms : pagezero_std); 2835 } 2836