1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 2003 Peter Wemm. 5 * Copyright (c) 1992 Terrence R. Lambert. 6 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * William Jolitz. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. All advertising materials mentioning features or use of this software 21 * must display the following acknowledgement: 22 * This product includes software developed by the University of 23 * California, Berkeley and its contributors. 24 * 4. Neither the name of the University nor the names of its contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 * 40 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_atpic.h" 47 #include "opt_cpu.h" 48 #include "opt_ddb.h" 49 #include "opt_inet.h" 50 #include "opt_isa.h" 51 #include "opt_kstack_pages.h" 52 #include "opt_maxmem.h" 53 #include "opt_mp_watchdog.h" 54 #include "opt_pci.h" 55 #include "opt_platform.h" 56 #include "opt_sched.h" 57 58 #include <sys/param.h> 59 #include <sys/proc.h> 60 #include <sys/systm.h> 61 #include <sys/asan.h> 62 #include <sys/bio.h> 63 #include <sys/buf.h> 64 #include <sys/bus.h> 65 #include <sys/callout.h> 66 #include <sys/cons.h> 67 #include <sys/cpu.h> 68 #include <sys/csan.h> 69 #include <sys/efi.h> 70 #include <sys/eventhandler.h> 71 #include <sys/exec.h> 72 #include <sys/imgact.h> 73 #include <sys/kdb.h> 74 #include <sys/kernel.h> 75 #include <sys/ktr.h> 76 #include <sys/linker.h> 77 #include <sys/lock.h> 78 #include <sys/malloc.h> 79 #include <sys/memrange.h> 80 #include <sys/msgbuf.h> 81 #include <sys/mutex.h> 82 #include <sys/pcpu.h> 83 #include <sys/ptrace.h> 84 #include <sys/reboot.h> 85 #include <sys/rwlock.h> 86 #include <sys/sched.h> 87 #include <sys/signalvar.h> 88 #ifdef SMP 89 #include <sys/smp.h> 90 #endif 91 #include <sys/syscallsubr.h> 92 #include <sys/sysctl.h> 93 #include <sys/sysent.h> 94 #include <sys/sysproto.h> 95 #include <sys/ucontext.h> 96 #include <sys/vmmeter.h> 97 98 #include <vm/vm.h> 99 #include <vm/vm_param.h> 100 #include <vm/vm_extern.h> 101 #include <vm/vm_kern.h> 102 #include <vm/vm_page.h> 103 #include <vm/vm_map.h> 104 #include <vm/vm_object.h> 105 #include <vm/vm_pager.h> 106 #include <vm/vm_phys.h> 107 #include <vm/vm_dumpset.h> 108 109 #ifdef DDB 110 #ifndef KDB 111 #error KDB must be enabled in order for DDB to work! 112 #endif 113 #include <ddb/ddb.h> 114 #include <ddb/db_sym.h> 115 #endif 116 117 #include <net/netisr.h> 118 119 #include <machine/clock.h> 120 #include <machine/cpu.h> 121 #include <machine/cputypes.h> 122 #include <machine/frame.h> 123 #include <machine/intr_machdep.h> 124 #include <x86/mca.h> 125 #include <machine/md_var.h> 126 #include <machine/metadata.h> 127 #include <machine/mp_watchdog.h> 128 #include <machine/pc/bios.h> 129 #include <machine/pcb.h> 130 #include <machine/proc.h> 131 #include <machine/reg.h> 132 #include <machine/sigframe.h> 133 #include <machine/specialreg.h> 134 #include <machine/trap.h> 135 #include <machine/tss.h> 136 #include <x86/ucode.h> 137 #include <x86/ifunc.h> 138 #ifdef SMP 139 #include <machine/smp.h> 140 #endif 141 #ifdef FDT 142 #include <x86/fdt.h> 143 #endif 144 145 #ifdef DEV_ATPIC 146 #include <x86/isa/icu.h> 147 #else 148 #include <x86/apicvar.h> 149 #endif 150 151 #include <isa/isareg.h> 152 #include <isa/rtc.h> 153 #include <x86/init.h> 154 155 /* Sanity check for __curthread() */ 156 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); 157 158 /* 159 * The PTI trampoline stack needs enough space for a hardware trapframe and a 160 * couple of scratch registers, as well as the trapframe left behind after an 161 * iret fault. 162 */ 163 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) - 164 offsetof(struct pti_frame, pti_rip)); 165 166 extern u_int64_t hammer_time(u_int64_t, u_int64_t); 167 168 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 169 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 170 171 static void cpu_startup(void *); 172 static void get_fpcontext(struct thread *td, mcontext_t *mcp, 173 char *xfpusave, size_t xfpusave_len); 174 static int set_fpcontext(struct thread *td, mcontext_t *mcp, 175 char *xfpustate, size_t xfpustate_len); 176 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); 177 178 /* Preload data parse function */ 179 static caddr_t native_parse_preload_data(u_int64_t); 180 181 /* Native function to fetch and parse the e820 map */ 182 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *); 183 184 /* Default init_ops implementation. */ 185 struct init_ops init_ops = { 186 .parse_preload_data = native_parse_preload_data, 187 .early_clock_source_init = i8254_init, 188 .early_delay = i8254_delay, 189 .parse_memmap = native_parse_memmap, 190 }; 191 192 /* 193 * Physical address of the EFI System Table. Stashed from the metadata hints 194 * passed into the kernel and used by the EFI code to call runtime services. 195 */ 196 vm_paddr_t efi_systbl_phys; 197 198 /* Intel ICH registers */ 199 #define ICH_PMBASE 0x400 200 #define ICH_SMI_EN ICH_PMBASE + 0x30 201 202 int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel; 203 204 int cold = 1; 205 206 long Maxmem = 0; 207 long realmem = 0; 208 209 struct kva_md_info kmi; 210 211 static struct trapframe proc0_tf; 212 struct region_descriptor r_idt; 213 214 struct pcpu *__pcpu; 215 struct pcpu temp_bsp_pcpu; 216 217 struct mtx icu_lock; 218 219 struct mem_range_softc mem_range_softc; 220 221 struct mtx dt_lock; /* lock for GDT and LDT */ 222 223 void (*vmm_resume_p)(void); 224 225 static void 226 cpu_startup(dummy) 227 void *dummy; 228 { 229 uintmax_t memsize; 230 char *sysenv; 231 232 /* 233 * On MacBooks, we need to disallow the legacy USB circuit to 234 * generate an SMI# because this can cause several problems, 235 * namely: incorrect CPU frequency detection and failure to 236 * start the APs. 237 * We do this by disabling a bit in the SMI_EN (SMI Control and 238 * Enable register) of the Intel ICH LPC Interface Bridge. 239 */ 240 sysenv = kern_getenv("smbios.system.product"); 241 if (sysenv != NULL) { 242 if (strncmp(sysenv, "MacBook1,1", 10) == 0 || 243 strncmp(sysenv, "MacBook3,1", 10) == 0 || 244 strncmp(sysenv, "MacBook4,1", 10) == 0 || 245 strncmp(sysenv, "MacBookPro1,1", 13) == 0 || 246 strncmp(sysenv, "MacBookPro1,2", 13) == 0 || 247 strncmp(sysenv, "MacBookPro3,1", 13) == 0 || 248 strncmp(sysenv, "MacBookPro4,1", 13) == 0 || 249 strncmp(sysenv, "Macmini1,1", 10) == 0) { 250 if (bootverbose) 251 printf("Disabling LEGACY_USB_EN bit on " 252 "Intel ICH.\n"); 253 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); 254 } 255 freeenv(sysenv); 256 } 257 258 /* 259 * Good {morning,afternoon,evening,night}. 260 */ 261 startrtclock(); 262 printcpuinfo(); 263 264 /* 265 * Display physical memory if SMBIOS reports reasonable amount. 266 */ 267 memsize = 0; 268 sysenv = kern_getenv("smbios.memory.enabled"); 269 if (sysenv != NULL) { 270 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; 271 freeenv(sysenv); 272 } 273 if (memsize < ptoa((uintmax_t)vm_free_count())) 274 memsize = ptoa((uintmax_t)Maxmem); 275 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); 276 realmem = atop(memsize); 277 278 /* 279 * Display any holes after the first chunk of extended memory. 280 */ 281 if (bootverbose) { 282 int indx; 283 284 printf("Physical memory chunk(s):\n"); 285 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 286 vm_paddr_t size; 287 288 size = phys_avail[indx + 1] - phys_avail[indx]; 289 printf( 290 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", 291 (uintmax_t)phys_avail[indx], 292 (uintmax_t)phys_avail[indx + 1] - 1, 293 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); 294 } 295 } 296 297 vm_ksubmap_init(&kmi); 298 299 printf("avail memory = %ju (%ju MB)\n", 300 ptoa((uintmax_t)vm_free_count()), 301 ptoa((uintmax_t)vm_free_count()) / 1048576); 302 #ifdef DEV_PCI 303 if (bootverbose && intel_graphics_stolen_base != 0) 304 printf("intel stolen mem: base %#jx size %ju MB\n", 305 (uintmax_t)intel_graphics_stolen_base, 306 (uintmax_t)intel_graphics_stolen_size / 1024 / 1024); 307 #endif 308 309 /* 310 * Set up buffers, so they can be used to read disk labels. 311 */ 312 bufinit(); 313 vm_pager_bufferinit(); 314 315 cpu_setregs(); 316 } 317 318 static void 319 late_ifunc_resolve(void *dummy __unused) 320 { 321 link_elf_late_ireloc(); 322 } 323 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL); 324 325 /* 326 * Send an interrupt to process. 327 * 328 * Stack is set up to allow sigcode stored 329 * at top to call routine, followed by call 330 * to sigreturn routine below. After sigreturn 331 * resets the signal mask, the stack, and the 332 * frame pointer, it returns to the user 333 * specified pc, psl. 334 */ 335 void 336 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 337 { 338 struct sigframe sf, *sfp; 339 struct pcb *pcb; 340 struct proc *p; 341 struct thread *td; 342 struct sigacts *psp; 343 char *sp; 344 struct trapframe *regs; 345 char *xfpusave; 346 size_t xfpusave_len; 347 int sig; 348 int oonstack; 349 350 td = curthread; 351 pcb = td->td_pcb; 352 p = td->td_proc; 353 PROC_LOCK_ASSERT(p, MA_OWNED); 354 sig = ksi->ksi_signo; 355 psp = p->p_sigacts; 356 mtx_assert(&psp->ps_mtx, MA_OWNED); 357 regs = td->td_frame; 358 oonstack = sigonstack(regs->tf_rsp); 359 360 if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) { 361 xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu); 362 xfpusave = __builtin_alloca(xfpusave_len); 363 } else { 364 xfpusave_len = 0; 365 xfpusave = NULL; 366 } 367 368 /* Save user context. */ 369 bzero(&sf, sizeof(sf)); 370 sf.sf_uc.uc_sigmask = *mask; 371 sf.sf_uc.uc_stack = td->td_sigstk; 372 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 373 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 374 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 375 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs)); 376 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ 377 get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); 378 fpstate_drop(td); 379 update_pcb_bases(pcb); 380 sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase; 381 sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase; 382 bzero(sf.sf_uc.uc_mcontext.mc_spare, 383 sizeof(sf.sf_uc.uc_mcontext.mc_spare)); 384 385 /* Allocate space for the signal handler context. */ 386 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 387 SIGISMEMBER(psp->ps_sigonstack, sig)) { 388 sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size; 389 #if defined(COMPAT_43) 390 td->td_sigstk.ss_flags |= SS_ONSTACK; 391 #endif 392 } else 393 sp = (char *)regs->tf_rsp - 128; 394 if (xfpusave != NULL) { 395 sp -= xfpusave_len; 396 sp = (char *)((unsigned long)sp & ~0x3Ful); 397 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp; 398 } 399 sp -= sizeof(struct sigframe); 400 /* Align to 16 bytes. */ 401 sfp = (struct sigframe *)((unsigned long)sp & ~0xFul); 402 403 /* Build the argument list for the signal handler. */ 404 regs->tf_rdi = sig; /* arg 1 in %rdi */ 405 regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */ 406 bzero(&sf.sf_si, sizeof(sf.sf_si)); 407 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 408 /* Signal handler installed with SA_SIGINFO. */ 409 regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */ 410 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 411 412 /* Fill in POSIX parts */ 413 sf.sf_si = ksi->ksi_info; 414 sf.sf_si.si_signo = sig; /* maybe a translated signal */ 415 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ 416 } else { 417 /* Old FreeBSD-style arguments. */ 418 regs->tf_rsi = ksi->ksi_code; /* arg 2 in %rsi */ 419 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ 420 sf.sf_ahu.sf_handler = catcher; 421 } 422 mtx_unlock(&psp->ps_mtx); 423 PROC_UNLOCK(p); 424 425 /* 426 * Copy the sigframe out to the user's stack. 427 */ 428 if (copyout(&sf, sfp, sizeof(*sfp)) != 0 || 429 (xfpusave != NULL && copyout(xfpusave, 430 (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len) 431 != 0)) { 432 #ifdef DEBUG 433 printf("process %ld has trashed its stack\n", (long)p->p_pid); 434 #endif 435 PROC_LOCK(p); 436 sigexit(td, SIGILL); 437 } 438 439 regs->tf_rsp = (long)sfp; 440 regs->tf_rip = p->p_sysent->sv_sigcode_base; 441 regs->tf_rflags &= ~(PSL_T | PSL_D); 442 regs->tf_cs = _ucodesel; 443 regs->tf_ds = _udatasel; 444 regs->tf_ss = _udatasel; 445 regs->tf_es = _udatasel; 446 regs->tf_fs = _ufssel; 447 regs->tf_gs = _ugssel; 448 regs->tf_flags = TF_HASSEGS; 449 PROC_LOCK(p); 450 mtx_lock(&psp->ps_mtx); 451 } 452 453 /* 454 * System call to cleanup state after a signal 455 * has been taken. Reset signal mask and 456 * stack state from context left by sendsig (above). 457 * Return to previous pc and psl as specified by 458 * context left by sendsig. Check carefully to 459 * make sure that the user has not modified the 460 * state to gain improper privileges. 461 * 462 * MPSAFE 463 */ 464 int 465 sys_sigreturn(td, uap) 466 struct thread *td; 467 struct sigreturn_args /* { 468 const struct __ucontext *sigcntxp; 469 } */ *uap; 470 { 471 ucontext_t uc; 472 struct pcb *pcb; 473 struct proc *p; 474 struct trapframe *regs; 475 ucontext_t *ucp; 476 char *xfpustate; 477 size_t xfpustate_len; 478 long rflags; 479 int cs, error, ret; 480 ksiginfo_t ksi; 481 482 pcb = td->td_pcb; 483 p = td->td_proc; 484 485 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 486 if (error != 0) { 487 uprintf("pid %d (%s): sigreturn copyin failed\n", 488 p->p_pid, td->td_name); 489 return (error); 490 } 491 ucp = &uc; 492 if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) { 493 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid, 494 td->td_name, ucp->uc_mcontext.mc_flags); 495 return (EINVAL); 496 } 497 regs = td->td_frame; 498 rflags = ucp->uc_mcontext.mc_rflags; 499 /* 500 * Don't allow users to change privileged or reserved flags. 501 */ 502 if (!EFL_SECURE(rflags, regs->tf_rflags)) { 503 uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid, 504 td->td_name, rflags); 505 return (EINVAL); 506 } 507 508 /* 509 * Don't allow users to load a valid privileged %cs. Let the 510 * hardware check for invalid selectors, excess privilege in 511 * other selectors, invalid %eip's and invalid %esp's. 512 */ 513 cs = ucp->uc_mcontext.mc_cs; 514 if (!CS_SECURE(cs)) { 515 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid, 516 td->td_name, cs); 517 ksiginfo_init_trap(&ksi); 518 ksi.ksi_signo = SIGBUS; 519 ksi.ksi_code = BUS_OBJERR; 520 ksi.ksi_trapno = T_PROTFLT; 521 ksi.ksi_addr = (void *)regs->tf_rip; 522 trapsignal(td, &ksi); 523 return (EINVAL); 524 } 525 526 if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) { 527 xfpustate_len = uc.uc_mcontext.mc_xfpustate_len; 528 if (xfpustate_len > cpu_max_ext_state_size - 529 sizeof(struct savefpu)) { 530 uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n", 531 p->p_pid, td->td_name, xfpustate_len); 532 return (EINVAL); 533 } 534 xfpustate = __builtin_alloca(xfpustate_len); 535 error = copyin((const void *)uc.uc_mcontext.mc_xfpustate, 536 xfpustate, xfpustate_len); 537 if (error != 0) { 538 uprintf( 539 "pid %d (%s): sigreturn copying xfpustate failed\n", 540 p->p_pid, td->td_name); 541 return (error); 542 } 543 } else { 544 xfpustate = NULL; 545 xfpustate_len = 0; 546 } 547 ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len); 548 if (ret != 0) { 549 uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n", 550 p->p_pid, td->td_name, ret); 551 return (ret); 552 } 553 bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs)); 554 update_pcb_bases(pcb); 555 pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase; 556 pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase; 557 558 #if defined(COMPAT_43) 559 if (ucp->uc_mcontext.mc_onstack & 1) 560 td->td_sigstk.ss_flags |= SS_ONSTACK; 561 else 562 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 563 #endif 564 565 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 566 return (EJUSTRETURN); 567 } 568 569 #ifdef COMPAT_FREEBSD4 570 int 571 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap) 572 { 573 574 return sys_sigreturn(td, (struct sigreturn_args *)uap); 575 } 576 #endif 577 578 /* 579 * Reset the hardware debug registers if they were in use. 580 * They won't have any meaning for the newly exec'd process. 581 */ 582 void 583 x86_clear_dbregs(struct pcb *pcb) 584 { 585 if ((pcb->pcb_flags & PCB_DBREGS) == 0) 586 return; 587 588 pcb->pcb_dr0 = 0; 589 pcb->pcb_dr1 = 0; 590 pcb->pcb_dr2 = 0; 591 pcb->pcb_dr3 = 0; 592 pcb->pcb_dr6 = 0; 593 pcb->pcb_dr7 = 0; 594 595 if (pcb == curpcb) { 596 /* 597 * Clear the debug registers on the running CPU, 598 * otherwise they will end up affecting the next 599 * process we switch to. 600 */ 601 reset_dbregs(); 602 } 603 clear_pcb_flags(pcb, PCB_DBREGS); 604 } 605 606 /* 607 * Reset registers to default values on exec. 608 */ 609 void 610 exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) 611 { 612 struct trapframe *regs; 613 struct pcb *pcb; 614 register_t saved_rflags; 615 616 regs = td->td_frame; 617 pcb = td->td_pcb; 618 619 if (td->td_proc->p_md.md_ldt != NULL) 620 user_ldt_free(td); 621 622 update_pcb_bases(pcb); 623 pcb->pcb_fsbase = 0; 624 pcb->pcb_gsbase = 0; 625 clear_pcb_flags(pcb, PCB_32BIT); 626 pcb->pcb_initial_fpucw = __INITIAL_FPUCW__; 627 628 saved_rflags = regs->tf_rflags & PSL_T; 629 bzero((char *)regs, sizeof(struct trapframe)); 630 regs->tf_rip = imgp->entry_addr; 631 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; 632 regs->tf_rdi = stack; /* argv */ 633 regs->tf_rflags = PSL_USER | saved_rflags; 634 regs->tf_ss = _udatasel; 635 regs->tf_cs = _ucodesel; 636 regs->tf_ds = _udatasel; 637 regs->tf_es = _udatasel; 638 regs->tf_fs = _ufssel; 639 regs->tf_gs = _ugssel; 640 regs->tf_flags = TF_HASSEGS; 641 642 x86_clear_dbregs(pcb); 643 644 /* 645 * Drop the FP state if we hold it, so that the process gets a 646 * clean FP state if it uses the FPU again. 647 */ 648 fpstate_drop(td); 649 } 650 651 void 652 cpu_setregs(void) 653 { 654 register_t cr0; 655 656 cr0 = rcr0(); 657 /* 658 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the 659 * BSP. See the comments there about why we set them. 660 */ 661 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; 662 load_cr0(cr0); 663 } 664 665 /* 666 * Initialize amd64 and configure to run kernel 667 */ 668 669 /* 670 * Initialize segments & interrupt table 671 */ 672 static struct gate_descriptor idt0[NIDT]; 673 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 674 675 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16); 676 static char mce0_stack[MCE_STACK_SIZE] __aligned(16); 677 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16); 678 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16); 679 CTASSERT(sizeof(struct nmi_pcpu) == 16); 680 681 /* 682 * Software prototypes -- in more palatable form. 683 * 684 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same 685 * slots as corresponding segments for i386 kernel. 686 */ 687 struct soft_segment_descriptor gdt_segs[] = { 688 /* GNULL_SEL 0 Null Descriptor */ 689 { .ssd_base = 0x0, 690 .ssd_limit = 0x0, 691 .ssd_type = 0, 692 .ssd_dpl = 0, 693 .ssd_p = 0, 694 .ssd_long = 0, 695 .ssd_def32 = 0, 696 .ssd_gran = 0 }, 697 /* GNULL2_SEL 1 Null Descriptor */ 698 { .ssd_base = 0x0, 699 .ssd_limit = 0x0, 700 .ssd_type = 0, 701 .ssd_dpl = 0, 702 .ssd_p = 0, 703 .ssd_long = 0, 704 .ssd_def32 = 0, 705 .ssd_gran = 0 }, 706 /* GUFS32_SEL 2 32 bit %gs Descriptor for user */ 707 { .ssd_base = 0x0, 708 .ssd_limit = 0xfffff, 709 .ssd_type = SDT_MEMRWA, 710 .ssd_dpl = SEL_UPL, 711 .ssd_p = 1, 712 .ssd_long = 0, 713 .ssd_def32 = 1, 714 .ssd_gran = 1 }, 715 /* GUGS32_SEL 3 32 bit %fs Descriptor for user */ 716 { .ssd_base = 0x0, 717 .ssd_limit = 0xfffff, 718 .ssd_type = SDT_MEMRWA, 719 .ssd_dpl = SEL_UPL, 720 .ssd_p = 1, 721 .ssd_long = 0, 722 .ssd_def32 = 1, 723 .ssd_gran = 1 }, 724 /* GCODE_SEL 4 Code Descriptor for kernel */ 725 { .ssd_base = 0x0, 726 .ssd_limit = 0xfffff, 727 .ssd_type = SDT_MEMERA, 728 .ssd_dpl = SEL_KPL, 729 .ssd_p = 1, 730 .ssd_long = 1, 731 .ssd_def32 = 0, 732 .ssd_gran = 1 }, 733 /* GDATA_SEL 5 Data Descriptor for kernel */ 734 { .ssd_base = 0x0, 735 .ssd_limit = 0xfffff, 736 .ssd_type = SDT_MEMRWA, 737 .ssd_dpl = SEL_KPL, 738 .ssd_p = 1, 739 .ssd_long = 1, 740 .ssd_def32 = 0, 741 .ssd_gran = 1 }, 742 /* GUCODE32_SEL 6 32 bit Code Descriptor for user */ 743 { .ssd_base = 0x0, 744 .ssd_limit = 0xfffff, 745 .ssd_type = SDT_MEMERA, 746 .ssd_dpl = SEL_UPL, 747 .ssd_p = 1, 748 .ssd_long = 0, 749 .ssd_def32 = 1, 750 .ssd_gran = 1 }, 751 /* GUDATA_SEL 7 32/64 bit Data Descriptor for user */ 752 { .ssd_base = 0x0, 753 .ssd_limit = 0xfffff, 754 .ssd_type = SDT_MEMRWA, 755 .ssd_dpl = SEL_UPL, 756 .ssd_p = 1, 757 .ssd_long = 0, 758 .ssd_def32 = 1, 759 .ssd_gran = 1 }, 760 /* GUCODE_SEL 8 64 bit Code Descriptor for user */ 761 { .ssd_base = 0x0, 762 .ssd_limit = 0xfffff, 763 .ssd_type = SDT_MEMERA, 764 .ssd_dpl = SEL_UPL, 765 .ssd_p = 1, 766 .ssd_long = 1, 767 .ssd_def32 = 0, 768 .ssd_gran = 1 }, 769 /* GPROC0_SEL 9 Proc 0 Tss Descriptor */ 770 { .ssd_base = 0x0, 771 .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1, 772 .ssd_type = SDT_SYSTSS, 773 .ssd_dpl = SEL_KPL, 774 .ssd_p = 1, 775 .ssd_long = 0, 776 .ssd_def32 = 0, 777 .ssd_gran = 0 }, 778 /* Actually, the TSS is a system descriptor which is double size */ 779 { .ssd_base = 0x0, 780 .ssd_limit = 0x0, 781 .ssd_type = 0, 782 .ssd_dpl = 0, 783 .ssd_p = 0, 784 .ssd_long = 0, 785 .ssd_def32 = 0, 786 .ssd_gran = 0 }, 787 /* GUSERLDT_SEL 11 LDT Descriptor */ 788 { .ssd_base = 0x0, 789 .ssd_limit = 0x0, 790 .ssd_type = 0, 791 .ssd_dpl = 0, 792 .ssd_p = 0, 793 .ssd_long = 0, 794 .ssd_def32 = 0, 795 .ssd_gran = 0 }, 796 /* GUSERLDT_SEL 12 LDT Descriptor, double size */ 797 { .ssd_base = 0x0, 798 .ssd_limit = 0x0, 799 .ssd_type = 0, 800 .ssd_dpl = 0, 801 .ssd_p = 0, 802 .ssd_long = 0, 803 .ssd_def32 = 0, 804 .ssd_gran = 0 }, 805 }; 806 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT"); 807 808 void 809 setidt(int idx, inthand_t *func, int typ, int dpl, int ist) 810 { 811 struct gate_descriptor *ip; 812 813 ip = idt + idx; 814 ip->gd_looffset = (uintptr_t)func; 815 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); 816 ip->gd_ist = ist; 817 ip->gd_xx = 0; 818 ip->gd_type = typ; 819 ip->gd_dpl = dpl; 820 ip->gd_p = 1; 821 ip->gd_hioffset = ((uintptr_t)func)>>16 ; 822 } 823 824 extern inthand_t 825 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 826 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 827 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 828 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 829 IDTVEC(xmm), IDTVEC(dblfault), 830 IDTVEC(div_pti), IDTVEC(bpt_pti), 831 IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti), 832 IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti), 833 IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti), 834 IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti), 835 IDTVEC(xmm_pti), 836 #ifdef KDTRACE_HOOKS 837 IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti), 838 #endif 839 #ifdef XENHVM 840 IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti), 841 #endif 842 IDTVEC(fast_syscall), IDTVEC(fast_syscall32), 843 IDTVEC(fast_syscall_pti); 844 845 #ifdef DDB 846 /* 847 * Display the index and function name of any IDT entries that don't use 848 * the default 'rsvd' entry point. 849 */ 850 DB_SHOW_COMMAND(idt, db_show_idt) 851 { 852 struct gate_descriptor *ip; 853 int idx; 854 uintptr_t func; 855 856 ip = idt; 857 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { 858 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); 859 if (func != (uintptr_t)&IDTVEC(rsvd)) { 860 db_printf("%3d\t", idx); 861 db_printsym(func, DB_STGY_PROC); 862 db_printf("\n"); 863 } 864 ip++; 865 } 866 } 867 868 /* Show privileged registers. */ 869 DB_SHOW_COMMAND(sysregs, db_show_sysregs) 870 { 871 struct { 872 uint16_t limit; 873 uint64_t base; 874 } __packed idtr, gdtr; 875 uint16_t ldt, tr; 876 877 __asm __volatile("sidt %0" : "=m" (idtr)); 878 db_printf("idtr\t0x%016lx/%04x\n", 879 (u_long)idtr.base, (u_int)idtr.limit); 880 __asm __volatile("sgdt %0" : "=m" (gdtr)); 881 db_printf("gdtr\t0x%016lx/%04x\n", 882 (u_long)gdtr.base, (u_int)gdtr.limit); 883 __asm __volatile("sldt %0" : "=r" (ldt)); 884 db_printf("ldtr\t0x%04x\n", ldt); 885 __asm __volatile("str %0" : "=r" (tr)); 886 db_printf("tr\t0x%04x\n", tr); 887 db_printf("cr0\t0x%016lx\n", rcr0()); 888 db_printf("cr2\t0x%016lx\n", rcr2()); 889 db_printf("cr3\t0x%016lx\n", rcr3()); 890 db_printf("cr4\t0x%016lx\n", rcr4()); 891 if (rcr4() & CR4_XSAVE) 892 db_printf("xcr0\t0x%016lx\n", rxcr(0)); 893 db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER)); 894 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX)) 895 db_printf("FEATURES_CTL\t%016lx\n", 896 rdmsr(MSR_IA32_FEATURE_CONTROL)); 897 db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR)); 898 db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT)); 899 db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE)); 900 } 901 902 DB_SHOW_COMMAND(dbregs, db_show_dbregs) 903 { 904 905 db_printf("dr0\t0x%016lx\n", rdr0()); 906 db_printf("dr1\t0x%016lx\n", rdr1()); 907 db_printf("dr2\t0x%016lx\n", rdr2()); 908 db_printf("dr3\t0x%016lx\n", rdr3()); 909 db_printf("dr6\t0x%016lx\n", rdr6()); 910 db_printf("dr7\t0x%016lx\n", rdr7()); 911 } 912 #endif 913 914 void 915 sdtossd(sd, ssd) 916 struct user_segment_descriptor *sd; 917 struct soft_segment_descriptor *ssd; 918 { 919 920 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 921 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 922 ssd->ssd_type = sd->sd_type; 923 ssd->ssd_dpl = sd->sd_dpl; 924 ssd->ssd_p = sd->sd_p; 925 ssd->ssd_long = sd->sd_long; 926 ssd->ssd_def32 = sd->sd_def32; 927 ssd->ssd_gran = sd->sd_gran; 928 } 929 930 void 931 ssdtosd(ssd, sd) 932 struct soft_segment_descriptor *ssd; 933 struct user_segment_descriptor *sd; 934 { 935 936 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 937 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; 938 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 939 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 940 sd->sd_type = ssd->ssd_type; 941 sd->sd_dpl = ssd->ssd_dpl; 942 sd->sd_p = ssd->ssd_p; 943 sd->sd_long = ssd->ssd_long; 944 sd->sd_def32 = ssd->ssd_def32; 945 sd->sd_gran = ssd->ssd_gran; 946 } 947 948 void 949 ssdtosyssd(ssd, sd) 950 struct soft_segment_descriptor *ssd; 951 struct system_segment_descriptor *sd; 952 { 953 954 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 955 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; 956 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 957 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 958 sd->sd_type = ssd->ssd_type; 959 sd->sd_dpl = ssd->ssd_dpl; 960 sd->sd_p = ssd->ssd_p; 961 sd->sd_gran = ssd->ssd_gran; 962 } 963 964 u_int basemem; 965 966 static int 967 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, 968 int *physmap_idxp) 969 { 970 int i, insert_idx, physmap_idx; 971 972 physmap_idx = *physmap_idxp; 973 974 if (length == 0) 975 return (1); 976 977 /* 978 * Find insertion point while checking for overlap. Start off by 979 * assuming the new entry will be added to the end. 980 * 981 * NB: physmap_idx points to the next free slot. 982 */ 983 insert_idx = physmap_idx; 984 for (i = 0; i <= physmap_idx; i += 2) { 985 if (base < physmap[i + 1]) { 986 if (base + length <= physmap[i]) { 987 insert_idx = i; 988 break; 989 } 990 if (boothowto & RB_VERBOSE) 991 printf( 992 "Overlapping memory regions, ignoring second region\n"); 993 return (1); 994 } 995 } 996 997 /* See if we can prepend to the next entry. */ 998 if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { 999 physmap[insert_idx] = base; 1000 return (1); 1001 } 1002 1003 /* See if we can append to the previous entry. */ 1004 if (insert_idx > 0 && base == physmap[insert_idx - 1]) { 1005 physmap[insert_idx - 1] += length; 1006 return (1); 1007 } 1008 1009 physmap_idx += 2; 1010 *physmap_idxp = physmap_idx; 1011 if (physmap_idx == PHYS_AVAIL_ENTRIES) { 1012 printf( 1013 "Too many segments in the physical address map, giving up\n"); 1014 return (0); 1015 } 1016 1017 /* 1018 * Move the last 'N' entries down to make room for the new 1019 * entry if needed. 1020 */ 1021 for (i = (physmap_idx - 2); i > insert_idx; i -= 2) { 1022 physmap[i] = physmap[i - 2]; 1023 physmap[i + 1] = physmap[i - 1]; 1024 } 1025 1026 /* Insert the new entry. */ 1027 physmap[insert_idx] = base; 1028 physmap[insert_idx + 1] = base + length; 1029 return (1); 1030 } 1031 1032 void 1033 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize, 1034 vm_paddr_t *physmap, int *physmap_idx) 1035 { 1036 struct bios_smap *smap, *smapend; 1037 1038 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 1039 1040 for (smap = smapbase; smap < smapend; smap++) { 1041 if (boothowto & RB_VERBOSE) 1042 printf("SMAP type=%02x base=%016lx len=%016lx\n", 1043 smap->type, smap->base, smap->length); 1044 1045 if (smap->type != SMAP_TYPE_MEMORY) 1046 continue; 1047 1048 if (!add_physmap_entry(smap->base, smap->length, physmap, 1049 physmap_idx)) 1050 break; 1051 } 1052 } 1053 1054 static void 1055 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap, 1056 int *physmap_idx) 1057 { 1058 struct efi_md *map, *p; 1059 const char *type; 1060 size_t efisz; 1061 int ndesc, i; 1062 1063 static const char *types[] = { 1064 "Reserved", 1065 "LoaderCode", 1066 "LoaderData", 1067 "BootServicesCode", 1068 "BootServicesData", 1069 "RuntimeServicesCode", 1070 "RuntimeServicesData", 1071 "ConventionalMemory", 1072 "UnusableMemory", 1073 "ACPIReclaimMemory", 1074 "ACPIMemoryNVS", 1075 "MemoryMappedIO", 1076 "MemoryMappedIOPortSpace", 1077 "PalCode", 1078 "PersistentMemory" 1079 }; 1080 1081 /* 1082 * Memory map data provided by UEFI via the GetMemoryMap 1083 * Boot Services API. 1084 */ 1085 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf; 1086 map = (struct efi_md *)((uint8_t *)efihdr + efisz); 1087 1088 if (efihdr->descriptor_size == 0) 1089 return; 1090 ndesc = efihdr->memory_size / efihdr->descriptor_size; 1091 1092 if (boothowto & RB_VERBOSE) 1093 printf("%23s %12s %12s %8s %4s\n", 1094 "Type", "Physical", "Virtual", "#Pages", "Attr"); 1095 1096 for (i = 0, p = map; i < ndesc; i++, 1097 p = efi_next_descriptor(p, efihdr->descriptor_size)) { 1098 if (boothowto & RB_VERBOSE) { 1099 if (p->md_type < nitems(types)) 1100 type = types[p->md_type]; 1101 else 1102 type = "<INVALID>"; 1103 printf("%23s %012lx %012lx %08lx ", type, p->md_phys, 1104 p->md_virt, p->md_pages); 1105 if (p->md_attr & EFI_MD_ATTR_UC) 1106 printf("UC "); 1107 if (p->md_attr & EFI_MD_ATTR_WC) 1108 printf("WC "); 1109 if (p->md_attr & EFI_MD_ATTR_WT) 1110 printf("WT "); 1111 if (p->md_attr & EFI_MD_ATTR_WB) 1112 printf("WB "); 1113 if (p->md_attr & EFI_MD_ATTR_UCE) 1114 printf("UCE "); 1115 if (p->md_attr & EFI_MD_ATTR_WP) 1116 printf("WP "); 1117 if (p->md_attr & EFI_MD_ATTR_RP) 1118 printf("RP "); 1119 if (p->md_attr & EFI_MD_ATTR_XP) 1120 printf("XP "); 1121 if (p->md_attr & EFI_MD_ATTR_NV) 1122 printf("NV "); 1123 if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE) 1124 printf("MORE_RELIABLE "); 1125 if (p->md_attr & EFI_MD_ATTR_RO) 1126 printf("RO "); 1127 if (p->md_attr & EFI_MD_ATTR_RT) 1128 printf("RUNTIME"); 1129 printf("\n"); 1130 } 1131 1132 switch (p->md_type) { 1133 case EFI_MD_TYPE_CODE: 1134 case EFI_MD_TYPE_DATA: 1135 case EFI_MD_TYPE_BS_CODE: 1136 case EFI_MD_TYPE_BS_DATA: 1137 case EFI_MD_TYPE_FREE: 1138 /* 1139 * We're allowed to use any entry with these types. 1140 */ 1141 break; 1142 default: 1143 continue; 1144 } 1145 1146 if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE), 1147 physmap, physmap_idx)) 1148 break; 1149 } 1150 } 1151 1152 static void 1153 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx) 1154 { 1155 struct bios_smap *smap; 1156 struct efi_map_header *efihdr; 1157 u_int32_t size; 1158 1159 /* 1160 * Memory map from INT 15:E820. 1161 * 1162 * subr_module.c says: 1163 * "Consumer may safely assume that size value precedes data." 1164 * ie: an int32_t immediately precedes smap. 1165 */ 1166 1167 efihdr = (struct efi_map_header *)preload_search_info(kmdp, 1168 MODINFO_METADATA | MODINFOMD_EFI_MAP); 1169 smap = (struct bios_smap *)preload_search_info(kmdp, 1170 MODINFO_METADATA | MODINFOMD_SMAP); 1171 if (efihdr == NULL && smap == NULL) 1172 panic("No BIOS smap or EFI map info from loader!"); 1173 1174 if (efihdr != NULL) { 1175 add_efi_map_entries(efihdr, physmap, physmap_idx); 1176 strlcpy(bootmethod, "UEFI", sizeof(bootmethod)); 1177 } else { 1178 size = *((u_int32_t *)smap - 1); 1179 bios_add_smap_entries(smap, size, physmap, physmap_idx); 1180 strlcpy(bootmethod, "BIOS", sizeof(bootmethod)); 1181 } 1182 } 1183 1184 #define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE) 1185 1186 /* 1187 * Populate the (physmap) array with base/bound pairs describing the 1188 * available physical memory in the system, then test this memory and 1189 * build the phys_avail array describing the actually-available memory. 1190 * 1191 * Total memory size may be set by the kernel environment variable 1192 * hw.physmem or the compile-time define MAXMEM. 1193 * 1194 * XXX first should be vm_paddr_t. 1195 */ 1196 static void 1197 getmemsize(caddr_t kmdp, u_int64_t first) 1198 { 1199 int i, physmap_idx, pa_indx, da_indx; 1200 vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES]; 1201 u_long physmem_start, physmem_tunable, memtest; 1202 pt_entry_t *pte; 1203 quad_t dcons_addr, dcons_size; 1204 int page_counter; 1205 1206 /* 1207 * Tell the physical memory allocator about pages used to store 1208 * the kernel and preloaded data. See kmem_bootstrap_free(). 1209 */ 1210 vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first)); 1211 1212 bzero(physmap, sizeof(physmap)); 1213 physmap_idx = 0; 1214 1215 init_ops.parse_memmap(kmdp, physmap, &physmap_idx); 1216 physmap_idx -= 2; 1217 1218 /* 1219 * Find the 'base memory' segment for SMP 1220 */ 1221 basemem = 0; 1222 for (i = 0; i <= physmap_idx; i += 2) { 1223 if (physmap[i] <= 0xA0000) { 1224 basemem = physmap[i + 1] / 1024; 1225 break; 1226 } 1227 } 1228 if (basemem == 0 || basemem > 640) { 1229 if (bootverbose) 1230 printf( 1231 "Memory map doesn't contain a basemem segment, faking it"); 1232 basemem = 640; 1233 } 1234 1235 /* 1236 * Maxmem isn't the "maximum memory", it's one larger than the 1237 * highest page of the physical address space. It should be 1238 * called something like "Maxphyspage". We may adjust this 1239 * based on ``hw.physmem'' and the results of the memory test. 1240 */ 1241 Maxmem = atop(physmap[physmap_idx + 1]); 1242 1243 #ifdef MAXMEM 1244 Maxmem = MAXMEM / 4; 1245 #endif 1246 1247 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 1248 Maxmem = atop(physmem_tunable); 1249 1250 /* 1251 * The boot memory test is disabled by default, as it takes a 1252 * significant amount of time on large-memory systems, and is 1253 * unfriendly to virtual machines as it unnecessarily touches all 1254 * pages. 1255 * 1256 * A general name is used as the code may be extended to support 1257 * additional tests beyond the current "page present" test. 1258 */ 1259 memtest = 0; 1260 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); 1261 1262 /* 1263 * Don't allow MAXMEM or hw.physmem to extend the amount of memory 1264 * in the system. 1265 */ 1266 if (Maxmem > atop(physmap[physmap_idx + 1])) 1267 Maxmem = atop(physmap[physmap_idx + 1]); 1268 1269 if (atop(physmap[physmap_idx + 1]) != Maxmem && 1270 (boothowto & RB_VERBOSE)) 1271 printf("Physical memory use set to %ldK\n", Maxmem * 4); 1272 1273 /* 1274 * Make hole for "AP -> long mode" bootstrap code. The 1275 * mp_bootaddress vector is only available when the kernel 1276 * is configured to support APs and APs for the system start 1277 * in real mode mode (e.g. SMP bare metal). 1278 */ 1279 #ifdef SMP 1280 mp_bootaddress(physmap, &physmap_idx); 1281 #endif 1282 1283 /* call pmap initialization to make new kernel address space */ 1284 pmap_bootstrap(&first); 1285 1286 /* 1287 * Size up each available chunk of physical memory. 1288 * 1289 * XXX Some BIOSes corrupt low 64KB between suspend and resume. 1290 * By default, mask off the first 16 pages unless we appear to be 1291 * running in a VM. 1292 */ 1293 physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT; 1294 TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start); 1295 if (physmap[0] < physmem_start) { 1296 if (physmem_start < PAGE_SIZE) 1297 physmap[0] = PAGE_SIZE; 1298 else if (physmem_start >= physmap[1]) 1299 physmap[0] = round_page(physmap[1] - PAGE_SIZE); 1300 else 1301 physmap[0] = round_page(physmem_start); 1302 } 1303 pa_indx = 0; 1304 da_indx = 1; 1305 phys_avail[pa_indx++] = physmap[0]; 1306 phys_avail[pa_indx] = physmap[0]; 1307 dump_avail[da_indx] = physmap[0]; 1308 pte = CMAP1; 1309 1310 /* 1311 * Get dcons buffer address 1312 */ 1313 if (getenv_quad("dcons.addr", &dcons_addr) == 0 || 1314 getenv_quad("dcons.size", &dcons_size) == 0) 1315 dcons_addr = 0; 1316 1317 /* 1318 * physmap is in bytes, so when converting to page boundaries, 1319 * round up the start address and round down the end address. 1320 */ 1321 page_counter = 0; 1322 if (memtest != 0) 1323 printf("Testing system memory"); 1324 for (i = 0; i <= physmap_idx; i += 2) { 1325 vm_paddr_t end; 1326 1327 end = ptoa((vm_paddr_t)Maxmem); 1328 if (physmap[i + 1] < end) 1329 end = trunc_page(physmap[i + 1]); 1330 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 1331 int tmp, page_bad, full; 1332 int *ptr = (int *)CADDR1; 1333 1334 full = FALSE; 1335 /* 1336 * block out kernel memory as not available. 1337 */ 1338 if (pa >= (vm_paddr_t)kernphys && pa < first) 1339 goto do_dump_avail; 1340 1341 /* 1342 * block out dcons buffer 1343 */ 1344 if (dcons_addr > 0 1345 && pa >= trunc_page(dcons_addr) 1346 && pa < dcons_addr + dcons_size) 1347 goto do_dump_avail; 1348 1349 page_bad = FALSE; 1350 if (memtest == 0) 1351 goto skip_memtest; 1352 1353 /* 1354 * Print a "." every GB to show we're making 1355 * progress. 1356 */ 1357 page_counter++; 1358 if ((page_counter % PAGES_PER_GB) == 0) 1359 printf("."); 1360 1361 /* 1362 * map page into kernel: valid, read/write,non-cacheable 1363 */ 1364 *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD; 1365 invltlb(); 1366 1367 tmp = *(int *)ptr; 1368 /* 1369 * Test for alternating 1's and 0's 1370 */ 1371 *(volatile int *)ptr = 0xaaaaaaaa; 1372 if (*(volatile int *)ptr != 0xaaaaaaaa) 1373 page_bad = TRUE; 1374 /* 1375 * Test for alternating 0's and 1's 1376 */ 1377 *(volatile int *)ptr = 0x55555555; 1378 if (*(volatile int *)ptr != 0x55555555) 1379 page_bad = TRUE; 1380 /* 1381 * Test for all 1's 1382 */ 1383 *(volatile int *)ptr = 0xffffffff; 1384 if (*(volatile int *)ptr != 0xffffffff) 1385 page_bad = TRUE; 1386 /* 1387 * Test for all 0's 1388 */ 1389 *(volatile int *)ptr = 0x0; 1390 if (*(volatile int *)ptr != 0x0) 1391 page_bad = TRUE; 1392 /* 1393 * Restore original value. 1394 */ 1395 *(int *)ptr = tmp; 1396 1397 skip_memtest: 1398 /* 1399 * Adjust array of valid/good pages. 1400 */ 1401 if (page_bad == TRUE) 1402 continue; 1403 /* 1404 * If this good page is a continuation of the 1405 * previous set of good pages, then just increase 1406 * the end pointer. Otherwise start a new chunk. 1407 * Note that "end" points one higher than end, 1408 * making the range >= start and < end. 1409 * If we're also doing a speculative memory 1410 * test and we at or past the end, bump up Maxmem 1411 * so that we keep going. The first bad page 1412 * will terminate the loop. 1413 */ 1414 if (phys_avail[pa_indx] == pa) { 1415 phys_avail[pa_indx] += PAGE_SIZE; 1416 } else { 1417 pa_indx++; 1418 if (pa_indx == PHYS_AVAIL_ENTRIES) { 1419 printf( 1420 "Too many holes in the physical address space, giving up\n"); 1421 pa_indx--; 1422 full = TRUE; 1423 goto do_dump_avail; 1424 } 1425 phys_avail[pa_indx++] = pa; /* start */ 1426 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 1427 } 1428 physmem++; 1429 do_dump_avail: 1430 if (dump_avail[da_indx] == pa) { 1431 dump_avail[da_indx] += PAGE_SIZE; 1432 } else { 1433 da_indx++; 1434 if (da_indx == PHYS_AVAIL_ENTRIES) { 1435 da_indx--; 1436 goto do_next; 1437 } 1438 dump_avail[da_indx++] = pa; /* start */ 1439 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 1440 } 1441 do_next: 1442 if (full) 1443 break; 1444 } 1445 } 1446 *pte = 0; 1447 invltlb(); 1448 if (memtest != 0) 1449 printf("\n"); 1450 1451 /* 1452 * XXX 1453 * The last chunk must contain at least one page plus the message 1454 * buffer to avoid complicating other code (message buffer address 1455 * calculation, etc.). 1456 */ 1457 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 1458 round_page(msgbufsize) >= phys_avail[pa_indx]) { 1459 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 1460 phys_avail[pa_indx--] = 0; 1461 phys_avail[pa_indx--] = 0; 1462 } 1463 1464 Maxmem = atop(phys_avail[pa_indx]); 1465 1466 /* Trim off space for the message buffer. */ 1467 phys_avail[pa_indx] -= round_page(msgbufsize); 1468 1469 /* Map the message buffer. */ 1470 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]); 1471 } 1472 1473 static caddr_t 1474 native_parse_preload_data(u_int64_t modulep) 1475 { 1476 caddr_t kmdp; 1477 char *envp; 1478 #ifdef DDB 1479 vm_offset_t ksym_start; 1480 vm_offset_t ksym_end; 1481 #endif 1482 1483 preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE); 1484 preload_bootstrap_relocate(KERNBASE); 1485 kmdp = preload_search_by_type("elf kernel"); 1486 if (kmdp == NULL) 1487 kmdp = preload_search_by_type("elf64 kernel"); 1488 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); 1489 envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *); 1490 if (envp != NULL) 1491 envp += KERNBASE; 1492 init_static_kenv(envp, 0); 1493 #ifdef DDB 1494 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); 1495 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); 1496 db_fetch_ksymtab(ksym_start, ksym_end, 0); 1497 #endif 1498 efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t); 1499 1500 return (kmdp); 1501 } 1502 1503 static void 1504 amd64_kdb_init(void) 1505 { 1506 kdb_init(); 1507 #ifdef KDB 1508 if (boothowto & RB_KDB) 1509 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); 1510 #endif 1511 } 1512 1513 /* Set up the fast syscall stuff */ 1514 void 1515 amd64_conf_fast_syscall(void) 1516 { 1517 uint64_t msr; 1518 1519 msr = rdmsr(MSR_EFER) | EFER_SCE; 1520 wrmsr(MSR_EFER, msr); 1521 wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) : 1522 (u_int64_t)IDTVEC(fast_syscall)); 1523 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 1524 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 1525 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 1526 wrmsr(MSR_STAR, msr); 1527 wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC); 1528 } 1529 1530 void 1531 amd64_bsp_pcpu_init1(struct pcpu *pc) 1532 { 1533 struct user_segment_descriptor *gdt; 1534 1535 PCPU_SET(prvspace, pc); 1536 gdt = *PCPU_PTR(gdt); 1537 PCPU_SET(curthread, &thread0); 1538 PCPU_SET(tssp, PCPU_PTR(common_tss)); 1539 PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1540 PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]); 1541 PCPU_SET(fs32p, &gdt[GUFS32_SEL]); 1542 PCPU_SET(gs32p, &gdt[GUGS32_SEL]); 1543 PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK); 1544 PCPU_SET(smp_tlb_gen, 1); 1545 } 1546 1547 void 1548 amd64_bsp_pcpu_init2(uint64_t rsp0) 1549 { 1550 1551 PCPU_SET(rsp0, rsp0); 1552 PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) + 1553 PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful); 1554 PCPU_SET(curpcb, thread0.td_pcb); 1555 } 1556 1557 void 1558 amd64_bsp_ist_init(struct pcpu *pc) 1559 { 1560 struct nmi_pcpu *np; 1561 struct amd64tss *tssp; 1562 1563 tssp = &pc->pc_common_tss; 1564 1565 /* doublefault stack space, runs on ist1 */ 1566 np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1; 1567 np->np_pcpu = (register_t)pc; 1568 tssp->tss_ist1 = (long)np; 1569 1570 /* 1571 * NMI stack, runs on ist2. The pcpu pointer is stored just 1572 * above the start of the ist2 stack. 1573 */ 1574 np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1; 1575 np->np_pcpu = (register_t)pc; 1576 tssp->tss_ist2 = (long)np; 1577 1578 /* 1579 * MC# stack, runs on ist3. The pcpu pointer is stored just 1580 * above the start of the ist3 stack. 1581 */ 1582 np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1; 1583 np->np_pcpu = (register_t)pc; 1584 tssp->tss_ist3 = (long)np; 1585 1586 /* 1587 * DB# stack, runs on ist4. 1588 */ 1589 np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1; 1590 np->np_pcpu = (register_t)pc; 1591 tssp->tss_ist4 = (long)np; 1592 } 1593 1594 u_int64_t 1595 hammer_time(u_int64_t modulep, u_int64_t physfree) 1596 { 1597 caddr_t kmdp; 1598 int gsel_tss, x; 1599 struct pcpu *pc; 1600 struct xstate_hdr *xhdr; 1601 u_int64_t rsp0; 1602 char *env; 1603 struct user_segment_descriptor *gdt; 1604 struct region_descriptor r_gdt; 1605 size_t kstack0_sz; 1606 int late_console; 1607 1608 TSRAW(&thread0, TS_ENTER, __func__, NULL); 1609 1610 kmdp = init_ops.parse_preload_data(modulep); 1611 1612 physfree += ucode_load_bsp(physfree + KERNBASE); 1613 physfree = roundup2(physfree, PAGE_SIZE); 1614 1615 identify_cpu1(); 1616 identify_hypervisor(); 1617 identify_cpu_fixup_bsp(); 1618 identify_cpu2(); 1619 initializecpucache(); 1620 1621 /* 1622 * Check for pti, pcid, and invpcid before ifuncs are 1623 * resolved, to correctly select the implementation for 1624 * pmap_activate_sw_mode(). 1625 */ 1626 pti = pti_get_default(); 1627 TUNABLE_INT_FETCH("vm.pmap.pti", &pti); 1628 TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); 1629 if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) { 1630 invpcid_works = (cpu_stdext_feature & 1631 CPUID_STDEXT_INVPCID) != 0; 1632 } else { 1633 pmap_pcid_enabled = 0; 1634 } 1635 1636 link_elf_ireloc(kmdp); 1637 1638 /* 1639 * This may be done better later if it gets more high level 1640 * components in it. If so just link td->td_proc here. 1641 */ 1642 proc_linkup0(&proc0, &thread0); 1643 1644 /* Init basic tunables, hz etc */ 1645 init_param1(); 1646 1647 thread0.td_kstack = physfree + KERNBASE; 1648 thread0.td_kstack_pages = kstack_pages; 1649 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; 1650 bzero((void *)thread0.td_kstack, kstack0_sz); 1651 physfree += kstack0_sz; 1652 1653 /* 1654 * Initialize enough of thread0 for delayed invalidation to 1655 * work very early. Rely on thread0.td_base_pri 1656 * zero-initialization, it is reset to PVM at proc0_init(). 1657 */ 1658 pmap_thread_init_invl_gen(&thread0); 1659 1660 pc = &temp_bsp_pcpu; 1661 pcpu_init(pc, 0, sizeof(struct pcpu)); 1662 gdt = &temp_bsp_pcpu.pc_gdt[0]; 1663 1664 /* 1665 * make gdt memory segments 1666 */ 1667 for (x = 0; x < NGDT; x++) { 1668 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && 1669 x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1) 1670 ssdtosd(&gdt_segs[x], &gdt[x]); 1671 } 1672 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss; 1673 ssdtosyssd(&gdt_segs[GPROC0_SEL], 1674 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1675 1676 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 1677 r_gdt.rd_base = (long)gdt; 1678 lgdt(&r_gdt); 1679 1680 wrmsr(MSR_FSBASE, 0); /* User value */ 1681 wrmsr(MSR_GSBASE, (u_int64_t)pc); 1682 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ 1683 1684 dpcpu_init((void *)(physfree + KERNBASE), 0); 1685 physfree += DPCPU_SIZE; 1686 amd64_bsp_pcpu_init1(pc); 1687 /* Non-late cninit() and printf() can be moved up to here. */ 1688 1689 /* 1690 * Initialize mutexes. 1691 * 1692 * icu_lock: in order to allow an interrupt to occur in a critical 1693 * section, to set pcpu->ipending (etc...) properly, we 1694 * must be able to get the icu lock, so it can't be 1695 * under witness. 1696 */ 1697 mutex_init(); 1698 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS); 1699 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF); 1700 1701 /* exceptions */ 1702 for (x = 0; x < NIDT; x++) 1703 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT, 1704 SEL_KPL, 0); 1705 setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT, 1706 SEL_KPL, 0); 1707 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4); 1708 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2); 1709 setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT, 1710 SEL_UPL, 0); 1711 setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT, 1712 SEL_UPL, 0); 1713 setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT, 1714 SEL_KPL, 0); 1715 setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT, 1716 SEL_KPL, 0); 1717 setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT, 1718 SEL_KPL, 0); 1719 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); 1720 setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm), 1721 SDT_SYSIGT, SEL_KPL, 0); 1722 setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT, 1723 SEL_KPL, 0); 1724 setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing), 1725 SDT_SYSIGT, SEL_KPL, 0); 1726 setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT, 1727 SEL_KPL, 0); 1728 setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT, 1729 SEL_KPL, 0); 1730 setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT, 1731 SEL_KPL, 0); 1732 setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT, 1733 SEL_KPL, 0); 1734 setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT, 1735 SEL_KPL, 0); 1736 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3); 1737 setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT, 1738 SEL_KPL, 0); 1739 #ifdef KDTRACE_HOOKS 1740 setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) : 1741 &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0); 1742 #endif 1743 #ifdef XENHVM 1744 setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) : 1745 &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0); 1746 #endif 1747 r_idt.rd_limit = sizeof(idt0) - 1; 1748 r_idt.rd_base = (long) idt; 1749 lidt(&r_idt); 1750 1751 /* 1752 * Initialize the clock before the console so that console 1753 * initialization can use DELAY(). 1754 */ 1755 clock_init(); 1756 1757 /* 1758 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4) 1759 * transition). 1760 * Once bootblocks have updated, we can test directly for 1761 * efi_systbl != NULL here... 1762 */ 1763 if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP) 1764 != NULL) 1765 vty_set_preferred(VTY_VT); 1766 1767 TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable); 1768 TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable); 1769 1770 TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable); 1771 TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable); 1772 1773 TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush", 1774 &syscall_ret_l1d_flush_mode); 1775 1776 TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable); 1777 TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable); 1778 1779 TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable); 1780 1781 TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable", 1782 &x86_rngds_mitg_enable); 1783 1784 finishidentcpu(); /* Final stage of CPU initialization */ 1785 initializecpu(); /* Initialize CPU registers */ 1786 1787 amd64_bsp_ist_init(pc); 1788 1789 /* Set the IO permission bitmap (empty due to tss seg limit) */ 1790 pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) + 1791 IOPERM_BITMAP_SIZE; 1792 1793 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 1794 ltr(gsel_tss); 1795 1796 amd64_conf_fast_syscall(); 1797 1798 /* 1799 * We initialize the PCB pointer early so that exception 1800 * handlers will work. Also set up td_critnest to short-cut 1801 * the page fault handler. 1802 */ 1803 cpu_max_ext_state_size = sizeof(struct savefpu); 1804 set_top_of_stack_td(&thread0); 1805 thread0.td_pcb = get_pcb_td(&thread0); 1806 thread0.td_critnest = 1; 1807 1808 /* 1809 * The console and kdb should be initialized even earlier than here, 1810 * but some console drivers don't work until after getmemsize(). 1811 * Default to late console initialization to support these drivers. 1812 * This loses mainly printf()s in getmemsize() and early debugging. 1813 */ 1814 late_console = 1; 1815 TUNABLE_INT_FETCH("debug.late_console", &late_console); 1816 if (!late_console) { 1817 cninit(); 1818 amd64_kdb_init(); 1819 } 1820 1821 getmemsize(kmdp, physfree); 1822 init_param2(physmem); 1823 1824 /* now running on new page tables, configured,and u/iom is accessible */ 1825 1826 #ifdef DEV_PCI 1827 /* This call might adjust phys_avail[]. */ 1828 pci_early_quirks(); 1829 #endif 1830 1831 if (late_console) 1832 cninit(); 1833 1834 /* 1835 * Dump the boot metadata. We have to wait for cninit() since console 1836 * output is required. If it's grossly incorrect the kernel will never 1837 * make it this far. 1838 */ 1839 if (getenv_is_true("debug.dump_modinfo_at_boot")) 1840 preload_dump(); 1841 1842 #ifdef DEV_ISA 1843 #ifdef DEV_ATPIC 1844 elcr_probe(); 1845 atpic_startup(); 1846 #else 1847 /* Reset and mask the atpics and leave them shut down. */ 1848 atpic_reset(); 1849 1850 /* 1851 * Point the ICU spurious interrupt vectors at the APIC spurious 1852 * interrupt handler. 1853 */ 1854 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); 1855 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); 1856 #endif 1857 #else 1858 #error "have you forgotten the isa device?" 1859 #endif 1860 1861 if (late_console) 1862 amd64_kdb_init(); 1863 1864 msgbufinit(msgbufp, msgbufsize); 1865 fpuinit(); 1866 1867 /* 1868 * Reinitialize thread0's stack base now that the xsave area size is 1869 * known. Set up thread0's pcb save area after fpuinit calculated fpu 1870 * save area size. Zero out the extended state header in fpu save area. 1871 */ 1872 set_top_of_stack_td(&thread0); 1873 thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0); 1874 bzero(thread0.td_pcb->pcb_save, cpu_max_ext_state_size); 1875 if (use_xsave) { 1876 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + 1877 1); 1878 xhdr->xstate_bv = xsave_mask; 1879 } 1880 /* make an initial tss so cpu can get interrupt stack on syscall! */ 1881 rsp0 = thread0.td_md.md_stack_base; 1882 /* Ensure the stack is aligned to 16 bytes */ 1883 rsp0 &= ~0xFul; 1884 PCPU_PTR(common_tss)->tss_rsp0 = rsp0; 1885 amd64_bsp_pcpu_init2(rsp0); 1886 1887 /* transfer to user mode */ 1888 1889 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 1890 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 1891 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); 1892 _ufssel = GSEL(GUFS32_SEL, SEL_UPL); 1893 _ugssel = GSEL(GUGS32_SEL, SEL_UPL); 1894 1895 load_ds(_udatasel); 1896 load_es(_udatasel); 1897 load_fs(_ufssel); 1898 1899 /* setup proc 0's pcb */ 1900 thread0.td_pcb->pcb_flags = 0; 1901 thread0.td_frame = &proc0_tf; 1902 1903 env = kern_getenv("kernelname"); 1904 if (env != NULL) 1905 strlcpy(kernelname, env, sizeof(kernelname)); 1906 1907 kcsan_cpu_init(0); 1908 1909 #ifdef FDT 1910 x86_init_fdt(); 1911 #endif 1912 thread0.td_critnest = 0; 1913 1914 kasan_init(); 1915 1916 TSEXIT(); 1917 1918 /* Location of kernel stack for locore */ 1919 return (thread0.td_md.md_stack_base); 1920 } 1921 1922 void 1923 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) 1924 { 1925 1926 pcpu->pc_acpi_id = 0xffffffff; 1927 } 1928 1929 static int 1930 smap_sysctl_handler(SYSCTL_HANDLER_ARGS) 1931 { 1932 struct bios_smap *smapbase; 1933 struct bios_smap_xattr smap; 1934 caddr_t kmdp; 1935 uint32_t *smapattr; 1936 int count, error, i; 1937 1938 /* Retrieve the system memory map from the loader. */ 1939 kmdp = preload_search_by_type("elf kernel"); 1940 if (kmdp == NULL) 1941 kmdp = preload_search_by_type("elf64 kernel"); 1942 smapbase = (struct bios_smap *)preload_search_info(kmdp, 1943 MODINFO_METADATA | MODINFOMD_SMAP); 1944 if (smapbase == NULL) 1945 return (0); 1946 smapattr = (uint32_t *)preload_search_info(kmdp, 1947 MODINFO_METADATA | MODINFOMD_SMAP_XATTR); 1948 count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase); 1949 error = 0; 1950 for (i = 0; i < count; i++) { 1951 smap.base = smapbase[i].base; 1952 smap.length = smapbase[i].length; 1953 smap.type = smapbase[i].type; 1954 if (smapattr != NULL) 1955 smap.xattr = smapattr[i]; 1956 else 1957 smap.xattr = 0; 1958 error = SYSCTL_OUT(req, &smap, sizeof(smap)); 1959 } 1960 return (error); 1961 } 1962 SYSCTL_PROC(_machdep, OID_AUTO, smap, 1963 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 1964 smap_sysctl_handler, "S,bios_smap_xattr", 1965 "Raw BIOS SMAP data"); 1966 1967 static int 1968 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS) 1969 { 1970 struct efi_map_header *efihdr; 1971 caddr_t kmdp; 1972 uint32_t efisize; 1973 1974 kmdp = preload_search_by_type("elf kernel"); 1975 if (kmdp == NULL) 1976 kmdp = preload_search_by_type("elf64 kernel"); 1977 efihdr = (struct efi_map_header *)preload_search_info(kmdp, 1978 MODINFO_METADATA | MODINFOMD_EFI_MAP); 1979 if (efihdr == NULL) 1980 return (0); 1981 efisize = *((uint32_t *)efihdr - 1); 1982 return (SYSCTL_OUT(req, efihdr, efisize)); 1983 } 1984 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, 1985 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 1986 efi_map_sysctl_handler, "S,efi_map_header", 1987 "Raw EFI Memory Map"); 1988 1989 void 1990 spinlock_enter(void) 1991 { 1992 struct thread *td; 1993 register_t flags; 1994 1995 td = curthread; 1996 if (td->td_md.md_spinlock_count == 0) { 1997 flags = intr_disable(); 1998 td->td_md.md_spinlock_count = 1; 1999 td->td_md.md_saved_flags = flags; 2000 critical_enter(); 2001 } else 2002 td->td_md.md_spinlock_count++; 2003 } 2004 2005 void 2006 spinlock_exit(void) 2007 { 2008 struct thread *td; 2009 register_t flags; 2010 2011 td = curthread; 2012 flags = td->td_md.md_saved_flags; 2013 td->td_md.md_spinlock_count--; 2014 if (td->td_md.md_spinlock_count == 0) { 2015 critical_exit(); 2016 intr_restore(flags); 2017 } 2018 } 2019 2020 /* 2021 * Construct a PCB from a trapframe. This is called from kdb_trap() where 2022 * we want to start a backtrace from the function that caused us to enter 2023 * the debugger. We have the context in the trapframe, but base the trace 2024 * on the PCB. The PCB doesn't have to be perfect, as long as it contains 2025 * enough for a backtrace. 2026 */ 2027 void 2028 makectx(struct trapframe *tf, struct pcb *pcb) 2029 { 2030 2031 pcb->pcb_r12 = tf->tf_r12; 2032 pcb->pcb_r13 = tf->tf_r13; 2033 pcb->pcb_r14 = tf->tf_r14; 2034 pcb->pcb_r15 = tf->tf_r15; 2035 pcb->pcb_rbp = tf->tf_rbp; 2036 pcb->pcb_rbx = tf->tf_rbx; 2037 pcb->pcb_rip = tf->tf_rip; 2038 pcb->pcb_rsp = tf->tf_rsp; 2039 } 2040 2041 int 2042 ptrace_set_pc(struct thread *td, unsigned long addr) 2043 { 2044 2045 td->td_frame->tf_rip = addr; 2046 set_pcb_flags(td->td_pcb, PCB_FULL_IRET); 2047 return (0); 2048 } 2049 2050 int 2051 ptrace_single_step(struct thread *td) 2052 { 2053 2054 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); 2055 if ((td->td_frame->tf_rflags & PSL_T) == 0) { 2056 td->td_frame->tf_rflags |= PSL_T; 2057 td->td_dbgflags |= TDB_STEP; 2058 } 2059 return (0); 2060 } 2061 2062 int 2063 ptrace_clear_single_step(struct thread *td) 2064 { 2065 2066 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); 2067 td->td_frame->tf_rflags &= ~PSL_T; 2068 td->td_dbgflags &= ~TDB_STEP; 2069 return (0); 2070 } 2071 2072 int 2073 fill_regs(struct thread *td, struct reg *regs) 2074 { 2075 struct trapframe *tp; 2076 2077 tp = td->td_frame; 2078 return (fill_frame_regs(tp, regs)); 2079 } 2080 2081 int 2082 fill_frame_regs(struct trapframe *tp, struct reg *regs) 2083 { 2084 2085 regs->r_r15 = tp->tf_r15; 2086 regs->r_r14 = tp->tf_r14; 2087 regs->r_r13 = tp->tf_r13; 2088 regs->r_r12 = tp->tf_r12; 2089 regs->r_r11 = tp->tf_r11; 2090 regs->r_r10 = tp->tf_r10; 2091 regs->r_r9 = tp->tf_r9; 2092 regs->r_r8 = tp->tf_r8; 2093 regs->r_rdi = tp->tf_rdi; 2094 regs->r_rsi = tp->tf_rsi; 2095 regs->r_rbp = tp->tf_rbp; 2096 regs->r_rbx = tp->tf_rbx; 2097 regs->r_rdx = tp->tf_rdx; 2098 regs->r_rcx = tp->tf_rcx; 2099 regs->r_rax = tp->tf_rax; 2100 regs->r_rip = tp->tf_rip; 2101 regs->r_cs = tp->tf_cs; 2102 regs->r_rflags = tp->tf_rflags; 2103 regs->r_rsp = tp->tf_rsp; 2104 regs->r_ss = tp->tf_ss; 2105 if (tp->tf_flags & TF_HASSEGS) { 2106 regs->r_ds = tp->tf_ds; 2107 regs->r_es = tp->tf_es; 2108 regs->r_fs = tp->tf_fs; 2109 regs->r_gs = tp->tf_gs; 2110 } else { 2111 regs->r_ds = 0; 2112 regs->r_es = 0; 2113 regs->r_fs = 0; 2114 regs->r_gs = 0; 2115 } 2116 regs->r_err = 0; 2117 regs->r_trapno = 0; 2118 return (0); 2119 } 2120 2121 int 2122 set_regs(struct thread *td, struct reg *regs) 2123 { 2124 struct trapframe *tp; 2125 register_t rflags; 2126 2127 tp = td->td_frame; 2128 rflags = regs->r_rflags & 0xffffffff; 2129 if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs)) 2130 return (EINVAL); 2131 tp->tf_r15 = regs->r_r15; 2132 tp->tf_r14 = regs->r_r14; 2133 tp->tf_r13 = regs->r_r13; 2134 tp->tf_r12 = regs->r_r12; 2135 tp->tf_r11 = regs->r_r11; 2136 tp->tf_r10 = regs->r_r10; 2137 tp->tf_r9 = regs->r_r9; 2138 tp->tf_r8 = regs->r_r8; 2139 tp->tf_rdi = regs->r_rdi; 2140 tp->tf_rsi = regs->r_rsi; 2141 tp->tf_rbp = regs->r_rbp; 2142 tp->tf_rbx = regs->r_rbx; 2143 tp->tf_rdx = regs->r_rdx; 2144 tp->tf_rcx = regs->r_rcx; 2145 tp->tf_rax = regs->r_rax; 2146 tp->tf_rip = regs->r_rip; 2147 tp->tf_cs = regs->r_cs; 2148 tp->tf_rflags = rflags; 2149 tp->tf_rsp = regs->r_rsp; 2150 tp->tf_ss = regs->r_ss; 2151 if (0) { /* XXXKIB */ 2152 tp->tf_ds = regs->r_ds; 2153 tp->tf_es = regs->r_es; 2154 tp->tf_fs = regs->r_fs; 2155 tp->tf_gs = regs->r_gs; 2156 tp->tf_flags = TF_HASSEGS; 2157 } 2158 set_pcb_flags(td->td_pcb, PCB_FULL_IRET); 2159 return (0); 2160 } 2161 2162 /* XXX check all this stuff! */ 2163 /* externalize from sv_xmm */ 2164 static void 2165 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs) 2166 { 2167 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; 2168 struct envxmm *penv_xmm = &sv_xmm->sv_env; 2169 int i; 2170 2171 /* pcb -> fpregs */ 2172 bzero(fpregs, sizeof(*fpregs)); 2173 2174 /* FPU control/status */ 2175 penv_fpreg->en_cw = penv_xmm->en_cw; 2176 penv_fpreg->en_sw = penv_xmm->en_sw; 2177 penv_fpreg->en_tw = penv_xmm->en_tw; 2178 penv_fpreg->en_opcode = penv_xmm->en_opcode; 2179 penv_fpreg->en_rip = penv_xmm->en_rip; 2180 penv_fpreg->en_rdp = penv_xmm->en_rdp; 2181 penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr; 2182 penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask; 2183 2184 /* FPU registers */ 2185 for (i = 0; i < 8; ++i) 2186 bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10); 2187 2188 /* SSE registers */ 2189 for (i = 0; i < 16; ++i) 2190 bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16); 2191 } 2192 2193 /* internalize from fpregs into sv_xmm */ 2194 static void 2195 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm) 2196 { 2197 struct envxmm *penv_xmm = &sv_xmm->sv_env; 2198 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; 2199 int i; 2200 2201 /* fpregs -> pcb */ 2202 /* FPU control/status */ 2203 penv_xmm->en_cw = penv_fpreg->en_cw; 2204 penv_xmm->en_sw = penv_fpreg->en_sw; 2205 penv_xmm->en_tw = penv_fpreg->en_tw; 2206 penv_xmm->en_opcode = penv_fpreg->en_opcode; 2207 penv_xmm->en_rip = penv_fpreg->en_rip; 2208 penv_xmm->en_rdp = penv_fpreg->en_rdp; 2209 penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr; 2210 penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask; 2211 2212 /* FPU registers */ 2213 for (i = 0; i < 8; ++i) 2214 bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10); 2215 2216 /* SSE registers */ 2217 for (i = 0; i < 16; ++i) 2218 bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16); 2219 } 2220 2221 /* externalize from td->pcb */ 2222 int 2223 fill_fpregs(struct thread *td, struct fpreg *fpregs) 2224 { 2225 2226 KASSERT(td == curthread || TD_IS_SUSPENDED(td) || 2227 P_SHOULDSTOP(td->td_proc), 2228 ("not suspended thread %p", td)); 2229 fpugetregs(td); 2230 fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs); 2231 return (0); 2232 } 2233 2234 /* internalize to td->pcb */ 2235 int 2236 set_fpregs(struct thread *td, struct fpreg *fpregs) 2237 { 2238 2239 critical_enter(); 2240 set_fpregs_xmm(fpregs, get_pcb_user_save_td(td)); 2241 fpuuserinited(td); 2242 critical_exit(); 2243 return (0); 2244 } 2245 2246 /* 2247 * Get machine context. 2248 */ 2249 int 2250 get_mcontext(struct thread *td, mcontext_t *mcp, int flags) 2251 { 2252 struct pcb *pcb; 2253 struct trapframe *tp; 2254 2255 pcb = td->td_pcb; 2256 tp = td->td_frame; 2257 PROC_LOCK(curthread->td_proc); 2258 mcp->mc_onstack = sigonstack(tp->tf_rsp); 2259 PROC_UNLOCK(curthread->td_proc); 2260 mcp->mc_r15 = tp->tf_r15; 2261 mcp->mc_r14 = tp->tf_r14; 2262 mcp->mc_r13 = tp->tf_r13; 2263 mcp->mc_r12 = tp->tf_r12; 2264 mcp->mc_r11 = tp->tf_r11; 2265 mcp->mc_r10 = tp->tf_r10; 2266 mcp->mc_r9 = tp->tf_r9; 2267 mcp->mc_r8 = tp->tf_r8; 2268 mcp->mc_rdi = tp->tf_rdi; 2269 mcp->mc_rsi = tp->tf_rsi; 2270 mcp->mc_rbp = tp->tf_rbp; 2271 mcp->mc_rbx = tp->tf_rbx; 2272 mcp->mc_rcx = tp->tf_rcx; 2273 mcp->mc_rflags = tp->tf_rflags; 2274 if (flags & GET_MC_CLEAR_RET) { 2275 mcp->mc_rax = 0; 2276 mcp->mc_rdx = 0; 2277 mcp->mc_rflags &= ~PSL_C; 2278 } else { 2279 mcp->mc_rax = tp->tf_rax; 2280 mcp->mc_rdx = tp->tf_rdx; 2281 } 2282 mcp->mc_rip = tp->tf_rip; 2283 mcp->mc_cs = tp->tf_cs; 2284 mcp->mc_rsp = tp->tf_rsp; 2285 mcp->mc_ss = tp->tf_ss; 2286 mcp->mc_ds = tp->tf_ds; 2287 mcp->mc_es = tp->tf_es; 2288 mcp->mc_fs = tp->tf_fs; 2289 mcp->mc_gs = tp->tf_gs; 2290 mcp->mc_flags = tp->tf_flags; 2291 mcp->mc_len = sizeof(*mcp); 2292 get_fpcontext(td, mcp, NULL, 0); 2293 update_pcb_bases(pcb); 2294 mcp->mc_fsbase = pcb->pcb_fsbase; 2295 mcp->mc_gsbase = pcb->pcb_gsbase; 2296 mcp->mc_xfpustate = 0; 2297 mcp->mc_xfpustate_len = 0; 2298 bzero(mcp->mc_spare, sizeof(mcp->mc_spare)); 2299 return (0); 2300 } 2301 2302 /* 2303 * Set machine context. 2304 * 2305 * However, we don't set any but the user modifiable flags, and we won't 2306 * touch the cs selector. 2307 */ 2308 int 2309 set_mcontext(struct thread *td, mcontext_t *mcp) 2310 { 2311 struct pcb *pcb; 2312 struct trapframe *tp; 2313 char *xfpustate; 2314 long rflags; 2315 int ret; 2316 2317 pcb = td->td_pcb; 2318 tp = td->td_frame; 2319 if (mcp->mc_len != sizeof(*mcp) || 2320 (mcp->mc_flags & ~_MC_FLAG_MASK) != 0) 2321 return (EINVAL); 2322 rflags = (mcp->mc_rflags & PSL_USERCHANGE) | 2323 (tp->tf_rflags & ~PSL_USERCHANGE); 2324 if (mcp->mc_flags & _MC_HASFPXSTATE) { 2325 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - 2326 sizeof(struct savefpu)) 2327 return (EINVAL); 2328 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len); 2329 ret = copyin((void *)mcp->mc_xfpustate, xfpustate, 2330 mcp->mc_xfpustate_len); 2331 if (ret != 0) 2332 return (ret); 2333 } else 2334 xfpustate = NULL; 2335 ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len); 2336 if (ret != 0) 2337 return (ret); 2338 tp->tf_r15 = mcp->mc_r15; 2339 tp->tf_r14 = mcp->mc_r14; 2340 tp->tf_r13 = mcp->mc_r13; 2341 tp->tf_r12 = mcp->mc_r12; 2342 tp->tf_r11 = mcp->mc_r11; 2343 tp->tf_r10 = mcp->mc_r10; 2344 tp->tf_r9 = mcp->mc_r9; 2345 tp->tf_r8 = mcp->mc_r8; 2346 tp->tf_rdi = mcp->mc_rdi; 2347 tp->tf_rsi = mcp->mc_rsi; 2348 tp->tf_rbp = mcp->mc_rbp; 2349 tp->tf_rbx = mcp->mc_rbx; 2350 tp->tf_rdx = mcp->mc_rdx; 2351 tp->tf_rcx = mcp->mc_rcx; 2352 tp->tf_rax = mcp->mc_rax; 2353 tp->tf_rip = mcp->mc_rip; 2354 tp->tf_rflags = rflags; 2355 tp->tf_rsp = mcp->mc_rsp; 2356 tp->tf_ss = mcp->mc_ss; 2357 tp->tf_flags = mcp->mc_flags; 2358 if (tp->tf_flags & TF_HASSEGS) { 2359 tp->tf_ds = mcp->mc_ds; 2360 tp->tf_es = mcp->mc_es; 2361 tp->tf_fs = mcp->mc_fs; 2362 tp->tf_gs = mcp->mc_gs; 2363 } 2364 set_pcb_flags(pcb, PCB_FULL_IRET); 2365 if (mcp->mc_flags & _MC_HASBASES) { 2366 pcb->pcb_fsbase = mcp->mc_fsbase; 2367 pcb->pcb_gsbase = mcp->mc_gsbase; 2368 } 2369 return (0); 2370 } 2371 2372 static void 2373 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, 2374 size_t xfpusave_len) 2375 { 2376 size_t max_len, len; 2377 2378 mcp->mc_ownedfp = fpugetregs(td); 2379 bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0], 2380 sizeof(mcp->mc_fpstate)); 2381 mcp->mc_fpformat = fpuformat(); 2382 if (!use_xsave || xfpusave_len == 0) 2383 return; 2384 max_len = cpu_max_ext_state_size - sizeof(struct savefpu); 2385 len = xfpusave_len; 2386 if (len > max_len) { 2387 len = max_len; 2388 bzero(xfpusave + max_len, len - max_len); 2389 } 2390 mcp->mc_flags |= _MC_HASFPXSTATE; 2391 mcp->mc_xfpustate_len = len; 2392 bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len); 2393 } 2394 2395 static int 2396 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, 2397 size_t xfpustate_len) 2398 { 2399 int error; 2400 2401 if (mcp->mc_fpformat == _MC_FPFMT_NODEV) 2402 return (0); 2403 else if (mcp->mc_fpformat != _MC_FPFMT_XMM) 2404 return (EINVAL); 2405 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) { 2406 /* We don't care what state is left in the FPU or PCB. */ 2407 fpstate_drop(td); 2408 error = 0; 2409 } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || 2410 mcp->mc_ownedfp == _MC_FPOWNED_PCB) { 2411 error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate, 2412 xfpustate, xfpustate_len); 2413 } else 2414 return (EINVAL); 2415 return (error); 2416 } 2417 2418 void 2419 fpstate_drop(struct thread *td) 2420 { 2421 2422 KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu")); 2423 critical_enter(); 2424 if (PCPU_GET(fpcurthread) == td) 2425 fpudrop(); 2426 /* 2427 * XXX force a full drop of the fpu. The above only drops it if we 2428 * owned it. 2429 * 2430 * XXX I don't much like fpugetuserregs()'s semantics of doing a full 2431 * drop. Dropping only to the pcb matches fnsave's behaviour. 2432 * We only need to drop to !PCB_INITDONE in sendsig(). But 2433 * sendsig() is the only caller of fpugetuserregs()... perhaps we just 2434 * have too many layers. 2435 */ 2436 clear_pcb_flags(curthread->td_pcb, 2437 PCB_FPUINITDONE | PCB_USERFPUINITDONE); 2438 critical_exit(); 2439 } 2440 2441 int 2442 fill_dbregs(struct thread *td, struct dbreg *dbregs) 2443 { 2444 struct pcb *pcb; 2445 2446 if (td == NULL) { 2447 dbregs->dr[0] = rdr0(); 2448 dbregs->dr[1] = rdr1(); 2449 dbregs->dr[2] = rdr2(); 2450 dbregs->dr[3] = rdr3(); 2451 dbregs->dr[6] = rdr6(); 2452 dbregs->dr[7] = rdr7(); 2453 } else { 2454 pcb = td->td_pcb; 2455 dbregs->dr[0] = pcb->pcb_dr0; 2456 dbregs->dr[1] = pcb->pcb_dr1; 2457 dbregs->dr[2] = pcb->pcb_dr2; 2458 dbregs->dr[3] = pcb->pcb_dr3; 2459 dbregs->dr[6] = pcb->pcb_dr6; 2460 dbregs->dr[7] = pcb->pcb_dr7; 2461 } 2462 dbregs->dr[4] = 0; 2463 dbregs->dr[5] = 0; 2464 dbregs->dr[8] = 0; 2465 dbregs->dr[9] = 0; 2466 dbregs->dr[10] = 0; 2467 dbregs->dr[11] = 0; 2468 dbregs->dr[12] = 0; 2469 dbregs->dr[13] = 0; 2470 dbregs->dr[14] = 0; 2471 dbregs->dr[15] = 0; 2472 return (0); 2473 } 2474 2475 int 2476 set_dbregs(struct thread *td, struct dbreg *dbregs) 2477 { 2478 struct pcb *pcb; 2479 int i; 2480 2481 if (td == NULL) { 2482 load_dr0(dbregs->dr[0]); 2483 load_dr1(dbregs->dr[1]); 2484 load_dr2(dbregs->dr[2]); 2485 load_dr3(dbregs->dr[3]); 2486 load_dr6(dbregs->dr[6]); 2487 load_dr7(dbregs->dr[7]); 2488 } else { 2489 /* 2490 * Don't let an illegal value for dr7 get set. Specifically, 2491 * check for undefined settings. Setting these bit patterns 2492 * result in undefined behaviour and can lead to an unexpected 2493 * TRCTRAP or a general protection fault right here. 2494 * Upper bits of dr6 and dr7 must not be set 2495 */ 2496 for (i = 0; i < 4; i++) { 2497 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) 2498 return (EINVAL); 2499 if (td->td_frame->tf_cs == _ucode32sel && 2500 DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8) 2501 return (EINVAL); 2502 } 2503 if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 || 2504 (dbregs->dr[7] & 0xffffffff00000000ul) != 0) 2505 return (EINVAL); 2506 2507 pcb = td->td_pcb; 2508 2509 /* 2510 * Don't let a process set a breakpoint that is not within the 2511 * process's address space. If a process could do this, it 2512 * could halt the system by setting a breakpoint in the kernel 2513 * (if ddb was enabled). Thus, we need to check to make sure 2514 * that no breakpoints are being enabled for addresses outside 2515 * process's address space. 2516 * 2517 * XXX - what about when the watched area of the user's 2518 * address space is written into from within the kernel 2519 * ... wouldn't that still cause a breakpoint to be generated 2520 * from within kernel mode? 2521 */ 2522 2523 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { 2524 /* dr0 is enabled */ 2525 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) 2526 return (EINVAL); 2527 } 2528 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { 2529 /* dr1 is enabled */ 2530 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) 2531 return (EINVAL); 2532 } 2533 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { 2534 /* dr2 is enabled */ 2535 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) 2536 return (EINVAL); 2537 } 2538 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { 2539 /* dr3 is enabled */ 2540 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) 2541 return (EINVAL); 2542 } 2543 2544 pcb->pcb_dr0 = dbregs->dr[0]; 2545 pcb->pcb_dr1 = dbregs->dr[1]; 2546 pcb->pcb_dr2 = dbregs->dr[2]; 2547 pcb->pcb_dr3 = dbregs->dr[3]; 2548 pcb->pcb_dr6 = dbregs->dr[6]; 2549 pcb->pcb_dr7 = dbregs->dr[7]; 2550 2551 set_pcb_flags(pcb, PCB_DBREGS); 2552 } 2553 2554 return (0); 2555 } 2556 2557 void 2558 reset_dbregs(void) 2559 { 2560 2561 load_dr7(0); /* Turn off the control bits first */ 2562 load_dr0(0); 2563 load_dr1(0); 2564 load_dr2(0); 2565 load_dr3(0); 2566 load_dr6(0); 2567 } 2568 2569 /* 2570 * Return > 0 if a hardware breakpoint has been hit, and the 2571 * breakpoint was in user space. Return 0, otherwise. 2572 */ 2573 int 2574 user_dbreg_trap(register_t dr6) 2575 { 2576 u_int64_t dr7; 2577 u_int64_t bp; /* breakpoint bits extracted from dr6 */ 2578 int nbp; /* number of breakpoints that triggered */ 2579 caddr_t addr[4]; /* breakpoint addresses */ 2580 int i; 2581 2582 bp = dr6 & DBREG_DR6_BMASK; 2583 if (bp == 0) { 2584 /* 2585 * None of the breakpoint bits are set meaning this 2586 * trap was not caused by any of the debug registers 2587 */ 2588 return 0; 2589 } 2590 2591 dr7 = rdr7(); 2592 if ((dr7 & 0x000000ff) == 0) { 2593 /* 2594 * all GE and LE bits in the dr7 register are zero, 2595 * thus the trap couldn't have been caused by the 2596 * hardware debug registers 2597 */ 2598 return 0; 2599 } 2600 2601 nbp = 0; 2602 2603 /* 2604 * at least one of the breakpoints were hit, check to see 2605 * which ones and if any of them are user space addresses 2606 */ 2607 2608 if (bp & 0x01) { 2609 addr[nbp++] = (caddr_t)rdr0(); 2610 } 2611 if (bp & 0x02) { 2612 addr[nbp++] = (caddr_t)rdr1(); 2613 } 2614 if (bp & 0x04) { 2615 addr[nbp++] = (caddr_t)rdr2(); 2616 } 2617 if (bp & 0x08) { 2618 addr[nbp++] = (caddr_t)rdr3(); 2619 } 2620 2621 for (i = 0; i < nbp; i++) { 2622 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { 2623 /* 2624 * addr[i] is in user space 2625 */ 2626 return nbp; 2627 } 2628 } 2629 2630 /* 2631 * None of the breakpoints are in user space. 2632 */ 2633 return 0; 2634 } 2635 2636 /* 2637 * The pcb_flags is only modified by current thread, or by other threads 2638 * when current thread is stopped. However, current thread may change it 2639 * from the interrupt context in cpu_switch(), or in the trap handler. 2640 * When we read-modify-write pcb_flags from C sources, compiler may generate 2641 * code that is not atomic regarding the interrupt handler. If a trap or 2642 * interrupt happens and any flag is modified from the handler, it can be 2643 * clobbered with the cached value later. Therefore, we implement setting 2644 * and clearing flags with single-instruction functions, which do not race 2645 * with possible modification of the flags from the trap or interrupt context, 2646 * because traps and interrupts are executed only on instruction boundary. 2647 */ 2648 void 2649 set_pcb_flags_raw(struct pcb *pcb, const u_int flags) 2650 { 2651 2652 __asm __volatile("orl %1,%0" 2653 : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags) 2654 : "cc", "memory"); 2655 2656 } 2657 2658 /* 2659 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs 2660 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into 2661 * pcb if user space modified the bases. We must save on the context 2662 * switch or if the return to usermode happens through the doreti. 2663 * 2664 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET, 2665 * which have a consequence that the base MSRs must be saved each time 2666 * the PCB_FULL_IRET flag is set. We disable interrupts to sync with 2667 * context switches. 2668 */ 2669 static void 2670 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags) 2671 { 2672 register_t r; 2673 2674 if (curpcb == pcb && 2675 (flags & PCB_FULL_IRET) != 0 && 2676 (pcb->pcb_flags & PCB_FULL_IRET) == 0) { 2677 r = intr_disable(); 2678 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) { 2679 if (rfs() == _ufssel) 2680 pcb->pcb_fsbase = rdfsbase(); 2681 if (rgs() == _ugssel) 2682 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE); 2683 } 2684 set_pcb_flags_raw(pcb, flags); 2685 intr_restore(r); 2686 } else { 2687 set_pcb_flags_raw(pcb, flags); 2688 } 2689 } 2690 2691 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int)) 2692 { 2693 2694 return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ? 2695 set_pcb_flags_fsgsbase : set_pcb_flags_raw); 2696 } 2697 2698 void 2699 clear_pcb_flags(struct pcb *pcb, const u_int flags) 2700 { 2701 2702 __asm __volatile("andl %1,%0" 2703 : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags) 2704 : "cc", "memory"); 2705 } 2706 2707 #ifdef KDB 2708 2709 /* 2710 * Provide inb() and outb() as functions. They are normally only available as 2711 * inline functions, thus cannot be called from the debugger. 2712 */ 2713 2714 /* silence compiler warnings */ 2715 u_char inb_(u_short); 2716 void outb_(u_short, u_char); 2717 2718 u_char 2719 inb_(u_short port) 2720 { 2721 return inb(port); 2722 } 2723 2724 void 2725 outb_(u_short port, u_char data) 2726 { 2727 outb(port, data); 2728 } 2729 2730 #endif /* KDB */ 2731 2732 #undef memset 2733 #undef memmove 2734 #undef memcpy 2735 2736 void *memset_std(void *buf, int c, size_t len); 2737 void *memset_erms(void *buf, int c, size_t len); 2738 void *memmove_std(void * _Nonnull dst, const void * _Nonnull src, 2739 size_t len); 2740 void *memmove_erms(void * _Nonnull dst, const void * _Nonnull src, 2741 size_t len); 2742 void *memcpy_std(void * _Nonnull dst, const void * _Nonnull src, 2743 size_t len); 2744 void *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src, 2745 size_t len); 2746 2747 #ifdef KCSAN 2748 /* 2749 * These fail to build as ifuncs when used with KCSAN. 2750 */ 2751 void * 2752 memset(void *buf, int c, size_t len) 2753 { 2754 2755 return (memset_std(buf, c, len)); 2756 } 2757 2758 void * 2759 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len) 2760 { 2761 2762 return (memmove_std(dst, src, len)); 2763 } 2764 2765 void * 2766 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len) 2767 { 2768 2769 return (memcpy_std(dst, src, len)); 2770 } 2771 #else 2772 DEFINE_IFUNC(, void *, memset, (void *, int, size_t)) 2773 { 2774 2775 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 2776 memset_erms : memset_std); 2777 } 2778 2779 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull, 2780 size_t)) 2781 { 2782 2783 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 2784 memmove_erms : memmove_std); 2785 } 2786 2787 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t)) 2788 { 2789 2790 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 2791 memcpy_erms : memcpy_std); 2792 } 2793 #endif 2794 2795 void pagezero_std(void *addr); 2796 void pagezero_erms(void *addr); 2797 DEFINE_IFUNC(, void , pagezero, (void *)) 2798 { 2799 2800 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 2801 pagezero_erms : pagezero_std); 2802 } 2803