1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 2003 Peter Wemm. 5 * Copyright (c) 1992 Terrence R. Lambert. 6 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * William Jolitz. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. All advertising materials mentioning features or use of this software 21 * must display the following acknowledgement: 22 * This product includes software developed by the University of 23 * California, Berkeley and its contributors. 24 * 4. Neither the name of the University nor the names of its contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 * 40 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_atpic.h" 47 #include "opt_cpu.h" 48 #include "opt_ddb.h" 49 #include "opt_inet.h" 50 #include "opt_isa.h" 51 #include "opt_kstack_pages.h" 52 #include "opt_maxmem.h" 53 #include "opt_mp_watchdog.h" 54 #include "opt_pci.h" 55 #include "opt_platform.h" 56 #include "opt_sched.h" 57 58 #include <sys/param.h> 59 #include <sys/proc.h> 60 #include <sys/systm.h> 61 #include <sys/bio.h> 62 #include <sys/buf.h> 63 #include <sys/bus.h> 64 #include <sys/callout.h> 65 #include <sys/cons.h> 66 #include <sys/cpu.h> 67 #include <sys/csan.h> 68 #include <sys/efi.h> 69 #include <sys/eventhandler.h> 70 #include <sys/exec.h> 71 #include <sys/imgact.h> 72 #include <sys/kdb.h> 73 #include <sys/kernel.h> 74 #include <sys/ktr.h> 75 #include <sys/linker.h> 76 #include <sys/lock.h> 77 #include <sys/malloc.h> 78 #include <sys/memrange.h> 79 #include <sys/msgbuf.h> 80 #include <sys/mutex.h> 81 #include <sys/pcpu.h> 82 #include <sys/ptrace.h> 83 #include <sys/reboot.h> 84 #include <sys/rwlock.h> 85 #include <sys/sched.h> 86 #include <sys/signalvar.h> 87 #ifdef SMP 88 #include <sys/smp.h> 89 #endif 90 #include <sys/syscallsubr.h> 91 #include <sys/sysctl.h> 92 #include <sys/sysent.h> 93 #include <sys/sysproto.h> 94 #include <sys/ucontext.h> 95 #include <sys/vmmeter.h> 96 97 #include <vm/vm.h> 98 #include <vm/vm_extern.h> 99 #include <vm/vm_kern.h> 100 #include <vm/vm_page.h> 101 #include <vm/vm_map.h> 102 #include <vm/vm_object.h> 103 #include <vm/vm_pager.h> 104 #include <vm/vm_param.h> 105 #include <vm/vm_phys.h> 106 107 #ifdef DDB 108 #ifndef KDB 109 #error KDB must be enabled in order for DDB to work! 110 #endif 111 #include <ddb/ddb.h> 112 #include <ddb/db_sym.h> 113 #endif 114 115 #include <net/netisr.h> 116 117 #include <machine/clock.h> 118 #include <machine/cpu.h> 119 #include <machine/cputypes.h> 120 #include <machine/frame.h> 121 #include <machine/intr_machdep.h> 122 #include <x86/mca.h> 123 #include <machine/md_var.h> 124 #include <machine/metadata.h> 125 #include <machine/mp_watchdog.h> 126 #include <machine/pc/bios.h> 127 #include <machine/pcb.h> 128 #include <machine/proc.h> 129 #include <machine/reg.h> 130 #include <machine/sigframe.h> 131 #include <machine/specialreg.h> 132 #include <machine/trap.h> 133 #include <machine/tss.h> 134 #include <x86/ucode.h> 135 #include <x86/ifunc.h> 136 #ifdef SMP 137 #include <machine/smp.h> 138 #endif 139 #ifdef FDT 140 #include <x86/fdt.h> 141 #endif 142 143 #ifdef DEV_ATPIC 144 #include <x86/isa/icu.h> 145 #else 146 #include <x86/apicvar.h> 147 #endif 148 149 #include <isa/isareg.h> 150 #include <isa/rtc.h> 151 #include <x86/init.h> 152 153 /* Sanity check for __curthread() */ 154 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); 155 156 /* 157 * The PTI trampoline stack needs enough space for a hardware trapframe and a 158 * couple of scratch registers, as well as the trapframe left behind after an 159 * iret fault. 160 */ 161 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) - 162 offsetof(struct pti_frame, pti_rip)); 163 164 extern u_int64_t hammer_time(u_int64_t, u_int64_t); 165 166 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 167 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 168 169 static void cpu_startup(void *); 170 static void get_fpcontext(struct thread *td, mcontext_t *mcp, 171 char *xfpusave, size_t xfpusave_len); 172 static int set_fpcontext(struct thread *td, mcontext_t *mcp, 173 char *xfpustate, size_t xfpustate_len); 174 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); 175 176 /* Preload data parse function */ 177 static caddr_t native_parse_preload_data(u_int64_t); 178 179 /* Native function to fetch and parse the e820 map */ 180 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *); 181 182 /* Default init_ops implementation. */ 183 struct init_ops init_ops = { 184 .parse_preload_data = native_parse_preload_data, 185 .early_clock_source_init = i8254_init, 186 .early_delay = i8254_delay, 187 .parse_memmap = native_parse_memmap, 188 #ifdef SMP 189 .mp_bootaddress = mp_bootaddress, 190 .start_all_aps = native_start_all_aps, 191 #endif 192 #ifdef DEV_PCI 193 .msi_init = msi_init, 194 #endif 195 }; 196 197 /* 198 * Physical address of the EFI System Table. Stashed from the metadata hints 199 * passed into the kernel and used by the EFI code to call runtime services. 200 */ 201 vm_paddr_t efi_systbl_phys; 202 203 /* Intel ICH registers */ 204 #define ICH_PMBASE 0x400 205 #define ICH_SMI_EN ICH_PMBASE + 0x30 206 207 int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel; 208 209 int cold = 1; 210 211 long Maxmem = 0; 212 long realmem = 0; 213 214 struct kva_md_info kmi; 215 216 static struct trapframe proc0_tf; 217 struct region_descriptor r_idt; 218 219 struct pcpu *__pcpu; 220 struct pcpu temp_bsp_pcpu; 221 222 struct mtx icu_lock; 223 224 struct mem_range_softc mem_range_softc; 225 226 struct mtx dt_lock; /* lock for GDT and LDT */ 227 228 void (*vmm_resume_p)(void); 229 230 static void 231 cpu_startup(dummy) 232 void *dummy; 233 { 234 uintmax_t memsize; 235 char *sysenv; 236 237 /* 238 * On MacBooks, we need to disallow the legacy USB circuit to 239 * generate an SMI# because this can cause several problems, 240 * namely: incorrect CPU frequency detection and failure to 241 * start the APs. 242 * We do this by disabling a bit in the SMI_EN (SMI Control and 243 * Enable register) of the Intel ICH LPC Interface Bridge. 244 */ 245 sysenv = kern_getenv("smbios.system.product"); 246 if (sysenv != NULL) { 247 if (strncmp(sysenv, "MacBook1,1", 10) == 0 || 248 strncmp(sysenv, "MacBook3,1", 10) == 0 || 249 strncmp(sysenv, "MacBook4,1", 10) == 0 || 250 strncmp(sysenv, "MacBookPro1,1", 13) == 0 || 251 strncmp(sysenv, "MacBookPro1,2", 13) == 0 || 252 strncmp(sysenv, "MacBookPro3,1", 13) == 0 || 253 strncmp(sysenv, "MacBookPro4,1", 13) == 0 || 254 strncmp(sysenv, "Macmini1,1", 10) == 0) { 255 if (bootverbose) 256 printf("Disabling LEGACY_USB_EN bit on " 257 "Intel ICH.\n"); 258 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); 259 } 260 freeenv(sysenv); 261 } 262 263 /* 264 * Good {morning,afternoon,evening,night}. 265 */ 266 startrtclock(); 267 printcpuinfo(); 268 269 /* 270 * Display physical memory if SMBIOS reports reasonable amount. 271 */ 272 memsize = 0; 273 sysenv = kern_getenv("smbios.memory.enabled"); 274 if (sysenv != NULL) { 275 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; 276 freeenv(sysenv); 277 } 278 if (memsize < ptoa((uintmax_t)vm_free_count())) 279 memsize = ptoa((uintmax_t)Maxmem); 280 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); 281 realmem = atop(memsize); 282 283 /* 284 * Display any holes after the first chunk of extended memory. 285 */ 286 if (bootverbose) { 287 int indx; 288 289 printf("Physical memory chunk(s):\n"); 290 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 291 vm_paddr_t size; 292 293 size = phys_avail[indx + 1] - phys_avail[indx]; 294 printf( 295 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", 296 (uintmax_t)phys_avail[indx], 297 (uintmax_t)phys_avail[indx + 1] - 1, 298 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); 299 } 300 } 301 302 vm_ksubmap_init(&kmi); 303 304 printf("avail memory = %ju (%ju MB)\n", 305 ptoa((uintmax_t)vm_free_count()), 306 ptoa((uintmax_t)vm_free_count()) / 1048576); 307 #ifdef DEV_PCI 308 if (bootverbose && intel_graphics_stolen_base != 0) 309 printf("intel stolen mem: base %#jx size %ju MB\n", 310 (uintmax_t)intel_graphics_stolen_base, 311 (uintmax_t)intel_graphics_stolen_size / 1024 / 1024); 312 #endif 313 314 /* 315 * Set up buffers, so they can be used to read disk labels. 316 */ 317 bufinit(); 318 vm_pager_bufferinit(); 319 320 cpu_setregs(); 321 } 322 323 /* 324 * Send an interrupt to process. 325 * 326 * Stack is set up to allow sigcode stored 327 * at top to call routine, followed by call 328 * to sigreturn routine below. After sigreturn 329 * resets the signal mask, the stack, and the 330 * frame pointer, it returns to the user 331 * specified pc, psl. 332 */ 333 void 334 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 335 { 336 struct sigframe sf, *sfp; 337 struct pcb *pcb; 338 struct proc *p; 339 struct thread *td; 340 struct sigacts *psp; 341 char *sp; 342 struct trapframe *regs; 343 char *xfpusave; 344 size_t xfpusave_len; 345 int sig; 346 int oonstack; 347 348 td = curthread; 349 pcb = td->td_pcb; 350 p = td->td_proc; 351 PROC_LOCK_ASSERT(p, MA_OWNED); 352 sig = ksi->ksi_signo; 353 psp = p->p_sigacts; 354 mtx_assert(&psp->ps_mtx, MA_OWNED); 355 regs = td->td_frame; 356 oonstack = sigonstack(regs->tf_rsp); 357 358 if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) { 359 xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu); 360 xfpusave = __builtin_alloca(xfpusave_len); 361 } else { 362 xfpusave_len = 0; 363 xfpusave = NULL; 364 } 365 366 /* Save user context. */ 367 bzero(&sf, sizeof(sf)); 368 sf.sf_uc.uc_sigmask = *mask; 369 sf.sf_uc.uc_stack = td->td_sigstk; 370 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 371 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 372 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 373 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs)); 374 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ 375 get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); 376 fpstate_drop(td); 377 update_pcb_bases(pcb); 378 sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase; 379 sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase; 380 bzero(sf.sf_uc.uc_mcontext.mc_spare, 381 sizeof(sf.sf_uc.uc_mcontext.mc_spare)); 382 383 /* Allocate space for the signal handler context. */ 384 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 385 SIGISMEMBER(psp->ps_sigonstack, sig)) { 386 sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size; 387 #if defined(COMPAT_43) 388 td->td_sigstk.ss_flags |= SS_ONSTACK; 389 #endif 390 } else 391 sp = (char *)regs->tf_rsp - 128; 392 if (xfpusave != NULL) { 393 sp -= xfpusave_len; 394 sp = (char *)((unsigned long)sp & ~0x3Ful); 395 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp; 396 } 397 sp -= sizeof(struct sigframe); 398 /* Align to 16 bytes. */ 399 sfp = (struct sigframe *)((unsigned long)sp & ~0xFul); 400 401 /* Build the argument list for the signal handler. */ 402 regs->tf_rdi = sig; /* arg 1 in %rdi */ 403 regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */ 404 bzero(&sf.sf_si, sizeof(sf.sf_si)); 405 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 406 /* Signal handler installed with SA_SIGINFO. */ 407 regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */ 408 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 409 410 /* Fill in POSIX parts */ 411 sf.sf_si = ksi->ksi_info; 412 sf.sf_si.si_signo = sig; /* maybe a translated signal */ 413 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ 414 } else { 415 /* Old FreeBSD-style arguments. */ 416 regs->tf_rsi = ksi->ksi_code; /* arg 2 in %rsi */ 417 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ 418 sf.sf_ahu.sf_handler = catcher; 419 } 420 mtx_unlock(&psp->ps_mtx); 421 PROC_UNLOCK(p); 422 423 /* 424 * Copy the sigframe out to the user's stack. 425 */ 426 if (copyout(&sf, sfp, sizeof(*sfp)) != 0 || 427 (xfpusave != NULL && copyout(xfpusave, 428 (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len) 429 != 0)) { 430 #ifdef DEBUG 431 printf("process %ld has trashed its stack\n", (long)p->p_pid); 432 #endif 433 PROC_LOCK(p); 434 sigexit(td, SIGILL); 435 } 436 437 regs->tf_rsp = (long)sfp; 438 regs->tf_rip = p->p_sysent->sv_sigcode_base; 439 regs->tf_rflags &= ~(PSL_T | PSL_D); 440 regs->tf_cs = _ucodesel; 441 regs->tf_ds = _udatasel; 442 regs->tf_ss = _udatasel; 443 regs->tf_es = _udatasel; 444 regs->tf_fs = _ufssel; 445 regs->tf_gs = _ugssel; 446 regs->tf_flags = TF_HASSEGS; 447 PROC_LOCK(p); 448 mtx_lock(&psp->ps_mtx); 449 } 450 451 /* 452 * System call to cleanup state after a signal 453 * has been taken. Reset signal mask and 454 * stack state from context left by sendsig (above). 455 * Return to previous pc and psl as specified by 456 * context left by sendsig. Check carefully to 457 * make sure that the user has not modified the 458 * state to gain improper privileges. 459 * 460 * MPSAFE 461 */ 462 int 463 sys_sigreturn(td, uap) 464 struct thread *td; 465 struct sigreturn_args /* { 466 const struct __ucontext *sigcntxp; 467 } */ *uap; 468 { 469 ucontext_t uc; 470 struct pcb *pcb; 471 struct proc *p; 472 struct trapframe *regs; 473 ucontext_t *ucp; 474 char *xfpustate; 475 size_t xfpustate_len; 476 long rflags; 477 int cs, error, ret; 478 ksiginfo_t ksi; 479 480 pcb = td->td_pcb; 481 p = td->td_proc; 482 483 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 484 if (error != 0) { 485 uprintf("pid %d (%s): sigreturn copyin failed\n", 486 p->p_pid, td->td_name); 487 return (error); 488 } 489 ucp = &uc; 490 if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) { 491 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid, 492 td->td_name, ucp->uc_mcontext.mc_flags); 493 return (EINVAL); 494 } 495 regs = td->td_frame; 496 rflags = ucp->uc_mcontext.mc_rflags; 497 /* 498 * Don't allow users to change privileged or reserved flags. 499 */ 500 if (!EFL_SECURE(rflags, regs->tf_rflags)) { 501 uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid, 502 td->td_name, rflags); 503 return (EINVAL); 504 } 505 506 /* 507 * Don't allow users to load a valid privileged %cs. Let the 508 * hardware check for invalid selectors, excess privilege in 509 * other selectors, invalid %eip's and invalid %esp's. 510 */ 511 cs = ucp->uc_mcontext.mc_cs; 512 if (!CS_SECURE(cs)) { 513 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid, 514 td->td_name, cs); 515 ksiginfo_init_trap(&ksi); 516 ksi.ksi_signo = SIGBUS; 517 ksi.ksi_code = BUS_OBJERR; 518 ksi.ksi_trapno = T_PROTFLT; 519 ksi.ksi_addr = (void *)regs->tf_rip; 520 trapsignal(td, &ksi); 521 return (EINVAL); 522 } 523 524 if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) { 525 xfpustate_len = uc.uc_mcontext.mc_xfpustate_len; 526 if (xfpustate_len > cpu_max_ext_state_size - 527 sizeof(struct savefpu)) { 528 uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n", 529 p->p_pid, td->td_name, xfpustate_len); 530 return (EINVAL); 531 } 532 xfpustate = __builtin_alloca(xfpustate_len); 533 error = copyin((const void *)uc.uc_mcontext.mc_xfpustate, 534 xfpustate, xfpustate_len); 535 if (error != 0) { 536 uprintf( 537 "pid %d (%s): sigreturn copying xfpustate failed\n", 538 p->p_pid, td->td_name); 539 return (error); 540 } 541 } else { 542 xfpustate = NULL; 543 xfpustate_len = 0; 544 } 545 ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len); 546 if (ret != 0) { 547 uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n", 548 p->p_pid, td->td_name, ret); 549 return (ret); 550 } 551 bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs)); 552 update_pcb_bases(pcb); 553 pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase; 554 pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase; 555 556 #if defined(COMPAT_43) 557 if (ucp->uc_mcontext.mc_onstack & 1) 558 td->td_sigstk.ss_flags |= SS_ONSTACK; 559 else 560 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 561 #endif 562 563 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 564 return (EJUSTRETURN); 565 } 566 567 #ifdef COMPAT_FREEBSD4 568 int 569 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap) 570 { 571 572 return sys_sigreturn(td, (struct sigreturn_args *)uap); 573 } 574 #endif 575 576 /* 577 * Reset registers to default values on exec. 578 */ 579 void 580 exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) 581 { 582 struct trapframe *regs; 583 struct pcb *pcb; 584 register_t saved_rflags; 585 586 regs = td->td_frame; 587 pcb = td->td_pcb; 588 589 if (td->td_proc->p_md.md_ldt != NULL) 590 user_ldt_free(td); 591 592 update_pcb_bases(pcb); 593 pcb->pcb_fsbase = 0; 594 pcb->pcb_gsbase = 0; 595 clear_pcb_flags(pcb, PCB_32BIT); 596 pcb->pcb_initial_fpucw = __INITIAL_FPUCW__; 597 598 saved_rflags = regs->tf_rflags & PSL_T; 599 bzero((char *)regs, sizeof(struct trapframe)); 600 regs->tf_rip = imgp->entry_addr; 601 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; 602 regs->tf_rdi = stack; /* argv */ 603 regs->tf_rflags = PSL_USER | saved_rflags; 604 regs->tf_ss = _udatasel; 605 regs->tf_cs = _ucodesel; 606 regs->tf_ds = _udatasel; 607 regs->tf_es = _udatasel; 608 regs->tf_fs = _ufssel; 609 regs->tf_gs = _ugssel; 610 regs->tf_flags = TF_HASSEGS; 611 612 /* 613 * Reset the hardware debug registers if they were in use. 614 * They won't have any meaning for the newly exec'd process. 615 */ 616 if (pcb->pcb_flags & PCB_DBREGS) { 617 pcb->pcb_dr0 = 0; 618 pcb->pcb_dr1 = 0; 619 pcb->pcb_dr2 = 0; 620 pcb->pcb_dr3 = 0; 621 pcb->pcb_dr6 = 0; 622 pcb->pcb_dr7 = 0; 623 if (pcb == curpcb) { 624 /* 625 * Clear the debug registers on the running 626 * CPU, otherwise they will end up affecting 627 * the next process we switch to. 628 */ 629 reset_dbregs(); 630 } 631 clear_pcb_flags(pcb, PCB_DBREGS); 632 } 633 634 /* 635 * Drop the FP state if we hold it, so that the process gets a 636 * clean FP state if it uses the FPU again. 637 */ 638 fpstate_drop(td); 639 } 640 641 void 642 cpu_setregs(void) 643 { 644 register_t cr0; 645 646 cr0 = rcr0(); 647 /* 648 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the 649 * BSP. See the comments there about why we set them. 650 */ 651 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; 652 load_cr0(cr0); 653 } 654 655 /* 656 * Initialize amd64 and configure to run kernel 657 */ 658 659 /* 660 * Initialize segments & interrupt table 661 */ 662 static struct gate_descriptor idt0[NIDT]; 663 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 664 665 static char dblfault_stack[PAGE_SIZE] __aligned(16); 666 static char mce0_stack[PAGE_SIZE] __aligned(16); 667 static char nmi0_stack[PAGE_SIZE] __aligned(16); 668 static char dbg0_stack[PAGE_SIZE] __aligned(16); 669 CTASSERT(sizeof(struct nmi_pcpu) == 16); 670 671 /* 672 * Software prototypes -- in more palatable form. 673 * 674 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same 675 * slots as corresponding segments for i386 kernel. 676 */ 677 struct soft_segment_descriptor gdt_segs[] = { 678 /* GNULL_SEL 0 Null Descriptor */ 679 { .ssd_base = 0x0, 680 .ssd_limit = 0x0, 681 .ssd_type = 0, 682 .ssd_dpl = 0, 683 .ssd_p = 0, 684 .ssd_long = 0, 685 .ssd_def32 = 0, 686 .ssd_gran = 0 }, 687 /* GNULL2_SEL 1 Null Descriptor */ 688 { .ssd_base = 0x0, 689 .ssd_limit = 0x0, 690 .ssd_type = 0, 691 .ssd_dpl = 0, 692 .ssd_p = 0, 693 .ssd_long = 0, 694 .ssd_def32 = 0, 695 .ssd_gran = 0 }, 696 /* GUFS32_SEL 2 32 bit %gs Descriptor for user */ 697 { .ssd_base = 0x0, 698 .ssd_limit = 0xfffff, 699 .ssd_type = SDT_MEMRWA, 700 .ssd_dpl = SEL_UPL, 701 .ssd_p = 1, 702 .ssd_long = 0, 703 .ssd_def32 = 1, 704 .ssd_gran = 1 }, 705 /* GUGS32_SEL 3 32 bit %fs Descriptor for user */ 706 { .ssd_base = 0x0, 707 .ssd_limit = 0xfffff, 708 .ssd_type = SDT_MEMRWA, 709 .ssd_dpl = SEL_UPL, 710 .ssd_p = 1, 711 .ssd_long = 0, 712 .ssd_def32 = 1, 713 .ssd_gran = 1 }, 714 /* GCODE_SEL 4 Code Descriptor for kernel */ 715 { .ssd_base = 0x0, 716 .ssd_limit = 0xfffff, 717 .ssd_type = SDT_MEMERA, 718 .ssd_dpl = SEL_KPL, 719 .ssd_p = 1, 720 .ssd_long = 1, 721 .ssd_def32 = 0, 722 .ssd_gran = 1 }, 723 /* GDATA_SEL 5 Data Descriptor for kernel */ 724 { .ssd_base = 0x0, 725 .ssd_limit = 0xfffff, 726 .ssd_type = SDT_MEMRWA, 727 .ssd_dpl = SEL_KPL, 728 .ssd_p = 1, 729 .ssd_long = 1, 730 .ssd_def32 = 0, 731 .ssd_gran = 1 }, 732 /* GUCODE32_SEL 6 32 bit Code Descriptor for user */ 733 { .ssd_base = 0x0, 734 .ssd_limit = 0xfffff, 735 .ssd_type = SDT_MEMERA, 736 .ssd_dpl = SEL_UPL, 737 .ssd_p = 1, 738 .ssd_long = 0, 739 .ssd_def32 = 1, 740 .ssd_gran = 1 }, 741 /* GUDATA_SEL 7 32/64 bit Data Descriptor for user */ 742 { .ssd_base = 0x0, 743 .ssd_limit = 0xfffff, 744 .ssd_type = SDT_MEMRWA, 745 .ssd_dpl = SEL_UPL, 746 .ssd_p = 1, 747 .ssd_long = 0, 748 .ssd_def32 = 1, 749 .ssd_gran = 1 }, 750 /* GUCODE_SEL 8 64 bit Code Descriptor for user */ 751 { .ssd_base = 0x0, 752 .ssd_limit = 0xfffff, 753 .ssd_type = SDT_MEMERA, 754 .ssd_dpl = SEL_UPL, 755 .ssd_p = 1, 756 .ssd_long = 1, 757 .ssd_def32 = 0, 758 .ssd_gran = 1 }, 759 /* GPROC0_SEL 9 Proc 0 Tss Descriptor */ 760 { .ssd_base = 0x0, 761 .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1, 762 .ssd_type = SDT_SYSTSS, 763 .ssd_dpl = SEL_KPL, 764 .ssd_p = 1, 765 .ssd_long = 0, 766 .ssd_def32 = 0, 767 .ssd_gran = 0 }, 768 /* Actually, the TSS is a system descriptor which is double size */ 769 { .ssd_base = 0x0, 770 .ssd_limit = 0x0, 771 .ssd_type = 0, 772 .ssd_dpl = 0, 773 .ssd_p = 0, 774 .ssd_long = 0, 775 .ssd_def32 = 0, 776 .ssd_gran = 0 }, 777 /* GUSERLDT_SEL 11 LDT Descriptor */ 778 { .ssd_base = 0x0, 779 .ssd_limit = 0x0, 780 .ssd_type = 0, 781 .ssd_dpl = 0, 782 .ssd_p = 0, 783 .ssd_long = 0, 784 .ssd_def32 = 0, 785 .ssd_gran = 0 }, 786 /* GUSERLDT_SEL 12 LDT Descriptor, double size */ 787 { .ssd_base = 0x0, 788 .ssd_limit = 0x0, 789 .ssd_type = 0, 790 .ssd_dpl = 0, 791 .ssd_p = 0, 792 .ssd_long = 0, 793 .ssd_def32 = 0, 794 .ssd_gran = 0 }, 795 }; 796 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT"); 797 798 void 799 setidt(int idx, inthand_t *func, int typ, int dpl, int ist) 800 { 801 struct gate_descriptor *ip; 802 803 ip = idt + idx; 804 ip->gd_looffset = (uintptr_t)func; 805 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); 806 ip->gd_ist = ist; 807 ip->gd_xx = 0; 808 ip->gd_type = typ; 809 ip->gd_dpl = dpl; 810 ip->gd_p = 1; 811 ip->gd_hioffset = ((uintptr_t)func)>>16 ; 812 } 813 814 extern inthand_t 815 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 816 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 817 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 818 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 819 IDTVEC(xmm), IDTVEC(dblfault), 820 IDTVEC(div_pti), IDTVEC(bpt_pti), 821 IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti), 822 IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti), 823 IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti), 824 IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti), 825 IDTVEC(xmm_pti), 826 #ifdef KDTRACE_HOOKS 827 IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti), 828 #endif 829 #ifdef XENHVM 830 IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti), 831 #endif 832 IDTVEC(fast_syscall), IDTVEC(fast_syscall32), 833 IDTVEC(fast_syscall_pti); 834 835 #ifdef DDB 836 /* 837 * Display the index and function name of any IDT entries that don't use 838 * the default 'rsvd' entry point. 839 */ 840 DB_SHOW_COMMAND(idt, db_show_idt) 841 { 842 struct gate_descriptor *ip; 843 int idx; 844 uintptr_t func; 845 846 ip = idt; 847 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { 848 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); 849 if (func != (uintptr_t)&IDTVEC(rsvd)) { 850 db_printf("%3d\t", idx); 851 db_printsym(func, DB_STGY_PROC); 852 db_printf("\n"); 853 } 854 ip++; 855 } 856 } 857 858 /* Show privileged registers. */ 859 DB_SHOW_COMMAND(sysregs, db_show_sysregs) 860 { 861 struct { 862 uint16_t limit; 863 uint64_t base; 864 } __packed idtr, gdtr; 865 uint16_t ldt, tr; 866 867 __asm __volatile("sidt %0" : "=m" (idtr)); 868 db_printf("idtr\t0x%016lx/%04x\n", 869 (u_long)idtr.base, (u_int)idtr.limit); 870 __asm __volatile("sgdt %0" : "=m" (gdtr)); 871 db_printf("gdtr\t0x%016lx/%04x\n", 872 (u_long)gdtr.base, (u_int)gdtr.limit); 873 __asm __volatile("sldt %0" : "=r" (ldt)); 874 db_printf("ldtr\t0x%04x\n", ldt); 875 __asm __volatile("str %0" : "=r" (tr)); 876 db_printf("tr\t0x%04x\n", tr); 877 db_printf("cr0\t0x%016lx\n", rcr0()); 878 db_printf("cr2\t0x%016lx\n", rcr2()); 879 db_printf("cr3\t0x%016lx\n", rcr3()); 880 db_printf("cr4\t0x%016lx\n", rcr4()); 881 if (rcr4() & CR4_XSAVE) 882 db_printf("xcr0\t0x%016lx\n", rxcr(0)); 883 db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER)); 884 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX)) 885 db_printf("FEATURES_CTL\t%016lx\n", 886 rdmsr(MSR_IA32_FEATURE_CONTROL)); 887 db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR)); 888 db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT)); 889 db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE)); 890 } 891 892 DB_SHOW_COMMAND(dbregs, db_show_dbregs) 893 { 894 895 db_printf("dr0\t0x%016lx\n", rdr0()); 896 db_printf("dr1\t0x%016lx\n", rdr1()); 897 db_printf("dr2\t0x%016lx\n", rdr2()); 898 db_printf("dr3\t0x%016lx\n", rdr3()); 899 db_printf("dr6\t0x%016lx\n", rdr6()); 900 db_printf("dr7\t0x%016lx\n", rdr7()); 901 } 902 #endif 903 904 void 905 sdtossd(sd, ssd) 906 struct user_segment_descriptor *sd; 907 struct soft_segment_descriptor *ssd; 908 { 909 910 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 911 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 912 ssd->ssd_type = sd->sd_type; 913 ssd->ssd_dpl = sd->sd_dpl; 914 ssd->ssd_p = sd->sd_p; 915 ssd->ssd_long = sd->sd_long; 916 ssd->ssd_def32 = sd->sd_def32; 917 ssd->ssd_gran = sd->sd_gran; 918 } 919 920 void 921 ssdtosd(ssd, sd) 922 struct soft_segment_descriptor *ssd; 923 struct user_segment_descriptor *sd; 924 { 925 926 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 927 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; 928 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 929 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 930 sd->sd_type = ssd->ssd_type; 931 sd->sd_dpl = ssd->ssd_dpl; 932 sd->sd_p = ssd->ssd_p; 933 sd->sd_long = ssd->ssd_long; 934 sd->sd_def32 = ssd->ssd_def32; 935 sd->sd_gran = ssd->ssd_gran; 936 } 937 938 void 939 ssdtosyssd(ssd, sd) 940 struct soft_segment_descriptor *ssd; 941 struct system_segment_descriptor *sd; 942 { 943 944 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 945 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; 946 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 947 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 948 sd->sd_type = ssd->ssd_type; 949 sd->sd_dpl = ssd->ssd_dpl; 950 sd->sd_p = ssd->ssd_p; 951 sd->sd_gran = ssd->ssd_gran; 952 } 953 954 #if !defined(DEV_ATPIC) && defined(DEV_ISA) 955 #include <isa/isavar.h> 956 #include <isa/isareg.h> 957 /* 958 * Return a bitmap of the current interrupt requests. This is 8259-specific 959 * and is only suitable for use at probe time. 960 * This is only here to pacify sio. It is NOT FATAL if this doesn't work. 961 * It shouldn't be here. There should probably be an APIC centric 962 * implementation in the apic driver code, if at all. 963 */ 964 intrmask_t 965 isa_irq_pending(void) 966 { 967 u_char irr1; 968 u_char irr2; 969 970 irr1 = inb(IO_ICU1); 971 irr2 = inb(IO_ICU2); 972 return ((irr2 << 8) | irr1); 973 } 974 #endif 975 976 u_int basemem; 977 978 static int 979 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, 980 int *physmap_idxp) 981 { 982 int i, insert_idx, physmap_idx; 983 984 physmap_idx = *physmap_idxp; 985 986 if (length == 0) 987 return (1); 988 989 /* 990 * Find insertion point while checking for overlap. Start off by 991 * assuming the new entry will be added to the end. 992 * 993 * NB: physmap_idx points to the next free slot. 994 */ 995 insert_idx = physmap_idx; 996 for (i = 0; i <= physmap_idx; i += 2) { 997 if (base < physmap[i + 1]) { 998 if (base + length <= physmap[i]) { 999 insert_idx = i; 1000 break; 1001 } 1002 if (boothowto & RB_VERBOSE) 1003 printf( 1004 "Overlapping memory regions, ignoring second region\n"); 1005 return (1); 1006 } 1007 } 1008 1009 /* See if we can prepend to the next entry. */ 1010 if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { 1011 physmap[insert_idx] = base; 1012 return (1); 1013 } 1014 1015 /* See if we can append to the previous entry. */ 1016 if (insert_idx > 0 && base == physmap[insert_idx - 1]) { 1017 physmap[insert_idx - 1] += length; 1018 return (1); 1019 } 1020 1021 physmap_idx += 2; 1022 *physmap_idxp = physmap_idx; 1023 if (physmap_idx == PHYS_AVAIL_ENTRIES) { 1024 printf( 1025 "Too many segments in the physical address map, giving up\n"); 1026 return (0); 1027 } 1028 1029 /* 1030 * Move the last 'N' entries down to make room for the new 1031 * entry if needed. 1032 */ 1033 for (i = (physmap_idx - 2); i > insert_idx; i -= 2) { 1034 physmap[i] = physmap[i - 2]; 1035 physmap[i + 1] = physmap[i - 1]; 1036 } 1037 1038 /* Insert the new entry. */ 1039 physmap[insert_idx] = base; 1040 physmap[insert_idx + 1] = base + length; 1041 return (1); 1042 } 1043 1044 void 1045 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize, 1046 vm_paddr_t *physmap, int *physmap_idx) 1047 { 1048 struct bios_smap *smap, *smapend; 1049 1050 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 1051 1052 for (smap = smapbase; smap < smapend; smap++) { 1053 if (boothowto & RB_VERBOSE) 1054 printf("SMAP type=%02x base=%016lx len=%016lx\n", 1055 smap->type, smap->base, smap->length); 1056 1057 if (smap->type != SMAP_TYPE_MEMORY) 1058 continue; 1059 1060 if (!add_physmap_entry(smap->base, smap->length, physmap, 1061 physmap_idx)) 1062 break; 1063 } 1064 } 1065 1066 static void 1067 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap, 1068 int *physmap_idx) 1069 { 1070 struct efi_md *map, *p; 1071 const char *type; 1072 size_t efisz; 1073 int ndesc, i; 1074 1075 static const char *types[] = { 1076 "Reserved", 1077 "LoaderCode", 1078 "LoaderData", 1079 "BootServicesCode", 1080 "BootServicesData", 1081 "RuntimeServicesCode", 1082 "RuntimeServicesData", 1083 "ConventionalMemory", 1084 "UnusableMemory", 1085 "ACPIReclaimMemory", 1086 "ACPIMemoryNVS", 1087 "MemoryMappedIO", 1088 "MemoryMappedIOPortSpace", 1089 "PalCode", 1090 "PersistentMemory" 1091 }; 1092 1093 /* 1094 * Memory map data provided by UEFI via the GetMemoryMap 1095 * Boot Services API. 1096 */ 1097 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf; 1098 map = (struct efi_md *)((uint8_t *)efihdr + efisz); 1099 1100 if (efihdr->descriptor_size == 0) 1101 return; 1102 ndesc = efihdr->memory_size / efihdr->descriptor_size; 1103 1104 if (boothowto & RB_VERBOSE) 1105 printf("%23s %12s %12s %8s %4s\n", 1106 "Type", "Physical", "Virtual", "#Pages", "Attr"); 1107 1108 for (i = 0, p = map; i < ndesc; i++, 1109 p = efi_next_descriptor(p, efihdr->descriptor_size)) { 1110 if (boothowto & RB_VERBOSE) { 1111 if (p->md_type < nitems(types)) 1112 type = types[p->md_type]; 1113 else 1114 type = "<INVALID>"; 1115 printf("%23s %012lx %12p %08lx ", type, p->md_phys, 1116 p->md_virt, p->md_pages); 1117 if (p->md_attr & EFI_MD_ATTR_UC) 1118 printf("UC "); 1119 if (p->md_attr & EFI_MD_ATTR_WC) 1120 printf("WC "); 1121 if (p->md_attr & EFI_MD_ATTR_WT) 1122 printf("WT "); 1123 if (p->md_attr & EFI_MD_ATTR_WB) 1124 printf("WB "); 1125 if (p->md_attr & EFI_MD_ATTR_UCE) 1126 printf("UCE "); 1127 if (p->md_attr & EFI_MD_ATTR_WP) 1128 printf("WP "); 1129 if (p->md_attr & EFI_MD_ATTR_RP) 1130 printf("RP "); 1131 if (p->md_attr & EFI_MD_ATTR_XP) 1132 printf("XP "); 1133 if (p->md_attr & EFI_MD_ATTR_NV) 1134 printf("NV "); 1135 if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE) 1136 printf("MORE_RELIABLE "); 1137 if (p->md_attr & EFI_MD_ATTR_RO) 1138 printf("RO "); 1139 if (p->md_attr & EFI_MD_ATTR_RT) 1140 printf("RUNTIME"); 1141 printf("\n"); 1142 } 1143 1144 switch (p->md_type) { 1145 case EFI_MD_TYPE_CODE: 1146 case EFI_MD_TYPE_DATA: 1147 case EFI_MD_TYPE_BS_CODE: 1148 case EFI_MD_TYPE_BS_DATA: 1149 case EFI_MD_TYPE_FREE: 1150 /* 1151 * We're allowed to use any entry with these types. 1152 */ 1153 break; 1154 default: 1155 continue; 1156 } 1157 1158 if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE), 1159 physmap, physmap_idx)) 1160 break; 1161 } 1162 } 1163 1164 static char bootmethod[16] = ""; 1165 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0, 1166 "System firmware boot method"); 1167 1168 static void 1169 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx) 1170 { 1171 struct bios_smap *smap; 1172 struct efi_map_header *efihdr; 1173 u_int32_t size; 1174 1175 /* 1176 * Memory map from INT 15:E820. 1177 * 1178 * subr_module.c says: 1179 * "Consumer may safely assume that size value precedes data." 1180 * ie: an int32_t immediately precedes smap. 1181 */ 1182 1183 efihdr = (struct efi_map_header *)preload_search_info(kmdp, 1184 MODINFO_METADATA | MODINFOMD_EFI_MAP); 1185 smap = (struct bios_smap *)preload_search_info(kmdp, 1186 MODINFO_METADATA | MODINFOMD_SMAP); 1187 if (efihdr == NULL && smap == NULL) 1188 panic("No BIOS smap or EFI map info from loader!"); 1189 1190 if (efihdr != NULL) { 1191 add_efi_map_entries(efihdr, physmap, physmap_idx); 1192 strlcpy(bootmethod, "UEFI", sizeof(bootmethod)); 1193 } else { 1194 size = *((u_int32_t *)smap - 1); 1195 bios_add_smap_entries(smap, size, physmap, physmap_idx); 1196 strlcpy(bootmethod, "BIOS", sizeof(bootmethod)); 1197 } 1198 } 1199 1200 #define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE) 1201 1202 /* 1203 * Populate the (physmap) array with base/bound pairs describing the 1204 * available physical memory in the system, then test this memory and 1205 * build the phys_avail array describing the actually-available memory. 1206 * 1207 * Total memory size may be set by the kernel environment variable 1208 * hw.physmem or the compile-time define MAXMEM. 1209 * 1210 * XXX first should be vm_paddr_t. 1211 */ 1212 static void 1213 getmemsize(caddr_t kmdp, u_int64_t first) 1214 { 1215 int i, physmap_idx, pa_indx, da_indx; 1216 vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES]; 1217 u_long physmem_start, physmem_tunable, memtest; 1218 pt_entry_t *pte; 1219 quad_t dcons_addr, dcons_size; 1220 int page_counter; 1221 1222 /* 1223 * Tell the physical memory allocator about pages used to store 1224 * the kernel and preloaded data. See kmem_bootstrap_free(). 1225 */ 1226 vm_phys_add_seg((vm_paddr_t)kernphys, trunc_page(first)); 1227 1228 bzero(physmap, sizeof(physmap)); 1229 physmap_idx = 0; 1230 1231 init_ops.parse_memmap(kmdp, physmap, &physmap_idx); 1232 physmap_idx -= 2; 1233 1234 /* 1235 * Find the 'base memory' segment for SMP 1236 */ 1237 basemem = 0; 1238 for (i = 0; i <= physmap_idx; i += 2) { 1239 if (physmap[i] <= 0xA0000) { 1240 basemem = physmap[i + 1] / 1024; 1241 break; 1242 } 1243 } 1244 if (basemem == 0 || basemem > 640) { 1245 if (bootverbose) 1246 printf( 1247 "Memory map doesn't contain a basemem segment, faking it"); 1248 basemem = 640; 1249 } 1250 1251 /* 1252 * Maxmem isn't the "maximum memory", it's one larger than the 1253 * highest page of the physical address space. It should be 1254 * called something like "Maxphyspage". We may adjust this 1255 * based on ``hw.physmem'' and the results of the memory test. 1256 */ 1257 Maxmem = atop(physmap[physmap_idx + 1]); 1258 1259 #ifdef MAXMEM 1260 Maxmem = MAXMEM / 4; 1261 #endif 1262 1263 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 1264 Maxmem = atop(physmem_tunable); 1265 1266 /* 1267 * The boot memory test is disabled by default, as it takes a 1268 * significant amount of time on large-memory systems, and is 1269 * unfriendly to virtual machines as it unnecessarily touches all 1270 * pages. 1271 * 1272 * A general name is used as the code may be extended to support 1273 * additional tests beyond the current "page present" test. 1274 */ 1275 memtest = 0; 1276 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); 1277 1278 /* 1279 * Don't allow MAXMEM or hw.physmem to extend the amount of memory 1280 * in the system. 1281 */ 1282 if (Maxmem > atop(physmap[physmap_idx + 1])) 1283 Maxmem = atop(physmap[physmap_idx + 1]); 1284 1285 if (atop(physmap[physmap_idx + 1]) != Maxmem && 1286 (boothowto & RB_VERBOSE)) 1287 printf("Physical memory use set to %ldK\n", Maxmem * 4); 1288 1289 /* 1290 * Make hole for "AP -> long mode" bootstrap code. The 1291 * mp_bootaddress vector is only available when the kernel 1292 * is configured to support APs and APs for the system start 1293 * in real mode mode (e.g. SMP bare metal). 1294 */ 1295 if (init_ops.mp_bootaddress) 1296 init_ops.mp_bootaddress(physmap, &physmap_idx); 1297 1298 /* call pmap initialization to make new kernel address space */ 1299 pmap_bootstrap(&first); 1300 1301 /* 1302 * Size up each available chunk of physical memory. 1303 * 1304 * XXX Some BIOSes corrupt low 64KB between suspend and resume. 1305 * By default, mask off the first 16 pages unless we appear to be 1306 * running in a VM. 1307 */ 1308 physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT; 1309 TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start); 1310 if (physmap[0] < physmem_start) { 1311 if (physmem_start < PAGE_SIZE) 1312 physmap[0] = PAGE_SIZE; 1313 else if (physmem_start >= physmap[1]) 1314 physmap[0] = round_page(physmap[1] - PAGE_SIZE); 1315 else 1316 physmap[0] = round_page(physmem_start); 1317 } 1318 pa_indx = 0; 1319 da_indx = 1; 1320 phys_avail[pa_indx++] = physmap[0]; 1321 phys_avail[pa_indx] = physmap[0]; 1322 dump_avail[da_indx] = physmap[0]; 1323 pte = CMAP1; 1324 1325 /* 1326 * Get dcons buffer address 1327 */ 1328 if (getenv_quad("dcons.addr", &dcons_addr) == 0 || 1329 getenv_quad("dcons.size", &dcons_size) == 0) 1330 dcons_addr = 0; 1331 1332 /* 1333 * physmap is in bytes, so when converting to page boundaries, 1334 * round up the start address and round down the end address. 1335 */ 1336 page_counter = 0; 1337 if (memtest != 0) 1338 printf("Testing system memory"); 1339 for (i = 0; i <= physmap_idx; i += 2) { 1340 vm_paddr_t end; 1341 1342 end = ptoa((vm_paddr_t)Maxmem); 1343 if (physmap[i + 1] < end) 1344 end = trunc_page(physmap[i + 1]); 1345 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 1346 int tmp, page_bad, full; 1347 int *ptr = (int *)CADDR1; 1348 1349 full = FALSE; 1350 /* 1351 * block out kernel memory as not available. 1352 */ 1353 if (pa >= (vm_paddr_t)kernphys && pa < first) 1354 goto do_dump_avail; 1355 1356 /* 1357 * block out dcons buffer 1358 */ 1359 if (dcons_addr > 0 1360 && pa >= trunc_page(dcons_addr) 1361 && pa < dcons_addr + dcons_size) 1362 goto do_dump_avail; 1363 1364 page_bad = FALSE; 1365 if (memtest == 0) 1366 goto skip_memtest; 1367 1368 /* 1369 * Print a "." every GB to show we're making 1370 * progress. 1371 */ 1372 page_counter++; 1373 if ((page_counter % PAGES_PER_GB) == 0) 1374 printf("."); 1375 1376 /* 1377 * map page into kernel: valid, read/write,non-cacheable 1378 */ 1379 *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD; 1380 invltlb(); 1381 1382 tmp = *(int *)ptr; 1383 /* 1384 * Test for alternating 1's and 0's 1385 */ 1386 *(volatile int *)ptr = 0xaaaaaaaa; 1387 if (*(volatile int *)ptr != 0xaaaaaaaa) 1388 page_bad = TRUE; 1389 /* 1390 * Test for alternating 0's and 1's 1391 */ 1392 *(volatile int *)ptr = 0x55555555; 1393 if (*(volatile int *)ptr != 0x55555555) 1394 page_bad = TRUE; 1395 /* 1396 * Test for all 1's 1397 */ 1398 *(volatile int *)ptr = 0xffffffff; 1399 if (*(volatile int *)ptr != 0xffffffff) 1400 page_bad = TRUE; 1401 /* 1402 * Test for all 0's 1403 */ 1404 *(volatile int *)ptr = 0x0; 1405 if (*(volatile int *)ptr != 0x0) 1406 page_bad = TRUE; 1407 /* 1408 * Restore original value. 1409 */ 1410 *(int *)ptr = tmp; 1411 1412 skip_memtest: 1413 /* 1414 * Adjust array of valid/good pages. 1415 */ 1416 if (page_bad == TRUE) 1417 continue; 1418 /* 1419 * If this good page is a continuation of the 1420 * previous set of good pages, then just increase 1421 * the end pointer. Otherwise start a new chunk. 1422 * Note that "end" points one higher than end, 1423 * making the range >= start and < end. 1424 * If we're also doing a speculative memory 1425 * test and we at or past the end, bump up Maxmem 1426 * so that we keep going. The first bad page 1427 * will terminate the loop. 1428 */ 1429 if (phys_avail[pa_indx] == pa) { 1430 phys_avail[pa_indx] += PAGE_SIZE; 1431 } else { 1432 pa_indx++; 1433 if (pa_indx == PHYS_AVAIL_ENTRIES) { 1434 printf( 1435 "Too many holes in the physical address space, giving up\n"); 1436 pa_indx--; 1437 full = TRUE; 1438 goto do_dump_avail; 1439 } 1440 phys_avail[pa_indx++] = pa; /* start */ 1441 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 1442 } 1443 physmem++; 1444 do_dump_avail: 1445 if (dump_avail[da_indx] == pa) { 1446 dump_avail[da_indx] += PAGE_SIZE; 1447 } else { 1448 da_indx++; 1449 if (da_indx == PHYS_AVAIL_ENTRIES) { 1450 da_indx--; 1451 goto do_next; 1452 } 1453 dump_avail[da_indx++] = pa; /* start */ 1454 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 1455 } 1456 do_next: 1457 if (full) 1458 break; 1459 } 1460 } 1461 *pte = 0; 1462 invltlb(); 1463 if (memtest != 0) 1464 printf("\n"); 1465 1466 /* 1467 * XXX 1468 * The last chunk must contain at least one page plus the message 1469 * buffer to avoid complicating other code (message buffer address 1470 * calculation, etc.). 1471 */ 1472 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 1473 round_page(msgbufsize) >= phys_avail[pa_indx]) { 1474 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 1475 phys_avail[pa_indx--] = 0; 1476 phys_avail[pa_indx--] = 0; 1477 } 1478 1479 Maxmem = atop(phys_avail[pa_indx]); 1480 1481 /* Trim off space for the message buffer. */ 1482 phys_avail[pa_indx] -= round_page(msgbufsize); 1483 1484 /* Map the message buffer. */ 1485 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]); 1486 } 1487 1488 static caddr_t 1489 native_parse_preload_data(u_int64_t modulep) 1490 { 1491 caddr_t kmdp; 1492 char *envp; 1493 #ifdef DDB 1494 vm_offset_t ksym_start; 1495 vm_offset_t ksym_end; 1496 #endif 1497 1498 preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE); 1499 preload_bootstrap_relocate(KERNBASE); 1500 kmdp = preload_search_by_type("elf kernel"); 1501 if (kmdp == NULL) 1502 kmdp = preload_search_by_type("elf64 kernel"); 1503 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); 1504 envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *); 1505 if (envp != NULL) 1506 envp += KERNBASE; 1507 init_static_kenv(envp, 0); 1508 #ifdef DDB 1509 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); 1510 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); 1511 db_fetch_ksymtab(ksym_start, ksym_end); 1512 #endif 1513 efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t); 1514 1515 return (kmdp); 1516 } 1517 1518 static void 1519 amd64_kdb_init(void) 1520 { 1521 kdb_init(); 1522 #ifdef KDB 1523 if (boothowto & RB_KDB) 1524 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); 1525 #endif 1526 } 1527 1528 /* Set up the fast syscall stuff */ 1529 void 1530 amd64_conf_fast_syscall(void) 1531 { 1532 uint64_t msr; 1533 1534 msr = rdmsr(MSR_EFER) | EFER_SCE; 1535 wrmsr(MSR_EFER, msr); 1536 wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) : 1537 (u_int64_t)IDTVEC(fast_syscall)); 1538 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 1539 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 1540 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 1541 wrmsr(MSR_STAR, msr); 1542 wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC); 1543 } 1544 1545 void 1546 amd64_bsp_pcpu_init1(struct pcpu *pc) 1547 { 1548 struct user_segment_descriptor *gdt; 1549 1550 PCPU_SET(prvspace, pc); 1551 gdt = *PCPU_PTR(gdt); 1552 PCPU_SET(curthread, &thread0); 1553 PCPU_SET(tssp, PCPU_PTR(common_tss)); 1554 PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1555 PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]); 1556 PCPU_SET(fs32p, &gdt[GUFS32_SEL]); 1557 PCPU_SET(gs32p, &gdt[GUGS32_SEL]); 1558 } 1559 1560 void 1561 amd64_bsp_pcpu_init2(uint64_t rsp0) 1562 { 1563 1564 PCPU_SET(rsp0, rsp0); 1565 PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) + 1566 PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful); 1567 PCPU_SET(curpcb, thread0.td_pcb); 1568 } 1569 1570 void 1571 amd64_bsp_ist_init(struct pcpu *pc) 1572 { 1573 struct nmi_pcpu *np; 1574 struct amd64tss *tssp; 1575 1576 tssp = &pc->pc_common_tss; 1577 1578 /* doublefault stack space, runs on ist1 */ 1579 np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1; 1580 np->np_pcpu = (register_t)pc; 1581 tssp->tss_ist1 = (long)np; 1582 1583 /* 1584 * NMI stack, runs on ist2. The pcpu pointer is stored just 1585 * above the start of the ist2 stack. 1586 */ 1587 np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1; 1588 np->np_pcpu = (register_t)pc; 1589 tssp->tss_ist2 = (long)np; 1590 1591 /* 1592 * MC# stack, runs on ist3. The pcpu pointer is stored just 1593 * above the start of the ist3 stack. 1594 */ 1595 np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1; 1596 np->np_pcpu = (register_t)pc; 1597 tssp->tss_ist3 = (long)np; 1598 1599 /* 1600 * DB# stack, runs on ist4. 1601 */ 1602 np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1; 1603 np->np_pcpu = (register_t)pc; 1604 tssp->tss_ist4 = (long)np; 1605 } 1606 1607 u_int64_t 1608 hammer_time(u_int64_t modulep, u_int64_t physfree) 1609 { 1610 caddr_t kmdp; 1611 int gsel_tss, x; 1612 struct pcpu *pc; 1613 struct xstate_hdr *xhdr; 1614 u_int64_t rsp0; 1615 char *env; 1616 struct user_segment_descriptor *gdt; 1617 struct region_descriptor r_gdt; 1618 size_t kstack0_sz; 1619 int late_console; 1620 1621 TSRAW(&thread0, TS_ENTER, __func__, NULL); 1622 1623 kmdp = init_ops.parse_preload_data(modulep); 1624 1625 physfree += ucode_load_bsp(physfree + KERNBASE); 1626 physfree = roundup2(physfree, PAGE_SIZE); 1627 1628 identify_cpu1(); 1629 identify_hypervisor(); 1630 identify_cpu_fixup_bsp(); 1631 identify_cpu2(); 1632 initializecpucache(); 1633 1634 /* 1635 * Check for pti, pcid, and invpcid before ifuncs are 1636 * resolved, to correctly select the implementation for 1637 * pmap_activate_sw_mode(). 1638 */ 1639 pti = pti_get_default(); 1640 TUNABLE_INT_FETCH("vm.pmap.pti", &pti); 1641 TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); 1642 if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) { 1643 invpcid_works = (cpu_stdext_feature & 1644 CPUID_STDEXT_INVPCID) != 0; 1645 } else { 1646 pmap_pcid_enabled = 0; 1647 } 1648 1649 link_elf_ireloc(kmdp); 1650 1651 /* 1652 * This may be done better later if it gets more high level 1653 * components in it. If so just link td->td_proc here. 1654 */ 1655 proc_linkup0(&proc0, &thread0); 1656 1657 /* Init basic tunables, hz etc */ 1658 init_param1(); 1659 1660 thread0.td_kstack = physfree + KERNBASE; 1661 thread0.td_kstack_pages = kstack_pages; 1662 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; 1663 bzero((void *)thread0.td_kstack, kstack0_sz); 1664 physfree += kstack0_sz; 1665 1666 /* 1667 * Initialize enough of thread0 for delayed invalidation to 1668 * work very early. Rely on thread0.td_base_pri 1669 * zero-initialization, it is reset to PVM at proc0_init(). 1670 */ 1671 pmap_thread_init_invl_gen(&thread0); 1672 1673 pc = &temp_bsp_pcpu; 1674 pcpu_init(pc, 0, sizeof(struct pcpu)); 1675 gdt = &temp_bsp_pcpu.pc_gdt[0]; 1676 1677 /* 1678 * make gdt memory segments 1679 */ 1680 for (x = 0; x < NGDT; x++) { 1681 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && 1682 x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1) 1683 ssdtosd(&gdt_segs[x], &gdt[x]); 1684 } 1685 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss; 1686 ssdtosyssd(&gdt_segs[GPROC0_SEL], 1687 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1688 1689 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 1690 r_gdt.rd_base = (long)gdt; 1691 lgdt(&r_gdt); 1692 1693 wrmsr(MSR_FSBASE, 0); /* User value */ 1694 wrmsr(MSR_GSBASE, (u_int64_t)pc); 1695 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ 1696 1697 dpcpu_init((void *)(physfree + KERNBASE), 0); 1698 physfree += DPCPU_SIZE; 1699 amd64_bsp_pcpu_init1(pc); 1700 /* Non-late cninit() and printf() can be moved up to here. */ 1701 1702 /* 1703 * Initialize mutexes. 1704 * 1705 * icu_lock: in order to allow an interrupt to occur in a critical 1706 * section, to set pcpu->ipending (etc...) properly, we 1707 * must be able to get the icu lock, so it can't be 1708 * under witness. 1709 */ 1710 mutex_init(); 1711 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS); 1712 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF); 1713 1714 /* exceptions */ 1715 for (x = 0; x < NIDT; x++) 1716 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT, 1717 SEL_KPL, 0); 1718 setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT, 1719 SEL_KPL, 0); 1720 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4); 1721 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2); 1722 setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT, 1723 SEL_UPL, 0); 1724 setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT, 1725 SEL_UPL, 0); 1726 setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT, 1727 SEL_KPL, 0); 1728 setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT, 1729 SEL_KPL, 0); 1730 setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT, 1731 SEL_KPL, 0); 1732 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); 1733 setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm), 1734 SDT_SYSIGT, SEL_KPL, 0); 1735 setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT, 1736 SEL_KPL, 0); 1737 setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing), 1738 SDT_SYSIGT, SEL_KPL, 0); 1739 setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT, 1740 SEL_KPL, 0); 1741 setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT, 1742 SEL_KPL, 0); 1743 setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT, 1744 SEL_KPL, 0); 1745 setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT, 1746 SEL_KPL, 0); 1747 setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT, 1748 SEL_KPL, 0); 1749 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3); 1750 setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT, 1751 SEL_KPL, 0); 1752 #ifdef KDTRACE_HOOKS 1753 setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) : 1754 &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0); 1755 #endif 1756 #ifdef XENHVM 1757 setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) : 1758 &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0); 1759 #endif 1760 r_idt.rd_limit = sizeof(idt0) - 1; 1761 r_idt.rd_base = (long) idt; 1762 lidt(&r_idt); 1763 1764 /* 1765 * Initialize the clock before the console so that console 1766 * initialization can use DELAY(). 1767 */ 1768 clock_init(); 1769 1770 /* 1771 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4) 1772 * transition). 1773 * Once bootblocks have updated, we can test directly for 1774 * efi_systbl != NULL here... 1775 */ 1776 if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP) 1777 != NULL) 1778 vty_set_preferred(VTY_VT); 1779 1780 TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable); 1781 TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable); 1782 1783 TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable); 1784 TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable); 1785 1786 TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush", 1787 &syscall_ret_l1d_flush_mode); 1788 1789 TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable); 1790 TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable); 1791 1792 TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable); 1793 1794 finishidentcpu(); /* Final stage of CPU initialization */ 1795 initializecpu(); /* Initialize CPU registers */ 1796 1797 amd64_bsp_ist_init(pc); 1798 1799 /* Set the IO permission bitmap (empty due to tss seg limit) */ 1800 pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) + 1801 IOPERM_BITMAP_SIZE; 1802 1803 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 1804 ltr(gsel_tss); 1805 1806 amd64_conf_fast_syscall(); 1807 1808 /* 1809 * We initialize the PCB pointer early so that exception 1810 * handlers will work. Also set up td_critnest to short-cut 1811 * the page fault handler. 1812 */ 1813 cpu_max_ext_state_size = sizeof(struct savefpu); 1814 set_top_of_stack_td(&thread0); 1815 thread0.td_pcb = get_pcb_td(&thread0); 1816 thread0.td_critnest = 1; 1817 1818 /* 1819 * The console and kdb should be initialized even earlier than here, 1820 * but some console drivers don't work until after getmemsize(). 1821 * Default to late console initialization to support these drivers. 1822 * This loses mainly printf()s in getmemsize() and early debugging. 1823 */ 1824 late_console = 1; 1825 TUNABLE_INT_FETCH("debug.late_console", &late_console); 1826 if (!late_console) { 1827 cninit(); 1828 amd64_kdb_init(); 1829 } 1830 1831 getmemsize(kmdp, physfree); 1832 init_param2(physmem); 1833 1834 /* now running on new page tables, configured,and u/iom is accessible */ 1835 1836 #ifdef DEV_PCI 1837 /* This call might adjust phys_avail[]. */ 1838 pci_early_quirks(); 1839 #endif 1840 1841 if (late_console) 1842 cninit(); 1843 1844 #ifdef DEV_ISA 1845 #ifdef DEV_ATPIC 1846 elcr_probe(); 1847 atpic_startup(); 1848 #else 1849 /* Reset and mask the atpics and leave them shut down. */ 1850 atpic_reset(); 1851 1852 /* 1853 * Point the ICU spurious interrupt vectors at the APIC spurious 1854 * interrupt handler. 1855 */ 1856 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); 1857 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); 1858 #endif 1859 #else 1860 #error "have you forgotten the isa device?"; 1861 #endif 1862 1863 if (late_console) 1864 amd64_kdb_init(); 1865 1866 msgbufinit(msgbufp, msgbufsize); 1867 fpuinit(); 1868 1869 /* 1870 * Set up thread0 pcb save area after fpuinit calculated fpu save 1871 * area size. Zero out the extended state header in fpu save 1872 * area. 1873 */ 1874 thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0); 1875 bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size); 1876 if (use_xsave) { 1877 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + 1878 1); 1879 xhdr->xstate_bv = xsave_mask; 1880 } 1881 /* make an initial tss so cpu can get interrupt stack on syscall! */ 1882 rsp0 = thread0.td_md.md_stack_base; 1883 /* Ensure the stack is aligned to 16 bytes */ 1884 rsp0 &= ~0xFul; 1885 __pcpu[0].pc_common_tss.tss_rsp0 = rsp0; 1886 amd64_bsp_pcpu_init2(rsp0); 1887 1888 /* transfer to user mode */ 1889 1890 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 1891 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 1892 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); 1893 _ufssel = GSEL(GUFS32_SEL, SEL_UPL); 1894 _ugssel = GSEL(GUGS32_SEL, SEL_UPL); 1895 1896 load_ds(_udatasel); 1897 load_es(_udatasel); 1898 load_fs(_ufssel); 1899 1900 /* setup proc 0's pcb */ 1901 thread0.td_pcb->pcb_flags = 0; 1902 thread0.td_frame = &proc0_tf; 1903 1904 env = kern_getenv("kernelname"); 1905 if (env != NULL) 1906 strlcpy(kernelname, env, sizeof(kernelname)); 1907 1908 cpu_probe_amdc1e(); 1909 1910 kcsan_cpu_init(0); 1911 1912 #ifdef FDT 1913 x86_init_fdt(); 1914 #endif 1915 thread0.td_critnest = 0; 1916 1917 TSEXIT(); 1918 1919 /* Location of kernel stack for locore */ 1920 return (thread0.td_md.md_stack_base); 1921 } 1922 1923 void 1924 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) 1925 { 1926 1927 pcpu->pc_acpi_id = 0xffffffff; 1928 } 1929 1930 static int 1931 smap_sysctl_handler(SYSCTL_HANDLER_ARGS) 1932 { 1933 struct bios_smap *smapbase; 1934 struct bios_smap_xattr smap; 1935 caddr_t kmdp; 1936 uint32_t *smapattr; 1937 int count, error, i; 1938 1939 /* Retrieve the system memory map from the loader. */ 1940 kmdp = preload_search_by_type("elf kernel"); 1941 if (kmdp == NULL) 1942 kmdp = preload_search_by_type("elf64 kernel"); 1943 smapbase = (struct bios_smap *)preload_search_info(kmdp, 1944 MODINFO_METADATA | MODINFOMD_SMAP); 1945 if (smapbase == NULL) 1946 return (0); 1947 smapattr = (uint32_t *)preload_search_info(kmdp, 1948 MODINFO_METADATA | MODINFOMD_SMAP_XATTR); 1949 count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase); 1950 error = 0; 1951 for (i = 0; i < count; i++) { 1952 smap.base = smapbase[i].base; 1953 smap.length = smapbase[i].length; 1954 smap.type = smapbase[i].type; 1955 if (smapattr != NULL) 1956 smap.xattr = smapattr[i]; 1957 else 1958 smap.xattr = 0; 1959 error = SYSCTL_OUT(req, &smap, sizeof(smap)); 1960 } 1961 return (error); 1962 } 1963 SYSCTL_PROC(_machdep, OID_AUTO, smap, 1964 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 1965 smap_sysctl_handler, "S,bios_smap_xattr", 1966 "Raw BIOS SMAP data"); 1967 1968 static int 1969 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS) 1970 { 1971 struct efi_map_header *efihdr; 1972 caddr_t kmdp; 1973 uint32_t efisize; 1974 1975 kmdp = preload_search_by_type("elf kernel"); 1976 if (kmdp == NULL) 1977 kmdp = preload_search_by_type("elf64 kernel"); 1978 efihdr = (struct efi_map_header *)preload_search_info(kmdp, 1979 MODINFO_METADATA | MODINFOMD_EFI_MAP); 1980 if (efihdr == NULL) 1981 return (0); 1982 efisize = *((uint32_t *)efihdr - 1); 1983 return (SYSCTL_OUT(req, efihdr, efisize)); 1984 } 1985 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, 1986 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 1987 efi_map_sysctl_handler, "S,efi_map_header", 1988 "Raw EFI Memory Map"); 1989 1990 void 1991 spinlock_enter(void) 1992 { 1993 struct thread *td; 1994 register_t flags; 1995 1996 td = curthread; 1997 if (td->td_md.md_spinlock_count == 0) { 1998 flags = intr_disable(); 1999 td->td_md.md_spinlock_count = 1; 2000 td->td_md.md_saved_flags = flags; 2001 critical_enter(); 2002 } else 2003 td->td_md.md_spinlock_count++; 2004 } 2005 2006 void 2007 spinlock_exit(void) 2008 { 2009 struct thread *td; 2010 register_t flags; 2011 2012 td = curthread; 2013 flags = td->td_md.md_saved_flags; 2014 td->td_md.md_spinlock_count--; 2015 if (td->td_md.md_spinlock_count == 0) { 2016 critical_exit(); 2017 intr_restore(flags); 2018 } 2019 } 2020 2021 /* 2022 * Construct a PCB from a trapframe. This is called from kdb_trap() where 2023 * we want to start a backtrace from the function that caused us to enter 2024 * the debugger. We have the context in the trapframe, but base the trace 2025 * on the PCB. The PCB doesn't have to be perfect, as long as it contains 2026 * enough for a backtrace. 2027 */ 2028 void 2029 makectx(struct trapframe *tf, struct pcb *pcb) 2030 { 2031 2032 pcb->pcb_r12 = tf->tf_r12; 2033 pcb->pcb_r13 = tf->tf_r13; 2034 pcb->pcb_r14 = tf->tf_r14; 2035 pcb->pcb_r15 = tf->tf_r15; 2036 pcb->pcb_rbp = tf->tf_rbp; 2037 pcb->pcb_rbx = tf->tf_rbx; 2038 pcb->pcb_rip = tf->tf_rip; 2039 pcb->pcb_rsp = tf->tf_rsp; 2040 } 2041 2042 int 2043 ptrace_set_pc(struct thread *td, unsigned long addr) 2044 { 2045 2046 td->td_frame->tf_rip = addr; 2047 set_pcb_flags(td->td_pcb, PCB_FULL_IRET); 2048 return (0); 2049 } 2050 2051 int 2052 ptrace_single_step(struct thread *td) 2053 { 2054 2055 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); 2056 if ((td->td_frame->tf_rflags & PSL_T) == 0) { 2057 td->td_frame->tf_rflags |= PSL_T; 2058 td->td_dbgflags |= TDB_STEP; 2059 } 2060 return (0); 2061 } 2062 2063 int 2064 ptrace_clear_single_step(struct thread *td) 2065 { 2066 2067 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); 2068 td->td_frame->tf_rflags &= ~PSL_T; 2069 td->td_dbgflags &= ~TDB_STEP; 2070 return (0); 2071 } 2072 2073 int 2074 fill_regs(struct thread *td, struct reg *regs) 2075 { 2076 struct trapframe *tp; 2077 2078 tp = td->td_frame; 2079 return (fill_frame_regs(tp, regs)); 2080 } 2081 2082 int 2083 fill_frame_regs(struct trapframe *tp, struct reg *regs) 2084 { 2085 2086 regs->r_r15 = tp->tf_r15; 2087 regs->r_r14 = tp->tf_r14; 2088 regs->r_r13 = tp->tf_r13; 2089 regs->r_r12 = tp->tf_r12; 2090 regs->r_r11 = tp->tf_r11; 2091 regs->r_r10 = tp->tf_r10; 2092 regs->r_r9 = tp->tf_r9; 2093 regs->r_r8 = tp->tf_r8; 2094 regs->r_rdi = tp->tf_rdi; 2095 regs->r_rsi = tp->tf_rsi; 2096 regs->r_rbp = tp->tf_rbp; 2097 regs->r_rbx = tp->tf_rbx; 2098 regs->r_rdx = tp->tf_rdx; 2099 regs->r_rcx = tp->tf_rcx; 2100 regs->r_rax = tp->tf_rax; 2101 regs->r_rip = tp->tf_rip; 2102 regs->r_cs = tp->tf_cs; 2103 regs->r_rflags = tp->tf_rflags; 2104 regs->r_rsp = tp->tf_rsp; 2105 regs->r_ss = tp->tf_ss; 2106 if (tp->tf_flags & TF_HASSEGS) { 2107 regs->r_ds = tp->tf_ds; 2108 regs->r_es = tp->tf_es; 2109 regs->r_fs = tp->tf_fs; 2110 regs->r_gs = tp->tf_gs; 2111 } else { 2112 regs->r_ds = 0; 2113 regs->r_es = 0; 2114 regs->r_fs = 0; 2115 regs->r_gs = 0; 2116 } 2117 regs->r_err = 0; 2118 regs->r_trapno = 0; 2119 return (0); 2120 } 2121 2122 int 2123 set_regs(struct thread *td, struct reg *regs) 2124 { 2125 struct trapframe *tp; 2126 register_t rflags; 2127 2128 tp = td->td_frame; 2129 rflags = regs->r_rflags & 0xffffffff; 2130 if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs)) 2131 return (EINVAL); 2132 tp->tf_r15 = regs->r_r15; 2133 tp->tf_r14 = regs->r_r14; 2134 tp->tf_r13 = regs->r_r13; 2135 tp->tf_r12 = regs->r_r12; 2136 tp->tf_r11 = regs->r_r11; 2137 tp->tf_r10 = regs->r_r10; 2138 tp->tf_r9 = regs->r_r9; 2139 tp->tf_r8 = regs->r_r8; 2140 tp->tf_rdi = regs->r_rdi; 2141 tp->tf_rsi = regs->r_rsi; 2142 tp->tf_rbp = regs->r_rbp; 2143 tp->tf_rbx = regs->r_rbx; 2144 tp->tf_rdx = regs->r_rdx; 2145 tp->tf_rcx = regs->r_rcx; 2146 tp->tf_rax = regs->r_rax; 2147 tp->tf_rip = regs->r_rip; 2148 tp->tf_cs = regs->r_cs; 2149 tp->tf_rflags = rflags; 2150 tp->tf_rsp = regs->r_rsp; 2151 tp->tf_ss = regs->r_ss; 2152 if (0) { /* XXXKIB */ 2153 tp->tf_ds = regs->r_ds; 2154 tp->tf_es = regs->r_es; 2155 tp->tf_fs = regs->r_fs; 2156 tp->tf_gs = regs->r_gs; 2157 tp->tf_flags = TF_HASSEGS; 2158 } 2159 set_pcb_flags(td->td_pcb, PCB_FULL_IRET); 2160 return (0); 2161 } 2162 2163 /* XXX check all this stuff! */ 2164 /* externalize from sv_xmm */ 2165 static void 2166 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs) 2167 { 2168 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; 2169 struct envxmm *penv_xmm = &sv_xmm->sv_env; 2170 int i; 2171 2172 /* pcb -> fpregs */ 2173 bzero(fpregs, sizeof(*fpregs)); 2174 2175 /* FPU control/status */ 2176 penv_fpreg->en_cw = penv_xmm->en_cw; 2177 penv_fpreg->en_sw = penv_xmm->en_sw; 2178 penv_fpreg->en_tw = penv_xmm->en_tw; 2179 penv_fpreg->en_opcode = penv_xmm->en_opcode; 2180 penv_fpreg->en_rip = penv_xmm->en_rip; 2181 penv_fpreg->en_rdp = penv_xmm->en_rdp; 2182 penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr; 2183 penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask; 2184 2185 /* FPU registers */ 2186 for (i = 0; i < 8; ++i) 2187 bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10); 2188 2189 /* SSE registers */ 2190 for (i = 0; i < 16; ++i) 2191 bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16); 2192 } 2193 2194 /* internalize from fpregs into sv_xmm */ 2195 static void 2196 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm) 2197 { 2198 struct envxmm *penv_xmm = &sv_xmm->sv_env; 2199 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; 2200 int i; 2201 2202 /* fpregs -> pcb */ 2203 /* FPU control/status */ 2204 penv_xmm->en_cw = penv_fpreg->en_cw; 2205 penv_xmm->en_sw = penv_fpreg->en_sw; 2206 penv_xmm->en_tw = penv_fpreg->en_tw; 2207 penv_xmm->en_opcode = penv_fpreg->en_opcode; 2208 penv_xmm->en_rip = penv_fpreg->en_rip; 2209 penv_xmm->en_rdp = penv_fpreg->en_rdp; 2210 penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr; 2211 penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask; 2212 2213 /* FPU registers */ 2214 for (i = 0; i < 8; ++i) 2215 bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10); 2216 2217 /* SSE registers */ 2218 for (i = 0; i < 16; ++i) 2219 bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16); 2220 } 2221 2222 /* externalize from td->pcb */ 2223 int 2224 fill_fpregs(struct thread *td, struct fpreg *fpregs) 2225 { 2226 2227 KASSERT(td == curthread || TD_IS_SUSPENDED(td) || 2228 P_SHOULDSTOP(td->td_proc), 2229 ("not suspended thread %p", td)); 2230 fpugetregs(td); 2231 fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs); 2232 return (0); 2233 } 2234 2235 /* internalize to td->pcb */ 2236 int 2237 set_fpregs(struct thread *td, struct fpreg *fpregs) 2238 { 2239 2240 critical_enter(); 2241 set_fpregs_xmm(fpregs, get_pcb_user_save_td(td)); 2242 fpuuserinited(td); 2243 critical_exit(); 2244 return (0); 2245 } 2246 2247 /* 2248 * Get machine context. 2249 */ 2250 int 2251 get_mcontext(struct thread *td, mcontext_t *mcp, int flags) 2252 { 2253 struct pcb *pcb; 2254 struct trapframe *tp; 2255 2256 pcb = td->td_pcb; 2257 tp = td->td_frame; 2258 PROC_LOCK(curthread->td_proc); 2259 mcp->mc_onstack = sigonstack(tp->tf_rsp); 2260 PROC_UNLOCK(curthread->td_proc); 2261 mcp->mc_r15 = tp->tf_r15; 2262 mcp->mc_r14 = tp->tf_r14; 2263 mcp->mc_r13 = tp->tf_r13; 2264 mcp->mc_r12 = tp->tf_r12; 2265 mcp->mc_r11 = tp->tf_r11; 2266 mcp->mc_r10 = tp->tf_r10; 2267 mcp->mc_r9 = tp->tf_r9; 2268 mcp->mc_r8 = tp->tf_r8; 2269 mcp->mc_rdi = tp->tf_rdi; 2270 mcp->mc_rsi = tp->tf_rsi; 2271 mcp->mc_rbp = tp->tf_rbp; 2272 mcp->mc_rbx = tp->tf_rbx; 2273 mcp->mc_rcx = tp->tf_rcx; 2274 mcp->mc_rflags = tp->tf_rflags; 2275 if (flags & GET_MC_CLEAR_RET) { 2276 mcp->mc_rax = 0; 2277 mcp->mc_rdx = 0; 2278 mcp->mc_rflags &= ~PSL_C; 2279 } else { 2280 mcp->mc_rax = tp->tf_rax; 2281 mcp->mc_rdx = tp->tf_rdx; 2282 } 2283 mcp->mc_rip = tp->tf_rip; 2284 mcp->mc_cs = tp->tf_cs; 2285 mcp->mc_rsp = tp->tf_rsp; 2286 mcp->mc_ss = tp->tf_ss; 2287 mcp->mc_ds = tp->tf_ds; 2288 mcp->mc_es = tp->tf_es; 2289 mcp->mc_fs = tp->tf_fs; 2290 mcp->mc_gs = tp->tf_gs; 2291 mcp->mc_flags = tp->tf_flags; 2292 mcp->mc_len = sizeof(*mcp); 2293 get_fpcontext(td, mcp, NULL, 0); 2294 update_pcb_bases(pcb); 2295 mcp->mc_fsbase = pcb->pcb_fsbase; 2296 mcp->mc_gsbase = pcb->pcb_gsbase; 2297 mcp->mc_xfpustate = 0; 2298 mcp->mc_xfpustate_len = 0; 2299 bzero(mcp->mc_spare, sizeof(mcp->mc_spare)); 2300 return (0); 2301 } 2302 2303 /* 2304 * Set machine context. 2305 * 2306 * However, we don't set any but the user modifiable flags, and we won't 2307 * touch the cs selector. 2308 */ 2309 int 2310 set_mcontext(struct thread *td, mcontext_t *mcp) 2311 { 2312 struct pcb *pcb; 2313 struct trapframe *tp; 2314 char *xfpustate; 2315 long rflags; 2316 int ret; 2317 2318 pcb = td->td_pcb; 2319 tp = td->td_frame; 2320 if (mcp->mc_len != sizeof(*mcp) || 2321 (mcp->mc_flags & ~_MC_FLAG_MASK) != 0) 2322 return (EINVAL); 2323 rflags = (mcp->mc_rflags & PSL_USERCHANGE) | 2324 (tp->tf_rflags & ~PSL_USERCHANGE); 2325 if (mcp->mc_flags & _MC_HASFPXSTATE) { 2326 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - 2327 sizeof(struct savefpu)) 2328 return (EINVAL); 2329 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len); 2330 ret = copyin((void *)mcp->mc_xfpustate, xfpustate, 2331 mcp->mc_xfpustate_len); 2332 if (ret != 0) 2333 return (ret); 2334 } else 2335 xfpustate = NULL; 2336 ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len); 2337 if (ret != 0) 2338 return (ret); 2339 tp->tf_r15 = mcp->mc_r15; 2340 tp->tf_r14 = mcp->mc_r14; 2341 tp->tf_r13 = mcp->mc_r13; 2342 tp->tf_r12 = mcp->mc_r12; 2343 tp->tf_r11 = mcp->mc_r11; 2344 tp->tf_r10 = mcp->mc_r10; 2345 tp->tf_r9 = mcp->mc_r9; 2346 tp->tf_r8 = mcp->mc_r8; 2347 tp->tf_rdi = mcp->mc_rdi; 2348 tp->tf_rsi = mcp->mc_rsi; 2349 tp->tf_rbp = mcp->mc_rbp; 2350 tp->tf_rbx = mcp->mc_rbx; 2351 tp->tf_rdx = mcp->mc_rdx; 2352 tp->tf_rcx = mcp->mc_rcx; 2353 tp->tf_rax = mcp->mc_rax; 2354 tp->tf_rip = mcp->mc_rip; 2355 tp->tf_rflags = rflags; 2356 tp->tf_rsp = mcp->mc_rsp; 2357 tp->tf_ss = mcp->mc_ss; 2358 tp->tf_flags = mcp->mc_flags; 2359 if (tp->tf_flags & TF_HASSEGS) { 2360 tp->tf_ds = mcp->mc_ds; 2361 tp->tf_es = mcp->mc_es; 2362 tp->tf_fs = mcp->mc_fs; 2363 tp->tf_gs = mcp->mc_gs; 2364 } 2365 set_pcb_flags(pcb, PCB_FULL_IRET); 2366 if (mcp->mc_flags & _MC_HASBASES) { 2367 pcb->pcb_fsbase = mcp->mc_fsbase; 2368 pcb->pcb_gsbase = mcp->mc_gsbase; 2369 } 2370 return (0); 2371 } 2372 2373 static void 2374 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, 2375 size_t xfpusave_len) 2376 { 2377 size_t max_len, len; 2378 2379 mcp->mc_ownedfp = fpugetregs(td); 2380 bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0], 2381 sizeof(mcp->mc_fpstate)); 2382 mcp->mc_fpformat = fpuformat(); 2383 if (!use_xsave || xfpusave_len == 0) 2384 return; 2385 max_len = cpu_max_ext_state_size - sizeof(struct savefpu); 2386 len = xfpusave_len; 2387 if (len > max_len) { 2388 len = max_len; 2389 bzero(xfpusave + max_len, len - max_len); 2390 } 2391 mcp->mc_flags |= _MC_HASFPXSTATE; 2392 mcp->mc_xfpustate_len = len; 2393 bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len); 2394 } 2395 2396 static int 2397 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, 2398 size_t xfpustate_len) 2399 { 2400 int error; 2401 2402 if (mcp->mc_fpformat == _MC_FPFMT_NODEV) 2403 return (0); 2404 else if (mcp->mc_fpformat != _MC_FPFMT_XMM) 2405 return (EINVAL); 2406 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) { 2407 /* We don't care what state is left in the FPU or PCB. */ 2408 fpstate_drop(td); 2409 error = 0; 2410 } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || 2411 mcp->mc_ownedfp == _MC_FPOWNED_PCB) { 2412 error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate, 2413 xfpustate, xfpustate_len); 2414 } else 2415 return (EINVAL); 2416 return (error); 2417 } 2418 2419 void 2420 fpstate_drop(struct thread *td) 2421 { 2422 2423 KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu")); 2424 critical_enter(); 2425 if (PCPU_GET(fpcurthread) == td) 2426 fpudrop(); 2427 /* 2428 * XXX force a full drop of the fpu. The above only drops it if we 2429 * owned it. 2430 * 2431 * XXX I don't much like fpugetuserregs()'s semantics of doing a full 2432 * drop. Dropping only to the pcb matches fnsave's behaviour. 2433 * We only need to drop to !PCB_INITDONE in sendsig(). But 2434 * sendsig() is the only caller of fpugetuserregs()... perhaps we just 2435 * have too many layers. 2436 */ 2437 clear_pcb_flags(curthread->td_pcb, 2438 PCB_FPUINITDONE | PCB_USERFPUINITDONE); 2439 critical_exit(); 2440 } 2441 2442 int 2443 fill_dbregs(struct thread *td, struct dbreg *dbregs) 2444 { 2445 struct pcb *pcb; 2446 2447 if (td == NULL) { 2448 dbregs->dr[0] = rdr0(); 2449 dbregs->dr[1] = rdr1(); 2450 dbregs->dr[2] = rdr2(); 2451 dbregs->dr[3] = rdr3(); 2452 dbregs->dr[6] = rdr6(); 2453 dbregs->dr[7] = rdr7(); 2454 } else { 2455 pcb = td->td_pcb; 2456 dbregs->dr[0] = pcb->pcb_dr0; 2457 dbregs->dr[1] = pcb->pcb_dr1; 2458 dbregs->dr[2] = pcb->pcb_dr2; 2459 dbregs->dr[3] = pcb->pcb_dr3; 2460 dbregs->dr[6] = pcb->pcb_dr6; 2461 dbregs->dr[7] = pcb->pcb_dr7; 2462 } 2463 dbregs->dr[4] = 0; 2464 dbregs->dr[5] = 0; 2465 dbregs->dr[8] = 0; 2466 dbregs->dr[9] = 0; 2467 dbregs->dr[10] = 0; 2468 dbregs->dr[11] = 0; 2469 dbregs->dr[12] = 0; 2470 dbregs->dr[13] = 0; 2471 dbregs->dr[14] = 0; 2472 dbregs->dr[15] = 0; 2473 return (0); 2474 } 2475 2476 int 2477 set_dbregs(struct thread *td, struct dbreg *dbregs) 2478 { 2479 struct pcb *pcb; 2480 int i; 2481 2482 if (td == NULL) { 2483 load_dr0(dbregs->dr[0]); 2484 load_dr1(dbregs->dr[1]); 2485 load_dr2(dbregs->dr[2]); 2486 load_dr3(dbregs->dr[3]); 2487 load_dr6(dbregs->dr[6]); 2488 load_dr7(dbregs->dr[7]); 2489 } else { 2490 /* 2491 * Don't let an illegal value for dr7 get set. Specifically, 2492 * check for undefined settings. Setting these bit patterns 2493 * result in undefined behaviour and can lead to an unexpected 2494 * TRCTRAP or a general protection fault right here. 2495 * Upper bits of dr6 and dr7 must not be set 2496 */ 2497 for (i = 0; i < 4; i++) { 2498 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) 2499 return (EINVAL); 2500 if (td->td_frame->tf_cs == _ucode32sel && 2501 DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8) 2502 return (EINVAL); 2503 } 2504 if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 || 2505 (dbregs->dr[7] & 0xffffffff00000000ul) != 0) 2506 return (EINVAL); 2507 2508 pcb = td->td_pcb; 2509 2510 /* 2511 * Don't let a process set a breakpoint that is not within the 2512 * process's address space. If a process could do this, it 2513 * could halt the system by setting a breakpoint in the kernel 2514 * (if ddb was enabled). Thus, we need to check to make sure 2515 * that no breakpoints are being enabled for addresses outside 2516 * process's address space. 2517 * 2518 * XXX - what about when the watched area of the user's 2519 * address space is written into from within the kernel 2520 * ... wouldn't that still cause a breakpoint to be generated 2521 * from within kernel mode? 2522 */ 2523 2524 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { 2525 /* dr0 is enabled */ 2526 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) 2527 return (EINVAL); 2528 } 2529 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { 2530 /* dr1 is enabled */ 2531 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) 2532 return (EINVAL); 2533 } 2534 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { 2535 /* dr2 is enabled */ 2536 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) 2537 return (EINVAL); 2538 } 2539 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { 2540 /* dr3 is enabled */ 2541 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) 2542 return (EINVAL); 2543 } 2544 2545 pcb->pcb_dr0 = dbregs->dr[0]; 2546 pcb->pcb_dr1 = dbregs->dr[1]; 2547 pcb->pcb_dr2 = dbregs->dr[2]; 2548 pcb->pcb_dr3 = dbregs->dr[3]; 2549 pcb->pcb_dr6 = dbregs->dr[6]; 2550 pcb->pcb_dr7 = dbregs->dr[7]; 2551 2552 set_pcb_flags(pcb, PCB_DBREGS); 2553 } 2554 2555 return (0); 2556 } 2557 2558 void 2559 reset_dbregs(void) 2560 { 2561 2562 load_dr7(0); /* Turn off the control bits first */ 2563 load_dr0(0); 2564 load_dr1(0); 2565 load_dr2(0); 2566 load_dr3(0); 2567 load_dr6(0); 2568 } 2569 2570 /* 2571 * Return > 0 if a hardware breakpoint has been hit, and the 2572 * breakpoint was in user space. Return 0, otherwise. 2573 */ 2574 int 2575 user_dbreg_trap(register_t dr6) 2576 { 2577 u_int64_t dr7; 2578 u_int64_t bp; /* breakpoint bits extracted from dr6 */ 2579 int nbp; /* number of breakpoints that triggered */ 2580 caddr_t addr[4]; /* breakpoint addresses */ 2581 int i; 2582 2583 bp = dr6 & DBREG_DR6_BMASK; 2584 if (bp == 0) { 2585 /* 2586 * None of the breakpoint bits are set meaning this 2587 * trap was not caused by any of the debug registers 2588 */ 2589 return 0; 2590 } 2591 2592 dr7 = rdr7(); 2593 if ((dr7 & 0x000000ff) == 0) { 2594 /* 2595 * all GE and LE bits in the dr7 register are zero, 2596 * thus the trap couldn't have been caused by the 2597 * hardware debug registers 2598 */ 2599 return 0; 2600 } 2601 2602 nbp = 0; 2603 2604 /* 2605 * at least one of the breakpoints were hit, check to see 2606 * which ones and if any of them are user space addresses 2607 */ 2608 2609 if (bp & 0x01) { 2610 addr[nbp++] = (caddr_t)rdr0(); 2611 } 2612 if (bp & 0x02) { 2613 addr[nbp++] = (caddr_t)rdr1(); 2614 } 2615 if (bp & 0x04) { 2616 addr[nbp++] = (caddr_t)rdr2(); 2617 } 2618 if (bp & 0x08) { 2619 addr[nbp++] = (caddr_t)rdr3(); 2620 } 2621 2622 for (i = 0; i < nbp; i++) { 2623 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { 2624 /* 2625 * addr[i] is in user space 2626 */ 2627 return nbp; 2628 } 2629 } 2630 2631 /* 2632 * None of the breakpoints are in user space. 2633 */ 2634 return 0; 2635 } 2636 2637 /* 2638 * The pcb_flags is only modified by current thread, or by other threads 2639 * when current thread is stopped. However, current thread may change it 2640 * from the interrupt context in cpu_switch(), or in the trap handler. 2641 * When we read-modify-write pcb_flags from C sources, compiler may generate 2642 * code that is not atomic regarding the interrupt handler. If a trap or 2643 * interrupt happens and any flag is modified from the handler, it can be 2644 * clobbered with the cached value later. Therefore, we implement setting 2645 * and clearing flags with single-instruction functions, which do not race 2646 * with possible modification of the flags from the trap or interrupt context, 2647 * because traps and interrupts are executed only on instruction boundary. 2648 */ 2649 void 2650 set_pcb_flags_raw(struct pcb *pcb, const u_int flags) 2651 { 2652 2653 __asm __volatile("orl %1,%0" 2654 : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags) 2655 : "cc", "memory"); 2656 2657 } 2658 2659 /* 2660 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs 2661 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into 2662 * pcb if user space modified the bases. We must save on the context 2663 * switch or if the return to usermode happens through the doreti. 2664 * 2665 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET, 2666 * which have a consequence that the base MSRs must be saved each time 2667 * the PCB_FULL_IRET flag is set. We disable interrupts to sync with 2668 * context switches. 2669 */ 2670 static void 2671 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags) 2672 { 2673 register_t r; 2674 2675 if (curpcb == pcb && 2676 (flags & PCB_FULL_IRET) != 0 && 2677 (pcb->pcb_flags & PCB_FULL_IRET) == 0) { 2678 r = intr_disable(); 2679 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) { 2680 if (rfs() == _ufssel) 2681 pcb->pcb_fsbase = rdfsbase(); 2682 if (rgs() == _ugssel) 2683 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE); 2684 } 2685 set_pcb_flags_raw(pcb, flags); 2686 intr_restore(r); 2687 } else { 2688 set_pcb_flags_raw(pcb, flags); 2689 } 2690 } 2691 2692 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int)) 2693 { 2694 2695 return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ? 2696 set_pcb_flags_fsgsbase : set_pcb_flags_raw); 2697 } 2698 2699 void 2700 clear_pcb_flags(struct pcb *pcb, const u_int flags) 2701 { 2702 2703 __asm __volatile("andl %1,%0" 2704 : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags) 2705 : "cc", "memory"); 2706 } 2707 2708 #ifdef KDB 2709 2710 /* 2711 * Provide inb() and outb() as functions. They are normally only available as 2712 * inline functions, thus cannot be called from the debugger. 2713 */ 2714 2715 /* silence compiler warnings */ 2716 u_char inb_(u_short); 2717 void outb_(u_short, u_char); 2718 2719 u_char 2720 inb_(u_short port) 2721 { 2722 return inb(port); 2723 } 2724 2725 void 2726 outb_(u_short port, u_char data) 2727 { 2728 outb(port, data); 2729 } 2730 2731 #endif /* KDB */ 2732 2733 #undef memset 2734 #undef memmove 2735 #undef memcpy 2736 2737 void *memset_std(void *buf, int c, size_t len); 2738 void *memset_erms(void *buf, int c, size_t len); 2739 void *memmove_std(void * _Nonnull dst, const void * _Nonnull src, 2740 size_t len); 2741 void *memmove_erms(void * _Nonnull dst, const void * _Nonnull src, 2742 size_t len); 2743 void *memcpy_std(void * _Nonnull dst, const void * _Nonnull src, 2744 size_t len); 2745 void *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src, 2746 size_t len); 2747 2748 #ifdef KCSAN 2749 /* 2750 * These fail to build as ifuncs when used with KCSAN. 2751 */ 2752 void * 2753 memset(void *buf, int c, size_t len) 2754 { 2755 2756 return (memset_std(buf, c, len)); 2757 } 2758 2759 void * 2760 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len) 2761 { 2762 2763 return (memmove_std(dst, src, len)); 2764 } 2765 2766 void * 2767 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len) 2768 { 2769 2770 return (memcpy_std(dst, src, len)); 2771 } 2772 #else 2773 DEFINE_IFUNC(, void *, memset, (void *, int, size_t)) 2774 { 2775 2776 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 2777 memset_erms : memset_std); 2778 } 2779 2780 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull, 2781 size_t)) 2782 { 2783 2784 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 2785 memmove_erms : memmove_std); 2786 } 2787 2788 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t)) 2789 { 2790 2791 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 2792 memcpy_erms : memcpy_std); 2793 } 2794 #endif 2795 2796 void pagezero_std(void *addr); 2797 void pagezero_erms(void *addr); 2798 DEFINE_IFUNC(, void , pagezero, (void *)) 2799 { 2800 2801 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 2802 pagezero_erms : pagezero_std); 2803 } 2804