1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 2003 Peter Wemm. 5 * Copyright (c) 1992 Terrence R. Lambert. 6 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * William Jolitz. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. All advertising materials mentioning features or use of this software 21 * must display the following acknowledgement: 22 * This product includes software developed by the University of 23 * California, Berkeley and its contributors. 24 * 4. Neither the name of the University nor the names of its contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 * 40 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_atpic.h" 47 #include "opt_cpu.h" 48 #include "opt_ddb.h" 49 #include "opt_inet.h" 50 #include "opt_isa.h" 51 #include "opt_kstack_pages.h" 52 #include "opt_maxmem.h" 53 #include "opt_mp_watchdog.h" 54 #include "opt_pci.h" 55 #include "opt_platform.h" 56 #include "opt_sched.h" 57 58 #include <sys/param.h> 59 #include <sys/proc.h> 60 #include <sys/systm.h> 61 #include <sys/bio.h> 62 #include <sys/buf.h> 63 #include <sys/bus.h> 64 #include <sys/callout.h> 65 #include <sys/cons.h> 66 #include <sys/cpu.h> 67 #include <sys/csan.h> 68 #include <sys/efi.h> 69 #include <sys/eventhandler.h> 70 #include <sys/exec.h> 71 #include <sys/imgact.h> 72 #include <sys/kdb.h> 73 #include <sys/kernel.h> 74 #include <sys/ktr.h> 75 #include <sys/linker.h> 76 #include <sys/lock.h> 77 #include <sys/malloc.h> 78 #include <sys/memrange.h> 79 #include <sys/msgbuf.h> 80 #include <sys/mutex.h> 81 #include <sys/pcpu.h> 82 #include <sys/ptrace.h> 83 #include <sys/reboot.h> 84 #include <sys/rwlock.h> 85 #include <sys/sched.h> 86 #include <sys/signalvar.h> 87 #ifdef SMP 88 #include <sys/smp.h> 89 #endif 90 #include <sys/syscallsubr.h> 91 #include <sys/sysctl.h> 92 #include <sys/sysent.h> 93 #include <sys/sysproto.h> 94 #include <sys/ucontext.h> 95 #include <sys/vmmeter.h> 96 97 #include <vm/vm.h> 98 #include <vm/vm_extern.h> 99 #include <vm/vm_kern.h> 100 #include <vm/vm_page.h> 101 #include <vm/vm_map.h> 102 #include <vm/vm_object.h> 103 #include <vm/vm_pager.h> 104 #include <vm/vm_param.h> 105 #include <vm/vm_phys.h> 106 107 #ifdef DDB 108 #ifndef KDB 109 #error KDB must be enabled in order for DDB to work! 110 #endif 111 #include <ddb/ddb.h> 112 #include <ddb/db_sym.h> 113 #endif 114 115 #include <net/netisr.h> 116 117 #include <machine/clock.h> 118 #include <machine/cpu.h> 119 #include <machine/cputypes.h> 120 #include <machine/frame.h> 121 #include <machine/intr_machdep.h> 122 #include <x86/mca.h> 123 #include <machine/md_var.h> 124 #include <machine/metadata.h> 125 #include <machine/mp_watchdog.h> 126 #include <machine/pc/bios.h> 127 #include <machine/pcb.h> 128 #include <machine/proc.h> 129 #include <machine/reg.h> 130 #include <machine/sigframe.h> 131 #include <machine/specialreg.h> 132 #include <machine/trap.h> 133 #include <machine/tss.h> 134 #include <x86/ucode.h> 135 #include <x86/ifunc.h> 136 #ifdef SMP 137 #include <machine/smp.h> 138 #endif 139 #ifdef FDT 140 #include <x86/fdt.h> 141 #endif 142 143 #ifdef DEV_ATPIC 144 #include <x86/isa/icu.h> 145 #else 146 #include <x86/apicvar.h> 147 #endif 148 149 #include <isa/isareg.h> 150 #include <isa/rtc.h> 151 #include <x86/init.h> 152 153 /* Sanity check for __curthread() */ 154 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); 155 156 /* 157 * The PTI trampoline stack needs enough space for a hardware trapframe and a 158 * couple of scratch registers, as well as the trapframe left behind after an 159 * iret fault. 160 */ 161 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) - 162 offsetof(struct pti_frame, pti_rip)); 163 164 extern u_int64_t hammer_time(u_int64_t, u_int64_t); 165 166 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 167 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 168 169 static void cpu_startup(void *); 170 static void get_fpcontext(struct thread *td, mcontext_t *mcp, 171 char *xfpusave, size_t xfpusave_len); 172 static int set_fpcontext(struct thread *td, mcontext_t *mcp, 173 char *xfpustate, size_t xfpustate_len); 174 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); 175 176 /* Preload data parse function */ 177 static caddr_t native_parse_preload_data(u_int64_t); 178 179 /* Native function to fetch and parse the e820 map */ 180 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *); 181 182 /* Default init_ops implementation. */ 183 struct init_ops init_ops = { 184 .parse_preload_data = native_parse_preload_data, 185 .early_clock_source_init = i8254_init, 186 .early_delay = i8254_delay, 187 .parse_memmap = native_parse_memmap, 188 #ifdef SMP 189 .mp_bootaddress = mp_bootaddress, 190 .start_all_aps = native_start_all_aps, 191 #endif 192 #ifdef DEV_PCI 193 .msi_init = msi_init, 194 #endif 195 }; 196 197 /* 198 * Physical address of the EFI System Table. Stashed from the metadata hints 199 * passed into the kernel and used by the EFI code to call runtime services. 200 */ 201 vm_paddr_t efi_systbl_phys; 202 203 /* Intel ICH registers */ 204 #define ICH_PMBASE 0x400 205 #define ICH_SMI_EN ICH_PMBASE + 0x30 206 207 int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel; 208 209 int cold = 1; 210 211 long Maxmem = 0; 212 long realmem = 0; 213 214 struct kva_md_info kmi; 215 216 static struct trapframe proc0_tf; 217 struct region_descriptor r_idt; 218 219 struct pcpu *__pcpu; 220 struct pcpu temp_bsp_pcpu; 221 222 struct mtx icu_lock; 223 224 struct mem_range_softc mem_range_softc; 225 226 struct mtx dt_lock; /* lock for GDT and LDT */ 227 228 void (*vmm_resume_p)(void); 229 230 static void 231 cpu_startup(dummy) 232 void *dummy; 233 { 234 uintmax_t memsize; 235 char *sysenv; 236 237 /* 238 * On MacBooks, we need to disallow the legacy USB circuit to 239 * generate an SMI# because this can cause several problems, 240 * namely: incorrect CPU frequency detection and failure to 241 * start the APs. 242 * We do this by disabling a bit in the SMI_EN (SMI Control and 243 * Enable register) of the Intel ICH LPC Interface Bridge. 244 */ 245 sysenv = kern_getenv("smbios.system.product"); 246 if (sysenv != NULL) { 247 if (strncmp(sysenv, "MacBook1,1", 10) == 0 || 248 strncmp(sysenv, "MacBook3,1", 10) == 0 || 249 strncmp(sysenv, "MacBook4,1", 10) == 0 || 250 strncmp(sysenv, "MacBookPro1,1", 13) == 0 || 251 strncmp(sysenv, "MacBookPro1,2", 13) == 0 || 252 strncmp(sysenv, "MacBookPro3,1", 13) == 0 || 253 strncmp(sysenv, "MacBookPro4,1", 13) == 0 || 254 strncmp(sysenv, "Macmini1,1", 10) == 0) { 255 if (bootverbose) 256 printf("Disabling LEGACY_USB_EN bit on " 257 "Intel ICH.\n"); 258 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); 259 } 260 freeenv(sysenv); 261 } 262 263 /* 264 * Good {morning,afternoon,evening,night}. 265 */ 266 startrtclock(); 267 printcpuinfo(); 268 269 /* 270 * Display physical memory if SMBIOS reports reasonable amount. 271 */ 272 memsize = 0; 273 sysenv = kern_getenv("smbios.memory.enabled"); 274 if (sysenv != NULL) { 275 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; 276 freeenv(sysenv); 277 } 278 if (memsize < ptoa((uintmax_t)vm_free_count())) 279 memsize = ptoa((uintmax_t)Maxmem); 280 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); 281 realmem = atop(memsize); 282 283 /* 284 * Display any holes after the first chunk of extended memory. 285 */ 286 if (bootverbose) { 287 int indx; 288 289 printf("Physical memory chunk(s):\n"); 290 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 291 vm_paddr_t size; 292 293 size = phys_avail[indx + 1] - phys_avail[indx]; 294 printf( 295 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", 296 (uintmax_t)phys_avail[indx], 297 (uintmax_t)phys_avail[indx + 1] - 1, 298 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); 299 } 300 } 301 302 vm_ksubmap_init(&kmi); 303 304 printf("avail memory = %ju (%ju MB)\n", 305 ptoa((uintmax_t)vm_free_count()), 306 ptoa((uintmax_t)vm_free_count()) / 1048576); 307 #ifdef DEV_PCI 308 if (bootverbose && intel_graphics_stolen_base != 0) 309 printf("intel stolen mem: base %#jx size %ju MB\n", 310 (uintmax_t)intel_graphics_stolen_base, 311 (uintmax_t)intel_graphics_stolen_size / 1024 / 1024); 312 #endif 313 314 /* 315 * Set up buffers, so they can be used to read disk labels. 316 */ 317 bufinit(); 318 vm_pager_bufferinit(); 319 320 cpu_setregs(); 321 } 322 323 static void 324 late_ifunc_resolve(void *dummy __unused) 325 { 326 link_elf_late_ireloc(); 327 } 328 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL); 329 330 /* 331 * Send an interrupt to process. 332 * 333 * Stack is set up to allow sigcode stored 334 * at top to call routine, followed by call 335 * to sigreturn routine below. After sigreturn 336 * resets the signal mask, the stack, and the 337 * frame pointer, it returns to the user 338 * specified pc, psl. 339 */ 340 void 341 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 342 { 343 struct sigframe sf, *sfp; 344 struct pcb *pcb; 345 struct proc *p; 346 struct thread *td; 347 struct sigacts *psp; 348 char *sp; 349 struct trapframe *regs; 350 char *xfpusave; 351 size_t xfpusave_len; 352 int sig; 353 int oonstack; 354 355 td = curthread; 356 pcb = td->td_pcb; 357 p = td->td_proc; 358 PROC_LOCK_ASSERT(p, MA_OWNED); 359 sig = ksi->ksi_signo; 360 psp = p->p_sigacts; 361 mtx_assert(&psp->ps_mtx, MA_OWNED); 362 regs = td->td_frame; 363 oonstack = sigonstack(regs->tf_rsp); 364 365 if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) { 366 xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu); 367 xfpusave = __builtin_alloca(xfpusave_len); 368 } else { 369 xfpusave_len = 0; 370 xfpusave = NULL; 371 } 372 373 /* Save user context. */ 374 bzero(&sf, sizeof(sf)); 375 sf.sf_uc.uc_sigmask = *mask; 376 sf.sf_uc.uc_stack = td->td_sigstk; 377 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 378 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 379 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 380 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs)); 381 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ 382 get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); 383 fpstate_drop(td); 384 update_pcb_bases(pcb); 385 sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase; 386 sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase; 387 bzero(sf.sf_uc.uc_mcontext.mc_spare, 388 sizeof(sf.sf_uc.uc_mcontext.mc_spare)); 389 390 /* Allocate space for the signal handler context. */ 391 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 392 SIGISMEMBER(psp->ps_sigonstack, sig)) { 393 sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size; 394 #if defined(COMPAT_43) 395 td->td_sigstk.ss_flags |= SS_ONSTACK; 396 #endif 397 } else 398 sp = (char *)regs->tf_rsp - 128; 399 if (xfpusave != NULL) { 400 sp -= xfpusave_len; 401 sp = (char *)((unsigned long)sp & ~0x3Ful); 402 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp; 403 } 404 sp -= sizeof(struct sigframe); 405 /* Align to 16 bytes. */ 406 sfp = (struct sigframe *)((unsigned long)sp & ~0xFul); 407 408 /* Build the argument list for the signal handler. */ 409 regs->tf_rdi = sig; /* arg 1 in %rdi */ 410 regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */ 411 bzero(&sf.sf_si, sizeof(sf.sf_si)); 412 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 413 /* Signal handler installed with SA_SIGINFO. */ 414 regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */ 415 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 416 417 /* Fill in POSIX parts */ 418 sf.sf_si = ksi->ksi_info; 419 sf.sf_si.si_signo = sig; /* maybe a translated signal */ 420 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ 421 } else { 422 /* Old FreeBSD-style arguments. */ 423 regs->tf_rsi = ksi->ksi_code; /* arg 2 in %rsi */ 424 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ 425 sf.sf_ahu.sf_handler = catcher; 426 } 427 mtx_unlock(&psp->ps_mtx); 428 PROC_UNLOCK(p); 429 430 /* 431 * Copy the sigframe out to the user's stack. 432 */ 433 if (copyout(&sf, sfp, sizeof(*sfp)) != 0 || 434 (xfpusave != NULL && copyout(xfpusave, 435 (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len) 436 != 0)) { 437 #ifdef DEBUG 438 printf("process %ld has trashed its stack\n", (long)p->p_pid); 439 #endif 440 PROC_LOCK(p); 441 sigexit(td, SIGILL); 442 } 443 444 regs->tf_rsp = (long)sfp; 445 regs->tf_rip = p->p_sysent->sv_sigcode_base; 446 regs->tf_rflags &= ~(PSL_T | PSL_D); 447 regs->tf_cs = _ucodesel; 448 regs->tf_ds = _udatasel; 449 regs->tf_ss = _udatasel; 450 regs->tf_es = _udatasel; 451 regs->tf_fs = _ufssel; 452 regs->tf_gs = _ugssel; 453 regs->tf_flags = TF_HASSEGS; 454 PROC_LOCK(p); 455 mtx_lock(&psp->ps_mtx); 456 } 457 458 /* 459 * System call to cleanup state after a signal 460 * has been taken. Reset signal mask and 461 * stack state from context left by sendsig (above). 462 * Return to previous pc and psl as specified by 463 * context left by sendsig. Check carefully to 464 * make sure that the user has not modified the 465 * state to gain improper privileges. 466 * 467 * MPSAFE 468 */ 469 int 470 sys_sigreturn(td, uap) 471 struct thread *td; 472 struct sigreturn_args /* { 473 const struct __ucontext *sigcntxp; 474 } */ *uap; 475 { 476 ucontext_t uc; 477 struct pcb *pcb; 478 struct proc *p; 479 struct trapframe *regs; 480 ucontext_t *ucp; 481 char *xfpustate; 482 size_t xfpustate_len; 483 long rflags; 484 int cs, error, ret; 485 ksiginfo_t ksi; 486 487 pcb = td->td_pcb; 488 p = td->td_proc; 489 490 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 491 if (error != 0) { 492 uprintf("pid %d (%s): sigreturn copyin failed\n", 493 p->p_pid, td->td_name); 494 return (error); 495 } 496 ucp = &uc; 497 if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) { 498 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid, 499 td->td_name, ucp->uc_mcontext.mc_flags); 500 return (EINVAL); 501 } 502 regs = td->td_frame; 503 rflags = ucp->uc_mcontext.mc_rflags; 504 /* 505 * Don't allow users to change privileged or reserved flags. 506 */ 507 if (!EFL_SECURE(rflags, regs->tf_rflags)) { 508 uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid, 509 td->td_name, rflags); 510 return (EINVAL); 511 } 512 513 /* 514 * Don't allow users to load a valid privileged %cs. Let the 515 * hardware check for invalid selectors, excess privilege in 516 * other selectors, invalid %eip's and invalid %esp's. 517 */ 518 cs = ucp->uc_mcontext.mc_cs; 519 if (!CS_SECURE(cs)) { 520 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid, 521 td->td_name, cs); 522 ksiginfo_init_trap(&ksi); 523 ksi.ksi_signo = SIGBUS; 524 ksi.ksi_code = BUS_OBJERR; 525 ksi.ksi_trapno = T_PROTFLT; 526 ksi.ksi_addr = (void *)regs->tf_rip; 527 trapsignal(td, &ksi); 528 return (EINVAL); 529 } 530 531 if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) { 532 xfpustate_len = uc.uc_mcontext.mc_xfpustate_len; 533 if (xfpustate_len > cpu_max_ext_state_size - 534 sizeof(struct savefpu)) { 535 uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n", 536 p->p_pid, td->td_name, xfpustate_len); 537 return (EINVAL); 538 } 539 xfpustate = __builtin_alloca(xfpustate_len); 540 error = copyin((const void *)uc.uc_mcontext.mc_xfpustate, 541 xfpustate, xfpustate_len); 542 if (error != 0) { 543 uprintf( 544 "pid %d (%s): sigreturn copying xfpustate failed\n", 545 p->p_pid, td->td_name); 546 return (error); 547 } 548 } else { 549 xfpustate = NULL; 550 xfpustate_len = 0; 551 } 552 ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len); 553 if (ret != 0) { 554 uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n", 555 p->p_pid, td->td_name, ret); 556 return (ret); 557 } 558 bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs)); 559 update_pcb_bases(pcb); 560 pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase; 561 pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase; 562 563 #if defined(COMPAT_43) 564 if (ucp->uc_mcontext.mc_onstack & 1) 565 td->td_sigstk.ss_flags |= SS_ONSTACK; 566 else 567 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 568 #endif 569 570 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 571 return (EJUSTRETURN); 572 } 573 574 #ifdef COMPAT_FREEBSD4 575 int 576 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap) 577 { 578 579 return sys_sigreturn(td, (struct sigreturn_args *)uap); 580 } 581 #endif 582 583 /* 584 * Reset registers to default values on exec. 585 */ 586 void 587 exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) 588 { 589 struct trapframe *regs; 590 struct pcb *pcb; 591 register_t saved_rflags; 592 593 regs = td->td_frame; 594 pcb = td->td_pcb; 595 596 if (td->td_proc->p_md.md_ldt != NULL) 597 user_ldt_free(td); 598 599 update_pcb_bases(pcb); 600 pcb->pcb_fsbase = 0; 601 pcb->pcb_gsbase = 0; 602 clear_pcb_flags(pcb, PCB_32BIT); 603 pcb->pcb_initial_fpucw = __INITIAL_FPUCW__; 604 605 saved_rflags = regs->tf_rflags & PSL_T; 606 bzero((char *)regs, sizeof(struct trapframe)); 607 regs->tf_rip = imgp->entry_addr; 608 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; 609 regs->tf_rdi = stack; /* argv */ 610 regs->tf_rflags = PSL_USER | saved_rflags; 611 regs->tf_ss = _udatasel; 612 regs->tf_cs = _ucodesel; 613 regs->tf_ds = _udatasel; 614 regs->tf_es = _udatasel; 615 regs->tf_fs = _ufssel; 616 regs->tf_gs = _ugssel; 617 regs->tf_flags = TF_HASSEGS; 618 619 /* 620 * Reset the hardware debug registers if they were in use. 621 * They won't have any meaning for the newly exec'd process. 622 */ 623 if (pcb->pcb_flags & PCB_DBREGS) { 624 pcb->pcb_dr0 = 0; 625 pcb->pcb_dr1 = 0; 626 pcb->pcb_dr2 = 0; 627 pcb->pcb_dr3 = 0; 628 pcb->pcb_dr6 = 0; 629 pcb->pcb_dr7 = 0; 630 if (pcb == curpcb) { 631 /* 632 * Clear the debug registers on the running 633 * CPU, otherwise they will end up affecting 634 * the next process we switch to. 635 */ 636 reset_dbregs(); 637 } 638 clear_pcb_flags(pcb, PCB_DBREGS); 639 } 640 641 /* 642 * Drop the FP state if we hold it, so that the process gets a 643 * clean FP state if it uses the FPU again. 644 */ 645 fpstate_drop(td); 646 } 647 648 void 649 cpu_setregs(void) 650 { 651 register_t cr0; 652 653 cr0 = rcr0(); 654 /* 655 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the 656 * BSP. See the comments there about why we set them. 657 */ 658 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; 659 load_cr0(cr0); 660 } 661 662 /* 663 * Initialize amd64 and configure to run kernel 664 */ 665 666 /* 667 * Initialize segments & interrupt table 668 */ 669 static struct gate_descriptor idt0[NIDT]; 670 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 671 672 static char dblfault_stack[PAGE_SIZE] __aligned(16); 673 static char mce0_stack[PAGE_SIZE] __aligned(16); 674 static char nmi0_stack[PAGE_SIZE] __aligned(16); 675 static char dbg0_stack[PAGE_SIZE] __aligned(16); 676 CTASSERT(sizeof(struct nmi_pcpu) == 16); 677 678 /* 679 * Software prototypes -- in more palatable form. 680 * 681 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same 682 * slots as corresponding segments for i386 kernel. 683 */ 684 struct soft_segment_descriptor gdt_segs[] = { 685 /* GNULL_SEL 0 Null Descriptor */ 686 { .ssd_base = 0x0, 687 .ssd_limit = 0x0, 688 .ssd_type = 0, 689 .ssd_dpl = 0, 690 .ssd_p = 0, 691 .ssd_long = 0, 692 .ssd_def32 = 0, 693 .ssd_gran = 0 }, 694 /* GNULL2_SEL 1 Null Descriptor */ 695 { .ssd_base = 0x0, 696 .ssd_limit = 0x0, 697 .ssd_type = 0, 698 .ssd_dpl = 0, 699 .ssd_p = 0, 700 .ssd_long = 0, 701 .ssd_def32 = 0, 702 .ssd_gran = 0 }, 703 /* GUFS32_SEL 2 32 bit %gs Descriptor for user */ 704 { .ssd_base = 0x0, 705 .ssd_limit = 0xfffff, 706 .ssd_type = SDT_MEMRWA, 707 .ssd_dpl = SEL_UPL, 708 .ssd_p = 1, 709 .ssd_long = 0, 710 .ssd_def32 = 1, 711 .ssd_gran = 1 }, 712 /* GUGS32_SEL 3 32 bit %fs Descriptor for user */ 713 { .ssd_base = 0x0, 714 .ssd_limit = 0xfffff, 715 .ssd_type = SDT_MEMRWA, 716 .ssd_dpl = SEL_UPL, 717 .ssd_p = 1, 718 .ssd_long = 0, 719 .ssd_def32 = 1, 720 .ssd_gran = 1 }, 721 /* GCODE_SEL 4 Code Descriptor for kernel */ 722 { .ssd_base = 0x0, 723 .ssd_limit = 0xfffff, 724 .ssd_type = SDT_MEMERA, 725 .ssd_dpl = SEL_KPL, 726 .ssd_p = 1, 727 .ssd_long = 1, 728 .ssd_def32 = 0, 729 .ssd_gran = 1 }, 730 /* GDATA_SEL 5 Data Descriptor for kernel */ 731 { .ssd_base = 0x0, 732 .ssd_limit = 0xfffff, 733 .ssd_type = SDT_MEMRWA, 734 .ssd_dpl = SEL_KPL, 735 .ssd_p = 1, 736 .ssd_long = 1, 737 .ssd_def32 = 0, 738 .ssd_gran = 1 }, 739 /* GUCODE32_SEL 6 32 bit Code Descriptor for user */ 740 { .ssd_base = 0x0, 741 .ssd_limit = 0xfffff, 742 .ssd_type = SDT_MEMERA, 743 .ssd_dpl = SEL_UPL, 744 .ssd_p = 1, 745 .ssd_long = 0, 746 .ssd_def32 = 1, 747 .ssd_gran = 1 }, 748 /* GUDATA_SEL 7 32/64 bit Data Descriptor for user */ 749 { .ssd_base = 0x0, 750 .ssd_limit = 0xfffff, 751 .ssd_type = SDT_MEMRWA, 752 .ssd_dpl = SEL_UPL, 753 .ssd_p = 1, 754 .ssd_long = 0, 755 .ssd_def32 = 1, 756 .ssd_gran = 1 }, 757 /* GUCODE_SEL 8 64 bit Code Descriptor for user */ 758 { .ssd_base = 0x0, 759 .ssd_limit = 0xfffff, 760 .ssd_type = SDT_MEMERA, 761 .ssd_dpl = SEL_UPL, 762 .ssd_p = 1, 763 .ssd_long = 1, 764 .ssd_def32 = 0, 765 .ssd_gran = 1 }, 766 /* GPROC0_SEL 9 Proc 0 Tss Descriptor */ 767 { .ssd_base = 0x0, 768 .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1, 769 .ssd_type = SDT_SYSTSS, 770 .ssd_dpl = SEL_KPL, 771 .ssd_p = 1, 772 .ssd_long = 0, 773 .ssd_def32 = 0, 774 .ssd_gran = 0 }, 775 /* Actually, the TSS is a system descriptor which is double size */ 776 { .ssd_base = 0x0, 777 .ssd_limit = 0x0, 778 .ssd_type = 0, 779 .ssd_dpl = 0, 780 .ssd_p = 0, 781 .ssd_long = 0, 782 .ssd_def32 = 0, 783 .ssd_gran = 0 }, 784 /* GUSERLDT_SEL 11 LDT Descriptor */ 785 { .ssd_base = 0x0, 786 .ssd_limit = 0x0, 787 .ssd_type = 0, 788 .ssd_dpl = 0, 789 .ssd_p = 0, 790 .ssd_long = 0, 791 .ssd_def32 = 0, 792 .ssd_gran = 0 }, 793 /* GUSERLDT_SEL 12 LDT Descriptor, double size */ 794 { .ssd_base = 0x0, 795 .ssd_limit = 0x0, 796 .ssd_type = 0, 797 .ssd_dpl = 0, 798 .ssd_p = 0, 799 .ssd_long = 0, 800 .ssd_def32 = 0, 801 .ssd_gran = 0 }, 802 }; 803 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT"); 804 805 void 806 setidt(int idx, inthand_t *func, int typ, int dpl, int ist) 807 { 808 struct gate_descriptor *ip; 809 810 ip = idt + idx; 811 ip->gd_looffset = (uintptr_t)func; 812 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); 813 ip->gd_ist = ist; 814 ip->gd_xx = 0; 815 ip->gd_type = typ; 816 ip->gd_dpl = dpl; 817 ip->gd_p = 1; 818 ip->gd_hioffset = ((uintptr_t)func)>>16 ; 819 } 820 821 extern inthand_t 822 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 823 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 824 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 825 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 826 IDTVEC(xmm), IDTVEC(dblfault), 827 IDTVEC(div_pti), IDTVEC(bpt_pti), 828 IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti), 829 IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti), 830 IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti), 831 IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti), 832 IDTVEC(xmm_pti), 833 #ifdef KDTRACE_HOOKS 834 IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti), 835 #endif 836 #ifdef XENHVM 837 IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti), 838 #endif 839 IDTVEC(fast_syscall), IDTVEC(fast_syscall32), 840 IDTVEC(fast_syscall_pti); 841 842 #ifdef DDB 843 /* 844 * Display the index and function name of any IDT entries that don't use 845 * the default 'rsvd' entry point. 846 */ 847 DB_SHOW_COMMAND(idt, db_show_idt) 848 { 849 struct gate_descriptor *ip; 850 int idx; 851 uintptr_t func; 852 853 ip = idt; 854 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { 855 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); 856 if (func != (uintptr_t)&IDTVEC(rsvd)) { 857 db_printf("%3d\t", idx); 858 db_printsym(func, DB_STGY_PROC); 859 db_printf("\n"); 860 } 861 ip++; 862 } 863 } 864 865 /* Show privileged registers. */ 866 DB_SHOW_COMMAND(sysregs, db_show_sysregs) 867 { 868 struct { 869 uint16_t limit; 870 uint64_t base; 871 } __packed idtr, gdtr; 872 uint16_t ldt, tr; 873 874 __asm __volatile("sidt %0" : "=m" (idtr)); 875 db_printf("idtr\t0x%016lx/%04x\n", 876 (u_long)idtr.base, (u_int)idtr.limit); 877 __asm __volatile("sgdt %0" : "=m" (gdtr)); 878 db_printf("gdtr\t0x%016lx/%04x\n", 879 (u_long)gdtr.base, (u_int)gdtr.limit); 880 __asm __volatile("sldt %0" : "=r" (ldt)); 881 db_printf("ldtr\t0x%04x\n", ldt); 882 __asm __volatile("str %0" : "=r" (tr)); 883 db_printf("tr\t0x%04x\n", tr); 884 db_printf("cr0\t0x%016lx\n", rcr0()); 885 db_printf("cr2\t0x%016lx\n", rcr2()); 886 db_printf("cr3\t0x%016lx\n", rcr3()); 887 db_printf("cr4\t0x%016lx\n", rcr4()); 888 if (rcr4() & CR4_XSAVE) 889 db_printf("xcr0\t0x%016lx\n", rxcr(0)); 890 db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER)); 891 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX)) 892 db_printf("FEATURES_CTL\t%016lx\n", 893 rdmsr(MSR_IA32_FEATURE_CONTROL)); 894 db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR)); 895 db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT)); 896 db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE)); 897 } 898 899 DB_SHOW_COMMAND(dbregs, db_show_dbregs) 900 { 901 902 db_printf("dr0\t0x%016lx\n", rdr0()); 903 db_printf("dr1\t0x%016lx\n", rdr1()); 904 db_printf("dr2\t0x%016lx\n", rdr2()); 905 db_printf("dr3\t0x%016lx\n", rdr3()); 906 db_printf("dr6\t0x%016lx\n", rdr6()); 907 db_printf("dr7\t0x%016lx\n", rdr7()); 908 } 909 #endif 910 911 void 912 sdtossd(sd, ssd) 913 struct user_segment_descriptor *sd; 914 struct soft_segment_descriptor *ssd; 915 { 916 917 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 918 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 919 ssd->ssd_type = sd->sd_type; 920 ssd->ssd_dpl = sd->sd_dpl; 921 ssd->ssd_p = sd->sd_p; 922 ssd->ssd_long = sd->sd_long; 923 ssd->ssd_def32 = sd->sd_def32; 924 ssd->ssd_gran = sd->sd_gran; 925 } 926 927 void 928 ssdtosd(ssd, sd) 929 struct soft_segment_descriptor *ssd; 930 struct user_segment_descriptor *sd; 931 { 932 933 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 934 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; 935 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 936 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 937 sd->sd_type = ssd->ssd_type; 938 sd->sd_dpl = ssd->ssd_dpl; 939 sd->sd_p = ssd->ssd_p; 940 sd->sd_long = ssd->ssd_long; 941 sd->sd_def32 = ssd->ssd_def32; 942 sd->sd_gran = ssd->ssd_gran; 943 } 944 945 void 946 ssdtosyssd(ssd, sd) 947 struct soft_segment_descriptor *ssd; 948 struct system_segment_descriptor *sd; 949 { 950 951 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 952 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; 953 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 954 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 955 sd->sd_type = ssd->ssd_type; 956 sd->sd_dpl = ssd->ssd_dpl; 957 sd->sd_p = ssd->ssd_p; 958 sd->sd_gran = ssd->ssd_gran; 959 } 960 961 #if !defined(DEV_ATPIC) && defined(DEV_ISA) 962 #include <isa/isavar.h> 963 #include <isa/isareg.h> 964 /* 965 * Return a bitmap of the current interrupt requests. This is 8259-specific 966 * and is only suitable for use at probe time. 967 * This is only here to pacify sio. It is NOT FATAL if this doesn't work. 968 * It shouldn't be here. There should probably be an APIC centric 969 * implementation in the apic driver code, if at all. 970 */ 971 intrmask_t 972 isa_irq_pending(void) 973 { 974 u_char irr1; 975 u_char irr2; 976 977 irr1 = inb(IO_ICU1); 978 irr2 = inb(IO_ICU2); 979 return ((irr2 << 8) | irr1); 980 } 981 #endif 982 983 u_int basemem; 984 985 static int 986 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, 987 int *physmap_idxp) 988 { 989 int i, insert_idx, physmap_idx; 990 991 physmap_idx = *physmap_idxp; 992 993 if (length == 0) 994 return (1); 995 996 /* 997 * Find insertion point while checking for overlap. Start off by 998 * assuming the new entry will be added to the end. 999 * 1000 * NB: physmap_idx points to the next free slot. 1001 */ 1002 insert_idx = physmap_idx; 1003 for (i = 0; i <= physmap_idx; i += 2) { 1004 if (base < physmap[i + 1]) { 1005 if (base + length <= physmap[i]) { 1006 insert_idx = i; 1007 break; 1008 } 1009 if (boothowto & RB_VERBOSE) 1010 printf( 1011 "Overlapping memory regions, ignoring second region\n"); 1012 return (1); 1013 } 1014 } 1015 1016 /* See if we can prepend to the next entry. */ 1017 if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { 1018 physmap[insert_idx] = base; 1019 return (1); 1020 } 1021 1022 /* See if we can append to the previous entry. */ 1023 if (insert_idx > 0 && base == physmap[insert_idx - 1]) { 1024 physmap[insert_idx - 1] += length; 1025 return (1); 1026 } 1027 1028 physmap_idx += 2; 1029 *physmap_idxp = physmap_idx; 1030 if (physmap_idx == PHYS_AVAIL_ENTRIES) { 1031 printf( 1032 "Too many segments in the physical address map, giving up\n"); 1033 return (0); 1034 } 1035 1036 /* 1037 * Move the last 'N' entries down to make room for the new 1038 * entry if needed. 1039 */ 1040 for (i = (physmap_idx - 2); i > insert_idx; i -= 2) { 1041 physmap[i] = physmap[i - 2]; 1042 physmap[i + 1] = physmap[i - 1]; 1043 } 1044 1045 /* Insert the new entry. */ 1046 physmap[insert_idx] = base; 1047 physmap[insert_idx + 1] = base + length; 1048 return (1); 1049 } 1050 1051 void 1052 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize, 1053 vm_paddr_t *physmap, int *physmap_idx) 1054 { 1055 struct bios_smap *smap, *smapend; 1056 1057 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 1058 1059 for (smap = smapbase; smap < smapend; smap++) { 1060 if (boothowto & RB_VERBOSE) 1061 printf("SMAP type=%02x base=%016lx len=%016lx\n", 1062 smap->type, smap->base, smap->length); 1063 1064 if (smap->type != SMAP_TYPE_MEMORY) 1065 continue; 1066 1067 if (!add_physmap_entry(smap->base, smap->length, physmap, 1068 physmap_idx)) 1069 break; 1070 } 1071 } 1072 1073 static void 1074 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap, 1075 int *physmap_idx) 1076 { 1077 struct efi_md *map, *p; 1078 const char *type; 1079 size_t efisz; 1080 int ndesc, i; 1081 1082 static const char *types[] = { 1083 "Reserved", 1084 "LoaderCode", 1085 "LoaderData", 1086 "BootServicesCode", 1087 "BootServicesData", 1088 "RuntimeServicesCode", 1089 "RuntimeServicesData", 1090 "ConventionalMemory", 1091 "UnusableMemory", 1092 "ACPIReclaimMemory", 1093 "ACPIMemoryNVS", 1094 "MemoryMappedIO", 1095 "MemoryMappedIOPortSpace", 1096 "PalCode", 1097 "PersistentMemory" 1098 }; 1099 1100 /* 1101 * Memory map data provided by UEFI via the GetMemoryMap 1102 * Boot Services API. 1103 */ 1104 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf; 1105 map = (struct efi_md *)((uint8_t *)efihdr + efisz); 1106 1107 if (efihdr->descriptor_size == 0) 1108 return; 1109 ndesc = efihdr->memory_size / efihdr->descriptor_size; 1110 1111 if (boothowto & RB_VERBOSE) 1112 printf("%23s %12s %12s %8s %4s\n", 1113 "Type", "Physical", "Virtual", "#Pages", "Attr"); 1114 1115 for (i = 0, p = map; i < ndesc; i++, 1116 p = efi_next_descriptor(p, efihdr->descriptor_size)) { 1117 if (boothowto & RB_VERBOSE) { 1118 if (p->md_type < nitems(types)) 1119 type = types[p->md_type]; 1120 else 1121 type = "<INVALID>"; 1122 printf("%23s %012lx %12p %08lx ", type, p->md_phys, 1123 p->md_virt, p->md_pages); 1124 if (p->md_attr & EFI_MD_ATTR_UC) 1125 printf("UC "); 1126 if (p->md_attr & EFI_MD_ATTR_WC) 1127 printf("WC "); 1128 if (p->md_attr & EFI_MD_ATTR_WT) 1129 printf("WT "); 1130 if (p->md_attr & EFI_MD_ATTR_WB) 1131 printf("WB "); 1132 if (p->md_attr & EFI_MD_ATTR_UCE) 1133 printf("UCE "); 1134 if (p->md_attr & EFI_MD_ATTR_WP) 1135 printf("WP "); 1136 if (p->md_attr & EFI_MD_ATTR_RP) 1137 printf("RP "); 1138 if (p->md_attr & EFI_MD_ATTR_XP) 1139 printf("XP "); 1140 if (p->md_attr & EFI_MD_ATTR_NV) 1141 printf("NV "); 1142 if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE) 1143 printf("MORE_RELIABLE "); 1144 if (p->md_attr & EFI_MD_ATTR_RO) 1145 printf("RO "); 1146 if (p->md_attr & EFI_MD_ATTR_RT) 1147 printf("RUNTIME"); 1148 printf("\n"); 1149 } 1150 1151 switch (p->md_type) { 1152 case EFI_MD_TYPE_CODE: 1153 case EFI_MD_TYPE_DATA: 1154 case EFI_MD_TYPE_BS_CODE: 1155 case EFI_MD_TYPE_BS_DATA: 1156 case EFI_MD_TYPE_FREE: 1157 /* 1158 * We're allowed to use any entry with these types. 1159 */ 1160 break; 1161 default: 1162 continue; 1163 } 1164 1165 if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE), 1166 physmap, physmap_idx)) 1167 break; 1168 } 1169 } 1170 1171 static char bootmethod[16] = ""; 1172 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0, 1173 "System firmware boot method"); 1174 1175 static void 1176 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx) 1177 { 1178 struct bios_smap *smap; 1179 struct efi_map_header *efihdr; 1180 u_int32_t size; 1181 1182 /* 1183 * Memory map from INT 15:E820. 1184 * 1185 * subr_module.c says: 1186 * "Consumer may safely assume that size value precedes data." 1187 * ie: an int32_t immediately precedes smap. 1188 */ 1189 1190 efihdr = (struct efi_map_header *)preload_search_info(kmdp, 1191 MODINFO_METADATA | MODINFOMD_EFI_MAP); 1192 smap = (struct bios_smap *)preload_search_info(kmdp, 1193 MODINFO_METADATA | MODINFOMD_SMAP); 1194 if (efihdr == NULL && smap == NULL) 1195 panic("No BIOS smap or EFI map info from loader!"); 1196 1197 if (efihdr != NULL) { 1198 add_efi_map_entries(efihdr, physmap, physmap_idx); 1199 strlcpy(bootmethod, "UEFI", sizeof(bootmethod)); 1200 } else { 1201 size = *((u_int32_t *)smap - 1); 1202 bios_add_smap_entries(smap, size, physmap, physmap_idx); 1203 strlcpy(bootmethod, "BIOS", sizeof(bootmethod)); 1204 } 1205 } 1206 1207 #define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE) 1208 1209 /* 1210 * Populate the (physmap) array with base/bound pairs describing the 1211 * available physical memory in the system, then test this memory and 1212 * build the phys_avail array describing the actually-available memory. 1213 * 1214 * Total memory size may be set by the kernel environment variable 1215 * hw.physmem or the compile-time define MAXMEM. 1216 * 1217 * XXX first should be vm_paddr_t. 1218 */ 1219 static void 1220 getmemsize(caddr_t kmdp, u_int64_t first) 1221 { 1222 int i, physmap_idx, pa_indx, da_indx; 1223 vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES]; 1224 u_long physmem_start, physmem_tunable, memtest; 1225 pt_entry_t *pte; 1226 quad_t dcons_addr, dcons_size; 1227 int page_counter; 1228 1229 /* 1230 * Tell the physical memory allocator about pages used to store 1231 * the kernel and preloaded data. See kmem_bootstrap_free(). 1232 */ 1233 vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first)); 1234 1235 bzero(physmap, sizeof(physmap)); 1236 physmap_idx = 0; 1237 1238 init_ops.parse_memmap(kmdp, physmap, &physmap_idx); 1239 physmap_idx -= 2; 1240 1241 /* 1242 * Find the 'base memory' segment for SMP 1243 */ 1244 basemem = 0; 1245 for (i = 0; i <= physmap_idx; i += 2) { 1246 if (physmap[i] <= 0xA0000) { 1247 basemem = physmap[i + 1] / 1024; 1248 break; 1249 } 1250 } 1251 if (basemem == 0 || basemem > 640) { 1252 if (bootverbose) 1253 printf( 1254 "Memory map doesn't contain a basemem segment, faking it"); 1255 basemem = 640; 1256 } 1257 1258 /* 1259 * Maxmem isn't the "maximum memory", it's one larger than the 1260 * highest page of the physical address space. It should be 1261 * called something like "Maxphyspage". We may adjust this 1262 * based on ``hw.physmem'' and the results of the memory test. 1263 */ 1264 Maxmem = atop(physmap[physmap_idx + 1]); 1265 1266 #ifdef MAXMEM 1267 Maxmem = MAXMEM / 4; 1268 #endif 1269 1270 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 1271 Maxmem = atop(physmem_tunable); 1272 1273 /* 1274 * The boot memory test is disabled by default, as it takes a 1275 * significant amount of time on large-memory systems, and is 1276 * unfriendly to virtual machines as it unnecessarily touches all 1277 * pages. 1278 * 1279 * A general name is used as the code may be extended to support 1280 * additional tests beyond the current "page present" test. 1281 */ 1282 memtest = 0; 1283 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); 1284 1285 /* 1286 * Don't allow MAXMEM or hw.physmem to extend the amount of memory 1287 * in the system. 1288 */ 1289 if (Maxmem > atop(physmap[physmap_idx + 1])) 1290 Maxmem = atop(physmap[physmap_idx + 1]); 1291 1292 if (atop(physmap[physmap_idx + 1]) != Maxmem && 1293 (boothowto & RB_VERBOSE)) 1294 printf("Physical memory use set to %ldK\n", Maxmem * 4); 1295 1296 /* 1297 * Make hole for "AP -> long mode" bootstrap code. The 1298 * mp_bootaddress vector is only available when the kernel 1299 * is configured to support APs and APs for the system start 1300 * in real mode mode (e.g. SMP bare metal). 1301 */ 1302 if (init_ops.mp_bootaddress) 1303 init_ops.mp_bootaddress(physmap, &physmap_idx); 1304 1305 /* call pmap initialization to make new kernel address space */ 1306 pmap_bootstrap(&first); 1307 1308 /* 1309 * Size up each available chunk of physical memory. 1310 * 1311 * XXX Some BIOSes corrupt low 64KB between suspend and resume. 1312 * By default, mask off the first 16 pages unless we appear to be 1313 * running in a VM. 1314 */ 1315 physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT; 1316 TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start); 1317 if (physmap[0] < physmem_start) { 1318 if (physmem_start < PAGE_SIZE) 1319 physmap[0] = PAGE_SIZE; 1320 else if (physmem_start >= physmap[1]) 1321 physmap[0] = round_page(physmap[1] - PAGE_SIZE); 1322 else 1323 physmap[0] = round_page(physmem_start); 1324 } 1325 pa_indx = 0; 1326 da_indx = 1; 1327 phys_avail[pa_indx++] = physmap[0]; 1328 phys_avail[pa_indx] = physmap[0]; 1329 dump_avail[da_indx] = physmap[0]; 1330 pte = CMAP1; 1331 1332 /* 1333 * Get dcons buffer address 1334 */ 1335 if (getenv_quad("dcons.addr", &dcons_addr) == 0 || 1336 getenv_quad("dcons.size", &dcons_size) == 0) 1337 dcons_addr = 0; 1338 1339 /* 1340 * physmap is in bytes, so when converting to page boundaries, 1341 * round up the start address and round down the end address. 1342 */ 1343 page_counter = 0; 1344 if (memtest != 0) 1345 printf("Testing system memory"); 1346 for (i = 0; i <= physmap_idx; i += 2) { 1347 vm_paddr_t end; 1348 1349 end = ptoa((vm_paddr_t)Maxmem); 1350 if (physmap[i + 1] < end) 1351 end = trunc_page(physmap[i + 1]); 1352 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 1353 int tmp, page_bad, full; 1354 int *ptr = (int *)CADDR1; 1355 1356 full = FALSE; 1357 /* 1358 * block out kernel memory as not available. 1359 */ 1360 if (pa >= (vm_paddr_t)kernphys && pa < first) 1361 goto do_dump_avail; 1362 1363 /* 1364 * block out dcons buffer 1365 */ 1366 if (dcons_addr > 0 1367 && pa >= trunc_page(dcons_addr) 1368 && pa < dcons_addr + dcons_size) 1369 goto do_dump_avail; 1370 1371 page_bad = FALSE; 1372 if (memtest == 0) 1373 goto skip_memtest; 1374 1375 /* 1376 * Print a "." every GB to show we're making 1377 * progress. 1378 */ 1379 page_counter++; 1380 if ((page_counter % PAGES_PER_GB) == 0) 1381 printf("."); 1382 1383 /* 1384 * map page into kernel: valid, read/write,non-cacheable 1385 */ 1386 *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD; 1387 invltlb(); 1388 1389 tmp = *(int *)ptr; 1390 /* 1391 * Test for alternating 1's and 0's 1392 */ 1393 *(volatile int *)ptr = 0xaaaaaaaa; 1394 if (*(volatile int *)ptr != 0xaaaaaaaa) 1395 page_bad = TRUE; 1396 /* 1397 * Test for alternating 0's and 1's 1398 */ 1399 *(volatile int *)ptr = 0x55555555; 1400 if (*(volatile int *)ptr != 0x55555555) 1401 page_bad = TRUE; 1402 /* 1403 * Test for all 1's 1404 */ 1405 *(volatile int *)ptr = 0xffffffff; 1406 if (*(volatile int *)ptr != 0xffffffff) 1407 page_bad = TRUE; 1408 /* 1409 * Test for all 0's 1410 */ 1411 *(volatile int *)ptr = 0x0; 1412 if (*(volatile int *)ptr != 0x0) 1413 page_bad = TRUE; 1414 /* 1415 * Restore original value. 1416 */ 1417 *(int *)ptr = tmp; 1418 1419 skip_memtest: 1420 /* 1421 * Adjust array of valid/good pages. 1422 */ 1423 if (page_bad == TRUE) 1424 continue; 1425 /* 1426 * If this good page is a continuation of the 1427 * previous set of good pages, then just increase 1428 * the end pointer. Otherwise start a new chunk. 1429 * Note that "end" points one higher than end, 1430 * making the range >= start and < end. 1431 * If we're also doing a speculative memory 1432 * test and we at or past the end, bump up Maxmem 1433 * so that we keep going. The first bad page 1434 * will terminate the loop. 1435 */ 1436 if (phys_avail[pa_indx] == pa) { 1437 phys_avail[pa_indx] += PAGE_SIZE; 1438 } else { 1439 pa_indx++; 1440 if (pa_indx == PHYS_AVAIL_ENTRIES) { 1441 printf( 1442 "Too many holes in the physical address space, giving up\n"); 1443 pa_indx--; 1444 full = TRUE; 1445 goto do_dump_avail; 1446 } 1447 phys_avail[pa_indx++] = pa; /* start */ 1448 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 1449 } 1450 physmem++; 1451 do_dump_avail: 1452 if (dump_avail[da_indx] == pa) { 1453 dump_avail[da_indx] += PAGE_SIZE; 1454 } else { 1455 da_indx++; 1456 if (da_indx == PHYS_AVAIL_ENTRIES) { 1457 da_indx--; 1458 goto do_next; 1459 } 1460 dump_avail[da_indx++] = pa; /* start */ 1461 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 1462 } 1463 do_next: 1464 if (full) 1465 break; 1466 } 1467 } 1468 *pte = 0; 1469 invltlb(); 1470 if (memtest != 0) 1471 printf("\n"); 1472 1473 /* 1474 * XXX 1475 * The last chunk must contain at least one page plus the message 1476 * buffer to avoid complicating other code (message buffer address 1477 * calculation, etc.). 1478 */ 1479 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 1480 round_page(msgbufsize) >= phys_avail[pa_indx]) { 1481 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 1482 phys_avail[pa_indx--] = 0; 1483 phys_avail[pa_indx--] = 0; 1484 } 1485 1486 Maxmem = atop(phys_avail[pa_indx]); 1487 1488 /* Trim off space for the message buffer. */ 1489 phys_avail[pa_indx] -= round_page(msgbufsize); 1490 1491 /* Map the message buffer. */ 1492 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]); 1493 } 1494 1495 static caddr_t 1496 native_parse_preload_data(u_int64_t modulep) 1497 { 1498 caddr_t kmdp; 1499 char *envp; 1500 #ifdef DDB 1501 vm_offset_t ksym_start; 1502 vm_offset_t ksym_end; 1503 #endif 1504 1505 preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE); 1506 preload_bootstrap_relocate(KERNBASE); 1507 kmdp = preload_search_by_type("elf kernel"); 1508 if (kmdp == NULL) 1509 kmdp = preload_search_by_type("elf64 kernel"); 1510 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); 1511 envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *); 1512 if (envp != NULL) 1513 envp += KERNBASE; 1514 init_static_kenv(envp, 0); 1515 #ifdef DDB 1516 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); 1517 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); 1518 db_fetch_ksymtab(ksym_start, ksym_end, 0); 1519 #endif 1520 efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t); 1521 1522 return (kmdp); 1523 } 1524 1525 static void 1526 amd64_kdb_init(void) 1527 { 1528 kdb_init(); 1529 #ifdef KDB 1530 if (boothowto & RB_KDB) 1531 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); 1532 #endif 1533 } 1534 1535 /* Set up the fast syscall stuff */ 1536 void 1537 amd64_conf_fast_syscall(void) 1538 { 1539 uint64_t msr; 1540 1541 msr = rdmsr(MSR_EFER) | EFER_SCE; 1542 wrmsr(MSR_EFER, msr); 1543 wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) : 1544 (u_int64_t)IDTVEC(fast_syscall)); 1545 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 1546 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 1547 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 1548 wrmsr(MSR_STAR, msr); 1549 wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC); 1550 } 1551 1552 void 1553 amd64_bsp_pcpu_init1(struct pcpu *pc) 1554 { 1555 struct user_segment_descriptor *gdt; 1556 1557 PCPU_SET(prvspace, pc); 1558 gdt = *PCPU_PTR(gdt); 1559 PCPU_SET(curthread, &thread0); 1560 PCPU_SET(tssp, PCPU_PTR(common_tss)); 1561 PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1562 PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]); 1563 PCPU_SET(fs32p, &gdt[GUFS32_SEL]); 1564 PCPU_SET(gs32p, &gdt[GUGS32_SEL]); 1565 PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK); 1566 PCPU_SET(smp_tlb_gen, 1); 1567 } 1568 1569 void 1570 amd64_bsp_pcpu_init2(uint64_t rsp0) 1571 { 1572 1573 PCPU_SET(rsp0, rsp0); 1574 PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) + 1575 PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful); 1576 PCPU_SET(curpcb, thread0.td_pcb); 1577 } 1578 1579 void 1580 amd64_bsp_ist_init(struct pcpu *pc) 1581 { 1582 struct nmi_pcpu *np; 1583 struct amd64tss *tssp; 1584 1585 tssp = &pc->pc_common_tss; 1586 1587 /* doublefault stack space, runs on ist1 */ 1588 np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1; 1589 np->np_pcpu = (register_t)pc; 1590 tssp->tss_ist1 = (long)np; 1591 1592 /* 1593 * NMI stack, runs on ist2. The pcpu pointer is stored just 1594 * above the start of the ist2 stack. 1595 */ 1596 np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1; 1597 np->np_pcpu = (register_t)pc; 1598 tssp->tss_ist2 = (long)np; 1599 1600 /* 1601 * MC# stack, runs on ist3. The pcpu pointer is stored just 1602 * above the start of the ist3 stack. 1603 */ 1604 np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1; 1605 np->np_pcpu = (register_t)pc; 1606 tssp->tss_ist3 = (long)np; 1607 1608 /* 1609 * DB# stack, runs on ist4. 1610 */ 1611 np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1; 1612 np->np_pcpu = (register_t)pc; 1613 tssp->tss_ist4 = (long)np; 1614 } 1615 1616 u_int64_t 1617 hammer_time(u_int64_t modulep, u_int64_t physfree) 1618 { 1619 caddr_t kmdp; 1620 int gsel_tss, x; 1621 struct pcpu *pc; 1622 struct xstate_hdr *xhdr; 1623 u_int64_t rsp0; 1624 char *env; 1625 struct user_segment_descriptor *gdt; 1626 struct region_descriptor r_gdt; 1627 size_t kstack0_sz; 1628 int late_console; 1629 1630 TSRAW(&thread0, TS_ENTER, __func__, NULL); 1631 1632 kmdp = init_ops.parse_preload_data(modulep); 1633 1634 physfree += ucode_load_bsp(physfree + KERNBASE); 1635 physfree = roundup2(physfree, PAGE_SIZE); 1636 1637 identify_cpu1(); 1638 identify_hypervisor(); 1639 identify_cpu_fixup_bsp(); 1640 identify_cpu2(); 1641 initializecpucache(); 1642 1643 /* 1644 * Check for pti, pcid, and invpcid before ifuncs are 1645 * resolved, to correctly select the implementation for 1646 * pmap_activate_sw_mode(). 1647 */ 1648 pti = pti_get_default(); 1649 TUNABLE_INT_FETCH("vm.pmap.pti", &pti); 1650 TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); 1651 if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) { 1652 invpcid_works = (cpu_stdext_feature & 1653 CPUID_STDEXT_INVPCID) != 0; 1654 } else { 1655 pmap_pcid_enabled = 0; 1656 } 1657 1658 link_elf_ireloc(kmdp); 1659 1660 /* 1661 * This may be done better later if it gets more high level 1662 * components in it. If so just link td->td_proc here. 1663 */ 1664 proc_linkup0(&proc0, &thread0); 1665 1666 /* Init basic tunables, hz etc */ 1667 init_param1(); 1668 1669 thread0.td_kstack = physfree + KERNBASE; 1670 thread0.td_kstack_pages = kstack_pages; 1671 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; 1672 bzero((void *)thread0.td_kstack, kstack0_sz); 1673 physfree += kstack0_sz; 1674 1675 /* 1676 * Initialize enough of thread0 for delayed invalidation to 1677 * work very early. Rely on thread0.td_base_pri 1678 * zero-initialization, it is reset to PVM at proc0_init(). 1679 */ 1680 pmap_thread_init_invl_gen(&thread0); 1681 1682 pc = &temp_bsp_pcpu; 1683 pcpu_init(pc, 0, sizeof(struct pcpu)); 1684 gdt = &temp_bsp_pcpu.pc_gdt[0]; 1685 1686 /* 1687 * make gdt memory segments 1688 */ 1689 for (x = 0; x < NGDT; x++) { 1690 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && 1691 x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1) 1692 ssdtosd(&gdt_segs[x], &gdt[x]); 1693 } 1694 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss; 1695 ssdtosyssd(&gdt_segs[GPROC0_SEL], 1696 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1697 1698 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 1699 r_gdt.rd_base = (long)gdt; 1700 lgdt(&r_gdt); 1701 1702 wrmsr(MSR_FSBASE, 0); /* User value */ 1703 wrmsr(MSR_GSBASE, (u_int64_t)pc); 1704 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ 1705 1706 dpcpu_init((void *)(physfree + KERNBASE), 0); 1707 physfree += DPCPU_SIZE; 1708 amd64_bsp_pcpu_init1(pc); 1709 /* Non-late cninit() and printf() can be moved up to here. */ 1710 1711 /* 1712 * Initialize mutexes. 1713 * 1714 * icu_lock: in order to allow an interrupt to occur in a critical 1715 * section, to set pcpu->ipending (etc...) properly, we 1716 * must be able to get the icu lock, so it can't be 1717 * under witness. 1718 */ 1719 mutex_init(); 1720 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS); 1721 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF); 1722 1723 /* exceptions */ 1724 for (x = 0; x < NIDT; x++) 1725 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT, 1726 SEL_KPL, 0); 1727 setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT, 1728 SEL_KPL, 0); 1729 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4); 1730 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2); 1731 setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT, 1732 SEL_UPL, 0); 1733 setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT, 1734 SEL_UPL, 0); 1735 setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT, 1736 SEL_KPL, 0); 1737 setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT, 1738 SEL_KPL, 0); 1739 setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT, 1740 SEL_KPL, 0); 1741 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); 1742 setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm), 1743 SDT_SYSIGT, SEL_KPL, 0); 1744 setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT, 1745 SEL_KPL, 0); 1746 setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing), 1747 SDT_SYSIGT, SEL_KPL, 0); 1748 setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT, 1749 SEL_KPL, 0); 1750 setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT, 1751 SEL_KPL, 0); 1752 setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT, 1753 SEL_KPL, 0); 1754 setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT, 1755 SEL_KPL, 0); 1756 setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT, 1757 SEL_KPL, 0); 1758 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3); 1759 setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT, 1760 SEL_KPL, 0); 1761 #ifdef KDTRACE_HOOKS 1762 setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) : 1763 &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0); 1764 #endif 1765 #ifdef XENHVM 1766 setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) : 1767 &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0); 1768 #endif 1769 r_idt.rd_limit = sizeof(idt0) - 1; 1770 r_idt.rd_base = (long) idt; 1771 lidt(&r_idt); 1772 1773 /* 1774 * Initialize the clock before the console so that console 1775 * initialization can use DELAY(). 1776 */ 1777 clock_init(); 1778 1779 /* 1780 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4) 1781 * transition). 1782 * Once bootblocks have updated, we can test directly for 1783 * efi_systbl != NULL here... 1784 */ 1785 if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP) 1786 != NULL) 1787 vty_set_preferred(VTY_VT); 1788 1789 TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable); 1790 TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable); 1791 1792 TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable); 1793 TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable); 1794 1795 TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush", 1796 &syscall_ret_l1d_flush_mode); 1797 1798 TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable); 1799 TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable); 1800 1801 TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable); 1802 1803 TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable", 1804 &x86_rngds_mitg_enable); 1805 1806 finishidentcpu(); /* Final stage of CPU initialization */ 1807 initializecpu(); /* Initialize CPU registers */ 1808 1809 amd64_bsp_ist_init(pc); 1810 1811 /* Set the IO permission bitmap (empty due to tss seg limit) */ 1812 pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) + 1813 IOPERM_BITMAP_SIZE; 1814 1815 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 1816 ltr(gsel_tss); 1817 1818 amd64_conf_fast_syscall(); 1819 1820 /* 1821 * We initialize the PCB pointer early so that exception 1822 * handlers will work. Also set up td_critnest to short-cut 1823 * the page fault handler. 1824 */ 1825 cpu_max_ext_state_size = sizeof(struct savefpu); 1826 set_top_of_stack_td(&thread0); 1827 thread0.td_pcb = get_pcb_td(&thread0); 1828 thread0.td_critnest = 1; 1829 1830 /* 1831 * The console and kdb should be initialized even earlier than here, 1832 * but some console drivers don't work until after getmemsize(). 1833 * Default to late console initialization to support these drivers. 1834 * This loses mainly printf()s in getmemsize() and early debugging. 1835 */ 1836 late_console = 1; 1837 TUNABLE_INT_FETCH("debug.late_console", &late_console); 1838 if (!late_console) { 1839 cninit(); 1840 amd64_kdb_init(); 1841 } 1842 1843 getmemsize(kmdp, physfree); 1844 init_param2(physmem); 1845 1846 /* now running on new page tables, configured,and u/iom is accessible */ 1847 1848 #ifdef DEV_PCI 1849 /* This call might adjust phys_avail[]. */ 1850 pci_early_quirks(); 1851 #endif 1852 1853 if (late_console) 1854 cninit(); 1855 1856 #ifdef DEV_ISA 1857 #ifdef DEV_ATPIC 1858 elcr_probe(); 1859 atpic_startup(); 1860 #else 1861 /* Reset and mask the atpics and leave them shut down. */ 1862 atpic_reset(); 1863 1864 /* 1865 * Point the ICU spurious interrupt vectors at the APIC spurious 1866 * interrupt handler. 1867 */ 1868 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); 1869 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); 1870 #endif 1871 #else 1872 #error "have you forgotten the isa device?" 1873 #endif 1874 1875 if (late_console) 1876 amd64_kdb_init(); 1877 1878 msgbufinit(msgbufp, msgbufsize); 1879 fpuinit(); 1880 1881 /* 1882 * Reinitialize thread0's stack base now that the xsave area size is 1883 * known. Set up thread0's pcb save area after fpuinit calculated fpu 1884 * save area size. Zero out the extended state header in fpu save area. 1885 */ 1886 set_top_of_stack_td(&thread0); 1887 thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0); 1888 bzero(thread0.td_pcb->pcb_save, cpu_max_ext_state_size); 1889 if (use_xsave) { 1890 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + 1891 1); 1892 xhdr->xstate_bv = xsave_mask; 1893 } 1894 /* make an initial tss so cpu can get interrupt stack on syscall! */ 1895 rsp0 = thread0.td_md.md_stack_base; 1896 /* Ensure the stack is aligned to 16 bytes */ 1897 rsp0 &= ~0xFul; 1898 PCPU_PTR(common_tss)->tss_rsp0 = rsp0; 1899 amd64_bsp_pcpu_init2(rsp0); 1900 1901 /* transfer to user mode */ 1902 1903 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 1904 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 1905 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); 1906 _ufssel = GSEL(GUFS32_SEL, SEL_UPL); 1907 _ugssel = GSEL(GUGS32_SEL, SEL_UPL); 1908 1909 load_ds(_udatasel); 1910 load_es(_udatasel); 1911 load_fs(_ufssel); 1912 1913 /* setup proc 0's pcb */ 1914 thread0.td_pcb->pcb_flags = 0; 1915 thread0.td_frame = &proc0_tf; 1916 1917 env = kern_getenv("kernelname"); 1918 if (env != NULL) 1919 strlcpy(kernelname, env, sizeof(kernelname)); 1920 1921 cpu_probe_amdc1e(); 1922 1923 kcsan_cpu_init(0); 1924 1925 #ifdef FDT 1926 x86_init_fdt(); 1927 #endif 1928 thread0.td_critnest = 0; 1929 1930 TSEXIT(); 1931 1932 /* Location of kernel stack for locore */ 1933 return (thread0.td_md.md_stack_base); 1934 } 1935 1936 void 1937 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) 1938 { 1939 1940 pcpu->pc_acpi_id = 0xffffffff; 1941 } 1942 1943 static int 1944 smap_sysctl_handler(SYSCTL_HANDLER_ARGS) 1945 { 1946 struct bios_smap *smapbase; 1947 struct bios_smap_xattr smap; 1948 caddr_t kmdp; 1949 uint32_t *smapattr; 1950 int count, error, i; 1951 1952 /* Retrieve the system memory map from the loader. */ 1953 kmdp = preload_search_by_type("elf kernel"); 1954 if (kmdp == NULL) 1955 kmdp = preload_search_by_type("elf64 kernel"); 1956 smapbase = (struct bios_smap *)preload_search_info(kmdp, 1957 MODINFO_METADATA | MODINFOMD_SMAP); 1958 if (smapbase == NULL) 1959 return (0); 1960 smapattr = (uint32_t *)preload_search_info(kmdp, 1961 MODINFO_METADATA | MODINFOMD_SMAP_XATTR); 1962 count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase); 1963 error = 0; 1964 for (i = 0; i < count; i++) { 1965 smap.base = smapbase[i].base; 1966 smap.length = smapbase[i].length; 1967 smap.type = smapbase[i].type; 1968 if (smapattr != NULL) 1969 smap.xattr = smapattr[i]; 1970 else 1971 smap.xattr = 0; 1972 error = SYSCTL_OUT(req, &smap, sizeof(smap)); 1973 } 1974 return (error); 1975 } 1976 SYSCTL_PROC(_machdep, OID_AUTO, smap, 1977 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 1978 smap_sysctl_handler, "S,bios_smap_xattr", 1979 "Raw BIOS SMAP data"); 1980 1981 static int 1982 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS) 1983 { 1984 struct efi_map_header *efihdr; 1985 caddr_t kmdp; 1986 uint32_t efisize; 1987 1988 kmdp = preload_search_by_type("elf kernel"); 1989 if (kmdp == NULL) 1990 kmdp = preload_search_by_type("elf64 kernel"); 1991 efihdr = (struct efi_map_header *)preload_search_info(kmdp, 1992 MODINFO_METADATA | MODINFOMD_EFI_MAP); 1993 if (efihdr == NULL) 1994 return (0); 1995 efisize = *((uint32_t *)efihdr - 1); 1996 return (SYSCTL_OUT(req, efihdr, efisize)); 1997 } 1998 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, 1999 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 2000 efi_map_sysctl_handler, "S,efi_map_header", 2001 "Raw EFI Memory Map"); 2002 2003 void 2004 spinlock_enter(void) 2005 { 2006 struct thread *td; 2007 register_t flags; 2008 2009 td = curthread; 2010 if (td->td_md.md_spinlock_count == 0) { 2011 flags = intr_disable(); 2012 td->td_md.md_spinlock_count = 1; 2013 td->td_md.md_saved_flags = flags; 2014 critical_enter(); 2015 } else 2016 td->td_md.md_spinlock_count++; 2017 } 2018 2019 void 2020 spinlock_exit(void) 2021 { 2022 struct thread *td; 2023 register_t flags; 2024 2025 td = curthread; 2026 flags = td->td_md.md_saved_flags; 2027 td->td_md.md_spinlock_count--; 2028 if (td->td_md.md_spinlock_count == 0) { 2029 critical_exit(); 2030 intr_restore(flags); 2031 } 2032 } 2033 2034 /* 2035 * Construct a PCB from a trapframe. This is called from kdb_trap() where 2036 * we want to start a backtrace from the function that caused us to enter 2037 * the debugger. We have the context in the trapframe, but base the trace 2038 * on the PCB. The PCB doesn't have to be perfect, as long as it contains 2039 * enough for a backtrace. 2040 */ 2041 void 2042 makectx(struct trapframe *tf, struct pcb *pcb) 2043 { 2044 2045 pcb->pcb_r12 = tf->tf_r12; 2046 pcb->pcb_r13 = tf->tf_r13; 2047 pcb->pcb_r14 = tf->tf_r14; 2048 pcb->pcb_r15 = tf->tf_r15; 2049 pcb->pcb_rbp = tf->tf_rbp; 2050 pcb->pcb_rbx = tf->tf_rbx; 2051 pcb->pcb_rip = tf->tf_rip; 2052 pcb->pcb_rsp = tf->tf_rsp; 2053 } 2054 2055 int 2056 ptrace_set_pc(struct thread *td, unsigned long addr) 2057 { 2058 2059 td->td_frame->tf_rip = addr; 2060 set_pcb_flags(td->td_pcb, PCB_FULL_IRET); 2061 return (0); 2062 } 2063 2064 int 2065 ptrace_single_step(struct thread *td) 2066 { 2067 2068 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); 2069 if ((td->td_frame->tf_rflags & PSL_T) == 0) { 2070 td->td_frame->tf_rflags |= PSL_T; 2071 td->td_dbgflags |= TDB_STEP; 2072 } 2073 return (0); 2074 } 2075 2076 int 2077 ptrace_clear_single_step(struct thread *td) 2078 { 2079 2080 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); 2081 td->td_frame->tf_rflags &= ~PSL_T; 2082 td->td_dbgflags &= ~TDB_STEP; 2083 return (0); 2084 } 2085 2086 int 2087 fill_regs(struct thread *td, struct reg *regs) 2088 { 2089 struct trapframe *tp; 2090 2091 tp = td->td_frame; 2092 return (fill_frame_regs(tp, regs)); 2093 } 2094 2095 int 2096 fill_frame_regs(struct trapframe *tp, struct reg *regs) 2097 { 2098 2099 regs->r_r15 = tp->tf_r15; 2100 regs->r_r14 = tp->tf_r14; 2101 regs->r_r13 = tp->tf_r13; 2102 regs->r_r12 = tp->tf_r12; 2103 regs->r_r11 = tp->tf_r11; 2104 regs->r_r10 = tp->tf_r10; 2105 regs->r_r9 = tp->tf_r9; 2106 regs->r_r8 = tp->tf_r8; 2107 regs->r_rdi = tp->tf_rdi; 2108 regs->r_rsi = tp->tf_rsi; 2109 regs->r_rbp = tp->tf_rbp; 2110 regs->r_rbx = tp->tf_rbx; 2111 regs->r_rdx = tp->tf_rdx; 2112 regs->r_rcx = tp->tf_rcx; 2113 regs->r_rax = tp->tf_rax; 2114 regs->r_rip = tp->tf_rip; 2115 regs->r_cs = tp->tf_cs; 2116 regs->r_rflags = tp->tf_rflags; 2117 regs->r_rsp = tp->tf_rsp; 2118 regs->r_ss = tp->tf_ss; 2119 if (tp->tf_flags & TF_HASSEGS) { 2120 regs->r_ds = tp->tf_ds; 2121 regs->r_es = tp->tf_es; 2122 regs->r_fs = tp->tf_fs; 2123 regs->r_gs = tp->tf_gs; 2124 } else { 2125 regs->r_ds = 0; 2126 regs->r_es = 0; 2127 regs->r_fs = 0; 2128 regs->r_gs = 0; 2129 } 2130 regs->r_err = 0; 2131 regs->r_trapno = 0; 2132 return (0); 2133 } 2134 2135 int 2136 set_regs(struct thread *td, struct reg *regs) 2137 { 2138 struct trapframe *tp; 2139 register_t rflags; 2140 2141 tp = td->td_frame; 2142 rflags = regs->r_rflags & 0xffffffff; 2143 if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs)) 2144 return (EINVAL); 2145 tp->tf_r15 = regs->r_r15; 2146 tp->tf_r14 = regs->r_r14; 2147 tp->tf_r13 = regs->r_r13; 2148 tp->tf_r12 = regs->r_r12; 2149 tp->tf_r11 = regs->r_r11; 2150 tp->tf_r10 = regs->r_r10; 2151 tp->tf_r9 = regs->r_r9; 2152 tp->tf_r8 = regs->r_r8; 2153 tp->tf_rdi = regs->r_rdi; 2154 tp->tf_rsi = regs->r_rsi; 2155 tp->tf_rbp = regs->r_rbp; 2156 tp->tf_rbx = regs->r_rbx; 2157 tp->tf_rdx = regs->r_rdx; 2158 tp->tf_rcx = regs->r_rcx; 2159 tp->tf_rax = regs->r_rax; 2160 tp->tf_rip = regs->r_rip; 2161 tp->tf_cs = regs->r_cs; 2162 tp->tf_rflags = rflags; 2163 tp->tf_rsp = regs->r_rsp; 2164 tp->tf_ss = regs->r_ss; 2165 if (0) { /* XXXKIB */ 2166 tp->tf_ds = regs->r_ds; 2167 tp->tf_es = regs->r_es; 2168 tp->tf_fs = regs->r_fs; 2169 tp->tf_gs = regs->r_gs; 2170 tp->tf_flags = TF_HASSEGS; 2171 } 2172 set_pcb_flags(td->td_pcb, PCB_FULL_IRET); 2173 return (0); 2174 } 2175 2176 /* XXX check all this stuff! */ 2177 /* externalize from sv_xmm */ 2178 static void 2179 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs) 2180 { 2181 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; 2182 struct envxmm *penv_xmm = &sv_xmm->sv_env; 2183 int i; 2184 2185 /* pcb -> fpregs */ 2186 bzero(fpregs, sizeof(*fpregs)); 2187 2188 /* FPU control/status */ 2189 penv_fpreg->en_cw = penv_xmm->en_cw; 2190 penv_fpreg->en_sw = penv_xmm->en_sw; 2191 penv_fpreg->en_tw = penv_xmm->en_tw; 2192 penv_fpreg->en_opcode = penv_xmm->en_opcode; 2193 penv_fpreg->en_rip = penv_xmm->en_rip; 2194 penv_fpreg->en_rdp = penv_xmm->en_rdp; 2195 penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr; 2196 penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask; 2197 2198 /* FPU registers */ 2199 for (i = 0; i < 8; ++i) 2200 bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10); 2201 2202 /* SSE registers */ 2203 for (i = 0; i < 16; ++i) 2204 bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16); 2205 } 2206 2207 /* internalize from fpregs into sv_xmm */ 2208 static void 2209 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm) 2210 { 2211 struct envxmm *penv_xmm = &sv_xmm->sv_env; 2212 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; 2213 int i; 2214 2215 /* fpregs -> pcb */ 2216 /* FPU control/status */ 2217 penv_xmm->en_cw = penv_fpreg->en_cw; 2218 penv_xmm->en_sw = penv_fpreg->en_sw; 2219 penv_xmm->en_tw = penv_fpreg->en_tw; 2220 penv_xmm->en_opcode = penv_fpreg->en_opcode; 2221 penv_xmm->en_rip = penv_fpreg->en_rip; 2222 penv_xmm->en_rdp = penv_fpreg->en_rdp; 2223 penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr; 2224 penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask; 2225 2226 /* FPU registers */ 2227 for (i = 0; i < 8; ++i) 2228 bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10); 2229 2230 /* SSE registers */ 2231 for (i = 0; i < 16; ++i) 2232 bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16); 2233 } 2234 2235 /* externalize from td->pcb */ 2236 int 2237 fill_fpregs(struct thread *td, struct fpreg *fpregs) 2238 { 2239 2240 KASSERT(td == curthread || TD_IS_SUSPENDED(td) || 2241 P_SHOULDSTOP(td->td_proc), 2242 ("not suspended thread %p", td)); 2243 fpugetregs(td); 2244 fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs); 2245 return (0); 2246 } 2247 2248 /* internalize to td->pcb */ 2249 int 2250 set_fpregs(struct thread *td, struct fpreg *fpregs) 2251 { 2252 2253 critical_enter(); 2254 set_fpregs_xmm(fpregs, get_pcb_user_save_td(td)); 2255 fpuuserinited(td); 2256 critical_exit(); 2257 return (0); 2258 } 2259 2260 /* 2261 * Get machine context. 2262 */ 2263 int 2264 get_mcontext(struct thread *td, mcontext_t *mcp, int flags) 2265 { 2266 struct pcb *pcb; 2267 struct trapframe *tp; 2268 2269 pcb = td->td_pcb; 2270 tp = td->td_frame; 2271 PROC_LOCK(curthread->td_proc); 2272 mcp->mc_onstack = sigonstack(tp->tf_rsp); 2273 PROC_UNLOCK(curthread->td_proc); 2274 mcp->mc_r15 = tp->tf_r15; 2275 mcp->mc_r14 = tp->tf_r14; 2276 mcp->mc_r13 = tp->tf_r13; 2277 mcp->mc_r12 = tp->tf_r12; 2278 mcp->mc_r11 = tp->tf_r11; 2279 mcp->mc_r10 = tp->tf_r10; 2280 mcp->mc_r9 = tp->tf_r9; 2281 mcp->mc_r8 = tp->tf_r8; 2282 mcp->mc_rdi = tp->tf_rdi; 2283 mcp->mc_rsi = tp->tf_rsi; 2284 mcp->mc_rbp = tp->tf_rbp; 2285 mcp->mc_rbx = tp->tf_rbx; 2286 mcp->mc_rcx = tp->tf_rcx; 2287 mcp->mc_rflags = tp->tf_rflags; 2288 if (flags & GET_MC_CLEAR_RET) { 2289 mcp->mc_rax = 0; 2290 mcp->mc_rdx = 0; 2291 mcp->mc_rflags &= ~PSL_C; 2292 } else { 2293 mcp->mc_rax = tp->tf_rax; 2294 mcp->mc_rdx = tp->tf_rdx; 2295 } 2296 mcp->mc_rip = tp->tf_rip; 2297 mcp->mc_cs = tp->tf_cs; 2298 mcp->mc_rsp = tp->tf_rsp; 2299 mcp->mc_ss = tp->tf_ss; 2300 mcp->mc_ds = tp->tf_ds; 2301 mcp->mc_es = tp->tf_es; 2302 mcp->mc_fs = tp->tf_fs; 2303 mcp->mc_gs = tp->tf_gs; 2304 mcp->mc_flags = tp->tf_flags; 2305 mcp->mc_len = sizeof(*mcp); 2306 get_fpcontext(td, mcp, NULL, 0); 2307 update_pcb_bases(pcb); 2308 mcp->mc_fsbase = pcb->pcb_fsbase; 2309 mcp->mc_gsbase = pcb->pcb_gsbase; 2310 mcp->mc_xfpustate = 0; 2311 mcp->mc_xfpustate_len = 0; 2312 bzero(mcp->mc_spare, sizeof(mcp->mc_spare)); 2313 return (0); 2314 } 2315 2316 /* 2317 * Set machine context. 2318 * 2319 * However, we don't set any but the user modifiable flags, and we won't 2320 * touch the cs selector. 2321 */ 2322 int 2323 set_mcontext(struct thread *td, mcontext_t *mcp) 2324 { 2325 struct pcb *pcb; 2326 struct trapframe *tp; 2327 char *xfpustate; 2328 long rflags; 2329 int ret; 2330 2331 pcb = td->td_pcb; 2332 tp = td->td_frame; 2333 if (mcp->mc_len != sizeof(*mcp) || 2334 (mcp->mc_flags & ~_MC_FLAG_MASK) != 0) 2335 return (EINVAL); 2336 rflags = (mcp->mc_rflags & PSL_USERCHANGE) | 2337 (tp->tf_rflags & ~PSL_USERCHANGE); 2338 if (mcp->mc_flags & _MC_HASFPXSTATE) { 2339 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - 2340 sizeof(struct savefpu)) 2341 return (EINVAL); 2342 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len); 2343 ret = copyin((void *)mcp->mc_xfpustate, xfpustate, 2344 mcp->mc_xfpustate_len); 2345 if (ret != 0) 2346 return (ret); 2347 } else 2348 xfpustate = NULL; 2349 ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len); 2350 if (ret != 0) 2351 return (ret); 2352 tp->tf_r15 = mcp->mc_r15; 2353 tp->tf_r14 = mcp->mc_r14; 2354 tp->tf_r13 = mcp->mc_r13; 2355 tp->tf_r12 = mcp->mc_r12; 2356 tp->tf_r11 = mcp->mc_r11; 2357 tp->tf_r10 = mcp->mc_r10; 2358 tp->tf_r9 = mcp->mc_r9; 2359 tp->tf_r8 = mcp->mc_r8; 2360 tp->tf_rdi = mcp->mc_rdi; 2361 tp->tf_rsi = mcp->mc_rsi; 2362 tp->tf_rbp = mcp->mc_rbp; 2363 tp->tf_rbx = mcp->mc_rbx; 2364 tp->tf_rdx = mcp->mc_rdx; 2365 tp->tf_rcx = mcp->mc_rcx; 2366 tp->tf_rax = mcp->mc_rax; 2367 tp->tf_rip = mcp->mc_rip; 2368 tp->tf_rflags = rflags; 2369 tp->tf_rsp = mcp->mc_rsp; 2370 tp->tf_ss = mcp->mc_ss; 2371 tp->tf_flags = mcp->mc_flags; 2372 if (tp->tf_flags & TF_HASSEGS) { 2373 tp->tf_ds = mcp->mc_ds; 2374 tp->tf_es = mcp->mc_es; 2375 tp->tf_fs = mcp->mc_fs; 2376 tp->tf_gs = mcp->mc_gs; 2377 } 2378 set_pcb_flags(pcb, PCB_FULL_IRET); 2379 if (mcp->mc_flags & _MC_HASBASES) { 2380 pcb->pcb_fsbase = mcp->mc_fsbase; 2381 pcb->pcb_gsbase = mcp->mc_gsbase; 2382 } 2383 return (0); 2384 } 2385 2386 static void 2387 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, 2388 size_t xfpusave_len) 2389 { 2390 size_t max_len, len; 2391 2392 mcp->mc_ownedfp = fpugetregs(td); 2393 bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0], 2394 sizeof(mcp->mc_fpstate)); 2395 mcp->mc_fpformat = fpuformat(); 2396 if (!use_xsave || xfpusave_len == 0) 2397 return; 2398 max_len = cpu_max_ext_state_size - sizeof(struct savefpu); 2399 len = xfpusave_len; 2400 if (len > max_len) { 2401 len = max_len; 2402 bzero(xfpusave + max_len, len - max_len); 2403 } 2404 mcp->mc_flags |= _MC_HASFPXSTATE; 2405 mcp->mc_xfpustate_len = len; 2406 bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len); 2407 } 2408 2409 static int 2410 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, 2411 size_t xfpustate_len) 2412 { 2413 int error; 2414 2415 if (mcp->mc_fpformat == _MC_FPFMT_NODEV) 2416 return (0); 2417 else if (mcp->mc_fpformat != _MC_FPFMT_XMM) 2418 return (EINVAL); 2419 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) { 2420 /* We don't care what state is left in the FPU or PCB. */ 2421 fpstate_drop(td); 2422 error = 0; 2423 } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || 2424 mcp->mc_ownedfp == _MC_FPOWNED_PCB) { 2425 error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate, 2426 xfpustate, xfpustate_len); 2427 } else 2428 return (EINVAL); 2429 return (error); 2430 } 2431 2432 void 2433 fpstate_drop(struct thread *td) 2434 { 2435 2436 KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu")); 2437 critical_enter(); 2438 if (PCPU_GET(fpcurthread) == td) 2439 fpudrop(); 2440 /* 2441 * XXX force a full drop of the fpu. The above only drops it if we 2442 * owned it. 2443 * 2444 * XXX I don't much like fpugetuserregs()'s semantics of doing a full 2445 * drop. Dropping only to the pcb matches fnsave's behaviour. 2446 * We only need to drop to !PCB_INITDONE in sendsig(). But 2447 * sendsig() is the only caller of fpugetuserregs()... perhaps we just 2448 * have too many layers. 2449 */ 2450 clear_pcb_flags(curthread->td_pcb, 2451 PCB_FPUINITDONE | PCB_USERFPUINITDONE); 2452 critical_exit(); 2453 } 2454 2455 int 2456 fill_dbregs(struct thread *td, struct dbreg *dbregs) 2457 { 2458 struct pcb *pcb; 2459 2460 if (td == NULL) { 2461 dbregs->dr[0] = rdr0(); 2462 dbregs->dr[1] = rdr1(); 2463 dbregs->dr[2] = rdr2(); 2464 dbregs->dr[3] = rdr3(); 2465 dbregs->dr[6] = rdr6(); 2466 dbregs->dr[7] = rdr7(); 2467 } else { 2468 pcb = td->td_pcb; 2469 dbregs->dr[0] = pcb->pcb_dr0; 2470 dbregs->dr[1] = pcb->pcb_dr1; 2471 dbregs->dr[2] = pcb->pcb_dr2; 2472 dbregs->dr[3] = pcb->pcb_dr3; 2473 dbregs->dr[6] = pcb->pcb_dr6; 2474 dbregs->dr[7] = pcb->pcb_dr7; 2475 } 2476 dbregs->dr[4] = 0; 2477 dbregs->dr[5] = 0; 2478 dbregs->dr[8] = 0; 2479 dbregs->dr[9] = 0; 2480 dbregs->dr[10] = 0; 2481 dbregs->dr[11] = 0; 2482 dbregs->dr[12] = 0; 2483 dbregs->dr[13] = 0; 2484 dbregs->dr[14] = 0; 2485 dbregs->dr[15] = 0; 2486 return (0); 2487 } 2488 2489 int 2490 set_dbregs(struct thread *td, struct dbreg *dbregs) 2491 { 2492 struct pcb *pcb; 2493 int i; 2494 2495 if (td == NULL) { 2496 load_dr0(dbregs->dr[0]); 2497 load_dr1(dbregs->dr[1]); 2498 load_dr2(dbregs->dr[2]); 2499 load_dr3(dbregs->dr[3]); 2500 load_dr6(dbregs->dr[6]); 2501 load_dr7(dbregs->dr[7]); 2502 } else { 2503 /* 2504 * Don't let an illegal value for dr7 get set. Specifically, 2505 * check for undefined settings. Setting these bit patterns 2506 * result in undefined behaviour and can lead to an unexpected 2507 * TRCTRAP or a general protection fault right here. 2508 * Upper bits of dr6 and dr7 must not be set 2509 */ 2510 for (i = 0; i < 4; i++) { 2511 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) 2512 return (EINVAL); 2513 if (td->td_frame->tf_cs == _ucode32sel && 2514 DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8) 2515 return (EINVAL); 2516 } 2517 if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 || 2518 (dbregs->dr[7] & 0xffffffff00000000ul) != 0) 2519 return (EINVAL); 2520 2521 pcb = td->td_pcb; 2522 2523 /* 2524 * Don't let a process set a breakpoint that is not within the 2525 * process's address space. If a process could do this, it 2526 * could halt the system by setting a breakpoint in the kernel 2527 * (if ddb was enabled). Thus, we need to check to make sure 2528 * that no breakpoints are being enabled for addresses outside 2529 * process's address space. 2530 * 2531 * XXX - what about when the watched area of the user's 2532 * address space is written into from within the kernel 2533 * ... wouldn't that still cause a breakpoint to be generated 2534 * from within kernel mode? 2535 */ 2536 2537 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { 2538 /* dr0 is enabled */ 2539 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) 2540 return (EINVAL); 2541 } 2542 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { 2543 /* dr1 is enabled */ 2544 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) 2545 return (EINVAL); 2546 } 2547 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { 2548 /* dr2 is enabled */ 2549 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) 2550 return (EINVAL); 2551 } 2552 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { 2553 /* dr3 is enabled */ 2554 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) 2555 return (EINVAL); 2556 } 2557 2558 pcb->pcb_dr0 = dbregs->dr[0]; 2559 pcb->pcb_dr1 = dbregs->dr[1]; 2560 pcb->pcb_dr2 = dbregs->dr[2]; 2561 pcb->pcb_dr3 = dbregs->dr[3]; 2562 pcb->pcb_dr6 = dbregs->dr[6]; 2563 pcb->pcb_dr7 = dbregs->dr[7]; 2564 2565 set_pcb_flags(pcb, PCB_DBREGS); 2566 } 2567 2568 return (0); 2569 } 2570 2571 void 2572 reset_dbregs(void) 2573 { 2574 2575 load_dr7(0); /* Turn off the control bits first */ 2576 load_dr0(0); 2577 load_dr1(0); 2578 load_dr2(0); 2579 load_dr3(0); 2580 load_dr6(0); 2581 } 2582 2583 /* 2584 * Return > 0 if a hardware breakpoint has been hit, and the 2585 * breakpoint was in user space. Return 0, otherwise. 2586 */ 2587 int 2588 user_dbreg_trap(register_t dr6) 2589 { 2590 u_int64_t dr7; 2591 u_int64_t bp; /* breakpoint bits extracted from dr6 */ 2592 int nbp; /* number of breakpoints that triggered */ 2593 caddr_t addr[4]; /* breakpoint addresses */ 2594 int i; 2595 2596 bp = dr6 & DBREG_DR6_BMASK; 2597 if (bp == 0) { 2598 /* 2599 * None of the breakpoint bits are set meaning this 2600 * trap was not caused by any of the debug registers 2601 */ 2602 return 0; 2603 } 2604 2605 dr7 = rdr7(); 2606 if ((dr7 & 0x000000ff) == 0) { 2607 /* 2608 * all GE and LE bits in the dr7 register are zero, 2609 * thus the trap couldn't have been caused by the 2610 * hardware debug registers 2611 */ 2612 return 0; 2613 } 2614 2615 nbp = 0; 2616 2617 /* 2618 * at least one of the breakpoints were hit, check to see 2619 * which ones and if any of them are user space addresses 2620 */ 2621 2622 if (bp & 0x01) { 2623 addr[nbp++] = (caddr_t)rdr0(); 2624 } 2625 if (bp & 0x02) { 2626 addr[nbp++] = (caddr_t)rdr1(); 2627 } 2628 if (bp & 0x04) { 2629 addr[nbp++] = (caddr_t)rdr2(); 2630 } 2631 if (bp & 0x08) { 2632 addr[nbp++] = (caddr_t)rdr3(); 2633 } 2634 2635 for (i = 0; i < nbp; i++) { 2636 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { 2637 /* 2638 * addr[i] is in user space 2639 */ 2640 return nbp; 2641 } 2642 } 2643 2644 /* 2645 * None of the breakpoints are in user space. 2646 */ 2647 return 0; 2648 } 2649 2650 /* 2651 * The pcb_flags is only modified by current thread, or by other threads 2652 * when current thread is stopped. However, current thread may change it 2653 * from the interrupt context in cpu_switch(), or in the trap handler. 2654 * When we read-modify-write pcb_flags from C sources, compiler may generate 2655 * code that is not atomic regarding the interrupt handler. If a trap or 2656 * interrupt happens and any flag is modified from the handler, it can be 2657 * clobbered with the cached value later. Therefore, we implement setting 2658 * and clearing flags with single-instruction functions, which do not race 2659 * with possible modification of the flags from the trap or interrupt context, 2660 * because traps and interrupts are executed only on instruction boundary. 2661 */ 2662 void 2663 set_pcb_flags_raw(struct pcb *pcb, const u_int flags) 2664 { 2665 2666 __asm __volatile("orl %1,%0" 2667 : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags) 2668 : "cc", "memory"); 2669 2670 } 2671 2672 /* 2673 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs 2674 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into 2675 * pcb if user space modified the bases. We must save on the context 2676 * switch or if the return to usermode happens through the doreti. 2677 * 2678 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET, 2679 * which have a consequence that the base MSRs must be saved each time 2680 * the PCB_FULL_IRET flag is set. We disable interrupts to sync with 2681 * context switches. 2682 */ 2683 static void 2684 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags) 2685 { 2686 register_t r; 2687 2688 if (curpcb == pcb && 2689 (flags & PCB_FULL_IRET) != 0 && 2690 (pcb->pcb_flags & PCB_FULL_IRET) == 0) { 2691 r = intr_disable(); 2692 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) { 2693 if (rfs() == _ufssel) 2694 pcb->pcb_fsbase = rdfsbase(); 2695 if (rgs() == _ugssel) 2696 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE); 2697 } 2698 set_pcb_flags_raw(pcb, flags); 2699 intr_restore(r); 2700 } else { 2701 set_pcb_flags_raw(pcb, flags); 2702 } 2703 } 2704 2705 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int)) 2706 { 2707 2708 return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ? 2709 set_pcb_flags_fsgsbase : set_pcb_flags_raw); 2710 } 2711 2712 void 2713 clear_pcb_flags(struct pcb *pcb, const u_int flags) 2714 { 2715 2716 __asm __volatile("andl %1,%0" 2717 : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags) 2718 : "cc", "memory"); 2719 } 2720 2721 #ifdef KDB 2722 2723 /* 2724 * Provide inb() and outb() as functions. They are normally only available as 2725 * inline functions, thus cannot be called from the debugger. 2726 */ 2727 2728 /* silence compiler warnings */ 2729 u_char inb_(u_short); 2730 void outb_(u_short, u_char); 2731 2732 u_char 2733 inb_(u_short port) 2734 { 2735 return inb(port); 2736 } 2737 2738 void 2739 outb_(u_short port, u_char data) 2740 { 2741 outb(port, data); 2742 } 2743 2744 #endif /* KDB */ 2745 2746 #undef memset 2747 #undef memmove 2748 #undef memcpy 2749 2750 void *memset_std(void *buf, int c, size_t len); 2751 void *memset_erms(void *buf, int c, size_t len); 2752 void *memmove_std(void * _Nonnull dst, const void * _Nonnull src, 2753 size_t len); 2754 void *memmove_erms(void * _Nonnull dst, const void * _Nonnull src, 2755 size_t len); 2756 void *memcpy_std(void * _Nonnull dst, const void * _Nonnull src, 2757 size_t len); 2758 void *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src, 2759 size_t len); 2760 2761 #ifdef KCSAN 2762 /* 2763 * These fail to build as ifuncs when used with KCSAN. 2764 */ 2765 void * 2766 memset(void *buf, int c, size_t len) 2767 { 2768 2769 return (memset_std(buf, c, len)); 2770 } 2771 2772 void * 2773 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len) 2774 { 2775 2776 return (memmove_std(dst, src, len)); 2777 } 2778 2779 void * 2780 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len) 2781 { 2782 2783 return (memcpy_std(dst, src, len)); 2784 } 2785 #else 2786 DEFINE_IFUNC(, void *, memset, (void *, int, size_t)) 2787 { 2788 2789 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 2790 memset_erms : memset_std); 2791 } 2792 2793 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull, 2794 size_t)) 2795 { 2796 2797 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 2798 memmove_erms : memmove_std); 2799 } 2800 2801 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t)) 2802 { 2803 2804 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 2805 memcpy_erms : memcpy_std); 2806 } 2807 #endif 2808 2809 void pagezero_std(void *addr); 2810 void pagezero_erms(void *addr); 2811 DEFINE_IFUNC(, void , pagezero, (void *)) 2812 { 2813 2814 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 2815 pagezero_erms : pagezero_std); 2816 } 2817