1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 2003 Peter Wemm. 5 * Copyright (c) 1992 Terrence R. Lambert. 6 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * William Jolitz. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. All advertising materials mentioning features or use of this software 21 * must display the following acknowledgement: 22 * This product includes software developed by the University of 23 * California, Berkeley and its contributors. 24 * 4. Neither the name of the University nor the names of its contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 * 40 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_atpic.h" 47 #include "opt_cpu.h" 48 #include "opt_ddb.h" 49 #include "opt_inet.h" 50 #include "opt_isa.h" 51 #include "opt_kstack_pages.h" 52 #include "opt_maxmem.h" 53 #include "opt_mp_watchdog.h" 54 #include "opt_pci.h" 55 #include "opt_platform.h" 56 #include "opt_sched.h" 57 58 #include <sys/param.h> 59 #include <sys/proc.h> 60 #include <sys/systm.h> 61 #include <sys/bio.h> 62 #include <sys/buf.h> 63 #include <sys/bus.h> 64 #include <sys/callout.h> 65 #include <sys/cons.h> 66 #include <sys/cpu.h> 67 #include <sys/efi.h> 68 #include <sys/eventhandler.h> 69 #include <sys/exec.h> 70 #include <sys/imgact.h> 71 #include <sys/kdb.h> 72 #include <sys/kernel.h> 73 #include <sys/ktr.h> 74 #include <sys/linker.h> 75 #include <sys/lock.h> 76 #include <sys/malloc.h> 77 #include <sys/memrange.h> 78 #include <sys/msgbuf.h> 79 #include <sys/mutex.h> 80 #include <sys/pcpu.h> 81 #include <sys/ptrace.h> 82 #include <sys/reboot.h> 83 #include <sys/rwlock.h> 84 #include <sys/sched.h> 85 #include <sys/signalvar.h> 86 #ifdef SMP 87 #include <sys/smp.h> 88 #endif 89 #include <sys/syscallsubr.h> 90 #include <sys/sysctl.h> 91 #include <sys/sysent.h> 92 #include <sys/sysproto.h> 93 #include <sys/ucontext.h> 94 #include <sys/vmmeter.h> 95 96 #include <vm/vm.h> 97 #include <vm/vm_extern.h> 98 #include <vm/vm_kern.h> 99 #include <vm/vm_page.h> 100 #include <vm/vm_map.h> 101 #include <vm/vm_object.h> 102 #include <vm/vm_pager.h> 103 #include <vm/vm_param.h> 104 #include <vm/vm_phys.h> 105 106 #ifdef DDB 107 #ifndef KDB 108 #error KDB must be enabled in order for DDB to work! 109 #endif 110 #include <ddb/ddb.h> 111 #include <ddb/db_sym.h> 112 #endif 113 114 #include <net/netisr.h> 115 116 #include <machine/clock.h> 117 #include <machine/cpu.h> 118 #include <machine/cputypes.h> 119 #include <machine/frame.h> 120 #include <machine/intr_machdep.h> 121 #include <x86/mca.h> 122 #include <machine/md_var.h> 123 #include <machine/metadata.h> 124 #include <machine/mp_watchdog.h> 125 #include <machine/pc/bios.h> 126 #include <machine/pcb.h> 127 #include <machine/proc.h> 128 #include <machine/reg.h> 129 #include <machine/sigframe.h> 130 #include <machine/specialreg.h> 131 #include <machine/trap.h> 132 #include <machine/tss.h> 133 #include <x86/ucode.h> 134 #include <x86/ifunc.h> 135 #ifdef SMP 136 #include <machine/smp.h> 137 #endif 138 #ifdef FDT 139 #include <x86/fdt.h> 140 #endif 141 142 #ifdef DEV_ATPIC 143 #include <x86/isa/icu.h> 144 #else 145 #include <x86/apicvar.h> 146 #endif 147 148 #include <isa/isareg.h> 149 #include <isa/rtc.h> 150 #include <x86/init.h> 151 152 /* Sanity check for __curthread() */ 153 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); 154 155 /* 156 * The PTI trampoline stack needs enough space for a hardware trapframe and a 157 * couple of scratch registers, as well as the trapframe left behind after an 158 * iret fault. 159 */ 160 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) - 161 offsetof(struct pti_frame, pti_rip)); 162 163 extern u_int64_t hammer_time(u_int64_t, u_int64_t); 164 165 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 166 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 167 168 static void cpu_startup(void *); 169 static void get_fpcontext(struct thread *td, mcontext_t *mcp, 170 char *xfpusave, size_t xfpusave_len); 171 static int set_fpcontext(struct thread *td, mcontext_t *mcp, 172 char *xfpustate, size_t xfpustate_len); 173 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); 174 175 /* Preload data parse function */ 176 static caddr_t native_parse_preload_data(u_int64_t); 177 178 /* Native function to fetch and parse the e820 map */ 179 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *); 180 181 /* Default init_ops implementation. */ 182 struct init_ops init_ops = { 183 .parse_preload_data = native_parse_preload_data, 184 .early_clock_source_init = i8254_init, 185 .early_delay = i8254_delay, 186 .parse_memmap = native_parse_memmap, 187 #ifdef SMP 188 .mp_bootaddress = mp_bootaddress, 189 .start_all_aps = native_start_all_aps, 190 #endif 191 #ifdef DEV_PCI 192 .msi_init = msi_init, 193 #endif 194 }; 195 196 /* 197 * Physical address of the EFI System Table. Stashed from the metadata hints 198 * passed into the kernel and used by the EFI code to call runtime services. 199 */ 200 vm_paddr_t efi_systbl_phys; 201 202 /* Intel ICH registers */ 203 #define ICH_PMBASE 0x400 204 #define ICH_SMI_EN ICH_PMBASE + 0x30 205 206 int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel; 207 208 int cold = 1; 209 210 long Maxmem = 0; 211 long realmem = 0; 212 213 /* 214 * The number of PHYSMAP entries must be one less than the number of 215 * PHYSSEG entries because the PHYSMAP entry that spans the largest 216 * physical address that is accessible by ISA DMA is split into two 217 * PHYSSEG entries. 218 */ 219 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 220 221 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; 222 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; 223 224 /* must be 2 less so 0 0 can signal end of chunks */ 225 #define PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2) 226 #define DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2) 227 228 struct kva_md_info kmi; 229 230 static struct trapframe proc0_tf; 231 struct region_descriptor r_gdt, r_idt; 232 233 struct pcpu __pcpu[MAXCPU]; 234 235 struct mtx icu_lock; 236 237 struct mem_range_softc mem_range_softc; 238 239 struct mtx dt_lock; /* lock for GDT and LDT */ 240 241 void (*vmm_resume_p)(void); 242 243 static void 244 cpu_startup(dummy) 245 void *dummy; 246 { 247 uintmax_t memsize; 248 char *sysenv; 249 250 /* 251 * On MacBooks, we need to disallow the legacy USB circuit to 252 * generate an SMI# because this can cause several problems, 253 * namely: incorrect CPU frequency detection and failure to 254 * start the APs. 255 * We do this by disabling a bit in the SMI_EN (SMI Control and 256 * Enable register) of the Intel ICH LPC Interface Bridge. 257 */ 258 sysenv = kern_getenv("smbios.system.product"); 259 if (sysenv != NULL) { 260 if (strncmp(sysenv, "MacBook1,1", 10) == 0 || 261 strncmp(sysenv, "MacBook3,1", 10) == 0 || 262 strncmp(sysenv, "MacBook4,1", 10) == 0 || 263 strncmp(sysenv, "MacBookPro1,1", 13) == 0 || 264 strncmp(sysenv, "MacBookPro1,2", 13) == 0 || 265 strncmp(sysenv, "MacBookPro3,1", 13) == 0 || 266 strncmp(sysenv, "MacBookPro4,1", 13) == 0 || 267 strncmp(sysenv, "Macmini1,1", 10) == 0) { 268 if (bootverbose) 269 printf("Disabling LEGACY_USB_EN bit on " 270 "Intel ICH.\n"); 271 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); 272 } 273 freeenv(sysenv); 274 } 275 276 /* 277 * Good {morning,afternoon,evening,night}. 278 */ 279 startrtclock(); 280 printcpuinfo(); 281 282 /* 283 * Display physical memory if SMBIOS reports reasonable amount. 284 */ 285 memsize = 0; 286 sysenv = kern_getenv("smbios.memory.enabled"); 287 if (sysenv != NULL) { 288 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; 289 freeenv(sysenv); 290 } 291 if (memsize < ptoa((uintmax_t)vm_free_count())) 292 memsize = ptoa((uintmax_t)Maxmem); 293 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); 294 realmem = atop(memsize); 295 296 /* 297 * Display any holes after the first chunk of extended memory. 298 */ 299 if (bootverbose) { 300 int indx; 301 302 printf("Physical memory chunk(s):\n"); 303 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 304 vm_paddr_t size; 305 306 size = phys_avail[indx + 1] - phys_avail[indx]; 307 printf( 308 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", 309 (uintmax_t)phys_avail[indx], 310 (uintmax_t)phys_avail[indx + 1] - 1, 311 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); 312 } 313 } 314 315 vm_ksubmap_init(&kmi); 316 317 printf("avail memory = %ju (%ju MB)\n", 318 ptoa((uintmax_t)vm_free_count()), 319 ptoa((uintmax_t)vm_free_count()) / 1048576); 320 #ifdef DEV_PCI 321 if (bootverbose && intel_graphics_stolen_base != 0) 322 printf("intel stolen mem: base %#jx size %ju MB\n", 323 (uintmax_t)intel_graphics_stolen_base, 324 (uintmax_t)intel_graphics_stolen_size / 1024 / 1024); 325 #endif 326 327 /* 328 * Set up buffers, so they can be used to read disk labels. 329 */ 330 bufinit(); 331 vm_pager_bufferinit(); 332 333 cpu_setregs(); 334 } 335 336 /* 337 * Send an interrupt to process. 338 * 339 * Stack is set up to allow sigcode stored 340 * at top to call routine, followed by call 341 * to sigreturn routine below. After sigreturn 342 * resets the signal mask, the stack, and the 343 * frame pointer, it returns to the user 344 * specified pc, psl. 345 */ 346 void 347 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 348 { 349 struct sigframe sf, *sfp; 350 struct pcb *pcb; 351 struct proc *p; 352 struct thread *td; 353 struct sigacts *psp; 354 char *sp; 355 struct trapframe *regs; 356 char *xfpusave; 357 size_t xfpusave_len; 358 int sig; 359 int oonstack; 360 361 td = curthread; 362 pcb = td->td_pcb; 363 p = td->td_proc; 364 PROC_LOCK_ASSERT(p, MA_OWNED); 365 sig = ksi->ksi_signo; 366 psp = p->p_sigacts; 367 mtx_assert(&psp->ps_mtx, MA_OWNED); 368 regs = td->td_frame; 369 oonstack = sigonstack(regs->tf_rsp); 370 371 if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) { 372 xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu); 373 xfpusave = __builtin_alloca(xfpusave_len); 374 } else { 375 xfpusave_len = 0; 376 xfpusave = NULL; 377 } 378 379 /* Save user context. */ 380 bzero(&sf, sizeof(sf)); 381 sf.sf_uc.uc_sigmask = *mask; 382 sf.sf_uc.uc_stack = td->td_sigstk; 383 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 384 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 385 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 386 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs)); 387 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ 388 get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); 389 fpstate_drop(td); 390 update_pcb_bases(pcb); 391 sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase; 392 sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase; 393 bzero(sf.sf_uc.uc_mcontext.mc_spare, 394 sizeof(sf.sf_uc.uc_mcontext.mc_spare)); 395 396 /* Allocate space for the signal handler context. */ 397 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 398 SIGISMEMBER(psp->ps_sigonstack, sig)) { 399 sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size; 400 #if defined(COMPAT_43) 401 td->td_sigstk.ss_flags |= SS_ONSTACK; 402 #endif 403 } else 404 sp = (char *)regs->tf_rsp - 128; 405 if (xfpusave != NULL) { 406 sp -= xfpusave_len; 407 sp = (char *)((unsigned long)sp & ~0x3Ful); 408 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp; 409 } 410 sp -= sizeof(struct sigframe); 411 /* Align to 16 bytes. */ 412 sfp = (struct sigframe *)((unsigned long)sp & ~0xFul); 413 414 /* Build the argument list for the signal handler. */ 415 regs->tf_rdi = sig; /* arg 1 in %rdi */ 416 regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */ 417 bzero(&sf.sf_si, sizeof(sf.sf_si)); 418 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 419 /* Signal handler installed with SA_SIGINFO. */ 420 regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */ 421 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 422 423 /* Fill in POSIX parts */ 424 sf.sf_si = ksi->ksi_info; 425 sf.sf_si.si_signo = sig; /* maybe a translated signal */ 426 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ 427 } else { 428 /* Old FreeBSD-style arguments. */ 429 regs->tf_rsi = ksi->ksi_code; /* arg 2 in %rsi */ 430 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ 431 sf.sf_ahu.sf_handler = catcher; 432 } 433 mtx_unlock(&psp->ps_mtx); 434 PROC_UNLOCK(p); 435 436 /* 437 * Copy the sigframe out to the user's stack. 438 */ 439 if (copyout(&sf, sfp, sizeof(*sfp)) != 0 || 440 (xfpusave != NULL && copyout(xfpusave, 441 (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len) 442 != 0)) { 443 #ifdef DEBUG 444 printf("process %ld has trashed its stack\n", (long)p->p_pid); 445 #endif 446 PROC_LOCK(p); 447 sigexit(td, SIGILL); 448 } 449 450 regs->tf_rsp = (long)sfp; 451 regs->tf_rip = p->p_sysent->sv_sigcode_base; 452 regs->tf_rflags &= ~(PSL_T | PSL_D); 453 regs->tf_cs = _ucodesel; 454 regs->tf_ds = _udatasel; 455 regs->tf_ss = _udatasel; 456 regs->tf_es = _udatasel; 457 regs->tf_fs = _ufssel; 458 regs->tf_gs = _ugssel; 459 regs->tf_flags = TF_HASSEGS; 460 PROC_LOCK(p); 461 mtx_lock(&psp->ps_mtx); 462 } 463 464 /* 465 * System call to cleanup state after a signal 466 * has been taken. Reset signal mask and 467 * stack state from context left by sendsig (above). 468 * Return to previous pc and psl as specified by 469 * context left by sendsig. Check carefully to 470 * make sure that the user has not modified the 471 * state to gain improper privileges. 472 * 473 * MPSAFE 474 */ 475 int 476 sys_sigreturn(td, uap) 477 struct thread *td; 478 struct sigreturn_args /* { 479 const struct __ucontext *sigcntxp; 480 } */ *uap; 481 { 482 ucontext_t uc; 483 struct pcb *pcb; 484 struct proc *p; 485 struct trapframe *regs; 486 ucontext_t *ucp; 487 char *xfpustate; 488 size_t xfpustate_len; 489 long rflags; 490 int cs, error, ret; 491 ksiginfo_t ksi; 492 493 pcb = td->td_pcb; 494 p = td->td_proc; 495 496 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 497 if (error != 0) { 498 uprintf("pid %d (%s): sigreturn copyin failed\n", 499 p->p_pid, td->td_name); 500 return (error); 501 } 502 ucp = &uc; 503 if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) { 504 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid, 505 td->td_name, ucp->uc_mcontext.mc_flags); 506 return (EINVAL); 507 } 508 regs = td->td_frame; 509 rflags = ucp->uc_mcontext.mc_rflags; 510 /* 511 * Don't allow users to change privileged or reserved flags. 512 */ 513 if (!EFL_SECURE(rflags, regs->tf_rflags)) { 514 uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid, 515 td->td_name, rflags); 516 return (EINVAL); 517 } 518 519 /* 520 * Don't allow users to load a valid privileged %cs. Let the 521 * hardware check for invalid selectors, excess privilege in 522 * other selectors, invalid %eip's and invalid %esp's. 523 */ 524 cs = ucp->uc_mcontext.mc_cs; 525 if (!CS_SECURE(cs)) { 526 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid, 527 td->td_name, cs); 528 ksiginfo_init_trap(&ksi); 529 ksi.ksi_signo = SIGBUS; 530 ksi.ksi_code = BUS_OBJERR; 531 ksi.ksi_trapno = T_PROTFLT; 532 ksi.ksi_addr = (void *)regs->tf_rip; 533 trapsignal(td, &ksi); 534 return (EINVAL); 535 } 536 537 if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) { 538 xfpustate_len = uc.uc_mcontext.mc_xfpustate_len; 539 if (xfpustate_len > cpu_max_ext_state_size - 540 sizeof(struct savefpu)) { 541 uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n", 542 p->p_pid, td->td_name, xfpustate_len); 543 return (EINVAL); 544 } 545 xfpustate = __builtin_alloca(xfpustate_len); 546 error = copyin((const void *)uc.uc_mcontext.mc_xfpustate, 547 xfpustate, xfpustate_len); 548 if (error != 0) { 549 uprintf( 550 "pid %d (%s): sigreturn copying xfpustate failed\n", 551 p->p_pid, td->td_name); 552 return (error); 553 } 554 } else { 555 xfpustate = NULL; 556 xfpustate_len = 0; 557 } 558 ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len); 559 if (ret != 0) { 560 uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n", 561 p->p_pid, td->td_name, ret); 562 return (ret); 563 } 564 bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs)); 565 update_pcb_bases(pcb); 566 pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase; 567 pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase; 568 569 #if defined(COMPAT_43) 570 if (ucp->uc_mcontext.mc_onstack & 1) 571 td->td_sigstk.ss_flags |= SS_ONSTACK; 572 else 573 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 574 #endif 575 576 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 577 return (EJUSTRETURN); 578 } 579 580 #ifdef COMPAT_FREEBSD4 581 int 582 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap) 583 { 584 585 return sys_sigreturn(td, (struct sigreturn_args *)uap); 586 } 587 #endif 588 589 /* 590 * Reset registers to default values on exec. 591 */ 592 void 593 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) 594 { 595 struct trapframe *regs; 596 struct pcb *pcb; 597 register_t saved_rflags; 598 599 regs = td->td_frame; 600 pcb = td->td_pcb; 601 602 if (td->td_proc->p_md.md_ldt != NULL) 603 user_ldt_free(td); 604 605 update_pcb_bases(pcb); 606 pcb->pcb_fsbase = 0; 607 pcb->pcb_gsbase = 0; 608 clear_pcb_flags(pcb, PCB_32BIT); 609 pcb->pcb_initial_fpucw = __INITIAL_FPUCW__; 610 611 saved_rflags = regs->tf_rflags & PSL_T; 612 bzero((char *)regs, sizeof(struct trapframe)); 613 regs->tf_rip = imgp->entry_addr; 614 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; 615 regs->tf_rdi = stack; /* argv */ 616 regs->tf_rflags = PSL_USER | saved_rflags; 617 regs->tf_ss = _udatasel; 618 regs->tf_cs = _ucodesel; 619 regs->tf_ds = _udatasel; 620 regs->tf_es = _udatasel; 621 regs->tf_fs = _ufssel; 622 regs->tf_gs = _ugssel; 623 regs->tf_flags = TF_HASSEGS; 624 625 /* 626 * Reset the hardware debug registers if they were in use. 627 * They won't have any meaning for the newly exec'd process. 628 */ 629 if (pcb->pcb_flags & PCB_DBREGS) { 630 pcb->pcb_dr0 = 0; 631 pcb->pcb_dr1 = 0; 632 pcb->pcb_dr2 = 0; 633 pcb->pcb_dr3 = 0; 634 pcb->pcb_dr6 = 0; 635 pcb->pcb_dr7 = 0; 636 if (pcb == curpcb) { 637 /* 638 * Clear the debug registers on the running 639 * CPU, otherwise they will end up affecting 640 * the next process we switch to. 641 */ 642 reset_dbregs(); 643 } 644 clear_pcb_flags(pcb, PCB_DBREGS); 645 } 646 647 /* 648 * Drop the FP state if we hold it, so that the process gets a 649 * clean FP state if it uses the FPU again. 650 */ 651 fpstate_drop(td); 652 } 653 654 void 655 cpu_setregs(void) 656 { 657 register_t cr0; 658 659 cr0 = rcr0(); 660 /* 661 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the 662 * BSP. See the comments there about why we set them. 663 */ 664 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; 665 load_cr0(cr0); 666 } 667 668 /* 669 * Initialize amd64 and configure to run kernel 670 */ 671 672 /* 673 * Initialize segments & interrupt table 674 */ 675 676 struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */ 677 static struct gate_descriptor idt0[NIDT]; 678 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 679 680 static char dblfault_stack[PAGE_SIZE] __aligned(16); 681 static char mce0_stack[PAGE_SIZE] __aligned(16); 682 static char nmi0_stack[PAGE_SIZE] __aligned(16); 683 static char dbg0_stack[PAGE_SIZE] __aligned(16); 684 CTASSERT(sizeof(struct nmi_pcpu) == 16); 685 686 struct amd64tss common_tss[MAXCPU]; 687 688 /* 689 * Software prototypes -- in more palatable form. 690 * 691 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same 692 * slots as corresponding segments for i386 kernel. 693 */ 694 struct soft_segment_descriptor gdt_segs[] = { 695 /* GNULL_SEL 0 Null Descriptor */ 696 { .ssd_base = 0x0, 697 .ssd_limit = 0x0, 698 .ssd_type = 0, 699 .ssd_dpl = 0, 700 .ssd_p = 0, 701 .ssd_long = 0, 702 .ssd_def32 = 0, 703 .ssd_gran = 0 }, 704 /* GNULL2_SEL 1 Null Descriptor */ 705 { .ssd_base = 0x0, 706 .ssd_limit = 0x0, 707 .ssd_type = 0, 708 .ssd_dpl = 0, 709 .ssd_p = 0, 710 .ssd_long = 0, 711 .ssd_def32 = 0, 712 .ssd_gran = 0 }, 713 /* GUFS32_SEL 2 32 bit %gs Descriptor for user */ 714 { .ssd_base = 0x0, 715 .ssd_limit = 0xfffff, 716 .ssd_type = SDT_MEMRWA, 717 .ssd_dpl = SEL_UPL, 718 .ssd_p = 1, 719 .ssd_long = 0, 720 .ssd_def32 = 1, 721 .ssd_gran = 1 }, 722 /* GUGS32_SEL 3 32 bit %fs Descriptor for user */ 723 { .ssd_base = 0x0, 724 .ssd_limit = 0xfffff, 725 .ssd_type = SDT_MEMRWA, 726 .ssd_dpl = SEL_UPL, 727 .ssd_p = 1, 728 .ssd_long = 0, 729 .ssd_def32 = 1, 730 .ssd_gran = 1 }, 731 /* GCODE_SEL 4 Code Descriptor for kernel */ 732 { .ssd_base = 0x0, 733 .ssd_limit = 0xfffff, 734 .ssd_type = SDT_MEMERA, 735 .ssd_dpl = SEL_KPL, 736 .ssd_p = 1, 737 .ssd_long = 1, 738 .ssd_def32 = 0, 739 .ssd_gran = 1 }, 740 /* GDATA_SEL 5 Data Descriptor for kernel */ 741 { .ssd_base = 0x0, 742 .ssd_limit = 0xfffff, 743 .ssd_type = SDT_MEMRWA, 744 .ssd_dpl = SEL_KPL, 745 .ssd_p = 1, 746 .ssd_long = 1, 747 .ssd_def32 = 0, 748 .ssd_gran = 1 }, 749 /* GUCODE32_SEL 6 32 bit Code Descriptor for user */ 750 { .ssd_base = 0x0, 751 .ssd_limit = 0xfffff, 752 .ssd_type = SDT_MEMERA, 753 .ssd_dpl = SEL_UPL, 754 .ssd_p = 1, 755 .ssd_long = 0, 756 .ssd_def32 = 1, 757 .ssd_gran = 1 }, 758 /* GUDATA_SEL 7 32/64 bit Data Descriptor for user */ 759 { .ssd_base = 0x0, 760 .ssd_limit = 0xfffff, 761 .ssd_type = SDT_MEMRWA, 762 .ssd_dpl = SEL_UPL, 763 .ssd_p = 1, 764 .ssd_long = 0, 765 .ssd_def32 = 1, 766 .ssd_gran = 1 }, 767 /* GUCODE_SEL 8 64 bit Code Descriptor for user */ 768 { .ssd_base = 0x0, 769 .ssd_limit = 0xfffff, 770 .ssd_type = SDT_MEMERA, 771 .ssd_dpl = SEL_UPL, 772 .ssd_p = 1, 773 .ssd_long = 1, 774 .ssd_def32 = 0, 775 .ssd_gran = 1 }, 776 /* GPROC0_SEL 9 Proc 0 Tss Descriptor */ 777 { .ssd_base = 0x0, 778 .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1, 779 .ssd_type = SDT_SYSTSS, 780 .ssd_dpl = SEL_KPL, 781 .ssd_p = 1, 782 .ssd_long = 0, 783 .ssd_def32 = 0, 784 .ssd_gran = 0 }, 785 /* Actually, the TSS is a system descriptor which is double size */ 786 { .ssd_base = 0x0, 787 .ssd_limit = 0x0, 788 .ssd_type = 0, 789 .ssd_dpl = 0, 790 .ssd_p = 0, 791 .ssd_long = 0, 792 .ssd_def32 = 0, 793 .ssd_gran = 0 }, 794 /* GUSERLDT_SEL 11 LDT Descriptor */ 795 { .ssd_base = 0x0, 796 .ssd_limit = 0x0, 797 .ssd_type = 0, 798 .ssd_dpl = 0, 799 .ssd_p = 0, 800 .ssd_long = 0, 801 .ssd_def32 = 0, 802 .ssd_gran = 0 }, 803 /* GUSERLDT_SEL 12 LDT Descriptor, double size */ 804 { .ssd_base = 0x0, 805 .ssd_limit = 0x0, 806 .ssd_type = 0, 807 .ssd_dpl = 0, 808 .ssd_p = 0, 809 .ssd_long = 0, 810 .ssd_def32 = 0, 811 .ssd_gran = 0 }, 812 }; 813 814 void 815 setidt(int idx, inthand_t *func, int typ, int dpl, int ist) 816 { 817 struct gate_descriptor *ip; 818 819 ip = idt + idx; 820 ip->gd_looffset = (uintptr_t)func; 821 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); 822 ip->gd_ist = ist; 823 ip->gd_xx = 0; 824 ip->gd_type = typ; 825 ip->gd_dpl = dpl; 826 ip->gd_p = 1; 827 ip->gd_hioffset = ((uintptr_t)func)>>16 ; 828 } 829 830 extern inthand_t 831 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 832 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 833 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 834 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 835 IDTVEC(xmm), IDTVEC(dblfault), 836 IDTVEC(div_pti), IDTVEC(bpt_pti), 837 IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti), 838 IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti), 839 IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti), 840 IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti), 841 IDTVEC(xmm_pti), 842 #ifdef KDTRACE_HOOKS 843 IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti), 844 #endif 845 #ifdef XENHVM 846 IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti), 847 #endif 848 IDTVEC(fast_syscall), IDTVEC(fast_syscall32), 849 IDTVEC(fast_syscall_pti); 850 851 #ifdef DDB 852 /* 853 * Display the index and function name of any IDT entries that don't use 854 * the default 'rsvd' entry point. 855 */ 856 DB_SHOW_COMMAND(idt, db_show_idt) 857 { 858 struct gate_descriptor *ip; 859 int idx; 860 uintptr_t func; 861 862 ip = idt; 863 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { 864 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); 865 if (func != (uintptr_t)&IDTVEC(rsvd)) { 866 db_printf("%3d\t", idx); 867 db_printsym(func, DB_STGY_PROC); 868 db_printf("\n"); 869 } 870 ip++; 871 } 872 } 873 874 /* Show privileged registers. */ 875 DB_SHOW_COMMAND(sysregs, db_show_sysregs) 876 { 877 struct { 878 uint16_t limit; 879 uint64_t base; 880 } __packed idtr, gdtr; 881 uint16_t ldt, tr; 882 883 __asm __volatile("sidt %0" : "=m" (idtr)); 884 db_printf("idtr\t0x%016lx/%04x\n", 885 (u_long)idtr.base, (u_int)idtr.limit); 886 __asm __volatile("sgdt %0" : "=m" (gdtr)); 887 db_printf("gdtr\t0x%016lx/%04x\n", 888 (u_long)gdtr.base, (u_int)gdtr.limit); 889 __asm __volatile("sldt %0" : "=r" (ldt)); 890 db_printf("ldtr\t0x%04x\n", ldt); 891 __asm __volatile("str %0" : "=r" (tr)); 892 db_printf("tr\t0x%04x\n", tr); 893 db_printf("cr0\t0x%016lx\n", rcr0()); 894 db_printf("cr2\t0x%016lx\n", rcr2()); 895 db_printf("cr3\t0x%016lx\n", rcr3()); 896 db_printf("cr4\t0x%016lx\n", rcr4()); 897 if (rcr4() & CR4_XSAVE) 898 db_printf("xcr0\t0x%016lx\n", rxcr(0)); 899 db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER)); 900 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX)) 901 db_printf("FEATURES_CTL\t%016lx\n", 902 rdmsr(MSR_IA32_FEATURE_CONTROL)); 903 db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR)); 904 db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT)); 905 db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE)); 906 } 907 908 DB_SHOW_COMMAND(dbregs, db_show_dbregs) 909 { 910 911 db_printf("dr0\t0x%016lx\n", rdr0()); 912 db_printf("dr1\t0x%016lx\n", rdr1()); 913 db_printf("dr2\t0x%016lx\n", rdr2()); 914 db_printf("dr3\t0x%016lx\n", rdr3()); 915 db_printf("dr6\t0x%016lx\n", rdr6()); 916 db_printf("dr7\t0x%016lx\n", rdr7()); 917 } 918 #endif 919 920 void 921 sdtossd(sd, ssd) 922 struct user_segment_descriptor *sd; 923 struct soft_segment_descriptor *ssd; 924 { 925 926 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 927 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 928 ssd->ssd_type = sd->sd_type; 929 ssd->ssd_dpl = sd->sd_dpl; 930 ssd->ssd_p = sd->sd_p; 931 ssd->ssd_long = sd->sd_long; 932 ssd->ssd_def32 = sd->sd_def32; 933 ssd->ssd_gran = sd->sd_gran; 934 } 935 936 void 937 ssdtosd(ssd, sd) 938 struct soft_segment_descriptor *ssd; 939 struct user_segment_descriptor *sd; 940 { 941 942 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 943 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; 944 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 945 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 946 sd->sd_type = ssd->ssd_type; 947 sd->sd_dpl = ssd->ssd_dpl; 948 sd->sd_p = ssd->ssd_p; 949 sd->sd_long = ssd->ssd_long; 950 sd->sd_def32 = ssd->ssd_def32; 951 sd->sd_gran = ssd->ssd_gran; 952 } 953 954 void 955 ssdtosyssd(ssd, sd) 956 struct soft_segment_descriptor *ssd; 957 struct system_segment_descriptor *sd; 958 { 959 960 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 961 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; 962 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 963 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 964 sd->sd_type = ssd->ssd_type; 965 sd->sd_dpl = ssd->ssd_dpl; 966 sd->sd_p = ssd->ssd_p; 967 sd->sd_gran = ssd->ssd_gran; 968 } 969 970 #if !defined(DEV_ATPIC) && defined(DEV_ISA) 971 #include <isa/isavar.h> 972 #include <isa/isareg.h> 973 /* 974 * Return a bitmap of the current interrupt requests. This is 8259-specific 975 * and is only suitable for use at probe time. 976 * This is only here to pacify sio. It is NOT FATAL if this doesn't work. 977 * It shouldn't be here. There should probably be an APIC centric 978 * implementation in the apic driver code, if at all. 979 */ 980 intrmask_t 981 isa_irq_pending(void) 982 { 983 u_char irr1; 984 u_char irr2; 985 986 irr1 = inb(IO_ICU1); 987 irr2 = inb(IO_ICU2); 988 return ((irr2 << 8) | irr1); 989 } 990 #endif 991 992 u_int basemem; 993 994 static int 995 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, 996 int *physmap_idxp) 997 { 998 int i, insert_idx, physmap_idx; 999 1000 physmap_idx = *physmap_idxp; 1001 1002 if (length == 0) 1003 return (1); 1004 1005 /* 1006 * Find insertion point while checking for overlap. Start off by 1007 * assuming the new entry will be added to the end. 1008 * 1009 * NB: physmap_idx points to the next free slot. 1010 */ 1011 insert_idx = physmap_idx; 1012 for (i = 0; i <= physmap_idx; i += 2) { 1013 if (base < physmap[i + 1]) { 1014 if (base + length <= physmap[i]) { 1015 insert_idx = i; 1016 break; 1017 } 1018 if (boothowto & RB_VERBOSE) 1019 printf( 1020 "Overlapping memory regions, ignoring second region\n"); 1021 return (1); 1022 } 1023 } 1024 1025 /* See if we can prepend to the next entry. */ 1026 if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { 1027 physmap[insert_idx] = base; 1028 return (1); 1029 } 1030 1031 /* See if we can append to the previous entry. */ 1032 if (insert_idx > 0 && base == physmap[insert_idx - 1]) { 1033 physmap[insert_idx - 1] += length; 1034 return (1); 1035 } 1036 1037 physmap_idx += 2; 1038 *physmap_idxp = physmap_idx; 1039 if (physmap_idx == PHYSMAP_SIZE) { 1040 printf( 1041 "Too many segments in the physical address map, giving up\n"); 1042 return (0); 1043 } 1044 1045 /* 1046 * Move the last 'N' entries down to make room for the new 1047 * entry if needed. 1048 */ 1049 for (i = (physmap_idx - 2); i > insert_idx; i -= 2) { 1050 physmap[i] = physmap[i - 2]; 1051 physmap[i + 1] = physmap[i - 1]; 1052 } 1053 1054 /* Insert the new entry. */ 1055 physmap[insert_idx] = base; 1056 physmap[insert_idx + 1] = base + length; 1057 return (1); 1058 } 1059 1060 void 1061 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize, 1062 vm_paddr_t *physmap, int *physmap_idx) 1063 { 1064 struct bios_smap *smap, *smapend; 1065 1066 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 1067 1068 for (smap = smapbase; smap < smapend; smap++) { 1069 if (boothowto & RB_VERBOSE) 1070 printf("SMAP type=%02x base=%016lx len=%016lx\n", 1071 smap->type, smap->base, smap->length); 1072 1073 if (smap->type != SMAP_TYPE_MEMORY) 1074 continue; 1075 1076 if (!add_physmap_entry(smap->base, smap->length, physmap, 1077 physmap_idx)) 1078 break; 1079 } 1080 } 1081 1082 static void 1083 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap, 1084 int *physmap_idx) 1085 { 1086 struct efi_md *map, *p; 1087 const char *type; 1088 size_t efisz; 1089 int ndesc, i; 1090 1091 static const char *types[] = { 1092 "Reserved", 1093 "LoaderCode", 1094 "LoaderData", 1095 "BootServicesCode", 1096 "BootServicesData", 1097 "RuntimeServicesCode", 1098 "RuntimeServicesData", 1099 "ConventionalMemory", 1100 "UnusableMemory", 1101 "ACPIReclaimMemory", 1102 "ACPIMemoryNVS", 1103 "MemoryMappedIO", 1104 "MemoryMappedIOPortSpace", 1105 "PalCode", 1106 "PersistentMemory" 1107 }; 1108 1109 /* 1110 * Memory map data provided by UEFI via the GetMemoryMap 1111 * Boot Services API. 1112 */ 1113 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf; 1114 map = (struct efi_md *)((uint8_t *)efihdr + efisz); 1115 1116 if (efihdr->descriptor_size == 0) 1117 return; 1118 ndesc = efihdr->memory_size / efihdr->descriptor_size; 1119 1120 if (boothowto & RB_VERBOSE) 1121 printf("%23s %12s %12s %8s %4s\n", 1122 "Type", "Physical", "Virtual", "#Pages", "Attr"); 1123 1124 for (i = 0, p = map; i < ndesc; i++, 1125 p = efi_next_descriptor(p, efihdr->descriptor_size)) { 1126 if (boothowto & RB_VERBOSE) { 1127 if (p->md_type < nitems(types)) 1128 type = types[p->md_type]; 1129 else 1130 type = "<INVALID>"; 1131 printf("%23s %012lx %12p %08lx ", type, p->md_phys, 1132 p->md_virt, p->md_pages); 1133 if (p->md_attr & EFI_MD_ATTR_UC) 1134 printf("UC "); 1135 if (p->md_attr & EFI_MD_ATTR_WC) 1136 printf("WC "); 1137 if (p->md_attr & EFI_MD_ATTR_WT) 1138 printf("WT "); 1139 if (p->md_attr & EFI_MD_ATTR_WB) 1140 printf("WB "); 1141 if (p->md_attr & EFI_MD_ATTR_UCE) 1142 printf("UCE "); 1143 if (p->md_attr & EFI_MD_ATTR_WP) 1144 printf("WP "); 1145 if (p->md_attr & EFI_MD_ATTR_RP) 1146 printf("RP "); 1147 if (p->md_attr & EFI_MD_ATTR_XP) 1148 printf("XP "); 1149 if (p->md_attr & EFI_MD_ATTR_NV) 1150 printf("NV "); 1151 if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE) 1152 printf("MORE_RELIABLE "); 1153 if (p->md_attr & EFI_MD_ATTR_RO) 1154 printf("RO "); 1155 if (p->md_attr & EFI_MD_ATTR_RT) 1156 printf("RUNTIME"); 1157 printf("\n"); 1158 } 1159 1160 switch (p->md_type) { 1161 case EFI_MD_TYPE_CODE: 1162 case EFI_MD_TYPE_DATA: 1163 case EFI_MD_TYPE_BS_CODE: 1164 case EFI_MD_TYPE_BS_DATA: 1165 case EFI_MD_TYPE_FREE: 1166 /* 1167 * We're allowed to use any entry with these types. 1168 */ 1169 break; 1170 default: 1171 continue; 1172 } 1173 1174 if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE), 1175 physmap, physmap_idx)) 1176 break; 1177 } 1178 } 1179 1180 static char bootmethod[16] = ""; 1181 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0, 1182 "System firmware boot method"); 1183 1184 static void 1185 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx) 1186 { 1187 struct bios_smap *smap; 1188 struct efi_map_header *efihdr; 1189 u_int32_t size; 1190 1191 /* 1192 * Memory map from INT 15:E820. 1193 * 1194 * subr_module.c says: 1195 * "Consumer may safely assume that size value precedes data." 1196 * ie: an int32_t immediately precedes smap. 1197 */ 1198 1199 efihdr = (struct efi_map_header *)preload_search_info(kmdp, 1200 MODINFO_METADATA | MODINFOMD_EFI_MAP); 1201 smap = (struct bios_smap *)preload_search_info(kmdp, 1202 MODINFO_METADATA | MODINFOMD_SMAP); 1203 if (efihdr == NULL && smap == NULL) 1204 panic("No BIOS smap or EFI map info from loader!"); 1205 1206 if (efihdr != NULL) { 1207 add_efi_map_entries(efihdr, physmap, physmap_idx); 1208 strlcpy(bootmethod, "UEFI", sizeof(bootmethod)); 1209 } else { 1210 size = *((u_int32_t *)smap - 1); 1211 bios_add_smap_entries(smap, size, physmap, physmap_idx); 1212 strlcpy(bootmethod, "BIOS", sizeof(bootmethod)); 1213 } 1214 } 1215 1216 #define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE) 1217 1218 /* 1219 * Populate the (physmap) array with base/bound pairs describing the 1220 * available physical memory in the system, then test this memory and 1221 * build the phys_avail array describing the actually-available memory. 1222 * 1223 * Total memory size may be set by the kernel environment variable 1224 * hw.physmem or the compile-time define MAXMEM. 1225 * 1226 * XXX first should be vm_paddr_t. 1227 */ 1228 static void 1229 getmemsize(caddr_t kmdp, u_int64_t first) 1230 { 1231 int i, physmap_idx, pa_indx, da_indx; 1232 vm_paddr_t pa, physmap[PHYSMAP_SIZE]; 1233 u_long physmem_start, physmem_tunable, memtest; 1234 pt_entry_t *pte; 1235 quad_t dcons_addr, dcons_size; 1236 int page_counter; 1237 1238 /* 1239 * Tell the physical memory allocator about pages used to store 1240 * the kernel and preloaded data. See kmem_bootstrap_free(). 1241 */ 1242 vm_phys_add_seg((vm_paddr_t)kernphys, trunc_page(first)); 1243 1244 bzero(physmap, sizeof(physmap)); 1245 physmap_idx = 0; 1246 1247 init_ops.parse_memmap(kmdp, physmap, &physmap_idx); 1248 physmap_idx -= 2; 1249 1250 /* 1251 * Find the 'base memory' segment for SMP 1252 */ 1253 basemem = 0; 1254 for (i = 0; i <= physmap_idx; i += 2) { 1255 if (physmap[i] <= 0xA0000) { 1256 basemem = physmap[i + 1] / 1024; 1257 break; 1258 } 1259 } 1260 if (basemem == 0 || basemem > 640) { 1261 if (bootverbose) 1262 printf( 1263 "Memory map doesn't contain a basemem segment, faking it"); 1264 basemem = 640; 1265 } 1266 1267 /* 1268 * Maxmem isn't the "maximum memory", it's one larger than the 1269 * highest page of the physical address space. It should be 1270 * called something like "Maxphyspage". We may adjust this 1271 * based on ``hw.physmem'' and the results of the memory test. 1272 */ 1273 Maxmem = atop(physmap[physmap_idx + 1]); 1274 1275 #ifdef MAXMEM 1276 Maxmem = MAXMEM / 4; 1277 #endif 1278 1279 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 1280 Maxmem = atop(physmem_tunable); 1281 1282 /* 1283 * The boot memory test is disabled by default, as it takes a 1284 * significant amount of time on large-memory systems, and is 1285 * unfriendly to virtual machines as it unnecessarily touches all 1286 * pages. 1287 * 1288 * A general name is used as the code may be extended to support 1289 * additional tests beyond the current "page present" test. 1290 */ 1291 memtest = 0; 1292 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); 1293 1294 /* 1295 * Don't allow MAXMEM or hw.physmem to extend the amount of memory 1296 * in the system. 1297 */ 1298 if (Maxmem > atop(physmap[physmap_idx + 1])) 1299 Maxmem = atop(physmap[physmap_idx + 1]); 1300 1301 if (atop(physmap[physmap_idx + 1]) != Maxmem && 1302 (boothowto & RB_VERBOSE)) 1303 printf("Physical memory use set to %ldK\n", Maxmem * 4); 1304 1305 /* 1306 * Make hole for "AP -> long mode" bootstrap code. The 1307 * mp_bootaddress vector is only available when the kernel 1308 * is configured to support APs and APs for the system start 1309 * in real mode mode (e.g. SMP bare metal). 1310 */ 1311 if (init_ops.mp_bootaddress) 1312 init_ops.mp_bootaddress(physmap, &physmap_idx); 1313 1314 /* call pmap initialization to make new kernel address space */ 1315 pmap_bootstrap(&first); 1316 1317 /* 1318 * Size up each available chunk of physical memory. 1319 * 1320 * XXX Some BIOSes corrupt low 64KB between suspend and resume. 1321 * By default, mask off the first 16 pages unless we appear to be 1322 * running in a VM. 1323 */ 1324 physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT; 1325 TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start); 1326 if (physmap[0] < physmem_start) { 1327 if (physmem_start < PAGE_SIZE) 1328 physmap[0] = PAGE_SIZE; 1329 else if (physmem_start >= physmap[1]) 1330 physmap[0] = round_page(physmap[1] - PAGE_SIZE); 1331 else 1332 physmap[0] = round_page(physmem_start); 1333 } 1334 pa_indx = 0; 1335 da_indx = 1; 1336 phys_avail[pa_indx++] = physmap[0]; 1337 phys_avail[pa_indx] = physmap[0]; 1338 dump_avail[da_indx] = physmap[0]; 1339 pte = CMAP1; 1340 1341 /* 1342 * Get dcons buffer address 1343 */ 1344 if (getenv_quad("dcons.addr", &dcons_addr) == 0 || 1345 getenv_quad("dcons.size", &dcons_size) == 0) 1346 dcons_addr = 0; 1347 1348 /* 1349 * physmap is in bytes, so when converting to page boundaries, 1350 * round up the start address and round down the end address. 1351 */ 1352 page_counter = 0; 1353 if (memtest != 0) 1354 printf("Testing system memory"); 1355 for (i = 0; i <= physmap_idx; i += 2) { 1356 vm_paddr_t end; 1357 1358 end = ptoa((vm_paddr_t)Maxmem); 1359 if (physmap[i + 1] < end) 1360 end = trunc_page(physmap[i + 1]); 1361 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 1362 int tmp, page_bad, full; 1363 int *ptr = (int *)CADDR1; 1364 1365 full = FALSE; 1366 /* 1367 * block out kernel memory as not available. 1368 */ 1369 if (pa >= (vm_paddr_t)kernphys && pa < first) 1370 goto do_dump_avail; 1371 1372 /* 1373 * block out dcons buffer 1374 */ 1375 if (dcons_addr > 0 1376 && pa >= trunc_page(dcons_addr) 1377 && pa < dcons_addr + dcons_size) 1378 goto do_dump_avail; 1379 1380 page_bad = FALSE; 1381 if (memtest == 0) 1382 goto skip_memtest; 1383 1384 /* 1385 * Print a "." every GB to show we're making 1386 * progress. 1387 */ 1388 page_counter++; 1389 if ((page_counter % PAGES_PER_GB) == 0) 1390 printf("."); 1391 1392 /* 1393 * map page into kernel: valid, read/write,non-cacheable 1394 */ 1395 *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD; 1396 invltlb(); 1397 1398 tmp = *(int *)ptr; 1399 /* 1400 * Test for alternating 1's and 0's 1401 */ 1402 *(volatile int *)ptr = 0xaaaaaaaa; 1403 if (*(volatile int *)ptr != 0xaaaaaaaa) 1404 page_bad = TRUE; 1405 /* 1406 * Test for alternating 0's and 1's 1407 */ 1408 *(volatile int *)ptr = 0x55555555; 1409 if (*(volatile int *)ptr != 0x55555555) 1410 page_bad = TRUE; 1411 /* 1412 * Test for all 1's 1413 */ 1414 *(volatile int *)ptr = 0xffffffff; 1415 if (*(volatile int *)ptr != 0xffffffff) 1416 page_bad = TRUE; 1417 /* 1418 * Test for all 0's 1419 */ 1420 *(volatile int *)ptr = 0x0; 1421 if (*(volatile int *)ptr != 0x0) 1422 page_bad = TRUE; 1423 /* 1424 * Restore original value. 1425 */ 1426 *(int *)ptr = tmp; 1427 1428 skip_memtest: 1429 /* 1430 * Adjust array of valid/good pages. 1431 */ 1432 if (page_bad == TRUE) 1433 continue; 1434 /* 1435 * If this good page is a continuation of the 1436 * previous set of good pages, then just increase 1437 * the end pointer. Otherwise start a new chunk. 1438 * Note that "end" points one higher than end, 1439 * making the range >= start and < end. 1440 * If we're also doing a speculative memory 1441 * test and we at or past the end, bump up Maxmem 1442 * so that we keep going. The first bad page 1443 * will terminate the loop. 1444 */ 1445 if (phys_avail[pa_indx] == pa) { 1446 phys_avail[pa_indx] += PAGE_SIZE; 1447 } else { 1448 pa_indx++; 1449 if (pa_indx == PHYS_AVAIL_ARRAY_END) { 1450 printf( 1451 "Too many holes in the physical address space, giving up\n"); 1452 pa_indx--; 1453 full = TRUE; 1454 goto do_dump_avail; 1455 } 1456 phys_avail[pa_indx++] = pa; /* start */ 1457 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 1458 } 1459 physmem++; 1460 do_dump_avail: 1461 if (dump_avail[da_indx] == pa) { 1462 dump_avail[da_indx] += PAGE_SIZE; 1463 } else { 1464 da_indx++; 1465 if (da_indx == DUMP_AVAIL_ARRAY_END) { 1466 da_indx--; 1467 goto do_next; 1468 } 1469 dump_avail[da_indx++] = pa; /* start */ 1470 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 1471 } 1472 do_next: 1473 if (full) 1474 break; 1475 } 1476 } 1477 *pte = 0; 1478 invltlb(); 1479 if (memtest != 0) 1480 printf("\n"); 1481 1482 /* 1483 * XXX 1484 * The last chunk must contain at least one page plus the message 1485 * buffer to avoid complicating other code (message buffer address 1486 * calculation, etc.). 1487 */ 1488 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 1489 round_page(msgbufsize) >= phys_avail[pa_indx]) { 1490 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 1491 phys_avail[pa_indx--] = 0; 1492 phys_avail[pa_indx--] = 0; 1493 } 1494 1495 Maxmem = atop(phys_avail[pa_indx]); 1496 1497 /* Trim off space for the message buffer. */ 1498 phys_avail[pa_indx] -= round_page(msgbufsize); 1499 1500 /* Map the message buffer. */ 1501 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]); 1502 } 1503 1504 static caddr_t 1505 native_parse_preload_data(u_int64_t modulep) 1506 { 1507 caddr_t kmdp; 1508 char *envp; 1509 #ifdef DDB 1510 vm_offset_t ksym_start; 1511 vm_offset_t ksym_end; 1512 #endif 1513 1514 preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE); 1515 preload_bootstrap_relocate(KERNBASE); 1516 kmdp = preload_search_by_type("elf kernel"); 1517 if (kmdp == NULL) 1518 kmdp = preload_search_by_type("elf64 kernel"); 1519 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); 1520 envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *); 1521 if (envp != NULL) 1522 envp += KERNBASE; 1523 init_static_kenv(envp, 0); 1524 #ifdef DDB 1525 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); 1526 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); 1527 db_fetch_ksymtab(ksym_start, ksym_end); 1528 #endif 1529 efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t); 1530 1531 return (kmdp); 1532 } 1533 1534 static void 1535 amd64_kdb_init(void) 1536 { 1537 kdb_init(); 1538 #ifdef KDB 1539 if (boothowto & RB_KDB) 1540 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); 1541 #endif 1542 } 1543 1544 /* Set up the fast syscall stuff */ 1545 void 1546 amd64_conf_fast_syscall(void) 1547 { 1548 uint64_t msr; 1549 1550 msr = rdmsr(MSR_EFER) | EFER_SCE; 1551 wrmsr(MSR_EFER, msr); 1552 wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) : 1553 (u_int64_t)IDTVEC(fast_syscall)); 1554 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 1555 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 1556 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 1557 wrmsr(MSR_STAR, msr); 1558 wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC); 1559 } 1560 1561 u_int64_t 1562 hammer_time(u_int64_t modulep, u_int64_t physfree) 1563 { 1564 caddr_t kmdp; 1565 int gsel_tss, x; 1566 struct pcpu *pc; 1567 struct nmi_pcpu *np; 1568 struct xstate_hdr *xhdr; 1569 u_int64_t rsp0; 1570 char *env; 1571 size_t kstack0_sz; 1572 int late_console; 1573 1574 TSRAW(&thread0, TS_ENTER, __func__, NULL); 1575 1576 kmdp = init_ops.parse_preload_data(modulep); 1577 1578 physfree += ucode_load_bsp(physfree + KERNBASE); 1579 physfree = roundup2(physfree, PAGE_SIZE); 1580 1581 identify_cpu1(); 1582 identify_hypervisor(); 1583 identify_cpu_fixup_bsp(); 1584 identify_cpu2(); 1585 initializecpucache(); 1586 1587 /* 1588 * Check for pti, pcid, and invpcid before ifuncs are 1589 * resolved, to correctly select the implementation for 1590 * pmap_activate_sw_mode(). 1591 */ 1592 pti = pti_get_default(); 1593 TUNABLE_INT_FETCH("vm.pmap.pti", &pti); 1594 TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); 1595 if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) { 1596 invpcid_works = (cpu_stdext_feature & 1597 CPUID_STDEXT_INVPCID) != 0; 1598 } else { 1599 pmap_pcid_enabled = 0; 1600 } 1601 1602 link_elf_ireloc(kmdp); 1603 1604 /* 1605 * This may be done better later if it gets more high level 1606 * components in it. If so just link td->td_proc here. 1607 */ 1608 proc_linkup0(&proc0, &thread0); 1609 1610 /* Init basic tunables, hz etc */ 1611 init_param1(); 1612 1613 thread0.td_kstack = physfree + KERNBASE; 1614 thread0.td_kstack_pages = kstack_pages; 1615 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; 1616 bzero((void *)thread0.td_kstack, kstack0_sz); 1617 physfree += kstack0_sz; 1618 1619 /* 1620 * make gdt memory segments 1621 */ 1622 for (x = 0; x < NGDT; x++) { 1623 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && 1624 x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1) 1625 ssdtosd(&gdt_segs[x], &gdt[x]); 1626 } 1627 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0]; 1628 ssdtosyssd(&gdt_segs[GPROC0_SEL], 1629 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1630 1631 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 1632 r_gdt.rd_base = (long) gdt; 1633 lgdt(&r_gdt); 1634 pc = &__pcpu[0]; 1635 1636 wrmsr(MSR_FSBASE, 0); /* User value */ 1637 wrmsr(MSR_GSBASE, (u_int64_t)pc); 1638 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ 1639 1640 pcpu_init(pc, 0, sizeof(struct pcpu)); 1641 dpcpu_init((void *)(physfree + KERNBASE), 0); 1642 physfree += DPCPU_SIZE; 1643 PCPU_SET(prvspace, pc); 1644 PCPU_SET(curthread, &thread0); 1645 /* Non-late cninit() and printf() can be moved up to here. */ 1646 PCPU_SET(tssp, &common_tss[0]); 1647 PCPU_SET(commontssp, &common_tss[0]); 1648 PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1649 PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]); 1650 PCPU_SET(fs32p, &gdt[GUFS32_SEL]); 1651 PCPU_SET(gs32p, &gdt[GUGS32_SEL]); 1652 1653 /* 1654 * Initialize mutexes. 1655 * 1656 * icu_lock: in order to allow an interrupt to occur in a critical 1657 * section, to set pcpu->ipending (etc...) properly, we 1658 * must be able to get the icu lock, so it can't be 1659 * under witness. 1660 */ 1661 mutex_init(); 1662 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS); 1663 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF); 1664 1665 /* exceptions */ 1666 for (x = 0; x < NIDT; x++) 1667 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT, 1668 SEL_KPL, 0); 1669 setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT, 1670 SEL_KPL, 0); 1671 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4); 1672 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2); 1673 setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT, 1674 SEL_UPL, 0); 1675 setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT, 1676 SEL_UPL, 0); 1677 setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT, 1678 SEL_KPL, 0); 1679 setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT, 1680 SEL_KPL, 0); 1681 setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT, 1682 SEL_KPL, 0); 1683 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); 1684 setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm), 1685 SDT_SYSIGT, SEL_KPL, 0); 1686 setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT, 1687 SEL_KPL, 0); 1688 setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing), 1689 SDT_SYSIGT, SEL_KPL, 0); 1690 setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT, 1691 SEL_KPL, 0); 1692 setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT, 1693 SEL_KPL, 0); 1694 setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT, 1695 SEL_KPL, 0); 1696 setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT, 1697 SEL_KPL, 0); 1698 setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT, 1699 SEL_KPL, 0); 1700 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3); 1701 setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT, 1702 SEL_KPL, 0); 1703 #ifdef KDTRACE_HOOKS 1704 setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) : 1705 &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0); 1706 #endif 1707 #ifdef XENHVM 1708 setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) : 1709 &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0); 1710 #endif 1711 r_idt.rd_limit = sizeof(idt0) - 1; 1712 r_idt.rd_base = (long) idt; 1713 lidt(&r_idt); 1714 1715 /* 1716 * Initialize the clock before the console so that console 1717 * initialization can use DELAY(). 1718 */ 1719 clock_init(); 1720 1721 /* 1722 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4) 1723 * transition). 1724 * Once bootblocks have updated, we can test directly for 1725 * efi_systbl != NULL here... 1726 */ 1727 if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP) 1728 != NULL) 1729 vty_set_preferred(VTY_VT); 1730 1731 TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable); 1732 TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable); 1733 TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush", 1734 &syscall_ret_l1d_flush_mode); 1735 1736 finishidentcpu(); /* Final stage of CPU initialization */ 1737 initializecpu(); /* Initialize CPU registers */ 1738 1739 /* doublefault stack space, runs on ist1 */ 1740 common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)]; 1741 1742 /* 1743 * NMI stack, runs on ist2. The pcpu pointer is stored just 1744 * above the start of the ist2 stack. 1745 */ 1746 np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1; 1747 np->np_pcpu = (register_t) pc; 1748 common_tss[0].tss_ist2 = (long) np; 1749 1750 /* 1751 * MC# stack, runs on ist3. The pcpu pointer is stored just 1752 * above the start of the ist3 stack. 1753 */ 1754 np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1; 1755 np->np_pcpu = (register_t) pc; 1756 common_tss[0].tss_ist3 = (long) np; 1757 1758 /* 1759 * DB# stack, runs on ist4. 1760 */ 1761 np = ((struct nmi_pcpu *) &dbg0_stack[sizeof(dbg0_stack)]) - 1; 1762 np->np_pcpu = (register_t) pc; 1763 common_tss[0].tss_ist4 = (long) np; 1764 1765 /* Set the IO permission bitmap (empty due to tss seg limit) */ 1766 common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE; 1767 1768 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 1769 ltr(gsel_tss); 1770 1771 amd64_conf_fast_syscall(); 1772 1773 /* 1774 * Temporary forge some valid pointer to PCB, for exception 1775 * handlers. It is reinitialized properly below after FPU is 1776 * set up. Also set up td_critnest to short-cut the page 1777 * fault handler. 1778 */ 1779 cpu_max_ext_state_size = sizeof(struct savefpu); 1780 thread0.td_pcb = get_pcb_td(&thread0); 1781 thread0.td_critnest = 1; 1782 1783 /* 1784 * The console and kdb should be initialized even earlier than here, 1785 * but some console drivers don't work until after getmemsize(). 1786 * Default to late console initialization to support these drivers. 1787 * This loses mainly printf()s in getmemsize() and early debugging. 1788 */ 1789 late_console = 1; 1790 TUNABLE_INT_FETCH("debug.late_console", &late_console); 1791 if (!late_console) { 1792 cninit(); 1793 amd64_kdb_init(); 1794 } 1795 1796 getmemsize(kmdp, physfree); 1797 init_param2(physmem); 1798 1799 /* now running on new page tables, configured,and u/iom is accessible */ 1800 1801 #ifdef DEV_PCI 1802 /* This call might adjust phys_avail[]. */ 1803 pci_early_quirks(); 1804 #endif 1805 1806 if (late_console) 1807 cninit(); 1808 1809 #ifdef DEV_ISA 1810 #ifdef DEV_ATPIC 1811 elcr_probe(); 1812 atpic_startup(); 1813 #else 1814 /* Reset and mask the atpics and leave them shut down. */ 1815 atpic_reset(); 1816 1817 /* 1818 * Point the ICU spurious interrupt vectors at the APIC spurious 1819 * interrupt handler. 1820 */ 1821 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); 1822 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); 1823 #endif 1824 #else 1825 #error "have you forgotten the isa device?"; 1826 #endif 1827 1828 if (late_console) 1829 amd64_kdb_init(); 1830 1831 msgbufinit(msgbufp, msgbufsize); 1832 fpuinit(); 1833 1834 /* 1835 * Set up thread0 pcb after fpuinit calculated pcb + fpu save 1836 * area size. Zero out the extended state header in fpu save 1837 * area. 1838 */ 1839 thread0.td_pcb = get_pcb_td(&thread0); 1840 thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0); 1841 bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size); 1842 if (use_xsave) { 1843 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + 1844 1); 1845 xhdr->xstate_bv = xsave_mask; 1846 } 1847 /* make an initial tss so cpu can get interrupt stack on syscall! */ 1848 rsp0 = (vm_offset_t)thread0.td_pcb; 1849 /* Ensure the stack is aligned to 16 bytes */ 1850 rsp0 &= ~0xFul; 1851 common_tss[0].tss_rsp0 = rsp0; 1852 PCPU_SET(rsp0, rsp0); 1853 PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) + 1854 PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful); 1855 PCPU_SET(curpcb, thread0.td_pcb); 1856 1857 /* transfer to user mode */ 1858 1859 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 1860 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 1861 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); 1862 _ufssel = GSEL(GUFS32_SEL, SEL_UPL); 1863 _ugssel = GSEL(GUGS32_SEL, SEL_UPL); 1864 1865 load_ds(_udatasel); 1866 load_es(_udatasel); 1867 load_fs(_ufssel); 1868 1869 /* setup proc 0's pcb */ 1870 thread0.td_pcb->pcb_flags = 0; 1871 thread0.td_frame = &proc0_tf; 1872 1873 env = kern_getenv("kernelname"); 1874 if (env != NULL) 1875 strlcpy(kernelname, env, sizeof(kernelname)); 1876 1877 cpu_probe_amdc1e(); 1878 1879 #ifdef FDT 1880 x86_init_fdt(); 1881 #endif 1882 thread0.td_critnest = 0; 1883 1884 TSEXIT(); 1885 1886 /* Location of kernel stack for locore */ 1887 return ((u_int64_t)thread0.td_pcb); 1888 } 1889 1890 void 1891 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) 1892 { 1893 1894 pcpu->pc_acpi_id = 0xffffffff; 1895 } 1896 1897 static int 1898 smap_sysctl_handler(SYSCTL_HANDLER_ARGS) 1899 { 1900 struct bios_smap *smapbase; 1901 struct bios_smap_xattr smap; 1902 caddr_t kmdp; 1903 uint32_t *smapattr; 1904 int count, error, i; 1905 1906 /* Retrieve the system memory map from the loader. */ 1907 kmdp = preload_search_by_type("elf kernel"); 1908 if (kmdp == NULL) 1909 kmdp = preload_search_by_type("elf64 kernel"); 1910 smapbase = (struct bios_smap *)preload_search_info(kmdp, 1911 MODINFO_METADATA | MODINFOMD_SMAP); 1912 if (smapbase == NULL) 1913 return (0); 1914 smapattr = (uint32_t *)preload_search_info(kmdp, 1915 MODINFO_METADATA | MODINFOMD_SMAP_XATTR); 1916 count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase); 1917 error = 0; 1918 for (i = 0; i < count; i++) { 1919 smap.base = smapbase[i].base; 1920 smap.length = smapbase[i].length; 1921 smap.type = smapbase[i].type; 1922 if (smapattr != NULL) 1923 smap.xattr = smapattr[i]; 1924 else 1925 smap.xattr = 0; 1926 error = SYSCTL_OUT(req, &smap, sizeof(smap)); 1927 } 1928 return (error); 1929 } 1930 SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, 1931 smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data"); 1932 1933 static int 1934 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS) 1935 { 1936 struct efi_map_header *efihdr; 1937 caddr_t kmdp; 1938 uint32_t efisize; 1939 1940 kmdp = preload_search_by_type("elf kernel"); 1941 if (kmdp == NULL) 1942 kmdp = preload_search_by_type("elf64 kernel"); 1943 efihdr = (struct efi_map_header *)preload_search_info(kmdp, 1944 MODINFO_METADATA | MODINFOMD_EFI_MAP); 1945 if (efihdr == NULL) 1946 return (0); 1947 efisize = *((uint32_t *)efihdr - 1); 1948 return (SYSCTL_OUT(req, efihdr, efisize)); 1949 } 1950 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, 1951 efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map"); 1952 1953 void 1954 spinlock_enter(void) 1955 { 1956 struct thread *td; 1957 register_t flags; 1958 1959 td = curthread; 1960 if (td->td_md.md_spinlock_count == 0) { 1961 flags = intr_disable(); 1962 td->td_md.md_spinlock_count = 1; 1963 td->td_md.md_saved_flags = flags; 1964 critical_enter(); 1965 } else 1966 td->td_md.md_spinlock_count++; 1967 } 1968 1969 void 1970 spinlock_exit(void) 1971 { 1972 struct thread *td; 1973 register_t flags; 1974 1975 td = curthread; 1976 flags = td->td_md.md_saved_flags; 1977 td->td_md.md_spinlock_count--; 1978 if (td->td_md.md_spinlock_count == 0) { 1979 critical_exit(); 1980 intr_restore(flags); 1981 } 1982 } 1983 1984 /* 1985 * Construct a PCB from a trapframe. This is called from kdb_trap() where 1986 * we want to start a backtrace from the function that caused us to enter 1987 * the debugger. We have the context in the trapframe, but base the trace 1988 * on the PCB. The PCB doesn't have to be perfect, as long as it contains 1989 * enough for a backtrace. 1990 */ 1991 void 1992 makectx(struct trapframe *tf, struct pcb *pcb) 1993 { 1994 1995 pcb->pcb_r12 = tf->tf_r12; 1996 pcb->pcb_r13 = tf->tf_r13; 1997 pcb->pcb_r14 = tf->tf_r14; 1998 pcb->pcb_r15 = tf->tf_r15; 1999 pcb->pcb_rbp = tf->tf_rbp; 2000 pcb->pcb_rbx = tf->tf_rbx; 2001 pcb->pcb_rip = tf->tf_rip; 2002 pcb->pcb_rsp = tf->tf_rsp; 2003 } 2004 2005 int 2006 ptrace_set_pc(struct thread *td, unsigned long addr) 2007 { 2008 2009 td->td_frame->tf_rip = addr; 2010 set_pcb_flags(td->td_pcb, PCB_FULL_IRET); 2011 return (0); 2012 } 2013 2014 int 2015 ptrace_single_step(struct thread *td) 2016 { 2017 2018 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); 2019 if ((td->td_frame->tf_rflags & PSL_T) == 0) { 2020 td->td_frame->tf_rflags |= PSL_T; 2021 td->td_dbgflags |= TDB_STEP; 2022 } 2023 return (0); 2024 } 2025 2026 int 2027 ptrace_clear_single_step(struct thread *td) 2028 { 2029 2030 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); 2031 td->td_frame->tf_rflags &= ~PSL_T; 2032 td->td_dbgflags &= ~TDB_STEP; 2033 return (0); 2034 } 2035 2036 int 2037 fill_regs(struct thread *td, struct reg *regs) 2038 { 2039 struct trapframe *tp; 2040 2041 tp = td->td_frame; 2042 return (fill_frame_regs(tp, regs)); 2043 } 2044 2045 int 2046 fill_frame_regs(struct trapframe *tp, struct reg *regs) 2047 { 2048 2049 regs->r_r15 = tp->tf_r15; 2050 regs->r_r14 = tp->tf_r14; 2051 regs->r_r13 = tp->tf_r13; 2052 regs->r_r12 = tp->tf_r12; 2053 regs->r_r11 = tp->tf_r11; 2054 regs->r_r10 = tp->tf_r10; 2055 regs->r_r9 = tp->tf_r9; 2056 regs->r_r8 = tp->tf_r8; 2057 regs->r_rdi = tp->tf_rdi; 2058 regs->r_rsi = tp->tf_rsi; 2059 regs->r_rbp = tp->tf_rbp; 2060 regs->r_rbx = tp->tf_rbx; 2061 regs->r_rdx = tp->tf_rdx; 2062 regs->r_rcx = tp->tf_rcx; 2063 regs->r_rax = tp->tf_rax; 2064 regs->r_rip = tp->tf_rip; 2065 regs->r_cs = tp->tf_cs; 2066 regs->r_rflags = tp->tf_rflags; 2067 regs->r_rsp = tp->tf_rsp; 2068 regs->r_ss = tp->tf_ss; 2069 if (tp->tf_flags & TF_HASSEGS) { 2070 regs->r_ds = tp->tf_ds; 2071 regs->r_es = tp->tf_es; 2072 regs->r_fs = tp->tf_fs; 2073 regs->r_gs = tp->tf_gs; 2074 } else { 2075 regs->r_ds = 0; 2076 regs->r_es = 0; 2077 regs->r_fs = 0; 2078 regs->r_gs = 0; 2079 } 2080 regs->r_err = 0; 2081 regs->r_trapno = 0; 2082 return (0); 2083 } 2084 2085 int 2086 set_regs(struct thread *td, struct reg *regs) 2087 { 2088 struct trapframe *tp; 2089 register_t rflags; 2090 2091 tp = td->td_frame; 2092 rflags = regs->r_rflags & 0xffffffff; 2093 if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs)) 2094 return (EINVAL); 2095 tp->tf_r15 = regs->r_r15; 2096 tp->tf_r14 = regs->r_r14; 2097 tp->tf_r13 = regs->r_r13; 2098 tp->tf_r12 = regs->r_r12; 2099 tp->tf_r11 = regs->r_r11; 2100 tp->tf_r10 = regs->r_r10; 2101 tp->tf_r9 = regs->r_r9; 2102 tp->tf_r8 = regs->r_r8; 2103 tp->tf_rdi = regs->r_rdi; 2104 tp->tf_rsi = regs->r_rsi; 2105 tp->tf_rbp = regs->r_rbp; 2106 tp->tf_rbx = regs->r_rbx; 2107 tp->tf_rdx = regs->r_rdx; 2108 tp->tf_rcx = regs->r_rcx; 2109 tp->tf_rax = regs->r_rax; 2110 tp->tf_rip = regs->r_rip; 2111 tp->tf_cs = regs->r_cs; 2112 tp->tf_rflags = rflags; 2113 tp->tf_rsp = regs->r_rsp; 2114 tp->tf_ss = regs->r_ss; 2115 if (0) { /* XXXKIB */ 2116 tp->tf_ds = regs->r_ds; 2117 tp->tf_es = regs->r_es; 2118 tp->tf_fs = regs->r_fs; 2119 tp->tf_gs = regs->r_gs; 2120 tp->tf_flags = TF_HASSEGS; 2121 } 2122 set_pcb_flags(td->td_pcb, PCB_FULL_IRET); 2123 return (0); 2124 } 2125 2126 /* XXX check all this stuff! */ 2127 /* externalize from sv_xmm */ 2128 static void 2129 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs) 2130 { 2131 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; 2132 struct envxmm *penv_xmm = &sv_xmm->sv_env; 2133 int i; 2134 2135 /* pcb -> fpregs */ 2136 bzero(fpregs, sizeof(*fpregs)); 2137 2138 /* FPU control/status */ 2139 penv_fpreg->en_cw = penv_xmm->en_cw; 2140 penv_fpreg->en_sw = penv_xmm->en_sw; 2141 penv_fpreg->en_tw = penv_xmm->en_tw; 2142 penv_fpreg->en_opcode = penv_xmm->en_opcode; 2143 penv_fpreg->en_rip = penv_xmm->en_rip; 2144 penv_fpreg->en_rdp = penv_xmm->en_rdp; 2145 penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr; 2146 penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask; 2147 2148 /* FPU registers */ 2149 for (i = 0; i < 8; ++i) 2150 bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10); 2151 2152 /* SSE registers */ 2153 for (i = 0; i < 16; ++i) 2154 bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16); 2155 } 2156 2157 /* internalize from fpregs into sv_xmm */ 2158 static void 2159 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm) 2160 { 2161 struct envxmm *penv_xmm = &sv_xmm->sv_env; 2162 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; 2163 int i; 2164 2165 /* fpregs -> pcb */ 2166 /* FPU control/status */ 2167 penv_xmm->en_cw = penv_fpreg->en_cw; 2168 penv_xmm->en_sw = penv_fpreg->en_sw; 2169 penv_xmm->en_tw = penv_fpreg->en_tw; 2170 penv_xmm->en_opcode = penv_fpreg->en_opcode; 2171 penv_xmm->en_rip = penv_fpreg->en_rip; 2172 penv_xmm->en_rdp = penv_fpreg->en_rdp; 2173 penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr; 2174 penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask; 2175 2176 /* FPU registers */ 2177 for (i = 0; i < 8; ++i) 2178 bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10); 2179 2180 /* SSE registers */ 2181 for (i = 0; i < 16; ++i) 2182 bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16); 2183 } 2184 2185 /* externalize from td->pcb */ 2186 int 2187 fill_fpregs(struct thread *td, struct fpreg *fpregs) 2188 { 2189 2190 KASSERT(td == curthread || TD_IS_SUSPENDED(td) || 2191 P_SHOULDSTOP(td->td_proc), 2192 ("not suspended thread %p", td)); 2193 fpugetregs(td); 2194 fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs); 2195 return (0); 2196 } 2197 2198 /* internalize to td->pcb */ 2199 int 2200 set_fpregs(struct thread *td, struct fpreg *fpregs) 2201 { 2202 2203 critical_enter(); 2204 set_fpregs_xmm(fpregs, get_pcb_user_save_td(td)); 2205 fpuuserinited(td); 2206 critical_exit(); 2207 return (0); 2208 } 2209 2210 /* 2211 * Get machine context. 2212 */ 2213 int 2214 get_mcontext(struct thread *td, mcontext_t *mcp, int flags) 2215 { 2216 struct pcb *pcb; 2217 struct trapframe *tp; 2218 2219 pcb = td->td_pcb; 2220 tp = td->td_frame; 2221 PROC_LOCK(curthread->td_proc); 2222 mcp->mc_onstack = sigonstack(tp->tf_rsp); 2223 PROC_UNLOCK(curthread->td_proc); 2224 mcp->mc_r15 = tp->tf_r15; 2225 mcp->mc_r14 = tp->tf_r14; 2226 mcp->mc_r13 = tp->tf_r13; 2227 mcp->mc_r12 = tp->tf_r12; 2228 mcp->mc_r11 = tp->tf_r11; 2229 mcp->mc_r10 = tp->tf_r10; 2230 mcp->mc_r9 = tp->tf_r9; 2231 mcp->mc_r8 = tp->tf_r8; 2232 mcp->mc_rdi = tp->tf_rdi; 2233 mcp->mc_rsi = tp->tf_rsi; 2234 mcp->mc_rbp = tp->tf_rbp; 2235 mcp->mc_rbx = tp->tf_rbx; 2236 mcp->mc_rcx = tp->tf_rcx; 2237 mcp->mc_rflags = tp->tf_rflags; 2238 if (flags & GET_MC_CLEAR_RET) { 2239 mcp->mc_rax = 0; 2240 mcp->mc_rdx = 0; 2241 mcp->mc_rflags &= ~PSL_C; 2242 } else { 2243 mcp->mc_rax = tp->tf_rax; 2244 mcp->mc_rdx = tp->tf_rdx; 2245 } 2246 mcp->mc_rip = tp->tf_rip; 2247 mcp->mc_cs = tp->tf_cs; 2248 mcp->mc_rsp = tp->tf_rsp; 2249 mcp->mc_ss = tp->tf_ss; 2250 mcp->mc_ds = tp->tf_ds; 2251 mcp->mc_es = tp->tf_es; 2252 mcp->mc_fs = tp->tf_fs; 2253 mcp->mc_gs = tp->tf_gs; 2254 mcp->mc_flags = tp->tf_flags; 2255 mcp->mc_len = sizeof(*mcp); 2256 get_fpcontext(td, mcp, NULL, 0); 2257 update_pcb_bases(pcb); 2258 mcp->mc_fsbase = pcb->pcb_fsbase; 2259 mcp->mc_gsbase = pcb->pcb_gsbase; 2260 mcp->mc_xfpustate = 0; 2261 mcp->mc_xfpustate_len = 0; 2262 bzero(mcp->mc_spare, sizeof(mcp->mc_spare)); 2263 return (0); 2264 } 2265 2266 /* 2267 * Set machine context. 2268 * 2269 * However, we don't set any but the user modifiable flags, and we won't 2270 * touch the cs selector. 2271 */ 2272 int 2273 set_mcontext(struct thread *td, mcontext_t *mcp) 2274 { 2275 struct pcb *pcb; 2276 struct trapframe *tp; 2277 char *xfpustate; 2278 long rflags; 2279 int ret; 2280 2281 pcb = td->td_pcb; 2282 tp = td->td_frame; 2283 if (mcp->mc_len != sizeof(*mcp) || 2284 (mcp->mc_flags & ~_MC_FLAG_MASK) != 0) 2285 return (EINVAL); 2286 rflags = (mcp->mc_rflags & PSL_USERCHANGE) | 2287 (tp->tf_rflags & ~PSL_USERCHANGE); 2288 if (mcp->mc_flags & _MC_HASFPXSTATE) { 2289 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - 2290 sizeof(struct savefpu)) 2291 return (EINVAL); 2292 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len); 2293 ret = copyin((void *)mcp->mc_xfpustate, xfpustate, 2294 mcp->mc_xfpustate_len); 2295 if (ret != 0) 2296 return (ret); 2297 } else 2298 xfpustate = NULL; 2299 ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len); 2300 if (ret != 0) 2301 return (ret); 2302 tp->tf_r15 = mcp->mc_r15; 2303 tp->tf_r14 = mcp->mc_r14; 2304 tp->tf_r13 = mcp->mc_r13; 2305 tp->tf_r12 = mcp->mc_r12; 2306 tp->tf_r11 = mcp->mc_r11; 2307 tp->tf_r10 = mcp->mc_r10; 2308 tp->tf_r9 = mcp->mc_r9; 2309 tp->tf_r8 = mcp->mc_r8; 2310 tp->tf_rdi = mcp->mc_rdi; 2311 tp->tf_rsi = mcp->mc_rsi; 2312 tp->tf_rbp = mcp->mc_rbp; 2313 tp->tf_rbx = mcp->mc_rbx; 2314 tp->tf_rdx = mcp->mc_rdx; 2315 tp->tf_rcx = mcp->mc_rcx; 2316 tp->tf_rax = mcp->mc_rax; 2317 tp->tf_rip = mcp->mc_rip; 2318 tp->tf_rflags = rflags; 2319 tp->tf_rsp = mcp->mc_rsp; 2320 tp->tf_ss = mcp->mc_ss; 2321 tp->tf_flags = mcp->mc_flags; 2322 if (tp->tf_flags & TF_HASSEGS) { 2323 tp->tf_ds = mcp->mc_ds; 2324 tp->tf_es = mcp->mc_es; 2325 tp->tf_fs = mcp->mc_fs; 2326 tp->tf_gs = mcp->mc_gs; 2327 } 2328 set_pcb_flags(pcb, PCB_FULL_IRET); 2329 if (mcp->mc_flags & _MC_HASBASES) { 2330 pcb->pcb_fsbase = mcp->mc_fsbase; 2331 pcb->pcb_gsbase = mcp->mc_gsbase; 2332 } 2333 return (0); 2334 } 2335 2336 static void 2337 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, 2338 size_t xfpusave_len) 2339 { 2340 size_t max_len, len; 2341 2342 mcp->mc_ownedfp = fpugetregs(td); 2343 bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0], 2344 sizeof(mcp->mc_fpstate)); 2345 mcp->mc_fpformat = fpuformat(); 2346 if (!use_xsave || xfpusave_len == 0) 2347 return; 2348 max_len = cpu_max_ext_state_size - sizeof(struct savefpu); 2349 len = xfpusave_len; 2350 if (len > max_len) { 2351 len = max_len; 2352 bzero(xfpusave + max_len, len - max_len); 2353 } 2354 mcp->mc_flags |= _MC_HASFPXSTATE; 2355 mcp->mc_xfpustate_len = len; 2356 bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len); 2357 } 2358 2359 static int 2360 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, 2361 size_t xfpustate_len) 2362 { 2363 int error; 2364 2365 if (mcp->mc_fpformat == _MC_FPFMT_NODEV) 2366 return (0); 2367 else if (mcp->mc_fpformat != _MC_FPFMT_XMM) 2368 return (EINVAL); 2369 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) { 2370 /* We don't care what state is left in the FPU or PCB. */ 2371 fpstate_drop(td); 2372 error = 0; 2373 } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || 2374 mcp->mc_ownedfp == _MC_FPOWNED_PCB) { 2375 error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate, 2376 xfpustate, xfpustate_len); 2377 } else 2378 return (EINVAL); 2379 return (error); 2380 } 2381 2382 void 2383 fpstate_drop(struct thread *td) 2384 { 2385 2386 KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu")); 2387 critical_enter(); 2388 if (PCPU_GET(fpcurthread) == td) 2389 fpudrop(); 2390 /* 2391 * XXX force a full drop of the fpu. The above only drops it if we 2392 * owned it. 2393 * 2394 * XXX I don't much like fpugetuserregs()'s semantics of doing a full 2395 * drop. Dropping only to the pcb matches fnsave's behaviour. 2396 * We only need to drop to !PCB_INITDONE in sendsig(). But 2397 * sendsig() is the only caller of fpugetuserregs()... perhaps we just 2398 * have too many layers. 2399 */ 2400 clear_pcb_flags(curthread->td_pcb, 2401 PCB_FPUINITDONE | PCB_USERFPUINITDONE); 2402 critical_exit(); 2403 } 2404 2405 int 2406 fill_dbregs(struct thread *td, struct dbreg *dbregs) 2407 { 2408 struct pcb *pcb; 2409 2410 if (td == NULL) { 2411 dbregs->dr[0] = rdr0(); 2412 dbregs->dr[1] = rdr1(); 2413 dbregs->dr[2] = rdr2(); 2414 dbregs->dr[3] = rdr3(); 2415 dbregs->dr[6] = rdr6(); 2416 dbregs->dr[7] = rdr7(); 2417 } else { 2418 pcb = td->td_pcb; 2419 dbregs->dr[0] = pcb->pcb_dr0; 2420 dbregs->dr[1] = pcb->pcb_dr1; 2421 dbregs->dr[2] = pcb->pcb_dr2; 2422 dbregs->dr[3] = pcb->pcb_dr3; 2423 dbregs->dr[6] = pcb->pcb_dr6; 2424 dbregs->dr[7] = pcb->pcb_dr7; 2425 } 2426 dbregs->dr[4] = 0; 2427 dbregs->dr[5] = 0; 2428 dbregs->dr[8] = 0; 2429 dbregs->dr[9] = 0; 2430 dbregs->dr[10] = 0; 2431 dbregs->dr[11] = 0; 2432 dbregs->dr[12] = 0; 2433 dbregs->dr[13] = 0; 2434 dbregs->dr[14] = 0; 2435 dbregs->dr[15] = 0; 2436 return (0); 2437 } 2438 2439 int 2440 set_dbregs(struct thread *td, struct dbreg *dbregs) 2441 { 2442 struct pcb *pcb; 2443 int i; 2444 2445 if (td == NULL) { 2446 load_dr0(dbregs->dr[0]); 2447 load_dr1(dbregs->dr[1]); 2448 load_dr2(dbregs->dr[2]); 2449 load_dr3(dbregs->dr[3]); 2450 load_dr6(dbregs->dr[6]); 2451 load_dr7(dbregs->dr[7]); 2452 } else { 2453 /* 2454 * Don't let an illegal value for dr7 get set. Specifically, 2455 * check for undefined settings. Setting these bit patterns 2456 * result in undefined behaviour and can lead to an unexpected 2457 * TRCTRAP or a general protection fault right here. 2458 * Upper bits of dr6 and dr7 must not be set 2459 */ 2460 for (i = 0; i < 4; i++) { 2461 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) 2462 return (EINVAL); 2463 if (td->td_frame->tf_cs == _ucode32sel && 2464 DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8) 2465 return (EINVAL); 2466 } 2467 if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 || 2468 (dbregs->dr[7] & 0xffffffff00000000ul) != 0) 2469 return (EINVAL); 2470 2471 pcb = td->td_pcb; 2472 2473 /* 2474 * Don't let a process set a breakpoint that is not within the 2475 * process's address space. If a process could do this, it 2476 * could halt the system by setting a breakpoint in the kernel 2477 * (if ddb was enabled). Thus, we need to check to make sure 2478 * that no breakpoints are being enabled for addresses outside 2479 * process's address space. 2480 * 2481 * XXX - what about when the watched area of the user's 2482 * address space is written into from within the kernel 2483 * ... wouldn't that still cause a breakpoint to be generated 2484 * from within kernel mode? 2485 */ 2486 2487 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { 2488 /* dr0 is enabled */ 2489 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) 2490 return (EINVAL); 2491 } 2492 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { 2493 /* dr1 is enabled */ 2494 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) 2495 return (EINVAL); 2496 } 2497 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { 2498 /* dr2 is enabled */ 2499 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) 2500 return (EINVAL); 2501 } 2502 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { 2503 /* dr3 is enabled */ 2504 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) 2505 return (EINVAL); 2506 } 2507 2508 pcb->pcb_dr0 = dbregs->dr[0]; 2509 pcb->pcb_dr1 = dbregs->dr[1]; 2510 pcb->pcb_dr2 = dbregs->dr[2]; 2511 pcb->pcb_dr3 = dbregs->dr[3]; 2512 pcb->pcb_dr6 = dbregs->dr[6]; 2513 pcb->pcb_dr7 = dbregs->dr[7]; 2514 2515 set_pcb_flags(pcb, PCB_DBREGS); 2516 } 2517 2518 return (0); 2519 } 2520 2521 void 2522 reset_dbregs(void) 2523 { 2524 2525 load_dr7(0); /* Turn off the control bits first */ 2526 load_dr0(0); 2527 load_dr1(0); 2528 load_dr2(0); 2529 load_dr3(0); 2530 load_dr6(0); 2531 } 2532 2533 /* 2534 * Return > 0 if a hardware breakpoint has been hit, and the 2535 * breakpoint was in user space. Return 0, otherwise. 2536 */ 2537 int 2538 user_dbreg_trap(register_t dr6) 2539 { 2540 u_int64_t dr7; 2541 u_int64_t bp; /* breakpoint bits extracted from dr6 */ 2542 int nbp; /* number of breakpoints that triggered */ 2543 caddr_t addr[4]; /* breakpoint addresses */ 2544 int i; 2545 2546 bp = dr6 & DBREG_DR6_BMASK; 2547 if (bp == 0) { 2548 /* 2549 * None of the breakpoint bits are set meaning this 2550 * trap was not caused by any of the debug registers 2551 */ 2552 return 0; 2553 } 2554 2555 dr7 = rdr7(); 2556 if ((dr7 & 0x000000ff) == 0) { 2557 /* 2558 * all GE and LE bits in the dr7 register are zero, 2559 * thus the trap couldn't have been caused by the 2560 * hardware debug registers 2561 */ 2562 return 0; 2563 } 2564 2565 nbp = 0; 2566 2567 /* 2568 * at least one of the breakpoints were hit, check to see 2569 * which ones and if any of them are user space addresses 2570 */ 2571 2572 if (bp & 0x01) { 2573 addr[nbp++] = (caddr_t)rdr0(); 2574 } 2575 if (bp & 0x02) { 2576 addr[nbp++] = (caddr_t)rdr1(); 2577 } 2578 if (bp & 0x04) { 2579 addr[nbp++] = (caddr_t)rdr2(); 2580 } 2581 if (bp & 0x08) { 2582 addr[nbp++] = (caddr_t)rdr3(); 2583 } 2584 2585 for (i = 0; i < nbp; i++) { 2586 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { 2587 /* 2588 * addr[i] is in user space 2589 */ 2590 return nbp; 2591 } 2592 } 2593 2594 /* 2595 * None of the breakpoints are in user space. 2596 */ 2597 return 0; 2598 } 2599 2600 /* 2601 * The pcb_flags is only modified by current thread, or by other threads 2602 * when current thread is stopped. However, current thread may change it 2603 * from the interrupt context in cpu_switch(), or in the trap handler. 2604 * When we read-modify-write pcb_flags from C sources, compiler may generate 2605 * code that is not atomic regarding the interrupt handler. If a trap or 2606 * interrupt happens and any flag is modified from the handler, it can be 2607 * clobbered with the cached value later. Therefore, we implement setting 2608 * and clearing flags with single-instruction functions, which do not race 2609 * with possible modification of the flags from the trap or interrupt context, 2610 * because traps and interrupts are executed only on instruction boundary. 2611 */ 2612 void 2613 set_pcb_flags_raw(struct pcb *pcb, const u_int flags) 2614 { 2615 2616 __asm __volatile("orl %1,%0" 2617 : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags) 2618 : "cc", "memory"); 2619 2620 } 2621 2622 /* 2623 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs 2624 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into 2625 * pcb if user space modified the bases. We must save on the context 2626 * switch or if the return to usermode happens through the doreti. 2627 * 2628 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET, 2629 * which have a consequence that the base MSRs must be saved each time 2630 * the PCB_FULL_IRET flag is set. We disable interrupts to sync with 2631 * context switches. 2632 */ 2633 static void 2634 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags) 2635 { 2636 register_t r; 2637 2638 if (curpcb == pcb && 2639 (flags & PCB_FULL_IRET) != 0 && 2640 (pcb->pcb_flags & PCB_FULL_IRET) == 0) { 2641 r = intr_disable(); 2642 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) { 2643 if (rfs() == _ufssel) 2644 pcb->pcb_fsbase = rdfsbase(); 2645 if (rgs() == _ugssel) 2646 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE); 2647 } 2648 set_pcb_flags_raw(pcb, flags); 2649 intr_restore(r); 2650 } else { 2651 set_pcb_flags_raw(pcb, flags); 2652 } 2653 } 2654 2655 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int), static) 2656 { 2657 2658 return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ? 2659 set_pcb_flags_fsgsbase : set_pcb_flags_raw); 2660 } 2661 2662 void 2663 clear_pcb_flags(struct pcb *pcb, const u_int flags) 2664 { 2665 2666 __asm __volatile("andl %1,%0" 2667 : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags) 2668 : "cc", "memory"); 2669 } 2670 2671 #ifdef KDB 2672 2673 /* 2674 * Provide inb() and outb() as functions. They are normally only available as 2675 * inline functions, thus cannot be called from the debugger. 2676 */ 2677 2678 /* silence compiler warnings */ 2679 u_char inb_(u_short); 2680 void outb_(u_short, u_char); 2681 2682 u_char 2683 inb_(u_short port) 2684 { 2685 return inb(port); 2686 } 2687 2688 void 2689 outb_(u_short port, u_char data) 2690 { 2691 outb(port, data); 2692 } 2693 2694 #endif /* KDB */ 2695 2696 #undef memset 2697 #undef memmove 2698 #undef memcpy 2699 2700 void *memset_std(void *buf, int c, size_t len); 2701 void *memset_erms(void *buf, int c, size_t len); 2702 DEFINE_IFUNC(, void *, memset, (void *, int, size_t), static) 2703 { 2704 2705 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 2706 memset_erms : memset_std); 2707 } 2708 2709 void *memmove_std(void * _Nonnull dst, const void * _Nonnull src, 2710 size_t len); 2711 void *memmove_erms(void * _Nonnull dst, const void * _Nonnull src, 2712 size_t len); 2713 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull, 2714 size_t), static) 2715 { 2716 2717 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 2718 memmove_erms : memmove_std); 2719 } 2720 2721 void *memcpy_std(void * _Nonnull dst, const void * _Nonnull src, 2722 size_t len); 2723 void *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src, 2724 size_t len); 2725 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t), 2726 static) 2727 { 2728 2729 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 2730 memcpy_erms : memcpy_std); 2731 } 2732 2733 void pagezero_std(void *addr); 2734 void pagezero_erms(void *addr); 2735 DEFINE_IFUNC(, void , pagezero, (void *), static) 2736 { 2737 2738 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 2739 pagezero_erms : pagezero_std); 2740 } 2741