1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 2003 Peter Wemm. 5 * Copyright (c) 1992 Terrence R. Lambert. 6 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * William Jolitz. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. All advertising materials mentioning features or use of this software 21 * must display the following acknowledgement: 22 * This product includes software developed by the University of 23 * California, Berkeley and its contributors. 24 * 4. Neither the name of the University nor the names of its contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 * 40 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_atpic.h" 47 #include "opt_cpu.h" 48 #include "opt_ddb.h" 49 #include "opt_inet.h" 50 #include "opt_isa.h" 51 #include "opt_kstack_pages.h" 52 #include "opt_maxmem.h" 53 #include "opt_pci.h" 54 #include "opt_platform.h" 55 #include "opt_sched.h" 56 57 #include <sys/param.h> 58 #include <sys/proc.h> 59 #include <sys/systm.h> 60 #include <sys/asan.h> 61 #include <sys/bio.h> 62 #include <sys/buf.h> 63 #include <sys/bus.h> 64 #include <sys/callout.h> 65 #include <sys/cons.h> 66 #include <sys/cpu.h> 67 #include <sys/csan.h> 68 #include <sys/efi.h> 69 #include <sys/eventhandler.h> 70 #include <sys/exec.h> 71 #include <sys/imgact.h> 72 #include <sys/kdb.h> 73 #include <sys/kernel.h> 74 #include <sys/ktr.h> 75 #include <sys/linker.h> 76 #include <sys/lock.h> 77 #include <sys/malloc.h> 78 #include <sys/memrange.h> 79 #include <sys/msan.h> 80 #include <sys/msgbuf.h> 81 #include <sys/mutex.h> 82 #include <sys/pcpu.h> 83 #include <sys/ptrace.h> 84 #include <sys/reboot.h> 85 #include <sys/reg.h> 86 #include <sys/rwlock.h> 87 #include <sys/sched.h> 88 #include <sys/signalvar.h> 89 #ifdef SMP 90 #include <sys/smp.h> 91 #endif 92 #include <sys/syscallsubr.h> 93 #include <sys/sysctl.h> 94 #include <sys/sysent.h> 95 #include <sys/sysproto.h> 96 #include <sys/ucontext.h> 97 #include <sys/vmmeter.h> 98 99 #include <vm/vm.h> 100 #include <vm/vm_param.h> 101 #include <vm/vm_extern.h> 102 #include <vm/vm_kern.h> 103 #include <vm/vm_page.h> 104 #include <vm/vm_map.h> 105 #include <vm/vm_object.h> 106 #include <vm/vm_pager.h> 107 #include <vm/vm_phys.h> 108 #include <vm/vm_dumpset.h> 109 110 #ifdef DDB 111 #ifndef KDB 112 #error KDB must be enabled in order for DDB to work! 113 #endif 114 #include <ddb/ddb.h> 115 #include <ddb/db_sym.h> 116 #endif 117 118 #include <net/netisr.h> 119 120 #include <dev/smbios/smbios.h> 121 122 #include <machine/clock.h> 123 #include <machine/cpu.h> 124 #include <machine/cputypes.h> 125 #include <machine/frame.h> 126 #include <machine/intr_machdep.h> 127 #include <x86/mca.h> 128 #include <machine/md_var.h> 129 #include <machine/metadata.h> 130 #include <machine/pc/bios.h> 131 #include <machine/pcb.h> 132 #include <machine/proc.h> 133 #include <machine/sigframe.h> 134 #include <machine/specialreg.h> 135 #include <machine/trap.h> 136 #include <machine/tss.h> 137 #include <x86/ucode.h> 138 #include <x86/ifunc.h> 139 #ifdef SMP 140 #include <machine/smp.h> 141 #endif 142 #ifdef FDT 143 #include <x86/fdt.h> 144 #endif 145 146 #ifdef DEV_ATPIC 147 #include <x86/isa/icu.h> 148 #else 149 #include <x86/apicvar.h> 150 #endif 151 152 #include <isa/isareg.h> 153 #include <isa/rtc.h> 154 #include <x86/init.h> 155 156 /* Sanity check for __curthread() */ 157 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); 158 159 /* 160 * The PTI trampoline stack needs enough space for a hardware trapframe and a 161 * couple of scratch registers, as well as the trapframe left behind after an 162 * iret fault. 163 */ 164 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) - 165 offsetof(struct pti_frame, pti_rip)); 166 167 extern u_int64_t hammer_time(u_int64_t, u_int64_t); 168 169 static void cpu_startup(void *); 170 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); 171 172 /* Probe 8254 PIT and TSC. */ 173 static void native_clock_source_init(void); 174 175 /* Preload data parse function */ 176 static caddr_t native_parse_preload_data(u_int64_t); 177 178 /* Native function to fetch and parse the e820 map */ 179 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *); 180 181 /* Default init_ops implementation. */ 182 struct init_ops init_ops = { 183 .parse_preload_data = native_parse_preload_data, 184 .early_clock_source_init = native_clock_source_init, 185 .early_delay = i8254_delay, 186 .parse_memmap = native_parse_memmap, 187 }; 188 189 /* 190 * Physical address of the EFI System Table. Stashed from the metadata hints 191 * passed into the kernel and used by the EFI code to call runtime services. 192 */ 193 vm_paddr_t efi_systbl_phys; 194 195 /* Intel ICH registers */ 196 #define ICH_PMBASE 0x400 197 #define ICH_SMI_EN ICH_PMBASE + 0x30 198 199 int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel; 200 201 int cold = 1; 202 203 long Maxmem = 0; 204 long realmem = 0; 205 int late_console = 1; 206 207 struct kva_md_info kmi; 208 209 struct region_descriptor r_idt; 210 211 struct pcpu *__pcpu; 212 struct pcpu temp_bsp_pcpu; 213 214 struct mtx icu_lock; 215 216 struct mem_range_softc mem_range_softc; 217 218 struct mtx dt_lock; /* lock for GDT and LDT */ 219 220 void (*vmm_resume_p)(void); 221 222 bool efi_boot; 223 224 static void 225 cpu_startup(void *dummy) 226 { 227 uintmax_t memsize; 228 char *sysenv; 229 230 /* 231 * On MacBooks, we need to disallow the legacy USB circuit to 232 * generate an SMI# because this can cause several problems, 233 * namely: incorrect CPU frequency detection and failure to 234 * start the APs. 235 * We do this by disabling a bit in the SMI_EN (SMI Control and 236 * Enable register) of the Intel ICH LPC Interface Bridge. 237 */ 238 sysenv = kern_getenv("smbios.system.product"); 239 if (sysenv != NULL) { 240 if (strncmp(sysenv, "MacBook1,1", 10) == 0 || 241 strncmp(sysenv, "MacBook3,1", 10) == 0 || 242 strncmp(sysenv, "MacBook4,1", 10) == 0 || 243 strncmp(sysenv, "MacBookPro1,1", 13) == 0 || 244 strncmp(sysenv, "MacBookPro1,2", 13) == 0 || 245 strncmp(sysenv, "MacBookPro3,1", 13) == 0 || 246 strncmp(sysenv, "MacBookPro4,1", 13) == 0 || 247 strncmp(sysenv, "Macmini1,1", 10) == 0) { 248 if (bootverbose) 249 printf("Disabling LEGACY_USB_EN bit on " 250 "Intel ICH.\n"); 251 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); 252 } 253 freeenv(sysenv); 254 } 255 256 /* 257 * Good {morning,afternoon,evening,night}. 258 */ 259 startrtclock(); 260 printcpuinfo(); 261 262 /* 263 * Display physical memory if SMBIOS reports reasonable amount. 264 */ 265 memsize = 0; 266 sysenv = kern_getenv("smbios.memory.enabled"); 267 if (sysenv != NULL) { 268 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; 269 freeenv(sysenv); 270 } 271 if (memsize < ptoa((uintmax_t)vm_free_count())) 272 memsize = ptoa((uintmax_t)Maxmem); 273 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); 274 realmem = atop(memsize); 275 276 /* 277 * Display any holes after the first chunk of extended memory. 278 */ 279 if (bootverbose) { 280 int indx; 281 282 printf("Physical memory chunk(s):\n"); 283 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 284 vm_paddr_t size; 285 286 size = phys_avail[indx + 1] - phys_avail[indx]; 287 printf( 288 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", 289 (uintmax_t)phys_avail[indx], 290 (uintmax_t)phys_avail[indx + 1] - 1, 291 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); 292 } 293 } 294 295 vm_ksubmap_init(&kmi); 296 297 printf("avail memory = %ju (%ju MB)\n", 298 ptoa((uintmax_t)vm_free_count()), 299 ptoa((uintmax_t)vm_free_count()) / 1048576); 300 #ifdef DEV_PCI 301 if (bootverbose && intel_graphics_stolen_base != 0) 302 printf("intel stolen mem: base %#jx size %ju MB\n", 303 (uintmax_t)intel_graphics_stolen_base, 304 (uintmax_t)intel_graphics_stolen_size / 1024 / 1024); 305 #endif 306 307 /* 308 * Set up buffers, so they can be used to read disk labels. 309 */ 310 bufinit(); 311 vm_pager_bufferinit(); 312 313 cpu_setregs(); 314 } 315 316 static void 317 late_ifunc_resolve(void *dummy __unused) 318 { 319 link_elf_late_ireloc(); 320 } 321 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL); 322 323 324 void 325 cpu_setregs(void) 326 { 327 register_t cr0; 328 329 TSENTER(); 330 cr0 = rcr0(); 331 /* 332 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the 333 * BSP. See the comments there about why we set them. 334 */ 335 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; 336 TSENTER2("load_cr0"); 337 load_cr0(cr0); 338 TSEXIT2("load_cr0"); 339 TSEXIT(); 340 } 341 342 /* 343 * Initialize amd64 and configure to run kernel 344 */ 345 346 /* 347 * Initialize segments & interrupt table 348 */ 349 static struct gate_descriptor idt0[NIDT]; 350 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 351 352 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16); 353 static char mce0_stack[MCE_STACK_SIZE] __aligned(16); 354 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16); 355 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16); 356 CTASSERT(sizeof(struct nmi_pcpu) == 16); 357 358 /* 359 * Software prototypes -- in more palatable form. 360 * 361 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same 362 * slots as corresponding segments for i386 kernel. 363 */ 364 struct soft_segment_descriptor gdt_segs[] = { 365 /* GNULL_SEL 0 Null Descriptor */ 366 { .ssd_base = 0x0, 367 .ssd_limit = 0x0, 368 .ssd_type = 0, 369 .ssd_dpl = 0, 370 .ssd_p = 0, 371 .ssd_long = 0, 372 .ssd_def32 = 0, 373 .ssd_gran = 0 }, 374 /* GNULL2_SEL 1 Null Descriptor */ 375 { .ssd_base = 0x0, 376 .ssd_limit = 0x0, 377 .ssd_type = 0, 378 .ssd_dpl = 0, 379 .ssd_p = 0, 380 .ssd_long = 0, 381 .ssd_def32 = 0, 382 .ssd_gran = 0 }, 383 /* GUFS32_SEL 2 32 bit %gs Descriptor for user */ 384 { .ssd_base = 0x0, 385 .ssd_limit = 0xfffff, 386 .ssd_type = SDT_MEMRWA, 387 .ssd_dpl = SEL_UPL, 388 .ssd_p = 1, 389 .ssd_long = 0, 390 .ssd_def32 = 1, 391 .ssd_gran = 1 }, 392 /* GUGS32_SEL 3 32 bit %fs Descriptor for user */ 393 { .ssd_base = 0x0, 394 .ssd_limit = 0xfffff, 395 .ssd_type = SDT_MEMRWA, 396 .ssd_dpl = SEL_UPL, 397 .ssd_p = 1, 398 .ssd_long = 0, 399 .ssd_def32 = 1, 400 .ssd_gran = 1 }, 401 /* GCODE_SEL 4 Code Descriptor for kernel */ 402 { .ssd_base = 0x0, 403 .ssd_limit = 0xfffff, 404 .ssd_type = SDT_MEMERA, 405 .ssd_dpl = SEL_KPL, 406 .ssd_p = 1, 407 .ssd_long = 1, 408 .ssd_def32 = 0, 409 .ssd_gran = 1 }, 410 /* GDATA_SEL 5 Data Descriptor for kernel */ 411 { .ssd_base = 0x0, 412 .ssd_limit = 0xfffff, 413 .ssd_type = SDT_MEMRWA, 414 .ssd_dpl = SEL_KPL, 415 .ssd_p = 1, 416 .ssd_long = 1, 417 .ssd_def32 = 0, 418 .ssd_gran = 1 }, 419 /* GUCODE32_SEL 6 32 bit Code Descriptor for user */ 420 { .ssd_base = 0x0, 421 .ssd_limit = 0xfffff, 422 .ssd_type = SDT_MEMERA, 423 .ssd_dpl = SEL_UPL, 424 .ssd_p = 1, 425 .ssd_long = 0, 426 .ssd_def32 = 1, 427 .ssd_gran = 1 }, 428 /* GUDATA_SEL 7 32/64 bit Data Descriptor for user */ 429 { .ssd_base = 0x0, 430 .ssd_limit = 0xfffff, 431 .ssd_type = SDT_MEMRWA, 432 .ssd_dpl = SEL_UPL, 433 .ssd_p = 1, 434 .ssd_long = 0, 435 .ssd_def32 = 1, 436 .ssd_gran = 1 }, 437 /* GUCODE_SEL 8 64 bit Code Descriptor for user */ 438 { .ssd_base = 0x0, 439 .ssd_limit = 0xfffff, 440 .ssd_type = SDT_MEMERA, 441 .ssd_dpl = SEL_UPL, 442 .ssd_p = 1, 443 .ssd_long = 1, 444 .ssd_def32 = 0, 445 .ssd_gran = 1 }, 446 /* GPROC0_SEL 9 Proc 0 Tss Descriptor */ 447 { .ssd_base = 0x0, 448 .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1, 449 .ssd_type = SDT_SYSTSS, 450 .ssd_dpl = SEL_KPL, 451 .ssd_p = 1, 452 .ssd_long = 0, 453 .ssd_def32 = 0, 454 .ssd_gran = 0 }, 455 /* Actually, the TSS is a system descriptor which is double size */ 456 { .ssd_base = 0x0, 457 .ssd_limit = 0x0, 458 .ssd_type = 0, 459 .ssd_dpl = 0, 460 .ssd_p = 0, 461 .ssd_long = 0, 462 .ssd_def32 = 0, 463 .ssd_gran = 0 }, 464 /* GUSERLDT_SEL 11 LDT Descriptor */ 465 { .ssd_base = 0x0, 466 .ssd_limit = 0x0, 467 .ssd_type = 0, 468 .ssd_dpl = 0, 469 .ssd_p = 0, 470 .ssd_long = 0, 471 .ssd_def32 = 0, 472 .ssd_gran = 0 }, 473 /* GUSERLDT_SEL 12 LDT Descriptor, double size */ 474 { .ssd_base = 0x0, 475 .ssd_limit = 0x0, 476 .ssd_type = 0, 477 .ssd_dpl = 0, 478 .ssd_p = 0, 479 .ssd_long = 0, 480 .ssd_def32 = 0, 481 .ssd_gran = 0 }, 482 }; 483 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT"); 484 485 void 486 setidt(int idx, inthand_t *func, int typ, int dpl, int ist) 487 { 488 struct gate_descriptor *ip; 489 490 ip = idt + idx; 491 ip->gd_looffset = (uintptr_t)func; 492 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); 493 ip->gd_ist = ist; 494 ip->gd_xx = 0; 495 ip->gd_type = typ; 496 ip->gd_dpl = dpl; 497 ip->gd_p = 1; 498 ip->gd_hioffset = ((uintptr_t)func)>>16 ; 499 } 500 501 extern inthand_t 502 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 503 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 504 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 505 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 506 IDTVEC(xmm), IDTVEC(dblfault), 507 IDTVEC(div_pti), IDTVEC(bpt_pti), 508 IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti), 509 IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti), 510 IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti), 511 IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti), 512 IDTVEC(xmm_pti), 513 #ifdef KDTRACE_HOOKS 514 IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti), 515 #endif 516 #ifdef XENHVM 517 IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti), 518 #endif 519 IDTVEC(fast_syscall), IDTVEC(fast_syscall32), 520 IDTVEC(fast_syscall_pti); 521 522 #ifdef DDB 523 /* 524 * Display the index and function name of any IDT entries that don't use 525 * the default 'rsvd' entry point. 526 */ 527 DB_SHOW_COMMAND_FLAGS(idt, db_show_idt, DB_CMD_MEMSAFE) 528 { 529 struct gate_descriptor *ip; 530 int idx; 531 uintptr_t func; 532 533 ip = idt; 534 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { 535 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); 536 if (func != (uintptr_t)&IDTVEC(rsvd)) { 537 db_printf("%3d\t", idx); 538 db_printsym(func, DB_STGY_PROC); 539 db_printf("\n"); 540 } 541 ip++; 542 } 543 } 544 545 /* Show privileged registers. */ 546 DB_SHOW_COMMAND_FLAGS(sysregs, db_show_sysregs, DB_CMD_MEMSAFE) 547 { 548 struct { 549 uint16_t limit; 550 uint64_t base; 551 } __packed idtr, gdtr; 552 uint16_t ldt, tr; 553 554 __asm __volatile("sidt %0" : "=m" (idtr)); 555 db_printf("idtr\t0x%016lx/%04x\n", 556 (u_long)idtr.base, (u_int)idtr.limit); 557 __asm __volatile("sgdt %0" : "=m" (gdtr)); 558 db_printf("gdtr\t0x%016lx/%04x\n", 559 (u_long)gdtr.base, (u_int)gdtr.limit); 560 __asm __volatile("sldt %0" : "=r" (ldt)); 561 db_printf("ldtr\t0x%04x\n", ldt); 562 __asm __volatile("str %0" : "=r" (tr)); 563 db_printf("tr\t0x%04x\n", tr); 564 db_printf("cr0\t0x%016lx\n", rcr0()); 565 db_printf("cr2\t0x%016lx\n", rcr2()); 566 db_printf("cr3\t0x%016lx\n", rcr3()); 567 db_printf("cr4\t0x%016lx\n", rcr4()); 568 if (rcr4() & CR4_XSAVE) 569 db_printf("xcr0\t0x%016lx\n", rxcr(0)); 570 db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER)); 571 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX)) 572 db_printf("FEATURES_CTL\t%016lx\n", 573 rdmsr(MSR_IA32_FEATURE_CONTROL)); 574 db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR)); 575 db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT)); 576 db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE)); 577 } 578 579 DB_SHOW_COMMAND_FLAGS(dbregs, db_show_dbregs, DB_CMD_MEMSAFE) 580 { 581 582 db_printf("dr0\t0x%016lx\n", rdr0()); 583 db_printf("dr1\t0x%016lx\n", rdr1()); 584 db_printf("dr2\t0x%016lx\n", rdr2()); 585 db_printf("dr3\t0x%016lx\n", rdr3()); 586 db_printf("dr6\t0x%016lx\n", rdr6()); 587 db_printf("dr7\t0x%016lx\n", rdr7()); 588 } 589 #endif 590 591 void 592 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd) 593 { 594 595 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 596 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 597 ssd->ssd_type = sd->sd_type; 598 ssd->ssd_dpl = sd->sd_dpl; 599 ssd->ssd_p = sd->sd_p; 600 ssd->ssd_long = sd->sd_long; 601 ssd->ssd_def32 = sd->sd_def32; 602 ssd->ssd_gran = sd->sd_gran; 603 } 604 605 void 606 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd) 607 { 608 609 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 610 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; 611 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 612 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 613 sd->sd_type = ssd->ssd_type; 614 sd->sd_dpl = ssd->ssd_dpl; 615 sd->sd_p = ssd->ssd_p; 616 sd->sd_long = ssd->ssd_long; 617 sd->sd_def32 = ssd->ssd_def32; 618 sd->sd_gran = ssd->ssd_gran; 619 } 620 621 void 622 ssdtosyssd(struct soft_segment_descriptor *ssd, struct system_segment_descriptor *sd) 623 { 624 625 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 626 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; 627 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 628 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 629 sd->sd_type = ssd->ssd_type; 630 sd->sd_dpl = ssd->ssd_dpl; 631 sd->sd_p = ssd->ssd_p; 632 sd->sd_gran = ssd->ssd_gran; 633 } 634 635 u_int basemem; 636 637 static int 638 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, 639 int *physmap_idxp) 640 { 641 int i, insert_idx, physmap_idx; 642 643 physmap_idx = *physmap_idxp; 644 645 if (length == 0) 646 return (1); 647 648 /* 649 * Find insertion point while checking for overlap. Start off by 650 * assuming the new entry will be added to the end. 651 * 652 * NB: physmap_idx points to the next free slot. 653 */ 654 insert_idx = physmap_idx; 655 for (i = 0; i <= physmap_idx; i += 2) { 656 if (base < physmap[i + 1]) { 657 if (base + length <= physmap[i]) { 658 insert_idx = i; 659 break; 660 } 661 if (boothowto & RB_VERBOSE) 662 printf( 663 "Overlapping memory regions, ignoring second region\n"); 664 return (1); 665 } 666 } 667 668 /* See if we can prepend to the next entry. */ 669 if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { 670 physmap[insert_idx] = base; 671 return (1); 672 } 673 674 /* See if we can append to the previous entry. */ 675 if (insert_idx > 0 && base == physmap[insert_idx - 1]) { 676 physmap[insert_idx - 1] += length; 677 return (1); 678 } 679 680 physmap_idx += 2; 681 *physmap_idxp = physmap_idx; 682 if (physmap_idx == PHYS_AVAIL_ENTRIES) { 683 printf( 684 "Too many segments in the physical address map, giving up\n"); 685 return (0); 686 } 687 688 /* 689 * Move the last 'N' entries down to make room for the new 690 * entry if needed. 691 */ 692 for (i = (physmap_idx - 2); i > insert_idx; i -= 2) { 693 physmap[i] = physmap[i - 2]; 694 physmap[i + 1] = physmap[i - 1]; 695 } 696 697 /* Insert the new entry. */ 698 physmap[insert_idx] = base; 699 physmap[insert_idx + 1] = base + length; 700 return (1); 701 } 702 703 void 704 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize, 705 vm_paddr_t *physmap, int *physmap_idx) 706 { 707 struct bios_smap *smap, *smapend; 708 709 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 710 711 for (smap = smapbase; smap < smapend; smap++) { 712 if (boothowto & RB_VERBOSE) 713 printf("SMAP type=%02x base=%016lx len=%016lx\n", 714 smap->type, smap->base, smap->length); 715 716 if (smap->type != SMAP_TYPE_MEMORY) 717 continue; 718 719 if (!add_physmap_entry(smap->base, smap->length, physmap, 720 physmap_idx)) 721 break; 722 } 723 } 724 725 static void 726 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap, 727 int *physmap_idx) 728 { 729 struct efi_md *map, *p; 730 const char *type; 731 size_t efisz; 732 int ndesc, i; 733 734 static const char *types[] = { 735 "Reserved", 736 "LoaderCode", 737 "LoaderData", 738 "BootServicesCode", 739 "BootServicesData", 740 "RuntimeServicesCode", 741 "RuntimeServicesData", 742 "ConventionalMemory", 743 "UnusableMemory", 744 "ACPIReclaimMemory", 745 "ACPIMemoryNVS", 746 "MemoryMappedIO", 747 "MemoryMappedIOPortSpace", 748 "PalCode", 749 "PersistentMemory" 750 }; 751 752 /* 753 * Memory map data provided by UEFI via the GetMemoryMap 754 * Boot Services API. 755 */ 756 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf; 757 map = (struct efi_md *)((uint8_t *)efihdr + efisz); 758 759 if (efihdr->descriptor_size == 0) 760 return; 761 ndesc = efihdr->memory_size / efihdr->descriptor_size; 762 763 if (boothowto & RB_VERBOSE) 764 printf("%23s %12s %12s %8s %4s\n", 765 "Type", "Physical", "Virtual", "#Pages", "Attr"); 766 767 for (i = 0, p = map; i < ndesc; i++, 768 p = efi_next_descriptor(p, efihdr->descriptor_size)) { 769 if (boothowto & RB_VERBOSE) { 770 if (p->md_type < nitems(types)) 771 type = types[p->md_type]; 772 else 773 type = "<INVALID>"; 774 printf("%23s %012lx %012lx %08lx ", type, p->md_phys, 775 p->md_virt, p->md_pages); 776 if (p->md_attr & EFI_MD_ATTR_UC) 777 printf("UC "); 778 if (p->md_attr & EFI_MD_ATTR_WC) 779 printf("WC "); 780 if (p->md_attr & EFI_MD_ATTR_WT) 781 printf("WT "); 782 if (p->md_attr & EFI_MD_ATTR_WB) 783 printf("WB "); 784 if (p->md_attr & EFI_MD_ATTR_UCE) 785 printf("UCE "); 786 if (p->md_attr & EFI_MD_ATTR_WP) 787 printf("WP "); 788 if (p->md_attr & EFI_MD_ATTR_RP) 789 printf("RP "); 790 if (p->md_attr & EFI_MD_ATTR_XP) 791 printf("XP "); 792 if (p->md_attr & EFI_MD_ATTR_NV) 793 printf("NV "); 794 if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE) 795 printf("MORE_RELIABLE "); 796 if (p->md_attr & EFI_MD_ATTR_RO) 797 printf("RO "); 798 if (p->md_attr & EFI_MD_ATTR_RT) 799 printf("RUNTIME"); 800 printf("\n"); 801 } 802 803 switch (p->md_type) { 804 case EFI_MD_TYPE_CODE: 805 case EFI_MD_TYPE_DATA: 806 case EFI_MD_TYPE_BS_CODE: 807 case EFI_MD_TYPE_BS_DATA: 808 case EFI_MD_TYPE_FREE: 809 /* 810 * We're allowed to use any entry with these types. 811 */ 812 break; 813 default: 814 continue; 815 } 816 817 if (!add_physmap_entry(p->md_phys, p->md_pages * EFI_PAGE_SIZE, 818 physmap, physmap_idx)) 819 break; 820 } 821 } 822 823 static void 824 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx) 825 { 826 struct bios_smap *smap; 827 struct efi_map_header *efihdr; 828 u_int32_t size; 829 830 /* 831 * Memory map from INT 15:E820. 832 * 833 * subr_module.c says: 834 * "Consumer may safely assume that size value precedes data." 835 * ie: an int32_t immediately precedes smap. 836 */ 837 838 efihdr = (struct efi_map_header *)preload_search_info(kmdp, 839 MODINFO_METADATA | MODINFOMD_EFI_MAP); 840 smap = (struct bios_smap *)preload_search_info(kmdp, 841 MODINFO_METADATA | MODINFOMD_SMAP); 842 if (efihdr == NULL && smap == NULL) 843 panic("No BIOS smap or EFI map info from loader!"); 844 845 if (efihdr != NULL) { 846 add_efi_map_entries(efihdr, physmap, physmap_idx); 847 strlcpy(bootmethod, "UEFI", sizeof(bootmethod)); 848 } else { 849 size = *((u_int32_t *)smap - 1); 850 bios_add_smap_entries(smap, size, physmap, physmap_idx); 851 strlcpy(bootmethod, "BIOS", sizeof(bootmethod)); 852 } 853 } 854 855 #define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE) 856 857 /* 858 * Populate the (physmap) array with base/bound pairs describing the 859 * available physical memory in the system, then test this memory and 860 * build the phys_avail array describing the actually-available memory. 861 * 862 * Total memory size may be set by the kernel environment variable 863 * hw.physmem or the compile-time define MAXMEM. 864 * 865 * XXX first should be vm_paddr_t. 866 */ 867 static void 868 getmemsize(caddr_t kmdp, u_int64_t first) 869 { 870 int i, physmap_idx, pa_indx, da_indx; 871 vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES]; 872 u_long physmem_start, physmem_tunable, memtest; 873 pt_entry_t *pte; 874 quad_t dcons_addr, dcons_size; 875 int page_counter; 876 877 TSENTER(); 878 /* 879 * Tell the physical memory allocator about pages used to store 880 * the kernel and preloaded data. See kmem_bootstrap_free(). 881 */ 882 vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first)); 883 884 bzero(physmap, sizeof(physmap)); 885 physmap_idx = 0; 886 887 init_ops.parse_memmap(kmdp, physmap, &physmap_idx); 888 physmap_idx -= 2; 889 890 /* 891 * Find the 'base memory' segment for SMP 892 */ 893 basemem = 0; 894 for (i = 0; i <= physmap_idx; i += 2) { 895 if (physmap[i] <= 0xA0000) { 896 basemem = physmap[i + 1] / 1024; 897 break; 898 } 899 } 900 if (basemem == 0 || basemem > 640) { 901 if (bootverbose) 902 printf( 903 "Memory map doesn't contain a basemem segment, faking it"); 904 basemem = 640; 905 } 906 907 /* 908 * Maxmem isn't the "maximum memory", it's one larger than the 909 * highest page of the physical address space. It should be 910 * called something like "Maxphyspage". We may adjust this 911 * based on ``hw.physmem'' and the results of the memory test. 912 */ 913 Maxmem = atop(physmap[physmap_idx + 1]); 914 915 #ifdef MAXMEM 916 Maxmem = MAXMEM / 4; 917 #endif 918 919 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 920 Maxmem = atop(physmem_tunable); 921 922 /* 923 * The boot memory test is disabled by default, as it takes a 924 * significant amount of time on large-memory systems, and is 925 * unfriendly to virtual machines as it unnecessarily touches all 926 * pages. 927 * 928 * A general name is used as the code may be extended to support 929 * additional tests beyond the current "page present" test. 930 */ 931 memtest = 0; 932 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); 933 934 /* 935 * Don't allow MAXMEM or hw.physmem to extend the amount of memory 936 * in the system. 937 */ 938 if (Maxmem > atop(physmap[physmap_idx + 1])) 939 Maxmem = atop(physmap[physmap_idx + 1]); 940 941 if (atop(physmap[physmap_idx + 1]) != Maxmem && 942 (boothowto & RB_VERBOSE)) 943 printf("Physical memory use set to %ldK\n", Maxmem * 4); 944 945 /* call pmap initialization to make new kernel address space */ 946 pmap_bootstrap(&first); 947 948 /* 949 * Size up each available chunk of physical memory. 950 * 951 * XXX Some BIOSes corrupt low 64KB between suspend and resume. 952 * By default, mask off the first 16 pages unless we appear to be 953 * running in a VM. 954 */ 955 physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT; 956 TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start); 957 if (physmap[0] < physmem_start) { 958 if (physmem_start < PAGE_SIZE) 959 physmap[0] = PAGE_SIZE; 960 else if (physmem_start >= physmap[1]) 961 physmap[0] = round_page(physmap[1] - PAGE_SIZE); 962 else 963 physmap[0] = round_page(physmem_start); 964 } 965 pa_indx = 0; 966 da_indx = 1; 967 phys_avail[pa_indx++] = physmap[0]; 968 phys_avail[pa_indx] = physmap[0]; 969 dump_avail[da_indx] = physmap[0]; 970 pte = CMAP1; 971 972 /* 973 * Get dcons buffer address 974 */ 975 if (getenv_quad("dcons.addr", &dcons_addr) == 0 || 976 getenv_quad("dcons.size", &dcons_size) == 0) 977 dcons_addr = 0; 978 979 /* 980 * physmap is in bytes, so when converting to page boundaries, 981 * round up the start address and round down the end address. 982 */ 983 page_counter = 0; 984 if (memtest != 0) 985 printf("Testing system memory"); 986 for (i = 0; i <= physmap_idx; i += 2) { 987 vm_paddr_t end; 988 989 end = ptoa((vm_paddr_t)Maxmem); 990 if (physmap[i + 1] < end) 991 end = trunc_page(physmap[i + 1]); 992 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 993 int tmp, page_bad, full; 994 int *ptr = (int *)CADDR1; 995 996 full = FALSE; 997 /* 998 * block out kernel memory as not available. 999 */ 1000 if (pa >= (vm_paddr_t)kernphys && pa < first) 1001 goto do_dump_avail; 1002 1003 /* 1004 * block out dcons buffer 1005 */ 1006 if (dcons_addr > 0 1007 && pa >= trunc_page(dcons_addr) 1008 && pa < dcons_addr + dcons_size) 1009 goto do_dump_avail; 1010 1011 page_bad = FALSE; 1012 if (memtest == 0) 1013 goto skip_memtest; 1014 1015 /* 1016 * Print a "." every GB to show we're making 1017 * progress. 1018 */ 1019 page_counter++; 1020 if ((page_counter % PAGES_PER_GB) == 0) 1021 printf("."); 1022 1023 /* 1024 * map page into kernel: valid, read/write,non-cacheable 1025 */ 1026 *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD; 1027 invltlb(); 1028 1029 tmp = *(int *)ptr; 1030 /* 1031 * Test for alternating 1's and 0's 1032 */ 1033 *(volatile int *)ptr = 0xaaaaaaaa; 1034 if (*(volatile int *)ptr != 0xaaaaaaaa) 1035 page_bad = TRUE; 1036 /* 1037 * Test for alternating 0's and 1's 1038 */ 1039 *(volatile int *)ptr = 0x55555555; 1040 if (*(volatile int *)ptr != 0x55555555) 1041 page_bad = TRUE; 1042 /* 1043 * Test for all 1's 1044 */ 1045 *(volatile int *)ptr = 0xffffffff; 1046 if (*(volatile int *)ptr != 0xffffffff) 1047 page_bad = TRUE; 1048 /* 1049 * Test for all 0's 1050 */ 1051 *(volatile int *)ptr = 0x0; 1052 if (*(volatile int *)ptr != 0x0) 1053 page_bad = TRUE; 1054 /* 1055 * Restore original value. 1056 */ 1057 *(int *)ptr = tmp; 1058 1059 skip_memtest: 1060 /* 1061 * Adjust array of valid/good pages. 1062 */ 1063 if (page_bad == TRUE) 1064 continue; 1065 /* 1066 * If this good page is a continuation of the 1067 * previous set of good pages, then just increase 1068 * the end pointer. Otherwise start a new chunk. 1069 * Note that "end" points one higher than end, 1070 * making the range >= start and < end. 1071 * If we're also doing a speculative memory 1072 * test and we at or past the end, bump up Maxmem 1073 * so that we keep going. The first bad page 1074 * will terminate the loop. 1075 */ 1076 if (phys_avail[pa_indx] == pa) { 1077 phys_avail[pa_indx] += PAGE_SIZE; 1078 } else { 1079 pa_indx++; 1080 if (pa_indx == PHYS_AVAIL_ENTRIES) { 1081 printf( 1082 "Too many holes in the physical address space, giving up\n"); 1083 pa_indx--; 1084 full = TRUE; 1085 goto do_dump_avail; 1086 } 1087 phys_avail[pa_indx++] = pa; /* start */ 1088 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 1089 } 1090 physmem++; 1091 do_dump_avail: 1092 if (dump_avail[da_indx] == pa) { 1093 dump_avail[da_indx] += PAGE_SIZE; 1094 } else { 1095 da_indx++; 1096 if (da_indx == PHYS_AVAIL_ENTRIES) { 1097 da_indx--; 1098 goto do_next; 1099 } 1100 dump_avail[da_indx++] = pa; /* start */ 1101 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 1102 } 1103 do_next: 1104 if (full) 1105 break; 1106 } 1107 } 1108 *pte = 0; 1109 invltlb(); 1110 if (memtest != 0) 1111 printf("\n"); 1112 1113 /* 1114 * XXX 1115 * The last chunk must contain at least one page plus the message 1116 * buffer to avoid complicating other code (message buffer address 1117 * calculation, etc.). 1118 */ 1119 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 1120 round_page(msgbufsize) >= phys_avail[pa_indx]) { 1121 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 1122 phys_avail[pa_indx--] = 0; 1123 phys_avail[pa_indx--] = 0; 1124 } 1125 1126 Maxmem = atop(phys_avail[pa_indx]); 1127 1128 /* Trim off space for the message buffer. */ 1129 phys_avail[pa_indx] -= round_page(msgbufsize); 1130 1131 /* Map the message buffer. */ 1132 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]); 1133 TSEXIT(); 1134 } 1135 1136 static caddr_t 1137 native_parse_preload_data(u_int64_t modulep) 1138 { 1139 caddr_t kmdp; 1140 char *envp; 1141 #ifdef DDB 1142 vm_offset_t ksym_start; 1143 vm_offset_t ksym_end; 1144 #endif 1145 1146 preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE); 1147 preload_bootstrap_relocate(KERNBASE); 1148 kmdp = preload_search_by_type("elf kernel"); 1149 if (kmdp == NULL) 1150 kmdp = preload_search_by_type("elf64 kernel"); 1151 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); 1152 envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *); 1153 if (envp != NULL) 1154 envp += KERNBASE; 1155 init_static_kenv(envp, 0); 1156 #ifdef DDB 1157 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); 1158 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); 1159 db_fetch_ksymtab(ksym_start, ksym_end, 0); 1160 #endif 1161 efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t); 1162 1163 return (kmdp); 1164 } 1165 1166 static void 1167 native_clock_source_init(void) 1168 { 1169 i8254_init(); 1170 } 1171 1172 static void 1173 amd64_kdb_init(void) 1174 { 1175 kdb_init(); 1176 #ifdef KDB 1177 if (boothowto & RB_KDB) 1178 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); 1179 #endif 1180 } 1181 1182 /* Set up the fast syscall stuff */ 1183 void 1184 amd64_conf_fast_syscall(void) 1185 { 1186 uint64_t msr; 1187 1188 msr = rdmsr(MSR_EFER) | EFER_SCE; 1189 wrmsr(MSR_EFER, msr); 1190 wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) : 1191 (u_int64_t)IDTVEC(fast_syscall)); 1192 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 1193 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 1194 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 1195 wrmsr(MSR_STAR, msr); 1196 wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC); 1197 } 1198 1199 void 1200 amd64_bsp_pcpu_init1(struct pcpu *pc) 1201 { 1202 struct user_segment_descriptor *gdt; 1203 1204 PCPU_SET(prvspace, pc); 1205 gdt = *PCPU_PTR(gdt); 1206 PCPU_SET(curthread, &thread0); 1207 PCPU_SET(tssp, PCPU_PTR(common_tss)); 1208 PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1209 PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]); 1210 PCPU_SET(fs32p, &gdt[GUFS32_SEL]); 1211 PCPU_SET(gs32p, &gdt[GUGS32_SEL]); 1212 PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK); 1213 PCPU_SET(smp_tlb_gen, 1); 1214 } 1215 1216 void 1217 amd64_bsp_pcpu_init2(uint64_t rsp0) 1218 { 1219 1220 PCPU_SET(rsp0, rsp0); 1221 PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) + 1222 PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful); 1223 PCPU_SET(curpcb, thread0.td_pcb); 1224 } 1225 1226 void 1227 amd64_bsp_ist_init(struct pcpu *pc) 1228 { 1229 struct nmi_pcpu *np; 1230 struct amd64tss *tssp; 1231 1232 tssp = &pc->pc_common_tss; 1233 1234 /* doublefault stack space, runs on ist1 */ 1235 np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1; 1236 np->np_pcpu = (register_t)pc; 1237 tssp->tss_ist1 = (long)np; 1238 1239 /* 1240 * NMI stack, runs on ist2. The pcpu pointer is stored just 1241 * above the start of the ist2 stack. 1242 */ 1243 np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1; 1244 np->np_pcpu = (register_t)pc; 1245 tssp->tss_ist2 = (long)np; 1246 1247 /* 1248 * MC# stack, runs on ist3. The pcpu pointer is stored just 1249 * above the start of the ist3 stack. 1250 */ 1251 np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1; 1252 np->np_pcpu = (register_t)pc; 1253 tssp->tss_ist3 = (long)np; 1254 1255 /* 1256 * DB# stack, runs on ist4. 1257 */ 1258 np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1; 1259 np->np_pcpu = (register_t)pc; 1260 tssp->tss_ist4 = (long)np; 1261 } 1262 1263 /* 1264 * Calculate the kernel load address by inspecting page table created by loader. 1265 * The assumptions: 1266 * - kernel is mapped at KERNBASE, backed by contiguous phys memory 1267 * aligned at 2M, below 4G (the latter is important for AP startup) 1268 * - there is a 2M hole at KERNBASE (KERNSTART = KERNBASE + 2M) 1269 * - kernel is mapped with 2M superpages 1270 * - all participating memory, i.e. kernel, modules, metadata, 1271 * page table is accessible by pre-created 1:1 mapping 1272 * (right now loader creates 1:1 mapping for lower 4G, and all 1273 * memory is from there) 1274 * - there is a usable memory block right after the end of the 1275 * mapped kernel and all modules/metadata, pointed to by 1276 * physfree, for early allocations 1277 */ 1278 vm_paddr_t __nosanitizeaddress __nosanitizememory 1279 amd64_loadaddr(void) 1280 { 1281 pml4_entry_t *pml4e; 1282 pdp_entry_t *pdpe; 1283 pd_entry_t *pde; 1284 uint64_t cr3; 1285 1286 cr3 = rcr3(); 1287 pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(KERNSTART); 1288 pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(KERNSTART); 1289 pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(KERNSTART); 1290 return (*pde & PG_FRAME); 1291 } 1292 1293 u_int64_t 1294 hammer_time(u_int64_t modulep, u_int64_t physfree) 1295 { 1296 caddr_t kmdp; 1297 int gsel_tss, x; 1298 struct pcpu *pc; 1299 uint64_t rsp0; 1300 char *env; 1301 struct user_segment_descriptor *gdt; 1302 struct region_descriptor r_gdt; 1303 size_t kstack0_sz; 1304 1305 TSRAW(&thread0, TS_ENTER, __func__, NULL); 1306 1307 kernphys = amd64_loadaddr(); 1308 1309 physfree += kernphys; 1310 1311 kmdp = init_ops.parse_preload_data(modulep); 1312 1313 efi_boot = preload_search_info(kmdp, MODINFO_METADATA | 1314 MODINFOMD_EFI_MAP) != NULL; 1315 1316 if (!efi_boot) { 1317 /* Tell the bios to warmboot next time */ 1318 atomic_store_short((u_short *)0x472, 0x1234); 1319 } 1320 1321 physfree += ucode_load_bsp(physfree - kernphys + KERNSTART); 1322 physfree = roundup2(physfree, PAGE_SIZE); 1323 1324 identify_cpu1(); 1325 identify_hypervisor(); 1326 identify_hypervisor_smbios(); 1327 identify_cpu_fixup_bsp(); 1328 identify_cpu2(); 1329 initializecpucache(); 1330 1331 /* 1332 * Check for pti, pcid, and invpcid before ifuncs are 1333 * resolved, to correctly select the implementation for 1334 * pmap_activate_sw_mode(). 1335 */ 1336 pti = pti_get_default(); 1337 TUNABLE_INT_FETCH("vm.pmap.pti", &pti); 1338 TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); 1339 if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) { 1340 invpcid_works = (cpu_stdext_feature & 1341 CPUID_STDEXT_INVPCID) != 0; 1342 } else { 1343 pmap_pcid_enabled = 0; 1344 } 1345 1346 /* 1347 * Now we can do small core initialization, after the PCID 1348 * CPU features and user knobs are evaluated. 1349 */ 1350 TUNABLE_INT_FETCH("vm.pmap.pcid_invlpg_workaround", 1351 &pmap_pcid_invlpg_workaround_uena); 1352 cpu_init_small_core(); 1353 1354 if ((cpu_feature2 & CPUID2_XSAVE) != 0) { 1355 use_xsave = 1; 1356 TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave); 1357 } 1358 1359 link_elf_ireloc(kmdp); 1360 1361 /* 1362 * This may be done better later if it gets more high level 1363 * components in it. If so just link td->td_proc here. 1364 */ 1365 proc_linkup0(&proc0, &thread0); 1366 1367 /* Init basic tunables, hz etc */ 1368 init_param1(); 1369 1370 thread0.td_kstack = physfree - kernphys + KERNSTART; 1371 thread0.td_kstack_pages = kstack_pages; 1372 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; 1373 bzero((void *)thread0.td_kstack, kstack0_sz); 1374 physfree += kstack0_sz; 1375 1376 /* 1377 * Initialize enough of thread0 for delayed invalidation to 1378 * work very early. Rely on thread0.td_base_pri 1379 * zero-initialization, it is reset to PVM at proc0_init(). 1380 */ 1381 pmap_thread_init_invl_gen(&thread0); 1382 1383 pc = &temp_bsp_pcpu; 1384 pcpu_init(pc, 0, sizeof(struct pcpu)); 1385 gdt = &temp_bsp_pcpu.pc_gdt[0]; 1386 1387 /* 1388 * make gdt memory segments 1389 */ 1390 for (x = 0; x < NGDT; x++) { 1391 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && 1392 x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1) 1393 ssdtosd(&gdt_segs[x], &gdt[x]); 1394 } 1395 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss; 1396 ssdtosyssd(&gdt_segs[GPROC0_SEL], 1397 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1398 1399 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 1400 r_gdt.rd_base = (long)gdt; 1401 lgdt(&r_gdt); 1402 1403 wrmsr(MSR_FSBASE, 0); /* User value */ 1404 wrmsr(MSR_GSBASE, (u_int64_t)pc); 1405 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ 1406 1407 dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0); 1408 physfree += DPCPU_SIZE; 1409 amd64_bsp_pcpu_init1(pc); 1410 /* Non-late cninit() and printf() can be moved up to here. */ 1411 1412 /* 1413 * Initialize mutexes. 1414 * 1415 * icu_lock: in order to allow an interrupt to occur in a critical 1416 * section, to set pcpu->ipending (etc...) properly, we 1417 * must be able to get the icu lock, so it can't be 1418 * under witness. 1419 */ 1420 mutex_init(); 1421 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS); 1422 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF); 1423 1424 /* exceptions */ 1425 for (x = 0; x < NIDT; x++) 1426 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT, 1427 SEL_KPL, 0); 1428 setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT, 1429 SEL_KPL, 0); 1430 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4); 1431 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2); 1432 setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT, 1433 SEL_UPL, 0); 1434 setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT, 1435 SEL_UPL, 0); 1436 setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT, 1437 SEL_KPL, 0); 1438 setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT, 1439 SEL_KPL, 0); 1440 setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT, 1441 SEL_KPL, 0); 1442 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); 1443 setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm), 1444 SDT_SYSIGT, SEL_KPL, 0); 1445 setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT, 1446 SEL_KPL, 0); 1447 setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing), 1448 SDT_SYSIGT, SEL_KPL, 0); 1449 setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT, 1450 SEL_KPL, 0); 1451 setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT, 1452 SEL_KPL, 0); 1453 setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT, 1454 SEL_KPL, 0); 1455 setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT, 1456 SEL_KPL, 0); 1457 setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT, 1458 SEL_KPL, 0); 1459 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3); 1460 setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT, 1461 SEL_KPL, 0); 1462 #ifdef KDTRACE_HOOKS 1463 setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) : 1464 &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0); 1465 #endif 1466 #ifdef XENHVM 1467 setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) : 1468 &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0); 1469 #endif 1470 r_idt.rd_limit = sizeof(idt0) - 1; 1471 r_idt.rd_base = (long) idt; 1472 lidt(&r_idt); 1473 1474 /* 1475 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4) 1476 * transition). 1477 * Once bootblocks have updated, we can test directly for 1478 * efi_systbl != NULL here... 1479 */ 1480 if (efi_boot) 1481 vty_set_preferred(VTY_VT); 1482 1483 TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable); 1484 TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable); 1485 1486 TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable); 1487 TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable); 1488 1489 TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush", 1490 &syscall_ret_l1d_flush_mode); 1491 1492 TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable); 1493 TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable); 1494 1495 TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable); 1496 1497 TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable", 1498 &x86_rngds_mitg_enable); 1499 1500 finishidentcpu(); /* Final stage of CPU initialization */ 1501 1502 /* 1503 * Initialize the clock before the console so that console 1504 * initialization can use DELAY(). 1505 */ 1506 clock_init(); 1507 1508 initializecpu(); /* Initialize CPU registers */ 1509 1510 amd64_bsp_ist_init(pc); 1511 1512 /* Set the IO permission bitmap (empty due to tss seg limit) */ 1513 pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) + 1514 IOPERM_BITMAP_SIZE; 1515 1516 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 1517 ltr(gsel_tss); 1518 1519 amd64_conf_fast_syscall(); 1520 1521 /* 1522 * We initialize the PCB pointer early so that exception 1523 * handlers will work. Also set up td_critnest to short-cut 1524 * the page fault handler. 1525 */ 1526 cpu_max_ext_state_size = sizeof(struct savefpu); 1527 set_top_of_stack_td(&thread0); 1528 thread0.td_pcb = get_pcb_td(&thread0); 1529 thread0.td_critnest = 1; 1530 1531 /* 1532 * The console and kdb should be initialized even earlier than here, 1533 * but some console drivers don't work until after getmemsize(). 1534 * Default to late console initialization to support these drivers. 1535 * This loses mainly printf()s in getmemsize() and early debugging. 1536 */ 1537 TUNABLE_INT_FETCH("debug.late_console", &late_console); 1538 if (!late_console) { 1539 cninit(); 1540 amd64_kdb_init(); 1541 } 1542 1543 getmemsize(kmdp, physfree); 1544 init_param2(physmem); 1545 1546 /* now running on new page tables, configured,and u/iom is accessible */ 1547 1548 #ifdef DEV_PCI 1549 /* This call might adjust phys_avail[]. */ 1550 pci_early_quirks(); 1551 #endif 1552 1553 if (late_console) 1554 cninit(); 1555 1556 /* 1557 * Dump the boot metadata. We have to wait for cninit() since console 1558 * output is required. If it's grossly incorrect the kernel will never 1559 * make it this far. 1560 */ 1561 if (getenv_is_true("debug.dump_modinfo_at_boot")) 1562 preload_dump(); 1563 1564 #ifdef DEV_ISA 1565 #ifdef DEV_ATPIC 1566 elcr_probe(); 1567 atpic_startup(); 1568 #else 1569 /* Reset and mask the atpics and leave them shut down. */ 1570 atpic_reset(); 1571 1572 /* 1573 * Point the ICU spurious interrupt vectors at the APIC spurious 1574 * interrupt handler. 1575 */ 1576 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); 1577 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); 1578 #endif 1579 #else 1580 #error "have you forgotten the isa device?" 1581 #endif 1582 1583 if (late_console) 1584 amd64_kdb_init(); 1585 1586 msgbufinit(msgbufp, msgbufsize); 1587 fpuinit(); 1588 1589 /* make an initial tss so cpu can get interrupt stack on syscall! */ 1590 rsp0 = thread0.td_md.md_stack_base; 1591 /* Ensure the stack is aligned to 16 bytes */ 1592 rsp0 &= ~0xFul; 1593 PCPU_PTR(common_tss)->tss_rsp0 = rsp0; 1594 amd64_bsp_pcpu_init2(rsp0); 1595 1596 /* transfer to user mode */ 1597 1598 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 1599 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 1600 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); 1601 _ufssel = GSEL(GUFS32_SEL, SEL_UPL); 1602 _ugssel = GSEL(GUGS32_SEL, SEL_UPL); 1603 1604 load_ds(_udatasel); 1605 load_es(_udatasel); 1606 load_fs(_ufssel); 1607 1608 /* setup proc 0's pcb */ 1609 thread0.td_pcb->pcb_flags = 0; 1610 1611 env = kern_getenv("kernelname"); 1612 if (env != NULL) 1613 strlcpy(kernelname, env, sizeof(kernelname)); 1614 1615 kcsan_cpu_init(0); 1616 1617 #ifdef FDT 1618 x86_init_fdt(); 1619 #endif 1620 thread0.td_critnest = 0; 1621 1622 kasan_init(); 1623 kmsan_init(); 1624 1625 TSEXIT(); 1626 1627 /* Location of kernel stack for locore */ 1628 return (thread0.td_md.md_stack_base); 1629 } 1630 1631 void 1632 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) 1633 { 1634 1635 pcpu->pc_acpi_id = 0xffffffff; 1636 } 1637 1638 static int 1639 smap_sysctl_handler(SYSCTL_HANDLER_ARGS) 1640 { 1641 struct bios_smap *smapbase; 1642 struct bios_smap_xattr smap; 1643 caddr_t kmdp; 1644 uint32_t *smapattr; 1645 int count, error, i; 1646 1647 /* Retrieve the system memory map from the loader. */ 1648 kmdp = preload_search_by_type("elf kernel"); 1649 if (kmdp == NULL) 1650 kmdp = preload_search_by_type("elf64 kernel"); 1651 smapbase = (struct bios_smap *)preload_search_info(kmdp, 1652 MODINFO_METADATA | MODINFOMD_SMAP); 1653 if (smapbase == NULL) 1654 return (0); 1655 smapattr = (uint32_t *)preload_search_info(kmdp, 1656 MODINFO_METADATA | MODINFOMD_SMAP_XATTR); 1657 count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase); 1658 error = 0; 1659 for (i = 0; i < count; i++) { 1660 smap.base = smapbase[i].base; 1661 smap.length = smapbase[i].length; 1662 smap.type = smapbase[i].type; 1663 if (smapattr != NULL) 1664 smap.xattr = smapattr[i]; 1665 else 1666 smap.xattr = 0; 1667 error = SYSCTL_OUT(req, &smap, sizeof(smap)); 1668 } 1669 return (error); 1670 } 1671 SYSCTL_PROC(_machdep, OID_AUTO, smap, 1672 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 1673 smap_sysctl_handler, "S,bios_smap_xattr", 1674 "Raw BIOS SMAP data"); 1675 1676 static int 1677 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS) 1678 { 1679 struct efi_map_header *efihdr; 1680 caddr_t kmdp; 1681 uint32_t efisize; 1682 1683 kmdp = preload_search_by_type("elf kernel"); 1684 if (kmdp == NULL) 1685 kmdp = preload_search_by_type("elf64 kernel"); 1686 efihdr = (struct efi_map_header *)preload_search_info(kmdp, 1687 MODINFO_METADATA | MODINFOMD_EFI_MAP); 1688 if (efihdr == NULL) 1689 return (0); 1690 efisize = *((uint32_t *)efihdr - 1); 1691 return (SYSCTL_OUT(req, efihdr, efisize)); 1692 } 1693 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, 1694 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 1695 efi_map_sysctl_handler, "S,efi_map_header", 1696 "Raw EFI Memory Map"); 1697 1698 void 1699 spinlock_enter(void) 1700 { 1701 struct thread *td; 1702 register_t flags; 1703 1704 td = curthread; 1705 if (td->td_md.md_spinlock_count == 0) { 1706 flags = intr_disable(); 1707 td->td_md.md_spinlock_count = 1; 1708 td->td_md.md_saved_flags = flags; 1709 critical_enter(); 1710 } else 1711 td->td_md.md_spinlock_count++; 1712 } 1713 1714 void 1715 spinlock_exit(void) 1716 { 1717 struct thread *td; 1718 register_t flags; 1719 1720 td = curthread; 1721 flags = td->td_md.md_saved_flags; 1722 td->td_md.md_spinlock_count--; 1723 if (td->td_md.md_spinlock_count == 0) { 1724 critical_exit(); 1725 intr_restore(flags); 1726 } 1727 } 1728 1729 /* 1730 * Construct a PCB from a trapframe. This is called from kdb_trap() where 1731 * we want to start a backtrace from the function that caused us to enter 1732 * the debugger. We have the context in the trapframe, but base the trace 1733 * on the PCB. The PCB doesn't have to be perfect, as long as it contains 1734 * enough for a backtrace. 1735 */ 1736 void 1737 makectx(struct trapframe *tf, struct pcb *pcb) 1738 { 1739 1740 pcb->pcb_r12 = tf->tf_r12; 1741 pcb->pcb_r13 = tf->tf_r13; 1742 pcb->pcb_r14 = tf->tf_r14; 1743 pcb->pcb_r15 = tf->tf_r15; 1744 pcb->pcb_rbp = tf->tf_rbp; 1745 pcb->pcb_rbx = tf->tf_rbx; 1746 pcb->pcb_rip = tf->tf_rip; 1747 pcb->pcb_rsp = tf->tf_rsp; 1748 } 1749 1750 /* 1751 * The pcb_flags is only modified by current thread, or by other threads 1752 * when current thread is stopped. However, current thread may change it 1753 * from the interrupt context in cpu_switch(), or in the trap handler. 1754 * When we read-modify-write pcb_flags from C sources, compiler may generate 1755 * code that is not atomic regarding the interrupt handler. If a trap or 1756 * interrupt happens and any flag is modified from the handler, it can be 1757 * clobbered with the cached value later. Therefore, we implement setting 1758 * and clearing flags with single-instruction functions, which do not race 1759 * with possible modification of the flags from the trap or interrupt context, 1760 * because traps and interrupts are executed only on instruction boundary. 1761 */ 1762 void 1763 set_pcb_flags_raw(struct pcb *pcb, const u_int flags) 1764 { 1765 1766 __asm __volatile("orl %1,%0" 1767 : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags) 1768 : "cc", "memory"); 1769 1770 } 1771 1772 /* 1773 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs 1774 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into 1775 * pcb if user space modified the bases. We must save on the context 1776 * switch or if the return to usermode happens through the doreti. 1777 * 1778 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET, 1779 * which have a consequence that the base MSRs must be saved each time 1780 * the PCB_FULL_IRET flag is set. We disable interrupts to sync with 1781 * context switches. 1782 */ 1783 static void 1784 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags) 1785 { 1786 register_t r; 1787 1788 if (curpcb == pcb && 1789 (flags & PCB_FULL_IRET) != 0 && 1790 (pcb->pcb_flags & PCB_FULL_IRET) == 0) { 1791 r = intr_disable(); 1792 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) { 1793 if (rfs() == _ufssel) 1794 pcb->pcb_fsbase = rdfsbase(); 1795 if (rgs() == _ugssel) 1796 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE); 1797 } 1798 set_pcb_flags_raw(pcb, flags); 1799 intr_restore(r); 1800 } else { 1801 set_pcb_flags_raw(pcb, flags); 1802 } 1803 } 1804 1805 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int)) 1806 { 1807 1808 return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ? 1809 set_pcb_flags_fsgsbase : set_pcb_flags_raw); 1810 } 1811 1812 void 1813 clear_pcb_flags(struct pcb *pcb, const u_int flags) 1814 { 1815 1816 __asm __volatile("andl %1,%0" 1817 : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags) 1818 : "cc", "memory"); 1819 } 1820 1821 #ifdef KDB 1822 1823 /* 1824 * Provide inb() and outb() as functions. They are normally only available as 1825 * inline functions, thus cannot be called from the debugger. 1826 */ 1827 1828 /* silence compiler warnings */ 1829 u_char inb_(u_short); 1830 void outb_(u_short, u_char); 1831 1832 u_char 1833 inb_(u_short port) 1834 { 1835 return inb(port); 1836 } 1837 1838 void 1839 outb_(u_short port, u_char data) 1840 { 1841 outb(port, data); 1842 } 1843 1844 #endif /* KDB */ 1845 1846 #undef memset 1847 #undef memmove 1848 #undef memcpy 1849 1850 void *memset_std(void *buf, int c, size_t len); 1851 void *memset_erms(void *buf, int c, size_t len); 1852 void *memmove_std(void * _Nonnull dst, const void * _Nonnull src, 1853 size_t len); 1854 void *memmove_erms(void * _Nonnull dst, const void * _Nonnull src, 1855 size_t len); 1856 void *memcpy_std(void * _Nonnull dst, const void * _Nonnull src, 1857 size_t len); 1858 void *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src, 1859 size_t len); 1860 1861 #ifdef KCSAN 1862 /* 1863 * These fail to build as ifuncs when used with KCSAN. 1864 */ 1865 void * 1866 memset(void *buf, int c, size_t len) 1867 { 1868 1869 return (memset_std(buf, c, len)); 1870 } 1871 1872 void * 1873 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len) 1874 { 1875 1876 return (memmove_std(dst, src, len)); 1877 } 1878 1879 void * 1880 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len) 1881 { 1882 1883 return (memcpy_std(dst, src, len)); 1884 } 1885 #else 1886 DEFINE_IFUNC(, void *, memset, (void *, int, size_t)) 1887 { 1888 1889 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 1890 memset_erms : memset_std); 1891 } 1892 1893 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull, 1894 size_t)) 1895 { 1896 1897 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 1898 memmove_erms : memmove_std); 1899 } 1900 1901 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t)) 1902 { 1903 1904 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 1905 memcpy_erms : memcpy_std); 1906 } 1907 #endif 1908 1909 void pagezero_std(void *addr); 1910 void pagezero_erms(void *addr); 1911 DEFINE_IFUNC(, void , pagezero, (void *)) 1912 { 1913 1914 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 1915 pagezero_erms : pagezero_std); 1916 } 1917