1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 2003 Peter Wemm. 5 * Copyright (c) 1992 Terrence R. Lambert. 6 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * William Jolitz. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. All advertising materials mentioning features or use of this software 21 * must display the following acknowledgement: 22 * This product includes software developed by the University of 23 * California, Berkeley and its contributors. 24 * 4. Neither the name of the University nor the names of its contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 * 40 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 41 */ 42 43 #include <sys/cdefs.h> 44 #include "opt_atpic.h" 45 #include "opt_cpu.h" 46 #include "opt_ddb.h" 47 #include "opt_inet.h" 48 #include "opt_isa.h" 49 #include "opt_kstack_pages.h" 50 #include "opt_maxmem.h" 51 #include "opt_pci.h" 52 #include "opt_platform.h" 53 #include "opt_sched.h" 54 55 #include <sys/param.h> 56 #include <sys/proc.h> 57 #include <sys/systm.h> 58 #include <sys/asan.h> 59 #include <sys/bio.h> 60 #include <sys/buf.h> 61 #include <sys/bus.h> 62 #include <sys/callout.h> 63 #include <sys/cons.h> 64 #include <sys/cpu.h> 65 #include <sys/csan.h> 66 #include <sys/efi.h> 67 #include <sys/eventhandler.h> 68 #include <sys/exec.h> 69 #include <sys/imgact.h> 70 #include <sys/kdb.h> 71 #include <sys/kernel.h> 72 #include <sys/ktr.h> 73 #include <sys/linker.h> 74 #include <sys/lock.h> 75 #include <sys/malloc.h> 76 #include <sys/memrange.h> 77 #include <sys/msan.h> 78 #include <sys/msgbuf.h> 79 #include <sys/mutex.h> 80 #include <sys/pcpu.h> 81 #include <sys/ptrace.h> 82 #include <sys/reboot.h> 83 #include <sys/reg.h> 84 #include <sys/rwlock.h> 85 #include <sys/sched.h> 86 #include <sys/signalvar.h> 87 #ifdef SMP 88 #include <sys/smp.h> 89 #endif 90 #include <sys/syscallsubr.h> 91 #include <sys/sysctl.h> 92 #include <sys/sysent.h> 93 #include <sys/sysproto.h> 94 #include <sys/ucontext.h> 95 #include <sys/vmmeter.h> 96 97 #include <vm/vm.h> 98 #include <vm/vm_param.h> 99 #include <vm/vm_extern.h> 100 #include <vm/vm_kern.h> 101 #include <vm/vm_page.h> 102 #include <vm/vm_map.h> 103 #include <vm/vm_object.h> 104 #include <vm/vm_pager.h> 105 #include <vm/vm_phys.h> 106 #include <vm/vm_dumpset.h> 107 108 #ifdef DDB 109 #ifndef KDB 110 #error KDB must be enabled in order for DDB to work! 111 #endif 112 #include <ddb/ddb.h> 113 #include <ddb/db_sym.h> 114 #endif 115 116 #include <net/netisr.h> 117 118 #include <dev/smbios/smbios.h> 119 120 #include <machine/clock.h> 121 #include <machine/cpu.h> 122 #include <machine/cputypes.h> 123 #include <machine/frame.h> 124 #include <machine/intr_machdep.h> 125 #include <x86/mca.h> 126 #include <machine/md_var.h> 127 #include <machine/metadata.h> 128 #include <machine/pc/bios.h> 129 #include <machine/pcb.h> 130 #include <machine/proc.h> 131 #include <machine/sigframe.h> 132 #include <machine/specialreg.h> 133 #include <machine/trap.h> 134 #include <machine/tss.h> 135 #include <x86/ucode.h> 136 #include <x86/ifunc.h> 137 #ifdef SMP 138 #include <machine/smp.h> 139 #endif 140 #ifdef FDT 141 #include <x86/fdt.h> 142 #endif 143 144 #ifdef DEV_ATPIC 145 #include <x86/isa/icu.h> 146 #else 147 #include <x86/apicvar.h> 148 #endif 149 150 #include <isa/isareg.h> 151 #include <isa/rtc.h> 152 #include <x86/init.h> 153 154 /* Sanity check for __curthread() */ 155 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); 156 157 /* 158 * The PTI trampoline stack needs enough space for a hardware trapframe and a 159 * couple of scratch registers, as well as the trapframe left behind after an 160 * iret fault. 161 */ 162 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) - 163 offsetof(struct pti_frame, pti_rip)); 164 165 extern u_int64_t hammer_time(u_int64_t, u_int64_t); 166 167 static void cpu_startup(void *); 168 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); 169 170 /* Probe 8254 PIT and TSC. */ 171 static void native_clock_source_init(void); 172 173 /* Preload data parse function */ 174 static caddr_t native_parse_preload_data(u_int64_t); 175 176 /* Native function to fetch and parse the e820 map */ 177 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *); 178 179 /* Default init_ops implementation. */ 180 struct init_ops init_ops = { 181 .parse_preload_data = native_parse_preload_data, 182 .early_clock_source_init = native_clock_source_init, 183 .early_delay = i8254_delay, 184 .parse_memmap = native_parse_memmap, 185 }; 186 187 /* 188 * Physical address of the EFI System Table. Stashed from the metadata hints 189 * passed into the kernel and used by the EFI code to call runtime services. 190 */ 191 vm_paddr_t efi_systbl_phys; 192 193 /* Intel ICH registers */ 194 #define ICH_PMBASE 0x400 195 #define ICH_SMI_EN ICH_PMBASE + 0x30 196 197 int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel; 198 199 int cold = 1; 200 201 long Maxmem = 0; 202 long realmem = 0; 203 int late_console = 1; 204 205 struct kva_md_info kmi; 206 207 struct region_descriptor r_idt; 208 209 struct pcpu *__pcpu; 210 struct pcpu temp_bsp_pcpu; 211 212 struct mtx icu_lock; 213 214 struct mem_range_softc mem_range_softc; 215 216 struct mtx dt_lock; /* lock for GDT and LDT */ 217 218 void (*vmm_resume_p)(void); 219 220 bool efi_boot; 221 222 static void 223 cpu_startup(void *dummy) 224 { 225 uintmax_t memsize; 226 char *sysenv; 227 228 /* 229 * On MacBooks, we need to disallow the legacy USB circuit to 230 * generate an SMI# because this can cause several problems, 231 * namely: incorrect CPU frequency detection and failure to 232 * start the APs. 233 * We do this by disabling a bit in the SMI_EN (SMI Control and 234 * Enable register) of the Intel ICH LPC Interface Bridge. 235 */ 236 sysenv = kern_getenv("smbios.system.product"); 237 if (sysenv != NULL) { 238 if (strncmp(sysenv, "MacBook1,1", 10) == 0 || 239 strncmp(sysenv, "MacBook3,1", 10) == 0 || 240 strncmp(sysenv, "MacBook4,1", 10) == 0 || 241 strncmp(sysenv, "MacBookPro1,1", 13) == 0 || 242 strncmp(sysenv, "MacBookPro1,2", 13) == 0 || 243 strncmp(sysenv, "MacBookPro3,1", 13) == 0 || 244 strncmp(sysenv, "MacBookPro4,1", 13) == 0 || 245 strncmp(sysenv, "Macmini1,1", 10) == 0) { 246 if (bootverbose) 247 printf("Disabling LEGACY_USB_EN bit on " 248 "Intel ICH.\n"); 249 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); 250 } 251 freeenv(sysenv); 252 } 253 254 /* 255 * Good {morning,afternoon,evening,night}. 256 */ 257 startrtclock(); 258 printcpuinfo(); 259 260 /* 261 * Display physical memory if SMBIOS reports reasonable amount. 262 */ 263 memsize = 0; 264 sysenv = kern_getenv("smbios.memory.enabled"); 265 if (sysenv != NULL) { 266 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; 267 freeenv(sysenv); 268 } 269 if (memsize < ptoa((uintmax_t)vm_free_count())) 270 memsize = ptoa((uintmax_t)Maxmem); 271 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); 272 realmem = atop(memsize); 273 274 /* 275 * Display any holes after the first chunk of extended memory. 276 */ 277 if (bootverbose) { 278 int indx; 279 280 printf("Physical memory chunk(s):\n"); 281 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 282 vm_paddr_t size; 283 284 size = phys_avail[indx + 1] - phys_avail[indx]; 285 printf( 286 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", 287 (uintmax_t)phys_avail[indx], 288 (uintmax_t)phys_avail[indx + 1] - 1, 289 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); 290 } 291 } 292 293 vm_ksubmap_init(&kmi); 294 295 printf("avail memory = %ju (%ju MB)\n", 296 ptoa((uintmax_t)vm_free_count()), 297 ptoa((uintmax_t)vm_free_count()) / 1048576); 298 #ifdef DEV_PCI 299 if (bootverbose && intel_graphics_stolen_base != 0) 300 printf("intel stolen mem: base %#jx size %ju MB\n", 301 (uintmax_t)intel_graphics_stolen_base, 302 (uintmax_t)intel_graphics_stolen_size / 1024 / 1024); 303 #endif 304 305 /* 306 * Set up buffers, so they can be used to read disk labels. 307 */ 308 bufinit(); 309 vm_pager_bufferinit(); 310 311 cpu_setregs(); 312 } 313 314 static void 315 late_ifunc_resolve(void *dummy __unused) 316 { 317 link_elf_late_ireloc(); 318 } 319 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL); 320 321 322 void 323 cpu_setregs(void) 324 { 325 register_t cr0; 326 327 TSENTER(); 328 cr0 = rcr0(); 329 /* 330 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the 331 * BSP. See the comments there about why we set them. 332 */ 333 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; 334 TSENTER2("load_cr0"); 335 load_cr0(cr0); 336 TSEXIT2("load_cr0"); 337 TSEXIT(); 338 } 339 340 /* 341 * Initialize amd64 and configure to run kernel 342 */ 343 344 /* 345 * Initialize segments & interrupt table 346 */ 347 static struct gate_descriptor idt0[NIDT]; 348 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 349 350 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16); 351 static char mce0_stack[MCE_STACK_SIZE] __aligned(16); 352 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16); 353 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16); 354 CTASSERT(sizeof(struct nmi_pcpu) == 16); 355 356 /* 357 * Software prototypes -- in more palatable form. 358 * 359 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same 360 * slots as corresponding segments for i386 kernel. 361 */ 362 struct soft_segment_descriptor gdt_segs[] = { 363 /* GNULL_SEL 0 Null Descriptor */ 364 { .ssd_base = 0x0, 365 .ssd_limit = 0x0, 366 .ssd_type = 0, 367 .ssd_dpl = 0, 368 .ssd_p = 0, 369 .ssd_long = 0, 370 .ssd_def32 = 0, 371 .ssd_gran = 0 }, 372 /* GNULL2_SEL 1 Null Descriptor */ 373 { .ssd_base = 0x0, 374 .ssd_limit = 0x0, 375 .ssd_type = 0, 376 .ssd_dpl = 0, 377 .ssd_p = 0, 378 .ssd_long = 0, 379 .ssd_def32 = 0, 380 .ssd_gran = 0 }, 381 /* GUFS32_SEL 2 32 bit %gs Descriptor for user */ 382 { .ssd_base = 0x0, 383 .ssd_limit = 0xfffff, 384 .ssd_type = SDT_MEMRWA, 385 .ssd_dpl = SEL_UPL, 386 .ssd_p = 1, 387 .ssd_long = 0, 388 .ssd_def32 = 1, 389 .ssd_gran = 1 }, 390 /* GUGS32_SEL 3 32 bit %fs Descriptor for user */ 391 { .ssd_base = 0x0, 392 .ssd_limit = 0xfffff, 393 .ssd_type = SDT_MEMRWA, 394 .ssd_dpl = SEL_UPL, 395 .ssd_p = 1, 396 .ssd_long = 0, 397 .ssd_def32 = 1, 398 .ssd_gran = 1 }, 399 /* GCODE_SEL 4 Code Descriptor for kernel */ 400 { .ssd_base = 0x0, 401 .ssd_limit = 0xfffff, 402 .ssd_type = SDT_MEMERA, 403 .ssd_dpl = SEL_KPL, 404 .ssd_p = 1, 405 .ssd_long = 1, 406 .ssd_def32 = 0, 407 .ssd_gran = 1 }, 408 /* GDATA_SEL 5 Data Descriptor for kernel */ 409 { .ssd_base = 0x0, 410 .ssd_limit = 0xfffff, 411 .ssd_type = SDT_MEMRWA, 412 .ssd_dpl = SEL_KPL, 413 .ssd_p = 1, 414 .ssd_long = 1, 415 .ssd_def32 = 0, 416 .ssd_gran = 1 }, 417 /* GUCODE32_SEL 6 32 bit Code Descriptor for user */ 418 { .ssd_base = 0x0, 419 .ssd_limit = 0xfffff, 420 .ssd_type = SDT_MEMERA, 421 .ssd_dpl = SEL_UPL, 422 .ssd_p = 1, 423 .ssd_long = 0, 424 .ssd_def32 = 1, 425 .ssd_gran = 1 }, 426 /* GUDATA_SEL 7 32/64 bit Data Descriptor for user */ 427 { .ssd_base = 0x0, 428 .ssd_limit = 0xfffff, 429 .ssd_type = SDT_MEMRWA, 430 .ssd_dpl = SEL_UPL, 431 .ssd_p = 1, 432 .ssd_long = 0, 433 .ssd_def32 = 1, 434 .ssd_gran = 1 }, 435 /* GUCODE_SEL 8 64 bit Code Descriptor for user */ 436 { .ssd_base = 0x0, 437 .ssd_limit = 0xfffff, 438 .ssd_type = SDT_MEMERA, 439 .ssd_dpl = SEL_UPL, 440 .ssd_p = 1, 441 .ssd_long = 1, 442 .ssd_def32 = 0, 443 .ssd_gran = 1 }, 444 /* GPROC0_SEL 9 Proc 0 Tss Descriptor */ 445 { .ssd_base = 0x0, 446 .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1, 447 .ssd_type = SDT_SYSTSS, 448 .ssd_dpl = SEL_KPL, 449 .ssd_p = 1, 450 .ssd_long = 0, 451 .ssd_def32 = 0, 452 .ssd_gran = 0 }, 453 /* Actually, the TSS is a system descriptor which is double size */ 454 { .ssd_base = 0x0, 455 .ssd_limit = 0x0, 456 .ssd_type = 0, 457 .ssd_dpl = 0, 458 .ssd_p = 0, 459 .ssd_long = 0, 460 .ssd_def32 = 0, 461 .ssd_gran = 0 }, 462 /* GUSERLDT_SEL 11 LDT Descriptor */ 463 { .ssd_base = 0x0, 464 .ssd_limit = 0x0, 465 .ssd_type = 0, 466 .ssd_dpl = 0, 467 .ssd_p = 0, 468 .ssd_long = 0, 469 .ssd_def32 = 0, 470 .ssd_gran = 0 }, 471 /* GUSERLDT_SEL 12 LDT Descriptor, double size */ 472 { .ssd_base = 0x0, 473 .ssd_limit = 0x0, 474 .ssd_type = 0, 475 .ssd_dpl = 0, 476 .ssd_p = 0, 477 .ssd_long = 0, 478 .ssd_def32 = 0, 479 .ssd_gran = 0 }, 480 }; 481 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT"); 482 483 void 484 setidt(int idx, inthand_t *func, int typ, int dpl, int ist) 485 { 486 struct gate_descriptor *ip; 487 488 ip = idt + idx; 489 ip->gd_looffset = (uintptr_t)func; 490 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); 491 ip->gd_ist = ist; 492 ip->gd_xx = 0; 493 ip->gd_type = typ; 494 ip->gd_dpl = dpl; 495 ip->gd_p = 1; 496 ip->gd_hioffset = ((uintptr_t)func)>>16 ; 497 } 498 499 extern inthand_t 500 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 501 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 502 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 503 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 504 IDTVEC(xmm), IDTVEC(dblfault), 505 IDTVEC(div_pti), IDTVEC(bpt_pti), 506 IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti), 507 IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti), 508 IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti), 509 IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti), 510 IDTVEC(xmm_pti), 511 #ifdef KDTRACE_HOOKS 512 IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti), 513 #endif 514 #ifdef XENHVM 515 IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti), 516 #endif 517 IDTVEC(fast_syscall), IDTVEC(fast_syscall32), 518 IDTVEC(fast_syscall_pti); 519 520 #ifdef DDB 521 /* 522 * Display the index and function name of any IDT entries that don't use 523 * the default 'rsvd' entry point. 524 */ 525 DB_SHOW_COMMAND_FLAGS(idt, db_show_idt, DB_CMD_MEMSAFE) 526 { 527 struct gate_descriptor *ip; 528 int idx; 529 uintptr_t func; 530 531 ip = idt; 532 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { 533 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); 534 if (func != (uintptr_t)&IDTVEC(rsvd)) { 535 db_printf("%3d\t", idx); 536 db_printsym(func, DB_STGY_PROC); 537 db_printf("\n"); 538 } 539 ip++; 540 } 541 } 542 543 /* Show privileged registers. */ 544 DB_SHOW_COMMAND_FLAGS(sysregs, db_show_sysregs, DB_CMD_MEMSAFE) 545 { 546 struct { 547 uint16_t limit; 548 uint64_t base; 549 } __packed idtr, gdtr; 550 uint16_t ldt, tr; 551 552 __asm __volatile("sidt %0" : "=m" (idtr)); 553 db_printf("idtr\t0x%016lx/%04x\n", 554 (u_long)idtr.base, (u_int)idtr.limit); 555 __asm __volatile("sgdt %0" : "=m" (gdtr)); 556 db_printf("gdtr\t0x%016lx/%04x\n", 557 (u_long)gdtr.base, (u_int)gdtr.limit); 558 __asm __volatile("sldt %0" : "=r" (ldt)); 559 db_printf("ldtr\t0x%04x\n", ldt); 560 __asm __volatile("str %0" : "=r" (tr)); 561 db_printf("tr\t0x%04x\n", tr); 562 db_printf("cr0\t0x%016lx\n", rcr0()); 563 db_printf("cr2\t0x%016lx\n", rcr2()); 564 db_printf("cr3\t0x%016lx\n", rcr3()); 565 db_printf("cr4\t0x%016lx\n", rcr4()); 566 if (rcr4() & CR4_XSAVE) 567 db_printf("xcr0\t0x%016lx\n", rxcr(0)); 568 db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER)); 569 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX)) 570 db_printf("FEATURES_CTL\t%016lx\n", 571 rdmsr(MSR_IA32_FEATURE_CONTROL)); 572 db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR)); 573 db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT)); 574 db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE)); 575 } 576 577 DB_SHOW_COMMAND_FLAGS(dbregs, db_show_dbregs, DB_CMD_MEMSAFE) 578 { 579 580 db_printf("dr0\t0x%016lx\n", rdr0()); 581 db_printf("dr1\t0x%016lx\n", rdr1()); 582 db_printf("dr2\t0x%016lx\n", rdr2()); 583 db_printf("dr3\t0x%016lx\n", rdr3()); 584 db_printf("dr6\t0x%016lx\n", rdr6()); 585 db_printf("dr7\t0x%016lx\n", rdr7()); 586 } 587 #endif 588 589 void 590 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd) 591 { 592 593 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 594 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 595 ssd->ssd_type = sd->sd_type; 596 ssd->ssd_dpl = sd->sd_dpl; 597 ssd->ssd_p = sd->sd_p; 598 ssd->ssd_long = sd->sd_long; 599 ssd->ssd_def32 = sd->sd_def32; 600 ssd->ssd_gran = sd->sd_gran; 601 } 602 603 void 604 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd) 605 { 606 607 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 608 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; 609 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 610 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 611 sd->sd_type = ssd->ssd_type; 612 sd->sd_dpl = ssd->ssd_dpl; 613 sd->sd_p = ssd->ssd_p; 614 sd->sd_long = ssd->ssd_long; 615 sd->sd_def32 = ssd->ssd_def32; 616 sd->sd_gran = ssd->ssd_gran; 617 } 618 619 void 620 ssdtosyssd(struct soft_segment_descriptor *ssd, struct system_segment_descriptor *sd) 621 { 622 623 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 624 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; 625 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 626 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 627 sd->sd_type = ssd->ssd_type; 628 sd->sd_dpl = ssd->ssd_dpl; 629 sd->sd_p = ssd->ssd_p; 630 sd->sd_gran = ssd->ssd_gran; 631 } 632 633 u_int basemem; 634 635 static int 636 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, 637 int *physmap_idxp) 638 { 639 int i, insert_idx, physmap_idx; 640 641 physmap_idx = *physmap_idxp; 642 643 if (length == 0) 644 return (1); 645 646 /* 647 * Find insertion point while checking for overlap. Start off by 648 * assuming the new entry will be added to the end. 649 * 650 * NB: physmap_idx points to the next free slot. 651 */ 652 insert_idx = physmap_idx; 653 for (i = 0; i <= physmap_idx; i += 2) { 654 if (base < physmap[i + 1]) { 655 if (base + length <= physmap[i]) { 656 insert_idx = i; 657 break; 658 } 659 if (boothowto & RB_VERBOSE) 660 printf( 661 "Overlapping memory regions, ignoring second region\n"); 662 return (1); 663 } 664 } 665 666 /* See if we can prepend to the next entry. */ 667 if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { 668 physmap[insert_idx] = base; 669 return (1); 670 } 671 672 /* See if we can append to the previous entry. */ 673 if (insert_idx > 0 && base == physmap[insert_idx - 1]) { 674 physmap[insert_idx - 1] += length; 675 return (1); 676 } 677 678 physmap_idx += 2; 679 *physmap_idxp = physmap_idx; 680 if (physmap_idx == PHYS_AVAIL_ENTRIES) { 681 printf( 682 "Too many segments in the physical address map, giving up\n"); 683 return (0); 684 } 685 686 /* 687 * Move the last 'N' entries down to make room for the new 688 * entry if needed. 689 */ 690 for (i = (physmap_idx - 2); i > insert_idx; i -= 2) { 691 physmap[i] = physmap[i - 2]; 692 physmap[i + 1] = physmap[i - 1]; 693 } 694 695 /* Insert the new entry. */ 696 physmap[insert_idx] = base; 697 physmap[insert_idx + 1] = base + length; 698 return (1); 699 } 700 701 void 702 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize, 703 vm_paddr_t *physmap, int *physmap_idx) 704 { 705 struct bios_smap *smap, *smapend; 706 707 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 708 709 for (smap = smapbase; smap < smapend; smap++) { 710 if (boothowto & RB_VERBOSE) 711 printf("SMAP type=%02x base=%016lx len=%016lx\n", 712 smap->type, smap->base, smap->length); 713 714 if (smap->type != SMAP_TYPE_MEMORY) 715 continue; 716 717 if (!add_physmap_entry(smap->base, smap->length, physmap, 718 physmap_idx)) 719 break; 720 } 721 } 722 723 static void 724 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap, 725 int *physmap_idx) 726 { 727 struct efi_md *map, *p; 728 const char *type; 729 size_t efisz; 730 int ndesc, i; 731 732 static const char *types[] = { 733 "Reserved", 734 "LoaderCode", 735 "LoaderData", 736 "BootServicesCode", 737 "BootServicesData", 738 "RuntimeServicesCode", 739 "RuntimeServicesData", 740 "ConventionalMemory", 741 "UnusableMemory", 742 "ACPIReclaimMemory", 743 "ACPIMemoryNVS", 744 "MemoryMappedIO", 745 "MemoryMappedIOPortSpace", 746 "PalCode", 747 "PersistentMemory" 748 }; 749 750 /* 751 * Memory map data provided by UEFI via the GetMemoryMap 752 * Boot Services API. 753 */ 754 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf; 755 map = (struct efi_md *)((uint8_t *)efihdr + efisz); 756 757 if (efihdr->descriptor_size == 0) 758 return; 759 ndesc = efihdr->memory_size / efihdr->descriptor_size; 760 761 if (boothowto & RB_VERBOSE) 762 printf("%23s %12s %12s %8s %4s\n", 763 "Type", "Physical", "Virtual", "#Pages", "Attr"); 764 765 for (i = 0, p = map; i < ndesc; i++, 766 p = efi_next_descriptor(p, efihdr->descriptor_size)) { 767 if (boothowto & RB_VERBOSE) { 768 if (p->md_type < nitems(types)) 769 type = types[p->md_type]; 770 else 771 type = "<INVALID>"; 772 printf("%23s %012lx %012lx %08lx ", type, p->md_phys, 773 p->md_virt, p->md_pages); 774 if (p->md_attr & EFI_MD_ATTR_UC) 775 printf("UC "); 776 if (p->md_attr & EFI_MD_ATTR_WC) 777 printf("WC "); 778 if (p->md_attr & EFI_MD_ATTR_WT) 779 printf("WT "); 780 if (p->md_attr & EFI_MD_ATTR_WB) 781 printf("WB "); 782 if (p->md_attr & EFI_MD_ATTR_UCE) 783 printf("UCE "); 784 if (p->md_attr & EFI_MD_ATTR_WP) 785 printf("WP "); 786 if (p->md_attr & EFI_MD_ATTR_RP) 787 printf("RP "); 788 if (p->md_attr & EFI_MD_ATTR_XP) 789 printf("XP "); 790 if (p->md_attr & EFI_MD_ATTR_NV) 791 printf("NV "); 792 if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE) 793 printf("MORE_RELIABLE "); 794 if (p->md_attr & EFI_MD_ATTR_RO) 795 printf("RO "); 796 if (p->md_attr & EFI_MD_ATTR_RT) 797 printf("RUNTIME"); 798 printf("\n"); 799 } 800 801 switch (p->md_type) { 802 case EFI_MD_TYPE_CODE: 803 case EFI_MD_TYPE_DATA: 804 case EFI_MD_TYPE_BS_CODE: 805 case EFI_MD_TYPE_BS_DATA: 806 case EFI_MD_TYPE_FREE: 807 /* 808 * We're allowed to use any entry with these types. 809 */ 810 break; 811 default: 812 continue; 813 } 814 815 if (!add_physmap_entry(p->md_phys, p->md_pages * EFI_PAGE_SIZE, 816 physmap, physmap_idx)) 817 break; 818 } 819 } 820 821 static void 822 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx) 823 { 824 struct bios_smap *smap; 825 struct efi_map_header *efihdr; 826 u_int32_t size; 827 828 /* 829 * Memory map from INT 15:E820. 830 * 831 * subr_module.c says: 832 * "Consumer may safely assume that size value precedes data." 833 * ie: an int32_t immediately precedes smap. 834 */ 835 836 efihdr = (struct efi_map_header *)preload_search_info(kmdp, 837 MODINFO_METADATA | MODINFOMD_EFI_MAP); 838 smap = (struct bios_smap *)preload_search_info(kmdp, 839 MODINFO_METADATA | MODINFOMD_SMAP); 840 if (efihdr == NULL && smap == NULL) 841 panic("No BIOS smap or EFI map info from loader!"); 842 843 if (efihdr != NULL) { 844 add_efi_map_entries(efihdr, physmap, physmap_idx); 845 strlcpy(bootmethod, "UEFI", sizeof(bootmethod)); 846 } else { 847 size = *((u_int32_t *)smap - 1); 848 bios_add_smap_entries(smap, size, physmap, physmap_idx); 849 strlcpy(bootmethod, "BIOS", sizeof(bootmethod)); 850 } 851 } 852 853 #define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE) 854 855 /* 856 * Populate the (physmap) array with base/bound pairs describing the 857 * available physical memory in the system, then test this memory and 858 * build the phys_avail array describing the actually-available memory. 859 * 860 * Total memory size may be set by the kernel environment variable 861 * hw.physmem or the compile-time define MAXMEM. 862 * 863 * XXX first should be vm_paddr_t. 864 */ 865 static void 866 getmemsize(caddr_t kmdp, u_int64_t first) 867 { 868 int i, physmap_idx, pa_indx, da_indx; 869 vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES]; 870 u_long physmem_start, physmem_tunable, memtest; 871 pt_entry_t *pte; 872 quad_t dcons_addr, dcons_size; 873 int page_counter; 874 875 TSENTER(); 876 /* 877 * Tell the physical memory allocator about pages used to store 878 * the kernel and preloaded data. See kmem_bootstrap_free(). 879 */ 880 vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first)); 881 882 bzero(physmap, sizeof(physmap)); 883 physmap_idx = 0; 884 885 init_ops.parse_memmap(kmdp, physmap, &physmap_idx); 886 physmap_idx -= 2; 887 888 /* 889 * Find the 'base memory' segment for SMP 890 */ 891 basemem = 0; 892 for (i = 0; i <= physmap_idx; i += 2) { 893 if (physmap[i] <= 0xA0000) { 894 basemem = physmap[i + 1] / 1024; 895 break; 896 } 897 } 898 if (basemem == 0 || basemem > 640) { 899 if (bootverbose) 900 printf( 901 "Memory map doesn't contain a basemem segment, faking it"); 902 basemem = 640; 903 } 904 905 /* 906 * Maxmem isn't the "maximum memory", it's one larger than the 907 * highest page of the physical address space. It should be 908 * called something like "Maxphyspage". We may adjust this 909 * based on ``hw.physmem'' and the results of the memory test. 910 */ 911 Maxmem = atop(physmap[physmap_idx + 1]); 912 913 #ifdef MAXMEM 914 Maxmem = MAXMEM / 4; 915 #endif 916 917 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 918 Maxmem = atop(physmem_tunable); 919 920 /* 921 * The boot memory test is disabled by default, as it takes a 922 * significant amount of time on large-memory systems, and is 923 * unfriendly to virtual machines as it unnecessarily touches all 924 * pages. 925 * 926 * A general name is used as the code may be extended to support 927 * additional tests beyond the current "page present" test. 928 */ 929 memtest = 0; 930 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); 931 932 /* 933 * Don't allow MAXMEM or hw.physmem to extend the amount of memory 934 * in the system. 935 */ 936 if (Maxmem > atop(physmap[physmap_idx + 1])) 937 Maxmem = atop(physmap[physmap_idx + 1]); 938 939 if (atop(physmap[physmap_idx + 1]) != Maxmem && 940 (boothowto & RB_VERBOSE)) 941 printf("Physical memory use set to %ldK\n", Maxmem * 4); 942 943 /* call pmap initialization to make new kernel address space */ 944 pmap_bootstrap(&first); 945 946 /* 947 * Size up each available chunk of physical memory. 948 * 949 * XXX Some BIOSes corrupt low 64KB between suspend and resume. 950 * By default, mask off the first 16 pages unless we appear to be 951 * running in a VM. 952 */ 953 physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT; 954 TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start); 955 if (physmap[0] < physmem_start) { 956 if (physmem_start < PAGE_SIZE) 957 physmap[0] = PAGE_SIZE; 958 else if (physmem_start >= physmap[1]) 959 physmap[0] = round_page(physmap[1] - PAGE_SIZE); 960 else 961 physmap[0] = round_page(physmem_start); 962 } 963 pa_indx = 0; 964 da_indx = 1; 965 phys_avail[pa_indx++] = physmap[0]; 966 phys_avail[pa_indx] = physmap[0]; 967 dump_avail[da_indx] = physmap[0]; 968 pte = CMAP1; 969 970 /* 971 * Get dcons buffer address 972 */ 973 if (getenv_quad("dcons.addr", &dcons_addr) == 0 || 974 getenv_quad("dcons.size", &dcons_size) == 0) 975 dcons_addr = 0; 976 977 /* 978 * physmap is in bytes, so when converting to page boundaries, 979 * round up the start address and round down the end address. 980 */ 981 page_counter = 0; 982 if (memtest != 0) 983 printf("Testing system memory"); 984 for (i = 0; i <= physmap_idx; i += 2) { 985 vm_paddr_t end; 986 987 end = ptoa((vm_paddr_t)Maxmem); 988 if (physmap[i + 1] < end) 989 end = trunc_page(physmap[i + 1]); 990 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 991 int tmp, page_bad, full; 992 int *ptr = (int *)CADDR1; 993 994 full = FALSE; 995 /* 996 * block out kernel memory as not available. 997 */ 998 if (pa >= (vm_paddr_t)kernphys && pa < first) 999 goto do_dump_avail; 1000 1001 /* 1002 * block out dcons buffer 1003 */ 1004 if (dcons_addr > 0 1005 && pa >= trunc_page(dcons_addr) 1006 && pa < dcons_addr + dcons_size) 1007 goto do_dump_avail; 1008 1009 page_bad = FALSE; 1010 if (memtest == 0) 1011 goto skip_memtest; 1012 1013 /* 1014 * Print a "." every GB to show we're making 1015 * progress. 1016 */ 1017 page_counter++; 1018 if ((page_counter % PAGES_PER_GB) == 0) 1019 printf("."); 1020 1021 /* 1022 * map page into kernel: valid, read/write,non-cacheable 1023 */ 1024 *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD; 1025 invltlb(); 1026 1027 tmp = *(int *)ptr; 1028 /* 1029 * Test for alternating 1's and 0's 1030 */ 1031 *(volatile int *)ptr = 0xaaaaaaaa; 1032 if (*(volatile int *)ptr != 0xaaaaaaaa) 1033 page_bad = TRUE; 1034 /* 1035 * Test for alternating 0's and 1's 1036 */ 1037 *(volatile int *)ptr = 0x55555555; 1038 if (*(volatile int *)ptr != 0x55555555) 1039 page_bad = TRUE; 1040 /* 1041 * Test for all 1's 1042 */ 1043 *(volatile int *)ptr = 0xffffffff; 1044 if (*(volatile int *)ptr != 0xffffffff) 1045 page_bad = TRUE; 1046 /* 1047 * Test for all 0's 1048 */ 1049 *(volatile int *)ptr = 0x0; 1050 if (*(volatile int *)ptr != 0x0) 1051 page_bad = TRUE; 1052 /* 1053 * Restore original value. 1054 */ 1055 *(int *)ptr = tmp; 1056 1057 skip_memtest: 1058 /* 1059 * Adjust array of valid/good pages. 1060 */ 1061 if (page_bad == TRUE) 1062 continue; 1063 /* 1064 * If this good page is a continuation of the 1065 * previous set of good pages, then just increase 1066 * the end pointer. Otherwise start a new chunk. 1067 * Note that "end" points one higher than end, 1068 * making the range >= start and < end. 1069 * If we're also doing a speculative memory 1070 * test and we at or past the end, bump up Maxmem 1071 * so that we keep going. The first bad page 1072 * will terminate the loop. 1073 */ 1074 if (phys_avail[pa_indx] == pa) { 1075 phys_avail[pa_indx] += PAGE_SIZE; 1076 } else { 1077 pa_indx++; 1078 if (pa_indx == PHYS_AVAIL_ENTRIES) { 1079 printf( 1080 "Too many holes in the physical address space, giving up\n"); 1081 pa_indx--; 1082 full = TRUE; 1083 goto do_dump_avail; 1084 } 1085 phys_avail[pa_indx++] = pa; /* start */ 1086 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 1087 } 1088 physmem++; 1089 do_dump_avail: 1090 if (dump_avail[da_indx] == pa) { 1091 dump_avail[da_indx] += PAGE_SIZE; 1092 } else { 1093 da_indx++; 1094 if (da_indx == PHYS_AVAIL_ENTRIES) { 1095 da_indx--; 1096 goto do_next; 1097 } 1098 dump_avail[da_indx++] = pa; /* start */ 1099 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 1100 } 1101 do_next: 1102 if (full) 1103 break; 1104 } 1105 } 1106 *pte = 0; 1107 invltlb(); 1108 if (memtest != 0) 1109 printf("\n"); 1110 1111 /* 1112 * XXX 1113 * The last chunk must contain at least one page plus the message 1114 * buffer to avoid complicating other code (message buffer address 1115 * calculation, etc.). 1116 */ 1117 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 1118 round_page(msgbufsize) >= phys_avail[pa_indx]) { 1119 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 1120 phys_avail[pa_indx--] = 0; 1121 phys_avail[pa_indx--] = 0; 1122 } 1123 1124 Maxmem = atop(phys_avail[pa_indx]); 1125 1126 /* Trim off space for the message buffer. */ 1127 phys_avail[pa_indx] -= round_page(msgbufsize); 1128 1129 /* Map the message buffer. */ 1130 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]); 1131 TSEXIT(); 1132 } 1133 1134 static caddr_t 1135 native_parse_preload_data(u_int64_t modulep) 1136 { 1137 caddr_t kmdp; 1138 char *envp; 1139 #ifdef DDB 1140 vm_offset_t ksym_start; 1141 vm_offset_t ksym_end; 1142 #endif 1143 1144 preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE); 1145 preload_bootstrap_relocate(KERNBASE); 1146 kmdp = preload_search_by_type("elf kernel"); 1147 if (kmdp == NULL) 1148 kmdp = preload_search_by_type("elf64 kernel"); 1149 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); 1150 envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *); 1151 if (envp != NULL) 1152 envp += KERNBASE; 1153 init_static_kenv(envp, 0); 1154 #ifdef DDB 1155 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); 1156 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); 1157 db_fetch_ksymtab(ksym_start, ksym_end, 0); 1158 #endif 1159 efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t); 1160 1161 return (kmdp); 1162 } 1163 1164 static void 1165 native_clock_source_init(void) 1166 { 1167 i8254_init(); 1168 } 1169 1170 static void 1171 amd64_kdb_init(void) 1172 { 1173 kdb_init(); 1174 #ifdef KDB 1175 if (boothowto & RB_KDB) 1176 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); 1177 #endif 1178 } 1179 1180 /* Set up the fast syscall stuff */ 1181 void 1182 amd64_conf_fast_syscall(void) 1183 { 1184 uint64_t msr; 1185 1186 msr = rdmsr(MSR_EFER) | EFER_SCE; 1187 wrmsr(MSR_EFER, msr); 1188 wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) : 1189 (u_int64_t)IDTVEC(fast_syscall)); 1190 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 1191 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 1192 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 1193 wrmsr(MSR_STAR, msr); 1194 wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC); 1195 } 1196 1197 void 1198 amd64_bsp_pcpu_init1(struct pcpu *pc) 1199 { 1200 struct user_segment_descriptor *gdt; 1201 1202 PCPU_SET(prvspace, pc); 1203 gdt = *PCPU_PTR(gdt); 1204 PCPU_SET(curthread, &thread0); 1205 PCPU_SET(tssp, PCPU_PTR(common_tss)); 1206 PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1207 PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]); 1208 PCPU_SET(fs32p, &gdt[GUFS32_SEL]); 1209 PCPU_SET(gs32p, &gdt[GUGS32_SEL]); 1210 PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK); 1211 PCPU_SET(smp_tlb_gen, 1); 1212 } 1213 1214 void 1215 amd64_bsp_pcpu_init2(uint64_t rsp0) 1216 { 1217 1218 PCPU_SET(rsp0, rsp0); 1219 PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) + 1220 PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful); 1221 PCPU_SET(curpcb, thread0.td_pcb); 1222 } 1223 1224 void 1225 amd64_bsp_ist_init(struct pcpu *pc) 1226 { 1227 struct nmi_pcpu *np; 1228 struct amd64tss *tssp; 1229 1230 tssp = &pc->pc_common_tss; 1231 1232 /* doublefault stack space, runs on ist1 */ 1233 np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1; 1234 np->np_pcpu = (register_t)pc; 1235 tssp->tss_ist1 = (long)np; 1236 1237 /* 1238 * NMI stack, runs on ist2. The pcpu pointer is stored just 1239 * above the start of the ist2 stack. 1240 */ 1241 np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1; 1242 np->np_pcpu = (register_t)pc; 1243 tssp->tss_ist2 = (long)np; 1244 1245 /* 1246 * MC# stack, runs on ist3. The pcpu pointer is stored just 1247 * above the start of the ist3 stack. 1248 */ 1249 np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1; 1250 np->np_pcpu = (register_t)pc; 1251 tssp->tss_ist3 = (long)np; 1252 1253 /* 1254 * DB# stack, runs on ist4. 1255 */ 1256 np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1; 1257 np->np_pcpu = (register_t)pc; 1258 tssp->tss_ist4 = (long)np; 1259 } 1260 1261 /* 1262 * Calculate the kernel load address by inspecting page table created by loader. 1263 * The assumptions: 1264 * - kernel is mapped at KERNBASE, backed by contiguous phys memory 1265 * aligned at 2M, below 4G (the latter is important for AP startup) 1266 * - there is a 2M hole at KERNBASE (KERNSTART = KERNBASE + 2M) 1267 * - kernel is mapped with 2M superpages 1268 * - all participating memory, i.e. kernel, modules, metadata, 1269 * page table is accessible by pre-created 1:1 mapping 1270 * (right now loader creates 1:1 mapping for lower 4G, and all 1271 * memory is from there) 1272 * - there is a usable memory block right after the end of the 1273 * mapped kernel and all modules/metadata, pointed to by 1274 * physfree, for early allocations 1275 */ 1276 vm_paddr_t __nosanitizeaddress __nosanitizememory 1277 amd64_loadaddr(void) 1278 { 1279 pml4_entry_t *pml4e; 1280 pdp_entry_t *pdpe; 1281 pd_entry_t *pde; 1282 uint64_t cr3; 1283 1284 cr3 = rcr3(); 1285 pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(KERNSTART); 1286 pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(KERNSTART); 1287 pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(KERNSTART); 1288 return (*pde & PG_FRAME); 1289 } 1290 1291 u_int64_t 1292 hammer_time(u_int64_t modulep, u_int64_t physfree) 1293 { 1294 caddr_t kmdp; 1295 int gsel_tss, x; 1296 struct pcpu *pc; 1297 uint64_t rsp0; 1298 char *env; 1299 struct user_segment_descriptor *gdt; 1300 struct region_descriptor r_gdt; 1301 size_t kstack0_sz; 1302 1303 TSRAW(&thread0, TS_ENTER, __func__, NULL); 1304 1305 kernphys = amd64_loadaddr(); 1306 1307 physfree += kernphys; 1308 1309 kmdp = init_ops.parse_preload_data(modulep); 1310 1311 efi_boot = preload_search_info(kmdp, MODINFO_METADATA | 1312 MODINFOMD_EFI_MAP) != NULL; 1313 1314 if (!efi_boot) { 1315 /* Tell the bios to warmboot next time */ 1316 atomic_store_short((u_short *)0x472, 0x1234); 1317 } 1318 1319 physfree += ucode_load_bsp(physfree - kernphys + KERNSTART); 1320 physfree = roundup2(physfree, PAGE_SIZE); 1321 1322 identify_cpu1(); 1323 identify_hypervisor(); 1324 identify_hypervisor_smbios(); 1325 identify_cpu_fixup_bsp(); 1326 identify_cpu2(); 1327 initializecpucache(); 1328 1329 /* 1330 * Check for pti, pcid, and invpcid before ifuncs are 1331 * resolved, to correctly select the implementation for 1332 * pmap_activate_sw_mode(). 1333 */ 1334 pti = pti_get_default(); 1335 TUNABLE_INT_FETCH("vm.pmap.pti", &pti); 1336 TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); 1337 if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) { 1338 invpcid_works = (cpu_stdext_feature & 1339 CPUID_STDEXT_INVPCID) != 0; 1340 } else { 1341 pmap_pcid_enabled = 0; 1342 } 1343 1344 /* 1345 * Now we can do small core initialization, after the PCID 1346 * CPU features and user knobs are evaluated. 1347 */ 1348 TUNABLE_INT_FETCH("vm.pmap.pcid_invlpg_workaround", 1349 &pmap_pcid_invlpg_workaround_uena); 1350 cpu_init_small_core(); 1351 1352 if ((cpu_feature2 & CPUID2_XSAVE) != 0) { 1353 use_xsave = 1; 1354 TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave); 1355 } 1356 1357 link_elf_ireloc(kmdp); 1358 1359 /* 1360 * This may be done better later if it gets more high level 1361 * components in it. If so just link td->td_proc here. 1362 */ 1363 proc_linkup0(&proc0, &thread0); 1364 1365 /* Init basic tunables, hz etc */ 1366 init_param1(); 1367 1368 thread0.td_kstack = physfree - kernphys + KERNSTART; 1369 thread0.td_kstack_pages = kstack_pages; 1370 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; 1371 bzero((void *)thread0.td_kstack, kstack0_sz); 1372 physfree += kstack0_sz; 1373 1374 /* 1375 * Initialize enough of thread0 for delayed invalidation to 1376 * work very early. Rely on thread0.td_base_pri 1377 * zero-initialization, it is reset to PVM at proc0_init(). 1378 */ 1379 pmap_thread_init_invl_gen(&thread0); 1380 1381 pc = &temp_bsp_pcpu; 1382 pcpu_init(pc, 0, sizeof(struct pcpu)); 1383 gdt = &temp_bsp_pcpu.pc_gdt[0]; 1384 1385 /* 1386 * make gdt memory segments 1387 */ 1388 for (x = 0; x < NGDT; x++) { 1389 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && 1390 x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1) 1391 ssdtosd(&gdt_segs[x], &gdt[x]); 1392 } 1393 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss; 1394 ssdtosyssd(&gdt_segs[GPROC0_SEL], 1395 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1396 1397 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 1398 r_gdt.rd_base = (long)gdt; 1399 lgdt(&r_gdt); 1400 1401 wrmsr(MSR_FSBASE, 0); /* User value */ 1402 wrmsr(MSR_GSBASE, (u_int64_t)pc); 1403 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ 1404 1405 dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0); 1406 physfree += DPCPU_SIZE; 1407 amd64_bsp_pcpu_init1(pc); 1408 /* Non-late cninit() and printf() can be moved up to here. */ 1409 1410 /* 1411 * Initialize mutexes. 1412 * 1413 * icu_lock: in order to allow an interrupt to occur in a critical 1414 * section, to set pcpu->ipending (etc...) properly, we 1415 * must be able to get the icu lock, so it can't be 1416 * under witness. 1417 */ 1418 mutex_init(); 1419 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS); 1420 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF); 1421 1422 /* exceptions */ 1423 for (x = 0; x < NIDT; x++) 1424 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT, 1425 SEL_KPL, 0); 1426 setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT, 1427 SEL_KPL, 0); 1428 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4); 1429 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2); 1430 setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT, 1431 SEL_UPL, 0); 1432 setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT, 1433 SEL_UPL, 0); 1434 setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT, 1435 SEL_KPL, 0); 1436 setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT, 1437 SEL_KPL, 0); 1438 setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT, 1439 SEL_KPL, 0); 1440 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); 1441 setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm), 1442 SDT_SYSIGT, SEL_KPL, 0); 1443 setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT, 1444 SEL_KPL, 0); 1445 setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing), 1446 SDT_SYSIGT, SEL_KPL, 0); 1447 setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT, 1448 SEL_KPL, 0); 1449 setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT, 1450 SEL_KPL, 0); 1451 setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT, 1452 SEL_KPL, 0); 1453 setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT, 1454 SEL_KPL, 0); 1455 setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT, 1456 SEL_KPL, 0); 1457 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3); 1458 setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT, 1459 SEL_KPL, 0); 1460 #ifdef KDTRACE_HOOKS 1461 setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) : 1462 &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0); 1463 #endif 1464 #ifdef XENHVM 1465 setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) : 1466 &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0); 1467 #endif 1468 r_idt.rd_limit = sizeof(idt0) - 1; 1469 r_idt.rd_base = (long) idt; 1470 lidt(&r_idt); 1471 1472 /* 1473 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4) 1474 * transition). 1475 * Once bootblocks have updated, we can test directly for 1476 * efi_systbl != NULL here... 1477 */ 1478 if (efi_boot) 1479 vty_set_preferred(VTY_VT); 1480 1481 TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable); 1482 TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable); 1483 1484 TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable); 1485 TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable); 1486 1487 TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush", 1488 &syscall_ret_l1d_flush_mode); 1489 1490 TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable); 1491 TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable); 1492 1493 TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable); 1494 1495 TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable", 1496 &x86_rngds_mitg_enable); 1497 1498 finishidentcpu(); /* Final stage of CPU initialization */ 1499 1500 /* 1501 * Initialize the clock before the console so that console 1502 * initialization can use DELAY(). 1503 */ 1504 clock_init(); 1505 1506 initializecpu(); /* Initialize CPU registers */ 1507 1508 amd64_bsp_ist_init(pc); 1509 1510 /* Set the IO permission bitmap (empty due to tss seg limit) */ 1511 pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) + 1512 IOPERM_BITMAP_SIZE; 1513 1514 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 1515 ltr(gsel_tss); 1516 1517 amd64_conf_fast_syscall(); 1518 1519 /* 1520 * We initialize the PCB pointer early so that exception 1521 * handlers will work. Also set up td_critnest to short-cut 1522 * the page fault handler. 1523 */ 1524 cpu_max_ext_state_size = sizeof(struct savefpu); 1525 set_top_of_stack_td(&thread0); 1526 thread0.td_pcb = get_pcb_td(&thread0); 1527 thread0.td_critnest = 1; 1528 1529 /* 1530 * The console and kdb should be initialized even earlier than here, 1531 * but some console drivers don't work until after getmemsize(). 1532 * Default to late console initialization to support these drivers. 1533 * This loses mainly printf()s in getmemsize() and early debugging. 1534 */ 1535 TUNABLE_INT_FETCH("debug.late_console", &late_console); 1536 if (!late_console) { 1537 cninit(); 1538 amd64_kdb_init(); 1539 } 1540 1541 getmemsize(kmdp, physfree); 1542 init_param2(physmem); 1543 1544 /* now running on new page tables, configured,and u/iom is accessible */ 1545 1546 #ifdef DEV_PCI 1547 /* This call might adjust phys_avail[]. */ 1548 pci_early_quirks(); 1549 #endif 1550 1551 if (late_console) 1552 cninit(); 1553 1554 /* 1555 * Dump the boot metadata. We have to wait for cninit() since console 1556 * output is required. If it's grossly incorrect the kernel will never 1557 * make it this far. 1558 */ 1559 if (getenv_is_true("debug.dump_modinfo_at_boot")) 1560 preload_dump(); 1561 1562 #ifdef DEV_ISA 1563 #ifdef DEV_ATPIC 1564 elcr_probe(); 1565 atpic_startup(); 1566 #else 1567 /* Reset and mask the atpics and leave them shut down. */ 1568 atpic_reset(); 1569 1570 /* 1571 * Point the ICU spurious interrupt vectors at the APIC spurious 1572 * interrupt handler. 1573 */ 1574 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); 1575 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); 1576 #endif 1577 #else 1578 #error "have you forgotten the isa device?" 1579 #endif 1580 1581 if (late_console) 1582 amd64_kdb_init(); 1583 1584 msgbufinit(msgbufp, msgbufsize); 1585 fpuinit(); 1586 1587 /* make an initial tss so cpu can get interrupt stack on syscall! */ 1588 rsp0 = thread0.td_md.md_stack_base; 1589 /* Ensure the stack is aligned to 16 bytes */ 1590 rsp0 &= ~0xFul; 1591 PCPU_PTR(common_tss)->tss_rsp0 = rsp0; 1592 amd64_bsp_pcpu_init2(rsp0); 1593 1594 /* transfer to user mode */ 1595 1596 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 1597 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 1598 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); 1599 _ufssel = GSEL(GUFS32_SEL, SEL_UPL); 1600 _ugssel = GSEL(GUGS32_SEL, SEL_UPL); 1601 1602 load_ds(_udatasel); 1603 load_es(_udatasel); 1604 load_fs(_ufssel); 1605 1606 /* setup proc 0's pcb */ 1607 thread0.td_pcb->pcb_flags = 0; 1608 1609 env = kern_getenv("kernelname"); 1610 if (env != NULL) 1611 strlcpy(kernelname, env, sizeof(kernelname)); 1612 1613 kcsan_cpu_init(0); 1614 1615 #ifdef FDT 1616 x86_init_fdt(); 1617 #endif 1618 thread0.td_critnest = 0; 1619 1620 kasan_init(); 1621 kmsan_init(); 1622 1623 TSEXIT(); 1624 1625 /* Location of kernel stack for locore */ 1626 return (thread0.td_md.md_stack_base); 1627 } 1628 1629 void 1630 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) 1631 { 1632 1633 pcpu->pc_acpi_id = 0xffffffff; 1634 } 1635 1636 static int 1637 smap_sysctl_handler(SYSCTL_HANDLER_ARGS) 1638 { 1639 struct bios_smap *smapbase; 1640 struct bios_smap_xattr smap; 1641 caddr_t kmdp; 1642 uint32_t *smapattr; 1643 int count, error, i; 1644 1645 /* Retrieve the system memory map from the loader. */ 1646 kmdp = preload_search_by_type("elf kernel"); 1647 if (kmdp == NULL) 1648 kmdp = preload_search_by_type("elf64 kernel"); 1649 smapbase = (struct bios_smap *)preload_search_info(kmdp, 1650 MODINFO_METADATA | MODINFOMD_SMAP); 1651 if (smapbase == NULL) 1652 return (0); 1653 smapattr = (uint32_t *)preload_search_info(kmdp, 1654 MODINFO_METADATA | MODINFOMD_SMAP_XATTR); 1655 count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase); 1656 error = 0; 1657 for (i = 0; i < count; i++) { 1658 smap.base = smapbase[i].base; 1659 smap.length = smapbase[i].length; 1660 smap.type = smapbase[i].type; 1661 if (smapattr != NULL) 1662 smap.xattr = smapattr[i]; 1663 else 1664 smap.xattr = 0; 1665 error = SYSCTL_OUT(req, &smap, sizeof(smap)); 1666 } 1667 return (error); 1668 } 1669 SYSCTL_PROC(_machdep, OID_AUTO, smap, 1670 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 1671 smap_sysctl_handler, "S,bios_smap_xattr", 1672 "Raw BIOS SMAP data"); 1673 1674 static int 1675 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS) 1676 { 1677 struct efi_map_header *efihdr; 1678 caddr_t kmdp; 1679 uint32_t efisize; 1680 1681 kmdp = preload_search_by_type("elf kernel"); 1682 if (kmdp == NULL) 1683 kmdp = preload_search_by_type("elf64 kernel"); 1684 efihdr = (struct efi_map_header *)preload_search_info(kmdp, 1685 MODINFO_METADATA | MODINFOMD_EFI_MAP); 1686 if (efihdr == NULL) 1687 return (0); 1688 efisize = *((uint32_t *)efihdr - 1); 1689 return (SYSCTL_OUT(req, efihdr, efisize)); 1690 } 1691 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, 1692 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 1693 efi_map_sysctl_handler, "S,efi_map_header", 1694 "Raw EFI Memory Map"); 1695 1696 void 1697 spinlock_enter(void) 1698 { 1699 struct thread *td; 1700 register_t flags; 1701 1702 td = curthread; 1703 if (td->td_md.md_spinlock_count == 0) { 1704 flags = intr_disable(); 1705 td->td_md.md_spinlock_count = 1; 1706 td->td_md.md_saved_flags = flags; 1707 critical_enter(); 1708 } else 1709 td->td_md.md_spinlock_count++; 1710 } 1711 1712 void 1713 spinlock_exit(void) 1714 { 1715 struct thread *td; 1716 register_t flags; 1717 1718 td = curthread; 1719 flags = td->td_md.md_saved_flags; 1720 td->td_md.md_spinlock_count--; 1721 if (td->td_md.md_spinlock_count == 0) { 1722 critical_exit(); 1723 intr_restore(flags); 1724 } 1725 } 1726 1727 /* 1728 * Construct a PCB from a trapframe. This is called from kdb_trap() where 1729 * we want to start a backtrace from the function that caused us to enter 1730 * the debugger. We have the context in the trapframe, but base the trace 1731 * on the PCB. The PCB doesn't have to be perfect, as long as it contains 1732 * enough for a backtrace. 1733 */ 1734 void 1735 makectx(struct trapframe *tf, struct pcb *pcb) 1736 { 1737 1738 pcb->pcb_r12 = tf->tf_r12; 1739 pcb->pcb_r13 = tf->tf_r13; 1740 pcb->pcb_r14 = tf->tf_r14; 1741 pcb->pcb_r15 = tf->tf_r15; 1742 pcb->pcb_rbp = tf->tf_rbp; 1743 pcb->pcb_rbx = tf->tf_rbx; 1744 pcb->pcb_rip = tf->tf_rip; 1745 pcb->pcb_rsp = tf->tf_rsp; 1746 } 1747 1748 /* 1749 * The pcb_flags is only modified by current thread, or by other threads 1750 * when current thread is stopped. However, current thread may change it 1751 * from the interrupt context in cpu_switch(), or in the trap handler. 1752 * When we read-modify-write pcb_flags from C sources, compiler may generate 1753 * code that is not atomic regarding the interrupt handler. If a trap or 1754 * interrupt happens and any flag is modified from the handler, it can be 1755 * clobbered with the cached value later. Therefore, we implement setting 1756 * and clearing flags with single-instruction functions, which do not race 1757 * with possible modification of the flags from the trap or interrupt context, 1758 * because traps and interrupts are executed only on instruction boundary. 1759 */ 1760 void 1761 set_pcb_flags_raw(struct pcb *pcb, const u_int flags) 1762 { 1763 1764 __asm __volatile("orl %1,%0" 1765 : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags) 1766 : "cc", "memory"); 1767 1768 } 1769 1770 /* 1771 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs 1772 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into 1773 * pcb if user space modified the bases. We must save on the context 1774 * switch or if the return to usermode happens through the doreti. 1775 * 1776 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET, 1777 * which have a consequence that the base MSRs must be saved each time 1778 * the PCB_FULL_IRET flag is set. We disable interrupts to sync with 1779 * context switches. 1780 */ 1781 static void 1782 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags) 1783 { 1784 register_t r; 1785 1786 if (curpcb == pcb && 1787 (flags & PCB_FULL_IRET) != 0 && 1788 (pcb->pcb_flags & PCB_FULL_IRET) == 0) { 1789 r = intr_disable(); 1790 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) { 1791 if (rfs() == _ufssel) 1792 pcb->pcb_fsbase = rdfsbase(); 1793 if (rgs() == _ugssel) 1794 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE); 1795 } 1796 set_pcb_flags_raw(pcb, flags); 1797 intr_restore(r); 1798 } else { 1799 set_pcb_flags_raw(pcb, flags); 1800 } 1801 } 1802 1803 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int)) 1804 { 1805 1806 return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ? 1807 set_pcb_flags_fsgsbase : set_pcb_flags_raw); 1808 } 1809 1810 void 1811 clear_pcb_flags(struct pcb *pcb, const u_int flags) 1812 { 1813 1814 __asm __volatile("andl %1,%0" 1815 : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags) 1816 : "cc", "memory"); 1817 } 1818 1819 #ifdef KDB 1820 1821 /* 1822 * Provide inb() and outb() as functions. They are normally only available as 1823 * inline functions, thus cannot be called from the debugger. 1824 */ 1825 1826 /* silence compiler warnings */ 1827 u_char inb_(u_short); 1828 void outb_(u_short, u_char); 1829 1830 u_char 1831 inb_(u_short port) 1832 { 1833 return inb(port); 1834 } 1835 1836 void 1837 outb_(u_short port, u_char data) 1838 { 1839 outb(port, data); 1840 } 1841 1842 #endif /* KDB */ 1843 1844 #undef memset 1845 #undef memmove 1846 #undef memcpy 1847 1848 void *memset_std(void *buf, int c, size_t len); 1849 void *memset_erms(void *buf, int c, size_t len); 1850 void *memmove_std(void * _Nonnull dst, const void * _Nonnull src, 1851 size_t len); 1852 void *memmove_erms(void * _Nonnull dst, const void * _Nonnull src, 1853 size_t len); 1854 void *memcpy_std(void * _Nonnull dst, const void * _Nonnull src, 1855 size_t len); 1856 void *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src, 1857 size_t len); 1858 1859 #ifdef KCSAN 1860 /* 1861 * These fail to build as ifuncs when used with KCSAN. 1862 */ 1863 void * 1864 memset(void *buf, int c, size_t len) 1865 { 1866 1867 return (memset_std(buf, c, len)); 1868 } 1869 1870 void * 1871 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len) 1872 { 1873 1874 return (memmove_std(dst, src, len)); 1875 } 1876 1877 void * 1878 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len) 1879 { 1880 1881 return (memcpy_std(dst, src, len)); 1882 } 1883 #else 1884 DEFINE_IFUNC(, void *, memset, (void *, int, size_t)) 1885 { 1886 1887 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 1888 memset_erms : memset_std); 1889 } 1890 1891 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull, 1892 size_t)) 1893 { 1894 1895 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 1896 memmove_erms : memmove_std); 1897 } 1898 1899 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t)) 1900 { 1901 1902 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 1903 memcpy_erms : memcpy_std); 1904 } 1905 #endif 1906 1907 void pagezero_std(void *addr); 1908 void pagezero_erms(void *addr); 1909 DEFINE_IFUNC(, void , pagezero, (void *)) 1910 { 1911 1912 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 1913 pagezero_erms : pagezero_std); 1914 } 1915