1 /* $OpenBSD: cpu.c,v 1.185 2024/04/03 02:01:21 guenther Exp $ */ 2 /* $NetBSD: cpu.c,v 1.1 2003/04/26 18:39:26 fvdl Exp $ */ 3 4 /*- 5 * Copyright (c) 2000 The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by RedBack Networks Inc. 10 * 11 * Author: Bill Sommerfeld 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 * POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 /* 36 * Copyright (c) 1999 Stefan Grefen 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. All advertising materials mentioning features or use of this software 47 * must display the following acknowledgement: 48 * This product includes software developed by the NetBSD 49 * Foundation, Inc. and its contributors. 50 * 4. Neither the name of The NetBSD Foundation nor the names of its 51 * contributors may be used to endorse or promote products derived 52 * from this software without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND ANY 55 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR AND CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 */ 66 67 #include "lapic.h" 68 #include "ioapic.h" 69 #include "vmm.h" 70 #include "pctr.h" 71 #include "pvbus.h" 72 73 #include <sys/param.h> 74 #include <sys/proc.h> 75 #include <sys/timeout.h> 76 #include <sys/systm.h> 77 #include <sys/device.h> 78 #include <sys/malloc.h> 79 #include <sys/memrange.h> 80 #include <sys/atomic.h> 81 #include <sys/user.h> 82 83 #include <uvm/uvm_extern.h> 84 85 #include <machine/codepatch.h> 86 #include <machine/cpu_full.h> 87 #include <machine/cpufunc.h> 88 #include <machine/cpuvar.h> 89 #include <machine/pmap.h> 90 #include <machine/vmparam.h> 91 #include <machine/mpbiosvar.h> 92 #include <machine/pcb.h> 93 #include <machine/specialreg.h> 94 #include <machine/segments.h> 95 #include <machine/gdt.h> 96 #include <machine/pio.h> 97 #include <machine/vmmvar.h> 98 99 #if NLAPIC > 0 100 #include <machine/i82489reg.h> 101 #include <machine/i82489var.h> 102 #endif 103 104 #if NIOAPIC > 0 105 #include <machine/i82093var.h> 106 #endif 107 108 #if NPCTR > 0 109 #include <machine/pctr.h> 110 #endif 111 112 #if NPVBUS > 0 113 #include <dev/pv/pvvar.h> 114 #endif 115 116 #include <dev/ic/mc146818reg.h> 117 #include <amd64/isa/nvram.h> 118 #include <dev/isa/isareg.h> 119 120 #ifdef HIBERNATE 121 #include <sys/hibernate.h> 122 #include <machine/hibernate.h> 123 #endif /* HIBERNATE */ 124 125 /* #define CPU_DEBUG */ 126 127 #ifdef CPU_DEBUG 128 #define DPRINTF(x...) do { printf(x); } while(0) 129 #else 130 #define DPRINTF(x...) 131 #endif /* CPU_DEBUG */ 132 133 int cpu_match(struct device *, void *, void *); 134 void cpu_attach(struct device *, struct device *, void *); 135 int cpu_activate(struct device *, int); 136 void patinit(struct cpu_info *ci); 137 #if NVMM > 0 138 void cpu_init_vmm(struct cpu_info *ci); 139 #endif /* NVMM > 0 */ 140 141 struct cpu_softc { 142 struct device sc_dev; /* device tree glue */ 143 struct cpu_info *sc_info; /* pointer to CPU info */ 144 }; 145 146 void replacesmap(void); 147 void replacemeltdown(void); 148 void replacemds(void); 149 150 extern long _stac; 151 extern long _clac; 152 153 int cpuid_level = 0; /* MIN cpuid(0).eax */ 154 char cpu_vendor[16] = { 0 }; /* CPU0's cpuid(0).e[bdc]x, \0 */ 155 int cpu_id = 0; /* cpuid(1).eax */ 156 int cpu_ebxfeature = 0; /* cpuid(1).ebx */ 157 int cpu_ecxfeature = 0; /* cpuid(1).ecx */ 158 int cpu_feature = 0; /* cpuid(1).edx */ 159 int cpu_perf_eax = 0; /* cpuid(0xa).eax */ 160 int cpu_perf_ebx = 0; /* cpuid(0xa).ebx */ 161 int cpu_perf_edx = 0; /* cpuid(0xa).edx */ 162 int cpu_apmi_edx = 0; /* cpuid(0x80000007).edx */ 163 int ecpu_ecxfeature = 0; /* cpuid(0x80000001).ecx */ 164 int cpu_meltdown = 0; 165 int cpu_use_xsaves = 0; 166 int need_retpoline = 1; /* most systems need retpoline */ 167 168 void 169 replacesmap(void) 170 { 171 static int replacedone = 0; 172 int s; 173 174 if (replacedone) 175 return; 176 replacedone = 1; 177 178 s = splhigh(); 179 180 codepatch_replace(CPTAG_STAC, &_stac, 3); 181 codepatch_replace(CPTAG_CLAC, &_clac, 3); 182 183 splx(s); 184 } 185 186 void 187 replacemeltdown(void) 188 { 189 static int replacedone = 0; 190 struct cpu_info *ci = &cpu_info_primary; 191 int swapgs_vuln = 0, ibrs = 0, s, ibpb = 0; 192 193 if (ci->ci_vendor == CPUV_INTEL) { 194 int family = ci->ci_family; 195 int model = ci->ci_model; 196 197 swapgs_vuln = 1; 198 if (family == 0x6 && 199 (model == 0x37 || model == 0x4a || model == 0x4c || 200 model == 0x4d || model == 0x5a || model == 0x5d || 201 model == 0x6e || model == 0x65 || model == 0x75)) { 202 /* Silvermont, Airmont */ 203 swapgs_vuln = 0; 204 } else if (family == 0x6 && (model == 0x85 || model == 0x57)) { 205 /* KnightsLanding */ 206 swapgs_vuln = 0; 207 } 208 if ((ci->ci_feature_sefflags_edx & SEFF0EDX_ARCH_CAP) && 209 (rdmsr(MSR_ARCH_CAPABILITIES) & ARCH_CAP_IBRS_ALL)) { 210 ibrs = 2; 211 } else if (ci->ci_feature_sefflags_edx & SEFF0EDX_IBRS) { 212 ibrs = 1; 213 } 214 if (ci->ci_feature_sefflags_edx & SEFF0EDX_IBRS) 215 ibpb = 1; 216 } else if (ci->ci_vendor == CPUV_AMD && 217 ci->ci_pnfeatset >= 0x80000008) { 218 if (ci->ci_feature_amdspec_ebx & CPUIDEBX_IBRS_ALWAYSON) { 219 ibrs = 2; 220 } else if ((ci->ci_feature_amdspec_ebx & CPUIDEBX_IBRS) && 221 (ci->ci_feature_amdspec_ebx & CPUIDEBX_IBRS_PREF)) { 222 ibrs = 1; 223 } 224 if (ci->ci_feature_amdspec_ebx & CPUIDEBX_IBPB) 225 ibpb = 1; 226 } 227 228 /* Enhanced IBRS: turn it on once on each CPU and don't touch again */ 229 if (ibrs == 2) 230 wrmsr(MSR_SPEC_CTRL, SPEC_CTRL_IBRS); 231 232 if (replacedone) 233 return; 234 replacedone = 1; 235 236 s = splhigh(); 237 238 /* If we don't have IBRS/IBPB, then don't use IBPB */ 239 if (ibpb == 0) 240 codepatch_nop(CPTAG_IBPB_NOP); 241 242 if (ibrs == 2 || (ci->ci_feature_sefflags_edx & SEFF0EDX_IBT)) { 243 extern const char _jmprax, _jmpr11, _jmpr13; 244 extern const short _jmprax_len, _jmpr11_len, _jmpr13_len; 245 246 codepatch_replace(CPTAG_RETPOLINE_RAX, &_jmprax, _jmprax_len); 247 codepatch_replace(CPTAG_RETPOLINE_R11, &_jmpr11, _jmpr11_len); 248 codepatch_replace(CPTAG_RETPOLINE_R13, &_jmpr13, _jmpr13_len); 249 need_retpoline = 0; 250 } 251 252 if (!cpu_meltdown) 253 codepatch_nop(CPTAG_MELTDOWN_NOP); 254 else { 255 extern long alltraps_kern_meltdown; 256 257 /* eliminate conditional branch in alltraps */ 258 codepatch_jmp(CPTAG_MELTDOWN_ALLTRAPS, &alltraps_kern_meltdown); 259 260 /* enable reuse of PCID for U-K page tables */ 261 if (pmap_use_pcid) { 262 extern long _pcid_set_reuse; 263 DPRINTF("%s: codepatching PCID use\n", __func__); 264 codepatch_replace(CPTAG_PCID_SET_REUSE, 265 &_pcid_set_reuse, PCID_SET_REUSE_SIZE); 266 } 267 } 268 269 /* 270 * CVE-2019-1125: if the CPU has SMAP and it's not vulnerable to 271 * Meltdown, then it's protected both from speculatively mis-skipping 272 * the swapgs during interrupts of userspace and from speculatively 273 * mis-taking a swapgs during interrupts while already in the kernel 274 * as the speculative path will fault from SMAP. Warning: enabling 275 * WRGSBASE would break this 'protection'. 276 * 277 * Otherwise, if the CPU's swapgs can't be speculated over and it 278 * _is_ vulnerable to Meltdown then the %cr3 change will serialize 279 * user->kern transitions, but we still need to mitigate the 280 * already-in-kernel cases. 281 */ 282 if (!cpu_meltdown && (ci->ci_feature_sefflags_ebx & SEFF0EBX_SMAP)) { 283 codepatch_nop(CPTAG_FENCE_SWAPGS_MIS_TAKEN); 284 codepatch_nop(CPTAG_FENCE_NO_SAFE_SMAP); 285 } else if (!swapgs_vuln && cpu_meltdown) { 286 codepatch_nop(CPTAG_FENCE_SWAPGS_MIS_TAKEN); 287 } 288 splx(s); 289 } 290 291 void 292 replacemds(void) 293 { 294 static int replacedone = 0; 295 extern long mds_handler_bdw, mds_handler_ivb, mds_handler_skl; 296 extern long mds_handler_skl_sse, mds_handler_skl_avx; 297 extern long mds_handler_silvermont, mds_handler_knights; 298 struct cpu_info *ci = &cpu_info_primary; 299 CPU_INFO_ITERATOR cii; 300 void *handler = NULL, *vmm_handler = NULL; 301 const char *type; 302 int use_verw = 0, s; 303 uint32_t cap = 0; 304 305 /* ci_mds_tmp must be 32byte aligned for AVX instructions */ 306 CTASSERT((offsetof(struct cpu_info, ci_mds_tmp) - 307 offsetof(struct cpu_info, ci_PAGEALIGN)) % 32 == 0); 308 309 if (replacedone) 310 return; 311 replacedone = 1; 312 313 if (ci->ci_vendor != CPUV_INTEL) 314 goto notintel; /* VERW only needed on Intel */ 315 316 if ((ci->ci_feature_sefflags_edx & SEFF0EDX_ARCH_CAP)) 317 cap = rdmsr(MSR_ARCH_CAPABILITIES); 318 319 if (cap & ARCH_CAP_MDS_NO) { 320 /* Unaffected, nop out the handling code */ 321 } else if (ci->ci_feature_sefflags_edx & SEFF0EDX_MD_CLEAR) { 322 /* new firmware, use VERW */ 323 use_verw = 1; 324 } else { 325 int family = ci->ci_family; 326 int model = ci->ci_model; 327 int stepping = CPUID2STEPPING(ci->ci_signature); 328 329 if (family == 0x6 && 330 (model == 0x2e || model == 0x1e || model == 0x1f || 331 model == 0x1a || model == 0x2f || model == 0x25 || 332 model == 0x2c || model == 0x2d || model == 0x2a || 333 model == 0x3e || model == 0x3a)) { 334 /* Nehalem, SandyBridge, IvyBridge */ 335 handler = vmm_handler = &mds_handler_ivb; 336 type = "IvyBridge"; 337 CPU_INFO_FOREACH(cii, ci) { 338 ci->ci_mds_buf = malloc(672, M_DEVBUF, 339 M_WAITOK); 340 memset(ci->ci_mds_buf, 0, 16); 341 } 342 } else if (family == 0x6 && 343 (model == 0x3f || model == 0x3c || model == 0x45 || 344 model == 0x46 || model == 0x56 || model == 0x4f || 345 model == 0x47 || model == 0x3d)) { 346 /* Haswell and Broadwell */ 347 handler = vmm_handler = &mds_handler_bdw; 348 type = "Broadwell"; 349 CPU_INFO_FOREACH(cii, ci) { 350 ci->ci_mds_buf = malloc(1536, M_DEVBUF, 351 M_WAITOK); 352 } 353 } else if (family == 0x6 && 354 ((model == 0x55 && stepping <= 5) || model == 0x4e || 355 model == 0x5e || (model == 0x8e && stepping <= 0xb) || 356 (model == 0x9e && stepping <= 0xc))) { 357 /* 358 * Skylake, KabyLake, CoffeeLake, WhiskeyLake, 359 * CascadeLake 360 */ 361 /* XXX mds_handler_skl_avx512 */ 362 if (xgetbv(0) & XFEATURE_AVX) { 363 handler = &mds_handler_skl_avx; 364 type = "Skylake AVX"; 365 } else { 366 handler = &mds_handler_skl_sse; 367 type = "Skylake SSE"; 368 } 369 vmm_handler = &mds_handler_skl; 370 CPU_INFO_FOREACH(cii, ci) { 371 vaddr_t b64; 372 b64 = (vaddr_t)malloc(6 * 1024 + 64 + 63, 373 M_DEVBUF, M_WAITOK); 374 ci->ci_mds_buf = (void *)((b64 + 63) & ~63); 375 memset(ci->ci_mds_buf, 0, 64); 376 } 377 } else if (family == 0x6 && 378 (model == 0x37 || model == 0x4a || model == 0x4c || 379 model == 0x4d || model == 0x5a || model == 0x5d || 380 model == 0x6e || model == 0x65 || model == 0x75)) { 381 /* Silvermont, Airmont */ 382 handler = vmm_handler = &mds_handler_silvermont; 383 type = "Silvermont"; 384 CPU_INFO_FOREACH(cii, ci) { 385 ci->ci_mds_buf = malloc(256, M_DEVBUF, 386 M_WAITOK); 387 memset(ci->ci_mds_buf, 0, 16); 388 } 389 } else if (family == 0x6 && (model == 0x85 || model == 0x57)) { 390 handler = vmm_handler = &mds_handler_knights; 391 type = "KnightsLanding"; 392 CPU_INFO_FOREACH(cii, ci) { 393 vaddr_t b64; 394 b64 = (vaddr_t)malloc(1152 + 63, M_DEVBUF, 395 M_WAITOK); 396 ci->ci_mds_buf = (void *)((b64 + 63) & ~63); 397 } 398 } 399 } 400 401 /* Register File Data Sampling (RFDS) also has a VERW workaround */ 402 if ((cap & ARCH_CAP_RFDS_NO) == 0 && (cap & ARCH_CAP_RFDS_CLEAR)) 403 use_verw = 1; 404 405 if (handler != NULL) { 406 printf("cpu0: using %s MDS workaround%s\n", type, ""); 407 s = splhigh(); 408 codepatch_call(CPTAG_MDS, handler); 409 codepatch_call(CPTAG_MDS_VMM, vmm_handler); 410 splx(s); 411 } else if (use_verw) { 412 /* 413 * The new firmware enhances L1D_FLUSH MSR to flush MDS too, 414 * but keep the verw if affected by RFDS 415 */ 416 if ((cap & ARCH_CAP_RFDS_NO) == 0 && (cap & ARCH_CAP_RFDS_CLEAR)) { 417 type = ""; 418 } else if (cpu_info_primary.ci_vmm_cap.vcc_vmx.vmx_has_l1_flush_msr == 1) { 419 s = splhigh(); 420 codepatch_nop(CPTAG_MDS_VMM); 421 splx(s); 422 type = " (except on vmm entry)"; 423 } else { 424 type = ""; 425 } 426 printf("cpu0: using %s MDS workaround%s\n", "VERW", type); 427 } else { 428 notintel: 429 s = splhigh(); 430 codepatch_nop(CPTAG_MDS); 431 codepatch_nop(CPTAG_MDS_VMM); 432 splx(s); 433 } 434 } 435 436 #ifdef MULTIPROCESSOR 437 int mp_cpu_start(struct cpu_info *); 438 void mp_cpu_start_cleanup(struct cpu_info *); 439 struct cpu_functions mp_cpu_funcs = { mp_cpu_start, NULL, 440 mp_cpu_start_cleanup }; 441 #endif /* MULTIPROCESSOR */ 442 443 const struct cfattach cpu_ca = { 444 sizeof(struct cpu_softc), cpu_match, cpu_attach, NULL, cpu_activate 445 }; 446 447 struct cfdriver cpu_cd = { 448 NULL, "cpu", DV_DULL 449 }; 450 451 /* 452 * Statically-allocated CPU info for the primary CPU (or the only 453 * CPU, on uniprocessors). The CPU info list is initialized to 454 * point at it. 455 */ 456 struct cpu_info_full cpu_info_full_primary = { .cif_cpu = { .ci_self = &cpu_info_primary } }; 457 458 struct cpu_info *cpu_info_list = &cpu_info_primary; 459 460 #ifdef MULTIPROCESSOR 461 /* 462 * Array of CPU info structures. Must be statically-allocated because 463 * curproc, etc. are used early. 464 */ 465 struct cpu_info *cpu_info[MAXCPUS] = { &cpu_info_primary }; 466 467 void cpu_hatch(void *); 468 void cpu_boot_secondary(struct cpu_info *ci); 469 void cpu_start_secondary(struct cpu_info *ci); 470 #endif 471 472 int 473 cpu_match(struct device *parent, void *match, void *aux) 474 { 475 struct cfdata *cf = match; 476 struct cpu_attach_args *caa = aux; 477 478 if (strcmp(caa->caa_name, cf->cf_driver->cd_name) != 0) 479 return 0; 480 481 if (cf->cf_unit >= MAXCPUS) 482 return 0; 483 484 return 1; 485 } 486 487 void cpu_idle_mwait_cycle(void); 488 void cpu_init_mwait(struct cpu_softc *, struct cpu_info *); 489 490 u_int cpu_mwait_size, cpu_mwait_states; 491 492 void 493 cpu_idle_mwait_cycle(void) 494 { 495 struct cpu_info *ci = curcpu(); 496 497 if ((read_rflags() & PSL_I) == 0) 498 panic("idle with interrupts blocked!"); 499 500 /* something already queued? */ 501 if (!cpu_is_idle(ci)) 502 return; 503 504 /* 505 * About to idle; setting the MWAIT_IN_IDLE bit tells 506 * cpu_unidle() that it can't be a no-op and tells cpu_kick() 507 * that it doesn't need to use an IPI. We also set the 508 * MWAIT_KEEP_IDLING bit: those routines clear it to stop 509 * the mwait. Once they're set, we do a final check of the 510 * queue, in case another cpu called setrunqueue() and added 511 * something to the queue and called cpu_unidle() between 512 * the check in sched_idle() and here. 513 */ 514 atomic_setbits_int(&ci->ci_mwait, MWAIT_IDLING | MWAIT_ONLY); 515 if (cpu_is_idle(ci)) { 516 monitor(&ci->ci_mwait, 0, 0); 517 if ((ci->ci_mwait & MWAIT_IDLING) == MWAIT_IDLING) 518 mwait(0, 0); 519 } 520 521 /* done idling; let cpu_kick() know that an IPI is required */ 522 atomic_clearbits_int(&ci->ci_mwait, MWAIT_IDLING); 523 } 524 525 void 526 cpu_init_mwait(struct cpu_softc *sc, struct cpu_info *ci) 527 { 528 unsigned int smallest, largest, extensions, c_substates; 529 530 if ((cpu_ecxfeature & CPUIDECX_MWAIT) == 0 || ci->ci_cpuid_level < 0x5) 531 return; 532 533 /* get the monitor granularity */ 534 CPUID(0x5, smallest, largest, extensions, cpu_mwait_states); 535 smallest &= 0xffff; 536 largest &= 0xffff; 537 538 /* mask out states C6/C7 in 31:24 for CHT45 errata */ 539 if (ci->ci_vendor == CPUV_INTEL && 540 ci->ci_family == 0x06 && ci->ci_model == 0x4c) 541 cpu_mwait_states &= 0x00ffffff; 542 543 printf("%s: mwait min=%u, max=%u", sc->sc_dev.dv_xname, 544 smallest, largest); 545 if (extensions & 0x1) { 546 if (cpu_mwait_states > 0) { 547 c_substates = cpu_mwait_states; 548 printf(", C-substates=%u", 0xf & c_substates); 549 while ((c_substates >>= 4) > 0) 550 printf(".%u", 0xf & c_substates); 551 } 552 if (extensions & 0x2) 553 printf(", IBE"); 554 } else { 555 /* substates not supported, forge the default: just C1 */ 556 cpu_mwait_states = 1 << 4; 557 } 558 559 /* paranoia: check the values */ 560 if (smallest < sizeof(int) || largest < smallest || 561 (largest & (sizeof(int)-1))) 562 printf(" (bogus)"); 563 else 564 cpu_mwait_size = largest; 565 printf("\n"); 566 567 /* enable use of mwait; may be overridden by acpicpu later */ 568 if (cpu_mwait_size > 0) 569 cpu_idle_cycle_fcn = &cpu_idle_mwait_cycle; 570 } 571 572 void 573 cpu_attach(struct device *parent, struct device *self, void *aux) 574 { 575 struct cpu_softc *sc = (void *) self; 576 struct cpu_attach_args *caa = aux; 577 struct cpu_info *ci; 578 #if defined(MULTIPROCESSOR) 579 int cpunum = sc->sc_dev.dv_unit; 580 vaddr_t kstack; 581 struct pcb *pcb; 582 #endif 583 584 /* 585 * If we're an Application Processor, allocate a cpu_info 586 * structure, otherwise use the primary's. 587 */ 588 if (caa->cpu_role == CPU_ROLE_AP) { 589 struct cpu_info_full *cif; 590 591 cif = km_alloc(sizeof *cif, &kv_any, &kp_zero, &kd_waitok); 592 ci = &cif->cif_cpu; 593 #if defined(MULTIPROCESSOR) 594 ci->ci_tss = &cif->cif_tss; 595 ci->ci_gdt = &cif->cif_gdt; 596 memcpy(ci->ci_gdt, cpu_info_primary.ci_gdt, GDT_SIZE); 597 cpu_enter_pages(cif); 598 if (cpu_info[cpunum] != NULL) 599 panic("cpu at apic id %d already attached?", cpunum); 600 cpu_info[cpunum] = ci; 601 #endif 602 #ifdef TRAPLOG 603 ci->ci_tlog_base = malloc(sizeof(struct tlog), 604 M_DEVBUF, M_WAITOK); 605 #endif 606 } else { 607 ci = &cpu_info_primary; 608 #if defined(MULTIPROCESSOR) 609 if (caa->cpu_apicid != lapic_cpu_number()) { 610 panic("%s: running cpu is at apic %d" 611 " instead of at expected %d", 612 sc->sc_dev.dv_xname, lapic_cpu_number(), caa->cpu_apicid); 613 } 614 #endif 615 } 616 617 ci->ci_self = ci; 618 sc->sc_info = ci; 619 620 ci->ci_dev = self; 621 ci->ci_apicid = caa->cpu_apicid; 622 ci->ci_acpi_proc_id = caa->cpu_acpi_proc_id; 623 #ifdef MULTIPROCESSOR 624 ci->ci_cpuid = cpunum; 625 #else 626 ci->ci_cpuid = 0; /* False for APs, but they're not used anyway */ 627 #endif 628 ci->ci_func = caa->cpu_func; 629 ci->ci_handled_intr_level = IPL_NONE; 630 631 #ifndef SMALL_KERNEL 632 strlcpy(ci->ci_sensordev.xname, ci->ci_dev->dv_xname, 633 sizeof(ci->ci_sensordev.xname)); 634 #endif 635 636 #if defined(MULTIPROCESSOR) 637 /* 638 * Allocate UPAGES contiguous pages for the idle PCB and stack. 639 */ 640 kstack = (vaddr_t)km_alloc(USPACE, &kv_any, &kp_dirty, &kd_nowait); 641 if (kstack == 0) { 642 if (caa->cpu_role != CPU_ROLE_AP) { 643 panic("cpu_attach: unable to allocate idle stack for" 644 " primary"); 645 } 646 printf("%s: unable to allocate idle stack\n", 647 sc->sc_dev.dv_xname); 648 return; 649 } 650 pcb = ci->ci_idle_pcb = (struct pcb *) kstack; 651 memset(pcb, 0, USPACE); 652 653 pcb->pcb_kstack = kstack + USPACE - 16; 654 pcb->pcb_rbp = pcb->pcb_rsp = kstack + USPACE - 16; 655 pcb->pcb_pmap = pmap_kernel(); 656 pcb->pcb_cr3 = pcb->pcb_pmap->pm_pdirpa; 657 #endif 658 659 /* further PCB init done later. */ 660 661 printf(": "); 662 663 switch (caa->cpu_role) { 664 case CPU_ROLE_SP: 665 printf("(uniprocessor)\n"); 666 atomic_setbits_int(&ci->ci_flags, 667 CPUF_PRESENT | CPUF_SP | CPUF_PRIMARY); 668 cpu_intr_init(ci); 669 identifycpu(ci); 670 cpu_fix_msrs(ci); 671 #ifdef MTRR 672 mem_range_attach(); 673 #endif /* MTRR */ 674 /* XXX SP fpuinit(ci) is done earlier */ 675 cpu_init(ci); 676 cpu_init_mwait(sc, ci); 677 break; 678 679 case CPU_ROLE_BP: 680 printf("apid %d (boot processor)\n", caa->cpu_apicid); 681 atomic_setbits_int(&ci->ci_flags, 682 CPUF_PRESENT | CPUF_BSP | CPUF_PRIMARY); 683 cpu_intr_init(ci); 684 identifycpu(ci); 685 cpu_fix_msrs(ci); 686 #ifdef MTRR 687 mem_range_attach(); 688 #endif /* MTRR */ 689 690 #if NLAPIC > 0 691 /* 692 * Enable local apic 693 */ 694 lapic_enable(); 695 lapic_calibrate_timer(ci); 696 #endif 697 /* XXX BP fpuinit(ci) is done earlier */ 698 cpu_init(ci); 699 700 #if NIOAPIC > 0 701 ioapic_bsp_id = caa->cpu_apicid; 702 #endif 703 cpu_init_mwait(sc, ci); 704 break; 705 706 case CPU_ROLE_AP: 707 /* 708 * report on an AP 709 */ 710 printf("apid %d (application processor)\n", caa->cpu_apicid); 711 712 #if defined(MULTIPROCESSOR) 713 cpu_intr_init(ci); 714 cpu_start_secondary(ci); 715 clockqueue_init(&ci->ci_queue); 716 sched_init_cpu(ci); 717 ncpus++; 718 if (ci->ci_flags & CPUF_PRESENT) { 719 ci->ci_next = cpu_info_list->ci_next; 720 cpu_info_list->ci_next = ci; 721 } 722 #else 723 printf("%s: not started\n", sc->sc_dev.dv_xname); 724 #endif 725 break; 726 727 default: 728 panic("unknown processor type??"); 729 } 730 731 #if defined(MULTIPROCESSOR) 732 if (mp_verbose) { 733 printf("%s: kstack at 0x%lx for %d bytes\n", 734 sc->sc_dev.dv_xname, kstack, USPACE); 735 printf("%s: idle pcb at %p, idle sp at 0x%llx\n", 736 sc->sc_dev.dv_xname, pcb, pcb->pcb_rsp); 737 } 738 #endif 739 #if NVMM > 0 740 cpu_init_vmm(ci); 741 #endif /* NVMM > 0 */ 742 743 #ifndef SMALL_KERNEL 744 if (ci->ci_sensordev.sensors_count > 0) 745 sensordev_install(&ci->ci_sensordev); 746 #endif 747 } 748 749 static void 750 replacexsave(int xsave_ext) 751 { 752 extern long _xrstor, _xrstors, _xsave, _xsaves, _xsaveopt; 753 static int replacedone = 0; 754 int s; 755 756 if (replacedone) 757 return; 758 replacedone = 1; 759 760 s = splhigh(); 761 codepatch_replace(CPTAG_XRSTORS, 762 (xsave_ext & XSAVE_XSAVES) ? &_xrstors : &_xrstor, 4); 763 codepatch_replace(CPTAG_XRSTOR, &_xrstor, 4); 764 codepatch_replace(CPTAG_XSAVE, 765 (xsave_ext & XSAVE_XSAVES) ? &_xsaves : 766 (xsave_ext & XSAVE_XSAVEOPT) ? &_xsaveopt : &_xsave, 4); 767 splx(s); 768 } 769 770 771 /* 772 * Initialize the processor appropriately. 773 */ 774 775 void 776 cpu_init(struct cpu_info *ci) 777 { 778 struct savefpu *sfp; 779 u_int cr4; 780 781 /* configure the CPU if needed */ 782 if (ci->cpu_setup != NULL) 783 (*ci->cpu_setup)(ci); 784 785 cr4 = rcr4() | CR4_DEFAULT; 786 if (ci->ci_feature_sefflags_ebx & SEFF0EBX_SMEP) 787 cr4 |= CR4_SMEP; 788 if (ci->ci_feature_sefflags_ebx & SEFF0EBX_SMAP) 789 cr4 |= CR4_SMAP; 790 if (ci->ci_feature_sefflags_ecx & SEFF0ECX_UMIP) 791 cr4 |= CR4_UMIP; 792 if ((cpu_ecxfeature & CPUIDECX_XSAVE) && ci->ci_cpuid_level >= 0xd) 793 cr4 |= CR4_OSXSAVE; 794 if (pg_xo) 795 cr4 |= CR4_PKE; 796 if (pmap_use_pcid) 797 cr4 |= CR4_PCIDE; 798 lcr4(cr4); 799 800 if ((cpu_ecxfeature & CPUIDECX_XSAVE) && ci->ci_cpuid_level >= 0xd) { 801 u_int32_t eax, ebx, ecx, edx; 802 803 xsave_mask = XFEATURE_X87 | XFEATURE_SSE; 804 CPUID_LEAF(0xd, 0, eax, ebx, ecx, edx); 805 xsave_mask |= eax & XFEATURE_AVX; 806 xsetbv(0, xsave_mask); 807 CPUID_LEAF(0xd, 0, eax, ebx, ecx, edx); 808 if (CPU_IS_PRIMARY(ci)) { 809 fpu_save_len = ebx; 810 KASSERT(fpu_save_len <= sizeof(struct savefpu)); 811 } else { 812 KASSERT(ebx == fpu_save_len); 813 } 814 815 /* check for xsaves, xsaveopt, and supervisor features */ 816 CPUID_LEAF(0xd, 1, eax, ebx, ecx, edx); 817 /* Disable XSAVES on AMD family 17h due to Erratum 1386 */ 818 if (ci->ci_vendor == CPUV_AMD && 819 ci->ci_family == 0x17) { 820 eax &= ~XSAVE_XSAVES; 821 } 822 if (eax & XSAVE_XSAVES) { 823 #ifndef SMALL_KERNEL 824 if (ci->ci_feature_sefflags_edx & SEFF0EDX_IBT) 825 xsave_mask |= ecx & XFEATURE_CET_U; 826 #endif 827 if (xsave_mask & XFEATURE_XSS_MASK) { 828 wrmsr(MSR_XSS, xsave_mask & XFEATURE_XSS_MASK); 829 CPUID_LEAF(0xd, 1, eax, ebx, ecx, edx); 830 KASSERT(ebx <= sizeof(struct savefpu)); 831 } 832 if (CPU_IS_PRIMARY(ci)) 833 cpu_use_xsaves = 1; 834 } 835 836 replacexsave(eax); 837 } 838 839 if (CPU_IS_PRIMARY(ci)) { 840 /* Clean our FPU save area */ 841 sfp = fpu_cleandata; 842 memset(sfp, 0, fpu_save_len); 843 sfp->fp_fxsave.fx_fcw = __INITIAL_NPXCW__; 844 sfp->fp_fxsave.fx_mxcsr = __INITIAL_MXCSR__; 845 xrstor_user(sfp, xsave_mask); 846 if (cpu_use_xsaves || !xsave_mask) 847 fpusave(sfp); 848 else { 849 /* must not use xsaveopt here */ 850 xsave(sfp, xsave_mask); 851 } 852 } else { 853 fpureset(); 854 } 855 856 #if NVMM > 0 857 /* Re-enable VMM if needed */ 858 if (ci->ci_flags & CPUF_VMM) 859 start_vmm_on_cpu(ci); 860 #endif /* NVMM > 0 */ 861 862 #ifdef MULTIPROCESSOR 863 atomic_setbits_int(&ci->ci_flags, CPUF_RUNNING); 864 /* 865 * Big hammer: flush all TLB entries, including ones from PTEs 866 * with the G bit set. This should only be necessary if TLB 867 * shootdown falls far behind. 868 */ 869 cr4 = rcr4(); 870 lcr4(cr4 & ~CR4_PGE); 871 lcr4(cr4); 872 873 /* Check if TSC is synchronized. */ 874 if (cold && !CPU_IS_PRIMARY(ci)) 875 tsc_test_sync_ap(ci); 876 #endif 877 } 878 879 #if NVMM > 0 880 /* 881 * cpu_init_vmm 882 * 883 * Initializes per-cpu VMM state 884 * 885 * Parameters: 886 * ci: the cpu for which state is being initialized 887 */ 888 void 889 cpu_init_vmm(struct cpu_info *ci) 890 { 891 /* 892 * Allocate a per-cpu VMXON region for VMX CPUs 893 */ 894 if (ci->ci_vmm_flags & CI_VMM_VMX) { 895 ci->ci_vmxon_region = (struct vmxon_region *)malloc(PAGE_SIZE, 896 M_DEVBUF, M_WAITOK | M_ZERO); 897 if (!pmap_extract(pmap_kernel(), (vaddr_t)ci->ci_vmxon_region, 898 &ci->ci_vmxon_region_pa)) 899 panic("Can't locate VMXON region in phys mem"); 900 ci->ci_vmcs_pa = VMX_VMCS_PA_CLEAR; 901 rw_init(&ci->ci_vmcs_lock, "vmcslock"); 902 } 903 } 904 #endif /* NVMM > 0 */ 905 906 #ifdef MULTIPROCESSOR 907 void 908 cpu_boot_secondary_processors(void) 909 { 910 struct cpu_info *ci; 911 u_long i; 912 913 for (i=0; i < MAXCPUS; i++) { 914 ci = cpu_info[i]; 915 if (ci == NULL) 916 continue; 917 if (ci->ci_idle_pcb == NULL) 918 continue; 919 if ((ci->ci_flags & CPUF_PRESENT) == 0) 920 continue; 921 if (ci->ci_flags & (CPUF_BSP | CPUF_SP | CPUF_PRIMARY)) 922 continue; 923 ci->ci_randseed = (arc4random() & 0x7fffffff) + 1; 924 cpu_boot_secondary(ci); 925 } 926 } 927 928 void 929 cpu_start_secondary(struct cpu_info *ci) 930 { 931 int i; 932 u_long s; 933 934 atomic_setbits_int(&ci->ci_flags, CPUF_AP); 935 936 pmap_kenter_pa(MP_TRAMPOLINE, MP_TRAMPOLINE, PROT_READ | PROT_EXEC); 937 pmap_kenter_pa(MP_TRAMP_DATA, MP_TRAMP_DATA, PROT_READ | PROT_WRITE); 938 939 CPU_STARTUP(ci); 940 941 /* 942 * wait for it to become ready 943 */ 944 for (i = 100000; (!(ci->ci_flags & CPUF_PRESENT)) && i>0;i--) { 945 delay(10); 946 } 947 if (! (ci->ci_flags & CPUF_PRESENT)) { 948 printf("%s: failed to become ready\n", ci->ci_dev->dv_xname); 949 #if defined(MPDEBUG) && defined(DDB) 950 printf("dropping into debugger; continue from here to resume boot\n"); 951 db_enter(); 952 #endif 953 } 954 955 if ((ci->ci_flags & CPUF_IDENTIFIED) == 0) { 956 atomic_setbits_int(&ci->ci_flags, CPUF_IDENTIFY); 957 958 /* wait for it to identify */ 959 for (i = 2000000; (ci->ci_flags & CPUF_IDENTIFY) && i > 0; i--) 960 delay(10); 961 962 if (ci->ci_flags & CPUF_IDENTIFY) 963 printf("%s: failed to identify\n", 964 ci->ci_dev->dv_xname); 965 } 966 967 if (ci->ci_flags & CPUF_IDENTIFIED) { 968 /* 969 * Test if TSCs are synchronized. Invalidate cache to 970 * minimize possible cache effects. Disable interrupts to 971 * try to rule out external interference. 972 */ 973 s = intr_disable(); 974 wbinvd(); 975 tsc_test_sync_bp(curcpu()); 976 intr_restore(s); 977 } 978 979 CPU_START_CLEANUP(ci); 980 981 pmap_kremove(MP_TRAMPOLINE, PAGE_SIZE); 982 pmap_kremove(MP_TRAMP_DATA, PAGE_SIZE); 983 } 984 985 void 986 cpu_boot_secondary(struct cpu_info *ci) 987 { 988 int i; 989 u_long s; 990 991 atomic_setbits_int(&ci->ci_flags, CPUF_GO); 992 993 for (i = 100000; (!(ci->ci_flags & CPUF_RUNNING)) && i>0;i--) { 994 delay(10); 995 } 996 if (! (ci->ci_flags & CPUF_RUNNING)) { 997 printf("cpu failed to start\n"); 998 #if defined(MPDEBUG) && defined(DDB) 999 printf("dropping into debugger; continue from here to resume boot\n"); 1000 db_enter(); 1001 #endif 1002 } else if (cold) { 1003 /* Test if TSCs are synchronized again. */ 1004 s = intr_disable(); 1005 wbinvd(); 1006 tsc_test_sync_bp(curcpu()); 1007 intr_restore(s); 1008 } 1009 } 1010 1011 /* 1012 * The CPU ends up here when it's ready to run 1013 * This is called from code in mptramp.s; at this point, we are running 1014 * in the idle pcb/idle stack of the new cpu. When this function returns, 1015 * this processor will enter the idle loop and start looking for work. 1016 * 1017 * XXX should share some of this with init386 in machdep.c 1018 */ 1019 void 1020 cpu_hatch(void *v) 1021 { 1022 struct cpu_info *ci = (struct cpu_info *)v; 1023 int s; 1024 1025 { 1026 uint32_t vendor[4]; 1027 int level; 1028 1029 CPUID(0, level, vendor[0], vendor[2], vendor[1]); 1030 vendor[3] = 0; 1031 cpu_set_vendor(ci, level, (const char *)vendor); 1032 } 1033 1034 cpu_init_msrs(ci); 1035 1036 #ifdef DEBUG 1037 if (ci->ci_flags & CPUF_PRESENT) 1038 panic("%s: already running!?", ci->ci_dev->dv_xname); 1039 #endif 1040 atomic_setbits_int(&ci->ci_flags, CPUF_PRESENT); 1041 1042 lapic_enable(); 1043 cpu_ucode_apply(ci); 1044 cpu_tsx_disable(ci); 1045 1046 if ((ci->ci_flags & CPUF_IDENTIFIED) == 0) { 1047 /* 1048 * We need to wait until we can identify, otherwise dmesg 1049 * output will be messy. 1050 */ 1051 while ((ci->ci_flags & CPUF_IDENTIFY) == 0) 1052 delay(10); 1053 1054 identifycpu(ci); 1055 1056 /* Prevent identifycpu() from running again */ 1057 atomic_setbits_int(&ci->ci_flags, CPUF_IDENTIFIED); 1058 1059 /* Signal we're done */ 1060 atomic_clearbits_int(&ci->ci_flags, CPUF_IDENTIFY); 1061 } 1062 1063 /* These have to run after identifycpu() */ 1064 cpu_fix_msrs(ci); 1065 1066 /* 1067 * Test if our TSC is synchronized for the first time. 1068 * Note that interrupts are off at this point. 1069 */ 1070 wbinvd(); 1071 tsc_test_sync_ap(ci); 1072 1073 while ((ci->ci_flags & CPUF_GO) == 0) 1074 delay(10); 1075 #ifdef HIBERNATE 1076 if ((ci->ci_flags & CPUF_PARK) != 0) { 1077 if (ci->ci_feature_sefflags_edx & SEFF0EDX_IBT) 1078 lcr4(rcr4() & ~CR4_CET); 1079 atomic_clearbits_int(&ci->ci_flags, CPUF_PARK); 1080 hibernate_drop_to_real_mode(); 1081 } 1082 #endif /* HIBERNATE */ 1083 1084 #ifdef DEBUG 1085 if (ci->ci_flags & CPUF_RUNNING) 1086 panic("%s: already running!?", ci->ci_dev->dv_xname); 1087 #endif 1088 1089 cpu_init_idt(); 1090 lapic_set_lvt(); 1091 gdt_init_cpu(ci); 1092 fpuinit(ci); 1093 1094 lldt(0); 1095 1096 cpu_init(ci); 1097 #if NPVBUS > 0 1098 pvbus_init_cpu(); 1099 #endif 1100 1101 /* Re-initialise memory range handling on AP */ 1102 if (mem_range_softc.mr_op != NULL) 1103 mem_range_softc.mr_op->initAP(&mem_range_softc); 1104 1105 s = splhigh(); 1106 lcr8(0); 1107 intr_enable(); 1108 splx(s); 1109 1110 lapic_startclock(); 1111 1112 sched_toidle(); 1113 } 1114 1115 #if defined(DDB) 1116 1117 #include <ddb/db_output.h> 1118 #include <machine/db_machdep.h> 1119 1120 /* 1121 * Dump cpu information from ddb. 1122 */ 1123 void 1124 cpu_debug_dump(void) 1125 { 1126 struct cpu_info *ci; 1127 CPU_INFO_ITERATOR cii; 1128 1129 db_printf("addr dev id flags ipis curproc\n"); 1130 CPU_INFO_FOREACH(cii, ci) { 1131 db_printf("%p %s %u %x %x %10p\n", 1132 ci, 1133 ci->ci_dev == NULL ? "BOOT" : ci->ci_dev->dv_xname, 1134 ci->ci_cpuid, 1135 ci->ci_flags, ci->ci_ipis, 1136 ci->ci_curproc); 1137 } 1138 } 1139 #endif 1140 1141 int 1142 mp_cpu_start(struct cpu_info *ci) 1143 { 1144 unsigned short dwordptr[2]; 1145 1146 /* 1147 * "The BSP must initialize CMOS shutdown code to 0Ah ..." 1148 */ 1149 1150 outb(IO_RTC, NVRAM_RESET); 1151 outb(IO_RTC+1, NVRAM_RESET_JUMP); 1152 1153 /* 1154 * "and the warm reset vector (DWORD based at 40:67) to point 1155 * to the AP startup code ..." 1156 */ 1157 1158 dwordptr[0] = 0; 1159 dwordptr[1] = MP_TRAMPOLINE >> 4; 1160 1161 pmap_kenter_pa(0, 0, PROT_READ | PROT_WRITE); 1162 memcpy((u_int8_t *) 0x467, dwordptr, 4); 1163 pmap_kremove(0, PAGE_SIZE); 1164 1165 #if NLAPIC > 0 1166 /* 1167 * ... prior to executing the following sequence:" 1168 */ 1169 1170 if (ci->ci_flags & CPUF_AP) { 1171 x86_ipi_init(ci->ci_apicid); 1172 1173 delay(10000); 1174 1175 if (cpu_feature & CPUID_APIC) { 1176 x86_ipi(MP_TRAMPOLINE/PAGE_SIZE, ci->ci_apicid, 1177 LAPIC_DLMODE_STARTUP); 1178 delay(200); 1179 1180 x86_ipi(MP_TRAMPOLINE/PAGE_SIZE, ci->ci_apicid, 1181 LAPIC_DLMODE_STARTUP); 1182 delay(200); 1183 } 1184 } 1185 #endif 1186 return 0; 1187 } 1188 1189 void 1190 mp_cpu_start_cleanup(struct cpu_info *ci) 1191 { 1192 /* 1193 * Ensure the NVRAM reset byte contains something vaguely sane. 1194 */ 1195 1196 outb(IO_RTC, NVRAM_RESET); 1197 outb(IO_RTC+1, NVRAM_RESET_RST); 1198 } 1199 #endif /* MULTIPROCESSOR */ 1200 1201 typedef void (vector)(void); 1202 extern vector Xsyscall_meltdown, Xsyscall, Xsyscall32; 1203 1204 void 1205 cpu_init_msrs(struct cpu_info *ci) 1206 { 1207 wrmsr(MSR_STAR, 1208 ((uint64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 1209 ((uint64_t)GSEL(GUDATA_SEL-1, SEL_UPL) << 48)); 1210 wrmsr(MSR_LSTAR, cpu_meltdown ? (uint64_t)Xsyscall_meltdown : 1211 (uint64_t)Xsyscall); 1212 wrmsr(MSR_CSTAR, 0); 1213 wrmsr(MSR_SFMASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D|PSL_AC); 1214 1215 wrmsr(MSR_FSBASE, 0); 1216 wrmsr(MSR_GSBASE, (u_int64_t)ci); 1217 wrmsr(MSR_KERNELGSBASE, 0); 1218 patinit(ci); 1219 } 1220 1221 void 1222 cpu_fix_msrs(struct cpu_info *ci) 1223 { 1224 int family = ci->ci_family; 1225 uint64_t msr, nmsr; 1226 1227 if (ci->ci_vendor == CPUV_INTEL) { 1228 if ((family > 6 || (family == 6 && ci->ci_model >= 0xd)) && 1229 rdmsr_safe(MSR_MISC_ENABLE, &msr) == 0 && 1230 (msr & MISC_ENABLE_FAST_STRINGS) == 0) { 1231 msr |= MISC_ENABLE_FAST_STRINGS; 1232 wrmsr(MSR_MISC_ENABLE, msr); 1233 DPRINTF("%s: enabled fast strings\n", ci->ci_dev->dv_xname); 1234 1235 /* 1236 * Attempt to disable Silicon Debug and lock the configuration 1237 * if it's enabled and unlocked. 1238 */ 1239 if (cpu_ecxfeature & CPUIDECX_SDBG) { 1240 msr = rdmsr(IA32_DEBUG_INTERFACE); 1241 if ((msr & IA32_DEBUG_INTERFACE_ENABLE) && 1242 (msr & IA32_DEBUG_INTERFACE_LOCK) == 0) { 1243 msr &= IA32_DEBUG_INTERFACE_MASK; 1244 msr |= IA32_DEBUG_INTERFACE_LOCK; 1245 wrmsr(IA32_DEBUG_INTERFACE, msr); 1246 } else if (msr & IA32_DEBUG_INTERFACE_ENABLE) 1247 printf("%s: cannot disable silicon debug\n", 1248 ci->ci_dev->dv_xname); 1249 } 1250 } 1251 } 1252 1253 if (ci->ci_vendor == CPUV_AMD) { 1254 /* Apply AMD errata */ 1255 amd64_errata(ci); 1256 1257 /* 1258 * "Mitigation G-2" per AMD's Whitepaper "Software Techniques 1259 * for Managing Speculation on AMD Processors" 1260 * 1261 * By setting MSR C001_1029[1]=1, LFENCE becomes a dispatch 1262 * serializing instruction. 1263 * 1264 * This MSR is available on all AMD families >= 10h, except 11h 1265 * where LFENCE is always serializing. 1266 */ 1267 if (family >= 0x10 && family != 0x11) { 1268 nmsr = msr = rdmsr(MSR_DE_CFG); 1269 nmsr |= DE_CFG_SERIALIZE_LFENCE; 1270 if (msr != nmsr) 1271 wrmsr(MSR_DE_CFG, nmsr); 1272 } 1273 if (family == 0x17 && ci->ci_model >= 0x31 && 1274 (cpu_ecxfeature & CPUIDECX_HV) == 0) { 1275 nmsr = msr = rdmsr(MSR_DE_CFG); 1276 nmsr |= DE_CFG_SERIALIZE_9; 1277 if (msr != nmsr) 1278 wrmsr(MSR_DE_CFG, nmsr); 1279 } 1280 } 1281 1282 #ifndef SMALL_KERNEL 1283 if (ci->ci_feature_sefflags_edx & SEFF0EDX_IBT) { 1284 msr = rdmsr(MSR_S_CET); 1285 wrmsr(MSR_S_CET, (msr & ~MSR_CET_NO_TRACK_EN) | MSR_CET_ENDBR_EN); 1286 lcr4(rcr4() | CR4_CET); 1287 } 1288 #endif 1289 } 1290 1291 void 1292 cpu_tsx_disable(struct cpu_info *ci) 1293 { 1294 uint64_t msr; 1295 uint32_t dummy, sefflags_edx; 1296 1297 /* this runs before identifycpu() populates ci_feature_sefflags_edx */ 1298 if (ci->ci_cpuid_level < 0x07) 1299 return; 1300 CPUID_LEAF(0x7, 0, dummy, dummy, dummy, sefflags_edx); 1301 1302 if (ci->ci_vendor == CPUV_INTEL && 1303 (sefflags_edx & SEFF0EDX_ARCH_CAP)) { 1304 msr = rdmsr(MSR_ARCH_CAPABILITIES); 1305 if (msr & ARCH_CAP_TSX_CTRL) { 1306 msr = rdmsr(MSR_TSX_CTRL); 1307 msr |= TSX_CTRL_RTM_DISABLE | TSX_CTRL_TSX_CPUID_CLEAR; 1308 wrmsr(MSR_TSX_CTRL, msr); 1309 } 1310 } 1311 } 1312 1313 void 1314 patinit(struct cpu_info *ci) 1315 { 1316 extern int pmap_pg_wc; 1317 u_int64_t reg; 1318 1319 if ((cpu_feature & CPUID_PAT) == 0) 1320 return; 1321 /* 1322 * Set up PAT bits. 1323 * The default pat table is the following: 1324 * WB, WT, UC-, UC, WB, WT, UC-, UC 1325 * We change it to: 1326 * WB, WC, UC-, UC, WB, WC, UC-, UC 1327 * i.e change the WT bit to be WC. 1328 */ 1329 reg = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) | 1330 PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) | 1331 PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) | 1332 PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC); 1333 1334 wrmsr(MSR_CR_PAT, reg); 1335 pmap_pg_wc = PG_WC; 1336 } 1337 1338 struct timeout rdrand_tmo; 1339 void rdrand(void *); 1340 1341 void 1342 rdrand(void *v) 1343 { 1344 struct timeout *tmo = v; 1345 extern int has_rdrand, has_rdseed; 1346 union { 1347 uint64_t u64; 1348 uint32_t u32[2]; 1349 } r, t; 1350 uint64_t tsc; 1351 uint8_t valid = 0; 1352 1353 tsc = rdtsc(); 1354 if (has_rdseed) 1355 __asm volatile( 1356 "rdseed %0\n\t" 1357 "setc %1\n" 1358 : "=r" (r.u64), "=qm" (valid) ); 1359 if (has_rdrand && (has_rdseed == 0 || valid == 0)) 1360 __asm volatile( 1361 "rdrand %0\n\t" 1362 "setc %1\n" 1363 : "=r" (r.u64), "=qm" (valid) ); 1364 1365 t.u64 = tsc; 1366 t.u64 ^= r.u64; 1367 t.u64 ^= valid; /* potential rdrand empty */ 1368 if (has_rdrand) 1369 t.u64 += rdtsc(); /* potential vmexit latency */ 1370 1371 enqueue_randomness(t.u32[0]); 1372 enqueue_randomness(t.u32[1]); 1373 1374 if (tmo) 1375 timeout_add_msec(tmo, 10); 1376 } 1377 1378 int 1379 cpu_activate(struct device *self, int act) 1380 { 1381 struct cpu_softc *sc = (struct cpu_softc *)self; 1382 1383 switch (act) { 1384 case DVACT_RESUME: 1385 if (sc->sc_info->ci_cpuid == 0) 1386 rdrand(NULL); 1387 #if NPCTR > 0 1388 pctr_resume(sc->sc_info); 1389 #endif 1390 break; 1391 } 1392 1393 return (0); 1394 } 1395 1396 /* 1397 * cpu_enter_pages 1398 * 1399 * Requests mapping of various special pages required in the Intel Meltdown 1400 * case (to be entered into the U-K page table): 1401 * 1402 * 1 tss+gdt page for each CPU 1403 * 1 trampoline stack page for each CPU 1404 * 1405 * The cpu_info_full struct for each CPU straddles these pages. The offset into 1406 * 'cif' is calculated below, for each page. For more information, consult 1407 * the definition of struct cpu_info_full in cpu_full.h 1408 * 1409 * On CPUs unaffected by Meltdown, this function still configures 'cif' but 1410 * the calls to pmap_enter_special become no-ops. 1411 * 1412 * Parameters: 1413 * cif : the cpu_info_full structure describing a CPU whose pages are to be 1414 * entered into the special meltdown U-K page table. 1415 */ 1416 void 1417 cpu_enter_pages(struct cpu_info_full *cif) 1418 { 1419 vaddr_t va; 1420 paddr_t pa; 1421 1422 /* The TSS+GDT need to be readable */ 1423 va = (vaddr_t)cif; 1424 pmap_extract(pmap_kernel(), va, &pa); 1425 pmap_enter_special(va, pa, PROT_READ); 1426 DPRINTF("%s: entered tss+gdt page at va 0x%llx pa 0x%llx\n", __func__, 1427 (uint64_t)va, (uint64_t)pa); 1428 1429 /* The trampoline stack page needs to be read/write */ 1430 va = (vaddr_t)&cif->cif_tramp_stack; 1431 pmap_extract(pmap_kernel(), va, &pa); 1432 pmap_enter_special(va, pa, PROT_READ | PROT_WRITE); 1433 DPRINTF("%s: entered t.stack page at va 0x%llx pa 0x%llx\n", __func__, 1434 (uint64_t)va, (uint64_t)pa); 1435 1436 cif->cif_tss.tss_rsp0 = va + sizeof(cif->cif_tramp_stack) - 16; 1437 DPRINTF("%s: cif_tss.tss_rsp0 = 0x%llx\n" ,__func__, 1438 (uint64_t)cif->cif_tss.tss_rsp0); 1439 cif->cif_cpu.ci_intr_rsp = cif->cif_tss.tss_rsp0 - 1440 sizeof(struct iretq_frame); 1441 1442 #define SETUP_IST_SPECIAL_STACK(ist, cif, member) do { \ 1443 (cif)->cif_tss.tss_ist[(ist)] = (vaddr_t)&(cif)->member + \ 1444 sizeof((cif)->member) - 16; \ 1445 (cif)->member[nitems((cif)->member) - 2] = (int64_t)&(cif)->cif_cpu; \ 1446 } while (0) 1447 1448 SETUP_IST_SPECIAL_STACK(0, cif, cif_dblflt_stack); 1449 SETUP_IST_SPECIAL_STACK(1, cif, cif_nmi_stack); 1450 1451 /* an empty iomap, by setting its offset to the TSS limit */ 1452 cif->cif_tss.tss_iobase = sizeof(cif->cif_tss); 1453 } 1454 1455 #ifdef MULTIPROCESSOR 1456 int 1457 wbinvd_on_all_cpus(void) 1458 { 1459 x86_broadcast_ipi(X86_IPI_WBINVD); 1460 wbinvd(); 1461 return 0; 1462 } 1463 #endif 1464