1 /* $OpenBSD: cpu.c,v 1.195 2024/11/07 17:24:42 bluhm Exp $ */
2 /* $NetBSD: cpu.c,v 1.1 2003/04/26 18:39:26 fvdl Exp $ */
3
4 /*-
5 * Copyright (c) 2000 The NetBSD Foundation, Inc.
6 * All rights reserved.
7 *
8 * This code is derived from software contributed to The NetBSD Foundation
9 * by RedBack Networks Inc.
10 *
11 * Author: Bill Sommerfeld
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 * POSSIBILITY OF SUCH DAMAGE.
33 */
34
35 /*
36 * Copyright (c) 1999 Stefan Grefen
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 * 3. All advertising materials mentioning features or use of this software
47 * must display the following acknowledgement:
48 * This product includes software developed by the NetBSD
49 * Foundation, Inc. and its contributors.
50 * 4. Neither the name of The NetBSD Foundation nor the names of its
51 * contributors may be used to endorse or promote products derived
52 * from this software without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND ANY
55 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR AND CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 */
66
67 #include "lapic.h"
68 #include "ioapic.h"
69 #include "vmm.h"
70 #include "pctr.h"
71 #include "pvbus.h"
72
73 #include <sys/param.h>
74 #include <sys/proc.h>
75 #include <sys/timeout.h>
76 #include <sys/systm.h>
77 #include <sys/device.h>
78 #include <sys/malloc.h>
79 #include <sys/memrange.h>
80 #include <sys/atomic.h>
81 #include <sys/user.h>
82
83 #include <uvm/uvm_extern.h>
84
85 #include <machine/codepatch.h>
86 #include <machine/cpu_full.h>
87 #include <machine/cpufunc.h>
88 #include <machine/cpuvar.h>
89 #include <machine/pmap.h>
90 #include <machine/vmparam.h>
91 #include <machine/mpbiosvar.h>
92 #include <machine/pcb.h>
93 #include <machine/specialreg.h>
94 #include <machine/segments.h>
95 #include <machine/gdt.h>
96 #include <machine/pio.h>
97 #include <machine/vmmvar.h>
98
99 #if NLAPIC > 0
100 #include <machine/i82489reg.h>
101 #include <machine/i82489var.h>
102 #endif
103
104 #if NIOAPIC > 0
105 #include <machine/i82093var.h>
106 #endif
107
108 #if NPCTR > 0
109 #include <machine/pctr.h>
110 #endif
111
112 #if NPVBUS > 0
113 #include <dev/pv/pvvar.h>
114 #endif
115
116 #include <dev/ic/mc146818reg.h>
117 #include <amd64/isa/nvram.h>
118 #include <dev/isa/isareg.h>
119
120 #ifdef HIBERNATE
121 #include <sys/hibernate.h>
122 #include <machine/hibernate.h>
123 #endif /* HIBERNATE */
124
125 /* #define CPU_DEBUG */
126
127 #ifdef CPU_DEBUG
128 #define DPRINTF(x...) do { printf(x); } while(0)
129 #else
130 #define DPRINTF(x...)
131 #endif /* CPU_DEBUG */
132
133 int cpu_match(struct device *, void *, void *);
134 void cpu_attach(struct device *, struct device *, void *);
135 int cpu_activate(struct device *, int);
136 void patinit(struct cpu_info *ci);
137 #if NVMM > 0
138 void cpu_init_vmm(struct cpu_info *ci);
139 #endif /* NVMM > 0 */
140
141 struct cpu_softc {
142 struct device sc_dev; /* device tree glue */
143 struct cpu_info *sc_info; /* pointer to CPU info */
144 };
145
146 void replacesmap(void);
147 void replacemeltdown(void);
148 void replacemds(void);
149
150 extern long _stac;
151 extern long _clac;
152
153 int cpuid_level = 0; /* MIN cpuid(0).eax */
154 char cpu_vendor[16] = { 0 }; /* CPU0's cpuid(0).e[bdc]x, \0 */
155 int cpu_id = 0; /* cpuid(1).eax */
156 int cpu_ebxfeature = 0; /* cpuid(1).ebx */
157 int cpu_ecxfeature = 0; /* INTERSECTION(cpuid(1).ecx) */
158 int cpu_feature = 0; /* cpuid(1).edx */
159 int ecpu_ecxfeature = 0; /* cpuid(0x80000001).ecx */
160 int cpu_sev_guestmode = 0;
161 int cpu_meltdown = 0;
162 int cpu_use_xsaves = 0;
163 int need_retpoline = 1; /* most systems need retpoline */
164
165 void
replacesmap(void)166 replacesmap(void)
167 {
168 static int replacedone = 0;
169 int s;
170
171 if (replacedone)
172 return;
173 replacedone = 1;
174
175 s = splhigh();
176
177 codepatch_replace(CPTAG_STAC, &_stac, 3);
178 codepatch_replace(CPTAG_CLAC, &_clac, 3);
179
180 splx(s);
181 }
182
183 void
replacemeltdown(void)184 replacemeltdown(void)
185 {
186 static int replacedone = 0;
187 struct cpu_info *ci = &cpu_info_primary;
188 int swapgs_vuln = 0, ibrs = 0, s, ibpb = 0;
189
190 if (ci->ci_vendor == CPUV_INTEL) {
191 int family = ci->ci_family;
192 int model = ci->ci_model;
193
194 swapgs_vuln = 1;
195 if (family == 0x6 &&
196 (model == 0x37 || model == 0x4a || model == 0x4c ||
197 model == 0x4d || model == 0x5a || model == 0x5d ||
198 model == 0x6e || model == 0x65 || model == 0x75)) {
199 /* Silvermont, Airmont */
200 swapgs_vuln = 0;
201 } else if (family == 0x6 && (model == 0x85 || model == 0x57)) {
202 /* KnightsLanding */
203 swapgs_vuln = 0;
204 }
205 if ((ci->ci_feature_sefflags_edx & SEFF0EDX_ARCH_CAP) &&
206 (rdmsr(MSR_ARCH_CAPABILITIES) & ARCH_CAP_IBRS_ALL)) {
207 ibrs = 2;
208 } else if (ci->ci_feature_sefflags_edx & SEFF0EDX_IBRS) {
209 ibrs = 1;
210 }
211 if (ci->ci_feature_sefflags_edx & SEFF0EDX_IBRS)
212 ibpb = 1;
213 } else if (ci->ci_vendor == CPUV_AMD &&
214 ci->ci_pnfeatset >= 0x80000008) {
215 if (ci->ci_feature_amdspec_ebx & CPUIDEBX_IBRS_ALWAYSON) {
216 ibrs = 2;
217 } else if ((ci->ci_feature_amdspec_ebx & CPUIDEBX_IBRS) &&
218 (ci->ci_feature_amdspec_ebx & CPUIDEBX_IBRS_PREF)) {
219 ibrs = 1;
220 }
221 if (ci->ci_feature_amdspec_ebx & CPUIDEBX_IBPB)
222 ibpb = 1;
223 }
224
225 /* Enhanced IBRS: turn it on once on each CPU and don't touch again */
226 if (ibrs == 2)
227 wrmsr(MSR_SPEC_CTRL, SPEC_CTRL_IBRS);
228
229 if (replacedone)
230 return;
231 replacedone = 1;
232
233 s = splhigh();
234
235 /* If we don't have IBRS/IBPB, then don't use IBPB */
236 if (ibpb == 0)
237 codepatch_nop(CPTAG_IBPB_NOP);
238
239 if (ibrs == 2 || (ci->ci_feature_sefflags_edx & SEFF0EDX_IBT)) {
240 extern const char _jmprax, _jmpr11, _jmpr13;
241 extern const short _jmprax_len, _jmpr11_len, _jmpr13_len;
242
243 codepatch_replace(CPTAG_RETPOLINE_RAX, &_jmprax, _jmprax_len);
244 codepatch_replace(CPTAG_RETPOLINE_R11, &_jmpr11, _jmpr11_len);
245 codepatch_replace(CPTAG_RETPOLINE_R13, &_jmpr13, _jmpr13_len);
246 need_retpoline = 0;
247 }
248
249 if (!cpu_meltdown)
250 codepatch_nop(CPTAG_MELTDOWN_NOP);
251 else {
252 extern long alltraps_kern_meltdown;
253
254 /* eliminate conditional branch in alltraps */
255 codepatch_jmp(CPTAG_MELTDOWN_ALLTRAPS, &alltraps_kern_meltdown);
256
257 /* enable reuse of PCID for U-K page tables */
258 if (pmap_use_pcid) {
259 extern long _pcid_set_reuse;
260 DPRINTF("%s: codepatching PCID use\n", __func__);
261 codepatch_replace(CPTAG_PCID_SET_REUSE,
262 &_pcid_set_reuse, PCID_SET_REUSE_SIZE);
263 }
264 }
265
266 /*
267 * CVE-2019-1125: if the CPU has SMAP and it's not vulnerable to
268 * Meltdown, then it's protected both from speculatively mis-skipping
269 * the swapgs during interrupts of userspace and from speculatively
270 * mis-taking a swapgs during interrupts while already in the kernel
271 * as the speculative path will fault from SMAP. Warning: enabling
272 * WRGSBASE would break this 'protection'.
273 *
274 * Otherwise, if the CPU's swapgs can't be speculated over and it
275 * _is_ vulnerable to Meltdown then the %cr3 change will serialize
276 * user->kern transitions, but we still need to mitigate the
277 * already-in-kernel cases.
278 */
279 if (!cpu_meltdown && (ci->ci_feature_sefflags_ebx & SEFF0EBX_SMAP)) {
280 codepatch_nop(CPTAG_FENCE_SWAPGS_MIS_TAKEN);
281 codepatch_nop(CPTAG_FENCE_NO_SAFE_SMAP);
282 } else if (!swapgs_vuln && cpu_meltdown) {
283 codepatch_nop(CPTAG_FENCE_SWAPGS_MIS_TAKEN);
284 }
285 splx(s);
286 }
287
288 void
replacemds(void)289 replacemds(void)
290 {
291 static int replacedone = 0;
292 extern long mds_handler_bdw, mds_handler_ivb, mds_handler_skl;
293 extern long mds_handler_skl_sse, mds_handler_skl_avx;
294 extern long mds_handler_skl_avx512;
295 extern long mds_handler_silvermont, mds_handler_knights;
296 struct cpu_info *ci = &cpu_info_primary;
297 CPU_INFO_ITERATOR cii;
298 void *handler = NULL, *vmm_handler = NULL;
299 const char *type;
300 int use_verw = 0, s;
301 uint32_t cap = 0;
302
303 /* ci_mds_tmp must be 64-byte aligned for AVX-512 instructions */
304 CTASSERT((offsetof(struct cpu_info, ci_mds_tmp) -
305 offsetof(struct cpu_info, ci_PAGEALIGN)) % 64 == 0);
306
307 if (replacedone)
308 return;
309 replacedone = 1;
310
311 if (ci->ci_vendor != CPUV_INTEL)
312 goto notintel; /* VERW only needed on Intel */
313
314 if ((ci->ci_feature_sefflags_edx & SEFF0EDX_ARCH_CAP))
315 cap = rdmsr(MSR_ARCH_CAPABILITIES);
316
317 if (cap & ARCH_CAP_MDS_NO) {
318 /* Unaffected, nop out the handling code */
319 } else if (ci->ci_feature_sefflags_edx & SEFF0EDX_MD_CLEAR) {
320 /* new firmware, use VERW */
321 use_verw = 1;
322 } else {
323 int family = ci->ci_family;
324 int model = ci->ci_model;
325 int stepping = CPUID2STEPPING(ci->ci_signature);
326
327 if (family == 0x6 &&
328 (model == 0x2e || model == 0x1e || model == 0x1f ||
329 model == 0x1a || model == 0x2f || model == 0x25 ||
330 model == 0x2c || model == 0x2d || model == 0x2a ||
331 model == 0x3e || model == 0x3a)) {
332 /* Nehalem, SandyBridge, IvyBridge */
333 handler = vmm_handler = &mds_handler_ivb;
334 type = "IvyBridge";
335 CPU_INFO_FOREACH(cii, ci) {
336 ci->ci_mds_buf = malloc(672, M_DEVBUF,
337 M_WAITOK);
338 memset(ci->ci_mds_buf, 0, 16);
339 }
340 } else if (family == 0x6 &&
341 (model == 0x3f || model == 0x3c || model == 0x45 ||
342 model == 0x46 || model == 0x56 || model == 0x4f ||
343 model == 0x47 || model == 0x3d)) {
344 /* Haswell and Broadwell */
345 handler = vmm_handler = &mds_handler_bdw;
346 type = "Broadwell";
347 CPU_INFO_FOREACH(cii, ci) {
348 ci->ci_mds_buf = malloc(1536, M_DEVBUF,
349 M_WAITOK);
350 }
351 } else if (family == 0x6 &&
352 ((model == 0x55 && stepping <= 5) || model == 0x4e ||
353 model == 0x5e || (model == 0x8e && stepping <= 0xb) ||
354 (model == 0x9e && stepping <= 0xc))) {
355 /*
356 * Skylake, KabyLake, CoffeeLake, WhiskeyLake,
357 * CascadeLake
358 */
359 if (xgetbv(0) & XFEATURE_AVX512) {
360 handler = &mds_handler_skl_avx512;
361 type = "Skylake AVX-512";
362 } else if (xgetbv(0) & XFEATURE_AVX) {
363 handler = &mds_handler_skl_avx;
364 type = "Skylake AVX";
365 } else {
366 handler = &mds_handler_skl_sse;
367 type = "Skylake SSE";
368 }
369 vmm_handler = &mds_handler_skl;
370 CPU_INFO_FOREACH(cii, ci) {
371 vaddr_t b64;
372 b64 = (vaddr_t)malloc(6 * 1024 + 64 + 63,
373 M_DEVBUF, M_WAITOK);
374 ci->ci_mds_buf = (void *)((b64 + 63) & ~63);
375 memset(ci->ci_mds_buf, 0, 64);
376 }
377 } else if (family == 0x6 &&
378 (model == 0x37 || model == 0x4a || model == 0x4c ||
379 model == 0x4d || model == 0x5a || model == 0x5d ||
380 model == 0x6e || model == 0x65 || model == 0x75)) {
381 /* Silvermont, Airmont */
382 handler = vmm_handler = &mds_handler_silvermont;
383 type = "Silvermont";
384 CPU_INFO_FOREACH(cii, ci) {
385 ci->ci_mds_buf = malloc(256, M_DEVBUF,
386 M_WAITOK);
387 memset(ci->ci_mds_buf, 0, 16);
388 }
389 } else if (family == 0x6 && (model == 0x85 || model == 0x57)) {
390 handler = vmm_handler = &mds_handler_knights;
391 type = "KnightsLanding";
392 CPU_INFO_FOREACH(cii, ci) {
393 vaddr_t b64;
394 b64 = (vaddr_t)malloc(1152 + 63, M_DEVBUF,
395 M_WAITOK);
396 ci->ci_mds_buf = (void *)((b64 + 63) & ~63);
397 }
398 }
399 }
400
401 /* Register File Data Sampling (RFDS) also has a VERW workaround */
402 if ((cap & ARCH_CAP_RFDS_NO) == 0 && (cap & ARCH_CAP_RFDS_CLEAR))
403 use_verw = 1;
404
405 if (handler != NULL) {
406 printf("cpu0: using %s MDS workaround%s\n", type, "");
407 s = splhigh();
408 codepatch_call(CPTAG_MDS, handler);
409 codepatch_call(CPTAG_MDS_VMM, vmm_handler);
410 splx(s);
411 } else if (use_verw) {
412 /*
413 * The new firmware enhances L1D_FLUSH MSR to flush MDS too,
414 * but keep the verw if affected by RFDS
415 */
416 if ((cap & ARCH_CAP_RFDS_NO) == 0 && (cap & ARCH_CAP_RFDS_CLEAR)) {
417 type = "";
418 } else if (cpu_info_primary.ci_vmm_cap.vcc_vmx.vmx_has_l1_flush_msr == 1) {
419 s = splhigh();
420 codepatch_nop(CPTAG_MDS_VMM);
421 splx(s);
422 type = " (except on vmm entry)";
423 } else {
424 type = "";
425 }
426 printf("cpu0: using %s MDS workaround%s\n", "VERW", type);
427 } else {
428 notintel:
429 s = splhigh();
430 codepatch_nop(CPTAG_MDS);
431 codepatch_nop(CPTAG_MDS_VMM);
432 splx(s);
433 }
434 }
435
436 #ifdef MULTIPROCESSOR
437 int mp_cpu_start(struct cpu_info *);
438 void mp_cpu_start_cleanup(struct cpu_info *);
439 struct cpu_functions mp_cpu_funcs = { mp_cpu_start, NULL,
440 mp_cpu_start_cleanup };
441 #endif /* MULTIPROCESSOR */
442
443 const struct cfattach cpu_ca = {
444 sizeof(struct cpu_softc), cpu_match, cpu_attach, NULL, cpu_activate
445 };
446
447 struct cfdriver cpu_cd = {
448 NULL, "cpu", DV_DULL
449 };
450
451 /*
452 * Statically-allocated CPU info for the primary CPU (or the only
453 * CPU, on uniprocessors). The CPU info list is initialized to
454 * point at it.
455 */
456 struct cpu_info_full cpu_info_full_primary = { .cif_cpu = { .ci_self = &cpu_info_primary } };
457
458 struct cpu_info *cpu_info_list = &cpu_info_primary;
459
460 #ifdef MULTIPROCESSOR
461 /*
462 * Array of CPU info structures. Must be statically-allocated because
463 * curproc, etc. are used early.
464 */
465 struct cpu_info *cpu_info[MAXCPUS] = { &cpu_info_primary };
466
467 void cpu_hatch(void *);
468 void cpu_boot_secondary(struct cpu_info *ci);
469 void cpu_start_secondary(struct cpu_info *ci);
470 #endif
471
472 int
cpu_match(struct device * parent,void * match,void * aux)473 cpu_match(struct device *parent, void *match, void *aux)
474 {
475 struct cfdata *cf = match;
476 struct cpu_attach_args *caa = aux;
477
478 if (strcmp(caa->caa_name, cf->cf_driver->cd_name) != 0)
479 return 0;
480
481 if (cf->cf_unit >= MAXCPUS)
482 return 0;
483
484 return 1;
485 }
486
487 void cpu_idle_mwait_cycle(void);
488 void cpu_init_mwait(struct cpu_softc *, struct cpu_info *);
489
490 u_int cpu_mwait_size, cpu_mwait_states;
491
492 void
cpu_idle_mwait_cycle(void)493 cpu_idle_mwait_cycle(void)
494 {
495 struct cpu_info *ci = curcpu();
496
497 if ((read_rflags() & PSL_I) == 0)
498 panic("idle with interrupts blocked!");
499
500 /* something already queued? */
501 if (!cpu_is_idle(ci))
502 return;
503
504 /*
505 * About to idle; setting the MWAIT_IN_IDLE bit tells
506 * cpu_unidle() that it can't be a no-op and tells cpu_kick()
507 * that it doesn't need to use an IPI. We also set the
508 * MWAIT_KEEP_IDLING bit: those routines clear it to stop
509 * the mwait. Once they're set, we do a final check of the
510 * queue, in case another cpu called setrunqueue() and added
511 * something to the queue and called cpu_unidle() between
512 * the check in sched_idle() and here.
513 */
514 atomic_setbits_int(&ci->ci_mwait, MWAIT_IDLING | MWAIT_ONLY);
515 if (cpu_is_idle(ci)) {
516 monitor(&ci->ci_mwait, 0, 0);
517 if ((ci->ci_mwait & MWAIT_IDLING) == MWAIT_IDLING)
518 mwait(0, 0);
519 }
520
521 /* done idling; let cpu_kick() know that an IPI is required */
522 atomic_clearbits_int(&ci->ci_mwait, MWAIT_IDLING);
523 }
524
525 void
cpu_init_mwait(struct cpu_softc * sc,struct cpu_info * ci)526 cpu_init_mwait(struct cpu_softc *sc, struct cpu_info *ci)
527 {
528 unsigned int smallest, largest, extensions, c_substates;
529
530 if ((cpu_ecxfeature & CPUIDECX_MWAIT) == 0 || ci->ci_cpuid_level < 0x5)
531 return;
532
533 /* get the monitor granularity */
534 CPUID(0x5, smallest, largest, extensions, cpu_mwait_states);
535 smallest &= 0xffff;
536 largest &= 0xffff;
537
538 /* mask out states C6/C7 in 31:24 for CHT45 errata */
539 if (ci->ci_vendor == CPUV_INTEL &&
540 ci->ci_family == 0x06 && ci->ci_model == 0x4c)
541 cpu_mwait_states &= 0x00ffffff;
542
543 printf("%s: mwait min=%u, max=%u", sc->sc_dev.dv_xname,
544 smallest, largest);
545 if (extensions & 0x1) {
546 if (cpu_mwait_states > 0) {
547 c_substates = cpu_mwait_states;
548 printf(", C-substates=%u", 0xf & c_substates);
549 while ((c_substates >>= 4) > 0)
550 printf(".%u", 0xf & c_substates);
551 }
552 if (extensions & 0x2)
553 printf(", IBE");
554 } else {
555 /* substates not supported, forge the default: just C1 */
556 cpu_mwait_states = 1 << 4;
557 }
558
559 /* paranoia: check the values */
560 if (smallest < sizeof(int) || largest < smallest ||
561 (largest & (sizeof(int)-1)))
562 printf(" (bogus)");
563 else
564 cpu_mwait_size = largest;
565 printf("\n");
566
567 /* enable use of mwait; may be overridden by acpicpu later */
568 if (cpu_mwait_size > 0)
569 cpu_idle_cycle_fcn = &cpu_idle_mwait_cycle;
570 }
571
572 void
cpu_attach(struct device * parent,struct device * self,void * aux)573 cpu_attach(struct device *parent, struct device *self, void *aux)
574 {
575 struct cpu_softc *sc = (void *) self;
576 struct cpu_attach_args *caa = aux;
577 struct cpu_info *ci;
578 #if defined(MULTIPROCESSOR)
579 int cpunum = sc->sc_dev.dv_unit;
580 vaddr_t kstack;
581 struct pcb *pcb;
582 #endif
583
584 /*
585 * If we're an Application Processor, allocate a cpu_info
586 * structure, otherwise use the primary's.
587 */
588 if (caa->cpu_role == CPU_ROLE_AP) {
589 struct cpu_info_full *cif;
590
591 cif = km_alloc(sizeof *cif, &kv_any, &kp_zero, &kd_waitok);
592 ci = &cif->cif_cpu;
593 #if defined(MULTIPROCESSOR)
594 ci->ci_tss = &cif->cif_tss;
595 ci->ci_gdt = &cif->cif_gdt;
596 memcpy(ci->ci_gdt, cpu_info_primary.ci_gdt, GDT_SIZE);
597 cpu_enter_pages(cif);
598 if (cpu_info[cpunum] != NULL)
599 panic("cpu at apic id %d already attached?", cpunum);
600 cpu_info[cpunum] = ci;
601 #endif
602 #ifdef TRAPLOG
603 ci->ci_tlog_base = malloc(sizeof(struct tlog),
604 M_DEVBUF, M_WAITOK);
605 #endif
606 } else {
607 ci = &cpu_info_primary;
608 #if defined(MULTIPROCESSOR)
609 if (caa->cpu_apicid != lapic_cpu_number()) {
610 panic("%s: running cpu is at apic %d"
611 " instead of at expected %d",
612 sc->sc_dev.dv_xname, lapic_cpu_number(), caa->cpu_apicid);
613 }
614 #endif
615 }
616
617 ci->ci_self = ci;
618 sc->sc_info = ci;
619
620 ci->ci_dev = self;
621 ci->ci_apicid = caa->cpu_apicid;
622 ci->ci_acpi_proc_id = caa->cpu_acpi_proc_id;
623 #ifdef MULTIPROCESSOR
624 ci->ci_cpuid = cpunum;
625 #else
626 ci->ci_cpuid = 0; /* False for APs, but they're not used anyway */
627 #endif
628 ci->ci_func = caa->cpu_func;
629 ci->ci_handled_intr_level = IPL_NONE;
630
631 #ifndef SMALL_KERNEL
632 strlcpy(ci->ci_sensordev.xname, ci->ci_dev->dv_xname,
633 sizeof(ci->ci_sensordev.xname));
634 #endif
635
636 #if defined(MULTIPROCESSOR)
637 /*
638 * Allocate UPAGES contiguous pages for the idle PCB and stack.
639 */
640 kstack = (vaddr_t)km_alloc(USPACE, &kv_any, &kp_dirty, &kd_nowait);
641 if (kstack == 0) {
642 if (caa->cpu_role != CPU_ROLE_AP) {
643 panic("cpu_attach: unable to allocate idle stack for"
644 " primary");
645 }
646 printf("%s: unable to allocate idle stack\n",
647 sc->sc_dev.dv_xname);
648 return;
649 }
650 pcb = ci->ci_idle_pcb = (struct pcb *) kstack;
651 memset(pcb, 0, USPACE);
652
653 pcb->pcb_kstack = kstack + USPACE - 16;
654 pcb->pcb_rbp = pcb->pcb_rsp = kstack + USPACE - 16;
655 pcb->pcb_pmap = pmap_kernel();
656 pcb->pcb_cr3 = pcb->pcb_pmap->pm_pdirpa;
657 #endif
658
659 /* further PCB init done later. */
660
661 printf(": ");
662
663 switch (caa->cpu_role) {
664 case CPU_ROLE_SP:
665 printf("(uniprocessor)\n");
666 atomic_setbits_int(&ci->ci_flags,
667 CPUF_PRESENT | CPUF_SP | CPUF_PRIMARY);
668 cpu_intr_init(ci);
669 identifycpu(ci);
670 cpu_fix_msrs(ci);
671 #ifdef MTRR
672 mem_range_attach();
673 #endif /* MTRR */
674 /* XXX SP fpuinit(ci) is done earlier */
675 cpu_init(ci);
676 cpu_init_mwait(sc, ci);
677 break;
678
679 case CPU_ROLE_BP:
680 printf("apid %d (boot processor)\n", caa->cpu_apicid);
681 atomic_setbits_int(&ci->ci_flags,
682 CPUF_PRESENT | CPUF_BSP | CPUF_PRIMARY);
683 cpu_intr_init(ci);
684 identifycpu(ci);
685 cpu_fix_msrs(ci);
686 #ifdef MTRR
687 mem_range_attach();
688 #endif /* MTRR */
689
690 #if NLAPIC > 0
691 /*
692 * Enable local apic
693 */
694 lapic_enable();
695 lapic_calibrate_timer(ci);
696 #endif
697 /* XXX BP fpuinit(ci) is done earlier */
698 cpu_init(ci);
699
700 #if NIOAPIC > 0
701 ioapic_bsp_id = caa->cpu_apicid;
702 #endif
703 cpu_init_mwait(sc, ci);
704 break;
705
706 case CPU_ROLE_AP:
707 /*
708 * report on an AP
709 */
710 printf("apid %d (application processor)\n", caa->cpu_apicid);
711
712 #if defined(MULTIPROCESSOR)
713 cpu_intr_init(ci);
714 cpu_start_secondary(ci);
715 clockqueue_init(&ci->ci_queue);
716 sched_init_cpu(ci);
717 ncpus++;
718 if (ci->ci_flags & CPUF_PRESENT) {
719 ci->ci_next = cpu_info_list->ci_next;
720 cpu_info_list->ci_next = ci;
721 }
722 #else
723 printf("%s: not started\n", sc->sc_dev.dv_xname);
724 #endif
725 break;
726
727 default:
728 panic("unknown processor type??");
729 }
730
731 #if defined(MULTIPROCESSOR)
732 if (mp_verbose) {
733 printf("%s: kstack at 0x%lx for %d bytes\n",
734 sc->sc_dev.dv_xname, kstack, USPACE);
735 printf("%s: idle pcb at %p, idle sp at 0x%llx\n",
736 sc->sc_dev.dv_xname, pcb, pcb->pcb_rsp);
737 }
738 #endif
739 #if NVMM > 0
740 cpu_init_vmm(ci);
741 #endif /* NVMM > 0 */
742
743 #ifndef SMALL_KERNEL
744 if (ci->ci_sensordev.sensors_count > 0)
745 sensordev_install(&ci->ci_sensordev);
746 #endif
747 }
748
749 static void
replacexsave(int xsave_ext)750 replacexsave(int xsave_ext)
751 {
752 extern long _xrstor, _xrstors, _xsave, _xsaves, _xsaveopt;
753 static int replacedone = 0;
754 int s;
755
756 if (replacedone)
757 return;
758 replacedone = 1;
759
760 s = splhigh();
761 codepatch_replace(CPTAG_XRSTORS,
762 (xsave_ext & XSAVE_XSAVES) ? &_xrstors : &_xrstor, 4);
763 codepatch_replace(CPTAG_XRSTOR, &_xrstor, 4);
764 codepatch_replace(CPTAG_XSAVE,
765 (xsave_ext & XSAVE_XSAVES) ? &_xsaves :
766 (xsave_ext & XSAVE_XSAVEOPT) ? &_xsaveopt : &_xsave, 4);
767 splx(s);
768 }
769
770
771 /*
772 * Initialize the processor appropriately.
773 */
774
775 void
cpu_init(struct cpu_info * ci)776 cpu_init(struct cpu_info *ci)
777 {
778 struct savefpu *sfp;
779 u_int cr4;
780
781 /* configure the CPU if needed */
782 if (ci->cpu_setup != NULL)
783 (*ci->cpu_setup)(ci);
784
785 cr4 = rcr4() | CR4_DEFAULT;
786 if (ci->ci_feature_sefflags_ebx & SEFF0EBX_SMEP)
787 cr4 |= CR4_SMEP;
788 if (ci->ci_feature_sefflags_ebx & SEFF0EBX_SMAP)
789 cr4 |= CR4_SMAP;
790 if (ci->ci_feature_sefflags_ecx & SEFF0ECX_UMIP)
791 cr4 |= CR4_UMIP;
792 if ((cpu_ecxfeature & CPUIDECX_XSAVE) && ci->ci_cpuid_level >= 0xd)
793 cr4 |= CR4_OSXSAVE;
794 if (pg_xo)
795 cr4 |= CR4_PKE;
796 if (pmap_use_pcid)
797 cr4 |= CR4_PCIDE;
798 lcr4(cr4);
799
800 if ((cpu_ecxfeature & CPUIDECX_XSAVE) && ci->ci_cpuid_level >= 0xd) {
801 u_int32_t eax, ebx, ecx, edx;
802
803 xsave_mask = XFEATURE_X87 | XFEATURE_SSE;
804 CPUID_LEAF(0xd, 0, eax, ebx, ecx, edx);
805 xsave_mask |= eax & XFEATURE_AVX;
806 xsave_mask |= eax & XFEATURE_AVX512;
807 xsetbv(0, xsave_mask);
808 CPUID_LEAF(0xd, 0, eax, ebx, ecx, edx);
809 if (CPU_IS_PRIMARY(ci)) {
810 fpu_save_len = ebx;
811 KASSERT(fpu_save_len <= sizeof(struct savefpu));
812 } else {
813 KASSERT(ebx == fpu_save_len);
814 }
815
816 /* check for xsaves, xsaveopt, and supervisor features */
817 CPUID_LEAF(0xd, 1, eax, ebx, ecx, edx);
818 /* Disable XSAVES on AMD family 17h due to Erratum 1386 */
819 if (ci->ci_vendor == CPUV_AMD &&
820 ci->ci_family == 0x17) {
821 eax &= ~XSAVE_XSAVES;
822 }
823 if (eax & XSAVE_XSAVES) {
824 #ifndef SMALL_KERNEL
825 if (ci->ci_feature_sefflags_edx & SEFF0EDX_IBT)
826 xsave_mask |= ecx & XFEATURE_CET_U;
827 #endif
828 if (xsave_mask & XFEATURE_XSS_MASK) {
829 wrmsr(MSR_XSS, xsave_mask & XFEATURE_XSS_MASK);
830 CPUID_LEAF(0xd, 1, eax, ebx, ecx, edx);
831 KASSERT(ebx <= sizeof(struct savefpu));
832 }
833 if (CPU_IS_PRIMARY(ci))
834 cpu_use_xsaves = 1;
835 }
836
837 replacexsave(eax);
838 }
839
840 if (CPU_IS_PRIMARY(ci)) {
841 /* Clean our FPU save area */
842 sfp = fpu_cleandata;
843 memset(sfp, 0, fpu_save_len);
844 sfp->fp_fxsave.fx_fcw = __INITIAL_NPXCW__;
845 sfp->fp_fxsave.fx_mxcsr = __INITIAL_MXCSR__;
846 xrstor_user(sfp, xsave_mask);
847 if (cpu_use_xsaves || !xsave_mask)
848 fpusave(sfp);
849 else {
850 /* must not use xsaveopt here */
851 xsave(sfp, xsave_mask);
852 }
853 } else {
854 fpureset();
855 }
856
857 #if NVMM > 0
858 /* Re-enable VMM if needed */
859 if (ci->ci_flags & CPUF_VMM)
860 start_vmm_on_cpu(ci);
861 #endif /* NVMM > 0 */
862
863 #ifdef MULTIPROCESSOR
864 atomic_setbits_int(&ci->ci_flags, CPUF_RUNNING);
865 /*
866 * Big hammer: flush all TLB entries, including ones from PTEs
867 * with the G bit set. This should only be necessary if TLB
868 * shootdown falls far behind.
869 */
870 cr4 = rcr4();
871 lcr4(cr4 & ~CR4_PGE);
872 lcr4(cr4);
873
874 /* Check if TSC is synchronized. */
875 if (cold && !CPU_IS_PRIMARY(ci))
876 tsc_test_sync_ap(ci);
877 #endif
878 }
879
880 #if NVMM > 0
881 /*
882 * cpu_init_vmm
883 *
884 * Initializes per-cpu VMM state
885 *
886 * Parameters:
887 * ci: the cpu for which state is being initialized
888 */
889 void
cpu_init_vmm(struct cpu_info * ci)890 cpu_init_vmm(struct cpu_info *ci)
891 {
892 uint64_t msr;
893
894 /*
895 * Detect VMX specific features and initialize VMX-related state.
896 */
897 if (ci->ci_vmm_flags & CI_VMM_VMX) {
898 ci->ci_vmxon_region = (struct vmxon_region *)malloc(PAGE_SIZE,
899 M_DEVBUF, M_WAITOK | M_ZERO);
900 if (!pmap_extract(pmap_kernel(), (vaddr_t)ci->ci_vmxon_region,
901 &ci->ci_vmxon_region_pa))
902 panic("Can't locate VMXON region in phys mem");
903
904 ci->ci_vmcs_pa = VMX_VMCS_PA_CLEAR;
905 rw_init(&ci->ci_vmcs_lock, "vmcslock");
906
907 if (rdmsr_safe(IA32_VMX_EPT_VPID_CAP, &msr) == 0 &&
908 msr & IA32_EPT_VPID_CAP_INVEPT_CONTEXT)
909 ci->ci_vmm_cap.vcc_vmx.vmx_invept_mode =
910 IA32_VMX_INVEPT_SINGLE_CTX;
911 else
912 ci->ci_vmm_cap.vcc_vmx.vmx_invept_mode =
913 IA32_VMX_INVEPT_GLOBAL_CTX;
914 }
915 }
916 #endif /* NVMM > 0 */
917
918 #ifdef MULTIPROCESSOR
919 void
cpu_boot_secondary_processors(void)920 cpu_boot_secondary_processors(void)
921 {
922 struct cpu_info *ci;
923 u_long i;
924
925 for (i=0; i < MAXCPUS; i++) {
926 ci = cpu_info[i];
927 if (ci == NULL)
928 continue;
929 if (ci->ci_idle_pcb == NULL)
930 continue;
931 if ((ci->ci_flags & CPUF_PRESENT) == 0)
932 continue;
933 if (ci->ci_flags & (CPUF_BSP | CPUF_SP | CPUF_PRIMARY))
934 continue;
935 ci->ci_randseed = (arc4random() & 0x7fffffff) + 1;
936 cpu_boot_secondary(ci);
937 }
938 }
939
940 void
cpu_start_secondary(struct cpu_info * ci)941 cpu_start_secondary(struct cpu_info *ci)
942 {
943 int i;
944 u_long s;
945
946 atomic_setbits_int(&ci->ci_flags, CPUF_AP);
947
948 pmap_kenter_pa(MP_TRAMPOLINE, MP_TRAMPOLINE, PROT_READ | PROT_EXEC);
949 pmap_kenter_pa(MP_TRAMP_DATA, MP_TRAMP_DATA, PROT_READ | PROT_WRITE);
950
951 CPU_STARTUP(ci);
952
953 /*
954 * wait for it to become ready
955 */
956 for (i = 100000; (!(ci->ci_flags & CPUF_PRESENT)) && i>0;i--) {
957 delay(10);
958 }
959 if (! (ci->ci_flags & CPUF_PRESENT)) {
960 printf("%s: failed to become ready\n", ci->ci_dev->dv_xname);
961 #if defined(MPDEBUG) && defined(DDB)
962 printf("dropping into debugger; continue from here to resume boot\n");
963 db_enter();
964 #endif
965 }
966
967 if ((ci->ci_flags & CPUF_IDENTIFIED) == 0) {
968 atomic_setbits_int(&ci->ci_flags, CPUF_IDENTIFY);
969
970 /* wait for it to identify */
971 for (i = 2000000; (ci->ci_flags & CPUF_IDENTIFY) && i > 0; i--)
972 delay(10);
973
974 if (ci->ci_flags & CPUF_IDENTIFY)
975 printf("%s: failed to identify\n",
976 ci->ci_dev->dv_xname);
977 }
978
979 if (ci->ci_flags & CPUF_IDENTIFIED) {
980 /*
981 * Test if TSCs are synchronized. Invalidate cache to
982 * minimize possible cache effects. Disable interrupts to
983 * try to rule out external interference.
984 */
985 s = intr_disable();
986 wbinvd();
987 tsc_test_sync_bp(curcpu());
988 intr_restore(s);
989 }
990
991 CPU_START_CLEANUP(ci);
992
993 pmap_kremove(MP_TRAMPOLINE, PAGE_SIZE);
994 pmap_kremove(MP_TRAMP_DATA, PAGE_SIZE);
995 }
996
997 void
cpu_boot_secondary(struct cpu_info * ci)998 cpu_boot_secondary(struct cpu_info *ci)
999 {
1000 int i;
1001 u_long s;
1002
1003 atomic_setbits_int(&ci->ci_flags, CPUF_GO);
1004
1005 for (i = 100000; (!(ci->ci_flags & CPUF_RUNNING)) && i>0;i--) {
1006 delay(10);
1007 }
1008 if (! (ci->ci_flags & CPUF_RUNNING)) {
1009 printf("cpu failed to start\n");
1010 #if defined(MPDEBUG) && defined(DDB)
1011 printf("dropping into debugger; continue from here to resume boot\n");
1012 db_enter();
1013 #endif
1014 } else if (cold) {
1015 /* Test if TSCs are synchronized again. */
1016 s = intr_disable();
1017 wbinvd();
1018 tsc_test_sync_bp(curcpu());
1019 intr_restore(s);
1020 }
1021 }
1022
1023 /*
1024 * The CPU ends up here when it's ready to run
1025 * This is called from code in mptramp.s; at this point, we are running
1026 * in the idle pcb/idle stack of the new cpu. When this function returns,
1027 * this processor will enter the idle loop and start looking for work.
1028 *
1029 * XXX should share some of this with init386 in machdep.c
1030 */
1031 void
cpu_hatch(void * v)1032 cpu_hatch(void *v)
1033 {
1034 struct cpu_info *ci = (struct cpu_info *)v;
1035 int s;
1036
1037 {
1038 uint32_t vendor[4];
1039 int level;
1040
1041 CPUID(0, level, vendor[0], vendor[2], vendor[1]);
1042 vendor[3] = 0;
1043 cpu_set_vendor(ci, level, (const char *)vendor);
1044 }
1045
1046 cpu_init_msrs(ci);
1047
1048 #ifdef DEBUG
1049 if (ci->ci_flags & CPUF_PRESENT)
1050 panic("%s: already running!?", ci->ci_dev->dv_xname);
1051 #endif
1052 atomic_setbits_int(&ci->ci_flags, CPUF_PRESENT);
1053
1054 lapic_enable();
1055 cpu_ucode_apply(ci);
1056 cpu_tsx_disable(ci);
1057
1058 if ((ci->ci_flags & CPUF_IDENTIFIED) == 0) {
1059 /*
1060 * We need to wait until we can identify, otherwise dmesg
1061 * output will be messy.
1062 */
1063 while ((ci->ci_flags & CPUF_IDENTIFY) == 0)
1064 delay(10);
1065
1066 identifycpu(ci);
1067
1068 /* Prevent identifycpu() from running again */
1069 atomic_setbits_int(&ci->ci_flags, CPUF_IDENTIFIED);
1070
1071 /* Signal we're done */
1072 atomic_clearbits_int(&ci->ci_flags, CPUF_IDENTIFY);
1073 }
1074
1075 /* These have to run after identifycpu() */
1076 cpu_fix_msrs(ci);
1077
1078 /*
1079 * Test if our TSC is synchronized for the first time.
1080 * Note that interrupts are off at this point.
1081 */
1082 wbinvd();
1083 tsc_test_sync_ap(ci);
1084
1085 while ((ci->ci_flags & CPUF_GO) == 0)
1086 delay(10);
1087 #ifdef HIBERNATE
1088 if ((ci->ci_flags & CPUF_PARK) != 0) {
1089 if (ci->ci_feature_sefflags_edx & SEFF0EDX_IBT)
1090 lcr4(rcr4() & ~CR4_CET);
1091 atomic_clearbits_int(&ci->ci_flags, CPUF_PARK);
1092 hibernate_drop_to_real_mode();
1093 }
1094 #endif /* HIBERNATE */
1095
1096 #ifdef DEBUG
1097 if (ci->ci_flags & CPUF_RUNNING)
1098 panic("%s: already running!?", ci->ci_dev->dv_xname);
1099 #endif
1100
1101 cpu_init_idt();
1102 lapic_set_lvt();
1103 gdt_init_cpu(ci);
1104 fpuinit(ci);
1105
1106 lldt(0);
1107
1108 cpu_init(ci);
1109 #if NPVBUS > 0
1110 pvbus_init_cpu();
1111 #endif
1112
1113 /* Re-initialise memory range handling on AP */
1114 if (mem_range_softc.mr_op != NULL)
1115 mem_range_softc.mr_op->initAP(&mem_range_softc);
1116
1117 s = splhigh();
1118 lcr8(0);
1119 intr_enable();
1120 splx(s);
1121
1122 lapic_startclock();
1123
1124 sched_toidle();
1125 }
1126
1127 #if defined(DDB)
1128
1129 #include <ddb/db_output.h>
1130 #include <machine/db_machdep.h>
1131
1132 /*
1133 * Dump cpu information from ddb.
1134 */
1135 void
cpu_debug_dump(void)1136 cpu_debug_dump(void)
1137 {
1138 struct cpu_info *ci;
1139 CPU_INFO_ITERATOR cii;
1140
1141 db_printf("addr dev id flags ipis curproc\n");
1142 CPU_INFO_FOREACH(cii, ci) {
1143 db_printf("%p %s %u %x %x %10p\n",
1144 ci,
1145 ci->ci_dev == NULL ? "BOOT" : ci->ci_dev->dv_xname,
1146 ci->ci_cpuid,
1147 ci->ci_flags, ci->ci_ipis,
1148 ci->ci_curproc);
1149 }
1150 }
1151 #endif
1152
1153 int
mp_cpu_start(struct cpu_info * ci)1154 mp_cpu_start(struct cpu_info *ci)
1155 {
1156 unsigned short dwordptr[2];
1157
1158 /*
1159 * "The BSP must initialize CMOS shutdown code to 0Ah ..."
1160 */
1161
1162 outb(IO_RTC, NVRAM_RESET);
1163 outb(IO_RTC+1, NVRAM_RESET_JUMP);
1164
1165 /*
1166 * "and the warm reset vector (DWORD based at 40:67) to point
1167 * to the AP startup code ..."
1168 */
1169
1170 dwordptr[0] = 0;
1171 dwordptr[1] = MP_TRAMPOLINE >> 4;
1172
1173 pmap_kenter_pa(0, 0, PROT_READ | PROT_WRITE);
1174 memcpy((u_int8_t *) 0x467, dwordptr, 4);
1175 pmap_kremove(0, PAGE_SIZE);
1176
1177 #if NLAPIC > 0
1178 /*
1179 * ... prior to executing the following sequence:"
1180 */
1181
1182 if (ci->ci_flags & CPUF_AP) {
1183 x86_ipi_init(ci->ci_apicid);
1184
1185 delay(10000);
1186
1187 if (cpu_feature & CPUID_APIC) {
1188 x86_ipi(MP_TRAMPOLINE/PAGE_SIZE, ci->ci_apicid,
1189 LAPIC_DLMODE_STARTUP);
1190 delay(200);
1191
1192 x86_ipi(MP_TRAMPOLINE/PAGE_SIZE, ci->ci_apicid,
1193 LAPIC_DLMODE_STARTUP);
1194 delay(200);
1195 }
1196 }
1197 #endif
1198 return 0;
1199 }
1200
1201 void
mp_cpu_start_cleanup(struct cpu_info * ci)1202 mp_cpu_start_cleanup(struct cpu_info *ci)
1203 {
1204 /*
1205 * Ensure the NVRAM reset byte contains something vaguely sane.
1206 */
1207
1208 outb(IO_RTC, NVRAM_RESET);
1209 outb(IO_RTC+1, NVRAM_RESET_RST);
1210 }
1211 #endif /* MULTIPROCESSOR */
1212
1213 typedef void (vector)(void);
1214 extern vector Xsyscall_meltdown, Xsyscall, Xsyscall32;
1215
1216 void
cpu_init_msrs(struct cpu_info * ci)1217 cpu_init_msrs(struct cpu_info *ci)
1218 {
1219 wrmsr(MSR_STAR,
1220 ((uint64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1221 ((uint64_t)GSEL(GUDATA_SEL-1, SEL_UPL) << 48));
1222 wrmsr(MSR_LSTAR, cpu_meltdown ? (uint64_t)Xsyscall_meltdown :
1223 (uint64_t)Xsyscall);
1224 wrmsr(MSR_CSTAR, 0);
1225 wrmsr(MSR_SFMASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D|PSL_AC);
1226
1227 wrmsr(MSR_FSBASE, 0);
1228 wrmsr(MSR_GSBASE, (u_int64_t)ci);
1229 wrmsr(MSR_KERNELGSBASE, 0);
1230 patinit(ci);
1231 }
1232
1233 void
cpu_fix_msrs(struct cpu_info * ci)1234 cpu_fix_msrs(struct cpu_info *ci)
1235 {
1236 int family = ci->ci_family;
1237 uint64_t msr, nmsr;
1238
1239 if (ci->ci_vendor == CPUV_INTEL) {
1240 if ((family > 6 || (family == 6 && ci->ci_model >= 0xd)) &&
1241 rdmsr_safe(MSR_MISC_ENABLE, &msr) == 0 &&
1242 (msr & MISC_ENABLE_FAST_STRINGS) == 0) {
1243 msr |= MISC_ENABLE_FAST_STRINGS;
1244 wrmsr(MSR_MISC_ENABLE, msr);
1245 DPRINTF("%s: enabled fast strings\n", ci->ci_dev->dv_xname);
1246
1247 /*
1248 * Attempt to disable Silicon Debug and lock the configuration
1249 * if it's enabled and unlocked.
1250 */
1251 if (cpu_ecxfeature & CPUIDECX_SDBG) {
1252 msr = rdmsr(IA32_DEBUG_INTERFACE);
1253 if ((msr & IA32_DEBUG_INTERFACE_ENABLE) &&
1254 (msr & IA32_DEBUG_INTERFACE_LOCK) == 0) {
1255 msr &= IA32_DEBUG_INTERFACE_MASK;
1256 msr |= IA32_DEBUG_INTERFACE_LOCK;
1257 wrmsr(IA32_DEBUG_INTERFACE, msr);
1258 } else if (msr & IA32_DEBUG_INTERFACE_ENABLE)
1259 printf("%s: cannot disable silicon debug\n",
1260 ci->ci_dev->dv_xname);
1261 }
1262 }
1263 }
1264
1265 if (ci->ci_vendor == CPUV_AMD) {
1266 /* Apply AMD errata */
1267 amd64_errata(ci);
1268
1269 /*
1270 * "Mitigation G-2" per AMD's Whitepaper "Software Techniques
1271 * for Managing Speculation on AMD Processors"
1272 *
1273 * By setting MSR C001_1029[1]=1, LFENCE becomes a dispatch
1274 * serializing instruction.
1275 *
1276 * This MSR is available on all AMD families >= 10h, except 11h
1277 * where LFENCE is always serializing.
1278 */
1279 if (family >= 0x10 && family != 0x11) {
1280 nmsr = msr = rdmsr(MSR_DE_CFG);
1281 nmsr |= DE_CFG_SERIALIZE_LFENCE;
1282 if (msr != nmsr)
1283 wrmsr(MSR_DE_CFG, nmsr);
1284 }
1285 if (family == 0x17 && ci->ci_model >= 0x31 &&
1286 (cpu_ecxfeature & CPUIDECX_HV) == 0) {
1287 nmsr = msr = rdmsr(MSR_DE_CFG);
1288 nmsr |= DE_CFG_SERIALIZE_9;
1289 if (msr != nmsr)
1290 wrmsr(MSR_DE_CFG, nmsr);
1291 }
1292 }
1293
1294 #ifndef SMALL_KERNEL
1295 if (ci->ci_feature_sefflags_edx & SEFF0EDX_IBT) {
1296 msr = rdmsr(MSR_S_CET);
1297 wrmsr(MSR_S_CET, (msr & ~MSR_CET_NO_TRACK_EN) | MSR_CET_ENDBR_EN);
1298 lcr4(rcr4() | CR4_CET);
1299 }
1300 #endif
1301 }
1302
1303 void
cpu_tsx_disable(struct cpu_info * ci)1304 cpu_tsx_disable(struct cpu_info *ci)
1305 {
1306 uint64_t msr;
1307 uint32_t dummy, sefflags_edx;
1308
1309 /* this runs before identifycpu() populates ci_feature_sefflags_edx */
1310 if (ci->ci_cpuid_level < 0x07)
1311 return;
1312 CPUID_LEAF(0x7, 0, dummy, dummy, dummy, sefflags_edx);
1313
1314 if (ci->ci_vendor == CPUV_INTEL &&
1315 (sefflags_edx & SEFF0EDX_ARCH_CAP)) {
1316 msr = rdmsr(MSR_ARCH_CAPABILITIES);
1317 if (msr & ARCH_CAP_TSX_CTRL) {
1318 msr = rdmsr(MSR_TSX_CTRL);
1319 msr |= TSX_CTRL_RTM_DISABLE | TSX_CTRL_TSX_CPUID_CLEAR;
1320 wrmsr(MSR_TSX_CTRL, msr);
1321 }
1322 }
1323 }
1324
1325 void
patinit(struct cpu_info * ci)1326 patinit(struct cpu_info *ci)
1327 {
1328 extern int pmap_pg_wc;
1329 u_int64_t reg;
1330
1331 if ((cpu_feature & CPUID_PAT) == 0)
1332 return;
1333 /*
1334 * Set up PAT bits.
1335 * The default pat table is the following:
1336 * WB, WT, UC-, UC, WB, WT, UC-, UC
1337 * We change it to:
1338 * WB, WC, UC-, UC, WB, WC, UC-, UC
1339 * i.e change the WT bit to be WC.
1340 */
1341 reg = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
1342 PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
1343 PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
1344 PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
1345
1346 wrmsr(MSR_CR_PAT, reg);
1347 pmap_pg_wc = PG_WC;
1348 }
1349
1350 struct timeout rdrand_tmo;
1351 void rdrand(void *);
1352
1353 void
rdrand(void * v)1354 rdrand(void *v)
1355 {
1356 struct timeout *tmo = v;
1357 extern int has_rdrand, has_rdseed;
1358 union {
1359 uint64_t u64;
1360 uint32_t u32[2];
1361 } r, t;
1362 uint64_t tsc;
1363 uint8_t valid = 0;
1364
1365 tsc = rdtsc();
1366 if (has_rdseed)
1367 __asm volatile(
1368 "rdseed %0\n\t"
1369 "setc %1\n"
1370 : "=r" (r.u64), "=qm" (valid) );
1371 if (has_rdrand && (has_rdseed == 0 || valid == 0))
1372 __asm volatile(
1373 "rdrand %0\n\t"
1374 "setc %1\n"
1375 : "=r" (r.u64), "=qm" (valid) );
1376
1377 t.u64 = tsc;
1378 t.u64 ^= r.u64;
1379 t.u64 ^= valid; /* potential rdrand empty */
1380 if (has_rdrand)
1381 t.u64 += rdtsc(); /* potential vmexit latency */
1382
1383 enqueue_randomness(t.u32[0]);
1384 enqueue_randomness(t.u32[1]);
1385
1386 if (tmo)
1387 timeout_add_msec(tmo, 10);
1388 }
1389
1390 int
cpu_activate(struct device * self,int act)1391 cpu_activate(struct device *self, int act)
1392 {
1393 struct cpu_softc *sc = (struct cpu_softc *)self;
1394
1395 switch (act) {
1396 case DVACT_RESUME:
1397 if (sc->sc_info->ci_cpuid == 0)
1398 rdrand(NULL);
1399 #if NPCTR > 0
1400 pctr_resume(sc->sc_info);
1401 #endif
1402 break;
1403 }
1404
1405 return (0);
1406 }
1407
1408 /*
1409 * cpu_enter_pages
1410 *
1411 * Requests mapping of various special pages required in the Intel Meltdown
1412 * case (to be entered into the U-K page table):
1413 *
1414 * 1 tss+gdt page for each CPU
1415 * 1 trampoline stack page for each CPU
1416 *
1417 * The cpu_info_full struct for each CPU straddles these pages. The offset into
1418 * 'cif' is calculated below, for each page. For more information, consult
1419 * the definition of struct cpu_info_full in cpu_full.h
1420 *
1421 * On CPUs unaffected by Meltdown, this function still configures 'cif' but
1422 * the calls to pmap_enter_special become no-ops.
1423 *
1424 * Parameters:
1425 * cif : the cpu_info_full structure describing a CPU whose pages are to be
1426 * entered into the special meltdown U-K page table.
1427 */
1428 void
cpu_enter_pages(struct cpu_info_full * cif)1429 cpu_enter_pages(struct cpu_info_full *cif)
1430 {
1431 vaddr_t va;
1432 paddr_t pa;
1433
1434 /* The TSS+GDT need to be readable */
1435 va = (vaddr_t)cif;
1436 pmap_extract(pmap_kernel(), va, &pa);
1437 pmap_enter_special(va, pa, PROT_READ);
1438 DPRINTF("%s: entered tss+gdt page at va 0x%llx pa 0x%llx\n", __func__,
1439 (uint64_t)va, (uint64_t)pa);
1440
1441 /* The trampoline stack page needs to be read/write */
1442 va = (vaddr_t)&cif->cif_tramp_stack;
1443 pmap_extract(pmap_kernel(), va, &pa);
1444 pmap_enter_special(va, pa, PROT_READ | PROT_WRITE);
1445 DPRINTF("%s: entered t.stack page at va 0x%llx pa 0x%llx\n", __func__,
1446 (uint64_t)va, (uint64_t)pa);
1447
1448 cif->cif_tss.tss_rsp0 = va + sizeof(cif->cif_tramp_stack) - 16;
1449 DPRINTF("%s: cif_tss.tss_rsp0 = 0x%llx\n" ,__func__,
1450 (uint64_t)cif->cif_tss.tss_rsp0);
1451 cif->cif_cpu.ci_intr_rsp = cif->cif_tss.tss_rsp0 -
1452 sizeof(struct iretq_frame);
1453
1454 #define SETUP_IST_SPECIAL_STACK(ist, cif, member) do { \
1455 (cif)->cif_tss.tss_ist[(ist)] = (vaddr_t)&(cif)->member + \
1456 sizeof((cif)->member) - 16; \
1457 (cif)->member[nitems((cif)->member) - 2] = (int64_t)&(cif)->cif_cpu; \
1458 } while (0)
1459
1460 SETUP_IST_SPECIAL_STACK(0, cif, cif_dblflt_stack);
1461 SETUP_IST_SPECIAL_STACK(1, cif, cif_nmi_stack);
1462
1463 /* an empty iomap, by setting its offset to the TSS limit */
1464 cif->cif_tss.tss_iobase = sizeof(cif->cif_tss);
1465 }
1466
1467 #ifdef MULTIPROCESSOR
1468 int
wbinvd_on_all_cpus(void)1469 wbinvd_on_all_cpus(void)
1470 {
1471 x86_broadcast_ipi(X86_IPI_WBINVD);
1472 wbinvd();
1473 return 0;
1474 }
1475
1476 volatile long wbinvd_wait __attribute__((section(".kudata")));
1477
1478 void
wbinvd_on_all_cpus_acked(void)1479 wbinvd_on_all_cpus_acked(void)
1480 {
1481 struct cpu_info *ci, *self = curcpu();;
1482 CPU_INFO_ITERATOR cii;
1483 long wait = 0;
1484 u_int64_t mask = 0;
1485 int s;
1486
1487 CPU_INFO_FOREACH(cii, ci) {
1488 if (ci == self)
1489 continue;
1490 mask |= (1ULL << ci->ci_cpuid);
1491 wait++;
1492 }
1493
1494 KASSERT(wait > 0);
1495
1496 s = splvm();
1497 while (atomic_cas_ulong(&wbinvd_wait, 0 , wait) != 0) {
1498 while (wbinvd_wait != 0) {
1499 CPU_BUSY_CYCLE();
1500 }
1501 }
1502
1503 CPU_INFO_FOREACH(cii, ci) {
1504 if ((mask & (1ULL << ci->ci_cpuid)) == 0)
1505 continue;
1506 if (x86_fast_ipi(ci, LAPIC_IPI_WBINVD) != 0)
1507 panic("%s: ipi failed", __func__);
1508 }
1509 splx(s);
1510
1511 wbinvd();
1512
1513 while (wbinvd_wait != 0)
1514 CPU_BUSY_CYCLE();
1515 }
1516 #endif /* MULTIPROCESSOR */
1517
1518 int cpu_suspended;
1519 int cpu_wakeups;
1520
1521 #ifdef SUSPEND
1522
1523 void
cpu_suspend_cycle(void)1524 cpu_suspend_cycle(void)
1525 {
1526 if (cpu_suspend_cycle_fcn)
1527 cpu_suspend_cycle_fcn();
1528 else
1529 cpu_idle_cycle_fcn();
1530 }
1531
1532 int
cpu_suspend_primary(void)1533 cpu_suspend_primary(void)
1534 {
1535 struct cpu_info *ci = curcpu();
1536
1537 /* Mask clock interrupts. */
1538 local_pic.pic_hwmask(&local_pic, 0);
1539
1540 /*
1541 * All non-wakeup interrupts should be masked at this point;
1542 * re-enable interrupts such that wakeup interrupts actually
1543 * wake us up. Set a flag such that drivers can tell we're
1544 * suspended and change their behaviour accordingly. They can
1545 * wake us up by clearing the flag.
1546 */
1547 cpu_suspended = 1;
1548 ci->ci_ilevel = IPL_NONE;
1549 intr_enable();
1550
1551 while (cpu_suspended) {
1552 cpu_suspend_cycle();
1553 cpu_wakeups++;
1554 }
1555
1556 intr_disable();
1557 ci->ci_ilevel = IPL_HIGH;
1558
1559 /* Unmask clock interrupts. */
1560 local_pic.pic_hwunmask(&local_pic, 0);
1561
1562 return 0;
1563 }
1564
1565 #endif
1566