xref: /openbsd/sys/arch/amd64/amd64/cpu.c (revision 72c7c57a)
1 /*	$OpenBSD: cpu.c,v 1.185 2024/04/03 02:01:21 guenther Exp $	*/
2 /* $NetBSD: cpu.c,v 1.1 2003/04/26 18:39:26 fvdl Exp $ */
3 
4 /*-
5  * Copyright (c) 2000 The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by RedBack Networks Inc.
10  *
11  * Author: Bill Sommerfeld
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
24  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
26  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32  * POSSIBILITY OF SUCH DAMAGE.
33  */
34 
35 /*
36  * Copyright (c) 1999 Stefan Grefen
37  *
38  * Redistribution and use in source and binary forms, with or without
39  * modification, are permitted provided that the following conditions
40  * are met:
41  * 1. Redistributions of source code must retain the above copyright
42  *    notice, this list of conditions and the following disclaimer.
43  * 2. Redistributions in binary form must reproduce the above copyright
44  *    notice, this list of conditions and the following disclaimer in the
45  *    documentation and/or other materials provided with the distribution.
46  * 3. All advertising materials mentioning features or use of this software
47  *    must display the following acknowledgement:
48  *      This product includes software developed by the NetBSD
49  *      Foundation, Inc. and its contributors.
50  * 4. Neither the name of The NetBSD Foundation nor the names of its
51  *    contributors may be used to endorse or promote products derived
52  *    from this software without specific prior written permission.
53  *
54  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND ANY
55  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR AND CONTRIBUTORS BE LIABLE
58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  */
66 
67 #include "lapic.h"
68 #include "ioapic.h"
69 #include "vmm.h"
70 #include "pctr.h"
71 #include "pvbus.h"
72 
73 #include <sys/param.h>
74 #include <sys/proc.h>
75 #include <sys/timeout.h>
76 #include <sys/systm.h>
77 #include <sys/device.h>
78 #include <sys/malloc.h>
79 #include <sys/memrange.h>
80 #include <sys/atomic.h>
81 #include <sys/user.h>
82 
83 #include <uvm/uvm_extern.h>
84 
85 #include <machine/codepatch.h>
86 #include <machine/cpu_full.h>
87 #include <machine/cpufunc.h>
88 #include <machine/cpuvar.h>
89 #include <machine/pmap.h>
90 #include <machine/vmparam.h>
91 #include <machine/mpbiosvar.h>
92 #include <machine/pcb.h>
93 #include <machine/specialreg.h>
94 #include <machine/segments.h>
95 #include <machine/gdt.h>
96 #include <machine/pio.h>
97 #include <machine/vmmvar.h>
98 
99 #if NLAPIC > 0
100 #include <machine/i82489reg.h>
101 #include <machine/i82489var.h>
102 #endif
103 
104 #if NIOAPIC > 0
105 #include <machine/i82093var.h>
106 #endif
107 
108 #if NPCTR > 0
109 #include <machine/pctr.h>
110 #endif
111 
112 #if NPVBUS > 0
113 #include <dev/pv/pvvar.h>
114 #endif
115 
116 #include <dev/ic/mc146818reg.h>
117 #include <amd64/isa/nvram.h>
118 #include <dev/isa/isareg.h>
119 
120 #ifdef HIBERNATE
121 #include <sys/hibernate.h>
122 #include <machine/hibernate.h>
123 #endif /* HIBERNATE */
124 
125 /* #define CPU_DEBUG */
126 
127 #ifdef CPU_DEBUG
128 #define DPRINTF(x...)	do { printf(x); } while(0)
129 #else
130 #define DPRINTF(x...)
131 #endif /* CPU_DEBUG */
132 
133 int     cpu_match(struct device *, void *, void *);
134 void    cpu_attach(struct device *, struct device *, void *);
135 int     cpu_activate(struct device *, int);
136 void	patinit(struct cpu_info *ci);
137 #if NVMM > 0
138 void	cpu_init_vmm(struct cpu_info *ci);
139 #endif /* NVMM > 0 */
140 
141 struct cpu_softc {
142 	struct device sc_dev;		/* device tree glue */
143 	struct cpu_info *sc_info;	/* pointer to CPU info */
144 };
145 
146 void	replacesmap(void);
147 void	replacemeltdown(void);
148 void	replacemds(void);
149 
150 extern long _stac;
151 extern long _clac;
152 
153 int cpuid_level = 0;		/* MIN cpuid(0).eax */
154 char cpu_vendor[16] = { 0 };	/* CPU0's cpuid(0).e[bdc]x, \0 */
155 int cpu_id = 0;			/* cpuid(1).eax */
156 int cpu_ebxfeature = 0;		/* cpuid(1).ebx */
157 int cpu_ecxfeature = 0;		/* cpuid(1).ecx */
158 int cpu_feature = 0;		/* cpuid(1).edx */
159 int cpu_perf_eax = 0;		/* cpuid(0xa).eax */
160 int cpu_perf_ebx = 0;		/* cpuid(0xa).ebx */
161 int cpu_perf_edx = 0;		/* cpuid(0xa).edx */
162 int cpu_apmi_edx = 0;		/* cpuid(0x80000007).edx */
163 int ecpu_ecxfeature = 0;	/* cpuid(0x80000001).ecx */
164 int cpu_meltdown = 0;
165 int cpu_use_xsaves = 0;
166 int need_retpoline = 1;		/* most systems need retpoline */
167 
168 void
169 replacesmap(void)
170 {
171 	static int replacedone = 0;
172 	int s;
173 
174 	if (replacedone)
175 		return;
176 	replacedone = 1;
177 
178 	s = splhigh();
179 
180 	codepatch_replace(CPTAG_STAC, &_stac, 3);
181 	codepatch_replace(CPTAG_CLAC, &_clac, 3);
182 
183 	splx(s);
184 }
185 
186 void
187 replacemeltdown(void)
188 {
189 	static int replacedone = 0;
190 	struct cpu_info *ci = &cpu_info_primary;
191 	int swapgs_vuln = 0, ibrs = 0, s, ibpb = 0;
192 
193 	if (ci->ci_vendor == CPUV_INTEL) {
194 		int family = ci->ci_family;
195 		int model = ci->ci_model;
196 
197 		swapgs_vuln = 1;
198 		if (family == 0x6 &&
199 		    (model == 0x37 || model == 0x4a || model == 0x4c ||
200 		     model == 0x4d || model == 0x5a || model == 0x5d ||
201 		     model == 0x6e || model == 0x65 || model == 0x75)) {
202 			/* Silvermont, Airmont */
203 			swapgs_vuln = 0;
204 		} else if (family == 0x6 && (model == 0x85 || model == 0x57)) {
205 			/* KnightsLanding */
206 			swapgs_vuln = 0;
207 		}
208 		if ((ci->ci_feature_sefflags_edx & SEFF0EDX_ARCH_CAP) &&
209 		    (rdmsr(MSR_ARCH_CAPABILITIES) & ARCH_CAP_IBRS_ALL)) {
210 			ibrs = 2;
211 		} else if (ci->ci_feature_sefflags_edx & SEFF0EDX_IBRS) {
212 			ibrs = 1;
213 		}
214 		if (ci->ci_feature_sefflags_edx & SEFF0EDX_IBRS)
215 			ibpb = 1;
216         } else if (ci->ci_vendor == CPUV_AMD &&
217             ci->ci_pnfeatset >= 0x80000008) {
218 		if (ci->ci_feature_amdspec_ebx & CPUIDEBX_IBRS_ALWAYSON) {
219 			ibrs = 2;
220 		} else if ((ci->ci_feature_amdspec_ebx & CPUIDEBX_IBRS) &&
221 		    (ci->ci_feature_amdspec_ebx & CPUIDEBX_IBRS_PREF)) {
222 			ibrs = 1;
223 		}
224 		if (ci->ci_feature_amdspec_ebx & CPUIDEBX_IBPB)
225 			ibpb = 1;
226 	}
227 
228 	/* Enhanced IBRS: turn it on once on each CPU and don't touch again */
229 	if (ibrs == 2)
230 		wrmsr(MSR_SPEC_CTRL, SPEC_CTRL_IBRS);
231 
232 	if (replacedone)
233 		return;
234 	replacedone = 1;
235 
236 	s = splhigh();
237 
238 	/* If we don't have IBRS/IBPB, then don't use IBPB */
239 	if (ibpb == 0)
240 		codepatch_nop(CPTAG_IBPB_NOP);
241 
242 	if (ibrs == 2 || (ci->ci_feature_sefflags_edx & SEFF0EDX_IBT)) {
243 		extern const char _jmprax, _jmpr11, _jmpr13;
244 		extern const short _jmprax_len, _jmpr11_len, _jmpr13_len;
245 
246 		codepatch_replace(CPTAG_RETPOLINE_RAX, &_jmprax, _jmprax_len);
247 		codepatch_replace(CPTAG_RETPOLINE_R11, &_jmpr11, _jmpr11_len);
248 		codepatch_replace(CPTAG_RETPOLINE_R13, &_jmpr13, _jmpr13_len);
249 		need_retpoline = 0;
250 	}
251 
252 	if (!cpu_meltdown)
253 		codepatch_nop(CPTAG_MELTDOWN_NOP);
254 	else {
255 		extern long alltraps_kern_meltdown;
256 
257 		/* eliminate conditional branch in alltraps */
258 		codepatch_jmp(CPTAG_MELTDOWN_ALLTRAPS, &alltraps_kern_meltdown);
259 
260 		/* enable reuse of PCID for U-K page tables */
261 		if (pmap_use_pcid) {
262 			extern long _pcid_set_reuse;
263 			DPRINTF("%s: codepatching PCID use\n", __func__);
264 			codepatch_replace(CPTAG_PCID_SET_REUSE,
265 			    &_pcid_set_reuse, PCID_SET_REUSE_SIZE);
266 		}
267 	}
268 
269 	/*
270 	 * CVE-2019-1125: if the CPU has SMAP and it's not vulnerable to
271 	 * Meltdown, then it's protected both from speculatively mis-skipping
272 	 * the swapgs during interrupts of userspace and from speculatively
273 	 * mis-taking a swapgs during interrupts while already in the kernel
274 	 * as the speculative path will fault from SMAP.  Warning: enabling
275 	 * WRGSBASE would break this 'protection'.
276 	 *
277 	 * Otherwise, if the CPU's swapgs can't be speculated over and it
278 	 * _is_ vulnerable to Meltdown then the %cr3 change will serialize
279 	 * user->kern transitions, but we still need to mitigate the
280 	 * already-in-kernel cases.
281 	 */
282 	if (!cpu_meltdown && (ci->ci_feature_sefflags_ebx & SEFF0EBX_SMAP)) {
283 		codepatch_nop(CPTAG_FENCE_SWAPGS_MIS_TAKEN);
284 		codepatch_nop(CPTAG_FENCE_NO_SAFE_SMAP);
285 	} else if (!swapgs_vuln && cpu_meltdown) {
286 		codepatch_nop(CPTAG_FENCE_SWAPGS_MIS_TAKEN);
287 	}
288 	splx(s);
289 }
290 
291 void
292 replacemds(void)
293 {
294 	static int replacedone = 0;
295 	extern long mds_handler_bdw, mds_handler_ivb, mds_handler_skl;
296 	extern long mds_handler_skl_sse, mds_handler_skl_avx;
297 	extern long mds_handler_silvermont, mds_handler_knights;
298 	struct cpu_info *ci = &cpu_info_primary;
299 	CPU_INFO_ITERATOR cii;
300 	void *handler = NULL, *vmm_handler = NULL;
301 	const char *type;
302 	int use_verw = 0, s;
303 	uint32_t cap = 0;
304 
305 	/* ci_mds_tmp must be 32byte aligned for AVX instructions */
306 	CTASSERT((offsetof(struct cpu_info, ci_mds_tmp) -
307 		  offsetof(struct cpu_info, ci_PAGEALIGN)) % 32 == 0);
308 
309 	if (replacedone)
310 		return;
311 	replacedone = 1;
312 
313 	if (ci->ci_vendor != CPUV_INTEL)
314 		goto notintel;	/* VERW only needed on Intel */
315 
316 	if ((ci->ci_feature_sefflags_edx & SEFF0EDX_ARCH_CAP))
317 		cap = rdmsr(MSR_ARCH_CAPABILITIES);
318 
319 	if (cap & ARCH_CAP_MDS_NO) {
320 		/* Unaffected, nop out the handling code */
321 	} else if (ci->ci_feature_sefflags_edx & SEFF0EDX_MD_CLEAR) {
322 		/* new firmware, use VERW */
323 		use_verw = 1;
324 	} else {
325 		int family = ci->ci_family;
326 		int model = ci->ci_model;
327 		int stepping = CPUID2STEPPING(ci->ci_signature);
328 
329 		if (family == 0x6 &&
330 		    (model == 0x2e || model == 0x1e || model == 0x1f ||
331 		     model == 0x1a || model == 0x2f || model == 0x25 ||
332 		     model == 0x2c || model == 0x2d || model == 0x2a ||
333 		     model == 0x3e || model == 0x3a)) {
334 			/* Nehalem, SandyBridge, IvyBridge */
335 			handler = vmm_handler = &mds_handler_ivb;
336 			type = "IvyBridge";
337 			CPU_INFO_FOREACH(cii, ci) {
338 				ci->ci_mds_buf = malloc(672, M_DEVBUF,
339 				    M_WAITOK);
340 				memset(ci->ci_mds_buf, 0, 16);
341 			}
342 		} else if (family == 0x6 &&
343 		    (model == 0x3f || model == 0x3c || model == 0x45 ||
344 		     model == 0x46 || model == 0x56 || model == 0x4f ||
345 		     model == 0x47 || model == 0x3d)) {
346 			/* Haswell and Broadwell */
347 			handler = vmm_handler = &mds_handler_bdw;
348 			type = "Broadwell";
349 			CPU_INFO_FOREACH(cii, ci) {
350 				ci->ci_mds_buf = malloc(1536, M_DEVBUF,
351 				    M_WAITOK);
352 			}
353 		} else if (family == 0x6 &&
354 		    ((model == 0x55 && stepping <= 5) || model == 0x4e ||
355 		    model == 0x5e || (model == 0x8e && stepping <= 0xb) ||
356 		    (model == 0x9e && stepping <= 0xc))) {
357 			/*
358 			 * Skylake, KabyLake, CoffeeLake, WhiskeyLake,
359 			 * CascadeLake
360 			 */
361 			/* XXX mds_handler_skl_avx512 */
362 			if (xgetbv(0) & XFEATURE_AVX) {
363 				handler = &mds_handler_skl_avx;
364 				type = "Skylake AVX";
365 			} else {
366 				handler = &mds_handler_skl_sse;
367 				type = "Skylake SSE";
368 			}
369 			vmm_handler = &mds_handler_skl;
370 			CPU_INFO_FOREACH(cii, ci) {
371 				vaddr_t b64;
372 				b64 = (vaddr_t)malloc(6 * 1024 + 64 + 63,
373 				    M_DEVBUF, M_WAITOK);
374 				ci->ci_mds_buf = (void *)((b64 + 63) & ~63);
375 				memset(ci->ci_mds_buf, 0, 64);
376 			}
377 		} else if (family == 0x6 &&
378 		    (model == 0x37 || model == 0x4a || model == 0x4c ||
379 		     model == 0x4d || model == 0x5a || model == 0x5d ||
380 		     model == 0x6e || model == 0x65 || model == 0x75)) {
381 			/* Silvermont, Airmont */
382 			handler = vmm_handler = &mds_handler_silvermont;
383 			type = "Silvermont";
384 			CPU_INFO_FOREACH(cii, ci) {
385 				ci->ci_mds_buf = malloc(256, M_DEVBUF,
386 				    M_WAITOK);
387 				memset(ci->ci_mds_buf, 0, 16);
388 			}
389 		} else if (family == 0x6 && (model == 0x85 || model == 0x57)) {
390 			handler = vmm_handler = &mds_handler_knights;
391 			type = "KnightsLanding";
392 			CPU_INFO_FOREACH(cii, ci) {
393 				vaddr_t b64;
394 				b64 = (vaddr_t)malloc(1152 + 63, M_DEVBUF,
395 				    M_WAITOK);
396 				ci->ci_mds_buf = (void *)((b64 + 63) & ~63);
397 			}
398 		}
399 	}
400 
401 	/* Register File Data Sampling (RFDS) also has a VERW workaround */
402 	if ((cap & ARCH_CAP_RFDS_NO) == 0 && (cap & ARCH_CAP_RFDS_CLEAR))
403 		use_verw = 1;
404 
405 	if (handler != NULL) {
406 		printf("cpu0: using %s MDS workaround%s\n", type, "");
407 		s = splhigh();
408 		codepatch_call(CPTAG_MDS, handler);
409 		codepatch_call(CPTAG_MDS_VMM, vmm_handler);
410 		splx(s);
411 	} else if (use_verw) {
412 		/*
413 		 * The new firmware enhances L1D_FLUSH MSR to flush MDS too,
414 		 * but keep the verw if affected by RFDS
415 		 */
416 		if ((cap & ARCH_CAP_RFDS_NO) == 0 && (cap & ARCH_CAP_RFDS_CLEAR)) {
417 			type = "";
418 		} else if (cpu_info_primary.ci_vmm_cap.vcc_vmx.vmx_has_l1_flush_msr == 1) {
419 			s = splhigh();
420 			codepatch_nop(CPTAG_MDS_VMM);
421 			splx(s);
422 			type = " (except on vmm entry)";
423 		} else {
424 			type = "";
425 		}
426 		printf("cpu0: using %s MDS workaround%s\n", "VERW", type);
427 	} else {
428 notintel:
429 		s = splhigh();
430 		codepatch_nop(CPTAG_MDS);
431 		codepatch_nop(CPTAG_MDS_VMM);
432 		splx(s);
433 	}
434 }
435 
436 #ifdef MULTIPROCESSOR
437 int mp_cpu_start(struct cpu_info *);
438 void mp_cpu_start_cleanup(struct cpu_info *);
439 struct cpu_functions mp_cpu_funcs = { mp_cpu_start, NULL,
440 				      mp_cpu_start_cleanup };
441 #endif /* MULTIPROCESSOR */
442 
443 const struct cfattach cpu_ca = {
444 	sizeof(struct cpu_softc), cpu_match, cpu_attach, NULL, cpu_activate
445 };
446 
447 struct cfdriver cpu_cd = {
448 	NULL, "cpu", DV_DULL
449 };
450 
451 /*
452  * Statically-allocated CPU info for the primary CPU (or the only
453  * CPU, on uniprocessors).  The CPU info list is initialized to
454  * point at it.
455  */
456 struct cpu_info_full cpu_info_full_primary = { .cif_cpu = { .ci_self = &cpu_info_primary } };
457 
458 struct cpu_info *cpu_info_list = &cpu_info_primary;
459 
460 #ifdef MULTIPROCESSOR
461 /*
462  * Array of CPU info structures.  Must be statically-allocated because
463  * curproc, etc. are used early.
464  */
465 struct cpu_info *cpu_info[MAXCPUS] = { &cpu_info_primary };
466 
467 void    	cpu_hatch(void *);
468 void    	cpu_boot_secondary(struct cpu_info *ci);
469 void    	cpu_start_secondary(struct cpu_info *ci);
470 #endif
471 
472 int
473 cpu_match(struct device *parent, void *match, void *aux)
474 {
475 	struct cfdata *cf = match;
476 	struct cpu_attach_args *caa = aux;
477 
478 	if (strcmp(caa->caa_name, cf->cf_driver->cd_name) != 0)
479 		return 0;
480 
481 	if (cf->cf_unit >= MAXCPUS)
482 		return 0;
483 
484 	return 1;
485 }
486 
487 void	cpu_idle_mwait_cycle(void);
488 void	cpu_init_mwait(struct cpu_softc *, struct cpu_info *);
489 
490 u_int	cpu_mwait_size, cpu_mwait_states;
491 
492 void
493 cpu_idle_mwait_cycle(void)
494 {
495 	struct cpu_info *ci = curcpu();
496 
497 	if ((read_rflags() & PSL_I) == 0)
498 		panic("idle with interrupts blocked!");
499 
500 	/* something already queued? */
501 	if (!cpu_is_idle(ci))
502 		return;
503 
504 	/*
505 	 * About to idle; setting the MWAIT_IN_IDLE bit tells
506 	 * cpu_unidle() that it can't be a no-op and tells cpu_kick()
507 	 * that it doesn't need to use an IPI.  We also set the
508 	 * MWAIT_KEEP_IDLING bit: those routines clear it to stop
509 	 * the mwait.  Once they're set, we do a final check of the
510 	 * queue, in case another cpu called setrunqueue() and added
511 	 * something to the queue and called cpu_unidle() between
512 	 * the check in sched_idle() and here.
513 	 */
514 	atomic_setbits_int(&ci->ci_mwait, MWAIT_IDLING | MWAIT_ONLY);
515 	if (cpu_is_idle(ci)) {
516 		monitor(&ci->ci_mwait, 0, 0);
517 		if ((ci->ci_mwait & MWAIT_IDLING) == MWAIT_IDLING)
518 			mwait(0, 0);
519 	}
520 
521 	/* done idling; let cpu_kick() know that an IPI is required */
522 	atomic_clearbits_int(&ci->ci_mwait, MWAIT_IDLING);
523 }
524 
525 void
526 cpu_init_mwait(struct cpu_softc *sc, struct cpu_info *ci)
527 {
528 	unsigned int smallest, largest, extensions, c_substates;
529 
530 	if ((cpu_ecxfeature & CPUIDECX_MWAIT) == 0 || ci->ci_cpuid_level < 0x5)
531 		return;
532 
533 	/* get the monitor granularity */
534 	CPUID(0x5, smallest, largest, extensions, cpu_mwait_states);
535 	smallest &= 0xffff;
536 	largest  &= 0xffff;
537 
538 	/* mask out states C6/C7 in 31:24 for CHT45 errata */
539 	if (ci->ci_vendor == CPUV_INTEL &&
540 	    ci->ci_family == 0x06 && ci->ci_model == 0x4c)
541 		cpu_mwait_states &= 0x00ffffff;
542 
543 	printf("%s: mwait min=%u, max=%u", sc->sc_dev.dv_xname,
544 	    smallest, largest);
545 	if (extensions & 0x1) {
546 		if (cpu_mwait_states > 0) {
547 			c_substates = cpu_mwait_states;
548 			printf(", C-substates=%u", 0xf & c_substates);
549 			while ((c_substates >>= 4) > 0)
550 				printf(".%u", 0xf & c_substates);
551 		}
552 		if (extensions & 0x2)
553 			printf(", IBE");
554 	} else {
555 		/* substates not supported, forge the default: just C1 */
556 		cpu_mwait_states = 1 << 4;
557 	}
558 
559 	/* paranoia: check the values */
560 	if (smallest < sizeof(int) || largest < smallest ||
561 	    (largest & (sizeof(int)-1)))
562 		printf(" (bogus)");
563 	else
564 		cpu_mwait_size = largest;
565 	printf("\n");
566 
567 	/* enable use of mwait; may be overridden by acpicpu later */
568 	if (cpu_mwait_size > 0)
569 		cpu_idle_cycle_fcn = &cpu_idle_mwait_cycle;
570 }
571 
572 void
573 cpu_attach(struct device *parent, struct device *self, void *aux)
574 {
575 	struct cpu_softc *sc = (void *) self;
576 	struct cpu_attach_args *caa = aux;
577 	struct cpu_info *ci;
578 #if defined(MULTIPROCESSOR)
579 	int cpunum = sc->sc_dev.dv_unit;
580 	vaddr_t kstack;
581 	struct pcb *pcb;
582 #endif
583 
584 	/*
585 	 * If we're an Application Processor, allocate a cpu_info
586 	 * structure, otherwise use the primary's.
587 	 */
588 	if (caa->cpu_role == CPU_ROLE_AP) {
589 		struct cpu_info_full *cif;
590 
591 		cif = km_alloc(sizeof *cif, &kv_any, &kp_zero, &kd_waitok);
592 		ci = &cif->cif_cpu;
593 #if defined(MULTIPROCESSOR)
594 		ci->ci_tss = &cif->cif_tss;
595 		ci->ci_gdt = &cif->cif_gdt;
596 		memcpy(ci->ci_gdt, cpu_info_primary.ci_gdt, GDT_SIZE);
597 		cpu_enter_pages(cif);
598 		if (cpu_info[cpunum] != NULL)
599 			panic("cpu at apic id %d already attached?", cpunum);
600 		cpu_info[cpunum] = ci;
601 #endif
602 #ifdef TRAPLOG
603 		ci->ci_tlog_base = malloc(sizeof(struct tlog),
604 		    M_DEVBUF, M_WAITOK);
605 #endif
606 	} else {
607 		ci = &cpu_info_primary;
608 #if defined(MULTIPROCESSOR)
609 		if (caa->cpu_apicid != lapic_cpu_number()) {
610 			panic("%s: running cpu is at apic %d"
611 			    " instead of at expected %d",
612 			    sc->sc_dev.dv_xname, lapic_cpu_number(), caa->cpu_apicid);
613 		}
614 #endif
615 	}
616 
617 	ci->ci_self = ci;
618 	sc->sc_info = ci;
619 
620 	ci->ci_dev = self;
621 	ci->ci_apicid = caa->cpu_apicid;
622 	ci->ci_acpi_proc_id = caa->cpu_acpi_proc_id;
623 #ifdef MULTIPROCESSOR
624 	ci->ci_cpuid = cpunum;
625 #else
626 	ci->ci_cpuid = 0;	/* False for APs, but they're not used anyway */
627 #endif
628 	ci->ci_func = caa->cpu_func;
629 	ci->ci_handled_intr_level = IPL_NONE;
630 
631 #ifndef SMALL_KERNEL
632 	strlcpy(ci->ci_sensordev.xname, ci->ci_dev->dv_xname,
633 	    sizeof(ci->ci_sensordev.xname));
634 #endif
635 
636 #if defined(MULTIPROCESSOR)
637 	/*
638 	 * Allocate UPAGES contiguous pages for the idle PCB and stack.
639 	 */
640 	kstack = (vaddr_t)km_alloc(USPACE, &kv_any, &kp_dirty, &kd_nowait);
641 	if (kstack == 0) {
642 		if (caa->cpu_role != CPU_ROLE_AP) {
643 			panic("cpu_attach: unable to allocate idle stack for"
644 			    " primary");
645 		}
646 		printf("%s: unable to allocate idle stack\n",
647 		    sc->sc_dev.dv_xname);
648 		return;
649 	}
650 	pcb = ci->ci_idle_pcb = (struct pcb *) kstack;
651 	memset(pcb, 0, USPACE);
652 
653 	pcb->pcb_kstack = kstack + USPACE - 16;
654 	pcb->pcb_rbp = pcb->pcb_rsp = kstack + USPACE - 16;
655 	pcb->pcb_pmap = pmap_kernel();
656 	pcb->pcb_cr3 = pcb->pcb_pmap->pm_pdirpa;
657 #endif
658 
659 	/* further PCB init done later. */
660 
661 	printf(": ");
662 
663 	switch (caa->cpu_role) {
664 	case CPU_ROLE_SP:
665 		printf("(uniprocessor)\n");
666 		atomic_setbits_int(&ci->ci_flags,
667 		    CPUF_PRESENT | CPUF_SP | CPUF_PRIMARY);
668 		cpu_intr_init(ci);
669 		identifycpu(ci);
670 		cpu_fix_msrs(ci);
671 #ifdef MTRR
672 		mem_range_attach();
673 #endif /* MTRR */
674 		/* XXX SP fpuinit(ci) is done earlier */
675 		cpu_init(ci);
676 		cpu_init_mwait(sc, ci);
677 		break;
678 
679 	case CPU_ROLE_BP:
680 		printf("apid %d (boot processor)\n", caa->cpu_apicid);
681 		atomic_setbits_int(&ci->ci_flags,
682 		    CPUF_PRESENT | CPUF_BSP | CPUF_PRIMARY);
683 		cpu_intr_init(ci);
684 		identifycpu(ci);
685 		cpu_fix_msrs(ci);
686 #ifdef MTRR
687 		mem_range_attach();
688 #endif /* MTRR */
689 
690 #if NLAPIC > 0
691 		/*
692 		 * Enable local apic
693 		 */
694 		lapic_enable();
695 		lapic_calibrate_timer(ci);
696 #endif
697 		/* XXX BP fpuinit(ci) is done earlier */
698 		cpu_init(ci);
699 
700 #if NIOAPIC > 0
701 		ioapic_bsp_id = caa->cpu_apicid;
702 #endif
703 		cpu_init_mwait(sc, ci);
704 		break;
705 
706 	case CPU_ROLE_AP:
707 		/*
708 		 * report on an AP
709 		 */
710 		printf("apid %d (application processor)\n", caa->cpu_apicid);
711 
712 #if defined(MULTIPROCESSOR)
713 		cpu_intr_init(ci);
714 		cpu_start_secondary(ci);
715 		clockqueue_init(&ci->ci_queue);
716 		sched_init_cpu(ci);
717 		ncpus++;
718 		if (ci->ci_flags & CPUF_PRESENT) {
719 			ci->ci_next = cpu_info_list->ci_next;
720 			cpu_info_list->ci_next = ci;
721 		}
722 #else
723 		printf("%s: not started\n", sc->sc_dev.dv_xname);
724 #endif
725 		break;
726 
727 	default:
728 		panic("unknown processor type??");
729 	}
730 
731 #if defined(MULTIPROCESSOR)
732 	if (mp_verbose) {
733 		printf("%s: kstack at 0x%lx for %d bytes\n",
734 		    sc->sc_dev.dv_xname, kstack, USPACE);
735 		printf("%s: idle pcb at %p, idle sp at 0x%llx\n",
736 		    sc->sc_dev.dv_xname, pcb, pcb->pcb_rsp);
737 	}
738 #endif
739 #if NVMM > 0
740 	cpu_init_vmm(ci);
741 #endif /* NVMM > 0 */
742 
743 #ifndef SMALL_KERNEL
744 	if (ci->ci_sensordev.sensors_count > 0)
745 		sensordev_install(&ci->ci_sensordev);
746 #endif
747 }
748 
749 static void
750 replacexsave(int xsave_ext)
751 {
752 	extern long _xrstor, _xrstors, _xsave, _xsaves, _xsaveopt;
753 	static int replacedone = 0;
754 	int s;
755 
756 	if (replacedone)
757 		return;
758 	replacedone = 1;
759 
760 	s = splhigh();
761 	codepatch_replace(CPTAG_XRSTORS,
762 	    (xsave_ext & XSAVE_XSAVES) ? &_xrstors : &_xrstor, 4);
763 	codepatch_replace(CPTAG_XRSTOR, &_xrstor, 4);
764 	codepatch_replace(CPTAG_XSAVE,
765 	    (xsave_ext & XSAVE_XSAVES) ? &_xsaves :
766 	    (xsave_ext & XSAVE_XSAVEOPT) ? &_xsaveopt : &_xsave, 4);
767 	splx(s);
768 }
769 
770 
771 /*
772  * Initialize the processor appropriately.
773  */
774 
775 void
776 cpu_init(struct cpu_info *ci)
777 {
778 	struct savefpu *sfp;
779 	u_int cr4;
780 
781 	/* configure the CPU if needed */
782 	if (ci->cpu_setup != NULL)
783 		(*ci->cpu_setup)(ci);
784 
785 	cr4 = rcr4() | CR4_DEFAULT;
786 	if (ci->ci_feature_sefflags_ebx & SEFF0EBX_SMEP)
787 		cr4 |= CR4_SMEP;
788 	if (ci->ci_feature_sefflags_ebx & SEFF0EBX_SMAP)
789 		cr4 |= CR4_SMAP;
790 	if (ci->ci_feature_sefflags_ecx & SEFF0ECX_UMIP)
791 		cr4 |= CR4_UMIP;
792 	if ((cpu_ecxfeature & CPUIDECX_XSAVE) && ci->ci_cpuid_level >= 0xd)
793 		cr4 |= CR4_OSXSAVE;
794 	if (pg_xo)
795 		cr4 |= CR4_PKE;
796 	if (pmap_use_pcid)
797 		cr4 |= CR4_PCIDE;
798 	lcr4(cr4);
799 
800 	if ((cpu_ecxfeature & CPUIDECX_XSAVE) && ci->ci_cpuid_level >= 0xd) {
801 		u_int32_t eax, ebx, ecx, edx;
802 
803 		xsave_mask = XFEATURE_X87 | XFEATURE_SSE;
804 		CPUID_LEAF(0xd, 0, eax, ebx, ecx, edx);
805 		xsave_mask |= eax & XFEATURE_AVX;
806 		xsetbv(0, xsave_mask);
807 		CPUID_LEAF(0xd, 0, eax, ebx, ecx, edx);
808 		if (CPU_IS_PRIMARY(ci)) {
809 			fpu_save_len = ebx;
810 			KASSERT(fpu_save_len <= sizeof(struct savefpu));
811 		} else {
812 			KASSERT(ebx == fpu_save_len);
813 		}
814 
815 		/* check for xsaves, xsaveopt, and supervisor features */
816 		CPUID_LEAF(0xd, 1, eax, ebx, ecx, edx);
817 		/* Disable XSAVES on AMD family 17h due to Erratum 1386 */
818 		if (ci->ci_vendor == CPUV_AMD &&
819 		    ci->ci_family == 0x17) {
820 			eax &= ~XSAVE_XSAVES;
821 		}
822 		if (eax & XSAVE_XSAVES) {
823 #ifndef SMALL_KERNEL
824 			if (ci->ci_feature_sefflags_edx & SEFF0EDX_IBT)
825 				xsave_mask |= ecx & XFEATURE_CET_U;
826 #endif
827 			if (xsave_mask & XFEATURE_XSS_MASK) {
828 				wrmsr(MSR_XSS, xsave_mask & XFEATURE_XSS_MASK);
829 				CPUID_LEAF(0xd, 1, eax, ebx, ecx, edx);
830 				KASSERT(ebx <= sizeof(struct savefpu));
831 			}
832 			if (CPU_IS_PRIMARY(ci))
833 				cpu_use_xsaves = 1;
834 		}
835 
836 		replacexsave(eax);
837 	}
838 
839 	if (CPU_IS_PRIMARY(ci)) {
840 		/* Clean our FPU save area */
841 		sfp = fpu_cleandata;
842 		memset(sfp, 0, fpu_save_len);
843 		sfp->fp_fxsave.fx_fcw = __INITIAL_NPXCW__;
844 		sfp->fp_fxsave.fx_mxcsr = __INITIAL_MXCSR__;
845 		xrstor_user(sfp, xsave_mask);
846 		if (cpu_use_xsaves || !xsave_mask)
847 			fpusave(sfp);
848 		else {
849 			/* must not use xsaveopt here */
850 			xsave(sfp, xsave_mask);
851 		}
852 	} else {
853 		fpureset();
854 	}
855 
856 #if NVMM > 0
857 	/* Re-enable VMM if needed */
858 	if (ci->ci_flags & CPUF_VMM)
859 		start_vmm_on_cpu(ci);
860 #endif /* NVMM > 0 */
861 
862 #ifdef MULTIPROCESSOR
863 	atomic_setbits_int(&ci->ci_flags, CPUF_RUNNING);
864 	/*
865 	 * Big hammer: flush all TLB entries, including ones from PTEs
866 	 * with the G bit set.  This should only be necessary if TLB
867 	 * shootdown falls far behind.
868 	 */
869 	cr4 = rcr4();
870 	lcr4(cr4 & ~CR4_PGE);
871 	lcr4(cr4);
872 
873 	/* Check if TSC is synchronized. */
874 	if (cold && !CPU_IS_PRIMARY(ci))
875 	      tsc_test_sync_ap(ci);
876 #endif
877 }
878 
879 #if NVMM > 0
880 /*
881  * cpu_init_vmm
882  *
883  * Initializes per-cpu VMM state
884  *
885  * Parameters:
886  *  ci: the cpu for which state is being initialized
887  */
888 void
889 cpu_init_vmm(struct cpu_info *ci)
890 {
891 	/*
892 	 * Allocate a per-cpu VMXON region for VMX CPUs
893 	 */
894 	if (ci->ci_vmm_flags & CI_VMM_VMX) {
895 		ci->ci_vmxon_region = (struct vmxon_region *)malloc(PAGE_SIZE,
896 		    M_DEVBUF, M_WAITOK | M_ZERO);
897 		if (!pmap_extract(pmap_kernel(), (vaddr_t)ci->ci_vmxon_region,
898 		    &ci->ci_vmxon_region_pa))
899 			panic("Can't locate VMXON region in phys mem");
900 		ci->ci_vmcs_pa = VMX_VMCS_PA_CLEAR;
901 		rw_init(&ci->ci_vmcs_lock, "vmcslock");
902 	}
903 }
904 #endif /* NVMM > 0 */
905 
906 #ifdef MULTIPROCESSOR
907 void
908 cpu_boot_secondary_processors(void)
909 {
910 	struct cpu_info *ci;
911 	u_long i;
912 
913 	for (i=0; i < MAXCPUS; i++) {
914 		ci = cpu_info[i];
915 		if (ci == NULL)
916 			continue;
917 		if (ci->ci_idle_pcb == NULL)
918 			continue;
919 		if ((ci->ci_flags & CPUF_PRESENT) == 0)
920 			continue;
921 		if (ci->ci_flags & (CPUF_BSP | CPUF_SP | CPUF_PRIMARY))
922 			continue;
923 		ci->ci_randseed = (arc4random() & 0x7fffffff) + 1;
924 		cpu_boot_secondary(ci);
925 	}
926 }
927 
928 void
929 cpu_start_secondary(struct cpu_info *ci)
930 {
931 	int i;
932 	u_long s;
933 
934 	atomic_setbits_int(&ci->ci_flags, CPUF_AP);
935 
936 	pmap_kenter_pa(MP_TRAMPOLINE, MP_TRAMPOLINE, PROT_READ | PROT_EXEC);
937 	pmap_kenter_pa(MP_TRAMP_DATA, MP_TRAMP_DATA, PROT_READ | PROT_WRITE);
938 
939 	CPU_STARTUP(ci);
940 
941 	/*
942 	 * wait for it to become ready
943 	 */
944 	for (i = 100000; (!(ci->ci_flags & CPUF_PRESENT)) && i>0;i--) {
945 		delay(10);
946 	}
947 	if (! (ci->ci_flags & CPUF_PRESENT)) {
948 		printf("%s: failed to become ready\n", ci->ci_dev->dv_xname);
949 #if defined(MPDEBUG) && defined(DDB)
950 		printf("dropping into debugger; continue from here to resume boot\n");
951 		db_enter();
952 #endif
953 	}
954 
955 	if ((ci->ci_flags & CPUF_IDENTIFIED) == 0) {
956 		atomic_setbits_int(&ci->ci_flags, CPUF_IDENTIFY);
957 
958 		/* wait for it to identify */
959 		for (i = 2000000; (ci->ci_flags & CPUF_IDENTIFY) && i > 0; i--)
960 			delay(10);
961 
962 		if (ci->ci_flags & CPUF_IDENTIFY)
963 			printf("%s: failed to identify\n",
964 			    ci->ci_dev->dv_xname);
965 	}
966 
967 	if (ci->ci_flags & CPUF_IDENTIFIED) {
968 		/*
969 		 * Test if TSCs are synchronized.  Invalidate cache to
970 		 * minimize possible cache effects.  Disable interrupts to
971 		 * try to rule out external interference.
972 		 */
973 		s = intr_disable();
974 		wbinvd();
975 		tsc_test_sync_bp(curcpu());
976 		intr_restore(s);
977 	}
978 
979 	CPU_START_CLEANUP(ci);
980 
981 	pmap_kremove(MP_TRAMPOLINE, PAGE_SIZE);
982 	pmap_kremove(MP_TRAMP_DATA, PAGE_SIZE);
983 }
984 
985 void
986 cpu_boot_secondary(struct cpu_info *ci)
987 {
988 	int i;
989 	u_long s;
990 
991 	atomic_setbits_int(&ci->ci_flags, CPUF_GO);
992 
993 	for (i = 100000; (!(ci->ci_flags & CPUF_RUNNING)) && i>0;i--) {
994 		delay(10);
995 	}
996 	if (! (ci->ci_flags & CPUF_RUNNING)) {
997 		printf("cpu failed to start\n");
998 #if defined(MPDEBUG) && defined(DDB)
999 		printf("dropping into debugger; continue from here to resume boot\n");
1000 		db_enter();
1001 #endif
1002 	} else if (cold) {
1003 		/* Test if TSCs are synchronized again. */
1004 		s = intr_disable();
1005 		wbinvd();
1006 		tsc_test_sync_bp(curcpu());
1007 		intr_restore(s);
1008 	}
1009 }
1010 
1011 /*
1012  * The CPU ends up here when it's ready to run
1013  * This is called from code in mptramp.s; at this point, we are running
1014  * in the idle pcb/idle stack of the new cpu.  When this function returns,
1015  * this processor will enter the idle loop and start looking for work.
1016  *
1017  * XXX should share some of this with init386 in machdep.c
1018  */
1019 void
1020 cpu_hatch(void *v)
1021 {
1022 	struct cpu_info *ci = (struct cpu_info *)v;
1023 	int s;
1024 
1025 	{
1026 		uint32_t vendor[4];
1027 		int level;
1028 
1029 		CPUID(0, level, vendor[0], vendor[2], vendor[1]);
1030 		vendor[3] = 0;
1031 		cpu_set_vendor(ci, level, (const char *)vendor);
1032 	}
1033 
1034 	cpu_init_msrs(ci);
1035 
1036 #ifdef DEBUG
1037 	if (ci->ci_flags & CPUF_PRESENT)
1038 		panic("%s: already running!?", ci->ci_dev->dv_xname);
1039 #endif
1040 	atomic_setbits_int(&ci->ci_flags, CPUF_PRESENT);
1041 
1042 	lapic_enable();
1043 	cpu_ucode_apply(ci);
1044 	cpu_tsx_disable(ci);
1045 
1046 	if ((ci->ci_flags & CPUF_IDENTIFIED) == 0) {
1047 		/*
1048 		 * We need to wait until we can identify, otherwise dmesg
1049 		 * output will be messy.
1050 		 */
1051 		while ((ci->ci_flags & CPUF_IDENTIFY) == 0)
1052 			delay(10);
1053 
1054 		identifycpu(ci);
1055 
1056 		/* Prevent identifycpu() from running again */
1057 		atomic_setbits_int(&ci->ci_flags, CPUF_IDENTIFIED);
1058 
1059 		/* Signal we're done */
1060 		atomic_clearbits_int(&ci->ci_flags, CPUF_IDENTIFY);
1061 	}
1062 
1063 	/* These have to run after identifycpu() */
1064 	cpu_fix_msrs(ci);
1065 
1066 	/*
1067 	 * Test if our TSC is synchronized for the first time.
1068 	 * Note that interrupts are off at this point.
1069 	 */
1070 	wbinvd();
1071 	tsc_test_sync_ap(ci);
1072 
1073 	while ((ci->ci_flags & CPUF_GO) == 0)
1074 		delay(10);
1075 #ifdef HIBERNATE
1076 	if ((ci->ci_flags & CPUF_PARK) != 0) {
1077 		if (ci->ci_feature_sefflags_edx & SEFF0EDX_IBT)
1078 			lcr4(rcr4() & ~CR4_CET);
1079 		atomic_clearbits_int(&ci->ci_flags, CPUF_PARK);
1080 		hibernate_drop_to_real_mode();
1081 	}
1082 #endif /* HIBERNATE */
1083 
1084 #ifdef DEBUG
1085 	if (ci->ci_flags & CPUF_RUNNING)
1086 		panic("%s: already running!?", ci->ci_dev->dv_xname);
1087 #endif
1088 
1089 	cpu_init_idt();
1090 	lapic_set_lvt();
1091 	gdt_init_cpu(ci);
1092 	fpuinit(ci);
1093 
1094 	lldt(0);
1095 
1096 	cpu_init(ci);
1097 #if NPVBUS > 0
1098 	pvbus_init_cpu();
1099 #endif
1100 
1101 	/* Re-initialise memory range handling on AP */
1102 	if (mem_range_softc.mr_op != NULL)
1103 		mem_range_softc.mr_op->initAP(&mem_range_softc);
1104 
1105 	s = splhigh();
1106 	lcr8(0);
1107 	intr_enable();
1108 	splx(s);
1109 
1110 	lapic_startclock();
1111 
1112 	sched_toidle();
1113 }
1114 
1115 #if defined(DDB)
1116 
1117 #include <ddb/db_output.h>
1118 #include <machine/db_machdep.h>
1119 
1120 /*
1121  * Dump cpu information from ddb.
1122  */
1123 void
1124 cpu_debug_dump(void)
1125 {
1126 	struct cpu_info *ci;
1127 	CPU_INFO_ITERATOR cii;
1128 
1129 	db_printf("addr		dev	id	flags	ipis	curproc\n");
1130 	CPU_INFO_FOREACH(cii, ci) {
1131 		db_printf("%p	%s	%u	%x	%x	%10p\n",
1132 		    ci,
1133 		    ci->ci_dev == NULL ? "BOOT" : ci->ci_dev->dv_xname,
1134 		    ci->ci_cpuid,
1135 		    ci->ci_flags, ci->ci_ipis,
1136 		    ci->ci_curproc);
1137 	}
1138 }
1139 #endif
1140 
1141 int
1142 mp_cpu_start(struct cpu_info *ci)
1143 {
1144 	unsigned short dwordptr[2];
1145 
1146 	/*
1147 	 * "The BSP must initialize CMOS shutdown code to 0Ah ..."
1148 	 */
1149 
1150 	outb(IO_RTC, NVRAM_RESET);
1151 	outb(IO_RTC+1, NVRAM_RESET_JUMP);
1152 
1153 	/*
1154 	 * "and the warm reset vector (DWORD based at 40:67) to point
1155 	 * to the AP startup code ..."
1156 	 */
1157 
1158 	dwordptr[0] = 0;
1159 	dwordptr[1] = MP_TRAMPOLINE >> 4;
1160 
1161 	pmap_kenter_pa(0, 0, PROT_READ | PROT_WRITE);
1162 	memcpy((u_int8_t *) 0x467, dwordptr, 4);
1163 	pmap_kremove(0, PAGE_SIZE);
1164 
1165 #if NLAPIC > 0
1166 	/*
1167 	 * ... prior to executing the following sequence:"
1168 	 */
1169 
1170 	if (ci->ci_flags & CPUF_AP) {
1171 		x86_ipi_init(ci->ci_apicid);
1172 
1173 		delay(10000);
1174 
1175 		if (cpu_feature & CPUID_APIC) {
1176 			x86_ipi(MP_TRAMPOLINE/PAGE_SIZE, ci->ci_apicid,
1177 			    LAPIC_DLMODE_STARTUP);
1178 			delay(200);
1179 
1180 			x86_ipi(MP_TRAMPOLINE/PAGE_SIZE, ci->ci_apicid,
1181 			    LAPIC_DLMODE_STARTUP);
1182 			delay(200);
1183 		}
1184 	}
1185 #endif
1186 	return 0;
1187 }
1188 
1189 void
1190 mp_cpu_start_cleanup(struct cpu_info *ci)
1191 {
1192 	/*
1193 	 * Ensure the NVRAM reset byte contains something vaguely sane.
1194 	 */
1195 
1196 	outb(IO_RTC, NVRAM_RESET);
1197 	outb(IO_RTC+1, NVRAM_RESET_RST);
1198 }
1199 #endif	/* MULTIPROCESSOR */
1200 
1201 typedef void (vector)(void);
1202 extern vector Xsyscall_meltdown, Xsyscall, Xsyscall32;
1203 
1204 void
1205 cpu_init_msrs(struct cpu_info *ci)
1206 {
1207 	wrmsr(MSR_STAR,
1208 	    ((uint64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1209 	    ((uint64_t)GSEL(GUDATA_SEL-1, SEL_UPL) << 48));
1210 	wrmsr(MSR_LSTAR, cpu_meltdown ? (uint64_t)Xsyscall_meltdown :
1211 	    (uint64_t)Xsyscall);
1212 	wrmsr(MSR_CSTAR, 0);
1213 	wrmsr(MSR_SFMASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D|PSL_AC);
1214 
1215 	wrmsr(MSR_FSBASE, 0);
1216 	wrmsr(MSR_GSBASE, (u_int64_t)ci);
1217 	wrmsr(MSR_KERNELGSBASE, 0);
1218 	patinit(ci);
1219 }
1220 
1221 void
1222 cpu_fix_msrs(struct cpu_info *ci)
1223 {
1224 	int family = ci->ci_family;
1225 	uint64_t msr, nmsr;
1226 
1227 	if (ci->ci_vendor == CPUV_INTEL) {
1228 		if ((family > 6 || (family == 6 && ci->ci_model >= 0xd)) &&
1229 		    rdmsr_safe(MSR_MISC_ENABLE, &msr) == 0 &&
1230 		    (msr & MISC_ENABLE_FAST_STRINGS) == 0) {
1231 			msr |= MISC_ENABLE_FAST_STRINGS;
1232 			wrmsr(MSR_MISC_ENABLE, msr);
1233 			DPRINTF("%s: enabled fast strings\n", ci->ci_dev->dv_xname);
1234 
1235 		/*
1236 		 * Attempt to disable Silicon Debug and lock the configuration
1237 		 * if it's enabled and unlocked.
1238 		 */
1239 		if (cpu_ecxfeature & CPUIDECX_SDBG) {
1240 			msr = rdmsr(IA32_DEBUG_INTERFACE);
1241 			if ((msr & IA32_DEBUG_INTERFACE_ENABLE) &&
1242 			    (msr & IA32_DEBUG_INTERFACE_LOCK) == 0) {
1243 				msr &= IA32_DEBUG_INTERFACE_MASK;
1244 				msr |= IA32_DEBUG_INTERFACE_LOCK;
1245 				wrmsr(IA32_DEBUG_INTERFACE, msr);
1246 			} else if (msr & IA32_DEBUG_INTERFACE_ENABLE)
1247 				printf("%s: cannot disable silicon debug\n",
1248 				    ci->ci_dev->dv_xname);
1249 			}
1250 		}
1251 	}
1252 
1253 	if (ci->ci_vendor == CPUV_AMD) {
1254 		/* Apply AMD errata */
1255 		amd64_errata(ci);
1256 
1257 		/*
1258 		 * "Mitigation G-2" per AMD's Whitepaper "Software Techniques
1259 		 * for Managing Speculation on AMD Processors"
1260 		 *
1261 		 * By setting MSR C001_1029[1]=1, LFENCE becomes a dispatch
1262 		 * serializing instruction.
1263 		 *
1264 		 * This MSR is available on all AMD families >= 10h, except 11h
1265 		 * where LFENCE is always serializing.
1266 		 */
1267 		if (family >= 0x10 && family != 0x11) {
1268 			nmsr = msr = rdmsr(MSR_DE_CFG);
1269 			nmsr |= DE_CFG_SERIALIZE_LFENCE;
1270 			if (msr != nmsr)
1271 				wrmsr(MSR_DE_CFG, nmsr);
1272 		}
1273 		if (family == 0x17 && ci->ci_model >= 0x31 &&
1274 		    (cpu_ecxfeature & CPUIDECX_HV) == 0) {
1275 			nmsr = msr = rdmsr(MSR_DE_CFG);
1276 			nmsr |= DE_CFG_SERIALIZE_9;
1277 			if (msr != nmsr)
1278 				wrmsr(MSR_DE_CFG, nmsr);
1279 		}
1280 	}
1281 
1282 #ifndef SMALL_KERNEL
1283 	if (ci->ci_feature_sefflags_edx & SEFF0EDX_IBT) {
1284 		msr = rdmsr(MSR_S_CET);
1285 		wrmsr(MSR_S_CET, (msr & ~MSR_CET_NO_TRACK_EN) | MSR_CET_ENDBR_EN);
1286 		lcr4(rcr4() | CR4_CET);
1287 	}
1288 #endif
1289 }
1290 
1291 void
1292 cpu_tsx_disable(struct cpu_info *ci)
1293 {
1294 	uint64_t msr;
1295 	uint32_t dummy, sefflags_edx;
1296 
1297 	/* this runs before identifycpu() populates ci_feature_sefflags_edx */
1298 	if (ci->ci_cpuid_level < 0x07)
1299 		return;
1300 	CPUID_LEAF(0x7, 0, dummy, dummy, dummy, sefflags_edx);
1301 
1302 	if (ci->ci_vendor == CPUV_INTEL &&
1303 	    (sefflags_edx & SEFF0EDX_ARCH_CAP)) {
1304 		msr = rdmsr(MSR_ARCH_CAPABILITIES);
1305 		if (msr & ARCH_CAP_TSX_CTRL) {
1306 			msr = rdmsr(MSR_TSX_CTRL);
1307 			msr |= TSX_CTRL_RTM_DISABLE | TSX_CTRL_TSX_CPUID_CLEAR;
1308 			wrmsr(MSR_TSX_CTRL, msr);
1309 		}
1310 	}
1311 }
1312 
1313 void
1314 patinit(struct cpu_info *ci)
1315 {
1316 	extern int	pmap_pg_wc;
1317 	u_int64_t	reg;
1318 
1319 	if ((cpu_feature & CPUID_PAT) == 0)
1320 		return;
1321 	/*
1322 	 * Set up PAT bits.
1323 	 * The default pat table is the following:
1324 	 * WB, WT, UC-, UC, WB, WT, UC-, UC
1325 	 * We change it to:
1326 	 * WB, WC, UC-, UC, WB, WC, UC-, UC
1327 	 * i.e change the WT bit to be WC.
1328 	 */
1329 	reg = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
1330 	    PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
1331 	    PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
1332 	    PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
1333 
1334 	wrmsr(MSR_CR_PAT, reg);
1335 	pmap_pg_wc = PG_WC;
1336 }
1337 
1338 struct timeout rdrand_tmo;
1339 void rdrand(void *);
1340 
1341 void
1342 rdrand(void *v)
1343 {
1344 	struct timeout *tmo = v;
1345 	extern int	has_rdrand, has_rdseed;
1346 	union {
1347 		uint64_t u64;
1348 		uint32_t u32[2];
1349 	} r, t;
1350 	uint64_t tsc;
1351 	uint8_t valid = 0;
1352 
1353 	tsc = rdtsc();
1354 	if (has_rdseed)
1355 		__asm volatile(
1356 		    "rdseed	%0\n\t"
1357 		    "setc	%1\n"
1358 		    : "=r" (r.u64), "=qm" (valid) );
1359 	if (has_rdrand && (has_rdseed == 0 || valid == 0))
1360 		__asm volatile(
1361 		    "rdrand	%0\n\t"
1362 		    "setc	%1\n"
1363 		    : "=r" (r.u64), "=qm" (valid) );
1364 
1365 	t.u64 = tsc;
1366 	t.u64 ^= r.u64;
1367 	t.u64 ^= valid;			/* potential rdrand empty */
1368 	if (has_rdrand)
1369 		t.u64 += rdtsc();	/* potential vmexit latency */
1370 
1371 	enqueue_randomness(t.u32[0]);
1372 	enqueue_randomness(t.u32[1]);
1373 
1374 	if (tmo)
1375 		timeout_add_msec(tmo, 10);
1376 }
1377 
1378 int
1379 cpu_activate(struct device *self, int act)
1380 {
1381 	struct cpu_softc *sc = (struct cpu_softc *)self;
1382 
1383 	switch (act) {
1384 	case DVACT_RESUME:
1385 		if (sc->sc_info->ci_cpuid == 0)
1386 			rdrand(NULL);
1387 #if NPCTR > 0
1388 		pctr_resume(sc->sc_info);
1389 #endif
1390 		break;
1391 	}
1392 
1393 	return (0);
1394 }
1395 
1396 /*
1397  * cpu_enter_pages
1398  *
1399  * Requests mapping of various special pages required in the Intel Meltdown
1400  * case (to be entered into the U-K page table):
1401  *
1402  *  1 tss+gdt page for each CPU
1403  *  1 trampoline stack page for each CPU
1404  *
1405  * The cpu_info_full struct for each CPU straddles these pages. The offset into
1406  * 'cif' is calculated below, for each page. For more information, consult
1407  * the definition of struct cpu_info_full in cpu_full.h
1408  *
1409  * On CPUs unaffected by Meltdown, this function still configures 'cif' but
1410  * the calls to pmap_enter_special become no-ops.
1411  *
1412  * Parameters:
1413  *  cif : the cpu_info_full structure describing a CPU whose pages are to be
1414  *    entered into the special meltdown U-K page table.
1415  */
1416 void
1417 cpu_enter_pages(struct cpu_info_full *cif)
1418 {
1419 	vaddr_t va;
1420 	paddr_t pa;
1421 
1422 	/* The TSS+GDT need to be readable */
1423 	va = (vaddr_t)cif;
1424 	pmap_extract(pmap_kernel(), va, &pa);
1425 	pmap_enter_special(va, pa, PROT_READ);
1426 	DPRINTF("%s: entered tss+gdt page at va 0x%llx pa 0x%llx\n", __func__,
1427 	   (uint64_t)va, (uint64_t)pa);
1428 
1429 	/* The trampoline stack page needs to be read/write */
1430 	va = (vaddr_t)&cif->cif_tramp_stack;
1431 	pmap_extract(pmap_kernel(), va, &pa);
1432 	pmap_enter_special(va, pa, PROT_READ | PROT_WRITE);
1433 	DPRINTF("%s: entered t.stack page at va 0x%llx pa 0x%llx\n", __func__,
1434 	   (uint64_t)va, (uint64_t)pa);
1435 
1436 	cif->cif_tss.tss_rsp0 = va + sizeof(cif->cif_tramp_stack) - 16;
1437 	DPRINTF("%s: cif_tss.tss_rsp0 = 0x%llx\n" ,__func__,
1438 	    (uint64_t)cif->cif_tss.tss_rsp0);
1439 	cif->cif_cpu.ci_intr_rsp = cif->cif_tss.tss_rsp0 -
1440 	    sizeof(struct iretq_frame);
1441 
1442 #define	SETUP_IST_SPECIAL_STACK(ist, cif, member) do {			\
1443 	(cif)->cif_tss.tss_ist[(ist)] = (vaddr_t)&(cif)->member +	\
1444 	    sizeof((cif)->member) - 16;					\
1445 	(cif)->member[nitems((cif)->member) - 2] = (int64_t)&(cif)->cif_cpu; \
1446 } while (0)
1447 
1448 	SETUP_IST_SPECIAL_STACK(0, cif, cif_dblflt_stack);
1449 	SETUP_IST_SPECIAL_STACK(1, cif, cif_nmi_stack);
1450 
1451 	/* an empty iomap, by setting its offset to the TSS limit */
1452 	cif->cif_tss.tss_iobase = sizeof(cif->cif_tss);
1453 }
1454 
1455 #ifdef MULTIPROCESSOR
1456 int
1457 wbinvd_on_all_cpus(void)
1458 {
1459 	x86_broadcast_ipi(X86_IPI_WBINVD);
1460 	wbinvd();
1461 	return 0;
1462 }
1463 #endif
1464