1 /*-
2  * Copyright (c) 1982, 1986 The Regents of the University of California.
3  * Copyright (c) 1989, 1990 William Jolitz
4  * Copyright (c) 1994 John Dyson
5  * Copyright (c) 2008-2018 The DragonFly Project.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * the Systems Programming Group of the University of Utah Computer
10  * Science Department, and William Jolitz.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *	This product includes software developed by the University of
23  *	California, Berkeley and its contributors.
24  * 4. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  *
40  *	from: @(#)vm_machdep.c	7.3 (Berkeley) 5/13/91
41  *	Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$
42  * $FreeBSD: src/sys/i386/i386/vm_machdep.c,v 1.132.2.9 2003/01/25 19:02:23 dillon Exp $
43  */
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/malloc.h>
48 #include <sys/proc.h>
49 #include <sys/buf.h>
50 #include <sys/interrupt.h>
51 #include <sys/vnode.h>
52 #include <sys/vmmeter.h>
53 #include <sys/kernel.h>
54 #include <sys/sysctl.h>
55 #include <sys/unistd.h>
56 #include <sys/lwp.h>
57 
58 #include <machine/clock.h>
59 #include <machine/cpu.h>
60 #include <machine/md_var.h>
61 #include <machine/smp.h>
62 #include <machine/pcb.h>
63 #include <machine/pcb_ext.h>
64 #include <machine/segments.h>
65 #include <machine/globaldata.h>	/* npxthread */
66 #include <machine/specialreg.h>
67 #include <machine/vmm.h>
68 
69 #include <vm/vm.h>
70 #include <vm/vm_param.h>
71 #include <sys/lock.h>
72 #include <vm/vm_kern.h>
73 #include <vm/vm_page.h>
74 #include <vm/vm_map.h>
75 #include <vm/vm_extern.h>
76 
77 #include <sys/thread2.h>
78 
79 #include <bus/isa/isa.h>
80 
81 static void	cpu_reset_real (void);
82 
83 static int spectre_mitigation = -1;
84 static int spectre_support = 0;
85 static int spectre_mode = 0;
86 SYSCTL_INT(_machdep, OID_AUTO, spectre_mode, CTLFLAG_RD,
87 	&spectre_mode, 0, "current Spectre enablements");
88 
89 static int mds_mitigation = -1;
90 static int mds_support = 0;
91 static int mds_mode = 0;
92 SYSCTL_INT(_machdep, OID_AUTO, mds_mode, CTLFLAG_RD,
93 	&mds_mode, 0, "current MDS enablements");
94 
95 /*
96  * Finish a fork operation, with lwp lp2 nearly set up.
97  * Copy and update the pcb, set up the stack so that the child
98  * ready to run and return to user mode.
99  */
100 void
101 cpu_fork(struct lwp *lp1, struct lwp *lp2, int flags)
102 {
103 	struct pcb *pcb2;
104 	struct pmap *pmap2;
105 
106 	if ((flags & RFPROC) == 0) {
107 		if ((flags & RFMEM) == 0) {
108 			/*
109 			 * Unshare user LDT.  > 1 test is MPSAFE.  While
110 			 * it can potentially race a 2->1 transition, the
111 			 * worst that happens is that we do an unnecessary
112 			 * ldt replacement.
113 			 */
114 			struct pcb *pcb1 = lp1->lwp_thread->td_pcb;
115 			struct pcb_ldt *pcb_ldt = pcb1->pcb_ldt;
116 
117 			if (pcb_ldt && pcb_ldt->ldt_refcnt > 1) {
118 				pcb_ldt = user_ldt_alloc(pcb1,pcb_ldt->ldt_len);
119 				user_ldt_free(pcb1);
120 				pcb1->pcb_ldt = pcb_ldt;
121 				set_user_ldt(pcb1);
122 			}
123 		}
124 		return;
125 	}
126 
127 	/* Ensure that lp1's pcb is up to date. */
128 	if (mdcpu->gd_npxthread == lp1->lwp_thread)
129 		npxsave(lp1->lwp_thread->td_savefpu);
130 
131 	/*
132 	 * Copy lp1's PCB.  This really only applies to the
133 	 * debug registers and FP state, but its faster to just copy the
134 	 * whole thing.  Because we only save the PCB at switchout time,
135 	 * the register state may not be current.
136 	 */
137 	pcb2 = lp2->lwp_thread->td_pcb;
138 	*pcb2 = *lp1->lwp_thread->td_pcb;
139 
140 	/*
141 	 * Create a new fresh stack for the new process.
142 	 * Copy the trap frame for the return to user mode as if from a
143 	 * syscall.  This copies the user mode register values.
144 	 *
145 	 * pcb_rsp must allocate an additional call-return pointer below
146 	 * the trap frame which will be restored by cpu_heavy_restore from
147 	 * PCB_RIP, and the thread's td_sp pointer must allocate an
148 	 * additonal two quadwords below the pcb_rsp call-return pointer to
149 	 * hold the LWKT restore function pointer and rflags.
150 	 *
151 	 * The LWKT restore function pointer must be set to cpu_heavy_restore,
152 	 * which is our standard heavy-weight process switch-in function.
153 	 * YYY eventually we should shortcut fork_return and fork_trampoline
154 	 * to use the LWKT restore function directly so we can get rid of
155 	 * all the extra crap we are setting up.
156 	 */
157 	lp2->lwp_md.md_regs = (struct trapframe *)pcb2 - 1;
158 	bcopy(lp1->lwp_md.md_regs, lp2->lwp_md.md_regs, sizeof(*lp2->lwp_md.md_regs));
159 
160 	/*
161 	 * Set registers for trampoline to user mode.  Leave space for the
162 	 * return address on stack.  These are the kernel mode register values.
163 	 *
164 	 * Set the new pmap CR3.  If the new process uses isolated VM spaces,
165 	 * also set the isolated CR3.
166 	 */
167 	pmap2 = vmspace_pmap(lp2->lwp_proc->p_vmspace);
168 	pcb2->pcb_cr3 = vtophys(pmap2->pm_pml4);
169 	if ((pcb2->pcb_flags & PCB_ISOMMU) && pmap2->pm_pmlpv_iso) {
170 		pcb2->pcb_cr3_iso = vtophys(pmap2->pm_pml4_iso);
171 	} else {
172 		pcb2->pcb_flags &= ~PCB_ISOMMU;
173 		pcb2->pcb_cr3_iso = 0;
174 	}
175 
176 #if 0
177 	/*
178 	 * Per-process spectre mitigation (future)
179 	 */
180 	pcb2->pcb_flags &= ~(PCB_IBRS1 | PCB_IBRS2);
181 	switch (spectre_mitigation) {
182 	case 1:
183 		pcb2->pcb_flags |= PCB_IBRS1;
184 		break;
185 	case 2:
186 		pcb2->pcb_flags |= PCB_IBRS2;
187 		break;
188 	default:
189 		break;
190 	}
191 #endif
192 
193 	pcb2->pcb_rbx = (unsigned long)fork_return;	/* fork_trampoline argument */
194 	pcb2->pcb_rbp = 0;
195 	pcb2->pcb_rsp = (unsigned long)lp2->lwp_md.md_regs - sizeof(void *);
196 	pcb2->pcb_r12 = (unsigned long)lp2;		/* fork_trampoline argument */
197 	pcb2->pcb_r13 = 0;
198 	pcb2->pcb_r14 = 0;
199 	pcb2->pcb_r15 = 0;
200 	pcb2->pcb_rip = (unsigned long)fork_trampoline;
201 	lp2->lwp_thread->td_sp = (char *)(pcb2->pcb_rsp - sizeof(void *));
202 	*(u_int64_t *)lp2->lwp_thread->td_sp = PSL_USER;
203 	lp2->lwp_thread->td_sp -= sizeof(void *);
204 	*(void **)lp2->lwp_thread->td_sp = (void *)cpu_heavy_restore;
205 
206 	/*
207 	 * pcb2->pcb_ldt:	duplicated below, if necessary.
208 	 * pcb2->pcb_savefpu:	cloned above.
209 	 * pcb2->pcb_flags:	cloned above
210 	 * pcb2->pcb_onfault:	cloned above (always NULL here).
211 	 * pcb2->pcb_onfault_sp:cloned above (dont care)
212 	 */
213 
214 	/*
215 	 * XXX don't copy the i/o pages.  this should probably be fixed.
216 	 */
217 	pcb2->pcb_ext = NULL;
218 
219         /* Copy the LDT, if necessary. */
220         if (pcb2->pcb_ldt != NULL) {
221 		if (flags & RFMEM) {
222 			atomic_add_int(&pcb2->pcb_ldt->ldt_refcnt, 1);
223 		} else {
224 			pcb2->pcb_ldt = user_ldt_alloc(pcb2,
225 						       pcb2->pcb_ldt->ldt_len);
226 		}
227         }
228 	bcopy(&lp1->lwp_thread->td_tls, &lp2->lwp_thread->td_tls,
229 	      sizeof(lp2->lwp_thread->td_tls));
230 	/*
231 	 * Now, cpu_switch() can schedule the new lwp.
232 	 * pcb_rsp is loaded pointing to the cpu_switch() stack frame
233 	 * containing the return address when exiting cpu_switch.
234 	 * This will normally be to fork_trampoline(), which will have
235 	 * %rbx loaded with the new lwp's pointer.  fork_trampoline()
236 	 * will set up a stack to call fork_return(lp, frame); to complete
237 	 * the return to user-mode.
238 	 */
239 }
240 
241 /*
242  * Prepare new lwp to return to the address specified in params.
243  */
244 int
245 cpu_prepare_lwp(struct lwp *lp, struct lwp_params *params)
246 {
247 	struct trapframe *regs = lp->lwp_md.md_regs;
248 	void *bad_return = NULL;
249 	int error;
250 
251 	regs->tf_rip = (long)params->lwp_func;
252 	regs->tf_rsp = (long)params->lwp_stack;
253 	/* Set up argument for function call */
254 	regs->tf_rdi = (long)params->lwp_arg;
255 
256 	/*
257 	 * Set up fake return address.  As the lwp function may never return,
258 	 * we simply copy out a NULL pointer and force the lwp to receive
259 	 * a SIGSEGV if it returns anyways.
260 	 */
261 	regs->tf_rsp -= sizeof(void *);
262 	error = copyout(&bad_return, (void *)regs->tf_rsp, sizeof(bad_return));
263 	if (error)
264 		return (error);
265 
266 	if (lp->lwp_proc->p_vmm) {
267 		lp->lwp_thread->td_pcb->pcb_cr3 = KPML4phys;
268 		cpu_set_fork_handler(lp,
269 		    (void (*)(void *, struct trapframe *))vmm_lwp_return, lp);
270 	} else {
271 		cpu_set_fork_handler(lp,
272 		    (void (*)(void *, struct trapframe *))generic_lwp_return, lp);
273 	}
274 	return (0);
275 }
276 
277 /*
278  * Intercept the return address from a freshly forked process that has NOT
279  * been scheduled yet.
280  *
281  * This is needed to make kernel threads stay in kernel mode.
282  */
283 void
284 cpu_set_fork_handler(struct lwp *lp, void (*func)(void *, struct trapframe *),
285 		     void *arg)
286 {
287 	/*
288 	 * Note that the trap frame follows the args, so the function
289 	 * is really called like this:  func(arg, frame);
290 	 */
291 	lp->lwp_thread->td_pcb->pcb_rbx = (long)func;	/* function */
292 	lp->lwp_thread->td_pcb->pcb_r12 = (long)arg;	/* first arg */
293 }
294 
295 void
296 cpu_set_thread_handler(thread_t td, void (*rfunc)(void), void *func, void *arg)
297 {
298 	td->td_pcb->pcb_rbx = (long)func;
299 	td->td_pcb->pcb_r12 = (long)arg;
300 	td->td_switch = cpu_lwkt_switch;
301 	td->td_sp -= sizeof(void *);
302 	*(void **)td->td_sp = rfunc;	/* exit function on return */
303 	td->td_sp -= sizeof(void *);
304 	*(void **)td->td_sp = cpu_kthread_restore;
305 }
306 
307 void
308 cpu_lwp_exit(void)
309 {
310 	struct thread *td = curthread;
311 	struct pcb *pcb;
312 
313 	pcb = td->td_pcb;
314 
315 	/* Some x86 functionality was dropped */
316 	KKASSERT(pcb->pcb_ext == NULL);
317 
318 	/*
319 	 * disable all hardware breakpoints
320 	 */
321         if (pcb->pcb_flags & PCB_DBREGS) {
322                 reset_dbregs();
323                 pcb->pcb_flags &= ~PCB_DBREGS;
324         }
325 	td->td_gd->gd_cnt.v_swtch++;
326 
327 	crit_enter_quick(td);
328 	if (td->td_flags & TDF_TSLEEPQ)
329 		tsleep_remove(td);
330 	lwkt_deschedule_self(td);
331 	lwkt_remove_tdallq(td);
332 	cpu_thread_exit();
333 }
334 
335 /*
336  * Terminate the current thread.  The caller must have already acquired
337  * the thread's rwlock and placed it on a reap list or otherwise notified
338  * a reaper of its existance.  We set a special assembly switch function which
339  * releases td_rwlock after it has cleaned up the MMU state and switched
340  * out the stack.
341  *
342  * Must be caller from a critical section and with the thread descheduled.
343  */
344 void
345 cpu_thread_exit(void)
346 {
347 	npxexit();
348 	curthread->td_switch = cpu_exit_switch;
349 	curthread->td_flags |= TDF_EXITING;
350 	lwkt_switch();
351 	panic("cpu_thread_exit: lwkt_switch() unexpectedly returned");
352 }
353 
354 void
355 cpu_reset(void)
356 {
357 	cpu_reset_real();
358 }
359 
360 static void
361 cpu_reset_real(void)
362 {
363 	/*
364 	 * Attempt to do a CPU reset via the keyboard controller,
365 	 * do not turn off the GateA20, as any machine that fails
366 	 * to do the reset here would then end up in no man's land.
367 	 */
368 
369 #if !defined(BROKEN_KEYBOARD_RESET)
370 	outb(IO_KBD + 4, 0xFE);
371 	DELAY(500000);	/* wait 0.5 sec to see if that did it */
372 	kprintf("Keyboard reset did not work, attempting CPU shutdown\n");
373 	DELAY(1000000);	/* wait 1 sec for kprintf to complete */
374 #endif
375 #if 0 /* JG */
376 	/* force a shutdown by unmapping entire address space ! */
377 	bzero((caddr_t) PTD, PAGE_SIZE);
378 #endif
379 
380 	/* "good night, sweet prince .... <THUNK!>" */
381 	cpu_invltlb();
382 	/* NOTREACHED */
383 	while(1);
384 }
385 
386 static void
387 swi_vm(void *arg, void *frame)
388 {
389 	if (busdma_swi_pending != 0)
390 		busdma_swi();
391 }
392 
393 static void
394 swi_vm_setup(void *arg)
395 {
396 	register_swi_mp(SWI_VM, swi_vm, NULL, "swi_vm", NULL, 0);
397 }
398 
399 SYSINIT(swi_vm_setup, SI_BOOT2_MACHDEP, SI_ORDER_ANY, swi_vm_setup, NULL);
400 
401 /*
402  * NOTE: This routine is also called after a successful microcode
403  *	 reload on cpu 0.
404  */
405 void mitigation_vm_setup(void *arg);
406 
407 /*
408  * Check for IBPB and IBRS support
409  *
410  * This bits also specify desired modes in the spectre_mitigation sysctl.
411  */
412 #define IBRS_SUPPORTED		0x0001
413 #define STIBP_SUPPORTED		0x0002
414 #define IBPB_SUPPORTED		0x0004
415 #define IBRS_AUTO_SUPPORTED	0x0008
416 #define STIBP_AUTO_SUPPORTED	0x0010
417 #define IBRS_PREFERRED_REQUEST	0x0020
418 
419 static
420 int
421 spectre_check_support(void)
422 {
423 	uint32_t p[4];
424 	int rv = 0;
425 
426 	/*
427 	 * Spectre mitigation hw bits
428 	 *
429 	 * IBRS		Indirect Branch Restricted Speculation   (isolation)
430 	 * STIBP	Single Thread Indirect Branch Prediction (isolation)
431 	 * IBPB		Branch Prediction Barrier                (barrier)
432 	 *
433 	 * IBRS and STIBP must be toggled (enabled on entry to kernel,
434 	 * disabled on exit, as well as disabled during any MWAIT/HLT).
435 	 * When *_AUTO bits are available, IBRS and STIBP may be left
436 	 * turned on and do not have to be toggled on kernel entry/exit.
437 	 * Be sure to clear before going idle (else hyperthread performance
438 	 * will drop).
439 	 *
440 	 * All this shit has enormous overhead.  IBPB in particular, and
441 	 * non-auto modes are disabled by default.
442 	 */
443 	if (cpu_vendor_id == CPU_VENDOR_INTEL) {
444 		p[0] = 0;
445 		p[1] = 0;
446 		p[2] = 0;
447 		p[3] = 0;
448 		cpuid_count(7, 0, p);
449 		if (p[3] & CPUID_STDEXT3_IBPB)
450 			rv |= IBRS_SUPPORTED | IBPB_SUPPORTED;
451 		if (p[3] & CPUID_STDEXT3_STIBP)
452 			rv |= STIBP_SUPPORTED;
453 
454 		/*
455 		 * 0x80000008 p[1] bit 12 indicates IBPB support
456 		 *
457 		 * This bit might be set even though STDEXT3_IBPB is not set.
458 		 */
459 		p[0] = 0;
460 		p[1] = 0;
461 		p[2] = 0;
462 		p[3] = 0;
463 		do_cpuid(0x80000008U, p);
464 		if (p[1] & CPUID_CAPEX_IBPB)
465 			rv |= IBPB_SUPPORTED;
466 	} else if (cpu_vendor_id == CPU_VENDOR_AMD) {
467 		/*
468 		 * 0x80000008
469 		 *	p[1] bit 12 indicates IBPB support
470 		 *	p[1] bit 14 indicates IBRS support
471 		 *	p[1] bit 15 indicates STIBP support
472 		 *
473 		 *	p[1] bit 16 indicates IBRS auto support
474 		 *	p[1] bit 17 indicates STIBP auto support
475 		 *	p[1] bit 18 indicates processor prefers using
476 		 *		IBRS instead of retpoline.
477 		 */
478 		p[0] = 0;
479 		p[1] = 0;
480 		p[2] = 0;
481 		p[3] = 0;
482 		do_cpuid(0x80000008U, p);
483 		if (p[1] & CPUID_CAPEX_IBPB)
484 			rv |= IBPB_SUPPORTED;
485 		if (p[1] & CPUID_CAPEX_IBRS)
486 			rv |= IBRS_SUPPORTED;
487 		if (p[1] & CPUID_CAPEX_STIBP)
488 			rv |= STIBP_SUPPORTED;
489 
490 		if (p[1] & CPUID_CAPEX_IBRS_ALWAYSON)
491 			rv |= IBRS_AUTO_SUPPORTED;
492 		if (p[1] & CPUID_CAPEX_STIBP_ALWAYSON)
493 			rv |= STIBP_AUTO_SUPPORTED;
494 		if (p[1] & CPUID_CAPEX_PREFER_IBRS)
495 			rv |= IBRS_PREFERRED_REQUEST;
496 	}
497 
498 	return rv;
499 }
500 
501 /*
502  * Iterate CPUs and adjust MSR for global operations, since
503  * the KMMU* code won't do it if spectre_mitigation is 0 or 2.
504  */
505 #define CHECK(flag)	(spectre_mitigation & spectre_support & (flag))
506 
507 static
508 void
509 spectre_sysctl_changed(void)
510 {
511 	globaldata_t save_gd;
512 	struct trampframe *tr;
513 	int spec_ctrl;
514 	int spec_mask;
515 	int mode;
516 	int n;
517 
518 	spec_mask = SPEC_CTRL_IBRS | SPEC_CTRL_STIBP |
519 		    SPEC_CTRL_DUMMY_ENABLE | SPEC_CTRL_DUMMY_IBPB;
520 
521 	/*
522 	 * Fixup state
523 	 */
524 	mode = 0;
525 	save_gd = mycpu;
526 	for (n = 0; n < ncpus; ++n) {
527 		lwkt_setcpu_self(globaldata_find(n));
528 		cpu_ccfence();
529 		tr = &pscpu->trampoline;
530 
531 		/*
532 		 * Make sure we are cleaned out.
533 		 *
534 		 * XXX cleanup, reusing globals inside the loop (they get
535 		 * set to the same thing each loop)
536 		 *
537 		 * [0] kernel entry (idle exit)
538 		 * [1] kernel exit  (idle entry)
539 		 */
540 		tr->tr_pcb_spec_ctrl[0] &= ~spec_mask;
541 		tr->tr_pcb_spec_ctrl[1] &= ~spec_mask;
542 
543 		/*
544 		 * Don't try to parse if not available
545 		 */
546 		if (spectre_mitigation < 0)
547 			continue;
548 
549 		/*
550 		 * IBRS mode.  Auto overrides toggling.
551 		 *
552 		 * Only set the ENABLE flag if we have to toggle something
553 		 * on entry and exit.
554 		 */
555 		spec_ctrl = 0;
556 		if (CHECK(IBRS_AUTO_SUPPORTED)) {
557 			spec_ctrl |= SPEC_CTRL_IBRS;
558 			mode |= IBRS_AUTO_SUPPORTED;
559 		} else if (CHECK(IBRS_SUPPORTED)) {
560 			spec_ctrl |= SPEC_CTRL_IBRS | SPEC_CTRL_DUMMY_ENABLE;
561 			mode |= IBRS_SUPPORTED;
562 		}
563 		if (CHECK(STIBP_AUTO_SUPPORTED)) {
564 			spec_ctrl |= SPEC_CTRL_STIBP;
565 			mode |= STIBP_AUTO_SUPPORTED;
566 		} else if (CHECK(STIBP_SUPPORTED)) {
567 			spec_ctrl |= SPEC_CTRL_STIBP | SPEC_CTRL_DUMMY_ENABLE;
568 			mode |= STIBP_SUPPORTED;
569 		}
570 
571 		/*
572 		 * IBPB requested and supported.
573 		 */
574 		if (CHECK(IBPB_SUPPORTED)) {
575 			spec_ctrl |= SPEC_CTRL_DUMMY_IBPB;
576 			mode |= IBPB_SUPPORTED;
577 		}
578 
579 		/*
580 		 * Update the MSR if the cpu supports the modes to ensure
581 		 * proper disablement if the user disabled the mode.
582 		 */
583 		if (spectre_support & (IBRS_SUPPORTED | IBRS_AUTO_SUPPORTED |
584 				    STIBP_SUPPORTED | STIBP_AUTO_SUPPORTED)) {
585 			wrmsr(MSR_SPEC_CTRL,
586 			      spec_ctrl & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
587 		}
588 
589 		/*
590 		 * Update spec_ctrl fields in the trampoline.
591 		 *
592 		 * [0] on-kernel-entry (on-idle-exit)
593 		 * [1] on-kernel-exit  (on-idle-entry)
594 		 *
595 		 * When auto mode is supported we leave the bit set, otherwise
596 		 * we clear the bits.
597 		 */
598 		tr->tr_pcb_spec_ctrl[0] |= spec_ctrl;
599 		if (CHECK(IBRS_AUTO_SUPPORTED) == 0)
600 			spec_ctrl &= ~SPEC_CTRL_IBRS;
601 		if (CHECK(STIBP_AUTO_SUPPORTED) == 0)
602 			spec_ctrl &= ~SPEC_CTRL_STIBP;
603 		tr->tr_pcb_spec_ctrl[1] |= spec_ctrl;
604 
605 		/*
606 		 * Make sure we set this on the first loop.  It will be
607 		 * the same value on remaining loops.
608 		 */
609 		spectre_mode = mode;
610 	}
611 	lwkt_setcpu_self(save_gd);
612 	cpu_ccfence();
613 
614 	/*
615 	 * Console message on mitigation mode change
616 	 */
617 	kprintf("Spectre: support=(");
618 	if (spectre_support == 0) {
619 		kprintf(" none");
620 	} else {
621 		if (spectre_support & IBRS_SUPPORTED)
622 			kprintf(" IBRS");
623 		if (spectre_support & STIBP_SUPPORTED)
624 			kprintf(" STIBP");
625 		if (spectre_support & IBPB_SUPPORTED)
626 			kprintf(" IBPB");
627 		if (spectre_support & IBRS_AUTO_SUPPORTED)
628 			kprintf(" IBRS_AUTO");
629 		if (spectre_support & STIBP_AUTO_SUPPORTED)
630 			kprintf(" STIBP_AUTO");
631 		if (spectre_support & IBRS_PREFERRED_REQUEST)
632 			kprintf(" IBRS_REQUESTED");
633 	}
634 	kprintf(" ) req=%04x operating=(", (uint16_t)spectre_mitigation);
635 	if (spectre_mode == 0) {
636 		kprintf(" none");
637 	} else {
638 		if (spectre_mode & IBRS_SUPPORTED)
639 			kprintf(" IBRS");
640 		if (spectre_mode & STIBP_SUPPORTED)
641 			kprintf(" STIBP");
642 		if (spectre_mode & IBPB_SUPPORTED)
643 			kprintf(" IBPB");
644 		if (spectre_mode & IBRS_AUTO_SUPPORTED)
645 			kprintf(" IBRS_AUTO");
646 		if (spectre_mode & STIBP_AUTO_SUPPORTED)
647 			kprintf(" STIBP_AUTO");
648 		if (spectre_mode & IBRS_PREFERRED_REQUEST)
649 			kprintf(" IBRS_REQUESTED");
650 	}
651 	kprintf(" )\n");
652 }
653 
654 #undef CHECK
655 
656 /*
657  * User changes sysctl value
658  */
659 static int
660 sysctl_spectre_mitigation(SYSCTL_HANDLER_ARGS)
661 {
662 	char buf[128];
663 	char *ptr;
664 	char *iter;
665 	size_t len;
666 	int spectre;
667 	int error = 0;
668 	int loop = 0;
669 
670 	/*
671 	 * Return current operating mode or support.
672 	 */
673 	if (oidp->oid_kind & CTLFLAG_WR)
674 		spectre = spectre_mode;
675 	else
676 		spectre = spectre_support;
677 
678 	spectre &= (IBRS_SUPPORTED | IBRS_AUTO_SUPPORTED |
679 		    STIBP_SUPPORTED | STIBP_AUTO_SUPPORTED |
680 		    IBPB_SUPPORTED);
681 	while (spectre) {
682 		if (error)
683 			break;
684 		if (loop++) {
685 			error = SYSCTL_OUT(req, " ", 1);
686 			if (error)
687 				break;
688 		}
689 		if (spectre & IBRS_SUPPORTED) {
690 			spectre &= ~IBRS_SUPPORTED;
691 			error = SYSCTL_OUT(req, "IBRS", 4);
692 		} else
693 		if (spectre & IBRS_AUTO_SUPPORTED) {
694 			spectre &= ~IBRS_AUTO_SUPPORTED;
695 			error = SYSCTL_OUT(req, "IBRS_AUTO", 9);
696 		} else
697 		if (spectre & STIBP_SUPPORTED) {
698 			spectre &= ~STIBP_SUPPORTED;
699 			error = SYSCTL_OUT(req, "STIBP", 5);
700 		} else
701 		if (spectre & STIBP_AUTO_SUPPORTED) {
702 			spectre &= ~STIBP_AUTO_SUPPORTED;
703 			error = SYSCTL_OUT(req, "STIBP_AUTO", 10);
704 		} else
705 		if (spectre & IBPB_SUPPORTED) {
706 			spectre &= ~IBPB_SUPPORTED;
707 			error = SYSCTL_OUT(req, "IBPB", 4);
708 		}
709 	}
710 	if (loop == 0) {
711 		error = SYSCTL_OUT(req, "NONE", 4);
712 	}
713 
714 	if (error || req->newptr == NULL)
715 		return error;
716 	if ((oidp->oid_kind & CTLFLAG_WR) == 0)
717 		return error;
718 
719 	/*
720 	 * Change current operating mode
721 	 */
722 	len = req->newlen - req->newidx;
723 	if (len >= sizeof(buf)) {
724 		error = EINVAL;
725 		len = 0;
726 	} else {
727 		error = SYSCTL_IN(req, buf, len);
728 	}
729 	buf[len] = 0;
730 	iter = &buf[0];
731 	spectre = 0;
732 
733 	while (error == 0 && iter) {
734 		ptr = strsep(&iter, " ,\t\r\n");
735 		if (*ptr == 0)
736 			continue;
737 		if (strcasecmp(ptr, "NONE") == 0)
738 			spectre |= 0;
739 		else if (strcasecmp(ptr, "IBRS") == 0)
740 			spectre |= IBRS_SUPPORTED;
741 		else if (strcasecmp(ptr, "IBRS_AUTO") == 0)
742 			spectre |= IBRS_AUTO_SUPPORTED;
743 		else if (strcasecmp(ptr, "STIBP") == 0)
744 			spectre |= STIBP_SUPPORTED;
745 		else if (strcasecmp(ptr, "STIBP_AUTO") == 0)
746 			spectre |= STIBP_AUTO_SUPPORTED;
747 		else if (strcasecmp(ptr, "IBPB") == 0)
748 			spectre |= IBPB_SUPPORTED;
749 		else
750 			error = ENOENT;
751 	}
752 	if (error == 0) {
753 		spectre_mitigation = spectre;
754 		spectre_sysctl_changed();
755 	}
756 	return error;
757 }
758 
759 SYSCTL_PROC(_machdep, OID_AUTO, spectre_mitigation,
760 	CTLTYPE_STRING | CTLFLAG_RW,
761 	0, 0, sysctl_spectre_mitigation, "A", "Spectre exploit mitigation");
762 SYSCTL_PROC(_machdep, OID_AUTO, spectre_support,
763 	CTLTYPE_STRING | CTLFLAG_RD,
764 	0, 0, sysctl_spectre_mitigation, "A", "Spectre supported features");
765 
766 /*
767  * NOTE: Called at SI_BOOT2_MACHDEP and also when the microcode is
768  *	 updated.  Microcode updates must be applied to all cpus
769  *	 for support to be recognized.
770  */
771 static void
772 spectre_vm_setup(void *arg)
773 {
774 	int inconsistent = 0;
775 	int supmask;
776 
777 	/*
778 	 * Fetch tunable in auto mode
779 	 */
780 	if (spectre_mitigation < 0) {
781 		TUNABLE_INT_FETCH("machdep.spectre_mitigation",
782 				  &spectre_mitigation);
783 	}
784 
785 	if ((supmask = spectre_check_support()) != 0) {
786 		/*
787 		 * Must be supported on all cpus before we
788 		 * can enable it.  Returns silently if it
789 		 * isn't.
790 		 *
791 		 * NOTE! arg != NULL indicates we were called
792 		 *	 from cpuctl after a successful microcode
793 		 *	 update.
794 		 */
795 		if (arg != NULL) {
796 			globaldata_t save_gd;
797 			int n;
798 
799 			save_gd = mycpu;
800 			for (n = 0; n < ncpus; ++n) {
801 				lwkt_setcpu_self(globaldata_find(n));
802 				cpu_ccfence();
803 				if (spectre_check_support() !=
804 				    supmask) {
805 					inconsistent = 1;
806 					break;
807 				}
808 			}
809 			lwkt_setcpu_self(save_gd);
810 			cpu_ccfence();
811 		}
812 	}
813 
814 	/*
815 	 * Be silent while microcode is being loaded on various CPUs,
816 	 * until all done.
817 	 */
818 	if (inconsistent) {
819 		spectre_mitigation = -1;
820 		spectre_support = 0;
821 		return;
822 	}
823 
824 	/*
825 	 * IBRS support
826 	 */
827 	spectre_support = supmask;
828 
829 	/*
830 	 * Enable spectre_mitigation, set defaults if -1, adjust
831 	 * tuned value according to support if not.
832 	 *
833 	 * NOTE!  We do not enable IBPB for user->kernel transitions
834 	 *	  by default, so this code is commented out for now.
835 	 */
836 	if (spectre_support) {
837 		if (spectre_mitigation < 0) {
838 			spectre_mitigation = 0;
839 
840 			/*
841 			 * IBRS toggling not currently recommended as a
842 			 * default.
843 			 */
844 			if (spectre_support & IBRS_AUTO_SUPPORTED)
845 				spectre_mitigation |= IBRS_AUTO_SUPPORTED;
846 			else if (spectre_support & IBRS_SUPPORTED)
847 				spectre_mitigation |= 0;
848 
849 			/*
850 			 * STIBP toggling not currently recommended as a
851 			 * default.
852 			 */
853 			if (spectre_support & STIBP_AUTO_SUPPORTED)
854 				spectre_mitigation |= STIBP_AUTO_SUPPORTED;
855 			else if (spectre_support & STIBP_SUPPORTED)
856 				spectre_mitigation |= 0;
857 
858 			/*
859 			 * IBPB adds enormous (~2uS) overhead to system
860 			 * calls etc, we do not enable it by default.
861 			 */
862 			if (spectre_support & IBPB_SUPPORTED)
863 				spectre_mitigation |= 0;
864 		}
865 	} else {
866 		spectre_mitigation = -1;
867 	}
868 
869 	/*
870 	 * Disallow sysctl changes when there is no support (otherwise
871 	 * the wrmsr will cause a protection fault).
872 	 */
873 	if (spectre_mitigation < 0)
874 		sysctl___machdep_spectre_mitigation.oid_kind &= ~CTLFLAG_WR;
875 	else
876 		sysctl___machdep_spectre_mitigation.oid_kind |= CTLFLAG_WR;
877 
878 	spectre_sysctl_changed();
879 }
880 
881 #define MDS_AVX512_4VNNIW_SUPPORTED	0x0001
882 #define MDS_AVX512_4FMAPS_SUPPORTED	0x0002
883 #define MDS_MD_CLEAR_SUPPORTED		0x0004
884 #define MDS_TSX_FORCE_ABORT_SUPPORTED	0x0008
885 #define MDS_NOT_REQUIRED		0x8000
886 
887 static
888 int
889 mds_check_support(void)
890 {
891 	uint64_t msr;
892 	uint32_t p[4];
893 	int rv = 0;
894 
895 	/*
896 	 * MDS mitigation hw bits
897 	 *
898 	 * MD_CLEAR	Use microcode-supported verf insn.  This is the
899 	 *		only mode we really support.
900 	 */
901 	if (cpu_vendor_id == CPU_VENDOR_INTEL) {
902 		p[0] = 0;
903 		p[1] = 0;
904 		p[2] = 0;
905 		p[3] = 0;
906 		cpuid_count(7, 0, p);
907 
908 		/*
909 		 * Some hypervisors fail to implement
910 		 * MSR_IA32_ARCH_CAPABILITIES.
911 		 */
912 		if (p[3] & CPUID_STDEXT3_ARCH_CAP) {
913 			msr = 0;
914 			if (rdmsr_safe(MSR_IA32_ARCH_CAPABILITIES, &msr)) {
915 				kprintf("Warning: MSR_IA32_ARCH_CAPABILITIES "
916 					"cannot be accessed\n");
917 			}
918 			if (msr & IA32_ARCH_CAP_MDS_NO)
919 				rv = MDS_NOT_REQUIRED;
920 		}
921 		if (p[3] & CPUID_STDEXT3_AVX5124VNNIW)
922 			rv |= MDS_AVX512_4VNNIW_SUPPORTED;
923 		if (p[3] & CPUID_STDEXT3_AVX5124FMAPS)
924 			rv |= MDS_AVX512_4FMAPS_SUPPORTED;
925 		if (p[3] & CPUID_STDEXT3_MD_CLEAR)
926 			rv |= MDS_MD_CLEAR_SUPPORTED;
927 		if (p[3] & CPUID_STDEXT3_TSXFA)
928 			rv |= MDS_TSX_FORCE_ABORT_SUPPORTED;
929 	} else {
930 		rv = MDS_NOT_REQUIRED;
931 	}
932 
933 	return rv;
934 }
935 
936 /*
937  * Iterate CPUs and adjust MSR for global operations, since
938  * the KMMU* code won't do it if spectre_mitigation is 0 or 2.
939  */
940 #define CHECK(flag)	(mds_mitigation & mds_support & (flag))
941 
942 static
943 void
944 mds_sysctl_changed(void)
945 {
946 	globaldata_t save_gd;
947 	struct trampframe *tr;
948 	int spec_ctrl;
949 	int spec_mask;
950 	int mode;
951 	int n;
952 
953 	spec_mask = SPEC_CTRL_MDS_ENABLE;
954 
955 	/*
956 	 * Fixup state
957 	 */
958 	mode = 0;
959 	save_gd = mycpu;
960 	for (n = 0; n < ncpus; ++n) {
961 		lwkt_setcpu_self(globaldata_find(n));
962 		cpu_ccfence();
963 		tr = &pscpu->trampoline;
964 
965 		/*
966 		 * Make sure we are cleaned out.
967 		 *
968 		 * XXX cleanup, reusing globals inside the loop (they get
969 		 * set to the same thing each loop)
970 		 *
971 		 * [0] kernel entry (idle exit)
972 		 * [1] kernel exit  (idle entry)
973 		 */
974 		tr->tr_pcb_spec_ctrl[0] &= ~spec_mask;
975 		tr->tr_pcb_spec_ctrl[1] &= ~spec_mask;
976 
977 		/*
978 		 * Don't try to parse if not available
979 		 */
980 		if (mds_mitigation < 0)
981 			continue;
982 
983 		spec_ctrl = 0;
984 		if (CHECK(MDS_MD_CLEAR_SUPPORTED)) {
985 			spec_ctrl |= SPEC_CTRL_MDS_ENABLE;
986 			mode |= MDS_MD_CLEAR_SUPPORTED;
987 		}
988 
989 		/*
990 		 * Update spec_ctrl fields in the trampoline.
991 		 *
992 		 * [0] on-kernel-entry (on-idle-exit)
993 		 * [1] on-kernel-exit  (on-idle-entry)
994 		 *
995 		 * The MDS stuff is only needed on kernel-exit or idle-entry
996 		 */
997 		/* tr->tr_pcb_spec_ctrl[0] |= spec_ctrl; */
998 		tr->tr_pcb_spec_ctrl[1] |= spec_ctrl;
999 
1000 		/*
1001 		 * Make sure we set this on the first loop.  It will be
1002 		 * the same value on remaining loops.
1003 		 */
1004 		mds_mode = mode;
1005 	}
1006 	lwkt_setcpu_self(save_gd);
1007 	cpu_ccfence();
1008 
1009 	/*
1010 	 * Console message on mitigation mode change
1011 	 */
1012 	kprintf("MDS: support=(");
1013 	if (mds_support == 0) {
1014 		kprintf(" none");
1015 	} else {
1016 		if (mds_support & MDS_AVX512_4VNNIW_SUPPORTED)
1017 			kprintf(" AVX512_4VNNIW");
1018 		if (mds_support & MDS_AVX512_4FMAPS_SUPPORTED)
1019 			kprintf(" AVX512_4FMAPS");
1020 		if (mds_support & MDS_MD_CLEAR_SUPPORTED)
1021 			kprintf(" MD_CLEAR");
1022 		if (mds_support & MDS_TSX_FORCE_ABORT_SUPPORTED)
1023 			kprintf(" TSX_FORCE_ABORT");
1024 		if (mds_support & MDS_NOT_REQUIRED)
1025 			kprintf(" MDS_NOT_REQUIRED");
1026 	}
1027 	kprintf(" ) req=%04x operating=(", (uint16_t)mds_mitigation);
1028 	if (mds_mode == 0) {
1029 		kprintf(" none");
1030 	} else {
1031 		if (mds_mode & MDS_AVX512_4VNNIW_SUPPORTED)
1032 			kprintf(" AVX512_4VNNIW");
1033 		if (mds_mode & MDS_AVX512_4FMAPS_SUPPORTED)
1034 			kprintf(" AVX512_4FMAPS");
1035 		if (mds_mode & MDS_MD_CLEAR_SUPPORTED)
1036 			kprintf(" MD_CLEAR");
1037 		if (mds_mode & MDS_TSX_FORCE_ABORT_SUPPORTED)
1038 			kprintf(" TSX_FORCE_ABORT");
1039 		if (mds_mode & MDS_NOT_REQUIRED)
1040 			kprintf(" MDS_NOT_REQUIRED");
1041 	}
1042 	kprintf(" )\n");
1043 }
1044 
1045 #undef CHECK
1046 
1047 /*
1048  * User changes sysctl value
1049  */
1050 static int
1051 sysctl_mds_mitigation(SYSCTL_HANDLER_ARGS)
1052 {
1053 	char buf[128];
1054 	char *ptr;
1055 	char *iter;
1056 	size_t len;
1057 	int mds;
1058 	int error = 0;
1059 	int loop = 0;
1060 
1061 	/*
1062 	 * Return current operating mode or support.
1063 	 */
1064 	if (oidp->oid_kind & CTLFLAG_WR)
1065 		mds = mds_mode;
1066 	else
1067 		mds = mds_support;
1068 
1069 	mds &= MDS_AVX512_4VNNIW_SUPPORTED |
1070 	       MDS_AVX512_4FMAPS_SUPPORTED |
1071 	       MDS_MD_CLEAR_SUPPORTED |
1072 	       MDS_TSX_FORCE_ABORT_SUPPORTED |
1073 	       MDS_NOT_REQUIRED;
1074 
1075 	while (mds) {
1076 		if (error)
1077 			break;
1078 		if (loop++) {
1079 			error = SYSCTL_OUT(req, " ", 1);
1080 			if (error)
1081 				break;
1082 		}
1083 		if (mds & MDS_AVX512_4VNNIW_SUPPORTED) {
1084 			mds &= ~MDS_AVX512_4VNNIW_SUPPORTED;
1085 			error = SYSCTL_OUT(req, "AVX512_4VNNIW", 13);
1086 		} else
1087 		if (mds & MDS_AVX512_4FMAPS_SUPPORTED) {
1088 			mds &= ~MDS_AVX512_4FMAPS_SUPPORTED;
1089 			error = SYSCTL_OUT(req, "AVX512_4FMAPS", 13);
1090 		} else
1091 		if (mds & MDS_MD_CLEAR_SUPPORTED) {
1092 			mds &= ~MDS_MD_CLEAR_SUPPORTED;
1093 			error = SYSCTL_OUT(req, "MD_CLEAR", 8);
1094 		} else
1095 		if (mds & MDS_TSX_FORCE_ABORT_SUPPORTED) {
1096 			mds &= ~MDS_TSX_FORCE_ABORT_SUPPORTED;
1097 			error = SYSCTL_OUT(req, "TSX_FORCE_ABORT", 15);
1098 		} else
1099 		if (mds & MDS_NOT_REQUIRED) {
1100 			mds &= ~MDS_NOT_REQUIRED;
1101 			error = SYSCTL_OUT(req, "MDS_NOT_REQUIRED", 16);
1102 		}
1103 	}
1104 	if (loop == 0) {
1105 		error = SYSCTL_OUT(req, "NONE", 4);
1106 	}
1107 
1108 	if (error || req->newptr == NULL)
1109 		return error;
1110 	if ((oidp->oid_kind & CTLFLAG_WR) == 0)
1111 		return error;
1112 
1113 	/*
1114 	 * Change current operating mode
1115 	 */
1116 	len = req->newlen - req->newidx;
1117 	if (len >= sizeof(buf)) {
1118 		error = EINVAL;
1119 		len = 0;
1120 	} else {
1121 		error = SYSCTL_IN(req, buf, len);
1122 	}
1123 	buf[len] = 0;
1124 	iter = &buf[0];
1125 	mds = 0;
1126 
1127 	while (error == 0 && iter) {
1128 		ptr = strsep(&iter, " ,\t\r\n");
1129 		if (*ptr == 0)
1130 			continue;
1131 		if (strcasecmp(ptr, "NONE") == 0)
1132 			mds |= 0;
1133 		else if (strcasecmp(ptr, "AVX512_4VNNIW") == 0)
1134 			mds |= MDS_AVX512_4VNNIW_SUPPORTED;
1135 		else if (strcasecmp(ptr, "AVX512_4FMAPS") == 0)
1136 			mds |= MDS_AVX512_4FMAPS_SUPPORTED;
1137 		else if (strcasecmp(ptr, "MD_CLEAR") == 0)
1138 			mds |= MDS_MD_CLEAR_SUPPORTED;
1139 		else if (strcasecmp(ptr, "TSX_FORCE_ABORT") == 0)
1140 			mds |= MDS_TSX_FORCE_ABORT_SUPPORTED;
1141 		else if (strcasecmp(ptr, "MDS_NOT_REQUIRED") == 0)
1142 			mds |= MDS_NOT_REQUIRED;
1143 		else
1144 			error = ENOENT;
1145 	}
1146 	if (error == 0) {
1147 		mds_mitigation = mds;
1148 		mds_sysctl_changed();
1149 	}
1150 	return error;
1151 }
1152 
1153 SYSCTL_PROC(_machdep, OID_AUTO, mds_mitigation,
1154 	CTLTYPE_STRING | CTLFLAG_RW,
1155 	0, 0, sysctl_mds_mitigation, "A", "MDS exploit mitigation");
1156 SYSCTL_PROC(_machdep, OID_AUTO, mds_support,
1157 	CTLTYPE_STRING | CTLFLAG_RD,
1158 	0, 0, sysctl_mds_mitigation, "A", "MDS supported features");
1159 
1160 /*
1161  * NOTE: Called at SI_BOOT2_MACHDEP and also when the microcode is
1162  *	 updated.  Microcode updates must be applied to all cpus
1163  *	 for support to be recognized.
1164  */
1165 static void
1166 mds_vm_setup(void *arg)
1167 {
1168 	int inconsistent = 0;
1169 	int supmask;
1170 
1171 	/*
1172 	 * Fetch tunable in auto mode
1173 	 */
1174 	if (mds_mitigation < 0) {
1175 		TUNABLE_INT_FETCH("machdep.mds_mitigation", &mds_mitigation);
1176 	}
1177 
1178 	if ((supmask = mds_check_support()) != 0) {
1179 		/*
1180 		 * Must be supported on all cpus before we
1181 		 * can enable it.  Returns silently if it
1182 		 * isn't.
1183 		 *
1184 		 * NOTE! arg != NULL indicates we were called
1185 		 *	 from cpuctl after a successful microcode
1186 		 *	 update.
1187 		 */
1188 		if (arg != NULL) {
1189 			globaldata_t save_gd;
1190 			int n;
1191 
1192 			save_gd = mycpu;
1193 			for (n = 0; n < ncpus; ++n) {
1194 				lwkt_setcpu_self(globaldata_find(n));
1195 				cpu_ccfence();
1196 				if (mds_check_support() != supmask) {
1197 					inconsistent = 1;
1198 					break;
1199 				}
1200 			}
1201 			lwkt_setcpu_self(save_gd);
1202 			cpu_ccfence();
1203 		}
1204 	}
1205 
1206 	/*
1207 	 * Be silent while microcode is being loaded on various CPUs,
1208 	 * until all done.
1209 	 */
1210 	if (inconsistent) {
1211 		mds_mitigation = -1;
1212 		mds_support = 0;
1213 		return;
1214 	}
1215 
1216 	/*
1217 	 * IBRS support
1218 	 */
1219 	mds_support = supmask;
1220 
1221 	/*
1222 	 * Enable mds_mitigation, set defaults if -1, adjust
1223 	 * tuned value according to support if not.
1224 	 *
1225 	 * NOTE!  MDS is not enabled by default.
1226 	 */
1227 	if (mds_support) {
1228 		if (mds_mitigation < 0) {
1229 			mds_mitigation = 0;
1230 
1231 			if ((mds_support & MDS_NOT_REQUIRED) == 0 &&
1232 			    (mds_support & MDS_MD_CLEAR_SUPPORTED)) {
1233 				/* mds_mitigation |= MDS_MD_CLEAR_SUPPORTED; */
1234 			}
1235 		}
1236 	} else {
1237 		mds_mitigation = -1;
1238 	}
1239 
1240 	/*
1241 	 * Disallow sysctl changes when there is no support (otherwise
1242 	 * the wrmsr will cause a protection fault).
1243 	 */
1244 	if (mds_mitigation < 0)
1245 		sysctl___machdep_mds_mitigation.oid_kind &= ~CTLFLAG_WR;
1246 	else
1247 		sysctl___machdep_mds_mitigation.oid_kind |= CTLFLAG_WR;
1248 
1249 	mds_sysctl_changed();
1250 }
1251 
1252 /*
1253  * NOTE: Called at SI_BOOT2_MACHDEP and also when the microcode is
1254  *	 updated.  Microcode updates must be applied to all cpus
1255  *	 for support to be recognized.
1256  */
1257 void
1258 mitigation_vm_setup(void *arg)
1259 {
1260 	spectre_vm_setup(arg);
1261 	mds_vm_setup(arg);
1262 }
1263 
1264 SYSINIT(mitigation_vm_setup, SI_BOOT2_MACHDEP, SI_ORDER_ANY,
1265 	mitigation_vm_setup, NULL);
1266 
1267 /*
1268  * platform-specific vmspace initialization (nothing for x86_64)
1269  */
1270 void
1271 cpu_vmspace_alloc(struct vmspace *vm __unused)
1272 {
1273 }
1274 
1275 void
1276 cpu_vmspace_free(struct vmspace *vm __unused)
1277 {
1278 }
1279 
1280 int
1281 kvm_access_check(vm_offset_t saddr, vm_offset_t eaddr, int prot)
1282 {
1283 	vm_offset_t addr;
1284 
1285 	if (saddr < KvaStart)
1286 		return EFAULT;
1287 	if (eaddr >= KvaEnd)
1288 		return EFAULT;
1289 	for (addr = saddr; addr < eaddr; addr += PAGE_SIZE)  {
1290 		if (pmap_kextract(addr) == 0)
1291 			return EFAULT;
1292 	}
1293 	if (!kernacc((caddr_t)saddr, eaddr - saddr, prot))
1294 		return EFAULT;
1295 	return 0;
1296 }
1297 
1298 #if 0
1299 
1300 void _test_frame_enter(struct trapframe *frame);
1301 void _test_frame_exit(struct trapframe *frame);
1302 
1303 void
1304 _test_frame_enter(struct trapframe *frame)
1305 {
1306 	thread_t td = curthread;
1307 
1308 	if (ISPL(frame->tf_cs) == SEL_UPL) {
1309 		KKASSERT(td->td_lwp);
1310                 KASSERT(td->td_lwp->lwp_md.md_regs == frame,
1311                         ("_test_frame_exit: Frame mismatch %p %p",
1312 			td->td_lwp->lwp_md.md_regs, frame));
1313 	    td->td_lwp->lwp_saveusp = (void *)frame->tf_rsp;
1314 	    td->td_lwp->lwp_saveupc = (void *)frame->tf_rip;
1315 	}
1316 	if ((char *)frame < td->td_kstack ||
1317 	    (char *)frame > td->td_kstack + td->td_kstack_size) {
1318 		panic("_test_frame_exit: frame not on kstack %p kstack=%p",
1319 			frame, td->td_kstack);
1320 	}
1321 }
1322 
1323 void
1324 _test_frame_exit(struct trapframe *frame)
1325 {
1326 	thread_t td = curthread;
1327 
1328 	if (ISPL(frame->tf_cs) == SEL_UPL) {
1329 		KKASSERT(td->td_lwp);
1330                 KASSERT(td->td_lwp->lwp_md.md_regs == frame,
1331                         ("_test_frame_exit: Frame mismatch %p %p",
1332 			td->td_lwp->lwp_md.md_regs, frame));
1333 		if (td->td_lwp->lwp_saveusp != (void *)frame->tf_rsp) {
1334 			kprintf("_test_frame_exit: %s:%d usp mismatch %p/%p\n",
1335 				td->td_comm, td->td_proc->p_pid,
1336 				td->td_lwp->lwp_saveusp,
1337 				(void *)frame->tf_rsp);
1338 		}
1339 		if (td->td_lwp->lwp_saveupc != (void *)frame->tf_rip) {
1340 			kprintf("_test_frame_exit: %s:%d upc mismatch %p/%p\n",
1341 				td->td_comm, td->td_proc->p_pid,
1342 				td->td_lwp->lwp_saveupc,
1343 				(void *)frame->tf_rip);
1344 		}
1345 
1346 		/*
1347 		 * adulterate the fields to catch entries that
1348 		 * don't run through test_frame_enter
1349 		 */
1350 		td->td_lwp->lwp_saveusp =
1351 			(void *)~(intptr_t)td->td_lwp->lwp_saveusp;
1352 		td->td_lwp->lwp_saveupc =
1353 			(void *)~(intptr_t)td->td_lwp->lwp_saveupc;
1354 	}
1355 	if ((char *)frame < td->td_kstack ||
1356 	    (char *)frame > td->td_kstack + td->td_kstack_size) {
1357 		panic("_test_frame_exit: frame not on kstack %p kstack=%p",
1358 			frame, td->td_kstack);
1359 	}
1360 }
1361 
1362 #endif
1363