1 /*-
2  * Copyright (c) 1982, 1986 The Regents of the University of California.
3  * Copyright (c) 1989, 1990 William Jolitz
4  * Copyright (c) 1994 John Dyson
5  * Copyright (c) 2008-2018 The DragonFly Project.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * the Systems Programming Group of the University of Utah Computer
10  * Science Department, and William Jolitz.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *	This product includes software developed by the University of
23  *	California, Berkeley and its contributors.
24  * 4. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  *
40  *	from: @(#)vm_machdep.c	7.3 (Berkeley) 5/13/91
41  *	Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$
42  * $FreeBSD: src/sys/i386/i386/vm_machdep.c,v 1.132.2.9 2003/01/25 19:02:23 dillon Exp $
43  */
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/malloc.h>
48 #include <sys/proc.h>
49 #include <sys/buf.h>
50 #include <sys/interrupt.h>
51 #include <sys/vnode.h>
52 #include <sys/vmmeter.h>
53 #include <sys/kernel.h>
54 #include <sys/sysctl.h>
55 #include <sys/unistd.h>
56 #include <sys/lwp.h>
57 
58 #include <machine/clock.h>
59 #include <machine/cpu.h>
60 #include <machine/md_var.h>
61 #include <machine/smp.h>
62 #include <machine/pcb.h>
63 #include <machine/pcb_ext.h>
64 #include <machine/segments.h>
65 #include <machine/globaldata.h>	/* npxthread */
66 #include <machine/specialreg.h>
67 #include <machine/vmm.h>
68 
69 #include <vm/vm.h>
70 #include <vm/vm_param.h>
71 #include <sys/lock.h>
72 #include <vm/vm_kern.h>
73 #include <vm/vm_page.h>
74 #include <vm/vm_map.h>
75 #include <vm/vm_extern.h>
76 
77 #include <sys/thread2.h>
78 #include <sys/mplock2.h>
79 
80 #include <bus/isa/isa.h>
81 
82 static void	cpu_reset_real (void);
83 
84 static int spectre_mitigation = -1;
85 static int spectre_support = 0;
86 
87 static int spectre_mode = 0;
88 SYSCTL_INT(_machdep, OID_AUTO, spectre_mode, CTLFLAG_RD,
89 	&spectre_mode, 0, "current Spectre enablements");
90 
91 /*
92  * Finish a fork operation, with lwp lp2 nearly set up.
93  * Copy and update the pcb, set up the stack so that the child
94  * ready to run and return to user mode.
95  */
96 void
97 cpu_fork(struct lwp *lp1, struct lwp *lp2, int flags)
98 {
99 	struct pcb *pcb2;
100 	struct pmap *pmap2;
101 
102 	if ((flags & RFPROC) == 0) {
103 		if ((flags & RFMEM) == 0) {
104 			/*
105 			 * Unshare user LDT.  > 1 test is MPSAFE.  While
106 			 * it can potentially race a 2->1 transition, the
107 			 * worst that happens is that we do an unnecessary
108 			 * ldt replacement.
109 			 */
110 			struct pcb *pcb1 = lp1->lwp_thread->td_pcb;
111 			struct pcb_ldt *pcb_ldt = pcb1->pcb_ldt;
112 
113 			if (pcb_ldt && pcb_ldt->ldt_refcnt > 1) {
114 				pcb_ldt = user_ldt_alloc(pcb1,pcb_ldt->ldt_len);
115 				user_ldt_free(pcb1);
116 				pcb1->pcb_ldt = pcb_ldt;
117 				set_user_ldt(pcb1);
118 			}
119 		}
120 		return;
121 	}
122 
123 	/* Ensure that lp1's pcb is up to date. */
124 	if (mdcpu->gd_npxthread == lp1->lwp_thread)
125 		npxsave(lp1->lwp_thread->td_savefpu);
126 
127 	/*
128 	 * Copy lp1's PCB.  This really only applies to the
129 	 * debug registers and FP state, but its faster to just copy the
130 	 * whole thing.  Because we only save the PCB at switchout time,
131 	 * the register state may not be current.
132 	 */
133 	pcb2 = lp2->lwp_thread->td_pcb;
134 	*pcb2 = *lp1->lwp_thread->td_pcb;
135 
136 	/*
137 	 * Create a new fresh stack for the new process.
138 	 * Copy the trap frame for the return to user mode as if from a
139 	 * syscall.  This copies the user mode register values.
140 	 *
141 	 * pcb_rsp must allocate an additional call-return pointer below
142 	 * the trap frame which will be restored by cpu_heavy_restore from
143 	 * PCB_RIP, and the thread's td_sp pointer must allocate an
144 	 * additonal two quadwords below the pcb_rsp call-return pointer to
145 	 * hold the LWKT restore function pointer and rflags.
146 	 *
147 	 * The LWKT restore function pointer must be set to cpu_heavy_restore,
148 	 * which is our standard heavy-weight process switch-in function.
149 	 * YYY eventually we should shortcut fork_return and fork_trampoline
150 	 * to use the LWKT restore function directly so we can get rid of
151 	 * all the extra crap we are setting up.
152 	 */
153 	lp2->lwp_md.md_regs = (struct trapframe *)pcb2 - 1;
154 	bcopy(lp1->lwp_md.md_regs, lp2->lwp_md.md_regs, sizeof(*lp2->lwp_md.md_regs));
155 
156 	/*
157 	 * Set registers for trampoline to user mode.  Leave space for the
158 	 * return address on stack.  These are the kernel mode register values.
159 	 *
160 	 * Set the new pmap CR3.  If the new process uses isolated VM spaces,
161 	 * also set the isolated CR3.
162 	 */
163 	pmap2 = vmspace_pmap(lp2->lwp_proc->p_vmspace);
164 	pcb2->pcb_cr3 = vtophys(pmap2->pm_pml4);
165 	if ((pcb2->pcb_flags & PCB_ISOMMU) && pmap2->pm_pmlpv_iso) {
166 		pcb2->pcb_cr3_iso = vtophys(pmap2->pm_pml4_iso);
167 	} else {
168 		pcb2->pcb_flags &= ~PCB_ISOMMU;
169 		pcb2->pcb_cr3_iso = 0;
170 	}
171 
172 #if 0
173 	/*
174 	 * Per-process spectre mitigation (future)
175 	 */
176 	pcb2->pcb_flags &= ~(PCB_IBRS1 | PCB_IBRS2);
177 	switch (spectre_mitigation) {
178 	case 1:
179 		pcb2->pcb_flags |= PCB_IBRS1;
180 		break;
181 	case 2:
182 		pcb2->pcb_flags |= PCB_IBRS2;
183 		break;
184 	default:
185 		break;
186 	}
187 #endif
188 
189 	pcb2->pcb_rbx = (unsigned long)fork_return;	/* fork_trampoline argument */
190 	pcb2->pcb_rbp = 0;
191 	pcb2->pcb_rsp = (unsigned long)lp2->lwp_md.md_regs - sizeof(void *);
192 	pcb2->pcb_r12 = (unsigned long)lp2;		/* fork_trampoline argument */
193 	pcb2->pcb_r13 = 0;
194 	pcb2->pcb_r14 = 0;
195 	pcb2->pcb_r15 = 0;
196 	pcb2->pcb_rip = (unsigned long)fork_trampoline;
197 	lp2->lwp_thread->td_sp = (char *)(pcb2->pcb_rsp - sizeof(void *));
198 	*(u_int64_t *)lp2->lwp_thread->td_sp = PSL_USER;
199 	lp2->lwp_thread->td_sp -= sizeof(void *);
200 	*(void **)lp2->lwp_thread->td_sp = (void *)cpu_heavy_restore;
201 
202 	/*
203 	 * pcb2->pcb_ldt:	duplicated below, if necessary.
204 	 * pcb2->pcb_savefpu:	cloned above.
205 	 * pcb2->pcb_flags:	cloned above
206 	 * pcb2->pcb_onfault:	cloned above (always NULL here).
207 	 * pcb2->pcb_onfault_sp:cloned above (dont care)
208 	 */
209 
210 	/*
211 	 * XXX don't copy the i/o pages.  this should probably be fixed.
212 	 */
213 	pcb2->pcb_ext = NULL;
214 
215         /* Copy the LDT, if necessary. */
216         if (pcb2->pcb_ldt != NULL) {
217 		if (flags & RFMEM) {
218 			atomic_add_int(&pcb2->pcb_ldt->ldt_refcnt, 1);
219 		} else {
220 			pcb2->pcb_ldt = user_ldt_alloc(pcb2,
221 						       pcb2->pcb_ldt->ldt_len);
222 		}
223         }
224 	bcopy(&lp1->lwp_thread->td_tls, &lp2->lwp_thread->td_tls,
225 	      sizeof(lp2->lwp_thread->td_tls));
226 	/*
227 	 * Now, cpu_switch() can schedule the new lwp.
228 	 * pcb_rsp is loaded pointing to the cpu_switch() stack frame
229 	 * containing the return address when exiting cpu_switch.
230 	 * This will normally be to fork_trampoline(), which will have
231 	 * %rbx loaded with the new lwp's pointer.  fork_trampoline()
232 	 * will set up a stack to call fork_return(lp, frame); to complete
233 	 * the return to user-mode.
234 	 */
235 }
236 
237 /*
238  * Prepare new lwp to return to the address specified in params.
239  */
240 int
241 cpu_prepare_lwp(struct lwp *lp, struct lwp_params *params)
242 {
243 	struct trapframe *regs = lp->lwp_md.md_regs;
244 	void *bad_return = NULL;
245 	int error;
246 
247 	regs->tf_rip = (long)params->lwp_func;
248 	regs->tf_rsp = (long)params->lwp_stack;
249 	/* Set up argument for function call */
250 	regs->tf_rdi = (long)params->lwp_arg;
251 
252 	/*
253 	 * Set up fake return address.  As the lwp function may never return,
254 	 * we simply copy out a NULL pointer and force the lwp to receive
255 	 * a SIGSEGV if it returns anyways.
256 	 */
257 	regs->tf_rsp -= sizeof(void *);
258 	error = copyout(&bad_return, (void *)regs->tf_rsp, sizeof(bad_return));
259 	if (error)
260 		return (error);
261 
262 	if (lp->lwp_proc->p_vmm) {
263 		lp->lwp_thread->td_pcb->pcb_cr3 = KPML4phys;
264 		cpu_set_fork_handler(lp,
265 		    (void (*)(void *, struct trapframe *))vmm_lwp_return, lp);
266 	} else {
267 		cpu_set_fork_handler(lp,
268 		    (void (*)(void *, struct trapframe *))generic_lwp_return, lp);
269 	}
270 	return (0);
271 }
272 
273 /*
274  * Intercept the return address from a freshly forked process that has NOT
275  * been scheduled yet.
276  *
277  * This is needed to make kernel threads stay in kernel mode.
278  */
279 void
280 cpu_set_fork_handler(struct lwp *lp, void (*func)(void *, struct trapframe *),
281 		     void *arg)
282 {
283 	/*
284 	 * Note that the trap frame follows the args, so the function
285 	 * is really called like this:  func(arg, frame);
286 	 */
287 	lp->lwp_thread->td_pcb->pcb_rbx = (long)func;	/* function */
288 	lp->lwp_thread->td_pcb->pcb_r12 = (long)arg;	/* first arg */
289 }
290 
291 void
292 cpu_set_thread_handler(thread_t td, void (*rfunc)(void), void *func, void *arg)
293 {
294 	td->td_pcb->pcb_rbx = (long)func;
295 	td->td_pcb->pcb_r12 = (long)arg;
296 	td->td_switch = cpu_lwkt_switch;
297 	td->td_sp -= sizeof(void *);
298 	*(void **)td->td_sp = rfunc;	/* exit function on return */
299 	td->td_sp -= sizeof(void *);
300 	*(void **)td->td_sp = cpu_kthread_restore;
301 }
302 
303 void
304 cpu_lwp_exit(void)
305 {
306 	struct thread *td = curthread;
307 	struct pcb *pcb;
308 
309 	pcb = td->td_pcb;
310 
311 	/* Some x86 functionality was dropped */
312 	KKASSERT(pcb->pcb_ext == NULL);
313 
314 	/*
315 	 * disable all hardware breakpoints
316 	 */
317         if (pcb->pcb_flags & PCB_DBREGS) {
318                 reset_dbregs();
319                 pcb->pcb_flags &= ~PCB_DBREGS;
320         }
321 	td->td_gd->gd_cnt.v_swtch++;
322 
323 	crit_enter_quick(td);
324 	if (td->td_flags & TDF_TSLEEPQ)
325 		tsleep_remove(td);
326 	lwkt_deschedule_self(td);
327 	lwkt_remove_tdallq(td);
328 	cpu_thread_exit();
329 }
330 
331 /*
332  * Terminate the current thread.  The caller must have already acquired
333  * the thread's rwlock and placed it on a reap list or otherwise notified
334  * a reaper of its existance.  We set a special assembly switch function which
335  * releases td_rwlock after it has cleaned up the MMU state and switched
336  * out the stack.
337  *
338  * Must be caller from a critical section and with the thread descheduled.
339  */
340 void
341 cpu_thread_exit(void)
342 {
343 	npxexit();
344 	curthread->td_switch = cpu_exit_switch;
345 	curthread->td_flags |= TDF_EXITING;
346 	lwkt_switch();
347 	panic("cpu_thread_exit: lwkt_switch() unexpectedly returned");
348 }
349 
350 void
351 cpu_reset(void)
352 {
353 	cpu_reset_real();
354 }
355 
356 static void
357 cpu_reset_real(void)
358 {
359 	/*
360 	 * Attempt to do a CPU reset via the keyboard controller,
361 	 * do not turn off the GateA20, as any machine that fails
362 	 * to do the reset here would then end up in no man's land.
363 	 */
364 
365 #if !defined(BROKEN_KEYBOARD_RESET)
366 	outb(IO_KBD + 4, 0xFE);
367 	DELAY(500000);	/* wait 0.5 sec to see if that did it */
368 	kprintf("Keyboard reset did not work, attempting CPU shutdown\n");
369 	DELAY(1000000);	/* wait 1 sec for kprintf to complete */
370 #endif
371 #if 0 /* JG */
372 	/* force a shutdown by unmapping entire address space ! */
373 	bzero((caddr_t) PTD, PAGE_SIZE);
374 #endif
375 
376 	/* "good night, sweet prince .... <THUNK!>" */
377 	cpu_invltlb();
378 	/* NOTREACHED */
379 	while(1);
380 }
381 
382 /*
383  * Convert kernel VA to physical address
384  */
385 vm_paddr_t
386 kvtop(void *addr)
387 {
388 	vm_paddr_t pa;
389 
390 	pa = pmap_kextract((vm_offset_t)addr);
391 	if (pa == 0)
392 		panic("kvtop: zero page frame");
393 	return (pa);
394 }
395 
396 static void
397 swi_vm(void *arg, void *frame)
398 {
399 	if (busdma_swi_pending != 0)
400 		busdma_swi();
401 }
402 
403 static void
404 swi_vm_setup(void *arg)
405 {
406 	register_swi_mp(SWI_VM, swi_vm, NULL, "swi_vm", NULL, 0);
407 }
408 
409 SYSINIT(swi_vm_setup, SI_BOOT2_MACHDEP, SI_ORDER_ANY, swi_vm_setup, NULL);
410 
411 /*
412  * NOTE: This routine is also called after a successful microcode
413  *	 reload on cpu 0.
414  */
415 void spectre_vm_setup(void *arg);
416 
417 /*
418  * Check for IBPB and IBRS support
419  *
420  * This bits also specify desired modes in the spectre_mitigation sysctl.
421  */
422 #define IBRS_SUPPORTED		0x0001
423 #define STIBP_SUPPORTED		0x0002
424 #define IBPB_SUPPORTED		0x0004
425 #define IBRS_AUTO_SUPPORTED	0x0008
426 #define STIBP_AUTO_SUPPORTED	0x0010
427 #define IBRS_PREFERRED_REQUEST	0x0020
428 
429 static
430 int
431 spectre_check_support(void)
432 {
433 	uint32_t p[4];
434 	int rv = 0;
435 
436 	/*
437 	 * Spectre mitigation hw bits
438 	 *
439 	 * IBRS		Indirect Branch Restricted Speculation   (isolation)
440 	 * STIBP	Single Thread Indirect Branch Prediction (isolation)
441 	 * IBPB		Branch Prediction Barrier		 (barrier)
442 	 *
443 	 * IBRS and STIBP must be toggled (enabled on entry to kernel,
444 	 * disabled on exit, as well as disabled during any MWAIT/HLT).
445 	 * When *_AUTO bits are available, IBRS and STIBP may be left
446 	 * turned on and do not have to be toggled on kernel entry/exit.
447 	 *
448 	 * All this shit has enormous overhead.  IBPB in particular, and
449 	 * non-auto modes are disabled by default.
450 	 */
451 	if (cpu_vendor_id == CPU_VENDOR_INTEL) {
452 		p[0] = 0;
453 		p[1] = 0;
454 		p[2] = 0;
455 		p[3] = 0;
456 		cpuid_count(7, 0, p);
457 		if (p[3] & CPUID_7_0_I3_SPEC_CTRL)
458 			rv |= IBRS_SUPPORTED | IBPB_SUPPORTED;
459 		if (p[3] & CPUID_7_0_I3_STIBP)
460 			rv |= STIBP_SUPPORTED;
461 
462 		/*
463 		 * 0x80000008 p[1] bit 12 indicates IBPB support
464 		 *
465 		 * This bit might be set even though SPEC_CTRL is not set.
466 		 */
467 		p[0] = 0;
468 		p[1] = 0;
469 		p[2] = 0;
470 		p[3] = 0;
471 		do_cpuid(0x80000008U, p);
472 		if (p[1] & CPUID_INTEL_80000008_I1_IBPB_SUPPORT)
473 			rv |= IBPB_SUPPORTED;
474 	} else if (cpu_vendor_id == CPU_VENDOR_AMD) {
475 		/*
476 		 * 0x80000008 p[1] bit 12 indicates IBPB support
477 		 *	      p[1] bit 14 indicates IBRS support
478 		 *	      p[1] bit 15 indicates STIBP support
479 		 *
480 		 *	      p[1] bit 16 indicates IBRS auto support
481 		 *	      p[1] bit 17 indicates STIBP auto support
482 		 *	      p[1] bit 18 indicates processor prefers using
483 		 *		IBRS instead of retpoline.
484 		 */
485 		p[0] = 0;
486 		p[1] = 0;
487 		p[2] = 0;
488 		p[3] = 0;
489 		do_cpuid(0x80000008U, p);
490 		if (p[1] & CPUID_AMD_80000008_I1_IBPB_SUPPORT)
491 			rv |= IBPB_SUPPORTED;
492 		if (p[1] & CPUID_AMD_80000008_I1_IBRS_SUPPORT)
493 			rv |= IBRS_SUPPORTED;
494 		if (p[1] & CPUID_AMD_80000008_I1_STIBP_SUPPORT)
495 			rv |= STIBP_SUPPORTED;
496 
497 		if (p[1] & CPUID_AMD_80000008_I1_IBRS_AUTO)
498 			rv |= IBRS_AUTO_SUPPORTED;
499 		if (p[1] & CPUID_AMD_80000008_I1_STIBP_AUTO)
500 			rv |= STIBP_AUTO_SUPPORTED;
501 		if (p[1] & CPUID_AMD_80000008_I1_IBRS_REQUESTED)
502 			rv |= IBRS_PREFERRED_REQUEST;
503 	}
504 
505 	return rv;
506 }
507 
508 /*
509  * Iterate CPUs and adjust MSR for global operations, since
510  * the KMMU* code won't do it if spectre_mitigation is 0 or 2.
511  */
512 #define CHECK(flag)	(spectre_mitigation & spectre_support & (flag))
513 
514 static
515 void
516 spectre_sysctl_changed(void)
517 {
518 	globaldata_t save_gd;
519 	struct trampframe *tr;
520 	int spec_ctrl;
521 	int mode;
522 	int n;
523 
524 	/*
525 	 * Fixup state
526 	 */
527 	mode = 0;
528 	save_gd = mycpu;
529 	for (n = 0; n < ncpus; ++n) {
530 		lwkt_setcpu_self(globaldata_find(n));
531 		cpu_ccfence();
532 		tr = &pscpu->trampoline;
533 
534 		/*
535 		 * Make sure we are cleaned out.
536 		 *
537 		 * XXX cleanup, reusing globals inside the loop (they get
538 		 * set to the same thing each loop)
539 		 */
540 		tr->tr_pcb_spec_ctrl[0] = 0;	/* kernel entry (idle exit) */
541 		tr->tr_pcb_spec_ctrl[1] = 0;	/* kernel exit  (idle entry) */
542 
543 		/*
544 		 * Don't try to parse if not available
545 		 */
546 		if (spectre_mitigation < 0)
547 			continue;
548 
549 		/*
550 		 * IBRS mode.  Auto overrides toggling.
551 		 *
552 		 * Only set the ENABLE flag if we have to toggle something
553 		 * on entry and exit.
554 		 */
555 		spec_ctrl = 0;
556 		if (CHECK(IBRS_AUTO_SUPPORTED)) {
557 			spec_ctrl |= SPEC_CTRL_IBRS;
558 			mode |= IBRS_AUTO_SUPPORTED;
559 		} else if (CHECK(IBRS_SUPPORTED)) {
560 			spec_ctrl |= SPEC_CTRL_IBRS | SPEC_CTRL_DUMMY_ENABLE;
561 			mode |= IBRS_SUPPORTED;
562 		}
563 		if (CHECK(STIBP_AUTO_SUPPORTED)) {
564 			spec_ctrl |= SPEC_CTRL_STIBP;
565 			mode |= STIBP_AUTO_SUPPORTED;
566 		} else if (CHECK(STIBP_SUPPORTED)) {
567 			spec_ctrl |= SPEC_CTRL_STIBP | SPEC_CTRL_DUMMY_ENABLE;
568 			mode |= STIBP_SUPPORTED;
569 		}
570 
571 		/*
572 		 * IBPB requested and supported.
573 		 */
574 		if (CHECK(IBPB_SUPPORTED)) {
575 			spec_ctrl |= SPEC_CTRL_DUMMY_IBPB;
576 			mode |= IBPB_SUPPORTED;
577 		}
578 
579 		/*
580 		 * Update the MSR if the cpu supports the modes to ensure
581 		 * proper disablement if the user disabled the mode.
582 		 */
583 		if (spectre_support & (IBRS_SUPPORTED | IBRS_AUTO_SUPPORTED |
584 				    STIBP_SUPPORTED | STIBP_AUTO_SUPPORTED)) {
585 			wrmsr(MSR_SPEC_CTRL,
586 			      spec_ctrl & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
587 		}
588 
589 		/*
590 		 * Update spec_ctrl fields in the trampoline.
591 		 *
592 		 * [0] on-kernel-entry (on-idle-exit)
593 		 * [1] on-kernel-exit  (on-idle-entry)
594 		 *
595 		 * When auto mode is supported we leave the bit set, otherwise
596 		 * we clear the bits.
597 		 */
598 		tr->tr_pcb_spec_ctrl[0] = spec_ctrl;
599 		if (CHECK(IBRS_AUTO_SUPPORTED) == 0)
600 			spec_ctrl &= ~SPEC_CTRL_IBRS;
601 		if (CHECK(STIBP_AUTO_SUPPORTED) == 0)
602 			spec_ctrl &= ~SPEC_CTRL_STIBP;
603 		tr->tr_pcb_spec_ctrl[1] = spec_ctrl;
604 
605 		/*
606 		 * Make sure we set this on the first loop.  It will be
607 		 * the same value on remaining loops.
608 		 */
609 		spectre_mode = mode;
610 	}
611 	lwkt_setcpu_self(save_gd);
612 	cpu_ccfence();
613 
614 	/*
615 	 * Console message on mitigation mode change
616 	 */
617 	kprintf("Spectre: support=(");
618 	if (spectre_support == 0) {
619 		kprintf(" none");
620 	} else {
621 		if (spectre_support & IBRS_SUPPORTED)
622 			kprintf(" IBRS");
623 		if (spectre_support & STIBP_SUPPORTED)
624 			kprintf(" STIBP");
625 		if (spectre_support & IBPB_SUPPORTED)
626 			kprintf(" IBPB");
627 		if (spectre_support & IBRS_AUTO_SUPPORTED)
628 			kprintf(" IBRS_AUTO");
629 		if (spectre_support & STIBP_AUTO_SUPPORTED)
630 			kprintf(" STIBP_AUTO");
631 		if (spectre_support & IBRS_PREFERRED_REQUEST)
632 			kprintf(" IBRS_REQUESTED");
633 	}
634 	kprintf(" ) req=%04x operating=(", (uint16_t)spectre_mitigation);
635 	if (spectre_mode == 0) {
636 		kprintf(" none");
637 	} else {
638 		if (spectre_mode & IBRS_SUPPORTED)
639 			kprintf(" IBRS");
640 		if (spectre_mode & STIBP_SUPPORTED)
641 			kprintf(" STIBP");
642 		if (spectre_mode & IBPB_SUPPORTED)
643 			kprintf(" IBPB");
644 		if (spectre_mode & IBRS_AUTO_SUPPORTED)
645 			kprintf(" IBRS_AUTO");
646 		if (spectre_mode & STIBP_AUTO_SUPPORTED)
647 			kprintf(" STIBP_AUTO");
648 		if (spectre_mode & IBRS_PREFERRED_REQUEST)
649 			kprintf(" IBRS_REQUESTED");
650 	}
651 	kprintf(" )\n");
652 }
653 
654 /*
655  * User changes sysctl value
656  */
657 static int
658 sysctl_spectre_mitigation(SYSCTL_HANDLER_ARGS)
659 {
660 	char buf[128];
661 	char *ptr;
662 	char *iter;
663 	size_t len;
664 	int spectre;
665 	int error = 0;
666 	int loop = 0;
667 
668 	/*
669 	 * Return current operating mode or support.
670 	 */
671 	if (oidp->oid_kind & CTLFLAG_WR)
672 		spectre = spectre_mode;
673 	else
674 		spectre = spectre_support;
675 
676 	spectre &= (IBRS_SUPPORTED | IBRS_AUTO_SUPPORTED |
677 		    STIBP_SUPPORTED | STIBP_AUTO_SUPPORTED |
678 		    IBPB_SUPPORTED);
679 	while (spectre) {
680 		if (error)
681 			break;
682 		if (loop++) {
683 			error = SYSCTL_OUT(req, " ", 1);
684 			if (error)
685 				break;
686 		}
687 		if (spectre & IBRS_SUPPORTED) {
688 			spectre &= ~IBRS_SUPPORTED;
689 			error = SYSCTL_OUT(req, "IBRS", 4);
690 		} else
691 		if (spectre & IBRS_AUTO_SUPPORTED) {
692 			spectre &= ~IBRS_AUTO_SUPPORTED;
693 			error = SYSCTL_OUT(req, "IBRS_AUTO", 9);
694 		} else
695 		if (spectre & STIBP_SUPPORTED) {
696 			spectre &= ~STIBP_SUPPORTED;
697 			error = SYSCTL_OUT(req, "STIBP", 5);
698 		} else
699 		if (spectre & STIBP_AUTO_SUPPORTED) {
700 			spectre &= ~STIBP_AUTO_SUPPORTED;
701 			error = SYSCTL_OUT(req, "STIBP_AUTO", 10);
702 		} else
703 		if (spectre & IBPB_SUPPORTED) {
704 			spectre &= ~IBPB_SUPPORTED;
705 			error = SYSCTL_OUT(req, "IBPB", 4);
706 		}
707 	}
708 	if (loop == 0) {
709 		error = SYSCTL_OUT(req, "NONE", 4);
710 	}
711 
712 	if (error || req->newptr == NULL)
713 		return error;
714 	if ((oidp->oid_kind & CTLFLAG_WR) == 0)
715 		return error;
716 
717 	/*
718 	 * Change current operating mode
719 	 */
720 	len = req->newlen - req->newidx;
721 	if (len >= sizeof(buf)) {
722 		error = EINVAL;
723 		len = 0;
724 	} else {
725 		error = SYSCTL_IN(req, buf, len);
726 	}
727 	buf[len] = 0;
728 	iter = &buf[0];
729 	spectre = 0;
730 
731 	while (error == 0 && iter) {
732 		ptr = strsep(&iter, " ,\t\r\n");
733 		if (*ptr == 0)
734 			continue;
735 		if (strcasecmp(ptr, "NONE") == 0)
736 			spectre |= 0;
737 		else if (strcasecmp(ptr, "IBRS") == 0)
738 			spectre |= IBRS_SUPPORTED;
739 		else if (strcasecmp(ptr, "IBRS_AUTO") == 0)
740 			spectre |= IBRS_AUTO_SUPPORTED;
741 		else if (strcasecmp(ptr, "STIBP") == 0)
742 			spectre |= STIBP_SUPPORTED;
743 		else if (strcasecmp(ptr, "STIBP_AUTO") == 0)
744 			spectre |= STIBP_AUTO_SUPPORTED;
745 		else if (strcasecmp(ptr, "IBPB") == 0)
746 			spectre |= IBPB_SUPPORTED;
747 		else
748 			error = ENOENT;
749 	}
750 	if (error == 0) {
751 		spectre_mitigation = spectre;
752 		spectre_sysctl_changed();
753 	}
754 	return error;
755 }
756 
757 SYSCTL_PROC(_machdep, OID_AUTO, spectre_mitigation,
758 	CTLTYPE_STRING | CTLFLAG_RW,
759 	0, 0, sysctl_spectre_mitigation, "A", "Spectre exploit mitigation");
760 SYSCTL_PROC(_machdep, OID_AUTO, spectre_support,
761 	CTLTYPE_STRING | CTLFLAG_RD,
762 	0, 0, sysctl_spectre_mitigation, "A", "Spectre supported features");
763 
764 /*
765  * NOTE: Called at SI_BOOT2_MACHDEP and also when the microcode is
766  *	 updated.  Microcode updates must be applied to all cpus
767  *	 for support to be recognized.
768  */
769 void
770 spectre_vm_setup(void *arg)
771 {
772 	int inconsistent = 0;
773 	int supmask;
774 
775 	/*
776 	 * Fetch tunable in auto mode
777 	 */
778 	if (spectre_mitigation < 0) {
779 		TUNABLE_INT_FETCH("machdep.spectre_mitigation",
780 				  &spectre_mitigation);
781 	}
782 
783 	if ((supmask = spectre_check_support()) != 0) {
784 		/*
785 		 * Must be supported on all cpus before we
786 		 * can enable it.  Returns silently if it
787 		 * isn't.
788 		 *
789 		 * NOTE! arg != NULL indicates we were called
790 		 *	 from cpuctl after a successful microcode
791 		 *	 update.
792 		 */
793 		if (arg != NULL) {
794 			globaldata_t save_gd;
795 			int n;
796 
797 			save_gd = mycpu;
798 			for (n = 0; n < ncpus; ++n) {
799 				lwkt_setcpu_self(globaldata_find(n));
800 				cpu_ccfence();
801 				if (spectre_check_support() !=
802 				    supmask) {
803 					inconsistent = 1;
804 					break;
805 				}
806 			}
807 			lwkt_setcpu_self(save_gd);
808 			cpu_ccfence();
809 		}
810 	}
811 
812 	/*
813 	 * Be silent while microcode is being loaded on various CPUs,
814 	 * until all done.
815 	 */
816 	if (inconsistent) {
817 		spectre_mitigation = -1;
818 		spectre_support = 0;
819 		return;
820 	}
821 
822 	/*
823 	 * IBRS support
824 	 */
825 	spectre_support = supmask;
826 
827 	/*
828 	 * Enable spectre_mitigation, set defaults if -1, adjust
829 	 * tuned value according to support if not.
830 	 *
831 	 * NOTE!  We do not enable IBPB for user->kernel transitions
832 	 *	  by default, so this code is commented out for now.
833 	 */
834 	if (spectre_support) {
835 		if (spectre_mitigation < 0) {
836 			spectre_mitigation = 0;
837 
838 			/*
839 			 * IBRS toggling not currently recommended as a
840 			 * default.
841 			 */
842 			if (spectre_support & IBRS_AUTO_SUPPORTED)
843 				spectre_mitigation |= IBRS_AUTO_SUPPORTED;
844 			else if (spectre_support & IBRS_SUPPORTED)
845 				spectre_mitigation |= 0;
846 
847 			/*
848 			 * STIBP toggling not currently recommended as a
849 			 * default.
850 			 */
851 			if (spectre_support & STIBP_AUTO_SUPPORTED)
852 				spectre_mitigation |= STIBP_AUTO_SUPPORTED;
853 			else if (spectre_support & STIBP_SUPPORTED)
854 				spectre_mitigation |= 0;
855 
856 			/*
857 			 * IBPB adds enormous (~2uS) overhead to system
858 			 * calls etc, we do not enable it by default.
859 			 */
860 			if (spectre_support & IBPB_SUPPORTED)
861 				spectre_mitigation |= 0;
862 		}
863 	} else {
864 		spectre_mitigation = -1;
865 	}
866 
867 	/*
868 	 * Disallow sysctl changes when there is no support (otherwise
869 	 * the wrmsr will cause a protection fault).
870 	 */
871 	if (spectre_mitigation < 0)
872 		sysctl___machdep_spectre_mitigation.oid_kind &= ~CTLFLAG_WR;
873 	else
874 		sysctl___machdep_spectre_mitigation.oid_kind |= CTLFLAG_WR;
875 
876 	spectre_sysctl_changed();
877 }
878 
879 SYSINIT(spectre_vm_setup, SI_BOOT2_MACHDEP, SI_ORDER_ANY,
880 	spectre_vm_setup, NULL);
881 
882 /*
883  * platform-specific vmspace initialization (nothing for x86_64)
884  */
885 void
886 cpu_vmspace_alloc(struct vmspace *vm __unused)
887 {
888 }
889 
890 void
891 cpu_vmspace_free(struct vmspace *vm __unused)
892 {
893 }
894 
895 int
896 kvm_access_check(vm_offset_t saddr, vm_offset_t eaddr, int prot)
897 {
898 	vm_offset_t addr;
899 
900 	if (saddr < KvaStart)
901 		return EFAULT;
902 	if (eaddr >= KvaEnd)
903 		return EFAULT;
904 	for (addr = saddr; addr < eaddr; addr += PAGE_SIZE)  {
905 		if (pmap_kextract(addr) == 0)
906 			return EFAULT;
907 	}
908 	if (!kernacc((caddr_t)saddr, eaddr - saddr, prot))
909 		return EFAULT;
910 	return 0;
911 }
912 
913 #if 0
914 
915 void _test_frame_enter(struct trapframe *frame);
916 void _test_frame_exit(struct trapframe *frame);
917 
918 void
919 _test_frame_enter(struct trapframe *frame)
920 {
921 	thread_t td = curthread;
922 
923 	if (ISPL(frame->tf_cs) == SEL_UPL) {
924 		KKASSERT(td->td_lwp);
925                 KASSERT(td->td_lwp->lwp_md.md_regs == frame,
926                         ("_test_frame_exit: Frame mismatch %p %p",
927 			td->td_lwp->lwp_md.md_regs, frame));
928 	    td->td_lwp->lwp_saveusp = (void *)frame->tf_rsp;
929 	    td->td_lwp->lwp_saveupc = (void *)frame->tf_rip;
930 	}
931 	if ((char *)frame < td->td_kstack ||
932 	    (char *)frame > td->td_kstack + td->td_kstack_size) {
933 		panic("_test_frame_exit: frame not on kstack %p kstack=%p",
934 			frame, td->td_kstack);
935 	}
936 }
937 
938 void
939 _test_frame_exit(struct trapframe *frame)
940 {
941 	thread_t td = curthread;
942 
943 	if (ISPL(frame->tf_cs) == SEL_UPL) {
944 		KKASSERT(td->td_lwp);
945                 KASSERT(td->td_lwp->lwp_md.md_regs == frame,
946                         ("_test_frame_exit: Frame mismatch %p %p",
947 			td->td_lwp->lwp_md.md_regs, frame));
948 		if (td->td_lwp->lwp_saveusp != (void *)frame->tf_rsp) {
949 			kprintf("_test_frame_exit: %s:%d usp mismatch %p/%p\n",
950 				td->td_comm, td->td_proc->p_pid,
951 				td->td_lwp->lwp_saveusp,
952 				(void *)frame->tf_rsp);
953 		}
954 		if (td->td_lwp->lwp_saveupc != (void *)frame->tf_rip) {
955 			kprintf("_test_frame_exit: %s:%d upc mismatch %p/%p\n",
956 				td->td_comm, td->td_proc->p_pid,
957 				td->td_lwp->lwp_saveupc,
958 				(void *)frame->tf_rip);
959 		}
960 
961 		/*
962 		 * adulterate the fields to catch entries that
963 		 * don't run through test_frame_enter
964 		 */
965 		td->td_lwp->lwp_saveusp =
966 			(void *)~(intptr_t)td->td_lwp->lwp_saveusp;
967 		td->td_lwp->lwp_saveupc =
968 			(void *)~(intptr_t)td->td_lwp->lwp_saveupc;
969 	}
970 	if ((char *)frame < td->td_kstack ||
971 	    (char *)frame > td->td_kstack + td->td_kstack_size) {
972 		panic("_test_frame_exit: frame not on kstack %p kstack=%p",
973 			frame, td->td_kstack);
974 	}
975 }
976 
977 #endif
978