xref: /illumos-gate/usr/src/uts/intel/io/vmm/amd/svm.c (revision 0dd92943)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com)
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * This file and its contents are supplied under the terms of the
31  * Common Development and Distribution License ("CDDL"), version 1.0.
32  * You may only use this file in accordance with the terms of version
33  * 1.0 of the CDDL.
34  *
35  * A full copy of the text of the CDDL should have accompanied this
36  * source.  A copy of the CDDL is also available via the Internet at
37  * http://www.illumos.org/license/CDDL.
38  *
39  * Copyright 2018 Joyent, Inc.
40  * Copyright 2022 Oxide Computer Company
41  */
42 
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45 
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/kernel.h>
49 #include <sys/kmem.h>
50 #include <sys/pcpu.h>
51 #include <sys/proc.h>
52 #include <sys/sysctl.h>
53 
54 #include <sys/x86_archext.h>
55 #include <sys/trap.h>
56 
57 #include <machine/cpufunc.h>
58 #include <machine/psl.h>
59 #include <machine/md_var.h>
60 #include <machine/reg.h>
61 #include <machine/specialreg.h>
62 #include <machine/vmm.h>
63 #include <machine/vmm_dev.h>
64 #include <sys/vmm_instruction_emul.h>
65 #include <sys/vmm_vm.h>
66 #include <sys/vmm_kernel.h>
67 
68 #include "vmm_lapic.h"
69 #include "vmm_stat.h"
70 #include "vmm_ioport.h"
71 #include "vatpic.h"
72 #include "vlapic.h"
73 #include "vlapic_priv.h"
74 
75 #include "vmcb.h"
76 #include "svm.h"
77 #include "svm_softc.h"
78 #include "svm_msr.h"
79 
80 SYSCTL_DECL(_hw_vmm);
81 SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
82     NULL);
83 
84 /*
85  * SVM CPUID function 0x8000_000A, edx bit decoding.
86  */
87 #define	AMD_CPUID_SVM_NP		BIT(0)  /* Nested paging or RVI */
88 #define	AMD_CPUID_SVM_LBR		BIT(1)  /* Last branch virtualization */
89 #define	AMD_CPUID_SVM_SVML		BIT(2)  /* SVM lock */
90 #define	AMD_CPUID_SVM_NRIP_SAVE		BIT(3)  /* Next RIP is saved */
91 #define	AMD_CPUID_SVM_TSC_RATE		BIT(4)  /* TSC rate control. */
92 #define	AMD_CPUID_SVM_VMCB_CLEAN	BIT(5)  /* VMCB state caching */
93 #define	AMD_CPUID_SVM_FLUSH_BY_ASID	BIT(6)  /* Flush by ASID */
94 #define	AMD_CPUID_SVM_DECODE_ASSIST	BIT(7)  /* Decode assist */
95 #define	AMD_CPUID_SVM_PAUSE_INC		BIT(10) /* Pause intercept filter. */
96 #define	AMD_CPUID_SVM_PAUSE_FTH		BIT(12) /* Pause filter threshold */
97 #define	AMD_CPUID_SVM_AVIC		BIT(13)	/* AVIC present */
98 
99 #define	VMCB_CACHE_DEFAULT	(VMCB_CACHE_ASID	|	\
100 				VMCB_CACHE_IOPM		|	\
101 				VMCB_CACHE_I		|	\
102 				VMCB_CACHE_TPR		|	\
103 				VMCB_CACHE_CR2		|	\
104 				VMCB_CACHE_CR		|	\
105 				VMCB_CACHE_DR		|	\
106 				VMCB_CACHE_DT		|	\
107 				VMCB_CACHE_SEG		|	\
108 				VMCB_CACHE_NP)
109 
110 static uint32_t vmcb_clean = VMCB_CACHE_DEFAULT;
111 SYSCTL_INT(_hw_vmm_svm, OID_AUTO, vmcb_clean, CTLFLAG_RDTUN, &vmcb_clean,
112     0, NULL);
113 
114 /* SVM features advertised by CPUID.8000000AH:EDX */
115 static uint32_t svm_feature = ~0U;	/* AMD SVM features. */
116 
117 static int disable_npf_assist;
118 
119 static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery");
120 static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry");
121 static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window");
122 
123 static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val);
124 static int svm_getreg(void *arg, int vcpu, int ident, uint64_t *val);
125 static void flush_asid(struct svm_softc *sc, int vcpuid);
126 
127 static __inline bool
128 flush_by_asid(void)
129 {
130 	return ((svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID) != 0);
131 }
132 
133 static __inline bool
134 decode_assist(void)
135 {
136 	return ((svm_feature & AMD_CPUID_SVM_DECODE_ASSIST) != 0);
137 }
138 
139 static int
140 svm_cleanup(void)
141 {
142 	/* This is taken care of by the hma registration */
143 	return (0);
144 }
145 
146 static int
147 svm_init(void)
148 {
149 	vmcb_clean &= VMCB_CACHE_DEFAULT;
150 
151 	svm_msr_init();
152 
153 	return (0);
154 }
155 
156 static void
157 svm_restore(void)
158 {
159 	/* No-op on illumos */
160 }
161 
162 /* Pentium compatible MSRs */
163 #define	MSR_PENTIUM_START	0
164 #define	MSR_PENTIUM_END		0x1FFF
165 /* AMD 6th generation and Intel compatible MSRs */
166 #define	MSR_AMD6TH_START	0xC0000000UL
167 #define	MSR_AMD6TH_END		0xC0001FFFUL
168 /* AMD 7th and 8th generation compatible MSRs */
169 #define	MSR_AMD7TH_START	0xC0010000UL
170 #define	MSR_AMD7TH_END		0xC0011FFFUL
171 
172 /*
173  * Get the index and bit position for a MSR in permission bitmap.
174  * Two bits are used for each MSR: lower bit for read and higher bit for write.
175  */
176 static int
177 svm_msr_index(uint64_t msr, int *index, int *bit)
178 {
179 	uint32_t base, off;
180 
181 	*index = -1;
182 	*bit = (msr % 4) * 2;
183 	base = 0;
184 
185 	if (msr <= MSR_PENTIUM_END) {
186 		*index = msr / 4;
187 		return (0);
188 	}
189 
190 	base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1);
191 	if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) {
192 		off = (msr - MSR_AMD6TH_START);
193 		*index = (off + base) / 4;
194 		return (0);
195 	}
196 
197 	base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1);
198 	if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) {
199 		off = (msr - MSR_AMD7TH_START);
200 		*index = (off + base) / 4;
201 		return (0);
202 	}
203 
204 	return (EINVAL);
205 }
206 
207 /*
208  * Allow vcpu to read or write the 'msr' without trapping into the hypervisor.
209  */
210 static void
211 svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write)
212 {
213 	int index, bit, error;
214 
215 	error = svm_msr_index(msr, &index, &bit);
216 	KASSERT(error == 0, ("%s: invalid msr %lx", __func__, msr));
217 	KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE,
218 	    ("%s: invalid index %d for msr %lx", __func__, index, msr));
219 	KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d "
220 	    "msr %lx", __func__, bit, msr));
221 
222 	if (read)
223 		perm_bitmap[index] &= ~(1UL << bit);
224 
225 	if (write)
226 		perm_bitmap[index] &= ~(2UL << bit);
227 }
228 
229 static void
230 svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr)
231 {
232 
233 	svm_msr_perm(perm_bitmap, msr, true, true);
234 }
235 
236 static void
237 svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr)
238 {
239 
240 	svm_msr_perm(perm_bitmap, msr, true, false);
241 }
242 
243 static __inline int
244 svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask)
245 {
246 	struct vmcb_ctrl *ctrl;
247 
248 	KASSERT(idx >= 0 && idx < 5, ("invalid intercept index %d", idx));
249 
250 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
251 	return (ctrl->intercept[idx] & bitmask ? 1 : 0);
252 }
253 
254 static __inline void
255 svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask,
256     int enabled)
257 {
258 	struct vmcb_ctrl *ctrl;
259 	uint32_t oldval;
260 
261 	KASSERT(idx >= 0 && idx < 5, ("invalid intercept index %d", idx));
262 
263 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
264 	oldval = ctrl->intercept[idx];
265 
266 	if (enabled)
267 		ctrl->intercept[idx] |= bitmask;
268 	else
269 		ctrl->intercept[idx] &= ~bitmask;
270 
271 	if (ctrl->intercept[idx] != oldval) {
272 		svm_set_dirty(sc, vcpu, VMCB_CACHE_I);
273 	}
274 }
275 
276 static __inline void
277 svm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
278 {
279 
280 	svm_set_intercept(sc, vcpu, off, bitmask, 0);
281 }
282 
283 static __inline void
284 svm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
285 {
286 
287 	svm_set_intercept(sc, vcpu, off, bitmask, 1);
288 }
289 
290 static void
291 vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa,
292     uint64_t msrpm_base_pa, uint64_t np_pml4)
293 {
294 	struct vmcb_ctrl *ctrl;
295 	struct vmcb_state *state;
296 	uint32_t mask;
297 	int n;
298 
299 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
300 	state = svm_get_vmcb_state(sc, vcpu);
301 
302 	ctrl->iopm_base_pa = iopm_base_pa;
303 	ctrl->msrpm_base_pa = msrpm_base_pa;
304 
305 	/* Enable nested paging */
306 	ctrl->np_ctrl = NP_ENABLE;
307 	ctrl->n_cr3 = np_pml4;
308 
309 	/*
310 	 * Intercept accesses to the control registers that are not shadowed
311 	 * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8.
312 	 */
313 	for (n = 0; n < 16; n++) {
314 		mask = (BIT(n) << 16) | BIT(n);
315 		if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8)
316 			svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
317 		else
318 			svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
319 	}
320 
321 	/*
322 	 * Selectively intercept writes to %cr0.  This triggers on operations
323 	 * which would change bits other than TS or MP.
324 	 */
325 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
326 	    VMCB_INTCPT_CR0_WRITE);
327 
328 	/*
329 	 * Intercept everything when tracing guest exceptions otherwise
330 	 * just intercept machine check exception.
331 	 */
332 	if (vcpu_trace_exceptions(sc->vm, vcpu)) {
333 		for (n = 0; n < 32; n++) {
334 			/*
335 			 * Skip unimplemented vectors in the exception bitmap.
336 			 */
337 			if (n == 2 || n == 9) {
338 				continue;
339 			}
340 			svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n));
341 		}
342 	} else {
343 		svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC));
344 	}
345 
346 	/* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */
347 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO);
348 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR);
349 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID);
350 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR);
351 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT);
352 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI);
353 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI);
354 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN);
355 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
356 	    VMCB_INTCPT_FERR_FREEZE);
357 
358 	/* Enable exit-on-hlt by default */
359 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_HLT);
360 
361 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR);
362 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT);
363 
364 	/* Intercept privileged invalidation instructions. */
365 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INVD);
366 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INVLPGA);
367 
368 	/*
369 	 * Intercept all virtualization-related instructions.
370 	 *
371 	 * From section "Canonicalization and Consistency Checks" in APMv2
372 	 * the VMRUN intercept bit must be set to pass the consistency check.
373 	 */
374 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN);
375 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMMCALL);
376 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMLOAD);
377 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMSAVE);
378 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_STGI);
379 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_CLGI);
380 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_SKINIT);
381 	if (vcpu_trap_wbinvd(sc->vm, vcpu) != 0) {
382 		svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT,
383 		    VMCB_INTCPT_WBINVD);
384 	}
385 
386 	/*
387 	 * The ASID will be set to a non-zero value just before VMRUN.
388 	 */
389 	ctrl->asid = 0;
390 
391 	/*
392 	 * Section 15.21.1, Interrupt Masking in EFLAGS
393 	 * Section 15.21.2, Virtualizing APIC.TPR
394 	 *
395 	 * This must be set for %rflag and %cr8 isolation of guest and host.
396 	 */
397 	ctrl->v_intr_ctrl |= V_INTR_MASKING;
398 
399 	/* Enable Last Branch Record aka LBR for debugging */
400 	ctrl->misc_ctrl |= LBR_VIRT_ENABLE;
401 	state->dbgctl = BIT(0);
402 
403 	/* EFER_SVM must always be set when the guest is executing */
404 	state->efer = EFER_SVM;
405 
406 	/* Set up the PAT to power-on state */
407 	state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK)	|
408 	    PAT_VALUE(1, PAT_WRITE_THROUGH)	|
409 	    PAT_VALUE(2, PAT_UNCACHED)		|
410 	    PAT_VALUE(3, PAT_UNCACHEABLE)	|
411 	    PAT_VALUE(4, PAT_WRITE_BACK)	|
412 	    PAT_VALUE(5, PAT_WRITE_THROUGH)	|
413 	    PAT_VALUE(6, PAT_UNCACHED)		|
414 	    PAT_VALUE(7, PAT_UNCACHEABLE);
415 
416 	/* Set up DR6/7 to power-on state */
417 	state->dr6 = DBREG_DR6_RESERVED1;
418 	state->dr7 = DBREG_DR7_RESERVED1;
419 }
420 
421 /*
422  * Initialize a virtual machine.
423  */
424 static void *
425 svm_vminit(struct vm *vm)
426 {
427 	struct svm_softc *svm_sc;
428 	struct svm_vcpu *vcpu;
429 	vm_paddr_t msrpm_pa, iopm_pa, pml4_pa;
430 	int i;
431 	uint16_t maxcpus;
432 
433 	svm_sc = kmem_zalloc(sizeof (*svm_sc), KM_SLEEP);
434 	VERIFY3U(((uintptr_t)svm_sc & PAGE_MASK),  ==,  0);
435 
436 	svm_sc->msr_bitmap = vmm_contig_alloc(SVM_MSR_BITMAP_SIZE);
437 	if (svm_sc->msr_bitmap == NULL)
438 		panic("contigmalloc of SVM MSR bitmap failed");
439 	svm_sc->iopm_bitmap = vmm_contig_alloc(SVM_IO_BITMAP_SIZE);
440 	if (svm_sc->iopm_bitmap == NULL)
441 		panic("contigmalloc of SVM IO bitmap failed");
442 
443 	svm_sc->vm = vm;
444 	svm_sc->nptp = vmspace_table_root(vm_get_vmspace(vm));
445 
446 	/*
447 	 * Intercept read and write accesses to all MSRs.
448 	 */
449 	memset(svm_sc->msr_bitmap, 0xFF, SVM_MSR_BITMAP_SIZE);
450 
451 	/*
452 	 * Access to the following MSRs is redirected to the VMCB when the
453 	 * guest is executing. Therefore it is safe to allow the guest to
454 	 * read/write these MSRs directly without hypervisor involvement.
455 	 */
456 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE);
457 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE);
458 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE);
459 
460 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR);
461 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR);
462 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR);
463 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK);
464 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR);
465 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR);
466 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR);
467 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT);
468 
469 	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC);
470 
471 	/*
472 	 * Intercept writes to make sure that the EFER_SVM bit is not cleared.
473 	 */
474 	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER);
475 
476 	/* Intercept access to all I/O ports. */
477 	memset(svm_sc->iopm_bitmap, 0xFF, SVM_IO_BITMAP_SIZE);
478 
479 	iopm_pa = vtophys(svm_sc->iopm_bitmap);
480 	msrpm_pa = vtophys(svm_sc->msr_bitmap);
481 	pml4_pa = svm_sc->nptp;
482 	maxcpus = vm_get_maxcpus(svm_sc->vm);
483 	for (i = 0; i < maxcpus; i++) {
484 		vcpu = svm_get_vcpu(svm_sc, i);
485 		vcpu->nextrip = ~0;
486 		vcpu->lastcpu = NOCPU;
487 		vcpu->vmcb_pa = vtophys(&vcpu->vmcb);
488 		vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa);
489 		svm_msr_guest_init(svm_sc, i);
490 	}
491 	return (svm_sc);
492 }
493 
494 /*
495  * Collateral for a generic SVM VM-exit.
496  */
497 static void
498 vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2)
499 {
500 
501 	vme->exitcode = VM_EXITCODE_SVM;
502 	vme->u.svm.exitcode = code;
503 	vme->u.svm.exitinfo1 = info1;
504 	vme->u.svm.exitinfo2 = info2;
505 }
506 
507 static int
508 svm_cpl(struct vmcb_state *state)
509 {
510 
511 	/*
512 	 * From APMv2:
513 	 *   "Retrieve the CPL from the CPL field in the VMCB, not
514 	 *    from any segment DPL"
515 	 */
516 	return (state->cpl);
517 }
518 
519 static enum vm_cpu_mode
520 svm_vcpu_mode(struct vmcb *vmcb)
521 {
522 	struct vmcb_state *state;
523 
524 	state = &vmcb->state;
525 
526 	if (state->efer & EFER_LMA) {
527 		struct vmcb_segment *seg;
528 
529 		/*
530 		 * Section 4.8.1 for APM2, check if Code Segment has
531 		 * Long attribute set in descriptor.
532 		 */
533 		seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS);
534 		if (seg->attrib & VMCB_CS_ATTRIB_L)
535 			return (CPU_MODE_64BIT);
536 		else
537 			return (CPU_MODE_COMPATIBILITY);
538 	} else  if (state->cr0 & CR0_PE) {
539 		return (CPU_MODE_PROTECTED);
540 	} else {
541 		return (CPU_MODE_REAL);
542 	}
543 }
544 
545 static enum vm_paging_mode
546 svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer)
547 {
548 
549 	if ((cr0 & CR0_PG) == 0)
550 		return (PAGING_MODE_FLAT);
551 	if ((cr4 & CR4_PAE) == 0)
552 		return (PAGING_MODE_32);
553 	if (efer & EFER_LME)
554 		return (PAGING_MODE_64);
555 	else
556 		return (PAGING_MODE_PAE);
557 }
558 
559 /*
560  * ins/outs utility routines
561  */
562 
563 static void
564 svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging)
565 {
566 	struct vmcb_state *state;
567 
568 	state = &vmcb->state;
569 	paging->cr3 = state->cr3;
570 	paging->cpl = svm_cpl(state);
571 	paging->cpu_mode = svm_vcpu_mode(vmcb);
572 	paging->paging_mode = svm_paging_mode(state->cr0, state->cr4,
573 	    state->efer);
574 }
575 
576 #define	UNHANDLED 0
577 
578 /*
579  * Handle guest I/O intercept.
580  */
581 static int
582 svm_handle_inout(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
583 {
584 	struct vmcb_ctrl *ctrl;
585 	struct vmcb_state *state;
586 	struct vm_inout *inout;
587 	struct vie *vie;
588 	uint64_t info1;
589 	struct vm_guest_paging paging;
590 
591 	state = svm_get_vmcb_state(svm_sc, vcpu);
592 	ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
593 	inout = &vmexit->u.inout;
594 	info1 = ctrl->exitinfo1;
595 
596 	inout->bytes = (info1 >> 4) & 0x7;
597 	inout->flags = 0;
598 	inout->flags |= (info1 & BIT(0)) ? INOUT_IN : 0;
599 	inout->flags |= (info1 & BIT(3)) ? INOUT_REP : 0;
600 	inout->flags |= (info1 & BIT(2)) ? INOUT_STR : 0;
601 	inout->port = (uint16_t)(info1 >> 16);
602 	inout->eax = (uint32_t)(state->rax);
603 
604 	if ((inout->flags & INOUT_STR) != 0) {
605 		/*
606 		 * The effective segment number in EXITINFO1[12:10] is populated
607 		 * only if the processor has the DecodeAssist capability.
608 		 *
609 		 * This is not specified explicitly in APMv2 but can be verified
610 		 * empirically.
611 		 */
612 		if (!decode_assist()) {
613 			/*
614 			 * Without decoding assistance, force the task of
615 			 * emulating the ins/outs on userspace.
616 			 */
617 			vmexit->exitcode = VM_EXITCODE_INST_EMUL;
618 			bzero(&vmexit->u.inst_emul,
619 			    sizeof (vmexit->u.inst_emul));
620 			return (UNHANDLED);
621 		}
622 
623 		/*
624 		 * Bits 7-9 encode the address size of ins/outs operations where
625 		 * the 1/2/4 values correspond to 16/32/64 bit sizes.
626 		 */
627 		inout->addrsize = 2 * ((info1 >> 7) & 0x7);
628 		VERIFY(inout->addrsize == 2 || inout->addrsize == 4 ||
629 		    inout->addrsize == 8);
630 
631 		if (inout->flags & INOUT_IN) {
632 			/*
633 			 * For INS instructions, %es (encoded as 0) is the
634 			 * implied segment for the operation.
635 			 */
636 			inout->segment = 0;
637 		} else {
638 			/*
639 			 * Bits 10-12 encode the segment for OUTS.
640 			 * This value follows the standard x86 segment order.
641 			 */
642 			inout->segment = (info1 >> 10) & 0x7;
643 		}
644 	}
645 
646 	vmexit->exitcode = VM_EXITCODE_INOUT;
647 	svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &paging);
648 	vie = vm_vie_ctx(svm_sc->vm, vcpu);
649 	vie_init_inout(vie, inout, vmexit->inst_length, &paging);
650 
651 	/* The in/out emulation will handle advancing %rip */
652 	vmexit->inst_length = 0;
653 
654 	return (UNHANDLED);
655 }
656 
657 static int
658 npf_fault_type(uint64_t exitinfo1)
659 {
660 
661 	if (exitinfo1 & VMCB_NPF_INFO1_W)
662 		return (PROT_WRITE);
663 	else if (exitinfo1 & VMCB_NPF_INFO1_ID)
664 		return (PROT_EXEC);
665 	else
666 		return (PROT_READ);
667 }
668 
669 static bool
670 svm_npf_emul_fault(uint64_t exitinfo1)
671 {
672 	if (exitinfo1 & VMCB_NPF_INFO1_ID) {
673 		return (false);
674 	}
675 
676 	if (exitinfo1 & VMCB_NPF_INFO1_GPT) {
677 		return (false);
678 	}
679 
680 	if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) {
681 		return (false);
682 	}
683 
684 	return (true);
685 }
686 
687 static void
688 svm_handle_mmio_emul(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit,
689     uint64_t gpa)
690 {
691 	struct vmcb_ctrl *ctrl;
692 	struct vmcb *vmcb;
693 	struct vie *vie;
694 	struct vm_guest_paging paging;
695 	struct vmcb_segment *seg;
696 	char *inst_bytes = NULL;
697 	uint8_t inst_len = 0;
698 
699 	vmcb = svm_get_vmcb(svm_sc, vcpu);
700 	ctrl = &vmcb->ctrl;
701 
702 	vmexit->exitcode = VM_EXITCODE_MMIO_EMUL;
703 	vmexit->u.mmio_emul.gpa = gpa;
704 	vmexit->u.mmio_emul.gla = VIE_INVALID_GLA;
705 	svm_paging_info(vmcb, &paging);
706 
707 	switch (paging.cpu_mode) {
708 	case CPU_MODE_REAL:
709 		seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS);
710 		vmexit->u.mmio_emul.cs_base = seg->base;
711 		vmexit->u.mmio_emul.cs_d = 0;
712 		break;
713 	case CPU_MODE_PROTECTED:
714 	case CPU_MODE_COMPATIBILITY:
715 		seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS);
716 		vmexit->u.mmio_emul.cs_base = seg->base;
717 
718 		/*
719 		 * Section 4.8.1 of APM2, Default Operand Size or D bit.
720 		 */
721 		vmexit->u.mmio_emul.cs_d = (seg->attrib & VMCB_CS_ATTRIB_D) ?
722 		    1 : 0;
723 		break;
724 	default:
725 		vmexit->u.mmio_emul.cs_base = 0;
726 		vmexit->u.mmio_emul.cs_d = 0;
727 		break;
728 	}
729 
730 	/*
731 	 * Copy the instruction bytes into 'vie' if available.
732 	 */
733 	if (decode_assist() && !disable_npf_assist) {
734 		inst_len = ctrl->inst_len;
735 		inst_bytes = (char *)ctrl->inst_bytes;
736 	}
737 	vie = vm_vie_ctx(svm_sc->vm, vcpu);
738 	vie_init_mmio(vie, inst_bytes, inst_len, &paging, gpa);
739 }
740 
741 /*
742  * Do not allow CD, NW, or invalid high bits to be asserted in the value of cr0
743  * which is live in the guest.  They are visible via the shadow instead.
744  */
745 #define	SVM_CR0_MASK	~(CR0_CD | CR0_NW | 0xffffffff00000000)
746 
747 static void
748 svm_set_cr0(struct svm_softc *svm_sc, int vcpu, uint64_t val, bool guest_write)
749 {
750 	struct vmcb_state *state;
751 	struct svm_regctx *regctx;
752 	uint64_t masked, old, diff;
753 
754 	state = svm_get_vmcb_state(svm_sc, vcpu);
755 	regctx = svm_get_guest_regctx(svm_sc, vcpu);
756 
757 	old = state->cr0 | (regctx->sctx_cr0_shadow & ~SVM_CR0_MASK);
758 	diff = old ^ val;
759 
760 	/* No further work needed if register contents remain the same */
761 	if (diff == 0) {
762 		return;
763 	}
764 
765 	/* Flush the TLB if the paging or write-protect bits are changing */
766 	if ((diff & CR0_PG) != 0 || (diff & CR0_WP) != 0) {
767 		flush_asid(svm_sc, vcpu);
768 	}
769 
770 	/*
771 	 * If the change in %cr0 is due to a guest action (via interception)
772 	 * then other CPU state updates may be required.
773 	 */
774 	if (guest_write) {
775 		if ((diff & CR0_PG) != 0) {
776 			uint64_t efer = state->efer;
777 
778 			/* Keep the long-mode state in EFER in sync */
779 			if ((val & CR0_PG) != 0 && (efer & EFER_LME) != 0) {
780 				state->efer |= EFER_LMA;
781 			}
782 			if ((val & CR0_PG) == 0 && (efer & EFER_LME) != 0) {
783 				state->efer &= ~EFER_LMA;
784 			}
785 		}
786 	}
787 
788 	masked = val & SVM_CR0_MASK;
789 	regctx->sctx_cr0_shadow = val;
790 	state->cr0 = masked;
791 	svm_set_dirty(svm_sc, vcpu, VMCB_CACHE_CR);
792 
793 	if ((masked ^ val) != 0) {
794 		/*
795 		 * The guest has set bits in %cr0 which we are masking out and
796 		 * exposing via shadow.
797 		 *
798 		 * We must intercept %cr0 reads in order to make the shadowed
799 		 * view available to the guest.
800 		 *
801 		 * Writes to %cr0 must also be intercepted (unconditionally,
802 		 * unlike the VMCB_INTCPT_CR0_WRITE mechanism) so we can catch
803 		 * if/when the guest clears those shadowed bits.
804 		 */
805 		svm_enable_intercept(svm_sc, vcpu, VMCB_CR_INTCPT,
806 		    BIT(0) | BIT(16));
807 	} else {
808 		/*
809 		 * When no bits remain in %cr0 which require shadowing, the
810 		 * unconditional intercept of reads/writes to %cr0 can be
811 		 * disabled.
812 		 *
813 		 * The selective write intercept (VMCB_INTCPT_CR0_WRITE) remains
814 		 * in place so we can be notified of operations which change
815 		 * bits other than TS or MP.
816 		 */
817 		svm_disable_intercept(svm_sc, vcpu, VMCB_CR_INTCPT,
818 		    BIT(0) | BIT(16));
819 	}
820 	svm_set_dirty(svm_sc, vcpu, VMCB_CACHE_I);
821 }
822 
823 static void
824 svm_get_cr0(struct svm_softc *svm_sc, int vcpu, uint64_t *val)
825 {
826 	struct vmcb *vmcb;
827 	struct svm_regctx *regctx;
828 
829 	vmcb = svm_get_vmcb(svm_sc, vcpu);
830 	regctx = svm_get_guest_regctx(svm_sc, vcpu);
831 
832 	/*
833 	 * Include the %cr0 bits which exist only in the shadow along with those
834 	 * in the running vCPU state.
835 	 */
836 	*val = vmcb->state.cr0 | (regctx->sctx_cr0_shadow & ~SVM_CR0_MASK);
837 }
838 
839 static void
840 svm_handle_cr0_read(struct svm_softc *svm_sc, int vcpu, enum vm_reg_name reg)
841 {
842 	uint64_t val;
843 	int err __maybe_unused;
844 
845 	svm_get_cr0(svm_sc, vcpu, &val);
846 	err = svm_setreg(svm_sc, vcpu, reg, val);
847 	ASSERT(err == 0);
848 }
849 
850 static void
851 svm_handle_cr0_write(struct svm_softc *svm_sc, int vcpu, enum vm_reg_name reg)
852 {
853 	struct vmcb_state *state;
854 	uint64_t val;
855 	int err __maybe_unused;
856 
857 	state = svm_get_vmcb_state(svm_sc, vcpu);
858 
859 	err = svm_getreg(svm_sc, vcpu, reg, &val);
860 	ASSERT(err == 0);
861 
862 	if ((val & CR0_NW) != 0 && (val & CR0_CD) == 0) {
863 		/* NW without CD is nonsensical */
864 		vm_inject_gp(svm_sc->vm, vcpu);
865 		return;
866 	}
867 	if ((val & CR0_PG) != 0 && (val & CR0_PE) == 0) {
868 		/* PG requires PE */
869 		vm_inject_gp(svm_sc->vm, vcpu);
870 		return;
871 	}
872 	if ((state->cr0 & CR0_PG) == 0 && (val & CR0_PG) != 0) {
873 		/* When enabling paging, PAE must be enabled if LME is. */
874 		if ((state->efer & EFER_LME) != 0 &&
875 		    (state->cr4 & CR4_PAE) == 0) {
876 			vm_inject_gp(svm_sc->vm, vcpu);
877 			return;
878 		}
879 	}
880 
881 	svm_set_cr0(svm_sc, vcpu, val, true);
882 }
883 
884 static void
885 svm_inst_emul_other(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
886 {
887 	struct vie *vie;
888 	struct vm_guest_paging paging;
889 
890 	/* Let the instruction emulation (hopefully in-kernel) handle it */
891 	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
892 	bzero(&vmexit->u.inst_emul, sizeof (vmexit->u.inst_emul));
893 	vie = vm_vie_ctx(svm_sc->vm, vcpu);
894 	svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &paging);
895 	vie_init_other(vie, &paging);
896 
897 	/* The instruction emulation will handle advancing %rip */
898 	vmexit->inst_length = 0;
899 }
900 
901 static void
902 svm_update_virqinfo(struct svm_softc *sc, int vcpu)
903 {
904 	struct vm *vm;
905 	struct vlapic *vlapic;
906 	struct vmcb_ctrl *ctrl;
907 
908 	vm = sc->vm;
909 	vlapic = vm_lapic(vm, vcpu);
910 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
911 
912 	/* Update %cr8 in the emulated vlapic */
913 	vlapic_set_cr8(vlapic, ctrl->v_tpr);
914 
915 	/* Virtual interrupt injection is not used. */
916 	KASSERT(ctrl->v_intr_vector == 0, ("%s: invalid "
917 	    "v_intr_vector %d", __func__, ctrl->v_intr_vector));
918 }
919 
920 CTASSERT(VMCB_EVENTINJ_TYPE_INTR	== VM_INTINFO_HWINTR);
921 CTASSERT(VMCB_EVENTINJ_TYPE_NMI		== VM_INTINFO_NMI);
922 CTASSERT(VMCB_EVENTINJ_TYPE_EXCEPTION	== VM_INTINFO_HWEXCP);
923 CTASSERT(VMCB_EVENTINJ_TYPE_INTn	== VM_INTINFO_SWINTR);
924 CTASSERT(VMCB_EVENTINJ_EC_VALID		== VM_INTINFO_DEL_ERRCODE);
925 CTASSERT(VMCB_EVENTINJ_VALID		== VM_INTINFO_VALID);
926 
927 /*
928  * Store SVM-specific event injection info for later handling.  This depends on
929  * the bhyve-internal event definitions matching those in the VMCB, as ensured
930  * by the above CTASSERTs.
931  */
932 static void
933 svm_stash_intinfo(struct svm_softc *svm_sc, int vcpu, uint64_t intinfo)
934 {
935 	ASSERT(VMCB_EXITINTINFO_VALID(intinfo));
936 
937 	/*
938 	 * If stashing an NMI pending injection, ensure that it bears the
939 	 * correct vector which exit_intinfo expects.
940 	 */
941 	if (VM_INTINFO_TYPE(intinfo) == VM_INTINFO_NMI) {
942 		intinfo &= ~VM_INTINFO_MASK_VECTOR;
943 		intinfo |= IDT_NMI;
944 	}
945 
946 	VERIFY0(vm_exit_intinfo(svm_sc->vm, vcpu, intinfo));
947 }
948 
949 static void
950 svm_save_exitintinfo(struct svm_softc *svm_sc, int vcpu)
951 {
952 	struct vmcb_ctrl *ctrl  = svm_get_vmcb_ctrl(svm_sc, vcpu);
953 	uint64_t intinfo = ctrl->exitintinfo;
954 
955 	if (VMCB_EXITINTINFO_VALID(intinfo)) {
956 		/*
957 		 * If a #VMEXIT happened during event delivery then record the
958 		 * event that was being delivered.
959 		 */
960 		vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1);
961 
962 		svm_stash_intinfo(svm_sc, vcpu, intinfo);
963 	}
964 }
965 
966 static __inline int
967 vintr_intercept_enabled(struct svm_softc *sc, int vcpu)
968 {
969 
970 	return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
971 	    VMCB_INTCPT_VINTR));
972 }
973 
974 static void
975 svm_enable_intr_window_exiting(struct svm_softc *sc, int vcpu)
976 {
977 	struct vmcb_ctrl *ctrl;
978 	struct vmcb_state *state;
979 
980 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
981 	state = svm_get_vmcb_state(sc, vcpu);
982 
983 	if ((ctrl->v_irq & V_IRQ) != 0 && ctrl->v_intr_vector == 0) {
984 		KASSERT(ctrl->v_intr_prio & V_IGN_TPR,
985 		    ("%s: invalid v_ign_tpr", __func__));
986 		KASSERT(vintr_intercept_enabled(sc, vcpu),
987 		    ("%s: vintr intercept should be enabled", __func__));
988 		return;
989 	}
990 
991 	/*
992 	 * We use V_IRQ in conjunction with the VINTR intercept to trap into the
993 	 * hypervisor as soon as a virtual interrupt can be delivered.
994 	 *
995 	 * Since injected events are not subject to intercept checks we need to
996 	 * ensure that the V_IRQ is not actually going to be delivered on VM
997 	 * entry.
998 	 */
999 	VERIFY((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 ||
1000 	    (state->rflags & PSL_I) == 0 || ctrl->intr_shadow);
1001 
1002 	ctrl->v_irq |= V_IRQ;
1003 	ctrl->v_intr_prio |= V_IGN_TPR;
1004 	ctrl->v_intr_vector = 0;
1005 	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
1006 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
1007 }
1008 
1009 static void
1010 svm_disable_intr_window_exiting(struct svm_softc *sc, int vcpu)
1011 {
1012 	struct vmcb_ctrl *ctrl;
1013 
1014 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1015 
1016 	if ((ctrl->v_irq & V_IRQ) == 0 && ctrl->v_intr_vector == 0) {
1017 		KASSERT(!vintr_intercept_enabled(sc, vcpu),
1018 		    ("%s: vintr intercept should be disabled", __func__));
1019 		return;
1020 	}
1021 
1022 	ctrl->v_irq &= ~V_IRQ;
1023 	ctrl->v_intr_vector = 0;
1024 	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
1025 	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
1026 }
1027 
1028 /*
1029  * Once an NMI is injected it blocks delivery of further NMIs until the handler
1030  * executes an IRET. The IRET intercept is enabled when an NMI is injected to
1031  * to track when the vcpu is done handling the NMI.
1032  */
1033 static int
1034 svm_nmi_blocked(struct svm_softc *sc, int vcpu)
1035 {
1036 	return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
1037 	    VMCB_INTCPT_IRET));
1038 }
1039 
1040 static void
1041 svm_clear_nmi_blocking(struct svm_softc *sc, int vcpu)
1042 {
1043 	struct vmcb_ctrl *ctrl;
1044 
1045 	KASSERT(svm_nmi_blocked(sc, vcpu), ("vNMI already unblocked"));
1046 	/*
1047 	 * When the IRET intercept is cleared the vcpu will attempt to execute
1048 	 * the "iret" when it runs next. However, it is possible to inject
1049 	 * another NMI into the vcpu before the "iret" has actually executed.
1050 	 *
1051 	 * For e.g. if the "iret" encounters a #NPF when accessing the stack
1052 	 * it will trap back into the hypervisor. If an NMI is pending for
1053 	 * the vcpu it will be injected into the guest.
1054 	 *
1055 	 * XXX this needs to be fixed
1056 	 */
1057 	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
1058 
1059 	/*
1060 	 * Set an interrupt shadow to prevent an NMI from being immediately
1061 	 * injected on the next VMRUN.
1062 	 */
1063 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1064 	ctrl->intr_shadow = 1;
1065 }
1066 
1067 static void
1068 svm_inject_event(struct vmcb_ctrl *ctrl, uint64_t info)
1069 {
1070 	ASSERT(VM_INTINFO_PENDING(info));
1071 
1072 	uint8_t vector = VM_INTINFO_VECTOR(info);
1073 	uint32_t type = VM_INTINFO_TYPE(info);
1074 
1075 	/*
1076 	 * Correct behavior depends on bhyve intinfo event types lining up with
1077 	 * those defined by AMD for event injection in the VMCB.  The CTASSERTs
1078 	 * above svm_save_exitintinfo() ensure it.
1079 	 */
1080 	switch (type) {
1081 	case VM_INTINFO_NMI:
1082 		/* Ensure vector for injected event matches its type (NMI) */
1083 		vector = IDT_NMI;
1084 		break;
1085 	case VM_INTINFO_HWINTR:
1086 	case VM_INTINFO_SWINTR:
1087 		break;
1088 	case VM_INTINFO_HWEXCP:
1089 		if (vector == IDT_NMI) {
1090 			/*
1091 			 * NMIs are expected to be injected with
1092 			 * VMCB_EVENTINJ_TYPE_NMI, rather than as an exception
1093 			 * with the NMI vector.
1094 			 */
1095 			type = VM_INTINFO_NMI;
1096 		}
1097 		VERIFY(vector < 32);
1098 		break;
1099 	default:
1100 		/*
1101 		 * Since there is not strong validation for injected event types
1102 		 * at this point, fall back to software interrupt for those we
1103 		 * do not recognized.
1104 		 */
1105 		type = VM_INTINFO_SWINTR;
1106 		break;
1107 	}
1108 
1109 	ctrl->eventinj = VMCB_EVENTINJ_VALID | type | vector;
1110 	if (VM_INTINFO_HAS_ERRCODE(info)) {
1111 		ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID;
1112 		ctrl->eventinj |= (uint64_t)VM_INTINFO_ERRCODE(info) << 32;
1113 	}
1114 }
1115 
1116 static void
1117 svm_inject_nmi(struct svm_softc *sc, int vcpu)
1118 {
1119 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1120 
1121 	ASSERT(!svm_nmi_blocked(sc, vcpu));
1122 
1123 	ctrl->eventinj = VMCB_EVENTINJ_VALID | VMCB_EVENTINJ_TYPE_NMI;
1124 	vm_nmi_clear(sc->vm, vcpu);
1125 
1126 	/*
1127 	 * Virtual NMI blocking is now in effect.
1128 	 *
1129 	 * Not only does this block a subsequent NMI injection from taking
1130 	 * place, it also configures an intercept on the IRET so we can track
1131 	 * when the next injection can take place.
1132 	 */
1133 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
1134 }
1135 
1136 static void
1137 svm_inject_irq(struct svm_softc *sc, int vcpu, int vector)
1138 {
1139 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1140 
1141 	ASSERT(vector >= 0 && vector <= 255);
1142 
1143 	ctrl->eventinj = VMCB_EVENTINJ_VALID | vector;
1144 }
1145 
1146 #define	EFER_MBZ_BITS	0xFFFFFFFFFFFF0200UL
1147 
1148 static vm_msr_result_t
1149 svm_write_efer(struct svm_softc *sc, int vcpu, uint64_t newval)
1150 {
1151 	struct vmcb_state *state = svm_get_vmcb_state(sc, vcpu);
1152 	uint64_t lma;
1153 	int error;
1154 
1155 	newval &= ~0xFE;		/* clear the Read-As-Zero (RAZ) bits */
1156 
1157 	if (newval & EFER_MBZ_BITS) {
1158 		return (VMR_GP);
1159 	}
1160 
1161 	/* APMv2 Table 14-5 "Long-Mode Consistency Checks" */
1162 	const uint64_t changed = state->efer ^ newval;
1163 	if (changed & EFER_LME) {
1164 		if (state->cr0 & CR0_PG) {
1165 			return (VMR_GP);
1166 		}
1167 	}
1168 
1169 	/* EFER.LMA = EFER.LME & CR0.PG */
1170 	if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0) {
1171 		lma = EFER_LMA;
1172 	} else {
1173 		lma = 0;
1174 	}
1175 	if ((newval & EFER_LMA) != lma) {
1176 		return (VMR_GP);
1177 	}
1178 
1179 	if ((newval & EFER_NXE) != 0 &&
1180 	    !vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE)) {
1181 		return (VMR_GP);
1182 	}
1183 	if ((newval & EFER_FFXSR) != 0 &&
1184 	    !vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR)) {
1185 		return (VMR_GP);
1186 	}
1187 	if ((newval & EFER_TCE) != 0 &&
1188 	    !vm_cpuid_capability(sc->vm, vcpu, VCC_TCE)) {
1189 		return (VMR_GP);
1190 	}
1191 
1192 	/*
1193 	 * Until bhyve has proper support for long-mode segment limits, just
1194 	 * toss a #GP at the guest if they attempt to use it.
1195 	 */
1196 	if (newval & EFER_LMSLE) {
1197 		return (VMR_GP);
1198 	}
1199 
1200 	error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval);
1201 	VERIFY0(error);
1202 	return (VMR_OK);
1203 }
1204 
1205 static int
1206 svm_handle_msr(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit,
1207     bool is_wrmsr)
1208 {
1209 	struct vmcb_state *state = svm_get_vmcb_state(svm_sc, vcpu);
1210 	struct svm_regctx *ctx = svm_get_guest_regctx(svm_sc, vcpu);
1211 	const uint32_t ecx = ctx->sctx_rcx;
1212 	vm_msr_result_t res;
1213 	uint64_t val = 0;
1214 
1215 	if (is_wrmsr) {
1216 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1);
1217 		val = ctx->sctx_rdx << 32 | (uint32_t)state->rax;
1218 
1219 		if (vlapic_owned_msr(ecx)) {
1220 			struct vlapic *vlapic = vm_lapic(svm_sc->vm, vcpu);
1221 
1222 			res = vlapic_wrmsr(vlapic, ecx, val);
1223 		} else if (ecx == MSR_EFER) {
1224 			res = svm_write_efer(svm_sc, vcpu, val);
1225 		} else {
1226 			res = svm_wrmsr(svm_sc, vcpu, ecx, val);
1227 		}
1228 	} else {
1229 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1);
1230 
1231 		if (vlapic_owned_msr(ecx)) {
1232 			struct vlapic *vlapic = vm_lapic(svm_sc->vm, vcpu);
1233 
1234 			res = vlapic_rdmsr(vlapic, ecx, &val);
1235 		} else {
1236 			res = svm_rdmsr(svm_sc, vcpu, ecx, &val);
1237 		}
1238 	}
1239 
1240 	switch (res) {
1241 	case VMR_OK:
1242 		/* Store rdmsr result in the appropriate registers */
1243 		if (!is_wrmsr) {
1244 			state->rax = (uint32_t)val;
1245 			ctx->sctx_rdx = val >> 32;
1246 		}
1247 		return (1);
1248 	case VMR_GP:
1249 		vm_inject_gp(svm_sc->vm, vcpu);
1250 		return (1);
1251 	case VMR_UNHANLDED:
1252 		vmexit->exitcode = is_wrmsr ?
1253 		    VM_EXITCODE_WRMSR : VM_EXITCODE_RDMSR;
1254 		vmexit->u.msr.code = ecx;
1255 		vmexit->u.msr.wval = val;
1256 		return (0);
1257 	default:
1258 		panic("unexpected msr result %u\n", res);
1259 	}
1260 }
1261 
1262 /*
1263  * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs
1264  * that are due to instruction intercepts as well as MSR and IOIO intercepts
1265  * and exceptions caused by INT3, INTO and BOUND instructions.
1266  *
1267  * Return 1 if the nRIP is valid and 0 otherwise.
1268  */
1269 static int
1270 nrip_valid(uint64_t exitcode)
1271 {
1272 	switch (exitcode) {
1273 	case 0x00 ... 0x0F:	/* read of CR0 through CR15 */
1274 	case 0x10 ... 0x1F:	/* write of CR0 through CR15 */
1275 	case 0x20 ... 0x2F:	/* read of DR0 through DR15 */
1276 	case 0x30 ... 0x3F:	/* write of DR0 through DR15 */
1277 	case 0x43:		/* INT3 */
1278 	case 0x44:		/* INTO */
1279 	case 0x45:		/* BOUND */
1280 	case 0x65 ... 0x7C:	/* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */
1281 	case 0x80 ... 0x8D:	/* VMEXIT_VMRUN ... VMEXIT_XSETBV */
1282 		return (1);
1283 	default:
1284 		return (0);
1285 	}
1286 }
1287 
1288 static int
1289 svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
1290 {
1291 	struct vmcb *vmcb;
1292 	struct vmcb_state *state;
1293 	struct vmcb_ctrl *ctrl;
1294 	struct svm_regctx *ctx;
1295 	uint64_t code, info1, info2;
1296 	int handled;
1297 
1298 	ctx = svm_get_guest_regctx(svm_sc, vcpu);
1299 	vmcb = svm_get_vmcb(svm_sc, vcpu);
1300 	state = &vmcb->state;
1301 	ctrl = &vmcb->ctrl;
1302 
1303 	handled = 0;
1304 	code = ctrl->exitcode;
1305 	info1 = ctrl->exitinfo1;
1306 	info2 = ctrl->exitinfo2;
1307 
1308 	vmexit->exitcode = VM_EXITCODE_BOGUS;
1309 	vmexit->rip = state->rip;
1310 	vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0;
1311 
1312 	vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1);
1313 
1314 	/*
1315 	 * #VMEXIT(INVALID) needs to be handled early because the VMCB is
1316 	 * in an inconsistent state and can trigger assertions that would
1317 	 * never happen otherwise.
1318 	 */
1319 	if (code == VMCB_EXIT_INVALID) {
1320 		vm_exit_svm(vmexit, code, info1, info2);
1321 		return (0);
1322 	}
1323 
1324 	KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event "
1325 	    "injection valid bit is set %lx", __func__, ctrl->eventinj));
1326 
1327 	KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15,
1328 	    ("invalid inst_length %d: code (%lx), info1 (%lx), info2 (%lx)",
1329 	    vmexit->inst_length, code, info1, info2));
1330 
1331 	svm_update_virqinfo(svm_sc, vcpu);
1332 	svm_save_exitintinfo(svm_sc, vcpu);
1333 
1334 	switch (code) {
1335 	case VMCB_EXIT_CR0_READ:
1336 		if (VMCB_CRx_INFO1_VALID(info1) != 0) {
1337 			svm_handle_cr0_read(svm_sc, vcpu,
1338 			    vie_regnum_map(VMCB_CRx_INFO1_GPR(info1)));
1339 			handled = 1;
1340 		} else {
1341 			/*
1342 			 * If SMSW is used to read the contents of %cr0, then
1343 			 * the VALID bit will not be set in `info1`, since the
1344 			 * handling is different from the mov-to-reg case.
1345 			 *
1346 			 * Punt to the instruction emulation to handle it.
1347 			 */
1348 			svm_inst_emul_other(svm_sc, vcpu, vmexit);
1349 		}
1350 		break;
1351 	case VMCB_EXIT_CR0_WRITE:
1352 	case VMCB_EXIT_CR0_SEL_WRITE:
1353 		if (VMCB_CRx_INFO1_VALID(info1) != 0) {
1354 			svm_handle_cr0_write(svm_sc, vcpu,
1355 			    vie_regnum_map(VMCB_CRx_INFO1_GPR(info1)));
1356 			handled = 1;
1357 		} else {
1358 			/*
1359 			 * Writes to %cr0 without VALID being set in `info1` are
1360 			 * initiated by the LMSW and CLTS instructions.  While
1361 			 * LMSW (like SMSW) sees little use in modern OSes and
1362 			 * bootloaders, CLTS is still used for handling FPU
1363 			 * state transitions.
1364 			 *
1365 			 * Punt to the instruction emulation to handle them.
1366 			 */
1367 			svm_inst_emul_other(svm_sc, vcpu, vmexit);
1368 		}
1369 		break;
1370 	case VMCB_EXIT_IRET:
1371 		/*
1372 		 * Restart execution at "iret" but with the intercept cleared.
1373 		 */
1374 		vmexit->inst_length = 0;
1375 		svm_clear_nmi_blocking(svm_sc, vcpu);
1376 		handled = 1;
1377 		break;
1378 	case VMCB_EXIT_VINTR:	/* interrupt window exiting */
1379 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1);
1380 		svm_disable_intr_window_exiting(svm_sc, vcpu);
1381 		handled = 1;
1382 		break;
1383 	case VMCB_EXIT_INTR:	/* external interrupt */
1384 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1);
1385 		handled = 1;
1386 		break;
1387 	case VMCB_EXIT_NMI:
1388 	case VMCB_EXIT_SMI:
1389 	case VMCB_EXIT_INIT:
1390 		/*
1391 		 * For external NMI/SMI and physical INIT interrupts, simply
1392 		 * continue execution, as those host events will be handled by
1393 		 * the physical CPU.
1394 		 */
1395 		handled = 1;
1396 		break;
1397 	case VMCB_EXIT_EXCP0 ... VMCB_EXIT_EXCP31: {
1398 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1);
1399 
1400 		const uint8_t idtvec = code - VMCB_EXIT_EXCP0;
1401 		uint32_t errcode = 0;
1402 		bool reflect = true;
1403 		bool errcode_valid = false;
1404 
1405 		switch (idtvec) {
1406 		case IDT_MC:
1407 			/* The host will handle the MCE itself. */
1408 			reflect = false;
1409 			vmm_call_trap(T_MCE);
1410 			break;
1411 		case IDT_PF:
1412 			VERIFY0(svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2,
1413 			    info2));
1414 			/* fallthru */
1415 		case IDT_NP:
1416 		case IDT_SS:
1417 		case IDT_GP:
1418 		case IDT_AC:
1419 		case IDT_TS:
1420 			errcode_valid = true;
1421 			errcode = info1;
1422 			break;
1423 
1424 		case IDT_DF:
1425 			errcode_valid = true;
1426 			break;
1427 
1428 		case IDT_BP:
1429 		case IDT_OF:
1430 		case IDT_BR:
1431 			/*
1432 			 * The 'nrip' field is populated for INT3, INTO and
1433 			 * BOUND exceptions and this also implies that
1434 			 * 'inst_length' is non-zero.
1435 			 *
1436 			 * Reset 'inst_length' to zero so the guest %rip at
1437 			 * event injection is identical to what it was when
1438 			 * the exception originally happened.
1439 			 */
1440 			vmexit->inst_length = 0;
1441 			/* fallthru */
1442 		default:
1443 			errcode_valid = false;
1444 			break;
1445 		}
1446 		VERIFY0(vmexit->inst_length);
1447 
1448 		if (reflect) {
1449 			/* Reflect the exception back into the guest */
1450 			VERIFY0(vm_inject_exception(svm_sc->vm, vcpu, idtvec,
1451 			    errcode_valid, errcode, false));
1452 		}
1453 		handled = 1;
1454 		break;
1455 		}
1456 	case VMCB_EXIT_MSR:
1457 		handled = svm_handle_msr(svm_sc, vcpu, vmexit, info1 != 0);
1458 		break;
1459 	case VMCB_EXIT_IO:
1460 		handled = svm_handle_inout(svm_sc, vcpu, vmexit);
1461 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1);
1462 		break;
1463 	case VMCB_EXIT_SHUTDOWN:
1464 		(void) vm_suspend(svm_sc->vm, VM_SUSPEND_TRIPLEFAULT);
1465 		handled = 1;
1466 		break;
1467 	case VMCB_EXIT_INVLPGA:
1468 		/* privileged invalidation instructions */
1469 		vm_inject_ud(svm_sc->vm, vcpu);
1470 		handled = 1;
1471 		break;
1472 	case VMCB_EXIT_VMRUN:
1473 	case VMCB_EXIT_VMLOAD:
1474 	case VMCB_EXIT_VMSAVE:
1475 	case VMCB_EXIT_STGI:
1476 	case VMCB_EXIT_CLGI:
1477 	case VMCB_EXIT_SKINIT:
1478 		/* privileged vmm instructions */
1479 		vm_inject_ud(svm_sc->vm, vcpu);
1480 		handled = 1;
1481 		break;
1482 	case VMCB_EXIT_INVD:
1483 	case VMCB_EXIT_WBINVD:
1484 		/* ignore exit */
1485 		handled = 1;
1486 		break;
1487 	case VMCB_EXIT_VMMCALL:
1488 		/* No handlers make use of VMMCALL for now */
1489 		vm_inject_ud(svm_sc->vm, vcpu);
1490 		handled = 1;
1491 		break;
1492 	case VMCB_EXIT_CPUID:
1493 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1);
1494 		vcpu_emulate_cpuid(svm_sc->vm, vcpu, &state->rax,
1495 		    &ctx->sctx_rbx, &ctx->sctx_rcx, &ctx->sctx_rdx);
1496 		handled = 1;
1497 		break;
1498 	case VMCB_EXIT_HLT:
1499 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1);
1500 		vmexit->exitcode = VM_EXITCODE_HLT;
1501 		vmexit->u.hlt.rflags = state->rflags;
1502 		break;
1503 	case VMCB_EXIT_PAUSE:
1504 		vmexit->exitcode = VM_EXITCODE_PAUSE;
1505 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1);
1506 		break;
1507 	case VMCB_EXIT_NPF:
1508 		/* EXITINFO2 contains the faulting guest physical address */
1509 		if (info1 & VMCB_NPF_INFO1_RSV) {
1510 			/* nested fault with reserved bits set */
1511 		} else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) {
1512 			vmexit->exitcode = VM_EXITCODE_PAGING;
1513 			vmexit->u.paging.gpa = info2;
1514 			vmexit->u.paging.fault_type = npf_fault_type(info1);
1515 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
1516 		} else if (svm_npf_emul_fault(info1)) {
1517 			svm_handle_mmio_emul(svm_sc, vcpu, vmexit, info2);
1518 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_MMIO_EMUL, 1);
1519 		}
1520 		break;
1521 	case VMCB_EXIT_MONITOR:
1522 		vmexit->exitcode = VM_EXITCODE_MONITOR;
1523 		break;
1524 	case VMCB_EXIT_MWAIT:
1525 		vmexit->exitcode = VM_EXITCODE_MWAIT;
1526 		break;
1527 	default:
1528 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1);
1529 		break;
1530 	}
1531 
1532 	DTRACE_PROBE3(vmm__vexit, int, vcpu, uint64_t, vmexit->rip, uint32_t,
1533 	    code);
1534 
1535 	if (handled) {
1536 		vmexit->rip += vmexit->inst_length;
1537 		vmexit->inst_length = 0;
1538 		state->rip = vmexit->rip;
1539 	} else {
1540 		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
1541 			/*
1542 			 * If this VM exit was not claimed by anybody then
1543 			 * treat it as a generic SVM exit.
1544 			 */
1545 			vm_exit_svm(vmexit, code, info1, info2);
1546 		} else {
1547 			/*
1548 			 * The exitcode and collateral have been populated.
1549 			 * The VM exit will be processed further in userland.
1550 			 */
1551 		}
1552 	}
1553 	return (handled);
1554 }
1555 
1556 /*
1557  * Inject exceptions, NMIs, and ExtINTs.
1558  *
1559  * The logic behind these are complicated and may involve mutex contention, so
1560  * the injection is performed without the protection of host CPU interrupts
1561  * being disabled.  This means a racing notification could be "lost",
1562  * necessitating a later call to svm_inject_recheck() to close that window
1563  * of opportunity.
1564  */
1565 static enum event_inject_state
1566 svm_inject_events(struct svm_softc *sc, int vcpu)
1567 {
1568 	struct vmcb_ctrl *ctrl;
1569 	struct vmcb_state *state;
1570 	struct svm_vcpu *vcpustate;
1571 	uint64_t intinfo;
1572 	enum event_inject_state ev_state;
1573 
1574 	state = svm_get_vmcb_state(sc, vcpu);
1575 	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
1576 	vcpustate = svm_get_vcpu(sc, vcpu);
1577 	ev_state = EIS_CAN_INJECT;
1578 
1579 	/* Clear any interrupt shadow if guest %rip has changed */
1580 	if (vcpustate->nextrip != state->rip) {
1581 		ctrl->intr_shadow = 0;
1582 	}
1583 
1584 	/*
1585 	 * An event is already pending for injection.  This can occur when the
1586 	 * vCPU exits prior to VM entry (like for an AST).
1587 	 */
1588 	if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
1589 		return (EIS_EV_EXISTING | EIS_REQ_EXIT);
1590 	}
1591 
1592 	/*
1593 	 * Inject pending events or exceptions for this vcpu.
1594 	 *
1595 	 * An event might be pending because the previous #VMEXIT happened
1596 	 * during event delivery (i.e. ctrl->exitintinfo).
1597 	 *
1598 	 * An event might also be pending because an exception was injected
1599 	 * by the hypervisor (e.g. #PF during instruction emulation).
1600 	 */
1601 	if (vm_entry_intinfo(sc->vm, vcpu, &intinfo)) {
1602 		svm_inject_event(ctrl, intinfo);
1603 		vmm_stat_incr(sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1);
1604 		ev_state = EIS_EV_INJECTED;
1605 	}
1606 
1607 	/* NMI event has priority over interrupts. */
1608 	if (vm_nmi_pending(sc->vm, vcpu) && !svm_nmi_blocked(sc, vcpu)) {
1609 		if (ev_state == EIS_CAN_INJECT) {
1610 			/* Can't inject NMI if vcpu is in an intr_shadow. */
1611 			if (ctrl->intr_shadow) {
1612 				return (EIS_GI_BLOCK);
1613 			}
1614 
1615 			svm_inject_nmi(sc, vcpu);
1616 			ev_state = EIS_EV_INJECTED;
1617 		} else {
1618 			return (ev_state | EIS_REQ_EXIT);
1619 		}
1620 	}
1621 
1622 	if (vm_extint_pending(sc->vm, vcpu)) {
1623 		int vector;
1624 
1625 		if (ev_state != EIS_CAN_INJECT) {
1626 			return (ev_state | EIS_REQ_EXIT);
1627 		}
1628 
1629 		/*
1630 		 * If the guest has disabled interrupts or is in an interrupt
1631 		 * shadow then we cannot inject the pending interrupt.
1632 		 */
1633 		if ((state->rflags & PSL_I) == 0 || ctrl->intr_shadow) {
1634 			return (EIS_GI_BLOCK);
1635 		}
1636 
1637 		/* Ask the legacy pic for a vector to inject */
1638 		vatpic_pending_intr(sc->vm, &vector);
1639 		KASSERT(vector >= 0 && vector <= 255,
1640 		    ("invalid vector %d from INTR", vector));
1641 
1642 		svm_inject_irq(sc, vcpu, vector);
1643 		vm_extint_clear(sc->vm, vcpu);
1644 		vatpic_intr_accepted(sc->vm, vector);
1645 		ev_state = EIS_EV_INJECTED;
1646 	}
1647 
1648 	return (ev_state);
1649 }
1650 
1651 /*
1652  * Synchronize vLAPIC state and inject any interrupts pending on it.
1653  *
1654  * This is done with host CPU interrupts disabled so notification IPIs will be
1655  * queued on the host APIC and recognized when entering SVM guest context.
1656  */
1657 static enum event_inject_state
1658 svm_inject_vlapic(struct svm_softc *sc, int vcpu, struct vlapic *vlapic,
1659     enum event_inject_state ev_state)
1660 {
1661 	struct vmcb_ctrl *ctrl;
1662 	struct vmcb_state *state;
1663 	int vector;
1664 	uint8_t v_tpr;
1665 
1666 	state = svm_get_vmcb_state(sc, vcpu);
1667 	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
1668 
1669 	/*
1670 	 * The guest can modify the TPR by writing to %cr8. In guest mode the
1671 	 * CPU reflects this write to V_TPR without hypervisor intervention.
1672 	 *
1673 	 * The guest can also modify the TPR by writing to it via the memory
1674 	 * mapped APIC page. In this case, the write will be emulated by the
1675 	 * hypervisor. For this reason V_TPR must be updated before every
1676 	 * VMRUN.
1677 	 */
1678 	v_tpr = vlapic_get_cr8(vlapic);
1679 	KASSERT(v_tpr <= 15, ("invalid v_tpr %x", v_tpr));
1680 	if (ctrl->v_tpr != v_tpr) {
1681 		ctrl->v_tpr = v_tpr;
1682 		svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
1683 	}
1684 
1685 	/* If an event cannot otherwise be injected, we are done for now */
1686 	if (ev_state != EIS_CAN_INJECT) {
1687 		return (ev_state);
1688 	}
1689 
1690 	if (!vlapic_pending_intr(vlapic, &vector)) {
1691 		return (EIS_CAN_INJECT);
1692 	}
1693 	KASSERT(vector >= 16 && vector <= 255,
1694 	    ("invalid vector %d from local APIC", vector));
1695 
1696 	/*
1697 	 * If the guest has disabled interrupts or is in an interrupt shadow
1698 	 * then we cannot inject the pending interrupt.
1699 	 */
1700 	if ((state->rflags & PSL_I) == 0 || ctrl->intr_shadow) {
1701 		return (EIS_GI_BLOCK);
1702 	}
1703 
1704 	svm_inject_irq(sc, vcpu, vector);
1705 	vlapic_intr_accepted(vlapic, vector);
1706 	return (EIS_EV_INJECTED);
1707 }
1708 
1709 /*
1710  * Re-check for events to be injected.
1711  *
1712  * Once host CPU interrupts are disabled, check for the presence of any events
1713  * which require injection processing.  If an exit is required upon injection,
1714  * or once the guest becomes interruptable, that will be configured too.
1715  */
1716 static bool
1717 svm_inject_recheck(struct svm_softc *sc, int vcpu,
1718     enum event_inject_state ev_state)
1719 {
1720 	struct vmcb_ctrl *ctrl;
1721 
1722 	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
1723 
1724 	if (ev_state == EIS_CAN_INJECT) {
1725 		/*
1726 		 * An active interrupt shadow would preclude us from injecting
1727 		 * any events picked up during a re-check.
1728 		 */
1729 		if (ctrl->intr_shadow != 0) {
1730 			return (false);
1731 		}
1732 
1733 		if (vm_nmi_pending(sc->vm, vcpu) &&
1734 		    !svm_nmi_blocked(sc, vcpu)) {
1735 			/* queued NMI not blocked by NMI-window-exiting */
1736 			return (true);
1737 		}
1738 		if (vm_extint_pending(sc->vm, vcpu)) {
1739 			/* queued ExtINT not blocked by existing injection */
1740 			return (true);
1741 		}
1742 	} else {
1743 		if ((ev_state & EIS_REQ_EXIT) != 0) {
1744 			/*
1745 			 * Use a self-IPI to force an immediate exit after
1746 			 * event injection has occurred.
1747 			 */
1748 			poke_cpu(CPU->cpu_id);
1749 		} else {
1750 			/*
1751 			 * If any event is being injected, an exit immediately
1752 			 * upon becoming interruptable again will allow pending
1753 			 * or newly queued events to be injected in a timely
1754 			 * manner.
1755 			 */
1756 			svm_enable_intr_window_exiting(sc, vcpu);
1757 		}
1758 	}
1759 	return (false);
1760 }
1761 
1762 
1763 static void
1764 check_asid(struct svm_softc *sc, int vcpuid, uint_t thiscpu, uint64_t nptgen)
1765 {
1766 	struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid);
1767 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
1768 	uint8_t flush;
1769 
1770 	flush = hma_svm_asid_update(&vcpustate->hma_asid, flush_by_asid(),
1771 	    vcpustate->nptgen != nptgen);
1772 
1773 	if (flush != VMCB_TLB_FLUSH_NOTHING) {
1774 		ctrl->asid = vcpustate->hma_asid.hsa_asid;
1775 		svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
1776 	}
1777 	ctrl->tlb_ctrl = flush;
1778 	vcpustate->nptgen = nptgen;
1779 }
1780 
1781 static void
1782 flush_asid(struct svm_softc *sc, int vcpuid)
1783 {
1784 	struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid);
1785 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
1786 	uint8_t flush;
1787 
1788 	flush = hma_svm_asid_update(&vcpustate->hma_asid, flush_by_asid(),
1789 	    true);
1790 
1791 	ASSERT(flush != VMCB_TLB_FLUSH_NOTHING);
1792 	ctrl->asid = vcpustate->hma_asid.hsa_asid;
1793 	ctrl->tlb_ctrl = flush;
1794 	svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
1795 	/*
1796 	 * A potential future optimization: We could choose to update the nptgen
1797 	 * associated with the vCPU, since any pending nptgen change requiring a
1798 	 * flush will be satisfied by the one which has just now been queued.
1799 	 */
1800 }
1801 
1802 static __inline void
1803 disable_gintr(void)
1804 {
1805 	__asm __volatile("clgi");
1806 }
1807 
1808 static __inline void
1809 enable_gintr(void)
1810 {
1811 	__asm __volatile("stgi");
1812 }
1813 
1814 static __inline void
1815 svm_dr_enter_guest(struct svm_regctx *gctx)
1816 {
1817 
1818 	/* Save host control debug registers. */
1819 	gctx->host_dr7 = rdr7();
1820 	gctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR);
1821 
1822 	/*
1823 	 * Disable debugging in DR7 and DEBUGCTL to avoid triggering
1824 	 * exceptions in the host based on the guest DRx values.  The
1825 	 * guest DR6, DR7, and DEBUGCTL are saved/restored in the
1826 	 * VMCB.
1827 	 */
1828 	load_dr7(0);
1829 	wrmsr(MSR_DEBUGCTLMSR, 0);
1830 
1831 	/* Save host debug registers. */
1832 	gctx->host_dr0 = rdr0();
1833 	gctx->host_dr1 = rdr1();
1834 	gctx->host_dr2 = rdr2();
1835 	gctx->host_dr3 = rdr3();
1836 	gctx->host_dr6 = rdr6();
1837 
1838 	/* Restore guest debug registers. */
1839 	load_dr0(gctx->sctx_dr0);
1840 	load_dr1(gctx->sctx_dr1);
1841 	load_dr2(gctx->sctx_dr2);
1842 	load_dr3(gctx->sctx_dr3);
1843 }
1844 
1845 static __inline void
1846 svm_dr_leave_guest(struct svm_regctx *gctx)
1847 {
1848 
1849 	/* Save guest debug registers. */
1850 	gctx->sctx_dr0 = rdr0();
1851 	gctx->sctx_dr1 = rdr1();
1852 	gctx->sctx_dr2 = rdr2();
1853 	gctx->sctx_dr3 = rdr3();
1854 
1855 	/*
1856 	 * Restore host debug registers.  Restore DR7 and DEBUGCTL
1857 	 * last.
1858 	 */
1859 	load_dr0(gctx->host_dr0);
1860 	load_dr1(gctx->host_dr1);
1861 	load_dr2(gctx->host_dr2);
1862 	load_dr3(gctx->host_dr3);
1863 	load_dr6(gctx->host_dr6);
1864 	wrmsr(MSR_DEBUGCTLMSR, gctx->host_debugctl);
1865 	load_dr7(gctx->host_dr7);
1866 }
1867 
1868 static void
1869 svm_apply_tsc_adjust(struct svm_softc *svm_sc, int vcpuid)
1870 {
1871 	const uint64_t offset = vcpu_tsc_offset(svm_sc->vm, vcpuid, true);
1872 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(svm_sc, vcpuid);
1873 
1874 	if (ctrl->tsc_offset != offset) {
1875 		ctrl->tsc_offset = offset;
1876 		svm_set_dirty(svm_sc, vcpuid, VMCB_CACHE_I);
1877 	}
1878 }
1879 
1880 
1881 /*
1882  * Start vcpu with specified RIP.
1883  */
1884 static int
1885 svm_vmrun(void *arg, int vcpu, uint64_t rip)
1886 {
1887 	struct svm_regctx *gctx;
1888 	struct svm_softc *svm_sc;
1889 	struct svm_vcpu *vcpustate;
1890 	struct vmcb_state *state;
1891 	struct vmcb_ctrl *ctrl;
1892 	struct vm_exit *vmexit;
1893 	struct vlapic *vlapic;
1894 	vm_client_t *vmc;
1895 	struct vm *vm;
1896 	uint64_t vmcb_pa;
1897 	int handled;
1898 	uint16_t ldt_sel;
1899 
1900 	svm_sc = arg;
1901 	vm = svm_sc->vm;
1902 
1903 	vcpustate = svm_get_vcpu(svm_sc, vcpu);
1904 	state = svm_get_vmcb_state(svm_sc, vcpu);
1905 	ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
1906 	vmexit = vm_exitinfo(vm, vcpu);
1907 	vlapic = vm_lapic(vm, vcpu);
1908 	vmc = vm_get_vmclient(vm, vcpu);
1909 
1910 	gctx = svm_get_guest_regctx(svm_sc, vcpu);
1911 	vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa;
1912 
1913 	if (vcpustate->lastcpu != curcpu) {
1914 		/*
1915 		 * Force new ASID allocation by invalidating the generation.
1916 		 */
1917 		vcpustate->hma_asid.hsa_gen = 0;
1918 
1919 		/*
1920 		 * Invalidate the VMCB state cache by marking all fields dirty.
1921 		 */
1922 		svm_set_dirty(svm_sc, vcpu, 0xffffffff);
1923 
1924 		/*
1925 		 * XXX
1926 		 * Setting 'vcpustate->lastcpu' here is bit premature because
1927 		 * we may return from this function without actually executing
1928 		 * the VMRUN  instruction. This could happen if an AST or yield
1929 		 * condition is pending on the first time through the loop.
1930 		 *
1931 		 * This works for now but any new side-effects of vcpu
1932 		 * migration should take this case into account.
1933 		 */
1934 		vcpustate->lastcpu = curcpu;
1935 		vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1);
1936 	}
1937 
1938 	svm_apply_tsc_adjust(svm_sc, vcpu);
1939 
1940 	svm_msr_guest_enter(svm_sc, vcpu);
1941 
1942 	VERIFY(!vcpustate->loaded && curthread->t_preempt != 0);
1943 	vcpustate->loaded = B_TRUE;
1944 
1945 	/* Update Guest RIP */
1946 	state->rip = rip;
1947 
1948 	do {
1949 		enum event_inject_state inject_state;
1950 		uint64_t nptgen;
1951 
1952 		/*
1953 		 * Initial event injection is complex and may involve mutex
1954 		 * contention, so it must be performed with global interrupts
1955 		 * still enabled.
1956 		 */
1957 		inject_state = svm_inject_events(svm_sc, vcpu);
1958 		handled = 0;
1959 
1960 		/*
1961 		 * Disable global interrupts to guarantee atomicity during
1962 		 * loading of guest state. This includes not only the state
1963 		 * loaded by the "vmrun" instruction but also software state
1964 		 * maintained by the hypervisor: suspended and rendezvous
1965 		 * state, NPT generation number, vlapic interrupts etc.
1966 		 */
1967 		disable_gintr();
1968 
1969 		/*
1970 		 * Synchronizing and injecting vlapic state is lock-free and is
1971 		 * safe (and prudent) to perform with interrupts disabled.
1972 		 */
1973 		inject_state = svm_inject_vlapic(svm_sc, vcpu, vlapic,
1974 		    inject_state);
1975 
1976 		/*
1977 		 * Check for vCPU bail-out conditions.  This must be done after
1978 		 * svm_inject_events() to detect a triple-fault condition.
1979 		 */
1980 		if (vcpu_entry_bailout_checks(vm, vcpu, state->rip)) {
1981 			enable_gintr();
1982 			break;
1983 		}
1984 
1985 		if (vcpu_run_state_pending(vm, vcpu)) {
1986 			enable_gintr();
1987 			vm_exit_run_state(vm, vcpu, state->rip);
1988 			break;
1989 		}
1990 
1991 		/*
1992 		 * If subsequent activity queued events which require injection
1993 		 * handling, take another lap to handle them.
1994 		 */
1995 		if (svm_inject_recheck(svm_sc, vcpu, inject_state)) {
1996 			enable_gintr();
1997 			handled = 1;
1998 			continue;
1999 		}
2000 
2001 		/*
2002 		 * #VMEXIT resumes the host with the guest LDTR, so
2003 		 * save the current LDT selector so it can be restored
2004 		 * after an exit.  The userspace hypervisor probably
2005 		 * doesn't use a LDT, but save and restore it to be
2006 		 * safe.
2007 		 */
2008 		ldt_sel = sldt();
2009 
2010 		/*
2011 		 * Check the vmspace and ASID generations to ensure that the
2012 		 * vcpu does not use stale TLB mappings.
2013 		 */
2014 		nptgen = vmc_table_enter(vmc);
2015 		check_asid(svm_sc, vcpu, curcpu, nptgen);
2016 
2017 		ctrl->vmcb_clean = vmcb_clean & ~vcpustate->dirty;
2018 		vcpustate->dirty = 0;
2019 
2020 		/* Launch Virtual Machine. */
2021 		vcpu_ustate_change(vm, vcpu, VU_RUN);
2022 		svm_dr_enter_guest(gctx);
2023 		svm_launch(vmcb_pa, gctx, get_pcpu());
2024 		svm_dr_leave_guest(gctx);
2025 		vcpu_ustate_change(vm, vcpu, VU_EMU_KERN);
2026 
2027 		/* Restore host LDTR. */
2028 		lldt(ldt_sel);
2029 
2030 		/* #VMEXIT disables interrupts so re-enable them here. */
2031 		enable_gintr();
2032 
2033 		vmc_table_exit(vmc);
2034 
2035 		/* Update 'nextrip' */
2036 		vcpustate->nextrip = state->rip;
2037 
2038 		/* Handle #VMEXIT and if required return to user space. */
2039 		handled = svm_vmexit(svm_sc, vcpu, vmexit);
2040 	} while (handled);
2041 
2042 	svm_msr_guest_exit(svm_sc, vcpu);
2043 
2044 	VERIFY(vcpustate->loaded && curthread->t_preempt != 0);
2045 	vcpustate->loaded = B_FALSE;
2046 
2047 	return (0);
2048 }
2049 
2050 static void
2051 svm_vmcleanup(void *arg)
2052 {
2053 	struct svm_softc *sc = arg;
2054 
2055 	vmm_contig_free(sc->iopm_bitmap, SVM_IO_BITMAP_SIZE);
2056 	vmm_contig_free(sc->msr_bitmap, SVM_MSR_BITMAP_SIZE);
2057 	kmem_free(sc, sizeof (*sc));
2058 }
2059 
2060 static uint64_t *
2061 swctx_regptr(struct svm_regctx *regctx, int reg)
2062 {
2063 	switch (reg) {
2064 	case VM_REG_GUEST_RBX:
2065 		return (&regctx->sctx_rbx);
2066 	case VM_REG_GUEST_RCX:
2067 		return (&regctx->sctx_rcx);
2068 	case VM_REG_GUEST_RDX:
2069 		return (&regctx->sctx_rdx);
2070 	case VM_REG_GUEST_RDI:
2071 		return (&regctx->sctx_rdi);
2072 	case VM_REG_GUEST_RSI:
2073 		return (&regctx->sctx_rsi);
2074 	case VM_REG_GUEST_RBP:
2075 		return (&regctx->sctx_rbp);
2076 	case VM_REG_GUEST_R8:
2077 		return (&regctx->sctx_r8);
2078 	case VM_REG_GUEST_R9:
2079 		return (&regctx->sctx_r9);
2080 	case VM_REG_GUEST_R10:
2081 		return (&regctx->sctx_r10);
2082 	case VM_REG_GUEST_R11:
2083 		return (&regctx->sctx_r11);
2084 	case VM_REG_GUEST_R12:
2085 		return (&regctx->sctx_r12);
2086 	case VM_REG_GUEST_R13:
2087 		return (&regctx->sctx_r13);
2088 	case VM_REG_GUEST_R14:
2089 		return (&regctx->sctx_r14);
2090 	case VM_REG_GUEST_R15:
2091 		return (&regctx->sctx_r15);
2092 	case VM_REG_GUEST_DR0:
2093 		return (&regctx->sctx_dr0);
2094 	case VM_REG_GUEST_DR1:
2095 		return (&regctx->sctx_dr1);
2096 	case VM_REG_GUEST_DR2:
2097 		return (&regctx->sctx_dr2);
2098 	case VM_REG_GUEST_DR3:
2099 		return (&regctx->sctx_dr3);
2100 	default:
2101 		return (NULL);
2102 	}
2103 }
2104 
2105 static int
2106 svm_getreg(void *arg, int vcpu, int ident, uint64_t *val)
2107 {
2108 	struct svm_softc *sc;
2109 	struct vmcb *vmcb;
2110 	uint64_t *regp;
2111 	uint64_t *fieldp;
2112 	struct vmcb_segment *seg;
2113 
2114 	sc = arg;
2115 	vmcb = svm_get_vmcb(sc, vcpu);
2116 
2117 	regp = swctx_regptr(svm_get_guest_regctx(sc, vcpu), ident);
2118 	if (regp != NULL) {
2119 		*val = *regp;
2120 		return (0);
2121 	}
2122 
2123 	switch (ident) {
2124 	case VM_REG_GUEST_INTR_SHADOW:
2125 		*val = (vmcb->ctrl.intr_shadow != 0) ? 1 : 0;
2126 		break;
2127 
2128 	case VM_REG_GUEST_CR0:
2129 		svm_get_cr0(sc, vcpu, val);
2130 		break;
2131 	case VM_REG_GUEST_CR2:
2132 	case VM_REG_GUEST_CR3:
2133 	case VM_REG_GUEST_CR4:
2134 	case VM_REG_GUEST_DR6:
2135 	case VM_REG_GUEST_DR7:
2136 	case VM_REG_GUEST_EFER:
2137 	case VM_REG_GUEST_RAX:
2138 	case VM_REG_GUEST_RFLAGS:
2139 	case VM_REG_GUEST_RIP:
2140 	case VM_REG_GUEST_RSP:
2141 		fieldp = vmcb_regptr(vmcb, ident, NULL);
2142 		*val = *fieldp;
2143 		break;
2144 
2145 	case VM_REG_GUEST_CS:
2146 	case VM_REG_GUEST_DS:
2147 	case VM_REG_GUEST_ES:
2148 	case VM_REG_GUEST_FS:
2149 	case VM_REG_GUEST_GS:
2150 	case VM_REG_GUEST_SS:
2151 	case VM_REG_GUEST_LDTR:
2152 	case VM_REG_GUEST_TR:
2153 		seg = vmcb_segptr(vmcb, ident);
2154 		*val = seg->selector;
2155 		break;
2156 
2157 	case VM_REG_GUEST_GDTR:
2158 	case VM_REG_GUEST_IDTR:
2159 		/* GDTR and IDTR don't have segment selectors */
2160 		return (EINVAL);
2161 
2162 	case VM_REG_GUEST_PDPTE0:
2163 	case VM_REG_GUEST_PDPTE1:
2164 	case VM_REG_GUEST_PDPTE2:
2165 	case VM_REG_GUEST_PDPTE3:
2166 		/*
2167 		 * Unlike VMX, where the PDPTEs are explicitly cached as part of
2168 		 * several well-defined events related to paging (such as
2169 		 * loading %cr3), SVM walks the PDPEs (their PDPTE) as part of
2170 		 * nested paging lookups.  This makes these registers
2171 		 * effectively irrelevant on SVM.
2172 		 *
2173 		 * Rather than tossing an error, emit zeroed values so casual
2174 		 * consumers do not need to be as careful about that difference.
2175 		 */
2176 		*val = 0;
2177 		break;
2178 
2179 	default:
2180 		return (EINVAL);
2181 	}
2182 
2183 	return (0);
2184 }
2185 
2186 static int
2187 svm_setreg(void *arg, int vcpu, int ident, uint64_t val)
2188 {
2189 	struct svm_softc *sc;
2190 	struct vmcb *vmcb;
2191 	uint64_t *regp;
2192 	uint64_t *fieldp;
2193 	uint32_t dirty;
2194 	struct vmcb_segment *seg;
2195 
2196 	sc = arg;
2197 	vmcb = svm_get_vmcb(sc, vcpu);
2198 
2199 	regp = swctx_regptr(svm_get_guest_regctx(sc, vcpu), ident);
2200 	if (regp != NULL) {
2201 		*regp = val;
2202 		return (0);
2203 	}
2204 
2205 	dirty = VMCB_CACHE_NONE;
2206 	switch (ident) {
2207 	case VM_REG_GUEST_INTR_SHADOW:
2208 		vmcb->ctrl.intr_shadow = (val != 0) ? 1 : 0;
2209 		break;
2210 
2211 	case VM_REG_GUEST_EFER:
2212 		fieldp = vmcb_regptr(vmcb, ident, &dirty);
2213 		/* EFER_SVM must always be set when the guest is executing */
2214 		*fieldp = val | EFER_SVM;
2215 		dirty |= VMCB_CACHE_CR;
2216 		break;
2217 
2218 	case VM_REG_GUEST_CR0:
2219 		svm_set_cr0(sc, vcpu, val, false);
2220 		break;
2221 	case VM_REG_GUEST_CR2:
2222 	case VM_REG_GUEST_CR3:
2223 	case VM_REG_GUEST_CR4:
2224 	case VM_REG_GUEST_DR6:
2225 	case VM_REG_GUEST_DR7:
2226 	case VM_REG_GUEST_RAX:
2227 	case VM_REG_GUEST_RFLAGS:
2228 	case VM_REG_GUEST_RIP:
2229 	case VM_REG_GUEST_RSP:
2230 		fieldp = vmcb_regptr(vmcb, ident, &dirty);
2231 		*fieldp = val;
2232 		break;
2233 
2234 	case VM_REG_GUEST_CS:
2235 	case VM_REG_GUEST_DS:
2236 	case VM_REG_GUEST_ES:
2237 	case VM_REG_GUEST_SS:
2238 	case VM_REG_GUEST_FS:
2239 	case VM_REG_GUEST_GS:
2240 	case VM_REG_GUEST_LDTR:
2241 	case VM_REG_GUEST_TR:
2242 		dirty |= VMCB_CACHE_SEG;
2243 		seg = vmcb_segptr(vmcb, ident);
2244 		seg->selector = (uint16_t)val;
2245 		break;
2246 
2247 	case VM_REG_GUEST_GDTR:
2248 	case VM_REG_GUEST_IDTR:
2249 		/* GDTR and IDTR don't have segment selectors */
2250 		return (EINVAL);
2251 
2252 	case VM_REG_GUEST_PDPTE0:
2253 	case VM_REG_GUEST_PDPTE1:
2254 	case VM_REG_GUEST_PDPTE2:
2255 	case VM_REG_GUEST_PDPTE3:
2256 		/*
2257 		 * PDPEs (AMD's PDPTE) are not cached under SVM, so we can
2258 		 * ignore attempts to set them.  See handler in svm_getreg() for
2259 		 * more details.
2260 		 */
2261 		break;
2262 
2263 	default:
2264 		return (EINVAL);
2265 	}
2266 
2267 	if (dirty != VMCB_CACHE_NONE) {
2268 		svm_set_dirty(sc, vcpu, dirty);
2269 	}
2270 
2271 	/*
2272 	 * XXX deal with CR3 and invalidate TLB entries tagged with the
2273 	 * vcpu's ASID. This needs to be treated differently depending on
2274 	 * whether 'running' is true/false.
2275 	 */
2276 
2277 	return (0);
2278 }
2279 
2280 static int
2281 svm_setdesc(void *arg, int vcpu, int reg, const struct seg_desc *desc)
2282 {
2283 	struct vmcb *vmcb;
2284 	struct svm_softc *sc;
2285 	struct vmcb_segment *seg;
2286 
2287 	sc = arg;
2288 	vmcb = svm_get_vmcb(sc, vcpu);
2289 
2290 	switch (reg) {
2291 	case VM_REG_GUEST_CS:
2292 	case VM_REG_GUEST_DS:
2293 	case VM_REG_GUEST_ES:
2294 	case VM_REG_GUEST_SS:
2295 	case VM_REG_GUEST_FS:
2296 	case VM_REG_GUEST_GS:
2297 	case VM_REG_GUEST_LDTR:
2298 	case VM_REG_GUEST_TR:
2299 		svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG);
2300 		seg = vmcb_segptr(vmcb, reg);
2301 		/*
2302 		 * Map seg_desc access to VMCB attribute format.
2303 		 *
2304 		 * SVM uses the 'P' bit in the segment attributes to indicate a
2305 		 * NULL segment so clear it if the segment is marked unusable.
2306 		 */
2307 		seg->attrib = VMCB_ACCESS2ATTR(desc->access);
2308 		if (SEG_DESC_UNUSABLE(desc->access)) {
2309 			seg->attrib &= ~0x80;
2310 		}
2311 		/*
2312 		 * Keep CPL synced with the DPL specified for %ss.
2313 		 *
2314 		 * KVM notes that a SYSRET to non-cpl-3 is possible on AMD
2315 		 * (unlike Intel), but accepts such a possible deviation for
2316 		 * what is otherwise unreasonable behavior for a guest OS, since
2317 		 * they do the same synchronization.
2318 		 */
2319 		if (reg == VM_REG_GUEST_SS) {
2320 			vmcb->state.cpl = SEG_DESC_DPL(desc->access);
2321 		}
2322 		break;
2323 
2324 	case VM_REG_GUEST_GDTR:
2325 	case VM_REG_GUEST_IDTR:
2326 		svm_set_dirty(sc, vcpu, VMCB_CACHE_DT);
2327 		seg = vmcb_segptr(vmcb, reg);
2328 		break;
2329 
2330 	default:
2331 		return (EINVAL);
2332 	}
2333 
2334 	ASSERT(seg != NULL);
2335 	seg->base = desc->base;
2336 	seg->limit = desc->limit;
2337 
2338 	return (0);
2339 }
2340 
2341 static int
2342 svm_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
2343 {
2344 	struct vmcb *vmcb;
2345 	struct svm_softc *sc;
2346 	struct vmcb_segment *seg;
2347 
2348 	sc = arg;
2349 	vmcb = svm_get_vmcb(sc, vcpu);
2350 
2351 	switch (reg) {
2352 	case VM_REG_GUEST_DS:
2353 	case VM_REG_GUEST_ES:
2354 	case VM_REG_GUEST_FS:
2355 	case VM_REG_GUEST_GS:
2356 	case VM_REG_GUEST_SS:
2357 	case VM_REG_GUEST_LDTR:
2358 		seg = vmcb_segptr(vmcb, reg);
2359 		desc->access = VMCB_ATTR2ACCESS(seg->attrib);
2360 		/*
2361 		 * VT-x uses bit 16 to indicate a segment that has been loaded
2362 		 * with a NULL selector (aka unusable). The 'desc->access'
2363 		 * field is interpreted in the VT-x format by the
2364 		 * processor-independent code.
2365 		 *
2366 		 * SVM uses the 'P' bit to convey the same information so
2367 		 * convert it into the VT-x format. For more details refer to
2368 		 * section "Segment State in the VMCB" in APMv2.
2369 		 */
2370 		if ((desc->access & 0x80) == 0) {
2371 			/* Unusable segment */
2372 			desc->access |= 0x10000;
2373 		}
2374 		break;
2375 
2376 	case VM_REG_GUEST_CS:
2377 	case VM_REG_GUEST_TR:
2378 		seg = vmcb_segptr(vmcb, reg);
2379 		desc->access = VMCB_ATTR2ACCESS(seg->attrib);
2380 		break;
2381 
2382 	case VM_REG_GUEST_GDTR:
2383 	case VM_REG_GUEST_IDTR:
2384 		seg = vmcb_segptr(vmcb, reg);
2385 		/*
2386 		 * Since there are no access bits associated with the GDTR or
2387 		 * the IDTR, zero out the field to ensure it does not contain
2388 		 * garbage which might confuse the consumer.
2389 		 */
2390 		desc->access = 0;
2391 		break;
2392 
2393 	default:
2394 		return (EINVAL);
2395 	}
2396 
2397 	ASSERT(seg != NULL);
2398 	desc->base = seg->base;
2399 	desc->limit = seg->limit;
2400 	return (0);
2401 }
2402 
2403 static int
2404 svm_get_msr(void *arg, int vcpu, uint32_t msr, uint64_t *valp)
2405 {
2406 	struct svm_softc *sc = arg;
2407 	struct vmcb *vmcb = svm_get_vmcb(sc, vcpu);
2408 	const uint64_t *msrp = vmcb_msr_ptr(vmcb, msr, NULL);
2409 
2410 	if (msrp != NULL) {
2411 		*valp = *msrp;
2412 		return (0);
2413 	}
2414 
2415 	return (EINVAL);
2416 }
2417 
2418 static int
2419 svm_set_msr(void *arg, int vcpu, uint32_t msr, uint64_t val)
2420 {
2421 	struct svm_softc *sc = arg;
2422 	struct vmcb *vmcb = svm_get_vmcb(sc, vcpu);
2423 
2424 	uint32_t dirty = 0;
2425 	uint64_t *msrp = vmcb_msr_ptr(vmcb, msr, &dirty);
2426 	if (msrp == NULL) {
2427 		return (EINVAL);
2428 	}
2429 	switch (msr) {
2430 	case MSR_EFER:
2431 		/*
2432 		 * For now, just clone the logic from
2433 		 * svm_setreg():
2434 		 *
2435 		 * EFER_SVM must always be set when the guest is
2436 		 * executing
2437 		 */
2438 		*msrp = val | EFER_SVM;
2439 		break;
2440 	/* TODO: other necessary MSR masking */
2441 	default:
2442 		*msrp = val;
2443 		break;
2444 	}
2445 	if (dirty != 0) {
2446 		svm_set_dirty(sc, vcpu, dirty);
2447 	}
2448 	return (0);
2449 
2450 }
2451 
2452 static int
2453 svm_setcap(void *arg, int vcpu, int type, int val)
2454 {
2455 	struct svm_softc *sc;
2456 	int error;
2457 
2458 	sc = arg;
2459 	error = 0;
2460 	switch (type) {
2461 	case VM_CAP_HALT_EXIT:
2462 		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2463 		    VMCB_INTCPT_HLT, val);
2464 		break;
2465 	case VM_CAP_PAUSE_EXIT:
2466 		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2467 		    VMCB_INTCPT_PAUSE, val);
2468 		break;
2469 	default:
2470 		error = ENOENT;
2471 		break;
2472 	}
2473 	return (error);
2474 }
2475 
2476 static int
2477 svm_getcap(void *arg, int vcpu, int type, int *retval)
2478 {
2479 	struct svm_softc *sc;
2480 	int error;
2481 
2482 	sc = arg;
2483 	error = 0;
2484 
2485 	switch (type) {
2486 	case VM_CAP_HALT_EXIT:
2487 		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2488 		    VMCB_INTCPT_HLT);
2489 		break;
2490 	case VM_CAP_PAUSE_EXIT:
2491 		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2492 		    VMCB_INTCPT_PAUSE);
2493 		break;
2494 	default:
2495 		error = ENOENT;
2496 		break;
2497 	}
2498 	return (error);
2499 }
2500 
2501 static struct vlapic *
2502 svm_vlapic_init(void *arg, int vcpuid)
2503 {
2504 	struct svm_softc *svm_sc;
2505 	struct vlapic *vlapic;
2506 
2507 	svm_sc = arg;
2508 	vlapic = kmem_zalloc(sizeof (struct vlapic), KM_SLEEP);
2509 	vlapic->vm = svm_sc->vm;
2510 	vlapic->vcpuid = vcpuid;
2511 	vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid];
2512 
2513 	vlapic_init(vlapic);
2514 
2515 	return (vlapic);
2516 }
2517 
2518 static void
2519 svm_vlapic_cleanup(void *arg, struct vlapic *vlapic)
2520 {
2521 	vlapic_cleanup(vlapic);
2522 	kmem_free(vlapic, sizeof (struct vlapic));
2523 }
2524 
2525 static void
2526 svm_pause(void *arg, int vcpu)
2527 {
2528 	struct svm_softc *sc = arg;
2529 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu);
2530 
2531 	/*
2532 	 * If an event is pending injection in the VMCB, stash it in
2533 	 * exit_intinfo as if it were deferred by an exit from guest context.
2534 	 */
2535 	const uint64_t intinfo = ctrl->eventinj;
2536 	if ((intinfo & VMCB_EVENTINJ_VALID) != 0) {
2537 		svm_stash_intinfo(sc, vcpu, intinfo);
2538 		ctrl->eventinj = 0;
2539 	}
2540 
2541 	/*
2542 	 * Now that no event is pending injection, interrupt-window exiting and
2543 	 * NMI-blocking can be disabled.  If/when this vCPU is made to run
2544 	 * again, those conditions will be reinstated when the now-queued events
2545 	 * are re-injected.
2546 	 */
2547 	svm_disable_intr_window_exiting(sc, vcpu);
2548 	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
2549 }
2550 
2551 static void
2552 svm_savectx(void *arg, int vcpu)
2553 {
2554 	struct svm_softc *sc = arg;
2555 
2556 	if (sc->vcpu[vcpu].loaded) {
2557 		svm_msr_guest_exit(sc, vcpu);
2558 	}
2559 }
2560 
2561 static void
2562 svm_restorectx(void *arg, int vcpu)
2563 {
2564 	struct svm_softc *sc = arg;
2565 
2566 	if (sc->vcpu[vcpu].loaded) {
2567 		svm_msr_guest_enter(sc, vcpu);
2568 	}
2569 }
2570 
2571 struct vmm_ops vmm_ops_amd = {
2572 	.init		= svm_init,
2573 	.cleanup	= svm_cleanup,
2574 	.resume		= svm_restore,
2575 
2576 	.vminit		= svm_vminit,
2577 	.vmrun		= svm_vmrun,
2578 	.vmcleanup	= svm_vmcleanup,
2579 	.vmgetreg	= svm_getreg,
2580 	.vmsetreg	= svm_setreg,
2581 	.vmgetdesc	= svm_getdesc,
2582 	.vmsetdesc	= svm_setdesc,
2583 	.vmgetcap	= svm_getcap,
2584 	.vmsetcap	= svm_setcap,
2585 	.vlapic_init	= svm_vlapic_init,
2586 	.vlapic_cleanup	= svm_vlapic_cleanup,
2587 	.vmpause	= svm_pause,
2588 
2589 	.vmsavectx	= svm_savectx,
2590 	.vmrestorectx	= svm_restorectx,
2591 
2592 	.vmgetmsr	= svm_get_msr,
2593 	.vmsetmsr	= svm_set_msr,
2594 };
2595