xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm.c (revision 0dd92943)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 /*
31  * This file and its contents are supplied under the terms of the
32  * Common Development and Distribution License ("CDDL"), version 1.0.
33  * You may only use this file in accordance with the terms of version
34  * 1.0 of the CDDL.
35  *
36  * A full copy of the text of the CDDL should have accompanied this
37  * source.  A copy of the CDDL is also available via the Internet at
38  * http://www.illumos.org/license/CDDL.
39  *
40  * Copyright 2015 Pluribus Networks Inc.
41  * Copyright 2018 Joyent, Inc.
42  * Copyright 2022 Oxide Computer Company
43  * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
44  */
45 
46 #include <sys/cdefs.h>
47 __FBSDID("$FreeBSD$");
48 
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/kernel.h>
52 #include <sys/module.h>
53 #include <sys/sysctl.h>
54 #include <sys/kmem.h>
55 #include <sys/pcpu.h>
56 #include <sys/mutex.h>
57 #include <sys/proc.h>
58 #include <sys/rwlock.h>
59 #include <sys/sched.h>
60 #include <sys/systm.h>
61 #include <sys/sunddi.h>
62 #include <sys/hma.h>
63 
64 #include <machine/md_var.h>
65 #include <x86/psl.h>
66 #include <x86/apicreg.h>
67 
68 #include <machine/specialreg.h>
69 #include <machine/vmm.h>
70 #include <machine/vmm_dev.h>
71 #include <machine/vmparam.h>
72 #include <sys/vmm_instruction_emul.h>
73 #include <sys/vmm_vm.h>
74 #include <sys/vmm_gpt.h>
75 #include <sys/vmm_data.h>
76 
77 #include "vmm_ioport.h"
78 #include "vmm_host.h"
79 #include "vmm_util.h"
80 #include "vatpic.h"
81 #include "vatpit.h"
82 #include "vhpet.h"
83 #include "vioapic.h"
84 #include "vlapic.h"
85 #include "vpmtmr.h"
86 #include "vrtc.h"
87 #include "vmm_stat.h"
88 #include "vmm_lapic.h"
89 
90 #include "io/ppt.h"
91 #include "io/iommu.h"
92 
93 struct vlapic;
94 
95 /* Flags for vtc_status */
96 #define	VTCS_FPU_RESTORED	1 /* guest FPU restored, host FPU saved */
97 #define	VTCS_FPU_CTX_CRITICAL	2 /* in ctx where FPU restore cannot be lazy */
98 
99 typedef struct vm_thread_ctx {
100 	struct vm	*vtc_vm;
101 	int		vtc_vcpuid;
102 	uint_t		vtc_status;
103 	enum vcpu_ustate vtc_ustate;
104 } vm_thread_ctx_t;
105 
106 #define	VMM_MTRR_VAR_MAX 10
107 #define	VMM_MTRR_DEF_MASK \
108 	(MTRR_DEF_ENABLE | MTRR_DEF_FIXED_ENABLE | MTRR_DEF_TYPE)
109 #define	VMM_MTRR_PHYSBASE_MASK (MTRR_PHYSBASE_PHYSBASE | MTRR_PHYSBASE_TYPE)
110 #define	VMM_MTRR_PHYSMASK_MASK (MTRR_PHYSMASK_PHYSMASK | MTRR_PHYSMASK_VALID)
111 struct vm_mtrr {
112 	uint64_t def_type;
113 	uint64_t fixed4k[8];
114 	uint64_t fixed16k[2];
115 	uint64_t fixed64k;
116 	struct {
117 		uint64_t base;
118 		uint64_t mask;
119 	} var[VMM_MTRR_VAR_MAX];
120 };
121 
122 /*
123  * Initialization:
124  * (a) allocated when vcpu is created
125  * (i) initialized when vcpu is created and when it is reinitialized
126  * (o) initialized the first time the vcpu is created
127  * (x) initialized before use
128  */
129 struct vcpu {
130 	/* (o) protects state, run_state, hostcpu, sipi_vector */
131 	kmutex_t	lock;
132 
133 	enum vcpu_state	state;		/* (o) vcpu state */
134 	enum vcpu_run_state run_state;	/* (i) vcpu init/sipi/run state */
135 	kcondvar_t	vcpu_cv;	/* (o) cpu waiter cv */
136 	kcondvar_t	state_cv;	/* (o) IDLE-transition cv */
137 	int		hostcpu;	/* (o) vcpu's current host cpu */
138 	int		lastloccpu;	/* (o) last host cpu localized to */
139 	int		reqidle;	/* (i) request vcpu to idle */
140 	struct vlapic	*vlapic;	/* (i) APIC device model */
141 	enum x2apic_state x2apic_state;	/* (i) APIC mode */
142 	uint64_t	exit_intinfo;	/* (i) events pending at VM exit */
143 	uint64_t	exc_pending;	/* (i) exception pending */
144 	bool		nmi_pending;	/* (i) NMI pending */
145 	bool		extint_pending;	/* (i) INTR pending */
146 
147 	uint8_t		sipi_vector;	/* (i) SIPI vector */
148 	hma_fpu_t	*guestfpu;	/* (a,i) guest fpu state */
149 	uint64_t	guest_xcr0;	/* (i) guest %xcr0 register */
150 	void		*stats;		/* (a,i) statistics */
151 	struct vm_exit	exitinfo;	/* (x) exit reason and collateral */
152 	uint64_t	nextrip;	/* (x) next instruction to execute */
153 	struct vie	*vie_ctx;	/* (x) instruction emulation context */
154 	vm_client_t	*vmclient;	/* (a) VM-system client */
155 	uint64_t	tsc_offset;	/* (x) offset from host TSC */
156 	struct vm_mtrr	mtrr;		/* (i) vcpu's MTRR */
157 	vcpu_cpuid_config_t cpuid_cfg;	/* (x) cpuid configuration */
158 
159 	enum vcpu_ustate ustate;	/* (i) microstate for the vcpu */
160 	hrtime_t	ustate_when;	/* (i) time of last ustate change */
161 	uint64_t ustate_total[VU_MAX];	/* (o) total time spent in ustates */
162 	vm_thread_ctx_t	vtc;		/* (o) thread state for ctxops */
163 	struct ctxop	*ctxop;		/* (o) ctxop storage for vcpu */
164 };
165 
166 #define	vcpu_lock(v)		mutex_enter(&((v)->lock))
167 #define	vcpu_unlock(v)		mutex_exit(&((v)->lock))
168 #define	vcpu_assert_locked(v)	ASSERT(MUTEX_HELD(&((v)->lock)))
169 
170 struct mem_seg {
171 	size_t	len;
172 	bool	sysmem;
173 	vm_object_t *object;
174 };
175 #define	VM_MAX_MEMSEGS	5
176 
177 struct mem_map {
178 	vm_paddr_t	gpa;
179 	size_t		len;
180 	vm_ooffset_t	segoff;
181 	int		segid;
182 	int		prot;
183 	int		flags;
184 };
185 #define	VM_MAX_MEMMAPS	8
186 
187 /*
188  * Initialization:
189  * (o) initialized the first time the VM is created
190  * (i) initialized when VM is created and when it is reinitialized
191  * (x) initialized before use
192  */
193 struct vm {
194 	void		*cookie;		/* (i) cpu-specific data */
195 	void		*iommu;			/* (x) iommu-specific data */
196 	struct vhpet	*vhpet;			/* (i) virtual HPET */
197 	struct vioapic	*vioapic;		/* (i) virtual ioapic */
198 	struct vatpic	*vatpic;		/* (i) virtual atpic */
199 	struct vatpit	*vatpit;		/* (i) virtual atpit */
200 	struct vpmtmr	*vpmtmr;		/* (i) virtual ACPI PM timer */
201 	struct vrtc	*vrtc;			/* (o) virtual RTC */
202 	volatile cpuset_t active_cpus;		/* (i) active vcpus */
203 	volatile cpuset_t debug_cpus;		/* (i) vcpus stopped for dbg */
204 	int		suspend;		/* (i) stop VM execution */
205 	volatile cpuset_t suspended_cpus;	/* (i) suspended vcpus */
206 	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
207 	struct mem_map	mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
208 	struct mem_seg	mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
209 	struct vmspace	*vmspace;		/* (o) guest's address space */
210 	struct vcpu	vcpu[VM_MAXCPU];	/* (i) guest vcpus */
211 	/* The following describe the vm cpu topology */
212 	uint16_t	sockets;		/* (o) num of sockets */
213 	uint16_t	cores;			/* (o) num of cores/socket */
214 	uint16_t	threads;		/* (o) num of threads/core */
215 	uint16_t	maxcpus;		/* (o) max pluggable cpus */
216 
217 	uint64_t	boot_tsc_offset;	/* (i) TSC offset at VM boot */
218 	hrtime_t	boot_hrtime;		/* (i) hrtime at VM boot */
219 
220 	struct ioport_config ioports;		/* (o) ioport handling */
221 
222 	bool		mem_transient;		/* (o) alloc transient memory */
223 	bool		is_paused;		/* (i) instance is paused */
224 };
225 
226 static int vmm_initialized;
227 
228 
229 static void
230 nullop_panic(void)
231 {
232 	panic("null vmm operation call");
233 }
234 
235 /* Do not allow use of an un-set `ops` to do anything but panic */
236 static struct vmm_ops vmm_ops_null = {
237 	.init		= (vmm_init_func_t)nullop_panic,
238 	.cleanup	= (vmm_cleanup_func_t)nullop_panic,
239 	.resume		= (vmm_resume_func_t)nullop_panic,
240 	.vminit		= (vmi_init_func_t)nullop_panic,
241 	.vmrun		= (vmi_run_func_t)nullop_panic,
242 	.vmcleanup	= (vmi_cleanup_func_t)nullop_panic,
243 	.vmgetreg	= (vmi_get_register_t)nullop_panic,
244 	.vmsetreg	= (vmi_set_register_t)nullop_panic,
245 	.vmgetdesc	= (vmi_get_desc_t)nullop_panic,
246 	.vmsetdesc	= (vmi_set_desc_t)nullop_panic,
247 	.vmgetcap	= (vmi_get_cap_t)nullop_panic,
248 	.vmsetcap	= (vmi_set_cap_t)nullop_panic,
249 	.vlapic_init	= (vmi_vlapic_init)nullop_panic,
250 	.vlapic_cleanup	= (vmi_vlapic_cleanup)nullop_panic,
251 	.vmpause	= (vmi_pause_t)nullop_panic,
252 	.vmsavectx	= (vmi_savectx)nullop_panic,
253 	.vmrestorectx	= (vmi_restorectx)nullop_panic,
254 	.vmgetmsr	= (vmi_get_msr_t)nullop_panic,
255 	.vmsetmsr	= (vmi_set_msr_t)nullop_panic,
256 };
257 
258 static struct vmm_ops *ops = &vmm_ops_null;
259 static vmm_pte_ops_t *pte_ops = NULL;
260 
261 #define	VMM_INIT()			((*ops->init)())
262 #define	VMM_CLEANUP()			((*ops->cleanup)())
263 #define	VMM_RESUME()			((*ops->resume)())
264 
265 #define	VMINIT(vm)		((*ops->vminit)(vm))
266 #define	VMRUN(vmi, vcpu, rip)	((*ops->vmrun)(vmi, vcpu, rip))
267 #define	VMCLEANUP(vmi)			((*ops->vmcleanup)(vmi))
268 
269 #define	VMGETREG(vmi, vcpu, num, rv)	((*ops->vmgetreg)(vmi, vcpu, num, rv))
270 #define	VMSETREG(vmi, vcpu, num, val)	((*ops->vmsetreg)(vmi, vcpu, num, val))
271 #define	VMGETDESC(vmi, vcpu, num, dsc)	((*ops->vmgetdesc)(vmi, vcpu, num, dsc))
272 #define	VMSETDESC(vmi, vcpu, num, dsc)	((*ops->vmsetdesc)(vmi, vcpu, num, dsc))
273 #define	VMGETCAP(vmi, vcpu, num, rv)	((*ops->vmgetcap)(vmi, vcpu, num, rv))
274 #define	VMSETCAP(vmi, vcpu, num, val)	((*ops->vmsetcap)(vmi, vcpu, num, val))
275 #define	VLAPIC_INIT(vmi, vcpu)		((*ops->vlapic_init)(vmi, vcpu))
276 #define	VLAPIC_CLEANUP(vmi, vlapic)	((*ops->vlapic_cleanup)(vmi, vlapic))
277 
278 #define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
279 #define	fpu_stop_emulating()	clts()
280 
281 SDT_PROVIDER_DEFINE(vmm);
282 
283 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
284     NULL);
285 
286 /*
287  * Halt the guest if all vcpus are executing a HLT instruction with
288  * interrupts disabled.
289  */
290 int halt_detection_enabled = 1;
291 
292 /* Trap into hypervisor on all guest exceptions and reflect them back */
293 int trace_guest_exceptions;
294 
295 /* Trap WBINVD and ignore it */
296 int trap_wbinvd = 1;
297 
298 static void vm_free_memmap(struct vm *vm, int ident);
299 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
300 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t);
301 static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid);
302 static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector);
303 
304 static void vmm_savectx(void *);
305 static void vmm_restorectx(void *);
306 static const struct ctxop_template vmm_ctxop_tpl = {
307 	.ct_rev		= CTXOP_TPL_REV,
308 	.ct_save	= vmm_savectx,
309 	.ct_restore	= vmm_restorectx,
310 };
311 
312 #ifdef KTR
313 static const char *
314 vcpu_state2str(enum vcpu_state state)
315 {
316 
317 	switch (state) {
318 	case VCPU_IDLE:
319 		return ("idle");
320 	case VCPU_FROZEN:
321 		return ("frozen");
322 	case VCPU_RUNNING:
323 		return ("running");
324 	case VCPU_SLEEPING:
325 		return ("sleeping");
326 	default:
327 		return ("unknown");
328 	}
329 }
330 #endif
331 
332 static void
333 vcpu_cleanup(struct vm *vm, int i, bool destroy)
334 {
335 	struct vcpu *vcpu = &vm->vcpu[i];
336 
337 	VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
338 	if (destroy) {
339 		vmm_stat_free(vcpu->stats);
340 
341 		vcpu_cpuid_cleanup(&vcpu->cpuid_cfg);
342 
343 		hma_fpu_free(vcpu->guestfpu);
344 		vcpu->guestfpu = NULL;
345 
346 		vie_free(vcpu->vie_ctx);
347 		vcpu->vie_ctx = NULL;
348 
349 		vmc_destroy(vcpu->vmclient);
350 		vcpu->vmclient = NULL;
351 
352 		ctxop_free(vcpu->ctxop);
353 		mutex_destroy(&vcpu->lock);
354 	}
355 }
356 
357 static void
358 vcpu_init(struct vm *vm, int vcpu_id, bool create)
359 {
360 	struct vcpu *vcpu;
361 
362 	KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
363 	    ("vcpu_init: invalid vcpu %d", vcpu_id));
364 
365 	vcpu = &vm->vcpu[vcpu_id];
366 
367 	if (create) {
368 		mutex_init(&vcpu->lock, NULL, MUTEX_ADAPTIVE, NULL);
369 
370 		vcpu->state = VCPU_IDLE;
371 		vcpu->hostcpu = NOCPU;
372 		vcpu->lastloccpu = NOCPU;
373 		vcpu->guestfpu = hma_fpu_alloc(KM_SLEEP);
374 		vcpu->stats = vmm_stat_alloc();
375 		vcpu->vie_ctx = vie_alloc();
376 		vcpu_cpuid_init(&vcpu->cpuid_cfg);
377 
378 		vcpu->ustate = VU_INIT;
379 		vcpu->ustate_when = gethrtime();
380 
381 		vcpu->vtc.vtc_vm = vm;
382 		vcpu->vtc.vtc_vcpuid = vcpu_id;
383 		vcpu->ctxop = ctxop_allocate(&vmm_ctxop_tpl, &vcpu->vtc);
384 	} else {
385 		vie_reset(vcpu->vie_ctx);
386 		bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo));
387 		if (vcpu->ustate != VU_INIT) {
388 			vcpu_ustate_change(vm, vcpu_id, VU_INIT);
389 		}
390 		bzero(&vcpu->mtrr, sizeof (vcpu->mtrr));
391 	}
392 
393 	vcpu->run_state = VRS_HALT;
394 	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
395 	(void) vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
396 	vcpu->reqidle = 0;
397 	vcpu->exit_intinfo = 0;
398 	vcpu->nmi_pending = false;
399 	vcpu->extint_pending = false;
400 	vcpu->exc_pending = 0;
401 	vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
402 	(void) hma_fpu_init(vcpu->guestfpu);
403 	vmm_stat_init(vcpu->stats);
404 	vcpu->tsc_offset = 0;
405 }
406 
407 int
408 vcpu_trace_exceptions(struct vm *vm, int vcpuid)
409 {
410 	return (trace_guest_exceptions);
411 }
412 
413 int
414 vcpu_trap_wbinvd(struct vm *vm, int vcpuid)
415 {
416 	return (trap_wbinvd);
417 }
418 
419 struct vm_exit *
420 vm_exitinfo(struct vm *vm, int cpuid)
421 {
422 	struct vcpu *vcpu;
423 
424 	if (cpuid < 0 || cpuid >= vm->maxcpus)
425 		panic("vm_exitinfo: invalid cpuid %d", cpuid);
426 
427 	vcpu = &vm->vcpu[cpuid];
428 
429 	return (&vcpu->exitinfo);
430 }
431 
432 struct vie *
433 vm_vie_ctx(struct vm *vm, int cpuid)
434 {
435 	if (cpuid < 0 || cpuid >= vm->maxcpus)
436 		panic("vm_vie_ctx: invalid cpuid %d", cpuid);
437 
438 	return (vm->vcpu[cpuid].vie_ctx);
439 }
440 
441 static int
442 vmm_init(void)
443 {
444 	vmm_host_state_init();
445 
446 	if (vmm_is_intel()) {
447 		ops = &vmm_ops_intel;
448 		pte_ops = &ept_pte_ops;
449 	} else if (vmm_is_svm()) {
450 		ops = &vmm_ops_amd;
451 		pte_ops = &rvi_pte_ops;
452 	} else {
453 		return (ENXIO);
454 	}
455 
456 	return (VMM_INIT());
457 }
458 
459 int
460 vmm_mod_load()
461 {
462 	int	error;
463 
464 	VERIFY(vmm_initialized == 0);
465 
466 	error = vmm_init();
467 	if (error == 0)
468 		vmm_initialized = 1;
469 
470 	return (error);
471 }
472 
473 int
474 vmm_mod_unload()
475 {
476 	int	error;
477 
478 	VERIFY(vmm_initialized == 1);
479 
480 	error = VMM_CLEANUP();
481 	if (error)
482 		return (error);
483 	vmm_initialized = 0;
484 
485 	return (0);
486 }
487 
488 /*
489  * Create a test IOMMU domain to see if the host system has necessary hardware
490  * and drivers to do so.
491  */
492 bool
493 vmm_check_iommu(void)
494 {
495 	void *domain;
496 	const size_t arb_test_sz = (1UL << 32);
497 
498 	domain = iommu_create_domain(arb_test_sz);
499 	if (domain == NULL) {
500 		return (false);
501 	}
502 	iommu_destroy_domain(domain);
503 	return (true);
504 }
505 
506 static void
507 vm_init(struct vm *vm, bool create)
508 {
509 	int i;
510 
511 	vm->cookie = VMINIT(vm);
512 	vm->iommu = NULL;
513 	vm->vioapic = vioapic_init(vm);
514 	vm->vhpet = vhpet_init(vm);
515 	vm->vatpic = vatpic_init(vm);
516 	vm->vatpit = vatpit_init(vm);
517 	vm->vpmtmr = vpmtmr_init(vm);
518 	if (create)
519 		vm->vrtc = vrtc_init(vm);
520 
521 	vm_inout_init(vm, &vm->ioports);
522 
523 	CPU_ZERO(&vm->active_cpus);
524 	CPU_ZERO(&vm->debug_cpus);
525 
526 	vm->suspend = 0;
527 	CPU_ZERO(&vm->suspended_cpus);
528 
529 	for (i = 0; i < vm->maxcpus; i++)
530 		vcpu_init(vm, i, create);
531 
532 	/*
533 	 * Configure the VM-wide TSC offset so that the call to vm_init()
534 	 * represents the boot time (when the TSC(s) read 0).  Each vCPU will
535 	 * have its own offset from this, which is altered if/when the guest
536 	 * writes to MSR_TSC.
537 	 *
538 	 * The TSC offsetting math is all unsigned, using overflow for negative
539 	 * offets.  A reading of the TSC is negated to form the boot offset.
540 	 */
541 	const uint64_t boot_tsc = rdtsc_offset();
542 	vm->boot_tsc_offset = (uint64_t)(-(int64_t)boot_tsc);
543 
544 	/* Convert the boot TSC reading to hrtime */
545 	vm->boot_hrtime = (hrtime_t)boot_tsc;
546 	scalehrtime(&vm->boot_hrtime);
547 }
548 
549 /*
550  * The default CPU topology is a single thread per package.
551  */
552 uint_t cores_per_package = 1;
553 uint_t threads_per_core = 1;
554 
555 int
556 vm_create(uint64_t flags, struct vm **retvm)
557 {
558 	struct vm *vm;
559 	struct vmspace *vmspace;
560 
561 	/*
562 	 * If vmm.ko could not be successfully initialized then don't attempt
563 	 * to create the virtual machine.
564 	 */
565 	if (!vmm_initialized)
566 		return (ENXIO);
567 
568 	bool track_dirty = (flags & VCF_TRACK_DIRTY) != 0;
569 	if (track_dirty && !pte_ops->vpeo_hw_ad_supported())
570 		return (ENOTSUP);
571 
572 	vmspace = vmspace_alloc(VM_MAXUSER_ADDRESS, pte_ops, track_dirty);
573 	if (vmspace == NULL)
574 		return (ENOMEM);
575 
576 	vm = kmem_zalloc(sizeof (struct vm), KM_SLEEP);
577 
578 	vm->vmspace = vmspace;
579 	vm->mem_transient = (flags & VCF_RESERVOIR_MEM) == 0;
580 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
581 		vm->vcpu[i].vmclient = vmspace_client_alloc(vmspace);
582 	}
583 
584 	vm->sockets = 1;
585 	vm->cores = cores_per_package;	/* XXX backwards compatibility */
586 	vm->threads = threads_per_core;	/* XXX backwards compatibility */
587 	vm->maxcpus = VM_MAXCPU;	/* XXX temp to keep code working */
588 
589 	vm_init(vm, true);
590 
591 	*retvm = vm;
592 	return (0);
593 }
594 
595 void
596 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
597     uint16_t *threads, uint16_t *maxcpus)
598 {
599 	*sockets = vm->sockets;
600 	*cores = vm->cores;
601 	*threads = vm->threads;
602 	*maxcpus = vm->maxcpus;
603 }
604 
605 uint16_t
606 vm_get_maxcpus(struct vm *vm)
607 {
608 	return (vm->maxcpus);
609 }
610 
611 int
612 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
613     uint16_t threads, uint16_t maxcpus)
614 {
615 	if (maxcpus != 0)
616 		return (EINVAL);	/* XXX remove when supported */
617 	if ((sockets * cores * threads) > vm->maxcpus)
618 		return (EINVAL);
619 	/* XXX need to check sockets * cores * threads == vCPU, how? */
620 	vm->sockets = sockets;
621 	vm->cores = cores;
622 	vm->threads = threads;
623 	vm->maxcpus = VM_MAXCPU;	/* XXX temp to keep code working */
624 	return (0);
625 }
626 
627 static void
628 vm_cleanup(struct vm *vm, bool destroy)
629 {
630 	struct mem_map *mm;
631 	int i;
632 
633 	ppt_unassign_all(vm);
634 
635 	if (vm->iommu != NULL)
636 		iommu_destroy_domain(vm->iommu);
637 
638 	/*
639 	 * Devices which attach their own ioport hooks should be cleaned up
640 	 * first so they can tear down those registrations.
641 	 */
642 	vpmtmr_cleanup(vm->vpmtmr);
643 
644 	vm_inout_cleanup(vm, &vm->ioports);
645 
646 	if (destroy)
647 		vrtc_cleanup(vm->vrtc);
648 	else
649 		vrtc_reset(vm->vrtc);
650 
651 	vatpit_cleanup(vm->vatpit);
652 	vhpet_cleanup(vm->vhpet);
653 	vatpic_cleanup(vm->vatpic);
654 	vioapic_cleanup(vm->vioapic);
655 
656 	for (i = 0; i < vm->maxcpus; i++)
657 		vcpu_cleanup(vm, i, destroy);
658 
659 	VMCLEANUP(vm->cookie);
660 
661 	/*
662 	 * System memory is removed from the guest address space only when
663 	 * the VM is destroyed. This is because the mapping remains the same
664 	 * across VM reset.
665 	 *
666 	 * Device memory can be relocated by the guest (e.g. using PCI BARs)
667 	 * so those mappings are removed on a VM reset.
668 	 */
669 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
670 		mm = &vm->mem_maps[i];
671 		if (destroy || !sysmem_mapping(vm, mm)) {
672 			vm_free_memmap(vm, i);
673 		} else {
674 			/*
675 			 * We need to reset the IOMMU flag so this mapping can
676 			 * be reused when a VM is rebooted. Since the IOMMU
677 			 * domain has already been destroyed we can just reset
678 			 * the flag here.
679 			 */
680 			mm->flags &= ~VM_MEMMAP_F_IOMMU;
681 		}
682 	}
683 
684 	if (destroy) {
685 		for (i = 0; i < VM_MAX_MEMSEGS; i++)
686 			vm_free_memseg(vm, i);
687 
688 		vmspace_destroy(vm->vmspace);
689 		vm->vmspace = NULL;
690 	}
691 }
692 
693 void
694 vm_destroy(struct vm *vm)
695 {
696 	vm_cleanup(vm, true);
697 	kmem_free(vm, sizeof (*vm));
698 }
699 
700 int
701 vm_reinit(struct vm *vm, uint64_t flags)
702 {
703 	/* A virtual machine can be reset only if all vcpus are suspended. */
704 	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) != 0) {
705 		if ((flags & VM_REINIT_F_FORCE_SUSPEND) == 0) {
706 			return (EBUSY);
707 		}
708 
709 		/*
710 		 * Force the VM (and all its vCPUs) into a suspended state.
711 		 * This should be quick and easy, since the vm_reinit() call is
712 		 * made while holding the VM write lock, which requires holding
713 		 * all of the vCPUs in the VCPU_FROZEN state.
714 		 */
715 		(void) atomic_cmpset_int((uint_t *)&vm->suspend, 0,
716 		    VM_SUSPEND_RESET);
717 		for (uint_t i = 0; i < vm->maxcpus; i++) {
718 			struct vcpu *vcpu = &vm->vcpu[i];
719 
720 			if (CPU_ISSET(i, &vm->suspended_cpus) ||
721 			    !CPU_ISSET(i, &vm->active_cpus)) {
722 				continue;
723 			}
724 
725 			vcpu_lock(vcpu);
726 			VERIFY3U(vcpu->state, ==, VCPU_FROZEN);
727 			CPU_SET_ATOMIC(i, &vm->suspended_cpus);
728 			vcpu_unlock(vcpu);
729 		}
730 
731 		VERIFY0(CPU_CMP(&vm->suspended_cpus, &vm->active_cpus));
732 	}
733 
734 	vm_cleanup(vm, false);
735 	vm_init(vm, false);
736 	return (0);
737 }
738 
739 bool
740 vm_is_paused(struct vm *vm)
741 {
742 	return (vm->is_paused);
743 }
744 
745 int
746 vm_pause_instance(struct vm *vm)
747 {
748 	if (vm->is_paused) {
749 		return (EALREADY);
750 	}
751 	vm->is_paused = true;
752 
753 	for (uint_t i = 0; i < vm->maxcpus; i++) {
754 		struct vcpu *vcpu = &vm->vcpu[i];
755 
756 		if (!CPU_ISSET(i, &vm->active_cpus)) {
757 			continue;
758 		}
759 		vlapic_pause(vcpu->vlapic);
760 
761 		/*
762 		 * vCPU-specific pause logic includes stashing any
763 		 * to-be-injected events in exit_intinfo where it can be
764 		 * accessed in a manner generic to the backend.
765 		 */
766 		ops->vmpause(vm->cookie, i);
767 	}
768 	vhpet_pause(vm->vhpet);
769 	vatpit_pause(vm->vatpit);
770 	vrtc_pause(vm->vrtc);
771 
772 	return (0);
773 }
774 
775 int
776 vm_resume_instance(struct vm *vm)
777 {
778 	if (!vm->is_paused) {
779 		return (EALREADY);
780 	}
781 	vm->is_paused = false;
782 
783 	vrtc_resume(vm->vrtc);
784 	vatpit_resume(vm->vatpit);
785 	vhpet_resume(vm->vhpet);
786 	for (uint_t i = 0; i < vm->maxcpus; i++) {
787 		struct vcpu *vcpu = &vm->vcpu[i];
788 
789 		if (!CPU_ISSET(i, &vm->active_cpus)) {
790 			continue;
791 		}
792 		vlapic_resume(vcpu->vlapic);
793 	}
794 
795 	return (0);
796 }
797 
798 int
799 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
800 {
801 	vm_object_t *obj;
802 
803 	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
804 		return (ENOMEM);
805 	else
806 		return (0);
807 }
808 
809 int
810 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
811 {
812 	return (vmspace_unmap(vm->vmspace, gpa, gpa + len));
813 }
814 
815 /*
816  * Return 'true' if 'gpa' is allocated in the guest address space.
817  *
818  * This function is called in the context of a running vcpu which acts as
819  * an implicit lock on 'vm->mem_maps[]'.
820  */
821 bool
822 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa)
823 {
824 	struct mem_map *mm;
825 	int i;
826 
827 #ifdef INVARIANTS
828 	int hostcpu, state;
829 	state = vcpu_get_state(vm, vcpuid, &hostcpu);
830 	KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
831 	    ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
832 #endif
833 
834 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
835 		mm = &vm->mem_maps[i];
836 		if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
837 			return (true);		/* 'gpa' is sysmem or devmem */
838 	}
839 
840 	if (ppt_is_mmio(vm, gpa))
841 		return (true);			/* 'gpa' is pci passthru mmio */
842 
843 	return (false);
844 }
845 
846 int
847 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
848 {
849 	struct mem_seg *seg;
850 	vm_object_t *obj;
851 
852 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
853 		return (EINVAL);
854 
855 	if (len == 0 || (len & PAGE_MASK))
856 		return (EINVAL);
857 
858 	seg = &vm->mem_segs[ident];
859 	if (seg->object != NULL) {
860 		if (seg->len == len && seg->sysmem == sysmem)
861 			return (EEXIST);
862 		else
863 			return (EINVAL);
864 	}
865 
866 	obj = vm_object_mem_allocate(len, vm->mem_transient);
867 	if (obj == NULL)
868 		return (ENOMEM);
869 
870 	seg->len = len;
871 	seg->object = obj;
872 	seg->sysmem = sysmem;
873 	return (0);
874 }
875 
876 int
877 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
878     vm_object_t **objptr)
879 {
880 	struct mem_seg *seg;
881 
882 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
883 		return (EINVAL);
884 
885 	seg = &vm->mem_segs[ident];
886 	if (len)
887 		*len = seg->len;
888 	if (sysmem)
889 		*sysmem = seg->sysmem;
890 	if (objptr)
891 		*objptr = seg->object;
892 	return (0);
893 }
894 
895 void
896 vm_free_memseg(struct vm *vm, int ident)
897 {
898 	struct mem_seg *seg;
899 
900 	KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
901 	    ("%s: invalid memseg ident %d", __func__, ident));
902 
903 	seg = &vm->mem_segs[ident];
904 	if (seg->object != NULL) {
905 		vm_object_release(seg->object);
906 		bzero(seg, sizeof (struct mem_seg));
907 	}
908 }
909 
910 int
911 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
912     size_t len, int prot, int flags)
913 {
914 	struct mem_seg *seg;
915 	struct mem_map *m, *map;
916 	vm_ooffset_t last;
917 	int i, error;
918 
919 	if (prot == 0 || (prot & ~(PROT_ALL)) != 0)
920 		return (EINVAL);
921 
922 	if (flags & ~VM_MEMMAP_F_WIRED)
923 		return (EINVAL);
924 
925 	if (segid < 0 || segid >= VM_MAX_MEMSEGS)
926 		return (EINVAL);
927 
928 	seg = &vm->mem_segs[segid];
929 	if (seg->object == NULL)
930 		return (EINVAL);
931 
932 	last = first + len;
933 	if (first < 0 || first >= last || last > seg->len)
934 		return (EINVAL);
935 
936 	if ((gpa | first | last) & PAGE_MASK)
937 		return (EINVAL);
938 
939 	map = NULL;
940 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
941 		m = &vm->mem_maps[i];
942 		if (m->len == 0) {
943 			map = m;
944 			break;
945 		}
946 	}
947 
948 	if (map == NULL)
949 		return (ENOSPC);
950 
951 	error = vmspace_map(vm->vmspace, seg->object, first, gpa, len, prot);
952 	if (error != 0)
953 		return (EFAULT);
954 
955 	vm_object_reference(seg->object);
956 
957 	if ((flags & VM_MEMMAP_F_WIRED) != 0) {
958 		error = vmspace_populate(vm->vmspace, gpa, gpa + len);
959 		if (error != 0) {
960 			VERIFY0(vmspace_unmap(vm->vmspace, gpa, gpa + len));
961 			return (EFAULT);
962 		}
963 	}
964 
965 	map->gpa = gpa;
966 	map->len = len;
967 	map->segoff = first;
968 	map->segid = segid;
969 	map->prot = prot;
970 	map->flags = flags;
971 	return (0);
972 }
973 
974 int
975 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len)
976 {
977 	struct mem_map *m;
978 	int i;
979 
980 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
981 		m = &vm->mem_maps[i];
982 		if (m->gpa == gpa && m->len == len &&
983 		    (m->flags & VM_MEMMAP_F_IOMMU) == 0) {
984 			vm_free_memmap(vm, i);
985 			return (0);
986 		}
987 	}
988 
989 	return (EINVAL);
990 }
991 
992 int
993 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
994     vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
995 {
996 	struct mem_map *mm, *mmnext;
997 	int i;
998 
999 	mmnext = NULL;
1000 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
1001 		mm = &vm->mem_maps[i];
1002 		if (mm->len == 0 || mm->gpa < *gpa)
1003 			continue;
1004 		if (mmnext == NULL || mm->gpa < mmnext->gpa)
1005 			mmnext = mm;
1006 	}
1007 
1008 	if (mmnext != NULL) {
1009 		*gpa = mmnext->gpa;
1010 		if (segid)
1011 			*segid = mmnext->segid;
1012 		if (segoff)
1013 			*segoff = mmnext->segoff;
1014 		if (len)
1015 			*len = mmnext->len;
1016 		if (prot)
1017 			*prot = mmnext->prot;
1018 		if (flags)
1019 			*flags = mmnext->flags;
1020 		return (0);
1021 	} else {
1022 		return (ENOENT);
1023 	}
1024 }
1025 
1026 static void
1027 vm_free_memmap(struct vm *vm, int ident)
1028 {
1029 	struct mem_map *mm;
1030 	int error;
1031 
1032 	mm = &vm->mem_maps[ident];
1033 	if (mm->len) {
1034 		error = vmspace_unmap(vm->vmspace, mm->gpa,
1035 		    mm->gpa + mm->len);
1036 		KASSERT(error == 0, ("%s: vmspace_unmap error %d",
1037 		    __func__, error));
1038 		bzero(mm, sizeof (struct mem_map));
1039 	}
1040 }
1041 
1042 static __inline bool
1043 sysmem_mapping(struct vm *vm, struct mem_map *mm)
1044 {
1045 
1046 	if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
1047 		return (true);
1048 	else
1049 		return (false);
1050 }
1051 
1052 vm_paddr_t
1053 vmm_sysmem_maxaddr(struct vm *vm)
1054 {
1055 	struct mem_map *mm;
1056 	vm_paddr_t maxaddr;
1057 	int i;
1058 
1059 	maxaddr = 0;
1060 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
1061 		mm = &vm->mem_maps[i];
1062 		if (sysmem_mapping(vm, mm)) {
1063 			if (maxaddr < mm->gpa + mm->len)
1064 				maxaddr = mm->gpa + mm->len;
1065 		}
1066 	}
1067 	return (maxaddr);
1068 }
1069 
1070 static void
1071 vm_iommu_modify(struct vm *vm, bool map)
1072 {
1073 	int i, sz;
1074 	vm_paddr_t gpa, hpa;
1075 	struct mem_map *mm;
1076 	vm_client_t *vmc;
1077 
1078 	sz = PAGE_SIZE;
1079 	vmc = vmspace_client_alloc(vm->vmspace);
1080 
1081 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
1082 		mm = &vm->mem_maps[i];
1083 		if (!sysmem_mapping(vm, mm))
1084 			continue;
1085 
1086 		if (map) {
1087 			KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0,
1088 			    ("iommu map found invalid memmap %lx/%lx/%x",
1089 			    mm->gpa, mm->len, mm->flags));
1090 			if ((mm->flags & VM_MEMMAP_F_WIRED) == 0)
1091 				continue;
1092 			mm->flags |= VM_MEMMAP_F_IOMMU;
1093 		} else {
1094 			if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0)
1095 				continue;
1096 			mm->flags &= ~VM_MEMMAP_F_IOMMU;
1097 			KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0,
1098 			    ("iommu unmap found invalid memmap %lx/%lx/%x",
1099 			    mm->gpa, mm->len, mm->flags));
1100 		}
1101 
1102 		gpa = mm->gpa;
1103 		while (gpa < mm->gpa + mm->len) {
1104 			vm_page_t *vmp;
1105 
1106 			vmp = vmc_hold(vmc, gpa, PROT_WRITE);
1107 			ASSERT(vmp != NULL);
1108 			hpa = ((uintptr_t)vmp_get_pfn(vmp) << PAGESHIFT);
1109 			(void) vmp_release(vmp);
1110 
1111 			/*
1112 			 * When originally ported from FreeBSD, the logic for
1113 			 * adding memory to the guest domain would
1114 			 * simultaneously remove it from the host domain.  The
1115 			 * justification for that is not clear, and FreeBSD has
1116 			 * subsequently changed the behavior to not remove the
1117 			 * memory from the host domain.
1118 			 *
1119 			 * Leaving the guest memory in the host domain for the
1120 			 * life of the VM is necessary to make it available for
1121 			 * DMA, such as through viona in the TX path.
1122 			 */
1123 			if (map) {
1124 				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
1125 			} else {
1126 				iommu_remove_mapping(vm->iommu, gpa, sz);
1127 			}
1128 
1129 			gpa += PAGE_SIZE;
1130 		}
1131 	}
1132 	vmc_destroy(vmc);
1133 
1134 	/*
1135 	 * Invalidate the cached translations associated with the domain
1136 	 * from which pages were removed.
1137 	 */
1138 	iommu_invalidate_tlb(vm->iommu);
1139 }
1140 
1141 int
1142 vm_unassign_pptdev(struct vm *vm, int pptfd)
1143 {
1144 	int error;
1145 
1146 	error = ppt_unassign_device(vm, pptfd);
1147 	if (error)
1148 		return (error);
1149 
1150 	if (ppt_assigned_devices(vm) == 0)
1151 		vm_iommu_modify(vm, false);
1152 
1153 	return (0);
1154 }
1155 
1156 int
1157 vm_assign_pptdev(struct vm *vm, int pptfd)
1158 {
1159 	int error;
1160 	vm_paddr_t maxaddr;
1161 
1162 	/* Set up the IOMMU to do the 'gpa' to 'hpa' translation */
1163 	if (ppt_assigned_devices(vm) == 0) {
1164 		KASSERT(vm->iommu == NULL,
1165 		    ("vm_assign_pptdev: iommu must be NULL"));
1166 		maxaddr = vmm_sysmem_maxaddr(vm);
1167 		vm->iommu = iommu_create_domain(maxaddr);
1168 		if (vm->iommu == NULL)
1169 			return (ENXIO);
1170 		vm_iommu_modify(vm, true);
1171 	}
1172 
1173 	error = ppt_assign_device(vm, pptfd);
1174 	return (error);
1175 }
1176 
1177 int
1178 vm_get_register(struct vm *vm, int vcpuid, int reg, uint64_t *retval)
1179 {
1180 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1181 		return (EINVAL);
1182 
1183 	if (reg >= VM_REG_LAST)
1184 		return (EINVAL);
1185 
1186 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
1187 	switch (reg) {
1188 	case VM_REG_GUEST_XCR0:
1189 		*retval = vcpu->guest_xcr0;
1190 		return (0);
1191 	default:
1192 		return (VMGETREG(vm->cookie, vcpuid, reg, retval));
1193 	}
1194 }
1195 
1196 int
1197 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
1198 {
1199 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1200 		return (EINVAL);
1201 
1202 	if (reg >= VM_REG_LAST)
1203 		return (EINVAL);
1204 
1205 	int error;
1206 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
1207 	switch (reg) {
1208 	case VM_REG_GUEST_RIP:
1209 		error = VMSETREG(vm->cookie, vcpuid, reg, val);
1210 		if (error == 0) {
1211 			vcpu->nextrip = val;
1212 		}
1213 		return (error);
1214 	case VM_REG_GUEST_XCR0:
1215 		if (!validate_guest_xcr0(val, vmm_get_host_xcr0())) {
1216 			return (EINVAL);
1217 		}
1218 		vcpu->guest_xcr0 = val;
1219 		return (0);
1220 	default:
1221 		return (VMSETREG(vm->cookie, vcpuid, reg, val));
1222 	}
1223 }
1224 
1225 static bool
1226 is_descriptor_table(int reg)
1227 {
1228 	switch (reg) {
1229 	case VM_REG_GUEST_IDTR:
1230 	case VM_REG_GUEST_GDTR:
1231 		return (true);
1232 	default:
1233 		return (false);
1234 	}
1235 }
1236 
1237 static bool
1238 is_segment_register(int reg)
1239 {
1240 	switch (reg) {
1241 	case VM_REG_GUEST_ES:
1242 	case VM_REG_GUEST_CS:
1243 	case VM_REG_GUEST_SS:
1244 	case VM_REG_GUEST_DS:
1245 	case VM_REG_GUEST_FS:
1246 	case VM_REG_GUEST_GS:
1247 	case VM_REG_GUEST_TR:
1248 	case VM_REG_GUEST_LDTR:
1249 		return (true);
1250 	default:
1251 		return (false);
1252 	}
1253 }
1254 
1255 int
1256 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc)
1257 {
1258 
1259 	if (vcpu < 0 || vcpu >= vm->maxcpus)
1260 		return (EINVAL);
1261 
1262 	if (!is_segment_register(reg) && !is_descriptor_table(reg))
1263 		return (EINVAL);
1264 
1265 	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
1266 }
1267 
1268 int
1269 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc)
1270 {
1271 	if (vcpu < 0 || vcpu >= vm->maxcpus)
1272 		return (EINVAL);
1273 
1274 	if (!is_segment_register(reg) && !is_descriptor_table(reg))
1275 		return (EINVAL);
1276 
1277 	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
1278 }
1279 
1280 static int
1281 translate_hma_xsave_result(hma_fpu_xsave_result_t res)
1282 {
1283 	switch (res) {
1284 	case HFXR_OK:
1285 		return (0);
1286 	case HFXR_NO_SPACE:
1287 		return (ENOSPC);
1288 	case HFXR_BAD_ALIGN:
1289 	case HFXR_UNSUP_FMT:
1290 	case HFXR_UNSUP_FEAT:
1291 	case HFXR_INVALID_DATA:
1292 		return (EINVAL);
1293 	default:
1294 		panic("unexpected xsave result");
1295 	}
1296 }
1297 
1298 int
1299 vm_get_fpu(struct vm *vm, int vcpuid, void *buf, size_t len)
1300 {
1301 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1302 		return (EINVAL);
1303 
1304 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
1305 	hma_fpu_xsave_result_t res;
1306 
1307 	res = hma_fpu_get_xsave_state(vcpu->guestfpu, buf, len);
1308 	return (translate_hma_xsave_result(res));
1309 }
1310 
1311 int
1312 vm_set_fpu(struct vm *vm, int vcpuid, void *buf, size_t len)
1313 {
1314 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1315 		return (EINVAL);
1316 
1317 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
1318 	hma_fpu_xsave_result_t res;
1319 
1320 	res = hma_fpu_set_xsave_state(vcpu->guestfpu, buf, len);
1321 	return (translate_hma_xsave_result(res));
1322 }
1323 
1324 int
1325 vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec)
1326 {
1327 	struct vcpu *vcpu;
1328 
1329 	if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1330 		return (EINVAL);
1331 	}
1332 
1333 	vcpu = &vm->vcpu[vcpuid];
1334 
1335 	vcpu_lock(vcpu);
1336 	*state = vcpu->run_state;
1337 	*sipi_vec = vcpu->sipi_vector;
1338 	vcpu_unlock(vcpu);
1339 
1340 	return (0);
1341 }
1342 
1343 int
1344 vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec)
1345 {
1346 	struct vcpu *vcpu;
1347 
1348 	if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1349 		return (EINVAL);
1350 	}
1351 	if (!VRS_IS_VALID(state)) {
1352 		return (EINVAL);
1353 	}
1354 
1355 	vcpu = &vm->vcpu[vcpuid];
1356 
1357 	vcpu_lock(vcpu);
1358 	vcpu->run_state = state;
1359 	vcpu->sipi_vector = sipi_vec;
1360 	vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1361 	vcpu_unlock(vcpu);
1362 
1363 	return (0);
1364 }
1365 
1366 int
1367 vm_track_dirty_pages(struct vm *vm, uint64_t gpa, size_t len, uint8_t *bitmap)
1368 {
1369 	vmspace_t *vms = vm_get_vmspace(vm);
1370 	return (vmspace_track_dirty(vms, gpa, len, bitmap));
1371 }
1372 
1373 static void
1374 restore_guest_fpustate(struct vcpu *vcpu)
1375 {
1376 	/* Save host FPU and restore guest FPU */
1377 	fpu_stop_emulating();
1378 	hma_fpu_start_guest(vcpu->guestfpu);
1379 
1380 	/* restore guest XCR0 if XSAVE is enabled in the host */
1381 	if (rcr4() & CR4_XSAVE)
1382 		load_xcr(0, vcpu->guest_xcr0);
1383 
1384 	/*
1385 	 * The FPU is now "dirty" with the guest's state so turn on emulation
1386 	 * to trap any access to the FPU by the host.
1387 	 */
1388 	fpu_start_emulating();
1389 }
1390 
1391 static void
1392 save_guest_fpustate(struct vcpu *vcpu)
1393 {
1394 
1395 	if ((rcr0() & CR0_TS) == 0)
1396 		panic("fpu emulation not enabled in host!");
1397 
1398 	/* save guest XCR0 and restore host XCR0 */
1399 	if (rcr4() & CR4_XSAVE) {
1400 		vcpu->guest_xcr0 = rxcr(0);
1401 		load_xcr(0, vmm_get_host_xcr0());
1402 	}
1403 
1404 	/* save guest FPU and restore host FPU */
1405 	fpu_stop_emulating();
1406 	hma_fpu_stop_guest(vcpu->guestfpu);
1407 	/*
1408 	 * When the host state has been restored, we should not re-enable
1409 	 * CR0.TS on illumos for eager FPU.
1410 	 */
1411 }
1412 
1413 static int
1414 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1415     bool from_idle)
1416 {
1417 	struct vcpu *vcpu;
1418 	int error;
1419 
1420 	vcpu = &vm->vcpu[vcpuid];
1421 	vcpu_assert_locked(vcpu);
1422 
1423 	/*
1424 	 * State transitions from the vmmdev_ioctl() must always begin from
1425 	 * the VCPU_IDLE state. This guarantees that there is only a single
1426 	 * ioctl() operating on a vcpu at any point.
1427 	 */
1428 	if (from_idle) {
1429 		while (vcpu->state != VCPU_IDLE) {
1430 			vcpu->reqidle = 1;
1431 			vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1432 			cv_wait(&vcpu->state_cv, &vcpu->lock);
1433 		}
1434 	} else {
1435 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1436 		    "vcpu idle state"));
1437 	}
1438 
1439 	if (vcpu->state == VCPU_RUNNING) {
1440 		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1441 		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1442 	} else {
1443 		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1444 		    "vcpu that is not running", vcpu->hostcpu));
1445 	}
1446 
1447 	/*
1448 	 * The following state transitions are allowed:
1449 	 * IDLE -> FROZEN -> IDLE
1450 	 * FROZEN -> RUNNING -> FROZEN
1451 	 * FROZEN -> SLEEPING -> FROZEN
1452 	 */
1453 	switch (vcpu->state) {
1454 	case VCPU_IDLE:
1455 	case VCPU_RUNNING:
1456 	case VCPU_SLEEPING:
1457 		error = (newstate != VCPU_FROZEN);
1458 		break;
1459 	case VCPU_FROZEN:
1460 		error = (newstate == VCPU_FROZEN);
1461 		break;
1462 	default:
1463 		error = 1;
1464 		break;
1465 	}
1466 
1467 	if (error)
1468 		return (EBUSY);
1469 
1470 	vcpu->state = newstate;
1471 	if (newstate == VCPU_RUNNING)
1472 		vcpu->hostcpu = curcpu;
1473 	else
1474 		vcpu->hostcpu = NOCPU;
1475 
1476 	if (newstate == VCPU_IDLE) {
1477 		cv_broadcast(&vcpu->state_cv);
1478 	}
1479 
1480 	return (0);
1481 }
1482 
1483 static void
1484 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1485 {
1486 	int error;
1487 
1488 	if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
1489 		panic("Error %d setting state to %d\n", error, newstate);
1490 }
1491 
1492 static void
1493 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1494 {
1495 	int error;
1496 
1497 	if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
1498 		panic("Error %d setting state to %d", error, newstate);
1499 }
1500 
1501 /*
1502  * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1503  */
1504 static int
1505 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled)
1506 {
1507 	struct vcpu *vcpu;
1508 	int vcpu_halted, vm_halted;
1509 	bool userspace_exit = false;
1510 
1511 	KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1512 
1513 	vcpu = &vm->vcpu[vcpuid];
1514 	vcpu_halted = 0;
1515 	vm_halted = 0;
1516 
1517 	vcpu_lock(vcpu);
1518 	while (1) {
1519 		/*
1520 		 * Do a final check for pending interrupts (including NMI and
1521 		 * INIT) before putting this thread to sleep.
1522 		 */
1523 		if (vm_nmi_pending(vm, vcpuid))
1524 			break;
1525 		if (vcpu_run_state_pending(vm, vcpuid))
1526 			break;
1527 		if (!intr_disabled) {
1528 			if (vm_extint_pending(vm, vcpuid) ||
1529 			    vlapic_pending_intr(vcpu->vlapic, NULL)) {
1530 				break;
1531 			}
1532 		}
1533 
1534 		/*
1535 		 * Also check for software events which would cause a wake-up.
1536 		 * This will set the appropriate exitcode directly, rather than
1537 		 * requiring a trip through VM_RUN().
1538 		 */
1539 		if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1540 			userspace_exit = true;
1541 			break;
1542 		}
1543 
1544 		/*
1545 		 * Some Linux guests implement "halt" by having all vcpus
1546 		 * execute HLT with interrupts disabled. 'halted_cpus' keeps
1547 		 * track of the vcpus that have entered this state. When all
1548 		 * vcpus enter the halted state the virtual machine is halted.
1549 		 */
1550 		if (intr_disabled) {
1551 			if (!vcpu_halted && halt_detection_enabled) {
1552 				vcpu_halted = 1;
1553 				CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1554 			}
1555 			if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1556 				vm_halted = 1;
1557 				break;
1558 			}
1559 		}
1560 
1561 		vcpu_ustate_change(vm, vcpuid, VU_IDLE);
1562 		vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1563 		(void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock);
1564 		vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1565 		vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
1566 	}
1567 
1568 	if (vcpu_halted)
1569 		CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1570 
1571 	vcpu_unlock(vcpu);
1572 
1573 	if (vm_halted) {
1574 		(void) vm_suspend(vm, VM_SUSPEND_HALT);
1575 	}
1576 
1577 	return (userspace_exit ? -1 : 0);
1578 }
1579 
1580 static int
1581 vm_handle_paging(struct vm *vm, int vcpuid)
1582 {
1583 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
1584 	vm_client_t *vmc = vcpu->vmclient;
1585 	struct vm_exit *vme = &vcpu->exitinfo;
1586 	const int ftype = vme->u.paging.fault_type;
1587 
1588 	ASSERT0(vme->inst_length);
1589 	ASSERT(ftype == PROT_READ || ftype == PROT_WRITE || ftype == PROT_EXEC);
1590 
1591 	if (vmc_fault(vmc, vme->u.paging.gpa, ftype) != 0) {
1592 		/*
1593 		 * If the fault cannot be serviced, kick it out to userspace for
1594 		 * handling (or more likely, halting the instance).
1595 		 */
1596 		return (-1);
1597 	}
1598 
1599 	return (0);
1600 }
1601 
1602 int
1603 vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval,
1604     int rsize)
1605 {
1606 	int err = ESRCH;
1607 
1608 	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1609 		struct vlapic *vlapic = vm_lapic(vm, cpuid);
1610 
1611 		err = vlapic_mmio_read(vlapic, gpa, rval, rsize);
1612 	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1613 		err = vioapic_mmio_read(vm, cpuid, gpa, rval, rsize);
1614 	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1615 		err = vhpet_mmio_read(vm, cpuid, gpa, rval, rsize);
1616 	}
1617 
1618 	return (err);
1619 }
1620 
1621 int
1622 vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval,
1623     int wsize)
1624 {
1625 	int err = ESRCH;
1626 
1627 	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1628 		struct vlapic *vlapic = vm_lapic(vm, cpuid);
1629 
1630 		err = vlapic_mmio_write(vlapic, gpa, wval, wsize);
1631 	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1632 		err = vioapic_mmio_write(vm, cpuid, gpa, wval, wsize);
1633 	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1634 		err = vhpet_mmio_write(vm, cpuid, gpa, wval, wsize);
1635 	}
1636 
1637 	return (err);
1638 }
1639 
1640 static int
1641 vm_handle_mmio_emul(struct vm *vm, int vcpuid)
1642 {
1643 	struct vie *vie;
1644 	struct vcpu *vcpu;
1645 	struct vm_exit *vme;
1646 	uint64_t inst_addr;
1647 	int error, fault, cs_d;
1648 
1649 	vcpu = &vm->vcpu[vcpuid];
1650 	vme = &vcpu->exitinfo;
1651 	vie = vcpu->vie_ctx;
1652 
1653 	KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1654 	    __func__, vme->inst_length));
1655 
1656 	inst_addr = vme->rip + vme->u.mmio_emul.cs_base;
1657 	cs_d = vme->u.mmio_emul.cs_d;
1658 
1659 	/* Fetch the faulting instruction */
1660 	if (vie_needs_fetch(vie)) {
1661 		error = vie_fetch_instruction(vie, vm, vcpuid, inst_addr,
1662 		    &fault);
1663 		if (error != 0) {
1664 			return (error);
1665 		} else if (fault) {
1666 			/*
1667 			 * If a fault during instruction fetch was encountered,
1668 			 * it will have asserted that the appropriate exception
1669 			 * be injected at next entry.
1670 			 * No further work is required.
1671 			 */
1672 			return (0);
1673 		}
1674 	}
1675 
1676 	if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) {
1677 		/* Dump (unrecognized) instruction bytes in userspace */
1678 		vie_fallback_exitinfo(vie, vme);
1679 		return (-1);
1680 	}
1681 	if (vme->u.mmio_emul.gla != VIE_INVALID_GLA &&
1682 	    vie_verify_gla(vie, vm, vcpuid, vme->u.mmio_emul.gla) != 0) {
1683 		/* Decoded GLA does not match GLA from VM exit state */
1684 		vie_fallback_exitinfo(vie, vme);
1685 		return (-1);
1686 	}
1687 
1688 repeat:
1689 	error = vie_emulate_mmio(vie, vm, vcpuid);
1690 	if (error < 0) {
1691 		/*
1692 		 * MMIO not handled by any of the in-kernel-emulated devices, so
1693 		 * make a trip out to userspace for it.
1694 		 */
1695 		vie_exitinfo(vie, vme);
1696 	} else if (error == EAGAIN) {
1697 		/*
1698 		 * Continue emulating the rep-prefixed instruction, which has
1699 		 * not completed its iterations.
1700 		 *
1701 		 * In case this can be emulated in-kernel and has a high
1702 		 * repetition count (causing a tight spin), it should be
1703 		 * deferential to yield conditions.
1704 		 */
1705 		if (!vcpu_should_yield(vm, vcpuid)) {
1706 			goto repeat;
1707 		} else {
1708 			/*
1709 			 * Defer to the contending load by making a trip to
1710 			 * userspace with a no-op (BOGUS) exit reason.
1711 			 */
1712 			vie_reset(vie);
1713 			vme->exitcode = VM_EXITCODE_BOGUS;
1714 			return (-1);
1715 		}
1716 	} else if (error == 0) {
1717 		/* Update %rip now that instruction has been emulated */
1718 		vie_advance_pc(vie, &vcpu->nextrip);
1719 	}
1720 	return (error);
1721 }
1722 
1723 static int
1724 vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme)
1725 {
1726 	struct vcpu *vcpu;
1727 	struct vie *vie;
1728 	int err;
1729 
1730 	vcpu = &vm->vcpu[vcpuid];
1731 	vie = vcpu->vie_ctx;
1732 
1733 repeat:
1734 	err = vie_emulate_inout(vie, vm, vcpuid);
1735 
1736 	if (err < 0) {
1737 		/*
1738 		 * In/out not handled by any of the in-kernel-emulated devices,
1739 		 * so make a trip out to userspace for it.
1740 		 */
1741 		vie_exitinfo(vie, vme);
1742 		return (err);
1743 	} else if (err == EAGAIN) {
1744 		/*
1745 		 * Continue emulating the rep-prefixed ins/outs, which has not
1746 		 * completed its iterations.
1747 		 *
1748 		 * In case this can be emulated in-kernel and has a high
1749 		 * repetition count (causing a tight spin), it should be
1750 		 * deferential to yield conditions.
1751 		 */
1752 		if (!vcpu_should_yield(vm, vcpuid)) {
1753 			goto repeat;
1754 		} else {
1755 			/*
1756 			 * Defer to the contending load by making a trip to
1757 			 * userspace with a no-op (BOGUS) exit reason.
1758 			 */
1759 			vie_reset(vie);
1760 			vme->exitcode = VM_EXITCODE_BOGUS;
1761 			return (-1);
1762 		}
1763 	} else if (err != 0) {
1764 		/* Emulation failure.  Bail all the way out to userspace. */
1765 		vme->exitcode = VM_EXITCODE_INST_EMUL;
1766 		bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul));
1767 		return (-1);
1768 	}
1769 
1770 	vie_advance_pc(vie, &vcpu->nextrip);
1771 	return (0);
1772 }
1773 
1774 static int
1775 vm_handle_inst_emul(struct vm *vm, int vcpuid)
1776 {
1777 	struct vie *vie;
1778 	struct vcpu *vcpu;
1779 	struct vm_exit *vme;
1780 	uint64_t cs_base;
1781 	int error, fault, cs_d;
1782 
1783 	vcpu = &vm->vcpu[vcpuid];
1784 	vme = &vcpu->exitinfo;
1785 	vie = vcpu->vie_ctx;
1786 
1787 	vie_cs_info(vie, vm, vcpuid, &cs_base, &cs_d);
1788 
1789 	/* Fetch the faulting instruction */
1790 	ASSERT(vie_needs_fetch(vie));
1791 	error = vie_fetch_instruction(vie, vm, vcpuid, vme->rip + cs_base,
1792 	    &fault);
1793 	if (error != 0) {
1794 		return (error);
1795 	} else if (fault) {
1796 		/*
1797 		 * If a fault during instruction fetch was encounted, it will
1798 		 * have asserted that the appropriate exception be injected at
1799 		 * next entry.  No further work is required.
1800 		 */
1801 		return (0);
1802 	}
1803 
1804 	if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) {
1805 		/* Dump (unrecognized) instruction bytes in userspace */
1806 		vie_fallback_exitinfo(vie, vme);
1807 		return (-1);
1808 	}
1809 
1810 	error = vie_emulate_other(vie, vm, vcpuid);
1811 	if (error != 0) {
1812 		/*
1813 		 * Instruction emulation was unable to complete successfully, so
1814 		 * kick it out to userspace for handling.
1815 		 */
1816 		vie_fallback_exitinfo(vie, vme);
1817 	} else {
1818 		/* Update %rip now that instruction has been emulated */
1819 		vie_advance_pc(vie, &vcpu->nextrip);
1820 	}
1821 	return (error);
1822 }
1823 
1824 static int
1825 vm_handle_suspend(struct vm *vm, int vcpuid)
1826 {
1827 	int i;
1828 	struct vcpu *vcpu;
1829 
1830 	vcpu = &vm->vcpu[vcpuid];
1831 
1832 	CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
1833 
1834 	/*
1835 	 * Wait until all 'active_cpus' have suspended themselves.
1836 	 */
1837 	vcpu_lock(vcpu);
1838 	vcpu_ustate_change(vm, vcpuid, VU_INIT);
1839 	while (1) {
1840 		int rc;
1841 
1842 		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1843 			break;
1844 		}
1845 
1846 		vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1847 		rc = cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->lock, hz,
1848 		    TR_CLOCK_TICK);
1849 		vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1850 
1851 		/*
1852 		 * If the userspace process driving the instance is killed, any
1853 		 * vCPUs yet to be marked suspended (because they are not
1854 		 * VM_RUN-ing in the kernel presently) will never reach that
1855 		 * state.
1856 		 *
1857 		 * To avoid vm_handle_suspend() getting stuck in the kernel
1858 		 * waiting for those vCPUs, offer a bail-out even though it
1859 		 * means returning without all vCPUs in a suspended state.
1860 		 */
1861 		if (rc <= 0) {
1862 			if ((curproc->p_flag & SEXITING) != 0) {
1863 				break;
1864 			}
1865 		}
1866 	}
1867 	vcpu_unlock(vcpu);
1868 
1869 	/*
1870 	 * Wakeup the other sleeping vcpus and return to userspace.
1871 	 */
1872 	for (i = 0; i < vm->maxcpus; i++) {
1873 		if (CPU_ISSET(i, &vm->suspended_cpus)) {
1874 			vcpu_notify_event(vm, i);
1875 		}
1876 	}
1877 
1878 	return (-1);
1879 }
1880 
1881 static int
1882 vm_handle_reqidle(struct vm *vm, int vcpuid)
1883 {
1884 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
1885 
1886 	vcpu_lock(vcpu);
1887 	KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
1888 	vcpu->reqidle = 0;
1889 	vcpu_unlock(vcpu);
1890 	return (-1);
1891 }
1892 
1893 static int
1894 vm_handle_run_state(struct vm *vm, int vcpuid)
1895 {
1896 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
1897 	bool handled = false;
1898 
1899 	vcpu_lock(vcpu);
1900 	while (1) {
1901 		if ((vcpu->run_state & VRS_PEND_INIT) != 0) {
1902 			vcpu_unlock(vcpu);
1903 			VERIFY0(vcpu_arch_reset(vm, vcpuid, true));
1904 			vcpu_lock(vcpu);
1905 
1906 			vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT);
1907 			vcpu->run_state |= VRS_INIT;
1908 		}
1909 
1910 		if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) ==
1911 		    (VRS_INIT | VRS_PEND_SIPI)) {
1912 			const uint8_t vector = vcpu->sipi_vector;
1913 
1914 			vcpu_unlock(vcpu);
1915 			VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector));
1916 			vcpu_lock(vcpu);
1917 
1918 			vcpu->run_state &= ~VRS_PEND_SIPI;
1919 			vcpu->run_state |= VRS_RUN;
1920 		}
1921 
1922 		/*
1923 		 * If the vCPU is now in the running state, there is no need to
1924 		 * wait for anything prior to re-entry.
1925 		 */
1926 		if ((vcpu->run_state & VRS_RUN) != 0) {
1927 			handled = true;
1928 			break;
1929 		}
1930 
1931 		/*
1932 		 * Also check for software events which would cause a wake-up.
1933 		 * This will set the appropriate exitcode directly, rather than
1934 		 * requiring a trip through VM_RUN().
1935 		 */
1936 		if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1937 			break;
1938 		}
1939 
1940 		vcpu_ustate_change(vm, vcpuid, VU_IDLE);
1941 		vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1942 		(void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock);
1943 		vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1944 		vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
1945 	}
1946 	vcpu_unlock(vcpu);
1947 
1948 	return (handled ? 0 : -1);
1949 }
1950 
1951 static int
1952 vm_rdmtrr(const struct vm_mtrr *mtrr, uint32_t num, uint64_t *val)
1953 {
1954 	switch (num) {
1955 	case MSR_MTRRcap:
1956 		*val = MTRR_CAP_WC | MTRR_CAP_FIXED | VMM_MTRR_VAR_MAX;
1957 		break;
1958 	case MSR_MTRRdefType:
1959 		*val = mtrr->def_type;
1960 		break;
1961 	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
1962 		*val = mtrr->fixed4k[num - MSR_MTRR4kBase];
1963 		break;
1964 	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
1965 		*val = mtrr->fixed16k[num - MSR_MTRR16kBase];
1966 		break;
1967 	case MSR_MTRR64kBase:
1968 		*val = mtrr->fixed64k;
1969 		break;
1970 	case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: {
1971 		uint_t offset = num - MSR_MTRRVarBase;
1972 		if (offset % 2 == 0) {
1973 			*val = mtrr->var[offset / 2].base;
1974 		} else {
1975 			*val = mtrr->var[offset / 2].mask;
1976 		}
1977 		break;
1978 	}
1979 	default:
1980 		return (-1);
1981 	}
1982 
1983 	return (0);
1984 }
1985 
1986 static int
1987 vm_wrmtrr(struct vm_mtrr *mtrr, uint32_t num, uint64_t val)
1988 {
1989 	switch (num) {
1990 	case MSR_MTRRcap:
1991 		/* MTRRCAP is read only */
1992 		return (-1);
1993 	case MSR_MTRRdefType:
1994 		if (val & ~VMM_MTRR_DEF_MASK) {
1995 			/* generate #GP on writes to reserved fields */
1996 			return (-1);
1997 		}
1998 		mtrr->def_type = val;
1999 		break;
2000 	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
2001 		mtrr->fixed4k[num - MSR_MTRR4kBase] = val;
2002 		break;
2003 	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
2004 		mtrr->fixed16k[num - MSR_MTRR16kBase] = val;
2005 		break;
2006 	case MSR_MTRR64kBase:
2007 		mtrr->fixed64k = val;
2008 		break;
2009 	case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: {
2010 		uint_t offset = num - MSR_MTRRVarBase;
2011 		if (offset % 2 == 0) {
2012 			if (val & ~VMM_MTRR_PHYSBASE_MASK) {
2013 				/* generate #GP on writes to reserved fields */
2014 				return (-1);
2015 			}
2016 			mtrr->var[offset / 2].base = val;
2017 		} else {
2018 			if (val & ~VMM_MTRR_PHYSMASK_MASK) {
2019 				/* generate #GP on writes to reserved fields */
2020 				return (-1);
2021 			}
2022 			mtrr->var[offset / 2].mask = val;
2023 		}
2024 		break;
2025 	}
2026 	default:
2027 		return (-1);
2028 	}
2029 
2030 	return (0);
2031 }
2032 
2033 static bool
2034 is_mtrr_msr(uint32_t msr)
2035 {
2036 	switch (msr) {
2037 	case MSR_MTRRcap:
2038 	case MSR_MTRRdefType:
2039 	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
2040 	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
2041 	case MSR_MTRR64kBase:
2042 	case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1:
2043 		return (true);
2044 	default:
2045 		return (false);
2046 	}
2047 }
2048 
2049 static int
2050 vm_handle_rdmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
2051 {
2052 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2053 	const uint32_t code = vme->u.msr.code;
2054 	uint64_t val = 0;
2055 
2056 	switch (code) {
2057 	case MSR_MCG_CAP:
2058 	case MSR_MCG_STATUS:
2059 		val = 0;
2060 		break;
2061 
2062 	case MSR_MTRRcap:
2063 	case MSR_MTRRdefType:
2064 	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
2065 	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
2066 	case MSR_MTRR64kBase:
2067 	case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1:
2068 		if (vm_rdmtrr(&vcpu->mtrr, code, &val) != 0)
2069 			vm_inject_gp(vm, vcpuid);
2070 		break;
2071 
2072 	case MSR_TSC:
2073 		/*
2074 		 * In all likelihood, this should always be handled in guest
2075 		 * context by VMX/SVM rather than taking an exit.  (Both VMX and
2076 		 * SVM pass through read-only access to MSR_TSC to the guest.)
2077 		 *
2078 		 * No physical offset is requested of vcpu_tsc_offset() since
2079 		 * rdtsc_offset() takes care of that instead.
2080 		 */
2081 		val = vcpu_tsc_offset(vm, vcpuid, false) + rdtsc_offset();
2082 		break;
2083 
2084 	default:
2085 		/*
2086 		 * Anything not handled at this point will be kicked out to
2087 		 * userspace for attempted processing there.
2088 		 */
2089 		return (-1);
2090 	}
2091 
2092 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX,
2093 	    val & 0xffffffff));
2094 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX,
2095 	    val >> 32));
2096 	return (0);
2097 }
2098 
2099 static int
2100 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
2101 {
2102 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2103 	const uint32_t code = vme->u.msr.code;
2104 	const uint64_t val = vme->u.msr.wval;
2105 
2106 	switch (code) {
2107 	case MSR_MCG_CAP:
2108 	case MSR_MCG_STATUS:
2109 		/* Ignore writes */
2110 		break;
2111 
2112 	case MSR_MTRRcap:
2113 	case MSR_MTRRdefType:
2114 	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
2115 	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
2116 	case MSR_MTRR64kBase:
2117 	case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1:
2118 		if (vm_wrmtrr(&vcpu->mtrr, code, val) != 0)
2119 			vm_inject_gp(vm, vcpuid);
2120 		break;
2121 
2122 	case MSR_TSC:
2123 		/*
2124 		 * The effect of writing the TSC MSR is that a subsequent read
2125 		 * of the TSC would report that value written (plus any time
2126 		 * elapsed between the write and the read).  The guest TSC value
2127 		 * is calculated from a global offset for the guest (which
2128 		 * effectively makes its TSC read 0 at guest boot) and a
2129 		 * per-vCPU offset to handle these writes to the MSR.
2130 		 *
2131 		 * To calculate that per-vCPU offset, we can work backwards from
2132 		 * the guest value at the time of write:
2133 		 *
2134 		 * value = host TSC + VM boot offset + vCPU offset
2135 		 *
2136 		 * so therefore:
2137 		 *
2138 		 * value - host TSC - VM boot offset = vCPU offset
2139 		 */
2140 		vcpu->tsc_offset = val - vm->boot_tsc_offset - rdtsc_offset();
2141 		break;
2142 
2143 	default:
2144 		/*
2145 		 * Anything not handled at this point will be kicked out to
2146 		 * userspace for attempted processing there.
2147 		 */
2148 		return (-1);
2149 	}
2150 
2151 	return (0);
2152 }
2153 
2154 int
2155 vm_suspend(struct vm *vm, enum vm_suspend_how how)
2156 {
2157 	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
2158 		return (EINVAL);
2159 
2160 	if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) {
2161 		return (EALREADY);
2162 	}
2163 
2164 	/*
2165 	 * Notify all active vcpus that they are now suspended.
2166 	 */
2167 	for (uint_t i = 0; i < vm->maxcpus; i++) {
2168 		struct vcpu *vcpu = &vm->vcpu[i];
2169 
2170 		vcpu_lock(vcpu);
2171 		if (vcpu->state == VCPU_IDLE || vcpu->state == VCPU_FROZEN) {
2172 			/*
2173 			 * Any vCPUs not actively running or in HLT can be
2174 			 * marked as suspended immediately.
2175 			 */
2176 			if (CPU_ISSET(i, &vm->active_cpus)) {
2177 				CPU_SET_ATOMIC(i, &vm->suspended_cpus);
2178 			}
2179 		} else {
2180 			/*
2181 			 * Those which are running or in HLT will pick up the
2182 			 * suspended state after notification.
2183 			 */
2184 			vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2185 		}
2186 		vcpu_unlock(vcpu);
2187 	}
2188 	return (0);
2189 }
2190 
2191 void
2192 vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip)
2193 {
2194 	struct vm_exit *vmexit;
2195 
2196 	vmexit = vm_exitinfo(vm, vcpuid);
2197 	vmexit->rip = rip;
2198 	vmexit->inst_length = 0;
2199 	vmexit->exitcode = VM_EXITCODE_RUN_STATE;
2200 	vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1);
2201 }
2202 
2203 /*
2204  * Some vmm resources, such as the lapic, may have CPU-specific resources
2205  * allocated to them which would benefit from migration onto the host CPU which
2206  * is processing the vcpu state.
2207  */
2208 static void
2209 vm_localize_resources(struct vm *vm, struct vcpu *vcpu)
2210 {
2211 	/*
2212 	 * Localizing cyclic resources requires acquisition of cpu_lock, and
2213 	 * doing so with kpreempt disabled is a recipe for deadlock disaster.
2214 	 */
2215 	VERIFY(curthread->t_preempt == 0);
2216 
2217 	/*
2218 	 * Do not bother with localization if this vCPU is about to return to
2219 	 * the host CPU it was last localized to.
2220 	 */
2221 	if (vcpu->lastloccpu == curcpu)
2222 		return;
2223 
2224 	/*
2225 	 * Localize system-wide resources to the primary boot vCPU.  While any
2226 	 * of the other vCPUs may access them, it keeps the potential interrupt
2227 	 * footprint constrained to CPUs involved with this instance.
2228 	 */
2229 	if (vcpu == &vm->vcpu[0]) {
2230 		vhpet_localize_resources(vm->vhpet);
2231 		vrtc_localize_resources(vm->vrtc);
2232 		vatpit_localize_resources(vm->vatpit);
2233 	}
2234 
2235 	vlapic_localize_resources(vcpu->vlapic);
2236 
2237 	vcpu->lastloccpu = curcpu;
2238 }
2239 
2240 static void
2241 vmm_savectx(void *arg)
2242 {
2243 	vm_thread_ctx_t *vtc = arg;
2244 	struct vm *vm = vtc->vtc_vm;
2245 	const int vcpuid = vtc->vtc_vcpuid;
2246 
2247 	if (ops->vmsavectx != NULL) {
2248 		ops->vmsavectx(vm->cookie, vcpuid);
2249 	}
2250 
2251 	/*
2252 	 * Account for going off-cpu, unless the vCPU is idled, where being
2253 	 * off-cpu is the explicit point.
2254 	 */
2255 	if (vm->vcpu[vcpuid].ustate != VU_IDLE) {
2256 		vtc->vtc_ustate = vm->vcpu[vcpuid].ustate;
2257 		vcpu_ustate_change(vm, vcpuid, VU_SCHED);
2258 	}
2259 
2260 	/*
2261 	 * If the CPU holds the restored guest FPU state, save it and restore
2262 	 * the host FPU state before this thread goes off-cpu.
2263 	 */
2264 	if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) {
2265 		struct vcpu *vcpu = &vm->vcpu[vcpuid];
2266 
2267 		save_guest_fpustate(vcpu);
2268 		vtc->vtc_status &= ~VTCS_FPU_RESTORED;
2269 	}
2270 }
2271 
2272 static void
2273 vmm_restorectx(void *arg)
2274 {
2275 	vm_thread_ctx_t *vtc = arg;
2276 	struct vm *vm = vtc->vtc_vm;
2277 	const int vcpuid = vtc->vtc_vcpuid;
2278 
2279 	/* Complete microstate accounting for vCPU being off-cpu */
2280 	if (vm->vcpu[vcpuid].ustate != VU_IDLE) {
2281 		vcpu_ustate_change(vm, vcpuid, vtc->vtc_ustate);
2282 	}
2283 
2284 	/*
2285 	 * When coming back on-cpu, only restore the guest FPU status if the
2286 	 * thread is in a context marked as requiring it.  This should be rare,
2287 	 * occurring only when a future logic error results in a voluntary
2288 	 * sleep during the VMRUN critical section.
2289 	 *
2290 	 * The common case will result in elision of the guest FPU state
2291 	 * restoration, deferring that action until it is clearly necessary
2292 	 * during vm_run.
2293 	 */
2294 	VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0);
2295 	if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) {
2296 		struct vcpu *vcpu = &vm->vcpu[vcpuid];
2297 
2298 		restore_guest_fpustate(vcpu);
2299 		vtc->vtc_status |= VTCS_FPU_RESTORED;
2300 	}
2301 
2302 	if (ops->vmrestorectx != NULL) {
2303 		ops->vmrestorectx(vm->cookie, vcpuid);
2304 	}
2305 
2306 }
2307 
2308 static int
2309 vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry,
2310     struct vm_exit *vme)
2311 {
2312 	struct vcpu *vcpu;
2313 	struct vie *vie;
2314 	int err;
2315 
2316 	vcpu = &vm->vcpu[vcpuid];
2317 	vie = vcpu->vie_ctx;
2318 	err = 0;
2319 
2320 	switch (entry->cmd) {
2321 	case VEC_DEFAULT:
2322 		return (0);
2323 	case VEC_DISCARD_INSTR:
2324 		vie_reset(vie);
2325 		return (0);
2326 	case VEC_FULFILL_MMIO:
2327 		err = vie_fulfill_mmio(vie, &entry->u.mmio);
2328 		if (err == 0) {
2329 			err = vie_emulate_mmio(vie, vm, vcpuid);
2330 			if (err == 0) {
2331 				vie_advance_pc(vie, &vcpu->nextrip);
2332 			} else if (err < 0) {
2333 				vie_exitinfo(vie, vme);
2334 			} else if (err == EAGAIN) {
2335 				/*
2336 				 * Clear the instruction emulation state in
2337 				 * order to re-enter VM context and continue
2338 				 * this 'rep <instruction>'
2339 				 */
2340 				vie_reset(vie);
2341 				err = 0;
2342 			}
2343 		}
2344 		break;
2345 	case VEC_FULFILL_INOUT:
2346 		err = vie_fulfill_inout(vie, &entry->u.inout);
2347 		if (err == 0) {
2348 			err = vie_emulate_inout(vie, vm, vcpuid);
2349 			if (err == 0) {
2350 				vie_advance_pc(vie, &vcpu->nextrip);
2351 			} else if (err < 0) {
2352 				vie_exitinfo(vie, vme);
2353 			} else if (err == EAGAIN) {
2354 				/*
2355 				 * Clear the instruction emulation state in
2356 				 * order to re-enter VM context and continue
2357 				 * this 'rep ins/outs'
2358 				 */
2359 				vie_reset(vie);
2360 				err = 0;
2361 			}
2362 		}
2363 		break;
2364 	default:
2365 		return (EINVAL);
2366 	}
2367 	return (err);
2368 }
2369 
2370 static int
2371 vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme)
2372 {
2373 	struct vie *vie;
2374 
2375 	vie = vm->vcpu[vcpuid].vie_ctx;
2376 
2377 	if (vie_pending(vie)) {
2378 		/*
2379 		 * Userspace has not fulfilled the pending needs of the
2380 		 * instruction emulation, so bail back out.
2381 		 */
2382 		vie_exitinfo(vie, vme);
2383 		return (-1);
2384 	}
2385 
2386 	return (0);
2387 }
2388 
2389 int
2390 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry)
2391 {
2392 	int error;
2393 	struct vcpu *vcpu;
2394 	struct vm_exit *vme;
2395 	bool intr_disabled;
2396 	int affinity_type = CPU_CURRENT;
2397 
2398 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2399 		return (EINVAL);
2400 	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
2401 		return (EINVAL);
2402 	if (vm->is_paused) {
2403 		return (EBUSY);
2404 	}
2405 
2406 	vcpu = &vm->vcpu[vcpuid];
2407 	vme = &vcpu->exitinfo;
2408 
2409 	vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
2410 
2411 	vcpu->vtc.vtc_status = 0;
2412 	ctxop_attach(curthread, vcpu->ctxop);
2413 
2414 	error = vm_entry_actions(vm, vcpuid, entry, vme);
2415 	if (error != 0) {
2416 		goto exit;
2417 	}
2418 
2419 restart:
2420 	error = vm_loop_checks(vm, vcpuid, vme);
2421 	if (error != 0) {
2422 		goto exit;
2423 	}
2424 
2425 	thread_affinity_set(curthread, affinity_type);
2426 	/*
2427 	 * Resource localization should happen after the CPU affinity for the
2428 	 * thread has been set to ensure that access from restricted contexts,
2429 	 * such as VMX-accelerated APIC operations, can occur without inducing
2430 	 * cyclic cross-calls.
2431 	 *
2432 	 * This must be done prior to disabling kpreempt via critical_enter().
2433 	 */
2434 	vm_localize_resources(vm, vcpu);
2435 	affinity_type = CPU_CURRENT;
2436 	critical_enter();
2437 
2438 	/* Force a trip through update_sregs to reload %fs/%gs and friends */
2439 	PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb);
2440 
2441 	if ((vcpu->vtc.vtc_status & VTCS_FPU_RESTORED) == 0) {
2442 		restore_guest_fpustate(vcpu);
2443 		vcpu->vtc.vtc_status |= VTCS_FPU_RESTORED;
2444 	}
2445 	vcpu->vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL;
2446 
2447 	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
2448 	error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip);
2449 	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
2450 
2451 	/*
2452 	 * Once clear of the delicate contexts comprising the VM_RUN handler,
2453 	 * thread CPU affinity can be loosened while other processing occurs.
2454 	 */
2455 	vcpu->vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL;
2456 	thread_affinity_clear(curthread);
2457 	critical_exit();
2458 
2459 	if (error != 0) {
2460 		/* Communicate out any error from VMRUN() above */
2461 		goto exit;
2462 	}
2463 
2464 	vcpu->nextrip = vme->rip + vme->inst_length;
2465 	switch (vme->exitcode) {
2466 	case VM_EXITCODE_REQIDLE:
2467 		error = vm_handle_reqidle(vm, vcpuid);
2468 		break;
2469 	case VM_EXITCODE_RUN_STATE:
2470 		error = vm_handle_run_state(vm, vcpuid);
2471 		break;
2472 	case VM_EXITCODE_SUSPENDED:
2473 		error = vm_handle_suspend(vm, vcpuid);
2474 		break;
2475 	case VM_EXITCODE_IOAPIC_EOI:
2476 		vioapic_process_eoi(vm, vcpuid,
2477 		    vme->u.ioapic_eoi.vector);
2478 		break;
2479 	case VM_EXITCODE_HLT:
2480 		intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
2481 		error = vm_handle_hlt(vm, vcpuid, intr_disabled);
2482 		break;
2483 	case VM_EXITCODE_PAGING:
2484 		error = vm_handle_paging(vm, vcpuid);
2485 		break;
2486 	case VM_EXITCODE_MMIO_EMUL:
2487 		error = vm_handle_mmio_emul(vm, vcpuid);
2488 		break;
2489 	case VM_EXITCODE_INOUT:
2490 		error = vm_handle_inout(vm, vcpuid, vme);
2491 		break;
2492 	case VM_EXITCODE_INST_EMUL:
2493 		error = vm_handle_inst_emul(vm, vcpuid);
2494 		break;
2495 	case VM_EXITCODE_MONITOR:
2496 	case VM_EXITCODE_MWAIT:
2497 	case VM_EXITCODE_VMINSN:
2498 		vm_inject_ud(vm, vcpuid);
2499 		break;
2500 	case VM_EXITCODE_RDMSR:
2501 		error = vm_handle_rdmsr(vm, vcpuid, vme);
2502 		break;
2503 	case VM_EXITCODE_WRMSR:
2504 		error = vm_handle_wrmsr(vm, vcpuid, vme);
2505 		break;
2506 	case VM_EXITCODE_HT:
2507 		affinity_type = CPU_BEST;
2508 		break;
2509 	case VM_EXITCODE_MTRAP:
2510 		VERIFY0(vm_suspend_cpu(vm, vcpuid));
2511 		error = -1;
2512 		break;
2513 	default:
2514 		/* handled in userland */
2515 		error = -1;
2516 		break;
2517 	}
2518 
2519 	if (error == 0) {
2520 		/* VM exit conditions handled in-kernel, continue running */
2521 		goto restart;
2522 	}
2523 
2524 exit:
2525 	kpreempt_disable();
2526 	ctxop_detach(curthread, vcpu->ctxop);
2527 	/* Make sure all of the needed vCPU context state is saved */
2528 	vmm_savectx(&vcpu->vtc);
2529 	kpreempt_enable();
2530 
2531 	vcpu_ustate_change(vm, vcpuid, VU_EMU_USER);
2532 	return (error);
2533 }
2534 
2535 int
2536 vm_restart_instruction(void *arg, int vcpuid)
2537 {
2538 	struct vm *vm;
2539 	struct vcpu *vcpu;
2540 	enum vcpu_state state;
2541 	uint64_t rip;
2542 	int error;
2543 
2544 	vm = arg;
2545 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2546 		return (EINVAL);
2547 
2548 	vcpu = &vm->vcpu[vcpuid];
2549 	state = vcpu_get_state(vm, vcpuid, NULL);
2550 	if (state == VCPU_RUNNING) {
2551 		/*
2552 		 * When a vcpu is "running" the next instruction is determined
2553 		 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
2554 		 * Thus setting 'inst_length' to zero will cause the current
2555 		 * instruction to be restarted.
2556 		 */
2557 		vcpu->exitinfo.inst_length = 0;
2558 	} else if (state == VCPU_FROZEN) {
2559 		/*
2560 		 * When a vcpu is "frozen" it is outside the critical section
2561 		 * around VMRUN() and 'nextrip' points to the next instruction.
2562 		 * Thus instruction restart is achieved by setting 'nextrip'
2563 		 * to the vcpu's %rip.
2564 		 */
2565 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
2566 		KASSERT(!error, ("%s: error %d getting rip", __func__, error));
2567 		vcpu->nextrip = rip;
2568 	} else {
2569 		panic("%s: invalid state %d", __func__, state);
2570 	}
2571 	return (0);
2572 }
2573 
2574 int
2575 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
2576 {
2577 	struct vcpu *vcpu;
2578 
2579 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2580 		return (EINVAL);
2581 
2582 	vcpu = &vm->vcpu[vcpuid];
2583 
2584 	if (VM_INTINFO_PENDING(info)) {
2585 		const uint32_t type = VM_INTINFO_TYPE(info);
2586 		const uint8_t vector = VM_INTINFO_VECTOR(info);
2587 
2588 		if (type == VM_INTINFO_NMI && vector != IDT_NMI)
2589 			return (EINVAL);
2590 		if (type == VM_INTINFO_HWEXCP && vector >= 32)
2591 			return (EINVAL);
2592 		if (info & VM_INTINFO_MASK_RSVD)
2593 			return (EINVAL);
2594 	} else {
2595 		info = 0;
2596 	}
2597 	vcpu->exit_intinfo = info;
2598 	return (0);
2599 }
2600 
2601 enum exc_class {
2602 	EXC_BENIGN,
2603 	EXC_CONTRIBUTORY,
2604 	EXC_PAGEFAULT
2605 };
2606 
2607 #define	IDT_VE	20	/* Virtualization Exception (Intel specific) */
2608 
2609 static enum exc_class
2610 exception_class(uint64_t info)
2611 {
2612 	ASSERT(VM_INTINFO_PENDING(info));
2613 
2614 	/* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
2615 	switch (VM_INTINFO_TYPE(info)) {
2616 	case VM_INTINFO_HWINTR:
2617 	case VM_INTINFO_SWINTR:
2618 	case VM_INTINFO_NMI:
2619 		return (EXC_BENIGN);
2620 	default:
2621 		/*
2622 		 * Hardware exception.
2623 		 *
2624 		 * SVM and VT-x use identical type values to represent NMI,
2625 		 * hardware interrupt and software interrupt.
2626 		 *
2627 		 * SVM uses type '3' for all exceptions. VT-x uses type '3'
2628 		 * for exceptions except #BP and #OF. #BP and #OF use a type
2629 		 * value of '5' or '6'. Therefore we don't check for explicit
2630 		 * values of 'type' to classify 'intinfo' into a hardware
2631 		 * exception.
2632 		 */
2633 		break;
2634 	}
2635 
2636 	switch (VM_INTINFO_VECTOR(info)) {
2637 	case IDT_PF:
2638 	case IDT_VE:
2639 		return (EXC_PAGEFAULT);
2640 	case IDT_DE:
2641 	case IDT_TS:
2642 	case IDT_NP:
2643 	case IDT_SS:
2644 	case IDT_GP:
2645 		return (EXC_CONTRIBUTORY);
2646 	default:
2647 		return (EXC_BENIGN);
2648 	}
2649 }
2650 
2651 /*
2652  * Fetch event pending injection into the guest, if one exists.
2653  *
2654  * Returns true if an event is to be injected (which is placed in `retinfo`).
2655  */
2656 bool
2657 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
2658 {
2659 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2660 	const uint64_t info1 = vcpu->exit_intinfo;
2661 	vcpu->exit_intinfo = 0;
2662 	const uint64_t info2 = vcpu->exc_pending;
2663 	vcpu->exc_pending = 0;
2664 
2665 	if (VM_INTINFO_PENDING(info1) && VM_INTINFO_PENDING(info2)) {
2666 		/*
2667 		 * If an exception occurs while attempting to call the
2668 		 * double-fault handler the processor enters shutdown mode
2669 		 * (aka triple fault).
2670 		 */
2671 		if (VM_INTINFO_TYPE(info1) == VM_INTINFO_HWEXCP &&
2672 		    VM_INTINFO_VECTOR(info1) == IDT_DF) {
2673 			(void) vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
2674 			*retinfo = 0;
2675 			return (false);
2676 		}
2677 		/*
2678 		 * "Conditions for Generating a Double Fault"
2679 		 *  Intel SDM, Vol3, Table 6-5
2680 		 */
2681 		const enum exc_class exc1 = exception_class(info1);
2682 		const enum exc_class exc2 = exception_class(info2);
2683 		if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
2684 		    (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
2685 			/* Convert nested fault into a double fault. */
2686 			*retinfo =
2687 			    VM_INTINFO_VALID |
2688 			    VM_INTINFO_DEL_ERRCODE |
2689 			    VM_INTINFO_HWEXCP |
2690 			    IDT_DF;
2691 		} else {
2692 			/* Handle exceptions serially */
2693 			vcpu->exit_intinfo = info1;
2694 			*retinfo = info2;
2695 		}
2696 		return (true);
2697 	} else if (VM_INTINFO_PENDING(info1)) {
2698 		*retinfo = info1;
2699 		return (true);
2700 	} else if (VM_INTINFO_PENDING(info2)) {
2701 		*retinfo = info2;
2702 		return (true);
2703 	}
2704 
2705 	return (false);
2706 }
2707 
2708 int
2709 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
2710 {
2711 	struct vcpu *vcpu;
2712 
2713 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2714 		return (EINVAL);
2715 
2716 	vcpu = &vm->vcpu[vcpuid];
2717 	*info1 = vcpu->exit_intinfo;
2718 	*info2 = vcpu->exc_pending;
2719 	return (0);
2720 }
2721 
2722 int
2723 vm_inject_exception(struct vm *vm, int vcpuid, uint8_t vector,
2724     bool errcode_valid, uint32_t errcode, bool restart_instruction)
2725 {
2726 	struct vcpu *vcpu;
2727 	uint64_t regval;
2728 	int error;
2729 
2730 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2731 		return (EINVAL);
2732 
2733 	if (vector >= 32)
2734 		return (EINVAL);
2735 
2736 	/*
2737 	 * NMIs are to be injected via their own specialized path using
2738 	 * vm_inject_nmi().
2739 	 */
2740 	if (vector == IDT_NMI) {
2741 		return (EINVAL);
2742 	}
2743 
2744 	/*
2745 	 * A double fault exception should never be injected directly into
2746 	 * the guest. It is a derived exception that results from specific
2747 	 * combinations of nested faults.
2748 	 */
2749 	if (vector == IDT_DF) {
2750 		return (EINVAL);
2751 	}
2752 
2753 	vcpu = &vm->vcpu[vcpuid];
2754 
2755 	if (VM_INTINFO_PENDING(vcpu->exc_pending)) {
2756 		/* Unable to inject exception due to one already pending */
2757 		return (EBUSY);
2758 	}
2759 
2760 	if (errcode_valid) {
2761 		/*
2762 		 * Exceptions don't deliver an error code in real mode.
2763 		 */
2764 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &regval);
2765 		VERIFY0(error);
2766 		if ((regval & CR0_PE) == 0) {
2767 			errcode_valid = false;
2768 		}
2769 	}
2770 
2771 	/*
2772 	 * From section 26.6.1 "Interruptibility State" in Intel SDM:
2773 	 *
2774 	 * Event blocking by "STI" or "MOV SS" is cleared after guest executes
2775 	 * one instruction or incurs an exception.
2776 	 */
2777 	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
2778 	VERIFY0(error);
2779 
2780 	if (restart_instruction) {
2781 		VERIFY0(vm_restart_instruction(vm, vcpuid));
2782 	}
2783 
2784 	uint64_t val = VM_INTINFO_VALID | VM_INTINFO_HWEXCP | vector;
2785 	if (errcode_valid) {
2786 		val |= VM_INTINFO_DEL_ERRCODE;
2787 		val |= (uint64_t)errcode << VM_INTINFO_SHIFT_ERRCODE;
2788 	}
2789 	vcpu->exc_pending = val;
2790 	return (0);
2791 }
2792 
2793 void
2794 vm_inject_ud(struct vm *vm, int vcpuid)
2795 {
2796 	VERIFY0(vm_inject_exception(vm, vcpuid, IDT_UD, false, 0, true));
2797 }
2798 
2799 void
2800 vm_inject_gp(struct vm *vm, int vcpuid)
2801 {
2802 	VERIFY0(vm_inject_exception(vm, vcpuid, IDT_GP, true, 0, true));
2803 }
2804 
2805 void
2806 vm_inject_ac(struct vm *vm, int vcpuid, uint32_t errcode)
2807 {
2808 	VERIFY0(vm_inject_exception(vm, vcpuid, IDT_AC, true, errcode, true));
2809 }
2810 
2811 void
2812 vm_inject_ss(struct vm *vm, int vcpuid, uint32_t errcode)
2813 {
2814 	VERIFY0(vm_inject_exception(vm, vcpuid, IDT_SS, true, errcode, true));
2815 }
2816 
2817 void
2818 vm_inject_pf(struct vm *vm, int vcpuid, uint32_t errcode, uint64_t cr2)
2819 {
2820 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2));
2821 	VERIFY0(vm_inject_exception(vm, vcpuid, IDT_PF, true, errcode, true));
2822 }
2823 
2824 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
2825 
2826 int
2827 vm_inject_nmi(struct vm *vm, int vcpuid)
2828 {
2829 	struct vcpu *vcpu;
2830 
2831 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2832 		return (EINVAL);
2833 
2834 	vcpu = &vm->vcpu[vcpuid];
2835 
2836 	vcpu->nmi_pending = true;
2837 	vcpu_notify_event(vm, vcpuid);
2838 	return (0);
2839 }
2840 
2841 bool
2842 vm_nmi_pending(struct vm *vm, int vcpuid)
2843 {
2844 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2845 
2846 	return (vcpu->nmi_pending);
2847 }
2848 
2849 void
2850 vm_nmi_clear(struct vm *vm, int vcpuid)
2851 {
2852 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2853 
2854 	ASSERT(vcpu->nmi_pending);
2855 
2856 	vcpu->nmi_pending = false;
2857 	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
2858 }
2859 
2860 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
2861 
2862 int
2863 vm_inject_extint(struct vm *vm, int vcpuid)
2864 {
2865 	struct vcpu *vcpu;
2866 
2867 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2868 		return (EINVAL);
2869 
2870 	vcpu = &vm->vcpu[vcpuid];
2871 
2872 	vcpu->extint_pending = true;
2873 	vcpu_notify_event(vm, vcpuid);
2874 	return (0);
2875 }
2876 
2877 bool
2878 vm_extint_pending(struct vm *vm, int vcpuid)
2879 {
2880 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2881 
2882 	return (vcpu->extint_pending);
2883 }
2884 
2885 void
2886 vm_extint_clear(struct vm *vm, int vcpuid)
2887 {
2888 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2889 
2890 	ASSERT(vcpu->extint_pending);
2891 
2892 	vcpu->extint_pending = false;
2893 	vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
2894 }
2895 
2896 int
2897 vm_inject_init(struct vm *vm, int vcpuid)
2898 {
2899 	struct vcpu *vcpu;
2900 
2901 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2902 		return (EINVAL);
2903 
2904 	vcpu = &vm->vcpu[vcpuid];
2905 	vcpu_lock(vcpu);
2906 	vcpu->run_state |= VRS_PEND_INIT;
2907 	/*
2908 	 * As part of queuing the INIT request, clear any pending SIPI.  It
2909 	 * would not otherwise survive across the reset of the vCPU when it
2910 	 * undergoes the requested INIT.  We would not want it to linger when it
2911 	 * could be mistaken as a subsequent (after the INIT) SIPI request.
2912 	 */
2913 	vcpu->run_state &= ~VRS_PEND_SIPI;
2914 	vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2915 
2916 	vcpu_unlock(vcpu);
2917 	return (0);
2918 }
2919 
2920 int
2921 vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2922 {
2923 	struct vcpu *vcpu;
2924 
2925 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2926 		return (EINVAL);
2927 
2928 	vcpu = &vm->vcpu[vcpuid];
2929 	vcpu_lock(vcpu);
2930 	vcpu->run_state |= VRS_PEND_SIPI;
2931 	vcpu->sipi_vector = vector;
2932 	/* SIPI is only actionable if the CPU is waiting in INIT state */
2933 	if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) {
2934 		vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2935 	}
2936 	vcpu_unlock(vcpu);
2937 	return (0);
2938 }
2939 
2940 bool
2941 vcpu_run_state_pending(struct vm *vm, int vcpuid)
2942 {
2943 	struct vcpu *vcpu;
2944 
2945 	ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
2946 	vcpu = &vm->vcpu[vcpuid];
2947 
2948 	/* Of interest: vCPU not in running state or with pending INIT */
2949 	return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN);
2950 }
2951 
2952 int
2953 vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only)
2954 {
2955 	struct seg_desc desc;
2956 	const enum vm_reg_name clear_regs[] = {
2957 		VM_REG_GUEST_CR2,
2958 		VM_REG_GUEST_CR3,
2959 		VM_REG_GUEST_CR4,
2960 		VM_REG_GUEST_RAX,
2961 		VM_REG_GUEST_RBX,
2962 		VM_REG_GUEST_RCX,
2963 		VM_REG_GUEST_RSI,
2964 		VM_REG_GUEST_RDI,
2965 		VM_REG_GUEST_RBP,
2966 		VM_REG_GUEST_RSP,
2967 		VM_REG_GUEST_R8,
2968 		VM_REG_GUEST_R9,
2969 		VM_REG_GUEST_R10,
2970 		VM_REG_GUEST_R11,
2971 		VM_REG_GUEST_R12,
2972 		VM_REG_GUEST_R13,
2973 		VM_REG_GUEST_R14,
2974 		VM_REG_GUEST_R15,
2975 		VM_REG_GUEST_DR0,
2976 		VM_REG_GUEST_DR1,
2977 		VM_REG_GUEST_DR2,
2978 		VM_REG_GUEST_DR3,
2979 		VM_REG_GUEST_EFER,
2980 	};
2981 	const enum vm_reg_name data_segs[] = {
2982 		VM_REG_GUEST_SS,
2983 		VM_REG_GUEST_DS,
2984 		VM_REG_GUEST_ES,
2985 		VM_REG_GUEST_FS,
2986 		VM_REG_GUEST_GS,
2987 	};
2988 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2989 
2990 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2991 		return (EINVAL);
2992 
2993 	for (uint_t i = 0; i < nitems(clear_regs); i++) {
2994 		VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0));
2995 	}
2996 
2997 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2));
2998 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0));
2999 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010));
3000 
3001 	/*
3002 	 * The prescribed contents of %rdx differ slightly between the Intel and
3003 	 * AMD architectural definitions.  The former expects the Extended Model
3004 	 * in bits 16-19 where the latter expects all the Family, Model, and
3005 	 * Stepping be there.  Common boot ROMs appear to disregard this
3006 	 * anyways, so we stick with a compromise value similar to what is
3007 	 * spelled out in the Intel SDM.
3008 	 */
3009 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600));
3010 
3011 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0));
3012 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400));
3013 
3014 	/* CS: Present, R/W, Accessed */
3015 	desc.access = 0x0093;
3016 	desc.base = 0xffff0000;
3017 	desc.limit = 0xffff;
3018 	VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
3019 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000));
3020 
3021 	/* SS, DS, ES, FS, GS: Present, R/W, Accessed */
3022 	desc.access = 0x0093;
3023 	desc.base = 0;
3024 	desc.limit = 0xffff;
3025 	for (uint_t i = 0; i < nitems(data_segs); i++) {
3026 		VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc));
3027 		VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0));
3028 	}
3029 
3030 	/* GDTR, IDTR */
3031 	desc.base = 0;
3032 	desc.limit = 0xffff;
3033 	VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc));
3034 	VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc));
3035 
3036 	/* LDTR: Present, LDT */
3037 	desc.access = 0x0082;
3038 	desc.base = 0;
3039 	desc.limit = 0xffff;
3040 	VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc));
3041 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0));
3042 
3043 	/* TR: Present, 32-bit TSS */
3044 	desc.access = 0x008b;
3045 	desc.base = 0;
3046 	desc.limit = 0xffff;
3047 	VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc));
3048 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0));
3049 
3050 	vlapic_reset(vm_lapic(vm, vcpuid));
3051 
3052 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0));
3053 
3054 	vcpu->exit_intinfo = 0;
3055 	vcpu->exc_pending = 0;
3056 	vcpu->nmi_pending = false;
3057 	vcpu->extint_pending = 0;
3058 
3059 	/*
3060 	 * A CPU reset caused by power-on or system reset clears more state than
3061 	 * one which is trigged from an INIT IPI.
3062 	 */
3063 	if (!init_only) {
3064 		vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
3065 		(void) hma_fpu_init(vcpu->guestfpu);
3066 
3067 		/* XXX: clear MSRs and other pieces */
3068 		bzero(&vcpu->mtrr, sizeof (vcpu->mtrr));
3069 	}
3070 
3071 	return (0);
3072 }
3073 
3074 static int
3075 vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector)
3076 {
3077 	struct seg_desc desc;
3078 
3079 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3080 		return (EINVAL);
3081 
3082 	/* CS: Present, R/W, Accessed */
3083 	desc.access = 0x0093;
3084 	desc.base = (uint64_t)vector << 12;
3085 	desc.limit = 0xffff;
3086 	VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
3087 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS,
3088 	    (uint64_t)vector << 8));
3089 
3090 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0));
3091 
3092 	return (0);
3093 }
3094 
3095 int
3096 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
3097 {
3098 	if (vcpu < 0 || vcpu >= vm->maxcpus)
3099 		return (EINVAL);
3100 
3101 	if (type < 0 || type >= VM_CAP_MAX)
3102 		return (EINVAL);
3103 
3104 	return (VMGETCAP(vm->cookie, vcpu, type, retval));
3105 }
3106 
3107 int
3108 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
3109 {
3110 	if (vcpu < 0 || vcpu >= vm->maxcpus)
3111 		return (EINVAL);
3112 
3113 	if (type < 0 || type >= VM_CAP_MAX)
3114 		return (EINVAL);
3115 
3116 	return (VMSETCAP(vm->cookie, vcpu, type, val));
3117 }
3118 
3119 vcpu_cpuid_config_t *
3120 vm_cpuid_config(struct vm *vm, int vcpuid)
3121 {
3122 	ASSERT3S(vcpuid, >=, 0);
3123 	ASSERT3S(vcpuid, <, VM_MAXCPU);
3124 
3125 	return (&vm->vcpu[vcpuid].cpuid_cfg);
3126 }
3127 
3128 struct vlapic *
3129 vm_lapic(struct vm *vm, int cpu)
3130 {
3131 	ASSERT3S(cpu, >=, 0);
3132 	ASSERT3S(cpu, <, VM_MAXCPU);
3133 
3134 	return (vm->vcpu[cpu].vlapic);
3135 }
3136 
3137 struct vioapic *
3138 vm_ioapic(struct vm *vm)
3139 {
3140 
3141 	return (vm->vioapic);
3142 }
3143 
3144 struct vhpet *
3145 vm_hpet(struct vm *vm)
3146 {
3147 
3148 	return (vm->vhpet);
3149 }
3150 
3151 void *
3152 vm_iommu_domain(struct vm *vm)
3153 {
3154 
3155 	return (vm->iommu);
3156 }
3157 
3158 int
3159 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
3160     bool from_idle)
3161 {
3162 	int error;
3163 	struct vcpu *vcpu;
3164 
3165 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3166 		panic("vcpu_set_state: invalid vcpuid %d", vcpuid);
3167 
3168 	vcpu = &vm->vcpu[vcpuid];
3169 
3170 	vcpu_lock(vcpu);
3171 	error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
3172 	vcpu_unlock(vcpu);
3173 
3174 	return (error);
3175 }
3176 
3177 enum vcpu_state
3178 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
3179 {
3180 	struct vcpu *vcpu;
3181 	enum vcpu_state state;
3182 
3183 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3184 		panic("vcpu_get_state: invalid vcpuid %d", vcpuid);
3185 
3186 	vcpu = &vm->vcpu[vcpuid];
3187 
3188 	vcpu_lock(vcpu);
3189 	state = vcpu->state;
3190 	if (hostcpu != NULL)
3191 		*hostcpu = vcpu->hostcpu;
3192 	vcpu_unlock(vcpu);
3193 
3194 	return (state);
3195 }
3196 
3197 uint64_t
3198 vcpu_tsc_offset(struct vm *vm, int vcpuid, bool phys_adj)
3199 {
3200 	ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
3201 
3202 	uint64_t vcpu_off = vm->boot_tsc_offset + vm->vcpu[vcpuid].tsc_offset;
3203 
3204 	if (phys_adj) {
3205 		/* Include any offset for the current physical CPU too */
3206 		extern hrtime_t tsc_gethrtime_tick_delta(void);
3207 		vcpu_off += (uint64_t)tsc_gethrtime_tick_delta();
3208 	}
3209 
3210 	return (vcpu_off);
3211 }
3212 
3213 /* Normalize hrtime against the boot time for a VM */
3214 hrtime_t
3215 vm_normalize_hrtime(struct vm *vm, hrtime_t hrt)
3216 {
3217 	/* To avoid underflow/overflow UB, perform math as unsigned */
3218 	return ((hrtime_t)((uint64_t)hrt - (uint64_t)vm->boot_hrtime));
3219 }
3220 
3221 /* Denormalize hrtime against the boot time for a VM */
3222 hrtime_t
3223 vm_denormalize_hrtime(struct vm *vm, hrtime_t hrt)
3224 {
3225 	/* To avoid underflow/overflow UB, perform math as unsigned */
3226 	return ((hrtime_t)((uint64_t)hrt + (uint64_t)vm->boot_hrtime));
3227 }
3228 
3229 int
3230 vm_activate_cpu(struct vm *vm, int vcpuid)
3231 {
3232 
3233 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3234 		return (EINVAL);
3235 
3236 	if (CPU_ISSET(vcpuid, &vm->active_cpus))
3237 		return (EBUSY);
3238 
3239 	if (vm->suspend != 0) {
3240 		return (EBUSY);
3241 	}
3242 
3243 	CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
3244 
3245 	/*
3246 	 * It is possible that this vCPU was undergoing activation at the same
3247 	 * time that the VM was being suspended.  If that happens to be the
3248 	 * case, it should reflect the suspended state immediately.
3249 	 */
3250 	if (atomic_load_acq_int((uint_t *)&vm->suspend) != 0) {
3251 		CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
3252 	}
3253 
3254 	return (0);
3255 }
3256 
3257 int
3258 vm_suspend_cpu(struct vm *vm, int vcpuid)
3259 {
3260 	int i;
3261 
3262 	if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3263 		return (EINVAL);
3264 
3265 	if (vcpuid == -1) {
3266 		vm->debug_cpus = vm->active_cpus;
3267 		for (i = 0; i < vm->maxcpus; i++) {
3268 			if (CPU_ISSET(i, &vm->active_cpus))
3269 				vcpu_notify_event(vm, i);
3270 		}
3271 	} else {
3272 		if (!CPU_ISSET(vcpuid, &vm->active_cpus))
3273 			return (EINVAL);
3274 
3275 		CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus);
3276 		vcpu_notify_event(vm, vcpuid);
3277 	}
3278 	return (0);
3279 }
3280 
3281 int
3282 vm_resume_cpu(struct vm *vm, int vcpuid)
3283 {
3284 
3285 	if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3286 		return (EINVAL);
3287 
3288 	if (vcpuid == -1) {
3289 		CPU_ZERO(&vm->debug_cpus);
3290 	} else {
3291 		if (!CPU_ISSET(vcpuid, &vm->debug_cpus))
3292 			return (EINVAL);
3293 
3294 		CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus);
3295 	}
3296 	return (0);
3297 }
3298 
3299 static bool
3300 vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry,
3301     uint64_t entry_rip)
3302 {
3303 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
3304 	struct vm_exit *vme = &vcpu->exitinfo;
3305 	bool bail = false;
3306 
3307 	ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
3308 
3309 	if (vm->suspend) {
3310 		if (on_entry) {
3311 			VERIFY(vm->suspend > VM_SUSPEND_NONE &&
3312 			    vm->suspend < VM_SUSPEND_LAST);
3313 
3314 			vme->exitcode = VM_EXITCODE_SUSPENDED;
3315 			vme->u.suspended.how = vm->suspend;
3316 		} else {
3317 			/*
3318 			 * Handling VM suspend is complicated, so if that
3319 			 * condition is detected outside of VM-entry itself,
3320 			 * just emit a BOGUS exitcode so we take a lap to pick
3321 			 * up the event during an entry and are directed into
3322 			 * the vm_handle_suspend() logic.
3323 			 */
3324 			vme->exitcode = VM_EXITCODE_BOGUS;
3325 		}
3326 		bail = true;
3327 	}
3328 	if (vcpu->reqidle) {
3329 		vme->exitcode = VM_EXITCODE_REQIDLE;
3330 		vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
3331 
3332 		if (!on_entry) {
3333 			/*
3334 			 * A reqidle request detected outside of VM-entry can be
3335 			 * handled directly by clearing the request (and taking
3336 			 * a lap to userspace).
3337 			 */
3338 			vcpu_assert_locked(vcpu);
3339 			vcpu->reqidle = 0;
3340 		}
3341 		bail = true;
3342 	}
3343 	if (vcpu_should_yield(vm, vcpuid)) {
3344 		vme->exitcode = VM_EXITCODE_BOGUS;
3345 		vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
3346 		bail = true;
3347 	}
3348 	if (CPU_ISSET(vcpuid, &vm->debug_cpus)) {
3349 		vme->exitcode = VM_EXITCODE_DEBUG;
3350 		bail = true;
3351 	}
3352 
3353 	if (bail) {
3354 		if (on_entry) {
3355 			/*
3356 			 * If bailing out during VM-entry, the current %rip must
3357 			 * be recorded in the exitinfo.
3358 			 */
3359 			vme->rip = entry_rip;
3360 		}
3361 		vme->inst_length = 0;
3362 	}
3363 	return (bail);
3364 }
3365 
3366 static bool
3367 vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid)
3368 {
3369 	/*
3370 	 * Bail-out check done prior to sleeping (in vCPU contexts like HLT or
3371 	 * wait-for-SIPI) expect that %rip is already populated in the vm_exit
3372 	 * structure, and we would only modify the exitcode.
3373 	 */
3374 	return (vcpu_bailout_checks(vm, vcpuid, false, 0));
3375 }
3376 
3377 bool
3378 vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip)
3379 {
3380 	/*
3381 	 * Bail-out checks done as part of VM entry require an updated %rip to
3382 	 * populate the vm_exit struct if any of the conditions of interest are
3383 	 * matched in the check.
3384 	 */
3385 	return (vcpu_bailout_checks(vm, vcpuid, true, rip));
3386 }
3387 
3388 cpuset_t
3389 vm_active_cpus(struct vm *vm)
3390 {
3391 
3392 	return (vm->active_cpus);
3393 }
3394 
3395 cpuset_t
3396 vm_debug_cpus(struct vm *vm)
3397 {
3398 
3399 	return (vm->debug_cpus);
3400 }
3401 
3402 cpuset_t
3403 vm_suspended_cpus(struct vm *vm)
3404 {
3405 
3406 	return (vm->suspended_cpus);
3407 }
3408 
3409 void *
3410 vcpu_stats(struct vm *vm, int vcpuid)
3411 {
3412 
3413 	return (vm->vcpu[vcpuid].stats);
3414 }
3415 
3416 int
3417 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
3418 {
3419 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3420 		return (EINVAL);
3421 
3422 	*state = vm->vcpu[vcpuid].x2apic_state;
3423 
3424 	return (0);
3425 }
3426 
3427 int
3428 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
3429 {
3430 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3431 		return (EINVAL);
3432 
3433 	if (state >= X2APIC_STATE_LAST)
3434 		return (EINVAL);
3435 
3436 	vm->vcpu[vcpuid].x2apic_state = state;
3437 
3438 	vlapic_set_x2apic_state(vm, vcpuid, state);
3439 
3440 	return (0);
3441 }
3442 
3443 /*
3444  * This function is called to ensure that a vcpu "sees" a pending event
3445  * as soon as possible:
3446  * - If the vcpu thread is sleeping then it is woken up.
3447  * - If the vcpu is running on a different host_cpu then an IPI will be directed
3448  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
3449  */
3450 static void
3451 vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype)
3452 {
3453 	int hostcpu;
3454 
3455 	ASSERT(ntype == VCPU_NOTIFY_APIC || VCPU_NOTIFY_EXIT);
3456 
3457 	hostcpu = vcpu->hostcpu;
3458 	if (vcpu->state == VCPU_RUNNING) {
3459 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
3460 		if (hostcpu != curcpu) {
3461 			if (ntype == VCPU_NOTIFY_APIC) {
3462 				vlapic_post_intr(vcpu->vlapic, hostcpu);
3463 			} else {
3464 				poke_cpu(hostcpu);
3465 			}
3466 		} else {
3467 			/*
3468 			 * If the 'vcpu' is running on 'curcpu' then it must
3469 			 * be sending a notification to itself (e.g. SELF_IPI).
3470 			 * The pending event will be picked up when the vcpu
3471 			 * transitions back to guest context.
3472 			 */
3473 		}
3474 	} else {
3475 		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
3476 		    "with hostcpu %d", vcpu->state, hostcpu));
3477 		if (vcpu->state == VCPU_SLEEPING) {
3478 			cv_signal(&vcpu->vcpu_cv);
3479 		}
3480 	}
3481 }
3482 
3483 void
3484 vcpu_notify_event(struct vm *vm, int vcpuid)
3485 {
3486 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
3487 
3488 	vcpu_lock(vcpu);
3489 	vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
3490 	vcpu_unlock(vcpu);
3491 }
3492 
3493 void
3494 vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t ntype)
3495 {
3496 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
3497 
3498 	if (ntype == VCPU_NOTIFY_NONE) {
3499 		return;
3500 	}
3501 
3502 	vcpu_lock(vcpu);
3503 	vcpu_notify_event_locked(vcpu, ntype);
3504 	vcpu_unlock(vcpu);
3505 }
3506 
3507 void
3508 vcpu_ustate_change(struct vm *vm, int vcpuid, enum vcpu_ustate ustate)
3509 {
3510 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
3511 	hrtime_t now = gethrtime();
3512 
3513 	ASSERT3U(ustate, !=, vcpu->ustate);
3514 	ASSERT3S(ustate, <, VU_MAX);
3515 	ASSERT3S(ustate, >=, VU_INIT);
3516 
3517 	hrtime_t delta = now - vcpu->ustate_when;
3518 	vcpu->ustate_total[vcpu->ustate] += delta;
3519 
3520 	membar_producer();
3521 
3522 	vcpu->ustate_when = now;
3523 	vcpu->ustate = ustate;
3524 }
3525 
3526 struct vmspace *
3527 vm_get_vmspace(struct vm *vm)
3528 {
3529 
3530 	return (vm->vmspace);
3531 }
3532 
3533 struct vm_client *
3534 vm_get_vmclient(struct vm *vm, int vcpuid)
3535 {
3536 	return (vm->vcpu[vcpuid].vmclient);
3537 }
3538 
3539 int
3540 vm_apicid2vcpuid(struct vm *vm, int apicid)
3541 {
3542 	/*
3543 	 * XXX apic id is assumed to be numerically identical to vcpu id
3544 	 */
3545 	return (apicid);
3546 }
3547 
3548 struct vatpic *
3549 vm_atpic(struct vm *vm)
3550 {
3551 	return (vm->vatpic);
3552 }
3553 
3554 struct vatpit *
3555 vm_atpit(struct vm *vm)
3556 {
3557 	return (vm->vatpit);
3558 }
3559 
3560 struct vpmtmr *
3561 vm_pmtmr(struct vm *vm)
3562 {
3563 
3564 	return (vm->vpmtmr);
3565 }
3566 
3567 struct vrtc *
3568 vm_rtc(struct vm *vm)
3569 {
3570 
3571 	return (vm->vrtc);
3572 }
3573 
3574 enum vm_reg_name
3575 vm_segment_name(int seg)
3576 {
3577 	static enum vm_reg_name seg_names[] = {
3578 		VM_REG_GUEST_ES,
3579 		VM_REG_GUEST_CS,
3580 		VM_REG_GUEST_SS,
3581 		VM_REG_GUEST_DS,
3582 		VM_REG_GUEST_FS,
3583 		VM_REG_GUEST_GS
3584 	};
3585 
3586 	KASSERT(seg >= 0 && seg < nitems(seg_names),
3587 	    ("%s: invalid segment encoding %d", __func__, seg));
3588 	return (seg_names[seg]);
3589 }
3590 
3591 void
3592 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
3593     uint_t num_copyinfo)
3594 {
3595 	for (uint_t idx = 0; idx < num_copyinfo; idx++) {
3596 		if (copyinfo[idx].cookie != NULL) {
3597 			(void) vmp_release((vm_page_t *)copyinfo[idx].cookie);
3598 		}
3599 	}
3600 	bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo));
3601 }
3602 
3603 int
3604 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
3605     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
3606     uint_t num_copyinfo, int *fault)
3607 {
3608 	uint_t idx, nused;
3609 	size_t n, off, remaining;
3610 	vm_client_t *vmc = vm_get_vmclient(vm, vcpuid);
3611 
3612 	bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo);
3613 
3614 	nused = 0;
3615 	remaining = len;
3616 	while (remaining > 0) {
3617 		uint64_t gpa;
3618 		int error;
3619 
3620 		KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
3621 		error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
3622 		if (error || *fault)
3623 			return (error);
3624 		off = gpa & PAGEOFFSET;
3625 		n = min(remaining, PAGESIZE - off);
3626 		copyinfo[nused].gpa = gpa;
3627 		copyinfo[nused].len = n;
3628 		remaining -= n;
3629 		gla += n;
3630 		nused++;
3631 	}
3632 
3633 	for (idx = 0; idx < nused; idx++) {
3634 		vm_page_t *vmp;
3635 		caddr_t hva;
3636 
3637 		vmp = vmc_hold(vmc, copyinfo[idx].gpa & PAGEMASK, prot);
3638 		if (vmp == NULL) {
3639 			break;
3640 		}
3641 		if ((prot & PROT_WRITE) != 0) {
3642 			hva = (caddr_t)vmp_get_writable(vmp);
3643 		} else {
3644 			hva = (caddr_t)vmp_get_readable(vmp);
3645 		}
3646 		copyinfo[idx].hva = hva + (copyinfo[idx].gpa & PAGEOFFSET);
3647 		copyinfo[idx].cookie = vmp;
3648 		copyinfo[idx].prot = prot;
3649 	}
3650 
3651 	if (idx != nused) {
3652 		vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
3653 		return (EFAULT);
3654 	} else {
3655 		*fault = 0;
3656 		return (0);
3657 	}
3658 }
3659 
3660 void
3661 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
3662     size_t len)
3663 {
3664 	char *dst;
3665 	int idx;
3666 
3667 	dst = kaddr;
3668 	idx = 0;
3669 	while (len > 0) {
3670 		ASSERT(copyinfo[idx].prot & PROT_READ);
3671 
3672 		bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
3673 		len -= copyinfo[idx].len;
3674 		dst += copyinfo[idx].len;
3675 		idx++;
3676 	}
3677 }
3678 
3679 void
3680 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
3681     struct vm_copyinfo *copyinfo, size_t len)
3682 {
3683 	const char *src;
3684 	int idx;
3685 
3686 	src = kaddr;
3687 	idx = 0;
3688 	while (len > 0) {
3689 		ASSERT(copyinfo[idx].prot & PROT_WRITE);
3690 
3691 		bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
3692 		len -= copyinfo[idx].len;
3693 		src += copyinfo[idx].len;
3694 		idx++;
3695 	}
3696 }
3697 
3698 /*
3699  * Return the amount of in-use and wired memory for the VM. Since
3700  * these are global stats, only return the values with for vCPU 0
3701  */
3702 VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
3703 
3704 static void
3705 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
3706 {
3707 	if (vcpu == 0) {
3708 		vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
3709 		    PAGE_SIZE * vmspace_resident_count(vm->vmspace));
3710 	}
3711 }
3712 
3713 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
3714 
3715 int
3716 vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port,
3717     uint8_t bytes, uint32_t *val)
3718 {
3719 	return (vm_inout_access(&vm->ioports, in, port, bytes, val));
3720 }
3721 
3722 /*
3723  * bhyve-internal interfaces to attach or detach IO port handlers.
3724  * Must be called with VM write lock held for safety.
3725  */
3726 int
3727 vm_ioport_attach(struct vm *vm, uint16_t port, ioport_handler_t func, void *arg,
3728     void **cookie)
3729 {
3730 	int err;
3731 	err = vm_inout_attach(&vm->ioports, port, IOPF_DEFAULT, func, arg);
3732 	if (err == 0) {
3733 		*cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3734 	}
3735 	return (err);
3736 }
3737 int
3738 vm_ioport_detach(struct vm *vm, void **cookie, ioport_handler_t *old_func,
3739     void **old_arg)
3740 {
3741 	uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3742 	int err;
3743 
3744 	err = vm_inout_detach(&vm->ioports, port, false, old_func, old_arg);
3745 	if (err == 0) {
3746 		*cookie = NULL;
3747 	}
3748 	return (err);
3749 }
3750 
3751 /*
3752  * External driver interfaces to attach or detach IO port handlers.
3753  * Must be called with VM write lock held for safety.
3754  */
3755 int
3756 vm_ioport_hook(struct vm *vm, uint16_t port, ioport_handler_t func,
3757     void *arg, void **cookie)
3758 {
3759 	int err;
3760 
3761 	if (port == 0) {
3762 		return (EINVAL);
3763 	}
3764 
3765 	err = vm_inout_attach(&vm->ioports, port, IOPF_DRV_HOOK, func, arg);
3766 	if (err == 0) {
3767 		*cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3768 	}
3769 	return (err);
3770 }
3771 void
3772 vm_ioport_unhook(struct vm *vm, void **cookie)
3773 {
3774 	uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3775 	ioport_handler_t old_func;
3776 	void *old_arg;
3777 	int err;
3778 
3779 	err = vm_inout_detach(&vm->ioports, port, true, &old_func, &old_arg);
3780 
3781 	/* ioport-hook-using drivers are expected to be well-behaved */
3782 	VERIFY0(err);
3783 	VERIFY(IOP_GEN_COOKIE(old_func, old_arg, port) == (uintptr_t)*cookie);
3784 
3785 	*cookie = NULL;
3786 }
3787 
3788 int
3789 vmm_kstat_update_vcpu(struct kstat *ksp, int rw)
3790 {
3791 	struct vm *vm = ksp->ks_private;
3792 	vmm_vcpu_kstats_t *vvk = ksp->ks_data;
3793 	const int vcpuid = vvk->vvk_vcpu.value.ui32;
3794 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
3795 
3796 	ASSERT3U(vcpuid, <, VM_MAXCPU);
3797 
3798 	vvk->vvk_time_init.value.ui64 = vcpu->ustate_total[VU_INIT];
3799 	vvk->vvk_time_run.value.ui64 = vcpu->ustate_total[VU_RUN];
3800 	vvk->vvk_time_idle.value.ui64 = vcpu->ustate_total[VU_IDLE];
3801 	vvk->vvk_time_emu_kern.value.ui64 = vcpu->ustate_total[VU_EMU_KERN];
3802 	vvk->vvk_time_emu_user.value.ui64 = vcpu->ustate_total[VU_EMU_USER];
3803 	vvk->vvk_time_sched.value.ui64 = vcpu->ustate_total[VU_SCHED];
3804 
3805 	return (0);
3806 }
3807 
3808 SET_DECLARE(vmm_data_version_entries, const vmm_data_version_entry_t);
3809 
3810 static inline bool
3811 vmm_data_is_cpu_specific(uint16_t data_class)
3812 {
3813 	switch (data_class) {
3814 	case VDC_REGISTER:
3815 	case VDC_MSR:
3816 	case VDC_FPU:
3817 	case VDC_LAPIC:
3818 		return (true);
3819 	default:
3820 		break;
3821 	}
3822 
3823 	return (false);
3824 }
3825 
3826 static int
3827 vmm_data_find(const vmm_data_req_t *req, const vmm_data_version_entry_t **resp)
3828 {
3829 	const vmm_data_version_entry_t **vdpp, *vdp;
3830 
3831 	ASSERT(resp != NULL);
3832 	ASSERT(req->vdr_result_len != NULL);
3833 
3834 	SET_FOREACH(vdpp, vmm_data_version_entries) {
3835 		vdp = *vdpp;
3836 		if (vdp->vdve_class == req->vdr_class &&
3837 		    vdp->vdve_version == req->vdr_version) {
3838 			/*
3839 			 * Enforce any data length expectation expressed by the
3840 			 * provider for this data.
3841 			 */
3842 			if (vdp->vdve_len_expect != 0 &&
3843 			    vdp->vdve_len_expect > req->vdr_len) {
3844 				*req->vdr_result_len = vdp->vdve_len_expect;
3845 				return (ENOSPC);
3846 			}
3847 			*resp = vdp;
3848 			return (0);
3849 		}
3850 	}
3851 	return (EINVAL);
3852 }
3853 
3854 static void *
3855 vmm_data_from_class(const vmm_data_req_t *req, struct vm *vm, int vcpuid)
3856 {
3857 	switch (req->vdr_class) {
3858 		/* per-cpu data/devices */
3859 	case VDC_LAPIC:
3860 		return (vm_lapic(vm, vcpuid));
3861 	case VDC_VMM_ARCH:
3862 		return (vm);
3863 
3864 	case VDC_FPU:
3865 	case VDC_REGISTER:
3866 	case VDC_MSR:
3867 		/*
3868 		 * These have per-CPU handling which is dispatched outside
3869 		 * vmm_data_version_entries listing.
3870 		 */
3871 		return (NULL);
3872 
3873 		/* system-wide data/devices */
3874 	case VDC_IOAPIC:
3875 		return (vm->vioapic);
3876 	case VDC_ATPIT:
3877 		return (vm->vatpit);
3878 	case VDC_ATPIC:
3879 		return (vm->vatpic);
3880 	case VDC_HPET:
3881 		return (vm->vhpet);
3882 	case VDC_PM_TIMER:
3883 		return (vm->vpmtmr);
3884 	case VDC_RTC:
3885 		return (vm->vrtc);
3886 
3887 	default:
3888 		/* The data class will have been validated by now */
3889 		panic("Unexpected class %u", req->vdr_class);
3890 	}
3891 }
3892 
3893 const uint32_t arch_msr_iter[] = {
3894 	MSR_EFER,
3895 
3896 	/*
3897 	 * While gsbase and fsbase are accessible via the MSR accessors, they
3898 	 * are not included in MSR iteration since they are covered by the
3899 	 * segment descriptor interface too.
3900 	 */
3901 	MSR_KGSBASE,
3902 
3903 	MSR_STAR,
3904 	MSR_LSTAR,
3905 	MSR_CSTAR,
3906 	MSR_SF_MASK,
3907 
3908 	MSR_SYSENTER_CS_MSR,
3909 	MSR_SYSENTER_ESP_MSR,
3910 	MSR_SYSENTER_EIP_MSR,
3911 	MSR_PAT,
3912 };
3913 const uint32_t generic_msr_iter[] = {
3914 	MSR_TSC,
3915 	MSR_MTRRcap,
3916 	MSR_MTRRdefType,
3917 
3918 	MSR_MTRR4kBase, MSR_MTRR4kBase + 1, MSR_MTRR4kBase + 2,
3919 	MSR_MTRR4kBase + 3, MSR_MTRR4kBase + 4, MSR_MTRR4kBase + 5,
3920 	MSR_MTRR4kBase + 6, MSR_MTRR4kBase + 7,
3921 
3922 	MSR_MTRR16kBase, MSR_MTRR16kBase + 1,
3923 
3924 	MSR_MTRR64kBase,
3925 };
3926 
3927 static int
3928 vmm_data_read_msrs(struct vm *vm, int vcpuid, const vmm_data_req_t *req)
3929 {
3930 	VERIFY3U(req->vdr_class, ==, VDC_MSR);
3931 	VERIFY3U(req->vdr_version, ==, 1);
3932 
3933 	const uint_t num_msrs = nitems(arch_msr_iter) + nitems(generic_msr_iter)
3934 	    + (VMM_MTRR_VAR_MAX * 2);
3935 	const uint32_t output_len =
3936 	    num_msrs * sizeof (struct vdi_field_entry_v1);
3937 	*req->vdr_result_len = output_len;
3938 
3939 	if (req->vdr_len < output_len) {
3940 		return (ENOSPC);
3941 	}
3942 
3943 	struct vdi_field_entry_v1 *entryp = req->vdr_data;
3944 	for (uint_t i = 0; i < nitems(arch_msr_iter); i++, entryp++) {
3945 		const uint32_t msr = arch_msr_iter[i];
3946 		uint64_t val = 0;
3947 
3948 		int err = ops->vmgetmsr(vm->cookie, vcpuid, msr, &val);
3949 		/* All of these MSRs are expected to work */
3950 		VERIFY0(err);
3951 		entryp->vfe_ident = msr;
3952 		entryp->vfe_value = val;
3953 	}
3954 
3955 	struct vm_mtrr *mtrr = &vm->vcpu[vcpuid].mtrr;
3956 	for (uint_t i = 0; i < nitems(generic_msr_iter); i++, entryp++) {
3957 		const uint32_t msr = generic_msr_iter[i];
3958 
3959 		entryp->vfe_ident = msr;
3960 		switch (msr) {
3961 		case MSR_TSC:
3962 			/*
3963 			 * Communicate this as the difference from the VM-wide
3964 			 * offset of the boot time.
3965 			 */
3966 			entryp->vfe_value = vm->vcpu[vcpuid].tsc_offset;
3967 			break;
3968 		case MSR_MTRRcap:
3969 		case MSR_MTRRdefType:
3970 		case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
3971 		case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
3972 		case MSR_MTRR64kBase: {
3973 			int err = vm_rdmtrr(mtrr, msr, &entryp->vfe_value);
3974 			VERIFY0(err);
3975 			break;
3976 		}
3977 		default:
3978 			panic("unexpected msr export %x", msr);
3979 		}
3980 	}
3981 	/* Copy the variable MTRRs */
3982 	for (uint_t i = 0; i < (VMM_MTRR_VAR_MAX * 2); i++, entryp++) {
3983 		const uint32_t msr = MSR_MTRRVarBase + i;
3984 
3985 		entryp->vfe_ident = msr;
3986 		int err = vm_rdmtrr(mtrr, msr, &entryp->vfe_value);
3987 		VERIFY0(err);
3988 	}
3989 	return (0);
3990 }
3991 
3992 static int
3993 vmm_data_write_msrs(struct vm *vm, int vcpuid, const vmm_data_req_t *req)
3994 {
3995 	VERIFY3U(req->vdr_class, ==, VDC_MSR);
3996 	VERIFY3U(req->vdr_version, ==, 1);
3997 
3998 	const struct vdi_field_entry_v1 *entryp = req->vdr_data;
3999 	const uint_t entry_count =
4000 	    req->vdr_len / sizeof (struct vdi_field_entry_v1);
4001 	struct vm_mtrr *mtrr = &vm->vcpu[vcpuid].mtrr;
4002 
4003 	/*
4004 	 * First make sure that all of the MSRs can be manipulated.
4005 	 * For now, this check is done by going though the getmsr handler
4006 	 */
4007 	for (uint_t i = 0; i < entry_count; i++, entryp++) {
4008 		const uint32_t msr = entryp->vfe_ident;
4009 		uint64_t val;
4010 		int err = 0;
4011 
4012 		switch (msr) {
4013 		case MSR_TSC:
4014 			break;
4015 		default:
4016 			if (is_mtrr_msr(msr)) {
4017 				err = vm_rdmtrr(mtrr, msr, &val);
4018 			} else {
4019 				err = ops->vmgetmsr(vm->cookie, vcpuid, msr,
4020 				    &val);
4021 			}
4022 			break;
4023 		}
4024 		if (err != 0) {
4025 			return (err);
4026 		}
4027 	}
4028 
4029 	/*
4030 	 * Fairly confident that all of the 'set' operations are at least
4031 	 * targeting valid MSRs, continue on.
4032 	 */
4033 	entryp = req->vdr_data;
4034 	for (uint_t i = 0; i < entry_count; i++, entryp++) {
4035 		const uint32_t msr = entryp->vfe_ident;
4036 		const uint64_t val = entryp->vfe_value;
4037 		int err = 0;
4038 
4039 		switch (msr) {
4040 		case MSR_TSC:
4041 			vm->vcpu[vcpuid].tsc_offset = entryp->vfe_value;
4042 			break;
4043 		default:
4044 			if (is_mtrr_msr(msr)) {
4045 				if (msr == MSR_MTRRcap) {
4046 					/*
4047 					 * MTRRcap is read-only.  If the current
4048 					 * value matches the incoming one,
4049 					 * consider it a success
4050 					 */
4051 					uint64_t comp;
4052 					err = vm_rdmtrr(mtrr, msr, &comp);
4053 					if (err != 0 || comp != val) {
4054 						err = EINVAL;
4055 					}
4056 				} else {
4057 					err = vm_wrmtrr(mtrr, msr, val);
4058 				}
4059 			} else {
4060 				err = ops->vmsetmsr(vm->cookie, vcpuid, msr,
4061 				    val);
4062 			}
4063 			break;
4064 		}
4065 		if (err != 0) {
4066 			return (err);
4067 		}
4068 	}
4069 	*req->vdr_result_len = entry_count * sizeof (struct vdi_field_entry_v1);
4070 
4071 	return (0);
4072 }
4073 
4074 static const vmm_data_version_entry_t msr_v1 = {
4075 	.vdve_class = VDC_MSR,
4076 	.vdve_version = 1,
4077 	.vdve_len_per_item = sizeof (struct vdi_field_entry_v1),
4078 	.vdve_vcpu_readf = vmm_data_read_msrs,
4079 	.vdve_vcpu_writef = vmm_data_write_msrs,
4080 };
4081 VMM_DATA_VERSION(msr_v1);
4082 
4083 static const uint32_t vmm_arch_v1_fields[] = {
4084 	VAI_TSC_BOOT_OFFSET,
4085 	VAI_BOOT_HRTIME,
4086 	VAI_TSC_FREQ,
4087 	VAI_VM_IS_PAUSED,
4088 };
4089 
4090 static const uint32_t vmm_arch_v1_vcpu_fields[] = {
4091 	VAI_PEND_NMI,
4092 	VAI_PEND_EXTINT,
4093 	VAI_PEND_EXCP,
4094 	VAI_PEND_INTINFO,
4095 };
4096 
4097 static bool
4098 vmm_read_arch_field(struct vm *vm, int vcpuid, uint32_t ident, uint64_t *valp)
4099 {
4100 	ASSERT(valp != NULL);
4101 
4102 	if (vcpuid == -1) {
4103 		switch (ident) {
4104 		case VAI_TSC_BOOT_OFFSET:
4105 			*valp = vm->boot_tsc_offset;
4106 			return (true);
4107 		case VAI_BOOT_HRTIME:
4108 			*valp = vm->boot_hrtime;
4109 			return (true);
4110 		case VAI_TSC_FREQ:
4111 			/*
4112 			 * Since the system TSC calibration is not public, just
4113 			 * derive it from the scaling functions available.
4114 			 */
4115 			*valp = unscalehrtime(NANOSEC);
4116 			return (true);
4117 		case VAI_VM_IS_PAUSED:
4118 			*valp = vm->is_paused ? 1 : 0;
4119 			return (true);
4120 		default:
4121 			break;
4122 		}
4123 	} else {
4124 		VERIFY(vcpuid >= 0 && vcpuid <= VM_MAXCPU);
4125 
4126 		struct vcpu *vcpu = &vm->vcpu[vcpuid];
4127 		switch (ident) {
4128 		case VAI_PEND_NMI:
4129 			*valp = vcpu->nmi_pending != 0 ? 1 : 0;
4130 			return (true);
4131 		case VAI_PEND_EXTINT:
4132 			*valp = vcpu->extint_pending != 0 ? 1 : 0;
4133 			return (true);
4134 		case VAI_PEND_EXCP:
4135 			*valp = vcpu->exc_pending;
4136 			return (true);
4137 		case VAI_PEND_INTINFO:
4138 			*valp = vcpu->exit_intinfo;
4139 			return (true);
4140 		default:
4141 			break;
4142 		}
4143 	}
4144 	return (false);
4145 }
4146 
4147 static int
4148 vmm_data_read_varch(struct vm *vm, int vcpuid, const vmm_data_req_t *req)
4149 {
4150 	VERIFY3U(req->vdr_class, ==, VDC_VMM_ARCH);
4151 	VERIFY3U(req->vdr_version, ==, 1);
4152 
4153 	/* per-vCPU fields are handled separately from VM-wide ones */
4154 	if (vcpuid != -1 && (vcpuid < 0 || vcpuid >= VM_MAXCPU)) {
4155 		return (EINVAL);
4156 	}
4157 
4158 	struct vdi_field_entry_v1 *entryp = req->vdr_data;
4159 
4160 	/* Specific fields requested */
4161 	if ((req->vdr_flags & VDX_FLAG_READ_COPYIN) != 0) {
4162 		const uint_t count =
4163 		    req->vdr_len / sizeof (struct vdi_field_entry_v1);
4164 
4165 		for (uint_t i = 0; i < count; i++, entryp++) {
4166 			if (!vmm_read_arch_field(vm, vcpuid, entryp->vfe_ident,
4167 			    &entryp->vfe_value)) {
4168 				return (EINVAL);
4169 			}
4170 		}
4171 		*req->vdr_result_len =
4172 		    count * sizeof (struct vdi_field_entry_v1);
4173 		return (0);
4174 	}
4175 
4176 	/* Emit all of the possible values */
4177 	const uint32_t *idents;
4178 	uint_t ident_count;
4179 
4180 	if (vcpuid == -1) {
4181 		idents = vmm_arch_v1_fields;
4182 		ident_count = nitems(vmm_arch_v1_fields);
4183 	} else {
4184 		idents = vmm_arch_v1_vcpu_fields;
4185 		ident_count = nitems(vmm_arch_v1_vcpu_fields);
4186 
4187 	}
4188 
4189 	const uint32_t total_size =
4190 	    ident_count * sizeof (struct vdi_field_entry_v1);
4191 
4192 	*req->vdr_result_len = total_size;
4193 	if (req->vdr_len < total_size) {
4194 		return (ENOSPC);
4195 	}
4196 	for (uint_t i = 0; i < ident_count; i++, entryp++) {
4197 		entryp->vfe_ident = idents[i];
4198 		VERIFY(vmm_read_arch_field(vm, vcpuid, entryp->vfe_ident,
4199 		    &entryp->vfe_value));
4200 	}
4201 	return (0);
4202 }
4203 
4204 static int
4205 vmm_data_write_varch_vcpu(struct vm *vm, int vcpuid, const vmm_data_req_t *req)
4206 {
4207 	VERIFY3U(req->vdr_class, ==, VDC_VMM_ARCH);
4208 	VERIFY3U(req->vdr_version, ==, 1);
4209 
4210 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU) {
4211 		return (EINVAL);
4212 	}
4213 
4214 	const struct vdi_field_entry_v1 *entryp = req->vdr_data;
4215 	const uint_t entry_count =
4216 	    req->vdr_len / sizeof (struct vdi_field_entry_v1);
4217 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
4218 
4219 	for (uint_t i = 0; i < entry_count; i++, entryp++) {
4220 		const uint64_t val = entryp->vfe_value;
4221 
4222 		switch (entryp->vfe_ident) {
4223 		case VAI_PEND_NMI:
4224 			vcpu->nmi_pending = (val != 0);
4225 			break;
4226 		case VAI_PEND_EXTINT:
4227 			vcpu->extint_pending = (val != 0);
4228 			break;
4229 		case VAI_PEND_EXCP:
4230 			if (!VM_INTINFO_PENDING(val)) {
4231 				vcpu->exc_pending = 0;
4232 			} else if (VM_INTINFO_TYPE(val) != VM_INTINFO_HWEXCP ||
4233 			    (val & VM_INTINFO_MASK_RSVD) != 0) {
4234 				/* reject improperly-formed hw exception */
4235 				return (EINVAL);
4236 			} else {
4237 				vcpu->exc_pending = val;
4238 			}
4239 			break;
4240 		case VAI_PEND_INTINFO:
4241 			if (vm_exit_intinfo(vm, vcpuid, val) != 0) {
4242 				return (EINVAL);
4243 			}
4244 			break;
4245 		default:
4246 			return (EINVAL);
4247 		}
4248 	}
4249 
4250 	*req->vdr_result_len = entry_count * sizeof (struct vdi_field_entry_v1);
4251 	return (0);
4252 }
4253 
4254 static int
4255 vmm_data_write_varch(struct vm *vm, int vcpuid, const vmm_data_req_t *req)
4256 {
4257 	VERIFY3U(req->vdr_class, ==, VDC_VMM_ARCH);
4258 	VERIFY3U(req->vdr_version, ==, 1);
4259 
4260 	/* per-vCPU fields are handled separately from VM-wide ones */
4261 	if (vcpuid != -1) {
4262 		return (vmm_data_write_varch_vcpu(vm, vcpuid, req));
4263 	}
4264 
4265 	const struct vdi_field_entry_v1 *entryp = req->vdr_data;
4266 	const uint_t entry_count =
4267 	    req->vdr_len / sizeof (struct vdi_field_entry_v1);
4268 
4269 	for (uint_t i = 0; i < entry_count; i++, entryp++) {
4270 		const uint64_t val = entryp->vfe_value;
4271 
4272 		switch (entryp->vfe_ident) {
4273 		case VAI_TSC_BOOT_OFFSET:
4274 			vm->boot_tsc_offset = val;
4275 			break;
4276 		case VAI_BOOT_HRTIME:
4277 			vm->boot_hrtime = val;
4278 			break;
4279 		case VAI_TSC_FREQ:
4280 			/* Guest TSC frequency not (currently) adjustable */
4281 			return (EPERM);
4282 		case VAI_VM_IS_PAUSED:
4283 			/*
4284 			 * The VM_PAUSE and VM_RESUME ioctls are the officially
4285 			 * sanctioned mechanisms for setting the is-paused state
4286 			 * of the VM.
4287 			 */
4288 			return (EPERM);
4289 		default:
4290 			return (EINVAL);
4291 		}
4292 	}
4293 	*req->vdr_result_len = entry_count * sizeof (struct vdi_field_entry_v1);
4294 	return (0);
4295 }
4296 
4297 static const vmm_data_version_entry_t vmm_arch_v1 = {
4298 	.vdve_class = VDC_VMM_ARCH,
4299 	.vdve_version = 1,
4300 	.vdve_len_per_item = sizeof (struct vdi_field_entry_v1),
4301 	.vdve_vcpu_readf = vmm_data_read_varch,
4302 	.vdve_vcpu_writef = vmm_data_write_varch,
4303 };
4304 VMM_DATA_VERSION(vmm_arch_v1);
4305 
4306 static int
4307 vmm_data_read_versions(void *arg, const vmm_data_req_t *req)
4308 {
4309 	VERIFY3U(req->vdr_class, ==, VDC_VERSION);
4310 	VERIFY3U(req->vdr_version, ==, 1);
4311 
4312 	const uint32_t total_size = SET_COUNT(vmm_data_version_entries) *
4313 	    sizeof (struct vdi_version_entry_v1);
4314 
4315 	/* Make sure there is room for all of the entries */
4316 	*req->vdr_result_len = total_size;
4317 	if (req->vdr_len < *req->vdr_result_len) {
4318 		return (ENOSPC);
4319 	}
4320 
4321 	struct vdi_version_entry_v1 *entryp = req->vdr_data;
4322 	const vmm_data_version_entry_t **vdpp;
4323 	SET_FOREACH(vdpp, vmm_data_version_entries) {
4324 		const vmm_data_version_entry_t *vdp = *vdpp;
4325 
4326 		entryp->vve_class = vdp->vdve_class;
4327 		entryp->vve_version = vdp->vdve_version;
4328 		entryp->vve_len_expect = vdp->vdve_len_expect;
4329 		entryp->vve_len_per_item = vdp->vdve_len_per_item;
4330 		entryp++;
4331 	}
4332 	return (0);
4333 }
4334 
4335 static int
4336 vmm_data_write_versions(void *arg, const vmm_data_req_t *req)
4337 {
4338 	/* Writing to the version information makes no sense */
4339 	return (EPERM);
4340 }
4341 
4342 static const vmm_data_version_entry_t versions_v1 = {
4343 	.vdve_class = VDC_VERSION,
4344 	.vdve_version = 1,
4345 	.vdve_len_per_item = sizeof (struct vdi_version_entry_v1),
4346 	.vdve_readf = vmm_data_read_versions,
4347 	.vdve_writef = vmm_data_write_versions,
4348 };
4349 VMM_DATA_VERSION(versions_v1);
4350 
4351 int
4352 vmm_data_read(struct vm *vm, int vcpuid, const vmm_data_req_t *req)
4353 {
4354 	int err = 0;
4355 
4356 	if (vmm_data_is_cpu_specific(req->vdr_class)) {
4357 		if (vcpuid >= VM_MAXCPU) {
4358 			return (EINVAL);
4359 		}
4360 	}
4361 
4362 	const vmm_data_version_entry_t *entry = NULL;
4363 	err = vmm_data_find(req, &entry);
4364 	if (err != 0) {
4365 		return (err);
4366 	}
4367 	ASSERT(entry != NULL);
4368 
4369 	if (entry->vdve_readf != NULL) {
4370 		void *datap = vmm_data_from_class(req, vm, vcpuid);
4371 
4372 		err = entry->vdve_readf(datap, req);
4373 	} else if (entry->vdve_vcpu_readf != NULL) {
4374 		err = entry->vdve_vcpu_readf(vm, vcpuid, req);
4375 	} else {
4376 		err = EINVAL;
4377 	}
4378 
4379 	/*
4380 	 * Successful reads of fixed-length data should populate the length of
4381 	 * that result.
4382 	 */
4383 	if (err == 0 && entry->vdve_len_expect != 0) {
4384 		*req->vdr_result_len = entry->vdve_len_expect;
4385 	}
4386 
4387 	return (err);
4388 }
4389 
4390 int
4391 vmm_data_write(struct vm *vm, int vcpuid, const vmm_data_req_t *req)
4392 {
4393 	int err = 0;
4394 
4395 	if (vmm_data_is_cpu_specific(req->vdr_class)) {
4396 		if (vcpuid >= VM_MAXCPU) {
4397 			return (EINVAL);
4398 		}
4399 	}
4400 
4401 	const vmm_data_version_entry_t *entry = NULL;
4402 	err = vmm_data_find(req, &entry);
4403 	if (err != 0) {
4404 		return (err);
4405 	}
4406 	ASSERT(entry != NULL);
4407 
4408 	if (entry->vdve_writef != NULL) {
4409 		void *datap = vmm_data_from_class(req, vm, vcpuid);
4410 
4411 		err = entry->vdve_writef(datap, req);
4412 	} else if (entry->vdve_vcpu_writef != NULL) {
4413 		err = entry->vdve_vcpu_writef(vm, vcpuid, req);
4414 	} else {
4415 		err = EINVAL;
4416 	}
4417 
4418 	/*
4419 	 * Successful writes of fixed-length data should populate the length of
4420 	 * that result.
4421 	 */
4422 	if (err == 0 && entry->vdve_len_expect != 0) {
4423 		*req->vdr_result_len = entry->vdve_len_expect;
4424 	}
4425 
4426 	return (err);
4427 }
4428