xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm_sol_dev.c (revision 70143b9f)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12 
13 /*
14  * Copyright 2015 Pluribus Networks Inc.
15  * Copyright 2019 Joyent, Inc.
16  * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
17  * Copyright 2022 Oxide Computer Company
18  */
19 
20 #include <sys/types.h>
21 #include <sys/conf.h>
22 #include <sys/cpuvar.h>
23 #include <sys/ioccom.h>
24 #include <sys/stat.h>
25 #include <sys/vmsystm.h>
26 #include <sys/ddi.h>
27 #include <sys/mkdev.h>
28 #include <sys/sunddi.h>
29 #include <sys/fs/dv_node.h>
30 #include <sys/cpuset.h>
31 #include <sys/id_space.h>
32 #include <sys/fs/sdev_plugin.h>
33 #include <sys/smt.h>
34 #include <sys/kstat.h>
35 
36 #include <sys/kernel.h>
37 #include <sys/hma.h>
38 #include <sys/x86_archext.h>
39 #include <x86/apicreg.h>
40 
41 #include <sys/vmm.h>
42 #include <sys/vmm_kernel.h>
43 #include <sys/vmm_instruction_emul.h>
44 #include <sys/vmm_dev.h>
45 #include <sys/vmm_impl.h>
46 #include <sys/vmm_drv.h>
47 #include <sys/vmm_vm.h>
48 #include <sys/vmm_reservoir.h>
49 
50 #include <vm/seg_dev.h>
51 
52 #include "io/ppt.h"
53 #include "io/vatpic.h"
54 #include "io/vioapic.h"
55 #include "io/vrtc.h"
56 #include "io/vhpet.h"
57 #include "io/vpmtmr.h"
58 #include "vmm_lapic.h"
59 #include "vmm_stat.h"
60 #include "vmm_util.h"
61 
62 /*
63  * Locking details:
64  *
65  * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is
66  * protected by vmmdev_mtx.  The list of vmm_softc_t instances and related data
67  * (vmm_*) are protected by vmm_mtx.  Actions requiring both locks must acquire
68  * vmmdev_mtx before vmm_mtx.  The sdev plugin functions must not attempt to
69  * acquire vmmdev_mtx, as they could deadlock with plugin unregistration.
70  */
71 
72 static kmutex_t		vmmdev_mtx;
73 static dev_info_t	*vmmdev_dip;
74 static hma_reg_t	*vmmdev_hma_reg;
75 static uint_t		vmmdev_hma_ref;
76 static sdev_plugin_hdl_t vmmdev_sdev_hdl;
77 
78 static kmutex_t		vmm_mtx;
79 static list_t		vmm_list;
80 static list_t		vmm_destroy_list;
81 static id_space_t	*vmm_minors;
82 static void		*vmm_statep;
83 
84 /* temporary safety switch */
85 int		vmm_allow_state_writes;
86 
87 static const char *vmmdev_hvm_name = "bhyve";
88 
89 /* For sdev plugin (/dev) */
90 #define	VMM_SDEV_ROOT "/dev/vmm"
91 
92 /* From uts/intel/io/vmm/intel/vmx.c */
93 extern int vmx_x86_supported(const char **);
94 
95 /* Holds and hooks from drivers external to vmm */
96 struct vmm_hold {
97 	list_node_t	vmh_node;
98 	vmm_softc_t	*vmh_sc;
99 	boolean_t	vmh_release_req;
100 	uint_t		vmh_ioport_hook_cnt;
101 };
102 
103 struct vmm_lease {
104 	list_node_t		vml_node;
105 	struct vm		*vml_vm;
106 	vm_client_t		*vml_vmclient;
107 	boolean_t		vml_expired;
108 	boolean_t		vml_break_deferred;
109 	boolean_t		(*vml_expire_func)(void *);
110 	void			*vml_expire_arg;
111 	struct vmm_hold		*vml_hold;
112 };
113 
114 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t);
115 static void vmm_lease_block(vmm_softc_t *);
116 static void vmm_lease_unblock(vmm_softc_t *);
117 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *);
118 static void vmm_kstat_init(vmm_softc_t *);
119 static void vmm_kstat_fini(vmm_softc_t *);
120 
121 /*
122  * The 'devmem' hack:
123  *
124  * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments
125  * in the vm which appear with their own name related to the vm under /dev.
126  * Since this would be a hassle from an sdev perspective and would require a
127  * new cdev interface (or complicate the existing one), we choose to implement
128  * this in a different manner.  Direct access to the underlying vm memory
129  * segments is exposed by placing them in a range of offsets beyond the normal
130  * guest memory space.  Userspace can query the appropriate offset to mmap()
131  * for a given segment-id with the VM_DEVMEM_GETOFFSET ioctl.
132  */
133 
134 static vmm_devmem_entry_t *
135 vmmdev_devmem_find(vmm_softc_t *sc, int segid)
136 {
137 	vmm_devmem_entry_t *ent = NULL;
138 	list_t *dl = &sc->vmm_devmem_list;
139 
140 	for (ent = list_head(dl); ent != NULL; ent = list_next(dl, ent)) {
141 		if (ent->vde_segid == segid) {
142 			return (ent);
143 		}
144 	}
145 	return (NULL);
146 }
147 
148 static int
149 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
150 {
151 	int error;
152 	bool sysmem;
153 
154 	error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem,
155 	    NULL);
156 	if (error || mseg->len == 0)
157 		return (error);
158 
159 	if (!sysmem) {
160 		vmm_devmem_entry_t *de;
161 
162 		de = vmmdev_devmem_find(sc, mseg->segid);
163 		if (de != NULL) {
164 			(void) strlcpy(mseg->name, de->vde_name,
165 			    sizeof (mseg->name));
166 		}
167 	} else {
168 		bzero(mseg->name, sizeof (mseg->name));
169 	}
170 
171 	return (error);
172 }
173 
174 static int
175 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name)
176 {
177 	off_t map_offset;
178 	vmm_devmem_entry_t *entry;
179 
180 	if (list_is_empty(&sc->vmm_devmem_list)) {
181 		map_offset = VM_DEVMEM_START;
182 	} else {
183 		entry = list_tail(&sc->vmm_devmem_list);
184 		map_offset = entry->vde_off + entry->vde_len;
185 		if (map_offset < entry->vde_off) {
186 			/* Do not tolerate overflow */
187 			return (ERANGE);
188 		}
189 		/*
190 		 * XXXJOY: We could choose to search the list for duplicate
191 		 * names and toss an error.  Since we're using the offset
192 		 * method for now, it does not make much of a difference.
193 		 */
194 	}
195 
196 	entry = kmem_zalloc(sizeof (*entry), KM_SLEEP);
197 	entry->vde_segid = mseg->segid;
198 	entry->vde_len = mseg->len;
199 	entry->vde_off = map_offset;
200 	(void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name));
201 	list_insert_tail(&sc->vmm_devmem_list, entry);
202 
203 	return (0);
204 }
205 
206 static boolean_t
207 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp,
208     off_t *map_offp)
209 {
210 	list_t *dl = &sc->vmm_devmem_list;
211 	vmm_devmem_entry_t *de = NULL;
212 	const off_t map_end = off + len;
213 
214 	VERIFY(off >= VM_DEVMEM_START);
215 
216 	if (map_end < off) {
217 		/* No match on overflow */
218 		return (B_FALSE);
219 	}
220 
221 	for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
222 		const off_t item_end = de->vde_off + de->vde_len;
223 
224 		if (de->vde_off <= off && item_end >= map_end) {
225 			*segidp = de->vde_segid;
226 			*map_offp = off - de->vde_off;
227 			return (B_TRUE);
228 		}
229 	}
230 	return (B_FALSE);
231 }
232 
233 static void
234 vmmdev_devmem_purge(vmm_softc_t *sc)
235 {
236 	vmm_devmem_entry_t *entry;
237 
238 	while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) {
239 		kmem_free(entry, sizeof (*entry));
240 	}
241 }
242 
243 static int
244 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
245 {
246 	int error;
247 	bool sysmem = true;
248 
249 	if (VM_MEMSEG_NAME(mseg)) {
250 		sysmem = false;
251 	}
252 	error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem);
253 
254 	if (error == 0) {
255 		/*
256 		 * Rather than create a whole fresh device from which userspace
257 		 * can mmap this segment, instead make it available at an
258 		 * offset above where the main guest memory resides.
259 		 */
260 		error = vmmdev_devmem_create(sc, mseg, mseg->name);
261 		if (error != 0) {
262 			vm_free_memseg(sc->vmm_vm, mseg->segid);
263 		}
264 	}
265 	return (error);
266 }
267 
268 /*
269  * Resource Locking and Exclusion
270  *
271  * Much of bhyve depends on key portions of VM state, such as the guest memory
272  * map, to remain unchanged while the guest is running.  As ported from
273  * FreeBSD, the initial strategy for this resource exclusion hinged on gating
274  * access to the instance vCPUs.  Threads acting on a single vCPU, like those
275  * performing the work of actually running the guest in VMX/SVM, would lock
276  * only that vCPU during ioctl() entry.  For ioctls which would change VM-wide
277  * state, all of the vCPUs would be first locked, ensuring that the
278  * operation(s) could complete without any other threads stumbling into
279  * intermediate states.
280  *
281  * This approach is largely effective for bhyve.  Common operations, such as
282  * running the vCPUs, steer clear of lock contention.  The model begins to
283  * break down for operations which do not occur in the context of a specific
284  * vCPU.  LAPIC MSI delivery, for example, may be initiated from a worker
285  * thread in the bhyve process.  In order to properly protect those vCPU-less
286  * operations from encountering invalid states, additional locking is required.
287  * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU.
288  * It does mean that class of operations will be serialized on locking the
289  * specific vCPU and that instances sized at VM_MAXCPU will potentially see
290  * undue contention on the VM_MAXCPU-1 vCPU.
291  *
292  * In order to address the shortcomings of this model, the concept of a
293  * read/write lock has been added to bhyve.  Operations which change
294  * fundamental aspects of a VM (such as the memory map) must acquire the write
295  * lock, which also implies locking all of the vCPUs and waiting for all read
296  * lock holders to release.  While it increases the cost and waiting time for
297  * those few operations, it allows most hot-path operations on the VM (which
298  * depend on its configuration remaining stable) to occur with minimal locking.
299  *
300  * Consumers of the Driver API (see below) are a special case when it comes to
301  * this locking, since they may hold a read lock via the drv_lease mechanism
302  * for an extended period of time.  Rather than forcing those consumers to
303  * continuously poll for a write lock attempt, the lease system forces them to
304  * provide a release callback to trigger their clean-up (and potential later
305  * reacquisition) of the read lock.
306  */
307 
308 static void
309 vcpu_lock_one(vmm_softc_t *sc, int vcpu)
310 {
311 	ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
312 
313 	/*
314 	 * Since this state transition is utilizing from_idle=true, it should
315 	 * not fail, but rather block until it can be successful.
316 	 */
317 	VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true));
318 }
319 
320 static void
321 vcpu_unlock_one(vmm_softc_t *sc, int vcpu)
322 {
323 	ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
324 
325 	VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN);
326 	VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false));
327 }
328 
329 static void
330 vmm_read_lock(vmm_softc_t *sc)
331 {
332 	rw_enter(&sc->vmm_rwlock, RW_READER);
333 }
334 
335 static void
336 vmm_read_unlock(vmm_softc_t *sc)
337 {
338 	rw_exit(&sc->vmm_rwlock);
339 }
340 
341 static void
342 vmm_write_lock(vmm_softc_t *sc)
343 {
344 	int maxcpus;
345 
346 	/* First lock all the vCPUs */
347 	maxcpus = vm_get_maxcpus(sc->vmm_vm);
348 	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
349 		vcpu_lock_one(sc, vcpu);
350 	}
351 
352 	/*
353 	 * Block vmm_drv leases from being acquired or held while the VM write
354 	 * lock is held.
355 	 */
356 	vmm_lease_block(sc);
357 
358 	rw_enter(&sc->vmm_rwlock, RW_WRITER);
359 	/*
360 	 * For now, the 'maxcpus' value for an instance is fixed at the
361 	 * compile-time constant of VM_MAXCPU at creation.  If this changes in
362 	 * the future, allowing for dynamic vCPU resource sizing, acquisition
363 	 * of the write lock will need to be wary of such changes.
364 	 */
365 	VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm));
366 }
367 
368 static void
369 vmm_write_unlock(vmm_softc_t *sc)
370 {
371 	int maxcpus;
372 
373 	/* Allow vmm_drv leases to be acquired once write lock is dropped */
374 	vmm_lease_unblock(sc);
375 
376 	/*
377 	 * The VM write lock _must_ be released from the same thread it was
378 	 * acquired in, unlike the read lock.
379 	 */
380 	VERIFY(rw_write_held(&sc->vmm_rwlock));
381 	rw_exit(&sc->vmm_rwlock);
382 
383 	/* Unlock all the vCPUs */
384 	maxcpus = vm_get_maxcpus(sc->vmm_vm);
385 	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
386 		vcpu_unlock_one(sc, vcpu);
387 	}
388 }
389 
390 static int
391 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
392     cred_t *credp, int *rvalp)
393 {
394 	int error = 0, vcpu = -1;
395 	void *datap = (void *)arg;
396 	enum vm_lock_type {
397 		LOCK_NONE = 0,
398 		LOCK_VCPU,
399 		LOCK_READ_HOLD,
400 		LOCK_WRITE_HOLD
401 	} lock_type = LOCK_NONE;
402 
403 	/* Acquire any exclusion resources needed for the operation. */
404 	switch (cmd) {
405 	case VM_RUN:
406 	case VM_GET_REGISTER:
407 	case VM_SET_REGISTER:
408 	case VM_GET_SEGMENT_DESCRIPTOR:
409 	case VM_SET_SEGMENT_DESCRIPTOR:
410 	case VM_GET_REGISTER_SET:
411 	case VM_SET_REGISTER_SET:
412 	case VM_INJECT_EXCEPTION:
413 	case VM_GET_CAPABILITY:
414 	case VM_SET_CAPABILITY:
415 	case VM_PPTDEV_MSI:
416 	case VM_PPTDEV_MSIX:
417 	case VM_SET_X2APIC_STATE:
418 	case VM_GLA2GPA:
419 	case VM_GLA2GPA_NOFAULT:
420 	case VM_ACTIVATE_CPU:
421 	case VM_SET_INTINFO:
422 	case VM_GET_INTINFO:
423 	case VM_RESTART_INSTRUCTION:
424 	case VM_SET_KERNEMU_DEV:
425 	case VM_GET_KERNEMU_DEV:
426 	case VM_RESET_CPU:
427 	case VM_GET_RUN_STATE:
428 	case VM_SET_RUN_STATE:
429 	case VM_GET_FPU:
430 	case VM_SET_FPU:
431 		/*
432 		 * Copy in the ID of the vCPU chosen for this operation.
433 		 * Since a nefarious caller could update their struct between
434 		 * this locking and when the rest of the ioctl data is copied
435 		 * in, it is _critical_ that this local 'vcpu' variable be used
436 		 * rather than the in-struct one when performing the ioctl.
437 		 */
438 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
439 			return (EFAULT);
440 		}
441 		if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) {
442 			return (EINVAL);
443 		}
444 		vcpu_lock_one(sc, vcpu);
445 		lock_type = LOCK_VCPU;
446 		break;
447 
448 	case VM_REINIT:
449 	case VM_BIND_PPTDEV:
450 	case VM_UNBIND_PPTDEV:
451 	case VM_MAP_PPTDEV_MMIO:
452 	case VM_UNMAP_PPTDEV_MMIO:
453 	case VM_ALLOC_MEMSEG:
454 	case VM_MMAP_MEMSEG:
455 	case VM_MUNMAP_MEMSEG:
456 	case VM_WRLOCK_CYCLE:
457 	case VM_PMTMR_LOCATE:
458 		vmm_write_lock(sc);
459 		lock_type = LOCK_WRITE_HOLD;
460 		break;
461 
462 	case VM_GET_MEMSEG:
463 	case VM_MMAP_GETNEXT:
464 	case VM_LAPIC_IRQ:
465 	case VM_INJECT_NMI:
466 	case VM_IOAPIC_ASSERT_IRQ:
467 	case VM_IOAPIC_DEASSERT_IRQ:
468 	case VM_IOAPIC_PULSE_IRQ:
469 	case VM_LAPIC_MSI:
470 	case VM_LAPIC_LOCAL_IRQ:
471 	case VM_GET_X2APIC_STATE:
472 	case VM_RTC_READ:
473 	case VM_RTC_WRITE:
474 	case VM_RTC_SETTIME:
475 	case VM_RTC_GETTIME:
476 	case VM_PPTDEV_DISABLE_MSIX:
477 	case VM_DEVMEM_GETOFFSET:
478 	case VM_TRACK_DIRTY_PAGES:
479 		vmm_read_lock(sc);
480 		lock_type = LOCK_READ_HOLD;
481 		break;
482 
483 	case VM_DATA_READ:
484 	case VM_DATA_WRITE:
485 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
486 			return (EFAULT);
487 		}
488 		if (vcpu == -1) {
489 			/* Access data for VM-wide devices */
490 			vmm_write_lock(sc);
491 			lock_type = LOCK_WRITE_HOLD;
492 		} else if (vcpu >= 0 && vcpu < vm_get_maxcpus(sc->vmm_vm)) {
493 			/* Access data associated with a specific vCPU */
494 			vcpu_lock_one(sc, vcpu);
495 			lock_type = LOCK_VCPU;
496 		} else {
497 			return (EINVAL);
498 		}
499 		break;
500 
501 	case VM_GET_GPA_PMAP:
502 	case VM_IOAPIC_PINCOUNT:
503 	case VM_SUSPEND:
504 	case VM_DESC_FPU_AREA:
505 	default:
506 		break;
507 	}
508 
509 	/* Execute the primary logic for the ioctl. */
510 	switch (cmd) {
511 	case VM_RUN: {
512 		struct vm_entry entry;
513 
514 		if (ddi_copyin(datap, &entry, sizeof (entry), md)) {
515 			error = EFAULT;
516 			break;
517 		}
518 
519 		if (!(curthread->t_schedflag & TS_VCPU))
520 			smt_mark_as_vcpu();
521 
522 		error = vm_run(sc->vmm_vm, vcpu, &entry);
523 
524 		/*
525 		 * Unexpected states in vm_run() are expressed through positive
526 		 * errno-oriented return values.  VM states which expect further
527 		 * processing in userspace (necessary context via exitinfo) are
528 		 * expressed through negative return values.  For the time being
529 		 * a return value of 0 is not expected from vm_run().
530 		 */
531 		ASSERT(error != 0);
532 		if (error < 0) {
533 			const struct vm_exit *vme;
534 			void *outp = entry.exit_data;
535 
536 			error = 0;
537 			vme = vm_exitinfo(sc->vmm_vm, vcpu);
538 			if (ddi_copyout(vme, outp, sizeof (*vme), md)) {
539 				error = EFAULT;
540 			}
541 		}
542 		break;
543 	}
544 	case VM_SUSPEND: {
545 		struct vm_suspend vmsuspend;
546 
547 		if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) {
548 			error = EFAULT;
549 			break;
550 		}
551 		error = vm_suspend(sc->vmm_vm, vmsuspend.how);
552 		break;
553 	}
554 	case VM_REINIT: {
555 		struct vm_reinit reinit;
556 
557 		if (ddi_copyin(datap, &reinit, sizeof (reinit), md)) {
558 			error = EFAULT;
559 			break;
560 		}
561 		if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) {
562 			/*
563 			 * The VM instance should be free of driver-attached
564 			 * hooks during the reinitialization process.
565 			 */
566 			break;
567 		}
568 		error = vm_reinit(sc->vmm_vm, reinit.flags);
569 		(void) vmm_drv_block_hook(sc, B_FALSE);
570 		break;
571 	}
572 	case VM_STAT_DESC: {
573 		struct vm_stat_desc statdesc;
574 
575 		if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) {
576 			error = EFAULT;
577 			break;
578 		}
579 		error = vmm_stat_desc_copy(statdesc.index, statdesc.desc,
580 		    sizeof (statdesc.desc));
581 		if (error == 0 &&
582 		    ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) {
583 			error = EFAULT;
584 			break;
585 		}
586 		break;
587 	}
588 	case VM_STATS_IOC: {
589 		struct vm_stats vmstats;
590 
591 		if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) {
592 			error = EFAULT;
593 			break;
594 		}
595 		hrt2tv(gethrtime(), &vmstats.tv);
596 		error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid, vmstats.index,
597 		    nitems(vmstats.statbuf),
598 		    &vmstats.num_entries, vmstats.statbuf);
599 		if (error == 0 &&
600 		    ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) {
601 			error = EFAULT;
602 			break;
603 		}
604 		break;
605 	}
606 
607 	case VM_PPTDEV_MSI: {
608 		struct vm_pptdev_msi pptmsi;
609 
610 		if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) {
611 			error = EFAULT;
612 			break;
613 		}
614 		error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd,
615 		    pptmsi.addr, pptmsi.msg, pptmsi.numvec);
616 		break;
617 	}
618 	case VM_PPTDEV_MSIX: {
619 		struct vm_pptdev_msix pptmsix;
620 
621 		if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) {
622 			error = EFAULT;
623 			break;
624 		}
625 		error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd,
626 		    pptmsix.idx, pptmsix.addr, pptmsix.msg,
627 		    pptmsix.vector_control);
628 		break;
629 	}
630 	case VM_PPTDEV_DISABLE_MSIX: {
631 		struct vm_pptdev pptdev;
632 
633 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
634 			error = EFAULT;
635 			break;
636 		}
637 		error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd);
638 		break;
639 	}
640 	case VM_MAP_PPTDEV_MMIO: {
641 		struct vm_pptdev_mmio pptmmio;
642 
643 		if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
644 			error = EFAULT;
645 			break;
646 		}
647 		error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
648 		    pptmmio.len, pptmmio.hpa);
649 		break;
650 	}
651 	case VM_UNMAP_PPTDEV_MMIO: {
652 		struct vm_pptdev_mmio pptmmio;
653 
654 		if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
655 			error = EFAULT;
656 			break;
657 		}
658 		error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
659 		    pptmmio.len);
660 		break;
661 	}
662 	case VM_BIND_PPTDEV: {
663 		struct vm_pptdev pptdev;
664 
665 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
666 			error = EFAULT;
667 			break;
668 		}
669 		error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd);
670 		break;
671 	}
672 	case VM_UNBIND_PPTDEV: {
673 		struct vm_pptdev pptdev;
674 
675 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
676 			error = EFAULT;
677 			break;
678 		}
679 		error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd);
680 		break;
681 	}
682 	case VM_GET_PPTDEV_LIMITS: {
683 		struct vm_pptdev_limits pptlimits;
684 
685 		if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) {
686 			error = EFAULT;
687 			break;
688 		}
689 		error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd,
690 		    &pptlimits.msi_limit, &pptlimits.msix_limit);
691 		if (error == 0 &&
692 		    ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) {
693 			error = EFAULT;
694 			break;
695 		}
696 		break;
697 	}
698 	case VM_INJECT_EXCEPTION: {
699 		struct vm_exception vmexc;
700 		if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) {
701 			error = EFAULT;
702 			break;
703 		}
704 		error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector,
705 		    vmexc.error_code_valid != 0, vmexc.error_code,
706 		    vmexc.restart_instruction != 0);
707 		break;
708 	}
709 	case VM_INJECT_NMI: {
710 		struct vm_nmi vmnmi;
711 
712 		if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) {
713 			error = EFAULT;
714 			break;
715 		}
716 		error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid);
717 		break;
718 	}
719 	case VM_LAPIC_IRQ: {
720 		struct vm_lapic_irq vmirq;
721 
722 		if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
723 			error = EFAULT;
724 			break;
725 		}
726 		error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector);
727 		break;
728 	}
729 	case VM_LAPIC_LOCAL_IRQ: {
730 		struct vm_lapic_irq vmirq;
731 
732 		if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
733 			error = EFAULT;
734 			break;
735 		}
736 		error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid,
737 		    vmirq.vector);
738 		break;
739 	}
740 	case VM_LAPIC_MSI: {
741 		struct vm_lapic_msi vmmsi;
742 
743 		if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) {
744 			error = EFAULT;
745 			break;
746 		}
747 		error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg);
748 		break;
749 	}
750 
751 	case VM_IOAPIC_ASSERT_IRQ: {
752 		struct vm_ioapic_irq ioapic_irq;
753 
754 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
755 			error = EFAULT;
756 			break;
757 		}
758 		error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq);
759 		break;
760 	}
761 	case VM_IOAPIC_DEASSERT_IRQ: {
762 		struct vm_ioapic_irq ioapic_irq;
763 
764 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
765 			error = EFAULT;
766 			break;
767 		}
768 		error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq);
769 		break;
770 	}
771 	case VM_IOAPIC_PULSE_IRQ: {
772 		struct vm_ioapic_irq ioapic_irq;
773 
774 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
775 			error = EFAULT;
776 			break;
777 		}
778 		error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq);
779 		break;
780 	}
781 	case VM_IOAPIC_PINCOUNT: {
782 		int pincount;
783 
784 		pincount = vioapic_pincount(sc->vmm_vm);
785 		if (ddi_copyout(&pincount, datap, sizeof (int), md)) {
786 			error = EFAULT;
787 			break;
788 		}
789 		break;
790 	}
791 	case VM_DESC_FPU_AREA: {
792 		struct vm_fpu_desc desc;
793 		void *buf = NULL;
794 
795 		if (ddi_copyin(datap, &desc, sizeof (desc), md)) {
796 			error = EFAULT;
797 			break;
798 		}
799 		if (desc.vfd_num_entries > 64) {
800 			error = EINVAL;
801 			break;
802 		}
803 		const size_t buf_sz = sizeof (struct vm_fpu_desc_entry) *
804 		    desc.vfd_num_entries;
805 		if (buf_sz != 0) {
806 			buf = kmem_zalloc(buf_sz, KM_SLEEP);
807 		}
808 
809 		/*
810 		 * For now, we are depending on vm_fpu_desc_entry and
811 		 * hma_xsave_state_desc_t having the same format.
812 		 */
813 		CTASSERT(sizeof (struct vm_fpu_desc_entry) ==
814 		    sizeof (hma_xsave_state_desc_t));
815 
816 		size_t req_size;
817 		const uint_t max_entries = hma_fpu_describe_xsave_state(
818 		    (hma_xsave_state_desc_t *)buf,
819 		    desc.vfd_num_entries,
820 		    &req_size);
821 
822 		desc.vfd_req_size = req_size;
823 		desc.vfd_num_entries = max_entries;
824 		if (buf_sz != 0) {
825 			if (ddi_copyout(buf, desc.vfd_entry_data, buf_sz, md)) {
826 				error = EFAULT;
827 			}
828 			kmem_free(buf, buf_sz);
829 		}
830 
831 		if (error == 0) {
832 			if (ddi_copyout(&desc, datap, sizeof (desc), md)) {
833 				error = EFAULT;
834 			}
835 		}
836 		break;
837 	}
838 
839 	case VM_ISA_ASSERT_IRQ: {
840 		struct vm_isa_irq isa_irq;
841 
842 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
843 			error = EFAULT;
844 			break;
845 		}
846 		error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq);
847 		if (error == 0 && isa_irq.ioapic_irq != -1) {
848 			error = vioapic_assert_irq(sc->vmm_vm,
849 			    isa_irq.ioapic_irq);
850 		}
851 		break;
852 	}
853 	case VM_ISA_DEASSERT_IRQ: {
854 		struct vm_isa_irq isa_irq;
855 
856 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
857 			error = EFAULT;
858 			break;
859 		}
860 		error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq);
861 		if (error == 0 && isa_irq.ioapic_irq != -1) {
862 			error = vioapic_deassert_irq(sc->vmm_vm,
863 			    isa_irq.ioapic_irq);
864 		}
865 		break;
866 	}
867 	case VM_ISA_PULSE_IRQ: {
868 		struct vm_isa_irq isa_irq;
869 
870 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
871 			error = EFAULT;
872 			break;
873 		}
874 		error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq);
875 		if (error == 0 && isa_irq.ioapic_irq != -1) {
876 			error = vioapic_pulse_irq(sc->vmm_vm,
877 			    isa_irq.ioapic_irq);
878 		}
879 		break;
880 	}
881 	case VM_ISA_SET_IRQ_TRIGGER: {
882 		struct vm_isa_irq_trigger isa_irq_trigger;
883 
884 		if (ddi_copyin(datap, &isa_irq_trigger,
885 		    sizeof (isa_irq_trigger), md)) {
886 			error = EFAULT;
887 			break;
888 		}
889 		error = vatpic_set_irq_trigger(sc->vmm_vm,
890 		    isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger);
891 		break;
892 	}
893 
894 	case VM_MMAP_GETNEXT: {
895 		struct vm_memmap mm;
896 
897 		if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
898 			error = EFAULT;
899 			break;
900 		}
901 		error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid,
902 		    &mm.segoff, &mm.len, &mm.prot, &mm.flags);
903 		if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) {
904 			error = EFAULT;
905 			break;
906 		}
907 		break;
908 	}
909 	case VM_MMAP_MEMSEG: {
910 		struct vm_memmap mm;
911 
912 		if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
913 			error = EFAULT;
914 			break;
915 		}
916 		error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff,
917 		    mm.len, mm.prot, mm.flags);
918 		break;
919 	}
920 	case VM_MUNMAP_MEMSEG: {
921 		struct vm_munmap mu;
922 
923 		if (ddi_copyin(datap, &mu, sizeof (mu), md)) {
924 			error = EFAULT;
925 			break;
926 		}
927 		error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len);
928 		break;
929 	}
930 	case VM_ALLOC_MEMSEG: {
931 		struct vm_memseg vmseg;
932 
933 		if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
934 			error = EFAULT;
935 			break;
936 		}
937 		error = vmmdev_alloc_memseg(sc, &vmseg);
938 		break;
939 	}
940 	case VM_GET_MEMSEG: {
941 		struct vm_memseg vmseg;
942 
943 		if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
944 			error = EFAULT;
945 			break;
946 		}
947 		error = vmmdev_get_memseg(sc, &vmseg);
948 		if (error == 0 &&
949 		    ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) {
950 			error = EFAULT;
951 			break;
952 		}
953 		break;
954 	}
955 	case VM_GET_REGISTER: {
956 		struct vm_register vmreg;
957 
958 		if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
959 			error = EFAULT;
960 			break;
961 		}
962 		error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum,
963 		    &vmreg.regval);
964 		if (error == 0 &&
965 		    ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) {
966 			error = EFAULT;
967 			break;
968 		}
969 		break;
970 	}
971 	case VM_SET_REGISTER: {
972 		struct vm_register vmreg;
973 
974 		if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
975 			error = EFAULT;
976 			break;
977 		}
978 		error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum,
979 		    vmreg.regval);
980 		break;
981 	}
982 	case VM_SET_SEGMENT_DESCRIPTOR: {
983 		struct vm_seg_desc vmsegd;
984 
985 		if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
986 			error = EFAULT;
987 			break;
988 		}
989 		error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
990 		    &vmsegd.desc);
991 		break;
992 	}
993 	case VM_GET_SEGMENT_DESCRIPTOR: {
994 		struct vm_seg_desc vmsegd;
995 
996 		if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
997 			error = EFAULT;
998 			break;
999 		}
1000 		error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
1001 		    &vmsegd.desc);
1002 		if (error == 0 &&
1003 		    ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) {
1004 			error = EFAULT;
1005 			break;
1006 		}
1007 		break;
1008 	}
1009 	case VM_GET_REGISTER_SET: {
1010 		struct vm_register_set vrs;
1011 		int regnums[VM_REG_LAST];
1012 		uint64_t regvals[VM_REG_LAST];
1013 
1014 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1015 			error = EFAULT;
1016 			break;
1017 		}
1018 		if (vrs.count > VM_REG_LAST || vrs.count == 0) {
1019 			error = EINVAL;
1020 			break;
1021 		}
1022 		if (ddi_copyin(vrs.regnums, regnums,
1023 		    sizeof (int) * vrs.count, md)) {
1024 			error = EFAULT;
1025 			break;
1026 		}
1027 
1028 		error = 0;
1029 		for (uint_t i = 0; i < vrs.count && error == 0; i++) {
1030 			if (regnums[i] < 0) {
1031 				error = EINVAL;
1032 				break;
1033 			}
1034 			error = vm_get_register(sc->vmm_vm, vcpu, regnums[i],
1035 			    &regvals[i]);
1036 		}
1037 		if (error == 0 && ddi_copyout(regvals, vrs.regvals,
1038 		    sizeof (uint64_t) * vrs.count, md)) {
1039 			error = EFAULT;
1040 		}
1041 		break;
1042 	}
1043 	case VM_SET_REGISTER_SET: {
1044 		struct vm_register_set vrs;
1045 		int regnums[VM_REG_LAST];
1046 		uint64_t regvals[VM_REG_LAST];
1047 
1048 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1049 			error = EFAULT;
1050 			break;
1051 		}
1052 		if (vrs.count > VM_REG_LAST || vrs.count == 0) {
1053 			error = EINVAL;
1054 			break;
1055 		}
1056 		if (ddi_copyin(vrs.regnums, regnums,
1057 		    sizeof (int) * vrs.count, md)) {
1058 			error = EFAULT;
1059 			break;
1060 		}
1061 		if (ddi_copyin(vrs.regvals, regvals,
1062 		    sizeof (uint64_t) * vrs.count, md)) {
1063 			error = EFAULT;
1064 			break;
1065 		}
1066 
1067 		error = 0;
1068 		for (uint_t i = 0; i < vrs.count && error == 0; i++) {
1069 			/*
1070 			 * Setting registers in a set is not atomic, since a
1071 			 * failure in the middle of the set will cause a
1072 			 * bail-out and inconsistent register state.  Callers
1073 			 * should be wary of this.
1074 			 */
1075 			if (regnums[i] < 0) {
1076 				error = EINVAL;
1077 				break;
1078 			}
1079 			error = vm_set_register(sc->vmm_vm, vcpu, regnums[i],
1080 			    regvals[i]);
1081 		}
1082 		break;
1083 	}
1084 	case VM_RESET_CPU: {
1085 		struct vm_vcpu_reset vvr;
1086 
1087 		if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) {
1088 			error = EFAULT;
1089 			break;
1090 		}
1091 		if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) {
1092 			error = EINVAL;
1093 		}
1094 
1095 		error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT);
1096 		break;
1097 	}
1098 	case VM_GET_RUN_STATE: {
1099 		struct vm_run_state vrs;
1100 
1101 		bzero(&vrs, sizeof (vrs));
1102 		error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state,
1103 		    &vrs.sipi_vector);
1104 		if (error == 0) {
1105 			if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) {
1106 				error = EFAULT;
1107 				break;
1108 			}
1109 		}
1110 		break;
1111 	}
1112 	case VM_SET_RUN_STATE: {
1113 		struct vm_run_state vrs;
1114 
1115 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1116 			error = EFAULT;
1117 			break;
1118 		}
1119 		error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state,
1120 		    vrs.sipi_vector);
1121 		break;
1122 	}
1123 	case VM_GET_FPU: {
1124 		struct vm_fpu_state req;
1125 		const size_t max_len = (PAGESIZE * 2);
1126 		void *kbuf;
1127 
1128 		if (ddi_copyin(datap, &req, sizeof (req), md)) {
1129 			error = EFAULT;
1130 			break;
1131 		}
1132 		if (req.len > max_len || req.len == 0) {
1133 			error = EINVAL;
1134 			break;
1135 		}
1136 		kbuf = kmem_zalloc(req.len, KM_SLEEP);
1137 		error = vm_get_fpu(sc->vmm_vm, vcpu, kbuf, req.len);
1138 		if (error == 0) {
1139 			if (ddi_copyout(kbuf, req.buf, req.len, md)) {
1140 				error = EFAULT;
1141 			}
1142 		}
1143 		kmem_free(kbuf, req.len);
1144 		break;
1145 	}
1146 	case VM_SET_FPU: {
1147 		struct vm_fpu_state req;
1148 		const size_t max_len = (PAGESIZE * 2);
1149 		void *kbuf;
1150 
1151 		if (ddi_copyin(datap, &req, sizeof (req), md)) {
1152 			error = EFAULT;
1153 			break;
1154 		}
1155 		if (req.len > max_len || req.len == 0) {
1156 			error = EINVAL;
1157 			break;
1158 		}
1159 		kbuf = kmem_alloc(req.len, KM_SLEEP);
1160 		if (ddi_copyin(req.buf, kbuf, req.len, md)) {
1161 			error = EFAULT;
1162 		} else {
1163 			error = vm_set_fpu(sc->vmm_vm, vcpu, kbuf, req.len);
1164 		}
1165 		kmem_free(kbuf, req.len);
1166 		break;
1167 	}
1168 
1169 	case VM_SET_KERNEMU_DEV:
1170 	case VM_GET_KERNEMU_DEV: {
1171 		struct vm_readwrite_kernemu_device kemu;
1172 		size_t size = 0;
1173 
1174 		if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) {
1175 			error = EFAULT;
1176 			break;
1177 		}
1178 
1179 		if (kemu.access_width > 3) {
1180 			error = EINVAL;
1181 			break;
1182 		}
1183 		size = (1 << kemu.access_width);
1184 		ASSERT(size >= 1 && size <= 8);
1185 
1186 		if (cmd == VM_SET_KERNEMU_DEV) {
1187 			error = vm_service_mmio_write(sc->vmm_vm, vcpu,
1188 			    kemu.gpa, kemu.value, size);
1189 		} else {
1190 			error = vm_service_mmio_read(sc->vmm_vm, vcpu,
1191 			    kemu.gpa, &kemu.value, size);
1192 		}
1193 
1194 		if (error == 0) {
1195 			if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) {
1196 				error = EFAULT;
1197 				break;
1198 			}
1199 		}
1200 		break;
1201 	}
1202 
1203 	case VM_GET_CAPABILITY: {
1204 		struct vm_capability vmcap;
1205 
1206 		if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1207 			error = EFAULT;
1208 			break;
1209 		}
1210 		error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype,
1211 		    &vmcap.capval);
1212 		if (error == 0 &&
1213 		    ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) {
1214 			error = EFAULT;
1215 			break;
1216 		}
1217 		break;
1218 	}
1219 	case VM_SET_CAPABILITY: {
1220 		struct vm_capability vmcap;
1221 
1222 		if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1223 			error = EFAULT;
1224 			break;
1225 		}
1226 		error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype,
1227 		    vmcap.capval);
1228 		break;
1229 	}
1230 	case VM_SET_X2APIC_STATE: {
1231 		struct vm_x2apic x2apic;
1232 
1233 		if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1234 			error = EFAULT;
1235 			break;
1236 		}
1237 		error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state);
1238 		break;
1239 	}
1240 	case VM_GET_X2APIC_STATE: {
1241 		struct vm_x2apic x2apic;
1242 
1243 		if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1244 			error = EFAULT;
1245 			break;
1246 		}
1247 		error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid,
1248 		    &x2apic.state);
1249 		if (error == 0 &&
1250 		    ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) {
1251 			error = EFAULT;
1252 			break;
1253 		}
1254 		break;
1255 	}
1256 	case VM_GET_GPA_PMAP: {
1257 		/*
1258 		 * Until there is a necessity to leak EPT/RVI PTE values to
1259 		 * userspace, this will remain unimplemented
1260 		 */
1261 		error = EINVAL;
1262 		break;
1263 	}
1264 	case VM_GET_HPET_CAPABILITIES: {
1265 		struct vm_hpet_cap hpetcap;
1266 
1267 		error = vhpet_getcap(&hpetcap);
1268 		if (error == 0 &&
1269 		    ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) {
1270 			error = EFAULT;
1271 			break;
1272 		}
1273 		break;
1274 	}
1275 	case VM_GLA2GPA: {
1276 		struct vm_gla2gpa gg;
1277 
1278 		if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1279 			error = EFAULT;
1280 			break;
1281 		}
1282 		gg.vcpuid = vcpu;
1283 		error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla,
1284 		    gg.prot, &gg.gpa, &gg.fault);
1285 		if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1286 			error = EFAULT;
1287 			break;
1288 		}
1289 		break;
1290 	}
1291 	case VM_GLA2GPA_NOFAULT: {
1292 		struct vm_gla2gpa gg;
1293 
1294 		if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1295 			error = EFAULT;
1296 			break;
1297 		}
1298 		gg.vcpuid = vcpu;
1299 		error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging,
1300 		    gg.gla, gg.prot, &gg.gpa, &gg.fault);
1301 		if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1302 			error = EFAULT;
1303 			break;
1304 		}
1305 		break;
1306 	}
1307 
1308 	case VM_ACTIVATE_CPU:
1309 		error = vm_activate_cpu(sc->vmm_vm, vcpu);
1310 		break;
1311 
1312 	case VM_SUSPEND_CPU:
1313 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1314 			error = EFAULT;
1315 		} else {
1316 			error = vm_suspend_cpu(sc->vmm_vm, vcpu);
1317 		}
1318 		break;
1319 
1320 	case VM_RESUME_CPU:
1321 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1322 			error = EFAULT;
1323 		} else {
1324 			error = vm_resume_cpu(sc->vmm_vm, vcpu);
1325 		}
1326 		break;
1327 
1328 	case VM_GET_CPUS: {
1329 		struct vm_cpuset vm_cpuset;
1330 		cpuset_t tempset;
1331 		void *srcp = &tempset;
1332 		int size;
1333 
1334 		if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) {
1335 			error = EFAULT;
1336 			break;
1337 		}
1338 
1339 		/* Be more generous about sizing since our cpuset_t is large. */
1340 		size = vm_cpuset.cpusetsize;
1341 		if (size <= 0 || size > sizeof (cpuset_t)) {
1342 			error = ERANGE;
1343 		}
1344 		/*
1345 		 * If they want a ulong_t or less, make sure they receive the
1346 		 * low bits with all the useful information.
1347 		 */
1348 		if (size <= sizeof (tempset.cpub[0])) {
1349 			srcp = &tempset.cpub[0];
1350 		}
1351 
1352 		if (vm_cpuset.which == VM_ACTIVE_CPUS) {
1353 			tempset = vm_active_cpus(sc->vmm_vm);
1354 		} else if (vm_cpuset.which == VM_SUSPENDED_CPUS) {
1355 			tempset = vm_suspended_cpus(sc->vmm_vm);
1356 		} else if (vm_cpuset.which == VM_DEBUG_CPUS) {
1357 			tempset = vm_debug_cpus(sc->vmm_vm);
1358 		} else {
1359 			error = EINVAL;
1360 		}
1361 
1362 		ASSERT(size > 0 && size <= sizeof (tempset));
1363 		if (error == 0 &&
1364 		    ddi_copyout(srcp, vm_cpuset.cpus, size, md)) {
1365 			error = EFAULT;
1366 			break;
1367 		}
1368 		break;
1369 	}
1370 	case VM_SET_INTINFO: {
1371 		struct vm_intinfo vmii;
1372 
1373 		if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) {
1374 			error = EFAULT;
1375 			break;
1376 		}
1377 		error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1);
1378 		break;
1379 	}
1380 	case VM_GET_INTINFO: {
1381 		struct vm_intinfo vmii;
1382 
1383 		vmii.vcpuid = vcpu;
1384 		error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1,
1385 		    &vmii.info2);
1386 		if (error == 0 &&
1387 		    ddi_copyout(&vmii, datap, sizeof (vmii), md)) {
1388 			error = EFAULT;
1389 			break;
1390 		}
1391 		break;
1392 	}
1393 	case VM_RTC_WRITE: {
1394 		struct vm_rtc_data rtcdata;
1395 
1396 		if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1397 			error = EFAULT;
1398 			break;
1399 		}
1400 		error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset,
1401 		    rtcdata.value);
1402 		break;
1403 	}
1404 	case VM_RTC_READ: {
1405 		struct vm_rtc_data rtcdata;
1406 
1407 		if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1408 			error = EFAULT;
1409 			break;
1410 		}
1411 		error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset,
1412 		    &rtcdata.value);
1413 		if (error == 0 &&
1414 		    ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) {
1415 			error = EFAULT;
1416 			break;
1417 		}
1418 		break;
1419 	}
1420 	case VM_RTC_SETTIME: {
1421 		struct vm_rtc_time rtctime;
1422 
1423 		if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) {
1424 			error = EFAULT;
1425 			break;
1426 		}
1427 		error = vrtc_set_time(sc->vmm_vm, rtctime.secs);
1428 		break;
1429 	}
1430 	case VM_RTC_GETTIME: {
1431 		struct vm_rtc_time rtctime;
1432 
1433 		rtctime.secs = vrtc_get_time(sc->vmm_vm);
1434 		if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) {
1435 			error = EFAULT;
1436 			break;
1437 		}
1438 		break;
1439 	}
1440 
1441 	case VM_PMTMR_LOCATE: {
1442 		uint16_t port = arg;
1443 		error = vpmtmr_set_location(sc->vmm_vm, port);
1444 		break;
1445 	}
1446 
1447 	case VM_RESTART_INSTRUCTION:
1448 		error = vm_restart_instruction(sc->vmm_vm, vcpu);
1449 		break;
1450 
1451 	case VM_SET_TOPOLOGY: {
1452 		struct vm_cpu_topology topo;
1453 
1454 		if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) {
1455 			error = EFAULT;
1456 			break;
1457 		}
1458 		error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores,
1459 		    topo.threads, topo.maxcpus);
1460 		break;
1461 	}
1462 	case VM_GET_TOPOLOGY: {
1463 		struct vm_cpu_topology topo;
1464 
1465 		vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores,
1466 		    &topo.threads, &topo.maxcpus);
1467 		if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) {
1468 			error = EFAULT;
1469 			break;
1470 		}
1471 		break;
1472 	}
1473 	case VM_DEVMEM_GETOFFSET: {
1474 		struct vm_devmem_offset vdo;
1475 		vmm_devmem_entry_t *de;
1476 
1477 		if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) {
1478 			error = EFAULT;
1479 			break;
1480 		}
1481 
1482 		de = vmmdev_devmem_find(sc, vdo.segid);
1483 		if (de != NULL) {
1484 			vdo.offset = de->vde_off;
1485 			if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) {
1486 				error = EFAULT;
1487 			}
1488 		} else {
1489 			error = ENOENT;
1490 		}
1491 		break;
1492 	}
1493 	case VM_TRACK_DIRTY_PAGES: {
1494 		const size_t max_track_region_len = 8 * PAGESIZE * 8 * PAGESIZE;
1495 		struct vmm_dirty_tracker tracker;
1496 		uint8_t *bitmap;
1497 		size_t len;
1498 
1499 		if (ddi_copyin(datap, &tracker, sizeof (tracker), md) != 0) {
1500 			error = EFAULT;
1501 			break;
1502 		}
1503 		if ((tracker.vdt_start_gpa & PAGEOFFSET) != 0) {
1504 			error = EINVAL;
1505 			break;
1506 		}
1507 		if (tracker.vdt_len == 0) {
1508 			break;
1509 		}
1510 		if ((tracker.vdt_len & PAGEOFFSET) != 0) {
1511 			error = EINVAL;
1512 			break;
1513 		}
1514 		if (tracker.vdt_len > max_track_region_len) {
1515 			error = EINVAL;
1516 			break;
1517 		}
1518 		len = roundup(tracker.vdt_len / PAGESIZE, 8) / 8;
1519 		bitmap = kmem_zalloc(len, KM_SLEEP);
1520 		vm_track_dirty_pages(sc->vmm_vm, tracker.vdt_start_gpa,
1521 		    tracker.vdt_len, bitmap);
1522 		if (ddi_copyout(bitmap, tracker.vdt_pfns, len, md) != 0) {
1523 			error = EFAULT;
1524 		}
1525 		kmem_free(bitmap, len);
1526 
1527 		break;
1528 	}
1529 	case VM_WRLOCK_CYCLE: {
1530 		/*
1531 		 * Present a test mechanism to acquire/release the write lock
1532 		 * on the VM without any other effects.
1533 		 */
1534 		break;
1535 	}
1536 	case VM_DATA_READ: {
1537 		struct vm_data_xfer vdx;
1538 
1539 		if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) {
1540 			error = EFAULT;
1541 			break;
1542 		}
1543 		if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) {
1544 			error = EINVAL;
1545 			break;
1546 		}
1547 		if (vdx.vdx_len > VM_DATA_XFER_LIMIT) {
1548 			error = EFBIG;
1549 			break;
1550 		}
1551 
1552 		const size_t len = vdx.vdx_len;
1553 		void *buf = NULL;
1554 		if (len != 0) {
1555 			buf = kmem_alloc(len, KM_SLEEP);
1556 			if ((vdx.vdx_flags & VDX_FLAG_READ_COPYIN) != 0 &&
1557 			    ddi_copyin(vdx.vdx_data, buf, len, md) != 0) {
1558 				kmem_free(buf, len);
1559 				error = EFAULT;
1560 				break;
1561 			} else {
1562 				bzero(buf, len);
1563 			}
1564 		}
1565 
1566 		vdx.vdx_result_len = 0;
1567 		vmm_data_req_t req = {
1568 			.vdr_class = vdx.vdx_class,
1569 			.vdr_version = vdx.vdx_version,
1570 			.vdr_flags = vdx.vdx_flags,
1571 			.vdr_len = len,
1572 			.vdr_data = buf,
1573 			.vdr_result_len = &vdx.vdx_result_len,
1574 		};
1575 		error = vmm_data_read(sc->vmm_vm, vdx.vdx_vcpuid, &req);
1576 
1577 		if (error == 0 && buf != NULL) {
1578 			if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) {
1579 				error = EFAULT;
1580 			}
1581 		}
1582 
1583 		/*
1584 		 * Copy out the transfer request so that the value of
1585 		 * vdx_result_len can be made available, regardless of any
1586 		 * error(s) which may have occurred.
1587 		 */
1588 		if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) {
1589 			error = (error != 0) ? error : EFAULT;
1590 		}
1591 
1592 		if (buf != NULL) {
1593 			kmem_free(buf, len);
1594 		}
1595 		break;
1596 	}
1597 	case VM_DATA_WRITE: {
1598 		struct vm_data_xfer vdx;
1599 
1600 		if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) {
1601 			error = EFAULT;
1602 			break;
1603 		}
1604 		if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) {
1605 			error = EINVAL;
1606 			break;
1607 		}
1608 		if (vdx.vdx_len > VM_DATA_XFER_LIMIT) {
1609 			error = EFBIG;
1610 			break;
1611 		}
1612 
1613 		const size_t len = vdx.vdx_len;
1614 		void *buf = NULL;
1615 		if (len != 0) {
1616 			buf = kmem_alloc(len, KM_SLEEP);
1617 			if (ddi_copyin(vdx.vdx_data, buf, len, md) != 0) {
1618 				kmem_free(buf, len);
1619 				error = EFAULT;
1620 				break;
1621 			}
1622 		}
1623 
1624 		vdx.vdx_result_len = 0;
1625 		vmm_data_req_t req = {
1626 			.vdr_class = vdx.vdx_class,
1627 			.vdr_version = vdx.vdx_version,
1628 			.vdr_flags = vdx.vdx_flags,
1629 			.vdr_len = len,
1630 			.vdr_data = buf,
1631 			.vdr_result_len = &vdx.vdx_result_len,
1632 		};
1633 		if (vmm_allow_state_writes == 0) {
1634 			/* XXX: Play it safe for now */
1635 			error = EPERM;
1636 		} else {
1637 			error = vmm_data_write(sc->vmm_vm, vdx.vdx_vcpuid,
1638 			    &req);
1639 		}
1640 
1641 		if (error == 0 && buf != NULL &&
1642 		    (vdx.vdx_flags & VDX_FLAG_WRITE_COPYOUT) != 0) {
1643 			if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) {
1644 				error = EFAULT;
1645 			}
1646 		}
1647 
1648 		/*
1649 		 * Copy out the transfer request so that the value of
1650 		 * vdx_result_len can be made available, regardless of any
1651 		 * error(s) which may have occurred.
1652 		 */
1653 		if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) {
1654 			error = (error != 0) ? error : EFAULT;
1655 		}
1656 
1657 		if (buf != NULL) {
1658 			kmem_free(buf, len);
1659 		}
1660 		break;
1661 	}
1662 
1663 	default:
1664 		error = ENOTTY;
1665 		break;
1666 	}
1667 
1668 	/* Release exclusion resources */
1669 	switch (lock_type) {
1670 	case LOCK_NONE:
1671 		break;
1672 	case LOCK_VCPU:
1673 		vcpu_unlock_one(sc, vcpu);
1674 		break;
1675 	case LOCK_READ_HOLD:
1676 		vmm_read_unlock(sc);
1677 		break;
1678 	case LOCK_WRITE_HOLD:
1679 		vmm_write_unlock(sc);
1680 		break;
1681 	default:
1682 		panic("unexpected lock type");
1683 		break;
1684 	}
1685 
1686 	return (error);
1687 }
1688 
1689 static vmm_softc_t *
1690 vmm_lookup(const char *name)
1691 {
1692 	list_t *vml = &vmm_list;
1693 	vmm_softc_t *sc;
1694 
1695 	ASSERT(MUTEX_HELD(&vmm_mtx));
1696 
1697 	for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) {
1698 		if (strcmp(sc->vmm_name, name) == 0) {
1699 			break;
1700 		}
1701 	}
1702 
1703 	return (sc);
1704 }
1705 
1706 /*
1707  * Acquire an HMA registration if not already held.
1708  */
1709 static boolean_t
1710 vmm_hma_acquire(void)
1711 {
1712 	ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1713 
1714 	mutex_enter(&vmmdev_mtx);
1715 
1716 	if (vmmdev_hma_reg == NULL) {
1717 		VERIFY3U(vmmdev_hma_ref, ==, 0);
1718 		vmmdev_hma_reg = hma_register(vmmdev_hvm_name);
1719 		if (vmmdev_hma_reg == NULL) {
1720 			cmn_err(CE_WARN, "%s HMA registration failed.",
1721 			    vmmdev_hvm_name);
1722 			mutex_exit(&vmmdev_mtx);
1723 			return (B_FALSE);
1724 		}
1725 	}
1726 
1727 	vmmdev_hma_ref++;
1728 
1729 	mutex_exit(&vmmdev_mtx);
1730 
1731 	return (B_TRUE);
1732 }
1733 
1734 /*
1735  * Release the HMA registration if held and there are no remaining VMs.
1736  */
1737 static void
1738 vmm_hma_release(void)
1739 {
1740 	ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1741 
1742 	mutex_enter(&vmmdev_mtx);
1743 
1744 	VERIFY3U(vmmdev_hma_ref, !=, 0);
1745 
1746 	vmmdev_hma_ref--;
1747 
1748 	if (vmmdev_hma_ref == 0) {
1749 		VERIFY(vmmdev_hma_reg != NULL);
1750 		hma_unregister(vmmdev_hma_reg);
1751 		vmmdev_hma_reg = NULL;
1752 	}
1753 	mutex_exit(&vmmdev_mtx);
1754 }
1755 
1756 static int
1757 vmmdev_do_vm_create(const struct vm_create_req *req, cred_t *cr)
1758 {
1759 	vmm_softc_t	*sc = NULL;
1760 	minor_t		minor;
1761 	int		error = ENOMEM;
1762 	size_t		len;
1763 	const char	*name = req->name;
1764 
1765 	len = strnlen(name, VM_MAX_NAMELEN);
1766 	if (len == 0) {
1767 		return (EINVAL);
1768 	}
1769 	if (len >= VM_MAX_NAMELEN) {
1770 		return (ENAMETOOLONG);
1771 	}
1772 	if (strchr(name, '/') != NULL) {
1773 		return (EINVAL);
1774 	}
1775 
1776 	if (!vmm_hma_acquire())
1777 		return (ENXIO);
1778 
1779 	mutex_enter(&vmm_mtx);
1780 
1781 	/* Look for duplicate names */
1782 	if (vmm_lookup(name) != NULL) {
1783 		mutex_exit(&vmm_mtx);
1784 		vmm_hma_release();
1785 		return (EEXIST);
1786 	}
1787 
1788 	/* Allow only one instance per non-global zone. */
1789 	if (!INGLOBALZONE(curproc)) {
1790 		for (sc = list_head(&vmm_list); sc != NULL;
1791 		    sc = list_next(&vmm_list, sc)) {
1792 			if (sc->vmm_zone == curzone) {
1793 				mutex_exit(&vmm_mtx);
1794 				vmm_hma_release();
1795 				return (EINVAL);
1796 			}
1797 		}
1798 	}
1799 
1800 	minor = id_alloc(vmm_minors);
1801 	if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) {
1802 		goto fail;
1803 	} else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1804 		ddi_soft_state_free(vmm_statep, minor);
1805 		goto fail;
1806 	} else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor,
1807 	    DDI_PSEUDO, 0) != DDI_SUCCESS) {
1808 		goto fail;
1809 	}
1810 
1811 	if (vmm_kstat_alloc(sc, minor, cr) != 0) {
1812 		goto fail;
1813 	}
1814 
1815 	error = vm_create(req->flags, &sc->vmm_vm);
1816 	if (error == 0) {
1817 		/* Complete VM intialization and report success. */
1818 		(void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name));
1819 		sc->vmm_minor = minor;
1820 		list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t),
1821 		    offsetof(vmm_devmem_entry_t, vde_node));
1822 
1823 		list_create(&sc->vmm_holds, sizeof (vmm_hold_t),
1824 		    offsetof(vmm_hold_t, vmh_node));
1825 		cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL);
1826 
1827 		mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL);
1828 		list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t),
1829 		    offsetof(vmm_lease_t, vml_node));
1830 		cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL);
1831 		rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL);
1832 
1833 		sc->vmm_zone = crgetzone(cr);
1834 		zone_hold(sc->vmm_zone);
1835 		vmm_zsd_add_vm(sc);
1836 		vmm_kstat_init(sc);
1837 
1838 		list_insert_tail(&vmm_list, sc);
1839 		mutex_exit(&vmm_mtx);
1840 		return (0);
1841 	}
1842 
1843 	vmm_kstat_fini(sc);
1844 	ddi_remove_minor_node(vmmdev_dip, name);
1845 fail:
1846 	id_free(vmm_minors, minor);
1847 	if (sc != NULL) {
1848 		ddi_soft_state_free(vmm_statep, minor);
1849 	}
1850 	mutex_exit(&vmm_mtx);
1851 	vmm_hma_release();
1852 
1853 	return (error);
1854 }
1855 
1856 /*
1857  * Bhyve 'Driver' Interface
1858  *
1859  * While many devices are emulated in the bhyve userspace process, there are
1860  * others with performance constraints which require that they run mostly or
1861  * entirely in-kernel.  For those not integrated directly into bhyve, an API is
1862  * needed so they can query/manipulate the portions of VM state needed to
1863  * fulfill their purpose.
1864  *
1865  * This includes:
1866  * - Translating guest-physical addresses to host-virtual pointers
1867  * - Injecting MSIs
1868  * - Hooking IO port addresses
1869  *
1870  * The vmm_drv interface exists to provide that functionality to its consumers.
1871  * (At this time, 'viona' is the only user)
1872  */
1873 int
1874 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp)
1875 {
1876 	vnode_t *vp = fp->f_vnode;
1877 	const dev_t dev = vp->v_rdev;
1878 	vmm_softc_t *sc;
1879 	vmm_hold_t *hold;
1880 	int err = 0;
1881 
1882 	if (vp->v_type != VCHR) {
1883 		return (ENXIO);
1884 	}
1885 	const major_t major = getmajor(dev);
1886 	const minor_t minor = getminor(dev);
1887 
1888 	mutex_enter(&vmmdev_mtx);
1889 	if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) {
1890 		mutex_exit(&vmmdev_mtx);
1891 		return (ENOENT);
1892 	}
1893 	mutex_enter(&vmm_mtx);
1894 	mutex_exit(&vmmdev_mtx);
1895 
1896 	if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1897 		err = ENOENT;
1898 		goto out;
1899 	}
1900 	/* XXXJOY: check cred permissions against instance */
1901 
1902 	if ((sc->vmm_flags & (VMM_CLEANUP|VMM_PURGED|VMM_DESTROY)) != 0) {
1903 		err = EBUSY;
1904 		goto out;
1905 	}
1906 
1907 	hold = kmem_zalloc(sizeof (*hold), KM_SLEEP);
1908 	hold->vmh_sc = sc;
1909 	hold->vmh_release_req = B_FALSE;
1910 
1911 	list_insert_tail(&sc->vmm_holds, hold);
1912 	sc->vmm_flags |= VMM_HELD;
1913 	*holdp = hold;
1914 
1915 out:
1916 	mutex_exit(&vmm_mtx);
1917 	return (err);
1918 }
1919 
1920 void
1921 vmm_drv_rele(vmm_hold_t *hold)
1922 {
1923 	vmm_softc_t *sc;
1924 
1925 	ASSERT(hold != NULL);
1926 	ASSERT(hold->vmh_sc != NULL);
1927 	VERIFY(hold->vmh_ioport_hook_cnt == 0);
1928 
1929 	mutex_enter(&vmm_mtx);
1930 	sc = hold->vmh_sc;
1931 	list_remove(&sc->vmm_holds, hold);
1932 	if (list_is_empty(&sc->vmm_holds)) {
1933 		sc->vmm_flags &= ~VMM_HELD;
1934 		cv_broadcast(&sc->vmm_cv);
1935 	}
1936 	mutex_exit(&vmm_mtx);
1937 	kmem_free(hold, sizeof (*hold));
1938 }
1939 
1940 boolean_t
1941 vmm_drv_release_reqd(vmm_hold_t *hold)
1942 {
1943 	ASSERT(hold != NULL);
1944 
1945 	return (hold->vmh_release_req);
1946 }
1947 
1948 vmm_lease_t *
1949 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg)
1950 {
1951 	vmm_softc_t *sc = hold->vmh_sc;
1952 	vmm_lease_t *lease;
1953 
1954 	ASSERT3P(expiref, !=, NULL);
1955 
1956 	if (hold->vmh_release_req) {
1957 		return (NULL);
1958 	}
1959 
1960 	lease = kmem_alloc(sizeof (*lease), KM_SLEEP);
1961 	list_link_init(&lease->vml_node);
1962 	lease->vml_expire_func = expiref;
1963 	lease->vml_expire_arg = arg;
1964 	lease->vml_expired = B_FALSE;
1965 	lease->vml_break_deferred = B_FALSE;
1966 	lease->vml_hold = hold;
1967 	/* cache the VM pointer for one less pointer chase */
1968 	lease->vml_vm = sc->vmm_vm;
1969 	lease->vml_vmclient = vmspace_client_alloc(vm_get_vmspace(sc->vmm_vm));
1970 
1971 	mutex_enter(&sc->vmm_lease_lock);
1972 	while (sc->vmm_lease_blocker != 0) {
1973 		cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
1974 	}
1975 	list_insert_tail(&sc->vmm_lease_list, lease);
1976 	vmm_read_lock(sc);
1977 	mutex_exit(&sc->vmm_lease_lock);
1978 
1979 	return (lease);
1980 }
1981 
1982 static void
1983 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease)
1984 {
1985 	ASSERT(MUTEX_HELD(&sc->vmm_lease_lock));
1986 
1987 	list_remove(&sc->vmm_lease_list, lease);
1988 	vmm_read_unlock(sc);
1989 	vmc_destroy(lease->vml_vmclient);
1990 	kmem_free(lease, sizeof (*lease));
1991 }
1992 
1993 static void
1994 vmm_lease_block(vmm_softc_t *sc)
1995 {
1996 	mutex_enter(&sc->vmm_lease_lock);
1997 	VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX);
1998 	sc->vmm_lease_blocker++;
1999 	if (sc->vmm_lease_blocker == 1) {
2000 		list_t *list = &sc->vmm_lease_list;
2001 		vmm_lease_t *lease = list_head(list);
2002 
2003 		while (lease != NULL) {
2004 			void *arg = lease->vml_expire_arg;
2005 			boolean_t (*expiref)(void *) = lease->vml_expire_func;
2006 			boolean_t sync_break = B_FALSE;
2007 
2008 			/*
2009 			 * Since the lease expiration notification may
2010 			 * need to take locks which would deadlock with
2011 			 * vmm_lease_lock, drop it across the call.
2012 			 *
2013 			 * We are the only one allowed to manipulate
2014 			 * vmm_lease_list right now, so it is safe to
2015 			 * continue iterating through it after
2016 			 * reacquiring the lock.
2017 			 */
2018 			lease->vml_expired = B_TRUE;
2019 			mutex_exit(&sc->vmm_lease_lock);
2020 			sync_break = expiref(arg);
2021 			mutex_enter(&sc->vmm_lease_lock);
2022 
2023 			if (sync_break) {
2024 				vmm_lease_t *next;
2025 
2026 				/*
2027 				 * These leases which are synchronously broken
2028 				 * result in vmm_read_unlock() calls from a
2029 				 * different thread than the corresponding
2030 				 * vmm_read_lock().  This is acceptable, given
2031 				 * that the rwlock underpinning the whole
2032 				 * mechanism tolerates the behavior.  This
2033 				 * flexibility is _only_ afforded to VM read
2034 				 * lock (RW_READER) holders.
2035 				 */
2036 				next = list_next(list, lease);
2037 				vmm_lease_break_locked(sc, lease);
2038 				lease = next;
2039 			} else {
2040 				lease = list_next(list, lease);
2041 			}
2042 		}
2043 
2044 		/* Process leases which were not broken synchronously. */
2045 		while (!list_is_empty(list)) {
2046 			/*
2047 			 * Although the nested loops are quadratic, the number
2048 			 * of leases is small.
2049 			 */
2050 			lease = list_head(list);
2051 			while (lease != NULL) {
2052 				vmm_lease_t *next = list_next(list, lease);
2053 				if (lease->vml_break_deferred) {
2054 					vmm_lease_break_locked(sc, lease);
2055 				}
2056 				lease = next;
2057 			}
2058 			if (list_is_empty(list)) {
2059 				break;
2060 			}
2061 			cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2062 		}
2063 		/* Wake anyone else waiting for the lease list to be empty  */
2064 		cv_broadcast(&sc->vmm_lease_cv);
2065 	} else {
2066 		list_t *list = &sc->vmm_lease_list;
2067 
2068 		/*
2069 		 * Some other thread beat us to the duty of lease cleanup.
2070 		 * Wait until that is complete.
2071 		 */
2072 		while (!list_is_empty(list)) {
2073 			cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2074 		}
2075 	}
2076 	mutex_exit(&sc->vmm_lease_lock);
2077 }
2078 
2079 static void
2080 vmm_lease_unblock(vmm_softc_t *sc)
2081 {
2082 	mutex_enter(&sc->vmm_lease_lock);
2083 	VERIFY3U(sc->vmm_lease_blocker, !=, 0);
2084 	sc->vmm_lease_blocker--;
2085 	if (sc->vmm_lease_blocker == 0) {
2086 		cv_broadcast(&sc->vmm_lease_cv);
2087 	}
2088 	mutex_exit(&sc->vmm_lease_lock);
2089 }
2090 
2091 void
2092 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease)
2093 {
2094 	vmm_softc_t *sc = hold->vmh_sc;
2095 
2096 	VERIFY3P(hold, ==, lease->vml_hold);
2097 	VERIFY(!lease->vml_break_deferred);
2098 
2099 	mutex_enter(&sc->vmm_lease_lock);
2100 	if (sc->vmm_lease_blocker == 0) {
2101 		vmm_lease_break_locked(sc, lease);
2102 	} else {
2103 		/*
2104 		 * Defer the lease-breaking to whichever thread is currently
2105 		 * cleaning up all leases as part of a vmm_lease_block() call.
2106 		 */
2107 		lease->vml_break_deferred = B_TRUE;
2108 		cv_broadcast(&sc->vmm_lease_cv);
2109 	}
2110 	mutex_exit(&sc->vmm_lease_lock);
2111 }
2112 
2113 boolean_t
2114 vmm_drv_lease_expired(vmm_lease_t *lease)
2115 {
2116 	return (lease->vml_expired);
2117 }
2118 
2119 vmm_page_t *
2120 vmm_drv_page_hold(vmm_lease_t *lease, uintptr_t gpa, int prot)
2121 {
2122 	ASSERT(lease != NULL);
2123 	ASSERT0(gpa & PAGEOFFSET);
2124 
2125 	return ((vmm_page_t *)vmc_hold(lease->vml_vmclient, gpa, prot));
2126 }
2127 
2128 void
2129 vmm_drv_page_release(vmm_page_t *vmmp)
2130 {
2131 	(void) vmp_release((vm_page_t *)vmmp);
2132 }
2133 
2134 void
2135 vmm_drv_page_release_chain(vmm_page_t *vmmp)
2136 {
2137 	(void) vmp_release_chain((vm_page_t *)vmmp);
2138 }
2139 
2140 const void *
2141 vmm_drv_page_readable(const vmm_page_t *vmmp)
2142 {
2143 	return (vmp_get_readable((const vm_page_t *)vmmp));
2144 }
2145 
2146 void *
2147 vmm_drv_page_writable(const vmm_page_t *vmmp)
2148 {
2149 	return (vmp_get_writable((const vm_page_t *)vmmp));
2150 }
2151 
2152 void
2153 vmm_drv_page_chain(vmm_page_t *vmmp, vmm_page_t *to_chain)
2154 {
2155 	vmp_chain((vm_page_t *)vmmp, (vm_page_t *)to_chain);
2156 }
2157 
2158 vmm_page_t *
2159 vmm_drv_page_next(const vmm_page_t *vmmp)
2160 {
2161 	return ((vmm_page_t *)vmp_next((vm_page_t *)vmmp));
2162 }
2163 
2164 int
2165 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg)
2166 {
2167 	ASSERT(lease != NULL);
2168 
2169 	return (lapic_intr_msi(lease->vml_vm, addr, msg));
2170 }
2171 
2172 int
2173 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func,
2174     void *arg, void **cookie)
2175 {
2176 	vmm_softc_t *sc;
2177 	int err;
2178 
2179 	ASSERT(hold != NULL);
2180 	ASSERT(cookie != NULL);
2181 
2182 	sc = hold->vmh_sc;
2183 	mutex_enter(&vmm_mtx);
2184 	/* Confirm that hook installation is not blocked */
2185 	if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) {
2186 		mutex_exit(&vmm_mtx);
2187 		return (EBUSY);
2188 	}
2189 	/*
2190 	 * Optimistically record an installed hook which will prevent a block
2191 	 * from being asserted while the mutex is dropped.
2192 	 */
2193 	hold->vmh_ioport_hook_cnt++;
2194 	mutex_exit(&vmm_mtx);
2195 
2196 	vmm_write_lock(sc);
2197 	err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func,
2198 	    arg, cookie);
2199 	vmm_write_unlock(sc);
2200 
2201 	if (err != 0) {
2202 		mutex_enter(&vmm_mtx);
2203 		/* Walk back optimism about the hook installation */
2204 		hold->vmh_ioport_hook_cnt--;
2205 		mutex_exit(&vmm_mtx);
2206 	}
2207 	return (err);
2208 }
2209 
2210 void
2211 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie)
2212 {
2213 	vmm_softc_t *sc;
2214 
2215 	ASSERT(hold != NULL);
2216 	ASSERT(cookie != NULL);
2217 	ASSERT(hold->vmh_ioport_hook_cnt != 0);
2218 
2219 	sc = hold->vmh_sc;
2220 	vmm_write_lock(sc);
2221 	vm_ioport_unhook(sc->vmm_vm, cookie);
2222 	vmm_write_unlock(sc);
2223 
2224 	mutex_enter(&vmm_mtx);
2225 	hold->vmh_ioport_hook_cnt--;
2226 	mutex_exit(&vmm_mtx);
2227 }
2228 
2229 static int
2230 vmm_drv_purge(vmm_softc_t *sc)
2231 {
2232 	ASSERT(MUTEX_HELD(&vmm_mtx));
2233 
2234 	if ((sc->vmm_flags & VMM_HELD) != 0) {
2235 		vmm_hold_t *hold;
2236 
2237 		sc->vmm_flags |= VMM_CLEANUP;
2238 		for (hold = list_head(&sc->vmm_holds); hold != NULL;
2239 		    hold = list_next(&sc->vmm_holds, hold)) {
2240 			hold->vmh_release_req = B_TRUE;
2241 		}
2242 
2243 		/*
2244 		 * Require that all leases on the instance be broken, now that
2245 		 * all associated holds have been marked as needing release.
2246 		 *
2247 		 * Dropping vmm_mtx is not strictly necessary, but if any of the
2248 		 * lessees are slow to respond, it would be nice to leave it
2249 		 * available for other parties.
2250 		 */
2251 		mutex_exit(&vmm_mtx);
2252 		vmm_lease_block(sc);
2253 		vmm_lease_unblock(sc);
2254 		mutex_enter(&vmm_mtx);
2255 
2256 		/*
2257 		 * With all of the leases broken, we can proceed in an orderly
2258 		 * fashion to waiting for any lingering holds to be dropped.
2259 		 */
2260 		while ((sc->vmm_flags & VMM_HELD) != 0) {
2261 			if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) {
2262 				return (EINTR);
2263 			}
2264 		}
2265 		sc->vmm_flags &= ~VMM_CLEANUP;
2266 	}
2267 
2268 	VERIFY(list_is_empty(&sc->vmm_holds));
2269 	sc->vmm_flags |= VMM_PURGED;
2270 	return (0);
2271 }
2272 
2273 static int
2274 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block)
2275 {
2276 	int err = 0;
2277 
2278 	mutex_enter(&vmm_mtx);
2279 	if (!enable_block) {
2280 		VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0);
2281 
2282 		sc->vmm_flags &= ~VMM_BLOCK_HOOK;
2283 		goto done;
2284 	}
2285 
2286 	/* If any holds have hooks installed, the block is a failure */
2287 	if (!list_is_empty(&sc->vmm_holds)) {
2288 		vmm_hold_t *hold;
2289 
2290 		for (hold = list_head(&sc->vmm_holds); hold != NULL;
2291 		    hold = list_next(&sc->vmm_holds, hold)) {
2292 			if (hold->vmh_ioport_hook_cnt != 0) {
2293 				err = EBUSY;
2294 				goto done;
2295 			}
2296 		}
2297 	}
2298 	sc->vmm_flags |= VMM_BLOCK_HOOK;
2299 
2300 done:
2301 	mutex_exit(&vmm_mtx);
2302 	return (err);
2303 }
2304 
2305 static int
2306 vmm_do_vm_destroy_locked(vmm_softc_t *sc, boolean_t clean_zsd,
2307     boolean_t *hma_release)
2308 {
2309 	dev_info_t	*pdip = ddi_get_parent(vmmdev_dip);
2310 	minor_t		minor;
2311 
2312 	ASSERT(MUTEX_HELD(&vmm_mtx));
2313 
2314 	*hma_release = B_FALSE;
2315 
2316 	if (vmm_drv_purge(sc) != 0) {
2317 		return (EINTR);
2318 	}
2319 
2320 	if (clean_zsd) {
2321 		vmm_zsd_rem_vm(sc);
2322 	}
2323 
2324 	/* Clean up devmem entries */
2325 	vmmdev_devmem_purge(sc);
2326 
2327 	list_remove(&vmm_list, sc);
2328 	ddi_remove_minor_node(vmmdev_dip, sc->vmm_name);
2329 	minor = sc->vmm_minor;
2330 	zone_rele(sc->vmm_zone);
2331 	if (sc->vmm_is_open) {
2332 		list_insert_tail(&vmm_destroy_list, sc);
2333 		sc->vmm_flags |= VMM_DESTROY;
2334 	} else {
2335 		vmm_kstat_fini(sc);
2336 		vm_destroy(sc->vmm_vm);
2337 		ddi_soft_state_free(vmm_statep, minor);
2338 		id_free(vmm_minors, minor);
2339 		*hma_release = B_TRUE;
2340 	}
2341 	(void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE);
2342 
2343 	return (0);
2344 }
2345 
2346 int
2347 vmm_do_vm_destroy(vmm_softc_t *sc, boolean_t clean_zsd)
2348 {
2349 	boolean_t	hma_release = B_FALSE;
2350 	int		err;
2351 
2352 	mutex_enter(&vmm_mtx);
2353 	err = vmm_do_vm_destroy_locked(sc, clean_zsd, &hma_release);
2354 	mutex_exit(&vmm_mtx);
2355 
2356 	if (hma_release)
2357 		vmm_hma_release();
2358 
2359 	return (err);
2360 }
2361 
2362 /* ARGSUSED */
2363 static int
2364 vmmdev_do_vm_destroy(const struct vm_destroy_req *req, cred_t *cr)
2365 {
2366 	boolean_t	hma_release = B_FALSE;
2367 	vmm_softc_t	*sc;
2368 	int		err;
2369 
2370 	if (crgetuid(cr) != 0)
2371 		return (EPERM);
2372 
2373 	mutex_enter(&vmm_mtx);
2374 
2375 	if ((sc = vmm_lookup(req->name)) == NULL) {
2376 		mutex_exit(&vmm_mtx);
2377 		return (ENOENT);
2378 	}
2379 	/*
2380 	 * We don't check this in vmm_lookup() since that function is also used
2381 	 * for validation during create and currently vmm names must be unique.
2382 	 */
2383 	if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) {
2384 		mutex_exit(&vmm_mtx);
2385 		return (EPERM);
2386 	}
2387 	err = vmm_do_vm_destroy_locked(sc, B_TRUE, &hma_release);
2388 
2389 	mutex_exit(&vmm_mtx);
2390 
2391 	if (hma_release)
2392 		vmm_hma_release();
2393 
2394 	return (err);
2395 }
2396 
2397 #define	VCPU_NAME_BUFLEN	32
2398 
2399 static int
2400 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr)
2401 {
2402 	zoneid_t zid = crgetzoneid(cr);
2403 	int instance = minor;
2404 	kstat_t *ksp;
2405 
2406 	ASSERT3P(sc->vmm_kstat_vm, ==, NULL);
2407 
2408 	ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm",
2409 	    VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2410 	    sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid);
2411 
2412 	if (ksp == NULL) {
2413 		return (-1);
2414 	}
2415 	sc->vmm_kstat_vm = ksp;
2416 
2417 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2418 		char namebuf[VCPU_NAME_BUFLEN];
2419 
2420 		ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL);
2421 
2422 		(void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i);
2423 		ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf,
2424 		    VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2425 		    sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t),
2426 		    0, zid);
2427 		if (ksp == NULL) {
2428 			goto fail;
2429 		}
2430 
2431 		sc->vmm_kstat_vcpu[i] = ksp;
2432 	}
2433 
2434 	/*
2435 	 * If this instance is associated with a non-global zone, make its
2436 	 * kstats visible from the GZ.
2437 	 */
2438 	if (zid != GLOBAL_ZONEID) {
2439 		kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID);
2440 		for (uint_t i = 0; i < VM_MAXCPU; i++) {
2441 			kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID);
2442 		}
2443 	}
2444 
2445 	return (0);
2446 
2447 fail:
2448 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2449 		if (sc->vmm_kstat_vcpu[i] != NULL) {
2450 			kstat_delete(sc->vmm_kstat_vcpu[i]);
2451 			sc->vmm_kstat_vcpu[i] = NULL;
2452 		} else {
2453 			break;
2454 		}
2455 	}
2456 	kstat_delete(sc->vmm_kstat_vm);
2457 	sc->vmm_kstat_vm = NULL;
2458 	return (-1);
2459 }
2460 
2461 static void
2462 vmm_kstat_init(vmm_softc_t *sc)
2463 {
2464 	kstat_t *ksp;
2465 
2466 	ASSERT3P(sc->vmm_vm, !=, NULL);
2467 	ASSERT3P(sc->vmm_kstat_vm, !=, NULL);
2468 
2469 	ksp = sc->vmm_kstat_vm;
2470 	vmm_kstats_t *vk = ksp->ks_data;
2471 	ksp->ks_private = sc->vmm_vm;
2472 	kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING);
2473 	kstat_named_setstr(&vk->vk_name, sc->vmm_name);
2474 
2475 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2476 		ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2477 
2478 		ksp = sc->vmm_kstat_vcpu[i];
2479 		vmm_vcpu_kstats_t *vvk = ksp->ks_data;
2480 
2481 		kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32);
2482 		vvk->vvk_vcpu.value.ui32 = i;
2483 		kstat_named_init(&vvk->vvk_time_init, "time_init",
2484 		    KSTAT_DATA_UINT64);
2485 		kstat_named_init(&vvk->vvk_time_run, "time_run",
2486 		    KSTAT_DATA_UINT64);
2487 		kstat_named_init(&vvk->vvk_time_idle, "time_idle",
2488 		    KSTAT_DATA_UINT64);
2489 		kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern",
2490 		    KSTAT_DATA_UINT64);
2491 		kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user",
2492 		    KSTAT_DATA_UINT64);
2493 		kstat_named_init(&vvk->vvk_time_sched, "time_sched",
2494 		    KSTAT_DATA_UINT64);
2495 		ksp->ks_private = sc->vmm_vm;
2496 		ksp->ks_update = vmm_kstat_update_vcpu;
2497 	}
2498 
2499 	kstat_install(sc->vmm_kstat_vm);
2500 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2501 		kstat_install(sc->vmm_kstat_vcpu[i]);
2502 	}
2503 }
2504 
2505 static void
2506 vmm_kstat_fini(vmm_softc_t *sc)
2507 {
2508 	ASSERT(sc->vmm_kstat_vm != NULL);
2509 
2510 	kstat_delete(sc->vmm_kstat_vm);
2511 	sc->vmm_kstat_vm = NULL;
2512 
2513 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2514 		ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2515 
2516 		kstat_delete(sc->vmm_kstat_vcpu[i]);
2517 		sc->vmm_kstat_vcpu[i] = NULL;
2518 	}
2519 }
2520 
2521 static int
2522 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
2523 {
2524 	minor_t		minor;
2525 	vmm_softc_t	*sc;
2526 
2527 	/*
2528 	 * Forbid running bhyve in a 32-bit process until it has been tested and
2529 	 * verified to be safe.
2530 	 */
2531 	if (curproc->p_model != DATAMODEL_LP64) {
2532 		return (EFBIG);
2533 	}
2534 
2535 	minor = getminor(*devp);
2536 	if (minor == VMM_CTL_MINOR) {
2537 		/*
2538 		 * Master control device must be opened exclusively.
2539 		 */
2540 		if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) {
2541 			return (EINVAL);
2542 		}
2543 
2544 		return (0);
2545 	}
2546 
2547 	mutex_enter(&vmm_mtx);
2548 	sc = ddi_get_soft_state(vmm_statep, minor);
2549 	if (sc == NULL) {
2550 		mutex_exit(&vmm_mtx);
2551 		return (ENXIO);
2552 	}
2553 
2554 	sc->vmm_is_open = B_TRUE;
2555 	mutex_exit(&vmm_mtx);
2556 
2557 	return (0);
2558 }
2559 
2560 static int
2561 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp)
2562 {
2563 	minor_t		minor;
2564 	vmm_softc_t	*sc;
2565 	boolean_t	hma_release = B_FALSE;
2566 
2567 	minor = getminor(dev);
2568 	if (minor == VMM_CTL_MINOR)
2569 		return (0);
2570 
2571 	mutex_enter(&vmm_mtx);
2572 	sc = ddi_get_soft_state(vmm_statep, minor);
2573 	if (sc == NULL) {
2574 		mutex_exit(&vmm_mtx);
2575 		return (ENXIO);
2576 	}
2577 
2578 	VERIFY(sc->vmm_is_open);
2579 	sc->vmm_is_open = B_FALSE;
2580 
2581 	/*
2582 	 * If this VM was destroyed while the vmm device was open, then
2583 	 * clean it up now that it is closed.
2584 	 */
2585 	if (sc->vmm_flags & VMM_DESTROY) {
2586 		list_remove(&vmm_destroy_list, sc);
2587 		vmm_kstat_fini(sc);
2588 		vm_destroy(sc->vmm_vm);
2589 		ddi_soft_state_free(vmm_statep, minor);
2590 		id_free(vmm_minors, minor);
2591 		hma_release = B_TRUE;
2592 	}
2593 	mutex_exit(&vmm_mtx);
2594 
2595 	if (hma_release)
2596 		vmm_hma_release();
2597 
2598 	return (0);
2599 }
2600 
2601 static int
2602 vmm_is_supported(intptr_t arg)
2603 {
2604 	int r;
2605 	const char *msg;
2606 
2607 	if (vmm_is_intel()) {
2608 		r = vmx_x86_supported(&msg);
2609 	} else if (vmm_is_svm()) {
2610 		/*
2611 		 * HMA already ensured that the features necessary for SVM
2612 		 * operation were present and online during vmm_attach().
2613 		 */
2614 		r = 0;
2615 	} else {
2616 		r = ENXIO;
2617 		msg = "Unsupported CPU vendor";
2618 	}
2619 
2620 	if (r != 0 && arg != (intptr_t)NULL) {
2621 		if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0)
2622 			return (EFAULT);
2623 	}
2624 	return (r);
2625 }
2626 
2627 static int
2628 vmm_ctl_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp)
2629 {
2630 	void *argp = (void *)arg;
2631 
2632 	switch (cmd) {
2633 	case VMM_CREATE_VM: {
2634 		struct vm_create_req req;
2635 
2636 		if ((md & FWRITE) == 0) {
2637 			return (EPERM);
2638 		}
2639 		if (ddi_copyin(argp, &req, sizeof (req), md) != 0) {
2640 			return (EFAULT);
2641 		}
2642 		return (vmmdev_do_vm_create(&req, cr));
2643 	}
2644 	case VMM_DESTROY_VM: {
2645 		struct vm_destroy_req req;
2646 
2647 		if ((md & FWRITE) == 0) {
2648 			return (EPERM);
2649 		}
2650 		if (ddi_copyin(argp, &req, sizeof (req), md) != 0) {
2651 			return (EFAULT);
2652 		}
2653 		return (vmmdev_do_vm_destroy(&req, cr));
2654 	}
2655 	case VMM_VM_SUPPORTED:
2656 		return (vmm_is_supported(arg));
2657 	case VMM_INTERFACE_VERSION:
2658 		*rvalp = VMM_CURRENT_INTERFACE_VERSION;
2659 		return (0);
2660 	case VMM_CHECK_IOMMU:
2661 		if (!vmm_check_iommu()) {
2662 			return (ENXIO);
2663 		}
2664 		return (0);
2665 	case VMM_RESV_QUERY:
2666 	case VMM_RESV_ADD:
2667 	case VMM_RESV_REMOVE:
2668 		return (vmmr_ioctl(cmd, arg, md, cr, rvalp));
2669 	default:
2670 		break;
2671 	}
2672 	/* No other actions are legal on ctl device */
2673 	return (ENOTTY);
2674 }
2675 
2676 static int
2677 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2678     int *rvalp)
2679 {
2680 	vmm_softc_t	*sc;
2681 	minor_t		minor;
2682 
2683 	/*
2684 	 * Forbid running bhyve in a 32-bit process until it has been tested and
2685 	 * verified to be safe.
2686 	 */
2687 	if (curproc->p_model != DATAMODEL_LP64) {
2688 		return (EFBIG);
2689 	}
2690 
2691 	/* The structs in bhyve ioctls assume a 64-bit datamodel */
2692 	if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) {
2693 		return (ENOTSUP);
2694 	}
2695 
2696 	minor = getminor(dev);
2697 
2698 	if (minor == VMM_CTL_MINOR) {
2699 		return (vmm_ctl_ioctl(cmd, arg, mode, credp, rvalp));
2700 	}
2701 
2702 	sc = ddi_get_soft_state(vmm_statep, minor);
2703 	ASSERT(sc);
2704 
2705 	if (sc->vmm_flags & VMM_DESTROY)
2706 		return (ENXIO);
2707 
2708 	return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp));
2709 }
2710 
2711 static int
2712 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
2713     unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp)
2714 {
2715 	vmm_softc_t *sc;
2716 	const minor_t minor = getminor(dev);
2717 	int err;
2718 
2719 	if (minor == VMM_CTL_MINOR) {
2720 		return (ENODEV);
2721 	}
2722 	if (off < 0 || (off + len) <= 0) {
2723 		return (EINVAL);
2724 	}
2725 	if ((prot & PROT_USER) == 0) {
2726 		return (EACCES);
2727 	}
2728 
2729 	sc = ddi_get_soft_state(vmm_statep, minor);
2730 	ASSERT(sc);
2731 
2732 	if (sc->vmm_flags & VMM_DESTROY)
2733 		return (ENXIO);
2734 
2735 	/* Grab read lock on the VM to prevent any changes to the memory map */
2736 	vmm_read_lock(sc);
2737 
2738 	if (off >= VM_DEVMEM_START) {
2739 		int segid;
2740 		off_t segoff;
2741 
2742 		/* Mapping a devmem "device" */
2743 		if (!vmmdev_devmem_segid(sc, off, len, &segid, &segoff)) {
2744 			err = ENODEV;
2745 		} else {
2746 			err = vm_segmap_obj(sc->vmm_vm, segid, segoff, len, as,
2747 			    addrp, prot, maxprot, flags);
2748 		}
2749 	} else {
2750 		/* Mapping a part of the guest physical space */
2751 		err = vm_segmap_space(sc->vmm_vm, off, as, addrp, len, prot,
2752 		    maxprot, flags);
2753 	}
2754 
2755 	vmm_read_unlock(sc);
2756 	return (err);
2757 }
2758 
2759 static sdev_plugin_validate_t
2760 vmm_sdev_validate(sdev_ctx_t ctx)
2761 {
2762 	const char *name = sdev_ctx_name(ctx);
2763 	vmm_softc_t *sc;
2764 	sdev_plugin_validate_t ret;
2765 	minor_t minor;
2766 
2767 	if (sdev_ctx_vtype(ctx) != VCHR)
2768 		return (SDEV_VTOR_INVALID);
2769 
2770 	VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0);
2771 
2772 	mutex_enter(&vmm_mtx);
2773 	if ((sc = vmm_lookup(name)) == NULL)
2774 		ret = SDEV_VTOR_INVALID;
2775 	else if (sc->vmm_minor != minor)
2776 		ret = SDEV_VTOR_STALE;
2777 	else
2778 		ret = SDEV_VTOR_VALID;
2779 	mutex_exit(&vmm_mtx);
2780 
2781 	return (ret);
2782 }
2783 
2784 static int
2785 vmm_sdev_filldir(sdev_ctx_t ctx)
2786 {
2787 	vmm_softc_t *sc;
2788 	int ret;
2789 
2790 	if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) {
2791 		cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__,
2792 		    sdev_ctx_path(ctx), VMM_SDEV_ROOT);
2793 		return (EINVAL);
2794 	}
2795 
2796 	mutex_enter(&vmm_mtx);
2797 	ASSERT(vmmdev_dip != NULL);
2798 	for (sc = list_head(&vmm_list); sc != NULL;
2799 	    sc = list_next(&vmm_list, sc)) {
2800 		if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) {
2801 			ret = sdev_plugin_mknod(ctx, sc->vmm_name,
2802 			    S_IFCHR | 0600,
2803 			    makedevice(ddi_driver_major(vmmdev_dip),
2804 			    sc->vmm_minor));
2805 		} else {
2806 			continue;
2807 		}
2808 		if (ret != 0 && ret != EEXIST)
2809 			goto out;
2810 	}
2811 
2812 	ret = 0;
2813 
2814 out:
2815 	mutex_exit(&vmm_mtx);
2816 	return (ret);
2817 }
2818 
2819 /* ARGSUSED */
2820 static void
2821 vmm_sdev_inactive(sdev_ctx_t ctx)
2822 {
2823 }
2824 
2825 static sdev_plugin_ops_t vmm_sdev_ops = {
2826 	.spo_version = SDEV_PLUGIN_VERSION,
2827 	.spo_flags = SDEV_PLUGIN_SUBDIR,
2828 	.spo_validate = vmm_sdev_validate,
2829 	.spo_filldir = vmm_sdev_filldir,
2830 	.spo_inactive = vmm_sdev_inactive
2831 };
2832 
2833 /* ARGSUSED */
2834 static int
2835 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
2836 {
2837 	int error;
2838 
2839 	switch (cmd) {
2840 	case DDI_INFO_DEVT2DEVINFO:
2841 		*result = (void *)vmmdev_dip;
2842 		error = DDI_SUCCESS;
2843 		break;
2844 	case DDI_INFO_DEVT2INSTANCE:
2845 		*result = (void *)0;
2846 		error = DDI_SUCCESS;
2847 		break;
2848 	default:
2849 		error = DDI_FAILURE;
2850 		break;
2851 	}
2852 	return (error);
2853 }
2854 
2855 static int
2856 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2857 {
2858 	sdev_plugin_hdl_t sph;
2859 	hma_reg_t *reg = NULL;
2860 	boolean_t vmm_loaded = B_FALSE;
2861 
2862 	if (cmd != DDI_ATTACH) {
2863 		return (DDI_FAILURE);
2864 	}
2865 
2866 	mutex_enter(&vmmdev_mtx);
2867 	/* Ensure we are not already attached. */
2868 	if (vmmdev_dip != NULL) {
2869 		mutex_exit(&vmmdev_mtx);
2870 		return (DDI_FAILURE);
2871 	}
2872 
2873 	vmm_sol_glue_init();
2874 
2875 	/*
2876 	 * Perform temporary HMA registration to determine if the system
2877 	 * is capable.
2878 	 */
2879 	if ((reg = hma_register(vmmdev_hvm_name)) == NULL) {
2880 		goto fail;
2881 	} else if (vmm_mod_load() != 0) {
2882 		goto fail;
2883 	}
2884 	vmm_loaded = B_TRUE;
2885 	hma_unregister(reg);
2886 	reg = NULL;
2887 
2888 	/* Create control node.  Other nodes will be created on demand. */
2889 	if (ddi_create_minor_node(dip, "ctl", S_IFCHR,
2890 	    VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) {
2891 		goto fail;
2892 	}
2893 
2894 	sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL);
2895 	if (sph == (sdev_plugin_hdl_t)NULL) {
2896 		ddi_remove_minor_node(dip, NULL);
2897 		goto fail;
2898 	}
2899 
2900 	ddi_report_dev(dip);
2901 	vmmdev_sdev_hdl = sph;
2902 	vmmdev_dip = dip;
2903 	mutex_exit(&vmmdev_mtx);
2904 	return (DDI_SUCCESS);
2905 
2906 fail:
2907 	if (vmm_loaded) {
2908 		VERIFY0(vmm_mod_unload());
2909 	}
2910 	if (reg != NULL) {
2911 		hma_unregister(reg);
2912 	}
2913 	vmm_sol_glue_cleanup();
2914 	mutex_exit(&vmmdev_mtx);
2915 	return (DDI_FAILURE);
2916 }
2917 
2918 static int
2919 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2920 {
2921 	if (cmd != DDI_DETACH) {
2922 		return (DDI_FAILURE);
2923 	}
2924 
2925 	/*
2926 	 * Ensure that all resources have been cleaned up.
2927 	 *
2928 	 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if
2929 	 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our
2930 	 * devinfo locked as iommu_cleanup() tries to recursively lock each
2931 	 * devinfo, including our own, while holding vmmdev_mtx.
2932 	 */
2933 	if (mutex_tryenter(&vmmdev_mtx) == 0)
2934 		return (DDI_FAILURE);
2935 
2936 	mutex_enter(&vmm_mtx);
2937 	if (!list_is_empty(&vmm_list) || !list_is_empty(&vmm_destroy_list)) {
2938 		mutex_exit(&vmm_mtx);
2939 		mutex_exit(&vmmdev_mtx);
2940 		return (DDI_FAILURE);
2941 	}
2942 	mutex_exit(&vmm_mtx);
2943 
2944 	if (!vmmr_is_empty()) {
2945 		mutex_exit(&vmmdev_mtx);
2946 		return (DDI_FAILURE);
2947 	}
2948 
2949 	VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL);
2950 	if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) {
2951 		mutex_exit(&vmmdev_mtx);
2952 		return (DDI_FAILURE);
2953 	}
2954 	vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL;
2955 
2956 	/* Remove the control node. */
2957 	ddi_remove_minor_node(dip, "ctl");
2958 	vmmdev_dip = NULL;
2959 
2960 	VERIFY0(vmm_mod_unload());
2961 	VERIFY3U(vmmdev_hma_reg, ==, NULL);
2962 	vmm_sol_glue_cleanup();
2963 
2964 	mutex_exit(&vmmdev_mtx);
2965 
2966 	return (DDI_SUCCESS);
2967 }
2968 
2969 static struct cb_ops vmm_cb_ops = {
2970 	vmm_open,
2971 	vmm_close,
2972 	nodev,		/* strategy */
2973 	nodev,		/* print */
2974 	nodev,		/* dump */
2975 	nodev,		/* read */
2976 	nodev,		/* write */
2977 	vmm_ioctl,
2978 	nodev,		/* devmap */
2979 	nodev,		/* mmap */
2980 	vmm_segmap,
2981 	nochpoll,	/* poll */
2982 	ddi_prop_op,
2983 	NULL,
2984 	D_NEW | D_MP | D_DEVMAP
2985 };
2986 
2987 static struct dev_ops vmm_ops = {
2988 	DEVO_REV,
2989 	0,
2990 	vmm_info,
2991 	nulldev,	/* identify */
2992 	nulldev,	/* probe */
2993 	vmm_attach,
2994 	vmm_detach,
2995 	nodev,		/* reset */
2996 	&vmm_cb_ops,
2997 	(struct bus_ops *)NULL
2998 };
2999 
3000 static struct modldrv modldrv = {
3001 	&mod_driverops,
3002 	"bhyve vmm",
3003 	&vmm_ops
3004 };
3005 
3006 static struct modlinkage modlinkage = {
3007 	MODREV_1,
3008 	&modldrv,
3009 	NULL
3010 };
3011 
3012 int
3013 _init(void)
3014 {
3015 	int	error;
3016 
3017 	sysinit();
3018 
3019 	mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL);
3020 	mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL);
3021 	list_create(&vmm_list, sizeof (vmm_softc_t),
3022 	    offsetof(vmm_softc_t, vmm_node));
3023 	list_create(&vmm_destroy_list, sizeof (vmm_softc_t),
3024 	    offsetof(vmm_softc_t, vmm_node));
3025 	vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32);
3026 
3027 	error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0);
3028 	if (error) {
3029 		return (error);
3030 	}
3031 
3032 	vmm_zsd_init();
3033 	vmmr_init();
3034 
3035 	error = mod_install(&modlinkage);
3036 	if (error) {
3037 		ddi_soft_state_fini(&vmm_statep);
3038 		vmm_zsd_fini();
3039 		vmmr_fini();
3040 	}
3041 
3042 	return (error);
3043 }
3044 
3045 int
3046 _fini(void)
3047 {
3048 	int	error;
3049 
3050 	error = mod_remove(&modlinkage);
3051 	if (error) {
3052 		return (error);
3053 	}
3054 
3055 	vmm_zsd_fini();
3056 	vmmr_fini();
3057 
3058 	ddi_soft_state_fini(&vmm_statep);
3059 
3060 	return (0);
3061 }
3062 
3063 int
3064 _info(struct modinfo *modinfop)
3065 {
3066 	return (mod_info(&modlinkage, modinfop));
3067 }
3068