xref: /freebsd/sys/amd64/vmm/vmm_dev.c (revision 61e21613)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 #include "opt_bhyve_snapshot.h"
31 
32 #include <sys/param.h>
33 #include <sys/kernel.h>
34 #include <sys/jail.h>
35 #include <sys/queue.h>
36 #include <sys/lock.h>
37 #include <sys/mutex.h>
38 #include <sys/malloc.h>
39 #include <sys/conf.h>
40 #include <sys/sysctl.h>
41 #include <sys/libkern.h>
42 #include <sys/ioccom.h>
43 #include <sys/mman.h>
44 #include <sys/uio.h>
45 #include <sys/proc.h>
46 
47 #include <vm/vm.h>
48 #include <vm/pmap.h>
49 #include <vm/vm_map.h>
50 #include <vm/vm_object.h>
51 
52 #include <machine/vmparam.h>
53 #include <machine/vmm.h>
54 #include <machine/vmm_dev.h>
55 #include <machine/vmm_instruction_emul.h>
56 #include <machine/vmm_snapshot.h>
57 #include <x86/apicreg.h>
58 
59 #include "vmm_lapic.h"
60 #include "vmm_stat.h"
61 #include "vmm_mem.h"
62 #include "io/ppt.h"
63 #include "io/vatpic.h"
64 #include "io/vioapic.h"
65 #include "io/vhpet.h"
66 #include "io/vrtc.h"
67 
68 #ifdef COMPAT_FREEBSD13
69 struct vm_stats_old {
70 	int		cpuid;				/* in */
71 	int		num_entries;			/* out */
72 	struct timeval	tv;
73 	uint64_t	statbuf[MAX_VM_STATS];
74 };
75 
76 #define	VM_STATS_OLD \
77 	_IOWR('v', IOCNUM_VM_STATS, struct vm_stats_old)
78 
79 struct vm_snapshot_meta_old {
80 	void *ctx;			/* unused */
81 	void *dev_data;
82 	const char *dev_name;      /* identify userspace devices */
83 	enum snapshot_req dev_req; /* identify kernel structs */
84 
85 	struct vm_snapshot_buffer buffer;
86 
87 	enum vm_snapshot_op op;
88 };
89 
90 #define VM_SNAPSHOT_REQ_OLD \
91 	_IOWR('v', IOCNUM_SNAPSHOT_REQ, struct vm_snapshot_meta_old)
92 
93 struct vm_exit_ipi_13 {
94 	uint32_t	mode;
95 	uint8_t		vector;
96 	__BITSET_DEFINE(, 256) dmask;
97 };
98 
99 struct vm_exit_13 {
100 	uint32_t	exitcode;
101 	int32_t		inst_length;
102 	uint64_t	rip;
103 	uint64_t	u[120 / sizeof(uint64_t)];
104 };
105 
106 struct vm_run_13 {
107 	int		cpuid;
108 	struct vm_exit_13 vm_exit;
109 };
110 
111 #define	VM_RUN_13 \
112 	_IOWR('v', IOCNUM_RUN, struct vm_run_13)
113 
114 #endif /* COMPAT_FREEBSD13 */
115 
116 struct devmem_softc {
117 	int	segid;
118 	char	*name;
119 	struct cdev *cdev;
120 	struct vmmdev_softc *sc;
121 	SLIST_ENTRY(devmem_softc) link;
122 };
123 
124 struct vmmdev_softc {
125 	struct vm	*vm;		/* vm instance cookie */
126 	struct cdev	*cdev;
127 	struct ucred	*ucred;
128 	SLIST_ENTRY(vmmdev_softc) link;
129 	SLIST_HEAD(, devmem_softc) devmem;
130 	int		flags;
131 };
132 #define	VSC_LINKED		0x01
133 
134 static SLIST_HEAD(, vmmdev_softc) head;
135 
136 static unsigned pr_allow_flag;
137 static struct mtx vmmdev_mtx;
138 MTX_SYSINIT(vmmdev_mtx, &vmmdev_mtx, "vmm device mutex", MTX_DEF);
139 
140 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
141 
142 SYSCTL_DECL(_hw_vmm);
143 
144 static int vmm_priv_check(struct ucred *ucred);
145 static int devmem_create_cdev(const char *vmname, int id, char *devmem);
146 static void devmem_destroy(void *arg);
147 
148 static int
149 vmm_priv_check(struct ucred *ucred)
150 {
151 
152 	if (jailed(ucred) &&
153 	    !(ucred->cr_prison->pr_allow & pr_allow_flag))
154 		return (EPERM);
155 
156 	return (0);
157 }
158 
159 static int
160 vcpu_lock_one(struct vcpu *vcpu)
161 {
162 	return (vcpu_set_state(vcpu, VCPU_FROZEN, true));
163 }
164 
165 static void
166 vcpu_unlock_one(struct vmmdev_softc *sc, int vcpuid, struct vcpu *vcpu)
167 {
168 	enum vcpu_state state;
169 
170 	state = vcpu_get_state(vcpu, NULL);
171 	if (state != VCPU_FROZEN) {
172 		panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm),
173 		    vcpuid, state);
174 	}
175 
176 	vcpu_set_state(vcpu, VCPU_IDLE, false);
177 }
178 
179 static int
180 vcpu_lock_all(struct vmmdev_softc *sc)
181 {
182 	struct vcpu *vcpu;
183 	int error;
184 	uint16_t i, j, maxcpus;
185 
186 	error = 0;
187 	vm_slock_vcpus(sc->vm);
188 	maxcpus = vm_get_maxcpus(sc->vm);
189 	for (i = 0; i < maxcpus; i++) {
190 		vcpu = vm_vcpu(sc->vm, i);
191 		if (vcpu == NULL)
192 			continue;
193 		error = vcpu_lock_one(vcpu);
194 		if (error)
195 			break;
196 	}
197 
198 	if (error) {
199 		for (j = 0; j < i; j++) {
200 			vcpu = vm_vcpu(sc->vm, j);
201 			if (vcpu == NULL)
202 				continue;
203 			vcpu_unlock_one(sc, j, vcpu);
204 		}
205 		vm_unlock_vcpus(sc->vm);
206 	}
207 
208 	return (error);
209 }
210 
211 static void
212 vcpu_unlock_all(struct vmmdev_softc *sc)
213 {
214 	struct vcpu *vcpu;
215 	uint16_t i, maxcpus;
216 
217 	maxcpus = vm_get_maxcpus(sc->vm);
218 	for (i = 0; i < maxcpus; i++) {
219 		vcpu = vm_vcpu(sc->vm, i);
220 		if (vcpu == NULL)
221 			continue;
222 		vcpu_unlock_one(sc, i, vcpu);
223 	}
224 	vm_unlock_vcpus(sc->vm);
225 }
226 
227 static struct vmmdev_softc *
228 vmmdev_lookup(const char *name)
229 {
230 	struct vmmdev_softc *sc;
231 
232 #ifdef notyet	/* XXX kernel is not compiled with invariants */
233 	mtx_assert(&vmmdev_mtx, MA_OWNED);
234 #endif
235 
236 	SLIST_FOREACH(sc, &head, link) {
237 		if (strcmp(name, vm_name(sc->vm)) == 0)
238 			break;
239 	}
240 
241 	if (sc == NULL)
242 		return (NULL);
243 
244 	if (cr_cansee(curthread->td_ucred, sc->ucred))
245 		return (NULL);
246 
247 	return (sc);
248 }
249 
250 static struct vmmdev_softc *
251 vmmdev_lookup2(struct cdev *cdev)
252 {
253 
254 	return (cdev->si_drv1);
255 }
256 
257 static int
258 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
259 {
260 	int error, off, c, prot;
261 	vm_paddr_t gpa, maxaddr;
262 	void *hpa, *cookie;
263 	struct vmmdev_softc *sc;
264 
265 	error = vmm_priv_check(curthread->td_ucred);
266 	if (error)
267 		return (error);
268 
269 	sc = vmmdev_lookup2(cdev);
270 	if (sc == NULL)
271 		return (ENXIO);
272 
273 	/*
274 	 * Get a read lock on the guest memory map.
275 	 */
276 	vm_slock_memsegs(sc->vm);
277 
278 	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
279 	maxaddr = vmm_sysmem_maxaddr(sc->vm);
280 	while (uio->uio_resid > 0 && error == 0) {
281 		gpa = uio->uio_offset;
282 		off = gpa & PAGE_MASK;
283 		c = min(uio->uio_resid, PAGE_SIZE - off);
284 
285 		/*
286 		 * The VM has a hole in its physical memory map. If we want to
287 		 * use 'dd' to inspect memory beyond the hole we need to
288 		 * provide bogus data for memory that lies in the hole.
289 		 *
290 		 * Since this device does not support lseek(2), dd(1) will
291 		 * read(2) blocks of data to simulate the lseek(2).
292 		 */
293 		hpa = vm_gpa_hold_global(sc->vm, gpa, c, prot, &cookie);
294 		if (hpa == NULL) {
295 			if (uio->uio_rw == UIO_READ && gpa < maxaddr)
296 				error = uiomove(__DECONST(void *, zero_region),
297 				    c, uio);
298 			else
299 				error = EFAULT;
300 		} else {
301 			error = uiomove(hpa, c, uio);
302 			vm_gpa_release(cookie);
303 		}
304 	}
305 	vm_unlock_memsegs(sc->vm);
306 	return (error);
307 }
308 
309 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1);
310 
311 static int
312 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
313 {
314 	struct devmem_softc *dsc;
315 	int error;
316 	bool sysmem;
317 
318 	error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
319 	if (error || mseg->len == 0)
320 		return (error);
321 
322 	if (!sysmem) {
323 		SLIST_FOREACH(dsc, &sc->devmem, link) {
324 			if (dsc->segid == mseg->segid)
325 				break;
326 		}
327 		KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
328 		    __func__, mseg->segid));
329 		error = copystr(dsc->name, mseg->name, len, NULL);
330 	} else {
331 		bzero(mseg->name, len);
332 	}
333 
334 	return (error);
335 }
336 
337 static int
338 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
339 {
340 	char *name;
341 	int error;
342 	bool sysmem;
343 
344 	error = 0;
345 	name = NULL;
346 	sysmem = true;
347 
348 	/*
349 	 * The allocation is lengthened by 1 to hold a terminating NUL.  It'll
350 	 * by stripped off when devfs processes the full string.
351 	 */
352 	if (VM_MEMSEG_NAME(mseg)) {
353 		sysmem = false;
354 		name = malloc(len, M_VMMDEV, M_WAITOK);
355 		error = copystr(mseg->name, name, len, NULL);
356 		if (error)
357 			goto done;
358 	}
359 
360 	error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
361 	if (error)
362 		goto done;
363 
364 	if (VM_MEMSEG_NAME(mseg)) {
365 		error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name);
366 		if (error)
367 			vm_free_memseg(sc->vm, mseg->segid);
368 		else
369 			name = NULL;	/* freed when 'cdev' is destroyed */
370 	}
371 done:
372 	free(name, M_VMMDEV);
373 	return (error);
374 }
375 
376 static int
377 vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
378     uint64_t *regval)
379 {
380 	int error, i;
381 
382 	error = 0;
383 	for (i = 0; i < count; i++) {
384 		error = vm_get_register(vcpu, regnum[i], &regval[i]);
385 		if (error)
386 			break;
387 	}
388 	return (error);
389 }
390 
391 static int
392 vm_set_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
393     uint64_t *regval)
394 {
395 	int error, i;
396 
397 	error = 0;
398 	for (i = 0; i < count; i++) {
399 		error = vm_set_register(vcpu, regnum[i], regval[i]);
400 		if (error)
401 			break;
402 	}
403 	return (error);
404 }
405 
406 static int
407 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
408 	     struct thread *td)
409 {
410 	int error, vcpuid, size;
411 	cpuset_t *cpuset;
412 	struct vmmdev_softc *sc;
413 	struct vcpu *vcpu;
414 	struct vm_register *vmreg;
415 	struct vm_seg_desc *vmsegdesc;
416 	struct vm_register_set *vmregset;
417 	struct vm_run *vmrun;
418 #ifdef COMPAT_FREEBSD13
419 	struct vm_run_13 *vmrun_13;
420 #endif
421 	struct vm_exception *vmexc;
422 	struct vm_lapic_irq *vmirq;
423 	struct vm_lapic_msi *vmmsi;
424 	struct vm_ioapic_irq *ioapic_irq;
425 	struct vm_isa_irq *isa_irq;
426 	struct vm_isa_irq_trigger *isa_irq_trigger;
427 	struct vm_capability *vmcap;
428 	struct vm_pptdev *pptdev;
429 	struct vm_pptdev_mmio *pptmmio;
430 	struct vm_pptdev_msi *pptmsi;
431 	struct vm_pptdev_msix *pptmsix;
432 #ifdef COMPAT_FREEBSD13
433 	struct vm_stats_old *vmstats_old;
434 #endif
435 	struct vm_stats *vmstats;
436 	struct vm_stat_desc *statdesc;
437 	struct vm_x2apic *x2apic;
438 	struct vm_gpa_pte *gpapte;
439 	struct vm_suspend *vmsuspend;
440 	struct vm_gla2gpa *gg;
441 	struct vm_cpuset *vm_cpuset;
442 	struct vm_intinfo *vmii;
443 	struct vm_rtc_time *rtctime;
444 	struct vm_rtc_data *rtcdata;
445 	struct vm_memmap *mm;
446 	struct vm_munmap *mu;
447 	struct vm_cpu_topology *topology;
448 	struct vm_readwrite_kernemu_device *kernemu;
449 	uint64_t *regvals;
450 	int *regnums;
451 	enum { NONE, SINGLE, ALL } vcpus_locked;
452 	bool memsegs_locked;
453 #ifdef BHYVE_SNAPSHOT
454 	struct vm_snapshot_meta *snapshot_meta;
455 #ifdef COMPAT_FREEBSD13
456 	struct vm_snapshot_meta_old *snapshot_old;
457 #endif
458 #endif
459 
460 	error = vmm_priv_check(curthread->td_ucred);
461 	if (error)
462 		return (error);
463 
464 	sc = vmmdev_lookup2(cdev);
465 	if (sc == NULL)
466 		return (ENXIO);
467 
468 	vcpuid = -1;
469 	vcpu = NULL;
470 	vcpus_locked = NONE;
471 	memsegs_locked = false;
472 
473 	/*
474 	 * For VMM ioctls that operate on a single vCPU, lookup the
475 	 * vcpu.  For VMM ioctls which require one or more vCPUs to
476 	 * not be running, lock necessary vCPUs.
477 	 *
478 	 * XXX fragile, handle with care
479 	 * Most of these assume that the first field of the ioctl data
480 	 * is the vcpuid.
481 	 */
482 	switch (cmd) {
483 	case VM_RUN:
484 #ifdef COMPAT_FREEBSD13
485 	case VM_RUN_13:
486 #endif
487 	case VM_GET_REGISTER:
488 	case VM_SET_REGISTER:
489 	case VM_GET_SEGMENT_DESCRIPTOR:
490 	case VM_SET_SEGMENT_DESCRIPTOR:
491 	case VM_GET_REGISTER_SET:
492 	case VM_SET_REGISTER_SET:
493 	case VM_INJECT_EXCEPTION:
494 	case VM_GET_CAPABILITY:
495 	case VM_SET_CAPABILITY:
496 	case VM_SET_X2APIC_STATE:
497 	case VM_GLA2GPA:
498 	case VM_GLA2GPA_NOFAULT:
499 	case VM_ACTIVATE_CPU:
500 	case VM_SET_INTINFO:
501 	case VM_GET_INTINFO:
502 	case VM_RESTART_INSTRUCTION:
503 	case VM_GET_KERNEMU_DEV:
504 	case VM_SET_KERNEMU_DEV:
505 		/*
506 		 * ioctls that can operate only on vcpus that are not running.
507 		 */
508 		vcpuid = *(int *)data;
509 		vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
510 		if (vcpu == NULL) {
511 			error = EINVAL;
512 			goto done;
513 		}
514 		error = vcpu_lock_one(vcpu);
515 		if (error)
516 			goto done;
517 		vcpus_locked = SINGLE;
518 		break;
519 
520 #ifdef COMPAT_FREEBSD12
521 	case VM_ALLOC_MEMSEG_FBSD12:
522 #endif
523 	case VM_ALLOC_MEMSEG:
524 	case VM_BIND_PPTDEV:
525 	case VM_UNBIND_PPTDEV:
526 	case VM_MMAP_MEMSEG:
527 	case VM_MUNMAP_MEMSEG:
528 	case VM_REINIT:
529 		/*
530 		 * ioctls that modify the memory map must lock memory
531 		 * segments exclusively.
532 		 */
533 		vm_xlock_memsegs(sc->vm);
534 		memsegs_locked = true;
535 		/* FALLTHROUGH */
536 	case VM_MAP_PPTDEV_MMIO:
537 	case VM_UNMAP_PPTDEV_MMIO:
538 #ifdef BHYVE_SNAPSHOT
539 	case VM_SNAPSHOT_REQ:
540 #ifdef COMPAT_FREEBSD13
541 	case VM_SNAPSHOT_REQ_OLD:
542 #endif
543 	case VM_RESTORE_TIME:
544 #endif
545 		/*
546 		 * ioctls that operate on the entire virtual machine must
547 		 * prevent all vcpus from running.
548 		 */
549 		error = vcpu_lock_all(sc);
550 		if (error)
551 			goto done;
552 		vcpus_locked = ALL;
553 		break;
554 
555 #ifdef COMPAT_FREEBSD12
556 	case VM_GET_MEMSEG_FBSD12:
557 #endif
558 	case VM_GET_MEMSEG:
559 	case VM_MMAP_GETNEXT:
560 		/*
561 		 * Lock the memory map while it is being inspected.
562 		 */
563 		vm_slock_memsegs(sc->vm);
564 		memsegs_locked = true;
565 		break;
566 
567 #ifdef COMPAT_FREEBSD13
568 	case VM_STATS_OLD:
569 #endif
570 	case VM_STATS:
571 	case VM_INJECT_NMI:
572 	case VM_LAPIC_IRQ:
573 	case VM_GET_X2APIC_STATE:
574 		/*
575 		 * These do not need the vCPU locked but do operate on
576 		 * a specific vCPU.
577 		 */
578 		vcpuid = *(int *)data;
579 		vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
580 		if (vcpu == NULL) {
581 			error = EINVAL;
582 			goto done;
583 		}
584 		break;
585 
586 	case VM_LAPIC_LOCAL_IRQ:
587 	case VM_SUSPEND_CPU:
588 	case VM_RESUME_CPU:
589 		/*
590 		 * These can either operate on all CPUs via a vcpuid of
591 		 * -1 or on a specific vCPU.
592 		 */
593 		vcpuid = *(int *)data;
594 		if (vcpuid == -1)
595 			break;
596 		vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
597 		if (vcpu == NULL) {
598 			error = EINVAL;
599 			goto done;
600 		}
601 		break;
602 
603 	default:
604 		break;
605 	}
606 
607 	switch (cmd) {
608 	case VM_RUN: {
609 		struct vm_exit *vme;
610 
611 		vmrun = (struct vm_run *)data;
612 		vme = vm_exitinfo(vcpu);
613 
614 		error = vm_run(vcpu);
615 		if (error != 0)
616 			break;
617 
618 		error = copyout(vme, vmrun->vm_exit, sizeof(*vme));
619 		if (error != 0)
620 			break;
621 		if (vme->exitcode == VM_EXITCODE_IPI) {
622 			error = copyout(vm_exitinfo_cpuset(vcpu),
623 			    vmrun->cpuset,
624 			    min(vmrun->cpusetsize, sizeof(cpuset_t)));
625 			if (error != 0)
626 				break;
627 			if (sizeof(cpuset_t) < vmrun->cpusetsize) {
628 				uint8_t *p;
629 
630 				p = (uint8_t *)vmrun->cpuset +
631 				    sizeof(cpuset_t);
632 				while (p < (uint8_t *)vmrun->cpuset +
633 				    vmrun->cpusetsize) {
634 					if (subyte(p++, 0) != 0) {
635 						error = EFAULT;
636 						break;
637 					}
638 				}
639 			}
640 		}
641 		break;
642 	}
643 #ifdef COMPAT_FREEBSD13
644 	case VM_RUN_13: {
645 		struct vm_exit *vme;
646 		struct vm_exit_13 *vme_13;
647 
648 		vmrun_13 = (struct vm_run_13 *)data;
649 		vme_13 = &vmrun_13->vm_exit;
650 		vme = vm_exitinfo(vcpu);
651 
652 		error = vm_run(vcpu);
653 		if (error == 0) {
654 			vme_13->exitcode = vme->exitcode;
655 			vme_13->inst_length = vme->inst_length;
656 			vme_13->rip = vme->rip;
657 			memcpy(vme_13->u, &vme->u, sizeof(vme_13->u));
658 			if (vme->exitcode == VM_EXITCODE_IPI) {
659 				struct vm_exit_ipi_13 *ipi;
660 				cpuset_t *dmask;
661 				int cpu;
662 
663 				dmask = vm_exitinfo_cpuset(vcpu);
664 				ipi = (struct vm_exit_ipi_13 *)&vme_13->u[0];
665 				BIT_ZERO(256, &ipi->dmask);
666 				CPU_FOREACH_ISSET(cpu, dmask) {
667 					if (cpu >= 256)
668 						break;
669 					BIT_SET(256, cpu, &ipi->dmask);
670 				}
671 			}
672 		}
673 		break;
674 	}
675 #endif
676 	case VM_SUSPEND:
677 		vmsuspend = (struct vm_suspend *)data;
678 		error = vm_suspend(sc->vm, vmsuspend->how);
679 		break;
680 	case VM_REINIT:
681 		error = vm_reinit(sc->vm);
682 		break;
683 	case VM_STAT_DESC: {
684 		statdesc = (struct vm_stat_desc *)data;
685 		error = vmm_stat_desc_copy(statdesc->index,
686 					statdesc->desc, sizeof(statdesc->desc));
687 		break;
688 	}
689 #ifdef COMPAT_FREEBSD13
690 	case VM_STATS_OLD:
691 		vmstats_old = (struct vm_stats_old *)data;
692 		getmicrotime(&vmstats_old->tv);
693 		error = vmm_stat_copy(vcpu, 0,
694 				      nitems(vmstats_old->statbuf),
695 				      &vmstats_old->num_entries,
696 				      vmstats_old->statbuf);
697 		break;
698 #endif
699 	case VM_STATS: {
700 		vmstats = (struct vm_stats *)data;
701 		getmicrotime(&vmstats->tv);
702 		error = vmm_stat_copy(vcpu, vmstats->index,
703 				      nitems(vmstats->statbuf),
704 				      &vmstats->num_entries, vmstats->statbuf);
705 		break;
706 	}
707 	case VM_PPTDEV_MSI:
708 		pptmsi = (struct vm_pptdev_msi *)data;
709 		error = ppt_setup_msi(sc->vm,
710 				      pptmsi->bus, pptmsi->slot, pptmsi->func,
711 				      pptmsi->addr, pptmsi->msg,
712 				      pptmsi->numvec);
713 		break;
714 	case VM_PPTDEV_MSIX:
715 		pptmsix = (struct vm_pptdev_msix *)data;
716 		error = ppt_setup_msix(sc->vm,
717 				       pptmsix->bus, pptmsix->slot,
718 				       pptmsix->func, pptmsix->idx,
719 				       pptmsix->addr, pptmsix->msg,
720 				       pptmsix->vector_control);
721 		break;
722 	case VM_PPTDEV_DISABLE_MSIX:
723 		pptdev = (struct vm_pptdev *)data;
724 		error = ppt_disable_msix(sc->vm, pptdev->bus, pptdev->slot,
725 					 pptdev->func);
726 		break;
727 	case VM_MAP_PPTDEV_MMIO:
728 		pptmmio = (struct vm_pptdev_mmio *)data;
729 		error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
730 				     pptmmio->func, pptmmio->gpa, pptmmio->len,
731 				     pptmmio->hpa);
732 		break;
733 	case VM_UNMAP_PPTDEV_MMIO:
734 		pptmmio = (struct vm_pptdev_mmio *)data;
735 		error = ppt_unmap_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
736 				       pptmmio->func, pptmmio->gpa, pptmmio->len);
737 		break;
738 	case VM_BIND_PPTDEV:
739 		pptdev = (struct vm_pptdev *)data;
740 		error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
741 					 pptdev->func);
742 		break;
743 	case VM_UNBIND_PPTDEV:
744 		pptdev = (struct vm_pptdev *)data;
745 		error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
746 					   pptdev->func);
747 		break;
748 	case VM_INJECT_EXCEPTION:
749 		vmexc = (struct vm_exception *)data;
750 		error = vm_inject_exception(vcpu,
751 		    vmexc->vector, vmexc->error_code_valid, vmexc->error_code,
752 		    vmexc->restart_instruction);
753 		break;
754 	case VM_INJECT_NMI:
755 		error = vm_inject_nmi(vcpu);
756 		break;
757 	case VM_LAPIC_IRQ:
758 		vmirq = (struct vm_lapic_irq *)data;
759 		error = lapic_intr_edge(vcpu, vmirq->vector);
760 		break;
761 	case VM_LAPIC_LOCAL_IRQ:
762 		vmirq = (struct vm_lapic_irq *)data;
763 		error = lapic_set_local_intr(sc->vm, vcpu, vmirq->vector);
764 		break;
765 	case VM_LAPIC_MSI:
766 		vmmsi = (struct vm_lapic_msi *)data;
767 		error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg);
768 		break;
769 	case VM_IOAPIC_ASSERT_IRQ:
770 		ioapic_irq = (struct vm_ioapic_irq *)data;
771 		error = vioapic_assert_irq(sc->vm, ioapic_irq->irq);
772 		break;
773 	case VM_IOAPIC_DEASSERT_IRQ:
774 		ioapic_irq = (struct vm_ioapic_irq *)data;
775 		error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq);
776 		break;
777 	case VM_IOAPIC_PULSE_IRQ:
778 		ioapic_irq = (struct vm_ioapic_irq *)data;
779 		error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq);
780 		break;
781 	case VM_IOAPIC_PINCOUNT:
782 		*(int *)data = vioapic_pincount(sc->vm);
783 		break;
784 	case VM_SET_KERNEMU_DEV:
785 	case VM_GET_KERNEMU_DEV: {
786 		mem_region_write_t mwrite;
787 		mem_region_read_t mread;
788 		bool arg;
789 
790 		kernemu = (void *)data;
791 
792 		if (kernemu->access_width > 0)
793 			size = (1u << kernemu->access_width);
794 		else
795 			size = 1;
796 
797 		if (kernemu->gpa >= DEFAULT_APIC_BASE && kernemu->gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
798 			mread = lapic_mmio_read;
799 			mwrite = lapic_mmio_write;
800 		} else if (kernemu->gpa >= VIOAPIC_BASE && kernemu->gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
801 			mread = vioapic_mmio_read;
802 			mwrite = vioapic_mmio_write;
803 		} else if (kernemu->gpa >= VHPET_BASE && kernemu->gpa < VHPET_BASE + VHPET_SIZE) {
804 			mread = vhpet_mmio_read;
805 			mwrite = vhpet_mmio_write;
806 		} else {
807 			error = EINVAL;
808 			break;
809 		}
810 
811 		if (cmd == VM_SET_KERNEMU_DEV)
812 			error = mwrite(vcpu, kernemu->gpa,
813 			    kernemu->value, size, &arg);
814 		else
815 			error = mread(vcpu, kernemu->gpa,
816 			    &kernemu->value, size, &arg);
817 		break;
818 		}
819 	case VM_ISA_ASSERT_IRQ:
820 		isa_irq = (struct vm_isa_irq *)data;
821 		error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq);
822 		if (error == 0 && isa_irq->ioapic_irq != -1)
823 			error = vioapic_assert_irq(sc->vm,
824 			    isa_irq->ioapic_irq);
825 		break;
826 	case VM_ISA_DEASSERT_IRQ:
827 		isa_irq = (struct vm_isa_irq *)data;
828 		error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq);
829 		if (error == 0 && isa_irq->ioapic_irq != -1)
830 			error = vioapic_deassert_irq(sc->vm,
831 			    isa_irq->ioapic_irq);
832 		break;
833 	case VM_ISA_PULSE_IRQ:
834 		isa_irq = (struct vm_isa_irq *)data;
835 		error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq);
836 		if (error == 0 && isa_irq->ioapic_irq != -1)
837 			error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq);
838 		break;
839 	case VM_ISA_SET_IRQ_TRIGGER:
840 		isa_irq_trigger = (struct vm_isa_irq_trigger *)data;
841 		error = vatpic_set_irq_trigger(sc->vm,
842 		    isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger);
843 		break;
844 	case VM_MMAP_GETNEXT:
845 		mm = (struct vm_memmap *)data;
846 		error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
847 		    &mm->segoff, &mm->len, &mm->prot, &mm->flags);
848 		break;
849 	case VM_MMAP_MEMSEG:
850 		mm = (struct vm_memmap *)data;
851 		error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
852 		    mm->len, mm->prot, mm->flags);
853 		break;
854 	case VM_MUNMAP_MEMSEG:
855 		mu = (struct vm_munmap *)data;
856 		error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len);
857 		break;
858 #ifdef COMPAT_FREEBSD12
859 	case VM_ALLOC_MEMSEG_FBSD12:
860 		error = alloc_memseg(sc, (struct vm_memseg *)data,
861 		    sizeof(((struct vm_memseg_fbsd12 *)0)->name));
862 		break;
863 #endif
864 	case VM_ALLOC_MEMSEG:
865 		error = alloc_memseg(sc, (struct vm_memseg *)data,
866 		    sizeof(((struct vm_memseg *)0)->name));
867 		break;
868 #ifdef COMPAT_FREEBSD12
869 	case VM_GET_MEMSEG_FBSD12:
870 		error = get_memseg(sc, (struct vm_memseg *)data,
871 		    sizeof(((struct vm_memseg_fbsd12 *)0)->name));
872 		break;
873 #endif
874 	case VM_GET_MEMSEG:
875 		error = get_memseg(sc, (struct vm_memseg *)data,
876 		    sizeof(((struct vm_memseg *)0)->name));
877 		break;
878 	case VM_GET_REGISTER:
879 		vmreg = (struct vm_register *)data;
880 		error = vm_get_register(vcpu, vmreg->regnum, &vmreg->regval);
881 		break;
882 	case VM_SET_REGISTER:
883 		vmreg = (struct vm_register *)data;
884 		error = vm_set_register(vcpu, vmreg->regnum, vmreg->regval);
885 		break;
886 	case VM_SET_SEGMENT_DESCRIPTOR:
887 		vmsegdesc = (struct vm_seg_desc *)data;
888 		error = vm_set_seg_desc(vcpu,
889 					vmsegdesc->regnum,
890 					&vmsegdesc->desc);
891 		break;
892 	case VM_GET_SEGMENT_DESCRIPTOR:
893 		vmsegdesc = (struct vm_seg_desc *)data;
894 		error = vm_get_seg_desc(vcpu,
895 					vmsegdesc->regnum,
896 					&vmsegdesc->desc);
897 		break;
898 	case VM_GET_REGISTER_SET:
899 		vmregset = (struct vm_register_set *)data;
900 		if (vmregset->count > VM_REG_LAST) {
901 			error = EINVAL;
902 			break;
903 		}
904 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
905 		    M_WAITOK);
906 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
907 		    M_WAITOK);
908 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
909 		    vmregset->count);
910 		if (error == 0)
911 			error = vm_get_register_set(vcpu,
912 			    vmregset->count, regnums, regvals);
913 		if (error == 0)
914 			error = copyout(regvals, vmregset->regvals,
915 			    sizeof(regvals[0]) * vmregset->count);
916 		free(regvals, M_VMMDEV);
917 		free(regnums, M_VMMDEV);
918 		break;
919 	case VM_SET_REGISTER_SET:
920 		vmregset = (struct vm_register_set *)data;
921 		if (vmregset->count > VM_REG_LAST) {
922 			error = EINVAL;
923 			break;
924 		}
925 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
926 		    M_WAITOK);
927 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
928 		    M_WAITOK);
929 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
930 		    vmregset->count);
931 		if (error == 0)
932 			error = copyin(vmregset->regvals, regvals,
933 			    sizeof(regvals[0]) * vmregset->count);
934 		if (error == 0)
935 			error = vm_set_register_set(vcpu,
936 			    vmregset->count, regnums, regvals);
937 		free(regvals, M_VMMDEV);
938 		free(regnums, M_VMMDEV);
939 		break;
940 	case VM_GET_CAPABILITY:
941 		vmcap = (struct vm_capability *)data;
942 		error = vm_get_capability(vcpu,
943 					  vmcap->captype,
944 					  &vmcap->capval);
945 		break;
946 	case VM_SET_CAPABILITY:
947 		vmcap = (struct vm_capability *)data;
948 		error = vm_set_capability(vcpu,
949 					  vmcap->captype,
950 					  vmcap->capval);
951 		break;
952 	case VM_SET_X2APIC_STATE:
953 		x2apic = (struct vm_x2apic *)data;
954 		error = vm_set_x2apic_state(vcpu, x2apic->state);
955 		break;
956 	case VM_GET_X2APIC_STATE:
957 		x2apic = (struct vm_x2apic *)data;
958 		error = vm_get_x2apic_state(vcpu, &x2apic->state);
959 		break;
960 	case VM_GET_GPA_PMAP:
961 		gpapte = (struct vm_gpa_pte *)data;
962 		pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
963 				 gpapte->gpa, gpapte->pte, &gpapte->ptenum);
964 		error = 0;
965 		break;
966 	case VM_GET_HPET_CAPABILITIES:
967 		error = vhpet_getcap((struct vm_hpet_cap *)data);
968 		break;
969 	case VM_GLA2GPA: {
970 		CTASSERT(PROT_READ == VM_PROT_READ);
971 		CTASSERT(PROT_WRITE == VM_PROT_WRITE);
972 		CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
973 		gg = (struct vm_gla2gpa *)data;
974 		error = vm_gla2gpa(vcpu, &gg->paging, gg->gla,
975 		    gg->prot, &gg->gpa, &gg->fault);
976 		KASSERT(error == 0 || error == EFAULT,
977 		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
978 		break;
979 	}
980 	case VM_GLA2GPA_NOFAULT:
981 		gg = (struct vm_gla2gpa *)data;
982 		error = vm_gla2gpa_nofault(vcpu, &gg->paging, gg->gla,
983 		    gg->prot, &gg->gpa, &gg->fault);
984 		KASSERT(error == 0 || error == EFAULT,
985 		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
986 		break;
987 	case VM_ACTIVATE_CPU:
988 		error = vm_activate_cpu(vcpu);
989 		break;
990 	case VM_GET_CPUS:
991 		error = 0;
992 		vm_cpuset = (struct vm_cpuset *)data;
993 		size = vm_cpuset->cpusetsize;
994 		if (size < 1 || size > CPU_MAXSIZE / NBBY) {
995 			error = ERANGE;
996 			break;
997 		}
998 		cpuset = malloc(max(size, sizeof(cpuset_t)), M_TEMP,
999 		    M_WAITOK | M_ZERO);
1000 		if (vm_cpuset->which == VM_ACTIVE_CPUS)
1001 			*cpuset = vm_active_cpus(sc->vm);
1002 		else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
1003 			*cpuset = vm_suspended_cpus(sc->vm);
1004 		else if (vm_cpuset->which == VM_DEBUG_CPUS)
1005 			*cpuset = vm_debug_cpus(sc->vm);
1006 		else
1007 			error = EINVAL;
1008 		if (error == 0 && size < howmany(CPU_FLS(cpuset), NBBY))
1009 			error = ERANGE;
1010 		if (error == 0)
1011 			error = copyout(cpuset, vm_cpuset->cpus, size);
1012 		free(cpuset, M_TEMP);
1013 		break;
1014 	case VM_SUSPEND_CPU:
1015 		error = vm_suspend_cpu(sc->vm, vcpu);
1016 		break;
1017 	case VM_RESUME_CPU:
1018 		error = vm_resume_cpu(sc->vm, vcpu);
1019 		break;
1020 	case VM_SET_INTINFO:
1021 		vmii = (struct vm_intinfo *)data;
1022 		error = vm_exit_intinfo(vcpu, vmii->info1);
1023 		break;
1024 	case VM_GET_INTINFO:
1025 		vmii = (struct vm_intinfo *)data;
1026 		error = vm_get_intinfo(vcpu, &vmii->info1, &vmii->info2);
1027 		break;
1028 	case VM_RTC_WRITE:
1029 		rtcdata = (struct vm_rtc_data *)data;
1030 		error = vrtc_nvram_write(sc->vm, rtcdata->offset,
1031 		    rtcdata->value);
1032 		break;
1033 	case VM_RTC_READ:
1034 		rtcdata = (struct vm_rtc_data *)data;
1035 		error = vrtc_nvram_read(sc->vm, rtcdata->offset,
1036 		    &rtcdata->value);
1037 		break;
1038 	case VM_RTC_SETTIME:
1039 		rtctime = (struct vm_rtc_time *)data;
1040 		error = vrtc_set_time(sc->vm, rtctime->secs);
1041 		break;
1042 	case VM_RTC_GETTIME:
1043 		error = 0;
1044 		rtctime = (struct vm_rtc_time *)data;
1045 		rtctime->secs = vrtc_get_time(sc->vm);
1046 		break;
1047 	case VM_RESTART_INSTRUCTION:
1048 		error = vm_restart_instruction(vcpu);
1049 		break;
1050 	case VM_SET_TOPOLOGY:
1051 		topology = (struct vm_cpu_topology *)data;
1052 		error = vm_set_topology(sc->vm, topology->sockets,
1053 		    topology->cores, topology->threads, topology->maxcpus);
1054 		break;
1055 	case VM_GET_TOPOLOGY:
1056 		topology = (struct vm_cpu_topology *)data;
1057 		vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
1058 		    &topology->threads, &topology->maxcpus);
1059 		error = 0;
1060 		break;
1061 #ifdef BHYVE_SNAPSHOT
1062 	case VM_SNAPSHOT_REQ:
1063 		snapshot_meta = (struct vm_snapshot_meta *)data;
1064 		error = vm_snapshot_req(sc->vm, snapshot_meta);
1065 		break;
1066 #ifdef COMPAT_FREEBSD13
1067 	case VM_SNAPSHOT_REQ_OLD:
1068 		/*
1069 		 * The old structure just has an additional pointer at
1070 		 * the start that is ignored.
1071 		 */
1072 		snapshot_old = (struct vm_snapshot_meta_old *)data;
1073 		snapshot_meta =
1074 		    (struct vm_snapshot_meta *)&snapshot_old->dev_data;
1075 		error = vm_snapshot_req(sc->vm, snapshot_meta);
1076 		break;
1077 #endif
1078 	case VM_RESTORE_TIME:
1079 		error = vm_restore_time(sc->vm);
1080 		break;
1081 #endif
1082 	default:
1083 		error = ENOTTY;
1084 		break;
1085 	}
1086 
1087 done:
1088 	if (vcpus_locked == SINGLE)
1089 		vcpu_unlock_one(sc, vcpuid, vcpu);
1090 	else if (vcpus_locked == ALL)
1091 		vcpu_unlock_all(sc);
1092 	if (memsegs_locked)
1093 		vm_unlock_memsegs(sc->vm);
1094 
1095 	/*
1096 	 * Make sure that no handler returns a kernel-internal
1097 	 * error value to userspace.
1098 	 */
1099 	KASSERT(error == ERESTART || error >= 0,
1100 	    ("vmmdev_ioctl: invalid error return %d", error));
1101 	return (error);
1102 }
1103 
1104 static int
1105 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
1106     struct vm_object **objp, int nprot)
1107 {
1108 	struct vmmdev_softc *sc;
1109 	vm_paddr_t gpa;
1110 	size_t len;
1111 	vm_ooffset_t segoff, first, last;
1112 	int error, found, segid;
1113 	bool sysmem;
1114 
1115 	error = vmm_priv_check(curthread->td_ucred);
1116 	if (error)
1117 		return (error);
1118 
1119 	first = *offset;
1120 	last = first + mapsize;
1121 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
1122 		return (EINVAL);
1123 
1124 	sc = vmmdev_lookup2(cdev);
1125 	if (sc == NULL) {
1126 		/* virtual machine is in the process of being created */
1127 		return (EINVAL);
1128 	}
1129 
1130 	/*
1131 	 * Get a read lock on the guest memory map.
1132 	 */
1133 	vm_slock_memsegs(sc->vm);
1134 
1135 	gpa = 0;
1136 	found = 0;
1137 	while (!found) {
1138 		error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
1139 		    NULL, NULL);
1140 		if (error)
1141 			break;
1142 
1143 		if (first >= gpa && last <= gpa + len)
1144 			found = 1;
1145 		else
1146 			gpa += len;
1147 	}
1148 
1149 	if (found) {
1150 		error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
1151 		KASSERT(error == 0 && *objp != NULL,
1152 		    ("%s: invalid memory segment %d", __func__, segid));
1153 		if (sysmem) {
1154 			vm_object_reference(*objp);
1155 			*offset = segoff + (first - gpa);
1156 		} else {
1157 			error = EINVAL;
1158 		}
1159 	}
1160 	vm_unlock_memsegs(sc->vm);
1161 	return (error);
1162 }
1163 
1164 static void
1165 vmmdev_destroy(void *arg)
1166 {
1167 	struct vmmdev_softc *sc = arg;
1168 	struct devmem_softc *dsc;
1169 	int error __diagused;
1170 
1171 	vm_disable_vcpu_creation(sc->vm);
1172 	error = vcpu_lock_all(sc);
1173 	KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
1174 	vm_unlock_vcpus(sc->vm);
1175 
1176 	while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
1177 		KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
1178 		SLIST_REMOVE_HEAD(&sc->devmem, link);
1179 		free(dsc->name, M_VMMDEV);
1180 		free(dsc, M_VMMDEV);
1181 	}
1182 
1183 	if (sc->cdev != NULL)
1184 		destroy_dev(sc->cdev);
1185 
1186 	if (sc->vm != NULL)
1187 		vm_destroy(sc->vm);
1188 
1189 	if (sc->ucred != NULL)
1190 		crfree(sc->ucred);
1191 
1192 	if ((sc->flags & VSC_LINKED) != 0) {
1193 		mtx_lock(&vmmdev_mtx);
1194 		SLIST_REMOVE(&head, sc, vmmdev_softc, link);
1195 		mtx_unlock(&vmmdev_mtx);
1196 	}
1197 
1198 	free(sc, M_VMMDEV);
1199 }
1200 
1201 static int
1202 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
1203 {
1204 	struct devmem_softc *dsc;
1205 	struct vmmdev_softc *sc;
1206 	struct cdev *cdev;
1207 	char *buf;
1208 	int error, buflen;
1209 
1210 	error = vmm_priv_check(req->td->td_ucred);
1211 	if (error)
1212 		return (error);
1213 
1214 	buflen = VM_MAX_NAMELEN + 1;
1215 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
1216 	strlcpy(buf, "beavis", buflen);
1217 	error = sysctl_handle_string(oidp, buf, buflen, req);
1218 	if (error != 0 || req->newptr == NULL)
1219 		goto out;
1220 
1221 	mtx_lock(&vmmdev_mtx);
1222 	sc = vmmdev_lookup(buf);
1223 	if (sc == NULL || sc->cdev == NULL) {
1224 		mtx_unlock(&vmmdev_mtx);
1225 		error = EINVAL;
1226 		goto out;
1227 	}
1228 
1229 	/*
1230 	 * Setting 'sc->cdev' to NULL is used to indicate that the VM
1231 	 * is scheduled for destruction.
1232 	 */
1233 	cdev = sc->cdev;
1234 	sc->cdev = NULL;
1235 	mtx_unlock(&vmmdev_mtx);
1236 
1237 	/*
1238 	 * Destroy all cdevs:
1239 	 *
1240 	 * - any new operations on the 'cdev' will return an error (ENXIO).
1241 	 *
1242 	 * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
1243 	 */
1244 	SLIST_FOREACH(dsc, &sc->devmem, link) {
1245 		KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
1246 		destroy_dev(dsc->cdev);
1247 		devmem_destroy(dsc);
1248 	}
1249 	destroy_dev(cdev);
1250 	vmmdev_destroy(sc);
1251 	error = 0;
1252 
1253 out:
1254 	free(buf, M_VMMDEV);
1255 	return (error);
1256 }
1257 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy,
1258     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
1259     NULL, 0, sysctl_vmm_destroy, "A",
1260     NULL);
1261 
1262 static struct cdevsw vmmdevsw = {
1263 	.d_name		= "vmmdev",
1264 	.d_version	= D_VERSION,
1265 	.d_ioctl	= vmmdev_ioctl,
1266 	.d_mmap_single	= vmmdev_mmap_single,
1267 	.d_read		= vmmdev_rw,
1268 	.d_write	= vmmdev_rw,
1269 };
1270 
1271 static int
1272 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
1273 {
1274 	struct vm *vm;
1275 	struct cdev *cdev;
1276 	struct vmmdev_softc *sc, *sc2;
1277 	char *buf;
1278 	int error, buflen;
1279 
1280 	error = vmm_priv_check(req->td->td_ucred);
1281 	if (error)
1282 		return (error);
1283 
1284 	buflen = VM_MAX_NAMELEN + 1;
1285 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
1286 	strlcpy(buf, "beavis", buflen);
1287 	error = sysctl_handle_string(oidp, buf, buflen, req);
1288 	if (error != 0 || req->newptr == NULL)
1289 		goto out;
1290 
1291 	mtx_lock(&vmmdev_mtx);
1292 	sc = vmmdev_lookup(buf);
1293 	mtx_unlock(&vmmdev_mtx);
1294 	if (sc != NULL) {
1295 		error = EEXIST;
1296 		goto out;
1297 	}
1298 
1299 	error = vm_create(buf, &vm);
1300 	if (error != 0)
1301 		goto out;
1302 
1303 	sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1304 	sc->ucred = crhold(curthread->td_ucred);
1305 	sc->vm = vm;
1306 	SLIST_INIT(&sc->devmem);
1307 
1308 	/*
1309 	 * Lookup the name again just in case somebody sneaked in when we
1310 	 * dropped the lock.
1311 	 */
1312 	mtx_lock(&vmmdev_mtx);
1313 	sc2 = vmmdev_lookup(buf);
1314 	if (sc2 == NULL) {
1315 		SLIST_INSERT_HEAD(&head, sc, link);
1316 		sc->flags |= VSC_LINKED;
1317 	}
1318 	mtx_unlock(&vmmdev_mtx);
1319 
1320 	if (sc2 != NULL) {
1321 		vmmdev_destroy(sc);
1322 		error = EEXIST;
1323 		goto out;
1324 	}
1325 
1326 	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, sc->ucred,
1327 	    UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf);
1328 	if (error != 0) {
1329 		vmmdev_destroy(sc);
1330 		goto out;
1331 	}
1332 
1333 	mtx_lock(&vmmdev_mtx);
1334 	sc->cdev = cdev;
1335 	sc->cdev->si_drv1 = sc;
1336 	mtx_unlock(&vmmdev_mtx);
1337 
1338 out:
1339 	free(buf, M_VMMDEV);
1340 	return (error);
1341 }
1342 SYSCTL_PROC(_hw_vmm, OID_AUTO, create,
1343     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
1344     NULL, 0, sysctl_vmm_create, "A",
1345     NULL);
1346 
1347 void
1348 vmmdev_init(void)
1349 {
1350 	pr_allow_flag = prison_add_allow(NULL, "vmm", NULL,
1351 	    "Allow use of vmm in a jail.");
1352 }
1353 
1354 int
1355 vmmdev_cleanup(void)
1356 {
1357 	int error;
1358 
1359 	if (SLIST_EMPTY(&head))
1360 		error = 0;
1361 	else
1362 		error = EBUSY;
1363 
1364 	return (error);
1365 }
1366 
1367 static int
1368 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
1369     struct vm_object **objp, int nprot)
1370 {
1371 	struct devmem_softc *dsc;
1372 	vm_ooffset_t first, last;
1373 	size_t seglen;
1374 	int error;
1375 	bool sysmem;
1376 
1377 	dsc = cdev->si_drv1;
1378 	if (dsc == NULL) {
1379 		/* 'cdev' has been created but is not ready for use */
1380 		return (ENXIO);
1381 	}
1382 
1383 	first = *offset;
1384 	last = *offset + len;
1385 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
1386 		return (EINVAL);
1387 
1388 	vm_slock_memsegs(dsc->sc->vm);
1389 
1390 	error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
1391 	KASSERT(error == 0 && !sysmem && *objp != NULL,
1392 	    ("%s: invalid devmem segment %d", __func__, dsc->segid));
1393 
1394 	if (seglen >= last)
1395 		vm_object_reference(*objp);
1396 	else
1397 		error = EINVAL;
1398 
1399 	vm_unlock_memsegs(dsc->sc->vm);
1400 	return (error);
1401 }
1402 
1403 static struct cdevsw devmemsw = {
1404 	.d_name		= "devmem",
1405 	.d_version	= D_VERSION,
1406 	.d_mmap_single	= devmem_mmap_single,
1407 };
1408 
1409 static int
1410 devmem_create_cdev(const char *vmname, int segid, char *devname)
1411 {
1412 	struct devmem_softc *dsc;
1413 	struct vmmdev_softc *sc;
1414 	struct cdev *cdev;
1415 	int error;
1416 
1417 	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL,
1418 	    UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname);
1419 	if (error)
1420 		return (error);
1421 
1422 	dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1423 
1424 	mtx_lock(&vmmdev_mtx);
1425 	sc = vmmdev_lookup(vmname);
1426 	KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname));
1427 	if (sc->cdev == NULL) {
1428 		/* virtual machine is being created or destroyed */
1429 		mtx_unlock(&vmmdev_mtx);
1430 		free(dsc, M_VMMDEV);
1431 		destroy_dev_sched_cb(cdev, NULL, 0);
1432 		return (ENODEV);
1433 	}
1434 
1435 	dsc->segid = segid;
1436 	dsc->name = devname;
1437 	dsc->cdev = cdev;
1438 	dsc->sc = sc;
1439 	SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
1440 	mtx_unlock(&vmmdev_mtx);
1441 
1442 	/* The 'cdev' is ready for use after 'si_drv1' is initialized */
1443 	cdev->si_drv1 = dsc;
1444 	return (0);
1445 }
1446 
1447 static void
1448 devmem_destroy(void *arg)
1449 {
1450 	struct devmem_softc *dsc = arg;
1451 
1452 	KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__));
1453 	dsc->cdev = NULL;
1454 	dsc->sc = NULL;
1455 }
1456