xref: /freebsd/sys/amd64/vmm/vmm_dev.c (revision d0b2dbfa)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 #include "opt_bhyve_snapshot.h"
31 
32 #include <sys/param.h>
33 #include <sys/kernel.h>
34 #include <sys/jail.h>
35 #include <sys/queue.h>
36 #include <sys/lock.h>
37 #include <sys/mutex.h>
38 #include <sys/malloc.h>
39 #include <sys/conf.h>
40 #include <sys/sysctl.h>
41 #include <sys/libkern.h>
42 #include <sys/ioccom.h>
43 #include <sys/mman.h>
44 #include <sys/uio.h>
45 #include <sys/proc.h>
46 
47 #include <vm/vm.h>
48 #include <vm/pmap.h>
49 #include <vm/vm_map.h>
50 #include <vm/vm_object.h>
51 
52 #include <machine/vmparam.h>
53 #include <machine/vmm.h>
54 #include <machine/vmm_dev.h>
55 #include <machine/vmm_instruction_emul.h>
56 #include <machine/vmm_snapshot.h>
57 #include <x86/apicreg.h>
58 
59 #include "vmm_lapic.h"
60 #include "vmm_stat.h"
61 #include "vmm_mem.h"
62 #include "io/ppt.h"
63 #include "io/vatpic.h"
64 #include "io/vioapic.h"
65 #include "io/vhpet.h"
66 #include "io/vrtc.h"
67 
68 #ifdef COMPAT_FREEBSD13
69 struct vm_stats_old {
70 	int		cpuid;				/* in */
71 	int		num_entries;			/* out */
72 	struct timeval	tv;
73 	uint64_t	statbuf[MAX_VM_STATS];
74 };
75 
76 #define	VM_STATS_OLD \
77 	_IOWR('v', IOCNUM_VM_STATS, struct vm_stats_old)
78 
79 struct vm_snapshot_meta_old {
80 	void *ctx;			/* unused */
81 	void *dev_data;
82 	const char *dev_name;      /* identify userspace devices */
83 	enum snapshot_req dev_req; /* identify kernel structs */
84 
85 	struct vm_snapshot_buffer buffer;
86 
87 	enum vm_snapshot_op op;
88 };
89 
90 #define VM_SNAPSHOT_REQ_OLD \
91 	_IOWR('v', IOCNUM_SNAPSHOT_REQ, struct vm_snapshot_meta_old)
92 
93 struct vm_exit_ipi_13 {
94 	uint32_t	mode;
95 	uint8_t		vector;
96 	__BITSET_DEFINE(, 256) dmask;
97 };
98 
99 struct vm_exit_13 {
100 	uint32_t	exitcode;
101 	int32_t		inst_length;
102 	uint64_t	rip;
103 	uint64_t	u[120 / sizeof(uint64_t)];
104 };
105 
106 struct vm_run_13 {
107 	int		cpuid;
108 	struct vm_exit_13 vm_exit;
109 };
110 
111 #define	VM_RUN_13 \
112 	_IOWR('v', IOCNUM_RUN, struct vm_run_13)
113 
114 #endif /* COMPAT_FREEBSD13 */
115 
116 struct devmem_softc {
117 	int	segid;
118 	char	*name;
119 	struct cdev *cdev;
120 	struct vmmdev_softc *sc;
121 	SLIST_ENTRY(devmem_softc) link;
122 };
123 
124 struct vmmdev_softc {
125 	struct vm	*vm;		/* vm instance cookie */
126 	struct cdev	*cdev;
127 	struct ucred	*ucred;
128 	SLIST_ENTRY(vmmdev_softc) link;
129 	SLIST_HEAD(, devmem_softc) devmem;
130 	int		flags;
131 };
132 #define	VSC_LINKED		0x01
133 
134 static SLIST_HEAD(, vmmdev_softc) head;
135 
136 static unsigned pr_allow_flag;
137 static struct mtx vmmdev_mtx;
138 MTX_SYSINIT(vmmdev_mtx, &vmmdev_mtx, "vmm device mutex", MTX_DEF);
139 
140 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
141 
142 SYSCTL_DECL(_hw_vmm);
143 
144 static int vmm_priv_check(struct ucred *ucred);
145 static int devmem_create_cdev(const char *vmname, int id, char *devmem);
146 static void devmem_destroy(void *arg);
147 
148 static int
149 vmm_priv_check(struct ucred *ucred)
150 {
151 
152 	if (jailed(ucred) &&
153 	    !(ucred->cr_prison->pr_allow & pr_allow_flag))
154 		return (EPERM);
155 
156 	return (0);
157 }
158 
159 static int
160 vcpu_lock_one(struct vcpu *vcpu)
161 {
162 	return (vcpu_set_state(vcpu, VCPU_FROZEN, true));
163 }
164 
165 static void
166 vcpu_unlock_one(struct vmmdev_softc *sc, int vcpuid, struct vcpu *vcpu)
167 {
168 	enum vcpu_state state;
169 
170 	state = vcpu_get_state(vcpu, NULL);
171 	if (state != VCPU_FROZEN) {
172 		panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm),
173 		    vcpuid, state);
174 	}
175 
176 	vcpu_set_state(vcpu, VCPU_IDLE, false);
177 }
178 
179 static int
180 vcpu_lock_all(struct vmmdev_softc *sc)
181 {
182 	struct vcpu *vcpu;
183 	int error;
184 	uint16_t i, j, maxcpus;
185 
186 	error = 0;
187 	vm_slock_vcpus(sc->vm);
188 	maxcpus = vm_get_maxcpus(sc->vm);
189 	for (i = 0; i < maxcpus; i++) {
190 		vcpu = vm_vcpu(sc->vm, i);
191 		if (vcpu == NULL)
192 			continue;
193 		error = vcpu_lock_one(vcpu);
194 		if (error)
195 			break;
196 	}
197 
198 	if (error) {
199 		for (j = 0; j < i; j++) {
200 			vcpu = vm_vcpu(sc->vm, j);
201 			if (vcpu == NULL)
202 				continue;
203 			vcpu_unlock_one(sc, j, vcpu);
204 		}
205 		vm_unlock_vcpus(sc->vm);
206 	}
207 
208 	return (error);
209 }
210 
211 static void
212 vcpu_unlock_all(struct vmmdev_softc *sc)
213 {
214 	struct vcpu *vcpu;
215 	uint16_t i, maxcpus;
216 
217 	maxcpus = vm_get_maxcpus(sc->vm);
218 	for (i = 0; i < maxcpus; i++) {
219 		vcpu = vm_vcpu(sc->vm, i);
220 		if (vcpu == NULL)
221 			continue;
222 		vcpu_unlock_one(sc, i, vcpu);
223 	}
224 	vm_unlock_vcpus(sc->vm);
225 }
226 
227 static struct vmmdev_softc *
228 vmmdev_lookup(const char *name)
229 {
230 	struct vmmdev_softc *sc;
231 
232 #ifdef notyet	/* XXX kernel is not compiled with invariants */
233 	mtx_assert(&vmmdev_mtx, MA_OWNED);
234 #endif
235 
236 	SLIST_FOREACH(sc, &head, link) {
237 		if (strcmp(name, vm_name(sc->vm)) == 0)
238 			break;
239 	}
240 
241 	if (sc == NULL)
242 		return (NULL);
243 
244 	if (cr_cansee(curthread->td_ucred, sc->ucred))
245 		return (NULL);
246 
247 	return (sc);
248 }
249 
250 static struct vmmdev_softc *
251 vmmdev_lookup2(struct cdev *cdev)
252 {
253 
254 	return (cdev->si_drv1);
255 }
256 
257 static int
258 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
259 {
260 	int error, off, c, prot;
261 	vm_paddr_t gpa, maxaddr;
262 	void *hpa, *cookie;
263 	struct vmmdev_softc *sc;
264 
265 	error = vmm_priv_check(curthread->td_ucred);
266 	if (error)
267 		return (error);
268 
269 	sc = vmmdev_lookup2(cdev);
270 	if (sc == NULL)
271 		return (ENXIO);
272 
273 	/*
274 	 * Get a read lock on the guest memory map.
275 	 */
276 	vm_slock_memsegs(sc->vm);
277 
278 	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
279 	maxaddr = vmm_sysmem_maxaddr(sc->vm);
280 	while (uio->uio_resid > 0 && error == 0) {
281 		gpa = uio->uio_offset;
282 		off = gpa & PAGE_MASK;
283 		c = min(uio->uio_resid, PAGE_SIZE - off);
284 
285 		/*
286 		 * The VM has a hole in its physical memory map. If we want to
287 		 * use 'dd' to inspect memory beyond the hole we need to
288 		 * provide bogus data for memory that lies in the hole.
289 		 *
290 		 * Since this device does not support lseek(2), dd(1) will
291 		 * read(2) blocks of data to simulate the lseek(2).
292 		 */
293 		hpa = vm_gpa_hold_global(sc->vm, gpa, c, prot, &cookie);
294 		if (hpa == NULL) {
295 			if (uio->uio_rw == UIO_READ && gpa < maxaddr)
296 				error = uiomove(__DECONST(void *, zero_region),
297 				    c, uio);
298 			else
299 				error = EFAULT;
300 		} else {
301 			error = uiomove(hpa, c, uio);
302 			vm_gpa_release(cookie);
303 		}
304 	}
305 	vm_unlock_memsegs(sc->vm);
306 	return (error);
307 }
308 
309 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1);
310 
311 static int
312 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
313 {
314 	struct devmem_softc *dsc;
315 	int error;
316 	bool sysmem;
317 
318 	error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
319 	if (error || mseg->len == 0)
320 		return (error);
321 
322 	if (!sysmem) {
323 		SLIST_FOREACH(dsc, &sc->devmem, link) {
324 			if (dsc->segid == mseg->segid)
325 				break;
326 		}
327 		KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
328 		    __func__, mseg->segid));
329 		error = copystr(dsc->name, mseg->name, len, NULL);
330 	} else {
331 		bzero(mseg->name, len);
332 	}
333 
334 	return (error);
335 }
336 
337 static int
338 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
339 {
340 	char *name;
341 	int error;
342 	bool sysmem;
343 
344 	error = 0;
345 	name = NULL;
346 	sysmem = true;
347 
348 	/*
349 	 * The allocation is lengthened by 1 to hold a terminating NUL.  It'll
350 	 * by stripped off when devfs processes the full string.
351 	 */
352 	if (VM_MEMSEG_NAME(mseg)) {
353 		sysmem = false;
354 		name = malloc(len, M_VMMDEV, M_WAITOK);
355 		error = copystr(mseg->name, name, len, NULL);
356 		if (error)
357 			goto done;
358 	}
359 
360 	error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
361 	if (error)
362 		goto done;
363 
364 	if (VM_MEMSEG_NAME(mseg)) {
365 		error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name);
366 		if (error)
367 			vm_free_memseg(sc->vm, mseg->segid);
368 		else
369 			name = NULL;	/* freed when 'cdev' is destroyed */
370 	}
371 done:
372 	free(name, M_VMMDEV);
373 	return (error);
374 }
375 
376 static int
377 vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
378     uint64_t *regval)
379 {
380 	int error, i;
381 
382 	error = 0;
383 	for (i = 0; i < count; i++) {
384 		error = vm_get_register(vcpu, regnum[i], &regval[i]);
385 		if (error)
386 			break;
387 	}
388 	return (error);
389 }
390 
391 static int
392 vm_set_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
393     uint64_t *regval)
394 {
395 	int error, i;
396 
397 	error = 0;
398 	for (i = 0; i < count; i++) {
399 		error = vm_set_register(vcpu, regnum[i], regval[i]);
400 		if (error)
401 			break;
402 	}
403 	return (error);
404 }
405 
406 static int
407 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
408 	     struct thread *td)
409 {
410 	int error, vcpuid, size;
411 	cpuset_t *cpuset;
412 	struct vmmdev_softc *sc;
413 	struct vcpu *vcpu;
414 	struct vm_register *vmreg;
415 	struct vm_seg_desc *vmsegdesc;
416 	struct vm_register_set *vmregset;
417 	struct vm_run *vmrun;
418 #ifdef COMPAT_FREEBSD13
419 	struct vm_run_13 *vmrun_13;
420 #endif
421 	struct vm_exception *vmexc;
422 	struct vm_lapic_irq *vmirq;
423 	struct vm_lapic_msi *vmmsi;
424 	struct vm_ioapic_irq *ioapic_irq;
425 	struct vm_isa_irq *isa_irq;
426 	struct vm_isa_irq_trigger *isa_irq_trigger;
427 	struct vm_capability *vmcap;
428 	struct vm_pptdev *pptdev;
429 	struct vm_pptdev_mmio *pptmmio;
430 	struct vm_pptdev_msi *pptmsi;
431 	struct vm_pptdev_msix *pptmsix;
432 #ifdef COMPAT_FREEBSD13
433 	struct vm_stats_old *vmstats_old;
434 #endif
435 	struct vm_stats *vmstats;
436 	struct vm_stat_desc *statdesc;
437 	struct vm_x2apic *x2apic;
438 	struct vm_gpa_pte *gpapte;
439 	struct vm_suspend *vmsuspend;
440 	struct vm_gla2gpa *gg;
441 	struct vm_cpuset *vm_cpuset;
442 	struct vm_intinfo *vmii;
443 	struct vm_rtc_time *rtctime;
444 	struct vm_rtc_data *rtcdata;
445 	struct vm_memmap *mm;
446 	struct vm_munmap *mu;
447 	struct vm_cpu_topology *topology;
448 	struct vm_readwrite_kernemu_device *kernemu;
449 	uint64_t *regvals;
450 	int *regnums;
451 	enum { NONE, SINGLE, ALL } vcpus_locked;
452 	bool memsegs_locked;
453 #ifdef BHYVE_SNAPSHOT
454 	struct vm_snapshot_meta *snapshot_meta;
455 #ifdef COMPAT_FREEBSD13
456 	struct vm_snapshot_meta_old *snapshot_old;
457 #endif
458 #endif
459 
460 	error = vmm_priv_check(curthread->td_ucred);
461 	if (error)
462 		return (error);
463 
464 	sc = vmmdev_lookup2(cdev);
465 	if (sc == NULL)
466 		return (ENXIO);
467 
468 	vcpuid = -1;
469 	vcpu = NULL;
470 	vcpus_locked = NONE;
471 	memsegs_locked = false;
472 
473 	/*
474 	 * For VMM ioctls that operate on a single vCPU, lookup the
475 	 * vcpu.  For VMM ioctls which require one or more vCPUs to
476 	 * not be running, lock necessary vCPUs.
477 	 *
478 	 * XXX fragile, handle with care
479 	 * Most of these assume that the first field of the ioctl data
480 	 * is the vcpuid.
481 	 */
482 	switch (cmd) {
483 	case VM_RUN:
484 #ifdef COMPAT_FREEBSD13
485 	case VM_RUN_13:
486 #endif
487 	case VM_GET_REGISTER:
488 	case VM_SET_REGISTER:
489 	case VM_GET_SEGMENT_DESCRIPTOR:
490 	case VM_SET_SEGMENT_DESCRIPTOR:
491 	case VM_GET_REGISTER_SET:
492 	case VM_SET_REGISTER_SET:
493 	case VM_INJECT_EXCEPTION:
494 	case VM_GET_CAPABILITY:
495 	case VM_SET_CAPABILITY:
496 	case VM_SET_X2APIC_STATE:
497 	case VM_GLA2GPA:
498 	case VM_GLA2GPA_NOFAULT:
499 	case VM_ACTIVATE_CPU:
500 	case VM_SET_INTINFO:
501 	case VM_GET_INTINFO:
502 	case VM_RESTART_INSTRUCTION:
503 	case VM_GET_KERNEMU_DEV:
504 	case VM_SET_KERNEMU_DEV:
505 		/*
506 		 * ioctls that can operate only on vcpus that are not running.
507 		 */
508 		vcpuid = *(int *)data;
509 		vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
510 		if (vcpu == NULL) {
511 			error = EINVAL;
512 			goto done;
513 		}
514 		error = vcpu_lock_one(vcpu);
515 		if (error)
516 			goto done;
517 		vcpus_locked = SINGLE;
518 		break;
519 
520 #ifdef COMPAT_FREEBSD12
521 	case VM_ALLOC_MEMSEG_FBSD12:
522 #endif
523 	case VM_ALLOC_MEMSEG:
524 	case VM_BIND_PPTDEV:
525 	case VM_UNBIND_PPTDEV:
526 	case VM_MMAP_MEMSEG:
527 	case VM_MUNMAP_MEMSEG:
528 	case VM_REINIT:
529 		/*
530 		 * ioctls that modify the memory map must lock memory
531 		 * segments exclusively.
532 		 */
533 		vm_xlock_memsegs(sc->vm);
534 		memsegs_locked = true;
535 		/* FALLTHROUGH */
536 	case VM_MAP_PPTDEV_MMIO:
537 	case VM_UNMAP_PPTDEV_MMIO:
538 #ifdef BHYVE_SNAPSHOT
539 	case VM_SNAPSHOT_REQ:
540 #ifdef COMPAT_FREEBSD13
541 	case VM_SNAPSHOT_REQ_OLD:
542 #endif
543 	case VM_RESTORE_TIME:
544 #endif
545 		/*
546 		 * ioctls that operate on the entire virtual machine must
547 		 * prevent all vcpus from running.
548 		 */
549 		error = vcpu_lock_all(sc);
550 		if (error)
551 			goto done;
552 		vcpus_locked = ALL;
553 		break;
554 
555 #ifdef COMPAT_FREEBSD12
556 	case VM_GET_MEMSEG_FBSD12:
557 #endif
558 	case VM_GET_MEMSEG:
559 	case VM_MMAP_GETNEXT:
560 		/*
561 		 * Lock the memory map while it is being inspected.
562 		 */
563 		vm_slock_memsegs(sc->vm);
564 		memsegs_locked = true;
565 		break;
566 
567 #ifdef COMPAT_FREEBSD13
568 	case VM_STATS_OLD:
569 #endif
570 	case VM_STATS:
571 	case VM_INJECT_NMI:
572 	case VM_LAPIC_IRQ:
573 	case VM_GET_X2APIC_STATE:
574 		/*
575 		 * These do not need the vCPU locked but do operate on
576 		 * a specific vCPU.
577 		 */
578 		vcpuid = *(int *)data;
579 		vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
580 		if (vcpu == NULL) {
581 			error = EINVAL;
582 			goto done;
583 		}
584 		break;
585 
586 	case VM_LAPIC_LOCAL_IRQ:
587 	case VM_SUSPEND_CPU:
588 	case VM_RESUME_CPU:
589 		/*
590 		 * These can either operate on all CPUs via a vcpuid of
591 		 * -1 or on a specific vCPU.
592 		 */
593 		vcpuid = *(int *)data;
594 		if (vcpuid == -1)
595 			break;
596 		vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
597 		if (vcpu == NULL) {
598 			error = EINVAL;
599 			goto done;
600 		}
601 		break;
602 
603 	default:
604 		break;
605 	}
606 
607 	switch (cmd) {
608 	case VM_RUN: {
609 		struct vm_exit *vme;
610 
611 		vmrun = (struct vm_run *)data;
612 		vme = vm_exitinfo(vcpu);
613 
614 		error = vm_run(vcpu);
615 		if (error != 0)
616 			break;
617 
618 		error = copyout(vme, vmrun->vm_exit, sizeof(*vme));
619 		if (error != 0)
620 			break;
621 		if (vme->exitcode == VM_EXITCODE_IPI) {
622 			error = copyout(vm_exitinfo_cpuset(vcpu),
623 			    vmrun->cpuset,
624 			    min(vmrun->cpusetsize, sizeof(cpuset_t)));
625 			if (error != 0)
626 				break;
627 			if (sizeof(cpuset_t) < vmrun->cpusetsize) {
628 				uint8_t *p;
629 
630 				p = (uint8_t *)vmrun->cpuset +
631 				    sizeof(cpuset_t);
632 				while (error == 0 &&
633 				    p < (uint8_t *)vmrun->cpuset +
634 				    vmrun->cpusetsize) {
635 					error = subyte(p++, 0);
636 				}
637 			}
638 		}
639 		break;
640 	}
641 #ifdef COMPAT_FREEBSD13
642 	case VM_RUN_13: {
643 		struct vm_exit *vme;
644 		struct vm_exit_13 *vme_13;
645 
646 		vmrun_13 = (struct vm_run_13 *)data;
647 		vme_13 = &vmrun_13->vm_exit;
648 		vme = vm_exitinfo(vcpu);
649 
650 		error = vm_run(vcpu);
651 		if (error == 0) {
652 			vme_13->exitcode = vme->exitcode;
653 			vme_13->inst_length = vme->inst_length;
654 			vme_13->rip = vme->rip;
655 			memcpy(vme_13->u, &vme->u, sizeof(vme_13->u));
656 			if (vme->exitcode == VM_EXITCODE_IPI) {
657 				struct vm_exit_ipi_13 *ipi;
658 				cpuset_t *dmask;
659 				int cpu;
660 
661 				dmask = vm_exitinfo_cpuset(vcpu);
662 				ipi = (struct vm_exit_ipi_13 *)&vme_13->u[0];
663 				BIT_ZERO(256, &ipi->dmask);
664 				CPU_FOREACH_ISSET(cpu, dmask) {
665 					if (cpu >= 256)
666 						break;
667 					BIT_SET(256, cpu, &ipi->dmask);
668 				}
669 			}
670 		}
671 		break;
672 	}
673 #endif
674 	case VM_SUSPEND:
675 		vmsuspend = (struct vm_suspend *)data;
676 		error = vm_suspend(sc->vm, vmsuspend->how);
677 		break;
678 	case VM_REINIT:
679 		error = vm_reinit(sc->vm);
680 		break;
681 	case VM_STAT_DESC: {
682 		statdesc = (struct vm_stat_desc *)data;
683 		error = vmm_stat_desc_copy(statdesc->index,
684 					statdesc->desc, sizeof(statdesc->desc));
685 		break;
686 	}
687 #ifdef COMPAT_FREEBSD13
688 	case VM_STATS_OLD:
689 		vmstats_old = (struct vm_stats_old *)data;
690 		getmicrotime(&vmstats_old->tv);
691 		error = vmm_stat_copy(vcpu, 0,
692 				      nitems(vmstats_old->statbuf),
693 				      &vmstats_old->num_entries,
694 				      vmstats_old->statbuf);
695 		break;
696 #endif
697 	case VM_STATS: {
698 		vmstats = (struct vm_stats *)data;
699 		getmicrotime(&vmstats->tv);
700 		error = vmm_stat_copy(vcpu, vmstats->index,
701 				      nitems(vmstats->statbuf),
702 				      &vmstats->num_entries, vmstats->statbuf);
703 		break;
704 	}
705 	case VM_PPTDEV_MSI:
706 		pptmsi = (struct vm_pptdev_msi *)data;
707 		error = ppt_setup_msi(sc->vm,
708 				      pptmsi->bus, pptmsi->slot, pptmsi->func,
709 				      pptmsi->addr, pptmsi->msg,
710 				      pptmsi->numvec);
711 		break;
712 	case VM_PPTDEV_MSIX:
713 		pptmsix = (struct vm_pptdev_msix *)data;
714 		error = ppt_setup_msix(sc->vm,
715 				       pptmsix->bus, pptmsix->slot,
716 				       pptmsix->func, pptmsix->idx,
717 				       pptmsix->addr, pptmsix->msg,
718 				       pptmsix->vector_control);
719 		break;
720 	case VM_PPTDEV_DISABLE_MSIX:
721 		pptdev = (struct vm_pptdev *)data;
722 		error = ppt_disable_msix(sc->vm, pptdev->bus, pptdev->slot,
723 					 pptdev->func);
724 		break;
725 	case VM_MAP_PPTDEV_MMIO:
726 		pptmmio = (struct vm_pptdev_mmio *)data;
727 		error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
728 				     pptmmio->func, pptmmio->gpa, pptmmio->len,
729 				     pptmmio->hpa);
730 		break;
731 	case VM_UNMAP_PPTDEV_MMIO:
732 		pptmmio = (struct vm_pptdev_mmio *)data;
733 		error = ppt_unmap_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
734 				       pptmmio->func, pptmmio->gpa, pptmmio->len);
735 		break;
736 	case VM_BIND_PPTDEV:
737 		pptdev = (struct vm_pptdev *)data;
738 		error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
739 					 pptdev->func);
740 		break;
741 	case VM_UNBIND_PPTDEV:
742 		pptdev = (struct vm_pptdev *)data;
743 		error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
744 					   pptdev->func);
745 		break;
746 	case VM_INJECT_EXCEPTION:
747 		vmexc = (struct vm_exception *)data;
748 		error = vm_inject_exception(vcpu,
749 		    vmexc->vector, vmexc->error_code_valid, vmexc->error_code,
750 		    vmexc->restart_instruction);
751 		break;
752 	case VM_INJECT_NMI:
753 		error = vm_inject_nmi(vcpu);
754 		break;
755 	case VM_LAPIC_IRQ:
756 		vmirq = (struct vm_lapic_irq *)data;
757 		error = lapic_intr_edge(vcpu, vmirq->vector);
758 		break;
759 	case VM_LAPIC_LOCAL_IRQ:
760 		vmirq = (struct vm_lapic_irq *)data;
761 		error = lapic_set_local_intr(sc->vm, vcpu, vmirq->vector);
762 		break;
763 	case VM_LAPIC_MSI:
764 		vmmsi = (struct vm_lapic_msi *)data;
765 		error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg);
766 		break;
767 	case VM_IOAPIC_ASSERT_IRQ:
768 		ioapic_irq = (struct vm_ioapic_irq *)data;
769 		error = vioapic_assert_irq(sc->vm, ioapic_irq->irq);
770 		break;
771 	case VM_IOAPIC_DEASSERT_IRQ:
772 		ioapic_irq = (struct vm_ioapic_irq *)data;
773 		error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq);
774 		break;
775 	case VM_IOAPIC_PULSE_IRQ:
776 		ioapic_irq = (struct vm_ioapic_irq *)data;
777 		error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq);
778 		break;
779 	case VM_IOAPIC_PINCOUNT:
780 		*(int *)data = vioapic_pincount(sc->vm);
781 		break;
782 	case VM_SET_KERNEMU_DEV:
783 	case VM_GET_KERNEMU_DEV: {
784 		mem_region_write_t mwrite;
785 		mem_region_read_t mread;
786 		bool arg;
787 
788 		kernemu = (void *)data;
789 
790 		if (kernemu->access_width > 0)
791 			size = (1u << kernemu->access_width);
792 		else
793 			size = 1;
794 
795 		if (kernemu->gpa >= DEFAULT_APIC_BASE && kernemu->gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
796 			mread = lapic_mmio_read;
797 			mwrite = lapic_mmio_write;
798 		} else if (kernemu->gpa >= VIOAPIC_BASE && kernemu->gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
799 			mread = vioapic_mmio_read;
800 			mwrite = vioapic_mmio_write;
801 		} else if (kernemu->gpa >= VHPET_BASE && kernemu->gpa < VHPET_BASE + VHPET_SIZE) {
802 			mread = vhpet_mmio_read;
803 			mwrite = vhpet_mmio_write;
804 		} else {
805 			error = EINVAL;
806 			break;
807 		}
808 
809 		if (cmd == VM_SET_KERNEMU_DEV)
810 			error = mwrite(vcpu, kernemu->gpa,
811 			    kernemu->value, size, &arg);
812 		else
813 			error = mread(vcpu, kernemu->gpa,
814 			    &kernemu->value, size, &arg);
815 		break;
816 		}
817 	case VM_ISA_ASSERT_IRQ:
818 		isa_irq = (struct vm_isa_irq *)data;
819 		error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq);
820 		if (error == 0 && isa_irq->ioapic_irq != -1)
821 			error = vioapic_assert_irq(sc->vm,
822 			    isa_irq->ioapic_irq);
823 		break;
824 	case VM_ISA_DEASSERT_IRQ:
825 		isa_irq = (struct vm_isa_irq *)data;
826 		error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq);
827 		if (error == 0 && isa_irq->ioapic_irq != -1)
828 			error = vioapic_deassert_irq(sc->vm,
829 			    isa_irq->ioapic_irq);
830 		break;
831 	case VM_ISA_PULSE_IRQ:
832 		isa_irq = (struct vm_isa_irq *)data;
833 		error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq);
834 		if (error == 0 && isa_irq->ioapic_irq != -1)
835 			error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq);
836 		break;
837 	case VM_ISA_SET_IRQ_TRIGGER:
838 		isa_irq_trigger = (struct vm_isa_irq_trigger *)data;
839 		error = vatpic_set_irq_trigger(sc->vm,
840 		    isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger);
841 		break;
842 	case VM_MMAP_GETNEXT:
843 		mm = (struct vm_memmap *)data;
844 		error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
845 		    &mm->segoff, &mm->len, &mm->prot, &mm->flags);
846 		break;
847 	case VM_MMAP_MEMSEG:
848 		mm = (struct vm_memmap *)data;
849 		error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
850 		    mm->len, mm->prot, mm->flags);
851 		break;
852 	case VM_MUNMAP_MEMSEG:
853 		mu = (struct vm_munmap *)data;
854 		error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len);
855 		break;
856 #ifdef COMPAT_FREEBSD12
857 	case VM_ALLOC_MEMSEG_FBSD12:
858 		error = alloc_memseg(sc, (struct vm_memseg *)data,
859 		    sizeof(((struct vm_memseg_fbsd12 *)0)->name));
860 		break;
861 #endif
862 	case VM_ALLOC_MEMSEG:
863 		error = alloc_memseg(sc, (struct vm_memseg *)data,
864 		    sizeof(((struct vm_memseg *)0)->name));
865 		break;
866 #ifdef COMPAT_FREEBSD12
867 	case VM_GET_MEMSEG_FBSD12:
868 		error = get_memseg(sc, (struct vm_memseg *)data,
869 		    sizeof(((struct vm_memseg_fbsd12 *)0)->name));
870 		break;
871 #endif
872 	case VM_GET_MEMSEG:
873 		error = get_memseg(sc, (struct vm_memseg *)data,
874 		    sizeof(((struct vm_memseg *)0)->name));
875 		break;
876 	case VM_GET_REGISTER:
877 		vmreg = (struct vm_register *)data;
878 		error = vm_get_register(vcpu, vmreg->regnum, &vmreg->regval);
879 		break;
880 	case VM_SET_REGISTER:
881 		vmreg = (struct vm_register *)data;
882 		error = vm_set_register(vcpu, vmreg->regnum, vmreg->regval);
883 		break;
884 	case VM_SET_SEGMENT_DESCRIPTOR:
885 		vmsegdesc = (struct vm_seg_desc *)data;
886 		error = vm_set_seg_desc(vcpu,
887 					vmsegdesc->regnum,
888 					&vmsegdesc->desc);
889 		break;
890 	case VM_GET_SEGMENT_DESCRIPTOR:
891 		vmsegdesc = (struct vm_seg_desc *)data;
892 		error = vm_get_seg_desc(vcpu,
893 					vmsegdesc->regnum,
894 					&vmsegdesc->desc);
895 		break;
896 	case VM_GET_REGISTER_SET:
897 		vmregset = (struct vm_register_set *)data;
898 		if (vmregset->count > VM_REG_LAST) {
899 			error = EINVAL;
900 			break;
901 		}
902 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
903 		    M_WAITOK);
904 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
905 		    M_WAITOK);
906 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
907 		    vmregset->count);
908 		if (error == 0)
909 			error = vm_get_register_set(vcpu,
910 			    vmregset->count, regnums, regvals);
911 		if (error == 0)
912 			error = copyout(regvals, vmregset->regvals,
913 			    sizeof(regvals[0]) * vmregset->count);
914 		free(regvals, M_VMMDEV);
915 		free(regnums, M_VMMDEV);
916 		break;
917 	case VM_SET_REGISTER_SET:
918 		vmregset = (struct vm_register_set *)data;
919 		if (vmregset->count > VM_REG_LAST) {
920 			error = EINVAL;
921 			break;
922 		}
923 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
924 		    M_WAITOK);
925 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
926 		    M_WAITOK);
927 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
928 		    vmregset->count);
929 		if (error == 0)
930 			error = copyin(vmregset->regvals, regvals,
931 			    sizeof(regvals[0]) * vmregset->count);
932 		if (error == 0)
933 			error = vm_set_register_set(vcpu,
934 			    vmregset->count, regnums, regvals);
935 		free(regvals, M_VMMDEV);
936 		free(regnums, M_VMMDEV);
937 		break;
938 	case VM_GET_CAPABILITY:
939 		vmcap = (struct vm_capability *)data;
940 		error = vm_get_capability(vcpu,
941 					  vmcap->captype,
942 					  &vmcap->capval);
943 		break;
944 	case VM_SET_CAPABILITY:
945 		vmcap = (struct vm_capability *)data;
946 		error = vm_set_capability(vcpu,
947 					  vmcap->captype,
948 					  vmcap->capval);
949 		break;
950 	case VM_SET_X2APIC_STATE:
951 		x2apic = (struct vm_x2apic *)data;
952 		error = vm_set_x2apic_state(vcpu, x2apic->state);
953 		break;
954 	case VM_GET_X2APIC_STATE:
955 		x2apic = (struct vm_x2apic *)data;
956 		error = vm_get_x2apic_state(vcpu, &x2apic->state);
957 		break;
958 	case VM_GET_GPA_PMAP:
959 		gpapte = (struct vm_gpa_pte *)data;
960 		pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
961 				 gpapte->gpa, gpapte->pte, &gpapte->ptenum);
962 		error = 0;
963 		break;
964 	case VM_GET_HPET_CAPABILITIES:
965 		error = vhpet_getcap((struct vm_hpet_cap *)data);
966 		break;
967 	case VM_GLA2GPA: {
968 		CTASSERT(PROT_READ == VM_PROT_READ);
969 		CTASSERT(PROT_WRITE == VM_PROT_WRITE);
970 		CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
971 		gg = (struct vm_gla2gpa *)data;
972 		error = vm_gla2gpa(vcpu, &gg->paging, gg->gla,
973 		    gg->prot, &gg->gpa, &gg->fault);
974 		KASSERT(error == 0 || error == EFAULT,
975 		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
976 		break;
977 	}
978 	case VM_GLA2GPA_NOFAULT:
979 		gg = (struct vm_gla2gpa *)data;
980 		error = vm_gla2gpa_nofault(vcpu, &gg->paging, gg->gla,
981 		    gg->prot, &gg->gpa, &gg->fault);
982 		KASSERT(error == 0 || error == EFAULT,
983 		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
984 		break;
985 	case VM_ACTIVATE_CPU:
986 		error = vm_activate_cpu(vcpu);
987 		break;
988 	case VM_GET_CPUS:
989 		error = 0;
990 		vm_cpuset = (struct vm_cpuset *)data;
991 		size = vm_cpuset->cpusetsize;
992 		if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) {
993 			error = ERANGE;
994 			break;
995 		}
996 		cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
997 		if (vm_cpuset->which == VM_ACTIVE_CPUS)
998 			*cpuset = vm_active_cpus(sc->vm);
999 		else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
1000 			*cpuset = vm_suspended_cpus(sc->vm);
1001 		else if (vm_cpuset->which == VM_DEBUG_CPUS)
1002 			*cpuset = vm_debug_cpus(sc->vm);
1003 		else
1004 			error = EINVAL;
1005 		if (error == 0)
1006 			error = copyout(cpuset, vm_cpuset->cpus, size);
1007 		free(cpuset, M_TEMP);
1008 		break;
1009 	case VM_SUSPEND_CPU:
1010 		error = vm_suspend_cpu(sc->vm, vcpu);
1011 		break;
1012 	case VM_RESUME_CPU:
1013 		error = vm_resume_cpu(sc->vm, vcpu);
1014 		break;
1015 	case VM_SET_INTINFO:
1016 		vmii = (struct vm_intinfo *)data;
1017 		error = vm_exit_intinfo(vcpu, vmii->info1);
1018 		break;
1019 	case VM_GET_INTINFO:
1020 		vmii = (struct vm_intinfo *)data;
1021 		error = vm_get_intinfo(vcpu, &vmii->info1, &vmii->info2);
1022 		break;
1023 	case VM_RTC_WRITE:
1024 		rtcdata = (struct vm_rtc_data *)data;
1025 		error = vrtc_nvram_write(sc->vm, rtcdata->offset,
1026 		    rtcdata->value);
1027 		break;
1028 	case VM_RTC_READ:
1029 		rtcdata = (struct vm_rtc_data *)data;
1030 		error = vrtc_nvram_read(sc->vm, rtcdata->offset,
1031 		    &rtcdata->value);
1032 		break;
1033 	case VM_RTC_SETTIME:
1034 		rtctime = (struct vm_rtc_time *)data;
1035 		error = vrtc_set_time(sc->vm, rtctime->secs);
1036 		break;
1037 	case VM_RTC_GETTIME:
1038 		error = 0;
1039 		rtctime = (struct vm_rtc_time *)data;
1040 		rtctime->secs = vrtc_get_time(sc->vm);
1041 		break;
1042 	case VM_RESTART_INSTRUCTION:
1043 		error = vm_restart_instruction(vcpu);
1044 		break;
1045 	case VM_SET_TOPOLOGY:
1046 		topology = (struct vm_cpu_topology *)data;
1047 		error = vm_set_topology(sc->vm, topology->sockets,
1048 		    topology->cores, topology->threads, topology->maxcpus);
1049 		break;
1050 	case VM_GET_TOPOLOGY:
1051 		topology = (struct vm_cpu_topology *)data;
1052 		vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
1053 		    &topology->threads, &topology->maxcpus);
1054 		error = 0;
1055 		break;
1056 #ifdef BHYVE_SNAPSHOT
1057 	case VM_SNAPSHOT_REQ:
1058 		snapshot_meta = (struct vm_snapshot_meta *)data;
1059 		error = vm_snapshot_req(sc->vm, snapshot_meta);
1060 		break;
1061 #ifdef COMPAT_FREEBSD13
1062 	case VM_SNAPSHOT_REQ_OLD:
1063 		/*
1064 		 * The old structure just has an additional pointer at
1065 		 * the start that is ignored.
1066 		 */
1067 		snapshot_old = (struct vm_snapshot_meta_old *)data;
1068 		snapshot_meta =
1069 		    (struct vm_snapshot_meta *)&snapshot_old->dev_data;
1070 		error = vm_snapshot_req(sc->vm, snapshot_meta);
1071 		break;
1072 #endif
1073 	case VM_RESTORE_TIME:
1074 		error = vm_restore_time(sc->vm);
1075 		break;
1076 #endif
1077 	default:
1078 		error = ENOTTY;
1079 		break;
1080 	}
1081 
1082 done:
1083 	if (vcpus_locked == SINGLE)
1084 		vcpu_unlock_one(sc, vcpuid, vcpu);
1085 	else if (vcpus_locked == ALL)
1086 		vcpu_unlock_all(sc);
1087 	if (memsegs_locked)
1088 		vm_unlock_memsegs(sc->vm);
1089 
1090 	/*
1091 	 * Make sure that no handler returns a kernel-internal
1092 	 * error value to userspace.
1093 	 */
1094 	KASSERT(error == ERESTART || error >= 0,
1095 	    ("vmmdev_ioctl: invalid error return %d", error));
1096 	return (error);
1097 }
1098 
1099 static int
1100 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
1101     struct vm_object **objp, int nprot)
1102 {
1103 	struct vmmdev_softc *sc;
1104 	vm_paddr_t gpa;
1105 	size_t len;
1106 	vm_ooffset_t segoff, first, last;
1107 	int error, found, segid;
1108 	bool sysmem;
1109 
1110 	error = vmm_priv_check(curthread->td_ucred);
1111 	if (error)
1112 		return (error);
1113 
1114 	first = *offset;
1115 	last = first + mapsize;
1116 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
1117 		return (EINVAL);
1118 
1119 	sc = vmmdev_lookup2(cdev);
1120 	if (sc == NULL) {
1121 		/* virtual machine is in the process of being created */
1122 		return (EINVAL);
1123 	}
1124 
1125 	/*
1126 	 * Get a read lock on the guest memory map.
1127 	 */
1128 	vm_slock_memsegs(sc->vm);
1129 
1130 	gpa = 0;
1131 	found = 0;
1132 	while (!found) {
1133 		error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
1134 		    NULL, NULL);
1135 		if (error)
1136 			break;
1137 
1138 		if (first >= gpa && last <= gpa + len)
1139 			found = 1;
1140 		else
1141 			gpa += len;
1142 	}
1143 
1144 	if (found) {
1145 		error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
1146 		KASSERT(error == 0 && *objp != NULL,
1147 		    ("%s: invalid memory segment %d", __func__, segid));
1148 		if (sysmem) {
1149 			vm_object_reference(*objp);
1150 			*offset = segoff + (first - gpa);
1151 		} else {
1152 			error = EINVAL;
1153 		}
1154 	}
1155 	vm_unlock_memsegs(sc->vm);
1156 	return (error);
1157 }
1158 
1159 static void
1160 vmmdev_destroy(void *arg)
1161 {
1162 	struct vmmdev_softc *sc = arg;
1163 	struct devmem_softc *dsc;
1164 	int error __diagused;
1165 
1166 	vm_disable_vcpu_creation(sc->vm);
1167 	error = vcpu_lock_all(sc);
1168 	KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
1169 	vm_unlock_vcpus(sc->vm);
1170 
1171 	while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
1172 		KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
1173 		SLIST_REMOVE_HEAD(&sc->devmem, link);
1174 		free(dsc->name, M_VMMDEV);
1175 		free(dsc, M_VMMDEV);
1176 	}
1177 
1178 	if (sc->cdev != NULL)
1179 		destroy_dev(sc->cdev);
1180 
1181 	if (sc->vm != NULL)
1182 		vm_destroy(sc->vm);
1183 
1184 	if (sc->ucred != NULL)
1185 		crfree(sc->ucred);
1186 
1187 	if ((sc->flags & VSC_LINKED) != 0) {
1188 		mtx_lock(&vmmdev_mtx);
1189 		SLIST_REMOVE(&head, sc, vmmdev_softc, link);
1190 		mtx_unlock(&vmmdev_mtx);
1191 	}
1192 
1193 	free(sc, M_VMMDEV);
1194 }
1195 
1196 static int
1197 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
1198 {
1199 	struct devmem_softc *dsc;
1200 	struct vmmdev_softc *sc;
1201 	struct cdev *cdev;
1202 	char *buf;
1203 	int error, buflen;
1204 
1205 	error = vmm_priv_check(req->td->td_ucred);
1206 	if (error)
1207 		return (error);
1208 
1209 	buflen = VM_MAX_NAMELEN + 1;
1210 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
1211 	strlcpy(buf, "beavis", buflen);
1212 	error = sysctl_handle_string(oidp, buf, buflen, req);
1213 	if (error != 0 || req->newptr == NULL)
1214 		goto out;
1215 
1216 	mtx_lock(&vmmdev_mtx);
1217 	sc = vmmdev_lookup(buf);
1218 	if (sc == NULL || sc->cdev == NULL) {
1219 		mtx_unlock(&vmmdev_mtx);
1220 		error = EINVAL;
1221 		goto out;
1222 	}
1223 
1224 	/*
1225 	 * Setting 'sc->cdev' to NULL is used to indicate that the VM
1226 	 * is scheduled for destruction.
1227 	 */
1228 	cdev = sc->cdev;
1229 	sc->cdev = NULL;
1230 	mtx_unlock(&vmmdev_mtx);
1231 
1232 	/*
1233 	 * Destroy all cdevs:
1234 	 *
1235 	 * - any new operations on the 'cdev' will return an error (ENXIO).
1236 	 *
1237 	 * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
1238 	 */
1239 	SLIST_FOREACH(dsc, &sc->devmem, link) {
1240 		KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
1241 		destroy_dev(dsc->cdev);
1242 		devmem_destroy(dsc);
1243 	}
1244 	destroy_dev(cdev);
1245 	vmmdev_destroy(sc);
1246 	error = 0;
1247 
1248 out:
1249 	free(buf, M_VMMDEV);
1250 	return (error);
1251 }
1252 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy,
1253     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
1254     NULL, 0, sysctl_vmm_destroy, "A",
1255     NULL);
1256 
1257 static struct cdevsw vmmdevsw = {
1258 	.d_name		= "vmmdev",
1259 	.d_version	= D_VERSION,
1260 	.d_ioctl	= vmmdev_ioctl,
1261 	.d_mmap_single	= vmmdev_mmap_single,
1262 	.d_read		= vmmdev_rw,
1263 	.d_write	= vmmdev_rw,
1264 };
1265 
1266 static int
1267 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
1268 {
1269 	struct vm *vm;
1270 	struct cdev *cdev;
1271 	struct vmmdev_softc *sc, *sc2;
1272 	char *buf;
1273 	int error, buflen;
1274 
1275 	error = vmm_priv_check(req->td->td_ucred);
1276 	if (error)
1277 		return (error);
1278 
1279 	buflen = VM_MAX_NAMELEN + 1;
1280 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
1281 	strlcpy(buf, "beavis", buflen);
1282 	error = sysctl_handle_string(oidp, buf, buflen, req);
1283 	if (error != 0 || req->newptr == NULL)
1284 		goto out;
1285 
1286 	mtx_lock(&vmmdev_mtx);
1287 	sc = vmmdev_lookup(buf);
1288 	mtx_unlock(&vmmdev_mtx);
1289 	if (sc != NULL) {
1290 		error = EEXIST;
1291 		goto out;
1292 	}
1293 
1294 	error = vm_create(buf, &vm);
1295 	if (error != 0)
1296 		goto out;
1297 
1298 	sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1299 	sc->ucred = crhold(curthread->td_ucred);
1300 	sc->vm = vm;
1301 	SLIST_INIT(&sc->devmem);
1302 
1303 	/*
1304 	 * Lookup the name again just in case somebody sneaked in when we
1305 	 * dropped the lock.
1306 	 */
1307 	mtx_lock(&vmmdev_mtx);
1308 	sc2 = vmmdev_lookup(buf);
1309 	if (sc2 == NULL) {
1310 		SLIST_INSERT_HEAD(&head, sc, link);
1311 		sc->flags |= VSC_LINKED;
1312 	}
1313 	mtx_unlock(&vmmdev_mtx);
1314 
1315 	if (sc2 != NULL) {
1316 		vmmdev_destroy(sc);
1317 		error = EEXIST;
1318 		goto out;
1319 	}
1320 
1321 	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, sc->ucred,
1322 	    UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf);
1323 	if (error != 0) {
1324 		vmmdev_destroy(sc);
1325 		goto out;
1326 	}
1327 
1328 	mtx_lock(&vmmdev_mtx);
1329 	sc->cdev = cdev;
1330 	sc->cdev->si_drv1 = sc;
1331 	mtx_unlock(&vmmdev_mtx);
1332 
1333 out:
1334 	free(buf, M_VMMDEV);
1335 	return (error);
1336 }
1337 SYSCTL_PROC(_hw_vmm, OID_AUTO, create,
1338     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
1339     NULL, 0, sysctl_vmm_create, "A",
1340     NULL);
1341 
1342 void
1343 vmmdev_init(void)
1344 {
1345 	pr_allow_flag = prison_add_allow(NULL, "vmm", NULL,
1346 	    "Allow use of vmm in a jail.");
1347 }
1348 
1349 int
1350 vmmdev_cleanup(void)
1351 {
1352 	int error;
1353 
1354 	if (SLIST_EMPTY(&head))
1355 		error = 0;
1356 	else
1357 		error = EBUSY;
1358 
1359 	return (error);
1360 }
1361 
1362 static int
1363 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
1364     struct vm_object **objp, int nprot)
1365 {
1366 	struct devmem_softc *dsc;
1367 	vm_ooffset_t first, last;
1368 	size_t seglen;
1369 	int error;
1370 	bool sysmem;
1371 
1372 	dsc = cdev->si_drv1;
1373 	if (dsc == NULL) {
1374 		/* 'cdev' has been created but is not ready for use */
1375 		return (ENXIO);
1376 	}
1377 
1378 	first = *offset;
1379 	last = *offset + len;
1380 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
1381 		return (EINVAL);
1382 
1383 	vm_slock_memsegs(dsc->sc->vm);
1384 
1385 	error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
1386 	KASSERT(error == 0 && !sysmem && *objp != NULL,
1387 	    ("%s: invalid devmem segment %d", __func__, dsc->segid));
1388 
1389 	if (seglen >= last)
1390 		vm_object_reference(*objp);
1391 	else
1392 		error = EINVAL;
1393 
1394 	vm_unlock_memsegs(dsc->sc->vm);
1395 	return (error);
1396 }
1397 
1398 static struct cdevsw devmemsw = {
1399 	.d_name		= "devmem",
1400 	.d_version	= D_VERSION,
1401 	.d_mmap_single	= devmem_mmap_single,
1402 };
1403 
1404 static int
1405 devmem_create_cdev(const char *vmname, int segid, char *devname)
1406 {
1407 	struct devmem_softc *dsc;
1408 	struct vmmdev_softc *sc;
1409 	struct cdev *cdev;
1410 	int error;
1411 
1412 	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL,
1413 	    UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname);
1414 	if (error)
1415 		return (error);
1416 
1417 	dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1418 
1419 	mtx_lock(&vmmdev_mtx);
1420 	sc = vmmdev_lookup(vmname);
1421 	KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname));
1422 	if (sc->cdev == NULL) {
1423 		/* virtual machine is being created or destroyed */
1424 		mtx_unlock(&vmmdev_mtx);
1425 		free(dsc, M_VMMDEV);
1426 		destroy_dev_sched_cb(cdev, NULL, 0);
1427 		return (ENODEV);
1428 	}
1429 
1430 	dsc->segid = segid;
1431 	dsc->name = devname;
1432 	dsc->cdev = cdev;
1433 	dsc->sc = sc;
1434 	SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
1435 	mtx_unlock(&vmmdev_mtx);
1436 
1437 	/* The 'cdev' is ready for use after 'si_drv1' is initialized */
1438 	cdev->si_drv1 = dsc;
1439 	return (0);
1440 }
1441 
1442 static void
1443 devmem_destroy(void *arg)
1444 {
1445 	struct devmem_softc *dsc = arg;
1446 
1447 	KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__));
1448 	dsc->cdev = NULL;
1449 	dsc->sc = NULL;
1450 }
1451