xref: /freebsd/sys/amd64/vmm/vmm_dev.c (revision 15f0b8c3)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_bhyve_snapshot.h"
35 
36 #include <sys/param.h>
37 #include <sys/kernel.h>
38 #include <sys/jail.h>
39 #include <sys/queue.h>
40 #include <sys/lock.h>
41 #include <sys/mutex.h>
42 #include <sys/malloc.h>
43 #include <sys/conf.h>
44 #include <sys/sysctl.h>
45 #include <sys/libkern.h>
46 #include <sys/ioccom.h>
47 #include <sys/mman.h>
48 #include <sys/uio.h>
49 #include <sys/proc.h>
50 
51 #include <vm/vm.h>
52 #include <vm/pmap.h>
53 #include <vm/vm_map.h>
54 #include <vm/vm_object.h>
55 
56 #include <machine/vmparam.h>
57 #include <machine/vmm.h>
58 #include <machine/vmm_dev.h>
59 #include <machine/vmm_instruction_emul.h>
60 #include <machine/vmm_snapshot.h>
61 #include <x86/apicreg.h>
62 
63 #include "vmm_lapic.h"
64 #include "vmm_stat.h"
65 #include "vmm_mem.h"
66 #include "io/ppt.h"
67 #include "io/vatpic.h"
68 #include "io/vioapic.h"
69 #include "io/vhpet.h"
70 #include "io/vrtc.h"
71 
72 #ifdef COMPAT_FREEBSD13
73 struct vm_stats_old {
74 	int		cpuid;				/* in */
75 	int		num_entries;			/* out */
76 	struct timeval	tv;
77 	uint64_t	statbuf[MAX_VM_STATS];
78 };
79 
80 #define	VM_STATS_OLD \
81 	_IOWR('v', IOCNUM_VM_STATS, struct vm_stats_old)
82 #endif
83 
84 struct devmem_softc {
85 	int	segid;
86 	char	*name;
87 	struct cdev *cdev;
88 	struct vmmdev_softc *sc;
89 	SLIST_ENTRY(devmem_softc) link;
90 };
91 
92 struct vmmdev_softc {
93 	struct vm	*vm;		/* vm instance cookie */
94 	struct cdev	*cdev;
95 	struct ucred	*ucred;
96 	SLIST_ENTRY(vmmdev_softc) link;
97 	SLIST_HEAD(, devmem_softc) devmem;
98 	int		flags;
99 };
100 #define	VSC_LINKED		0x01
101 
102 static SLIST_HEAD(, vmmdev_softc) head;
103 
104 static unsigned pr_allow_flag;
105 static struct mtx vmmdev_mtx;
106 MTX_SYSINIT(vmmdev_mtx, &vmmdev_mtx, "vmm device mutex", MTX_DEF);
107 
108 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
109 
110 SYSCTL_DECL(_hw_vmm);
111 
112 static int vmm_priv_check(struct ucred *ucred);
113 static int devmem_create_cdev(const char *vmname, int id, char *devmem);
114 static void devmem_destroy(void *arg);
115 
116 static int
117 vmm_priv_check(struct ucred *ucred)
118 {
119 
120 	if (jailed(ucred) &&
121 	    !(ucred->cr_prison->pr_allow & pr_allow_flag))
122 		return (EPERM);
123 
124 	return (0);
125 }
126 
127 static int
128 vcpu_lock_one(struct vcpu *vcpu)
129 {
130 	return (vcpu_set_state(vcpu, VCPU_FROZEN, true));
131 }
132 
133 static void
134 vcpu_unlock_one(struct vmmdev_softc *sc, int vcpuid, struct vcpu *vcpu)
135 {
136 	enum vcpu_state state;
137 
138 	state = vcpu_get_state(vcpu, NULL);
139 	if (state != VCPU_FROZEN) {
140 		panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm),
141 		    vcpuid, state);
142 	}
143 
144 	vcpu_set_state(vcpu, VCPU_IDLE, false);
145 }
146 
147 static int
148 vcpu_lock_all(struct vmmdev_softc *sc)
149 {
150 	struct vcpu *vcpu;
151 	int error;
152 	uint16_t i, j, maxcpus;
153 
154 	vm_slock_vcpus(sc->vm);
155 	maxcpus = vm_get_maxcpus(sc->vm);
156 	for (i = 0; i < maxcpus; i++) {
157 		vcpu = vm_vcpu(sc->vm, i);
158 		if (vcpu == NULL)
159 			continue;
160 		error = vcpu_lock_one(vcpu);
161 		if (error)
162 			break;
163 	}
164 
165 	if (error) {
166 		for (j = 0; j < i; j++) {
167 			vcpu = vm_vcpu(sc->vm, j);
168 			if (vcpu == NULL)
169 				continue;
170 			vcpu_unlock_one(sc, j, vcpu);
171 		}
172 		vm_unlock_vcpus(sc->vm);
173 	}
174 
175 	return (error);
176 }
177 
178 static void
179 vcpu_unlock_all(struct vmmdev_softc *sc)
180 {
181 	struct vcpu *vcpu;
182 	uint16_t i, maxcpus;
183 
184 	maxcpus = vm_get_maxcpus(sc->vm);
185 	for (i = 0; i < maxcpus; i++) {
186 		vcpu = vm_vcpu(sc->vm, i);
187 		if (vcpu == NULL)
188 			continue;
189 		vcpu_unlock_one(sc, i, vcpu);
190 	}
191 	vm_unlock_vcpus(sc->vm);
192 }
193 
194 static struct vmmdev_softc *
195 vmmdev_lookup(const char *name)
196 {
197 	struct vmmdev_softc *sc;
198 
199 #ifdef notyet	/* XXX kernel is not compiled with invariants */
200 	mtx_assert(&vmmdev_mtx, MA_OWNED);
201 #endif
202 
203 	SLIST_FOREACH(sc, &head, link) {
204 		if (strcmp(name, vm_name(sc->vm)) == 0)
205 			break;
206 	}
207 
208 	if (sc == NULL)
209 		return (NULL);
210 
211 	if (cr_cansee(curthread->td_ucred, sc->ucred))
212 		return (NULL);
213 
214 	return (sc);
215 }
216 
217 static struct vmmdev_softc *
218 vmmdev_lookup2(struct cdev *cdev)
219 {
220 
221 	return (cdev->si_drv1);
222 }
223 
224 static int
225 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
226 {
227 	int error, off, c, prot;
228 	vm_paddr_t gpa, maxaddr;
229 	void *hpa, *cookie;
230 	struct vmmdev_softc *sc;
231 
232 	error = vmm_priv_check(curthread->td_ucred);
233 	if (error)
234 		return (error);
235 
236 	sc = vmmdev_lookup2(cdev);
237 	if (sc == NULL)
238 		return (ENXIO);
239 
240 	/*
241 	 * Get a read lock on the guest memory map.
242 	 */
243 	vm_slock_memsegs(sc->vm);
244 
245 	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
246 	maxaddr = vmm_sysmem_maxaddr(sc->vm);
247 	while (uio->uio_resid > 0 && error == 0) {
248 		gpa = uio->uio_offset;
249 		off = gpa & PAGE_MASK;
250 		c = min(uio->uio_resid, PAGE_SIZE - off);
251 
252 		/*
253 		 * The VM has a hole in its physical memory map. If we want to
254 		 * use 'dd' to inspect memory beyond the hole we need to
255 		 * provide bogus data for memory that lies in the hole.
256 		 *
257 		 * Since this device does not support lseek(2), dd(1) will
258 		 * read(2) blocks of data to simulate the lseek(2).
259 		 */
260 		hpa = vm_gpa_hold_global(sc->vm, gpa, c, prot, &cookie);
261 		if (hpa == NULL) {
262 			if (uio->uio_rw == UIO_READ && gpa < maxaddr)
263 				error = uiomove(__DECONST(void *, zero_region),
264 				    c, uio);
265 			else
266 				error = EFAULT;
267 		} else {
268 			error = uiomove(hpa, c, uio);
269 			vm_gpa_release(cookie);
270 		}
271 	}
272 	vm_unlock_memsegs(sc->vm);
273 	return (error);
274 }
275 
276 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1);
277 
278 static int
279 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
280 {
281 	struct devmem_softc *dsc;
282 	int error;
283 	bool sysmem;
284 
285 	error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
286 	if (error || mseg->len == 0)
287 		return (error);
288 
289 	if (!sysmem) {
290 		SLIST_FOREACH(dsc, &sc->devmem, link) {
291 			if (dsc->segid == mseg->segid)
292 				break;
293 		}
294 		KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
295 		    __func__, mseg->segid));
296 		error = copystr(dsc->name, mseg->name, len, NULL);
297 	} else {
298 		bzero(mseg->name, len);
299 	}
300 
301 	return (error);
302 }
303 
304 static int
305 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
306 {
307 	char *name;
308 	int error;
309 	bool sysmem;
310 
311 	error = 0;
312 	name = NULL;
313 	sysmem = true;
314 
315 	/*
316 	 * The allocation is lengthened by 1 to hold a terminating NUL.  It'll
317 	 * by stripped off when devfs processes the full string.
318 	 */
319 	if (VM_MEMSEG_NAME(mseg)) {
320 		sysmem = false;
321 		name = malloc(len, M_VMMDEV, M_WAITOK);
322 		error = copystr(mseg->name, name, len, NULL);
323 		if (error)
324 			goto done;
325 	}
326 
327 	error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
328 	if (error)
329 		goto done;
330 
331 	if (VM_MEMSEG_NAME(mseg)) {
332 		error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name);
333 		if (error)
334 			vm_free_memseg(sc->vm, mseg->segid);
335 		else
336 			name = NULL;	/* freed when 'cdev' is destroyed */
337 	}
338 done:
339 	free(name, M_VMMDEV);
340 	return (error);
341 }
342 
343 static int
344 vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
345     uint64_t *regval)
346 {
347 	int error, i;
348 
349 	error = 0;
350 	for (i = 0; i < count; i++) {
351 		error = vm_get_register(vcpu, regnum[i], &regval[i]);
352 		if (error)
353 			break;
354 	}
355 	return (error);
356 }
357 
358 static int
359 vm_set_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
360     uint64_t *regval)
361 {
362 	int error, i;
363 
364 	error = 0;
365 	for (i = 0; i < count; i++) {
366 		error = vm_set_register(vcpu, regnum[i], regval[i]);
367 		if (error)
368 			break;
369 	}
370 	return (error);
371 }
372 
373 static int
374 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
375 	     struct thread *td)
376 {
377 	int error, vcpuid, size;
378 	cpuset_t *cpuset;
379 	struct vmmdev_softc *sc;
380 	struct vcpu *vcpu;
381 	struct vm_register *vmreg;
382 	struct vm_seg_desc *vmsegdesc;
383 	struct vm_register_set *vmregset;
384 	struct vm_run *vmrun;
385 	struct vm_exception *vmexc;
386 	struct vm_lapic_irq *vmirq;
387 	struct vm_lapic_msi *vmmsi;
388 	struct vm_ioapic_irq *ioapic_irq;
389 	struct vm_isa_irq *isa_irq;
390 	struct vm_isa_irq_trigger *isa_irq_trigger;
391 	struct vm_capability *vmcap;
392 	struct vm_pptdev *pptdev;
393 	struct vm_pptdev_mmio *pptmmio;
394 	struct vm_pptdev_msi *pptmsi;
395 	struct vm_pptdev_msix *pptmsix;
396 #ifdef COMPAT_FREEBSD13
397 	struct vm_stats_old *vmstats_old;
398 #endif
399 	struct vm_stats *vmstats;
400 	struct vm_stat_desc *statdesc;
401 	struct vm_x2apic *x2apic;
402 	struct vm_gpa_pte *gpapte;
403 	struct vm_suspend *vmsuspend;
404 	struct vm_gla2gpa *gg;
405 	struct vm_cpuset *vm_cpuset;
406 	struct vm_intinfo *vmii;
407 	struct vm_rtc_time *rtctime;
408 	struct vm_rtc_data *rtcdata;
409 	struct vm_memmap *mm;
410 	struct vm_munmap *mu;
411 	struct vm_cpu_topology *topology;
412 	struct vm_readwrite_kernemu_device *kernemu;
413 	uint64_t *regvals;
414 	int *regnums;
415 	enum { NONE, SINGLE, ALL } vcpus_locked;
416 	bool memsegs_locked;
417 #ifdef BHYVE_SNAPSHOT
418 	struct vm_snapshot_meta *snapshot_meta;
419 #endif
420 
421 	error = vmm_priv_check(curthread->td_ucred);
422 	if (error)
423 		return (error);
424 
425 	sc = vmmdev_lookup2(cdev);
426 	if (sc == NULL)
427 		return (ENXIO);
428 
429 	vcpuid = -1;
430 	vcpu = NULL;
431 	vcpus_locked = NONE;
432 	memsegs_locked = false;
433 
434 	/*
435 	 * For VMM ioctls that operate on a single vCPU, lookup the
436 	 * vcpu.  For VMM ioctls which require one or more vCPUs to
437 	 * not be running, lock necessary vCPUs.
438 	 *
439 	 * XXX fragile, handle with care
440 	 * Most of these assume that the first field of the ioctl data
441 	 * is the vcpuid.
442 	 */
443 	switch (cmd) {
444 	case VM_RUN:
445 	case VM_GET_REGISTER:
446 	case VM_SET_REGISTER:
447 	case VM_GET_SEGMENT_DESCRIPTOR:
448 	case VM_SET_SEGMENT_DESCRIPTOR:
449 	case VM_GET_REGISTER_SET:
450 	case VM_SET_REGISTER_SET:
451 	case VM_INJECT_EXCEPTION:
452 	case VM_GET_CAPABILITY:
453 	case VM_SET_CAPABILITY:
454 	case VM_SET_X2APIC_STATE:
455 	case VM_GLA2GPA:
456 	case VM_GLA2GPA_NOFAULT:
457 	case VM_ACTIVATE_CPU:
458 	case VM_SET_INTINFO:
459 	case VM_GET_INTINFO:
460 	case VM_RESTART_INSTRUCTION:
461 	case VM_GET_KERNEMU_DEV:
462 	case VM_SET_KERNEMU_DEV:
463 		/*
464 		 * ioctls that can operate only on vcpus that are not running.
465 		 */
466 		vcpuid = *(int *)data;
467 		vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
468 		if (vcpu == NULL) {
469 			error = EINVAL;
470 			goto done;
471 		}
472 		error = vcpu_lock_one(vcpu);
473 		if (error)
474 			goto done;
475 		vcpus_locked = SINGLE;
476 		break;
477 
478 #ifdef COMPAT_FREEBSD12
479 	case VM_ALLOC_MEMSEG_FBSD12:
480 #endif
481 	case VM_ALLOC_MEMSEG:
482 	case VM_BIND_PPTDEV:
483 	case VM_UNBIND_PPTDEV:
484 	case VM_MMAP_MEMSEG:
485 	case VM_MUNMAP_MEMSEG:
486 	case VM_REINIT:
487 		/*
488 		 * ioctls that modify the memory map must lock memory
489 		 * segments exclusively.
490 		 */
491 		vm_xlock_memsegs(sc->vm);
492 		memsegs_locked = true;
493 		/* FALLTHROUGH */
494 	case VM_MAP_PPTDEV_MMIO:
495 	case VM_UNMAP_PPTDEV_MMIO:
496 #ifdef BHYVE_SNAPSHOT
497 	case VM_SNAPSHOT_REQ:
498 	case VM_RESTORE_TIME:
499 #endif
500 		/*
501 		 * ioctls that operate on the entire virtual machine must
502 		 * prevent all vcpus from running.
503 		 */
504 		error = vcpu_lock_all(sc);
505 		if (error)
506 			goto done;
507 		vcpus_locked = ALL;
508 		break;
509 
510 #ifdef COMPAT_FREEBSD12
511 	case VM_GET_MEMSEG_FBSD12:
512 #endif
513 	case VM_GET_MEMSEG:
514 	case VM_MMAP_GETNEXT:
515 		/*
516 		 * Lock the memory map while it is being inspected.
517 		 */
518 		vm_slock_memsegs(sc->vm);
519 		memsegs_locked = true;
520 		break;
521 
522 #ifdef COMPAT_FREEBSD13
523 	case VM_STATS_OLD:
524 #endif
525 	case VM_STATS:
526 	case VM_INJECT_NMI:
527 	case VM_LAPIC_IRQ:
528 	case VM_GET_X2APIC_STATE:
529 		/*
530 		 * These do not need the vCPU locked but do operate on
531 		 * a specific vCPU.
532 		 */
533 		vcpuid = *(int *)data;
534 		vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
535 		if (vcpu == NULL) {
536 			error = EINVAL;
537 			goto done;
538 		}
539 		break;
540 
541 	case VM_LAPIC_LOCAL_IRQ:
542 	case VM_SUSPEND_CPU:
543 	case VM_RESUME_CPU:
544 		/*
545 		 * These can either operate on all CPUs via a vcpuid of
546 		 * -1 or on a specific vCPU.
547 		 */
548 		vcpuid = *(int *)data;
549 		if (vcpuid == -1)
550 			break;
551 		vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
552 		if (vcpu == NULL) {
553 			error = EINVAL;
554 			goto done;
555 		}
556 		break;
557 
558 	default:
559 		break;
560 	}
561 
562 	switch(cmd) {
563 	case VM_RUN:
564 		vmrun = (struct vm_run *)data;
565 		error = vm_run(vcpu, &vmrun->vm_exit);
566 		break;
567 	case VM_SUSPEND:
568 		vmsuspend = (struct vm_suspend *)data;
569 		error = vm_suspend(sc->vm, vmsuspend->how);
570 		break;
571 	case VM_REINIT:
572 		error = vm_reinit(sc->vm);
573 		break;
574 	case VM_STAT_DESC: {
575 		statdesc = (struct vm_stat_desc *)data;
576 		error = vmm_stat_desc_copy(statdesc->index,
577 					statdesc->desc, sizeof(statdesc->desc));
578 		break;
579 	}
580 #ifdef COMPAT_FREEBSD13
581 	case VM_STATS_OLD:
582 		vmstats_old = (struct vm_stats_old *)data;
583 		getmicrotime(&vmstats_old->tv);
584 		error = vmm_stat_copy(vcpu, 0,
585 				      nitems(vmstats_old->statbuf),
586 				      &vmstats_old->num_entries,
587 				      vmstats_old->statbuf);
588 		break;
589 #endif
590 	case VM_STATS: {
591 		vmstats = (struct vm_stats *)data;
592 		getmicrotime(&vmstats->tv);
593 		error = vmm_stat_copy(vcpu, vmstats->index,
594 				      nitems(vmstats->statbuf),
595 				      &vmstats->num_entries, vmstats->statbuf);
596 		break;
597 	}
598 	case VM_PPTDEV_MSI:
599 		pptmsi = (struct vm_pptdev_msi *)data;
600 		error = ppt_setup_msi(sc->vm,
601 				      pptmsi->bus, pptmsi->slot, pptmsi->func,
602 				      pptmsi->addr, pptmsi->msg,
603 				      pptmsi->numvec);
604 		break;
605 	case VM_PPTDEV_MSIX:
606 		pptmsix = (struct vm_pptdev_msix *)data;
607 		error = ppt_setup_msix(sc->vm,
608 				       pptmsix->bus, pptmsix->slot,
609 				       pptmsix->func, pptmsix->idx,
610 				       pptmsix->addr, pptmsix->msg,
611 				       pptmsix->vector_control);
612 		break;
613 	case VM_PPTDEV_DISABLE_MSIX:
614 		pptdev = (struct vm_pptdev *)data;
615 		error = ppt_disable_msix(sc->vm, pptdev->bus, pptdev->slot,
616 					 pptdev->func);
617 		break;
618 	case VM_MAP_PPTDEV_MMIO:
619 		pptmmio = (struct vm_pptdev_mmio *)data;
620 		error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
621 				     pptmmio->func, pptmmio->gpa, pptmmio->len,
622 				     pptmmio->hpa);
623 		break;
624 	case VM_UNMAP_PPTDEV_MMIO:
625 		pptmmio = (struct vm_pptdev_mmio *)data;
626 		error = ppt_unmap_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
627 				       pptmmio->func, pptmmio->gpa, pptmmio->len);
628 		break;
629 	case VM_BIND_PPTDEV:
630 		pptdev = (struct vm_pptdev *)data;
631 		error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
632 					 pptdev->func);
633 		break;
634 	case VM_UNBIND_PPTDEV:
635 		pptdev = (struct vm_pptdev *)data;
636 		error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
637 					   pptdev->func);
638 		break;
639 	case VM_INJECT_EXCEPTION:
640 		vmexc = (struct vm_exception *)data;
641 		error = vm_inject_exception(vcpu,
642 		    vmexc->vector, vmexc->error_code_valid, vmexc->error_code,
643 		    vmexc->restart_instruction);
644 		break;
645 	case VM_INJECT_NMI:
646 		error = vm_inject_nmi(vcpu);
647 		break;
648 	case VM_LAPIC_IRQ:
649 		vmirq = (struct vm_lapic_irq *)data;
650 		error = lapic_intr_edge(vcpu, vmirq->vector);
651 		break;
652 	case VM_LAPIC_LOCAL_IRQ:
653 		vmirq = (struct vm_lapic_irq *)data;
654 		error = lapic_set_local_intr(sc->vm, vcpu, vmirq->vector);
655 		break;
656 	case VM_LAPIC_MSI:
657 		vmmsi = (struct vm_lapic_msi *)data;
658 		error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg);
659 		break;
660 	case VM_IOAPIC_ASSERT_IRQ:
661 		ioapic_irq = (struct vm_ioapic_irq *)data;
662 		error = vioapic_assert_irq(sc->vm, ioapic_irq->irq);
663 		break;
664 	case VM_IOAPIC_DEASSERT_IRQ:
665 		ioapic_irq = (struct vm_ioapic_irq *)data;
666 		error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq);
667 		break;
668 	case VM_IOAPIC_PULSE_IRQ:
669 		ioapic_irq = (struct vm_ioapic_irq *)data;
670 		error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq);
671 		break;
672 	case VM_IOAPIC_PINCOUNT:
673 		*(int *)data = vioapic_pincount(sc->vm);
674 		break;
675 	case VM_SET_KERNEMU_DEV:
676 	case VM_GET_KERNEMU_DEV: {
677 		mem_region_write_t mwrite;
678 		mem_region_read_t mread;
679 		bool arg;
680 
681 		kernemu = (void *)data;
682 
683 		if (kernemu->access_width > 0)
684 			size = (1u << kernemu->access_width);
685 		else
686 			size = 1;
687 
688 		if (kernemu->gpa >= DEFAULT_APIC_BASE && kernemu->gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
689 			mread = lapic_mmio_read;
690 			mwrite = lapic_mmio_write;
691 		} else if (kernemu->gpa >= VIOAPIC_BASE && kernemu->gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
692 			mread = vioapic_mmio_read;
693 			mwrite = vioapic_mmio_write;
694 		} else if (kernemu->gpa >= VHPET_BASE && kernemu->gpa < VHPET_BASE + VHPET_SIZE) {
695 			mread = vhpet_mmio_read;
696 			mwrite = vhpet_mmio_write;
697 		} else {
698 			error = EINVAL;
699 			break;
700 		}
701 
702 		if (cmd == VM_SET_KERNEMU_DEV)
703 			error = mwrite(vcpu, kernemu->gpa,
704 			    kernemu->value, size, &arg);
705 		else
706 			error = mread(vcpu, kernemu->gpa,
707 			    &kernemu->value, size, &arg);
708 		break;
709 		}
710 	case VM_ISA_ASSERT_IRQ:
711 		isa_irq = (struct vm_isa_irq *)data;
712 		error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq);
713 		if (error == 0 && isa_irq->ioapic_irq != -1)
714 			error = vioapic_assert_irq(sc->vm,
715 			    isa_irq->ioapic_irq);
716 		break;
717 	case VM_ISA_DEASSERT_IRQ:
718 		isa_irq = (struct vm_isa_irq *)data;
719 		error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq);
720 		if (error == 0 && isa_irq->ioapic_irq != -1)
721 			error = vioapic_deassert_irq(sc->vm,
722 			    isa_irq->ioapic_irq);
723 		break;
724 	case VM_ISA_PULSE_IRQ:
725 		isa_irq = (struct vm_isa_irq *)data;
726 		error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq);
727 		if (error == 0 && isa_irq->ioapic_irq != -1)
728 			error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq);
729 		break;
730 	case VM_ISA_SET_IRQ_TRIGGER:
731 		isa_irq_trigger = (struct vm_isa_irq_trigger *)data;
732 		error = vatpic_set_irq_trigger(sc->vm,
733 		    isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger);
734 		break;
735 	case VM_MMAP_GETNEXT:
736 		mm = (struct vm_memmap *)data;
737 		error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
738 		    &mm->segoff, &mm->len, &mm->prot, &mm->flags);
739 		break;
740 	case VM_MMAP_MEMSEG:
741 		mm = (struct vm_memmap *)data;
742 		error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
743 		    mm->len, mm->prot, mm->flags);
744 		break;
745 	case VM_MUNMAP_MEMSEG:
746 		mu = (struct vm_munmap *)data;
747 		error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len);
748 		break;
749 #ifdef COMPAT_FREEBSD12
750 	case VM_ALLOC_MEMSEG_FBSD12:
751 		error = alloc_memseg(sc, (struct vm_memseg *)data,
752 		    sizeof(((struct vm_memseg_fbsd12 *)0)->name));
753 		break;
754 #endif
755 	case VM_ALLOC_MEMSEG:
756 		error = alloc_memseg(sc, (struct vm_memseg *)data,
757 		    sizeof(((struct vm_memseg *)0)->name));
758 		break;
759 #ifdef COMPAT_FREEBSD12
760 	case VM_GET_MEMSEG_FBSD12:
761 		error = get_memseg(sc, (struct vm_memseg *)data,
762 		    sizeof(((struct vm_memseg_fbsd12 *)0)->name));
763 		break;
764 #endif
765 	case VM_GET_MEMSEG:
766 		error = get_memseg(sc, (struct vm_memseg *)data,
767 		    sizeof(((struct vm_memseg *)0)->name));
768 		break;
769 	case VM_GET_REGISTER:
770 		vmreg = (struct vm_register *)data;
771 		error = vm_get_register(vcpu, vmreg->regnum, &vmreg->regval);
772 		break;
773 	case VM_SET_REGISTER:
774 		vmreg = (struct vm_register *)data;
775 		error = vm_set_register(vcpu, vmreg->regnum, vmreg->regval);
776 		break;
777 	case VM_SET_SEGMENT_DESCRIPTOR:
778 		vmsegdesc = (struct vm_seg_desc *)data;
779 		error = vm_set_seg_desc(vcpu,
780 					vmsegdesc->regnum,
781 					&vmsegdesc->desc);
782 		break;
783 	case VM_GET_SEGMENT_DESCRIPTOR:
784 		vmsegdesc = (struct vm_seg_desc *)data;
785 		error = vm_get_seg_desc(vcpu,
786 					vmsegdesc->regnum,
787 					&vmsegdesc->desc);
788 		break;
789 	case VM_GET_REGISTER_SET:
790 		vmregset = (struct vm_register_set *)data;
791 		if (vmregset->count > VM_REG_LAST) {
792 			error = EINVAL;
793 			break;
794 		}
795 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
796 		    M_WAITOK);
797 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
798 		    M_WAITOK);
799 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
800 		    vmregset->count);
801 		if (error == 0)
802 			error = vm_get_register_set(vcpu,
803 			    vmregset->count, regnums, regvals);
804 		if (error == 0)
805 			error = copyout(regvals, vmregset->regvals,
806 			    sizeof(regvals[0]) * vmregset->count);
807 		free(regvals, M_VMMDEV);
808 		free(regnums, M_VMMDEV);
809 		break;
810 	case VM_SET_REGISTER_SET:
811 		vmregset = (struct vm_register_set *)data;
812 		if (vmregset->count > VM_REG_LAST) {
813 			error = EINVAL;
814 			break;
815 		}
816 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
817 		    M_WAITOK);
818 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
819 		    M_WAITOK);
820 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
821 		    vmregset->count);
822 		if (error == 0)
823 			error = copyin(vmregset->regvals, regvals,
824 			    sizeof(regvals[0]) * vmregset->count);
825 		if (error == 0)
826 			error = vm_set_register_set(vcpu,
827 			    vmregset->count, regnums, regvals);
828 		free(regvals, M_VMMDEV);
829 		free(regnums, M_VMMDEV);
830 		break;
831 	case VM_GET_CAPABILITY:
832 		vmcap = (struct vm_capability *)data;
833 		error = vm_get_capability(vcpu,
834 					  vmcap->captype,
835 					  &vmcap->capval);
836 		break;
837 	case VM_SET_CAPABILITY:
838 		vmcap = (struct vm_capability *)data;
839 		error = vm_set_capability(vcpu,
840 					  vmcap->captype,
841 					  vmcap->capval);
842 		break;
843 	case VM_SET_X2APIC_STATE:
844 		x2apic = (struct vm_x2apic *)data;
845 		error = vm_set_x2apic_state(vcpu, x2apic->state);
846 		break;
847 	case VM_GET_X2APIC_STATE:
848 		x2apic = (struct vm_x2apic *)data;
849 		error = vm_get_x2apic_state(vcpu, &x2apic->state);
850 		break;
851 	case VM_GET_GPA_PMAP:
852 		gpapte = (struct vm_gpa_pte *)data;
853 		pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
854 				 gpapte->gpa, gpapte->pte, &gpapte->ptenum);
855 		error = 0;
856 		break;
857 	case VM_GET_HPET_CAPABILITIES:
858 		error = vhpet_getcap((struct vm_hpet_cap *)data);
859 		break;
860 	case VM_GLA2GPA: {
861 		CTASSERT(PROT_READ == VM_PROT_READ);
862 		CTASSERT(PROT_WRITE == VM_PROT_WRITE);
863 		CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
864 		gg = (struct vm_gla2gpa *)data;
865 		error = vm_gla2gpa(vcpu, &gg->paging, gg->gla,
866 		    gg->prot, &gg->gpa, &gg->fault);
867 		KASSERT(error == 0 || error == EFAULT,
868 		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
869 		break;
870 	}
871 	case VM_GLA2GPA_NOFAULT:
872 		gg = (struct vm_gla2gpa *)data;
873 		error = vm_gla2gpa_nofault(vcpu, &gg->paging, gg->gla,
874 		    gg->prot, &gg->gpa, &gg->fault);
875 		KASSERT(error == 0 || error == EFAULT,
876 		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
877 		break;
878 	case VM_ACTIVATE_CPU:
879 		error = vm_activate_cpu(vcpu);
880 		break;
881 	case VM_GET_CPUS:
882 		error = 0;
883 		vm_cpuset = (struct vm_cpuset *)data;
884 		size = vm_cpuset->cpusetsize;
885 		if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) {
886 			error = ERANGE;
887 			break;
888 		}
889 		cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
890 		if (vm_cpuset->which == VM_ACTIVE_CPUS)
891 			*cpuset = vm_active_cpus(sc->vm);
892 		else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
893 			*cpuset = vm_suspended_cpus(sc->vm);
894 		else if (vm_cpuset->which == VM_DEBUG_CPUS)
895 			*cpuset = vm_debug_cpus(sc->vm);
896 		else
897 			error = EINVAL;
898 		if (error == 0)
899 			error = copyout(cpuset, vm_cpuset->cpus, size);
900 		free(cpuset, M_TEMP);
901 		break;
902 	case VM_SUSPEND_CPU:
903 		error = vm_suspend_cpu(sc->vm, vcpu);
904 		break;
905 	case VM_RESUME_CPU:
906 		error = vm_resume_cpu(sc->vm, vcpu);
907 		break;
908 	case VM_SET_INTINFO:
909 		vmii = (struct vm_intinfo *)data;
910 		error = vm_exit_intinfo(vcpu, vmii->info1);
911 		break;
912 	case VM_GET_INTINFO:
913 		vmii = (struct vm_intinfo *)data;
914 		error = vm_get_intinfo(vcpu, &vmii->info1, &vmii->info2);
915 		break;
916 	case VM_RTC_WRITE:
917 		rtcdata = (struct vm_rtc_data *)data;
918 		error = vrtc_nvram_write(sc->vm, rtcdata->offset,
919 		    rtcdata->value);
920 		break;
921 	case VM_RTC_READ:
922 		rtcdata = (struct vm_rtc_data *)data;
923 		error = vrtc_nvram_read(sc->vm, rtcdata->offset,
924 		    &rtcdata->value);
925 		break;
926 	case VM_RTC_SETTIME:
927 		rtctime = (struct vm_rtc_time *)data;
928 		error = vrtc_set_time(sc->vm, rtctime->secs);
929 		break;
930 	case VM_RTC_GETTIME:
931 		error = 0;
932 		rtctime = (struct vm_rtc_time *)data;
933 		rtctime->secs = vrtc_get_time(sc->vm);
934 		break;
935 	case VM_RESTART_INSTRUCTION:
936 		error = vm_restart_instruction(vcpu);
937 		break;
938 	case VM_SET_TOPOLOGY:
939 		topology = (struct vm_cpu_topology *)data;
940 		error = vm_set_topology(sc->vm, topology->sockets,
941 		    topology->cores, topology->threads, topology->maxcpus);
942 		break;
943 	case VM_GET_TOPOLOGY:
944 		topology = (struct vm_cpu_topology *)data;
945 		vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
946 		    &topology->threads, &topology->maxcpus);
947 		error = 0;
948 		break;
949 #ifdef BHYVE_SNAPSHOT
950 	case VM_SNAPSHOT_REQ:
951 		snapshot_meta = (struct vm_snapshot_meta *)data;
952 		error = vm_snapshot_req(sc->vm, snapshot_meta);
953 		break;
954 	case VM_RESTORE_TIME:
955 		error = vm_restore_time(sc->vm);
956 		break;
957 #endif
958 	default:
959 		error = ENOTTY;
960 		break;
961 	}
962 
963 	if (vcpus_locked == SINGLE)
964 		vcpu_unlock_one(sc, vcpuid, vcpu);
965 	else if (vcpus_locked == ALL)
966 		vcpu_unlock_all(sc);
967 	if (memsegs_locked)
968 		vm_unlock_memsegs(sc->vm);
969 
970 done:
971 	/*
972 	 * Make sure that no handler returns a kernel-internal
973 	 * error value to userspace.
974 	 */
975 	KASSERT(error == ERESTART || error >= 0,
976 	    ("vmmdev_ioctl: invalid error return %d", error));
977 	return (error);
978 }
979 
980 static int
981 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
982     struct vm_object **objp, int nprot)
983 {
984 	struct vmmdev_softc *sc;
985 	vm_paddr_t gpa;
986 	size_t len;
987 	vm_ooffset_t segoff, first, last;
988 	int error, found, segid;
989 	bool sysmem;
990 
991 	error = vmm_priv_check(curthread->td_ucred);
992 	if (error)
993 		return (error);
994 
995 	first = *offset;
996 	last = first + mapsize;
997 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
998 		return (EINVAL);
999 
1000 	sc = vmmdev_lookup2(cdev);
1001 	if (sc == NULL) {
1002 		/* virtual machine is in the process of being created */
1003 		return (EINVAL);
1004 	}
1005 
1006 	/*
1007 	 * Get a read lock on the guest memory map.
1008 	 */
1009 	vm_slock_memsegs(sc->vm);
1010 
1011 	gpa = 0;
1012 	found = 0;
1013 	while (!found) {
1014 		error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
1015 		    NULL, NULL);
1016 		if (error)
1017 			break;
1018 
1019 		if (first >= gpa && last <= gpa + len)
1020 			found = 1;
1021 		else
1022 			gpa += len;
1023 	}
1024 
1025 	if (found) {
1026 		error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
1027 		KASSERT(error == 0 && *objp != NULL,
1028 		    ("%s: invalid memory segment %d", __func__, segid));
1029 		if (sysmem) {
1030 			vm_object_reference(*objp);
1031 			*offset = segoff + (first - gpa);
1032 		} else {
1033 			error = EINVAL;
1034 		}
1035 	}
1036 	vm_unlock_memsegs(sc->vm);
1037 	return (error);
1038 }
1039 
1040 static void
1041 vmmdev_destroy(void *arg)
1042 {
1043 	struct vmmdev_softc *sc = arg;
1044 	struct devmem_softc *dsc;
1045 	int error __diagused;
1046 
1047 	vm_disable_vcpu_creation(sc->vm);
1048 	error = vcpu_lock_all(sc);
1049 	KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
1050 	vm_unlock_vcpus(sc->vm);
1051 
1052 	while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
1053 		KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
1054 		SLIST_REMOVE_HEAD(&sc->devmem, link);
1055 		free(dsc->name, M_VMMDEV);
1056 		free(dsc, M_VMMDEV);
1057 	}
1058 
1059 	if (sc->cdev != NULL)
1060 		destroy_dev(sc->cdev);
1061 
1062 	if (sc->vm != NULL)
1063 		vm_destroy(sc->vm);
1064 
1065 	if (sc->ucred != NULL)
1066 		crfree(sc->ucred);
1067 
1068 	if ((sc->flags & VSC_LINKED) != 0) {
1069 		mtx_lock(&vmmdev_mtx);
1070 		SLIST_REMOVE(&head, sc, vmmdev_softc, link);
1071 		mtx_unlock(&vmmdev_mtx);
1072 	}
1073 
1074 	free(sc, M_VMMDEV);
1075 }
1076 
1077 static int
1078 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
1079 {
1080 	struct devmem_softc *dsc;
1081 	struct vmmdev_softc *sc;
1082 	struct cdev *cdev;
1083 	char *buf;
1084 	int error, buflen;
1085 
1086 	error = vmm_priv_check(req->td->td_ucred);
1087 	if (error)
1088 		return (error);
1089 
1090 	buflen = VM_MAX_NAMELEN + 1;
1091 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
1092 	strlcpy(buf, "beavis", buflen);
1093 	error = sysctl_handle_string(oidp, buf, buflen, req);
1094 	if (error != 0 || req->newptr == NULL)
1095 		goto out;
1096 
1097 	mtx_lock(&vmmdev_mtx);
1098 	sc = vmmdev_lookup(buf);
1099 	if (sc == NULL || sc->cdev == NULL) {
1100 		mtx_unlock(&vmmdev_mtx);
1101 		error = EINVAL;
1102 		goto out;
1103 	}
1104 
1105 	/*
1106 	 * Setting 'sc->cdev' to NULL is used to indicate that the VM
1107 	 * is scheduled for destruction.
1108 	 */
1109 	cdev = sc->cdev;
1110 	sc->cdev = NULL;
1111 	mtx_unlock(&vmmdev_mtx);
1112 
1113 	/*
1114 	 * Destroy all cdevs:
1115 	 *
1116 	 * - any new operations on the 'cdev' will return an error (ENXIO).
1117 	 *
1118 	 * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
1119 	 */
1120 	SLIST_FOREACH(dsc, &sc->devmem, link) {
1121 		KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
1122 		destroy_dev(dsc->cdev);
1123 		devmem_destroy(dsc);
1124 	}
1125 	destroy_dev(cdev);
1126 	vmmdev_destroy(sc);
1127 	error = 0;
1128 
1129 out:
1130 	free(buf, M_VMMDEV);
1131 	return (error);
1132 }
1133 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy,
1134     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
1135     NULL, 0, sysctl_vmm_destroy, "A",
1136     NULL);
1137 
1138 static struct cdevsw vmmdevsw = {
1139 	.d_name		= "vmmdev",
1140 	.d_version	= D_VERSION,
1141 	.d_ioctl	= vmmdev_ioctl,
1142 	.d_mmap_single	= vmmdev_mmap_single,
1143 	.d_read		= vmmdev_rw,
1144 	.d_write	= vmmdev_rw,
1145 };
1146 
1147 static int
1148 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
1149 {
1150 	struct vm *vm;
1151 	struct cdev *cdev;
1152 	struct vmmdev_softc *sc, *sc2;
1153 	char *buf;
1154 	int error, buflen;
1155 
1156 	error = vmm_priv_check(req->td->td_ucred);
1157 	if (error)
1158 		return (error);
1159 
1160 	buflen = VM_MAX_NAMELEN + 1;
1161 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
1162 	strlcpy(buf, "beavis", buflen);
1163 	error = sysctl_handle_string(oidp, buf, buflen, req);
1164 	if (error != 0 || req->newptr == NULL)
1165 		goto out;
1166 
1167 	mtx_lock(&vmmdev_mtx);
1168 	sc = vmmdev_lookup(buf);
1169 	mtx_unlock(&vmmdev_mtx);
1170 	if (sc != NULL) {
1171 		error = EEXIST;
1172 		goto out;
1173 	}
1174 
1175 	error = vm_create(buf, &vm);
1176 	if (error != 0)
1177 		goto out;
1178 
1179 	sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1180 	sc->ucred = crhold(curthread->td_ucred);
1181 	sc->vm = vm;
1182 	SLIST_INIT(&sc->devmem);
1183 
1184 	/*
1185 	 * Lookup the name again just in case somebody sneaked in when we
1186 	 * dropped the lock.
1187 	 */
1188 	mtx_lock(&vmmdev_mtx);
1189 	sc2 = vmmdev_lookup(buf);
1190 	if (sc2 == NULL) {
1191 		SLIST_INSERT_HEAD(&head, sc, link);
1192 		sc->flags |= VSC_LINKED;
1193 	}
1194 	mtx_unlock(&vmmdev_mtx);
1195 
1196 	if (sc2 != NULL) {
1197 		vmmdev_destroy(sc);
1198 		error = EEXIST;
1199 		goto out;
1200 	}
1201 
1202 	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, sc->ucred,
1203 	    UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf);
1204 	if (error != 0) {
1205 		vmmdev_destroy(sc);
1206 		goto out;
1207 	}
1208 
1209 	mtx_lock(&vmmdev_mtx);
1210 	sc->cdev = cdev;
1211 	sc->cdev->si_drv1 = sc;
1212 	mtx_unlock(&vmmdev_mtx);
1213 
1214 out:
1215 	free(buf, M_VMMDEV);
1216 	return (error);
1217 }
1218 SYSCTL_PROC(_hw_vmm, OID_AUTO, create,
1219     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
1220     NULL, 0, sysctl_vmm_create, "A",
1221     NULL);
1222 
1223 void
1224 vmmdev_init(void)
1225 {
1226 	pr_allow_flag = prison_add_allow(NULL, "vmm", NULL,
1227 	    "Allow use of vmm in a jail.");
1228 }
1229 
1230 int
1231 vmmdev_cleanup(void)
1232 {
1233 	int error;
1234 
1235 	if (SLIST_EMPTY(&head))
1236 		error = 0;
1237 	else
1238 		error = EBUSY;
1239 
1240 	return (error);
1241 }
1242 
1243 static int
1244 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
1245     struct vm_object **objp, int nprot)
1246 {
1247 	struct devmem_softc *dsc;
1248 	vm_ooffset_t first, last;
1249 	size_t seglen;
1250 	int error;
1251 	bool sysmem;
1252 
1253 	dsc = cdev->si_drv1;
1254 	if (dsc == NULL) {
1255 		/* 'cdev' has been created but is not ready for use */
1256 		return (ENXIO);
1257 	}
1258 
1259 	first = *offset;
1260 	last = *offset + len;
1261 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
1262 		return (EINVAL);
1263 
1264 	vm_slock_memsegs(dsc->sc->vm);
1265 
1266 	error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
1267 	KASSERT(error == 0 && !sysmem && *objp != NULL,
1268 	    ("%s: invalid devmem segment %d", __func__, dsc->segid));
1269 
1270 	if (seglen >= last)
1271 		vm_object_reference(*objp);
1272 	else
1273 		error = EINVAL;
1274 
1275 	vm_unlock_memsegs(dsc->sc->vm);
1276 	return (error);
1277 }
1278 
1279 static struct cdevsw devmemsw = {
1280 	.d_name		= "devmem",
1281 	.d_version	= D_VERSION,
1282 	.d_mmap_single	= devmem_mmap_single,
1283 };
1284 
1285 static int
1286 devmem_create_cdev(const char *vmname, int segid, char *devname)
1287 {
1288 	struct devmem_softc *dsc;
1289 	struct vmmdev_softc *sc;
1290 	struct cdev *cdev;
1291 	int error;
1292 
1293 	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL,
1294 	    UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname);
1295 	if (error)
1296 		return (error);
1297 
1298 	dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1299 
1300 	mtx_lock(&vmmdev_mtx);
1301 	sc = vmmdev_lookup(vmname);
1302 	KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname));
1303 	if (sc->cdev == NULL) {
1304 		/* virtual machine is being created or destroyed */
1305 		mtx_unlock(&vmmdev_mtx);
1306 		free(dsc, M_VMMDEV);
1307 		destroy_dev_sched_cb(cdev, NULL, 0);
1308 		return (ENODEV);
1309 	}
1310 
1311 	dsc->segid = segid;
1312 	dsc->name = devname;
1313 	dsc->cdev = cdev;
1314 	dsc->sc = sc;
1315 	SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
1316 	mtx_unlock(&vmmdev_mtx);
1317 
1318 	/* The 'cdev' is ready for use after 'si_drv1' is initialized */
1319 	cdev->si_drv1 = dsc;
1320 	return (0);
1321 }
1322 
1323 static void
1324 devmem_destroy(void *arg)
1325 {
1326 	struct devmem_softc *dsc = arg;
1327 
1328 	KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__));
1329 	dsc->cdev = NULL;
1330 	dsc->sc = NULL;
1331 }
1332