xref: /freebsd/sys/amd64/vmm/vmm_dev.c (revision 2b833162)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_bhyve_snapshot.h"
35 
36 #include <sys/param.h>
37 #include <sys/kernel.h>
38 #include <sys/jail.h>
39 #include <sys/queue.h>
40 #include <sys/lock.h>
41 #include <sys/mutex.h>
42 #include <sys/malloc.h>
43 #include <sys/conf.h>
44 #include <sys/sysctl.h>
45 #include <sys/libkern.h>
46 #include <sys/ioccom.h>
47 #include <sys/mman.h>
48 #include <sys/uio.h>
49 #include <sys/proc.h>
50 
51 #include <vm/vm.h>
52 #include <vm/pmap.h>
53 #include <vm/vm_map.h>
54 #include <vm/vm_object.h>
55 
56 #include <machine/vmparam.h>
57 #include <machine/vmm.h>
58 #include <machine/vmm_dev.h>
59 #include <machine/vmm_instruction_emul.h>
60 #include <machine/vmm_snapshot.h>
61 #include <x86/apicreg.h>
62 
63 #include "vmm_lapic.h"
64 #include "vmm_stat.h"
65 #include "vmm_mem.h"
66 #include "io/ppt.h"
67 #include "io/vatpic.h"
68 #include "io/vioapic.h"
69 #include "io/vhpet.h"
70 #include "io/vrtc.h"
71 
72 #ifdef COMPAT_FREEBSD13
73 struct vm_stats_old {
74 	int		cpuid;				/* in */
75 	int		num_entries;			/* out */
76 	struct timeval	tv;
77 	uint64_t	statbuf[MAX_VM_STATS];
78 };
79 
80 #define	VM_STATS_OLD \
81 	_IOWR('v', IOCNUM_VM_STATS, struct vm_stats_old)
82 
83 struct vm_snapshot_meta_old {
84 	void *ctx;			/* unused */
85 	void *dev_data;
86 	const char *dev_name;      /* identify userspace devices */
87 	enum snapshot_req dev_req; /* identify kernel structs */
88 
89 	struct vm_snapshot_buffer buffer;
90 
91 	enum vm_snapshot_op op;
92 };
93 
94 #define VM_SNAPSHOT_REQ_OLD \
95 	_IOWR('v', IOCNUM_SNAPSHOT_REQ, struct vm_snapshot_meta_old)
96 #endif
97 
98 struct devmem_softc {
99 	int	segid;
100 	char	*name;
101 	struct cdev *cdev;
102 	struct vmmdev_softc *sc;
103 	SLIST_ENTRY(devmem_softc) link;
104 };
105 
106 struct vmmdev_softc {
107 	struct vm	*vm;		/* vm instance cookie */
108 	struct cdev	*cdev;
109 	struct ucred	*ucred;
110 	SLIST_ENTRY(vmmdev_softc) link;
111 	SLIST_HEAD(, devmem_softc) devmem;
112 	int		flags;
113 };
114 #define	VSC_LINKED		0x01
115 
116 static SLIST_HEAD(, vmmdev_softc) head;
117 
118 static unsigned pr_allow_flag;
119 static struct mtx vmmdev_mtx;
120 MTX_SYSINIT(vmmdev_mtx, &vmmdev_mtx, "vmm device mutex", MTX_DEF);
121 
122 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
123 
124 SYSCTL_DECL(_hw_vmm);
125 
126 static int vmm_priv_check(struct ucred *ucred);
127 static int devmem_create_cdev(const char *vmname, int id, char *devmem);
128 static void devmem_destroy(void *arg);
129 
130 static int
131 vmm_priv_check(struct ucred *ucred)
132 {
133 
134 	if (jailed(ucred) &&
135 	    !(ucred->cr_prison->pr_allow & pr_allow_flag))
136 		return (EPERM);
137 
138 	return (0);
139 }
140 
141 static int
142 vcpu_lock_one(struct vcpu *vcpu)
143 {
144 	return (vcpu_set_state(vcpu, VCPU_FROZEN, true));
145 }
146 
147 static void
148 vcpu_unlock_one(struct vmmdev_softc *sc, int vcpuid, struct vcpu *vcpu)
149 {
150 	enum vcpu_state state;
151 
152 	state = vcpu_get_state(vcpu, NULL);
153 	if (state != VCPU_FROZEN) {
154 		panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm),
155 		    vcpuid, state);
156 	}
157 
158 	vcpu_set_state(vcpu, VCPU_IDLE, false);
159 }
160 
161 static int
162 vcpu_lock_all(struct vmmdev_softc *sc)
163 {
164 	struct vcpu *vcpu;
165 	int error;
166 	uint16_t i, j, maxcpus;
167 
168 	vm_slock_vcpus(sc->vm);
169 	maxcpus = vm_get_maxcpus(sc->vm);
170 	for (i = 0; i < maxcpus; i++) {
171 		vcpu = vm_vcpu(sc->vm, i);
172 		if (vcpu == NULL)
173 			continue;
174 		error = vcpu_lock_one(vcpu);
175 		if (error)
176 			break;
177 	}
178 
179 	if (error) {
180 		for (j = 0; j < i; j++) {
181 			vcpu = vm_vcpu(sc->vm, j);
182 			if (vcpu == NULL)
183 				continue;
184 			vcpu_unlock_one(sc, j, vcpu);
185 		}
186 		vm_unlock_vcpus(sc->vm);
187 	}
188 
189 	return (error);
190 }
191 
192 static void
193 vcpu_unlock_all(struct vmmdev_softc *sc)
194 {
195 	struct vcpu *vcpu;
196 	uint16_t i, maxcpus;
197 
198 	maxcpus = vm_get_maxcpus(sc->vm);
199 	for (i = 0; i < maxcpus; i++) {
200 		vcpu = vm_vcpu(sc->vm, i);
201 		if (vcpu == NULL)
202 			continue;
203 		vcpu_unlock_one(sc, i, vcpu);
204 	}
205 	vm_unlock_vcpus(sc->vm);
206 }
207 
208 static struct vmmdev_softc *
209 vmmdev_lookup(const char *name)
210 {
211 	struct vmmdev_softc *sc;
212 
213 #ifdef notyet	/* XXX kernel is not compiled with invariants */
214 	mtx_assert(&vmmdev_mtx, MA_OWNED);
215 #endif
216 
217 	SLIST_FOREACH(sc, &head, link) {
218 		if (strcmp(name, vm_name(sc->vm)) == 0)
219 			break;
220 	}
221 
222 	if (sc == NULL)
223 		return (NULL);
224 
225 	if (cr_cansee(curthread->td_ucred, sc->ucred))
226 		return (NULL);
227 
228 	return (sc);
229 }
230 
231 static struct vmmdev_softc *
232 vmmdev_lookup2(struct cdev *cdev)
233 {
234 
235 	return (cdev->si_drv1);
236 }
237 
238 static int
239 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
240 {
241 	int error, off, c, prot;
242 	vm_paddr_t gpa, maxaddr;
243 	void *hpa, *cookie;
244 	struct vmmdev_softc *sc;
245 
246 	error = vmm_priv_check(curthread->td_ucred);
247 	if (error)
248 		return (error);
249 
250 	sc = vmmdev_lookup2(cdev);
251 	if (sc == NULL)
252 		return (ENXIO);
253 
254 	/*
255 	 * Get a read lock on the guest memory map.
256 	 */
257 	vm_slock_memsegs(sc->vm);
258 
259 	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
260 	maxaddr = vmm_sysmem_maxaddr(sc->vm);
261 	while (uio->uio_resid > 0 && error == 0) {
262 		gpa = uio->uio_offset;
263 		off = gpa & PAGE_MASK;
264 		c = min(uio->uio_resid, PAGE_SIZE - off);
265 
266 		/*
267 		 * The VM has a hole in its physical memory map. If we want to
268 		 * use 'dd' to inspect memory beyond the hole we need to
269 		 * provide bogus data for memory that lies in the hole.
270 		 *
271 		 * Since this device does not support lseek(2), dd(1) will
272 		 * read(2) blocks of data to simulate the lseek(2).
273 		 */
274 		hpa = vm_gpa_hold_global(sc->vm, gpa, c, prot, &cookie);
275 		if (hpa == NULL) {
276 			if (uio->uio_rw == UIO_READ && gpa < maxaddr)
277 				error = uiomove(__DECONST(void *, zero_region),
278 				    c, uio);
279 			else
280 				error = EFAULT;
281 		} else {
282 			error = uiomove(hpa, c, uio);
283 			vm_gpa_release(cookie);
284 		}
285 	}
286 	vm_unlock_memsegs(sc->vm);
287 	return (error);
288 }
289 
290 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1);
291 
292 static int
293 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
294 {
295 	struct devmem_softc *dsc;
296 	int error;
297 	bool sysmem;
298 
299 	error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
300 	if (error || mseg->len == 0)
301 		return (error);
302 
303 	if (!sysmem) {
304 		SLIST_FOREACH(dsc, &sc->devmem, link) {
305 			if (dsc->segid == mseg->segid)
306 				break;
307 		}
308 		KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
309 		    __func__, mseg->segid));
310 		error = copystr(dsc->name, mseg->name, len, NULL);
311 	} else {
312 		bzero(mseg->name, len);
313 	}
314 
315 	return (error);
316 }
317 
318 static int
319 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
320 {
321 	char *name;
322 	int error;
323 	bool sysmem;
324 
325 	error = 0;
326 	name = NULL;
327 	sysmem = true;
328 
329 	/*
330 	 * The allocation is lengthened by 1 to hold a terminating NUL.  It'll
331 	 * by stripped off when devfs processes the full string.
332 	 */
333 	if (VM_MEMSEG_NAME(mseg)) {
334 		sysmem = false;
335 		name = malloc(len, M_VMMDEV, M_WAITOK);
336 		error = copystr(mseg->name, name, len, NULL);
337 		if (error)
338 			goto done;
339 	}
340 
341 	error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
342 	if (error)
343 		goto done;
344 
345 	if (VM_MEMSEG_NAME(mseg)) {
346 		error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name);
347 		if (error)
348 			vm_free_memseg(sc->vm, mseg->segid);
349 		else
350 			name = NULL;	/* freed when 'cdev' is destroyed */
351 	}
352 done:
353 	free(name, M_VMMDEV);
354 	return (error);
355 }
356 
357 static int
358 vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
359     uint64_t *regval)
360 {
361 	int error, i;
362 
363 	error = 0;
364 	for (i = 0; i < count; i++) {
365 		error = vm_get_register(vcpu, regnum[i], &regval[i]);
366 		if (error)
367 			break;
368 	}
369 	return (error);
370 }
371 
372 static int
373 vm_set_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
374     uint64_t *regval)
375 {
376 	int error, i;
377 
378 	error = 0;
379 	for (i = 0; i < count; i++) {
380 		error = vm_set_register(vcpu, regnum[i], regval[i]);
381 		if (error)
382 			break;
383 	}
384 	return (error);
385 }
386 
387 static int
388 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
389 	     struct thread *td)
390 {
391 	int error, vcpuid, size;
392 	cpuset_t *cpuset;
393 	struct vmmdev_softc *sc;
394 	struct vcpu *vcpu;
395 	struct vm_register *vmreg;
396 	struct vm_seg_desc *vmsegdesc;
397 	struct vm_register_set *vmregset;
398 	struct vm_run *vmrun;
399 	struct vm_exception *vmexc;
400 	struct vm_lapic_irq *vmirq;
401 	struct vm_lapic_msi *vmmsi;
402 	struct vm_ioapic_irq *ioapic_irq;
403 	struct vm_isa_irq *isa_irq;
404 	struct vm_isa_irq_trigger *isa_irq_trigger;
405 	struct vm_capability *vmcap;
406 	struct vm_pptdev *pptdev;
407 	struct vm_pptdev_mmio *pptmmio;
408 	struct vm_pptdev_msi *pptmsi;
409 	struct vm_pptdev_msix *pptmsix;
410 #ifdef COMPAT_FREEBSD13
411 	struct vm_stats_old *vmstats_old;
412 #endif
413 	struct vm_stats *vmstats;
414 	struct vm_stat_desc *statdesc;
415 	struct vm_x2apic *x2apic;
416 	struct vm_gpa_pte *gpapte;
417 	struct vm_suspend *vmsuspend;
418 	struct vm_gla2gpa *gg;
419 	struct vm_cpuset *vm_cpuset;
420 	struct vm_intinfo *vmii;
421 	struct vm_rtc_time *rtctime;
422 	struct vm_rtc_data *rtcdata;
423 	struct vm_memmap *mm;
424 	struct vm_munmap *mu;
425 	struct vm_cpu_topology *topology;
426 	struct vm_readwrite_kernemu_device *kernemu;
427 	uint64_t *regvals;
428 	int *regnums;
429 	enum { NONE, SINGLE, ALL } vcpus_locked;
430 	bool memsegs_locked;
431 #ifdef BHYVE_SNAPSHOT
432 	struct vm_snapshot_meta *snapshot_meta;
433 #ifdef COMPAT_FREEBSD13
434 	struct vm_snapshot_meta_old *snapshot_old;
435 #endif
436 #endif
437 
438 	error = vmm_priv_check(curthread->td_ucred);
439 	if (error)
440 		return (error);
441 
442 	sc = vmmdev_lookup2(cdev);
443 	if (sc == NULL)
444 		return (ENXIO);
445 
446 	vcpuid = -1;
447 	vcpu = NULL;
448 	vcpus_locked = NONE;
449 	memsegs_locked = false;
450 
451 	/*
452 	 * For VMM ioctls that operate on a single vCPU, lookup the
453 	 * vcpu.  For VMM ioctls which require one or more vCPUs to
454 	 * not be running, lock necessary vCPUs.
455 	 *
456 	 * XXX fragile, handle with care
457 	 * Most of these assume that the first field of the ioctl data
458 	 * is the vcpuid.
459 	 */
460 	switch (cmd) {
461 	case VM_RUN:
462 	case VM_GET_REGISTER:
463 	case VM_SET_REGISTER:
464 	case VM_GET_SEGMENT_DESCRIPTOR:
465 	case VM_SET_SEGMENT_DESCRIPTOR:
466 	case VM_GET_REGISTER_SET:
467 	case VM_SET_REGISTER_SET:
468 	case VM_INJECT_EXCEPTION:
469 	case VM_GET_CAPABILITY:
470 	case VM_SET_CAPABILITY:
471 	case VM_SET_X2APIC_STATE:
472 	case VM_GLA2GPA:
473 	case VM_GLA2GPA_NOFAULT:
474 	case VM_ACTIVATE_CPU:
475 	case VM_SET_INTINFO:
476 	case VM_GET_INTINFO:
477 	case VM_RESTART_INSTRUCTION:
478 	case VM_GET_KERNEMU_DEV:
479 	case VM_SET_KERNEMU_DEV:
480 		/*
481 		 * ioctls that can operate only on vcpus that are not running.
482 		 */
483 		vcpuid = *(int *)data;
484 		vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
485 		if (vcpu == NULL) {
486 			error = EINVAL;
487 			goto done;
488 		}
489 		error = vcpu_lock_one(vcpu);
490 		if (error)
491 			goto done;
492 		vcpus_locked = SINGLE;
493 		break;
494 
495 #ifdef COMPAT_FREEBSD12
496 	case VM_ALLOC_MEMSEG_FBSD12:
497 #endif
498 	case VM_ALLOC_MEMSEG:
499 	case VM_BIND_PPTDEV:
500 	case VM_UNBIND_PPTDEV:
501 	case VM_MMAP_MEMSEG:
502 	case VM_MUNMAP_MEMSEG:
503 	case VM_REINIT:
504 		/*
505 		 * ioctls that modify the memory map must lock memory
506 		 * segments exclusively.
507 		 */
508 		vm_xlock_memsegs(sc->vm);
509 		memsegs_locked = true;
510 		/* FALLTHROUGH */
511 	case VM_MAP_PPTDEV_MMIO:
512 	case VM_UNMAP_PPTDEV_MMIO:
513 #ifdef BHYVE_SNAPSHOT
514 	case VM_SNAPSHOT_REQ:
515 #ifdef COMPAT_FREEBSD13
516 	case VM_SNAPSHOT_REQ_OLD:
517 #endif
518 	case VM_RESTORE_TIME:
519 #endif
520 		/*
521 		 * ioctls that operate on the entire virtual machine must
522 		 * prevent all vcpus from running.
523 		 */
524 		error = vcpu_lock_all(sc);
525 		if (error)
526 			goto done;
527 		vcpus_locked = ALL;
528 		break;
529 
530 #ifdef COMPAT_FREEBSD12
531 	case VM_GET_MEMSEG_FBSD12:
532 #endif
533 	case VM_GET_MEMSEG:
534 	case VM_MMAP_GETNEXT:
535 		/*
536 		 * Lock the memory map while it is being inspected.
537 		 */
538 		vm_slock_memsegs(sc->vm);
539 		memsegs_locked = true;
540 		break;
541 
542 #ifdef COMPAT_FREEBSD13
543 	case VM_STATS_OLD:
544 #endif
545 	case VM_STATS:
546 	case VM_INJECT_NMI:
547 	case VM_LAPIC_IRQ:
548 	case VM_GET_X2APIC_STATE:
549 		/*
550 		 * These do not need the vCPU locked but do operate on
551 		 * a specific vCPU.
552 		 */
553 		vcpuid = *(int *)data;
554 		vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
555 		if (vcpu == NULL) {
556 			error = EINVAL;
557 			goto done;
558 		}
559 		break;
560 
561 	case VM_LAPIC_LOCAL_IRQ:
562 	case VM_SUSPEND_CPU:
563 	case VM_RESUME_CPU:
564 		/*
565 		 * These can either operate on all CPUs via a vcpuid of
566 		 * -1 or on a specific vCPU.
567 		 */
568 		vcpuid = *(int *)data;
569 		if (vcpuid == -1)
570 			break;
571 		vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
572 		if (vcpu == NULL) {
573 			error = EINVAL;
574 			goto done;
575 		}
576 		break;
577 
578 	default:
579 		break;
580 	}
581 
582 	switch(cmd) {
583 	case VM_RUN:
584 		vmrun = (struct vm_run *)data;
585 		error = vm_run(vcpu, &vmrun->vm_exit);
586 		break;
587 	case VM_SUSPEND:
588 		vmsuspend = (struct vm_suspend *)data;
589 		error = vm_suspend(sc->vm, vmsuspend->how);
590 		break;
591 	case VM_REINIT:
592 		error = vm_reinit(sc->vm);
593 		break;
594 	case VM_STAT_DESC: {
595 		statdesc = (struct vm_stat_desc *)data;
596 		error = vmm_stat_desc_copy(statdesc->index,
597 					statdesc->desc, sizeof(statdesc->desc));
598 		break;
599 	}
600 #ifdef COMPAT_FREEBSD13
601 	case VM_STATS_OLD:
602 		vmstats_old = (struct vm_stats_old *)data;
603 		getmicrotime(&vmstats_old->tv);
604 		error = vmm_stat_copy(vcpu, 0,
605 				      nitems(vmstats_old->statbuf),
606 				      &vmstats_old->num_entries,
607 				      vmstats_old->statbuf);
608 		break;
609 #endif
610 	case VM_STATS: {
611 		vmstats = (struct vm_stats *)data;
612 		getmicrotime(&vmstats->tv);
613 		error = vmm_stat_copy(vcpu, vmstats->index,
614 				      nitems(vmstats->statbuf),
615 				      &vmstats->num_entries, vmstats->statbuf);
616 		break;
617 	}
618 	case VM_PPTDEV_MSI:
619 		pptmsi = (struct vm_pptdev_msi *)data;
620 		error = ppt_setup_msi(sc->vm,
621 				      pptmsi->bus, pptmsi->slot, pptmsi->func,
622 				      pptmsi->addr, pptmsi->msg,
623 				      pptmsi->numvec);
624 		break;
625 	case VM_PPTDEV_MSIX:
626 		pptmsix = (struct vm_pptdev_msix *)data;
627 		error = ppt_setup_msix(sc->vm,
628 				       pptmsix->bus, pptmsix->slot,
629 				       pptmsix->func, pptmsix->idx,
630 				       pptmsix->addr, pptmsix->msg,
631 				       pptmsix->vector_control);
632 		break;
633 	case VM_PPTDEV_DISABLE_MSIX:
634 		pptdev = (struct vm_pptdev *)data;
635 		error = ppt_disable_msix(sc->vm, pptdev->bus, pptdev->slot,
636 					 pptdev->func);
637 		break;
638 	case VM_MAP_PPTDEV_MMIO:
639 		pptmmio = (struct vm_pptdev_mmio *)data;
640 		error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
641 				     pptmmio->func, pptmmio->gpa, pptmmio->len,
642 				     pptmmio->hpa);
643 		break;
644 	case VM_UNMAP_PPTDEV_MMIO:
645 		pptmmio = (struct vm_pptdev_mmio *)data;
646 		error = ppt_unmap_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
647 				       pptmmio->func, pptmmio->gpa, pptmmio->len);
648 		break;
649 	case VM_BIND_PPTDEV:
650 		pptdev = (struct vm_pptdev *)data;
651 		error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
652 					 pptdev->func);
653 		break;
654 	case VM_UNBIND_PPTDEV:
655 		pptdev = (struct vm_pptdev *)data;
656 		error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
657 					   pptdev->func);
658 		break;
659 	case VM_INJECT_EXCEPTION:
660 		vmexc = (struct vm_exception *)data;
661 		error = vm_inject_exception(vcpu,
662 		    vmexc->vector, vmexc->error_code_valid, vmexc->error_code,
663 		    vmexc->restart_instruction);
664 		break;
665 	case VM_INJECT_NMI:
666 		error = vm_inject_nmi(vcpu);
667 		break;
668 	case VM_LAPIC_IRQ:
669 		vmirq = (struct vm_lapic_irq *)data;
670 		error = lapic_intr_edge(vcpu, vmirq->vector);
671 		break;
672 	case VM_LAPIC_LOCAL_IRQ:
673 		vmirq = (struct vm_lapic_irq *)data;
674 		error = lapic_set_local_intr(sc->vm, vcpu, vmirq->vector);
675 		break;
676 	case VM_LAPIC_MSI:
677 		vmmsi = (struct vm_lapic_msi *)data;
678 		error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg);
679 		break;
680 	case VM_IOAPIC_ASSERT_IRQ:
681 		ioapic_irq = (struct vm_ioapic_irq *)data;
682 		error = vioapic_assert_irq(sc->vm, ioapic_irq->irq);
683 		break;
684 	case VM_IOAPIC_DEASSERT_IRQ:
685 		ioapic_irq = (struct vm_ioapic_irq *)data;
686 		error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq);
687 		break;
688 	case VM_IOAPIC_PULSE_IRQ:
689 		ioapic_irq = (struct vm_ioapic_irq *)data;
690 		error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq);
691 		break;
692 	case VM_IOAPIC_PINCOUNT:
693 		*(int *)data = vioapic_pincount(sc->vm);
694 		break;
695 	case VM_SET_KERNEMU_DEV:
696 	case VM_GET_KERNEMU_DEV: {
697 		mem_region_write_t mwrite;
698 		mem_region_read_t mread;
699 		bool arg;
700 
701 		kernemu = (void *)data;
702 
703 		if (kernemu->access_width > 0)
704 			size = (1u << kernemu->access_width);
705 		else
706 			size = 1;
707 
708 		if (kernemu->gpa >= DEFAULT_APIC_BASE && kernemu->gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
709 			mread = lapic_mmio_read;
710 			mwrite = lapic_mmio_write;
711 		} else if (kernemu->gpa >= VIOAPIC_BASE && kernemu->gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
712 			mread = vioapic_mmio_read;
713 			mwrite = vioapic_mmio_write;
714 		} else if (kernemu->gpa >= VHPET_BASE && kernemu->gpa < VHPET_BASE + VHPET_SIZE) {
715 			mread = vhpet_mmio_read;
716 			mwrite = vhpet_mmio_write;
717 		} else {
718 			error = EINVAL;
719 			break;
720 		}
721 
722 		if (cmd == VM_SET_KERNEMU_DEV)
723 			error = mwrite(vcpu, kernemu->gpa,
724 			    kernemu->value, size, &arg);
725 		else
726 			error = mread(vcpu, kernemu->gpa,
727 			    &kernemu->value, size, &arg);
728 		break;
729 		}
730 	case VM_ISA_ASSERT_IRQ:
731 		isa_irq = (struct vm_isa_irq *)data;
732 		error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq);
733 		if (error == 0 && isa_irq->ioapic_irq != -1)
734 			error = vioapic_assert_irq(sc->vm,
735 			    isa_irq->ioapic_irq);
736 		break;
737 	case VM_ISA_DEASSERT_IRQ:
738 		isa_irq = (struct vm_isa_irq *)data;
739 		error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq);
740 		if (error == 0 && isa_irq->ioapic_irq != -1)
741 			error = vioapic_deassert_irq(sc->vm,
742 			    isa_irq->ioapic_irq);
743 		break;
744 	case VM_ISA_PULSE_IRQ:
745 		isa_irq = (struct vm_isa_irq *)data;
746 		error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq);
747 		if (error == 0 && isa_irq->ioapic_irq != -1)
748 			error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq);
749 		break;
750 	case VM_ISA_SET_IRQ_TRIGGER:
751 		isa_irq_trigger = (struct vm_isa_irq_trigger *)data;
752 		error = vatpic_set_irq_trigger(sc->vm,
753 		    isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger);
754 		break;
755 	case VM_MMAP_GETNEXT:
756 		mm = (struct vm_memmap *)data;
757 		error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
758 		    &mm->segoff, &mm->len, &mm->prot, &mm->flags);
759 		break;
760 	case VM_MMAP_MEMSEG:
761 		mm = (struct vm_memmap *)data;
762 		error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
763 		    mm->len, mm->prot, mm->flags);
764 		break;
765 	case VM_MUNMAP_MEMSEG:
766 		mu = (struct vm_munmap *)data;
767 		error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len);
768 		break;
769 #ifdef COMPAT_FREEBSD12
770 	case VM_ALLOC_MEMSEG_FBSD12:
771 		error = alloc_memseg(sc, (struct vm_memseg *)data,
772 		    sizeof(((struct vm_memseg_fbsd12 *)0)->name));
773 		break;
774 #endif
775 	case VM_ALLOC_MEMSEG:
776 		error = alloc_memseg(sc, (struct vm_memseg *)data,
777 		    sizeof(((struct vm_memseg *)0)->name));
778 		break;
779 #ifdef COMPAT_FREEBSD12
780 	case VM_GET_MEMSEG_FBSD12:
781 		error = get_memseg(sc, (struct vm_memseg *)data,
782 		    sizeof(((struct vm_memseg_fbsd12 *)0)->name));
783 		break;
784 #endif
785 	case VM_GET_MEMSEG:
786 		error = get_memseg(sc, (struct vm_memseg *)data,
787 		    sizeof(((struct vm_memseg *)0)->name));
788 		break;
789 	case VM_GET_REGISTER:
790 		vmreg = (struct vm_register *)data;
791 		error = vm_get_register(vcpu, vmreg->regnum, &vmreg->regval);
792 		break;
793 	case VM_SET_REGISTER:
794 		vmreg = (struct vm_register *)data;
795 		error = vm_set_register(vcpu, vmreg->regnum, vmreg->regval);
796 		break;
797 	case VM_SET_SEGMENT_DESCRIPTOR:
798 		vmsegdesc = (struct vm_seg_desc *)data;
799 		error = vm_set_seg_desc(vcpu,
800 					vmsegdesc->regnum,
801 					&vmsegdesc->desc);
802 		break;
803 	case VM_GET_SEGMENT_DESCRIPTOR:
804 		vmsegdesc = (struct vm_seg_desc *)data;
805 		error = vm_get_seg_desc(vcpu,
806 					vmsegdesc->regnum,
807 					&vmsegdesc->desc);
808 		break;
809 	case VM_GET_REGISTER_SET:
810 		vmregset = (struct vm_register_set *)data;
811 		if (vmregset->count > VM_REG_LAST) {
812 			error = EINVAL;
813 			break;
814 		}
815 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
816 		    M_WAITOK);
817 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
818 		    M_WAITOK);
819 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
820 		    vmregset->count);
821 		if (error == 0)
822 			error = vm_get_register_set(vcpu,
823 			    vmregset->count, regnums, regvals);
824 		if (error == 0)
825 			error = copyout(regvals, vmregset->regvals,
826 			    sizeof(regvals[0]) * vmregset->count);
827 		free(regvals, M_VMMDEV);
828 		free(regnums, M_VMMDEV);
829 		break;
830 	case VM_SET_REGISTER_SET:
831 		vmregset = (struct vm_register_set *)data;
832 		if (vmregset->count > VM_REG_LAST) {
833 			error = EINVAL;
834 			break;
835 		}
836 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
837 		    M_WAITOK);
838 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
839 		    M_WAITOK);
840 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
841 		    vmregset->count);
842 		if (error == 0)
843 			error = copyin(vmregset->regvals, regvals,
844 			    sizeof(regvals[0]) * vmregset->count);
845 		if (error == 0)
846 			error = vm_set_register_set(vcpu,
847 			    vmregset->count, regnums, regvals);
848 		free(regvals, M_VMMDEV);
849 		free(regnums, M_VMMDEV);
850 		break;
851 	case VM_GET_CAPABILITY:
852 		vmcap = (struct vm_capability *)data;
853 		error = vm_get_capability(vcpu,
854 					  vmcap->captype,
855 					  &vmcap->capval);
856 		break;
857 	case VM_SET_CAPABILITY:
858 		vmcap = (struct vm_capability *)data;
859 		error = vm_set_capability(vcpu,
860 					  vmcap->captype,
861 					  vmcap->capval);
862 		break;
863 	case VM_SET_X2APIC_STATE:
864 		x2apic = (struct vm_x2apic *)data;
865 		error = vm_set_x2apic_state(vcpu, x2apic->state);
866 		break;
867 	case VM_GET_X2APIC_STATE:
868 		x2apic = (struct vm_x2apic *)data;
869 		error = vm_get_x2apic_state(vcpu, &x2apic->state);
870 		break;
871 	case VM_GET_GPA_PMAP:
872 		gpapte = (struct vm_gpa_pte *)data;
873 		pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
874 				 gpapte->gpa, gpapte->pte, &gpapte->ptenum);
875 		error = 0;
876 		break;
877 	case VM_GET_HPET_CAPABILITIES:
878 		error = vhpet_getcap((struct vm_hpet_cap *)data);
879 		break;
880 	case VM_GLA2GPA: {
881 		CTASSERT(PROT_READ == VM_PROT_READ);
882 		CTASSERT(PROT_WRITE == VM_PROT_WRITE);
883 		CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
884 		gg = (struct vm_gla2gpa *)data;
885 		error = vm_gla2gpa(vcpu, &gg->paging, gg->gla,
886 		    gg->prot, &gg->gpa, &gg->fault);
887 		KASSERT(error == 0 || error == EFAULT,
888 		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
889 		break;
890 	}
891 	case VM_GLA2GPA_NOFAULT:
892 		gg = (struct vm_gla2gpa *)data;
893 		error = vm_gla2gpa_nofault(vcpu, &gg->paging, gg->gla,
894 		    gg->prot, &gg->gpa, &gg->fault);
895 		KASSERT(error == 0 || error == EFAULT,
896 		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
897 		break;
898 	case VM_ACTIVATE_CPU:
899 		error = vm_activate_cpu(vcpu);
900 		break;
901 	case VM_GET_CPUS:
902 		error = 0;
903 		vm_cpuset = (struct vm_cpuset *)data;
904 		size = vm_cpuset->cpusetsize;
905 		if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) {
906 			error = ERANGE;
907 			break;
908 		}
909 		cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
910 		if (vm_cpuset->which == VM_ACTIVE_CPUS)
911 			*cpuset = vm_active_cpus(sc->vm);
912 		else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
913 			*cpuset = vm_suspended_cpus(sc->vm);
914 		else if (vm_cpuset->which == VM_DEBUG_CPUS)
915 			*cpuset = vm_debug_cpus(sc->vm);
916 		else
917 			error = EINVAL;
918 		if (error == 0)
919 			error = copyout(cpuset, vm_cpuset->cpus, size);
920 		free(cpuset, M_TEMP);
921 		break;
922 	case VM_SUSPEND_CPU:
923 		error = vm_suspend_cpu(sc->vm, vcpu);
924 		break;
925 	case VM_RESUME_CPU:
926 		error = vm_resume_cpu(sc->vm, vcpu);
927 		break;
928 	case VM_SET_INTINFO:
929 		vmii = (struct vm_intinfo *)data;
930 		error = vm_exit_intinfo(vcpu, vmii->info1);
931 		break;
932 	case VM_GET_INTINFO:
933 		vmii = (struct vm_intinfo *)data;
934 		error = vm_get_intinfo(vcpu, &vmii->info1, &vmii->info2);
935 		break;
936 	case VM_RTC_WRITE:
937 		rtcdata = (struct vm_rtc_data *)data;
938 		error = vrtc_nvram_write(sc->vm, rtcdata->offset,
939 		    rtcdata->value);
940 		break;
941 	case VM_RTC_READ:
942 		rtcdata = (struct vm_rtc_data *)data;
943 		error = vrtc_nvram_read(sc->vm, rtcdata->offset,
944 		    &rtcdata->value);
945 		break;
946 	case VM_RTC_SETTIME:
947 		rtctime = (struct vm_rtc_time *)data;
948 		error = vrtc_set_time(sc->vm, rtctime->secs);
949 		break;
950 	case VM_RTC_GETTIME:
951 		error = 0;
952 		rtctime = (struct vm_rtc_time *)data;
953 		rtctime->secs = vrtc_get_time(sc->vm);
954 		break;
955 	case VM_RESTART_INSTRUCTION:
956 		error = vm_restart_instruction(vcpu);
957 		break;
958 	case VM_SET_TOPOLOGY:
959 		topology = (struct vm_cpu_topology *)data;
960 		error = vm_set_topology(sc->vm, topology->sockets,
961 		    topology->cores, topology->threads, topology->maxcpus);
962 		break;
963 	case VM_GET_TOPOLOGY:
964 		topology = (struct vm_cpu_topology *)data;
965 		vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
966 		    &topology->threads, &topology->maxcpus);
967 		error = 0;
968 		break;
969 #ifdef BHYVE_SNAPSHOT
970 	case VM_SNAPSHOT_REQ:
971 		snapshot_meta = (struct vm_snapshot_meta *)data;
972 		error = vm_snapshot_req(sc->vm, snapshot_meta);
973 		break;
974 #ifdef COMPAT_FREEBSD13
975 	case VM_SNAPSHOT_REQ_OLD:
976 		/*
977 		 * The old structure just has an additional pointer at
978 		 * the start that is ignored.
979 		 */
980 		snapshot_old = (struct vm_snapshot_meta_old *)data;
981 		snapshot_meta =
982 		    (struct vm_snapshot_meta *)&snapshot_old->dev_data;
983 		error = vm_snapshot_req(sc->vm, snapshot_meta);
984 		break;
985 #endif
986 	case VM_RESTORE_TIME:
987 		error = vm_restore_time(sc->vm);
988 		break;
989 #endif
990 	default:
991 		error = ENOTTY;
992 		break;
993 	}
994 
995 	if (vcpus_locked == SINGLE)
996 		vcpu_unlock_one(sc, vcpuid, vcpu);
997 	else if (vcpus_locked == ALL)
998 		vcpu_unlock_all(sc);
999 	if (memsegs_locked)
1000 		vm_unlock_memsegs(sc->vm);
1001 
1002 done:
1003 	/*
1004 	 * Make sure that no handler returns a kernel-internal
1005 	 * error value to userspace.
1006 	 */
1007 	KASSERT(error == ERESTART || error >= 0,
1008 	    ("vmmdev_ioctl: invalid error return %d", error));
1009 	return (error);
1010 }
1011 
1012 static int
1013 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
1014     struct vm_object **objp, int nprot)
1015 {
1016 	struct vmmdev_softc *sc;
1017 	vm_paddr_t gpa;
1018 	size_t len;
1019 	vm_ooffset_t segoff, first, last;
1020 	int error, found, segid;
1021 	bool sysmem;
1022 
1023 	error = vmm_priv_check(curthread->td_ucred);
1024 	if (error)
1025 		return (error);
1026 
1027 	first = *offset;
1028 	last = first + mapsize;
1029 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
1030 		return (EINVAL);
1031 
1032 	sc = vmmdev_lookup2(cdev);
1033 	if (sc == NULL) {
1034 		/* virtual machine is in the process of being created */
1035 		return (EINVAL);
1036 	}
1037 
1038 	/*
1039 	 * Get a read lock on the guest memory map.
1040 	 */
1041 	vm_slock_memsegs(sc->vm);
1042 
1043 	gpa = 0;
1044 	found = 0;
1045 	while (!found) {
1046 		error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
1047 		    NULL, NULL);
1048 		if (error)
1049 			break;
1050 
1051 		if (first >= gpa && last <= gpa + len)
1052 			found = 1;
1053 		else
1054 			gpa += len;
1055 	}
1056 
1057 	if (found) {
1058 		error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
1059 		KASSERT(error == 0 && *objp != NULL,
1060 		    ("%s: invalid memory segment %d", __func__, segid));
1061 		if (sysmem) {
1062 			vm_object_reference(*objp);
1063 			*offset = segoff + (first - gpa);
1064 		} else {
1065 			error = EINVAL;
1066 		}
1067 	}
1068 	vm_unlock_memsegs(sc->vm);
1069 	return (error);
1070 }
1071 
1072 static void
1073 vmmdev_destroy(void *arg)
1074 {
1075 	struct vmmdev_softc *sc = arg;
1076 	struct devmem_softc *dsc;
1077 	int error __diagused;
1078 
1079 	vm_disable_vcpu_creation(sc->vm);
1080 	error = vcpu_lock_all(sc);
1081 	KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
1082 	vm_unlock_vcpus(sc->vm);
1083 
1084 	while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
1085 		KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
1086 		SLIST_REMOVE_HEAD(&sc->devmem, link);
1087 		free(dsc->name, M_VMMDEV);
1088 		free(dsc, M_VMMDEV);
1089 	}
1090 
1091 	if (sc->cdev != NULL)
1092 		destroy_dev(sc->cdev);
1093 
1094 	if (sc->vm != NULL)
1095 		vm_destroy(sc->vm);
1096 
1097 	if (sc->ucred != NULL)
1098 		crfree(sc->ucred);
1099 
1100 	if ((sc->flags & VSC_LINKED) != 0) {
1101 		mtx_lock(&vmmdev_mtx);
1102 		SLIST_REMOVE(&head, sc, vmmdev_softc, link);
1103 		mtx_unlock(&vmmdev_mtx);
1104 	}
1105 
1106 	free(sc, M_VMMDEV);
1107 }
1108 
1109 static int
1110 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
1111 {
1112 	struct devmem_softc *dsc;
1113 	struct vmmdev_softc *sc;
1114 	struct cdev *cdev;
1115 	char *buf;
1116 	int error, buflen;
1117 
1118 	error = vmm_priv_check(req->td->td_ucred);
1119 	if (error)
1120 		return (error);
1121 
1122 	buflen = VM_MAX_NAMELEN + 1;
1123 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
1124 	strlcpy(buf, "beavis", buflen);
1125 	error = sysctl_handle_string(oidp, buf, buflen, req);
1126 	if (error != 0 || req->newptr == NULL)
1127 		goto out;
1128 
1129 	mtx_lock(&vmmdev_mtx);
1130 	sc = vmmdev_lookup(buf);
1131 	if (sc == NULL || sc->cdev == NULL) {
1132 		mtx_unlock(&vmmdev_mtx);
1133 		error = EINVAL;
1134 		goto out;
1135 	}
1136 
1137 	/*
1138 	 * Setting 'sc->cdev' to NULL is used to indicate that the VM
1139 	 * is scheduled for destruction.
1140 	 */
1141 	cdev = sc->cdev;
1142 	sc->cdev = NULL;
1143 	mtx_unlock(&vmmdev_mtx);
1144 
1145 	/*
1146 	 * Destroy all cdevs:
1147 	 *
1148 	 * - any new operations on the 'cdev' will return an error (ENXIO).
1149 	 *
1150 	 * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
1151 	 */
1152 	SLIST_FOREACH(dsc, &sc->devmem, link) {
1153 		KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
1154 		destroy_dev(dsc->cdev);
1155 		devmem_destroy(dsc);
1156 	}
1157 	destroy_dev(cdev);
1158 	vmmdev_destroy(sc);
1159 	error = 0;
1160 
1161 out:
1162 	free(buf, M_VMMDEV);
1163 	return (error);
1164 }
1165 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy,
1166     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
1167     NULL, 0, sysctl_vmm_destroy, "A",
1168     NULL);
1169 
1170 static struct cdevsw vmmdevsw = {
1171 	.d_name		= "vmmdev",
1172 	.d_version	= D_VERSION,
1173 	.d_ioctl	= vmmdev_ioctl,
1174 	.d_mmap_single	= vmmdev_mmap_single,
1175 	.d_read		= vmmdev_rw,
1176 	.d_write	= vmmdev_rw,
1177 };
1178 
1179 static int
1180 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
1181 {
1182 	struct vm *vm;
1183 	struct cdev *cdev;
1184 	struct vmmdev_softc *sc, *sc2;
1185 	char *buf;
1186 	int error, buflen;
1187 
1188 	error = vmm_priv_check(req->td->td_ucred);
1189 	if (error)
1190 		return (error);
1191 
1192 	buflen = VM_MAX_NAMELEN + 1;
1193 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
1194 	strlcpy(buf, "beavis", buflen);
1195 	error = sysctl_handle_string(oidp, buf, buflen, req);
1196 	if (error != 0 || req->newptr == NULL)
1197 		goto out;
1198 
1199 	mtx_lock(&vmmdev_mtx);
1200 	sc = vmmdev_lookup(buf);
1201 	mtx_unlock(&vmmdev_mtx);
1202 	if (sc != NULL) {
1203 		error = EEXIST;
1204 		goto out;
1205 	}
1206 
1207 	error = vm_create(buf, &vm);
1208 	if (error != 0)
1209 		goto out;
1210 
1211 	sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1212 	sc->ucred = crhold(curthread->td_ucred);
1213 	sc->vm = vm;
1214 	SLIST_INIT(&sc->devmem);
1215 
1216 	/*
1217 	 * Lookup the name again just in case somebody sneaked in when we
1218 	 * dropped the lock.
1219 	 */
1220 	mtx_lock(&vmmdev_mtx);
1221 	sc2 = vmmdev_lookup(buf);
1222 	if (sc2 == NULL) {
1223 		SLIST_INSERT_HEAD(&head, sc, link);
1224 		sc->flags |= VSC_LINKED;
1225 	}
1226 	mtx_unlock(&vmmdev_mtx);
1227 
1228 	if (sc2 != NULL) {
1229 		vmmdev_destroy(sc);
1230 		error = EEXIST;
1231 		goto out;
1232 	}
1233 
1234 	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, sc->ucred,
1235 	    UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf);
1236 	if (error != 0) {
1237 		vmmdev_destroy(sc);
1238 		goto out;
1239 	}
1240 
1241 	mtx_lock(&vmmdev_mtx);
1242 	sc->cdev = cdev;
1243 	sc->cdev->si_drv1 = sc;
1244 	mtx_unlock(&vmmdev_mtx);
1245 
1246 out:
1247 	free(buf, M_VMMDEV);
1248 	return (error);
1249 }
1250 SYSCTL_PROC(_hw_vmm, OID_AUTO, create,
1251     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
1252     NULL, 0, sysctl_vmm_create, "A",
1253     NULL);
1254 
1255 void
1256 vmmdev_init(void)
1257 {
1258 	pr_allow_flag = prison_add_allow(NULL, "vmm", NULL,
1259 	    "Allow use of vmm in a jail.");
1260 }
1261 
1262 int
1263 vmmdev_cleanup(void)
1264 {
1265 	int error;
1266 
1267 	if (SLIST_EMPTY(&head))
1268 		error = 0;
1269 	else
1270 		error = EBUSY;
1271 
1272 	return (error);
1273 }
1274 
1275 static int
1276 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
1277     struct vm_object **objp, int nprot)
1278 {
1279 	struct devmem_softc *dsc;
1280 	vm_ooffset_t first, last;
1281 	size_t seglen;
1282 	int error;
1283 	bool sysmem;
1284 
1285 	dsc = cdev->si_drv1;
1286 	if (dsc == NULL) {
1287 		/* 'cdev' has been created but is not ready for use */
1288 		return (ENXIO);
1289 	}
1290 
1291 	first = *offset;
1292 	last = *offset + len;
1293 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
1294 		return (EINVAL);
1295 
1296 	vm_slock_memsegs(dsc->sc->vm);
1297 
1298 	error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
1299 	KASSERT(error == 0 && !sysmem && *objp != NULL,
1300 	    ("%s: invalid devmem segment %d", __func__, dsc->segid));
1301 
1302 	if (seglen >= last)
1303 		vm_object_reference(*objp);
1304 	else
1305 		error = EINVAL;
1306 
1307 	vm_unlock_memsegs(dsc->sc->vm);
1308 	return (error);
1309 }
1310 
1311 static struct cdevsw devmemsw = {
1312 	.d_name		= "devmem",
1313 	.d_version	= D_VERSION,
1314 	.d_mmap_single	= devmem_mmap_single,
1315 };
1316 
1317 static int
1318 devmem_create_cdev(const char *vmname, int segid, char *devname)
1319 {
1320 	struct devmem_softc *dsc;
1321 	struct vmmdev_softc *sc;
1322 	struct cdev *cdev;
1323 	int error;
1324 
1325 	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL,
1326 	    UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname);
1327 	if (error)
1328 		return (error);
1329 
1330 	dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1331 
1332 	mtx_lock(&vmmdev_mtx);
1333 	sc = vmmdev_lookup(vmname);
1334 	KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname));
1335 	if (sc->cdev == NULL) {
1336 		/* virtual machine is being created or destroyed */
1337 		mtx_unlock(&vmmdev_mtx);
1338 		free(dsc, M_VMMDEV);
1339 		destroy_dev_sched_cb(cdev, NULL, 0);
1340 		return (ENODEV);
1341 	}
1342 
1343 	dsc->segid = segid;
1344 	dsc->name = devname;
1345 	dsc->cdev = cdev;
1346 	dsc->sc = sc;
1347 	SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
1348 	mtx_unlock(&vmmdev_mtx);
1349 
1350 	/* The 'cdev' is ready for use after 'si_drv1' is initialized */
1351 	cdev->si_drv1 = dsc;
1352 	return (0);
1353 }
1354 
1355 static void
1356 devmem_destroy(void *arg)
1357 {
1358 	struct devmem_softc *dsc = arg;
1359 
1360 	KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__));
1361 	dsc->cdev = NULL;
1362 	dsc->sc = NULL;
1363 }
1364