xref: /freebsd/sys/amd64/vmm/vmm_dev.c (revision 1323ec57)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_bhyve_snapshot.h"
35 
36 #include <sys/param.h>
37 #include <sys/kernel.h>
38 #include <sys/jail.h>
39 #include <sys/queue.h>
40 #include <sys/lock.h>
41 #include <sys/mutex.h>
42 #include <sys/malloc.h>
43 #include <sys/conf.h>
44 #include <sys/sysctl.h>
45 #include <sys/libkern.h>
46 #include <sys/ioccom.h>
47 #include <sys/mman.h>
48 #include <sys/uio.h>
49 #include <sys/proc.h>
50 
51 #include <vm/vm.h>
52 #include <vm/pmap.h>
53 #include <vm/vm_map.h>
54 #include <vm/vm_object.h>
55 
56 #include <machine/vmparam.h>
57 #include <machine/vmm.h>
58 #include <machine/vmm_dev.h>
59 #include <machine/vmm_instruction_emul.h>
60 #include <machine/vmm_snapshot.h>
61 #include <x86/apicreg.h>
62 
63 #include "vmm_lapic.h"
64 #include "vmm_stat.h"
65 #include "vmm_mem.h"
66 #include "io/ppt.h"
67 #include "io/vatpic.h"
68 #include "io/vioapic.h"
69 #include "io/vhpet.h"
70 #include "io/vrtc.h"
71 
72 #ifdef COMPAT_FREEBSD13
73 struct vm_stats_old {
74 	int		cpuid;				/* in */
75 	int		num_entries;			/* out */
76 	struct timeval	tv;
77 	uint64_t	statbuf[MAX_VM_STATS];
78 };
79 
80 #define	VM_STATS_OLD \
81 	_IOWR('v', IOCNUM_VM_STATS, struct vm_stats_old)
82 #endif
83 
84 struct devmem_softc {
85 	int	segid;
86 	char	*name;
87 	struct cdev *cdev;
88 	struct vmmdev_softc *sc;
89 	SLIST_ENTRY(devmem_softc) link;
90 };
91 
92 struct vmmdev_softc {
93 	struct vm	*vm;		/* vm instance cookie */
94 	struct cdev	*cdev;
95 	struct ucred	*ucred;
96 	SLIST_ENTRY(vmmdev_softc) link;
97 	SLIST_HEAD(, devmem_softc) devmem;
98 	int		flags;
99 };
100 #define	VSC_LINKED		0x01
101 
102 static SLIST_HEAD(, vmmdev_softc) head;
103 
104 static unsigned pr_allow_flag;
105 static struct mtx vmmdev_mtx;
106 
107 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
108 
109 SYSCTL_DECL(_hw_vmm);
110 
111 static int vmm_priv_check(struct ucred *ucred);
112 static int devmem_create_cdev(const char *vmname, int id, char *devmem);
113 static void devmem_destroy(void *arg);
114 
115 static int
116 vmm_priv_check(struct ucred *ucred)
117 {
118 
119 	if (jailed(ucred) &&
120 	    !(ucred->cr_prison->pr_allow & pr_allow_flag))
121 		return (EPERM);
122 
123 	return (0);
124 }
125 
126 static int
127 vcpu_lock_one(struct vmmdev_softc *sc, int vcpu)
128 {
129 	int error;
130 
131 	if (vcpu < 0 || vcpu >= vm_get_maxcpus(sc->vm))
132 		return (EINVAL);
133 
134 	error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true);
135 	return (error);
136 }
137 
138 static void
139 vcpu_unlock_one(struct vmmdev_softc *sc, int vcpu)
140 {
141 	enum vcpu_state state;
142 
143 	state = vcpu_get_state(sc->vm, vcpu, NULL);
144 	if (state != VCPU_FROZEN) {
145 		panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm),
146 		    vcpu, state);
147 	}
148 
149 	vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
150 }
151 
152 static int
153 vcpu_lock_all(struct vmmdev_softc *sc)
154 {
155 	int error, vcpu;
156 	uint16_t maxcpus;
157 
158 	maxcpus = vm_get_maxcpus(sc->vm);
159 	for (vcpu = 0; vcpu < maxcpus; vcpu++) {
160 		error = vcpu_lock_one(sc, vcpu);
161 		if (error)
162 			break;
163 	}
164 
165 	if (error) {
166 		while (--vcpu >= 0)
167 			vcpu_unlock_one(sc, vcpu);
168 	}
169 
170 	return (error);
171 }
172 
173 static void
174 vcpu_unlock_all(struct vmmdev_softc *sc)
175 {
176 	int vcpu;
177 	uint16_t maxcpus;
178 
179 	maxcpus = vm_get_maxcpus(sc->vm);
180 	for (vcpu = 0; vcpu < maxcpus; vcpu++)
181 		vcpu_unlock_one(sc, vcpu);
182 }
183 
184 static struct vmmdev_softc *
185 vmmdev_lookup(const char *name)
186 {
187 	struct vmmdev_softc *sc;
188 
189 #ifdef notyet	/* XXX kernel is not compiled with invariants */
190 	mtx_assert(&vmmdev_mtx, MA_OWNED);
191 #endif
192 
193 	SLIST_FOREACH(sc, &head, link) {
194 		if (strcmp(name, vm_name(sc->vm)) == 0)
195 			break;
196 	}
197 
198 	if (sc == NULL)
199 		return (NULL);
200 
201 	if (cr_cansee(curthread->td_ucred, sc->ucred))
202 		return (NULL);
203 
204 	return (sc);
205 }
206 
207 static struct vmmdev_softc *
208 vmmdev_lookup2(struct cdev *cdev)
209 {
210 
211 	return (cdev->si_drv1);
212 }
213 
214 static int
215 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
216 {
217 	int error, off, c, prot;
218 	vm_paddr_t gpa, maxaddr;
219 	void *hpa, *cookie;
220 	struct vmmdev_softc *sc;
221 	uint16_t lastcpu;
222 
223 	error = vmm_priv_check(curthread->td_ucred);
224 	if (error)
225 		return (error);
226 
227 	sc = vmmdev_lookup2(cdev);
228 	if (sc == NULL)
229 		return (ENXIO);
230 
231 	/*
232 	 * Get a read lock on the guest memory map by freezing any vcpu.
233 	 */
234 	lastcpu = vm_get_maxcpus(sc->vm) - 1;
235 	error = vcpu_lock_one(sc, lastcpu);
236 	if (error)
237 		return (error);
238 
239 	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
240 	maxaddr = vmm_sysmem_maxaddr(sc->vm);
241 	while (uio->uio_resid > 0 && error == 0) {
242 		gpa = uio->uio_offset;
243 		off = gpa & PAGE_MASK;
244 		c = min(uio->uio_resid, PAGE_SIZE - off);
245 
246 		/*
247 		 * The VM has a hole in its physical memory map. If we want to
248 		 * use 'dd' to inspect memory beyond the hole we need to
249 		 * provide bogus data for memory that lies in the hole.
250 		 *
251 		 * Since this device does not support lseek(2), dd(1) will
252 		 * read(2) blocks of data to simulate the lseek(2).
253 		 */
254 		hpa = vm_gpa_hold(sc->vm, lastcpu, gpa, c,
255 		    prot, &cookie);
256 		if (hpa == NULL) {
257 			if (uio->uio_rw == UIO_READ && gpa < maxaddr)
258 				error = uiomove(__DECONST(void *, zero_region),
259 				    c, uio);
260 			else
261 				error = EFAULT;
262 		} else {
263 			error = uiomove(hpa, c, uio);
264 			vm_gpa_release(cookie);
265 		}
266 	}
267 	vcpu_unlock_one(sc, lastcpu);
268 	return (error);
269 }
270 
271 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1);
272 
273 static int
274 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
275 {
276 	struct devmem_softc *dsc;
277 	int error;
278 	bool sysmem;
279 
280 	error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
281 	if (error || mseg->len == 0)
282 		return (error);
283 
284 	if (!sysmem) {
285 		SLIST_FOREACH(dsc, &sc->devmem, link) {
286 			if (dsc->segid == mseg->segid)
287 				break;
288 		}
289 		KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
290 		    __func__, mseg->segid));
291 		error = copystr(dsc->name, mseg->name, len, NULL);
292 	} else {
293 		bzero(mseg->name, len);
294 	}
295 
296 	return (error);
297 }
298 
299 static int
300 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
301 {
302 	char *name;
303 	int error;
304 	bool sysmem;
305 
306 	error = 0;
307 	name = NULL;
308 	sysmem = true;
309 
310 	/*
311 	 * The allocation is lengthened by 1 to hold a terminating NUL.  It'll
312 	 * by stripped off when devfs processes the full string.
313 	 */
314 	if (VM_MEMSEG_NAME(mseg)) {
315 		sysmem = false;
316 		name = malloc(len, M_VMMDEV, M_WAITOK);
317 		error = copystr(mseg->name, name, len, NULL);
318 		if (error)
319 			goto done;
320 	}
321 
322 	error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
323 	if (error)
324 		goto done;
325 
326 	if (VM_MEMSEG_NAME(mseg)) {
327 		error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name);
328 		if (error)
329 			vm_free_memseg(sc->vm, mseg->segid);
330 		else
331 			name = NULL;	/* freed when 'cdev' is destroyed */
332 	}
333 done:
334 	free(name, M_VMMDEV);
335 	return (error);
336 }
337 
338 static int
339 vm_get_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum,
340     uint64_t *regval)
341 {
342 	int error, i;
343 
344 	error = 0;
345 	for (i = 0; i < count; i++) {
346 		error = vm_get_register(vm, vcpu, regnum[i], &regval[i]);
347 		if (error)
348 			break;
349 	}
350 	return (error);
351 }
352 
353 static int
354 vm_set_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum,
355     uint64_t *regval)
356 {
357 	int error, i;
358 
359 	error = 0;
360 	for (i = 0; i < count; i++) {
361 		error = vm_set_register(vm, vcpu, regnum[i], regval[i]);
362 		if (error)
363 			break;
364 	}
365 	return (error);
366 }
367 
368 static int
369 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
370 	     struct thread *td)
371 {
372 	int error, vcpu, state_changed, size;
373 	cpuset_t *cpuset;
374 	struct vmmdev_softc *sc;
375 	struct vm_register *vmreg;
376 	struct vm_seg_desc *vmsegdesc;
377 	struct vm_register_set *vmregset;
378 	struct vm_run *vmrun;
379 	struct vm_exception *vmexc;
380 	struct vm_lapic_irq *vmirq;
381 	struct vm_lapic_msi *vmmsi;
382 	struct vm_ioapic_irq *ioapic_irq;
383 	struct vm_isa_irq *isa_irq;
384 	struct vm_isa_irq_trigger *isa_irq_trigger;
385 	struct vm_capability *vmcap;
386 	struct vm_pptdev *pptdev;
387 	struct vm_pptdev_mmio *pptmmio;
388 	struct vm_pptdev_msi *pptmsi;
389 	struct vm_pptdev_msix *pptmsix;
390 	struct vm_nmi *vmnmi;
391 #ifdef COMPAT_FREEBSD13
392 	struct vm_stats_old *vmstats_old;
393 #endif
394 	struct vm_stats *vmstats;
395 	struct vm_stat_desc *statdesc;
396 	struct vm_x2apic *x2apic;
397 	struct vm_gpa_pte *gpapte;
398 	struct vm_suspend *vmsuspend;
399 	struct vm_gla2gpa *gg;
400 	struct vm_activate_cpu *vac;
401 	struct vm_cpuset *vm_cpuset;
402 	struct vm_intinfo *vmii;
403 	struct vm_rtc_time *rtctime;
404 	struct vm_rtc_data *rtcdata;
405 	struct vm_memmap *mm;
406 	struct vm_munmap *mu;
407 	struct vm_cpu_topology *topology;
408 	struct vm_readwrite_kernemu_device *kernemu;
409 	uint64_t *regvals;
410 	int *regnums;
411 #ifdef BHYVE_SNAPSHOT
412 	struct vm_snapshot_meta *snapshot_meta;
413 #endif
414 
415 	error = vmm_priv_check(curthread->td_ucred);
416 	if (error)
417 		return (error);
418 
419 	sc = vmmdev_lookup2(cdev);
420 	if (sc == NULL)
421 		return (ENXIO);
422 
423 	vcpu = -1;
424 	state_changed = 0;
425 
426 	/*
427 	 * Some VMM ioctls can operate only on vcpus that are not running.
428 	 */
429 	switch (cmd) {
430 	case VM_RUN:
431 	case VM_GET_REGISTER:
432 	case VM_SET_REGISTER:
433 	case VM_GET_SEGMENT_DESCRIPTOR:
434 	case VM_SET_SEGMENT_DESCRIPTOR:
435 	case VM_GET_REGISTER_SET:
436 	case VM_SET_REGISTER_SET:
437 	case VM_INJECT_EXCEPTION:
438 	case VM_GET_CAPABILITY:
439 	case VM_SET_CAPABILITY:
440 	case VM_PPTDEV_MSI:
441 	case VM_PPTDEV_MSIX:
442 	case VM_SET_X2APIC_STATE:
443 	case VM_GLA2GPA:
444 	case VM_GLA2GPA_NOFAULT:
445 	case VM_ACTIVATE_CPU:
446 	case VM_SET_INTINFO:
447 	case VM_GET_INTINFO:
448 	case VM_RESTART_INSTRUCTION:
449 		/*
450 		 * XXX fragile, handle with care
451 		 * Assumes that the first field of the ioctl data is the vcpu.
452 		 */
453 		vcpu = *(int *)data;
454 		error = vcpu_lock_one(sc, vcpu);
455 		if (error)
456 			goto done;
457 		state_changed = 1;
458 		break;
459 
460 	case VM_MAP_PPTDEV_MMIO:
461 	case VM_UNMAP_PPTDEV_MMIO:
462 	case VM_BIND_PPTDEV:
463 	case VM_UNBIND_PPTDEV:
464 #ifdef COMPAT_FREEBSD12
465 	case VM_ALLOC_MEMSEG_FBSD12:
466 #endif
467 	case VM_ALLOC_MEMSEG:
468 	case VM_MMAP_MEMSEG:
469 	case VM_MUNMAP_MEMSEG:
470 	case VM_REINIT:
471 		/*
472 		 * ioctls that operate on the entire virtual machine must
473 		 * prevent all vcpus from running.
474 		 */
475 		error = vcpu_lock_all(sc);
476 		if (error)
477 			goto done;
478 		state_changed = 2;
479 		break;
480 
481 #ifdef COMPAT_FREEBSD12
482 	case VM_GET_MEMSEG_FBSD12:
483 #endif
484 	case VM_GET_MEMSEG:
485 	case VM_MMAP_GETNEXT:
486 		/*
487 		 * Lock a vcpu to make sure that the memory map cannot be
488 		 * modified while it is being inspected.
489 		 */
490 		vcpu = vm_get_maxcpus(sc->vm) - 1;
491 		error = vcpu_lock_one(sc, vcpu);
492 		if (error)
493 			goto done;
494 		state_changed = 1;
495 		break;
496 
497 	default:
498 		break;
499 	}
500 
501 	switch(cmd) {
502 	case VM_RUN:
503 		vmrun = (struct vm_run *)data;
504 		error = vm_run(sc->vm, vmrun);
505 		break;
506 	case VM_SUSPEND:
507 		vmsuspend = (struct vm_suspend *)data;
508 		error = vm_suspend(sc->vm, vmsuspend->how);
509 		break;
510 	case VM_REINIT:
511 		error = vm_reinit(sc->vm);
512 		break;
513 	case VM_STAT_DESC: {
514 		statdesc = (struct vm_stat_desc *)data;
515 		error = vmm_stat_desc_copy(statdesc->index,
516 					statdesc->desc, sizeof(statdesc->desc));
517 		break;
518 	}
519 #ifdef COMPAT_FREEBSD13
520 	case VM_STATS_OLD:
521 		vmstats_old = (struct vm_stats_old *)data;
522 		getmicrotime(&vmstats_old->tv);
523 		error = vmm_stat_copy(sc->vm, vmstats_old->cpuid, 0,
524 				      nitems(vmstats_old->statbuf),
525 				      &vmstats_old->num_entries,
526 				      vmstats_old->statbuf);
527 		break;
528 #endif
529 	case VM_STATS: {
530 		vmstats = (struct vm_stats *)data;
531 		getmicrotime(&vmstats->tv);
532 		error = vmm_stat_copy(sc->vm, vmstats->cpuid, vmstats->index,
533 				      nitems(vmstats->statbuf),
534 				      &vmstats->num_entries, vmstats->statbuf);
535 		break;
536 	}
537 	case VM_PPTDEV_MSI:
538 		pptmsi = (struct vm_pptdev_msi *)data;
539 		error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
540 				      pptmsi->bus, pptmsi->slot, pptmsi->func,
541 				      pptmsi->addr, pptmsi->msg,
542 				      pptmsi->numvec);
543 		break;
544 	case VM_PPTDEV_MSIX:
545 		pptmsix = (struct vm_pptdev_msix *)data;
546 		error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
547 				       pptmsix->bus, pptmsix->slot,
548 				       pptmsix->func, pptmsix->idx,
549 				       pptmsix->addr, pptmsix->msg,
550 				       pptmsix->vector_control);
551 		break;
552 	case VM_PPTDEV_DISABLE_MSIX:
553 		pptdev = (struct vm_pptdev *)data;
554 		error = ppt_disable_msix(sc->vm, pptdev->bus, pptdev->slot,
555 					 pptdev->func);
556 		break;
557 	case VM_MAP_PPTDEV_MMIO:
558 		pptmmio = (struct vm_pptdev_mmio *)data;
559 		error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
560 				     pptmmio->func, pptmmio->gpa, pptmmio->len,
561 				     pptmmio->hpa);
562 		break;
563 	case VM_UNMAP_PPTDEV_MMIO:
564 		pptmmio = (struct vm_pptdev_mmio *)data;
565 		error = ppt_unmap_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
566 				       pptmmio->func, pptmmio->gpa, pptmmio->len);
567 		break;
568 	case VM_BIND_PPTDEV:
569 		pptdev = (struct vm_pptdev *)data;
570 		error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
571 					 pptdev->func);
572 		break;
573 	case VM_UNBIND_PPTDEV:
574 		pptdev = (struct vm_pptdev *)data;
575 		error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
576 					   pptdev->func);
577 		break;
578 	case VM_INJECT_EXCEPTION:
579 		vmexc = (struct vm_exception *)data;
580 		error = vm_inject_exception(sc->vm, vmexc->cpuid,
581 		    vmexc->vector, vmexc->error_code_valid, vmexc->error_code,
582 		    vmexc->restart_instruction);
583 		break;
584 	case VM_INJECT_NMI:
585 		vmnmi = (struct vm_nmi *)data;
586 		error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
587 		break;
588 	case VM_LAPIC_IRQ:
589 		vmirq = (struct vm_lapic_irq *)data;
590 		error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector);
591 		break;
592 	case VM_LAPIC_LOCAL_IRQ:
593 		vmirq = (struct vm_lapic_irq *)data;
594 		error = lapic_set_local_intr(sc->vm, vmirq->cpuid,
595 		    vmirq->vector);
596 		break;
597 	case VM_LAPIC_MSI:
598 		vmmsi = (struct vm_lapic_msi *)data;
599 		error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg);
600 		break;
601 	case VM_IOAPIC_ASSERT_IRQ:
602 		ioapic_irq = (struct vm_ioapic_irq *)data;
603 		error = vioapic_assert_irq(sc->vm, ioapic_irq->irq);
604 		break;
605 	case VM_IOAPIC_DEASSERT_IRQ:
606 		ioapic_irq = (struct vm_ioapic_irq *)data;
607 		error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq);
608 		break;
609 	case VM_IOAPIC_PULSE_IRQ:
610 		ioapic_irq = (struct vm_ioapic_irq *)data;
611 		error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq);
612 		break;
613 	case VM_IOAPIC_PINCOUNT:
614 		*(int *)data = vioapic_pincount(sc->vm);
615 		break;
616 	case VM_SET_KERNEMU_DEV:
617 	case VM_GET_KERNEMU_DEV: {
618 		mem_region_write_t mwrite;
619 		mem_region_read_t mread;
620 		bool arg;
621 
622 		kernemu = (void *)data;
623 
624 		if (kernemu->access_width > 0)
625 			size = (1u << kernemu->access_width);
626 		else
627 			size = 1;
628 
629 		if (kernemu->gpa >= DEFAULT_APIC_BASE && kernemu->gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
630 			mread = lapic_mmio_read;
631 			mwrite = lapic_mmio_write;
632 		} else if (kernemu->gpa >= VIOAPIC_BASE && kernemu->gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
633 			mread = vioapic_mmio_read;
634 			mwrite = vioapic_mmio_write;
635 		} else if (kernemu->gpa >= VHPET_BASE && kernemu->gpa < VHPET_BASE + VHPET_SIZE) {
636 			mread = vhpet_mmio_read;
637 			mwrite = vhpet_mmio_write;
638 		} else {
639 			error = EINVAL;
640 			break;
641 		}
642 
643 		if (cmd == VM_SET_KERNEMU_DEV)
644 			error = mwrite(sc->vm, kernemu->vcpuid, kernemu->gpa,
645 			    kernemu->value, size, &arg);
646 		else
647 			error = mread(sc->vm, kernemu->vcpuid, kernemu->gpa,
648 			    &kernemu->value, size, &arg);
649 		break;
650 		}
651 	case VM_ISA_ASSERT_IRQ:
652 		isa_irq = (struct vm_isa_irq *)data;
653 		error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq);
654 		if (error == 0 && isa_irq->ioapic_irq != -1)
655 			error = vioapic_assert_irq(sc->vm,
656 			    isa_irq->ioapic_irq);
657 		break;
658 	case VM_ISA_DEASSERT_IRQ:
659 		isa_irq = (struct vm_isa_irq *)data;
660 		error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq);
661 		if (error == 0 && isa_irq->ioapic_irq != -1)
662 			error = vioapic_deassert_irq(sc->vm,
663 			    isa_irq->ioapic_irq);
664 		break;
665 	case VM_ISA_PULSE_IRQ:
666 		isa_irq = (struct vm_isa_irq *)data;
667 		error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq);
668 		if (error == 0 && isa_irq->ioapic_irq != -1)
669 			error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq);
670 		break;
671 	case VM_ISA_SET_IRQ_TRIGGER:
672 		isa_irq_trigger = (struct vm_isa_irq_trigger *)data;
673 		error = vatpic_set_irq_trigger(sc->vm,
674 		    isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger);
675 		break;
676 	case VM_MMAP_GETNEXT:
677 		mm = (struct vm_memmap *)data;
678 		error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
679 		    &mm->segoff, &mm->len, &mm->prot, &mm->flags);
680 		break;
681 	case VM_MMAP_MEMSEG:
682 		mm = (struct vm_memmap *)data;
683 		error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
684 		    mm->len, mm->prot, mm->flags);
685 		break;
686 	case VM_MUNMAP_MEMSEG:
687 		mu = (struct vm_munmap *)data;
688 		error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len);
689 		break;
690 #ifdef COMPAT_FREEBSD12
691 	case VM_ALLOC_MEMSEG_FBSD12:
692 		error = alloc_memseg(sc, (struct vm_memseg *)data,
693 		    sizeof(((struct vm_memseg_fbsd12 *)0)->name));
694 		break;
695 #endif
696 	case VM_ALLOC_MEMSEG:
697 		error = alloc_memseg(sc, (struct vm_memseg *)data,
698 		    sizeof(((struct vm_memseg *)0)->name));
699 		break;
700 #ifdef COMPAT_FREEBSD12
701 	case VM_GET_MEMSEG_FBSD12:
702 		error = get_memseg(sc, (struct vm_memseg *)data,
703 		    sizeof(((struct vm_memseg_fbsd12 *)0)->name));
704 		break;
705 #endif
706 	case VM_GET_MEMSEG:
707 		error = get_memseg(sc, (struct vm_memseg *)data,
708 		    sizeof(((struct vm_memseg *)0)->name));
709 		break;
710 	case VM_GET_REGISTER:
711 		vmreg = (struct vm_register *)data;
712 		error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
713 					&vmreg->regval);
714 		break;
715 	case VM_SET_REGISTER:
716 		vmreg = (struct vm_register *)data;
717 		error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
718 					vmreg->regval);
719 		break;
720 	case VM_SET_SEGMENT_DESCRIPTOR:
721 		vmsegdesc = (struct vm_seg_desc *)data;
722 		error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
723 					vmsegdesc->regnum,
724 					&vmsegdesc->desc);
725 		break;
726 	case VM_GET_SEGMENT_DESCRIPTOR:
727 		vmsegdesc = (struct vm_seg_desc *)data;
728 		error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
729 					vmsegdesc->regnum,
730 					&vmsegdesc->desc);
731 		break;
732 	case VM_GET_REGISTER_SET:
733 		vmregset = (struct vm_register_set *)data;
734 		if (vmregset->count > VM_REG_LAST) {
735 			error = EINVAL;
736 			break;
737 		}
738 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
739 		    M_WAITOK);
740 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
741 		    M_WAITOK);
742 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
743 		    vmregset->count);
744 		if (error == 0)
745 			error = vm_get_register_set(sc->vm, vmregset->cpuid,
746 			    vmregset->count, regnums, regvals);
747 		if (error == 0)
748 			error = copyout(regvals, vmregset->regvals,
749 			    sizeof(regvals[0]) * vmregset->count);
750 		free(regvals, M_VMMDEV);
751 		free(regnums, M_VMMDEV);
752 		break;
753 	case VM_SET_REGISTER_SET:
754 		vmregset = (struct vm_register_set *)data;
755 		if (vmregset->count > VM_REG_LAST) {
756 			error = EINVAL;
757 			break;
758 		}
759 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
760 		    M_WAITOK);
761 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
762 		    M_WAITOK);
763 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
764 		    vmregset->count);
765 		if (error == 0)
766 			error = copyin(vmregset->regvals, regvals,
767 			    sizeof(regvals[0]) * vmregset->count);
768 		if (error == 0)
769 			error = vm_set_register_set(sc->vm, vmregset->cpuid,
770 			    vmregset->count, regnums, regvals);
771 		free(regvals, M_VMMDEV);
772 		free(regnums, M_VMMDEV);
773 		break;
774 	case VM_GET_CAPABILITY:
775 		vmcap = (struct vm_capability *)data;
776 		error = vm_get_capability(sc->vm, vmcap->cpuid,
777 					  vmcap->captype,
778 					  &vmcap->capval);
779 		break;
780 	case VM_SET_CAPABILITY:
781 		vmcap = (struct vm_capability *)data;
782 		error = vm_set_capability(sc->vm, vmcap->cpuid,
783 					  vmcap->captype,
784 					  vmcap->capval);
785 		break;
786 	case VM_SET_X2APIC_STATE:
787 		x2apic = (struct vm_x2apic *)data;
788 		error = vm_set_x2apic_state(sc->vm,
789 					    x2apic->cpuid, x2apic->state);
790 		break;
791 	case VM_GET_X2APIC_STATE:
792 		x2apic = (struct vm_x2apic *)data;
793 		error = vm_get_x2apic_state(sc->vm,
794 					    x2apic->cpuid, &x2apic->state);
795 		break;
796 	case VM_GET_GPA_PMAP:
797 		gpapte = (struct vm_gpa_pte *)data;
798 		pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
799 				 gpapte->gpa, gpapte->pte, &gpapte->ptenum);
800 		error = 0;
801 		break;
802 	case VM_GET_HPET_CAPABILITIES:
803 		error = vhpet_getcap((struct vm_hpet_cap *)data);
804 		break;
805 	case VM_GLA2GPA: {
806 		CTASSERT(PROT_READ == VM_PROT_READ);
807 		CTASSERT(PROT_WRITE == VM_PROT_WRITE);
808 		CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
809 		gg = (struct vm_gla2gpa *)data;
810 		error = vm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla,
811 		    gg->prot, &gg->gpa, &gg->fault);
812 		KASSERT(error == 0 || error == EFAULT,
813 		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
814 		break;
815 	}
816 	case VM_GLA2GPA_NOFAULT:
817 		gg = (struct vm_gla2gpa *)data;
818 		error = vm_gla2gpa_nofault(sc->vm, gg->vcpuid, &gg->paging,
819 		    gg->gla, gg->prot, &gg->gpa, &gg->fault);
820 		KASSERT(error == 0 || error == EFAULT,
821 		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
822 		break;
823 	case VM_ACTIVATE_CPU:
824 		vac = (struct vm_activate_cpu *)data;
825 		error = vm_activate_cpu(sc->vm, vac->vcpuid);
826 		break;
827 	case VM_GET_CPUS:
828 		error = 0;
829 		vm_cpuset = (struct vm_cpuset *)data;
830 		size = vm_cpuset->cpusetsize;
831 		if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) {
832 			error = ERANGE;
833 			break;
834 		}
835 		cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
836 		if (vm_cpuset->which == VM_ACTIVE_CPUS)
837 			*cpuset = vm_active_cpus(sc->vm);
838 		else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
839 			*cpuset = vm_suspended_cpus(sc->vm);
840 		else if (vm_cpuset->which == VM_DEBUG_CPUS)
841 			*cpuset = vm_debug_cpus(sc->vm);
842 		else
843 			error = EINVAL;
844 		if (error == 0)
845 			error = copyout(cpuset, vm_cpuset->cpus, size);
846 		free(cpuset, M_TEMP);
847 		break;
848 	case VM_SUSPEND_CPU:
849 		vac = (struct vm_activate_cpu *)data;
850 		error = vm_suspend_cpu(sc->vm, vac->vcpuid);
851 		break;
852 	case VM_RESUME_CPU:
853 		vac = (struct vm_activate_cpu *)data;
854 		error = vm_resume_cpu(sc->vm, vac->vcpuid);
855 		break;
856 	case VM_SET_INTINFO:
857 		vmii = (struct vm_intinfo *)data;
858 		error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1);
859 		break;
860 	case VM_GET_INTINFO:
861 		vmii = (struct vm_intinfo *)data;
862 		error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1,
863 		    &vmii->info2);
864 		break;
865 	case VM_RTC_WRITE:
866 		rtcdata = (struct vm_rtc_data *)data;
867 		error = vrtc_nvram_write(sc->vm, rtcdata->offset,
868 		    rtcdata->value);
869 		break;
870 	case VM_RTC_READ:
871 		rtcdata = (struct vm_rtc_data *)data;
872 		error = vrtc_nvram_read(sc->vm, rtcdata->offset,
873 		    &rtcdata->value);
874 		break;
875 	case VM_RTC_SETTIME:
876 		rtctime = (struct vm_rtc_time *)data;
877 		error = vrtc_set_time(sc->vm, rtctime->secs);
878 		break;
879 	case VM_RTC_GETTIME:
880 		error = 0;
881 		rtctime = (struct vm_rtc_time *)data;
882 		rtctime->secs = vrtc_get_time(sc->vm);
883 		break;
884 	case VM_RESTART_INSTRUCTION:
885 		error = vm_restart_instruction(sc->vm, vcpu);
886 		break;
887 	case VM_SET_TOPOLOGY:
888 		topology = (struct vm_cpu_topology *)data;
889 		error = vm_set_topology(sc->vm, topology->sockets,
890 		    topology->cores, topology->threads, topology->maxcpus);
891 		break;
892 	case VM_GET_TOPOLOGY:
893 		topology = (struct vm_cpu_topology *)data;
894 		vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
895 		    &topology->threads, &topology->maxcpus);
896 		error = 0;
897 		break;
898 #ifdef BHYVE_SNAPSHOT
899 	case VM_SNAPSHOT_REQ:
900 		snapshot_meta = (struct vm_snapshot_meta *)data;
901 		error = vm_snapshot_req(sc->vm, snapshot_meta);
902 		break;
903 	case VM_RESTORE_TIME:
904 		error = vm_restore_time(sc->vm);
905 		break;
906 #endif
907 	default:
908 		error = ENOTTY;
909 		break;
910 	}
911 
912 	if (state_changed == 1)
913 		vcpu_unlock_one(sc, vcpu);
914 	else if (state_changed == 2)
915 		vcpu_unlock_all(sc);
916 
917 done:
918 	/*
919 	 * Make sure that no handler returns a kernel-internal
920 	 * error value to userspace.
921 	 */
922 	KASSERT(error == ERESTART || error >= 0,
923 	    ("vmmdev_ioctl: invalid error return %d", error));
924 	return (error);
925 }
926 
927 static int
928 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
929     struct vm_object **objp, int nprot)
930 {
931 	struct vmmdev_softc *sc;
932 	vm_paddr_t gpa;
933 	size_t len;
934 	vm_ooffset_t segoff, first, last;
935 	int error, found, segid;
936 	uint16_t lastcpu;
937 	bool sysmem;
938 
939 	error = vmm_priv_check(curthread->td_ucred);
940 	if (error)
941 		return (error);
942 
943 	first = *offset;
944 	last = first + mapsize;
945 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
946 		return (EINVAL);
947 
948 	sc = vmmdev_lookup2(cdev);
949 	if (sc == NULL) {
950 		/* virtual machine is in the process of being created */
951 		return (EINVAL);
952 	}
953 
954 	/*
955 	 * Get a read lock on the guest memory map by freezing any vcpu.
956 	 */
957 	lastcpu = vm_get_maxcpus(sc->vm) - 1;
958 	error = vcpu_lock_one(sc, lastcpu);
959 	if (error)
960 		return (error);
961 
962 	gpa = 0;
963 	found = 0;
964 	while (!found) {
965 		error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
966 		    NULL, NULL);
967 		if (error)
968 			break;
969 
970 		if (first >= gpa && last <= gpa + len)
971 			found = 1;
972 		else
973 			gpa += len;
974 	}
975 
976 	if (found) {
977 		error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
978 		KASSERT(error == 0 && *objp != NULL,
979 		    ("%s: invalid memory segment %d", __func__, segid));
980 		if (sysmem) {
981 			vm_object_reference(*objp);
982 			*offset = segoff + (first - gpa);
983 		} else {
984 			error = EINVAL;
985 		}
986 	}
987 	vcpu_unlock_one(sc, lastcpu);
988 	return (error);
989 }
990 
991 static void
992 vmmdev_destroy(void *arg)
993 {
994 	struct vmmdev_softc *sc = arg;
995 	struct devmem_softc *dsc;
996 	int error __diagused;
997 
998 	error = vcpu_lock_all(sc);
999 	KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
1000 
1001 	while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
1002 		KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
1003 		SLIST_REMOVE_HEAD(&sc->devmem, link);
1004 		free(dsc->name, M_VMMDEV);
1005 		free(dsc, M_VMMDEV);
1006 	}
1007 
1008 	if (sc->cdev != NULL)
1009 		destroy_dev(sc->cdev);
1010 
1011 	if (sc->vm != NULL)
1012 		vm_destroy(sc->vm);
1013 
1014 	if (sc->ucred != NULL)
1015 		crfree(sc->ucred);
1016 
1017 	if ((sc->flags & VSC_LINKED) != 0) {
1018 		mtx_lock(&vmmdev_mtx);
1019 		SLIST_REMOVE(&head, sc, vmmdev_softc, link);
1020 		mtx_unlock(&vmmdev_mtx);
1021 	}
1022 
1023 	free(sc, M_VMMDEV);
1024 }
1025 
1026 static int
1027 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
1028 {
1029 	struct devmem_softc *dsc;
1030 	struct vmmdev_softc *sc;
1031 	struct cdev *cdev;
1032 	char *buf;
1033 	int error, buflen;
1034 
1035 	error = vmm_priv_check(req->td->td_ucred);
1036 	if (error)
1037 		return (error);
1038 
1039 	buflen = VM_MAX_NAMELEN + 1;
1040 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
1041 	strlcpy(buf, "beavis", buflen);
1042 	error = sysctl_handle_string(oidp, buf, buflen, req);
1043 	if (error != 0 || req->newptr == NULL)
1044 		goto out;
1045 
1046 	mtx_lock(&vmmdev_mtx);
1047 	sc = vmmdev_lookup(buf);
1048 	if (sc == NULL || sc->cdev == NULL) {
1049 		mtx_unlock(&vmmdev_mtx);
1050 		error = EINVAL;
1051 		goto out;
1052 	}
1053 
1054 	/*
1055 	 * The 'cdev' will be destroyed asynchronously when 'si_threadcount'
1056 	 * goes down to 0 so we should not do it again in the callback.
1057 	 *
1058 	 * Setting 'sc->cdev' to NULL is also used to indicate that the VM
1059 	 * is scheduled for destruction.
1060 	 */
1061 	cdev = sc->cdev;
1062 	sc->cdev = NULL;
1063 	mtx_unlock(&vmmdev_mtx);
1064 
1065 	/*
1066 	 * Schedule all cdevs to be destroyed:
1067 	 *
1068 	 * - any new operations on the 'cdev' will return an error (ENXIO).
1069 	 *
1070 	 * - when the 'si_threadcount' dwindles down to zero the 'cdev' will
1071 	 *   be destroyed and the callback will be invoked in a taskqueue
1072 	 *   context.
1073 	 *
1074 	 * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
1075 	 */
1076 	SLIST_FOREACH(dsc, &sc->devmem, link) {
1077 		KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
1078 		destroy_dev_sched_cb(dsc->cdev, devmem_destroy, dsc);
1079 	}
1080 	destroy_dev_sched_cb(cdev, vmmdev_destroy, sc);
1081 	error = 0;
1082 
1083 out:
1084 	free(buf, M_VMMDEV);
1085 	return (error);
1086 }
1087 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy,
1088     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
1089     NULL, 0, sysctl_vmm_destroy, "A",
1090     NULL);
1091 
1092 static struct cdevsw vmmdevsw = {
1093 	.d_name		= "vmmdev",
1094 	.d_version	= D_VERSION,
1095 	.d_ioctl	= vmmdev_ioctl,
1096 	.d_mmap_single	= vmmdev_mmap_single,
1097 	.d_read		= vmmdev_rw,
1098 	.d_write	= vmmdev_rw,
1099 };
1100 
1101 static int
1102 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
1103 {
1104 	struct vm *vm;
1105 	struct cdev *cdev;
1106 	struct vmmdev_softc *sc, *sc2;
1107 	char *buf;
1108 	int error, buflen;
1109 
1110 	error = vmm_priv_check(req->td->td_ucred);
1111 	if (error)
1112 		return (error);
1113 
1114 	buflen = VM_MAX_NAMELEN + 1;
1115 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
1116 	strlcpy(buf, "beavis", buflen);
1117 	error = sysctl_handle_string(oidp, buf, buflen, req);
1118 	if (error != 0 || req->newptr == NULL)
1119 		goto out;
1120 
1121 	mtx_lock(&vmmdev_mtx);
1122 	sc = vmmdev_lookup(buf);
1123 	mtx_unlock(&vmmdev_mtx);
1124 	if (sc != NULL) {
1125 		error = EEXIST;
1126 		goto out;
1127 	}
1128 
1129 	error = vm_create(buf, &vm);
1130 	if (error != 0)
1131 		goto out;
1132 
1133 	sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1134 	sc->ucred = crhold(curthread->td_ucred);
1135 	sc->vm = vm;
1136 	SLIST_INIT(&sc->devmem);
1137 
1138 	/*
1139 	 * Lookup the name again just in case somebody sneaked in when we
1140 	 * dropped the lock.
1141 	 */
1142 	mtx_lock(&vmmdev_mtx);
1143 	sc2 = vmmdev_lookup(buf);
1144 	if (sc2 == NULL) {
1145 		SLIST_INSERT_HEAD(&head, sc, link);
1146 		sc->flags |= VSC_LINKED;
1147 	}
1148 	mtx_unlock(&vmmdev_mtx);
1149 
1150 	if (sc2 != NULL) {
1151 		vmmdev_destroy(sc);
1152 		error = EEXIST;
1153 		goto out;
1154 	}
1155 
1156 	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, sc->ucred,
1157 	    UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf);
1158 	if (error != 0) {
1159 		vmmdev_destroy(sc);
1160 		goto out;
1161 	}
1162 
1163 	mtx_lock(&vmmdev_mtx);
1164 	sc->cdev = cdev;
1165 	sc->cdev->si_drv1 = sc;
1166 	mtx_unlock(&vmmdev_mtx);
1167 
1168 out:
1169 	free(buf, M_VMMDEV);
1170 	return (error);
1171 }
1172 SYSCTL_PROC(_hw_vmm, OID_AUTO, create,
1173     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
1174     NULL, 0, sysctl_vmm_create, "A",
1175     NULL);
1176 
1177 void
1178 vmmdev_init(void)
1179 {
1180 	mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
1181 	pr_allow_flag = prison_add_allow(NULL, "vmm", NULL,
1182 	    "Allow use of vmm in a jail.");
1183 }
1184 
1185 int
1186 vmmdev_cleanup(void)
1187 {
1188 	int error;
1189 
1190 	if (SLIST_EMPTY(&head))
1191 		error = 0;
1192 	else
1193 		error = EBUSY;
1194 
1195 	return (error);
1196 }
1197 
1198 static int
1199 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
1200     struct vm_object **objp, int nprot)
1201 {
1202 	struct devmem_softc *dsc;
1203 	vm_ooffset_t first, last;
1204 	size_t seglen;
1205 	int error;
1206 	uint16_t lastcpu;
1207 	bool sysmem;
1208 
1209 	dsc = cdev->si_drv1;
1210 	if (dsc == NULL) {
1211 		/* 'cdev' has been created but is not ready for use */
1212 		return (ENXIO);
1213 	}
1214 
1215 	first = *offset;
1216 	last = *offset + len;
1217 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
1218 		return (EINVAL);
1219 
1220 	lastcpu = vm_get_maxcpus(dsc->sc->vm) - 1;
1221 	error = vcpu_lock_one(dsc->sc, lastcpu);
1222 	if (error)
1223 		return (error);
1224 
1225 	error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
1226 	KASSERT(error == 0 && !sysmem && *objp != NULL,
1227 	    ("%s: invalid devmem segment %d", __func__, dsc->segid));
1228 
1229 	vcpu_unlock_one(dsc->sc, lastcpu);
1230 
1231 	if (seglen >= last) {
1232 		vm_object_reference(*objp);
1233 		return (0);
1234 	} else {
1235 		return (EINVAL);
1236 	}
1237 }
1238 
1239 static struct cdevsw devmemsw = {
1240 	.d_name		= "devmem",
1241 	.d_version	= D_VERSION,
1242 	.d_mmap_single	= devmem_mmap_single,
1243 };
1244 
1245 static int
1246 devmem_create_cdev(const char *vmname, int segid, char *devname)
1247 {
1248 	struct devmem_softc *dsc;
1249 	struct vmmdev_softc *sc;
1250 	struct cdev *cdev;
1251 	int error;
1252 
1253 	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL,
1254 	    UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname);
1255 	if (error)
1256 		return (error);
1257 
1258 	dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1259 
1260 	mtx_lock(&vmmdev_mtx);
1261 	sc = vmmdev_lookup(vmname);
1262 	KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname));
1263 	if (sc->cdev == NULL) {
1264 		/* virtual machine is being created or destroyed */
1265 		mtx_unlock(&vmmdev_mtx);
1266 		free(dsc, M_VMMDEV);
1267 		destroy_dev_sched_cb(cdev, NULL, 0);
1268 		return (ENODEV);
1269 	}
1270 
1271 	dsc->segid = segid;
1272 	dsc->name = devname;
1273 	dsc->cdev = cdev;
1274 	dsc->sc = sc;
1275 	SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
1276 	mtx_unlock(&vmmdev_mtx);
1277 
1278 	/* The 'cdev' is ready for use after 'si_drv1' is initialized */
1279 	cdev->si_drv1 = dsc;
1280 	return (0);
1281 }
1282 
1283 static void
1284 devmem_destroy(void *arg)
1285 {
1286 	struct devmem_softc *dsc = arg;
1287 
1288 	KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__));
1289 	dsc->cdev = NULL;
1290 	dsc->sc = NULL;
1291 }
1292