xref: /freebsd/sys/arm64/vmm/vmm_dev.c (revision 5f757f3f)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/param.h>
31 #include <sys/kernel.h>
32 #include <sys/jail.h>
33 #include <sys/queue.h>
34 #include <sys/lock.h>
35 #include <sys/mutex.h>
36 #include <sys/malloc.h>
37 #include <sys/conf.h>
38 #include <sys/sysctl.h>
39 #include <sys/libkern.h>
40 #include <sys/ioccom.h>
41 #include <sys/mman.h>
42 #include <sys/uio.h>
43 #include <sys/proc.h>
44 
45 #include <vm/vm.h>
46 #include <vm/pmap.h>
47 #include <vm/vm_map.h>
48 #include <vm/vm_object.h>
49 
50 #include <machine/machdep.h>
51 #include <machine/vmparam.h>
52 #include <machine/vmm.h>
53 #include <machine/vmm_dev.h>
54 
55 #include "vmm_stat.h"
56 
57 #include "io/vgic.h"
58 
59 struct devmem_softc {
60 	int	segid;
61 	char	*name;
62 	struct cdev *cdev;
63 	struct vmmdev_softc *sc;
64 	SLIST_ENTRY(devmem_softc) link;
65 };
66 
67 struct vmmdev_softc {
68 	struct vm	*vm;		/* vm instance cookie */
69 	struct cdev	*cdev;
70 	struct ucred	*ucred;
71 	SLIST_ENTRY(vmmdev_softc) link;
72 	SLIST_HEAD(, devmem_softc) devmem;
73 	int		flags;
74 };
75 #define	VSC_LINKED		0x01
76 
77 static SLIST_HEAD(, vmmdev_softc) head;
78 
79 static unsigned pr_allow_flag;
80 static struct mtx vmmdev_mtx;
81 MTX_SYSINIT(vmmdev_mtx, &vmmdev_mtx, "vmm device mutex", MTX_DEF);
82 
83 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
84 
85 SYSCTL_DECL(_hw_vmm);
86 
87 static int vmm_priv_check(struct ucred *ucred);
88 static int devmem_create_cdev(const char *vmname, int id, char *devmem);
89 static void devmem_destroy(void *arg);
90 
91 static int
92 vmm_priv_check(struct ucred *ucred)
93 {
94 
95 	if (jailed(ucred) &&
96 	    !(ucred->cr_prison->pr_allow & pr_allow_flag))
97 		return (EPERM);
98 
99 	return (0);
100 }
101 
102 static int
103 vcpu_lock_one(struct vcpu *vcpu)
104 {
105 	int error;
106 
107 	error = vcpu_set_state(vcpu, VCPU_FROZEN, true);
108 	return (error);
109 }
110 
111 static void
112 vcpu_unlock_one(struct vcpu *vcpu)
113 {
114 	enum vcpu_state state;
115 
116 	state = vcpu_get_state(vcpu, NULL);
117 	if (state != VCPU_FROZEN) {
118 		panic("vcpu %s(%d) has invalid state %d",
119 		    vm_name(vcpu_vm(vcpu)), vcpu_vcpuid(vcpu), state);
120 	}
121 
122 	vcpu_set_state(vcpu, VCPU_IDLE, false);
123 }
124 
125 static int
126 vcpu_lock_all(struct vmmdev_softc *sc)
127 {
128 	struct vcpu *vcpu;
129 	int error;
130 	uint16_t i, j, maxcpus;
131 
132 	error = 0;
133 	vm_slock_vcpus(sc->vm);
134 	maxcpus = vm_get_maxcpus(sc->vm);
135 	for (i = 0; i < maxcpus; i++) {
136 		vcpu = vm_vcpu(sc->vm, i);
137 		if (vcpu == NULL)
138 			continue;
139 		error = vcpu_lock_one(vcpu);
140 		if (error)
141 			break;
142 	}
143 
144 	if (error) {
145 		for (j = 0; j < i; j++) {
146 			vcpu = vm_vcpu(sc->vm, j);
147 			if (vcpu == NULL)
148 				continue;
149 			vcpu_unlock_one(vcpu);
150 		}
151 		vm_unlock_vcpus(sc->vm);
152 	}
153 
154 	return (error);
155 }
156 
157 static void
158 vcpu_unlock_all(struct vmmdev_softc *sc)
159 {
160 	struct vcpu *vcpu;
161 	uint16_t i, maxcpus;
162 
163 	maxcpus = vm_get_maxcpus(sc->vm);
164 	for (i = 0; i < maxcpus; i++) {
165 		vcpu = vm_vcpu(sc->vm, i);
166 		if (vcpu == NULL)
167 			continue;
168 		vcpu_unlock_one(vcpu);
169 	}
170 	vm_unlock_vcpus(sc->vm);
171 }
172 
173 static struct vmmdev_softc *
174 vmmdev_lookup(const char *name)
175 {
176 	struct vmmdev_softc *sc;
177 
178 #ifdef notyet	/* XXX kernel is not compiled with invariants */
179 	mtx_assert(&vmmdev_mtx, MA_OWNED);
180 #endif
181 
182 	SLIST_FOREACH(sc, &head, link) {
183 		if (strcmp(name, vm_name(sc->vm)) == 0)
184 			break;
185 	}
186 
187 	if (sc == NULL)
188 		return (NULL);
189 
190 	if (cr_cansee(curthread->td_ucred, sc->ucred))
191 		return (NULL);
192 
193 	return (sc);
194 }
195 
196 static struct vmmdev_softc *
197 vmmdev_lookup2(struct cdev *cdev)
198 {
199 
200 	return (cdev->si_drv1);
201 }
202 
203 static int
204 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
205 {
206 	int error, off, c, prot;
207 	vm_paddr_t gpa, maxaddr;
208 	void *hpa, *cookie;
209 	struct vmmdev_softc *sc;
210 
211 	error = vmm_priv_check(curthread->td_ucred);
212 	if (error)
213 		return (error);
214 
215 	sc = vmmdev_lookup2(cdev);
216 	if (sc == NULL)
217 		return (ENXIO);
218 
219 	/*
220 	 * Get a read lock on the guest memory map.
221 	 */
222 	vm_slock_memsegs(sc->vm);
223 
224 	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
225 	maxaddr = vmm_sysmem_maxaddr(sc->vm);
226 	while (uio->uio_resid > 0 && error == 0) {
227 		gpa = uio->uio_offset;
228 		off = gpa & PAGE_MASK;
229 		c = min(uio->uio_resid, PAGE_SIZE - off);
230 
231 		/*
232 		 * The VM has a hole in its physical memory map. If we want to
233 		 * use 'dd' to inspect memory beyond the hole we need to
234 		 * provide bogus data for memory that lies in the hole.
235 		 *
236 		 * Since this device does not support lseek(2), dd(1) will
237 		 * read(2) blocks of data to simulate the lseek(2).
238 		 */
239 		hpa = vm_gpa_hold_global(sc->vm, gpa, c, prot, &cookie);
240 		if (hpa == NULL) {
241 			if (uio->uio_rw == UIO_READ && gpa < maxaddr)
242 				error = uiomove(__DECONST(void *, zero_region),
243 				    c, uio);
244 			else
245 				error = EFAULT;
246 		} else {
247 			error = uiomove(hpa, c, uio);
248 			vm_gpa_release(cookie);
249 		}
250 	}
251 	vm_unlock_memsegs(sc->vm);
252 	return (error);
253 }
254 
255 static int
256 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
257 {
258 	struct devmem_softc *dsc;
259 	int error;
260 	bool sysmem;
261 
262 	error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
263 	if (error || mseg->len == 0)
264 		return (error);
265 
266 	if (!sysmem) {
267 		SLIST_FOREACH(dsc, &sc->devmem, link) {
268 			if (dsc->segid == mseg->segid)
269 				break;
270 		}
271 		KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
272 		    __func__, mseg->segid));
273 		error = copystr(dsc->name, mseg->name, sizeof(mseg->name),
274 		    NULL);
275 	} else {
276 		bzero(mseg->name, sizeof(mseg->name));
277 	}
278 
279 	return (error);
280 }
281 
282 static int
283 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
284 {
285 	char *name;
286 	int error;
287 	bool sysmem;
288 
289 	error = 0;
290 	name = NULL;
291 	sysmem = true;
292 
293 	/*
294 	 * The allocation is lengthened by 1 to hold a terminating NUL.  It'll
295 	 * by stripped off when devfs processes the full string.
296 	 */
297 	if (VM_MEMSEG_NAME(mseg)) {
298 		sysmem = false;
299 		name = malloc(sizeof(mseg->name), M_VMMDEV, M_WAITOK);
300 		error = copystr(mseg->name, name, sizeof(mseg->name), NULL);
301 		if (error)
302 			goto done;
303 	}
304 
305 	error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
306 	if (error)
307 		goto done;
308 
309 	if (VM_MEMSEG_NAME(mseg)) {
310 		error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name);
311 		if (error)
312 			vm_free_memseg(sc->vm, mseg->segid);
313 		else
314 			name = NULL;	/* freed when 'cdev' is destroyed */
315 	}
316 done:
317 	free(name, M_VMMDEV);
318 	return (error);
319 }
320 
321 static int
322 vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
323     uint64_t *regval)
324 {
325 	int error, i;
326 
327 	error = 0;
328 	for (i = 0; i < count; i++) {
329 		error = vm_get_register(vcpu, regnum[i], &regval[i]);
330 		if (error)
331 			break;
332 	}
333 	return (error);
334 }
335 
336 static int
337 vm_set_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
338     uint64_t *regval)
339 {
340 	int error, i;
341 
342 	error = 0;
343 	for (i = 0; i < count; i++) {
344 		error = vm_set_register(vcpu, regnum[i], regval[i]);
345 		if (error)
346 			break;
347 	}
348 	return (error);
349 }
350 
351 static int
352 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
353 	     struct thread *td)
354 {
355 	int error, vcpuid, size;
356 	cpuset_t *cpuset;
357 	struct vmmdev_softc *sc;
358 	struct vcpu *vcpu;
359 	struct vm_register *vmreg;
360 	struct vm_register_set *vmregset;
361 	struct vm_run *vmrun;
362 	struct vm_vgic_version *vgv;
363 	struct vm_vgic_descr *vgic;
364 	struct vm_cpuset *vm_cpuset;
365 	struct vm_irq *vi;
366 	struct vm_capability *vmcap;
367 	struct vm_stats *vmstats;
368 	struct vm_stat_desc *statdesc;
369 	struct vm_suspend *vmsuspend;
370 	struct vm_exception *vmexc;
371 	struct vm_gla2gpa *gg;
372 	struct vm_memmap *mm;
373 	struct vm_munmap *mu;
374 	struct vm_msi *vmsi;
375 	struct vm_cpu_topology *topology;
376 	uint64_t *regvals;
377 	int *regnums;
378 	enum { NONE, SINGLE, ALL } vcpus_locked;
379 	bool memsegs_locked;
380 
381 	error = vmm_priv_check(curthread->td_ucred);
382 	if (error)
383 		return (error);
384 
385 	sc = vmmdev_lookup2(cdev);
386 	if (sc == NULL)
387 		return (ENXIO);
388 
389 	error = 0;
390 	vcpuid = -1;
391 	vcpu = NULL;
392 	vcpus_locked = NONE;
393 	memsegs_locked = false;
394 
395 	/*
396 	 * Some VMM ioctls can operate only on vcpus that are not running.
397 	 */
398 	switch (cmd) {
399 	case VM_RUN:
400 	case VM_GET_REGISTER:
401 	case VM_SET_REGISTER:
402 	case VM_GET_REGISTER_SET:
403 	case VM_SET_REGISTER_SET:
404 	case VM_INJECT_EXCEPTION:
405 	case VM_GET_CAPABILITY:
406 	case VM_SET_CAPABILITY:
407 	case VM_GLA2GPA_NOFAULT:
408 	case VM_ACTIVATE_CPU:
409 		/*
410 		 * ioctls that can operate only on vcpus that are not running.
411 		 */
412 		vcpuid = *(int *)data;
413 		vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
414 		if (vcpu == NULL) {
415 			error = EINVAL;
416 			goto done;
417 		}
418 		error = vcpu_lock_one(vcpu);
419 		if (error)
420 			goto done;
421 		vcpus_locked = SINGLE;
422 		break;
423 
424 	case VM_ALLOC_MEMSEG:
425 	case VM_MMAP_MEMSEG:
426 	case VM_MUNMAP_MEMSEG:
427 	case VM_REINIT:
428 	case VM_ATTACH_VGIC:
429 		/*
430 		 * ioctls that modify the memory map must lock memory
431 		 * segments exclusively.
432 		 */
433 		vm_xlock_memsegs(sc->vm);
434 		memsegs_locked = true;
435 
436 		/*
437 		 * ioctls that operate on the entire virtual machine must
438 		 * prevent all vcpus from running.
439 		 */
440 		error = vcpu_lock_all(sc);
441 		if (error)
442 			goto done;
443 		vcpus_locked = ALL;
444 		break;
445 	case VM_GET_MEMSEG:
446 	case VM_MMAP_GETNEXT:
447 		/*
448 		 * Lock the memory map while it is being inspected.
449 		 */
450 		vm_slock_memsegs(sc->vm);
451 		memsegs_locked = true;
452 		break;
453 
454 	case VM_STATS:
455 		/*
456 		 * These do not need the vCPU locked but do operate on
457 		 * a specific vCPU.
458 		 */
459 		vcpuid = *(int *)data;
460 		vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
461 		if (vcpu == NULL) {
462 			error = EINVAL;
463 			goto done;
464 		}
465 		break;
466 
467 	case VM_SUSPEND_CPU:
468 	case VM_RESUME_CPU:
469 		/*
470 		 * These can either operate on all CPUs via a vcpuid of
471 		 * -1 or on a specific vCPU.
472 		 */
473 		vcpuid = *(int *)data;
474 		if (vcpuid == -1)
475 			break;
476 		vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
477 		if (vcpu == NULL) {
478 			error = EINVAL;
479 			goto done;
480 		}
481 		break;
482 
483 	case VM_ASSERT_IRQ:
484 		vi = (struct vm_irq *)data;
485 		error = vm_assert_irq(sc->vm, vi->irq);
486 		break;
487 	case VM_DEASSERT_IRQ:
488 		vi = (struct vm_irq *)data;
489 		error = vm_deassert_irq(sc->vm, vi->irq);
490 		break;
491 	default:
492 		break;
493 	}
494 
495 	switch (cmd) {
496 	case VM_RUN: {
497 		struct vm_exit *vme;
498 
499 		vmrun = (struct vm_run *)data;
500 		vme = vm_exitinfo(vcpu);
501 
502 		error = vm_run(vcpu);
503 		if (error != 0)
504 			break;
505 
506 		error = copyout(vme, vmrun->vm_exit, sizeof(*vme));
507 		if (error != 0)
508 			break;
509 		break;
510 	}
511 	case VM_SUSPEND:
512 		vmsuspend = (struct vm_suspend *)data;
513 		error = vm_suspend(sc->vm, vmsuspend->how);
514 		break;
515 	case VM_REINIT:
516 		error = vm_reinit(sc->vm);
517 		break;
518 	case VM_STAT_DESC: {
519 		statdesc = (struct vm_stat_desc *)data;
520 		error = vmm_stat_desc_copy(statdesc->index,
521 					statdesc->desc, sizeof(statdesc->desc));
522 		break;
523 	}
524 	case VM_STATS: {
525 		CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
526 		vmstats = (struct vm_stats *)data;
527 		getmicrotime(&vmstats->tv);
528 		error = vmm_stat_copy(vcpu, vmstats->index,
529 				      nitems(vmstats->statbuf),
530 				      &vmstats->num_entries, vmstats->statbuf);
531 		break;
532 	}
533 	case VM_MMAP_GETNEXT:
534 		mm = (struct vm_memmap *)data;
535 		error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
536 		    &mm->segoff, &mm->len, &mm->prot, &mm->flags);
537 		break;
538 	case VM_MMAP_MEMSEG:
539 		mm = (struct vm_memmap *)data;
540 		error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
541 		    mm->len, mm->prot, mm->flags);
542 		break;
543 	case VM_MUNMAP_MEMSEG:
544 		mu = (struct vm_munmap *)data;
545 		error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len);
546 		break;
547 	case VM_ALLOC_MEMSEG:
548 		error = alloc_memseg(sc, (struct vm_memseg *)data);
549 		break;
550 	case VM_GET_MEMSEG:
551 		error = get_memseg(sc, (struct vm_memseg *)data);
552 		break;
553 	case VM_GET_REGISTER:
554 		vmreg = (struct vm_register *)data;
555 		error = vm_get_register(vcpu, vmreg->regnum, &vmreg->regval);
556 		break;
557 	case VM_SET_REGISTER:
558 		vmreg = (struct vm_register *)data;
559 		error = vm_set_register(vcpu, vmreg->regnum, vmreg->regval);
560 		break;
561 	case VM_GET_REGISTER_SET:
562 		vmregset = (struct vm_register_set *)data;
563 		if (vmregset->count > VM_REG_LAST) {
564 			error = EINVAL;
565 			break;
566 		}
567 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
568 		    M_WAITOK);
569 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
570 		    M_WAITOK);
571 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
572 		    vmregset->count);
573 		if (error == 0)
574 			error = vm_get_register_set(vcpu, vmregset->count,
575 			    regnums, regvals);
576 		if (error == 0)
577 			error = copyout(regvals, vmregset->regvals,
578 			    sizeof(regvals[0]) * vmregset->count);
579 		free(regvals, M_VMMDEV);
580 		free(regnums, M_VMMDEV);
581 		break;
582 	case VM_SET_REGISTER_SET:
583 		vmregset = (struct vm_register_set *)data;
584 		if (vmregset->count > VM_REG_LAST) {
585 			error = EINVAL;
586 			break;
587 		}
588 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
589 		    M_WAITOK);
590 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
591 		    M_WAITOK);
592 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
593 		    vmregset->count);
594 		if (error == 0)
595 			error = copyin(vmregset->regvals, regvals,
596 			    sizeof(regvals[0]) * vmregset->count);
597 		if (error == 0)
598 			error = vm_set_register_set(vcpu, vmregset->count,
599 			    regnums, regvals);
600 		free(regvals, M_VMMDEV);
601 		free(regnums, M_VMMDEV);
602 		break;
603 	case VM_GET_CAPABILITY:
604 		vmcap = (struct vm_capability *)data;
605 		error = vm_get_capability(vcpu,
606 					  vmcap->captype,
607 					  &vmcap->capval);
608 		break;
609 	case VM_SET_CAPABILITY:
610 		vmcap = (struct vm_capability *)data;
611 		error = vm_set_capability(vcpu,
612 					  vmcap->captype,
613 					  vmcap->capval);
614 		break;
615 	case VM_INJECT_EXCEPTION:
616 		vmexc = (struct vm_exception *)data;
617 		error = vm_inject_exception(vcpu, vmexc->esr, vmexc->far);
618 		break;
619 	case VM_GLA2GPA_NOFAULT:
620 		gg = (struct vm_gla2gpa *)data;
621 		error = vm_gla2gpa_nofault(vcpu, &gg->paging, gg->gla,
622 		    gg->prot, &gg->gpa, &gg->fault);
623 		KASSERT(error == 0 || error == EFAULT,
624 		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
625 		break;
626 	case VM_ACTIVATE_CPU:
627 		error = vm_activate_cpu(vcpu);
628 		break;
629 	case VM_GET_CPUS:
630 		error = 0;
631 		vm_cpuset = (struct vm_cpuset *)data;
632 		size = vm_cpuset->cpusetsize;
633 		if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) {
634 			error = ERANGE;
635 			break;
636 		}
637 		cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
638 		if (vm_cpuset->which == VM_ACTIVE_CPUS)
639 			*cpuset = vm_active_cpus(sc->vm);
640 		else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
641 			*cpuset = vm_suspended_cpus(sc->vm);
642 		else if (vm_cpuset->which == VM_DEBUG_CPUS)
643 			*cpuset = vm_debug_cpus(sc->vm);
644 		else
645 			error = EINVAL;
646 		if (error == 0)
647 			error = copyout(cpuset, vm_cpuset->cpus, size);
648 		free(cpuset, M_TEMP);
649 		break;
650 	case VM_SUSPEND_CPU:
651 		error = vm_suspend_cpu(sc->vm, vcpu);
652 		break;
653 	case VM_RESUME_CPU:
654 		error = vm_resume_cpu(sc->vm, vcpu);
655 		break;
656 	case VM_GET_VGIC_VERSION:
657 		vgv = (struct vm_vgic_version *)data;
658 		/* TODO: Query the vgic driver for this */
659 		vgv->version = 3;
660 		vgv->flags = 0;
661 		error = 0;
662 		break;
663 	case VM_ATTACH_VGIC:
664 		vgic = (struct vm_vgic_descr *)data;
665 		error = vm_attach_vgic(sc->vm, vgic);
666 		break;
667 	case VM_RAISE_MSI:
668 		vmsi = (struct vm_msi *)data;
669 		error = vm_raise_msi(sc->vm, vmsi->msg, vmsi->addr, vmsi->bus,
670 		    vmsi->slot, vmsi->func);
671 		break;
672 	case VM_SET_TOPOLOGY:
673 		topology = (struct vm_cpu_topology *)data;
674 		error = vm_set_topology(sc->vm, topology->sockets,
675 		    topology->cores, topology->threads, topology->maxcpus);
676 		break;
677 	case VM_GET_TOPOLOGY:
678 		topology = (struct vm_cpu_topology *)data;
679 		vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
680 		    &topology->threads, &topology->maxcpus);
681 		error = 0;
682 		break;
683 	default:
684 		error = ENOTTY;
685 		break;
686 	}
687 
688 done:
689 	if (vcpus_locked == SINGLE)
690 		vcpu_unlock_one(vcpu);
691 	else if (vcpus_locked == ALL)
692 		vcpu_unlock_all(sc);
693 	if (memsegs_locked)
694 		vm_unlock_memsegs(sc->vm);
695 
696 	/*
697 	 * Make sure that no handler returns a kernel-internal
698 	 * error value to userspace.
699 	 */
700 	KASSERT(error == ERESTART || error >= 0,
701 	    ("vmmdev_ioctl: invalid error return %d", error));
702 	return (error);
703 }
704 
705 static int
706 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
707     struct vm_object **objp, int nprot)
708 {
709 	struct vmmdev_softc *sc;
710 	vm_paddr_t gpa;
711 	size_t len;
712 	vm_ooffset_t segoff, first, last;
713 	int error, found, segid;
714 	bool sysmem;
715 
716 	error = vmm_priv_check(curthread->td_ucred);
717 	if (error)
718 		return (error);
719 
720 	first = *offset;
721 	last = first + mapsize;
722 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
723 		return (EINVAL);
724 
725 	sc = vmmdev_lookup2(cdev);
726 	if (sc == NULL) {
727 		/* virtual machine is in the process of being created */
728 		return (EINVAL);
729 	}
730 
731 	/*
732 	 * Get a read lock on the guest memory map.
733 	 */
734 	vm_slock_memsegs(sc->vm);
735 
736 	gpa = 0;
737 	found = 0;
738 	while (!found) {
739 		error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
740 		    NULL, NULL);
741 		if (error)
742 			break;
743 
744 		if (first >= gpa && last <= gpa + len)
745 			found = 1;
746 		else
747 			gpa += len;
748 	}
749 
750 	if (found) {
751 		error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
752 		KASSERT(error == 0 && *objp != NULL,
753 		    ("%s: invalid memory segment %d", __func__, segid));
754 		if (sysmem) {
755 			vm_object_reference(*objp);
756 			*offset = segoff + (first - gpa);
757 		} else {
758 			error = EINVAL;
759 		}
760 	}
761 	vm_unlock_memsegs(sc->vm);
762 	return (error);
763 }
764 
765 static void
766 vmmdev_destroy(void *arg)
767 {
768 	struct vmmdev_softc *sc = arg;
769 	struct devmem_softc *dsc;
770 	int error __diagused;
771 
772 	error = vcpu_lock_all(sc);
773 	KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
774 	vm_unlock_vcpus(sc->vm);
775 
776 	while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
777 		KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
778 		SLIST_REMOVE_HEAD(&sc->devmem, link);
779 		free(dsc->name, M_VMMDEV);
780 		free(dsc, M_VMMDEV);
781 	}
782 
783 	if (sc->cdev != NULL)
784 		destroy_dev(sc->cdev);
785 
786 	if (sc->vm != NULL)
787 		vm_destroy(sc->vm);
788 
789 	if (sc->ucred != NULL)
790 		crfree(sc->ucred);
791 
792 	if ((sc->flags & VSC_LINKED) != 0) {
793 		mtx_lock(&vmmdev_mtx);
794 		SLIST_REMOVE(&head, sc, vmmdev_softc, link);
795 		mtx_unlock(&vmmdev_mtx);
796 	}
797 
798 	free(sc, M_VMMDEV);
799 }
800 
801 static int
802 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
803 {
804 	struct devmem_softc *dsc;
805 	struct vmmdev_softc *sc;
806 	struct cdev *cdev;
807 	char *buf;
808 	int error, buflen;
809 
810 	error = vmm_priv_check(req->td->td_ucred);
811 	if (error)
812 		return (error);
813 
814 	buflen = VM_MAX_NAMELEN + 1;
815 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
816 	strlcpy(buf, "beavis", buflen);
817 	error = sysctl_handle_string(oidp, buf, buflen, req);
818 	if (error != 0 || req->newptr == NULL)
819 		goto out;
820 
821 	mtx_lock(&vmmdev_mtx);
822 	sc = vmmdev_lookup(buf);
823 	if (sc == NULL || sc->cdev == NULL) {
824 		mtx_unlock(&vmmdev_mtx);
825 		error = EINVAL;
826 		goto out;
827 	}
828 
829 	/*
830 	 * Setting 'sc->cdev' to NULL is used to indicate that the VM
831 	 * is scheduled for destruction.
832 	 */
833 	cdev = sc->cdev;
834 	sc->cdev = NULL;
835 	mtx_unlock(&vmmdev_mtx);
836 
837 	/*
838 	 * Destroy all cdevs:
839 	 *
840 	 * - any new operations on the 'cdev' will return an error (ENXIO).
841 	 *
842 	 * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
843 	 */
844 	SLIST_FOREACH(dsc, &sc->devmem, link) {
845 		KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
846 		destroy_dev(dsc->cdev);
847 		devmem_destroy(dsc);
848 	}
849 	destroy_dev(cdev);
850 	vmmdev_destroy(sc);
851 	error = 0;
852 
853 out:
854 	free(buf, M_VMMDEV);
855 	return (error);
856 }
857 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy,
858     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
859     NULL, 0, sysctl_vmm_destroy, "A",
860     NULL);
861 
862 static struct cdevsw vmmdevsw = {
863 	.d_name		= "vmmdev",
864 	.d_version	= D_VERSION,
865 	.d_ioctl	= vmmdev_ioctl,
866 	.d_mmap_single	= vmmdev_mmap_single,
867 	.d_read		= vmmdev_rw,
868 	.d_write	= vmmdev_rw,
869 };
870 
871 static int
872 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
873 {
874 	struct vm *vm;
875 	struct cdev *cdev;
876 	struct vmmdev_softc *sc, *sc2;
877 	char *buf;
878 	int error, buflen;
879 
880 	error = vmm_priv_check(req->td->td_ucred);
881 	if (error)
882 		return (error);
883 
884 	buflen = VM_MAX_NAMELEN + 1;
885 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
886 	strlcpy(buf, "beavis", buflen);
887 	error = sysctl_handle_string(oidp, buf, buflen, req);
888 	if (error != 0 || req->newptr == NULL)
889 		goto out;
890 
891 	mtx_lock(&vmmdev_mtx);
892 	sc = vmmdev_lookup(buf);
893 	mtx_unlock(&vmmdev_mtx);
894 	if (sc != NULL) {
895 		error = EEXIST;
896 		goto out;
897 	}
898 
899 	error = vm_create(buf, &vm);
900 	if (error != 0)
901 		goto out;
902 
903 	sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
904 	sc->ucred = crhold(curthread->td_ucred);
905 	sc->vm = vm;
906 	SLIST_INIT(&sc->devmem);
907 
908 	/*
909 	 * Lookup the name again just in case somebody sneaked in when we
910 	 * dropped the lock.
911 	 */
912 	mtx_lock(&vmmdev_mtx);
913 	sc2 = vmmdev_lookup(buf);
914 	if (sc2 == NULL) {
915 		SLIST_INSERT_HEAD(&head, sc, link);
916 		sc->flags |= VSC_LINKED;
917 	}
918 	mtx_unlock(&vmmdev_mtx);
919 
920 	if (sc2 != NULL) {
921 		vmmdev_destroy(sc);
922 		error = EEXIST;
923 		goto out;
924 	}
925 
926 	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, sc->ucred,
927 	    UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf);
928 	if (error != 0) {
929 		vmmdev_destroy(sc);
930 		goto out;
931 	}
932 
933 	mtx_lock(&vmmdev_mtx);
934 	sc->cdev = cdev;
935 	sc->cdev->si_drv1 = sc;
936 	mtx_unlock(&vmmdev_mtx);
937 
938 out:
939 	free(buf, M_VMMDEV);
940 	return (error);
941 }
942 SYSCTL_PROC(_hw_vmm, OID_AUTO, create,
943     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
944     NULL, 0, sysctl_vmm_create, "A",
945     NULL);
946 
947 void
948 vmmdev_init(void)
949 {
950 	pr_allow_flag = prison_add_allow(NULL, "vmm", NULL,
951 	    "Allow use of vmm in a jail.");
952 }
953 
954 int
955 vmmdev_cleanup(void)
956 {
957 	int error;
958 
959 	if (SLIST_EMPTY(&head))
960 		error = 0;
961 	else
962 		error = EBUSY;
963 
964 	return (error);
965 }
966 
967 static int
968 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
969     struct vm_object **objp, int nprot)
970 {
971 	struct devmem_softc *dsc;
972 	vm_ooffset_t first, last;
973 	size_t seglen;
974 	int error;
975 	bool sysmem;
976 
977 	dsc = cdev->si_drv1;
978 	if (dsc == NULL) {
979 		/* 'cdev' has been created but is not ready for use */
980 		return (ENXIO);
981 	}
982 
983 	first = *offset;
984 	last = *offset + len;
985 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
986 		return (EINVAL);
987 
988 	vm_slock_memsegs(dsc->sc->vm);
989 
990 	error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
991 	KASSERT(error == 0 && !sysmem && *objp != NULL,
992 	    ("%s: invalid devmem segment %d", __func__, dsc->segid));
993 
994 	if (seglen >= last)
995 		vm_object_reference(*objp);
996 	else
997 		error = 0;
998 	vm_unlock_memsegs(dsc->sc->vm);
999 	return (error);
1000 }
1001 
1002 static struct cdevsw devmemsw = {
1003 	.d_name		= "devmem",
1004 	.d_version	= D_VERSION,
1005 	.d_mmap_single	= devmem_mmap_single,
1006 };
1007 
1008 static int
1009 devmem_create_cdev(const char *vmname, int segid, char *devname)
1010 {
1011 	struct devmem_softc *dsc;
1012 	struct vmmdev_softc *sc;
1013 	struct cdev *cdev;
1014 	int error;
1015 
1016 	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL,
1017 	    UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname);
1018 	if (error)
1019 		return (error);
1020 
1021 	dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1022 
1023 	mtx_lock(&vmmdev_mtx);
1024 	sc = vmmdev_lookup(vmname);
1025 	KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname));
1026 	if (sc->cdev == NULL) {
1027 		/* virtual machine is being created or destroyed */
1028 		mtx_unlock(&vmmdev_mtx);
1029 		free(dsc, M_VMMDEV);
1030 		destroy_dev_sched_cb(cdev, NULL, 0);
1031 		return (ENODEV);
1032 	}
1033 
1034 	dsc->segid = segid;
1035 	dsc->name = devname;
1036 	dsc->cdev = cdev;
1037 	dsc->sc = sc;
1038 	SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
1039 	mtx_unlock(&vmmdev_mtx);
1040 
1041 	/* The 'cdev' is ready for use after 'si_drv1' is initialized */
1042 	cdev->si_drv1 = dsc;
1043 	return (0);
1044 }
1045 
1046 static void
1047 devmem_destroy(void *arg)
1048 {
1049 	struct devmem_softc *dsc = arg;
1050 
1051 	KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__));
1052 	dsc->cdev = NULL;
1053 	dsc->sc = NULL;
1054 }
1055