xref: /dragonfly/sys/dev/virtual/nvmm/nvmm.c (revision 655933d6)
1 /*
2  * Copyright (c) 2018-2021 Maxime Villard, m00nbsd.net
3  * All rights reserved.
4  *
5  * This code is part of the NVMM hypervisor.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 
32 #include <sys/kernel.h>
33 #include <sys/mman.h>
34 
35 #include "nvmm.h"
36 #include "nvmm_internal.h"
37 #include "nvmm_ioctl.h"
38 
39 static struct nvmm_machine machines[NVMM_MAX_MACHINES];
40 volatile unsigned int nmachines __cacheline_aligned;
41 
42 static const struct nvmm_impl *nvmm_impl_list[] = {
43 #if defined(__x86_64__)
44 	&nvmm_x86_svm,	/* x86 AMD SVM */
45 	&nvmm_x86_vmx	/* x86 Intel VMX */
46 #endif
47 };
48 
49 const struct nvmm_impl *nvmm_impl __read_mostly = NULL;
50 
51 struct nvmm_owner nvmm_root_owner;
52 
53 /* -------------------------------------------------------------------------- */
54 
55 static int
56 nvmm_machine_alloc(struct nvmm_machine **ret)
57 {
58 	struct nvmm_machine *mach;
59 	size_t i;
60 
61 	for (i = 0; i < NVMM_MAX_MACHINES; i++) {
62 		mach = &machines[i];
63 
64 		os_rwl_wlock(&mach->lock);
65 		if (mach->present) {
66 			os_rwl_unlock(&mach->lock);
67 			continue;
68 		}
69 
70 		mach->present = true;
71 		mach->time = time_second;
72 		*ret = mach;
73 		os_atomic_inc_uint(&nmachines);
74 		return 0;
75 	}
76 
77 	return ENOBUFS;
78 }
79 
80 static void
81 nvmm_machine_free(struct nvmm_machine *mach)
82 {
83 	OS_ASSERT(os_rwl_wheld(&mach->lock));
84 	OS_ASSERT(mach->present);
85 	mach->present = false;
86 	os_atomic_dec_uint(&nmachines);
87 }
88 
89 static int
90 nvmm_machine_get(struct nvmm_owner *owner, nvmm_machid_t machid,
91     struct nvmm_machine **ret, bool writer)
92 {
93 	struct nvmm_machine *mach;
94 
95 	if (__predict_false(machid >= NVMM_MAX_MACHINES)) {
96 		return EINVAL;
97 	}
98 	mach = &machines[machid];
99 
100 	if (__predict_false(writer)) {
101 		os_rwl_wlock(&mach->lock);
102 	} else {
103 		os_rwl_rlock(&mach->lock);
104 	}
105 	if (__predict_false(!mach->present)) {
106 		os_rwl_unlock(&mach->lock);
107 		return ENOENT;
108 	}
109 	if (__predict_false(mach->owner != owner &&
110 			    owner != &nvmm_root_owner)) {
111 		os_rwl_unlock(&mach->lock);
112 		return EPERM;
113 	}
114 	*ret = mach;
115 
116 	return 0;
117 }
118 
119 static void
120 nvmm_machine_put(struct nvmm_machine *mach)
121 {
122 	os_rwl_unlock(&mach->lock);
123 }
124 
125 /* -------------------------------------------------------------------------- */
126 
127 static int
128 nvmm_vcpu_alloc(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
129     struct nvmm_cpu **ret)
130 {
131 	struct nvmm_cpu *vcpu;
132 
133 	if (cpuid >= NVMM_MAX_VCPUS) {
134 		return EINVAL;
135 	}
136 	vcpu = &mach->cpus[cpuid];
137 
138 	os_mtx_lock(&vcpu->lock);
139 	if (vcpu->present) {
140 		os_mtx_unlock(&vcpu->lock);
141 		return EBUSY;
142 	}
143 
144 	vcpu->present = true;
145 	vcpu->comm = NULL;
146 	vcpu->hcpu_last = -1;
147 	*ret = vcpu;
148 	return 0;
149 }
150 
151 static void
152 nvmm_vcpu_free(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
153 {
154 	OS_ASSERT(os_mtx_owned(&vcpu->lock));
155 	vcpu->present = false;
156 	if (vcpu->comm != NULL) {
157 		os_vmobj_unmap(os_kernel_map, (vaddr_t)vcpu->comm,
158 		    (vaddr_t)vcpu->comm + NVMM_COMM_PAGE_SIZE, true);
159 		/*
160 		 * Require userland to unmap the comm page from its address
161 		 * space, because os_curproc_map at this point (fd close)
162 		 * is not guaranteed to be the correct address space.
163 		 */
164 	}
165 }
166 
167 static int
168 nvmm_vcpu_get(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
169     struct nvmm_cpu **ret)
170 {
171 	struct nvmm_cpu *vcpu;
172 
173 	if (__predict_false(cpuid >= NVMM_MAX_VCPUS)) {
174 		return EINVAL;
175 	}
176 	vcpu = &mach->cpus[cpuid];
177 
178 	os_mtx_lock(&vcpu->lock);
179 	if (__predict_false(!vcpu->present)) {
180 		os_mtx_unlock(&vcpu->lock);
181 		return ENOENT;
182 	}
183 	*ret = vcpu;
184 
185 	return 0;
186 }
187 
188 static void
189 nvmm_vcpu_put(struct nvmm_cpu *vcpu)
190 {
191 	os_mtx_unlock(&vcpu->lock);
192 }
193 
194 /* -------------------------------------------------------------------------- */
195 
196 void
197 nvmm_kill_machines(struct nvmm_owner *owner)
198 {
199 	struct nvmm_machine *mach;
200 	struct nvmm_cpu *vcpu;
201 	size_t i, j;
202 	int error;
203 
204 	for (i = 0; i < NVMM_MAX_MACHINES; i++) {
205 		mach = &machines[i];
206 
207 		os_rwl_wlock(&mach->lock);
208 		if (!mach->present || mach->owner != owner) {
209 			os_rwl_unlock(&mach->lock);
210 			continue;
211 		}
212 
213 		/* Kill it. */
214 		for (j = 0; j < NVMM_MAX_VCPUS; j++) {
215 			error = nvmm_vcpu_get(mach, j, &vcpu);
216 			if (error)
217 				continue;
218 			(*nvmm_impl->vcpu_destroy)(mach, vcpu);
219 			nvmm_vcpu_free(mach, vcpu);
220 			nvmm_vcpu_put(vcpu);
221 			os_atomic_dec_uint(&mach->ncpus);
222 		}
223 		(*nvmm_impl->machine_destroy)(mach);
224 		os_vmspace_destroy(mach->vm);
225 
226 		/* Drop the kernel vmobj refs. */
227 		for (j = 0; j < NVMM_MAX_HMAPPINGS; j++) {
228 			if (!mach->hmap[j].present)
229 				continue;
230 			os_vmobj_rel(mach->hmap[j].vmobj);
231 		}
232 
233 		nvmm_machine_free(mach);
234 
235 		os_rwl_unlock(&mach->lock);
236 	}
237 }
238 
239 /* -------------------------------------------------------------------------- */
240 
241 static int
242 nvmm_capability(struct nvmm_owner *owner, struct nvmm_ioc_capability *args)
243 {
244 	args->cap.version = NVMM_KERN_VERSION;
245 	args->cap.state_size = nvmm_impl->state_size;
246 	args->cap.comm_size = NVMM_COMM_PAGE_SIZE;
247 	args->cap.max_machines = NVMM_MAX_MACHINES;
248 	args->cap.max_vcpus = NVMM_MAX_VCPUS;
249 	args->cap.max_ram = NVMM_MAX_RAM;
250 
251 	(*nvmm_impl->capability)(&args->cap);
252 
253 	return 0;
254 }
255 
256 static int
257 nvmm_machine_create(struct nvmm_owner *owner,
258     struct nvmm_ioc_machine_create *args)
259 {
260 	struct nvmm_machine *mach;
261 	int error;
262 
263 	error = nvmm_machine_alloc(&mach);
264 	if (error)
265 		return error;
266 
267 	/* Curproc owns the machine. */
268 	mach->owner = owner;
269 
270 	/* Zero out the host mappings. */
271 	memset(&mach->hmap, 0, sizeof(mach->hmap));
272 
273 	/* Create the machine vmspace. */
274 	mach->gpa_begin = 0;
275 	mach->gpa_end = NVMM_MAX_RAM;
276 	mach->vm = os_vmspace_create(mach->gpa_begin, mach->gpa_end);
277 
278 	/* Create the comm vmobj. */
279 	mach->commvmobj = os_vmobj_create(
280 	    NVMM_MAX_VCPUS * NVMM_COMM_PAGE_SIZE);
281 
282 	(*nvmm_impl->machine_create)(mach);
283 
284 	args->machid = mach->machid;
285 	nvmm_machine_put(mach);
286 
287 	return 0;
288 }
289 
290 static int
291 nvmm_machine_destroy(struct nvmm_owner *owner,
292     struct nvmm_ioc_machine_destroy *args)
293 {
294 	struct nvmm_machine *mach;
295 	struct nvmm_cpu *vcpu;
296 	int error;
297 	size_t i;
298 
299 	error = nvmm_machine_get(owner, args->machid, &mach, true);
300 	if (error)
301 		return error;
302 
303 	for (i = 0; i < NVMM_MAX_VCPUS; i++) {
304 		error = nvmm_vcpu_get(mach, i, &vcpu);
305 		if (error)
306 			continue;
307 
308 		(*nvmm_impl->vcpu_destroy)(mach, vcpu);
309 		nvmm_vcpu_free(mach, vcpu);
310 		nvmm_vcpu_put(vcpu);
311 		os_atomic_dec_uint(&mach->ncpus);
312 	}
313 
314 	(*nvmm_impl->machine_destroy)(mach);
315 
316 	/* Free the machine vmspace. */
317 	os_vmspace_destroy(mach->vm);
318 
319 	/* Drop the kernel vmobj refs. */
320 	for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
321 		if (!mach->hmap[i].present)
322 			continue;
323 		os_vmobj_rel(mach->hmap[i].vmobj);
324 	}
325 
326 	nvmm_machine_free(mach);
327 	nvmm_machine_put(mach);
328 
329 	return 0;
330 }
331 
332 static int
333 nvmm_machine_configure(struct nvmm_owner *owner,
334     struct nvmm_ioc_machine_configure *args)
335 {
336 	struct nvmm_machine *mach;
337 	size_t allocsz;
338 	uint64_t op;
339 	void *data;
340 	int error;
341 
342 	op = NVMM_MACH_CONF_MD(args->op);
343 	if (__predict_false(op >= nvmm_impl->mach_conf_max)) {
344 		return EINVAL;
345 	}
346 
347 	allocsz = nvmm_impl->mach_conf_sizes[op];
348 	data = os_mem_alloc(allocsz);
349 
350 	error = nvmm_machine_get(owner, args->machid, &mach, true);
351 	if (error) {
352 		os_mem_free(data, allocsz);
353 		return error;
354 	}
355 
356 	error = copyin(args->conf, data, allocsz);
357 	if (error) {
358 		goto out;
359 	}
360 
361 	error = (*nvmm_impl->machine_configure)(mach, op, data);
362 
363 out:
364 	nvmm_machine_put(mach);
365 	os_mem_free(data, allocsz);
366 	return error;
367 }
368 
369 static int
370 nvmm_vcpu_create(struct nvmm_owner *owner, struct nvmm_ioc_vcpu_create *args)
371 {
372 	struct nvmm_machine *mach;
373 	struct nvmm_cpu *vcpu;
374 	int error;
375 
376 	error = nvmm_machine_get(owner, args->machid, &mach, false);
377 	if (error)
378 		return error;
379 
380 	error = nvmm_vcpu_alloc(mach, args->cpuid, &vcpu);
381 	if (error)
382 		goto out;
383 
384 	/* Map the comm page on the kernel side, as wired. */
385 	error = os_vmobj_map(os_kernel_map, (vaddr_t *)&vcpu->comm,
386 	    NVMM_COMM_PAGE_SIZE, mach->commvmobj,
387 	    args->cpuid * NVMM_COMM_PAGE_SIZE, true /* wired */,
388 	    false /* !fixed */, true /* shared */, PROT_READ | PROT_WRITE,
389 	    PROT_READ | PROT_WRITE);
390 	if (error) {
391 		nvmm_vcpu_free(mach, vcpu);
392 		nvmm_vcpu_put(vcpu);
393 		goto out;
394 	}
395 
396 	memset(vcpu->comm, 0, NVMM_COMM_PAGE_SIZE);
397 
398 	/* Map the comm page on the user side, as pageable. */
399 	error = os_vmobj_map(os_curproc_map, (vaddr_t *)&args->comm,
400 	    NVMM_COMM_PAGE_SIZE, mach->commvmobj,
401 	    args->cpuid * NVMM_COMM_PAGE_SIZE, false /* !wired */,
402 	    false /* !fixed */, true /* shared */, PROT_READ | PROT_WRITE,
403 	    PROT_READ | PROT_WRITE);
404 	if (error) {
405 		nvmm_vcpu_free(mach, vcpu);
406 		nvmm_vcpu_put(vcpu);
407 		goto out;
408 	}
409 
410 	error = (*nvmm_impl->vcpu_create)(mach, vcpu);
411 	if (error) {
412 		nvmm_vcpu_free(mach, vcpu);
413 		nvmm_vcpu_put(vcpu);
414 		goto out;
415 	}
416 
417 	nvmm_vcpu_put(vcpu);
418 	os_atomic_inc_uint(&mach->ncpus);
419 
420 out:
421 	nvmm_machine_put(mach);
422 	return error;
423 }
424 
425 static int
426 nvmm_vcpu_destroy(struct nvmm_owner *owner, struct nvmm_ioc_vcpu_destroy *args)
427 {
428 	struct nvmm_machine *mach;
429 	struct nvmm_cpu *vcpu;
430 	int error;
431 
432 	error = nvmm_machine_get(owner, args->machid, &mach, false);
433 	if (error)
434 		return error;
435 
436 	error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
437 	if (error)
438 		goto out;
439 
440 	(*nvmm_impl->vcpu_destroy)(mach, vcpu);
441 	nvmm_vcpu_free(mach, vcpu);
442 	nvmm_vcpu_put(vcpu);
443 	os_atomic_dec_uint(&mach->ncpus);
444 
445 out:
446 	nvmm_machine_put(mach);
447 	return error;
448 }
449 
450 static int
451 nvmm_vcpu_configure(struct nvmm_owner *owner,
452     struct nvmm_ioc_vcpu_configure *args)
453 {
454 	struct nvmm_machine *mach;
455 	struct nvmm_cpu *vcpu;
456 	size_t allocsz;
457 	uint64_t op;
458 	void *data;
459 	int error;
460 
461 	op = NVMM_VCPU_CONF_MD(args->op);
462 	if (__predict_false(op >= nvmm_impl->vcpu_conf_max))
463 		return EINVAL;
464 
465 	allocsz = nvmm_impl->vcpu_conf_sizes[op];
466 	data = os_mem_alloc(allocsz);
467 
468 	error = nvmm_machine_get(owner, args->machid, &mach, false);
469 	if (error) {
470 		os_mem_free(data, allocsz);
471 		return error;
472 	}
473 
474 	error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
475 	if (error) {
476 		nvmm_machine_put(mach);
477 		os_mem_free(data, allocsz);
478 		return error;
479 	}
480 
481 	error = copyin(args->conf, data, allocsz);
482 	if (error) {
483 		goto out;
484 	}
485 
486 	error = (*nvmm_impl->vcpu_configure)(vcpu, op, data);
487 
488 out:
489 	nvmm_vcpu_put(vcpu);
490 	nvmm_machine_put(mach);
491 	os_mem_free(data, allocsz);
492 	return error;
493 }
494 
495 static int
496 nvmm_vcpu_setstate(struct nvmm_owner *owner,
497     struct nvmm_ioc_vcpu_setstate *args)
498 {
499 	struct nvmm_machine *mach;
500 	struct nvmm_cpu *vcpu;
501 	int error;
502 
503 	error = nvmm_machine_get(owner, args->machid, &mach, false);
504 	if (error)
505 		return error;
506 
507 	error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
508 	if (error)
509 		goto out;
510 
511 	(*nvmm_impl->vcpu_setstate)(vcpu);
512 	nvmm_vcpu_put(vcpu);
513 
514 out:
515 	nvmm_machine_put(mach);
516 	return error;
517 }
518 
519 static int
520 nvmm_vcpu_getstate(struct nvmm_owner *owner,
521     struct nvmm_ioc_vcpu_getstate *args)
522 {
523 	struct nvmm_machine *mach;
524 	struct nvmm_cpu *vcpu;
525 	int error;
526 
527 	error = nvmm_machine_get(owner, args->machid, &mach, false);
528 	if (error)
529 		return error;
530 
531 	error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
532 	if (error)
533 		goto out;
534 
535 	(*nvmm_impl->vcpu_getstate)(vcpu);
536 	nvmm_vcpu_put(vcpu);
537 
538 out:
539 	nvmm_machine_put(mach);
540 	return error;
541 }
542 
543 static int
544 nvmm_vcpu_inject(struct nvmm_owner *owner, struct nvmm_ioc_vcpu_inject *args)
545 {
546 	struct nvmm_machine *mach;
547 	struct nvmm_cpu *vcpu;
548 	int error;
549 
550 	error = nvmm_machine_get(owner, args->machid, &mach, false);
551 	if (error)
552 		return error;
553 
554 	error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
555 	if (error)
556 		goto out;
557 
558 	error = (*nvmm_impl->vcpu_inject)(vcpu);
559 	nvmm_vcpu_put(vcpu);
560 
561 out:
562 	nvmm_machine_put(mach);
563 	return error;
564 }
565 
566 static int
567 nvmm_do_vcpu_run(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
568     struct nvmm_vcpu_exit *exit)
569 {
570 	struct vmspace *vm = mach->vm;
571 	int ret;
572 
573 	while (1) {
574 		/* Got a signal? Or pending resched? Leave. */
575 		if (__predict_false(os_return_needed())) {
576 			exit->reason = NVMM_VCPU_EXIT_NONE;
577 			return 0;
578 		}
579 
580 		/* Run the VCPU. */
581 		ret = (*nvmm_impl->vcpu_run)(mach, vcpu, exit);
582 		if (__predict_false(ret != 0)) {
583 			return ret;
584 		}
585 
586 		/* Process nested page faults. */
587 		if (__predict_true(exit->reason != NVMM_VCPU_EXIT_MEMORY)) {
588 			break;
589 		}
590 		if (exit->u.mem.gpa >= mach->gpa_end) {
591 			break;
592 		}
593 		if (os_vmspace_fault(vm, exit->u.mem.gpa, exit->u.mem.prot)) {
594 			break;
595 		}
596 	}
597 
598 	return 0;
599 }
600 
601 static int
602 nvmm_vcpu_run(struct nvmm_owner *owner, struct nvmm_ioc_vcpu_run *args)
603 {
604 	struct nvmm_machine *mach;
605 	struct nvmm_cpu *vcpu;
606 	int error;
607 
608 	error = nvmm_machine_get(owner, args->machid, &mach, false);
609 	if (error)
610 		return error;
611 
612 	error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
613 	if (error)
614 		goto out;
615 
616 	error = nvmm_do_vcpu_run(mach, vcpu, &args->exit);
617 	nvmm_vcpu_put(vcpu);
618 
619 out:
620 	nvmm_machine_put(mach);
621 	return error;
622 }
623 
624 /* -------------------------------------------------------------------------- */
625 
626 static os_vmobj_t *
627 nvmm_hmapping_getvmobj(struct nvmm_machine *mach, uintptr_t hva, size_t size,
628    size_t *off)
629 {
630 	struct nvmm_hmapping *hmapping;
631 	size_t i;
632 
633 	for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
634 		hmapping = &mach->hmap[i];
635 		if (!hmapping->present) {
636 			continue;
637 		}
638 		if (hva >= hmapping->hva &&
639 		    hva + size <= hmapping->hva + hmapping->size) {
640 			*off = hva - hmapping->hva;
641 			return hmapping->vmobj;
642 		}
643 	}
644 
645 	return NULL;
646 }
647 
648 static int
649 nvmm_hmapping_validate(struct nvmm_machine *mach, uintptr_t hva, size_t size)
650 {
651 	struct nvmm_hmapping *hmapping;
652 	size_t i;
653 	uintptr_t hva_end;
654 	uintptr_t hmap_end;
655 
656 	if ((hva % PAGE_SIZE) != 0 || (size % PAGE_SIZE) != 0) {
657 		return EINVAL;
658 	}
659 	if (hva == 0) {
660 		return EINVAL;
661 	}
662 
663 	/*
664 	 * Overflow tests MUST be done very carefully to avoid compiler
665 	 * optimizations from effectively deleting the test.
666 	 */
667 	hva_end = hva + size;
668 	if (hva_end <= hva)
669 		return EINVAL;
670 
671 	/*
672 	 * Overlap tests
673 	 */
674 	for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
675 		hmapping = &mach->hmap[i];
676 
677 		if (!hmapping->present) {
678 			continue;
679 		}
680 		hmap_end = hmapping->hva + hmapping->size;
681 
682 		if (hva >= hmapping->hva && hva_end <= hmap_end)
683 			break;
684 		if (hva >= hmapping->hva && hva < hmap_end)
685 			return EEXIST;
686 		if (hva_end > hmapping->hva && hva_end <= hmap_end)
687 			return EEXIST;
688 		if (hva <= hmapping->hva && hva_end >= hmap_end)
689 			return EEXIST;
690 	}
691 
692 	return 0;
693 }
694 
695 static struct nvmm_hmapping *
696 nvmm_hmapping_alloc(struct nvmm_machine *mach)
697 {
698 	struct nvmm_hmapping *hmapping;
699 	size_t i;
700 
701 	for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
702 		hmapping = &mach->hmap[i];
703 		if (!hmapping->present) {
704 			hmapping->present = true;
705 			return hmapping;
706 		}
707 	}
708 
709 	return NULL;
710 }
711 
712 static int
713 nvmm_hmapping_free(struct nvmm_machine *mach, uintptr_t hva, size_t size)
714 {
715 	struct nvmm_hmapping *hmapping;
716 	size_t i;
717 
718 	for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
719 		hmapping = &mach->hmap[i];
720 		if (!hmapping->present || hmapping->hva != hva ||
721 		    hmapping->size != size) {
722 			continue;
723 		}
724 
725 		os_vmobj_unmap(os_curproc_map, hmapping->hva,
726 		    hmapping->hva + hmapping->size, false);
727 		os_vmobj_rel(hmapping->vmobj);
728 
729 		hmapping->vmobj = NULL;
730 		hmapping->present = false;
731 
732 		return 0;
733 	}
734 
735 	return ENOENT;
736 }
737 
738 static int
739 nvmm_hva_map(struct nvmm_owner *owner, struct nvmm_ioc_hva_map *args)
740 {
741 	struct nvmm_machine *mach;
742 	struct nvmm_hmapping *hmapping;
743 	vaddr_t uva;
744 	int error;
745 
746 	error = nvmm_machine_get(owner, args->machid, &mach, true);
747 	if (error)
748 		return error;
749 
750 	error = nvmm_hmapping_validate(mach, args->hva, args->size);
751 	if (error)
752 		goto out;
753 
754 	hmapping = nvmm_hmapping_alloc(mach);
755 	if (hmapping == NULL) {
756 		error = ENOBUFS;
757 		goto out;
758 	}
759 
760 	hmapping->hva = args->hva;
761 	hmapping->size = args->size;
762 	hmapping->vmobj = os_vmobj_create(hmapping->size);
763 	uva = hmapping->hva;
764 
765 	/* Map the vmobj into the user address space, as pageable. */
766 	error = os_vmobj_map(os_curproc_map, &uva, hmapping->size,
767 	    hmapping->vmobj, 0, false /* !wired */, true /* fixed */,
768 	    true /* shared */, PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE);
769 
770 out:
771 	nvmm_machine_put(mach);
772 	return error;
773 }
774 
775 static int
776 nvmm_hva_unmap(struct nvmm_owner *owner, struct nvmm_ioc_hva_unmap *args)
777 {
778 	struct nvmm_machine *mach;
779 	int error;
780 
781 	error = nvmm_machine_get(owner, args->machid, &mach, true);
782 	if (error)
783 		return error;
784 
785 	error = nvmm_hmapping_free(mach, args->hva, args->size);
786 
787 	nvmm_machine_put(mach);
788 	return error;
789 }
790 
791 /* -------------------------------------------------------------------------- */
792 
793 static int
794 nvmm_gpa_map(struct nvmm_owner *owner, struct nvmm_ioc_gpa_map *args)
795 {
796 	struct nvmm_machine *mach;
797 	os_vmobj_t *vmobj;
798 	gpaddr_t gpa;
799 	gpaddr_t gpa_end;
800 	size_t off;
801 	int error;
802 
803 	error = nvmm_machine_get(owner, args->machid, &mach, false);
804 	if (error)
805 		return error;
806 
807 	if ((args->prot & ~(PROT_READ|PROT_WRITE|PROT_EXEC)) != 0) {
808 		error = EINVAL;
809 		goto out;
810 	}
811 
812 	/*
813 	 * Overflow tests MUST be done very carefully to avoid compiler
814 	 * optimizations from effectively deleting the test.
815 	 */
816 	gpa = args->gpa;
817 	gpa_end = gpa + args->size;
818 	if (gpa_end <= gpa) {
819 		error = EINVAL;
820 		goto out;
821 	}
822 
823 	if ((gpa % PAGE_SIZE) != 0 || (args->size % PAGE_SIZE) != 0 ||
824 	    (args->hva % PAGE_SIZE) != 0) {
825 		error = EINVAL;
826 		goto out;
827 	}
828 	if (args->hva == 0) {
829 		error = EINVAL;
830 		goto out;
831 	}
832 
833 	if (gpa < mach->gpa_begin || gpa >= mach->gpa_end) {
834 		error = EINVAL;
835 		goto out;
836 	}
837 	if (gpa_end  > mach->gpa_end) {
838 		error = EINVAL;
839 		goto out;
840 	}
841 
842 	vmobj = nvmm_hmapping_getvmobj(mach, args->hva, args->size, &off);
843 	if (vmobj == NULL) {
844 		error = EINVAL;
845 		goto out;
846 	}
847 
848 	/* Map the vmobj into the machine address space, as pageable. */
849 	error = os_vmobj_map(&mach->vm->vm_map, &gpa, args->size, vmobj, off,
850 	    false /* !wired */, true /* fixed */, false /* !shared */,
851 	    args->prot, PROT_READ | PROT_WRITE | PROT_EXEC);
852 
853 out:
854 	nvmm_machine_put(mach);
855 	return error;
856 }
857 
858 static int
859 nvmm_gpa_unmap(struct nvmm_owner *owner, struct nvmm_ioc_gpa_unmap *args)
860 {
861 	struct nvmm_machine *mach;
862 	gpaddr_t gpa;
863 	gpaddr_t gpa_end;
864 	int error;
865 
866 	error = nvmm_machine_get(owner, args->machid, &mach, false);
867 	if (error)
868 		return error;
869 
870 	/*
871 	 * Overflow tests MUST be done very carefully to avoid compiler
872 	 * optimizations from effectively deleting the test.
873 	 */
874 	gpa = args->gpa;
875 	gpa_end = gpa + args->size;
876 	if (gpa_end <= gpa) {
877 		error = EINVAL;
878 		goto out;
879 	}
880 
881 	if ((gpa % PAGE_SIZE) != 0 || (args->size % PAGE_SIZE) != 0) {
882 		error = EINVAL;
883 		goto out;
884 	}
885 	if (gpa < mach->gpa_begin || gpa >= mach->gpa_end) {
886 		error = EINVAL;
887 		goto out;
888 	}
889 	if (gpa_end >= mach->gpa_end) {
890 		error = EINVAL;
891 		goto out;
892 	}
893 
894 	/* Unmap the memory from the machine. */
895 	os_vmobj_unmap(&mach->vm->vm_map, gpa, gpa + args->size, false);
896 
897 out:
898 	nvmm_machine_put(mach);
899 	return error;
900 }
901 
902 /* -------------------------------------------------------------------------- */
903 
904 static int
905 nvmm_ctl_mach_info(struct nvmm_owner *owner, struct nvmm_ioc_ctl *args)
906 {
907 	struct nvmm_ctl_mach_info ctl;
908 	struct nvmm_machine *mach;
909 	int error;
910 	size_t i;
911 
912 	if (args->size != sizeof(ctl))
913 		return EINVAL;
914 	error = copyin(args->data, &ctl, sizeof(ctl));
915 	if (error)
916 		return error;
917 
918 	error = nvmm_machine_get(owner, ctl.machid, &mach, true);
919 	if (error)
920 		return error;
921 
922 	ctl.nvcpus = mach->ncpus;
923 
924 	ctl.nram = 0;
925 	for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
926 		if (!mach->hmap[i].present)
927 			continue;
928 		ctl.nram += mach->hmap[i].size;
929 	}
930 
931 	ctl.pid = mach->owner->pid;
932 	ctl.time = mach->time;
933 
934 	nvmm_machine_put(mach);
935 
936 	error = copyout(&ctl, args->data, sizeof(ctl));
937 	if (error)
938 		return error;
939 
940 	return 0;
941 }
942 
943 static int
944 nvmm_ctl(struct nvmm_owner *owner, struct nvmm_ioc_ctl *args)
945 {
946 	switch (args->op) {
947 	case NVMM_CTL_MACH_INFO:
948 		return nvmm_ctl_mach_info(owner, args);
949 	default:
950 		return EINVAL;
951 	}
952 }
953 
954 /* -------------------------------------------------------------------------- */
955 
956 const struct nvmm_impl *
957 nvmm_ident(void)
958 {
959 	size_t i;
960 
961 	for (i = 0; i < __arraycount(nvmm_impl_list); i++) {
962 		if ((*nvmm_impl_list[i]->ident)())
963 			return nvmm_impl_list[i];
964 	}
965 
966 	return NULL;
967 }
968 
969 int
970 nvmm_init(void)
971 {
972 	size_t i, n;
973 
974 	nvmm_impl = nvmm_ident();
975 	if (nvmm_impl == NULL)
976 		return ENOTSUP;
977 
978 	for (i = 0; i < NVMM_MAX_MACHINES; i++) {
979 		machines[i].machid = i;
980 		os_rwl_init(&machines[i].lock);
981 		for (n = 0; n < NVMM_MAX_VCPUS; n++) {
982 			machines[i].cpus[n].present = false;
983 			machines[i].cpus[n].cpuid = n;
984 			os_mtx_init(&machines[i].cpus[n].lock);
985 		}
986 	}
987 
988 	(*nvmm_impl->init)();
989 
990 	return 0;
991 }
992 
993 void
994 nvmm_fini(void)
995 {
996 	size_t i, n;
997 
998 	for (i = 0; i < NVMM_MAX_MACHINES; i++) {
999 		os_rwl_destroy(&machines[i].lock);
1000 		for (n = 0; n < NVMM_MAX_VCPUS; n++) {
1001 			os_mtx_destroy(&machines[i].cpus[n].lock);
1002 		}
1003 	}
1004 
1005 	(*nvmm_impl->fini)();
1006 	nvmm_impl = NULL;
1007 }
1008 
1009 /* -------------------------------------------------------------------------- */
1010 
1011 int
1012 nvmm_ioctl(struct nvmm_owner *owner, unsigned long cmd, void *data)
1013 {
1014 	switch (cmd) {
1015 	case NVMM_IOC_CAPABILITY:
1016 		return nvmm_capability(owner, data);
1017 	case NVMM_IOC_MACHINE_CREATE:
1018 		return nvmm_machine_create(owner, data);
1019 	case NVMM_IOC_MACHINE_DESTROY:
1020 		return nvmm_machine_destroy(owner, data);
1021 	case NVMM_IOC_MACHINE_CONFIGURE:
1022 		return nvmm_machine_configure(owner, data);
1023 	case NVMM_IOC_VCPU_CREATE:
1024 		return nvmm_vcpu_create(owner, data);
1025 	case NVMM_IOC_VCPU_DESTROY:
1026 		return nvmm_vcpu_destroy(owner, data);
1027 	case NVMM_IOC_VCPU_CONFIGURE:
1028 		return nvmm_vcpu_configure(owner, data);
1029 	case NVMM_IOC_VCPU_SETSTATE:
1030 		return nvmm_vcpu_setstate(owner, data);
1031 	case NVMM_IOC_VCPU_GETSTATE:
1032 		return nvmm_vcpu_getstate(owner, data);
1033 	case NVMM_IOC_VCPU_INJECT:
1034 		return nvmm_vcpu_inject(owner, data);
1035 	case NVMM_IOC_VCPU_RUN:
1036 		return nvmm_vcpu_run(owner, data);
1037 	case NVMM_IOC_GPA_MAP:
1038 		return nvmm_gpa_map(owner, data);
1039 	case NVMM_IOC_GPA_UNMAP:
1040 		return nvmm_gpa_unmap(owner, data);
1041 	case NVMM_IOC_HVA_MAP:
1042 		return nvmm_hva_map(owner, data);
1043 	case NVMM_IOC_HVA_UNMAP:
1044 		return nvmm_hva_unmap(owner, data);
1045 	case NVMM_IOC_CTL:
1046 		return nvmm_ctl(owner, data);
1047 	default:
1048 		return EINVAL;
1049 	}
1050 }
1051