1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/cpuset.h>
32 #include <sys/kernel.h>
33 #include <sys/linker.h>
34 #include <sys/lock.h>
35 #include <sys/malloc.h>
36 #include <sys/module.h>
37 #include <sys/mutex.h>
38 #include <sys/pcpu.h>
39 #include <sys/proc.h>
40 #include <sys/queue.h>
41 #include <sys/rwlock.h>
42 #include <sys/sched.h>
43 #include <sys/smp.h>
44 #include <sys/sysctl.h>
45
46 #include <vm/vm.h>
47 #include <vm/vm_object.h>
48 #include <vm/vm_page.h>
49 #include <vm/pmap.h>
50 #include <vm/vm_map.h>
51 #include <vm/vm_extern.h>
52 #include <vm/vm_param.h>
53
54 #include <machine/armreg.h>
55 #include <machine/cpu.h>
56 #include <machine/fpu.h>
57 #include <machine/machdep.h>
58 #include <machine/pcb.h>
59 #include <machine/smp.h>
60 #include <machine/vm.h>
61 #include <machine/vmparam.h>
62 #include <machine/vmm.h>
63 #include <machine/vmm_dev.h>
64 #include <machine/vmm_instruction_emul.h>
65
66 #include <dev/pci/pcireg.h>
67
68 #include "vmm_ktr.h"
69 #include "vmm_stat.h"
70 #include "arm64.h"
71 #include "mmu.h"
72
73 #include "io/vgic.h"
74 #include "io/vtimer.h"
75
76 struct vcpu {
77 int flags;
78 enum vcpu_state state;
79 struct mtx mtx;
80 int hostcpu; /* host cpuid this vcpu last ran on */
81 int vcpuid;
82 void *stats;
83 struct vm_exit exitinfo;
84 uint64_t nextpc; /* (x) next instruction to execute */
85 struct vm *vm; /* (o) */
86 void *cookie; /* (i) cpu-specific data */
87 struct vfpstate *guestfpu; /* (a,i) guest fpu state */
88 };
89
90 #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
91 #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
92 #define vcpu_lock_destroy(v) mtx_destroy(&((v)->mtx))
93 #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx))
94 #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx))
95 #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED)
96
97 struct mem_seg {
98 uint64_t gpa;
99 size_t len;
100 bool wired;
101 bool sysmem;
102 vm_object_t object;
103 };
104 #define VM_MAX_MEMSEGS 3
105
106 struct mem_map {
107 vm_paddr_t gpa;
108 size_t len;
109 vm_ooffset_t segoff;
110 int segid;
111 int prot;
112 int flags;
113 };
114 #define VM_MAX_MEMMAPS 4
115
116 struct vmm_mmio_region {
117 uint64_t start;
118 uint64_t end;
119 mem_region_read_t read;
120 mem_region_write_t write;
121 };
122 #define VM_MAX_MMIO_REGIONS 4
123
124 struct vmm_special_reg {
125 uint32_t esr_iss;
126 uint32_t esr_mask;
127 reg_read_t reg_read;
128 reg_write_t reg_write;
129 void *arg;
130 };
131 #define VM_MAX_SPECIAL_REGS 16
132
133 /*
134 * Initialization:
135 * (o) initialized the first time the VM is created
136 * (i) initialized when VM is created and when it is reinitialized
137 * (x) initialized before use
138 */
139 struct vm {
140 void *cookie; /* (i) cpu-specific data */
141 volatile cpuset_t active_cpus; /* (i) active vcpus */
142 volatile cpuset_t debug_cpus; /* (i) vcpus stopped for debug */
143 int suspend; /* (i) stop VM execution */
144 bool dying; /* (o) is dying */
145 volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */
146 volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */
147 struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
148 struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
149 struct vmspace *vmspace; /* (o) guest's address space */
150 char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */
151 struct vcpu **vcpu; /* (i) guest vcpus */
152 struct vmm_mmio_region mmio_region[VM_MAX_MMIO_REGIONS];
153 /* (o) guest MMIO regions */
154 struct vmm_special_reg special_reg[VM_MAX_SPECIAL_REGS];
155 /* The following describe the vm cpu topology */
156 uint16_t sockets; /* (o) num of sockets */
157 uint16_t cores; /* (o) num of cores/socket */
158 uint16_t threads; /* (o) num of threads/core */
159 uint16_t maxcpus; /* (o) max pluggable cpus */
160 struct sx mem_segs_lock; /* (o) */
161 struct sx vcpus_init_lock; /* (o) */
162 };
163
164 static bool vmm_initialized = false;
165
166 static int vm_handle_wfi(struct vcpu *vcpu,
167 struct vm_exit *vme, bool *retu);
168
169 static MALLOC_DEFINE(M_VMM, "vmm", "vmm");
170
171 /* statistics */
172 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
173
174 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
175
176 static int vmm_ipinum;
177 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
178 "IPI vector used for vcpu notifications");
179
180 struct vmm_regs {
181 uint64_t id_aa64afr0;
182 uint64_t id_aa64afr1;
183 uint64_t id_aa64dfr0;
184 uint64_t id_aa64dfr1;
185 uint64_t id_aa64isar0;
186 uint64_t id_aa64isar1;
187 uint64_t id_aa64isar2;
188 uint64_t id_aa64mmfr0;
189 uint64_t id_aa64mmfr1;
190 uint64_t id_aa64mmfr2;
191 uint64_t id_aa64pfr0;
192 uint64_t id_aa64pfr1;
193 };
194
195 static const struct vmm_regs vmm_arch_regs_masks = {
196 .id_aa64dfr0 =
197 ID_AA64DFR0_CTX_CMPs_MASK |
198 ID_AA64DFR0_WRPs_MASK |
199 ID_AA64DFR0_BRPs_MASK |
200 ID_AA64DFR0_PMUVer_3 |
201 ID_AA64DFR0_DebugVer_8,
202 .id_aa64isar0 =
203 ID_AA64ISAR0_TLB_TLBIOSR |
204 ID_AA64ISAR0_SHA3_IMPL |
205 ID_AA64ISAR0_RDM_IMPL |
206 ID_AA64ISAR0_Atomic_IMPL |
207 ID_AA64ISAR0_CRC32_BASE |
208 ID_AA64ISAR0_SHA2_512 |
209 ID_AA64ISAR0_SHA1_BASE |
210 ID_AA64ISAR0_AES_PMULL,
211 .id_aa64mmfr0 =
212 ID_AA64MMFR0_TGran4_IMPL |
213 ID_AA64MMFR0_TGran64_IMPL |
214 ID_AA64MMFR0_TGran16_IMPL |
215 ID_AA64MMFR0_ASIDBits_16 |
216 ID_AA64MMFR0_PARange_4P,
217 .id_aa64mmfr1 =
218 ID_AA64MMFR1_SpecSEI_IMPL |
219 ID_AA64MMFR1_PAN_ATS1E1 |
220 ID_AA64MMFR1_HAFDBS_AF,
221 .id_aa64pfr0 =
222 ID_AA64PFR0_GIC_CPUIF_NONE |
223 ID_AA64PFR0_AdvSIMD_HP |
224 ID_AA64PFR0_FP_HP |
225 ID_AA64PFR0_EL3_64 |
226 ID_AA64PFR0_EL2_64 |
227 ID_AA64PFR0_EL1_64 |
228 ID_AA64PFR0_EL0_64,
229 };
230
231 /* Host registers masked by vmm_arch_regs_masks. */
232 static struct vmm_regs vmm_arch_regs;
233
234 u_int vm_maxcpu;
235 SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
236 &vm_maxcpu, 0, "Maximum number of vCPUs");
237
238 static void vm_free_memmap(struct vm *vm, int ident);
239 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
240 static void vcpu_notify_event_locked(struct vcpu *vcpu);
241
242 /*
243 * Upper limit on vm_maxcpu. We could increase this to 28 bits, but this
244 * is a safe value for now.
245 */
246 #define VM_MAXCPU MIN(0xffff - 1, CPU_SETSIZE)
247
248 static int
vmm_regs_init(struct vmm_regs * regs,const struct vmm_regs * masks)249 vmm_regs_init(struct vmm_regs *regs, const struct vmm_regs *masks)
250 {
251 #define _FETCH_KERN_REG(reg, field) do { \
252 regs->field = vmm_arch_regs_masks.field; \
253 if (!get_kernel_reg_masked(reg, ®s->field, masks->field)) \
254 regs->field = 0; \
255 } while (0)
256 _FETCH_KERN_REG(ID_AA64AFR0_EL1, id_aa64afr0);
257 _FETCH_KERN_REG(ID_AA64AFR1_EL1, id_aa64afr1);
258 _FETCH_KERN_REG(ID_AA64DFR0_EL1, id_aa64dfr0);
259 _FETCH_KERN_REG(ID_AA64DFR1_EL1, id_aa64dfr1);
260 _FETCH_KERN_REG(ID_AA64ISAR0_EL1, id_aa64isar0);
261 _FETCH_KERN_REG(ID_AA64ISAR1_EL1, id_aa64isar1);
262 _FETCH_KERN_REG(ID_AA64ISAR2_EL1, id_aa64isar2);
263 _FETCH_KERN_REG(ID_AA64MMFR0_EL1, id_aa64mmfr0);
264 _FETCH_KERN_REG(ID_AA64MMFR1_EL1, id_aa64mmfr1);
265 _FETCH_KERN_REG(ID_AA64MMFR2_EL1, id_aa64mmfr2);
266 _FETCH_KERN_REG(ID_AA64PFR0_EL1, id_aa64pfr0);
267 _FETCH_KERN_REG(ID_AA64PFR1_EL1, id_aa64pfr1);
268 #undef _FETCH_KERN_REG
269 return (0);
270 }
271
272 static void
vcpu_cleanup(struct vcpu * vcpu,bool destroy)273 vcpu_cleanup(struct vcpu *vcpu, bool destroy)
274 {
275 vmmops_vcpu_cleanup(vcpu->cookie);
276 vcpu->cookie = NULL;
277 if (destroy) {
278 vmm_stat_free(vcpu->stats);
279 fpu_save_area_free(vcpu->guestfpu);
280 vcpu_lock_destroy(vcpu);
281 }
282 }
283
284 static struct vcpu *
vcpu_alloc(struct vm * vm,int vcpu_id)285 vcpu_alloc(struct vm *vm, int vcpu_id)
286 {
287 struct vcpu *vcpu;
288
289 KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
290 ("vcpu_alloc: invalid vcpu %d", vcpu_id));
291
292 vcpu = malloc(sizeof(*vcpu), M_VMM, M_WAITOK | M_ZERO);
293 vcpu_lock_init(vcpu);
294 vcpu->state = VCPU_IDLE;
295 vcpu->hostcpu = NOCPU;
296 vcpu->vcpuid = vcpu_id;
297 vcpu->vm = vm;
298 vcpu->guestfpu = fpu_save_area_alloc();
299 vcpu->stats = vmm_stat_alloc();
300 return (vcpu);
301 }
302
303 static void
vcpu_init(struct vcpu * vcpu)304 vcpu_init(struct vcpu *vcpu)
305 {
306 vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid);
307 MPASS(vcpu->cookie != NULL);
308 fpu_save_area_reset(vcpu->guestfpu);
309 vmm_stat_init(vcpu->stats);
310 }
311
312 struct vm_exit *
vm_exitinfo(struct vcpu * vcpu)313 vm_exitinfo(struct vcpu *vcpu)
314 {
315 return (&vcpu->exitinfo);
316 }
317
318 static int
vmm_init(void)319 vmm_init(void)
320 {
321 int error;
322
323 vm_maxcpu = mp_ncpus;
324 TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu);
325
326 if (vm_maxcpu > VM_MAXCPU) {
327 printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU);
328 vm_maxcpu = VM_MAXCPU;
329 }
330 if (vm_maxcpu == 0)
331 vm_maxcpu = 1;
332
333 error = vmm_regs_init(&vmm_arch_regs, &vmm_arch_regs_masks);
334 if (error != 0)
335 return (error);
336
337 return (vmmops_modinit(0));
338 }
339
340 static int
vmm_handler(module_t mod,int what,void * arg)341 vmm_handler(module_t mod, int what, void *arg)
342 {
343 int error;
344
345 switch (what) {
346 case MOD_LOAD:
347 /* TODO: if (vmm_is_hw_supported()) { */
348 vmmdev_init();
349 error = vmm_init();
350 if (error == 0)
351 vmm_initialized = true;
352 break;
353 case MOD_UNLOAD:
354 /* TODO: if (vmm_is_hw_supported()) { */
355 error = vmmdev_cleanup();
356 if (error == 0 && vmm_initialized) {
357 error = vmmops_modcleanup();
358 if (error)
359 vmm_initialized = false;
360 }
361 break;
362 default:
363 error = 0;
364 break;
365 }
366 return (error);
367 }
368
369 static moduledata_t vmm_kmod = {
370 "vmm",
371 vmm_handler,
372 NULL
373 };
374
375 /*
376 * vmm initialization has the following dependencies:
377 *
378 * - HYP initialization requires smp_rendezvous() and therefore must happen
379 * after SMP is fully functional (after SI_SUB_SMP).
380 */
381 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
382 MODULE_VERSION(vmm, 1);
383
384 static void
vm_init(struct vm * vm,bool create)385 vm_init(struct vm *vm, bool create)
386 {
387 int i;
388
389 vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace));
390 MPASS(vm->cookie != NULL);
391
392 CPU_ZERO(&vm->active_cpus);
393 CPU_ZERO(&vm->debug_cpus);
394
395 vm->suspend = 0;
396 CPU_ZERO(&vm->suspended_cpus);
397
398 memset(vm->mmio_region, 0, sizeof(vm->mmio_region));
399 memset(vm->special_reg, 0, sizeof(vm->special_reg));
400
401 if (!create) {
402 for (i = 0; i < vm->maxcpus; i++) {
403 if (vm->vcpu[i] != NULL)
404 vcpu_init(vm->vcpu[i]);
405 }
406 }
407 }
408
409 void
vm_disable_vcpu_creation(struct vm * vm)410 vm_disable_vcpu_creation(struct vm *vm)
411 {
412 sx_xlock(&vm->vcpus_init_lock);
413 vm->dying = true;
414 sx_xunlock(&vm->vcpus_init_lock);
415 }
416
417 struct vcpu *
vm_alloc_vcpu(struct vm * vm,int vcpuid)418 vm_alloc_vcpu(struct vm *vm, int vcpuid)
419 {
420 struct vcpu *vcpu;
421
422 if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm))
423 return (NULL);
424
425 /* Some interrupt controllers may have a CPU limit */
426 if (vcpuid >= vgic_max_cpu_count(vm->cookie))
427 return (NULL);
428
429 vcpu = atomic_load_ptr(&vm->vcpu[vcpuid]);
430 if (__predict_true(vcpu != NULL))
431 return (vcpu);
432
433 sx_xlock(&vm->vcpus_init_lock);
434 vcpu = vm->vcpu[vcpuid];
435 if (vcpu == NULL && !vm->dying) {
436 vcpu = vcpu_alloc(vm, vcpuid);
437 vcpu_init(vcpu);
438
439 /*
440 * Ensure vCPU is fully created before updating pointer
441 * to permit unlocked reads above.
442 */
443 atomic_store_rel_ptr((uintptr_t *)&vm->vcpu[vcpuid],
444 (uintptr_t)vcpu);
445 }
446 sx_xunlock(&vm->vcpus_init_lock);
447 return (vcpu);
448 }
449
450 void
vm_slock_vcpus(struct vm * vm)451 vm_slock_vcpus(struct vm *vm)
452 {
453 sx_slock(&vm->vcpus_init_lock);
454 }
455
456 void
vm_unlock_vcpus(struct vm * vm)457 vm_unlock_vcpus(struct vm *vm)
458 {
459 sx_unlock(&vm->vcpus_init_lock);
460 }
461
462 int
vm_create(const char * name,struct vm ** retvm)463 vm_create(const char *name, struct vm **retvm)
464 {
465 struct vm *vm;
466 struct vmspace *vmspace;
467
468 /*
469 * If vmm.ko could not be successfully initialized then don't attempt
470 * to create the virtual machine.
471 */
472 if (!vmm_initialized)
473 return (ENXIO);
474
475 if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
476 return (EINVAL);
477
478 vmspace = vmmops_vmspace_alloc(0, 1ul << 39);
479 if (vmspace == NULL)
480 return (ENOMEM);
481
482 vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO);
483 strcpy(vm->name, name);
484 vm->vmspace = vmspace;
485 sx_init(&vm->mem_segs_lock, "vm mem_segs");
486 sx_init(&vm->vcpus_init_lock, "vm vcpus");
487
488 vm->sockets = 1;
489 vm->cores = 1; /* XXX backwards compatibility */
490 vm->threads = 1; /* XXX backwards compatibility */
491 vm->maxcpus = vm_maxcpu;
492
493 vm->vcpu = malloc(sizeof(*vm->vcpu) * vm->maxcpus, M_VMM,
494 M_WAITOK | M_ZERO);
495
496 vm_init(vm, true);
497
498 *retvm = vm;
499 return (0);
500 }
501
502 void
vm_get_topology(struct vm * vm,uint16_t * sockets,uint16_t * cores,uint16_t * threads,uint16_t * maxcpus)503 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
504 uint16_t *threads, uint16_t *maxcpus)
505 {
506 *sockets = vm->sockets;
507 *cores = vm->cores;
508 *threads = vm->threads;
509 *maxcpus = vm->maxcpus;
510 }
511
512 uint16_t
vm_get_maxcpus(struct vm * vm)513 vm_get_maxcpus(struct vm *vm)
514 {
515 return (vm->maxcpus);
516 }
517
518 int
vm_set_topology(struct vm * vm,uint16_t sockets,uint16_t cores,uint16_t threads,uint16_t maxcpus)519 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
520 uint16_t threads, uint16_t maxcpus)
521 {
522 /* Ignore maxcpus. */
523 if ((sockets * cores * threads) > vm->maxcpus)
524 return (EINVAL);
525 vm->sockets = sockets;
526 vm->cores = cores;
527 vm->threads = threads;
528 return(0);
529 }
530
531 static void
vm_cleanup(struct vm * vm,bool destroy)532 vm_cleanup(struct vm *vm, bool destroy)
533 {
534 struct mem_map *mm;
535 pmap_t pmap __diagused;
536 int i;
537
538 if (destroy) {
539 pmap = vmspace_pmap(vm->vmspace);
540 sched_pin();
541 PCPU_SET(curvmpmap, NULL);
542 sched_unpin();
543 CPU_FOREACH(i) {
544 MPASS(cpuid_to_pcpu[i]->pc_curvmpmap != pmap);
545 }
546 }
547
548 vgic_detach_from_vm(vm->cookie);
549
550 for (i = 0; i < vm->maxcpus; i++) {
551 if (vm->vcpu[i] != NULL)
552 vcpu_cleanup(vm->vcpu[i], destroy);
553 }
554
555 vmmops_cleanup(vm->cookie);
556
557 /*
558 * System memory is removed from the guest address space only when
559 * the VM is destroyed. This is because the mapping remains the same
560 * across VM reset.
561 *
562 * Device memory can be relocated by the guest (e.g. using PCI BARs)
563 * so those mappings are removed on a VM reset.
564 */
565 if (!destroy) {
566 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
567 mm = &vm->mem_maps[i];
568 if (destroy || !sysmem_mapping(vm, mm))
569 vm_free_memmap(vm, i);
570 }
571 }
572
573 if (destroy) {
574 for (i = 0; i < VM_MAX_MEMSEGS; i++)
575 vm_free_memseg(vm, i);
576
577 vmmops_vmspace_free(vm->vmspace);
578 vm->vmspace = NULL;
579
580 for (i = 0; i < vm->maxcpus; i++)
581 free(vm->vcpu[i], M_VMM);
582 free(vm->vcpu, M_VMM);
583 sx_destroy(&vm->vcpus_init_lock);
584 sx_destroy(&vm->mem_segs_lock);
585 }
586 }
587
588 void
vm_destroy(struct vm * vm)589 vm_destroy(struct vm *vm)
590 {
591 vm_cleanup(vm, true);
592 free(vm, M_VMM);
593 }
594
595 int
vm_reinit(struct vm * vm)596 vm_reinit(struct vm *vm)
597 {
598 int error;
599
600 /*
601 * A virtual machine can be reset only if all vcpus are suspended.
602 */
603 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
604 vm_cleanup(vm, false);
605 vm_init(vm, false);
606 error = 0;
607 } else {
608 error = EBUSY;
609 }
610
611 return (error);
612 }
613
614 const char *
vm_name(struct vm * vm)615 vm_name(struct vm *vm)
616 {
617 return (vm->name);
618 }
619
620 void
vm_slock_memsegs(struct vm * vm)621 vm_slock_memsegs(struct vm *vm)
622 {
623 sx_slock(&vm->mem_segs_lock);
624 }
625
626 void
vm_xlock_memsegs(struct vm * vm)627 vm_xlock_memsegs(struct vm *vm)
628 {
629 sx_xlock(&vm->mem_segs_lock);
630 }
631
632 void
vm_unlock_memsegs(struct vm * vm)633 vm_unlock_memsegs(struct vm *vm)
634 {
635 sx_unlock(&vm->mem_segs_lock);
636 }
637
638 /*
639 * Return 'true' if 'gpa' is allocated in the guest address space.
640 *
641 * This function is called in the context of a running vcpu which acts as
642 * an implicit lock on 'vm->mem_maps[]'.
643 */
644 bool
vm_mem_allocated(struct vcpu * vcpu,vm_paddr_t gpa)645 vm_mem_allocated(struct vcpu *vcpu, vm_paddr_t gpa)
646 {
647 struct vm *vm = vcpu->vm;
648 struct mem_map *mm;
649 int i;
650
651 #ifdef INVARIANTS
652 int hostcpu, state;
653 state = vcpu_get_state(vcpu, &hostcpu);
654 KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
655 ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
656 #endif
657
658 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
659 mm = &vm->mem_maps[i];
660 if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
661 return (true); /* 'gpa' is sysmem or devmem */
662 }
663
664 return (false);
665 }
666
667 int
vm_alloc_memseg(struct vm * vm,int ident,size_t len,bool sysmem)668 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
669 {
670 struct mem_seg *seg;
671 vm_object_t obj;
672
673 sx_assert(&vm->mem_segs_lock, SX_XLOCKED);
674
675 if (ident < 0 || ident >= VM_MAX_MEMSEGS)
676 return (EINVAL);
677
678 if (len == 0 || (len & PAGE_MASK))
679 return (EINVAL);
680
681 seg = &vm->mem_segs[ident];
682 if (seg->object != NULL) {
683 if (seg->len == len && seg->sysmem == sysmem)
684 return (EEXIST);
685 else
686 return (EINVAL);
687 }
688
689 obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
690 if (obj == NULL)
691 return (ENOMEM);
692
693 seg->len = len;
694 seg->object = obj;
695 seg->sysmem = sysmem;
696 return (0);
697 }
698
699 int
vm_get_memseg(struct vm * vm,int ident,size_t * len,bool * sysmem,vm_object_t * objptr)700 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
701 vm_object_t *objptr)
702 {
703 struct mem_seg *seg;
704
705 sx_assert(&vm->mem_segs_lock, SX_LOCKED);
706
707 if (ident < 0 || ident >= VM_MAX_MEMSEGS)
708 return (EINVAL);
709
710 seg = &vm->mem_segs[ident];
711 if (len)
712 *len = seg->len;
713 if (sysmem)
714 *sysmem = seg->sysmem;
715 if (objptr)
716 *objptr = seg->object;
717 return (0);
718 }
719
720 void
vm_free_memseg(struct vm * vm,int ident)721 vm_free_memseg(struct vm *vm, int ident)
722 {
723 struct mem_seg *seg;
724
725 KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
726 ("%s: invalid memseg ident %d", __func__, ident));
727
728 seg = &vm->mem_segs[ident];
729 if (seg->object != NULL) {
730 vm_object_deallocate(seg->object);
731 bzero(seg, sizeof(struct mem_seg));
732 }
733 }
734
735 int
vm_mmap_memseg(struct vm * vm,vm_paddr_t gpa,int segid,vm_ooffset_t first,size_t len,int prot,int flags)736 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
737 size_t len, int prot, int flags)
738 {
739 struct mem_seg *seg;
740 struct mem_map *m, *map;
741 vm_ooffset_t last;
742 int i, error;
743
744 if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0)
745 return (EINVAL);
746
747 if (flags & ~VM_MEMMAP_F_WIRED)
748 return (EINVAL);
749
750 if (segid < 0 || segid >= VM_MAX_MEMSEGS)
751 return (EINVAL);
752
753 seg = &vm->mem_segs[segid];
754 if (seg->object == NULL)
755 return (EINVAL);
756
757 last = first + len;
758 if (first < 0 || first >= last || last > seg->len)
759 return (EINVAL);
760
761 if ((gpa | first | last) & PAGE_MASK)
762 return (EINVAL);
763
764 map = NULL;
765 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
766 m = &vm->mem_maps[i];
767 if (m->len == 0) {
768 map = m;
769 break;
770 }
771 }
772
773 if (map == NULL)
774 return (ENOSPC);
775
776 error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
777 len, 0, VMFS_NO_SPACE, prot, prot, 0);
778 if (error != KERN_SUCCESS)
779 return (EFAULT);
780
781 vm_object_reference(seg->object);
782
783 if (flags & VM_MEMMAP_F_WIRED) {
784 error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
785 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
786 if (error != KERN_SUCCESS) {
787 vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
788 return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM :
789 EFAULT);
790 }
791 }
792
793 map->gpa = gpa;
794 map->len = len;
795 map->segoff = first;
796 map->segid = segid;
797 map->prot = prot;
798 map->flags = flags;
799 return (0);
800 }
801
802 int
vm_munmap_memseg(struct vm * vm,vm_paddr_t gpa,size_t len)803 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len)
804 {
805 struct mem_map *m;
806 int i;
807
808 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
809 m = &vm->mem_maps[i];
810 if (m->gpa == gpa && m->len == len) {
811 vm_free_memmap(vm, i);
812 return (0);
813 }
814 }
815
816 return (EINVAL);
817 }
818
819 int
vm_mmap_getnext(struct vm * vm,vm_paddr_t * gpa,int * segid,vm_ooffset_t * segoff,size_t * len,int * prot,int * flags)820 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
821 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
822 {
823 struct mem_map *mm, *mmnext;
824 int i;
825
826 mmnext = NULL;
827 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
828 mm = &vm->mem_maps[i];
829 if (mm->len == 0 || mm->gpa < *gpa)
830 continue;
831 if (mmnext == NULL || mm->gpa < mmnext->gpa)
832 mmnext = mm;
833 }
834
835 if (mmnext != NULL) {
836 *gpa = mmnext->gpa;
837 if (segid)
838 *segid = mmnext->segid;
839 if (segoff)
840 *segoff = mmnext->segoff;
841 if (len)
842 *len = mmnext->len;
843 if (prot)
844 *prot = mmnext->prot;
845 if (flags)
846 *flags = mmnext->flags;
847 return (0);
848 } else {
849 return (ENOENT);
850 }
851 }
852
853 static void
vm_free_memmap(struct vm * vm,int ident)854 vm_free_memmap(struct vm *vm, int ident)
855 {
856 struct mem_map *mm;
857 int error __diagused;
858
859 mm = &vm->mem_maps[ident];
860 if (mm->len) {
861 error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
862 mm->gpa + mm->len);
863 KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d",
864 __func__, error));
865 bzero(mm, sizeof(struct mem_map));
866 }
867 }
868
869 static __inline bool
sysmem_mapping(struct vm * vm,struct mem_map * mm)870 sysmem_mapping(struct vm *vm, struct mem_map *mm)
871 {
872
873 if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
874 return (true);
875 else
876 return (false);
877 }
878
879 vm_paddr_t
vmm_sysmem_maxaddr(struct vm * vm)880 vmm_sysmem_maxaddr(struct vm *vm)
881 {
882 struct mem_map *mm;
883 vm_paddr_t maxaddr;
884 int i;
885
886 maxaddr = 0;
887 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
888 mm = &vm->mem_maps[i];
889 if (sysmem_mapping(vm, mm)) {
890 if (maxaddr < mm->gpa + mm->len)
891 maxaddr = mm->gpa + mm->len;
892 }
893 }
894 return (maxaddr);
895 }
896
897 int
vm_gla2gpa_nofault(struct vcpu * vcpu,struct vm_guest_paging * paging,uint64_t gla,int prot,uint64_t * gpa,int * is_fault)898 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
899 uint64_t gla, int prot, uint64_t *gpa, int *is_fault)
900 {
901
902 vmmops_gla2gpa(vcpu->cookie, paging, gla, prot, gpa, is_fault);
903 return (0);
904 }
905
906 static int
vmm_reg_raz(struct vcpu * vcpu,uint64_t * rval,void * arg)907 vmm_reg_raz(struct vcpu *vcpu, uint64_t *rval, void *arg)
908 {
909 *rval = 0;
910 return (0);
911 }
912
913 static int
vmm_reg_read_arg(struct vcpu * vcpu,uint64_t * rval,void * arg)914 vmm_reg_read_arg(struct vcpu *vcpu, uint64_t *rval, void *arg)
915 {
916 *rval = *(uint64_t *)arg;
917 return (0);
918 }
919
920 static int
vmm_reg_wi(struct vcpu * vcpu,uint64_t wval,void * arg)921 vmm_reg_wi(struct vcpu *vcpu, uint64_t wval, void *arg)
922 {
923 return (0);
924 }
925
926 static const struct vmm_special_reg vmm_special_regs[] = {
927 #define SPECIAL_REG(_reg, _read, _write) \
928 { \
929 .esr_iss = ((_reg ## _op0) << ISS_MSR_OP0_SHIFT) | \
930 ((_reg ## _op1) << ISS_MSR_OP1_SHIFT) | \
931 ((_reg ## _CRn) << ISS_MSR_CRn_SHIFT) | \
932 ((_reg ## _CRm) << ISS_MSR_CRm_SHIFT) | \
933 ((_reg ## _op2) << ISS_MSR_OP2_SHIFT), \
934 .esr_mask = ISS_MSR_REG_MASK, \
935 .reg_read = (_read), \
936 .reg_write = (_write), \
937 .arg = NULL, \
938 }
939 #define ID_SPECIAL_REG(_reg, _name) \
940 { \
941 .esr_iss = ((_reg ## _op0) << ISS_MSR_OP0_SHIFT) | \
942 ((_reg ## _op1) << ISS_MSR_OP1_SHIFT) | \
943 ((_reg ## _CRn) << ISS_MSR_CRn_SHIFT) | \
944 ((_reg ## _CRm) << ISS_MSR_CRm_SHIFT) | \
945 ((_reg ## _op2) << ISS_MSR_OP2_SHIFT), \
946 .esr_mask = ISS_MSR_REG_MASK, \
947 .reg_read = vmm_reg_read_arg, \
948 .reg_write = vmm_reg_wi, \
949 .arg = &(vmm_arch_regs._name), \
950 }
951
952 /* ID registers */
953 ID_SPECIAL_REG(ID_AA64PFR0_EL1, id_aa64pfr0),
954 ID_SPECIAL_REG(ID_AA64DFR0_EL1, id_aa64dfr0),
955 ID_SPECIAL_REG(ID_AA64ISAR0_EL1, id_aa64isar0),
956 ID_SPECIAL_REG(ID_AA64MMFR0_EL1, id_aa64mmfr0),
957 ID_SPECIAL_REG(ID_AA64MMFR1_EL1, id_aa64mmfr1),
958
959 /*
960 * All other ID registers are read as zero.
961 * They are all in the op0=3, op1=0, CRn=0, CRm={0..7} space.
962 */
963 {
964 .esr_iss = (3 << ISS_MSR_OP0_SHIFT) |
965 (0 << ISS_MSR_OP1_SHIFT) |
966 (0 << ISS_MSR_CRn_SHIFT) |
967 (0 << ISS_MSR_CRm_SHIFT),
968 .esr_mask = ISS_MSR_OP0_MASK | ISS_MSR_OP1_MASK |
969 ISS_MSR_CRn_MASK | (0x8 << ISS_MSR_CRm_SHIFT),
970 .reg_read = vmm_reg_raz,
971 .reg_write = vmm_reg_wi,
972 .arg = NULL,
973 },
974
975 /* Counter physical registers */
976 SPECIAL_REG(CNTP_CTL_EL0, vtimer_phys_ctl_read, vtimer_phys_ctl_write),
977 SPECIAL_REG(CNTP_CVAL_EL0, vtimer_phys_cval_read,
978 vtimer_phys_cval_write),
979 SPECIAL_REG(CNTP_TVAL_EL0, vtimer_phys_tval_read,
980 vtimer_phys_tval_write),
981 SPECIAL_REG(CNTPCT_EL0, vtimer_phys_cnt_read, vtimer_phys_cnt_write),
982 #undef SPECIAL_REG
983 };
984
985 void
vm_register_reg_handler(struct vm * vm,uint64_t iss,uint64_t mask,reg_read_t reg_read,reg_write_t reg_write,void * arg)986 vm_register_reg_handler(struct vm *vm, uint64_t iss, uint64_t mask,
987 reg_read_t reg_read, reg_write_t reg_write, void *arg)
988 {
989 int i;
990
991 for (i = 0; i < nitems(vm->special_reg); i++) {
992 if (vm->special_reg[i].esr_iss == 0 &&
993 vm->special_reg[i].esr_mask == 0) {
994 vm->special_reg[i].esr_iss = iss;
995 vm->special_reg[i].esr_mask = mask;
996 vm->special_reg[i].reg_read = reg_read;
997 vm->special_reg[i].reg_write = reg_write;
998 vm->special_reg[i].arg = arg;
999 return;
1000 }
1001 }
1002
1003 panic("%s: No free special register slot", __func__);
1004 }
1005
1006 void
vm_deregister_reg_handler(struct vm * vm,uint64_t iss,uint64_t mask)1007 vm_deregister_reg_handler(struct vm *vm, uint64_t iss, uint64_t mask)
1008 {
1009 int i;
1010
1011 for (i = 0; i < nitems(vm->special_reg); i++) {
1012 if (vm->special_reg[i].esr_iss == iss &&
1013 vm->special_reg[i].esr_mask == mask) {
1014 memset(&vm->special_reg[i], 0,
1015 sizeof(vm->special_reg[i]));
1016 return;
1017 }
1018 }
1019
1020 panic("%s: Invalid special register: iss %lx mask %lx", __func__, iss,
1021 mask);
1022 }
1023
1024 static int
vm_handle_reg_emul(struct vcpu * vcpu,bool * retu)1025 vm_handle_reg_emul(struct vcpu *vcpu, bool *retu)
1026 {
1027 struct vm *vm;
1028 struct vm_exit *vme;
1029 struct vre *vre;
1030 int i, rv;
1031
1032 vm = vcpu->vm;
1033 vme = &vcpu->exitinfo;
1034 vre = &vme->u.reg_emul.vre;
1035
1036 for (i = 0; i < nitems(vm->special_reg); i++) {
1037 if (vm->special_reg[i].esr_iss == 0 &&
1038 vm->special_reg[i].esr_mask == 0)
1039 continue;
1040
1041 if ((vre->inst_syndrome & vm->special_reg[i].esr_mask) ==
1042 vm->special_reg[i].esr_iss) {
1043 rv = vmm_emulate_register(vcpu, vre,
1044 vm->special_reg[i].reg_read,
1045 vm->special_reg[i].reg_write,
1046 vm->special_reg[i].arg);
1047 if (rv == 0) {
1048 *retu = false;
1049 }
1050 return (rv);
1051 }
1052 }
1053 for (i = 0; i < nitems(vmm_special_regs); i++) {
1054 if ((vre->inst_syndrome & vmm_special_regs[i].esr_mask) ==
1055 vmm_special_regs[i].esr_iss) {
1056 rv = vmm_emulate_register(vcpu, vre,
1057 vmm_special_regs[i].reg_read,
1058 vmm_special_regs[i].reg_write,
1059 vmm_special_regs[i].arg);
1060 if (rv == 0) {
1061 *retu = false;
1062 }
1063 return (rv);
1064 }
1065 }
1066
1067
1068 *retu = true;
1069 return (0);
1070 }
1071
1072 void
vm_register_inst_handler(struct vm * vm,uint64_t start,uint64_t size,mem_region_read_t mmio_read,mem_region_write_t mmio_write)1073 vm_register_inst_handler(struct vm *vm, uint64_t start, uint64_t size,
1074 mem_region_read_t mmio_read, mem_region_write_t mmio_write)
1075 {
1076 int i;
1077
1078 for (i = 0; i < nitems(vm->mmio_region); i++) {
1079 if (vm->mmio_region[i].start == 0 &&
1080 vm->mmio_region[i].end == 0) {
1081 vm->mmio_region[i].start = start;
1082 vm->mmio_region[i].end = start + size;
1083 vm->mmio_region[i].read = mmio_read;
1084 vm->mmio_region[i].write = mmio_write;
1085 return;
1086 }
1087 }
1088
1089 panic("%s: No free MMIO region", __func__);
1090 }
1091
1092 void
vm_deregister_inst_handler(struct vm * vm,uint64_t start,uint64_t size)1093 vm_deregister_inst_handler(struct vm *vm, uint64_t start, uint64_t size)
1094 {
1095 int i;
1096
1097 for (i = 0; i < nitems(vm->mmio_region); i++) {
1098 if (vm->mmio_region[i].start == start &&
1099 vm->mmio_region[i].end == start + size) {
1100 memset(&vm->mmio_region[i], 0,
1101 sizeof(vm->mmio_region[i]));
1102 return;
1103 }
1104 }
1105
1106 panic("%s: Invalid MMIO region: %lx - %lx", __func__, start,
1107 start + size);
1108 }
1109
1110 static int
vm_handle_inst_emul(struct vcpu * vcpu,bool * retu)1111 vm_handle_inst_emul(struct vcpu *vcpu, bool *retu)
1112 {
1113 struct vm *vm;
1114 struct vm_exit *vme;
1115 struct vie *vie;
1116 struct hyp *hyp;
1117 uint64_t fault_ipa;
1118 struct vm_guest_paging *paging;
1119 struct vmm_mmio_region *vmr;
1120 int error, i;
1121
1122 vm = vcpu->vm;
1123 hyp = vm->cookie;
1124 if (!hyp->vgic_attached)
1125 goto out_user;
1126
1127 vme = &vcpu->exitinfo;
1128 vie = &vme->u.inst_emul.vie;
1129 paging = &vme->u.inst_emul.paging;
1130
1131 fault_ipa = vme->u.inst_emul.gpa;
1132
1133 vmr = NULL;
1134 for (i = 0; i < nitems(vm->mmio_region); i++) {
1135 if (vm->mmio_region[i].start <= fault_ipa &&
1136 vm->mmio_region[i].end > fault_ipa) {
1137 vmr = &vm->mmio_region[i];
1138 break;
1139 }
1140 }
1141 if (vmr == NULL)
1142 goto out_user;
1143
1144 error = vmm_emulate_instruction(vcpu, fault_ipa, vie, paging,
1145 vmr->read, vmr->write, retu);
1146 return (error);
1147
1148 out_user:
1149 *retu = true;
1150 return (0);
1151 }
1152
1153 int
vm_suspend(struct vm * vm,enum vm_suspend_how how)1154 vm_suspend(struct vm *vm, enum vm_suspend_how how)
1155 {
1156 int i;
1157
1158 if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
1159 return (EINVAL);
1160
1161 if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
1162 VM_CTR2(vm, "virtual machine already suspended %d/%d",
1163 vm->suspend, how);
1164 return (EALREADY);
1165 }
1166
1167 VM_CTR1(vm, "virtual machine successfully suspended %d", how);
1168
1169 /*
1170 * Notify all active vcpus that they are now suspended.
1171 */
1172 for (i = 0; i < vm->maxcpus; i++) {
1173 if (CPU_ISSET(i, &vm->active_cpus))
1174 vcpu_notify_event(vm_vcpu(vm, i));
1175 }
1176
1177 return (0);
1178 }
1179
1180 void
vm_exit_suspended(struct vcpu * vcpu,uint64_t pc)1181 vm_exit_suspended(struct vcpu *vcpu, uint64_t pc)
1182 {
1183 struct vm *vm = vcpu->vm;
1184 struct vm_exit *vmexit;
1185
1186 KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
1187 ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
1188
1189 vmexit = vm_exitinfo(vcpu);
1190 vmexit->pc = pc;
1191 vmexit->inst_length = 4;
1192 vmexit->exitcode = VM_EXITCODE_SUSPENDED;
1193 vmexit->u.suspended.how = vm->suspend;
1194 }
1195
1196 void
vm_exit_debug(struct vcpu * vcpu,uint64_t pc)1197 vm_exit_debug(struct vcpu *vcpu, uint64_t pc)
1198 {
1199 struct vm_exit *vmexit;
1200
1201 vmexit = vm_exitinfo(vcpu);
1202 vmexit->pc = pc;
1203 vmexit->inst_length = 4;
1204 vmexit->exitcode = VM_EXITCODE_DEBUG;
1205 }
1206
1207 int
vm_activate_cpu(struct vcpu * vcpu)1208 vm_activate_cpu(struct vcpu *vcpu)
1209 {
1210 struct vm *vm = vcpu->vm;
1211
1212 if (CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
1213 return (EBUSY);
1214
1215 CPU_SET_ATOMIC(vcpu->vcpuid, &vm->active_cpus);
1216 return (0);
1217
1218 }
1219
1220 int
vm_suspend_cpu(struct vm * vm,struct vcpu * vcpu)1221 vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu)
1222 {
1223 if (vcpu == NULL) {
1224 vm->debug_cpus = vm->active_cpus;
1225 for (int i = 0; i < vm->maxcpus; i++) {
1226 if (CPU_ISSET(i, &vm->active_cpus))
1227 vcpu_notify_event(vm_vcpu(vm, i));
1228 }
1229 } else {
1230 if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
1231 return (EINVAL);
1232
1233 CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
1234 vcpu_notify_event(vcpu);
1235 }
1236 return (0);
1237 }
1238
1239 int
vm_resume_cpu(struct vm * vm,struct vcpu * vcpu)1240 vm_resume_cpu(struct vm *vm, struct vcpu *vcpu)
1241 {
1242
1243 if (vcpu == NULL) {
1244 CPU_ZERO(&vm->debug_cpus);
1245 } else {
1246 if (!CPU_ISSET(vcpu->vcpuid, &vm->debug_cpus))
1247 return (EINVAL);
1248
1249 CPU_CLR_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
1250 }
1251 return (0);
1252 }
1253
1254 int
vcpu_debugged(struct vcpu * vcpu)1255 vcpu_debugged(struct vcpu *vcpu)
1256 {
1257
1258 return (CPU_ISSET(vcpu->vcpuid, &vcpu->vm->debug_cpus));
1259 }
1260
1261 cpuset_t
vm_active_cpus(struct vm * vm)1262 vm_active_cpus(struct vm *vm)
1263 {
1264
1265 return (vm->active_cpus);
1266 }
1267
1268 cpuset_t
vm_debug_cpus(struct vm * vm)1269 vm_debug_cpus(struct vm *vm)
1270 {
1271
1272 return (vm->debug_cpus);
1273 }
1274
1275 cpuset_t
vm_suspended_cpus(struct vm * vm)1276 vm_suspended_cpus(struct vm *vm)
1277 {
1278
1279 return (vm->suspended_cpus);
1280 }
1281
1282
1283 void *
vcpu_stats(struct vcpu * vcpu)1284 vcpu_stats(struct vcpu *vcpu)
1285 {
1286
1287 return (vcpu->stats);
1288 }
1289
1290 /*
1291 * This function is called to ensure that a vcpu "sees" a pending event
1292 * as soon as possible:
1293 * - If the vcpu thread is sleeping then it is woken up.
1294 * - If the vcpu is running on a different host_cpu then an IPI will be directed
1295 * to the host_cpu to cause the vcpu to trap into the hypervisor.
1296 */
1297 static void
vcpu_notify_event_locked(struct vcpu * vcpu)1298 vcpu_notify_event_locked(struct vcpu *vcpu)
1299 {
1300 int hostcpu;
1301
1302 hostcpu = vcpu->hostcpu;
1303 if (vcpu->state == VCPU_RUNNING) {
1304 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
1305 if (hostcpu != curcpu) {
1306 ipi_cpu(hostcpu, vmm_ipinum);
1307 } else {
1308 /*
1309 * If the 'vcpu' is running on 'curcpu' then it must
1310 * be sending a notification to itself (e.g. SELF_IPI).
1311 * The pending event will be picked up when the vcpu
1312 * transitions back to guest context.
1313 */
1314 }
1315 } else {
1316 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
1317 "with hostcpu %d", vcpu->state, hostcpu));
1318 if (vcpu->state == VCPU_SLEEPING)
1319 wakeup_one(vcpu);
1320 }
1321 }
1322
1323 void
vcpu_notify_event(struct vcpu * vcpu)1324 vcpu_notify_event(struct vcpu *vcpu)
1325 {
1326 vcpu_lock(vcpu);
1327 vcpu_notify_event_locked(vcpu);
1328 vcpu_unlock(vcpu);
1329 }
1330
1331 static void
restore_guest_fpustate(struct vcpu * vcpu)1332 restore_guest_fpustate(struct vcpu *vcpu)
1333 {
1334
1335 /* flush host state to the pcb */
1336 vfp_save_state(curthread, curthread->td_pcb);
1337 /* Ensure the VFP state will be re-loaded when exiting the guest */
1338 PCPU_SET(fpcurthread, NULL);
1339
1340 /* restore guest FPU state */
1341 vfp_enable();
1342 vfp_restore(vcpu->guestfpu);
1343
1344 /*
1345 * The FPU is now "dirty" with the guest's state so turn on emulation
1346 * to trap any access to the FPU by the host.
1347 */
1348 vfp_disable();
1349 }
1350
1351 static void
save_guest_fpustate(struct vcpu * vcpu)1352 save_guest_fpustate(struct vcpu *vcpu)
1353 {
1354 if ((READ_SPECIALREG(cpacr_el1) & CPACR_FPEN_MASK) !=
1355 CPACR_FPEN_TRAP_ALL1)
1356 panic("VFP not enabled in host!");
1357
1358 /* save guest FPU state */
1359 vfp_enable();
1360 vfp_store(vcpu->guestfpu);
1361 vfp_disable();
1362
1363 KASSERT(PCPU_GET(fpcurthread) == NULL,
1364 ("%s: fpcurthread set with guest registers", __func__));
1365 }
1366 static int
vcpu_set_state_locked(struct vcpu * vcpu,enum vcpu_state newstate,bool from_idle)1367 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
1368 bool from_idle)
1369 {
1370 int error;
1371
1372 vcpu_assert_locked(vcpu);
1373
1374 /*
1375 * State transitions from the vmmdev_ioctl() must always begin from
1376 * the VCPU_IDLE state. This guarantees that there is only a single
1377 * ioctl() operating on a vcpu at any point.
1378 */
1379 if (from_idle) {
1380 while (vcpu->state != VCPU_IDLE) {
1381 vcpu_notify_event_locked(vcpu);
1382 msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
1383 }
1384 } else {
1385 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1386 "vcpu idle state"));
1387 }
1388
1389 if (vcpu->state == VCPU_RUNNING) {
1390 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1391 "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1392 } else {
1393 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1394 "vcpu that is not running", vcpu->hostcpu));
1395 }
1396
1397 /*
1398 * The following state transitions are allowed:
1399 * IDLE -> FROZEN -> IDLE
1400 * FROZEN -> RUNNING -> FROZEN
1401 * FROZEN -> SLEEPING -> FROZEN
1402 */
1403 switch (vcpu->state) {
1404 case VCPU_IDLE:
1405 case VCPU_RUNNING:
1406 case VCPU_SLEEPING:
1407 error = (newstate != VCPU_FROZEN);
1408 break;
1409 case VCPU_FROZEN:
1410 error = (newstate == VCPU_FROZEN);
1411 break;
1412 default:
1413 error = 1;
1414 break;
1415 }
1416
1417 if (error)
1418 return (EBUSY);
1419
1420 vcpu->state = newstate;
1421 if (newstate == VCPU_RUNNING)
1422 vcpu->hostcpu = curcpu;
1423 else
1424 vcpu->hostcpu = NOCPU;
1425
1426 if (newstate == VCPU_IDLE)
1427 wakeup(&vcpu->state);
1428
1429 return (0);
1430 }
1431
1432 static void
vcpu_require_state(struct vcpu * vcpu,enum vcpu_state newstate)1433 vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate)
1434 {
1435 int error;
1436
1437 if ((error = vcpu_set_state(vcpu, newstate, false)) != 0)
1438 panic("Error %d setting state to %d\n", error, newstate);
1439 }
1440
1441 static void
vcpu_require_state_locked(struct vcpu * vcpu,enum vcpu_state newstate)1442 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
1443 {
1444 int error;
1445
1446 if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
1447 panic("Error %d setting state to %d", error, newstate);
1448 }
1449
1450 int
vm_get_capability(struct vcpu * vcpu,int type,int * retval)1451 vm_get_capability(struct vcpu *vcpu, int type, int *retval)
1452 {
1453 if (type < 0 || type >= VM_CAP_MAX)
1454 return (EINVAL);
1455
1456 return (vmmops_getcap(vcpu->cookie, type, retval));
1457 }
1458
1459 int
vm_set_capability(struct vcpu * vcpu,int type,int val)1460 vm_set_capability(struct vcpu *vcpu, int type, int val)
1461 {
1462 if (type < 0 || type >= VM_CAP_MAX)
1463 return (EINVAL);
1464
1465 return (vmmops_setcap(vcpu->cookie, type, val));
1466 }
1467
1468 struct vm *
vcpu_vm(struct vcpu * vcpu)1469 vcpu_vm(struct vcpu *vcpu)
1470 {
1471 return (vcpu->vm);
1472 }
1473
1474 int
vcpu_vcpuid(struct vcpu * vcpu)1475 vcpu_vcpuid(struct vcpu *vcpu)
1476 {
1477 return (vcpu->vcpuid);
1478 }
1479
1480 void *
vcpu_get_cookie(struct vcpu * vcpu)1481 vcpu_get_cookie(struct vcpu *vcpu)
1482 {
1483 return (vcpu->cookie);
1484 }
1485
1486 struct vcpu *
vm_vcpu(struct vm * vm,int vcpuid)1487 vm_vcpu(struct vm *vm, int vcpuid)
1488 {
1489 return (vm->vcpu[vcpuid]);
1490 }
1491
1492 int
vcpu_set_state(struct vcpu * vcpu,enum vcpu_state newstate,bool from_idle)1493 vcpu_set_state(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle)
1494 {
1495 int error;
1496
1497 vcpu_lock(vcpu);
1498 error = vcpu_set_state_locked(vcpu, newstate, from_idle);
1499 vcpu_unlock(vcpu);
1500
1501 return (error);
1502 }
1503
1504 enum vcpu_state
vcpu_get_state(struct vcpu * vcpu,int * hostcpu)1505 vcpu_get_state(struct vcpu *vcpu, int *hostcpu)
1506 {
1507 enum vcpu_state state;
1508
1509 vcpu_lock(vcpu);
1510 state = vcpu->state;
1511 if (hostcpu != NULL)
1512 *hostcpu = vcpu->hostcpu;
1513 vcpu_unlock(vcpu);
1514
1515 return (state);
1516 }
1517
1518 static void *
_vm_gpa_hold(struct vm * vm,vm_paddr_t gpa,size_t len,int reqprot,void ** cookie)1519 _vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
1520 void **cookie)
1521 {
1522 int i, count, pageoff;
1523 struct mem_map *mm;
1524 vm_page_t m;
1525
1526 pageoff = gpa & PAGE_MASK;
1527 if (len > PAGE_SIZE - pageoff)
1528 panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
1529
1530 count = 0;
1531 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
1532 mm = &vm->mem_maps[i];
1533 if (sysmem_mapping(vm, mm) && gpa >= mm->gpa &&
1534 gpa < mm->gpa + mm->len) {
1535 count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
1536 trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
1537 break;
1538 }
1539 }
1540
1541 if (count == 1) {
1542 *cookie = m;
1543 return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
1544 } else {
1545 *cookie = NULL;
1546 return (NULL);
1547 }
1548 }
1549
1550 void *
vm_gpa_hold(struct vcpu * vcpu,vm_paddr_t gpa,size_t len,int reqprot,void ** cookie)1551 vm_gpa_hold(struct vcpu *vcpu, vm_paddr_t gpa, size_t len, int reqprot,
1552 void **cookie)
1553 {
1554 #ifdef INVARIANTS
1555 /*
1556 * The current vcpu should be frozen to ensure 'vm_memmap[]'
1557 * stability.
1558 */
1559 int state = vcpu_get_state(vcpu, NULL);
1560 KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
1561 __func__, state));
1562 #endif
1563 return (_vm_gpa_hold(vcpu->vm, gpa, len, reqprot, cookie));
1564 }
1565
1566 void *
vm_gpa_hold_global(struct vm * vm,vm_paddr_t gpa,size_t len,int reqprot,void ** cookie)1567 vm_gpa_hold_global(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
1568 void **cookie)
1569 {
1570 sx_assert(&vm->mem_segs_lock, SX_LOCKED);
1571 return (_vm_gpa_hold(vm, gpa, len, reqprot, cookie));
1572 }
1573
1574 void
vm_gpa_release(void * cookie)1575 vm_gpa_release(void *cookie)
1576 {
1577 vm_page_t m = cookie;
1578
1579 vm_page_unwire(m, PQ_ACTIVE);
1580 }
1581
1582 int
vm_get_register(struct vcpu * vcpu,int reg,uint64_t * retval)1583 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval)
1584 {
1585
1586 if (reg >= VM_REG_LAST)
1587 return (EINVAL);
1588
1589 return (vmmops_getreg(vcpu->cookie, reg, retval));
1590 }
1591
1592 int
vm_set_register(struct vcpu * vcpu,int reg,uint64_t val)1593 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
1594 {
1595 int error;
1596
1597 if (reg >= VM_REG_LAST)
1598 return (EINVAL);
1599 error = vmmops_setreg(vcpu->cookie, reg, val);
1600 if (error || reg != VM_REG_GUEST_PC)
1601 return (error);
1602
1603 vcpu->nextpc = val;
1604
1605 return (0);
1606 }
1607
1608 void *
vm_get_cookie(struct vm * vm)1609 vm_get_cookie(struct vm *vm)
1610 {
1611 return (vm->cookie);
1612 }
1613
1614 int
vm_inject_exception(struct vcpu * vcpu,uint64_t esr,uint64_t far)1615 vm_inject_exception(struct vcpu *vcpu, uint64_t esr, uint64_t far)
1616 {
1617 return (vmmops_exception(vcpu->cookie, esr, far));
1618 }
1619
1620 int
vm_attach_vgic(struct vm * vm,struct vm_vgic_descr * descr)1621 vm_attach_vgic(struct vm *vm, struct vm_vgic_descr *descr)
1622 {
1623 return (vgic_attach_to_vm(vm->cookie, descr));
1624 }
1625
1626 int
vm_assert_irq(struct vm * vm,uint32_t irq)1627 vm_assert_irq(struct vm *vm, uint32_t irq)
1628 {
1629 return (vgic_inject_irq(vm->cookie, -1, irq, true));
1630 }
1631
1632 int
vm_deassert_irq(struct vm * vm,uint32_t irq)1633 vm_deassert_irq(struct vm *vm, uint32_t irq)
1634 {
1635 return (vgic_inject_irq(vm->cookie, -1, irq, false));
1636 }
1637
1638 int
vm_raise_msi(struct vm * vm,uint64_t msg,uint64_t addr,int bus,int slot,int func)1639 vm_raise_msi(struct vm *vm, uint64_t msg, uint64_t addr, int bus, int slot,
1640 int func)
1641 {
1642 /* TODO: Should we raise an SError? */
1643 return (vgic_inject_msi(vm->cookie, msg, addr));
1644 }
1645
1646 static int
vm_handle_smccc_call(struct vcpu * vcpu,struct vm_exit * vme,bool * retu)1647 vm_handle_smccc_call(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
1648 {
1649 struct hypctx *hypctx;
1650 int i;
1651
1652 hypctx = vcpu_get_cookie(vcpu);
1653
1654 if ((hypctx->tf.tf_esr & ESR_ELx_ISS_MASK) != 0)
1655 return (1);
1656
1657 vme->exitcode = VM_EXITCODE_SMCCC;
1658 vme->u.smccc_call.func_id = hypctx->tf.tf_x[0];
1659 for (i = 0; i < nitems(vme->u.smccc_call.args); i++)
1660 vme->u.smccc_call.args[i] = hypctx->tf.tf_x[i + 1];
1661
1662 *retu = true;
1663 return (0);
1664 }
1665
1666 static int
vm_handle_wfi(struct vcpu * vcpu,struct vm_exit * vme,bool * retu)1667 vm_handle_wfi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
1668 {
1669 vcpu_lock(vcpu);
1670 while (1) {
1671 if (vgic_has_pending_irq(vcpu->cookie))
1672 break;
1673
1674 if (vcpu_should_yield(vcpu))
1675 break;
1676
1677 vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1678 /*
1679 * XXX msleep_spin() cannot be interrupted by signals so
1680 * wake up periodically to check pending signals.
1681 */
1682 msleep_spin(vcpu, &vcpu->mtx, "vmidle", hz);
1683 vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1684 }
1685 vcpu_unlock(vcpu);
1686
1687 *retu = false;
1688 return (0);
1689 }
1690
1691 static int
vm_handle_paging(struct vcpu * vcpu,bool * retu)1692 vm_handle_paging(struct vcpu *vcpu, bool *retu)
1693 {
1694 struct vm *vm = vcpu->vm;
1695 struct vm_exit *vme;
1696 struct vm_map *map;
1697 uint64_t addr, esr;
1698 pmap_t pmap;
1699 int ftype, rv;
1700
1701 vme = &vcpu->exitinfo;
1702
1703 pmap = vmspace_pmap(vcpu->vm->vmspace);
1704 addr = vme->u.paging.gpa;
1705 esr = vme->u.paging.esr;
1706
1707 /* The page exists, but the page table needs to be updated. */
1708 if (pmap_fault(pmap, esr, addr) == KERN_SUCCESS)
1709 return (0);
1710
1711 switch (ESR_ELx_EXCEPTION(esr)) {
1712 case EXCP_INSN_ABORT_L:
1713 case EXCP_DATA_ABORT_L:
1714 ftype = VM_PROT_EXECUTE | VM_PROT_READ | VM_PROT_WRITE;
1715 break;
1716 default:
1717 panic("%s: Invalid exception (esr = %lx)", __func__, esr);
1718 }
1719
1720 map = &vm->vmspace->vm_map;
1721 rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL);
1722 if (rv != KERN_SUCCESS)
1723 return (EFAULT);
1724
1725 return (0);
1726 }
1727
1728 static int
vm_handle_suspend(struct vcpu * vcpu,bool * retu)1729 vm_handle_suspend(struct vcpu *vcpu, bool *retu)
1730 {
1731 struct vm *vm = vcpu->vm;
1732 int error, i;
1733 struct thread *td;
1734
1735 error = 0;
1736 td = curthread;
1737
1738 CPU_SET_ATOMIC(vcpu->vcpuid, &vm->suspended_cpus);
1739
1740 /*
1741 * Wait until all 'active_cpus' have suspended themselves.
1742 *
1743 * Since a VM may be suspended at any time including when one or
1744 * more vcpus are doing a rendezvous we need to call the rendezvous
1745 * handler while we are waiting to prevent a deadlock.
1746 */
1747 vcpu_lock(vcpu);
1748 while (error == 0) {
1749 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0)
1750 break;
1751
1752 vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1753 msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
1754 vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1755 if (td_ast_pending(td, TDA_SUSPEND)) {
1756 vcpu_unlock(vcpu);
1757 error = thread_check_susp(td, false);
1758 vcpu_lock(vcpu);
1759 }
1760 }
1761 vcpu_unlock(vcpu);
1762
1763 /*
1764 * Wakeup the other sleeping vcpus and return to userspace.
1765 */
1766 for (i = 0; i < vm->maxcpus; i++) {
1767 if (CPU_ISSET(i, &vm->suspended_cpus)) {
1768 vcpu_notify_event(vm_vcpu(vm, i));
1769 }
1770 }
1771
1772 *retu = true;
1773 return (error);
1774 }
1775
1776 int
vm_run(struct vcpu * vcpu)1777 vm_run(struct vcpu *vcpu)
1778 {
1779 struct vm *vm = vcpu->vm;
1780 struct vm_eventinfo evinfo;
1781 int error, vcpuid;
1782 struct vm_exit *vme;
1783 bool retu;
1784 pmap_t pmap;
1785
1786 vcpuid = vcpu->vcpuid;
1787
1788 if (!CPU_ISSET(vcpuid, &vm->active_cpus))
1789 return (EINVAL);
1790
1791 if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
1792 return (EINVAL);
1793
1794 pmap = vmspace_pmap(vm->vmspace);
1795 vme = &vcpu->exitinfo;
1796 evinfo.rptr = NULL;
1797 evinfo.sptr = &vm->suspend;
1798 evinfo.iptr = NULL;
1799 restart:
1800 critical_enter();
1801
1802 restore_guest_fpustate(vcpu);
1803
1804 vcpu_require_state(vcpu, VCPU_RUNNING);
1805 error = vmmops_run(vcpu->cookie, vcpu->nextpc, pmap, &evinfo);
1806 vcpu_require_state(vcpu, VCPU_FROZEN);
1807
1808 save_guest_fpustate(vcpu);
1809
1810 critical_exit();
1811
1812 if (error == 0) {
1813 retu = false;
1814 switch (vme->exitcode) {
1815 case VM_EXITCODE_INST_EMUL:
1816 vcpu->nextpc = vme->pc + vme->inst_length;
1817 error = vm_handle_inst_emul(vcpu, &retu);
1818 break;
1819
1820 case VM_EXITCODE_REG_EMUL:
1821 vcpu->nextpc = vme->pc + vme->inst_length;
1822 error = vm_handle_reg_emul(vcpu, &retu);
1823 break;
1824
1825 case VM_EXITCODE_HVC:
1826 /*
1827 * The HVC instruction saves the address for the
1828 * next instruction as the return address.
1829 */
1830 vcpu->nextpc = vme->pc;
1831 /*
1832 * The PSCI call can change the exit information in the
1833 * case of suspend/reset/poweroff/cpu off/cpu on.
1834 */
1835 error = vm_handle_smccc_call(vcpu, vme, &retu);
1836 break;
1837
1838 case VM_EXITCODE_WFI:
1839 vcpu->nextpc = vme->pc + vme->inst_length;
1840 error = vm_handle_wfi(vcpu, vme, &retu);
1841 break;
1842
1843 case VM_EXITCODE_PAGING:
1844 vcpu->nextpc = vme->pc;
1845 error = vm_handle_paging(vcpu, &retu);
1846 break;
1847
1848 case VM_EXITCODE_SUSPENDED:
1849 vcpu->nextpc = vme->pc;
1850 error = vm_handle_suspend(vcpu, &retu);
1851 break;
1852
1853 default:
1854 /* Handle in userland */
1855 vcpu->nextpc = vme->pc;
1856 retu = true;
1857 break;
1858 }
1859 }
1860
1861 if (error == 0 && retu == false)
1862 goto restart;
1863
1864 return (error);
1865 }
1866