1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 31 #include <sys/cdefs.h> 32 __FBSDID("$FreeBSD$"); 33 34 #include "opt_bhyve_snapshot.h" 35 36 #include <sys/param.h> 37 #include <sys/kernel.h> 38 #include <sys/jail.h> 39 #include <sys/queue.h> 40 #include <sys/lock.h> 41 #include <sys/mutex.h> 42 #include <sys/malloc.h> 43 #include <sys/conf.h> 44 #include <sys/sysctl.h> 45 #include <sys/libkern.h> 46 #include <sys/ioccom.h> 47 #include <sys/mman.h> 48 #include <sys/uio.h> 49 #include <sys/proc.h> 50 51 #include <vm/vm.h> 52 #include <vm/pmap.h> 53 #include <vm/vm_map.h> 54 #include <vm/vm_object.h> 55 56 #include <machine/vmparam.h> 57 #include <machine/vmm.h> 58 #include <machine/vmm_dev.h> 59 #include <machine/vmm_instruction_emul.h> 60 #include <machine/vmm_snapshot.h> 61 #include <x86/apicreg.h> 62 63 #include "vmm_lapic.h" 64 #include "vmm_stat.h" 65 #include "vmm_mem.h" 66 #include "io/ppt.h" 67 #include "io/vatpic.h" 68 #include "io/vioapic.h" 69 #include "io/vhpet.h" 70 #include "io/vrtc.h" 71 72 struct devmem_softc { 73 int segid; 74 char *name; 75 struct cdev *cdev; 76 struct vmmdev_softc *sc; 77 SLIST_ENTRY(devmem_softc) link; 78 }; 79 80 struct vmmdev_softc { 81 struct vm *vm; /* vm instance cookie */ 82 struct cdev *cdev; 83 SLIST_ENTRY(vmmdev_softc) link; 84 SLIST_HEAD(, devmem_softc) devmem; 85 int flags; 86 }; 87 #define VSC_LINKED 0x01 88 89 static SLIST_HEAD(, vmmdev_softc) head; 90 91 static unsigned pr_allow_flag; 92 static struct mtx vmmdev_mtx; 93 94 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); 95 96 SYSCTL_DECL(_hw_vmm); 97 98 static int vmm_priv_check(struct ucred *ucred); 99 static int devmem_create_cdev(const char *vmname, int id, char *devmem); 100 static void devmem_destroy(void *arg); 101 102 static int 103 vmm_priv_check(struct ucred *ucred) 104 { 105 106 if (jailed(ucred) && 107 !(ucred->cr_prison->pr_allow & pr_allow_flag)) 108 return (EPERM); 109 110 return (0); 111 } 112 113 static int 114 vcpu_lock_one(struct vmmdev_softc *sc, int vcpu) 115 { 116 int error; 117 118 if (vcpu < 0 || vcpu >= vm_get_maxcpus(sc->vm)) 119 return (EINVAL); 120 121 error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true); 122 return (error); 123 } 124 125 static void 126 vcpu_unlock_one(struct vmmdev_softc *sc, int vcpu) 127 { 128 enum vcpu_state state; 129 130 state = vcpu_get_state(sc->vm, vcpu, NULL); 131 if (state != VCPU_FROZEN) { 132 panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm), 133 vcpu, state); 134 } 135 136 vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); 137 } 138 139 static int 140 vcpu_lock_all(struct vmmdev_softc *sc) 141 { 142 int error, vcpu; 143 uint16_t maxcpus; 144 145 maxcpus = vm_get_maxcpus(sc->vm); 146 for (vcpu = 0; vcpu < maxcpus; vcpu++) { 147 error = vcpu_lock_one(sc, vcpu); 148 if (error) 149 break; 150 } 151 152 if (error) { 153 while (--vcpu >= 0) 154 vcpu_unlock_one(sc, vcpu); 155 } 156 157 return (error); 158 } 159 160 static void 161 vcpu_unlock_all(struct vmmdev_softc *sc) 162 { 163 int vcpu; 164 uint16_t maxcpus; 165 166 maxcpus = vm_get_maxcpus(sc->vm); 167 for (vcpu = 0; vcpu < maxcpus; vcpu++) 168 vcpu_unlock_one(sc, vcpu); 169 } 170 171 static struct vmmdev_softc * 172 vmmdev_lookup(const char *name) 173 { 174 struct vmmdev_softc *sc; 175 176 #ifdef notyet /* XXX kernel is not compiled with invariants */ 177 mtx_assert(&vmmdev_mtx, MA_OWNED); 178 #endif 179 180 SLIST_FOREACH(sc, &head, link) { 181 if (strcmp(name, vm_name(sc->vm)) == 0) 182 break; 183 } 184 185 return (sc); 186 } 187 188 static struct vmmdev_softc * 189 vmmdev_lookup2(struct cdev *cdev) 190 { 191 192 return (cdev->si_drv1); 193 } 194 195 static int 196 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) 197 { 198 int error, off, c, prot; 199 vm_paddr_t gpa, maxaddr; 200 void *hpa, *cookie; 201 struct vmmdev_softc *sc; 202 uint16_t lastcpu; 203 204 error = vmm_priv_check(curthread->td_ucred); 205 if (error) 206 return (error); 207 208 sc = vmmdev_lookup2(cdev); 209 if (sc == NULL) 210 return (ENXIO); 211 212 /* 213 * Get a read lock on the guest memory map by freezing any vcpu. 214 */ 215 lastcpu = vm_get_maxcpus(sc->vm) - 1; 216 error = vcpu_lock_one(sc, lastcpu); 217 if (error) 218 return (error); 219 220 prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ); 221 maxaddr = vmm_sysmem_maxaddr(sc->vm); 222 while (uio->uio_resid > 0 && error == 0) { 223 gpa = uio->uio_offset; 224 off = gpa & PAGE_MASK; 225 c = min(uio->uio_resid, PAGE_SIZE - off); 226 227 /* 228 * The VM has a hole in its physical memory map. If we want to 229 * use 'dd' to inspect memory beyond the hole we need to 230 * provide bogus data for memory that lies in the hole. 231 * 232 * Since this device does not support lseek(2), dd(1) will 233 * read(2) blocks of data to simulate the lseek(2). 234 */ 235 hpa = vm_gpa_hold(sc->vm, lastcpu, gpa, c, 236 prot, &cookie); 237 if (hpa == NULL) { 238 if (uio->uio_rw == UIO_READ && gpa < maxaddr) 239 error = uiomove(__DECONST(void *, zero_region), 240 c, uio); 241 else 242 error = EFAULT; 243 } else { 244 error = uiomove(hpa, c, uio); 245 vm_gpa_release(cookie); 246 } 247 } 248 vcpu_unlock_one(sc, lastcpu); 249 return (error); 250 } 251 252 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1); 253 254 static int 255 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len) 256 { 257 struct devmem_softc *dsc; 258 int error; 259 bool sysmem; 260 261 error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL); 262 if (error || mseg->len == 0) 263 return (error); 264 265 if (!sysmem) { 266 SLIST_FOREACH(dsc, &sc->devmem, link) { 267 if (dsc->segid == mseg->segid) 268 break; 269 } 270 KASSERT(dsc != NULL, ("%s: devmem segment %d not found", 271 __func__, mseg->segid)); 272 error = copystr(dsc->name, mseg->name, len, NULL); 273 } else { 274 bzero(mseg->name, len); 275 } 276 277 return (error); 278 } 279 280 static int 281 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len) 282 { 283 char *name; 284 int error; 285 bool sysmem; 286 287 error = 0; 288 name = NULL; 289 sysmem = true; 290 291 /* 292 * The allocation is lengthened by 1 to hold a terminating NUL. It'll 293 * by stripped off when devfs processes the full string. 294 */ 295 if (VM_MEMSEG_NAME(mseg)) { 296 sysmem = false; 297 name = malloc(len, M_VMMDEV, M_WAITOK); 298 error = copystr(mseg->name, name, len, NULL); 299 if (error) 300 goto done; 301 } 302 303 error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem); 304 if (error) 305 goto done; 306 307 if (VM_MEMSEG_NAME(mseg)) { 308 error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name); 309 if (error) 310 vm_free_memseg(sc->vm, mseg->segid); 311 else 312 name = NULL; /* freed when 'cdev' is destroyed */ 313 } 314 done: 315 free(name, M_VMMDEV); 316 return (error); 317 } 318 319 static int 320 vm_get_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum, 321 uint64_t *regval) 322 { 323 int error, i; 324 325 error = 0; 326 for (i = 0; i < count; i++) { 327 error = vm_get_register(vm, vcpu, regnum[i], ®val[i]); 328 if (error) 329 break; 330 } 331 return (error); 332 } 333 334 static int 335 vm_set_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum, 336 uint64_t *regval) 337 { 338 int error, i; 339 340 error = 0; 341 for (i = 0; i < count; i++) { 342 error = vm_set_register(vm, vcpu, regnum[i], regval[i]); 343 if (error) 344 break; 345 } 346 return (error); 347 } 348 349 static int 350 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, 351 struct thread *td) 352 { 353 int error, vcpu, state_changed, size; 354 cpuset_t *cpuset; 355 struct vmmdev_softc *sc; 356 struct vm_register *vmreg; 357 struct vm_seg_desc *vmsegdesc; 358 struct vm_register_set *vmregset; 359 struct vm_run *vmrun; 360 struct vm_exception *vmexc; 361 struct vm_lapic_irq *vmirq; 362 struct vm_lapic_msi *vmmsi; 363 struct vm_ioapic_irq *ioapic_irq; 364 struct vm_isa_irq *isa_irq; 365 struct vm_isa_irq_trigger *isa_irq_trigger; 366 struct vm_capability *vmcap; 367 struct vm_pptdev *pptdev; 368 struct vm_pptdev_mmio *pptmmio; 369 struct vm_pptdev_msi *pptmsi; 370 struct vm_pptdev_msix *pptmsix; 371 struct vm_nmi *vmnmi; 372 struct vm_stats *vmstats; 373 struct vm_stat_desc *statdesc; 374 struct vm_x2apic *x2apic; 375 struct vm_gpa_pte *gpapte; 376 struct vm_suspend *vmsuspend; 377 struct vm_gla2gpa *gg; 378 struct vm_activate_cpu *vac; 379 struct vm_cpuset *vm_cpuset; 380 struct vm_intinfo *vmii; 381 struct vm_rtc_time *rtctime; 382 struct vm_rtc_data *rtcdata; 383 struct vm_memmap *mm; 384 struct vm_cpu_topology *topology; 385 struct vm_readwrite_kernemu_device *kernemu; 386 uint64_t *regvals; 387 int *regnums; 388 #ifdef BHYVE_SNAPSHOT 389 struct vm_snapshot_meta *snapshot_meta; 390 #endif 391 392 error = vmm_priv_check(curthread->td_ucred); 393 if (error) 394 return (error); 395 396 sc = vmmdev_lookup2(cdev); 397 if (sc == NULL) 398 return (ENXIO); 399 400 vcpu = -1; 401 state_changed = 0; 402 403 /* 404 * Some VMM ioctls can operate only on vcpus that are not running. 405 */ 406 switch (cmd) { 407 case VM_RUN: 408 case VM_GET_REGISTER: 409 case VM_SET_REGISTER: 410 case VM_GET_SEGMENT_DESCRIPTOR: 411 case VM_SET_SEGMENT_DESCRIPTOR: 412 case VM_GET_REGISTER_SET: 413 case VM_SET_REGISTER_SET: 414 case VM_INJECT_EXCEPTION: 415 case VM_GET_CAPABILITY: 416 case VM_SET_CAPABILITY: 417 case VM_PPTDEV_MSI: 418 case VM_PPTDEV_MSIX: 419 case VM_SET_X2APIC_STATE: 420 case VM_GLA2GPA: 421 case VM_GLA2GPA_NOFAULT: 422 case VM_ACTIVATE_CPU: 423 case VM_SET_INTINFO: 424 case VM_GET_INTINFO: 425 case VM_RESTART_INSTRUCTION: 426 /* 427 * XXX fragile, handle with care 428 * Assumes that the first field of the ioctl data is the vcpu. 429 */ 430 vcpu = *(int *)data; 431 error = vcpu_lock_one(sc, vcpu); 432 if (error) 433 goto done; 434 state_changed = 1; 435 break; 436 437 case VM_MAP_PPTDEV_MMIO: 438 case VM_BIND_PPTDEV: 439 case VM_UNBIND_PPTDEV: 440 #ifdef COMPAT_FREEBSD12 441 case VM_ALLOC_MEMSEG_FBSD12: 442 #endif 443 case VM_ALLOC_MEMSEG: 444 case VM_MMAP_MEMSEG: 445 case VM_REINIT: 446 /* 447 * ioctls that operate on the entire virtual machine must 448 * prevent all vcpus from running. 449 */ 450 error = vcpu_lock_all(sc); 451 if (error) 452 goto done; 453 state_changed = 2; 454 break; 455 456 #ifdef COMPAT_FREEBSD12 457 case VM_GET_MEMSEG_FBSD12: 458 #endif 459 case VM_GET_MEMSEG: 460 case VM_MMAP_GETNEXT: 461 /* 462 * Lock a vcpu to make sure that the memory map cannot be 463 * modified while it is being inspected. 464 */ 465 vcpu = vm_get_maxcpus(sc->vm) - 1; 466 error = vcpu_lock_one(sc, vcpu); 467 if (error) 468 goto done; 469 state_changed = 1; 470 break; 471 472 default: 473 break; 474 } 475 476 switch(cmd) { 477 case VM_RUN: 478 vmrun = (struct vm_run *)data; 479 error = vm_run(sc->vm, vmrun); 480 break; 481 case VM_SUSPEND: 482 vmsuspend = (struct vm_suspend *)data; 483 error = vm_suspend(sc->vm, vmsuspend->how); 484 break; 485 case VM_REINIT: 486 error = vm_reinit(sc->vm); 487 break; 488 case VM_STAT_DESC: { 489 statdesc = (struct vm_stat_desc *)data; 490 error = vmm_stat_desc_copy(statdesc->index, 491 statdesc->desc, sizeof(statdesc->desc)); 492 break; 493 } 494 case VM_STATS: { 495 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS); 496 vmstats = (struct vm_stats *)data; 497 getmicrotime(&vmstats->tv); 498 error = vmm_stat_copy(sc->vm, vmstats->cpuid, 499 &vmstats->num_entries, vmstats->statbuf); 500 break; 501 } 502 case VM_PPTDEV_MSI: 503 pptmsi = (struct vm_pptdev_msi *)data; 504 error = ppt_setup_msi(sc->vm, pptmsi->vcpu, 505 pptmsi->bus, pptmsi->slot, pptmsi->func, 506 pptmsi->addr, pptmsi->msg, 507 pptmsi->numvec); 508 break; 509 case VM_PPTDEV_MSIX: 510 pptmsix = (struct vm_pptdev_msix *)data; 511 error = ppt_setup_msix(sc->vm, pptmsix->vcpu, 512 pptmsix->bus, pptmsix->slot, 513 pptmsix->func, pptmsix->idx, 514 pptmsix->addr, pptmsix->msg, 515 pptmsix->vector_control); 516 break; 517 case VM_MAP_PPTDEV_MMIO: 518 pptmmio = (struct vm_pptdev_mmio *)data; 519 error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot, 520 pptmmio->func, pptmmio->gpa, pptmmio->len, 521 pptmmio->hpa); 522 break; 523 case VM_BIND_PPTDEV: 524 pptdev = (struct vm_pptdev *)data; 525 error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot, 526 pptdev->func); 527 break; 528 case VM_UNBIND_PPTDEV: 529 pptdev = (struct vm_pptdev *)data; 530 error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot, 531 pptdev->func); 532 break; 533 case VM_INJECT_EXCEPTION: 534 vmexc = (struct vm_exception *)data; 535 error = vm_inject_exception(sc->vm, vmexc->cpuid, 536 vmexc->vector, vmexc->error_code_valid, vmexc->error_code, 537 vmexc->restart_instruction); 538 break; 539 case VM_INJECT_NMI: 540 vmnmi = (struct vm_nmi *)data; 541 error = vm_inject_nmi(sc->vm, vmnmi->cpuid); 542 break; 543 case VM_LAPIC_IRQ: 544 vmirq = (struct vm_lapic_irq *)data; 545 error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector); 546 break; 547 case VM_LAPIC_LOCAL_IRQ: 548 vmirq = (struct vm_lapic_irq *)data; 549 error = lapic_set_local_intr(sc->vm, vmirq->cpuid, 550 vmirq->vector); 551 break; 552 case VM_LAPIC_MSI: 553 vmmsi = (struct vm_lapic_msi *)data; 554 error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg); 555 break; 556 case VM_IOAPIC_ASSERT_IRQ: 557 ioapic_irq = (struct vm_ioapic_irq *)data; 558 error = vioapic_assert_irq(sc->vm, ioapic_irq->irq); 559 break; 560 case VM_IOAPIC_DEASSERT_IRQ: 561 ioapic_irq = (struct vm_ioapic_irq *)data; 562 error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq); 563 break; 564 case VM_IOAPIC_PULSE_IRQ: 565 ioapic_irq = (struct vm_ioapic_irq *)data; 566 error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq); 567 break; 568 case VM_IOAPIC_PINCOUNT: 569 *(int *)data = vioapic_pincount(sc->vm); 570 break; 571 case VM_SET_KERNEMU_DEV: 572 case VM_GET_KERNEMU_DEV: { 573 mem_region_write_t mwrite; 574 mem_region_read_t mread; 575 bool arg; 576 577 kernemu = (void *)data; 578 579 if (kernemu->access_width > 0) 580 size = (1u << kernemu->access_width); 581 else 582 size = 1; 583 584 if (kernemu->gpa >= DEFAULT_APIC_BASE && kernemu->gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 585 mread = lapic_mmio_read; 586 mwrite = lapic_mmio_write; 587 } else if (kernemu->gpa >= VIOAPIC_BASE && kernemu->gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 588 mread = vioapic_mmio_read; 589 mwrite = vioapic_mmio_write; 590 } else if (kernemu->gpa >= VHPET_BASE && kernemu->gpa < VHPET_BASE + VHPET_SIZE) { 591 mread = vhpet_mmio_read; 592 mwrite = vhpet_mmio_write; 593 } else { 594 error = EINVAL; 595 break; 596 } 597 598 if (cmd == VM_SET_KERNEMU_DEV) 599 error = mwrite(sc->vm, kernemu->vcpuid, kernemu->gpa, 600 kernemu->value, size, &arg); 601 else 602 error = mread(sc->vm, kernemu->vcpuid, kernemu->gpa, 603 &kernemu->value, size, &arg); 604 break; 605 } 606 case VM_ISA_ASSERT_IRQ: 607 isa_irq = (struct vm_isa_irq *)data; 608 error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq); 609 if (error == 0 && isa_irq->ioapic_irq != -1) 610 error = vioapic_assert_irq(sc->vm, 611 isa_irq->ioapic_irq); 612 break; 613 case VM_ISA_DEASSERT_IRQ: 614 isa_irq = (struct vm_isa_irq *)data; 615 error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq); 616 if (error == 0 && isa_irq->ioapic_irq != -1) 617 error = vioapic_deassert_irq(sc->vm, 618 isa_irq->ioapic_irq); 619 break; 620 case VM_ISA_PULSE_IRQ: 621 isa_irq = (struct vm_isa_irq *)data; 622 error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq); 623 if (error == 0 && isa_irq->ioapic_irq != -1) 624 error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq); 625 break; 626 case VM_ISA_SET_IRQ_TRIGGER: 627 isa_irq_trigger = (struct vm_isa_irq_trigger *)data; 628 error = vatpic_set_irq_trigger(sc->vm, 629 isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger); 630 break; 631 case VM_MMAP_GETNEXT: 632 mm = (struct vm_memmap *)data; 633 error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid, 634 &mm->segoff, &mm->len, &mm->prot, &mm->flags); 635 break; 636 case VM_MMAP_MEMSEG: 637 mm = (struct vm_memmap *)data; 638 error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff, 639 mm->len, mm->prot, mm->flags); 640 break; 641 #ifdef COMPAT_FREEBSD12 642 case VM_ALLOC_MEMSEG_FBSD12: 643 error = alloc_memseg(sc, (struct vm_memseg *)data, 644 sizeof(((struct vm_memseg_fbsd12 *)0)->name)); 645 break; 646 #endif 647 case VM_ALLOC_MEMSEG: 648 error = alloc_memseg(sc, (struct vm_memseg *)data, 649 sizeof(((struct vm_memseg *)0)->name)); 650 break; 651 #ifdef COMPAT_FREEBSD12 652 case VM_GET_MEMSEG_FBSD12: 653 error = get_memseg(sc, (struct vm_memseg *)data, 654 sizeof(((struct vm_memseg_fbsd12 *)0)->name)); 655 break; 656 #endif 657 case VM_GET_MEMSEG: 658 error = get_memseg(sc, (struct vm_memseg *)data, 659 sizeof(((struct vm_memseg *)0)->name)); 660 break; 661 case VM_GET_REGISTER: 662 vmreg = (struct vm_register *)data; 663 error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum, 664 &vmreg->regval); 665 break; 666 case VM_SET_REGISTER: 667 vmreg = (struct vm_register *)data; 668 error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum, 669 vmreg->regval); 670 break; 671 case VM_SET_SEGMENT_DESCRIPTOR: 672 vmsegdesc = (struct vm_seg_desc *)data; 673 error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid, 674 vmsegdesc->regnum, 675 &vmsegdesc->desc); 676 break; 677 case VM_GET_SEGMENT_DESCRIPTOR: 678 vmsegdesc = (struct vm_seg_desc *)data; 679 error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid, 680 vmsegdesc->regnum, 681 &vmsegdesc->desc); 682 break; 683 case VM_GET_REGISTER_SET: 684 vmregset = (struct vm_register_set *)data; 685 if (vmregset->count > VM_REG_LAST) { 686 error = EINVAL; 687 break; 688 } 689 regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV, 690 M_WAITOK); 691 regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV, 692 M_WAITOK); 693 error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) * 694 vmregset->count); 695 if (error == 0) 696 error = vm_get_register_set(sc->vm, vmregset->cpuid, 697 vmregset->count, regnums, regvals); 698 if (error == 0) 699 error = copyout(regvals, vmregset->regvals, 700 sizeof(regvals[0]) * vmregset->count); 701 free(regvals, M_VMMDEV); 702 free(regnums, M_VMMDEV); 703 break; 704 case VM_SET_REGISTER_SET: 705 vmregset = (struct vm_register_set *)data; 706 if (vmregset->count > VM_REG_LAST) { 707 error = EINVAL; 708 break; 709 } 710 regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV, 711 M_WAITOK); 712 regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV, 713 M_WAITOK); 714 error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) * 715 vmregset->count); 716 if (error == 0) 717 error = copyin(vmregset->regvals, regvals, 718 sizeof(regvals[0]) * vmregset->count); 719 if (error == 0) 720 error = vm_set_register_set(sc->vm, vmregset->cpuid, 721 vmregset->count, regnums, regvals); 722 free(regvals, M_VMMDEV); 723 free(regnums, M_VMMDEV); 724 break; 725 case VM_GET_CAPABILITY: 726 vmcap = (struct vm_capability *)data; 727 error = vm_get_capability(sc->vm, vmcap->cpuid, 728 vmcap->captype, 729 &vmcap->capval); 730 break; 731 case VM_SET_CAPABILITY: 732 vmcap = (struct vm_capability *)data; 733 error = vm_set_capability(sc->vm, vmcap->cpuid, 734 vmcap->captype, 735 vmcap->capval); 736 break; 737 case VM_SET_X2APIC_STATE: 738 x2apic = (struct vm_x2apic *)data; 739 error = vm_set_x2apic_state(sc->vm, 740 x2apic->cpuid, x2apic->state); 741 break; 742 case VM_GET_X2APIC_STATE: 743 x2apic = (struct vm_x2apic *)data; 744 error = vm_get_x2apic_state(sc->vm, 745 x2apic->cpuid, &x2apic->state); 746 break; 747 case VM_GET_GPA_PMAP: 748 gpapte = (struct vm_gpa_pte *)data; 749 pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)), 750 gpapte->gpa, gpapte->pte, &gpapte->ptenum); 751 error = 0; 752 break; 753 case VM_GET_HPET_CAPABILITIES: 754 error = vhpet_getcap((struct vm_hpet_cap *)data); 755 break; 756 case VM_GLA2GPA: { 757 CTASSERT(PROT_READ == VM_PROT_READ); 758 CTASSERT(PROT_WRITE == VM_PROT_WRITE); 759 CTASSERT(PROT_EXEC == VM_PROT_EXECUTE); 760 gg = (struct vm_gla2gpa *)data; 761 error = vm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla, 762 gg->prot, &gg->gpa, &gg->fault); 763 KASSERT(error == 0 || error == EFAULT, 764 ("%s: vm_gla2gpa unknown error %d", __func__, error)); 765 break; 766 } 767 case VM_GLA2GPA_NOFAULT: 768 gg = (struct vm_gla2gpa *)data; 769 error = vm_gla2gpa_nofault(sc->vm, gg->vcpuid, &gg->paging, 770 gg->gla, gg->prot, &gg->gpa, &gg->fault); 771 KASSERT(error == 0 || error == EFAULT, 772 ("%s: vm_gla2gpa unknown error %d", __func__, error)); 773 break; 774 case VM_ACTIVATE_CPU: 775 vac = (struct vm_activate_cpu *)data; 776 error = vm_activate_cpu(sc->vm, vac->vcpuid); 777 break; 778 case VM_GET_CPUS: 779 error = 0; 780 vm_cpuset = (struct vm_cpuset *)data; 781 size = vm_cpuset->cpusetsize; 782 if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) { 783 error = ERANGE; 784 break; 785 } 786 cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO); 787 if (vm_cpuset->which == VM_ACTIVE_CPUS) 788 *cpuset = vm_active_cpus(sc->vm); 789 else if (vm_cpuset->which == VM_SUSPENDED_CPUS) 790 *cpuset = vm_suspended_cpus(sc->vm); 791 else if (vm_cpuset->which == VM_DEBUG_CPUS) 792 *cpuset = vm_debug_cpus(sc->vm); 793 else 794 error = EINVAL; 795 if (error == 0) 796 error = copyout(cpuset, vm_cpuset->cpus, size); 797 free(cpuset, M_TEMP); 798 break; 799 case VM_SUSPEND_CPU: 800 vac = (struct vm_activate_cpu *)data; 801 error = vm_suspend_cpu(sc->vm, vac->vcpuid); 802 break; 803 case VM_RESUME_CPU: 804 vac = (struct vm_activate_cpu *)data; 805 error = vm_resume_cpu(sc->vm, vac->vcpuid); 806 break; 807 case VM_SET_INTINFO: 808 vmii = (struct vm_intinfo *)data; 809 error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1); 810 break; 811 case VM_GET_INTINFO: 812 vmii = (struct vm_intinfo *)data; 813 error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1, 814 &vmii->info2); 815 break; 816 case VM_RTC_WRITE: 817 rtcdata = (struct vm_rtc_data *)data; 818 error = vrtc_nvram_write(sc->vm, rtcdata->offset, 819 rtcdata->value); 820 break; 821 case VM_RTC_READ: 822 rtcdata = (struct vm_rtc_data *)data; 823 error = vrtc_nvram_read(sc->vm, rtcdata->offset, 824 &rtcdata->value); 825 break; 826 case VM_RTC_SETTIME: 827 rtctime = (struct vm_rtc_time *)data; 828 error = vrtc_set_time(sc->vm, rtctime->secs); 829 break; 830 case VM_RTC_GETTIME: 831 error = 0; 832 rtctime = (struct vm_rtc_time *)data; 833 rtctime->secs = vrtc_get_time(sc->vm); 834 break; 835 case VM_RESTART_INSTRUCTION: 836 error = vm_restart_instruction(sc->vm, vcpu); 837 break; 838 case VM_SET_TOPOLOGY: 839 topology = (struct vm_cpu_topology *)data; 840 error = vm_set_topology(sc->vm, topology->sockets, 841 topology->cores, topology->threads, topology->maxcpus); 842 break; 843 case VM_GET_TOPOLOGY: 844 topology = (struct vm_cpu_topology *)data; 845 vm_get_topology(sc->vm, &topology->sockets, &topology->cores, 846 &topology->threads, &topology->maxcpus); 847 error = 0; 848 break; 849 #ifdef BHYVE_SNAPSHOT 850 case VM_SNAPSHOT_REQ: 851 snapshot_meta = (struct vm_snapshot_meta *)data; 852 error = vm_snapshot_req(sc->vm, snapshot_meta); 853 break; 854 case VM_RESTORE_TIME: 855 error = vm_restore_time(sc->vm); 856 break; 857 #endif 858 default: 859 error = ENOTTY; 860 break; 861 } 862 863 if (state_changed == 1) 864 vcpu_unlock_one(sc, vcpu); 865 else if (state_changed == 2) 866 vcpu_unlock_all(sc); 867 868 done: 869 /* 870 * Make sure that no handler returns a kernel-internal 871 * error value to userspace. 872 */ 873 KASSERT(error == ERESTART || error >= 0, 874 ("vmmdev_ioctl: invalid error return %d", error)); 875 return (error); 876 } 877 878 static int 879 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize, 880 struct vm_object **objp, int nprot) 881 { 882 struct vmmdev_softc *sc; 883 vm_paddr_t gpa; 884 size_t len; 885 vm_ooffset_t segoff, first, last; 886 int error, found, segid; 887 uint16_t lastcpu; 888 bool sysmem; 889 890 error = vmm_priv_check(curthread->td_ucred); 891 if (error) 892 return (error); 893 894 first = *offset; 895 last = first + mapsize; 896 if ((nprot & PROT_EXEC) || first < 0 || first >= last) 897 return (EINVAL); 898 899 sc = vmmdev_lookup2(cdev); 900 if (sc == NULL) { 901 /* virtual machine is in the process of being created */ 902 return (EINVAL); 903 } 904 905 /* 906 * Get a read lock on the guest memory map by freezing any vcpu. 907 */ 908 lastcpu = vm_get_maxcpus(sc->vm) - 1; 909 error = vcpu_lock_one(sc, lastcpu); 910 if (error) 911 return (error); 912 913 gpa = 0; 914 found = 0; 915 while (!found) { 916 error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len, 917 NULL, NULL); 918 if (error) 919 break; 920 921 if (first >= gpa && last <= gpa + len) 922 found = 1; 923 else 924 gpa += len; 925 } 926 927 if (found) { 928 error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp); 929 KASSERT(error == 0 && *objp != NULL, 930 ("%s: invalid memory segment %d", __func__, segid)); 931 if (sysmem) { 932 vm_object_reference(*objp); 933 *offset = segoff + (first - gpa); 934 } else { 935 error = EINVAL; 936 } 937 } 938 vcpu_unlock_one(sc, lastcpu); 939 return (error); 940 } 941 942 static void 943 vmmdev_destroy(void *arg) 944 { 945 struct vmmdev_softc *sc = arg; 946 struct devmem_softc *dsc; 947 int error; 948 949 error = vcpu_lock_all(sc); 950 KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error)); 951 952 while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) { 953 KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__)); 954 SLIST_REMOVE_HEAD(&sc->devmem, link); 955 free(dsc->name, M_VMMDEV); 956 free(dsc, M_VMMDEV); 957 } 958 959 if (sc->cdev != NULL) 960 destroy_dev(sc->cdev); 961 962 if (sc->vm != NULL) 963 vm_destroy(sc->vm); 964 965 if ((sc->flags & VSC_LINKED) != 0) { 966 mtx_lock(&vmmdev_mtx); 967 SLIST_REMOVE(&head, sc, vmmdev_softc, link); 968 mtx_unlock(&vmmdev_mtx); 969 } 970 971 free(sc, M_VMMDEV); 972 } 973 974 static int 975 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) 976 { 977 struct devmem_softc *dsc; 978 struct vmmdev_softc *sc; 979 struct cdev *cdev; 980 char *buf; 981 int error, buflen; 982 983 error = vmm_priv_check(req->td->td_ucred); 984 if (error) 985 return (error); 986 987 buflen = VM_MAX_NAMELEN + 1; 988 buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO); 989 strlcpy(buf, "beavis", buflen); 990 error = sysctl_handle_string(oidp, buf, buflen, req); 991 if (error != 0 || req->newptr == NULL) 992 goto out; 993 994 mtx_lock(&vmmdev_mtx); 995 sc = vmmdev_lookup(buf); 996 if (sc == NULL || sc->cdev == NULL) { 997 mtx_unlock(&vmmdev_mtx); 998 error = EINVAL; 999 goto out; 1000 } 1001 1002 /* 1003 * The 'cdev' will be destroyed asynchronously when 'si_threadcount' 1004 * goes down to 0 so we should not do it again in the callback. 1005 * 1006 * Setting 'sc->cdev' to NULL is also used to indicate that the VM 1007 * is scheduled for destruction. 1008 */ 1009 cdev = sc->cdev; 1010 sc->cdev = NULL; 1011 mtx_unlock(&vmmdev_mtx); 1012 1013 /* 1014 * Schedule all cdevs to be destroyed: 1015 * 1016 * - any new operations on the 'cdev' will return an error (ENXIO). 1017 * 1018 * - when the 'si_threadcount' dwindles down to zero the 'cdev' will 1019 * be destroyed and the callback will be invoked in a taskqueue 1020 * context. 1021 * 1022 * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev' 1023 */ 1024 SLIST_FOREACH(dsc, &sc->devmem, link) { 1025 KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed")); 1026 destroy_dev_sched_cb(dsc->cdev, devmem_destroy, dsc); 1027 } 1028 destroy_dev_sched_cb(cdev, vmmdev_destroy, sc); 1029 error = 0; 1030 1031 out: 1032 free(buf, M_VMMDEV); 1033 return (error); 1034 } 1035 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, 1036 CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, 1037 NULL, 0, sysctl_vmm_destroy, "A", 1038 NULL); 1039 1040 static struct cdevsw vmmdevsw = { 1041 .d_name = "vmmdev", 1042 .d_version = D_VERSION, 1043 .d_ioctl = vmmdev_ioctl, 1044 .d_mmap_single = vmmdev_mmap_single, 1045 .d_read = vmmdev_rw, 1046 .d_write = vmmdev_rw, 1047 }; 1048 1049 static int 1050 sysctl_vmm_create(SYSCTL_HANDLER_ARGS) 1051 { 1052 struct vm *vm; 1053 struct cdev *cdev; 1054 struct vmmdev_softc *sc, *sc2; 1055 char *buf; 1056 int error, buflen; 1057 1058 error = vmm_priv_check(req->td->td_ucred); 1059 if (error) 1060 return (error); 1061 1062 buflen = VM_MAX_NAMELEN + 1; 1063 buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO); 1064 strlcpy(buf, "beavis", buflen); 1065 error = sysctl_handle_string(oidp, buf, buflen, req); 1066 if (error != 0 || req->newptr == NULL) 1067 goto out; 1068 1069 mtx_lock(&vmmdev_mtx); 1070 sc = vmmdev_lookup(buf); 1071 mtx_unlock(&vmmdev_mtx); 1072 if (sc != NULL) { 1073 error = EEXIST; 1074 goto out; 1075 } 1076 1077 error = vm_create(buf, &vm); 1078 if (error != 0) 1079 goto out; 1080 1081 sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO); 1082 sc->vm = vm; 1083 SLIST_INIT(&sc->devmem); 1084 1085 /* 1086 * Lookup the name again just in case somebody sneaked in when we 1087 * dropped the lock. 1088 */ 1089 mtx_lock(&vmmdev_mtx); 1090 sc2 = vmmdev_lookup(buf); 1091 if (sc2 == NULL) { 1092 SLIST_INSERT_HEAD(&head, sc, link); 1093 sc->flags |= VSC_LINKED; 1094 } 1095 mtx_unlock(&vmmdev_mtx); 1096 1097 if (sc2 != NULL) { 1098 vmmdev_destroy(sc); 1099 error = EEXIST; 1100 goto out; 1101 } 1102 1103 error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL, 1104 UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf); 1105 if (error != 0) { 1106 vmmdev_destroy(sc); 1107 goto out; 1108 } 1109 1110 mtx_lock(&vmmdev_mtx); 1111 sc->cdev = cdev; 1112 sc->cdev->si_drv1 = sc; 1113 mtx_unlock(&vmmdev_mtx); 1114 1115 out: 1116 free(buf, M_VMMDEV); 1117 return (error); 1118 } 1119 SYSCTL_PROC(_hw_vmm, OID_AUTO, create, 1120 CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, 1121 NULL, 0, sysctl_vmm_create, "A", 1122 NULL); 1123 1124 void 1125 vmmdev_init(void) 1126 { 1127 mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF); 1128 pr_allow_flag = prison_add_allow(NULL, "vmm", NULL, 1129 "Allow use of vmm in a jail."); 1130 } 1131 1132 int 1133 vmmdev_cleanup(void) 1134 { 1135 int error; 1136 1137 if (SLIST_EMPTY(&head)) 1138 error = 0; 1139 else 1140 error = EBUSY; 1141 1142 return (error); 1143 } 1144 1145 static int 1146 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len, 1147 struct vm_object **objp, int nprot) 1148 { 1149 struct devmem_softc *dsc; 1150 vm_ooffset_t first, last; 1151 size_t seglen; 1152 int error; 1153 uint16_t lastcpu; 1154 bool sysmem; 1155 1156 dsc = cdev->si_drv1; 1157 if (dsc == NULL) { 1158 /* 'cdev' has been created but is not ready for use */ 1159 return (ENXIO); 1160 } 1161 1162 first = *offset; 1163 last = *offset + len; 1164 if ((nprot & PROT_EXEC) || first < 0 || first >= last) 1165 return (EINVAL); 1166 1167 lastcpu = vm_get_maxcpus(dsc->sc->vm) - 1; 1168 error = vcpu_lock_one(dsc->sc, lastcpu); 1169 if (error) 1170 return (error); 1171 1172 error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp); 1173 KASSERT(error == 0 && !sysmem && *objp != NULL, 1174 ("%s: invalid devmem segment %d", __func__, dsc->segid)); 1175 1176 vcpu_unlock_one(dsc->sc, lastcpu); 1177 1178 if (seglen >= last) { 1179 vm_object_reference(*objp); 1180 return (0); 1181 } else { 1182 return (EINVAL); 1183 } 1184 } 1185 1186 static struct cdevsw devmemsw = { 1187 .d_name = "devmem", 1188 .d_version = D_VERSION, 1189 .d_mmap_single = devmem_mmap_single, 1190 }; 1191 1192 static int 1193 devmem_create_cdev(const char *vmname, int segid, char *devname) 1194 { 1195 struct devmem_softc *dsc; 1196 struct vmmdev_softc *sc; 1197 struct cdev *cdev; 1198 int error; 1199 1200 error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL, 1201 UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname); 1202 if (error) 1203 return (error); 1204 1205 dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO); 1206 1207 mtx_lock(&vmmdev_mtx); 1208 sc = vmmdev_lookup(vmname); 1209 KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname)); 1210 if (sc->cdev == NULL) { 1211 /* virtual machine is being created or destroyed */ 1212 mtx_unlock(&vmmdev_mtx); 1213 free(dsc, M_VMMDEV); 1214 destroy_dev_sched_cb(cdev, NULL, 0); 1215 return (ENODEV); 1216 } 1217 1218 dsc->segid = segid; 1219 dsc->name = devname; 1220 dsc->cdev = cdev; 1221 dsc->sc = sc; 1222 SLIST_INSERT_HEAD(&sc->devmem, dsc, link); 1223 mtx_unlock(&vmmdev_mtx); 1224 1225 /* The 'cdev' is ready for use after 'si_drv1' is initialized */ 1226 cdev->si_drv1 = dsc; 1227 return (0); 1228 } 1229 1230 static void 1231 devmem_destroy(void *arg) 1232 { 1233 struct devmem_softc *dsc = arg; 1234 1235 KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__)); 1236 dsc->cdev = NULL; 1237 dsc->sc = NULL; 1238 } 1239