1 /* $OpenBSD: vm.c,v 1.100 2024/04/29 14:47:06 dv Exp $ */ 2 3 /* 4 * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/param.h> /* PAGE_SIZE, MAXCOMLEN */ 20 #include <sys/types.h> 21 #include <sys/ioctl.h> 22 #include <sys/queue.h> 23 #include <sys/wait.h> 24 #include <sys/uio.h> 25 #include <sys/stat.h> 26 #include <sys/socket.h> 27 #include <sys/time.h> 28 #include <sys/mman.h> 29 #include <sys/resource.h> 30 31 #include <dev/ic/i8253reg.h> 32 #include <dev/isa/isareg.h> 33 #include <dev/pci/pcireg.h> 34 35 #include <machine/psl.h> 36 #include <machine/pte.h> 37 #include <machine/specialreg.h> 38 #include <machine/vmmvar.h> 39 40 #include <net/if.h> 41 42 #include <errno.h> 43 #include <event.h> 44 #include <fcntl.h> 45 #include <imsg.h> 46 #include <limits.h> 47 #include <poll.h> 48 #include <pthread.h> 49 #include <pthread_np.h> 50 #include <stddef.h> 51 #include <stdio.h> 52 #include <stdlib.h> 53 #include <string.h> 54 #include <unistd.h> 55 #include <util.h> 56 57 #include "atomicio.h" 58 #include "fw_cfg.h" 59 #include "i8253.h" 60 #include "i8259.h" 61 #include "loadfile.h" 62 #include "mc146818.h" 63 #include "mmio.h" 64 #include "ns8250.h" 65 #include "pci.h" 66 #include "virtio.h" 67 #include "vmd.h" 68 #include "vmm.h" 69 70 #define MB(x) (x * 1024UL * 1024UL) 71 #define GB(x) (x * 1024UL * 1024UL * 1024UL) 72 73 #define MMIO_NOTYET 0 74 75 io_fn_t ioports_map[MAX_PORTS]; 76 77 static int run_vm(struct vmop_create_params *, struct vcpu_reg_state *); 78 void vm_dispatch_vmm(int, short, void *); 79 void *event_thread(void *); 80 void *vcpu_run_loop(void *); 81 int vcpu_exit(struct vm_run_params *); 82 int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *); 83 void create_memory_map(struct vm_create_params *); 84 static int vmm_create_vm(struct vmd_vm *); 85 int alloc_guest_mem(struct vmd_vm *); 86 void init_emulated_hw(struct vmop_create_params *, int, 87 int[][VM_MAX_BASE_PER_DISK], int *); 88 void restore_emulated_hw(struct vm_create_params *, int, int *, 89 int[][VM_MAX_BASE_PER_DISK],int); 90 void vcpu_exit_inout(struct vm_run_params *); 91 int vcpu_exit_eptviolation(struct vm_run_params *); 92 uint8_t vcpu_exit_pci(struct vm_run_params *); 93 int vcpu_pic_intr(uint32_t, uint32_t, uint8_t); 94 int loadfile_bios(gzFile, off_t, struct vcpu_reg_state *); 95 static int send_vm(int, struct vmd_vm *); 96 int dump_send_header(int); 97 static int dump_vmr(int , struct vm_mem_range *); 98 static int dump_mem(int, struct vmd_vm *); 99 void restore_vmr(int, struct vm_mem_range *); 100 void restore_mem(int, struct vm_create_params *); 101 int restore_vm_params(int, struct vm_create_params *); 102 static void pause_vm(struct vmd_vm *); 103 static void unpause_vm(struct vmd_vm *); 104 105 int translate_gva(struct vm_exit*, uint64_t, uint64_t *, int); 106 107 static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t, 108 size_t); 109 110 int con_fd; 111 struct vmd_vm *current_vm; 112 113 extern struct vmd *env; 114 115 extern char *__progname; 116 117 pthread_mutex_t threadmutex; 118 pthread_cond_t threadcond; 119 120 pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM]; 121 pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM]; 122 pthread_barrier_t vm_pause_barrier; 123 pthread_cond_t vcpu_unpause_cond[VMM_MAX_VCPUS_PER_VM]; 124 pthread_mutex_t vcpu_unpause_mtx[VMM_MAX_VCPUS_PER_VM]; 125 uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM]; 126 uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM]; 127 128 /* 129 * Represents a standard register set for an OS to be booted 130 * as a flat 64 bit address space. 131 * 132 * NOT set here are: 133 * RIP 134 * RSP 135 * GDTR BASE 136 * 137 * Specific bootloaders should clone this structure and override 138 * those fields as needed. 139 * 140 * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on 141 * features of the CPU in use. 142 */ 143 static const struct vcpu_reg_state vcpu_init_flat64 = { 144 .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2, 145 .vrs_gprs[VCPU_REGS_RIP] = 0x0, 146 .vrs_gprs[VCPU_REGS_RSP] = 0x0, 147 .vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE | CR0_PG, 148 .vrs_crs[VCPU_REGS_CR3] = PML4_PAGE, 149 .vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE, 150 .vrs_crs[VCPU_REGS_PDPTE0] = 0ULL, 151 .vrs_crs[VCPU_REGS_PDPTE1] = 0ULL, 152 .vrs_crs[VCPU_REGS_PDPTE2] = 0ULL, 153 .vrs_crs[VCPU_REGS_PDPTE3] = 0ULL, 154 .vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0}, 155 .vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 156 .vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 157 .vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 158 .vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 159 .vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 160 .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0}, 161 .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0}, 162 .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0}, 163 .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0}, 164 .vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA, 165 .vrs_drs[VCPU_REGS_DR0] = 0x0, 166 .vrs_drs[VCPU_REGS_DR1] = 0x0, 167 .vrs_drs[VCPU_REGS_DR2] = 0x0, 168 .vrs_drs[VCPU_REGS_DR3] = 0x0, 169 .vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0, 170 .vrs_drs[VCPU_REGS_DR7] = 0x400, 171 .vrs_msrs[VCPU_REGS_STAR] = 0ULL, 172 .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL, 173 .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL, 174 .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL, 175 .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL, 176 .vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL, 177 .vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87 178 }; 179 180 /* 181 * Represents a standard register set for an BIOS to be booted 182 * as a flat 16 bit address space. 183 */ 184 static const struct vcpu_reg_state vcpu_init_flat16 = { 185 .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2, 186 .vrs_gprs[VCPU_REGS_RIP] = 0xFFF0, 187 .vrs_gprs[VCPU_REGS_RSP] = 0x0, 188 .vrs_crs[VCPU_REGS_CR0] = 0x60000010, 189 .vrs_crs[VCPU_REGS_CR3] = 0, 190 .vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000}, 191 .vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 192 .vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0}, 193 .vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 194 .vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 195 .vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 196 .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0}, 197 .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0}, 198 .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0}, 199 .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0}, 200 .vrs_msrs[VCPU_REGS_EFER] = 0ULL, 201 .vrs_drs[VCPU_REGS_DR0] = 0x0, 202 .vrs_drs[VCPU_REGS_DR1] = 0x0, 203 .vrs_drs[VCPU_REGS_DR2] = 0x0, 204 .vrs_drs[VCPU_REGS_DR3] = 0x0, 205 .vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0, 206 .vrs_drs[VCPU_REGS_DR7] = 0x400, 207 .vrs_msrs[VCPU_REGS_STAR] = 0ULL, 208 .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL, 209 .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL, 210 .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL, 211 .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL, 212 .vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87 213 }; 214 215 /* 216 * vm_main 217 * 218 * Primary entrypoint for launching a vm. Does not return. 219 * 220 * fd: file descriptor for communicating with vmm process. 221 * fd_vmm: file descriptor for communicating with vmm(4) device 222 */ 223 void 224 vm_main(int fd, int fd_vmm) 225 { 226 struct vm_create_params *vcp = NULL; 227 struct vmd_vm vm; 228 size_t sz = 0; 229 int ret = 0; 230 231 /* 232 * The vm process relies on global state. Set the fd for /dev/vmm. 233 */ 234 env->vmd_fd = fd_vmm; 235 236 /* 237 * We aren't root, so we can't chroot(2). Use unveil(2) instead. 238 */ 239 if (unveil(env->argv0, "x") == -1) 240 fatal("unveil %s", env->argv0); 241 if (unveil(NULL, NULL) == -1) 242 fatal("unveil lock"); 243 244 /* 245 * pledge in the vm processes: 246 * stdio - for malloc and basic I/O including events. 247 * vmm - for the vmm ioctls and operations. 248 * proc exec - fork/exec for launching devices. 249 * recvfd - for vm send/recv and sending fd to devices. 250 */ 251 if (pledge("stdio vmm proc exec recvfd", NULL) == -1) 252 fatal("pledge"); 253 254 /* Receive our vm configuration. */ 255 memset(&vm, 0, sizeof(vm)); 256 sz = atomicio(read, fd, &vm, sizeof(vm)); 257 if (sz != sizeof(vm)) { 258 log_warnx("failed to receive start message"); 259 _exit(EIO); 260 } 261 262 /* Update process with the vm name. */ 263 vcp = &vm.vm_params.vmc_params; 264 setproctitle("%s", vcp->vcp_name); 265 log_procinit("vm/%s", vcp->vcp_name); 266 267 /* Receive the local prefix settings. */ 268 sz = atomicio(read, fd, &env->vmd_cfg.cfg_localprefix, 269 sizeof(env->vmd_cfg.cfg_localprefix)); 270 if (sz != sizeof(env->vmd_cfg.cfg_localprefix)) { 271 log_warnx("failed to receive local prefix"); 272 _exit(EIO); 273 } 274 275 /* 276 * We need, at minimum, a vm_kernel fd to boot a vm. This is either a 277 * kernel or a BIOS image. 278 */ 279 if (!(vm.vm_state & VM_STATE_RECEIVED)) { 280 if (vm.vm_kernel == -1) { 281 log_warnx("%s: failed to receive boot fd", 282 vcp->vcp_name); 283 _exit(EINVAL); 284 } 285 } 286 287 ret = start_vm(&vm, fd); 288 _exit(ret); 289 } 290 291 /* 292 * loadfile_bios 293 * 294 * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image 295 * directly into memory. 296 * 297 * Parameters: 298 * fp: file of a kernel file to load 299 * size: uncompressed size of the image 300 * (out) vrs: register state to set on init for this kernel 301 * 302 * Return values: 303 * 0 if successful 304 * various error codes returned from read(2) or loadelf functions 305 */ 306 int 307 loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_state *vrs) 308 { 309 off_t off; 310 311 /* Set up a "flat 16 bit" register state for BIOS */ 312 memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs)); 313 314 /* Seek to the beginning of the BIOS image */ 315 if (gzseek(fp, 0, SEEK_SET) == -1) 316 return (-1); 317 318 /* The BIOS image must end at 1MB */ 319 if ((off = MB(1) - size) < 0) 320 return (-1); 321 322 /* Read BIOS image into memory */ 323 if (mread(fp, off, size) != (size_t)size) { 324 errno = EIO; 325 return (-1); 326 } 327 328 if (gzseek(fp, 0, SEEK_SET) == -1) 329 return (-1); 330 331 /* Read a second BIOS copy into memory ending at 4GB */ 332 off = GB(4) - size; 333 if (mread(fp, off, size) != (size_t)size) { 334 errno = EIO; 335 return (-1); 336 } 337 338 log_debug("%s: loaded BIOS image", __func__); 339 340 return (0); 341 } 342 343 /* 344 * start_vm 345 * 346 * After forking a new VM process, starts the new VM with the creation 347 * parameters supplied (in the incoming vm->vm_params field). This 348 * function performs a basic sanity check on the incoming parameters 349 * and then performs the following steps to complete the creation of the VM: 350 * 351 * 1. validates and create the new VM 352 * 2. opens the imsg control channel to the parent and drops more privilege 353 * 3. drops additional privileges by calling pledge(2) 354 * 4. loads the kernel from the disk image or file descriptor 355 * 5. runs the VM's VCPU loops. 356 * 357 * Parameters: 358 * vm: The VM data structure that is including the VM create parameters. 359 * fd: The imsg socket that is connected to the parent process. 360 * 361 * Return values: 362 * 0: success 363 * !0 : failure - typically an errno indicating the source of the failure 364 */ 365 int 366 start_vm(struct vmd_vm *vm, int fd) 367 { 368 struct vmop_create_params *vmc = &vm->vm_params; 369 struct vm_create_params *vcp = &vmc->vmc_params; 370 struct vcpu_reg_state vrs; 371 int nicfds[VM_MAX_NICS_PER_VM]; 372 int ret; 373 gzFile fp; 374 size_t i; 375 struct vm_rwregs_params vrp; 376 struct stat sb; 377 378 /* 379 * We first try to initialize and allocate memory before bothering 380 * vmm(4) with a request to create a new vm. 381 */ 382 if (!(vm->vm_state & VM_STATE_RECEIVED)) 383 create_memory_map(vcp); 384 385 ret = alloc_guest_mem(vm); 386 if (ret) { 387 struct rlimit lim; 388 char buf[FMT_SCALED_STRSIZE]; 389 if (ret == ENOMEM && getrlimit(RLIMIT_DATA, &lim) == 0) { 390 if (fmt_scaled(lim.rlim_cur, buf) == 0) 391 fatalx("could not allocate guest memory (data " 392 "limit is %s)", buf); 393 } 394 errno = ret; 395 log_warn("could not allocate guest memory"); 396 return (ret); 397 } 398 399 /* We've allocated guest memory, so now create the vm in vmm(4). */ 400 ret = vmm_create_vm(vm); 401 if (ret) { 402 /* Let the vmm process know we failed by sending a 0 vm id. */ 403 vcp->vcp_id = 0; 404 atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id)); 405 return (ret); 406 } 407 408 /* 409 * Some of vmd currently relies on global state (current_vm, con_fd). 410 */ 411 current_vm = vm; 412 con_fd = vm->vm_tty; 413 if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1) { 414 log_warn("failed to set nonblocking mode on console"); 415 return (1); 416 } 417 418 /* 419 * We now let the vmm process know we were successful by sending it our 420 * vmm(4) assigned vm id. 421 */ 422 if (atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) != 423 sizeof(vcp->vcp_id)) { 424 log_warn("failed to send created vm id to vmm process"); 425 return (1); 426 } 427 428 /* Prepare either our boot image or receive an existing vm to launch. */ 429 if (vm->vm_state & VM_STATE_RECEIVED) { 430 ret = atomicio(read, vm->vm_receive_fd, &vrp, sizeof(vrp)); 431 if (ret != sizeof(vrp)) 432 fatal("received incomplete vrp - exiting"); 433 vrs = vrp.vrwp_regs; 434 } else { 435 /* 436 * Set up default "flat 64 bit" register state - RIP, 437 * RSP, and GDT info will be set in bootloader 438 */ 439 memcpy(&vrs, &vcpu_init_flat64, sizeof(vrs)); 440 441 /* Find and open kernel image */ 442 if ((fp = gzdopen(vm->vm_kernel, "r")) == NULL) 443 fatalx("failed to open kernel - exiting"); 444 445 /* Load kernel image */ 446 ret = loadfile_elf(fp, vm, &vrs, vmc->vmc_bootdevice); 447 448 /* 449 * Try BIOS as a fallback (only if it was provided as an image 450 * with vm->vm_kernel and the file is not compressed) 451 */ 452 if (ret && errno == ENOEXEC && vm->vm_kernel != -1 && 453 gzdirect(fp) && (ret = fstat(vm->vm_kernel, &sb)) == 0) 454 ret = loadfile_bios(fp, sb.st_size, &vrs); 455 456 if (ret) 457 fatal("failed to load kernel or BIOS - exiting"); 458 459 gzclose(fp); 460 } 461 462 if (vm->vm_kernel != -1) 463 close_fd(vm->vm_kernel); 464 465 /* Initialize our mutexes. */ 466 ret = pthread_mutex_init(&threadmutex, NULL); 467 if (ret) { 468 log_warn("%s: could not initialize thread state mutex", 469 __func__); 470 return (ret); 471 } 472 ret = pthread_cond_init(&threadcond, NULL); 473 if (ret) { 474 log_warn("%s: could not initialize thread state " 475 "condition variable", __func__); 476 return (ret); 477 } 478 mutex_lock(&threadmutex); 479 480 481 /* 482 * Finalize our communication socket with the vmm process. From here 483 * onwards, communication with the vmm process is event-based. 484 */ 485 event_init(); 486 if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1) 487 fatal("setup vm pipe"); 488 489 /* 490 * Initialize or restore our emulated hardware. 491 */ 492 for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) 493 nicfds[i] = vm->vm_ifs[i].vif_fd; 494 495 if (vm->vm_state & VM_STATE_RECEIVED) { 496 restore_mem(vm->vm_receive_fd, vcp); 497 restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds, 498 vm->vm_disks, vm->vm_cdrom); 499 if (restore_vm_params(vm->vm_receive_fd, vcp)) 500 fatal("restore vm params failed"); 501 unpause_vm(vm); 502 } else 503 init_emulated_hw(vmc, vm->vm_cdrom, vm->vm_disks, nicfds); 504 505 /* Drop privleges further before starting the vcpu run loop(s). */ 506 if (pledge("stdio vmm recvfd", NULL) == -1) 507 fatal("pledge"); 508 509 /* 510 * Execute the vcpu run loop(s) for this VM. 511 */ 512 ret = run_vm(&vm->vm_params, &vrs); 513 514 /* Ensure that any in-flight data is written back */ 515 virtio_shutdown(vm); 516 517 return (ret); 518 } 519 520 /* 521 * vm_dispatch_vmm 522 * 523 * imsg callback for messages that are received from the vmm parent process. 524 */ 525 void 526 vm_dispatch_vmm(int fd, short event, void *arg) 527 { 528 struct vmd_vm *vm = arg; 529 struct vmop_result vmr; 530 struct vmop_addr_result var; 531 struct imsgev *iev = &vm->vm_iev; 532 struct imsgbuf *ibuf = &iev->ibuf; 533 struct imsg imsg; 534 ssize_t n; 535 int verbose; 536 537 if (event & EV_READ) { 538 if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN) 539 fatal("%s: imsg_read", __func__); 540 if (n == 0) 541 _exit(0); 542 } 543 544 if (event & EV_WRITE) { 545 if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN) 546 fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd); 547 if (n == 0) 548 _exit(0); 549 } 550 551 for (;;) { 552 if ((n = imsg_get(ibuf, &imsg)) == -1) 553 fatal("%s: imsg_get", __func__); 554 if (n == 0) 555 break; 556 557 #if DEBUG > 1 558 log_debug("%s: got imsg %d from %s", 559 __func__, imsg.hdr.type, 560 vm->vm_params.vmc_params.vcp_name); 561 #endif 562 563 switch (imsg.hdr.type) { 564 case IMSG_CTL_VERBOSE: 565 IMSG_SIZE_CHECK(&imsg, &verbose); 566 memcpy(&verbose, imsg.data, sizeof(verbose)); 567 log_setverbose(verbose); 568 virtio_broadcast_imsg(vm, IMSG_CTL_VERBOSE, &verbose, 569 sizeof(verbose)); 570 break; 571 case IMSG_VMDOP_VM_SHUTDOWN: 572 if (vmmci_ctl(VMMCI_SHUTDOWN) == -1) 573 _exit(0); 574 break; 575 case IMSG_VMDOP_VM_REBOOT: 576 if (vmmci_ctl(VMMCI_REBOOT) == -1) 577 _exit(0); 578 break; 579 case IMSG_VMDOP_PAUSE_VM: 580 vmr.vmr_result = 0; 581 vmr.vmr_id = vm->vm_vmid; 582 pause_vm(vm); 583 imsg_compose_event(&vm->vm_iev, 584 IMSG_VMDOP_PAUSE_VM_RESPONSE, 585 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 586 sizeof(vmr)); 587 break; 588 case IMSG_VMDOP_UNPAUSE_VM: 589 vmr.vmr_result = 0; 590 vmr.vmr_id = vm->vm_vmid; 591 unpause_vm(vm); 592 imsg_compose_event(&vm->vm_iev, 593 IMSG_VMDOP_UNPAUSE_VM_RESPONSE, 594 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 595 sizeof(vmr)); 596 break; 597 case IMSG_VMDOP_SEND_VM_REQUEST: 598 vmr.vmr_id = vm->vm_vmid; 599 vmr.vmr_result = send_vm(imsg_get_fd(&imsg), vm); 600 imsg_compose_event(&vm->vm_iev, 601 IMSG_VMDOP_SEND_VM_RESPONSE, 602 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 603 sizeof(vmr)); 604 if (!vmr.vmr_result) { 605 imsg_flush(¤t_vm->vm_iev.ibuf); 606 _exit(0); 607 } 608 break; 609 case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE: 610 IMSG_SIZE_CHECK(&imsg, &var); 611 memcpy(&var, imsg.data, sizeof(var)); 612 613 log_debug("%s: received tap addr %s for nic %d", 614 vm->vm_params.vmc_params.vcp_name, 615 ether_ntoa((void *)var.var_addr), var.var_nic_idx); 616 617 vionet_set_hostmac(vm, var.var_nic_idx, var.var_addr); 618 break; 619 default: 620 fatalx("%s: got invalid imsg %d from %s", 621 __func__, imsg.hdr.type, 622 vm->vm_params.vmc_params.vcp_name); 623 } 624 imsg_free(&imsg); 625 } 626 imsg_event_add(iev); 627 } 628 629 /* 630 * vm_shutdown 631 * 632 * Tell the vmm parent process to shutdown or reboot the VM and exit. 633 */ 634 __dead void 635 vm_shutdown(unsigned int cmd) 636 { 637 switch (cmd) { 638 case VMMCI_NONE: 639 case VMMCI_SHUTDOWN: 640 (void)imsg_compose_event(¤t_vm->vm_iev, 641 IMSG_VMDOP_VM_SHUTDOWN, 0, 0, -1, NULL, 0); 642 break; 643 case VMMCI_REBOOT: 644 (void)imsg_compose_event(¤t_vm->vm_iev, 645 IMSG_VMDOP_VM_REBOOT, 0, 0, -1, NULL, 0); 646 break; 647 default: 648 fatalx("invalid vm ctl command: %d", cmd); 649 } 650 imsg_flush(¤t_vm->vm_iev.ibuf); 651 652 _exit(0); 653 } 654 655 int 656 send_vm(int fd, struct vmd_vm *vm) 657 { 658 struct vm_rwregs_params vrp; 659 struct vm_rwvmparams_params vpp; 660 struct vmop_create_params *vmc; 661 struct vm_terminate_params vtp; 662 unsigned int flags = 0; 663 unsigned int i; 664 int ret = 0; 665 size_t sz; 666 667 if (dump_send_header(fd)) { 668 log_warnx("%s: failed to send vm dump header", __func__); 669 goto err; 670 } 671 672 pause_vm(vm); 673 674 vmc = calloc(1, sizeof(struct vmop_create_params)); 675 if (vmc == NULL) { 676 log_warn("%s: calloc error getting vmc", __func__); 677 ret = -1; 678 goto err; 679 } 680 681 flags |= VMOP_CREATE_MEMORY; 682 memcpy(&vmc->vmc_params, ¤t_vm->vm_params, sizeof(struct 683 vmop_create_params)); 684 vmc->vmc_flags = flags; 685 vrp.vrwp_vm_id = vm->vm_params.vmc_params.vcp_id; 686 vrp.vrwp_mask = VM_RWREGS_ALL; 687 vpp.vpp_mask = VM_RWVMPARAMS_ALL; 688 vpp.vpp_vm_id = vm->vm_params.vmc_params.vcp_id; 689 690 sz = atomicio(vwrite, fd, vmc, sizeof(struct vmop_create_params)); 691 if (sz != sizeof(struct vmop_create_params)) { 692 ret = -1; 693 goto err; 694 } 695 696 for (i = 0; i < vm->vm_params.vmc_params.vcp_ncpus; i++) { 697 vrp.vrwp_vcpu_id = i; 698 if ((ret = ioctl(env->vmd_fd, VMM_IOC_READREGS, &vrp))) { 699 log_warn("%s: readregs failed", __func__); 700 goto err; 701 } 702 703 sz = atomicio(vwrite, fd, &vrp, 704 sizeof(struct vm_rwregs_params)); 705 if (sz != sizeof(struct vm_rwregs_params)) { 706 log_warn("%s: dumping registers failed", __func__); 707 ret = -1; 708 goto err; 709 } 710 } 711 712 /* Dump memory before devices to aid in restoration. */ 713 if ((ret = dump_mem(fd, vm))) 714 goto err; 715 if ((ret = i8253_dump(fd))) 716 goto err; 717 if ((ret = i8259_dump(fd))) 718 goto err; 719 if ((ret = ns8250_dump(fd))) 720 goto err; 721 if ((ret = mc146818_dump(fd))) 722 goto err; 723 if ((ret = fw_cfg_dump(fd))) 724 goto err; 725 if ((ret = pci_dump(fd))) 726 goto err; 727 if ((ret = virtio_dump(fd))) 728 goto err; 729 730 for (i = 0; i < vm->vm_params.vmc_params.vcp_ncpus; i++) { 731 vpp.vpp_vcpu_id = i; 732 if ((ret = ioctl(env->vmd_fd, VMM_IOC_READVMPARAMS, &vpp))) { 733 log_warn("%s: readvmparams failed", __func__); 734 goto err; 735 } 736 737 sz = atomicio(vwrite, fd, &vpp, 738 sizeof(struct vm_rwvmparams_params)); 739 if (sz != sizeof(struct vm_rwvmparams_params)) { 740 log_warn("%s: dumping vm params failed", __func__); 741 ret = -1; 742 goto err; 743 } 744 } 745 746 vtp.vtp_vm_id = vm->vm_params.vmc_params.vcp_id; 747 if (ioctl(env->vmd_fd, VMM_IOC_TERM, &vtp) == -1) { 748 log_warnx("%s: term IOC error: %d, %d", __func__, 749 errno, ENOENT); 750 } 751 err: 752 close(fd); 753 if (ret) 754 unpause_vm(vm); 755 return ret; 756 } 757 758 int 759 dump_send_header(int fd) { 760 struct vm_dump_header vmh; 761 int i; 762 763 memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE, 764 sizeof(vmh.vmh_signature)); 765 766 vmh.vmh_cpuids[0].code = 0x00; 767 vmh.vmh_cpuids[0].leaf = 0x00; 768 769 vmh.vmh_cpuids[1].code = 0x01; 770 vmh.vmh_cpuids[1].leaf = 0x00; 771 772 vmh.vmh_cpuids[2].code = 0x07; 773 vmh.vmh_cpuids[2].leaf = 0x00; 774 775 vmh.vmh_cpuids[3].code = 0x0d; 776 vmh.vmh_cpuids[3].leaf = 0x00; 777 778 vmh.vmh_cpuids[4].code = 0x80000001; 779 vmh.vmh_cpuids[4].leaf = 0x00; 780 781 vmh.vmh_version = VM_DUMP_VERSION; 782 783 for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) { 784 CPUID_LEAF(vmh.vmh_cpuids[i].code, 785 vmh.vmh_cpuids[i].leaf, 786 vmh.vmh_cpuids[i].a, 787 vmh.vmh_cpuids[i].b, 788 vmh.vmh_cpuids[i].c, 789 vmh.vmh_cpuids[i].d); 790 } 791 792 if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh)) 793 return (-1); 794 795 return (0); 796 } 797 798 int 799 dump_mem(int fd, struct vmd_vm *vm) 800 { 801 unsigned int i; 802 int ret; 803 struct vm_mem_range *vmr; 804 805 for (i = 0; i < vm->vm_params.vmc_params.vcp_nmemranges; i++) { 806 vmr = &vm->vm_params.vmc_params.vcp_memranges[i]; 807 ret = dump_vmr(fd, vmr); 808 if (ret) 809 return ret; 810 } 811 return (0); 812 } 813 814 int 815 restore_vm_params(int fd, struct vm_create_params *vcp) { 816 unsigned int i; 817 struct vm_rwvmparams_params vpp; 818 819 for (i = 0; i < vcp->vcp_ncpus; i++) { 820 if (atomicio(read, fd, &vpp, sizeof(vpp)) != sizeof(vpp)) { 821 log_warn("%s: error restoring vm params", __func__); 822 return (-1); 823 } 824 vpp.vpp_vm_id = vcp->vcp_id; 825 vpp.vpp_vcpu_id = i; 826 if (ioctl(env->vmd_fd, VMM_IOC_WRITEVMPARAMS, &vpp) < 0) { 827 log_debug("%s: writing vm params failed", __func__); 828 return (-1); 829 } 830 } 831 return (0); 832 } 833 834 void 835 restore_mem(int fd, struct vm_create_params *vcp) 836 { 837 unsigned int i; 838 struct vm_mem_range *vmr; 839 840 for (i = 0; i < vcp->vcp_nmemranges; i++) { 841 vmr = &vcp->vcp_memranges[i]; 842 restore_vmr(fd, vmr); 843 } 844 } 845 846 int 847 dump_vmr(int fd, struct vm_mem_range *vmr) 848 { 849 size_t rem = vmr->vmr_size, read=0; 850 char buf[PAGE_SIZE]; 851 852 while (rem > 0) { 853 if (read_mem(vmr->vmr_gpa + read, buf, PAGE_SIZE)) { 854 log_warn("failed to read vmr"); 855 return (-1); 856 } 857 if (atomicio(vwrite, fd, buf, sizeof(buf)) != sizeof(buf)) { 858 log_warn("failed to dump vmr"); 859 return (-1); 860 } 861 rem = rem - PAGE_SIZE; 862 read = read + PAGE_SIZE; 863 } 864 return (0); 865 } 866 867 void 868 restore_vmr(int fd, struct vm_mem_range *vmr) 869 { 870 size_t rem = vmr->vmr_size, wrote=0; 871 char buf[PAGE_SIZE]; 872 873 while (rem > 0) { 874 if (atomicio(read, fd, buf, sizeof(buf)) != sizeof(buf)) 875 fatal("failed to restore vmr"); 876 if (write_mem(vmr->vmr_gpa + wrote, buf, PAGE_SIZE)) 877 fatal("failed to write vmr"); 878 rem = rem - PAGE_SIZE; 879 wrote = wrote + PAGE_SIZE; 880 } 881 } 882 883 static void 884 pause_vm(struct vmd_vm *vm) 885 { 886 unsigned int n; 887 int ret; 888 if (vm->vm_state & VM_STATE_PAUSED) 889 return; 890 891 current_vm->vm_state |= VM_STATE_PAUSED; 892 893 ret = pthread_barrier_init(&vm_pause_barrier, NULL, 894 vm->vm_params.vmc_params.vcp_ncpus + 1); 895 if (ret) { 896 log_warnx("%s: cannot initialize pause barrier (%d)", 897 __progname, ret); 898 return; 899 } 900 901 for (n = 0; n < vm->vm_params.vmc_params.vcp_ncpus; n++) { 902 ret = pthread_cond_broadcast(&vcpu_run_cond[n]); 903 if (ret) { 904 log_warnx("%s: can't broadcast vcpu run cond (%d)", 905 __func__, (int)ret); 906 return; 907 } 908 } 909 ret = pthread_barrier_wait(&vm_pause_barrier); 910 if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) { 911 log_warnx("%s: could not wait on pause barrier (%d)", 912 __func__, (int)ret); 913 return; 914 } 915 916 ret = pthread_barrier_destroy(&vm_pause_barrier); 917 if (ret) { 918 log_warnx("%s: could not destroy pause barrier (%d)", 919 __progname, ret); 920 return; 921 } 922 923 i8253_stop(); 924 mc146818_stop(); 925 ns8250_stop(); 926 virtio_stop(vm); 927 } 928 929 static void 930 unpause_vm(struct vmd_vm *vm) 931 { 932 unsigned int n; 933 int ret; 934 if (!(vm->vm_state & VM_STATE_PAUSED)) 935 return; 936 937 current_vm->vm_state &= ~VM_STATE_PAUSED; 938 for (n = 0; n < vm->vm_params.vmc_params.vcp_ncpus; n++) { 939 ret = pthread_cond_broadcast(&vcpu_unpause_cond[n]); 940 if (ret) { 941 log_warnx("%s: can't broadcast vcpu unpause cond (%d)", 942 __func__, (int)ret); 943 return; 944 } 945 } 946 947 i8253_start(); 948 mc146818_start(); 949 ns8250_start(); 950 virtio_start(vm); 951 } 952 953 /* 954 * vcpu_reset 955 * 956 * Requests vmm(4) to reset the VCPUs in the indicated VM to 957 * the register state provided 958 * 959 * Parameters 960 * vmid: VM ID to reset 961 * vcpu_id: VCPU ID to reset 962 * vrs: the register state to initialize 963 * 964 * Return values: 965 * 0: success 966 * !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not 967 * valid) 968 */ 969 int 970 vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs) 971 { 972 struct vm_resetcpu_params vrp; 973 974 memset(&vrp, 0, sizeof(vrp)); 975 vrp.vrp_vm_id = vmid; 976 vrp.vrp_vcpu_id = vcpu_id; 977 memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state)); 978 979 log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid); 980 981 if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) == -1) 982 return (errno); 983 984 return (0); 985 } 986 987 /* 988 * create_memory_map 989 * 990 * Sets up the guest physical memory ranges that the VM can access. 991 * 992 * Parameters: 993 * vcp: VM create parameters describing the VM whose memory map 994 * is being created 995 * 996 * Return values: 997 * nothing 998 */ 999 void 1000 create_memory_map(struct vm_create_params *vcp) 1001 { 1002 size_t len, mem_bytes; 1003 size_t above_1m = 0, above_4g = 0; 1004 1005 mem_bytes = vcp->vcp_memranges[0].vmr_size; 1006 vcp->vcp_nmemranges = 0; 1007 if (mem_bytes == 0 || mem_bytes > VMM_MAX_VM_MEM_SIZE) 1008 return; 1009 1010 /* First memory region: 0 - LOWMEM_KB (DOS low mem) */ 1011 len = LOWMEM_KB * 1024; 1012 vcp->vcp_memranges[0].vmr_gpa = 0x0; 1013 vcp->vcp_memranges[0].vmr_size = len; 1014 vcp->vcp_memranges[0].vmr_type = VM_MEM_RAM; 1015 mem_bytes -= len; 1016 1017 /* 1018 * Second memory region: LOWMEM_KB - 1MB. 1019 * 1020 * N.B. - Normally ROMs or parts of video RAM are mapped here. 1021 * We have to add this region, because some systems 1022 * unconditionally write to 0xb8000 (VGA RAM), and 1023 * we need to make sure that vmm(4) permits accesses 1024 * to it. So allocate guest memory for it. 1025 */ 1026 len = MB(1) - (LOWMEM_KB * 1024); 1027 vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024; 1028 vcp->vcp_memranges[1].vmr_size = len; 1029 vcp->vcp_memranges[1].vmr_type = VM_MEM_RESERVED; 1030 mem_bytes -= len; 1031 1032 /* If we have less than 2MB remaining, still create a 2nd BIOS area. */ 1033 if (mem_bytes <= MB(2)) { 1034 vcp->vcp_memranges[2].vmr_gpa = VMM_PCI_MMIO_BAR_END; 1035 vcp->vcp_memranges[2].vmr_size = MB(2); 1036 vcp->vcp_memranges[2].vmr_type = VM_MEM_RESERVED; 1037 vcp->vcp_nmemranges = 3; 1038 return; 1039 } 1040 1041 /* 1042 * Calculate the how to split any remaining memory across the 4GB 1043 * boundary while making sure we do not place physical memory into 1044 * MMIO ranges. 1045 */ 1046 if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - MB(1)) { 1047 above_1m = VMM_PCI_MMIO_BAR_BASE - MB(1); 1048 above_4g = mem_bytes - above_1m; 1049 } else { 1050 above_1m = mem_bytes; 1051 above_4g = 0; 1052 } 1053 1054 /* Third memory region: area above 1MB to MMIO region */ 1055 vcp->vcp_memranges[2].vmr_gpa = MB(1); 1056 vcp->vcp_memranges[2].vmr_size = above_1m; 1057 vcp->vcp_memranges[2].vmr_type = VM_MEM_RAM; 1058 1059 /* Fourth region: PCI MMIO range */ 1060 vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_BASE; 1061 vcp->vcp_memranges[3].vmr_size = VMM_PCI_MMIO_BAR_END - 1062 VMM_PCI_MMIO_BAR_BASE + 1; 1063 vcp->vcp_memranges[3].vmr_type = VM_MEM_MMIO; 1064 1065 /* Fifth region: 2nd copy of BIOS above MMIO ending at 4GB */ 1066 vcp->vcp_memranges[4].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1; 1067 vcp->vcp_memranges[4].vmr_size = MB(2); 1068 vcp->vcp_memranges[4].vmr_type = VM_MEM_RESERVED; 1069 1070 /* Sixth region: any remainder above 4GB */ 1071 if (above_4g > 0) { 1072 vcp->vcp_memranges[5].vmr_gpa = GB(4); 1073 vcp->vcp_memranges[5].vmr_size = above_4g; 1074 vcp->vcp_memranges[5].vmr_type = VM_MEM_RAM; 1075 vcp->vcp_nmemranges = 6; 1076 } else 1077 vcp->vcp_nmemranges = 5; 1078 } 1079 1080 /* 1081 * alloc_guest_mem 1082 * 1083 * Allocates memory for the guest. 1084 * Instead of doing a single allocation with one mmap(), we allocate memory 1085 * separately for every range for the following reasons: 1086 * - ASLR for the individual ranges 1087 * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to 1088 * map the single mmap'd userspace memory to the individual guest physical 1089 * memory ranges, the underlying amap of the single mmap'd range would have 1090 * to allocate per-page reference counters. The reason is that the 1091 * individual guest physical ranges would reference the single mmap'd region 1092 * only partially. However, if every guest physical range has its own 1093 * corresponding mmap'd userspace allocation, there are no partial 1094 * references: every guest physical range fully references an mmap'd 1095 * range => no per-page reference counters have to be allocated. 1096 * 1097 * Return values: 1098 * 0: success 1099 * !0: failure - errno indicating the source of the failure 1100 */ 1101 int 1102 alloc_guest_mem(struct vmd_vm *vm) 1103 { 1104 void *p; 1105 int ret = 0; 1106 size_t i, j; 1107 struct vm_create_params *vcp = &vm->vm_params.vmc_params; 1108 struct vm_mem_range *vmr; 1109 1110 for (i = 0; i < vcp->vcp_nmemranges; i++) { 1111 vmr = &vcp->vcp_memranges[i]; 1112 1113 /* 1114 * We only need R/W as userland. vmm(4) will use R/W/X in its 1115 * mapping. 1116 * 1117 * We must use MAP_SHARED so emulated devices will be able 1118 * to generate shared mappings. 1119 */ 1120 p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE, 1121 MAP_ANON | MAP_CONCEAL | MAP_SHARED, -1, 0); 1122 if (p == MAP_FAILED) { 1123 ret = errno; 1124 for (j = 0; j < i; j++) { 1125 vmr = &vcp->vcp_memranges[j]; 1126 munmap((void *)vmr->vmr_va, vmr->vmr_size); 1127 } 1128 return (ret); 1129 } 1130 vmr->vmr_va = (vaddr_t)p; 1131 } 1132 1133 return (ret); 1134 } 1135 1136 /* 1137 * vmm_create_vm 1138 * 1139 * Requests vmm(4) to create a new VM using the supplied creation 1140 * parameters. This operation results in the creation of the in-kernel 1141 * structures for the VM, but does not start the VM's vcpu(s). 1142 * 1143 * Parameters: 1144 * vm: pointer to the vm object 1145 * 1146 * Return values: 1147 * 0: success 1148 * !0 : ioctl to vmm(4) failed 1149 */ 1150 static int 1151 vmm_create_vm(struct vmd_vm *vm) 1152 { 1153 struct vm_create_params *vcp = &vm->vm_params.vmc_params; 1154 1155 /* Sanity check arguments */ 1156 if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) 1157 return (EINVAL); 1158 1159 if (vcp->vcp_nmemranges == 0 || 1160 vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) 1161 return (EINVAL); 1162 1163 if (vm->vm_params.vmc_ndisks > VM_MAX_DISKS_PER_VM) 1164 return (EINVAL); 1165 1166 if (vm->vm_params.vmc_nnics > VM_MAX_NICS_PER_VM) 1167 return (EINVAL); 1168 1169 if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) == -1) 1170 return (errno); 1171 1172 return (0); 1173 } 1174 1175 /* 1176 * init_emulated_hw 1177 * 1178 * Initializes the userspace hardware emulation 1179 */ 1180 void 1181 init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom, 1182 int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps) 1183 { 1184 struct vm_create_params *vcp = &vmc->vmc_params; 1185 size_t i; 1186 uint64_t memlo, memhi; 1187 1188 /* Calculate memory size for NVRAM registers */ 1189 memlo = memhi = 0; 1190 for (i = 0; i < vcp->vcp_nmemranges; i++) { 1191 if (vcp->vcp_memranges[i].vmr_gpa == MB(1) && 1192 vcp->vcp_memranges[i].vmr_size > (15 * MB(1))) 1193 memlo = vcp->vcp_memranges[i].vmr_size - (15 * MB(1)); 1194 else if (vcp->vcp_memranges[i].vmr_gpa == GB(4)) 1195 memhi = vcp->vcp_memranges[i].vmr_size; 1196 } 1197 1198 /* Reset the IO port map */ 1199 memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS); 1200 1201 /* Init i8253 PIT */ 1202 i8253_init(vcp->vcp_id); 1203 ioports_map[TIMER_CTRL] = vcpu_exit_i8253; 1204 ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253; 1205 ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253; 1206 ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253; 1207 ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc; 1208 1209 /* Init mc146818 RTC */ 1210 mc146818_init(vcp->vcp_id, memlo, memhi); 1211 ioports_map[IO_RTC] = vcpu_exit_mc146818; 1212 ioports_map[IO_RTC + 1] = vcpu_exit_mc146818; 1213 1214 /* Init master and slave PICs */ 1215 i8259_init(); 1216 ioports_map[IO_ICU1] = vcpu_exit_i8259; 1217 ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259; 1218 ioports_map[IO_ICU2] = vcpu_exit_i8259; 1219 ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259; 1220 ioports_map[ELCR0] = vcpu_exit_elcr; 1221 ioports_map[ELCR1] = vcpu_exit_elcr; 1222 1223 /* Init ns8250 UART */ 1224 ns8250_init(con_fd, vcp->vcp_id); 1225 for (i = COM1_DATA; i <= COM1_SCR; i++) 1226 ioports_map[i] = vcpu_exit_com; 1227 1228 /* Initialize PCI */ 1229 for (i = VM_PCI_IO_BAR_BASE; i <= VM_PCI_IO_BAR_END; i++) 1230 ioports_map[i] = vcpu_exit_pci; 1231 1232 ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci; 1233 ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci; 1234 ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci; 1235 ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci; 1236 ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci; 1237 pci_init(); 1238 1239 /* Initialize virtio devices */ 1240 virtio_init(current_vm, child_cdrom, child_disks, child_taps); 1241 1242 /* 1243 * Init QEMU fw_cfg interface. Must be done last for pci hardware 1244 * detection. 1245 */ 1246 fw_cfg_init(vmc); 1247 ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg; 1248 ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg; 1249 ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma; 1250 ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma; 1251 } 1252 1253 /* 1254 * restore_emulated_hw 1255 * 1256 * Restores the userspace hardware emulation from fd 1257 */ 1258 void 1259 restore_emulated_hw(struct vm_create_params *vcp, int fd, 1260 int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom) 1261 { 1262 /* struct vm_create_params *vcp = &vmc->vmc_params; */ 1263 int i; 1264 memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS); 1265 1266 /* Init i8253 PIT */ 1267 i8253_restore(fd, vcp->vcp_id); 1268 ioports_map[TIMER_CTRL] = vcpu_exit_i8253; 1269 ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253; 1270 ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253; 1271 ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253; 1272 1273 /* Init master and slave PICs */ 1274 i8259_restore(fd); 1275 ioports_map[IO_ICU1] = vcpu_exit_i8259; 1276 ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259; 1277 ioports_map[IO_ICU2] = vcpu_exit_i8259; 1278 ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259; 1279 1280 /* Init ns8250 UART */ 1281 ns8250_restore(fd, con_fd, vcp->vcp_id); 1282 for (i = COM1_DATA; i <= COM1_SCR; i++) 1283 ioports_map[i] = vcpu_exit_com; 1284 1285 /* Init mc146818 RTC */ 1286 mc146818_restore(fd, vcp->vcp_id); 1287 ioports_map[IO_RTC] = vcpu_exit_mc146818; 1288 ioports_map[IO_RTC + 1] = vcpu_exit_mc146818; 1289 1290 /* Init QEMU fw_cfg interface */ 1291 fw_cfg_restore(fd); 1292 ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg; 1293 ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg; 1294 ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma; 1295 ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma; 1296 1297 /* Initialize PCI */ 1298 for (i = VM_PCI_IO_BAR_BASE; i <= VM_PCI_IO_BAR_END; i++) 1299 ioports_map[i] = vcpu_exit_pci; 1300 1301 ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci; 1302 ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci; 1303 ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci; 1304 ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci; 1305 ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci; 1306 pci_restore(fd); 1307 virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps); 1308 } 1309 1310 /* 1311 * run_vm 1312 * 1313 * Runs the VM whose creation parameters are specified in vcp 1314 * 1315 * Parameters: 1316 * child_cdrom: previously-opened child ISO disk file descriptor 1317 * child_disks: previously-opened child VM disk file file descriptors 1318 * child_taps: previously-opened child tap file descriptors 1319 * vmc: vmop_create_params struct containing the VM's desired creation 1320 * configuration 1321 * vrs: VCPU register state to initialize 1322 * 1323 * Return values: 1324 * 0: the VM exited normally 1325 * !0 : the VM exited abnormally or failed to start 1326 */ 1327 static int 1328 run_vm(struct vmop_create_params *vmc, struct vcpu_reg_state *vrs) 1329 { 1330 struct vm_create_params *vcp = &vmc->vmc_params; 1331 struct vm_rwregs_params vregsp; 1332 uint8_t evdone = 0; 1333 size_t i; 1334 int ret; 1335 pthread_t *tid, evtid; 1336 char tname[MAXCOMLEN + 1]; 1337 struct vm_run_params **vrp; 1338 void *exit_status; 1339 1340 if (vcp == NULL) 1341 return (EINVAL); 1342 1343 if (vcp->vcp_nmemranges == 0 || 1344 vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) 1345 return (EINVAL); 1346 1347 tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t)); 1348 vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *)); 1349 if (tid == NULL || vrp == NULL) { 1350 log_warn("%s: memory allocation error - exiting.", 1351 __progname); 1352 return (ENOMEM); 1353 } 1354 1355 log_debug("%s: starting %zu vcpu thread(s) for vm %s", __func__, 1356 vcp->vcp_ncpus, vcp->vcp_name); 1357 1358 /* 1359 * Create and launch one thread for each VCPU. These threads may 1360 * migrate between PCPUs over time; the need to reload CPU state 1361 * in such situations is detected and performed by vmm(4) in the 1362 * kernel. 1363 */ 1364 for (i = 0 ; i < vcp->vcp_ncpus; i++) { 1365 vrp[i] = malloc(sizeof(struct vm_run_params)); 1366 if (vrp[i] == NULL) { 1367 log_warn("%s: memory allocation error - " 1368 "exiting.", __progname); 1369 /* caller will exit, so skip freeing */ 1370 return (ENOMEM); 1371 } 1372 vrp[i]->vrp_exit = malloc(sizeof(struct vm_exit)); 1373 if (vrp[i]->vrp_exit == NULL) { 1374 log_warn("%s: memory allocation error - " 1375 "exiting.", __progname); 1376 /* caller will exit, so skip freeing */ 1377 return (ENOMEM); 1378 } 1379 vrp[i]->vrp_vm_id = vcp->vcp_id; 1380 vrp[i]->vrp_vcpu_id = i; 1381 1382 if (vcpu_reset(vcp->vcp_id, i, vrs)) { 1383 log_warnx("%s: cannot reset VCPU %zu - exiting.", 1384 __progname, i); 1385 return (EIO); 1386 } 1387 1388 /* once more because reset_cpu changes regs */ 1389 if (current_vm->vm_state & VM_STATE_RECEIVED) { 1390 vregsp.vrwp_vm_id = vcp->vcp_id; 1391 vregsp.vrwp_vcpu_id = i; 1392 vregsp.vrwp_regs = *vrs; 1393 vregsp.vrwp_mask = VM_RWREGS_ALL; 1394 if ((ret = ioctl(env->vmd_fd, VMM_IOC_WRITEREGS, 1395 &vregsp)) == -1) { 1396 log_warn("%s: writeregs failed", __func__); 1397 return (ret); 1398 } 1399 } 1400 1401 ret = pthread_cond_init(&vcpu_run_cond[i], NULL); 1402 if (ret) { 1403 log_warnx("%s: cannot initialize cond var (%d)", 1404 __progname, ret); 1405 return (ret); 1406 } 1407 1408 ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL); 1409 if (ret) { 1410 log_warnx("%s: cannot initialize mtx (%d)", 1411 __progname, ret); 1412 return (ret); 1413 } 1414 1415 ret = pthread_cond_init(&vcpu_unpause_cond[i], NULL); 1416 if (ret) { 1417 log_warnx("%s: cannot initialize unpause var (%d)", 1418 __progname, ret); 1419 return (ret); 1420 } 1421 1422 ret = pthread_mutex_init(&vcpu_unpause_mtx[i], NULL); 1423 if (ret) { 1424 log_warnx("%s: cannot initialize unpause mtx (%d)", 1425 __progname, ret); 1426 return (ret); 1427 } 1428 1429 vcpu_hlt[i] = 0; 1430 1431 /* Start each VCPU run thread at vcpu_run_loop */ 1432 ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]); 1433 if (ret) { 1434 /* caller will _exit after this return */ 1435 ret = errno; 1436 log_warn("%s: could not create vcpu thread %zu", 1437 __func__, i); 1438 return (ret); 1439 } 1440 1441 snprintf(tname, sizeof(tname), "vcpu-%zu", i); 1442 pthread_set_name_np(tid[i], tname); 1443 } 1444 1445 log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name); 1446 ret = pthread_create(&evtid, NULL, event_thread, &evdone); 1447 if (ret) { 1448 errno = ret; 1449 log_warn("%s: could not create event thread", __func__); 1450 return (ret); 1451 } 1452 pthread_set_name_np(evtid, "event"); 1453 1454 for (;;) { 1455 ret = pthread_cond_wait(&threadcond, &threadmutex); 1456 if (ret) { 1457 log_warn("%s: waiting on thread state condition " 1458 "variable failed", __func__); 1459 return (ret); 1460 } 1461 1462 /* 1463 * Did a VCPU thread exit with an error? => return the first one 1464 */ 1465 for (i = 0; i < vcp->vcp_ncpus; i++) { 1466 if (vcpu_done[i] == 0) 1467 continue; 1468 1469 if (pthread_join(tid[i], &exit_status)) { 1470 log_warn("%s: failed to join thread %zd - " 1471 "exiting", __progname, i); 1472 return (EIO); 1473 } 1474 1475 ret = (intptr_t)exit_status; 1476 } 1477 1478 /* Did the event thread exit? => return with an error */ 1479 if (evdone) { 1480 if (pthread_join(evtid, &exit_status)) { 1481 log_warn("%s: failed to join event thread - " 1482 "exiting", __progname); 1483 return (EIO); 1484 } 1485 1486 log_warnx("%s: vm %d event thread exited " 1487 "unexpectedly", __progname, vcp->vcp_id); 1488 return (EIO); 1489 } 1490 1491 /* Did all VCPU threads exit successfully? => return */ 1492 for (i = 0; i < vcp->vcp_ncpus; i++) { 1493 if (vcpu_done[i] == 0) 1494 break; 1495 } 1496 if (i == vcp->vcp_ncpus) 1497 return (ret); 1498 1499 /* Some more threads to wait for, start over */ 1500 } 1501 1502 return (ret); 1503 } 1504 1505 void * 1506 event_thread(void *arg) 1507 { 1508 uint8_t *donep = arg; 1509 intptr_t ret; 1510 1511 ret = event_dispatch(); 1512 1513 mutex_lock(&threadmutex); 1514 *donep = 1; 1515 pthread_cond_signal(&threadcond); 1516 mutex_unlock(&threadmutex); 1517 1518 return (void *)ret; 1519 } 1520 1521 /* 1522 * vcpu_run_loop 1523 * 1524 * Runs a single VCPU until vmm(4) requires help handling an exit, 1525 * or the VM terminates. 1526 * 1527 * Parameters: 1528 * arg: vcpu_run_params for the VCPU being run by this thread 1529 * 1530 * Return values: 1531 * NULL: the VCPU shutdown properly 1532 * !NULL: error processing VCPU run, or the VCPU shutdown abnormally 1533 */ 1534 void * 1535 vcpu_run_loop(void *arg) 1536 { 1537 struct vm_run_params *vrp = (struct vm_run_params *)arg; 1538 intptr_t ret = 0; 1539 uint32_t n; 1540 1541 n = vrp->vrp_vcpu_id; 1542 1543 for (;;) { 1544 ret = pthread_mutex_lock(&vcpu_run_mtx[n]); 1545 1546 if (ret) { 1547 log_warnx("%s: can't lock vcpu run mtx (%d)", 1548 __func__, (int)ret); 1549 return ((void *)ret); 1550 } 1551 1552 /* If we are halted and need to pause, pause */ 1553 if (vcpu_hlt[n] && (current_vm->vm_state & VM_STATE_PAUSED)) { 1554 ret = pthread_barrier_wait(&vm_pause_barrier); 1555 if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) { 1556 log_warnx("%s: could not wait on pause barrier (%d)", 1557 __func__, (int)ret); 1558 return ((void *)ret); 1559 } 1560 1561 ret = pthread_mutex_lock(&vcpu_unpause_mtx[n]); 1562 if (ret) { 1563 log_warnx("%s: can't lock vcpu unpause mtx (%d)", 1564 __func__, (int)ret); 1565 return ((void *)ret); 1566 } 1567 1568 /* i8259 may be firing as we pause, release run mtx. */ 1569 mutex_unlock(&vcpu_run_mtx[n]); 1570 ret = pthread_cond_wait(&vcpu_unpause_cond[n], 1571 &vcpu_unpause_mtx[n]); 1572 if (ret) { 1573 log_warnx( 1574 "%s: can't wait on unpause cond (%d)", 1575 __func__, (int)ret); 1576 break; 1577 } 1578 mutex_lock(&vcpu_run_mtx[n]); 1579 1580 ret = pthread_mutex_unlock(&vcpu_unpause_mtx[n]); 1581 if (ret) { 1582 log_warnx("%s: can't unlock unpause mtx (%d)", 1583 __func__, (int)ret); 1584 break; 1585 } 1586 } 1587 1588 /* If we are halted and not paused, wait */ 1589 if (vcpu_hlt[n]) { 1590 ret = pthread_cond_wait(&vcpu_run_cond[n], 1591 &vcpu_run_mtx[n]); 1592 1593 if (ret) { 1594 log_warnx( 1595 "%s: can't wait on cond (%d)", 1596 __func__, (int)ret); 1597 (void)pthread_mutex_unlock( 1598 &vcpu_run_mtx[n]); 1599 break; 1600 } 1601 } 1602 1603 ret = pthread_mutex_unlock(&vcpu_run_mtx[n]); 1604 1605 if (ret) { 1606 log_warnx("%s: can't unlock mutex on cond (%d)", 1607 __func__, (int)ret); 1608 break; 1609 } 1610 1611 if (vrp->vrp_irqready && i8259_is_pending()) { 1612 vrp->vrp_inject.vie_vector = i8259_ack(); 1613 vrp->vrp_inject.vie_type = VCPU_INJECT_INTR; 1614 } else 1615 vrp->vrp_inject.vie_type = VCPU_INJECT_NONE; 1616 1617 /* Still more interrupts pending? */ 1618 vrp->vrp_intr_pending = i8259_is_pending(); 1619 1620 if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) == -1) { 1621 /* If run ioctl failed, exit */ 1622 ret = errno; 1623 log_warn("%s: vm %d / vcpu %d run ioctl failed", 1624 __func__, current_vm->vm_vmid, n); 1625 break; 1626 } 1627 1628 /* If the VM is terminating, exit normally */ 1629 if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) { 1630 ret = (intptr_t)NULL; 1631 break; 1632 } 1633 1634 if (vrp->vrp_exit_reason != VM_EXIT_NONE) { 1635 /* 1636 * vmm(4) needs help handling an exit, handle in 1637 * vcpu_exit. 1638 */ 1639 ret = vcpu_exit(vrp); 1640 if (ret) 1641 break; 1642 } 1643 } 1644 1645 mutex_lock(&threadmutex); 1646 vcpu_done[n] = 1; 1647 pthread_cond_signal(&threadcond); 1648 mutex_unlock(&threadmutex); 1649 1650 return ((void *)ret); 1651 } 1652 1653 int 1654 vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr) 1655 { 1656 struct vm_intr_params vip; 1657 1658 memset(&vip, 0, sizeof(vip)); 1659 1660 vip.vip_vm_id = vm_id; 1661 vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */ 1662 vip.vip_intr = intr; 1663 1664 if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) == -1) 1665 return (errno); 1666 1667 return (0); 1668 } 1669 1670 /* 1671 * vcpu_exit_pci 1672 * 1673 * Handle all I/O to the emulated PCI subsystem. 1674 * 1675 * Parameters: 1676 * vrp: vcpu run parameters containing guest state for this exit 1677 * 1678 * Return value: 1679 * Interrupt to inject to the guest VM, or 0xFF if no interrupt should 1680 * be injected. 1681 */ 1682 uint8_t 1683 vcpu_exit_pci(struct vm_run_params *vrp) 1684 { 1685 struct vm_exit *vei = vrp->vrp_exit; 1686 uint8_t intr; 1687 1688 intr = 0xFF; 1689 1690 switch (vei->vei.vei_port) { 1691 case PCI_MODE1_ADDRESS_REG: 1692 pci_handle_address_reg(vrp); 1693 break; 1694 case PCI_MODE1_DATA_REG: 1695 case PCI_MODE1_DATA_REG + 1: 1696 case PCI_MODE1_DATA_REG + 2: 1697 case PCI_MODE1_DATA_REG + 3: 1698 pci_handle_data_reg(vrp); 1699 break; 1700 case VM_PCI_IO_BAR_BASE ... VM_PCI_IO_BAR_END: 1701 intr = pci_handle_io(vrp); 1702 break; 1703 default: 1704 log_warnx("%s: unknown PCI register 0x%llx", 1705 __progname, (uint64_t)vei->vei.vei_port); 1706 break; 1707 } 1708 1709 return (intr); 1710 } 1711 1712 /* 1713 * vcpu_exit_inout 1714 * 1715 * Handle all I/O exits that need to be emulated in vmd. This includes the 1716 * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device. 1717 * 1718 * Parameters: 1719 * vrp: vcpu run parameters containing guest state for this exit 1720 */ 1721 void 1722 vcpu_exit_inout(struct vm_run_params *vrp) 1723 { 1724 struct vm_exit *vei = vrp->vrp_exit; 1725 uint8_t intr = 0xFF; 1726 1727 if (vei->vei.vei_rep || vei->vei.vei_string) { 1728 #ifdef MMIO_DEBUG 1729 log_info("%s: %s%s%s %d-byte, enc=%d, data=0x%08x, port=0x%04x", 1730 __func__, 1731 vei->vei.vei_rep == 0 ? "" : "REP ", 1732 vei->vei.vei_dir == VEI_DIR_IN ? "IN" : "OUT", 1733 vei->vei.vei_string == 0 ? "" : "S", 1734 vei->vei.vei_size, vei->vei.vei_encoding, 1735 vei->vei.vei_data, vei->vei.vei_port); 1736 log_info("%s: ECX = 0x%llx, RDX = 0x%llx, RSI = 0x%llx", 1737 __func__, 1738 vei->vrs.vrs_gprs[VCPU_REGS_RCX], 1739 vei->vrs.vrs_gprs[VCPU_REGS_RDX], 1740 vei->vrs.vrs_gprs[VCPU_REGS_RSI]); 1741 #endif /* MMIO_DEBUG */ 1742 fatalx("%s: can't emulate REP prefixed IN(S)/OUT(S)", 1743 __func__); 1744 } 1745 1746 if (ioports_map[vei->vei.vei_port] != NULL) 1747 intr = ioports_map[vei->vei.vei_port](vrp); 1748 else if (vei->vei.vei_dir == VEI_DIR_IN) 1749 set_return_data(vei, 0xFFFFFFFF); 1750 1751 vei->vrs.vrs_gprs[VCPU_REGS_RIP] += vei->vei.vei_insn_len; 1752 1753 if (intr != 0xFF) 1754 vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr); 1755 } 1756 1757 /* 1758 * vcpu_exit_eptviolation 1759 * 1760 * handle an EPT Violation 1761 * 1762 * Parameters: 1763 * vrp: vcpu run parameters containing guest state for this exit 1764 * 1765 * Return values: 1766 * 0: no action required 1767 * EFAULT: a protection fault occured, kill the vm. 1768 */ 1769 int 1770 vcpu_exit_eptviolation(struct vm_run_params *vrp) 1771 { 1772 struct vm_exit *ve = vrp->vrp_exit; 1773 int ret = 0; 1774 #if MMIO_NOTYET 1775 struct x86_insn insn; 1776 uint64_t va, pa; 1777 size_t len = 15; /* Max instruction length in x86. */ 1778 #endif /* MMIO_NOTYET */ 1779 switch (ve->vee.vee_fault_type) { 1780 case VEE_FAULT_HANDLED: 1781 log_debug("%s: fault already handled", __func__); 1782 break; 1783 1784 #if MMIO_NOTYET 1785 case VEE_FAULT_MMIO_ASSIST: 1786 /* Intel VMX might give us the length of the instruction. */ 1787 if (ve->vee.vee_insn_info & VEE_LEN_VALID) 1788 len = ve->vee.vee_insn_len; 1789 1790 if (len > 15) 1791 fatalx("%s: invalid instruction length %lu", __func__, 1792 len); 1793 1794 /* If we weren't given instruction bytes, we need to fetch. */ 1795 if (!(ve->vee.vee_insn_info & VEE_BYTES_VALID)) { 1796 memset(ve->vee.vee_insn_bytes, 0, 1797 sizeof(ve->vee.vee_insn_bytes)); 1798 va = ve->vrs.vrs_gprs[VCPU_REGS_RIP]; 1799 1800 /* XXX Only support instructions that fit on 1 page. */ 1801 if ((va & PAGE_MASK) + len > PAGE_SIZE) { 1802 log_warnx("%s: instruction might cross page " 1803 "boundary", __func__); 1804 ret = EINVAL; 1805 break; 1806 } 1807 1808 ret = translate_gva(ve, va, &pa, PROT_EXEC); 1809 if (ret != 0) { 1810 log_warnx("%s: failed gva translation", 1811 __func__); 1812 break; 1813 } 1814 1815 ret = read_mem(pa, ve->vee.vee_insn_bytes, len); 1816 if (ret != 0) { 1817 log_warnx("%s: failed to fetch instruction " 1818 "bytes from 0x%llx", __func__, pa); 1819 break; 1820 } 1821 } 1822 1823 ret = insn_decode(ve, &insn); 1824 if (ret == 0) 1825 ret = insn_emulate(ve, &insn); 1826 break; 1827 #endif /* MMIO_NOTYET */ 1828 1829 case VEE_FAULT_PROTECT: 1830 log_debug("%s: EPT Violation: rip=0x%llx", __progname, 1831 ve->vrs.vrs_gprs[VCPU_REGS_RIP]); 1832 ret = EFAULT; 1833 break; 1834 1835 default: 1836 fatalx("%s: invalid fault_type %d", __progname, 1837 ve->vee.vee_fault_type); 1838 /* UNREACHED */ 1839 } 1840 1841 return (ret); 1842 } 1843 1844 /* 1845 * vcpu_exit 1846 * 1847 * Handle a vcpu exit. This function is called when it is determined that 1848 * vmm(4) requires the assistance of vmd to support a particular guest 1849 * exit type (eg, accessing an I/O port or device). Guest state is contained 1850 * in 'vrp', and will be resent to vmm(4) on exit completion. 1851 * 1852 * Upon conclusion of handling the exit, the function determines if any 1853 * interrupts should be injected into the guest, and asserts the proper 1854 * IRQ line whose interrupt should be vectored. 1855 * 1856 * Parameters: 1857 * vrp: vcpu run parameters containing guest state for this exit 1858 * 1859 * Return values: 1860 * 0: the exit was handled successfully 1861 * 1: an error occurred (eg, unknown exit reason passed in 'vrp') 1862 */ 1863 int 1864 vcpu_exit(struct vm_run_params *vrp) 1865 { 1866 int ret; 1867 1868 switch (vrp->vrp_exit_reason) { 1869 case VMX_EXIT_INT_WINDOW: 1870 case SVM_VMEXIT_VINTR: 1871 case VMX_EXIT_CPUID: 1872 case VMX_EXIT_EXTINT: 1873 case SVM_VMEXIT_INTR: 1874 case SVM_VMEXIT_MSR: 1875 case SVM_VMEXIT_CPUID: 1876 /* 1877 * We may be exiting to vmd to handle a pending interrupt but 1878 * at the same time the last exit type may have been one of 1879 * these. In this case, there's nothing extra to be done 1880 * here (and falling through to the default case below results 1881 * in more vmd log spam). 1882 */ 1883 break; 1884 case SVM_VMEXIT_NPF: 1885 case VMX_EXIT_EPT_VIOLATION: 1886 ret = vcpu_exit_eptviolation(vrp); 1887 if (ret) 1888 return (ret); 1889 break; 1890 case VMX_EXIT_IO: 1891 case SVM_VMEXIT_IOIO: 1892 vcpu_exit_inout(vrp); 1893 break; 1894 case VMX_EXIT_HLT: 1895 case SVM_VMEXIT_HLT: 1896 ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]); 1897 if (ret) { 1898 log_warnx("%s: can't lock vcpu mutex (%d)", 1899 __func__, ret); 1900 return (ret); 1901 } 1902 vcpu_hlt[vrp->vrp_vcpu_id] = 1; 1903 ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]); 1904 if (ret) { 1905 log_warnx("%s: can't unlock vcpu mutex (%d)", 1906 __func__, ret); 1907 return (ret); 1908 } 1909 break; 1910 case VMX_EXIT_TRIPLE_FAULT: 1911 case SVM_VMEXIT_SHUTDOWN: 1912 /* reset VM */ 1913 return (EAGAIN); 1914 default: 1915 log_debug("%s: unknown exit reason 0x%x", 1916 __progname, vrp->vrp_exit_reason); 1917 } 1918 1919 return (0); 1920 } 1921 1922 /* 1923 * find_gpa_range 1924 * 1925 * Search for a contiguous guest physical mem range. 1926 * 1927 * Parameters: 1928 * vcp: VM create parameters that contain the memory map to search in 1929 * gpa: the starting guest physical address 1930 * len: the length of the memory range 1931 * 1932 * Return values: 1933 * NULL: on failure if there is no memory range as described by the parameters 1934 * Pointer to vm_mem_range that contains the start of the range otherwise. 1935 */ 1936 static struct vm_mem_range * 1937 find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len) 1938 { 1939 size_t i, n; 1940 struct vm_mem_range *vmr; 1941 1942 /* Find the first vm_mem_range that contains gpa */ 1943 for (i = 0; i < vcp->vcp_nmemranges; i++) { 1944 vmr = &vcp->vcp_memranges[i]; 1945 if (gpa < vmr->vmr_gpa + vmr->vmr_size) 1946 break; 1947 } 1948 1949 /* No range found. */ 1950 if (i == vcp->vcp_nmemranges) 1951 return (NULL); 1952 1953 /* 1954 * vmr may cover the range [gpa, gpa + len) only partly. Make 1955 * sure that the following vm_mem_ranges are contiguous and 1956 * cover the rest. 1957 */ 1958 n = vmr->vmr_size - (gpa - vmr->vmr_gpa); 1959 if (len < n) 1960 len = 0; 1961 else 1962 len -= n; 1963 gpa = vmr->vmr_gpa + vmr->vmr_size; 1964 for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) { 1965 vmr = &vcp->vcp_memranges[i]; 1966 if (gpa != vmr->vmr_gpa) 1967 return (NULL); 1968 if (len <= vmr->vmr_size) 1969 len = 0; 1970 else 1971 len -= vmr->vmr_size; 1972 1973 gpa = vmr->vmr_gpa + vmr->vmr_size; 1974 } 1975 1976 if (len != 0) 1977 return (NULL); 1978 1979 return (vmr); 1980 } 1981 1982 /* 1983 * write_mem 1984 * 1985 * Copies data from 'buf' into the guest VM's memory at paddr 'dst'. 1986 * 1987 * Parameters: 1988 * dst: the destination paddr_t in the guest VM 1989 * buf: data to copy (or NULL to zero the data) 1990 * len: number of bytes to copy 1991 * 1992 * Return values: 1993 * 0: success 1994 * EINVAL: if the guest physical memory range [dst, dst + len) does not 1995 * exist in the guest. 1996 */ 1997 int 1998 write_mem(paddr_t dst, const void *buf, size_t len) 1999 { 2000 const char *from = buf; 2001 char *to; 2002 size_t n, off; 2003 struct vm_mem_range *vmr; 2004 2005 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, dst, len); 2006 if (vmr == NULL) { 2007 errno = EINVAL; 2008 log_warn("%s: failed - invalid memory range dst = 0x%lx, " 2009 "len = 0x%zx", __func__, dst, len); 2010 return (EINVAL); 2011 } 2012 2013 off = dst - vmr->vmr_gpa; 2014 while (len != 0) { 2015 n = vmr->vmr_size - off; 2016 if (len < n) 2017 n = len; 2018 2019 to = (char *)vmr->vmr_va + off; 2020 if (buf == NULL) 2021 memset(to, 0, n); 2022 else { 2023 memcpy(to, from, n); 2024 from += n; 2025 } 2026 len -= n; 2027 off = 0; 2028 vmr++; 2029 } 2030 2031 return (0); 2032 } 2033 2034 /* 2035 * read_mem 2036 * 2037 * Reads memory at guest paddr 'src' into 'buf'. 2038 * 2039 * Parameters: 2040 * src: the source paddr_t in the guest VM to read from. 2041 * buf: destination (local) buffer 2042 * len: number of bytes to read 2043 * 2044 * Return values: 2045 * 0: success 2046 * EINVAL: if the guest physical memory range [dst, dst + len) does not 2047 * exist in the guest. 2048 */ 2049 int 2050 read_mem(paddr_t src, void *buf, size_t len) 2051 { 2052 char *from, *to = buf; 2053 size_t n, off; 2054 struct vm_mem_range *vmr; 2055 2056 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, src, len); 2057 if (vmr == NULL) { 2058 errno = EINVAL; 2059 log_warn("%s: failed - invalid memory range src = 0x%lx, " 2060 "len = 0x%zx", __func__, src, len); 2061 return (EINVAL); 2062 } 2063 2064 off = src - vmr->vmr_gpa; 2065 while (len != 0) { 2066 n = vmr->vmr_size - off; 2067 if (len < n) 2068 n = len; 2069 2070 from = (char *)vmr->vmr_va + off; 2071 memcpy(to, from, n); 2072 2073 to += n; 2074 len -= n; 2075 off = 0; 2076 vmr++; 2077 } 2078 2079 return (0); 2080 } 2081 2082 /* 2083 * hvaddr_mem 2084 * 2085 * Translate a guest physical address to a host virtual address, checking the 2086 * provided memory range length to confirm it's contiguous within the same 2087 * guest memory range (vm_mem_range). 2088 * 2089 * Parameters: 2090 * gpa: guest physical address to translate 2091 * len: number of bytes in the intended range 2092 * 2093 * Return values: 2094 * void* to host virtual memory on success 2095 * NULL on error, setting errno to: 2096 * EFAULT: gpa falls outside guest memory ranges 2097 * EINVAL: requested len extends beyond memory range 2098 */ 2099 void * 2100 hvaddr_mem(paddr_t gpa, size_t len) 2101 { 2102 struct vm_mem_range *vmr; 2103 size_t off; 2104 2105 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, gpa, len); 2106 if (vmr == NULL) { 2107 log_warnx("%s: failed - invalid gpa: 0x%lx\n", __func__, gpa); 2108 errno = EFAULT; 2109 return (NULL); 2110 } 2111 2112 off = gpa - vmr->vmr_gpa; 2113 if (len > (vmr->vmr_size - off)) { 2114 log_warnx("%s: failed - invalid memory range: gpa=0x%lx, " 2115 "len=%zu", __func__, gpa, len); 2116 errno = EINVAL; 2117 return (NULL); 2118 } 2119 2120 return ((char *)vmr->vmr_va + off); 2121 } 2122 2123 /* 2124 * vcpu_assert_pic_irq 2125 * 2126 * Injects the specified IRQ on the supplied vcpu/vm 2127 * 2128 * Parameters: 2129 * vm_id: VM ID to inject to 2130 * vcpu_id: VCPU ID to inject to 2131 * irq: IRQ to inject 2132 */ 2133 void 2134 vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) 2135 { 2136 int ret; 2137 2138 i8259_assert_irq(irq); 2139 2140 if (i8259_is_pending()) { 2141 if (vcpu_pic_intr(vm_id, vcpu_id, 1)) 2142 fatalx("%s: can't assert INTR", __func__); 2143 mutex_lock(&vcpu_run_mtx[vcpu_id]); 2144 vcpu_hlt[vcpu_id] = 0; 2145 ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]); 2146 if (ret) 2147 fatalx("%s: can't signal (%d)", __func__, ret); 2148 mutex_unlock(&vcpu_run_mtx[vcpu_id]); 2149 } 2150 } 2151 2152 /* 2153 * vcpu_deassert_pic_irq 2154 * 2155 * Clears the specified IRQ on the supplied vcpu/vm 2156 * 2157 * Parameters: 2158 * vm_id: VM ID to clear in 2159 * vcpu_id: VCPU ID to clear in 2160 * irq: IRQ to clear 2161 */ 2162 void 2163 vcpu_deassert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) 2164 { 2165 i8259_deassert_irq(irq); 2166 2167 if (!i8259_is_pending()) { 2168 if (vcpu_pic_intr(vm_id, vcpu_id, 0)) 2169 fatalx("%s: can't deassert INTR for vm_id %d, " 2170 "vcpu_id %d", __func__, vm_id, vcpu_id); 2171 } 2172 } 2173 2174 /* 2175 * fd_hasdata 2176 * 2177 * Determines if data can be read from a file descriptor. 2178 * 2179 * Parameters: 2180 * fd: the fd to check 2181 * 2182 * Return values: 2183 * 1 if data can be read from an fd, or 0 otherwise. 2184 */ 2185 int 2186 fd_hasdata(int fd) 2187 { 2188 struct pollfd pfd[1]; 2189 int nready, hasdata = 0; 2190 2191 pfd[0].fd = fd; 2192 pfd[0].events = POLLIN; 2193 nready = poll(pfd, 1, 0); 2194 if (nready == -1) 2195 log_warn("checking file descriptor for data failed"); 2196 else if (nready == 1 && pfd[0].revents & POLLIN) 2197 hasdata = 1; 2198 return (hasdata); 2199 } 2200 2201 /* 2202 * mutex_lock 2203 * 2204 * Wrapper function for pthread_mutex_lock that does error checking and that 2205 * exits on failure 2206 */ 2207 void 2208 mutex_lock(pthread_mutex_t *m) 2209 { 2210 int ret; 2211 2212 ret = pthread_mutex_lock(m); 2213 if (ret) { 2214 errno = ret; 2215 fatal("could not acquire mutex"); 2216 } 2217 } 2218 2219 /* 2220 * mutex_unlock 2221 * 2222 * Wrapper function for pthread_mutex_unlock that does error checking and that 2223 * exits on failure 2224 */ 2225 void 2226 mutex_unlock(pthread_mutex_t *m) 2227 { 2228 int ret; 2229 2230 ret = pthread_mutex_unlock(m); 2231 if (ret) { 2232 errno = ret; 2233 fatal("could not release mutex"); 2234 } 2235 } 2236 2237 /* 2238 * set_return_data 2239 * 2240 * Utility function for manipulating register data in vm exit info structs. This 2241 * function ensures that the data is copied to the vei->vei.vei_data field with 2242 * the proper size for the operation being performed. 2243 * 2244 * Parameters: 2245 * vei: exit information 2246 * data: return data 2247 */ 2248 void 2249 set_return_data(struct vm_exit *vei, uint32_t data) 2250 { 2251 switch (vei->vei.vei_size) { 2252 case 1: 2253 vei->vei.vei_data &= ~0xFF; 2254 vei->vei.vei_data |= (uint8_t)data; 2255 break; 2256 case 2: 2257 vei->vei.vei_data &= ~0xFFFF; 2258 vei->vei.vei_data |= (uint16_t)data; 2259 break; 2260 case 4: 2261 vei->vei.vei_data = data; 2262 break; 2263 } 2264 } 2265 2266 /* 2267 * get_input_data 2268 * 2269 * Utility function for manipulating register data in vm exit info 2270 * structs. This function ensures that the data is copied from the 2271 * vei->vei.vei_data field with the proper size for the operation being 2272 * performed. 2273 * 2274 * Parameters: 2275 * vei: exit information 2276 * data: location to store the result 2277 */ 2278 void 2279 get_input_data(struct vm_exit *vei, uint32_t *data) 2280 { 2281 switch (vei->vei.vei_size) { 2282 case 1: 2283 *data &= 0xFFFFFF00; 2284 *data |= (uint8_t)vei->vei.vei_data; 2285 break; 2286 case 2: 2287 *data &= 0xFFFF0000; 2288 *data |= (uint16_t)vei->vei.vei_data; 2289 break; 2290 case 4: 2291 *data = vei->vei.vei_data; 2292 break; 2293 default: 2294 log_warnx("%s: invalid i/o size %d", __func__, 2295 vei->vei.vei_size); 2296 } 2297 2298 } 2299 2300 /* 2301 * translate_gva 2302 * 2303 * Translates a guest virtual address to a guest physical address by walking 2304 * the currently active page table (if needed). 2305 * 2306 * XXX ensure translate_gva updates the A bit in the PTE 2307 * XXX ensure translate_gva respects segment base and limits in i386 mode 2308 * XXX ensure translate_gva respects segment wraparound in i8086 mode 2309 * XXX ensure translate_gva updates the A bit in the segment selector 2310 * XXX ensure translate_gva respects CR4.LMSLE if available 2311 * 2312 * Parameters: 2313 * exit: The VCPU this translation should be performed for (guest MMU settings 2314 * are gathered from this VCPU) 2315 * va: virtual address to translate 2316 * pa: pointer to paddr_t variable that will receive the translated physical 2317 * address. 'pa' is unchanged on error. 2318 * mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which 2319 * the address should be translated 2320 * 2321 * Return values: 2322 * 0: the address was successfully translated - 'pa' contains the physical 2323 * address currently mapped by 'va'. 2324 * EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case 2325 * and %cr2 set in the vcpu structure. 2326 * EINVAL: an error occurred reading paging table structures 2327 */ 2328 int 2329 translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode) 2330 { 2331 int level, shift, pdidx; 2332 uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask; 2333 uint64_t shift_width, pte_size; 2334 struct vcpu_reg_state *vrs; 2335 2336 vrs = &exit->vrs; 2337 2338 if (!pa) 2339 return (EINVAL); 2340 2341 if (!(vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PG)) { 2342 log_debug("%s: unpaged, va=pa=0x%llx", __func__, va); 2343 *pa = va; 2344 return (0); 2345 } 2346 2347 pt_paddr = vrs->vrs_crs[VCPU_REGS_CR3]; 2348 2349 log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__, 2350 vrs->vrs_crs[VCPU_REGS_CR0], vrs->vrs_crs[VCPU_REGS_CR3]); 2351 2352 if (vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PE) { 2353 if (vrs->vrs_crs[VCPU_REGS_CR4] & CR4_PAE) { 2354 pte_size = sizeof(uint64_t); 2355 shift_width = 9; 2356 2357 if (vrs->vrs_msrs[VCPU_REGS_EFER] & EFER_LMA) { 2358 /* 4 level paging */ 2359 level = 4; 2360 mask = L4_MASK; 2361 shift = L4_SHIFT; 2362 } else { 2363 /* 32 bit with PAE paging */ 2364 level = 3; 2365 mask = L3_MASK; 2366 shift = L3_SHIFT; 2367 } 2368 } else { 2369 /* 32 bit paging */ 2370 level = 2; 2371 shift_width = 10; 2372 mask = 0xFFC00000; 2373 shift = 22; 2374 pte_size = sizeof(uint32_t); 2375 } 2376 } else 2377 return (EINVAL); 2378 2379 /* XXX: Check for R bit in segment selector and set A bit */ 2380 2381 for (;level > 0; level--) { 2382 pdidx = (va & mask) >> shift; 2383 pte_paddr = (pt_paddr) + (pdidx * pte_size); 2384 2385 log_debug("%s: read pte level %d @ GPA 0x%llx", __func__, 2386 level, pte_paddr); 2387 if (read_mem(pte_paddr, &pte, pte_size)) { 2388 log_warn("%s: failed to read pte", __func__); 2389 return (EFAULT); 2390 } 2391 2392 log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr, 2393 pte); 2394 2395 /* XXX: Set CR2 */ 2396 if (!(pte & PG_V)) 2397 return (EFAULT); 2398 2399 /* XXX: Check for SMAP */ 2400 if ((mode == PROT_WRITE) && !(pte & PG_RW)) 2401 return (EPERM); 2402 2403 if ((exit->cpl > 0) && !(pte & PG_u)) 2404 return (EPERM); 2405 2406 pte = pte | PG_U; 2407 if (mode == PROT_WRITE) 2408 pte = pte | PG_M; 2409 if (write_mem(pte_paddr, &pte, pte_size)) { 2410 log_warn("%s: failed to write back flags to pte", 2411 __func__); 2412 return (EIO); 2413 } 2414 2415 /* XXX: EINVAL if in 32bit and PG_PS is 1 but CR4.PSE is 0 */ 2416 if (pte & PG_PS) 2417 break; 2418 2419 if (level > 1) { 2420 pt_paddr = pte & PG_FRAME; 2421 shift -= shift_width; 2422 mask = mask >> shift_width; 2423 } 2424 } 2425 2426 low_mask = (1 << shift) - 1; 2427 high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask; 2428 *pa = (pte & high_mask) | (va & low_mask); 2429 2430 log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa); 2431 2432 return (0); 2433 } 2434 2435 void 2436 vm_pipe_init(struct vm_dev_pipe *p, void (*cb)(int, short, void *)) 2437 { 2438 vm_pipe_init2(p, cb, NULL); 2439 } 2440 2441 /* 2442 * vm_pipe_init2 2443 * 2444 * Initialize a vm_dev_pipe, setting up its file descriptors and its 2445 * event structure with the given callback and argument. 2446 * 2447 * Parameters: 2448 * p: pointer to vm_dev_pipe struct to initizlize 2449 * cb: callback to use for READ events on the read end of the pipe 2450 * arg: pointer to pass to the callback on event trigger 2451 */ 2452 void 2453 vm_pipe_init2(struct vm_dev_pipe *p, void (*cb)(int, short, void *), void *arg) 2454 { 2455 int ret; 2456 int fds[2]; 2457 2458 memset(p, 0, sizeof(struct vm_dev_pipe)); 2459 2460 ret = pipe2(fds, O_CLOEXEC); 2461 if (ret) 2462 fatal("failed to create vm_dev_pipe pipe"); 2463 2464 p->read = fds[0]; 2465 p->write = fds[1]; 2466 2467 event_set(&p->read_ev, p->read, EV_READ | EV_PERSIST, cb, arg); 2468 } 2469 2470 /* 2471 * vm_pipe_send 2472 * 2473 * Send a message to an emulated device vie the provided vm_dev_pipe. This 2474 * relies on the fact sizeof(msg) < PIPE_BUF to ensure atomic writes. 2475 * 2476 * Parameters: 2477 * p: pointer to initialized vm_dev_pipe 2478 * msg: message to send in the channel 2479 */ 2480 void 2481 vm_pipe_send(struct vm_dev_pipe *p, enum pipe_msg_type msg) 2482 { 2483 size_t n; 2484 n = write(p->write, &msg, sizeof(msg)); 2485 if (n != sizeof(msg)) 2486 fatal("failed to write to device pipe"); 2487 } 2488 2489 /* 2490 * vm_pipe_recv 2491 * 2492 * Receive a message for an emulated device via the provided vm_dev_pipe. 2493 * Returns the message value, otherwise will exit on failure. This relies on 2494 * the fact sizeof(enum pipe_msg_type) < PIPE_BUF for atomic reads. 2495 * 2496 * Parameters: 2497 * p: pointer to initialized vm_dev_pipe 2498 * 2499 * Return values: 2500 * a value of enum pipe_msg_type or fatal exit on read(2) error 2501 */ 2502 enum pipe_msg_type 2503 vm_pipe_recv(struct vm_dev_pipe *p) 2504 { 2505 size_t n; 2506 enum pipe_msg_type msg; 2507 n = read(p->read, &msg, sizeof(msg)); 2508 if (n != sizeof(msg)) 2509 fatal("failed to read from device pipe"); 2510 2511 return msg; 2512 } 2513 2514 /* 2515 * Re-map the guest address space using vmm(4)'s VMM_IOC_SHARE 2516 * 2517 * Returns 0 on success, non-zero in event of failure. 2518 */ 2519 int 2520 remap_guest_mem(struct vmd_vm *vm, int vmm_fd) 2521 { 2522 struct vm_create_params *vcp; 2523 struct vm_mem_range *vmr; 2524 struct vm_sharemem_params vsp; 2525 size_t i, j; 2526 void *p = NULL; 2527 int ret; 2528 2529 if (vm == NULL) 2530 return (1); 2531 2532 vcp = &vm->vm_params.vmc_params; 2533 2534 /* 2535 * Initialize our VM shared memory request using our original 2536 * creation parameters. We'll overwrite the va's after mmap(2). 2537 */ 2538 memset(&vsp, 0, sizeof(vsp)); 2539 vsp.vsp_nmemranges = vcp->vcp_nmemranges; 2540 vsp.vsp_vm_id = vcp->vcp_id; 2541 memcpy(&vsp.vsp_memranges, &vcp->vcp_memranges, 2542 sizeof(vsp.vsp_memranges)); 2543 2544 /* 2545 * Use mmap(2) to identify virtual address space for our mappings. 2546 */ 2547 for (i = 0; i < VMM_MAX_MEM_RANGES; i++) { 2548 if (i < vsp.vsp_nmemranges) { 2549 vmr = &vsp.vsp_memranges[i]; 2550 2551 /* Ignore any MMIO ranges. */ 2552 if (vmr->vmr_type == VM_MEM_MMIO) { 2553 vmr->vmr_va = 0; 2554 vcp->vcp_memranges[i].vmr_va = 0; 2555 continue; 2556 } 2557 2558 /* Make initial mappings for the memrange. */ 2559 p = mmap(NULL, vmr->vmr_size, PROT_READ, MAP_ANON, -1, 2560 0); 2561 if (p == MAP_FAILED) { 2562 ret = errno; 2563 log_warn("%s: mmap", __func__); 2564 for (j = 0; j < i; j++) { 2565 vmr = &vcp->vcp_memranges[j]; 2566 munmap((void *)vmr->vmr_va, 2567 vmr->vmr_size); 2568 } 2569 return (ret); 2570 } 2571 vmr->vmr_va = (vaddr_t)p; 2572 vcp->vcp_memranges[i].vmr_va = vmr->vmr_va; 2573 } 2574 } 2575 2576 /* 2577 * munmap(2) now that we have va's and ranges that don't overlap. vmm 2578 * will use the va's and sizes to recreate the mappings for us. 2579 */ 2580 for (i = 0; i < vsp.vsp_nmemranges; i++) { 2581 vmr = &vsp.vsp_memranges[i]; 2582 if (vmr->vmr_type == VM_MEM_MMIO) 2583 continue; 2584 if (munmap((void*)vmr->vmr_va, vmr->vmr_size) == -1) 2585 fatal("%s: munmap", __func__); 2586 } 2587 2588 /* 2589 * Ask vmm to enter the shared mappings for us. They'll point 2590 * to the same host physical memory, but will have a randomized 2591 * virtual address for the calling process. 2592 */ 2593 if (ioctl(vmm_fd, VMM_IOC_SHAREMEM, &vsp) == -1) 2594 return (errno); 2595 2596 return (0); 2597 } 2598