1 /* $OpenBSD: vm.c,v 1.96 2024/01/18 14:49:59 claudio Exp $ */ 2 3 /* 4 * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/param.h> /* PAGE_SIZE, MAXCOMLEN */ 20 #include <sys/types.h> 21 #include <sys/ioctl.h> 22 #include <sys/queue.h> 23 #include <sys/wait.h> 24 #include <sys/uio.h> 25 #include <sys/stat.h> 26 #include <sys/socket.h> 27 #include <sys/time.h> 28 #include <sys/mman.h> 29 #include <sys/resource.h> 30 31 #include <dev/ic/i8253reg.h> 32 #include <dev/isa/isareg.h> 33 #include <dev/pci/pcireg.h> 34 35 #include <machine/psl.h> 36 #include <machine/pte.h> 37 #include <machine/specialreg.h> 38 #include <machine/vmmvar.h> 39 40 #include <net/if.h> 41 42 #include <errno.h> 43 #include <event.h> 44 #include <fcntl.h> 45 #include <imsg.h> 46 #include <limits.h> 47 #include <poll.h> 48 #include <pthread.h> 49 #include <pthread_np.h> 50 #include <stddef.h> 51 #include <stdio.h> 52 #include <stdlib.h> 53 #include <string.h> 54 #include <unistd.h> 55 #include <util.h> 56 57 #include "atomicio.h" 58 #include "fw_cfg.h" 59 #include "i8253.h" 60 #include "i8259.h" 61 #include "loadfile.h" 62 #include "mc146818.h" 63 #include "mmio.h" 64 #include "ns8250.h" 65 #include "pci.h" 66 #include "virtio.h" 67 #include "vmd.h" 68 #include "vmm.h" 69 70 #define MB(x) (x * 1024UL * 1024UL) 71 #define GB(x) (x * 1024UL * 1024UL * 1024UL) 72 73 #define MMIO_NOTYET 0 74 75 io_fn_t ioports_map[MAX_PORTS]; 76 77 static int run_vm(struct vmop_create_params *, struct vcpu_reg_state *); 78 void vm_dispatch_vmm(int, short, void *); 79 void *event_thread(void *); 80 void *vcpu_run_loop(void *); 81 int vcpu_exit(struct vm_run_params *); 82 int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *); 83 void create_memory_map(struct vm_create_params *); 84 static int vmm_create_vm(struct vmd_vm *); 85 int alloc_guest_mem(struct vmd_vm *); 86 void init_emulated_hw(struct vmop_create_params *, int, 87 int[][VM_MAX_BASE_PER_DISK], int *); 88 void restore_emulated_hw(struct vm_create_params *, int, int *, 89 int[][VM_MAX_BASE_PER_DISK],int); 90 void vcpu_exit_inout(struct vm_run_params *); 91 int vcpu_exit_eptviolation(struct vm_run_params *); 92 uint8_t vcpu_exit_pci(struct vm_run_params *); 93 int vcpu_pic_intr(uint32_t, uint32_t, uint8_t); 94 int loadfile_bios(gzFile, off_t, struct vcpu_reg_state *); 95 static int send_vm(int, struct vmd_vm *); 96 int dump_send_header(int); 97 static int dump_vmr(int , struct vm_mem_range *); 98 static int dump_mem(int, struct vmd_vm *); 99 void restore_vmr(int, struct vm_mem_range *); 100 void restore_mem(int, struct vm_create_params *); 101 int restore_vm_params(int, struct vm_create_params *); 102 static void pause_vm(struct vmd_vm *); 103 static void unpause_vm(struct vmd_vm *); 104 105 int translate_gva(struct vm_exit*, uint64_t, uint64_t *, int); 106 107 static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t, 108 size_t); 109 110 int con_fd; 111 struct vmd_vm *current_vm; 112 113 extern struct vmd *env; 114 115 extern char *__progname; 116 117 pthread_mutex_t threadmutex; 118 pthread_cond_t threadcond; 119 120 pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM]; 121 pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM]; 122 pthread_barrier_t vm_pause_barrier; 123 pthread_cond_t vcpu_unpause_cond[VMM_MAX_VCPUS_PER_VM]; 124 pthread_mutex_t vcpu_unpause_mtx[VMM_MAX_VCPUS_PER_VM]; 125 uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM]; 126 uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM]; 127 128 /* 129 * Represents a standard register set for an OS to be booted 130 * as a flat 64 bit address space. 131 * 132 * NOT set here are: 133 * RIP 134 * RSP 135 * GDTR BASE 136 * 137 * Specific bootloaders should clone this structure and override 138 * those fields as needed. 139 * 140 * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on 141 * features of the CPU in use. 142 */ 143 static const struct vcpu_reg_state vcpu_init_flat64 = { 144 .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2, 145 .vrs_gprs[VCPU_REGS_RIP] = 0x0, 146 .vrs_gprs[VCPU_REGS_RSP] = 0x0, 147 .vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE | CR0_PG, 148 .vrs_crs[VCPU_REGS_CR3] = PML4_PAGE, 149 .vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE, 150 .vrs_crs[VCPU_REGS_PDPTE0] = 0ULL, 151 .vrs_crs[VCPU_REGS_PDPTE1] = 0ULL, 152 .vrs_crs[VCPU_REGS_PDPTE2] = 0ULL, 153 .vrs_crs[VCPU_REGS_PDPTE3] = 0ULL, 154 .vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0}, 155 .vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 156 .vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 157 .vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 158 .vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 159 .vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 160 .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0}, 161 .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0}, 162 .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0}, 163 .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0}, 164 .vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA, 165 .vrs_drs[VCPU_REGS_DR0] = 0x0, 166 .vrs_drs[VCPU_REGS_DR1] = 0x0, 167 .vrs_drs[VCPU_REGS_DR2] = 0x0, 168 .vrs_drs[VCPU_REGS_DR3] = 0x0, 169 .vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0, 170 .vrs_drs[VCPU_REGS_DR7] = 0x400, 171 .vrs_msrs[VCPU_REGS_STAR] = 0ULL, 172 .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL, 173 .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL, 174 .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL, 175 .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL, 176 .vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL, 177 .vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87 178 }; 179 180 /* 181 * Represents a standard register set for an BIOS to be booted 182 * as a flat 16 bit address space. 183 */ 184 static const struct vcpu_reg_state vcpu_init_flat16 = { 185 .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2, 186 .vrs_gprs[VCPU_REGS_RIP] = 0xFFF0, 187 .vrs_gprs[VCPU_REGS_RSP] = 0x0, 188 .vrs_crs[VCPU_REGS_CR0] = 0x60000010, 189 .vrs_crs[VCPU_REGS_CR3] = 0, 190 .vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000}, 191 .vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 192 .vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0}, 193 .vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 194 .vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 195 .vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 196 .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0}, 197 .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0}, 198 .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0}, 199 .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0}, 200 .vrs_msrs[VCPU_REGS_EFER] = 0ULL, 201 .vrs_drs[VCPU_REGS_DR0] = 0x0, 202 .vrs_drs[VCPU_REGS_DR1] = 0x0, 203 .vrs_drs[VCPU_REGS_DR2] = 0x0, 204 .vrs_drs[VCPU_REGS_DR3] = 0x0, 205 .vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0, 206 .vrs_drs[VCPU_REGS_DR7] = 0x400, 207 .vrs_msrs[VCPU_REGS_STAR] = 0ULL, 208 .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL, 209 .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL, 210 .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL, 211 .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL, 212 .vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87 213 }; 214 215 /* 216 * vm_main 217 * 218 * Primary entrypoint for launching a vm. Does not return. 219 * 220 * fd: file descriptor for communicating with vmm process. 221 * fd_vmm: file descriptor for communicating with vmm(4) device 222 */ 223 void 224 vm_main(int fd, int vmm_fd) 225 { 226 struct vm_create_params *vcp = NULL; 227 struct vmd_vm vm; 228 size_t sz = 0; 229 int ret = 0; 230 231 /* 232 * We aren't root, so we can't chroot(2). Use unveil(2) instead. 233 */ 234 if (unveil(env->argv0, "x") == -1) 235 fatal("unveil %s", env->argv0); 236 if (unveil(NULL, NULL) == -1) 237 fatal("unveil lock"); 238 239 /* 240 * pledge in the vm processes: 241 * stdio - for malloc and basic I/O including events. 242 * vmm - for the vmm ioctls and operations. 243 * proc exec - fork/exec for launching devices. 244 * recvfd - for vm send/recv and sending fd to devices. 245 */ 246 if (pledge("stdio vmm proc exec recvfd", NULL) == -1) 247 fatal("pledge"); 248 249 /* Receive our vm configuration. */ 250 memset(&vm, 0, sizeof(vm)); 251 sz = atomicio(read, fd, &vm, sizeof(vm)); 252 if (sz != sizeof(vm)) { 253 log_warnx("failed to receive start message"); 254 _exit(EIO); 255 } 256 257 /* Update process with the vm name. */ 258 vcp = &vm.vm_params.vmc_params; 259 setproctitle("%s", vcp->vcp_name); 260 log_procinit("vm/%s", vcp->vcp_name); 261 262 /* Receive the local prefix settings. */ 263 sz = atomicio(read, fd, &env->vmd_cfg.cfg_localprefix, 264 sizeof(env->vmd_cfg.cfg_localprefix)); 265 if (sz != sizeof(env->vmd_cfg.cfg_localprefix)) { 266 log_warnx("failed to receive local prefix"); 267 _exit(EIO); 268 } 269 270 /* 271 * We need, at minimum, a vm_kernel fd to boot a vm. This is either a 272 * kernel or a BIOS image. 273 */ 274 if (!(vm.vm_state & VM_STATE_RECEIVED)) { 275 if (vm.vm_kernel == -1) { 276 log_warnx("%s: failed to receive boot fd", 277 vcp->vcp_name); 278 _exit(EINVAL); 279 } 280 if (fcntl(vm.vm_kernel, F_SETFL, O_NONBLOCK) == -1) { 281 ret = errno; 282 log_warn("failed to set nonblocking mode on boot fd"); 283 _exit(ret); 284 } 285 } 286 287 ret = start_vm(&vm, fd); 288 _exit(ret); 289 } 290 291 /* 292 * loadfile_bios 293 * 294 * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image 295 * directly into memory. 296 * 297 * Parameters: 298 * fp: file of a kernel file to load 299 * size: uncompressed size of the image 300 * (out) vrs: register state to set on init for this kernel 301 * 302 * Return values: 303 * 0 if successful 304 * various error codes returned from read(2) or loadelf functions 305 */ 306 int 307 loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_state *vrs) 308 { 309 off_t off; 310 311 /* Set up a "flat 16 bit" register state for BIOS */ 312 memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs)); 313 314 /* Seek to the beginning of the BIOS image */ 315 if (gzseek(fp, 0, SEEK_SET) == -1) 316 return (-1); 317 318 /* The BIOS image must end at 1MB */ 319 if ((off = MB(1) - size) < 0) 320 return (-1); 321 322 /* Read BIOS image into memory */ 323 if (mread(fp, off, size) != (size_t)size) { 324 errno = EIO; 325 return (-1); 326 } 327 328 if (gzseek(fp, 0, SEEK_SET) == -1) 329 return (-1); 330 331 /* Read a second BIOS copy into memory ending at 4GB */ 332 off = GB(4) - size; 333 if (mread(fp, off, size) != (size_t)size) { 334 errno = EIO; 335 return (-1); 336 } 337 338 log_debug("%s: loaded BIOS image", __func__); 339 340 return (0); 341 } 342 343 /* 344 * start_vm 345 * 346 * After forking a new VM process, starts the new VM with the creation 347 * parameters supplied (in the incoming vm->vm_params field). This 348 * function performs a basic sanity check on the incoming parameters 349 * and then performs the following steps to complete the creation of the VM: 350 * 351 * 1. validates and create the new VM 352 * 2. opens the imsg control channel to the parent and drops more privilege 353 * 3. drops additional privileges by calling pledge(2) 354 * 4. loads the kernel from the disk image or file descriptor 355 * 5. runs the VM's VCPU loops. 356 * 357 * Parameters: 358 * vm: The VM data structure that is including the VM create parameters. 359 * fd: The imsg socket that is connected to the parent process. 360 * 361 * Return values: 362 * 0: success 363 * !0 : failure - typically an errno indicating the source of the failure 364 */ 365 int 366 start_vm(struct vmd_vm *vm, int fd) 367 { 368 struct vmop_create_params *vmc = &vm->vm_params; 369 struct vm_create_params *vcp = &vmc->vmc_params; 370 struct vcpu_reg_state vrs; 371 int nicfds[VM_MAX_NICS_PER_VM]; 372 int ret; 373 gzFile fp; 374 size_t i; 375 struct vm_rwregs_params vrp; 376 struct stat sb; 377 378 /* 379 * We first try to initialize and allocate memory before bothering 380 * vmm(4) with a request to create a new vm. 381 */ 382 if (!(vm->vm_state & VM_STATE_RECEIVED)) 383 create_memory_map(vcp); 384 385 ret = alloc_guest_mem(vm); 386 if (ret) { 387 struct rlimit lim; 388 char buf[FMT_SCALED_STRSIZE]; 389 if (ret == ENOMEM && getrlimit(RLIMIT_DATA, &lim) == 0) { 390 if (fmt_scaled(lim.rlim_cur, buf) == 0) 391 fatalx("could not allocate guest memory (data " 392 "limit is %s)", buf); 393 } 394 errno = ret; 395 log_warn("could not allocate guest memory"); 396 return (ret); 397 } 398 399 /* We've allocated guest memory, so now create the vm in vmm(4). */ 400 ret = vmm_create_vm(vm); 401 if (ret) { 402 /* Let the vmm process know we failed by sending a 0 vm id. */ 403 vcp->vcp_id = 0; 404 atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id)); 405 return (ret); 406 } 407 408 /* 409 * Some of vmd currently relies on global state (current_vm, con_fd). 410 */ 411 current_vm = vm; 412 con_fd = vm->vm_tty; 413 if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1) { 414 log_warn("failed to set nonblocking mode on console"); 415 return (1); 416 } 417 418 /* 419 * We now let the vmm process know we were successful by sending it our 420 * vmm(4) assigned vm id. 421 */ 422 if (atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) != 423 sizeof(vcp->vcp_id)) { 424 log_warn("failed to send created vm id to vmm process"); 425 return (1); 426 } 427 428 /* Prepare either our boot image or receive an existing vm to launch. */ 429 if (vm->vm_state & VM_STATE_RECEIVED) { 430 ret = atomicio(read, vm->vm_receive_fd, &vrp, sizeof(vrp)); 431 if (ret != sizeof(vrp)) 432 fatal("received incomplete vrp - exiting"); 433 vrs = vrp.vrwp_regs; 434 } else { 435 /* 436 * Set up default "flat 64 bit" register state - RIP, 437 * RSP, and GDT info will be set in bootloader 438 */ 439 memcpy(&vrs, &vcpu_init_flat64, sizeof(vrs)); 440 441 /* Find and open kernel image */ 442 if ((fp = gzdopen(vm->vm_kernel, "r")) == NULL) 443 fatalx("failed to open kernel - exiting"); 444 445 /* Load kernel image */ 446 ret = loadfile_elf(fp, vm, &vrs, vmc->vmc_bootdevice); 447 448 /* 449 * Try BIOS as a fallback (only if it was provided as an image 450 * with vm->vm_kernel and the file is not compressed) 451 */ 452 if (ret && errno == ENOEXEC && vm->vm_kernel != -1 && 453 gzdirect(fp) && (ret = fstat(vm->vm_kernel, &sb)) == 0) 454 ret = loadfile_bios(fp, sb.st_size, &vrs); 455 456 if (ret) 457 fatal("failed to load kernel or BIOS - exiting"); 458 459 gzclose(fp); 460 } 461 462 if (vm->vm_kernel != -1) 463 close_fd(vm->vm_kernel); 464 465 /* Initialize our mutexes. */ 466 ret = pthread_mutex_init(&threadmutex, NULL); 467 if (ret) { 468 log_warn("%s: could not initialize thread state mutex", 469 __func__); 470 return (ret); 471 } 472 ret = pthread_cond_init(&threadcond, NULL); 473 if (ret) { 474 log_warn("%s: could not initialize thread state " 475 "condition variable", __func__); 476 return (ret); 477 } 478 mutex_lock(&threadmutex); 479 480 481 /* 482 * Finalize our communication socket with the vmm process. From here 483 * onwards, communication with the vmm process is event-based. 484 */ 485 event_init(); 486 if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1) 487 fatal("setup vm pipe"); 488 489 /* 490 * Initialize or restore our emulated hardware. 491 */ 492 for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) 493 nicfds[i] = vm->vm_ifs[i].vif_fd; 494 495 if (vm->vm_state & VM_STATE_RECEIVED) { 496 restore_mem(vm->vm_receive_fd, vcp); 497 restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds, 498 vm->vm_disks, vm->vm_cdrom); 499 if (restore_vm_params(vm->vm_receive_fd, vcp)) 500 fatal("restore vm params failed"); 501 unpause_vm(vm); 502 } else 503 init_emulated_hw(vmc, vm->vm_cdrom, vm->vm_disks, nicfds); 504 505 /* Drop privleges further before starting the vcpu run loop(s). */ 506 if (pledge("stdio vmm recvfd", NULL) == -1) 507 fatal("pledge"); 508 509 /* 510 * Execute the vcpu run loop(s) for this VM. 511 */ 512 ret = run_vm(&vm->vm_params, &vrs); 513 514 /* Ensure that any in-flight data is written back */ 515 virtio_shutdown(vm); 516 517 return (ret); 518 } 519 520 /* 521 * vm_dispatch_vmm 522 * 523 * imsg callback for messages that are received from the vmm parent process. 524 */ 525 void 526 vm_dispatch_vmm(int fd, short event, void *arg) 527 { 528 struct vmd_vm *vm = arg; 529 struct vmop_result vmr; 530 struct vmop_addr_result var; 531 struct imsgev *iev = &vm->vm_iev; 532 struct imsgbuf *ibuf = &iev->ibuf; 533 struct imsg imsg; 534 ssize_t n; 535 int verbose; 536 537 if (event & EV_READ) { 538 if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN) 539 fatal("%s: imsg_read", __func__); 540 if (n == 0) 541 _exit(0); 542 } 543 544 if (event & EV_WRITE) { 545 if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN) 546 fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd); 547 if (n == 0) 548 _exit(0); 549 } 550 551 for (;;) { 552 if ((n = imsg_get(ibuf, &imsg)) == -1) 553 fatal("%s: imsg_get", __func__); 554 if (n == 0) 555 break; 556 557 #if DEBUG > 1 558 log_debug("%s: got imsg %d from %s", 559 __func__, imsg.hdr.type, 560 vm->vm_params.vmc_params.vcp_name); 561 #endif 562 563 switch (imsg.hdr.type) { 564 case IMSG_CTL_VERBOSE: 565 IMSG_SIZE_CHECK(&imsg, &verbose); 566 memcpy(&verbose, imsg.data, sizeof(verbose)); 567 log_setverbose(verbose); 568 virtio_broadcast_imsg(vm, IMSG_CTL_VERBOSE, &verbose, 569 sizeof(verbose)); 570 break; 571 case IMSG_VMDOP_VM_SHUTDOWN: 572 if (vmmci_ctl(VMMCI_SHUTDOWN) == -1) 573 _exit(0); 574 break; 575 case IMSG_VMDOP_VM_REBOOT: 576 if (vmmci_ctl(VMMCI_REBOOT) == -1) 577 _exit(0); 578 break; 579 case IMSG_VMDOP_PAUSE_VM: 580 vmr.vmr_result = 0; 581 vmr.vmr_id = vm->vm_vmid; 582 pause_vm(vm); 583 imsg_compose_event(&vm->vm_iev, 584 IMSG_VMDOP_PAUSE_VM_RESPONSE, 585 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 586 sizeof(vmr)); 587 break; 588 case IMSG_VMDOP_UNPAUSE_VM: 589 vmr.vmr_result = 0; 590 vmr.vmr_id = vm->vm_vmid; 591 unpause_vm(vm); 592 imsg_compose_event(&vm->vm_iev, 593 IMSG_VMDOP_UNPAUSE_VM_RESPONSE, 594 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 595 sizeof(vmr)); 596 break; 597 case IMSG_VMDOP_SEND_VM_REQUEST: 598 vmr.vmr_id = vm->vm_vmid; 599 vmr.vmr_result = send_vm(imsg_get_fd(&imsg), vm); 600 imsg_compose_event(&vm->vm_iev, 601 IMSG_VMDOP_SEND_VM_RESPONSE, 602 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 603 sizeof(vmr)); 604 if (!vmr.vmr_result) { 605 imsg_flush(¤t_vm->vm_iev.ibuf); 606 _exit(0); 607 } 608 break; 609 case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE: 610 IMSG_SIZE_CHECK(&imsg, &var); 611 memcpy(&var, imsg.data, sizeof(var)); 612 613 log_debug("%s: received tap addr %s for nic %d", 614 vm->vm_params.vmc_params.vcp_name, 615 ether_ntoa((void *)var.var_addr), var.var_nic_idx); 616 617 vionet_set_hostmac(vm, var.var_nic_idx, var.var_addr); 618 break; 619 default: 620 fatalx("%s: got invalid imsg %d from %s", 621 __func__, imsg.hdr.type, 622 vm->vm_params.vmc_params.vcp_name); 623 } 624 imsg_free(&imsg); 625 } 626 imsg_event_add(iev); 627 } 628 629 /* 630 * vm_shutdown 631 * 632 * Tell the vmm parent process to shutdown or reboot the VM and exit. 633 */ 634 __dead void 635 vm_shutdown(unsigned int cmd) 636 { 637 switch (cmd) { 638 case VMMCI_NONE: 639 case VMMCI_SHUTDOWN: 640 (void)imsg_compose_event(¤t_vm->vm_iev, 641 IMSG_VMDOP_VM_SHUTDOWN, 0, 0, -1, NULL, 0); 642 break; 643 case VMMCI_REBOOT: 644 (void)imsg_compose_event(¤t_vm->vm_iev, 645 IMSG_VMDOP_VM_REBOOT, 0, 0, -1, NULL, 0); 646 break; 647 default: 648 fatalx("invalid vm ctl command: %d", cmd); 649 } 650 imsg_flush(¤t_vm->vm_iev.ibuf); 651 652 _exit(0); 653 } 654 655 int 656 send_vm(int fd, struct vmd_vm *vm) 657 { 658 struct vm_rwregs_params vrp; 659 struct vm_rwvmparams_params vpp; 660 struct vmop_create_params *vmc; 661 struct vm_terminate_params vtp; 662 unsigned int flags = 0; 663 unsigned int i; 664 int ret = 0; 665 size_t sz; 666 667 if (dump_send_header(fd)) { 668 log_warnx("%s: failed to send vm dump header", __func__); 669 goto err; 670 } 671 672 pause_vm(vm); 673 674 vmc = calloc(1, sizeof(struct vmop_create_params)); 675 if (vmc == NULL) { 676 log_warn("%s: calloc error getting vmc", __func__); 677 ret = -1; 678 goto err; 679 } 680 681 flags |= VMOP_CREATE_MEMORY; 682 memcpy(&vmc->vmc_params, ¤t_vm->vm_params, sizeof(struct 683 vmop_create_params)); 684 vmc->vmc_flags = flags; 685 vrp.vrwp_vm_id = vm->vm_params.vmc_params.vcp_id; 686 vrp.vrwp_mask = VM_RWREGS_ALL; 687 vpp.vpp_mask = VM_RWVMPARAMS_ALL; 688 vpp.vpp_vm_id = vm->vm_params.vmc_params.vcp_id; 689 690 sz = atomicio(vwrite, fd, vmc, sizeof(struct vmop_create_params)); 691 if (sz != sizeof(struct vmop_create_params)) { 692 ret = -1; 693 goto err; 694 } 695 696 for (i = 0; i < vm->vm_params.vmc_params.vcp_ncpus; i++) { 697 vrp.vrwp_vcpu_id = i; 698 if ((ret = ioctl(env->vmd_fd, VMM_IOC_READREGS, &vrp))) { 699 log_warn("%s: readregs failed", __func__); 700 goto err; 701 } 702 703 sz = atomicio(vwrite, fd, &vrp, 704 sizeof(struct vm_rwregs_params)); 705 if (sz != sizeof(struct vm_rwregs_params)) { 706 log_warn("%s: dumping registers failed", __func__); 707 ret = -1; 708 goto err; 709 } 710 } 711 712 /* Dump memory before devices to aid in restoration. */ 713 if ((ret = dump_mem(fd, vm))) 714 goto err; 715 if ((ret = i8253_dump(fd))) 716 goto err; 717 if ((ret = i8259_dump(fd))) 718 goto err; 719 if ((ret = ns8250_dump(fd))) 720 goto err; 721 if ((ret = mc146818_dump(fd))) 722 goto err; 723 if ((ret = fw_cfg_dump(fd))) 724 goto err; 725 if ((ret = pci_dump(fd))) 726 goto err; 727 if ((ret = virtio_dump(fd))) 728 goto err; 729 730 for (i = 0; i < vm->vm_params.vmc_params.vcp_ncpus; i++) { 731 vpp.vpp_vcpu_id = i; 732 if ((ret = ioctl(env->vmd_fd, VMM_IOC_READVMPARAMS, &vpp))) { 733 log_warn("%s: readvmparams failed", __func__); 734 goto err; 735 } 736 737 sz = atomicio(vwrite, fd, &vpp, 738 sizeof(struct vm_rwvmparams_params)); 739 if (sz != sizeof(struct vm_rwvmparams_params)) { 740 log_warn("%s: dumping vm params failed", __func__); 741 ret = -1; 742 goto err; 743 } 744 } 745 746 vtp.vtp_vm_id = vm->vm_params.vmc_params.vcp_id; 747 if (ioctl(env->vmd_fd, VMM_IOC_TERM, &vtp) == -1) { 748 log_warnx("%s: term IOC error: %d, %d", __func__, 749 errno, ENOENT); 750 } 751 err: 752 close(fd); 753 if (ret) 754 unpause_vm(vm); 755 return ret; 756 } 757 758 int 759 dump_send_header(int fd) { 760 struct vm_dump_header vmh; 761 int i; 762 763 memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE, 764 sizeof(vmh.vmh_signature)); 765 766 vmh.vmh_cpuids[0].code = 0x00; 767 vmh.vmh_cpuids[0].leaf = 0x00; 768 769 vmh.vmh_cpuids[1].code = 0x01; 770 vmh.vmh_cpuids[1].leaf = 0x00; 771 772 vmh.vmh_cpuids[2].code = 0x07; 773 vmh.vmh_cpuids[2].leaf = 0x00; 774 775 vmh.vmh_cpuids[3].code = 0x0d; 776 vmh.vmh_cpuids[3].leaf = 0x00; 777 778 vmh.vmh_cpuids[4].code = 0x80000001; 779 vmh.vmh_cpuids[4].leaf = 0x00; 780 781 vmh.vmh_version = VM_DUMP_VERSION; 782 783 for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) { 784 CPUID_LEAF(vmh.vmh_cpuids[i].code, 785 vmh.vmh_cpuids[i].leaf, 786 vmh.vmh_cpuids[i].a, 787 vmh.vmh_cpuids[i].b, 788 vmh.vmh_cpuids[i].c, 789 vmh.vmh_cpuids[i].d); 790 } 791 792 if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh)) 793 return (-1); 794 795 return (0); 796 } 797 798 int 799 dump_mem(int fd, struct vmd_vm *vm) 800 { 801 unsigned int i; 802 int ret; 803 struct vm_mem_range *vmr; 804 805 for (i = 0; i < vm->vm_params.vmc_params.vcp_nmemranges; i++) { 806 vmr = &vm->vm_params.vmc_params.vcp_memranges[i]; 807 ret = dump_vmr(fd, vmr); 808 if (ret) 809 return ret; 810 } 811 return (0); 812 } 813 814 int 815 restore_vm_params(int fd, struct vm_create_params *vcp) { 816 unsigned int i; 817 struct vm_rwvmparams_params vpp; 818 819 for (i = 0; i < vcp->vcp_ncpus; i++) { 820 if (atomicio(read, fd, &vpp, sizeof(vpp)) != sizeof(vpp)) { 821 log_warn("%s: error restoring vm params", __func__); 822 return (-1); 823 } 824 vpp.vpp_vm_id = vcp->vcp_id; 825 vpp.vpp_vcpu_id = i; 826 if (ioctl(env->vmd_fd, VMM_IOC_WRITEVMPARAMS, &vpp) < 0) { 827 log_debug("%s: writing vm params failed", __func__); 828 return (-1); 829 } 830 } 831 return (0); 832 } 833 834 void 835 restore_mem(int fd, struct vm_create_params *vcp) 836 { 837 unsigned int i; 838 struct vm_mem_range *vmr; 839 840 for (i = 0; i < vcp->vcp_nmemranges; i++) { 841 vmr = &vcp->vcp_memranges[i]; 842 restore_vmr(fd, vmr); 843 } 844 } 845 846 int 847 dump_vmr(int fd, struct vm_mem_range *vmr) 848 { 849 size_t rem = vmr->vmr_size, read=0; 850 char buf[PAGE_SIZE]; 851 852 while (rem > 0) { 853 if (read_mem(vmr->vmr_gpa + read, buf, PAGE_SIZE)) { 854 log_warn("failed to read vmr"); 855 return (-1); 856 } 857 if (atomicio(vwrite, fd, buf, sizeof(buf)) != sizeof(buf)) { 858 log_warn("failed to dump vmr"); 859 return (-1); 860 } 861 rem = rem - PAGE_SIZE; 862 read = read + PAGE_SIZE; 863 } 864 return (0); 865 } 866 867 void 868 restore_vmr(int fd, struct vm_mem_range *vmr) 869 { 870 size_t rem = vmr->vmr_size, wrote=0; 871 char buf[PAGE_SIZE]; 872 873 while (rem > 0) { 874 if (atomicio(read, fd, buf, sizeof(buf)) != sizeof(buf)) 875 fatal("failed to restore vmr"); 876 if (write_mem(vmr->vmr_gpa + wrote, buf, PAGE_SIZE)) 877 fatal("failed to write vmr"); 878 rem = rem - PAGE_SIZE; 879 wrote = wrote + PAGE_SIZE; 880 } 881 } 882 883 static void 884 pause_vm(struct vmd_vm *vm) 885 { 886 unsigned int n; 887 int ret; 888 if (vm->vm_state & VM_STATE_PAUSED) 889 return; 890 891 current_vm->vm_state |= VM_STATE_PAUSED; 892 893 ret = pthread_barrier_init(&vm_pause_barrier, NULL, 894 vm->vm_params.vmc_params.vcp_ncpus + 1); 895 if (ret) { 896 log_warnx("%s: cannot initialize pause barrier (%d)", 897 __progname, ret); 898 return; 899 } 900 901 for (n = 0; n < vm->vm_params.vmc_params.vcp_ncpus; n++) { 902 ret = pthread_cond_broadcast(&vcpu_run_cond[n]); 903 if (ret) { 904 log_warnx("%s: can't broadcast vcpu run cond (%d)", 905 __func__, (int)ret); 906 return; 907 } 908 } 909 ret = pthread_barrier_wait(&vm_pause_barrier); 910 if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) { 911 log_warnx("%s: could not wait on pause barrier (%d)", 912 __func__, (int)ret); 913 return; 914 } 915 916 ret = pthread_barrier_destroy(&vm_pause_barrier); 917 if (ret) { 918 log_warnx("%s: could not destroy pause barrier (%d)", 919 __progname, ret); 920 return; 921 } 922 923 i8253_stop(); 924 mc146818_stop(); 925 ns8250_stop(); 926 virtio_stop(vm); 927 } 928 929 static void 930 unpause_vm(struct vmd_vm *vm) 931 { 932 unsigned int n; 933 int ret; 934 if (!(vm->vm_state & VM_STATE_PAUSED)) 935 return; 936 937 current_vm->vm_state &= ~VM_STATE_PAUSED; 938 for (n = 0; n < vm->vm_params.vmc_params.vcp_ncpus; n++) { 939 ret = pthread_cond_broadcast(&vcpu_unpause_cond[n]); 940 if (ret) { 941 log_warnx("%s: can't broadcast vcpu unpause cond (%d)", 942 __func__, (int)ret); 943 return; 944 } 945 } 946 947 i8253_start(); 948 mc146818_start(); 949 ns8250_start(); 950 virtio_start(vm); 951 } 952 953 /* 954 * vcpu_reset 955 * 956 * Requests vmm(4) to reset the VCPUs in the indicated VM to 957 * the register state provided 958 * 959 * Parameters 960 * vmid: VM ID to reset 961 * vcpu_id: VCPU ID to reset 962 * vrs: the register state to initialize 963 * 964 * Return values: 965 * 0: success 966 * !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not 967 * valid) 968 */ 969 int 970 vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs) 971 { 972 struct vm_resetcpu_params vrp; 973 974 memset(&vrp, 0, sizeof(vrp)); 975 vrp.vrp_vm_id = vmid; 976 vrp.vrp_vcpu_id = vcpu_id; 977 memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state)); 978 979 log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid); 980 981 if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) == -1) 982 return (errno); 983 984 return (0); 985 } 986 987 /* 988 * create_memory_map 989 * 990 * Sets up the guest physical memory ranges that the VM can access. 991 * 992 * Parameters: 993 * vcp: VM create parameters describing the VM whose memory map 994 * is being created 995 * 996 * Return values: 997 * nothing 998 */ 999 void 1000 create_memory_map(struct vm_create_params *vcp) 1001 { 1002 size_t len, mem_bytes; 1003 size_t above_1m = 0, above_4g = 0; 1004 1005 mem_bytes = vcp->vcp_memranges[0].vmr_size; 1006 vcp->vcp_nmemranges = 0; 1007 if (mem_bytes == 0 || mem_bytes > VMM_MAX_VM_MEM_SIZE) 1008 return; 1009 1010 /* First memory region: 0 - LOWMEM_KB (DOS low mem) */ 1011 len = LOWMEM_KB * 1024; 1012 vcp->vcp_memranges[0].vmr_gpa = 0x0; 1013 vcp->vcp_memranges[0].vmr_size = len; 1014 vcp->vcp_memranges[0].vmr_type = VM_MEM_RAM; 1015 mem_bytes -= len; 1016 1017 /* 1018 * Second memory region: LOWMEM_KB - 1MB. 1019 * 1020 * N.B. - Normally ROMs or parts of video RAM are mapped here. 1021 * We have to add this region, because some systems 1022 * unconditionally write to 0xb8000 (VGA RAM), and 1023 * we need to make sure that vmm(4) permits accesses 1024 * to it. So allocate guest memory for it. 1025 */ 1026 len = MB(1) - (LOWMEM_KB * 1024); 1027 vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024; 1028 vcp->vcp_memranges[1].vmr_size = len; 1029 vcp->vcp_memranges[1].vmr_type = VM_MEM_RESERVED; 1030 mem_bytes -= len; 1031 1032 /* If we have less than 2MB remaining, still create a 2nd BIOS area. */ 1033 if (mem_bytes <= MB(2)) { 1034 vcp->vcp_memranges[2].vmr_gpa = VMM_PCI_MMIO_BAR_END; 1035 vcp->vcp_memranges[2].vmr_size = MB(2); 1036 vcp->vcp_memranges[2].vmr_type = VM_MEM_RESERVED; 1037 vcp->vcp_nmemranges = 3; 1038 return; 1039 } 1040 1041 /* 1042 * Calculate the how to split any remaining memory across the 4GB 1043 * boundary while making sure we do not place physical memory into 1044 * MMIO ranges. 1045 */ 1046 if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - MB(1)) { 1047 above_1m = VMM_PCI_MMIO_BAR_BASE - MB(1); 1048 above_4g = mem_bytes - above_1m; 1049 } else { 1050 above_1m = mem_bytes; 1051 above_4g = 0; 1052 } 1053 1054 /* Third memory region: area above 1MB to MMIO region */ 1055 vcp->vcp_memranges[2].vmr_gpa = MB(1); 1056 vcp->vcp_memranges[2].vmr_size = above_1m; 1057 vcp->vcp_memranges[2].vmr_type = VM_MEM_RAM; 1058 1059 /* Fourth region: PCI MMIO range */ 1060 vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_BASE; 1061 vcp->vcp_memranges[3].vmr_size = VMM_PCI_MMIO_BAR_END - 1062 VMM_PCI_MMIO_BAR_BASE + 1; 1063 vcp->vcp_memranges[3].vmr_type = VM_MEM_MMIO; 1064 1065 /* Fifth region: 2nd copy of BIOS above MMIO ending at 4GB */ 1066 vcp->vcp_memranges[4].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1; 1067 vcp->vcp_memranges[4].vmr_size = MB(2); 1068 vcp->vcp_memranges[4].vmr_type = VM_MEM_RESERVED; 1069 1070 /* Sixth region: any remainder above 4GB */ 1071 if (above_4g > 0) { 1072 vcp->vcp_memranges[5].vmr_gpa = GB(4); 1073 vcp->vcp_memranges[5].vmr_size = above_4g; 1074 vcp->vcp_memranges[5].vmr_type = VM_MEM_RAM; 1075 vcp->vcp_nmemranges = 6; 1076 } else 1077 vcp->vcp_nmemranges = 5; 1078 } 1079 1080 /* 1081 * alloc_guest_mem 1082 * 1083 * Allocates memory for the guest. 1084 * Instead of doing a single allocation with one mmap(), we allocate memory 1085 * separately for every range for the following reasons: 1086 * - ASLR for the individual ranges 1087 * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to 1088 * map the single mmap'd userspace memory to the individual guest physical 1089 * memory ranges, the underlying amap of the single mmap'd range would have 1090 * to allocate per-page reference counters. The reason is that the 1091 * individual guest physical ranges would reference the single mmap'd region 1092 * only partially. However, if every guest physical range has its own 1093 * corresponding mmap'd userspace allocation, there are no partial 1094 * references: every guest physical range fully references an mmap'd 1095 * range => no per-page reference counters have to be allocated. 1096 * 1097 * Return values: 1098 * 0: success 1099 * !0: failure - errno indicating the source of the failure 1100 */ 1101 int 1102 alloc_guest_mem(struct vmd_vm *vm) 1103 { 1104 void *p; 1105 int ret = 0; 1106 size_t i, j; 1107 struct vm_create_params *vcp = &vm->vm_params.vmc_params; 1108 struct vm_mem_range *vmr; 1109 1110 for (i = 0; i < vcp->vcp_nmemranges; i++) { 1111 vmr = &vcp->vcp_memranges[i]; 1112 1113 /* 1114 * We only need R/W as userland. vmm(4) will use R/W/X in its 1115 * mapping. 1116 * 1117 * We must use MAP_SHARED so emulated devices will be able 1118 * to generate shared mappings. 1119 */ 1120 p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE, 1121 MAP_ANON | MAP_CONCEAL | MAP_SHARED, -1, 0); 1122 if (p == MAP_FAILED) { 1123 ret = errno; 1124 for (j = 0; j < i; j++) { 1125 vmr = &vcp->vcp_memranges[j]; 1126 munmap((void *)vmr->vmr_va, vmr->vmr_size); 1127 } 1128 return (ret); 1129 } 1130 vmr->vmr_va = (vaddr_t)p; 1131 } 1132 1133 return (ret); 1134 } 1135 1136 /* 1137 * vmm_create_vm 1138 * 1139 * Requests vmm(4) to create a new VM using the supplied creation 1140 * parameters. This operation results in the creation of the in-kernel 1141 * structures for the VM, but does not start the VM's vcpu(s). 1142 * 1143 * Parameters: 1144 * vm: pointer to the vm object 1145 * 1146 * Return values: 1147 * 0: success 1148 * !0 : ioctl to vmm(4) failed 1149 */ 1150 static int 1151 vmm_create_vm(struct vmd_vm *vm) 1152 { 1153 struct vm_create_params *vcp = &vm->vm_params.vmc_params; 1154 1155 /* Sanity check arguments */ 1156 if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) 1157 return (EINVAL); 1158 1159 if (vcp->vcp_nmemranges == 0 || 1160 vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) 1161 return (EINVAL); 1162 1163 if (vm->vm_params.vmc_ndisks > VM_MAX_DISKS_PER_VM) 1164 return (EINVAL); 1165 1166 if (vm->vm_params.vmc_nnics > VM_MAX_NICS_PER_VM) 1167 return (EINVAL); 1168 1169 if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) == -1) 1170 return (errno); 1171 1172 return (0); 1173 } 1174 1175 /* 1176 * init_emulated_hw 1177 * 1178 * Initializes the userspace hardware emulation 1179 */ 1180 void 1181 init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom, 1182 int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps) 1183 { 1184 struct vm_create_params *vcp = &vmc->vmc_params; 1185 size_t i; 1186 uint64_t memlo, memhi; 1187 1188 /* Calculate memory size for NVRAM registers */ 1189 memlo = memhi = 0; 1190 for (i = 0; i < vcp->vcp_nmemranges; i++) { 1191 if (vcp->vcp_memranges[i].vmr_gpa == MB(1) && 1192 vcp->vcp_memranges[i].vmr_size > (15 * MB(1))) 1193 memlo = vcp->vcp_memranges[i].vmr_size - (15 * MB(1)); 1194 else if (vcp->vcp_memranges[i].vmr_gpa == GB(4)) 1195 memhi = vcp->vcp_memranges[i].vmr_size; 1196 } 1197 1198 /* Reset the IO port map */ 1199 memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS); 1200 1201 /* Init i8253 PIT */ 1202 i8253_init(vcp->vcp_id); 1203 ioports_map[TIMER_CTRL] = vcpu_exit_i8253; 1204 ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253; 1205 ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253; 1206 ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253; 1207 ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc; 1208 1209 /* Init mc146818 RTC */ 1210 mc146818_init(vcp->vcp_id, memlo, memhi); 1211 ioports_map[IO_RTC] = vcpu_exit_mc146818; 1212 ioports_map[IO_RTC + 1] = vcpu_exit_mc146818; 1213 1214 /* Init master and slave PICs */ 1215 i8259_init(); 1216 ioports_map[IO_ICU1] = vcpu_exit_i8259; 1217 ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259; 1218 ioports_map[IO_ICU2] = vcpu_exit_i8259; 1219 ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259; 1220 ioports_map[ELCR0] = vcpu_exit_elcr; 1221 ioports_map[ELCR1] = vcpu_exit_elcr; 1222 1223 /* Init ns8250 UART */ 1224 ns8250_init(con_fd, vcp->vcp_id); 1225 for (i = COM1_DATA; i <= COM1_SCR; i++) 1226 ioports_map[i] = vcpu_exit_com; 1227 1228 /* Initialize PCI */ 1229 for (i = VM_PCI_IO_BAR_BASE; i <= VM_PCI_IO_BAR_END; i++) 1230 ioports_map[i] = vcpu_exit_pci; 1231 1232 ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci; 1233 ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci; 1234 ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci; 1235 ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci; 1236 ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci; 1237 pci_init(); 1238 1239 /* Initialize virtio devices */ 1240 virtio_init(current_vm, child_cdrom, child_disks, child_taps); 1241 1242 /* 1243 * Init QEMU fw_cfg interface. Must be done last for pci hardware 1244 * detection. 1245 */ 1246 fw_cfg_init(vmc); 1247 ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg; 1248 ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg; 1249 ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma; 1250 ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma; 1251 } 1252 1253 /* 1254 * restore_emulated_hw 1255 * 1256 * Restores the userspace hardware emulation from fd 1257 */ 1258 void 1259 restore_emulated_hw(struct vm_create_params *vcp, int fd, 1260 int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom) 1261 { 1262 /* struct vm_create_params *vcp = &vmc->vmc_params; */ 1263 int i; 1264 memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS); 1265 1266 /* Init i8253 PIT */ 1267 i8253_restore(fd, vcp->vcp_id); 1268 ioports_map[TIMER_CTRL] = vcpu_exit_i8253; 1269 ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253; 1270 ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253; 1271 ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253; 1272 1273 /* Init master and slave PICs */ 1274 i8259_restore(fd); 1275 ioports_map[IO_ICU1] = vcpu_exit_i8259; 1276 ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259; 1277 ioports_map[IO_ICU2] = vcpu_exit_i8259; 1278 ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259; 1279 1280 /* Init ns8250 UART */ 1281 ns8250_restore(fd, con_fd, vcp->vcp_id); 1282 for (i = COM1_DATA; i <= COM1_SCR; i++) 1283 ioports_map[i] = vcpu_exit_com; 1284 1285 /* Init mc146818 RTC */ 1286 mc146818_restore(fd, vcp->vcp_id); 1287 ioports_map[IO_RTC] = vcpu_exit_mc146818; 1288 ioports_map[IO_RTC + 1] = vcpu_exit_mc146818; 1289 1290 /* Init QEMU fw_cfg interface */ 1291 fw_cfg_restore(fd); 1292 ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg; 1293 ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg; 1294 ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma; 1295 ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma; 1296 1297 /* Initialize PCI */ 1298 for (i = VM_PCI_IO_BAR_BASE; i <= VM_PCI_IO_BAR_END; i++) 1299 ioports_map[i] = vcpu_exit_pci; 1300 1301 ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci; 1302 ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci; 1303 ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci; 1304 ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci; 1305 ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci; 1306 pci_restore(fd); 1307 virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps); 1308 } 1309 1310 /* 1311 * run_vm 1312 * 1313 * Runs the VM whose creation parameters are specified in vcp 1314 * 1315 * Parameters: 1316 * child_cdrom: previously-opened child ISO disk file descriptor 1317 * child_disks: previously-opened child VM disk file file descriptors 1318 * child_taps: previously-opened child tap file descriptors 1319 * vmc: vmop_create_params struct containing the VM's desired creation 1320 * configuration 1321 * vrs: VCPU register state to initialize 1322 * 1323 * Return values: 1324 * 0: the VM exited normally 1325 * !0 : the VM exited abnormally or failed to start 1326 */ 1327 static int 1328 run_vm(struct vmop_create_params *vmc, struct vcpu_reg_state *vrs) 1329 { 1330 struct vm_create_params *vcp = &vmc->vmc_params; 1331 struct vm_rwregs_params vregsp; 1332 uint8_t evdone = 0; 1333 size_t i; 1334 int ret; 1335 pthread_t *tid, evtid; 1336 char tname[MAXCOMLEN + 1]; 1337 struct vm_run_params **vrp; 1338 void *exit_status; 1339 1340 if (vcp == NULL) 1341 return (EINVAL); 1342 1343 if (vcp->vcp_nmemranges == 0 || 1344 vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) 1345 return (EINVAL); 1346 1347 tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t)); 1348 vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *)); 1349 if (tid == NULL || vrp == NULL) { 1350 log_warn("%s: memory allocation error - exiting.", 1351 __progname); 1352 return (ENOMEM); 1353 } 1354 1355 log_debug("%s: starting %zu vcpu thread(s) for vm %s", __func__, 1356 vcp->vcp_ncpus, vcp->vcp_name); 1357 1358 /* 1359 * Create and launch one thread for each VCPU. These threads may 1360 * migrate between PCPUs over time; the need to reload CPU state 1361 * in such situations is detected and performed by vmm(4) in the 1362 * kernel. 1363 */ 1364 for (i = 0 ; i < vcp->vcp_ncpus; i++) { 1365 vrp[i] = malloc(sizeof(struct vm_run_params)); 1366 if (vrp[i] == NULL) { 1367 log_warn("%s: memory allocation error - " 1368 "exiting.", __progname); 1369 /* caller will exit, so skip freeing */ 1370 return (ENOMEM); 1371 } 1372 vrp[i]->vrp_exit = malloc(sizeof(struct vm_exit)); 1373 if (vrp[i]->vrp_exit == NULL) { 1374 log_warn("%s: memory allocation error - " 1375 "exiting.", __progname); 1376 /* caller will exit, so skip freeing */ 1377 return (ENOMEM); 1378 } 1379 vrp[i]->vrp_vm_id = vcp->vcp_id; 1380 vrp[i]->vrp_vcpu_id = i; 1381 1382 if (vcpu_reset(vcp->vcp_id, i, vrs)) { 1383 log_warnx("%s: cannot reset VCPU %zu - exiting.", 1384 __progname, i); 1385 return (EIO); 1386 } 1387 1388 /* once more because reset_cpu changes regs */ 1389 if (current_vm->vm_state & VM_STATE_RECEIVED) { 1390 vregsp.vrwp_vm_id = vcp->vcp_id; 1391 vregsp.vrwp_vcpu_id = i; 1392 vregsp.vrwp_regs = *vrs; 1393 vregsp.vrwp_mask = VM_RWREGS_ALL; 1394 if ((ret = ioctl(env->vmd_fd, VMM_IOC_WRITEREGS, 1395 &vregsp)) == -1) { 1396 log_warn("%s: writeregs failed", __func__); 1397 return (ret); 1398 } 1399 } 1400 1401 ret = pthread_cond_init(&vcpu_run_cond[i], NULL); 1402 if (ret) { 1403 log_warnx("%s: cannot initialize cond var (%d)", 1404 __progname, ret); 1405 return (ret); 1406 } 1407 1408 ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL); 1409 if (ret) { 1410 log_warnx("%s: cannot initialize mtx (%d)", 1411 __progname, ret); 1412 return (ret); 1413 } 1414 1415 ret = pthread_cond_init(&vcpu_unpause_cond[i], NULL); 1416 if (ret) { 1417 log_warnx("%s: cannot initialize unpause var (%d)", 1418 __progname, ret); 1419 return (ret); 1420 } 1421 1422 ret = pthread_mutex_init(&vcpu_unpause_mtx[i], NULL); 1423 if (ret) { 1424 log_warnx("%s: cannot initialize unpause mtx (%d)", 1425 __progname, ret); 1426 return (ret); 1427 } 1428 1429 vcpu_hlt[i] = 0; 1430 1431 /* Start each VCPU run thread at vcpu_run_loop */ 1432 ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]); 1433 if (ret) { 1434 /* caller will _exit after this return */ 1435 ret = errno; 1436 log_warn("%s: could not create vcpu thread %zu", 1437 __func__, i); 1438 return (ret); 1439 } 1440 1441 snprintf(tname, sizeof(tname), "vcpu-%zu", i); 1442 pthread_set_name_np(tid[i], tname); 1443 } 1444 1445 log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name); 1446 ret = pthread_create(&evtid, NULL, event_thread, &evdone); 1447 if (ret) { 1448 errno = ret; 1449 log_warn("%s: could not create event thread", __func__); 1450 return (ret); 1451 } 1452 pthread_set_name_np(evtid, "event"); 1453 1454 for (;;) { 1455 ret = pthread_cond_wait(&threadcond, &threadmutex); 1456 if (ret) { 1457 log_warn("%s: waiting on thread state condition " 1458 "variable failed", __func__); 1459 return (ret); 1460 } 1461 1462 /* 1463 * Did a VCPU thread exit with an error? => return the first one 1464 */ 1465 for (i = 0; i < vcp->vcp_ncpus; i++) { 1466 if (vcpu_done[i] == 0) 1467 continue; 1468 1469 if (pthread_join(tid[i], &exit_status)) { 1470 log_warn("%s: failed to join thread %zd - " 1471 "exiting", __progname, i); 1472 return (EIO); 1473 } 1474 1475 ret = (intptr_t)exit_status; 1476 } 1477 1478 /* Did the event thread exit? => return with an error */ 1479 if (evdone) { 1480 if (pthread_join(evtid, &exit_status)) { 1481 log_warn("%s: failed to join event thread - " 1482 "exiting", __progname); 1483 return (EIO); 1484 } 1485 1486 log_warnx("%s: vm %d event thread exited " 1487 "unexpectedly", __progname, vcp->vcp_id); 1488 return (EIO); 1489 } 1490 1491 /* Did all VCPU threads exit successfully? => return */ 1492 for (i = 0; i < vcp->vcp_ncpus; i++) { 1493 if (vcpu_done[i] == 0) 1494 break; 1495 } 1496 if (i == vcp->vcp_ncpus) 1497 return (ret); 1498 1499 /* Some more threads to wait for, start over */ 1500 } 1501 1502 return (ret); 1503 } 1504 1505 void * 1506 event_thread(void *arg) 1507 { 1508 uint8_t *donep = arg; 1509 intptr_t ret; 1510 1511 ret = event_dispatch(); 1512 1513 mutex_lock(&threadmutex); 1514 *donep = 1; 1515 pthread_cond_signal(&threadcond); 1516 mutex_unlock(&threadmutex); 1517 1518 return (void *)ret; 1519 } 1520 1521 /* 1522 * vcpu_run_loop 1523 * 1524 * Runs a single VCPU until vmm(4) requires help handling an exit, 1525 * or the VM terminates. 1526 * 1527 * Parameters: 1528 * arg: vcpu_run_params for the VCPU being run by this thread 1529 * 1530 * Return values: 1531 * NULL: the VCPU shutdown properly 1532 * !NULL: error processing VCPU run, or the VCPU shutdown abnormally 1533 */ 1534 void * 1535 vcpu_run_loop(void *arg) 1536 { 1537 struct vm_run_params *vrp = (struct vm_run_params *)arg; 1538 intptr_t ret = 0; 1539 int irq; 1540 uint32_t n; 1541 1542 vrp->vrp_continue = 0; 1543 n = vrp->vrp_vcpu_id; 1544 1545 for (;;) { 1546 ret = pthread_mutex_lock(&vcpu_run_mtx[n]); 1547 1548 if (ret) { 1549 log_warnx("%s: can't lock vcpu run mtx (%d)", 1550 __func__, (int)ret); 1551 return ((void *)ret); 1552 } 1553 1554 /* If we are halted and need to pause, pause */ 1555 if (vcpu_hlt[n] && (current_vm->vm_state & VM_STATE_PAUSED)) { 1556 ret = pthread_barrier_wait(&vm_pause_barrier); 1557 if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) { 1558 log_warnx("%s: could not wait on pause barrier (%d)", 1559 __func__, (int)ret); 1560 return ((void *)ret); 1561 } 1562 1563 ret = pthread_mutex_lock(&vcpu_unpause_mtx[n]); 1564 if (ret) { 1565 log_warnx("%s: can't lock vcpu unpause mtx (%d)", 1566 __func__, (int)ret); 1567 return ((void *)ret); 1568 } 1569 1570 /* i8259 may be firing as we pause, release run mtx. */ 1571 mutex_unlock(&vcpu_run_mtx[n]); 1572 ret = pthread_cond_wait(&vcpu_unpause_cond[n], 1573 &vcpu_unpause_mtx[n]); 1574 if (ret) { 1575 log_warnx( 1576 "%s: can't wait on unpause cond (%d)", 1577 __func__, (int)ret); 1578 break; 1579 } 1580 mutex_lock(&vcpu_run_mtx[n]); 1581 1582 ret = pthread_mutex_unlock(&vcpu_unpause_mtx[n]); 1583 if (ret) { 1584 log_warnx("%s: can't unlock unpause mtx (%d)", 1585 __func__, (int)ret); 1586 break; 1587 } 1588 } 1589 1590 /* If we are halted and not paused, wait */ 1591 if (vcpu_hlt[n]) { 1592 ret = pthread_cond_wait(&vcpu_run_cond[n], 1593 &vcpu_run_mtx[n]); 1594 1595 if (ret) { 1596 log_warnx( 1597 "%s: can't wait on cond (%d)", 1598 __func__, (int)ret); 1599 (void)pthread_mutex_unlock( 1600 &vcpu_run_mtx[n]); 1601 break; 1602 } 1603 } 1604 1605 ret = pthread_mutex_unlock(&vcpu_run_mtx[n]); 1606 1607 if (ret) { 1608 log_warnx("%s: can't unlock mutex on cond (%d)", 1609 __func__, (int)ret); 1610 break; 1611 } 1612 1613 if (vrp->vrp_irqready && i8259_is_pending()) { 1614 irq = i8259_ack(); 1615 vrp->vrp_irq = irq; 1616 } else 1617 vrp->vrp_irq = 0xFFFF; 1618 1619 /* Still more interrupts pending? */ 1620 vrp->vrp_intr_pending = i8259_is_pending(); 1621 1622 if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) == -1) { 1623 /* If run ioctl failed, exit */ 1624 ret = errno; 1625 log_warn("%s: vm %d / vcpu %d run ioctl failed", 1626 __func__, current_vm->vm_vmid, n); 1627 break; 1628 } 1629 1630 /* If the VM is terminating, exit normally */ 1631 if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) { 1632 ret = (intptr_t)NULL; 1633 break; 1634 } 1635 1636 if (vrp->vrp_exit_reason != VM_EXIT_NONE) { 1637 /* 1638 * vmm(4) needs help handling an exit, handle in 1639 * vcpu_exit. 1640 */ 1641 ret = vcpu_exit(vrp); 1642 if (ret) 1643 break; 1644 } 1645 } 1646 1647 mutex_lock(&threadmutex); 1648 vcpu_done[n] = 1; 1649 pthread_cond_signal(&threadcond); 1650 mutex_unlock(&threadmutex); 1651 1652 return ((void *)ret); 1653 } 1654 1655 int 1656 vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr) 1657 { 1658 struct vm_intr_params vip; 1659 1660 memset(&vip, 0, sizeof(vip)); 1661 1662 vip.vip_vm_id = vm_id; 1663 vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */ 1664 vip.vip_intr = intr; 1665 1666 if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) == -1) 1667 return (errno); 1668 1669 return (0); 1670 } 1671 1672 /* 1673 * vcpu_exit_pci 1674 * 1675 * Handle all I/O to the emulated PCI subsystem. 1676 * 1677 * Parameters: 1678 * vrp: vcpu run parameters containing guest state for this exit 1679 * 1680 * Return value: 1681 * Interrupt to inject to the guest VM, or 0xFF if no interrupt should 1682 * be injected. 1683 */ 1684 uint8_t 1685 vcpu_exit_pci(struct vm_run_params *vrp) 1686 { 1687 struct vm_exit *vei = vrp->vrp_exit; 1688 uint8_t intr; 1689 1690 intr = 0xFF; 1691 1692 switch (vei->vei.vei_port) { 1693 case PCI_MODE1_ADDRESS_REG: 1694 pci_handle_address_reg(vrp); 1695 break; 1696 case PCI_MODE1_DATA_REG: 1697 case PCI_MODE1_DATA_REG + 1: 1698 case PCI_MODE1_DATA_REG + 2: 1699 case PCI_MODE1_DATA_REG + 3: 1700 pci_handle_data_reg(vrp); 1701 break; 1702 case VM_PCI_IO_BAR_BASE ... VM_PCI_IO_BAR_END: 1703 intr = pci_handle_io(vrp); 1704 break; 1705 default: 1706 log_warnx("%s: unknown PCI register 0x%llx", 1707 __progname, (uint64_t)vei->vei.vei_port); 1708 break; 1709 } 1710 1711 return (intr); 1712 } 1713 1714 /* 1715 * vcpu_exit_inout 1716 * 1717 * Handle all I/O exits that need to be emulated in vmd. This includes the 1718 * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device. 1719 * 1720 * Parameters: 1721 * vrp: vcpu run parameters containing guest state for this exit 1722 */ 1723 void 1724 vcpu_exit_inout(struct vm_run_params *vrp) 1725 { 1726 struct vm_exit *vei = vrp->vrp_exit; 1727 uint8_t intr = 0xFF; 1728 1729 if (vei->vei.vei_rep || vei->vei.vei_string) { 1730 #ifdef MMIO_DEBUG 1731 log_info("%s: %s%s%s %d-byte, enc=%d, data=0x%08x, port=0x%04x", 1732 __func__, 1733 vei->vei.vei_rep == 0 ? "" : "REP ", 1734 vei->vei.vei_dir == VEI_DIR_IN ? "IN" : "OUT", 1735 vei->vei.vei_string == 0 ? "" : "S", 1736 vei->vei.vei_size, vei->vei.vei_encoding, 1737 vei->vei.vei_data, vei->vei.vei_port); 1738 log_info("%s: ECX = 0x%llx, RDX = 0x%llx, RSI = 0x%llx", 1739 __func__, 1740 vei->vrs.vrs_gprs[VCPU_REGS_RCX], 1741 vei->vrs.vrs_gprs[VCPU_REGS_RDX], 1742 vei->vrs.vrs_gprs[VCPU_REGS_RSI]); 1743 #endif /* MMIO_DEBUG */ 1744 fatalx("%s: can't emulate REP prefixed IN(S)/OUT(S)", 1745 __func__); 1746 } 1747 1748 if (ioports_map[vei->vei.vei_port] != NULL) 1749 intr = ioports_map[vei->vei.vei_port](vrp); 1750 else if (vei->vei.vei_dir == VEI_DIR_IN) 1751 set_return_data(vei, 0xFFFFFFFF); 1752 1753 vei->vrs.vrs_gprs[VCPU_REGS_RIP] += vei->vei.vei_insn_len; 1754 1755 if (intr != 0xFF) 1756 vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr); 1757 } 1758 1759 /* 1760 * vcpu_exit_eptviolation 1761 * 1762 * handle an EPT Violation 1763 * 1764 * Parameters: 1765 * vrp: vcpu run parameters containing guest state for this exit 1766 * 1767 * Return values: 1768 * 0: no action required 1769 * EFAULT: a protection fault occured, kill the vm. 1770 */ 1771 int 1772 vcpu_exit_eptviolation(struct vm_run_params *vrp) 1773 { 1774 struct vm_exit *ve = vrp->vrp_exit; 1775 int ret = 0; 1776 #if MMIO_NOTYET 1777 struct x86_insn insn; 1778 uint64_t va, pa; 1779 size_t len = 15; /* Max instruction length in x86. */ 1780 #endif /* MMIO_NOTYET */ 1781 switch (ve->vee.vee_fault_type) { 1782 case VEE_FAULT_HANDLED: 1783 log_debug("%s: fault already handled", __func__); 1784 break; 1785 1786 #if MMIO_NOTYET 1787 case VEE_FAULT_MMIO_ASSIST: 1788 /* Intel VMX might give us the length of the instruction. */ 1789 if (ve->vee.vee_insn_info & VEE_LEN_VALID) 1790 len = ve->vee.vee_insn_len; 1791 1792 if (len > 15) 1793 fatalx("%s: invalid instruction length %lu", __func__, 1794 len); 1795 1796 /* If we weren't given instruction bytes, we need to fetch. */ 1797 if (!(ve->vee.vee_insn_info & VEE_BYTES_VALID)) { 1798 memset(ve->vee.vee_insn_bytes, 0, 1799 sizeof(ve->vee.vee_insn_bytes)); 1800 va = ve->vrs.vrs_gprs[VCPU_REGS_RIP]; 1801 1802 /* XXX Only support instructions that fit on 1 page. */ 1803 if ((va & PAGE_MASK) + len > PAGE_SIZE) { 1804 log_warnx("%s: instruction might cross page " 1805 "boundary", __func__); 1806 ret = EINVAL; 1807 break; 1808 } 1809 1810 ret = translate_gva(ve, va, &pa, PROT_EXEC); 1811 if (ret != 0) { 1812 log_warnx("%s: failed gva translation", 1813 __func__); 1814 break; 1815 } 1816 1817 ret = read_mem(pa, ve->vee.vee_insn_bytes, len); 1818 if (ret != 0) { 1819 log_warnx("%s: failed to fetch instruction " 1820 "bytes from 0x%llx", __func__, pa); 1821 break; 1822 } 1823 } 1824 1825 ret = insn_decode(ve, &insn); 1826 if (ret == 0) 1827 ret = insn_emulate(ve, &insn); 1828 break; 1829 #endif /* MMIO_NOTYET */ 1830 1831 case VEE_FAULT_PROTECT: 1832 log_debug("%s: EPT Violation: rip=0x%llx", __progname, 1833 ve->vrs.vrs_gprs[VCPU_REGS_RIP]); 1834 ret = EFAULT; 1835 break; 1836 1837 default: 1838 fatalx("%s: invalid fault_type %d", __progname, 1839 ve->vee.vee_fault_type); 1840 /* UNREACHED */ 1841 } 1842 1843 return (ret); 1844 } 1845 1846 /* 1847 * vcpu_exit 1848 * 1849 * Handle a vcpu exit. This function is called when it is determined that 1850 * vmm(4) requires the assistance of vmd to support a particular guest 1851 * exit type (eg, accessing an I/O port or device). Guest state is contained 1852 * in 'vrp', and will be resent to vmm(4) on exit completion. 1853 * 1854 * Upon conclusion of handling the exit, the function determines if any 1855 * interrupts should be injected into the guest, and asserts the proper 1856 * IRQ line whose interrupt should be vectored. 1857 * 1858 * Parameters: 1859 * vrp: vcpu run parameters containing guest state for this exit 1860 * 1861 * Return values: 1862 * 0: the exit was handled successfully 1863 * 1: an error occurred (eg, unknown exit reason passed in 'vrp') 1864 */ 1865 int 1866 vcpu_exit(struct vm_run_params *vrp) 1867 { 1868 int ret; 1869 1870 switch (vrp->vrp_exit_reason) { 1871 case VMX_EXIT_INT_WINDOW: 1872 case SVM_VMEXIT_VINTR: 1873 case VMX_EXIT_CPUID: 1874 case VMX_EXIT_EXTINT: 1875 case SVM_VMEXIT_INTR: 1876 case SVM_VMEXIT_MSR: 1877 case SVM_VMEXIT_CPUID: 1878 /* 1879 * We may be exiting to vmd to handle a pending interrupt but 1880 * at the same time the last exit type may have been one of 1881 * these. In this case, there's nothing extra to be done 1882 * here (and falling through to the default case below results 1883 * in more vmd log spam). 1884 */ 1885 break; 1886 case SVM_VMEXIT_NPF: 1887 case VMX_EXIT_EPT_VIOLATION: 1888 ret = vcpu_exit_eptviolation(vrp); 1889 if (ret) 1890 return (ret); 1891 break; 1892 case VMX_EXIT_IO: 1893 case SVM_VMEXIT_IOIO: 1894 vcpu_exit_inout(vrp); 1895 break; 1896 case VMX_EXIT_HLT: 1897 case SVM_VMEXIT_HLT: 1898 ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]); 1899 if (ret) { 1900 log_warnx("%s: can't lock vcpu mutex (%d)", 1901 __func__, ret); 1902 return (ret); 1903 } 1904 vcpu_hlt[vrp->vrp_vcpu_id] = 1; 1905 ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]); 1906 if (ret) { 1907 log_warnx("%s: can't unlock vcpu mutex (%d)", 1908 __func__, ret); 1909 return (ret); 1910 } 1911 break; 1912 case VMX_EXIT_TRIPLE_FAULT: 1913 case SVM_VMEXIT_SHUTDOWN: 1914 /* reset VM */ 1915 return (EAGAIN); 1916 default: 1917 log_debug("%s: unknown exit reason 0x%x", 1918 __progname, vrp->vrp_exit_reason); 1919 } 1920 1921 vrp->vrp_continue = 1; 1922 1923 return (0); 1924 } 1925 1926 /* 1927 * find_gpa_range 1928 * 1929 * Search for a contiguous guest physical mem range. 1930 * 1931 * Parameters: 1932 * vcp: VM create parameters that contain the memory map to search in 1933 * gpa: the starting guest physical address 1934 * len: the length of the memory range 1935 * 1936 * Return values: 1937 * NULL: on failure if there is no memory range as described by the parameters 1938 * Pointer to vm_mem_range that contains the start of the range otherwise. 1939 */ 1940 static struct vm_mem_range * 1941 find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len) 1942 { 1943 size_t i, n; 1944 struct vm_mem_range *vmr; 1945 1946 /* Find the first vm_mem_range that contains gpa */ 1947 for (i = 0; i < vcp->vcp_nmemranges; i++) { 1948 vmr = &vcp->vcp_memranges[i]; 1949 if (gpa < vmr->vmr_gpa + vmr->vmr_size) 1950 break; 1951 } 1952 1953 /* No range found. */ 1954 if (i == vcp->vcp_nmemranges) 1955 return (NULL); 1956 1957 /* 1958 * vmr may cover the range [gpa, gpa + len) only partly. Make 1959 * sure that the following vm_mem_ranges are contiguous and 1960 * cover the rest. 1961 */ 1962 n = vmr->vmr_size - (gpa - vmr->vmr_gpa); 1963 if (len < n) 1964 len = 0; 1965 else 1966 len -= n; 1967 gpa = vmr->vmr_gpa + vmr->vmr_size; 1968 for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) { 1969 vmr = &vcp->vcp_memranges[i]; 1970 if (gpa != vmr->vmr_gpa) 1971 return (NULL); 1972 if (len <= vmr->vmr_size) 1973 len = 0; 1974 else 1975 len -= vmr->vmr_size; 1976 1977 gpa = vmr->vmr_gpa + vmr->vmr_size; 1978 } 1979 1980 if (len != 0) 1981 return (NULL); 1982 1983 return (vmr); 1984 } 1985 1986 /* 1987 * write_mem 1988 * 1989 * Copies data from 'buf' into the guest VM's memory at paddr 'dst'. 1990 * 1991 * Parameters: 1992 * dst: the destination paddr_t in the guest VM 1993 * buf: data to copy (or NULL to zero the data) 1994 * len: number of bytes to copy 1995 * 1996 * Return values: 1997 * 0: success 1998 * EINVAL: if the guest physical memory range [dst, dst + len) does not 1999 * exist in the guest. 2000 */ 2001 int 2002 write_mem(paddr_t dst, const void *buf, size_t len) 2003 { 2004 const char *from = buf; 2005 char *to; 2006 size_t n, off; 2007 struct vm_mem_range *vmr; 2008 2009 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, dst, len); 2010 if (vmr == NULL) { 2011 errno = EINVAL; 2012 log_warn("%s: failed - invalid memory range dst = 0x%lx, " 2013 "len = 0x%zx", __func__, dst, len); 2014 return (EINVAL); 2015 } 2016 2017 off = dst - vmr->vmr_gpa; 2018 while (len != 0) { 2019 n = vmr->vmr_size - off; 2020 if (len < n) 2021 n = len; 2022 2023 to = (char *)vmr->vmr_va + off; 2024 if (buf == NULL) 2025 memset(to, 0, n); 2026 else { 2027 memcpy(to, from, n); 2028 from += n; 2029 } 2030 len -= n; 2031 off = 0; 2032 vmr++; 2033 } 2034 2035 return (0); 2036 } 2037 2038 /* 2039 * read_mem 2040 * 2041 * Reads memory at guest paddr 'src' into 'buf'. 2042 * 2043 * Parameters: 2044 * src: the source paddr_t in the guest VM to read from. 2045 * buf: destination (local) buffer 2046 * len: number of bytes to read 2047 * 2048 * Return values: 2049 * 0: success 2050 * EINVAL: if the guest physical memory range [dst, dst + len) does not 2051 * exist in the guest. 2052 */ 2053 int 2054 read_mem(paddr_t src, void *buf, size_t len) 2055 { 2056 char *from, *to = buf; 2057 size_t n, off; 2058 struct vm_mem_range *vmr; 2059 2060 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, src, len); 2061 if (vmr == NULL) { 2062 errno = EINVAL; 2063 log_warn("%s: failed - invalid memory range src = 0x%lx, " 2064 "len = 0x%zx", __func__, src, len); 2065 return (EINVAL); 2066 } 2067 2068 off = src - vmr->vmr_gpa; 2069 while (len != 0) { 2070 n = vmr->vmr_size - off; 2071 if (len < n) 2072 n = len; 2073 2074 from = (char *)vmr->vmr_va + off; 2075 memcpy(to, from, n); 2076 2077 to += n; 2078 len -= n; 2079 off = 0; 2080 vmr++; 2081 } 2082 2083 return (0); 2084 } 2085 2086 /* 2087 * hvaddr_mem 2088 * 2089 * Translate a guest physical address to a host virtual address, checking the 2090 * provided memory range length to confirm it's contiguous within the same 2091 * guest memory range (vm_mem_range). 2092 * 2093 * Parameters: 2094 * gpa: guest physical address to translate 2095 * len: number of bytes in the intended range 2096 * 2097 * Return values: 2098 * void* to host virtual memory on success 2099 * NULL on error, setting errno to: 2100 * EFAULT: gpa falls outside guest memory ranges 2101 * EINVAL: requested len extends beyond memory range 2102 */ 2103 void * 2104 hvaddr_mem(paddr_t gpa, size_t len) 2105 { 2106 struct vm_mem_range *vmr; 2107 size_t off; 2108 2109 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, gpa, len); 2110 if (vmr == NULL) { 2111 log_warnx("%s: failed - invalid gpa: 0x%lx\n", __func__, gpa); 2112 errno = EFAULT; 2113 return (NULL); 2114 } 2115 2116 off = gpa - vmr->vmr_gpa; 2117 if (len > (vmr->vmr_size - off)) { 2118 log_warnx("%s: failed - invalid memory range: gpa=0x%lx, " 2119 "len=%zu", __func__, gpa, len); 2120 errno = EINVAL; 2121 return (NULL); 2122 } 2123 2124 return ((char *)vmr->vmr_va + off); 2125 } 2126 2127 /* 2128 * vcpu_assert_pic_irq 2129 * 2130 * Injects the specified IRQ on the supplied vcpu/vm 2131 * 2132 * Parameters: 2133 * vm_id: VM ID to inject to 2134 * vcpu_id: VCPU ID to inject to 2135 * irq: IRQ to inject 2136 */ 2137 void 2138 vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) 2139 { 2140 int ret; 2141 2142 i8259_assert_irq(irq); 2143 2144 if (i8259_is_pending()) { 2145 if (vcpu_pic_intr(vm_id, vcpu_id, 1)) 2146 fatalx("%s: can't assert INTR", __func__); 2147 mutex_lock(&vcpu_run_mtx[vcpu_id]); 2148 vcpu_hlt[vcpu_id] = 0; 2149 ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]); 2150 if (ret) 2151 fatalx("%s: can't signal (%d)", __func__, ret); 2152 mutex_unlock(&vcpu_run_mtx[vcpu_id]); 2153 } 2154 } 2155 2156 /* 2157 * vcpu_deassert_pic_irq 2158 * 2159 * Clears the specified IRQ on the supplied vcpu/vm 2160 * 2161 * Parameters: 2162 * vm_id: VM ID to clear in 2163 * vcpu_id: VCPU ID to clear in 2164 * irq: IRQ to clear 2165 */ 2166 void 2167 vcpu_deassert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) 2168 { 2169 i8259_deassert_irq(irq); 2170 2171 if (!i8259_is_pending()) { 2172 if (vcpu_pic_intr(vm_id, vcpu_id, 0)) 2173 fatalx("%s: can't deassert INTR for vm_id %d, " 2174 "vcpu_id %d", __func__, vm_id, vcpu_id); 2175 } 2176 } 2177 2178 /* 2179 * fd_hasdata 2180 * 2181 * Determines if data can be read from a file descriptor. 2182 * 2183 * Parameters: 2184 * fd: the fd to check 2185 * 2186 * Return values: 2187 * 1 if data can be read from an fd, or 0 otherwise. 2188 */ 2189 int 2190 fd_hasdata(int fd) 2191 { 2192 struct pollfd pfd[1]; 2193 int nready, hasdata = 0; 2194 2195 pfd[0].fd = fd; 2196 pfd[0].events = POLLIN; 2197 nready = poll(pfd, 1, 0); 2198 if (nready == -1) 2199 log_warn("checking file descriptor for data failed"); 2200 else if (nready == 1 && pfd[0].revents & POLLIN) 2201 hasdata = 1; 2202 return (hasdata); 2203 } 2204 2205 /* 2206 * mutex_lock 2207 * 2208 * Wrapper function for pthread_mutex_lock that does error checking and that 2209 * exits on failure 2210 */ 2211 void 2212 mutex_lock(pthread_mutex_t *m) 2213 { 2214 int ret; 2215 2216 ret = pthread_mutex_lock(m); 2217 if (ret) { 2218 errno = ret; 2219 fatal("could not acquire mutex"); 2220 } 2221 } 2222 2223 /* 2224 * mutex_unlock 2225 * 2226 * Wrapper function for pthread_mutex_unlock that does error checking and that 2227 * exits on failure 2228 */ 2229 void 2230 mutex_unlock(pthread_mutex_t *m) 2231 { 2232 int ret; 2233 2234 ret = pthread_mutex_unlock(m); 2235 if (ret) { 2236 errno = ret; 2237 fatal("could not release mutex"); 2238 } 2239 } 2240 2241 /* 2242 * set_return_data 2243 * 2244 * Utility function for manipulating register data in vm exit info structs. This 2245 * function ensures that the data is copied to the vei->vei.vei_data field with 2246 * the proper size for the operation being performed. 2247 * 2248 * Parameters: 2249 * vei: exit information 2250 * data: return data 2251 */ 2252 void 2253 set_return_data(struct vm_exit *vei, uint32_t data) 2254 { 2255 switch (vei->vei.vei_size) { 2256 case 1: 2257 vei->vei.vei_data &= ~0xFF; 2258 vei->vei.vei_data |= (uint8_t)data; 2259 break; 2260 case 2: 2261 vei->vei.vei_data &= ~0xFFFF; 2262 vei->vei.vei_data |= (uint16_t)data; 2263 break; 2264 case 4: 2265 vei->vei.vei_data = data; 2266 break; 2267 } 2268 } 2269 2270 /* 2271 * get_input_data 2272 * 2273 * Utility function for manipulating register data in vm exit info 2274 * structs. This function ensures that the data is copied from the 2275 * vei->vei.vei_data field with the proper size for the operation being 2276 * performed. 2277 * 2278 * Parameters: 2279 * vei: exit information 2280 * data: location to store the result 2281 */ 2282 void 2283 get_input_data(struct vm_exit *vei, uint32_t *data) 2284 { 2285 switch (vei->vei.vei_size) { 2286 case 1: 2287 *data &= 0xFFFFFF00; 2288 *data |= (uint8_t)vei->vei.vei_data; 2289 break; 2290 case 2: 2291 *data &= 0xFFFF0000; 2292 *data |= (uint16_t)vei->vei.vei_data; 2293 break; 2294 case 4: 2295 *data = vei->vei.vei_data; 2296 break; 2297 default: 2298 log_warnx("%s: invalid i/o size %d", __func__, 2299 vei->vei.vei_size); 2300 } 2301 2302 } 2303 2304 /* 2305 * translate_gva 2306 * 2307 * Translates a guest virtual address to a guest physical address by walking 2308 * the currently active page table (if needed). 2309 * 2310 * XXX ensure translate_gva updates the A bit in the PTE 2311 * XXX ensure translate_gva respects segment base and limits in i386 mode 2312 * XXX ensure translate_gva respects segment wraparound in i8086 mode 2313 * XXX ensure translate_gva updates the A bit in the segment selector 2314 * XXX ensure translate_gva respects CR4.LMSLE if available 2315 * 2316 * Parameters: 2317 * exit: The VCPU this translation should be performed for (guest MMU settings 2318 * are gathered from this VCPU) 2319 * va: virtual address to translate 2320 * pa: pointer to paddr_t variable that will receive the translated physical 2321 * address. 'pa' is unchanged on error. 2322 * mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which 2323 * the address should be translated 2324 * 2325 * Return values: 2326 * 0: the address was successfully translated - 'pa' contains the physical 2327 * address currently mapped by 'va'. 2328 * EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case 2329 * and %cr2 set in the vcpu structure. 2330 * EINVAL: an error occurred reading paging table structures 2331 */ 2332 int 2333 translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode) 2334 { 2335 int level, shift, pdidx; 2336 uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask; 2337 uint64_t shift_width, pte_size; 2338 struct vcpu_reg_state *vrs; 2339 2340 vrs = &exit->vrs; 2341 2342 if (!pa) 2343 return (EINVAL); 2344 2345 if (!(vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PG)) { 2346 log_debug("%s: unpaged, va=pa=0x%llx", __func__, va); 2347 *pa = va; 2348 return (0); 2349 } 2350 2351 pt_paddr = vrs->vrs_crs[VCPU_REGS_CR3]; 2352 2353 log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__, 2354 vrs->vrs_crs[VCPU_REGS_CR0], vrs->vrs_crs[VCPU_REGS_CR3]); 2355 2356 if (vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PE) { 2357 if (vrs->vrs_crs[VCPU_REGS_CR4] & CR4_PAE) { 2358 pte_size = sizeof(uint64_t); 2359 shift_width = 9; 2360 2361 if (vrs->vrs_msrs[VCPU_REGS_EFER] & EFER_LMA) { 2362 /* 4 level paging */ 2363 level = 4; 2364 mask = L4_MASK; 2365 shift = L4_SHIFT; 2366 } else { 2367 /* 32 bit with PAE paging */ 2368 level = 3; 2369 mask = L3_MASK; 2370 shift = L3_SHIFT; 2371 } 2372 } else { 2373 /* 32 bit paging */ 2374 level = 2; 2375 shift_width = 10; 2376 mask = 0xFFC00000; 2377 shift = 22; 2378 pte_size = sizeof(uint32_t); 2379 } 2380 } else 2381 return (EINVAL); 2382 2383 /* XXX: Check for R bit in segment selector and set A bit */ 2384 2385 for (;level > 0; level--) { 2386 pdidx = (va & mask) >> shift; 2387 pte_paddr = (pt_paddr) + (pdidx * pte_size); 2388 2389 log_debug("%s: read pte level %d @ GPA 0x%llx", __func__, 2390 level, pte_paddr); 2391 if (read_mem(pte_paddr, &pte, pte_size)) { 2392 log_warn("%s: failed to read pte", __func__); 2393 return (EFAULT); 2394 } 2395 2396 log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr, 2397 pte); 2398 2399 /* XXX: Set CR2 */ 2400 if (!(pte & PG_V)) 2401 return (EFAULT); 2402 2403 /* XXX: Check for SMAP */ 2404 if ((mode == PROT_WRITE) && !(pte & PG_RW)) 2405 return (EPERM); 2406 2407 if ((exit->cpl > 0) && !(pte & PG_u)) 2408 return (EPERM); 2409 2410 pte = pte | PG_U; 2411 if (mode == PROT_WRITE) 2412 pte = pte | PG_M; 2413 if (write_mem(pte_paddr, &pte, pte_size)) { 2414 log_warn("%s: failed to write back flags to pte", 2415 __func__); 2416 return (EIO); 2417 } 2418 2419 /* XXX: EINVAL if in 32bit and PG_PS is 1 but CR4.PSE is 0 */ 2420 if (pte & PG_PS) 2421 break; 2422 2423 if (level > 1) { 2424 pt_paddr = pte & PG_FRAME; 2425 shift -= shift_width; 2426 mask = mask >> shift_width; 2427 } 2428 } 2429 2430 low_mask = (1 << shift) - 1; 2431 high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask; 2432 *pa = (pte & high_mask) | (va & low_mask); 2433 2434 log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa); 2435 2436 return (0); 2437 } 2438 2439 /* 2440 * vm_pipe_init 2441 * 2442 * Initialize a vm_dev_pipe, setting up its file descriptors and its 2443 * event structure with the given callback. 2444 * 2445 * Parameters: 2446 * p: pointer to vm_dev_pipe struct to initizlize 2447 * cb: callback to use for READ events on the read end of the pipe 2448 */ 2449 void 2450 vm_pipe_init(struct vm_dev_pipe *p, void (*cb)(int, short, void *)) 2451 { 2452 int ret; 2453 int fds[2]; 2454 2455 memset(p, 0, sizeof(struct vm_dev_pipe)); 2456 2457 ret = pipe(fds); 2458 if (ret) 2459 fatal("failed to create vm_dev_pipe pipe"); 2460 2461 p->read = fds[0]; 2462 p->write = fds[1]; 2463 2464 event_set(&p->read_ev, p->read, EV_READ | EV_PERSIST, cb, NULL); 2465 } 2466 2467 /* 2468 * vm_pipe_send 2469 * 2470 * Send a message to an emulated device vie the provided vm_dev_pipe. 2471 * 2472 * Parameters: 2473 * p: pointer to initialized vm_dev_pipe 2474 * msg: message to send in the channel 2475 */ 2476 void 2477 vm_pipe_send(struct vm_dev_pipe *p, enum pipe_msg_type msg) 2478 { 2479 size_t n; 2480 n = write(p->write, &msg, sizeof(msg)); 2481 if (n != sizeof(msg)) 2482 fatal("failed to write to device pipe"); 2483 } 2484 2485 /* 2486 * vm_pipe_recv 2487 * 2488 * Receive a message for an emulated device via the provided vm_dev_pipe. 2489 * Returns the message value, otherwise will exit on failure. 2490 * 2491 * Parameters: 2492 * p: pointer to initialized vm_dev_pipe 2493 * 2494 * Return values: 2495 * a value of enum pipe_msg_type or fatal exit on read(2) error 2496 */ 2497 enum pipe_msg_type 2498 vm_pipe_recv(struct vm_dev_pipe *p) 2499 { 2500 size_t n; 2501 enum pipe_msg_type msg; 2502 n = read(p->read, &msg, sizeof(msg)); 2503 if (n != sizeof(msg)) 2504 fatal("failed to read from device pipe"); 2505 2506 return msg; 2507 } 2508 2509 /* 2510 * Re-map the guest address space using the shared memory file descriptor. 2511 * 2512 * Returns 0 on success, non-zero in event of failure. 2513 */ 2514 int 2515 remap_guest_mem(struct vmd_vm *vm, int vmm_fd) 2516 { 2517 struct vm_create_params *vcp; 2518 struct vm_mem_range *vmr; 2519 struct vm_sharemem_params vsp; 2520 size_t i, j; 2521 void *p = NULL; 2522 int ret; 2523 2524 if (vm == NULL) 2525 return (1); 2526 2527 vcp = &vm->vm_params.vmc_params; 2528 2529 /* 2530 * Initialize our VM shared memory request using our original 2531 * creation parameters. We'll overwrite the va's after mmap(2). 2532 */ 2533 memset(&vsp, 0, sizeof(vsp)); 2534 vsp.vsp_nmemranges = vcp->vcp_nmemranges; 2535 vsp.vsp_vm_id = vcp->vcp_id; 2536 memcpy(&vsp.vsp_memranges, &vcp->vcp_memranges, 2537 sizeof(vsp.vsp_memranges)); 2538 2539 /* 2540 * Use mmap(2) to identify virtual address space for our mappings. 2541 */ 2542 for (i = 0; i < VMM_MAX_MEM_RANGES; i++) { 2543 if (i < vsp.vsp_nmemranges) { 2544 vmr = &vsp.vsp_memranges[i]; 2545 2546 /* Ignore any MMIO ranges. */ 2547 if (vmr->vmr_type == VM_MEM_MMIO) { 2548 vmr->vmr_va = 0; 2549 vcp->vcp_memranges[i].vmr_va = 0; 2550 continue; 2551 } 2552 2553 /* Make initial mappings for the memrange. */ 2554 p = mmap(NULL, vmr->vmr_size, PROT_READ, MAP_ANON, -1, 2555 0); 2556 if (p == MAP_FAILED) { 2557 ret = errno; 2558 log_warn("%s: mmap", __func__); 2559 for (j = 0; j < i; j++) { 2560 vmr = &vcp->vcp_memranges[j]; 2561 munmap((void *)vmr->vmr_va, 2562 vmr->vmr_size); 2563 } 2564 return (ret); 2565 } 2566 vmr->vmr_va = (vaddr_t)p; 2567 vcp->vcp_memranges[i].vmr_va = vmr->vmr_va; 2568 } 2569 } 2570 2571 /* 2572 * munmap(2) now that we have va's and ranges that don't overlap. vmm 2573 * will use the va's and sizes to recreate the mappings for us. 2574 */ 2575 for (i = 0; i < vsp.vsp_nmemranges; i++) { 2576 vmr = &vsp.vsp_memranges[i]; 2577 if (vmr->vmr_type == VM_MEM_MMIO) 2578 continue; 2579 if (munmap((void*)vmr->vmr_va, vmr->vmr_size) == -1) 2580 fatal("%s: munmap", __func__); 2581 } 2582 2583 /* 2584 * Ask vmm to enter the shared mappings for us. They'll point 2585 * to the same host physical memory, but will have a randomized 2586 * virtual address for the calling process. 2587 */ 2588 if (ioctl(vmm_fd, VMM_IOC_SHAREMEM, &vsp) == -1) 2589 return (errno); 2590 2591 return (0); 2592 } 2593