1 /* $OpenBSD: virtio.c,v 1.110 2023/11/03 11:16:43 dv Exp $ */ 2 3 /* 4 * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/param.h> /* PAGE_SIZE */ 20 #include <sys/socket.h> 21 #include <sys/wait.h> 22 23 #include <machine/vmmvar.h> 24 #include <dev/pci/pcireg.h> 25 #include <dev/pci/pcidevs.h> 26 #include <dev/pv/virtioreg.h> 27 #include <dev/pci/virtio_pcireg.h> 28 #include <dev/pv/vioblkreg.h> 29 #include <dev/pv/vioscsireg.h> 30 31 #include <net/if.h> 32 #include <netinet/in.h> 33 #include <netinet/if_ether.h> 34 #include <netinet/ip.h> 35 36 #include <errno.h> 37 #include <event.h> 38 #include <fcntl.h> 39 #include <poll.h> 40 #include <stddef.h> 41 #include <stdlib.h> 42 #include <string.h> 43 #include <unistd.h> 44 45 #include "atomicio.h" 46 #include "pci.h" 47 #include "vioscsi.h" 48 #include "virtio.h" 49 #include "vmd.h" 50 #include "vmm.h" 51 52 extern struct vmd *env; 53 extern char *__progname; 54 55 struct viornd_dev viornd; 56 struct vioscsi_dev *vioscsi; 57 struct vmmci_dev vmmci; 58 59 /* Devices emulated in subprocesses are inserted into this list. */ 60 SLIST_HEAD(virtio_dev_head, virtio_dev) virtio_devs; 61 62 #define MAXPHYS (64 * 1024) /* max raw I/O transfer size */ 63 64 #define VIRTIO_NET_F_MAC (1<<5) 65 66 #define VMMCI_F_TIMESYNC (1<<0) 67 #define VMMCI_F_ACK (1<<1) 68 #define VMMCI_F_SYNCRTC (1<<2) 69 70 #define RXQ 0 71 #define TXQ 1 72 73 static int virtio_dev_launch(struct vmd_vm *, struct virtio_dev *); 74 static void virtio_dispatch_dev(int, short, void *); 75 static int handle_dev_msg(struct viodev_msg *, struct virtio_dev *); 76 77 const char * 78 virtio_reg_name(uint8_t reg) 79 { 80 switch (reg) { 81 case VIRTIO_CONFIG_DEVICE_FEATURES: return "device feature"; 82 case VIRTIO_CONFIG_GUEST_FEATURES: return "guest feature"; 83 case VIRTIO_CONFIG_QUEUE_PFN: return "queue address"; 84 case VIRTIO_CONFIG_QUEUE_SIZE: return "queue size"; 85 case VIRTIO_CONFIG_QUEUE_SELECT: return "queue select"; 86 case VIRTIO_CONFIG_QUEUE_NOTIFY: return "queue notify"; 87 case VIRTIO_CONFIG_DEVICE_STATUS: return "device status"; 88 case VIRTIO_CONFIG_ISR_STATUS: return "isr status"; 89 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI...VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3: 90 return "device config 0"; 91 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4: 92 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5: 93 return "device config 1"; 94 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8: return "device config 2"; 95 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 12: return "device config 3"; 96 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 16: return "device config 4"; 97 default: return "unknown"; 98 } 99 } 100 101 uint32_t 102 vring_size(uint32_t vq_size) 103 { 104 uint32_t allocsize1, allocsize2; 105 106 /* allocsize1: descriptor table + avail ring + pad */ 107 allocsize1 = VIRTQUEUE_ALIGN(sizeof(struct vring_desc) * vq_size 108 + sizeof(uint16_t) * (2 + vq_size)); 109 /* allocsize2: used ring + pad */ 110 allocsize2 = VIRTQUEUE_ALIGN(sizeof(uint16_t) * 2 111 + sizeof(struct vring_used_elem) * vq_size); 112 113 return allocsize1 + allocsize2; 114 } 115 116 /* Update queue select */ 117 void 118 viornd_update_qs(void) 119 { 120 struct virtio_vq_info *vq_info; 121 122 /* Invalid queue? */ 123 if (viornd.cfg.queue_select > 0) { 124 viornd.cfg.queue_size = 0; 125 return; 126 } 127 128 vq_info = &viornd.vq[viornd.cfg.queue_select]; 129 130 /* Update queue pfn/size based on queue select */ 131 viornd.cfg.queue_pfn = vq_info->q_gpa >> 12; 132 viornd.cfg.queue_size = vq_info->qs; 133 } 134 135 /* Update queue address */ 136 void 137 viornd_update_qa(void) 138 { 139 struct virtio_vq_info *vq_info; 140 void *hva = NULL; 141 142 /* Invalid queue? */ 143 if (viornd.cfg.queue_select > 0) 144 return; 145 146 vq_info = &viornd.vq[viornd.cfg.queue_select]; 147 vq_info->q_gpa = (uint64_t)viornd.cfg.queue_pfn * VIRTIO_PAGE_SIZE; 148 149 hva = hvaddr_mem(vq_info->q_gpa, vring_size(VIORND_QUEUE_SIZE)); 150 if (hva == NULL) 151 fatalx("viornd_update_qa"); 152 vq_info->q_hva = hva; 153 } 154 155 int 156 viornd_notifyq(void) 157 { 158 size_t sz; 159 int dxx, ret; 160 uint16_t aidx, uidx; 161 char *vr, *rnd_data; 162 struct vring_desc *desc; 163 struct vring_avail *avail; 164 struct vring_used *used; 165 struct virtio_vq_info *vq_info; 166 167 ret = 0; 168 169 /* Invalid queue? */ 170 if (viornd.cfg.queue_notify > 0) 171 return (0); 172 173 vq_info = &viornd.vq[viornd.cfg.queue_notify]; 174 vr = vq_info->q_hva; 175 if (vr == NULL) 176 fatalx("%s: null vring", __func__); 177 178 desc = (struct vring_desc *)(vr); 179 avail = (struct vring_avail *)(vr + vq_info->vq_availoffset); 180 used = (struct vring_used *)(vr + vq_info->vq_usedoffset); 181 182 aidx = avail->idx & VIORND_QUEUE_MASK; 183 uidx = used->idx & VIORND_QUEUE_MASK; 184 185 dxx = avail->ring[aidx] & VIORND_QUEUE_MASK; 186 187 sz = desc[dxx].len; 188 if (sz > MAXPHYS) 189 fatalx("viornd descriptor size too large (%zu)", sz); 190 191 rnd_data = malloc(sz); 192 193 if (rnd_data != NULL) { 194 arc4random_buf(rnd_data, sz); 195 if (write_mem(desc[dxx].addr, rnd_data, sz)) { 196 log_warnx("viornd: can't write random data @ " 197 "0x%llx", 198 desc[dxx].addr); 199 } else { 200 /* ret == 1 -> interrupt needed */ 201 /* XXX check VIRTIO_F_NO_INTR */ 202 ret = 1; 203 viornd.cfg.isr_status = 1; 204 used->ring[uidx].id = dxx; 205 used->ring[uidx].len = sz; 206 __sync_synchronize(); 207 used->idx++; 208 } 209 free(rnd_data); 210 } else 211 fatal("memory allocation error for viornd data"); 212 213 return (ret); 214 } 215 216 int 217 virtio_rnd_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr, 218 void *unused, uint8_t sz) 219 { 220 *intr = 0xFF; 221 222 if (dir == 0) { 223 switch (reg) { 224 case VIRTIO_CONFIG_DEVICE_FEATURES: 225 case VIRTIO_CONFIG_QUEUE_SIZE: 226 case VIRTIO_CONFIG_ISR_STATUS: 227 log_warnx("%s: illegal write %x to %s", 228 __progname, *data, virtio_reg_name(reg)); 229 break; 230 case VIRTIO_CONFIG_GUEST_FEATURES: 231 viornd.cfg.guest_feature = *data; 232 break; 233 case VIRTIO_CONFIG_QUEUE_PFN: 234 viornd.cfg.queue_pfn = *data; 235 viornd_update_qa(); 236 break; 237 case VIRTIO_CONFIG_QUEUE_SELECT: 238 viornd.cfg.queue_select = *data; 239 viornd_update_qs(); 240 break; 241 case VIRTIO_CONFIG_QUEUE_NOTIFY: 242 viornd.cfg.queue_notify = *data; 243 if (viornd_notifyq()) 244 *intr = 1; 245 break; 246 case VIRTIO_CONFIG_DEVICE_STATUS: 247 viornd.cfg.device_status = *data; 248 break; 249 } 250 } else { 251 switch (reg) { 252 case VIRTIO_CONFIG_DEVICE_FEATURES: 253 *data = viornd.cfg.device_feature; 254 break; 255 case VIRTIO_CONFIG_GUEST_FEATURES: 256 *data = viornd.cfg.guest_feature; 257 break; 258 case VIRTIO_CONFIG_QUEUE_PFN: 259 *data = viornd.cfg.queue_pfn; 260 break; 261 case VIRTIO_CONFIG_QUEUE_SIZE: 262 *data = viornd.cfg.queue_size; 263 break; 264 case VIRTIO_CONFIG_QUEUE_SELECT: 265 *data = viornd.cfg.queue_select; 266 break; 267 case VIRTIO_CONFIG_QUEUE_NOTIFY: 268 *data = viornd.cfg.queue_notify; 269 break; 270 case VIRTIO_CONFIG_DEVICE_STATUS: 271 *data = viornd.cfg.device_status; 272 break; 273 case VIRTIO_CONFIG_ISR_STATUS: 274 *data = viornd.cfg.isr_status; 275 viornd.cfg.isr_status = 0; 276 vcpu_deassert_pic_irq(viornd.vm_id, 0, viornd.irq); 277 break; 278 } 279 } 280 return (0); 281 } 282 283 int 284 vmmci_ctl(unsigned int cmd) 285 { 286 struct timeval tv = { 0, 0 }; 287 288 if ((vmmci.cfg.device_status & 289 VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) == 0) 290 return (-1); 291 292 if (cmd == vmmci.cmd) 293 return (0); 294 295 switch (cmd) { 296 case VMMCI_NONE: 297 break; 298 case VMMCI_SHUTDOWN: 299 case VMMCI_REBOOT: 300 /* Update command */ 301 vmmci.cmd = cmd; 302 303 /* 304 * vmm VMs do not support powerdown, send a reboot request 305 * instead and turn it off after the triple fault. 306 */ 307 if (cmd == VMMCI_SHUTDOWN) 308 cmd = VMMCI_REBOOT; 309 310 /* Trigger interrupt */ 311 vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE; 312 vcpu_assert_pic_irq(vmmci.vm_id, 0, vmmci.irq); 313 314 /* Add ACK timeout */ 315 tv.tv_sec = VMMCI_TIMEOUT; 316 evtimer_add(&vmmci.timeout, &tv); 317 break; 318 case VMMCI_SYNCRTC: 319 if (vmmci.cfg.guest_feature & VMMCI_F_SYNCRTC) { 320 /* RTC updated, request guest VM resync of its RTC */ 321 vmmci.cmd = cmd; 322 323 vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE; 324 vcpu_assert_pic_irq(vmmci.vm_id, 0, vmmci.irq); 325 } else { 326 log_debug("%s: RTC sync skipped (guest does not " 327 "support RTC sync)\n", __func__); 328 } 329 break; 330 default: 331 fatalx("invalid vmmci command: %d", cmd); 332 } 333 334 return (0); 335 } 336 337 void 338 vmmci_ack(unsigned int cmd) 339 { 340 struct timeval tv = { 0, 0 }; 341 342 switch (cmd) { 343 case VMMCI_NONE: 344 break; 345 case VMMCI_SHUTDOWN: 346 /* 347 * The shutdown was requested by the VM if we don't have 348 * a pending shutdown request. In this case add a short 349 * timeout to give the VM a chance to reboot before the 350 * timer is expired. 351 */ 352 if (vmmci.cmd == 0) { 353 log_debug("%s: vm %u requested shutdown", __func__, 354 vmmci.vm_id); 355 tv.tv_sec = VMMCI_TIMEOUT; 356 evtimer_add(&vmmci.timeout, &tv); 357 return; 358 } 359 /* FALLTHROUGH */ 360 case VMMCI_REBOOT: 361 /* 362 * If the VM acknowledged our shutdown request, give it 363 * enough time to shutdown or reboot gracefully. This 364 * might take a considerable amount of time (running 365 * rc.shutdown on the VM), so increase the timeout before 366 * killing it forcefully. 367 */ 368 if (cmd == vmmci.cmd && 369 evtimer_pending(&vmmci.timeout, NULL)) { 370 log_debug("%s: vm %u acknowledged shutdown request", 371 __func__, vmmci.vm_id); 372 tv.tv_sec = VMMCI_SHUTDOWN_TIMEOUT; 373 evtimer_add(&vmmci.timeout, &tv); 374 } 375 break; 376 case VMMCI_SYNCRTC: 377 log_debug("%s: vm %u acknowledged RTC sync request", 378 __func__, vmmci.vm_id); 379 vmmci.cmd = VMMCI_NONE; 380 break; 381 default: 382 log_warnx("%s: illegal request %u", __func__, cmd); 383 break; 384 } 385 } 386 387 void 388 vmmci_timeout(int fd, short type, void *arg) 389 { 390 log_debug("%s: vm %u shutdown", __progname, vmmci.vm_id); 391 vm_shutdown(vmmci.cmd == VMMCI_REBOOT ? VMMCI_REBOOT : VMMCI_SHUTDOWN); 392 } 393 394 int 395 vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr, 396 void *unused, uint8_t sz) 397 { 398 *intr = 0xFF; 399 400 if (dir == 0) { 401 switch (reg) { 402 case VIRTIO_CONFIG_DEVICE_FEATURES: 403 case VIRTIO_CONFIG_QUEUE_SIZE: 404 case VIRTIO_CONFIG_ISR_STATUS: 405 log_warnx("%s: illegal write %x to %s", 406 __progname, *data, virtio_reg_name(reg)); 407 break; 408 case VIRTIO_CONFIG_GUEST_FEATURES: 409 vmmci.cfg.guest_feature = *data; 410 break; 411 case VIRTIO_CONFIG_QUEUE_PFN: 412 vmmci.cfg.queue_pfn = *data; 413 break; 414 case VIRTIO_CONFIG_QUEUE_SELECT: 415 vmmci.cfg.queue_select = *data; 416 break; 417 case VIRTIO_CONFIG_QUEUE_NOTIFY: 418 vmmci.cfg.queue_notify = *data; 419 break; 420 case VIRTIO_CONFIG_DEVICE_STATUS: 421 vmmci.cfg.device_status = *data; 422 break; 423 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI: 424 vmmci_ack(*data); 425 break; 426 } 427 } else { 428 switch (reg) { 429 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI: 430 *data = vmmci.cmd; 431 break; 432 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4: 433 /* Update time once when reading the first register */ 434 gettimeofday(&vmmci.time, NULL); 435 *data = (uint64_t)vmmci.time.tv_sec; 436 break; 437 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8: 438 *data = (uint64_t)vmmci.time.tv_sec << 32; 439 break; 440 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 12: 441 *data = (uint64_t)vmmci.time.tv_usec; 442 break; 443 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 16: 444 *data = (uint64_t)vmmci.time.tv_usec << 32; 445 break; 446 case VIRTIO_CONFIG_DEVICE_FEATURES: 447 *data = vmmci.cfg.device_feature; 448 break; 449 case VIRTIO_CONFIG_GUEST_FEATURES: 450 *data = vmmci.cfg.guest_feature; 451 break; 452 case VIRTIO_CONFIG_QUEUE_PFN: 453 *data = vmmci.cfg.queue_pfn; 454 break; 455 case VIRTIO_CONFIG_QUEUE_SIZE: 456 *data = vmmci.cfg.queue_size; 457 break; 458 case VIRTIO_CONFIG_QUEUE_SELECT: 459 *data = vmmci.cfg.queue_select; 460 break; 461 case VIRTIO_CONFIG_QUEUE_NOTIFY: 462 *data = vmmci.cfg.queue_notify; 463 break; 464 case VIRTIO_CONFIG_DEVICE_STATUS: 465 *data = vmmci.cfg.device_status; 466 break; 467 case VIRTIO_CONFIG_ISR_STATUS: 468 *data = vmmci.cfg.isr_status; 469 vmmci.cfg.isr_status = 0; 470 vcpu_deassert_pic_irq(vmmci.vm_id, 0, vmmci.irq); 471 break; 472 } 473 } 474 return (0); 475 } 476 477 int 478 virtio_get_base(int fd, char *path, size_t npath, int type, const char *dpath) 479 { 480 switch (type) { 481 case VMDF_RAW: 482 return 0; 483 case VMDF_QCOW2: 484 return virtio_qcow2_get_base(fd, path, npath, dpath); 485 } 486 log_warnx("%s: invalid disk format", __func__); 487 return -1; 488 } 489 490 void 491 virtio_init(struct vmd_vm *vm, int child_cdrom, 492 int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps) 493 { 494 struct vmop_create_params *vmc = &vm->vm_params; 495 struct vm_create_params *vcp = &vmc->vmc_params; 496 struct virtio_dev *dev; 497 uint8_t id; 498 uint8_t i, j; 499 500 /* Virtio entropy device */ 501 if (pci_add_device(&id, PCI_VENDOR_QUMRANET, 502 PCI_PRODUCT_QUMRANET_VIO_RNG, PCI_CLASS_SYSTEM, 503 PCI_SUBCLASS_SYSTEM_MISC, 504 PCI_VENDOR_OPENBSD, 505 PCI_PRODUCT_VIRTIO_ENTROPY, 1, NULL)) { 506 log_warnx("%s: can't add PCI virtio rng device", 507 __progname); 508 return; 509 } 510 511 if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_rnd_io, NULL)) { 512 log_warnx("%s: can't add bar for virtio rng device", 513 __progname); 514 return; 515 } 516 517 memset(&viornd, 0, sizeof(viornd)); 518 viornd.vq[0].qs = VIORND_QUEUE_SIZE; 519 viornd.vq[0].vq_availoffset = sizeof(struct vring_desc) * 520 VIORND_QUEUE_SIZE; 521 viornd.vq[0].vq_usedoffset = VIRTQUEUE_ALIGN( 522 sizeof(struct vring_desc) * VIORND_QUEUE_SIZE 523 + sizeof(uint16_t) * (2 + VIORND_QUEUE_SIZE)); 524 viornd.pci_id = id; 525 viornd.irq = pci_get_dev_irq(id); 526 viornd.vm_id = vcp->vcp_id; 527 528 SLIST_INIT(&virtio_devs); 529 530 if (vmc->vmc_nnics > 0) { 531 for (i = 0; i < vmc->vmc_nnics; i++) { 532 dev = calloc(1, sizeof(struct virtio_dev)); 533 if (dev == NULL) { 534 log_warn("%s: calloc failure allocating vionet", 535 __progname); 536 return; 537 } 538 /* Virtio network */ 539 dev->dev_type = VMD_DEVTYPE_NET; 540 541 if (pci_add_device(&id, PCI_VENDOR_QUMRANET, 542 PCI_PRODUCT_QUMRANET_VIO_NET, PCI_CLASS_SYSTEM, 543 PCI_SUBCLASS_SYSTEM_MISC, PCI_VENDOR_OPENBSD, 544 PCI_PRODUCT_VIRTIO_NETWORK, 1, NULL)) { 545 log_warnx("%s: can't add PCI virtio net device", 546 __progname); 547 return; 548 } 549 dev->pci_id = id; 550 dev->sync_fd = -1; 551 dev->async_fd = -1; 552 dev->vm_id = vcp->vcp_id; 553 dev->vm_vmid = vm->vm_vmid; 554 dev->irq = pci_get_dev_irq(id); 555 556 /* The vionet pci bar function is called by the vcpu. */ 557 if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_pci_io, 558 dev)) { 559 log_warnx("%s: can't add bar for virtio net " 560 "device", __progname); 561 return; 562 } 563 564 dev->vionet.vq[RXQ].qs = VIONET_QUEUE_SIZE; 565 dev->vionet.vq[RXQ].vq_availoffset = 566 sizeof(struct vring_desc) * VIONET_QUEUE_SIZE; 567 dev->vionet.vq[RXQ].vq_usedoffset = VIRTQUEUE_ALIGN( 568 sizeof(struct vring_desc) * VIONET_QUEUE_SIZE 569 + sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE)); 570 dev->vionet.vq[RXQ].last_avail = 0; 571 dev->vionet.vq[RXQ].notified_avail = 0; 572 573 dev->vionet.vq[TXQ].qs = VIONET_QUEUE_SIZE; 574 dev->vionet.vq[TXQ].vq_availoffset = 575 sizeof(struct vring_desc) * VIONET_QUEUE_SIZE; 576 dev->vionet.vq[TXQ].vq_usedoffset = VIRTQUEUE_ALIGN( 577 sizeof(struct vring_desc) * VIONET_QUEUE_SIZE 578 + sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE)); 579 dev->vionet.vq[TXQ].last_avail = 0; 580 dev->vionet.vq[TXQ].notified_avail = 0; 581 582 dev->vionet.data_fd = child_taps[i]; 583 584 /* MAC address has been assigned by the parent */ 585 memcpy(&dev->vionet.mac, &vmc->vmc_macs[i], 6); 586 dev->vionet.cfg.device_feature = VIRTIO_NET_F_MAC; 587 588 dev->vionet.lockedmac = 589 vmc->vmc_ifflags[i] & VMIFF_LOCKED ? 1 : 0; 590 dev->vionet.local = 591 vmc->vmc_ifflags[i] & VMIFF_LOCAL ? 1 : 0; 592 if (i == 0 && vmc->vmc_bootdevice & VMBOOTDEV_NET) 593 dev->vionet.pxeboot = 1; 594 memcpy(&dev->vionet.local_prefix, 595 &env->vmd_cfg.cfg_localprefix, 596 sizeof(dev->vionet.local_prefix)); 597 log_debug("%s: vm \"%s\" vio%u lladdr %s%s%s%s", 598 __func__, vcp->vcp_name, i, 599 ether_ntoa((void *)dev->vionet.mac), 600 dev->vionet.lockedmac ? ", locked" : "", 601 dev->vionet.local ? ", local" : "", 602 dev->vionet.pxeboot ? ", pxeboot" : ""); 603 604 /* Add the vionet to our device list. */ 605 dev->vionet.idx = i; 606 SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next); 607 } 608 } 609 610 if (vmc->vmc_ndisks > 0) { 611 for (i = 0; i < vmc->vmc_ndisks; i++) { 612 dev = calloc(1, sizeof(struct virtio_dev)); 613 if (dev == NULL) { 614 log_warn("%s: calloc failure allocating vioblk", 615 __progname); 616 return; 617 } 618 619 /* One vioblk device for each disk defined in vcp */ 620 dev->dev_type = VMD_DEVTYPE_DISK; 621 622 if (pci_add_device(&id, PCI_VENDOR_QUMRANET, 623 PCI_PRODUCT_QUMRANET_VIO_BLOCK, 624 PCI_CLASS_MASS_STORAGE, 625 PCI_SUBCLASS_MASS_STORAGE_SCSI, 626 PCI_VENDOR_OPENBSD, 627 PCI_PRODUCT_VIRTIO_BLOCK, 1, NULL)) { 628 log_warnx("%s: can't add PCI virtio block " 629 "device", __progname); 630 return; 631 } 632 dev->pci_id = id; 633 dev->sync_fd = -1; 634 dev->async_fd = -1; 635 dev->vm_id = vcp->vcp_id; 636 dev->vm_vmid = vm->vm_vmid; 637 dev->irq = pci_get_dev_irq(id); 638 639 if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_pci_io, 640 &dev->vioblk)) { 641 log_warnx("%s: can't add bar for virtio block " 642 "device", __progname); 643 return; 644 } 645 dev->vioblk.vq[0].qs = VIOBLK_QUEUE_SIZE; 646 dev->vioblk.vq[0].vq_availoffset = 647 sizeof(struct vring_desc) * VIOBLK_QUEUE_SIZE; 648 dev->vioblk.vq[0].vq_usedoffset = VIRTQUEUE_ALIGN( 649 sizeof(struct vring_desc) * VIOBLK_QUEUE_SIZE 650 + sizeof(uint16_t) * (2 + VIOBLK_QUEUE_SIZE)); 651 dev->vioblk.vq[0].last_avail = 0; 652 dev->vioblk.cfg.device_feature = 653 VIRTIO_BLK_F_SEG_MAX; 654 dev->vioblk.seg_max = VIOBLK_SEG_MAX; 655 656 /* 657 * Initialize disk fds to an invalid fd (-1), then 658 * set any child disk fds. 659 */ 660 memset(&dev->vioblk.disk_fd, -1, 661 sizeof(dev->vioblk.disk_fd)); 662 dev->vioblk.ndisk_fd = vmc->vmc_diskbases[i]; 663 for (j = 0; j < dev->vioblk.ndisk_fd; j++) 664 dev->vioblk.disk_fd[j] = child_disks[i][j]; 665 666 dev->vioblk.idx = i; 667 SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next); 668 } 669 } 670 671 /* 672 * Launch virtio devices that support subprocess execution. 673 */ 674 SLIST_FOREACH(dev, &virtio_devs, dev_next) { 675 if (virtio_dev_launch(vm, dev) != 0) 676 fatalx("failed to launch virtio device"); 677 } 678 679 /* vioscsi cdrom */ 680 if (strlen(vmc->vmc_cdrom)) { 681 vioscsi = calloc(1, sizeof(struct vioscsi_dev)); 682 if (vioscsi == NULL) { 683 log_warn("%s: calloc failure allocating vioscsi", 684 __progname); 685 return; 686 } 687 688 if (pci_add_device(&id, PCI_VENDOR_QUMRANET, 689 PCI_PRODUCT_QUMRANET_VIO_SCSI, 690 PCI_CLASS_MASS_STORAGE, 691 PCI_SUBCLASS_MASS_STORAGE_SCSI, 692 PCI_VENDOR_OPENBSD, 693 PCI_PRODUCT_VIRTIO_SCSI, 1, NULL)) { 694 log_warnx("%s: can't add PCI vioscsi device", 695 __progname); 696 return; 697 } 698 699 if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, vioscsi_io, vioscsi)) { 700 log_warnx("%s: can't add bar for vioscsi device", 701 __progname); 702 return; 703 } 704 705 for (i = 0; i < VIRTIO_MAX_QUEUES; i++) { 706 vioscsi->vq[i].qs = VIOSCSI_QUEUE_SIZE; 707 vioscsi->vq[i].vq_availoffset = 708 sizeof(struct vring_desc) * VIOSCSI_QUEUE_SIZE; 709 vioscsi->vq[i].vq_usedoffset = VIRTQUEUE_ALIGN( 710 sizeof(struct vring_desc) * VIOSCSI_QUEUE_SIZE 711 + sizeof(uint16_t) * (2 + VIOSCSI_QUEUE_SIZE)); 712 vioscsi->vq[i].last_avail = 0; 713 } 714 if (virtio_raw_init(&vioscsi->file, &vioscsi->sz, &child_cdrom, 715 1) == -1) { 716 log_warnx("%s: unable to determine iso format", 717 __func__); 718 return; 719 } 720 vioscsi->locked = 0; 721 vioscsi->lba = 0; 722 vioscsi->n_blocks = vioscsi->sz / VIOSCSI_BLOCK_SIZE_CDROM; 723 vioscsi->max_xfer = VIOSCSI_BLOCK_SIZE_CDROM; 724 vioscsi->pci_id = id; 725 vioscsi->vm_id = vcp->vcp_id; 726 vioscsi->irq = pci_get_dev_irq(id); 727 } 728 729 /* virtio control device */ 730 if (pci_add_device(&id, PCI_VENDOR_OPENBSD, 731 PCI_PRODUCT_OPENBSD_CONTROL, 732 PCI_CLASS_COMMUNICATIONS, 733 PCI_SUBCLASS_COMMUNICATIONS_MISC, 734 PCI_VENDOR_OPENBSD, 735 PCI_PRODUCT_VIRTIO_VMMCI, 1, NULL)) { 736 log_warnx("%s: can't add PCI vmm control device", 737 __progname); 738 return; 739 } 740 741 if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, vmmci_io, NULL)) { 742 log_warnx("%s: can't add bar for vmm control device", 743 __progname); 744 return; 745 } 746 747 memset(&vmmci, 0, sizeof(vmmci)); 748 vmmci.cfg.device_feature = VMMCI_F_TIMESYNC | VMMCI_F_ACK | 749 VMMCI_F_SYNCRTC; 750 vmmci.vm_id = vcp->vcp_id; 751 vmmci.irq = pci_get_dev_irq(id); 752 vmmci.pci_id = id; 753 754 evtimer_set(&vmmci.timeout, vmmci_timeout, NULL); 755 } 756 757 /* 758 * vionet_set_hostmac 759 * 760 * Sets the hardware address for the host-side tap(4) on a vionet_dev. 761 * 762 * This should only be called from the event-loop thread 763 * 764 * vm: pointer to the current vmd_vm instance 765 * idx: index into the array of vionet_dev's for the target vionet_dev 766 * addr: ethernet address to set 767 */ 768 void 769 vionet_set_hostmac(struct vmd_vm *vm, unsigned int idx, uint8_t *addr) 770 { 771 struct vmop_create_params *vmc = &vm->vm_params; 772 struct virtio_dev *dev; 773 struct vionet_dev *vionet = NULL; 774 int ret; 775 776 if (idx > vmc->vmc_nnics) 777 fatalx("%s: invalid vionet index: %u", __func__, idx); 778 779 SLIST_FOREACH(dev, &virtio_devs, dev_next) { 780 if (dev->dev_type == VMD_DEVTYPE_NET 781 && dev->vionet.idx == idx) { 782 vionet = &dev->vionet; 783 break; 784 } 785 } 786 if (vionet == NULL) 787 fatalx("%s: dev == NULL, idx = %u", __func__, idx); 788 789 /* Set the local vm process copy. */ 790 memcpy(vionet->hostmac, addr, sizeof(vionet->hostmac)); 791 792 /* Send the information to the device process. */ 793 ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_HOSTMAC, 0, 0, -1, 794 vionet->hostmac, sizeof(vionet->hostmac)); 795 if (ret == -1) { 796 log_warnx("%s: failed to queue hostmac to vionet dev %u", 797 __func__, idx); 798 return; 799 } 800 } 801 802 void 803 virtio_shutdown(struct vmd_vm *vm) 804 { 805 int ret, status; 806 pid_t pid = 0; 807 struct virtio_dev *dev, *tmp; 808 struct viodev_msg msg; 809 struct imsgbuf *ibuf; 810 811 /* Ensure that our disks are synced. */ 812 if (vioscsi != NULL) 813 vioscsi->file.close(vioscsi->file.p, 0); 814 815 /* 816 * Broadcast shutdown to child devices. We need to do this 817 * synchronously as we have already stopped the async event thread. 818 */ 819 SLIST_FOREACH(dev, &virtio_devs, dev_next) { 820 memset(&msg, 0, sizeof(msg)); 821 msg.type = VIODEV_MSG_SHUTDOWN; 822 ibuf = &dev->sync_iev.ibuf; 823 ret = imsg_compose(ibuf, VIODEV_MSG_SHUTDOWN, 0, 0, -1, 824 &msg, sizeof(msg)); 825 if (ret == -1) 826 fatalx("%s: failed to send shutdown to device", 827 __func__); 828 if (imsg_flush(ibuf) == -1) 829 fatalx("%s: imsg_flush", __func__); 830 } 831 832 /* 833 * Wait for all children to shutdown using a simple approach of 834 * iterating over known child devices and waiting for them to die. 835 */ 836 SLIST_FOREACH_SAFE(dev, &virtio_devs, dev_next, tmp) { 837 log_debug("%s: waiting on device pid %d", __func__, 838 dev->dev_pid); 839 do { 840 pid = waitpid(dev->dev_pid, &status, WNOHANG); 841 } while (pid == 0 || (pid == -1 && errno == EINTR)); 842 if (pid == dev->dev_pid) 843 log_debug("%s: device for pid %d is stopped", 844 __func__, pid); 845 else 846 log_warnx("%s: unexpected pid %d", __func__, pid); 847 free(dev); 848 } 849 } 850 851 int 852 vmmci_restore(int fd, uint32_t vm_id) 853 { 854 log_debug("%s: receiving vmmci", __func__); 855 if (atomicio(read, fd, &vmmci, sizeof(vmmci)) != sizeof(vmmci)) { 856 log_warnx("%s: error reading vmmci from fd", __func__); 857 return (-1); 858 } 859 860 if (pci_set_bar_fn(vmmci.pci_id, 0, vmmci_io, NULL)) { 861 log_warnx("%s: can't set bar fn for vmm control device", 862 __progname); 863 return (-1); 864 } 865 vmmci.vm_id = vm_id; 866 vmmci.irq = pci_get_dev_irq(vmmci.pci_id); 867 memset(&vmmci.timeout, 0, sizeof(struct event)); 868 evtimer_set(&vmmci.timeout, vmmci_timeout, NULL); 869 return (0); 870 } 871 872 int 873 viornd_restore(int fd, struct vmd_vm *vm) 874 { 875 void *hva = NULL; 876 877 log_debug("%s: receiving viornd", __func__); 878 if (atomicio(read, fd, &viornd, sizeof(viornd)) != sizeof(viornd)) { 879 log_warnx("%s: error reading viornd from fd", __func__); 880 return (-1); 881 } 882 if (pci_set_bar_fn(viornd.pci_id, 0, virtio_rnd_io, NULL)) { 883 log_warnx("%s: can't set bar fn for virtio rng device", 884 __progname); 885 return (-1); 886 } 887 viornd.vm_id = vm->vm_params.vmc_params.vcp_id; 888 viornd.irq = pci_get_dev_irq(viornd.pci_id); 889 890 hva = hvaddr_mem(viornd.vq[0].q_gpa, vring_size(VIORND_QUEUE_SIZE)); 891 if (hva == NULL) 892 fatal("failed to restore viornd virtqueue"); 893 viornd.vq[0].q_hva = hva; 894 895 return (0); 896 } 897 898 int 899 vionet_restore(int fd, struct vmd_vm *vm, int *child_taps) 900 { 901 struct vmop_create_params *vmc = &vm->vm_params; 902 struct vm_create_params *vcp = &vmc->vmc_params; 903 struct virtio_dev *dev; 904 uint8_t i; 905 906 if (vmc->vmc_nnics == 0) 907 return (0); 908 909 for (i = 0; i < vmc->vmc_nnics; i++) { 910 dev = calloc(1, sizeof(struct virtio_dev)); 911 if (dev == NULL) { 912 log_warn("%s: calloc failure allocating vionet", 913 __progname); 914 return (-1); 915 } 916 917 log_debug("%s: receiving virtio network device", __func__); 918 if (atomicio(read, fd, dev, sizeof(struct virtio_dev)) 919 != sizeof(struct virtio_dev)) { 920 log_warnx("%s: error reading vionet from fd", 921 __func__); 922 return (-1); 923 } 924 925 /* Virtio network */ 926 if (dev->dev_type != VMD_DEVTYPE_NET) { 927 log_warnx("%s: invalid device type", __func__); 928 return (-1); 929 } 930 931 dev->sync_fd = -1; 932 dev->async_fd = -1; 933 dev->vm_id = vcp->vcp_id; 934 dev->vm_vmid = vm->vm_vmid; 935 dev->irq = pci_get_dev_irq(dev->pci_id); 936 937 if (pci_set_bar_fn(dev->pci_id, 0, virtio_pci_io, dev)) { 938 log_warnx("%s: can't set bar fn for virtio net " 939 "device", __progname); 940 return (-1); 941 } 942 943 dev->vionet.data_fd = child_taps[i]; 944 dev->vionet.idx = i; 945 946 SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next); 947 } 948 949 return (0); 950 } 951 952 int 953 vioblk_restore(int fd, struct vmd_vm *vm, 954 int child_disks[][VM_MAX_BASE_PER_DISK]) 955 { 956 struct vmop_create_params *vmc = &vm->vm_params; 957 struct virtio_dev *dev; 958 uint8_t i, j; 959 960 if (vmc->vmc_ndisks == 0) 961 return (0); 962 963 for (i = 0; i < vmc->vmc_ndisks; i++) { 964 dev = calloc(1, sizeof(struct virtio_dev)); 965 if (dev == NULL) { 966 log_warn("%s: calloc failure allocating vioblks", 967 __progname); 968 return (-1); 969 } 970 971 log_debug("%s: receiving vioblk", __func__); 972 if (atomicio(read, fd, dev, sizeof(struct virtio_dev)) 973 != sizeof(struct virtio_dev)) { 974 log_warnx("%s: error reading vioblk from fd", __func__); 975 return (-1); 976 } 977 if (dev->dev_type != VMD_DEVTYPE_DISK) { 978 log_warnx("%s: invalid device type", __func__); 979 return (-1); 980 } 981 982 dev->sync_fd = -1; 983 dev->async_fd = -1; 984 985 if (pci_set_bar_fn(dev->pci_id, 0, virtio_pci_io, dev)) { 986 log_warnx("%s: can't set bar fn for virtio block " 987 "device", __progname); 988 return (-1); 989 } 990 dev->vm_id = vmc->vmc_params.vcp_id; 991 dev->irq = pci_get_dev_irq(dev->pci_id); 992 993 memset(&dev->vioblk.disk_fd, -1, sizeof(dev->vioblk.disk_fd)); 994 dev->vioblk.ndisk_fd = vmc->vmc_diskbases[i]; 995 for (j = 0; j < dev->vioblk.ndisk_fd; j++) 996 dev->vioblk.disk_fd[j] = child_disks[i][j]; 997 998 dev->vioblk.idx = i; 999 SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next); 1000 } 1001 return (0); 1002 } 1003 1004 int 1005 vioscsi_restore(int fd, struct vmd_vm *vm, int child_cdrom) 1006 { 1007 void *hva = NULL; 1008 unsigned int i; 1009 1010 if (!strlen(vm->vm_params.vmc_cdrom)) 1011 return (0); 1012 1013 vioscsi = calloc(1, sizeof(struct vioscsi_dev)); 1014 if (vioscsi == NULL) { 1015 log_warn("%s: calloc failure allocating vioscsi", __progname); 1016 return (-1); 1017 } 1018 1019 log_debug("%s: receiving vioscsi", __func__); 1020 1021 if (atomicio(read, fd, vioscsi, sizeof(struct vioscsi_dev)) != 1022 sizeof(struct vioscsi_dev)) { 1023 log_warnx("%s: error reading vioscsi from fd", __func__); 1024 return (-1); 1025 } 1026 1027 if (pci_set_bar_fn(vioscsi->pci_id, 0, vioscsi_io, vioscsi)) { 1028 log_warnx("%s: can't set bar fn for vmm control device", 1029 __progname); 1030 return (-1); 1031 } 1032 1033 vioscsi->vm_id = vm->vm_params.vmc_params.vcp_id; 1034 vioscsi->irq = pci_get_dev_irq(vioscsi->pci_id); 1035 1036 /* vioscsi uses 3 virtqueues. */ 1037 for (i = 0; i < 3; i++) { 1038 hva = hvaddr_mem(vioscsi->vq[i].q_gpa, 1039 vring_size(VIOSCSI_QUEUE_SIZE)); 1040 if (hva == NULL) 1041 fatal("failed to restore vioscsi virtqueue"); 1042 vioscsi->vq[i].q_hva = hva; 1043 } 1044 1045 return (0); 1046 } 1047 1048 int 1049 virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom, 1050 int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps) 1051 { 1052 struct virtio_dev *dev; 1053 int ret; 1054 1055 SLIST_INIT(&virtio_devs); 1056 1057 if ((ret = viornd_restore(fd, vm)) == -1) 1058 return (ret); 1059 1060 if ((ret = vioblk_restore(fd, vm, child_disks)) == -1) 1061 return (ret); 1062 1063 if ((ret = vioscsi_restore(fd, vm, child_cdrom)) == -1) 1064 return (ret); 1065 1066 if ((ret = vionet_restore(fd, vm, child_taps)) == -1) 1067 return (ret); 1068 1069 if ((ret = vmmci_restore(fd, vm->vm_params.vmc_params.vcp_id)) == -1) 1070 return (ret); 1071 1072 SLIST_FOREACH(dev, &virtio_devs, dev_next) { 1073 if (virtio_dev_launch(vm, dev) != 0) 1074 fatalx("%s: failed to restore virtio dev", __func__); 1075 } 1076 1077 return (0); 1078 } 1079 1080 int 1081 viornd_dump(int fd) 1082 { 1083 log_debug("%s: sending viornd", __func__); 1084 1085 viornd.vq[0].q_hva = NULL; 1086 1087 if (atomicio(vwrite, fd, &viornd, sizeof(viornd)) != sizeof(viornd)) { 1088 log_warnx("%s: error writing viornd to fd", __func__); 1089 return (-1); 1090 } 1091 return (0); 1092 } 1093 1094 int 1095 vmmci_dump(int fd) 1096 { 1097 log_debug("%s: sending vmmci", __func__); 1098 1099 if (atomicio(vwrite, fd, &vmmci, sizeof(vmmci)) != sizeof(vmmci)) { 1100 log_warnx("%s: error writing vmmci to fd", __func__); 1101 return (-1); 1102 } 1103 return (0); 1104 } 1105 1106 int 1107 vionet_dump(int fd) 1108 { 1109 struct virtio_dev *dev, temp; 1110 struct viodev_msg msg; 1111 struct imsg imsg; 1112 struct imsgbuf *ibuf = NULL; 1113 size_t sz; 1114 int ret; 1115 1116 log_debug("%s: dumping vionet", __func__); 1117 1118 SLIST_FOREACH(dev, &virtio_devs, dev_next) { 1119 if (dev->dev_type != VMD_DEVTYPE_NET) 1120 continue; 1121 1122 memset(&msg, 0, sizeof(msg)); 1123 memset(&imsg, 0, sizeof(imsg)); 1124 1125 ibuf = &dev->sync_iev.ibuf; 1126 msg.type = VIODEV_MSG_DUMP; 1127 1128 ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg, 1129 sizeof(msg)); 1130 if (ret == -1) { 1131 log_warnx("%s: failed requesting dump of vionet[%d]", 1132 __func__, dev->vionet.idx); 1133 return (-1); 1134 } 1135 if (imsg_flush(ibuf) == -1) { 1136 log_warnx("%s: imsg_flush", __func__); 1137 return (-1); 1138 } 1139 1140 sz = atomicio(read, dev->sync_fd, &temp, sizeof(temp)); 1141 if (sz != sizeof(temp)) { 1142 log_warnx("%s: failed to dump vionet[%d]", __func__, 1143 dev->vionet.idx); 1144 return (-1); 1145 } 1146 1147 /* Clear volatile state. Will reinitialize on restore. */ 1148 temp.vionet.vq[RXQ].q_hva = NULL; 1149 temp.vionet.vq[TXQ].q_hva = NULL; 1150 temp.async_fd = -1; 1151 temp.sync_fd = -1; 1152 memset(&temp.async_iev, 0, sizeof(temp.async_iev)); 1153 memset(&temp.sync_iev, 0, sizeof(temp.sync_iev)); 1154 1155 if (atomicio(vwrite, fd, &temp, sizeof(temp)) != sizeof(temp)) { 1156 log_warnx("%s: error writing vionet to fd", __func__); 1157 return (-1); 1158 } 1159 } 1160 1161 return (0); 1162 } 1163 1164 int 1165 vioblk_dump(int fd) 1166 { 1167 struct virtio_dev *dev, temp; 1168 struct viodev_msg msg; 1169 struct imsg imsg; 1170 struct imsgbuf *ibuf = NULL; 1171 size_t sz; 1172 int ret; 1173 1174 log_debug("%s: dumping vioblk", __func__); 1175 1176 SLIST_FOREACH(dev, &virtio_devs, dev_next) { 1177 if (dev->dev_type != VMD_DEVTYPE_DISK) 1178 continue; 1179 1180 memset(&msg, 0, sizeof(msg)); 1181 memset(&imsg, 0, sizeof(imsg)); 1182 1183 ibuf = &dev->sync_iev.ibuf; 1184 msg.type = VIODEV_MSG_DUMP; 1185 1186 ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg, 1187 sizeof(msg)); 1188 if (ret == -1) { 1189 log_warnx("%s: failed requesting dump of vioblk[%d]", 1190 __func__, dev->vioblk.idx); 1191 return (-1); 1192 } 1193 if (imsg_flush(ibuf) == -1) { 1194 log_warnx("%s: imsg_flush", __func__); 1195 return (-1); 1196 } 1197 1198 1199 sz = atomicio(read, dev->sync_fd, &temp, sizeof(temp)); 1200 if (sz != sizeof(temp)) { 1201 log_warnx("%s: failed to dump vioblk[%d]", __func__, 1202 dev->vioblk.idx); 1203 return (-1); 1204 } 1205 1206 /* Clear volatile state. Will reinitialize on restore. */ 1207 temp.vioblk.vq[0].q_hva = NULL; 1208 temp.async_fd = -1; 1209 temp.sync_fd = -1; 1210 memset(&temp.async_iev, 0, sizeof(temp.async_iev)); 1211 memset(&temp.sync_iev, 0, sizeof(temp.sync_iev)); 1212 1213 if (atomicio(vwrite, fd, &temp, sizeof(temp)) != sizeof(temp)) { 1214 log_warnx("%s: error writing vioblk to fd", __func__); 1215 return (-1); 1216 } 1217 } 1218 1219 return (0); 1220 } 1221 1222 int 1223 vioscsi_dump(int fd) 1224 { 1225 unsigned int i; 1226 1227 if (vioscsi == NULL) 1228 return (0); 1229 1230 log_debug("%s: sending vioscsi", __func__); 1231 1232 for (i = 0; i < 3; i++) 1233 vioscsi->vq[i].q_hva = NULL; 1234 1235 if (atomicio(vwrite, fd, vioscsi, sizeof(struct vioscsi_dev)) != 1236 sizeof(struct vioscsi_dev)) { 1237 log_warnx("%s: error writing vioscsi to fd", __func__); 1238 return (-1); 1239 } 1240 return (0); 1241 } 1242 1243 int 1244 virtio_dump(int fd) 1245 { 1246 int ret; 1247 1248 if ((ret = viornd_dump(fd)) == -1) 1249 return ret; 1250 1251 if ((ret = vioblk_dump(fd)) == -1) 1252 return ret; 1253 1254 if ((ret = vioscsi_dump(fd)) == -1) 1255 return ret; 1256 1257 if ((ret = vionet_dump(fd)) == -1) 1258 return ret; 1259 1260 if ((ret = vmmci_dump(fd)) == -1) 1261 return ret; 1262 1263 return (0); 1264 } 1265 1266 void virtio_broadcast_imsg(struct vmd_vm *vm, uint16_t type, void *data, 1267 uint16_t datalen) 1268 { 1269 struct virtio_dev *dev; 1270 int ret; 1271 1272 SLIST_FOREACH(dev, &virtio_devs, dev_next) { 1273 ret = imsg_compose_event(&dev->async_iev, type, 0, 0, -1, data, 1274 datalen); 1275 if (ret == -1) { 1276 log_warnx("%s: failed to broadcast imsg type %u", 1277 __func__, type); 1278 } 1279 } 1280 1281 } 1282 1283 void 1284 virtio_stop(struct vmd_vm *vm) 1285 { 1286 return virtio_broadcast_imsg(vm, IMSG_VMDOP_PAUSE_VM, NULL, 0); 1287 } 1288 1289 void 1290 virtio_start(struct vmd_vm *vm) 1291 { 1292 return virtio_broadcast_imsg(vm, IMSG_VMDOP_UNPAUSE_VM, NULL, 0); 1293 } 1294 1295 /* 1296 * Fork+exec a child virtio device. Returns 0 on success. 1297 */ 1298 static int 1299 virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev *dev) 1300 { 1301 char *nargv[12], num[32], vmm_fd[32], vm_name[VM_NAME_MAX], t[2]; 1302 pid_t dev_pid; 1303 int data_fds[VM_MAX_BASE_PER_DISK], sync_fds[2], async_fds[2], ret = 0; 1304 size_t i, data_fds_sz, sz = 0; 1305 struct viodev_msg msg; 1306 struct imsg imsg; 1307 struct imsgev *iev = &dev->sync_iev; 1308 1309 switch (dev->dev_type) { 1310 case VMD_DEVTYPE_NET: 1311 data_fds[0] = dev->vionet.data_fd; 1312 data_fds_sz = 1; 1313 log_debug("%s: launching vionet%d", 1314 vm->vm_params.vmc_params.vcp_name, dev->vionet.idx); 1315 break; 1316 case VMD_DEVTYPE_DISK: 1317 memcpy(&data_fds, dev->vioblk.disk_fd, sizeof(data_fds)); 1318 data_fds_sz = dev->vioblk.ndisk_fd; 1319 log_debug("%s: launching vioblk%d", 1320 vm->vm_params.vmc_params.vcp_name, dev->vioblk.idx); 1321 break; 1322 /* NOTREACHED */ 1323 default: 1324 log_warn("%s: invalid device type", __func__); 1325 return (EINVAL); 1326 } 1327 1328 /* We need two channels: one synchronous (IO reads) and one async. */ 1329 if (socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC, sync_fds) == -1) { 1330 log_warn("failed to create socketpair"); 1331 return (errno); 1332 } 1333 if (socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC, async_fds) == -1) { 1334 log_warn("failed to create async socketpair"); 1335 return (errno); 1336 } 1337 1338 /* Keep communication channels open after exec. */ 1339 if (fcntl(sync_fds[1], F_SETFD, 0)) { 1340 ret = errno; 1341 log_warn("%s: fcntl", __func__); 1342 goto err; 1343 } 1344 if (fcntl(async_fds[1], F_SETFD, 0)) { 1345 ret = errno; 1346 log_warn("%s: fcnt", __func__); 1347 goto err; 1348 } 1349 1350 /* Fork... */ 1351 dev_pid = fork(); 1352 if (dev_pid == -1) { 1353 ret = errno; 1354 log_warn("%s: fork failed", __func__); 1355 goto err; 1356 } 1357 1358 if (dev_pid > 0) { 1359 /* Parent */ 1360 close_fd(sync_fds[1]); 1361 close_fd(async_fds[1]); 1362 1363 /* Save the child's pid to help with cleanup. */ 1364 dev->dev_pid = dev_pid; 1365 1366 /* Set the channel fds to the child's before sending. */ 1367 dev->sync_fd = sync_fds[1]; 1368 dev->async_fd = async_fds[1]; 1369 1370 /* Close data fds. Only the child device needs them now. */ 1371 for (i = 0; i < data_fds_sz; i++) 1372 close_fd(data_fds[i]); 1373 1374 /* Set our synchronous channel to non-blocking. */ 1375 if (fcntl(sync_fds[0], F_SETFL, O_NONBLOCK) == -1) { 1376 ret = errno; 1377 log_warn("%s: fcntl", __func__); 1378 goto err; 1379 } 1380 1381 /* 1. Send over our configured device. */ 1382 log_debug("%s: sending '%c' type device struct", __func__, 1383 dev->dev_type); 1384 sz = atomicio(vwrite, sync_fds[0], dev, sizeof(*dev)); 1385 if (sz != sizeof(*dev)) { 1386 log_warnx("%s: failed to send device", __func__); 1387 ret = EIO; 1388 goto err; 1389 } 1390 1391 /* 2. Send over details on the VM (including memory fds). */ 1392 log_debug("%s: sending vm message for '%s'", __func__, 1393 vm->vm_params.vmc_params.vcp_name); 1394 sz = atomicio(vwrite, sync_fds[0], vm, sizeof(*vm)); 1395 if (sz != sizeof(*vm)) { 1396 log_warnx("%s: failed to send vm details", __func__); 1397 ret = EIO; 1398 goto err; 1399 } 1400 1401 /* 1402 * Initialize our imsg channel to the child device. The initial 1403 * communication will be synchronous. We expect the child to 1404 * report itself "ready" to confirm the launch was a success. 1405 */ 1406 imsg_init(&iev->ibuf, sync_fds[0]); 1407 do 1408 ret = imsg_read(&iev->ibuf); 1409 while (ret == -1 && errno == EAGAIN); 1410 if (ret == 0 || ret == -1) { 1411 log_warnx("%s: failed to receive ready message from " 1412 "'%c' type device", __func__, dev->dev_type); 1413 ret = EIO; 1414 goto err; 1415 } 1416 ret = 0; 1417 1418 log_debug("%s: receiving reply", __func__); 1419 if (imsg_get(&iev->ibuf, &imsg) < 1) { 1420 log_warnx("%s: imsg_get", __func__); 1421 ret = EIO; 1422 goto err; 1423 } 1424 IMSG_SIZE_CHECK(&imsg, &msg); 1425 memcpy(&msg, imsg.data, sizeof(msg)); 1426 imsg_free(&imsg); 1427 1428 if (msg.type != VIODEV_MSG_READY) { 1429 log_warnx("%s: expected ready message, got type %d", 1430 __func__, msg.type); 1431 ret = EINVAL; 1432 goto err; 1433 } 1434 log_debug("%s: device reports ready via sync channel", 1435 __func__); 1436 1437 /* 1438 * Wire in the async event handling, but after reverting back 1439 * to the parent's fd's. 1440 */ 1441 dev->sync_fd = sync_fds[0]; 1442 dev->async_fd = async_fds[0]; 1443 vm_device_pipe(dev, virtio_dispatch_dev); 1444 } else { 1445 /* Child */ 1446 close_fd(async_fds[0]); 1447 close_fd(sync_fds[0]); 1448 1449 /* Keep data file descriptors open after exec. */ 1450 for (i = 0; i < data_fds_sz; i++) { 1451 log_debug("%s: marking fd %d !close-on-exec", __func__, 1452 data_fds[i]); 1453 if (fcntl(data_fds[i], F_SETFD, 0)) { 1454 ret = errno; 1455 log_warn("%s: fcntl", __func__); 1456 goto err; 1457 } 1458 } 1459 1460 memset(&nargv, 0, sizeof(nargv)); 1461 memset(num, 0, sizeof(num)); 1462 snprintf(num, sizeof(num), "%d", sync_fds[1]); 1463 memset(vmm_fd, 0, sizeof(vmm_fd)); 1464 snprintf(vmm_fd, sizeof(vmm_fd), "%d", env->vmd_fd); 1465 memset(vm_name, 0, sizeof(vm_name)); 1466 snprintf(vm_name, sizeof(vm_name), "%s", 1467 vm->vm_params.vmc_params.vcp_name); 1468 1469 t[0] = dev->dev_type; 1470 t[1] = '\0'; 1471 1472 nargv[0] = env->argv0; 1473 nargv[1] = "-X"; 1474 nargv[2] = num; 1475 nargv[3] = "-t"; 1476 nargv[4] = t; 1477 nargv[5] = "-i"; 1478 nargv[6] = vmm_fd; 1479 nargv[7] = "-p"; 1480 nargv[8] = vm_name; 1481 nargv[9] = "-n"; 1482 nargv[10] = NULL; 1483 1484 if (env->vmd_verbose == 1) { 1485 nargv[10] = VMD_VERBOSE_1; 1486 nargv[11] = NULL; 1487 } else if (env->vmd_verbose > 1) { 1488 nargv[10] = VMD_VERBOSE_2; 1489 nargv[11] = NULL; 1490 } 1491 1492 /* Control resumes in vmd.c:main(). */ 1493 execvp(nargv[0], nargv); 1494 1495 ret = errno; 1496 log_warn("%s: failed to exec device", __func__); 1497 _exit(ret); 1498 /* NOTREACHED */ 1499 } 1500 1501 return (ret); 1502 1503 err: 1504 close_fd(sync_fds[0]); 1505 close_fd(sync_fds[1]); 1506 close_fd(async_fds[0]); 1507 close_fd(async_fds[1]); 1508 return (ret); 1509 } 1510 1511 /* 1512 * Initialize an async imsg channel for a virtio device. 1513 */ 1514 int 1515 vm_device_pipe(struct virtio_dev *dev, void (*cb)(int, short, void *)) 1516 { 1517 struct imsgev *iev = &dev->async_iev; 1518 int fd = dev->async_fd; 1519 1520 log_debug("%s: initializing '%c' device pipe (fd=%d)", __func__, 1521 dev->dev_type, fd); 1522 1523 if (fcntl(fd, F_SETFL, O_NONBLOCK) == -1) { 1524 log_warn("failed to set nonblocking mode on vm device pipe"); 1525 return (-1); 1526 } 1527 1528 imsg_init(&iev->ibuf, fd); 1529 iev->handler = cb; 1530 iev->data = dev; 1531 iev->events = EV_READ; 1532 imsg_event_add(iev); 1533 1534 return (0); 1535 } 1536 1537 void 1538 virtio_dispatch_dev(int fd, short event, void *arg) 1539 { 1540 struct virtio_dev *dev = (struct virtio_dev*)arg; 1541 struct imsgev *iev = &dev->async_iev; 1542 struct imsgbuf *ibuf = &iev->ibuf; 1543 struct imsg imsg; 1544 struct viodev_msg msg; 1545 ssize_t n = 0; 1546 1547 if (event & EV_READ) { 1548 if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN) 1549 fatal("%s: imsg_read", __func__); 1550 if (n == 0) { 1551 /* this pipe is dead, so remove the event handler */ 1552 log_debug("%s: pipe dead (EV_READ)", __func__); 1553 event_del(&iev->ev); 1554 event_loopexit(NULL); 1555 return; 1556 } 1557 } 1558 1559 if (event & EV_WRITE) { 1560 if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN) 1561 fatal("%s: msgbuf_write", __func__); 1562 if (n == 0) { 1563 /* this pipe is dead, so remove the event handler */ 1564 log_debug("%s: pipe dead (EV_WRITE)", __func__); 1565 event_del(&iev->ev); 1566 event_loopexit(NULL); 1567 return; 1568 } 1569 } 1570 1571 for (;;) { 1572 if ((n = imsg_get(ibuf, &imsg)) == -1) 1573 fatal("%s: imsg_get", __func__); 1574 if (n == 0) 1575 break; 1576 1577 switch (imsg.hdr.type) { 1578 case IMSG_DEVOP_MSG: 1579 IMSG_SIZE_CHECK(&imsg, &msg); 1580 memcpy(&msg, imsg.data, sizeof(msg)); 1581 handle_dev_msg(&msg, dev); 1582 break; 1583 default: 1584 log_warnx("%s: got non devop imsg %d", __func__, 1585 imsg.hdr.type); 1586 break; 1587 } 1588 imsg_free(&imsg); 1589 } 1590 imsg_event_add(iev); 1591 } 1592 1593 1594 static int 1595 handle_dev_msg(struct viodev_msg *msg, struct virtio_dev *gdev) 1596 { 1597 uint32_t vm_id = gdev->vm_id; 1598 int irq = gdev->irq; 1599 1600 switch (msg->type) { 1601 case VIODEV_MSG_KICK: 1602 if (msg->state == INTR_STATE_ASSERT) 1603 vcpu_assert_pic_irq(vm_id, msg->vcpu, irq); 1604 else if (msg->state == INTR_STATE_DEASSERT) 1605 vcpu_deassert_pic_irq(vm_id, msg->vcpu, irq); 1606 break; 1607 case VIODEV_MSG_READY: 1608 log_debug("%s: device reports ready", __func__); 1609 break; 1610 case VIODEV_MSG_ERROR: 1611 log_warnx("%s: device reported error", __func__); 1612 break; 1613 case VIODEV_MSG_INVALID: 1614 case VIODEV_MSG_IO_READ: 1615 case VIODEV_MSG_IO_WRITE: 1616 /* FALLTHROUGH */ 1617 default: 1618 log_warnx("%s: unsupported device message type %d", __func__, 1619 msg->type); 1620 return (1); 1621 } 1622 1623 return (0); 1624 }; 1625 1626 /* 1627 * Called by the VM process while processing IO from the VCPU thread. 1628 * 1629 * N.b. Since the VCPU thread calls this function, we cannot mutate the event 1630 * system. All ipc messages must be sent manually and cannot be queued for 1631 * the event loop to push them. (We need to perform a synchronous read, so 1632 * this isn't really a big deal.) 1633 */ 1634 int 1635 virtio_pci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr, 1636 void *cookie, uint8_t sz) 1637 { 1638 struct virtio_dev *dev = (struct virtio_dev *)cookie; 1639 struct imsgbuf *ibuf = &dev->sync_iev.ibuf; 1640 struct imsg imsg; 1641 struct viodev_msg msg; 1642 ssize_t n; 1643 int ret = 0; 1644 1645 memset(&msg, 0, sizeof(msg)); 1646 msg.reg = reg; 1647 msg.io_sz = sz; 1648 1649 if (dir == 0) { 1650 msg.type = VIODEV_MSG_IO_WRITE; 1651 msg.data = *data; 1652 msg.data_valid = 1; 1653 } else 1654 msg.type = VIODEV_MSG_IO_READ; 1655 1656 if (msg.type == VIODEV_MSG_IO_WRITE) { 1657 /* 1658 * Write request. No reply expected. 1659 */ 1660 ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg, 1661 sizeof(msg)); 1662 if (ret == -1) { 1663 log_warn("%s: failed to send async io event to virtio" 1664 " device", __func__); 1665 return (ret); 1666 } 1667 if (imsg_flush(ibuf) == -1) { 1668 log_warnx("%s: imsg_flush (write)", __func__); 1669 return (-1); 1670 } 1671 } else { 1672 /* 1673 * Read request. Requires waiting for a reply. 1674 */ 1675 ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg, 1676 sizeof(msg)); 1677 if (ret == -1) { 1678 log_warnx("%s: failed to send sync io event to virtio" 1679 " device", __func__); 1680 return (ret); 1681 } 1682 if (imsg_flush(ibuf) == -1) { 1683 log_warnx("%s: imsg_flush (read)", __func__); 1684 return (-1); 1685 } 1686 1687 /* Read our reply. */ 1688 do 1689 n = imsg_read(ibuf); 1690 while (n == -1 && errno == EAGAIN); 1691 if (n == 0 || n == -1) { 1692 log_warn("%s: imsg_read (n=%ld)", __func__, n); 1693 return (-1); 1694 } 1695 if ((n = imsg_get(ibuf, &imsg)) == -1) { 1696 log_warn("%s: imsg_get (n=%ld)", __func__, n); 1697 return (-1); 1698 } 1699 if (n == 0) { 1700 log_warnx("%s: invalid imsg", __func__); 1701 return (-1); 1702 } 1703 1704 IMSG_SIZE_CHECK(&imsg, &msg); 1705 memcpy(&msg, imsg.data, sizeof(msg)); 1706 imsg_free(&imsg); 1707 1708 if (msg.type == VIODEV_MSG_IO_READ && msg.data_valid) { 1709 #if DEBUG 1710 log_debug("%s: got sync read response (reg=%s)", 1711 __func__, virtio_reg_name(msg.reg)); 1712 #endif /* DEBUG */ 1713 *data = msg.data; 1714 /* 1715 * It's possible we're asked to {de,}assert after the 1716 * device performs a register read. 1717 */ 1718 if (msg.state == INTR_STATE_ASSERT) 1719 vcpu_assert_pic_irq(dev->vm_id, msg.vcpu, msg.irq); 1720 else if (msg.state == INTR_STATE_DEASSERT) 1721 vcpu_deassert_pic_irq(dev->vm_id, msg.vcpu, msg.irq); 1722 } else { 1723 log_warnx("%s: expected IO_READ, got %d", __func__, 1724 msg.type); 1725 return (-1); 1726 } 1727 } 1728 1729 return (0); 1730 } 1731 1732 void 1733 virtio_assert_pic_irq(struct virtio_dev *dev, int vcpu) 1734 { 1735 struct viodev_msg msg; 1736 int ret; 1737 1738 memset(&msg, 0, sizeof(msg)); 1739 msg.irq = dev->irq; 1740 msg.vcpu = vcpu; 1741 msg.type = VIODEV_MSG_KICK; 1742 msg.state = INTR_STATE_ASSERT; 1743 1744 ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1, 1745 &msg, sizeof(msg)); 1746 if (ret == -1) 1747 log_warnx("%s: failed to assert irq %d", __func__, dev->irq); 1748 } 1749 1750 void 1751 virtio_deassert_pic_irq(struct virtio_dev *dev, int vcpu) 1752 { 1753 struct viodev_msg msg; 1754 int ret; 1755 1756 memset(&msg, 0, sizeof(msg)); 1757 msg.irq = dev->irq; 1758 msg.vcpu = vcpu; 1759 msg.type = VIODEV_MSG_KICK; 1760 msg.state = INTR_STATE_DEASSERT; 1761 1762 ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1, 1763 &msg, sizeof(msg)); 1764 if (ret == -1) 1765 log_warnx("%s: failed to deassert irq %d", __func__, dev->irq); 1766 } 1767