1 /* 2 * Vhost User library 3 * 4 * Copyright IBM, Corp. 2007 5 * Copyright (c) 2016 Red Hat, Inc. 6 * 7 * Authors: 8 * Anthony Liguori <aliguori@us.ibm.com> 9 * Marc-André Lureau <mlureau@redhat.com> 10 * Victor Kaplansky <victork@redhat.com> 11 * 12 * This work is licensed under the terms of the GNU GPL, version 2 or 13 * later. See the COPYING file in the top-level directory. 14 */ 15 16 #ifndef _GNU_SOURCE 17 #define _GNU_SOURCE 18 #endif 19 20 /* this code avoids GLib dependency */ 21 #include <stdlib.h> 22 #include <stdio.h> 23 #include <unistd.h> 24 #include <stdarg.h> 25 #include <errno.h> 26 #include <string.h> 27 #include <assert.h> 28 #include <inttypes.h> 29 #include <sys/types.h> 30 #include <sys/socket.h> 31 #include <sys/eventfd.h> 32 #include <sys/mman.h> 33 #include <endian.h> 34 35 /* Necessary to provide VIRTIO_F_VERSION_1 on system 36 * with older linux headers. Must appear before 37 * <linux/vhost.h> below. 38 */ 39 #include "standard-headers/linux/virtio_config.h" 40 41 #if defined(__linux__) 42 #include <sys/syscall.h> 43 #include <fcntl.h> 44 #include <sys/ioctl.h> 45 #include <linux/vhost.h> 46 47 #ifdef __NR_userfaultfd 48 #include <linux/userfaultfd.h> 49 #endif 50 51 #endif 52 53 #include "include/atomic.h" 54 55 #include "libvhost-user.h" 56 57 /* usually provided by GLib */ 58 #if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 4) 59 #if !defined(__clang__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 4) 60 #define G_GNUC_PRINTF(format_idx, arg_idx) \ 61 __attribute__((__format__(gnu_printf, format_idx, arg_idx))) 62 #else 63 #define G_GNUC_PRINTF(format_idx, arg_idx) \ 64 __attribute__((__format__(__printf__, format_idx, arg_idx))) 65 #endif 66 #else /* !__GNUC__ */ 67 #define G_GNUC_PRINTF(format_idx, arg_idx) 68 #endif /* !__GNUC__ */ 69 #ifndef MIN 70 #define MIN(x, y) ({ \ 71 __typeof__(x) _min1 = (x); \ 72 __typeof__(y) _min2 = (y); \ 73 (void) (&_min1 == &_min2); \ 74 _min1 < _min2 ? _min1 : _min2; }) 75 #endif 76 77 /* Round number down to multiple */ 78 #define ALIGN_DOWN(n, m) ((n) / (m) * (m)) 79 80 /* Round number up to multiple */ 81 #define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m)) 82 83 #ifndef unlikely 84 #define unlikely(x) __builtin_expect(!!(x), 0) 85 #endif 86 87 /* Align each region to cache line size in inflight buffer */ 88 #define INFLIGHT_ALIGNMENT 64 89 90 /* The version of inflight buffer */ 91 #define INFLIGHT_VERSION 1 92 93 /* The version of the protocol we support */ 94 #define VHOST_USER_VERSION 1 95 #define LIBVHOST_USER_DEBUG 0 96 97 #define DPRINT(...) \ 98 do { \ 99 if (LIBVHOST_USER_DEBUG) { \ 100 fprintf(stderr, __VA_ARGS__); \ 101 } \ 102 } while (0) 103 104 static inline 105 bool has_feature(uint64_t features, unsigned int fbit) 106 { 107 assert(fbit < 64); 108 return !!(features & (1ULL << fbit)); 109 } 110 111 static inline 112 bool vu_has_feature(VuDev *dev, 113 unsigned int fbit) 114 { 115 return has_feature(dev->features, fbit); 116 } 117 118 static inline bool vu_has_protocol_feature(VuDev *dev, unsigned int fbit) 119 { 120 return has_feature(dev->protocol_features, fbit); 121 } 122 123 const char * 124 vu_request_to_string(unsigned int req) 125 { 126 #define REQ(req) [req] = #req 127 static const char *vu_request_str[] = { 128 REQ(VHOST_USER_NONE), 129 REQ(VHOST_USER_GET_FEATURES), 130 REQ(VHOST_USER_SET_FEATURES), 131 REQ(VHOST_USER_SET_OWNER), 132 REQ(VHOST_USER_RESET_OWNER), 133 REQ(VHOST_USER_SET_MEM_TABLE), 134 REQ(VHOST_USER_SET_LOG_BASE), 135 REQ(VHOST_USER_SET_LOG_FD), 136 REQ(VHOST_USER_SET_VRING_NUM), 137 REQ(VHOST_USER_SET_VRING_ADDR), 138 REQ(VHOST_USER_SET_VRING_BASE), 139 REQ(VHOST_USER_GET_VRING_BASE), 140 REQ(VHOST_USER_SET_VRING_KICK), 141 REQ(VHOST_USER_SET_VRING_CALL), 142 REQ(VHOST_USER_SET_VRING_ERR), 143 REQ(VHOST_USER_GET_PROTOCOL_FEATURES), 144 REQ(VHOST_USER_SET_PROTOCOL_FEATURES), 145 REQ(VHOST_USER_GET_QUEUE_NUM), 146 REQ(VHOST_USER_SET_VRING_ENABLE), 147 REQ(VHOST_USER_SEND_RARP), 148 REQ(VHOST_USER_NET_SET_MTU), 149 REQ(VHOST_USER_SET_BACKEND_REQ_FD), 150 REQ(VHOST_USER_IOTLB_MSG), 151 REQ(VHOST_USER_SET_VRING_ENDIAN), 152 REQ(VHOST_USER_GET_CONFIG), 153 REQ(VHOST_USER_SET_CONFIG), 154 REQ(VHOST_USER_POSTCOPY_ADVISE), 155 REQ(VHOST_USER_POSTCOPY_LISTEN), 156 REQ(VHOST_USER_POSTCOPY_END), 157 REQ(VHOST_USER_GET_INFLIGHT_FD), 158 REQ(VHOST_USER_SET_INFLIGHT_FD), 159 REQ(VHOST_USER_GPU_SET_SOCKET), 160 REQ(VHOST_USER_VRING_KICK), 161 REQ(VHOST_USER_GET_MAX_MEM_SLOTS), 162 REQ(VHOST_USER_ADD_MEM_REG), 163 REQ(VHOST_USER_REM_MEM_REG), 164 REQ(VHOST_USER_MAX), 165 }; 166 #undef REQ 167 168 if (req < VHOST_USER_MAX) { 169 return vu_request_str[req]; 170 } else { 171 return "unknown"; 172 } 173 } 174 175 static void G_GNUC_PRINTF(2, 3) 176 vu_panic(VuDev *dev, const char *msg, ...) 177 { 178 char *buf = NULL; 179 va_list ap; 180 181 va_start(ap, msg); 182 if (vasprintf(&buf, msg, ap) < 0) { 183 buf = NULL; 184 } 185 va_end(ap); 186 187 dev->broken = true; 188 dev->panic(dev, buf); 189 free(buf); 190 191 /* 192 * FIXME: 193 * find a way to call virtio_error, or perhaps close the connection? 194 */ 195 } 196 197 /* Translate guest physical address to our virtual address. */ 198 void * 199 vu_gpa_to_va(VuDev *dev, uint64_t *plen, uint64_t guest_addr) 200 { 201 unsigned int i; 202 203 if (*plen == 0) { 204 return NULL; 205 } 206 207 /* Find matching memory region. */ 208 for (i = 0; i < dev->nregions; i++) { 209 VuDevRegion *r = &dev->regions[i]; 210 211 if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) { 212 if ((guest_addr + *plen) > (r->gpa + r->size)) { 213 *plen = r->gpa + r->size - guest_addr; 214 } 215 return (void *)(uintptr_t) 216 guest_addr - r->gpa + r->mmap_addr + r->mmap_offset; 217 } 218 } 219 220 return NULL; 221 } 222 223 /* Translate qemu virtual address to our virtual address. */ 224 static void * 225 qva_to_va(VuDev *dev, uint64_t qemu_addr) 226 { 227 unsigned int i; 228 229 /* Find matching memory region. */ 230 for (i = 0; i < dev->nregions; i++) { 231 VuDevRegion *r = &dev->regions[i]; 232 233 if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) { 234 return (void *)(uintptr_t) 235 qemu_addr - r->qva + r->mmap_addr + r->mmap_offset; 236 } 237 } 238 239 return NULL; 240 } 241 242 static void 243 vmsg_close_fds(VhostUserMsg *vmsg) 244 { 245 int i; 246 247 for (i = 0; i < vmsg->fd_num; i++) { 248 close(vmsg->fds[i]); 249 } 250 } 251 252 /* Set reply payload.u64 and clear request flags and fd_num */ 253 static void vmsg_set_reply_u64(VhostUserMsg *vmsg, uint64_t val) 254 { 255 vmsg->flags = 0; /* defaults will be set by vu_send_reply() */ 256 vmsg->size = sizeof(vmsg->payload.u64); 257 vmsg->payload.u64 = val; 258 vmsg->fd_num = 0; 259 } 260 261 /* A test to see if we have userfault available */ 262 static bool 263 have_userfault(void) 264 { 265 #if defined(__linux__) && defined(__NR_userfaultfd) &&\ 266 defined(UFFD_FEATURE_MISSING_SHMEM) &&\ 267 defined(UFFD_FEATURE_MISSING_HUGETLBFS) 268 /* Now test the kernel we're running on really has the features */ 269 int ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); 270 struct uffdio_api api_struct; 271 if (ufd < 0) { 272 return false; 273 } 274 275 api_struct.api = UFFD_API; 276 api_struct.features = UFFD_FEATURE_MISSING_SHMEM | 277 UFFD_FEATURE_MISSING_HUGETLBFS; 278 if (ioctl(ufd, UFFDIO_API, &api_struct)) { 279 close(ufd); 280 return false; 281 } 282 close(ufd); 283 return true; 284 285 #else 286 return false; 287 #endif 288 } 289 290 static bool 291 vu_message_read_default(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) 292 { 293 char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = {}; 294 struct iovec iov = { 295 .iov_base = (char *)vmsg, 296 .iov_len = VHOST_USER_HDR_SIZE, 297 }; 298 struct msghdr msg = { 299 .msg_iov = &iov, 300 .msg_iovlen = 1, 301 .msg_control = control, 302 .msg_controllen = sizeof(control), 303 }; 304 size_t fd_size; 305 struct cmsghdr *cmsg; 306 int rc; 307 308 do { 309 rc = recvmsg(conn_fd, &msg, 0); 310 } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); 311 312 if (rc < 0) { 313 vu_panic(dev, "Error while recvmsg: %s", strerror(errno)); 314 return false; 315 } 316 317 vmsg->fd_num = 0; 318 for (cmsg = CMSG_FIRSTHDR(&msg); 319 cmsg != NULL; 320 cmsg = CMSG_NXTHDR(&msg, cmsg)) 321 { 322 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) { 323 fd_size = cmsg->cmsg_len - CMSG_LEN(0); 324 vmsg->fd_num = fd_size / sizeof(int); 325 assert(fd_size < VHOST_MEMORY_BASELINE_NREGIONS); 326 memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size); 327 break; 328 } 329 } 330 331 if (vmsg->size > sizeof(vmsg->payload)) { 332 vu_panic(dev, 333 "Error: too big message request: %d, size: vmsg->size: %u, " 334 "while sizeof(vmsg->payload) = %zu\n", 335 vmsg->request, vmsg->size, sizeof(vmsg->payload)); 336 goto fail; 337 } 338 339 if (vmsg->size) { 340 do { 341 rc = read(conn_fd, &vmsg->payload, vmsg->size); 342 } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); 343 344 if (rc <= 0) { 345 vu_panic(dev, "Error while reading: %s", strerror(errno)); 346 goto fail; 347 } 348 349 assert((uint32_t)rc == vmsg->size); 350 } 351 352 return true; 353 354 fail: 355 vmsg_close_fds(vmsg); 356 357 return false; 358 } 359 360 static bool 361 vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) 362 { 363 int rc; 364 uint8_t *p = (uint8_t *)vmsg; 365 char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = {}; 366 struct iovec iov = { 367 .iov_base = (char *)vmsg, 368 .iov_len = VHOST_USER_HDR_SIZE, 369 }; 370 struct msghdr msg = { 371 .msg_iov = &iov, 372 .msg_iovlen = 1, 373 .msg_control = control, 374 }; 375 struct cmsghdr *cmsg; 376 377 memset(control, 0, sizeof(control)); 378 assert(vmsg->fd_num <= VHOST_MEMORY_BASELINE_NREGIONS); 379 if (vmsg->fd_num > 0) { 380 size_t fdsize = vmsg->fd_num * sizeof(int); 381 msg.msg_controllen = CMSG_SPACE(fdsize); 382 cmsg = CMSG_FIRSTHDR(&msg); 383 cmsg->cmsg_len = CMSG_LEN(fdsize); 384 cmsg->cmsg_level = SOL_SOCKET; 385 cmsg->cmsg_type = SCM_RIGHTS; 386 memcpy(CMSG_DATA(cmsg), vmsg->fds, fdsize); 387 } else { 388 msg.msg_controllen = 0; 389 } 390 391 do { 392 rc = sendmsg(conn_fd, &msg, 0); 393 } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); 394 395 if (vmsg->size) { 396 do { 397 if (vmsg->data) { 398 rc = write(conn_fd, vmsg->data, vmsg->size); 399 } else { 400 rc = write(conn_fd, p + VHOST_USER_HDR_SIZE, vmsg->size); 401 } 402 } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); 403 } 404 405 if (rc <= 0) { 406 vu_panic(dev, "Error while writing: %s", strerror(errno)); 407 return false; 408 } 409 410 return true; 411 } 412 413 static bool 414 vu_send_reply(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) 415 { 416 /* Set the version in the flags when sending the reply */ 417 vmsg->flags &= ~VHOST_USER_VERSION_MASK; 418 vmsg->flags |= VHOST_USER_VERSION; 419 vmsg->flags |= VHOST_USER_REPLY_MASK; 420 421 return vu_message_write(dev, conn_fd, vmsg); 422 } 423 424 /* 425 * Processes a reply on the backend channel. 426 * Entered with backend_mutex held and releases it before exit. 427 * Returns true on success. 428 */ 429 static bool 430 vu_process_message_reply(VuDev *dev, const VhostUserMsg *vmsg) 431 { 432 VhostUserMsg msg_reply; 433 bool result = false; 434 435 if ((vmsg->flags & VHOST_USER_NEED_REPLY_MASK) == 0) { 436 result = true; 437 goto out; 438 } 439 440 if (!vu_message_read_default(dev, dev->backend_fd, &msg_reply)) { 441 goto out; 442 } 443 444 if (msg_reply.request != vmsg->request) { 445 DPRINT("Received unexpected msg type. Expected %d received %d", 446 vmsg->request, msg_reply.request); 447 goto out; 448 } 449 450 result = msg_reply.payload.u64 == 0; 451 452 out: 453 pthread_mutex_unlock(&dev->backend_mutex); 454 return result; 455 } 456 457 /* Kick the log_call_fd if required. */ 458 static void 459 vu_log_kick(VuDev *dev) 460 { 461 if (dev->log_call_fd != -1) { 462 DPRINT("Kicking the QEMU's log...\n"); 463 if (eventfd_write(dev->log_call_fd, 1) < 0) { 464 vu_panic(dev, "Error writing eventfd: %s", strerror(errno)); 465 } 466 } 467 } 468 469 static void 470 vu_log_page(uint8_t *log_table, uint64_t page) 471 { 472 DPRINT("Logged dirty guest page: %"PRId64"\n", page); 473 qatomic_or(&log_table[page / 8], 1 << (page % 8)); 474 } 475 476 static void 477 vu_log_write(VuDev *dev, uint64_t address, uint64_t length) 478 { 479 uint64_t page; 480 481 if (!(dev->features & (1ULL << VHOST_F_LOG_ALL)) || 482 !dev->log_table || !length) { 483 return; 484 } 485 486 assert(dev->log_size > ((address + length - 1) / VHOST_LOG_PAGE / 8)); 487 488 page = address / VHOST_LOG_PAGE; 489 while (page * VHOST_LOG_PAGE < address + length) { 490 vu_log_page(dev->log_table, page); 491 page += 1; 492 } 493 494 vu_log_kick(dev); 495 } 496 497 static void 498 vu_kick_cb(VuDev *dev, int condition, void *data) 499 { 500 int index = (intptr_t)data; 501 VuVirtq *vq = &dev->vq[index]; 502 int sock = vq->kick_fd; 503 eventfd_t kick_data; 504 ssize_t rc; 505 506 rc = eventfd_read(sock, &kick_data); 507 if (rc == -1) { 508 vu_panic(dev, "kick eventfd_read(): %s", strerror(errno)); 509 dev->remove_watch(dev, dev->vq[index].kick_fd); 510 } else { 511 DPRINT("Got kick_data: %016"PRIx64" handler:%p idx:%d\n", 512 kick_data, vq->handler, index); 513 if (vq->handler) { 514 vq->handler(dev, index); 515 } 516 } 517 } 518 519 static bool 520 vu_get_features_exec(VuDev *dev, VhostUserMsg *vmsg) 521 { 522 vmsg->payload.u64 = 523 /* 524 * The following VIRTIO feature bits are supported by our virtqueue 525 * implementation: 526 */ 527 1ULL << VIRTIO_F_NOTIFY_ON_EMPTY | 528 1ULL << VIRTIO_RING_F_INDIRECT_DESC | 529 1ULL << VIRTIO_RING_F_EVENT_IDX | 530 1ULL << VIRTIO_F_VERSION_1 | 531 532 /* vhost-user feature bits */ 533 1ULL << VHOST_F_LOG_ALL | 534 1ULL << VHOST_USER_F_PROTOCOL_FEATURES; 535 536 if (dev->iface->get_features) { 537 vmsg->payload.u64 |= dev->iface->get_features(dev); 538 } 539 540 vmsg->size = sizeof(vmsg->payload.u64); 541 vmsg->fd_num = 0; 542 543 DPRINT("Sending back to guest u64: 0x%016"PRIx64"\n", vmsg->payload.u64); 544 545 return true; 546 } 547 548 static void 549 vu_set_enable_all_rings(VuDev *dev, bool enabled) 550 { 551 uint16_t i; 552 553 for (i = 0; i < dev->max_queues; i++) { 554 dev->vq[i].enable = enabled; 555 } 556 } 557 558 static bool 559 vu_set_features_exec(VuDev *dev, VhostUserMsg *vmsg) 560 { 561 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); 562 563 dev->features = vmsg->payload.u64; 564 if (!vu_has_feature(dev, VIRTIO_F_VERSION_1)) { 565 /* 566 * We only support devices conforming to VIRTIO 1.0 or 567 * later 568 */ 569 vu_panic(dev, "virtio legacy devices aren't supported by libvhost-user"); 570 return false; 571 } 572 573 if (!(dev->features & VHOST_USER_F_PROTOCOL_FEATURES)) { 574 vu_set_enable_all_rings(dev, true); 575 } 576 577 if (dev->iface->set_features) { 578 dev->iface->set_features(dev, dev->features); 579 } 580 581 return false; 582 } 583 584 static bool 585 vu_set_owner_exec(VuDev *dev, VhostUserMsg *vmsg) 586 { 587 return false; 588 } 589 590 static void 591 vu_close_log(VuDev *dev) 592 { 593 if (dev->log_table) { 594 if (munmap(dev->log_table, dev->log_size) != 0) { 595 perror("close log munmap() error"); 596 } 597 598 dev->log_table = NULL; 599 } 600 if (dev->log_call_fd != -1) { 601 close(dev->log_call_fd); 602 dev->log_call_fd = -1; 603 } 604 } 605 606 static bool 607 vu_reset_device_exec(VuDev *dev, VhostUserMsg *vmsg) 608 { 609 vu_set_enable_all_rings(dev, false); 610 611 return false; 612 } 613 614 static bool 615 map_ring(VuDev *dev, VuVirtq *vq) 616 { 617 vq->vring.desc = qva_to_va(dev, vq->vra.desc_user_addr); 618 vq->vring.used = qva_to_va(dev, vq->vra.used_user_addr); 619 vq->vring.avail = qva_to_va(dev, vq->vra.avail_user_addr); 620 621 DPRINT("Setting virtq addresses:\n"); 622 DPRINT(" vring_desc at %p\n", vq->vring.desc); 623 DPRINT(" vring_used at %p\n", vq->vring.used); 624 DPRINT(" vring_avail at %p\n", vq->vring.avail); 625 626 return !(vq->vring.desc && vq->vring.used && vq->vring.avail); 627 } 628 629 static bool 630 generate_faults(VuDev *dev) { 631 unsigned int i; 632 for (i = 0; i < dev->nregions; i++) { 633 VuDevRegion *dev_region = &dev->regions[i]; 634 int ret; 635 #ifdef UFFDIO_REGISTER 636 struct uffdio_register reg_struct; 637 638 /* 639 * We should already have an open ufd. Mark each memory 640 * range as ufd. 641 * Discard any mapping we have here; note I can't use MADV_REMOVE 642 * or fallocate to make the hole since I don't want to lose 643 * data that's already arrived in the shared process. 644 * TODO: How to do hugepage 645 */ 646 ret = madvise((void *)(uintptr_t)dev_region->mmap_addr, 647 dev_region->size + dev_region->mmap_offset, 648 MADV_DONTNEED); 649 if (ret) { 650 fprintf(stderr, 651 "%s: Failed to madvise(DONTNEED) region %d: %s\n", 652 __func__, i, strerror(errno)); 653 } 654 /* 655 * Turn off transparent hugepages so we dont get lose wakeups 656 * in neighbouring pages. 657 * TODO: Turn this backon later. 658 */ 659 ret = madvise((void *)(uintptr_t)dev_region->mmap_addr, 660 dev_region->size + dev_region->mmap_offset, 661 MADV_NOHUGEPAGE); 662 if (ret) { 663 /* 664 * Note: This can happen legally on kernels that are configured 665 * without madvise'able hugepages 666 */ 667 fprintf(stderr, 668 "%s: Failed to madvise(NOHUGEPAGE) region %d: %s\n", 669 __func__, i, strerror(errno)); 670 } 671 672 reg_struct.range.start = (uintptr_t)dev_region->mmap_addr; 673 reg_struct.range.len = dev_region->size + dev_region->mmap_offset; 674 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; 675 676 if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, ®_struct)) { 677 vu_panic(dev, "%s: Failed to userfault region %d " 678 "@%" PRIx64 " + size:%" PRIx64 " offset: %" PRIx64 679 ": (ufd=%d)%s\n", 680 __func__, i, 681 dev_region->mmap_addr, 682 dev_region->size, dev_region->mmap_offset, 683 dev->postcopy_ufd, strerror(errno)); 684 return false; 685 } 686 if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) { 687 vu_panic(dev, "%s Region (%d) doesn't support COPY", 688 __func__, i); 689 return false; 690 } 691 DPRINT("%s: region %d: Registered userfault for %" 692 PRIx64 " + %" PRIx64 "\n", __func__, i, 693 (uint64_t)reg_struct.range.start, 694 (uint64_t)reg_struct.range.len); 695 /* Now it's registered we can let the client at it */ 696 if (mprotect((void *)(uintptr_t)dev_region->mmap_addr, 697 dev_region->size + dev_region->mmap_offset, 698 PROT_READ | PROT_WRITE)) { 699 vu_panic(dev, "failed to mprotect region %d for postcopy (%s)", 700 i, strerror(errno)); 701 return false; 702 } 703 /* TODO: Stash 'zero' support flags somewhere */ 704 #endif 705 } 706 707 return true; 708 } 709 710 static bool 711 vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { 712 int i; 713 bool track_ramblocks = dev->postcopy_listening; 714 VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m; 715 VuDevRegion *dev_region = &dev->regions[dev->nregions]; 716 void *mmap_addr; 717 718 if (vmsg->fd_num != 1) { 719 vmsg_close_fds(vmsg); 720 vu_panic(dev, "VHOST_USER_ADD_MEM_REG received %d fds - only 1 fd " 721 "should be sent for this message type", vmsg->fd_num); 722 return false; 723 } 724 725 if (vmsg->size < VHOST_USER_MEM_REG_SIZE) { 726 close(vmsg->fds[0]); 727 vu_panic(dev, "VHOST_USER_ADD_MEM_REG requires a message size of at " 728 "least %zu bytes and only %d bytes were received", 729 VHOST_USER_MEM_REG_SIZE, vmsg->size); 730 return false; 731 } 732 733 if (dev->nregions == VHOST_USER_MAX_RAM_SLOTS) { 734 close(vmsg->fds[0]); 735 vu_panic(dev, "failing attempt to hot add memory via " 736 "VHOST_USER_ADD_MEM_REG message because the backend has " 737 "no free ram slots available"); 738 return false; 739 } 740 741 /* 742 * If we are in postcopy mode and we receive a u64 payload with a 0 value 743 * we know all the postcopy client bases have been received, and we 744 * should start generating faults. 745 */ 746 if (track_ramblocks && 747 vmsg->size == sizeof(vmsg->payload.u64) && 748 vmsg->payload.u64 == 0) { 749 (void)generate_faults(dev); 750 return false; 751 } 752 753 DPRINT("Adding region: %u\n", dev->nregions); 754 DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", 755 msg_region->guest_phys_addr); 756 DPRINT(" memory_size: 0x%016"PRIx64"\n", 757 msg_region->memory_size); 758 DPRINT(" userspace_addr 0x%016"PRIx64"\n", 759 msg_region->userspace_addr); 760 DPRINT(" mmap_offset 0x%016"PRIx64"\n", 761 msg_region->mmap_offset); 762 763 dev_region->gpa = msg_region->guest_phys_addr; 764 dev_region->size = msg_region->memory_size; 765 dev_region->qva = msg_region->userspace_addr; 766 dev_region->mmap_offset = msg_region->mmap_offset; 767 768 /* 769 * We don't use offset argument of mmap() since the 770 * mapped address has to be page aligned, and we use huge 771 * pages. 772 */ 773 if (track_ramblocks) { 774 /* 775 * In postcopy we're using PROT_NONE here to catch anyone 776 * accessing it before we userfault. 777 */ 778 mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, 779 PROT_NONE, MAP_SHARED | MAP_NORESERVE, 780 vmsg->fds[0], 0); 781 } else { 782 mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, 783 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NORESERVE, 784 vmsg->fds[0], 0); 785 } 786 787 if (mmap_addr == MAP_FAILED) { 788 vu_panic(dev, "region mmap error: %s", strerror(errno)); 789 } else { 790 dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; 791 DPRINT(" mmap_addr: 0x%016"PRIx64"\n", 792 dev_region->mmap_addr); 793 } 794 795 close(vmsg->fds[0]); 796 797 if (track_ramblocks) { 798 /* 799 * Return the address to QEMU so that it can translate the ufd 800 * fault addresses back. 801 */ 802 msg_region->userspace_addr = (uintptr_t)(mmap_addr + 803 dev_region->mmap_offset); 804 805 /* Send the message back to qemu with the addresses filled in. */ 806 vmsg->fd_num = 0; 807 DPRINT("Successfully added new region in postcopy\n"); 808 dev->nregions++; 809 return true; 810 } else { 811 for (i = 0; i < dev->max_queues; i++) { 812 if (dev->vq[i].vring.desc) { 813 if (map_ring(dev, &dev->vq[i])) { 814 vu_panic(dev, "remapping queue %d for new memory region", 815 i); 816 } 817 } 818 } 819 820 DPRINT("Successfully added new region\n"); 821 dev->nregions++; 822 return false; 823 } 824 } 825 826 static inline bool reg_equal(VuDevRegion *vudev_reg, 827 VhostUserMemoryRegion *msg_reg) 828 { 829 if (vudev_reg->gpa == msg_reg->guest_phys_addr && 830 vudev_reg->qva == msg_reg->userspace_addr && 831 vudev_reg->size == msg_reg->memory_size) { 832 return true; 833 } 834 835 return false; 836 } 837 838 static bool 839 vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { 840 VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m; 841 unsigned int i; 842 bool found = false; 843 844 if (vmsg->fd_num > 1) { 845 vmsg_close_fds(vmsg); 846 vu_panic(dev, "VHOST_USER_REM_MEM_REG received %d fds - at most 1 fd " 847 "should be sent for this message type", vmsg->fd_num); 848 return false; 849 } 850 851 if (vmsg->size < VHOST_USER_MEM_REG_SIZE) { 852 vmsg_close_fds(vmsg); 853 vu_panic(dev, "VHOST_USER_REM_MEM_REG requires a message size of at " 854 "least %zu bytes and only %d bytes were received", 855 VHOST_USER_MEM_REG_SIZE, vmsg->size); 856 return false; 857 } 858 859 DPRINT("Removing region:\n"); 860 DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", 861 msg_region->guest_phys_addr); 862 DPRINT(" memory_size: 0x%016"PRIx64"\n", 863 msg_region->memory_size); 864 DPRINT(" userspace_addr 0x%016"PRIx64"\n", 865 msg_region->userspace_addr); 866 DPRINT(" mmap_offset 0x%016"PRIx64"\n", 867 msg_region->mmap_offset); 868 869 for (i = 0; i < dev->nregions; i++) { 870 if (reg_equal(&dev->regions[i], msg_region)) { 871 VuDevRegion *r = &dev->regions[i]; 872 void *m = (void *) (uintptr_t) r->mmap_addr; 873 874 if (m) { 875 munmap(m, r->size + r->mmap_offset); 876 } 877 878 /* 879 * Shift all affected entries by 1 to close the hole at index i and 880 * zero out the last entry. 881 */ 882 memmove(dev->regions + i, dev->regions + i + 1, 883 sizeof(VuDevRegion) * (dev->nregions - i - 1)); 884 memset(dev->regions + dev->nregions - 1, 0, sizeof(VuDevRegion)); 885 DPRINT("Successfully removed a region\n"); 886 dev->nregions--; 887 i--; 888 889 found = true; 890 891 /* Continue the search for eventual duplicates. */ 892 } 893 } 894 895 if (!found) { 896 vu_panic(dev, "Specified region not found\n"); 897 } 898 899 vmsg_close_fds(vmsg); 900 901 return false; 902 } 903 904 static bool 905 vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg) 906 { 907 unsigned int i; 908 VhostUserMemory m = vmsg->payload.memory, *memory = &m; 909 dev->nregions = memory->nregions; 910 911 DPRINT("Nregions: %u\n", memory->nregions); 912 for (i = 0; i < dev->nregions; i++) { 913 void *mmap_addr; 914 VhostUserMemoryRegion *msg_region = &memory->regions[i]; 915 VuDevRegion *dev_region = &dev->regions[i]; 916 917 DPRINT("Region %d\n", i); 918 DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", 919 msg_region->guest_phys_addr); 920 DPRINT(" memory_size: 0x%016"PRIx64"\n", 921 msg_region->memory_size); 922 DPRINT(" userspace_addr 0x%016"PRIx64"\n", 923 msg_region->userspace_addr); 924 DPRINT(" mmap_offset 0x%016"PRIx64"\n", 925 msg_region->mmap_offset); 926 927 dev_region->gpa = msg_region->guest_phys_addr; 928 dev_region->size = msg_region->memory_size; 929 dev_region->qva = msg_region->userspace_addr; 930 dev_region->mmap_offset = msg_region->mmap_offset; 931 932 /* We don't use offset argument of mmap() since the 933 * mapped address has to be page aligned, and we use huge 934 * pages. 935 * In postcopy we're using PROT_NONE here to catch anyone 936 * accessing it before we userfault 937 */ 938 mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, 939 PROT_NONE, MAP_SHARED | MAP_NORESERVE, 940 vmsg->fds[i], 0); 941 942 if (mmap_addr == MAP_FAILED) { 943 vu_panic(dev, "region mmap error: %s", strerror(errno)); 944 } else { 945 dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; 946 DPRINT(" mmap_addr: 0x%016"PRIx64"\n", 947 dev_region->mmap_addr); 948 } 949 950 /* Return the address to QEMU so that it can translate the ufd 951 * fault addresses back. 952 */ 953 msg_region->userspace_addr = (uintptr_t)(mmap_addr + 954 dev_region->mmap_offset); 955 close(vmsg->fds[i]); 956 } 957 958 /* Send the message back to qemu with the addresses filled in */ 959 vmsg->fd_num = 0; 960 if (!vu_send_reply(dev, dev->sock, vmsg)) { 961 vu_panic(dev, "failed to respond to set-mem-table for postcopy"); 962 return false; 963 } 964 965 /* Wait for QEMU to confirm that it's registered the handler for the 966 * faults. 967 */ 968 if (!dev->read_msg(dev, dev->sock, vmsg) || 969 vmsg->size != sizeof(vmsg->payload.u64) || 970 vmsg->payload.u64 != 0) { 971 vu_panic(dev, "failed to receive valid ack for postcopy set-mem-table"); 972 return false; 973 } 974 975 /* OK, now we can go and register the memory and generate faults */ 976 (void)generate_faults(dev); 977 978 return false; 979 } 980 981 static bool 982 vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg) 983 { 984 unsigned int i; 985 VhostUserMemory m = vmsg->payload.memory, *memory = &m; 986 987 for (i = 0; i < dev->nregions; i++) { 988 VuDevRegion *r = &dev->regions[i]; 989 void *m = (void *) (uintptr_t) r->mmap_addr; 990 991 if (m) { 992 munmap(m, r->size + r->mmap_offset); 993 } 994 } 995 dev->nregions = memory->nregions; 996 997 if (dev->postcopy_listening) { 998 return vu_set_mem_table_exec_postcopy(dev, vmsg); 999 } 1000 1001 DPRINT("Nregions: %u\n", memory->nregions); 1002 for (i = 0; i < dev->nregions; i++) { 1003 void *mmap_addr; 1004 VhostUserMemoryRegion *msg_region = &memory->regions[i]; 1005 VuDevRegion *dev_region = &dev->regions[i]; 1006 1007 DPRINT("Region %d\n", i); 1008 DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", 1009 msg_region->guest_phys_addr); 1010 DPRINT(" memory_size: 0x%016"PRIx64"\n", 1011 msg_region->memory_size); 1012 DPRINT(" userspace_addr 0x%016"PRIx64"\n", 1013 msg_region->userspace_addr); 1014 DPRINT(" mmap_offset 0x%016"PRIx64"\n", 1015 msg_region->mmap_offset); 1016 1017 dev_region->gpa = msg_region->guest_phys_addr; 1018 dev_region->size = msg_region->memory_size; 1019 dev_region->qva = msg_region->userspace_addr; 1020 dev_region->mmap_offset = msg_region->mmap_offset; 1021 1022 /* We don't use offset argument of mmap() since the 1023 * mapped address has to be page aligned, and we use huge 1024 * pages. */ 1025 mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, 1026 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NORESERVE, 1027 vmsg->fds[i], 0); 1028 1029 if (mmap_addr == MAP_FAILED) { 1030 vu_panic(dev, "region mmap error: %s", strerror(errno)); 1031 } else { 1032 dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; 1033 DPRINT(" mmap_addr: 0x%016"PRIx64"\n", 1034 dev_region->mmap_addr); 1035 } 1036 1037 close(vmsg->fds[i]); 1038 } 1039 1040 for (i = 0; i < dev->max_queues; i++) { 1041 if (dev->vq[i].vring.desc) { 1042 if (map_ring(dev, &dev->vq[i])) { 1043 vu_panic(dev, "remapping queue %d during setmemtable", i); 1044 } 1045 } 1046 } 1047 1048 return false; 1049 } 1050 1051 static bool 1052 vu_set_log_base_exec(VuDev *dev, VhostUserMsg *vmsg) 1053 { 1054 int fd; 1055 uint64_t log_mmap_size, log_mmap_offset; 1056 void *rc; 1057 1058 if (vmsg->fd_num != 1 || 1059 vmsg->size != sizeof(vmsg->payload.log)) { 1060 vu_panic(dev, "Invalid log_base message"); 1061 return true; 1062 } 1063 1064 fd = vmsg->fds[0]; 1065 log_mmap_offset = vmsg->payload.log.mmap_offset; 1066 log_mmap_size = vmsg->payload.log.mmap_size; 1067 DPRINT("Log mmap_offset: %"PRId64"\n", log_mmap_offset); 1068 DPRINT("Log mmap_size: %"PRId64"\n", log_mmap_size); 1069 1070 rc = mmap(0, log_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 1071 log_mmap_offset); 1072 close(fd); 1073 if (rc == MAP_FAILED) { 1074 perror("log mmap error"); 1075 } 1076 1077 if (dev->log_table) { 1078 munmap(dev->log_table, dev->log_size); 1079 } 1080 dev->log_table = rc; 1081 dev->log_size = log_mmap_size; 1082 1083 vmsg->size = sizeof(vmsg->payload.u64); 1084 vmsg->fd_num = 0; 1085 1086 return true; 1087 } 1088 1089 static bool 1090 vu_set_log_fd_exec(VuDev *dev, VhostUserMsg *vmsg) 1091 { 1092 if (vmsg->fd_num != 1) { 1093 vu_panic(dev, "Invalid log_fd message"); 1094 return false; 1095 } 1096 1097 if (dev->log_call_fd != -1) { 1098 close(dev->log_call_fd); 1099 } 1100 dev->log_call_fd = vmsg->fds[0]; 1101 DPRINT("Got log_call_fd: %d\n", vmsg->fds[0]); 1102 1103 return false; 1104 } 1105 1106 static bool 1107 vu_set_vring_num_exec(VuDev *dev, VhostUserMsg *vmsg) 1108 { 1109 unsigned int index = vmsg->payload.state.index; 1110 unsigned int num = vmsg->payload.state.num; 1111 1112 DPRINT("State.index: %u\n", index); 1113 DPRINT("State.num: %u\n", num); 1114 dev->vq[index].vring.num = num; 1115 1116 return false; 1117 } 1118 1119 static bool 1120 vu_set_vring_addr_exec(VuDev *dev, VhostUserMsg *vmsg) 1121 { 1122 struct vhost_vring_addr addr = vmsg->payload.addr, *vra = &addr; 1123 unsigned int index = vra->index; 1124 VuVirtq *vq = &dev->vq[index]; 1125 1126 DPRINT("vhost_vring_addr:\n"); 1127 DPRINT(" index: %d\n", vra->index); 1128 DPRINT(" flags: %d\n", vra->flags); 1129 DPRINT(" desc_user_addr: 0x%016" PRIx64 "\n", (uint64_t)vra->desc_user_addr); 1130 DPRINT(" used_user_addr: 0x%016" PRIx64 "\n", (uint64_t)vra->used_user_addr); 1131 DPRINT(" avail_user_addr: 0x%016" PRIx64 "\n", (uint64_t)vra->avail_user_addr); 1132 DPRINT(" log_guest_addr: 0x%016" PRIx64 "\n", (uint64_t)vra->log_guest_addr); 1133 1134 vq->vra = *vra; 1135 vq->vring.flags = vra->flags; 1136 vq->vring.log_guest_addr = vra->log_guest_addr; 1137 1138 1139 if (map_ring(dev, vq)) { 1140 vu_panic(dev, "Invalid vring_addr message"); 1141 return false; 1142 } 1143 1144 vq->used_idx = le16toh(vq->vring.used->idx); 1145 1146 if (vq->last_avail_idx != vq->used_idx) { 1147 bool resume = dev->iface->queue_is_processed_in_order && 1148 dev->iface->queue_is_processed_in_order(dev, index); 1149 1150 DPRINT("Last avail index != used index: %u != %u%s\n", 1151 vq->last_avail_idx, vq->used_idx, 1152 resume ? ", resuming" : ""); 1153 1154 if (resume) { 1155 vq->shadow_avail_idx = vq->last_avail_idx = vq->used_idx; 1156 } 1157 } 1158 1159 return false; 1160 } 1161 1162 static bool 1163 vu_set_vring_base_exec(VuDev *dev, VhostUserMsg *vmsg) 1164 { 1165 unsigned int index = vmsg->payload.state.index; 1166 unsigned int num = vmsg->payload.state.num; 1167 1168 DPRINT("State.index: %u\n", index); 1169 DPRINT("State.num: %u\n", num); 1170 dev->vq[index].shadow_avail_idx = dev->vq[index].last_avail_idx = num; 1171 1172 return false; 1173 } 1174 1175 static bool 1176 vu_get_vring_base_exec(VuDev *dev, VhostUserMsg *vmsg) 1177 { 1178 unsigned int index = vmsg->payload.state.index; 1179 1180 DPRINT("State.index: %u\n", index); 1181 vmsg->payload.state.num = dev->vq[index].last_avail_idx; 1182 vmsg->size = sizeof(vmsg->payload.state); 1183 1184 dev->vq[index].started = false; 1185 if (dev->iface->queue_set_started) { 1186 dev->iface->queue_set_started(dev, index, false); 1187 } 1188 1189 if (dev->vq[index].call_fd != -1) { 1190 close(dev->vq[index].call_fd); 1191 dev->vq[index].call_fd = -1; 1192 } 1193 if (dev->vq[index].kick_fd != -1) { 1194 dev->remove_watch(dev, dev->vq[index].kick_fd); 1195 close(dev->vq[index].kick_fd); 1196 dev->vq[index].kick_fd = -1; 1197 } 1198 1199 return true; 1200 } 1201 1202 static bool 1203 vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg) 1204 { 1205 int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; 1206 bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; 1207 1208 if (index >= dev->max_queues) { 1209 vmsg_close_fds(vmsg); 1210 vu_panic(dev, "Invalid queue index: %u", index); 1211 return false; 1212 } 1213 1214 if (nofd) { 1215 vmsg_close_fds(vmsg); 1216 return true; 1217 } 1218 1219 if (vmsg->fd_num != 1) { 1220 vmsg_close_fds(vmsg); 1221 vu_panic(dev, "Invalid fds in request: %d", vmsg->request); 1222 return false; 1223 } 1224 1225 return true; 1226 } 1227 1228 static int 1229 inflight_desc_compare(const void *a, const void *b) 1230 { 1231 VuVirtqInflightDesc *desc0 = (VuVirtqInflightDesc *)a, 1232 *desc1 = (VuVirtqInflightDesc *)b; 1233 1234 if (desc1->counter > desc0->counter && 1235 (desc1->counter - desc0->counter) < VIRTQUEUE_MAX_SIZE * 2) { 1236 return 1; 1237 } 1238 1239 return -1; 1240 } 1241 1242 static int 1243 vu_check_queue_inflights(VuDev *dev, VuVirtq *vq) 1244 { 1245 int i = 0; 1246 1247 if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { 1248 return 0; 1249 } 1250 1251 if (unlikely(!vq->inflight)) { 1252 return -1; 1253 } 1254 1255 if (unlikely(!vq->inflight->version)) { 1256 /* initialize the buffer */ 1257 vq->inflight->version = INFLIGHT_VERSION; 1258 return 0; 1259 } 1260 1261 vq->used_idx = le16toh(vq->vring.used->idx); 1262 vq->resubmit_num = 0; 1263 vq->resubmit_list = NULL; 1264 vq->counter = 0; 1265 1266 if (unlikely(vq->inflight->used_idx != vq->used_idx)) { 1267 vq->inflight->desc[vq->inflight->last_batch_head].inflight = 0; 1268 1269 barrier(); 1270 1271 vq->inflight->used_idx = vq->used_idx; 1272 } 1273 1274 for (i = 0; i < vq->inflight->desc_num; i++) { 1275 if (vq->inflight->desc[i].inflight == 1) { 1276 vq->inuse++; 1277 } 1278 } 1279 1280 vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx; 1281 1282 if (vq->inuse) { 1283 vq->resubmit_list = calloc(vq->inuse, sizeof(VuVirtqInflightDesc)); 1284 if (!vq->resubmit_list) { 1285 return -1; 1286 } 1287 1288 for (i = 0; i < vq->inflight->desc_num; i++) { 1289 if (vq->inflight->desc[i].inflight) { 1290 vq->resubmit_list[vq->resubmit_num].index = i; 1291 vq->resubmit_list[vq->resubmit_num].counter = 1292 vq->inflight->desc[i].counter; 1293 vq->resubmit_num++; 1294 } 1295 } 1296 1297 if (vq->resubmit_num > 1) { 1298 qsort(vq->resubmit_list, vq->resubmit_num, 1299 sizeof(VuVirtqInflightDesc), inflight_desc_compare); 1300 } 1301 vq->counter = vq->resubmit_list[0].counter + 1; 1302 } 1303 1304 /* in case of I/O hang after reconnecting */ 1305 if (eventfd_write(vq->kick_fd, 1)) { 1306 return -1; 1307 } 1308 1309 return 0; 1310 } 1311 1312 static bool 1313 vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg) 1314 { 1315 int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; 1316 bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; 1317 1318 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); 1319 1320 if (!vu_check_queue_msg_file(dev, vmsg)) { 1321 return false; 1322 } 1323 1324 if (dev->vq[index].kick_fd != -1) { 1325 dev->remove_watch(dev, dev->vq[index].kick_fd); 1326 close(dev->vq[index].kick_fd); 1327 dev->vq[index].kick_fd = -1; 1328 } 1329 1330 dev->vq[index].kick_fd = nofd ? -1 : vmsg->fds[0]; 1331 DPRINT("Got kick_fd: %d for vq: %d\n", dev->vq[index].kick_fd, index); 1332 1333 dev->vq[index].started = true; 1334 if (dev->iface->queue_set_started) { 1335 dev->iface->queue_set_started(dev, index, true); 1336 } 1337 1338 if (dev->vq[index].kick_fd != -1 && dev->vq[index].handler) { 1339 dev->set_watch(dev, dev->vq[index].kick_fd, VU_WATCH_IN, 1340 vu_kick_cb, (void *)(long)index); 1341 1342 DPRINT("Waiting for kicks on fd: %d for vq: %d\n", 1343 dev->vq[index].kick_fd, index); 1344 } 1345 1346 if (vu_check_queue_inflights(dev, &dev->vq[index])) { 1347 vu_panic(dev, "Failed to check inflights for vq: %d\n", index); 1348 } 1349 1350 return false; 1351 } 1352 1353 void vu_set_queue_handler(VuDev *dev, VuVirtq *vq, 1354 vu_queue_handler_cb handler) 1355 { 1356 int qidx = vq - dev->vq; 1357 1358 vq->handler = handler; 1359 if (vq->kick_fd >= 0) { 1360 if (handler) { 1361 dev->set_watch(dev, vq->kick_fd, VU_WATCH_IN, 1362 vu_kick_cb, (void *)(long)qidx); 1363 } else { 1364 dev->remove_watch(dev, vq->kick_fd); 1365 } 1366 } 1367 } 1368 1369 bool vu_set_queue_host_notifier(VuDev *dev, VuVirtq *vq, int fd, 1370 int size, int offset) 1371 { 1372 int qidx = vq - dev->vq; 1373 int fd_num = 0; 1374 VhostUserMsg vmsg = { 1375 .request = VHOST_USER_BACKEND_VRING_HOST_NOTIFIER_MSG, 1376 .flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK, 1377 .size = sizeof(vmsg.payload.area), 1378 .payload.area = { 1379 .u64 = qidx & VHOST_USER_VRING_IDX_MASK, 1380 .size = size, 1381 .offset = offset, 1382 }, 1383 }; 1384 1385 if (fd == -1) { 1386 vmsg.payload.area.u64 |= VHOST_USER_VRING_NOFD_MASK; 1387 } else { 1388 vmsg.fds[fd_num++] = fd; 1389 } 1390 1391 vmsg.fd_num = fd_num; 1392 1393 if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_BACKEND_SEND_FD)) { 1394 return false; 1395 } 1396 1397 pthread_mutex_lock(&dev->backend_mutex); 1398 if (!vu_message_write(dev, dev->backend_fd, &vmsg)) { 1399 pthread_mutex_unlock(&dev->backend_mutex); 1400 return false; 1401 } 1402 1403 /* Also unlocks the backend_mutex */ 1404 return vu_process_message_reply(dev, &vmsg); 1405 } 1406 1407 static bool 1408 vu_set_vring_call_exec(VuDev *dev, VhostUserMsg *vmsg) 1409 { 1410 int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; 1411 bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; 1412 1413 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); 1414 1415 if (!vu_check_queue_msg_file(dev, vmsg)) { 1416 return false; 1417 } 1418 1419 if (dev->vq[index].call_fd != -1) { 1420 close(dev->vq[index].call_fd); 1421 dev->vq[index].call_fd = -1; 1422 } 1423 1424 dev->vq[index].call_fd = nofd ? -1 : vmsg->fds[0]; 1425 1426 /* in case of I/O hang after reconnecting */ 1427 if (dev->vq[index].call_fd != -1 && eventfd_write(vmsg->fds[0], 1)) { 1428 return -1; 1429 } 1430 1431 DPRINT("Got call_fd: %d for vq: %d\n", dev->vq[index].call_fd, index); 1432 1433 return false; 1434 } 1435 1436 static bool 1437 vu_set_vring_err_exec(VuDev *dev, VhostUserMsg *vmsg) 1438 { 1439 int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; 1440 bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; 1441 1442 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); 1443 1444 if (!vu_check_queue_msg_file(dev, vmsg)) { 1445 return false; 1446 } 1447 1448 if (dev->vq[index].err_fd != -1) { 1449 close(dev->vq[index].err_fd); 1450 dev->vq[index].err_fd = -1; 1451 } 1452 1453 dev->vq[index].err_fd = nofd ? -1 : vmsg->fds[0]; 1454 1455 return false; 1456 } 1457 1458 static bool 1459 vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg) 1460 { 1461 /* 1462 * Note that we support, but intentionally do not set, 1463 * VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS. This means that 1464 * a device implementation can return it in its callback 1465 * (get_protocol_features) if it wants to use this for 1466 * simulation, but it is otherwise not desirable (if even 1467 * implemented by the frontend.) 1468 */ 1469 uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_MQ | 1470 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD | 1471 1ULL << VHOST_USER_PROTOCOL_F_BACKEND_REQ | 1472 1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | 1473 1ULL << VHOST_USER_PROTOCOL_F_BACKEND_SEND_FD | 1474 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | 1475 1ULL << VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS; 1476 1477 if (have_userfault()) { 1478 features |= 1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT; 1479 } 1480 1481 if (dev->iface->get_config && dev->iface->set_config) { 1482 features |= 1ULL << VHOST_USER_PROTOCOL_F_CONFIG; 1483 } 1484 1485 if (dev->iface->get_protocol_features) { 1486 features |= dev->iface->get_protocol_features(dev); 1487 } 1488 1489 vmsg_set_reply_u64(vmsg, features); 1490 return true; 1491 } 1492 1493 static bool 1494 vu_set_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg) 1495 { 1496 uint64_t features = vmsg->payload.u64; 1497 1498 DPRINT("u64: 0x%016"PRIx64"\n", features); 1499 1500 dev->protocol_features = vmsg->payload.u64; 1501 1502 if (vu_has_protocol_feature(dev, 1503 VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS) && 1504 (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_BACKEND_REQ) || 1505 !vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_REPLY_ACK))) { 1506 /* 1507 * The use case for using messages for kick/call is simulation, to make 1508 * the kick and call synchronous. To actually get that behaviour, both 1509 * of the other features are required. 1510 * Theoretically, one could use only kick messages, or do them without 1511 * having F_REPLY_ACK, but too many (possibly pending) messages on the 1512 * socket will eventually cause the frontend to hang, to avoid this in 1513 * scenarios where not desired enforce that the settings are in a way 1514 * that actually enables the simulation case. 1515 */ 1516 vu_panic(dev, 1517 "F_IN_BAND_NOTIFICATIONS requires F_BACKEND_REQ && F_REPLY_ACK"); 1518 return false; 1519 } 1520 1521 if (dev->iface->set_protocol_features) { 1522 dev->iface->set_protocol_features(dev, features); 1523 } 1524 1525 return false; 1526 } 1527 1528 static bool 1529 vu_get_queue_num_exec(VuDev *dev, VhostUserMsg *vmsg) 1530 { 1531 vmsg_set_reply_u64(vmsg, dev->max_queues); 1532 return true; 1533 } 1534 1535 static bool 1536 vu_set_vring_enable_exec(VuDev *dev, VhostUserMsg *vmsg) 1537 { 1538 unsigned int index = vmsg->payload.state.index; 1539 unsigned int enable = vmsg->payload.state.num; 1540 1541 DPRINT("State.index: %u\n", index); 1542 DPRINT("State.enable: %u\n", enable); 1543 1544 if (index >= dev->max_queues) { 1545 vu_panic(dev, "Invalid vring_enable index: %u", index); 1546 return false; 1547 } 1548 1549 dev->vq[index].enable = enable; 1550 return false; 1551 } 1552 1553 static bool 1554 vu_set_backend_req_fd(VuDev *dev, VhostUserMsg *vmsg) 1555 { 1556 if (vmsg->fd_num != 1) { 1557 vu_panic(dev, "Invalid backend_req_fd message (%d fd's)", vmsg->fd_num); 1558 return false; 1559 } 1560 1561 if (dev->backend_fd != -1) { 1562 close(dev->backend_fd); 1563 } 1564 dev->backend_fd = vmsg->fds[0]; 1565 DPRINT("Got backend_fd: %d\n", vmsg->fds[0]); 1566 1567 return false; 1568 } 1569 1570 static bool 1571 vu_get_config(VuDev *dev, VhostUserMsg *vmsg) 1572 { 1573 int ret = -1; 1574 1575 if (dev->iface->get_config) { 1576 ret = dev->iface->get_config(dev, vmsg->payload.config.region, 1577 vmsg->payload.config.size); 1578 } 1579 1580 if (ret) { 1581 /* resize to zero to indicate an error to frontend */ 1582 vmsg->size = 0; 1583 } 1584 1585 return true; 1586 } 1587 1588 static bool 1589 vu_set_config(VuDev *dev, VhostUserMsg *vmsg) 1590 { 1591 int ret = -1; 1592 1593 if (dev->iface->set_config) { 1594 ret = dev->iface->set_config(dev, vmsg->payload.config.region, 1595 vmsg->payload.config.offset, 1596 vmsg->payload.config.size, 1597 vmsg->payload.config.flags); 1598 if (ret) { 1599 vu_panic(dev, "Set virtio configuration space failed"); 1600 } 1601 } 1602 1603 return false; 1604 } 1605 1606 static bool 1607 vu_set_postcopy_advise(VuDev *dev, VhostUserMsg *vmsg) 1608 { 1609 #ifdef UFFDIO_API 1610 struct uffdio_api api_struct; 1611 1612 dev->postcopy_ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); 1613 vmsg->size = 0; 1614 #else 1615 dev->postcopy_ufd = -1; 1616 #endif 1617 1618 if (dev->postcopy_ufd == -1) { 1619 vu_panic(dev, "Userfaultfd not available: %s", strerror(errno)); 1620 goto out; 1621 } 1622 1623 #ifdef UFFDIO_API 1624 api_struct.api = UFFD_API; 1625 api_struct.features = 0; 1626 if (ioctl(dev->postcopy_ufd, UFFDIO_API, &api_struct)) { 1627 vu_panic(dev, "Failed UFFDIO_API: %s", strerror(errno)); 1628 close(dev->postcopy_ufd); 1629 dev->postcopy_ufd = -1; 1630 goto out; 1631 } 1632 /* TODO: Stash feature flags somewhere */ 1633 #endif 1634 1635 out: 1636 /* Return a ufd to the QEMU */ 1637 vmsg->fd_num = 1; 1638 vmsg->fds[0] = dev->postcopy_ufd; 1639 return true; /* = send a reply */ 1640 } 1641 1642 static bool 1643 vu_set_postcopy_listen(VuDev *dev, VhostUserMsg *vmsg) 1644 { 1645 if (dev->nregions) { 1646 vu_panic(dev, "Regions already registered at postcopy-listen"); 1647 vmsg_set_reply_u64(vmsg, -1); 1648 return true; 1649 } 1650 dev->postcopy_listening = true; 1651 1652 vmsg_set_reply_u64(vmsg, 0); 1653 return true; 1654 } 1655 1656 static bool 1657 vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg) 1658 { 1659 DPRINT("%s: Entry\n", __func__); 1660 dev->postcopy_listening = false; 1661 if (dev->postcopy_ufd > 0) { 1662 close(dev->postcopy_ufd); 1663 dev->postcopy_ufd = -1; 1664 DPRINT("%s: Done close\n", __func__); 1665 } 1666 1667 vmsg_set_reply_u64(vmsg, 0); 1668 DPRINT("%s: exit\n", __func__); 1669 return true; 1670 } 1671 1672 static inline uint64_t 1673 vu_inflight_queue_size(uint16_t queue_size) 1674 { 1675 return ALIGN_UP(sizeof(VuDescStateSplit) * queue_size + 1676 sizeof(uint16_t), INFLIGHT_ALIGNMENT); 1677 } 1678 1679 #ifdef MFD_ALLOW_SEALING 1680 static void * 1681 memfd_alloc(const char *name, size_t size, unsigned int flags, int *fd) 1682 { 1683 void *ptr; 1684 int ret; 1685 1686 *fd = memfd_create(name, MFD_ALLOW_SEALING); 1687 if (*fd < 0) { 1688 return NULL; 1689 } 1690 1691 ret = ftruncate(*fd, size); 1692 if (ret < 0) { 1693 close(*fd); 1694 return NULL; 1695 } 1696 1697 ret = fcntl(*fd, F_ADD_SEALS, flags); 1698 if (ret < 0) { 1699 close(*fd); 1700 return NULL; 1701 } 1702 1703 ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, *fd, 0); 1704 if (ptr == MAP_FAILED) { 1705 close(*fd); 1706 return NULL; 1707 } 1708 1709 return ptr; 1710 } 1711 #endif 1712 1713 static bool 1714 vu_get_inflight_fd(VuDev *dev, VhostUserMsg *vmsg) 1715 { 1716 int fd = -1; 1717 void *addr = NULL; 1718 uint64_t mmap_size; 1719 uint16_t num_queues, queue_size; 1720 1721 if (vmsg->size != sizeof(vmsg->payload.inflight)) { 1722 vu_panic(dev, "Invalid get_inflight_fd message:%d", vmsg->size); 1723 vmsg->payload.inflight.mmap_size = 0; 1724 return true; 1725 } 1726 1727 num_queues = vmsg->payload.inflight.num_queues; 1728 queue_size = vmsg->payload.inflight.queue_size; 1729 1730 DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues); 1731 DPRINT("set_inflight_fd queue_size: %"PRId16"\n", queue_size); 1732 1733 mmap_size = vu_inflight_queue_size(queue_size) * num_queues; 1734 1735 #ifdef MFD_ALLOW_SEALING 1736 addr = memfd_alloc("vhost-inflight", mmap_size, 1737 F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL, 1738 &fd); 1739 #else 1740 vu_panic(dev, "Not implemented: memfd support is missing"); 1741 #endif 1742 1743 if (!addr) { 1744 vu_panic(dev, "Failed to alloc vhost inflight area"); 1745 vmsg->payload.inflight.mmap_size = 0; 1746 return true; 1747 } 1748 1749 memset(addr, 0, mmap_size); 1750 1751 dev->inflight_info.addr = addr; 1752 dev->inflight_info.size = vmsg->payload.inflight.mmap_size = mmap_size; 1753 dev->inflight_info.fd = vmsg->fds[0] = fd; 1754 vmsg->fd_num = 1; 1755 vmsg->payload.inflight.mmap_offset = 0; 1756 1757 DPRINT("send inflight mmap_size: %"PRId64"\n", 1758 vmsg->payload.inflight.mmap_size); 1759 DPRINT("send inflight mmap offset: %"PRId64"\n", 1760 vmsg->payload.inflight.mmap_offset); 1761 1762 return true; 1763 } 1764 1765 static bool 1766 vu_set_inflight_fd(VuDev *dev, VhostUserMsg *vmsg) 1767 { 1768 int fd, i; 1769 uint64_t mmap_size, mmap_offset; 1770 uint16_t num_queues, queue_size; 1771 void *rc; 1772 1773 if (vmsg->fd_num != 1 || 1774 vmsg->size != sizeof(vmsg->payload.inflight)) { 1775 vu_panic(dev, "Invalid set_inflight_fd message size:%d fds:%d", 1776 vmsg->size, vmsg->fd_num); 1777 return false; 1778 } 1779 1780 fd = vmsg->fds[0]; 1781 mmap_size = vmsg->payload.inflight.mmap_size; 1782 mmap_offset = vmsg->payload.inflight.mmap_offset; 1783 num_queues = vmsg->payload.inflight.num_queues; 1784 queue_size = vmsg->payload.inflight.queue_size; 1785 1786 DPRINT("set_inflight_fd mmap_size: %"PRId64"\n", mmap_size); 1787 DPRINT("set_inflight_fd mmap_offset: %"PRId64"\n", mmap_offset); 1788 DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues); 1789 DPRINT("set_inflight_fd queue_size: %"PRId16"\n", queue_size); 1790 1791 rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, 1792 fd, mmap_offset); 1793 1794 if (rc == MAP_FAILED) { 1795 vu_panic(dev, "set_inflight_fd mmap error: %s", strerror(errno)); 1796 return false; 1797 } 1798 1799 if (dev->inflight_info.fd) { 1800 close(dev->inflight_info.fd); 1801 } 1802 1803 if (dev->inflight_info.addr) { 1804 munmap(dev->inflight_info.addr, dev->inflight_info.size); 1805 } 1806 1807 dev->inflight_info.fd = fd; 1808 dev->inflight_info.addr = rc; 1809 dev->inflight_info.size = mmap_size; 1810 1811 for (i = 0; i < num_queues; i++) { 1812 dev->vq[i].inflight = (VuVirtqInflight *)rc; 1813 dev->vq[i].inflight->desc_num = queue_size; 1814 rc = (void *)((char *)rc + vu_inflight_queue_size(queue_size)); 1815 } 1816 1817 return false; 1818 } 1819 1820 static bool 1821 vu_handle_vring_kick(VuDev *dev, VhostUserMsg *vmsg) 1822 { 1823 unsigned int index = vmsg->payload.state.index; 1824 1825 if (index >= dev->max_queues) { 1826 vu_panic(dev, "Invalid queue index: %u", index); 1827 return false; 1828 } 1829 1830 DPRINT("Got kick message: handler:%p idx:%u\n", 1831 dev->vq[index].handler, index); 1832 1833 if (!dev->vq[index].started) { 1834 dev->vq[index].started = true; 1835 1836 if (dev->iface->queue_set_started) { 1837 dev->iface->queue_set_started(dev, index, true); 1838 } 1839 } 1840 1841 if (dev->vq[index].handler) { 1842 dev->vq[index].handler(dev, index); 1843 } 1844 1845 return false; 1846 } 1847 1848 static bool vu_handle_get_max_memslots(VuDev *dev, VhostUserMsg *vmsg) 1849 { 1850 vmsg_set_reply_u64(vmsg, VHOST_USER_MAX_RAM_SLOTS); 1851 1852 DPRINT("u64: 0x%016"PRIx64"\n", (uint64_t) VHOST_USER_MAX_RAM_SLOTS); 1853 1854 return true; 1855 } 1856 1857 static bool 1858 vu_process_message(VuDev *dev, VhostUserMsg *vmsg) 1859 { 1860 int do_reply = 0; 1861 1862 /* Print out generic part of the request. */ 1863 DPRINT("================ Vhost user message ================\n"); 1864 DPRINT("Request: %s (%d)\n", vu_request_to_string(vmsg->request), 1865 vmsg->request); 1866 DPRINT("Flags: 0x%x\n", vmsg->flags); 1867 DPRINT("Size: %u\n", vmsg->size); 1868 1869 if (vmsg->fd_num) { 1870 int i; 1871 DPRINT("Fds:"); 1872 for (i = 0; i < vmsg->fd_num; i++) { 1873 DPRINT(" %d", vmsg->fds[i]); 1874 } 1875 DPRINT("\n"); 1876 } 1877 1878 if (dev->iface->process_msg && 1879 dev->iface->process_msg(dev, vmsg, &do_reply)) { 1880 return do_reply; 1881 } 1882 1883 switch (vmsg->request) { 1884 case VHOST_USER_GET_FEATURES: 1885 return vu_get_features_exec(dev, vmsg); 1886 case VHOST_USER_SET_FEATURES: 1887 return vu_set_features_exec(dev, vmsg); 1888 case VHOST_USER_GET_PROTOCOL_FEATURES: 1889 return vu_get_protocol_features_exec(dev, vmsg); 1890 case VHOST_USER_SET_PROTOCOL_FEATURES: 1891 return vu_set_protocol_features_exec(dev, vmsg); 1892 case VHOST_USER_SET_OWNER: 1893 return vu_set_owner_exec(dev, vmsg); 1894 case VHOST_USER_RESET_OWNER: 1895 return vu_reset_device_exec(dev, vmsg); 1896 case VHOST_USER_SET_MEM_TABLE: 1897 return vu_set_mem_table_exec(dev, vmsg); 1898 case VHOST_USER_SET_LOG_BASE: 1899 return vu_set_log_base_exec(dev, vmsg); 1900 case VHOST_USER_SET_LOG_FD: 1901 return vu_set_log_fd_exec(dev, vmsg); 1902 case VHOST_USER_SET_VRING_NUM: 1903 return vu_set_vring_num_exec(dev, vmsg); 1904 case VHOST_USER_SET_VRING_ADDR: 1905 return vu_set_vring_addr_exec(dev, vmsg); 1906 case VHOST_USER_SET_VRING_BASE: 1907 return vu_set_vring_base_exec(dev, vmsg); 1908 case VHOST_USER_GET_VRING_BASE: 1909 return vu_get_vring_base_exec(dev, vmsg); 1910 case VHOST_USER_SET_VRING_KICK: 1911 return vu_set_vring_kick_exec(dev, vmsg); 1912 case VHOST_USER_SET_VRING_CALL: 1913 return vu_set_vring_call_exec(dev, vmsg); 1914 case VHOST_USER_SET_VRING_ERR: 1915 return vu_set_vring_err_exec(dev, vmsg); 1916 case VHOST_USER_GET_QUEUE_NUM: 1917 return vu_get_queue_num_exec(dev, vmsg); 1918 case VHOST_USER_SET_VRING_ENABLE: 1919 return vu_set_vring_enable_exec(dev, vmsg); 1920 case VHOST_USER_SET_BACKEND_REQ_FD: 1921 return vu_set_backend_req_fd(dev, vmsg); 1922 case VHOST_USER_GET_CONFIG: 1923 return vu_get_config(dev, vmsg); 1924 case VHOST_USER_SET_CONFIG: 1925 return vu_set_config(dev, vmsg); 1926 case VHOST_USER_NONE: 1927 /* if you need processing before exit, override iface->process_msg */ 1928 exit(0); 1929 case VHOST_USER_POSTCOPY_ADVISE: 1930 return vu_set_postcopy_advise(dev, vmsg); 1931 case VHOST_USER_POSTCOPY_LISTEN: 1932 return vu_set_postcopy_listen(dev, vmsg); 1933 case VHOST_USER_POSTCOPY_END: 1934 return vu_set_postcopy_end(dev, vmsg); 1935 case VHOST_USER_GET_INFLIGHT_FD: 1936 return vu_get_inflight_fd(dev, vmsg); 1937 case VHOST_USER_SET_INFLIGHT_FD: 1938 return vu_set_inflight_fd(dev, vmsg); 1939 case VHOST_USER_VRING_KICK: 1940 return vu_handle_vring_kick(dev, vmsg); 1941 case VHOST_USER_GET_MAX_MEM_SLOTS: 1942 return vu_handle_get_max_memslots(dev, vmsg); 1943 case VHOST_USER_ADD_MEM_REG: 1944 return vu_add_mem_reg(dev, vmsg); 1945 case VHOST_USER_REM_MEM_REG: 1946 return vu_rem_mem_reg(dev, vmsg); 1947 default: 1948 vmsg_close_fds(vmsg); 1949 vu_panic(dev, "Unhandled request: %d", vmsg->request); 1950 } 1951 1952 return false; 1953 } 1954 1955 bool 1956 vu_dispatch(VuDev *dev) 1957 { 1958 VhostUserMsg vmsg = { 0, }; 1959 int reply_requested; 1960 bool need_reply, success = false; 1961 1962 if (!dev->read_msg(dev, dev->sock, &vmsg)) { 1963 goto end; 1964 } 1965 1966 need_reply = vmsg.flags & VHOST_USER_NEED_REPLY_MASK; 1967 1968 reply_requested = vu_process_message(dev, &vmsg); 1969 if (!reply_requested && need_reply) { 1970 vmsg_set_reply_u64(&vmsg, 0); 1971 reply_requested = 1; 1972 } 1973 1974 if (!reply_requested) { 1975 success = true; 1976 goto end; 1977 } 1978 1979 if (!vu_send_reply(dev, dev->sock, &vmsg)) { 1980 goto end; 1981 } 1982 1983 success = true; 1984 1985 end: 1986 free(vmsg.data); 1987 return success; 1988 } 1989 1990 void 1991 vu_deinit(VuDev *dev) 1992 { 1993 unsigned int i; 1994 1995 for (i = 0; i < dev->nregions; i++) { 1996 VuDevRegion *r = &dev->regions[i]; 1997 void *m = (void *) (uintptr_t) r->mmap_addr; 1998 if (m != MAP_FAILED) { 1999 munmap(m, r->size + r->mmap_offset); 2000 } 2001 } 2002 dev->nregions = 0; 2003 2004 for (i = 0; i < dev->max_queues; i++) { 2005 VuVirtq *vq = &dev->vq[i]; 2006 2007 if (vq->call_fd != -1) { 2008 close(vq->call_fd); 2009 vq->call_fd = -1; 2010 } 2011 2012 if (vq->kick_fd != -1) { 2013 dev->remove_watch(dev, vq->kick_fd); 2014 close(vq->kick_fd); 2015 vq->kick_fd = -1; 2016 } 2017 2018 if (vq->err_fd != -1) { 2019 close(vq->err_fd); 2020 vq->err_fd = -1; 2021 } 2022 2023 if (vq->resubmit_list) { 2024 free(vq->resubmit_list); 2025 vq->resubmit_list = NULL; 2026 } 2027 2028 vq->inflight = NULL; 2029 } 2030 2031 if (dev->inflight_info.addr) { 2032 munmap(dev->inflight_info.addr, dev->inflight_info.size); 2033 dev->inflight_info.addr = NULL; 2034 } 2035 2036 if (dev->inflight_info.fd > 0) { 2037 close(dev->inflight_info.fd); 2038 dev->inflight_info.fd = -1; 2039 } 2040 2041 vu_close_log(dev); 2042 if (dev->backend_fd != -1) { 2043 close(dev->backend_fd); 2044 dev->backend_fd = -1; 2045 } 2046 pthread_mutex_destroy(&dev->backend_mutex); 2047 2048 if (dev->sock != -1) { 2049 close(dev->sock); 2050 } 2051 2052 free(dev->vq); 2053 dev->vq = NULL; 2054 } 2055 2056 bool 2057 vu_init(VuDev *dev, 2058 uint16_t max_queues, 2059 int socket, 2060 vu_panic_cb panic, 2061 vu_read_msg_cb read_msg, 2062 vu_set_watch_cb set_watch, 2063 vu_remove_watch_cb remove_watch, 2064 const VuDevIface *iface) 2065 { 2066 uint16_t i; 2067 2068 assert(max_queues > 0); 2069 assert(socket >= 0); 2070 assert(set_watch); 2071 assert(remove_watch); 2072 assert(iface); 2073 assert(panic); 2074 2075 memset(dev, 0, sizeof(*dev)); 2076 2077 dev->sock = socket; 2078 dev->panic = panic; 2079 dev->read_msg = read_msg ? read_msg : vu_message_read_default; 2080 dev->set_watch = set_watch; 2081 dev->remove_watch = remove_watch; 2082 dev->iface = iface; 2083 dev->log_call_fd = -1; 2084 pthread_mutex_init(&dev->backend_mutex, NULL); 2085 dev->backend_fd = -1; 2086 dev->max_queues = max_queues; 2087 2088 dev->vq = malloc(max_queues * sizeof(dev->vq[0])); 2089 if (!dev->vq) { 2090 DPRINT("%s: failed to malloc virtqueues\n", __func__); 2091 return false; 2092 } 2093 2094 for (i = 0; i < max_queues; i++) { 2095 dev->vq[i] = (VuVirtq) { 2096 .call_fd = -1, .kick_fd = -1, .err_fd = -1, 2097 .notification = true, 2098 }; 2099 } 2100 2101 return true; 2102 } 2103 2104 VuVirtq * 2105 vu_get_queue(VuDev *dev, int qidx) 2106 { 2107 assert(qidx < dev->max_queues); 2108 return &dev->vq[qidx]; 2109 } 2110 2111 bool 2112 vu_queue_enabled(VuDev *dev, VuVirtq *vq) 2113 { 2114 return vq->enable; 2115 } 2116 2117 bool 2118 vu_queue_started(const VuDev *dev, const VuVirtq *vq) 2119 { 2120 return vq->started; 2121 } 2122 2123 static inline uint16_t 2124 vring_avail_flags(VuVirtq *vq) 2125 { 2126 return le16toh(vq->vring.avail->flags); 2127 } 2128 2129 static inline uint16_t 2130 vring_avail_idx(VuVirtq *vq) 2131 { 2132 vq->shadow_avail_idx = le16toh(vq->vring.avail->idx); 2133 2134 return vq->shadow_avail_idx; 2135 } 2136 2137 static inline uint16_t 2138 vring_avail_ring(VuVirtq *vq, int i) 2139 { 2140 return le16toh(vq->vring.avail->ring[i]); 2141 } 2142 2143 static inline uint16_t 2144 vring_get_used_event(VuVirtq *vq) 2145 { 2146 return vring_avail_ring(vq, vq->vring.num); 2147 } 2148 2149 static int 2150 virtqueue_num_heads(VuDev *dev, VuVirtq *vq, unsigned int idx) 2151 { 2152 uint16_t num_heads = vring_avail_idx(vq) - idx; 2153 2154 /* Check it isn't doing very strange things with descriptor numbers. */ 2155 if (num_heads > vq->vring.num) { 2156 vu_panic(dev, "Guest moved used index from %u to %u", 2157 idx, vq->shadow_avail_idx); 2158 return -1; 2159 } 2160 if (num_heads) { 2161 /* On success, callers read a descriptor at vq->last_avail_idx. 2162 * Make sure descriptor read does not bypass avail index read. */ 2163 smp_rmb(); 2164 } 2165 2166 return num_heads; 2167 } 2168 2169 static bool 2170 virtqueue_get_head(VuDev *dev, VuVirtq *vq, 2171 unsigned int idx, unsigned int *head) 2172 { 2173 /* Grab the next descriptor number they're advertising, and increment 2174 * the index we've seen. */ 2175 *head = vring_avail_ring(vq, idx % vq->vring.num); 2176 2177 /* If their number is silly, that's a fatal mistake. */ 2178 if (*head >= vq->vring.num) { 2179 vu_panic(dev, "Guest says index %u is available", *head); 2180 return false; 2181 } 2182 2183 return true; 2184 } 2185 2186 static int 2187 virtqueue_read_indirect_desc(VuDev *dev, struct vring_desc *desc, 2188 uint64_t addr, size_t len) 2189 { 2190 struct vring_desc *ori_desc; 2191 uint64_t read_len; 2192 2193 if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) { 2194 return -1; 2195 } 2196 2197 if (len == 0) { 2198 return -1; 2199 } 2200 2201 while (len) { 2202 read_len = len; 2203 ori_desc = vu_gpa_to_va(dev, &read_len, addr); 2204 if (!ori_desc) { 2205 return -1; 2206 } 2207 2208 memcpy(desc, ori_desc, read_len); 2209 len -= read_len; 2210 addr += read_len; 2211 desc += read_len; 2212 } 2213 2214 return 0; 2215 } 2216 2217 enum { 2218 VIRTQUEUE_READ_DESC_ERROR = -1, 2219 VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */ 2220 VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */ 2221 }; 2222 2223 static int 2224 virtqueue_read_next_desc(VuDev *dev, struct vring_desc *desc, 2225 int i, unsigned int max, unsigned int *next) 2226 { 2227 /* If this descriptor says it doesn't chain, we're done. */ 2228 if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) { 2229 return VIRTQUEUE_READ_DESC_DONE; 2230 } 2231 2232 /* Check they're not leading us off end of descriptors. */ 2233 *next = le16toh(desc[i].next); 2234 /* Make sure compiler knows to grab that: we don't want it changing! */ 2235 smp_wmb(); 2236 2237 if (*next >= max) { 2238 vu_panic(dev, "Desc next is %u", *next); 2239 return VIRTQUEUE_READ_DESC_ERROR; 2240 } 2241 2242 return VIRTQUEUE_READ_DESC_MORE; 2243 } 2244 2245 void 2246 vu_queue_get_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int *in_bytes, 2247 unsigned int *out_bytes, 2248 unsigned max_in_bytes, unsigned max_out_bytes) 2249 { 2250 unsigned int idx; 2251 unsigned int total_bufs, in_total, out_total; 2252 int rc; 2253 2254 idx = vq->last_avail_idx; 2255 2256 total_bufs = in_total = out_total = 0; 2257 if (unlikely(dev->broken) || 2258 unlikely(!vq->vring.avail)) { 2259 goto done; 2260 } 2261 2262 while ((rc = virtqueue_num_heads(dev, vq, idx)) > 0) { 2263 unsigned int max, desc_len, num_bufs, indirect = 0; 2264 uint64_t desc_addr, read_len; 2265 struct vring_desc *desc; 2266 struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; 2267 unsigned int i; 2268 2269 max = vq->vring.num; 2270 num_bufs = total_bufs; 2271 if (!virtqueue_get_head(dev, vq, idx++, &i)) { 2272 goto err; 2273 } 2274 desc = vq->vring.desc; 2275 2276 if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) { 2277 if (le32toh(desc[i].len) % sizeof(struct vring_desc)) { 2278 vu_panic(dev, "Invalid size for indirect buffer table"); 2279 goto err; 2280 } 2281 2282 /* If we've got too many, that implies a descriptor loop. */ 2283 if (num_bufs >= max) { 2284 vu_panic(dev, "Looped descriptor"); 2285 goto err; 2286 } 2287 2288 /* loop over the indirect descriptor table */ 2289 indirect = 1; 2290 desc_addr = le64toh(desc[i].addr); 2291 desc_len = le32toh(desc[i].len); 2292 max = desc_len / sizeof(struct vring_desc); 2293 read_len = desc_len; 2294 desc = vu_gpa_to_va(dev, &read_len, desc_addr); 2295 if (unlikely(desc && read_len != desc_len)) { 2296 /* Failed to use zero copy */ 2297 desc = NULL; 2298 if (!virtqueue_read_indirect_desc(dev, desc_buf, 2299 desc_addr, 2300 desc_len)) { 2301 desc = desc_buf; 2302 } 2303 } 2304 if (!desc) { 2305 vu_panic(dev, "Invalid indirect buffer table"); 2306 goto err; 2307 } 2308 num_bufs = i = 0; 2309 } 2310 2311 do { 2312 /* If we've got too many, that implies a descriptor loop. */ 2313 if (++num_bufs > max) { 2314 vu_panic(dev, "Looped descriptor"); 2315 goto err; 2316 } 2317 2318 if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) { 2319 in_total += le32toh(desc[i].len); 2320 } else { 2321 out_total += le32toh(desc[i].len); 2322 } 2323 if (in_total >= max_in_bytes && out_total >= max_out_bytes) { 2324 goto done; 2325 } 2326 rc = virtqueue_read_next_desc(dev, desc, i, max, &i); 2327 } while (rc == VIRTQUEUE_READ_DESC_MORE); 2328 2329 if (rc == VIRTQUEUE_READ_DESC_ERROR) { 2330 goto err; 2331 } 2332 2333 if (!indirect) { 2334 total_bufs = num_bufs; 2335 } else { 2336 total_bufs++; 2337 } 2338 } 2339 if (rc < 0) { 2340 goto err; 2341 } 2342 done: 2343 if (in_bytes) { 2344 *in_bytes = in_total; 2345 } 2346 if (out_bytes) { 2347 *out_bytes = out_total; 2348 } 2349 return; 2350 2351 err: 2352 in_total = out_total = 0; 2353 goto done; 2354 } 2355 2356 bool 2357 vu_queue_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int in_bytes, 2358 unsigned int out_bytes) 2359 { 2360 unsigned int in_total, out_total; 2361 2362 vu_queue_get_avail_bytes(dev, vq, &in_total, &out_total, 2363 in_bytes, out_bytes); 2364 2365 return in_bytes <= in_total && out_bytes <= out_total; 2366 } 2367 2368 /* Fetch avail_idx from VQ memory only when we really need to know if 2369 * guest has added some buffers. */ 2370 bool 2371 vu_queue_empty(VuDev *dev, VuVirtq *vq) 2372 { 2373 if (unlikely(dev->broken) || 2374 unlikely(!vq->vring.avail)) { 2375 return true; 2376 } 2377 2378 if (vq->shadow_avail_idx != vq->last_avail_idx) { 2379 return false; 2380 } 2381 2382 return vring_avail_idx(vq) == vq->last_avail_idx; 2383 } 2384 2385 static bool 2386 vring_notify(VuDev *dev, VuVirtq *vq) 2387 { 2388 uint16_t old, new; 2389 bool v; 2390 2391 /* We need to expose used array entries before checking used event. */ 2392 smp_mb(); 2393 2394 /* Always notify when queue is empty (when feature acknowledge) */ 2395 if (vu_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) && 2396 !vq->inuse && vu_queue_empty(dev, vq)) { 2397 return true; 2398 } 2399 2400 if (!vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { 2401 return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT); 2402 } 2403 2404 v = vq->signalled_used_valid; 2405 vq->signalled_used_valid = true; 2406 old = vq->signalled_used; 2407 new = vq->signalled_used = vq->used_idx; 2408 return !v || vring_need_event(vring_get_used_event(vq), new, old); 2409 } 2410 2411 static void _vu_queue_notify(VuDev *dev, VuVirtq *vq, bool sync) 2412 { 2413 if (unlikely(dev->broken) || 2414 unlikely(!vq->vring.avail)) { 2415 return; 2416 } 2417 2418 if (!vring_notify(dev, vq)) { 2419 DPRINT("skipped notify...\n"); 2420 return; 2421 } 2422 2423 if (vq->call_fd < 0 && 2424 vu_has_protocol_feature(dev, 2425 VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS) && 2426 vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_BACKEND_REQ)) { 2427 VhostUserMsg vmsg = { 2428 .request = VHOST_USER_BACKEND_VRING_CALL, 2429 .flags = VHOST_USER_VERSION, 2430 .size = sizeof(vmsg.payload.state), 2431 .payload.state = { 2432 .index = vq - dev->vq, 2433 }, 2434 }; 2435 bool ack = sync && 2436 vu_has_protocol_feature(dev, 2437 VHOST_USER_PROTOCOL_F_REPLY_ACK); 2438 2439 if (ack) { 2440 vmsg.flags |= VHOST_USER_NEED_REPLY_MASK; 2441 } 2442 2443 vu_message_write(dev, dev->backend_fd, &vmsg); 2444 if (ack) { 2445 vu_message_read_default(dev, dev->backend_fd, &vmsg); 2446 } 2447 return; 2448 } 2449 2450 if (eventfd_write(vq->call_fd, 1) < 0) { 2451 vu_panic(dev, "Error writing eventfd: %s", strerror(errno)); 2452 } 2453 } 2454 2455 void vu_queue_notify(VuDev *dev, VuVirtq *vq) 2456 { 2457 _vu_queue_notify(dev, vq, false); 2458 } 2459 2460 void vu_queue_notify_sync(VuDev *dev, VuVirtq *vq) 2461 { 2462 _vu_queue_notify(dev, vq, true); 2463 } 2464 2465 void vu_config_change_msg(VuDev *dev) 2466 { 2467 VhostUserMsg vmsg = { 2468 .request = VHOST_USER_BACKEND_CONFIG_CHANGE_MSG, 2469 .flags = VHOST_USER_VERSION, 2470 }; 2471 2472 vu_message_write(dev, dev->backend_fd, &vmsg); 2473 } 2474 2475 static inline void 2476 vring_used_flags_set_bit(VuVirtq *vq, int mask) 2477 { 2478 uint16_t *flags; 2479 2480 flags = (uint16_t *)((char*)vq->vring.used + 2481 offsetof(struct vring_used, flags)); 2482 *flags = htole16(le16toh(*flags) | mask); 2483 } 2484 2485 static inline void 2486 vring_used_flags_unset_bit(VuVirtq *vq, int mask) 2487 { 2488 uint16_t *flags; 2489 2490 flags = (uint16_t *)((char*)vq->vring.used + 2491 offsetof(struct vring_used, flags)); 2492 *flags = htole16(le16toh(*flags) & ~mask); 2493 } 2494 2495 static inline void 2496 vring_set_avail_event(VuVirtq *vq, uint16_t val) 2497 { 2498 uint16_t val_le = htole16(val); 2499 2500 if (!vq->notification) { 2501 return; 2502 } 2503 2504 memcpy(&vq->vring.used->ring[vq->vring.num], &val_le, sizeof(uint16_t)); 2505 } 2506 2507 void 2508 vu_queue_set_notification(VuDev *dev, VuVirtq *vq, int enable) 2509 { 2510 vq->notification = enable; 2511 if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { 2512 vring_set_avail_event(vq, vring_avail_idx(vq)); 2513 } else if (enable) { 2514 vring_used_flags_unset_bit(vq, VRING_USED_F_NO_NOTIFY); 2515 } else { 2516 vring_used_flags_set_bit(vq, VRING_USED_F_NO_NOTIFY); 2517 } 2518 if (enable) { 2519 /* Expose avail event/used flags before caller checks the avail idx. */ 2520 smp_mb(); 2521 } 2522 } 2523 2524 static bool 2525 virtqueue_map_desc(VuDev *dev, 2526 unsigned int *p_num_sg, struct iovec *iov, 2527 unsigned int max_num_sg, bool is_write, 2528 uint64_t pa, size_t sz) 2529 { 2530 unsigned num_sg = *p_num_sg; 2531 2532 assert(num_sg <= max_num_sg); 2533 2534 if (!sz) { 2535 vu_panic(dev, "virtio: zero sized buffers are not allowed"); 2536 return false; 2537 } 2538 2539 while (sz) { 2540 uint64_t len = sz; 2541 2542 if (num_sg == max_num_sg) { 2543 vu_panic(dev, "virtio: too many descriptors in indirect table"); 2544 return false; 2545 } 2546 2547 iov[num_sg].iov_base = vu_gpa_to_va(dev, &len, pa); 2548 if (iov[num_sg].iov_base == NULL) { 2549 vu_panic(dev, "virtio: invalid address for buffers"); 2550 return false; 2551 } 2552 iov[num_sg].iov_len = len; 2553 num_sg++; 2554 sz -= len; 2555 pa += len; 2556 } 2557 2558 *p_num_sg = num_sg; 2559 return true; 2560 } 2561 2562 static void * 2563 virtqueue_alloc_element(size_t sz, 2564 unsigned out_num, unsigned in_num) 2565 { 2566 VuVirtqElement *elem; 2567 size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0])); 2568 size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]); 2569 size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]); 2570 2571 assert(sz >= sizeof(VuVirtqElement)); 2572 elem = malloc(out_sg_end); 2573 if (!elem) { 2574 DPRINT("%s: failed to malloc virtqueue element\n", __func__); 2575 return NULL; 2576 } 2577 elem->out_num = out_num; 2578 elem->in_num = in_num; 2579 elem->in_sg = (void *)elem + in_sg_ofs; 2580 elem->out_sg = (void *)elem + out_sg_ofs; 2581 return elem; 2582 } 2583 2584 static void * 2585 vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz) 2586 { 2587 struct vring_desc *desc = vq->vring.desc; 2588 uint64_t desc_addr, read_len; 2589 unsigned int desc_len; 2590 unsigned int max = vq->vring.num; 2591 unsigned int i = idx; 2592 VuVirtqElement *elem; 2593 unsigned int out_num = 0, in_num = 0; 2594 struct iovec iov[VIRTQUEUE_MAX_SIZE]; 2595 struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; 2596 int rc; 2597 2598 if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) { 2599 if (le32toh(desc[i].len) % sizeof(struct vring_desc)) { 2600 vu_panic(dev, "Invalid size for indirect buffer table"); 2601 return NULL; 2602 } 2603 2604 /* loop over the indirect descriptor table */ 2605 desc_addr = le64toh(desc[i].addr); 2606 desc_len = le32toh(desc[i].len); 2607 max = desc_len / sizeof(struct vring_desc); 2608 read_len = desc_len; 2609 desc = vu_gpa_to_va(dev, &read_len, desc_addr); 2610 if (unlikely(desc && read_len != desc_len)) { 2611 /* Failed to use zero copy */ 2612 desc = NULL; 2613 if (!virtqueue_read_indirect_desc(dev, desc_buf, 2614 desc_addr, 2615 desc_len)) { 2616 desc = desc_buf; 2617 } 2618 } 2619 if (!desc) { 2620 vu_panic(dev, "Invalid indirect buffer table"); 2621 return NULL; 2622 } 2623 i = 0; 2624 } 2625 2626 /* Collect all the descriptors */ 2627 do { 2628 if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) { 2629 if (!virtqueue_map_desc(dev, &in_num, iov + out_num, 2630 VIRTQUEUE_MAX_SIZE - out_num, true, 2631 le64toh(desc[i].addr), 2632 le32toh(desc[i].len))) { 2633 return NULL; 2634 } 2635 } else { 2636 if (in_num) { 2637 vu_panic(dev, "Incorrect order for descriptors"); 2638 return NULL; 2639 } 2640 if (!virtqueue_map_desc(dev, &out_num, iov, 2641 VIRTQUEUE_MAX_SIZE, false, 2642 le64toh(desc[i].addr), 2643 le32toh(desc[i].len))) { 2644 return NULL; 2645 } 2646 } 2647 2648 /* If we've got too many, that implies a descriptor loop. */ 2649 if ((in_num + out_num) > max) { 2650 vu_panic(dev, "Looped descriptor"); 2651 return NULL; 2652 } 2653 rc = virtqueue_read_next_desc(dev, desc, i, max, &i); 2654 } while (rc == VIRTQUEUE_READ_DESC_MORE); 2655 2656 if (rc == VIRTQUEUE_READ_DESC_ERROR) { 2657 vu_panic(dev, "read descriptor error"); 2658 return NULL; 2659 } 2660 2661 /* Now copy what we have collected and mapped */ 2662 elem = virtqueue_alloc_element(sz, out_num, in_num); 2663 if (!elem) { 2664 return NULL; 2665 } 2666 elem->index = idx; 2667 for (i = 0; i < out_num; i++) { 2668 elem->out_sg[i] = iov[i]; 2669 } 2670 for (i = 0; i < in_num; i++) { 2671 elem->in_sg[i] = iov[out_num + i]; 2672 } 2673 2674 return elem; 2675 } 2676 2677 static int 2678 vu_queue_inflight_get(VuDev *dev, VuVirtq *vq, int desc_idx) 2679 { 2680 if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { 2681 return 0; 2682 } 2683 2684 if (unlikely(!vq->inflight)) { 2685 return -1; 2686 } 2687 2688 vq->inflight->desc[desc_idx].counter = vq->counter++; 2689 vq->inflight->desc[desc_idx].inflight = 1; 2690 2691 return 0; 2692 } 2693 2694 static int 2695 vu_queue_inflight_pre_put(VuDev *dev, VuVirtq *vq, int desc_idx) 2696 { 2697 if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { 2698 return 0; 2699 } 2700 2701 if (unlikely(!vq->inflight)) { 2702 return -1; 2703 } 2704 2705 vq->inflight->last_batch_head = desc_idx; 2706 2707 return 0; 2708 } 2709 2710 static int 2711 vu_queue_inflight_post_put(VuDev *dev, VuVirtq *vq, int desc_idx) 2712 { 2713 if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { 2714 return 0; 2715 } 2716 2717 if (unlikely(!vq->inflight)) { 2718 return -1; 2719 } 2720 2721 barrier(); 2722 2723 vq->inflight->desc[desc_idx].inflight = 0; 2724 2725 barrier(); 2726 2727 vq->inflight->used_idx = vq->used_idx; 2728 2729 return 0; 2730 } 2731 2732 void * 2733 vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz) 2734 { 2735 int i; 2736 unsigned int head; 2737 VuVirtqElement *elem; 2738 2739 if (unlikely(dev->broken) || 2740 unlikely(!vq->vring.avail)) { 2741 return NULL; 2742 } 2743 2744 if (unlikely(vq->resubmit_list && vq->resubmit_num > 0)) { 2745 i = (--vq->resubmit_num); 2746 elem = vu_queue_map_desc(dev, vq, vq->resubmit_list[i].index, sz); 2747 2748 if (!vq->resubmit_num) { 2749 free(vq->resubmit_list); 2750 vq->resubmit_list = NULL; 2751 } 2752 2753 return elem; 2754 } 2755 2756 if (vu_queue_empty(dev, vq)) { 2757 return NULL; 2758 } 2759 /* 2760 * Needed after virtio_queue_empty(), see comment in 2761 * virtqueue_num_heads(). 2762 */ 2763 smp_rmb(); 2764 2765 if (vq->inuse >= vq->vring.num) { 2766 vu_panic(dev, "Virtqueue size exceeded"); 2767 return NULL; 2768 } 2769 2770 if (!virtqueue_get_head(dev, vq, vq->last_avail_idx++, &head)) { 2771 return NULL; 2772 } 2773 2774 if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { 2775 vring_set_avail_event(vq, vq->last_avail_idx); 2776 } 2777 2778 elem = vu_queue_map_desc(dev, vq, head, sz); 2779 2780 if (!elem) { 2781 return NULL; 2782 } 2783 2784 vq->inuse++; 2785 2786 vu_queue_inflight_get(dev, vq, head); 2787 2788 return elem; 2789 } 2790 2791 static void 2792 vu_queue_detach_element(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem, 2793 size_t len) 2794 { 2795 vq->inuse--; 2796 /* unmap, when DMA support is added */ 2797 } 2798 2799 void 2800 vu_queue_unpop(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem, 2801 size_t len) 2802 { 2803 vq->last_avail_idx--; 2804 vu_queue_detach_element(dev, vq, elem, len); 2805 } 2806 2807 bool 2808 vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num) 2809 { 2810 if (num > vq->inuse) { 2811 return false; 2812 } 2813 vq->last_avail_idx -= num; 2814 vq->inuse -= num; 2815 return true; 2816 } 2817 2818 static inline 2819 void vring_used_write(VuDev *dev, VuVirtq *vq, 2820 struct vring_used_elem *uelem, int i) 2821 { 2822 struct vring_used *used = vq->vring.used; 2823 2824 used->ring[i] = *uelem; 2825 vu_log_write(dev, vq->vring.log_guest_addr + 2826 offsetof(struct vring_used, ring[i]), 2827 sizeof(used->ring[i])); 2828 } 2829 2830 2831 static void 2832 vu_log_queue_fill(VuDev *dev, VuVirtq *vq, 2833 const VuVirtqElement *elem, 2834 unsigned int len) 2835 { 2836 struct vring_desc *desc = vq->vring.desc; 2837 unsigned int i, max, min, desc_len; 2838 uint64_t desc_addr, read_len; 2839 struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; 2840 unsigned num_bufs = 0; 2841 2842 max = vq->vring.num; 2843 i = elem->index; 2844 2845 if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) { 2846 if (le32toh(desc[i].len) % sizeof(struct vring_desc)) { 2847 vu_panic(dev, "Invalid size for indirect buffer table"); 2848 return; 2849 } 2850 2851 /* loop over the indirect descriptor table */ 2852 desc_addr = le64toh(desc[i].addr); 2853 desc_len = le32toh(desc[i].len); 2854 max = desc_len / sizeof(struct vring_desc); 2855 read_len = desc_len; 2856 desc = vu_gpa_to_va(dev, &read_len, desc_addr); 2857 if (unlikely(desc && read_len != desc_len)) { 2858 /* Failed to use zero copy */ 2859 desc = NULL; 2860 if (!virtqueue_read_indirect_desc(dev, desc_buf, 2861 desc_addr, 2862 desc_len)) { 2863 desc = desc_buf; 2864 } 2865 } 2866 if (!desc) { 2867 vu_panic(dev, "Invalid indirect buffer table"); 2868 return; 2869 } 2870 i = 0; 2871 } 2872 2873 do { 2874 if (++num_bufs > max) { 2875 vu_panic(dev, "Looped descriptor"); 2876 return; 2877 } 2878 2879 if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) { 2880 min = MIN(le32toh(desc[i].len), len); 2881 vu_log_write(dev, le64toh(desc[i].addr), min); 2882 len -= min; 2883 } 2884 2885 } while (len > 0 && 2886 (virtqueue_read_next_desc(dev, desc, i, max, &i) 2887 == VIRTQUEUE_READ_DESC_MORE)); 2888 } 2889 2890 void 2891 vu_queue_fill(VuDev *dev, VuVirtq *vq, 2892 const VuVirtqElement *elem, 2893 unsigned int len, unsigned int idx) 2894 { 2895 struct vring_used_elem uelem; 2896 2897 if (unlikely(dev->broken) || 2898 unlikely(!vq->vring.avail)) { 2899 return; 2900 } 2901 2902 vu_log_queue_fill(dev, vq, elem, len); 2903 2904 idx = (idx + vq->used_idx) % vq->vring.num; 2905 2906 uelem.id = htole32(elem->index); 2907 uelem.len = htole32(len); 2908 vring_used_write(dev, vq, &uelem, idx); 2909 } 2910 2911 static inline 2912 void vring_used_idx_set(VuDev *dev, VuVirtq *vq, uint16_t val) 2913 { 2914 vq->vring.used->idx = htole16(val); 2915 vu_log_write(dev, 2916 vq->vring.log_guest_addr + offsetof(struct vring_used, idx), 2917 sizeof(vq->vring.used->idx)); 2918 2919 vq->used_idx = val; 2920 } 2921 2922 void 2923 vu_queue_flush(VuDev *dev, VuVirtq *vq, unsigned int count) 2924 { 2925 uint16_t old, new; 2926 2927 if (unlikely(dev->broken) || 2928 unlikely(!vq->vring.avail)) { 2929 return; 2930 } 2931 2932 /* Make sure buffer is written before we update index. */ 2933 smp_wmb(); 2934 2935 old = vq->used_idx; 2936 new = old + count; 2937 vring_used_idx_set(dev, vq, new); 2938 vq->inuse -= count; 2939 if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) { 2940 vq->signalled_used_valid = false; 2941 } 2942 } 2943 2944 void 2945 vu_queue_push(VuDev *dev, VuVirtq *vq, 2946 const VuVirtqElement *elem, unsigned int len) 2947 { 2948 vu_queue_fill(dev, vq, elem, len, 0); 2949 vu_queue_inflight_pre_put(dev, vq, elem->index); 2950 vu_queue_flush(dev, vq, 1); 2951 vu_queue_inflight_post_put(dev, vq, elem->index); 2952 } 2953