xref: /qemu/hw/virtio/vhost-vdpa.c (revision b21e2380)
1 /*
2  * vhost-vdpa
3  *
4  *  Copyright(c) 2017-2018 Intel Corporation.
5  *  Copyright(c) 2020 Red Hat, Inc.
6  *
7  * This work is licensed under the terms of the GNU GPL, version 2 or later.
8  * See the COPYING file in the top-level directory.
9  *
10  */
11 
12 #include "qemu/osdep.h"
13 #include <linux/vhost.h>
14 #include <linux/vfio.h>
15 #include <sys/eventfd.h>
16 #include <sys/ioctl.h>
17 #include "hw/virtio/vhost.h"
18 #include "hw/virtio/vhost-backend.h"
19 #include "hw/virtio/virtio-net.h"
20 #include "hw/virtio/vhost-shadow-virtqueue.h"
21 #include "hw/virtio/vhost-vdpa.h"
22 #include "exec/address-spaces.h"
23 #include "qemu/main-loop.h"
24 #include "cpu.h"
25 #include "trace.h"
26 #include "qemu-common.h"
27 #include "qapi/error.h"
28 
29 /*
30  * Return one past the end of the end of section. Be careful with uint64_t
31  * conversions!
32  */
33 static Int128 vhost_vdpa_section_end(const MemoryRegionSection *section)
34 {
35     Int128 llend = int128_make64(section->offset_within_address_space);
36     llend = int128_add(llend, section->size);
37     llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
38 
39     return llend;
40 }
41 
42 static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section,
43                                                 uint64_t iova_min,
44                                                 uint64_t iova_max)
45 {
46     Int128 llend;
47 
48     if ((!memory_region_is_ram(section->mr) &&
49          !memory_region_is_iommu(section->mr)) ||
50         memory_region_is_protected(section->mr) ||
51         /* vhost-vDPA doesn't allow MMIO to be mapped  */
52         memory_region_is_ram_device(section->mr)) {
53         return true;
54     }
55 
56     if (section->offset_within_address_space < iova_min) {
57         error_report("RAM section out of device range (min=0x%" PRIx64
58                      ", addr=0x%" HWADDR_PRIx ")",
59                      iova_min, section->offset_within_address_space);
60         return true;
61     }
62 
63     llend = vhost_vdpa_section_end(section);
64     if (int128_gt(llend, int128_make64(iova_max))) {
65         error_report("RAM section out of device range (max=0x%" PRIx64
66                      ", end addr=0x%" PRIx64 ")",
67                      iova_max, int128_get64(llend));
68         return true;
69     }
70 
71     return false;
72 }
73 
74 static int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
75                               void *vaddr, bool readonly)
76 {
77     struct vhost_msg_v2 msg = {};
78     int fd = v->device_fd;
79     int ret = 0;
80 
81     msg.type = v->msg_type;
82     msg.iotlb.iova = iova;
83     msg.iotlb.size = size;
84     msg.iotlb.uaddr = (uint64_t)(uintptr_t)vaddr;
85     msg.iotlb.perm = readonly ? VHOST_ACCESS_RO : VHOST_ACCESS_RW;
86     msg.iotlb.type = VHOST_IOTLB_UPDATE;
87 
88    trace_vhost_vdpa_dma_map(v, fd, msg.type, msg.iotlb.iova, msg.iotlb.size,
89                             msg.iotlb.uaddr, msg.iotlb.perm, msg.iotlb.type);
90 
91     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
92         error_report("failed to write, fd=%d, errno=%d (%s)",
93             fd, errno, strerror(errno));
94         return -EIO ;
95     }
96 
97     return ret;
98 }
99 
100 static int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova,
101                                 hwaddr size)
102 {
103     struct vhost_msg_v2 msg = {};
104     int fd = v->device_fd;
105     int ret = 0;
106 
107     msg.type = v->msg_type;
108     msg.iotlb.iova = iova;
109     msg.iotlb.size = size;
110     msg.iotlb.type = VHOST_IOTLB_INVALIDATE;
111 
112     trace_vhost_vdpa_dma_unmap(v, fd, msg.type, msg.iotlb.iova,
113                                msg.iotlb.size, msg.iotlb.type);
114 
115     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
116         error_report("failed to write, fd=%d, errno=%d (%s)",
117             fd, errno, strerror(errno));
118         return -EIO ;
119     }
120 
121     return ret;
122 }
123 
124 static void vhost_vdpa_listener_begin_batch(struct vhost_vdpa *v)
125 {
126     int fd = v->device_fd;
127     struct vhost_msg_v2 msg = {
128         .type = v->msg_type,
129         .iotlb.type = VHOST_IOTLB_BATCH_BEGIN,
130     };
131 
132     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
133         error_report("failed to write, fd=%d, errno=%d (%s)",
134                      fd, errno, strerror(errno));
135     }
136 }
137 
138 static void vhost_vdpa_iotlb_batch_begin_once(struct vhost_vdpa *v)
139 {
140     if (v->dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH) &&
141         !v->iotlb_batch_begin_sent) {
142         vhost_vdpa_listener_begin_batch(v);
143     }
144 
145     v->iotlb_batch_begin_sent = true;
146 }
147 
148 static void vhost_vdpa_listener_commit(MemoryListener *listener)
149 {
150     struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
151     struct vhost_dev *dev = v->dev;
152     struct vhost_msg_v2 msg = {};
153     int fd = v->device_fd;
154 
155     if (!(dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH))) {
156         return;
157     }
158 
159     if (!v->iotlb_batch_begin_sent) {
160         return;
161     }
162 
163     msg.type = v->msg_type;
164     msg.iotlb.type = VHOST_IOTLB_BATCH_END;
165 
166     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
167         error_report("failed to write, fd=%d, errno=%d (%s)",
168                      fd, errno, strerror(errno));
169     }
170 
171     v->iotlb_batch_begin_sent = false;
172 }
173 
174 static void vhost_vdpa_listener_region_add(MemoryListener *listener,
175                                            MemoryRegionSection *section)
176 {
177     struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
178     hwaddr iova;
179     Int128 llend, llsize;
180     void *vaddr;
181     int ret;
182 
183     if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first,
184                                             v->iova_range.last)) {
185         return;
186     }
187 
188     if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
189                  (section->offset_within_region & ~TARGET_PAGE_MASK))) {
190         error_report("%s received unaligned region", __func__);
191         return;
192     }
193 
194     iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
195     llend = vhost_vdpa_section_end(section);
196     if (int128_ge(int128_make64(iova), llend)) {
197         return;
198     }
199 
200     memory_region_ref(section->mr);
201 
202     /* Here we assume that memory_region_is_ram(section->mr)==true */
203 
204     vaddr = memory_region_get_ram_ptr(section->mr) +
205             section->offset_within_region +
206             (iova - section->offset_within_address_space);
207 
208     trace_vhost_vdpa_listener_region_add(v, iova, int128_get64(llend),
209                                          vaddr, section->readonly);
210 
211     llsize = int128_sub(llend, int128_make64(iova));
212     if (v->shadow_vqs_enabled) {
213         DMAMap mem_region = {
214             .translated_addr = (hwaddr)(uintptr_t)vaddr,
215             .size = int128_get64(llsize) - 1,
216             .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
217         };
218 
219         int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
220         if (unlikely(r != IOVA_OK)) {
221             error_report("Can't allocate a mapping (%d)", r);
222             goto fail;
223         }
224 
225         iova = mem_region.iova;
226     }
227 
228     vhost_vdpa_iotlb_batch_begin_once(v);
229     ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
230                              vaddr, section->readonly);
231     if (ret) {
232         error_report("vhost vdpa map fail!");
233         goto fail;
234     }
235 
236     return;
237 
238 fail:
239     /*
240      * On the initfn path, store the first error in the container so we
241      * can gracefully fail.  Runtime, there's not much we can do other
242      * than throw a hardware error.
243      */
244     error_report("vhost-vdpa: DMA mapping failed, unable to continue");
245     return;
246 
247 }
248 
249 static void vhost_vdpa_listener_region_del(MemoryListener *listener,
250                                            MemoryRegionSection *section)
251 {
252     struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
253     hwaddr iova;
254     Int128 llend, llsize;
255     int ret;
256 
257     if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first,
258                                             v->iova_range.last)) {
259         return;
260     }
261 
262     if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
263                  (section->offset_within_region & ~TARGET_PAGE_MASK))) {
264         error_report("%s received unaligned region", __func__);
265         return;
266     }
267 
268     iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
269     llend = vhost_vdpa_section_end(section);
270 
271     trace_vhost_vdpa_listener_region_del(v, iova, int128_get64(llend));
272 
273     if (int128_ge(int128_make64(iova), llend)) {
274         return;
275     }
276 
277     llsize = int128_sub(llend, int128_make64(iova));
278 
279     if (v->shadow_vqs_enabled) {
280         const DMAMap *result;
281         const void *vaddr = memory_region_get_ram_ptr(section->mr) +
282             section->offset_within_region +
283             (iova - section->offset_within_address_space);
284         DMAMap mem_region = {
285             .translated_addr = (hwaddr)(uintptr_t)vaddr,
286             .size = int128_get64(llsize) - 1,
287         };
288 
289         result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
290         iova = result->iova;
291         vhost_iova_tree_remove(v->iova_tree, &mem_region);
292     }
293     vhost_vdpa_iotlb_batch_begin_once(v);
294     ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
295     if (ret) {
296         error_report("vhost_vdpa dma unmap error!");
297     }
298 
299     memory_region_unref(section->mr);
300 }
301 /*
302  * IOTLB API is used by vhost-vpda which requires incremental updating
303  * of the mapping. So we can not use generic vhost memory listener which
304  * depends on the addnop().
305  */
306 static const MemoryListener vhost_vdpa_memory_listener = {
307     .name = "vhost-vdpa",
308     .commit = vhost_vdpa_listener_commit,
309     .region_add = vhost_vdpa_listener_region_add,
310     .region_del = vhost_vdpa_listener_region_del,
311 };
312 
313 static int vhost_vdpa_call(struct vhost_dev *dev, unsigned long int request,
314                              void *arg)
315 {
316     struct vhost_vdpa *v = dev->opaque;
317     int fd = v->device_fd;
318     int ret;
319 
320     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
321 
322     ret = ioctl(fd, request, arg);
323     return ret < 0 ? -errno : ret;
324 }
325 
326 static int vhost_vdpa_add_status(struct vhost_dev *dev, uint8_t status)
327 {
328     uint8_t s;
329     int ret;
330 
331     trace_vhost_vdpa_add_status(dev, status);
332     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s);
333     if (ret < 0) {
334         return ret;
335     }
336 
337     s |= status;
338 
339     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &s);
340     if (ret < 0) {
341         return ret;
342     }
343 
344     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s);
345     if (ret < 0) {
346         return ret;
347     }
348 
349     if (!(s & status)) {
350         return -EIO;
351     }
352 
353     return 0;
354 }
355 
356 static void vhost_vdpa_get_iova_range(struct vhost_vdpa *v)
357 {
358     int ret = vhost_vdpa_call(v->dev, VHOST_VDPA_GET_IOVA_RANGE,
359                               &v->iova_range);
360     if (ret != 0) {
361         v->iova_range.first = 0;
362         v->iova_range.last = UINT64_MAX;
363     }
364 
365     trace_vhost_vdpa_get_iova_range(v->dev, v->iova_range.first,
366                                     v->iova_range.last);
367 }
368 
369 static bool vhost_vdpa_one_time_request(struct vhost_dev *dev)
370 {
371     struct vhost_vdpa *v = dev->opaque;
372 
373     return v->index != 0;
374 }
375 
376 static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
377                                        uint64_t *features)
378 {
379     int ret;
380 
381     ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
382     trace_vhost_vdpa_get_features(dev, *features);
383     return ret;
384 }
385 
386 static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
387                                Error **errp)
388 {
389     g_autoptr(GPtrArray) shadow_vqs = NULL;
390     uint64_t dev_features, svq_features;
391     int r;
392     bool ok;
393 
394     if (!v->shadow_vqs_enabled) {
395         return 0;
396     }
397 
398     r = vhost_vdpa_get_dev_features(hdev, &dev_features);
399     if (r != 0) {
400         error_setg_errno(errp, -r, "Can't get vdpa device features");
401         return r;
402     }
403 
404     svq_features = dev_features;
405     ok = vhost_svq_valid_features(svq_features, errp);
406     if (unlikely(!ok)) {
407         return -1;
408     }
409 
410     shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
411     for (unsigned n = 0; n < hdev->nvqs; ++n) {
412         g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
413 
414         if (unlikely(!svq)) {
415             error_setg(errp, "Cannot create svq %u", n);
416             return -1;
417         }
418         g_ptr_array_add(shadow_vqs, g_steal_pointer(&svq));
419     }
420 
421     v->shadow_vqs = g_steal_pointer(&shadow_vqs);
422     return 0;
423 }
424 
425 static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
426 {
427     struct vhost_vdpa *v;
428     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
429     trace_vhost_vdpa_init(dev, opaque);
430     int ret;
431 
432     /*
433      * Similar to VFIO, we end up pinning all guest memory and have to
434      * disable discarding of RAM.
435      */
436     ret = ram_block_discard_disable(true);
437     if (ret) {
438         error_report("Cannot set discarding of RAM broken");
439         return ret;
440     }
441 
442     v = opaque;
443     v->dev = dev;
444     dev->opaque =  opaque ;
445     v->listener = vhost_vdpa_memory_listener;
446     v->msg_type = VHOST_IOTLB_MSG_V2;
447     ret = vhost_vdpa_init_svq(dev, v, errp);
448     if (ret) {
449         goto err;
450     }
451 
452     vhost_vdpa_get_iova_range(v);
453 
454     if (vhost_vdpa_one_time_request(dev)) {
455         return 0;
456     }
457 
458     vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
459                                VIRTIO_CONFIG_S_DRIVER);
460 
461     return 0;
462 
463 err:
464     ram_block_discard_disable(false);
465     return ret;
466 }
467 
468 static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev,
469                                             int queue_index)
470 {
471     size_t page_size = qemu_real_host_page_size;
472     struct vhost_vdpa *v = dev->opaque;
473     VirtIODevice *vdev = dev->vdev;
474     VhostVDPAHostNotifier *n;
475 
476     n = &v->notifier[queue_index];
477 
478     if (n->addr) {
479         virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, false);
480         object_unparent(OBJECT(&n->mr));
481         munmap(n->addr, page_size);
482         n->addr = NULL;
483     }
484 }
485 
486 static int vhost_vdpa_host_notifier_init(struct vhost_dev *dev, int queue_index)
487 {
488     size_t page_size = qemu_real_host_page_size;
489     struct vhost_vdpa *v = dev->opaque;
490     VirtIODevice *vdev = dev->vdev;
491     VhostVDPAHostNotifier *n;
492     int fd = v->device_fd;
493     void *addr;
494     char *name;
495 
496     vhost_vdpa_host_notifier_uninit(dev, queue_index);
497 
498     n = &v->notifier[queue_index];
499 
500     addr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, fd,
501                 queue_index * page_size);
502     if (addr == MAP_FAILED) {
503         goto err;
504     }
505 
506     name = g_strdup_printf("vhost-vdpa/host-notifier@%p mmaps[%d]",
507                            v, queue_index);
508     memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name,
509                                       page_size, addr);
510     g_free(name);
511 
512     if (virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, true)) {
513         object_unparent(OBJECT(&n->mr));
514         munmap(addr, page_size);
515         goto err;
516     }
517     n->addr = addr;
518 
519     return 0;
520 
521 err:
522     return -1;
523 }
524 
525 static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n)
526 {
527     int i;
528 
529     for (i = dev->vq_index; i < dev->vq_index + n; i++) {
530         vhost_vdpa_host_notifier_uninit(dev, i);
531     }
532 }
533 
534 static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev)
535 {
536     struct vhost_vdpa *v = dev->opaque;
537     int i;
538 
539     if (v->shadow_vqs_enabled) {
540         /* FIXME SVQ is not compatible with host notifiers mr */
541         return;
542     }
543 
544     for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) {
545         if (vhost_vdpa_host_notifier_init(dev, i)) {
546             goto err;
547         }
548     }
549 
550     return;
551 
552 err:
553     vhost_vdpa_host_notifiers_uninit(dev, i - dev->vq_index);
554     return;
555 }
556 
557 static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev)
558 {
559     struct vhost_vdpa *v = dev->opaque;
560     size_t idx;
561 
562     if (!v->shadow_vqs) {
563         return;
564     }
565 
566     for (idx = 0; idx < v->shadow_vqs->len; ++idx) {
567         vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx));
568     }
569     g_ptr_array_free(v->shadow_vqs, true);
570 }
571 
572 static int vhost_vdpa_cleanup(struct vhost_dev *dev)
573 {
574     struct vhost_vdpa *v;
575     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
576     v = dev->opaque;
577     trace_vhost_vdpa_cleanup(dev, v);
578     vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
579     memory_listener_unregister(&v->listener);
580     vhost_vdpa_svq_cleanup(dev);
581 
582     dev->opaque = NULL;
583     ram_block_discard_disable(false);
584 
585     return 0;
586 }
587 
588 static int vhost_vdpa_memslots_limit(struct vhost_dev *dev)
589 {
590     trace_vhost_vdpa_memslots_limit(dev, INT_MAX);
591     return INT_MAX;
592 }
593 
594 static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
595                                     struct vhost_memory *mem)
596 {
597     if (vhost_vdpa_one_time_request(dev)) {
598         return 0;
599     }
600 
601     trace_vhost_vdpa_set_mem_table(dev, mem->nregions, mem->padding);
602     if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_MEM_TABLE) &&
603         trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_REGIONS)) {
604         int i;
605         for (i = 0; i < mem->nregions; i++) {
606             trace_vhost_vdpa_dump_regions(dev, i,
607                                           mem->regions[i].guest_phys_addr,
608                                           mem->regions[i].memory_size,
609                                           mem->regions[i].userspace_addr,
610                                           mem->regions[i].flags_padding);
611         }
612     }
613     if (mem->padding) {
614         return -EINVAL;
615     }
616 
617     return 0;
618 }
619 
620 static int vhost_vdpa_set_features(struct vhost_dev *dev,
621                                    uint64_t features)
622 {
623     struct vhost_vdpa *v = dev->opaque;
624     int ret;
625 
626     if (vhost_vdpa_one_time_request(dev)) {
627         return 0;
628     }
629 
630     if (v->shadow_vqs_enabled) {
631         if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
632             /*
633              * QEMU is just trying to enable or disable logging. SVQ handles
634              * this sepparately, so no need to forward this.
635              */
636             v->acked_features = features;
637             return 0;
638         }
639 
640         v->acked_features = features;
641 
642         /* We must not ack _F_LOG if SVQ is enabled */
643         features &= ~BIT_ULL(VHOST_F_LOG_ALL);
644     }
645 
646     trace_vhost_vdpa_set_features(dev, features);
647     ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
648     if (ret) {
649         return ret;
650     }
651 
652     return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK);
653 }
654 
655 static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev)
656 {
657     uint64_t features;
658     uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 |
659         0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH;
660     int r;
661 
662     if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) {
663         return -EFAULT;
664     }
665 
666     features &= f;
667 
668     if (vhost_vdpa_one_time_request(dev)) {
669         r = vhost_vdpa_call(dev, VHOST_SET_BACKEND_FEATURES, &features);
670         if (r) {
671             return -EFAULT;
672         }
673     }
674 
675     dev->backend_cap = features;
676 
677     return 0;
678 }
679 
680 static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
681                                     uint32_t *device_id)
682 {
683     int ret;
684     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_DEVICE_ID, device_id);
685     trace_vhost_vdpa_get_device_id(dev, *device_id);
686     return ret;
687 }
688 
689 static void vhost_vdpa_reset_svq(struct vhost_vdpa *v)
690 {
691     if (!v->shadow_vqs_enabled) {
692         return;
693     }
694 
695     for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
696         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
697         vhost_svq_stop(svq);
698     }
699 }
700 
701 static int vhost_vdpa_reset_device(struct vhost_dev *dev)
702 {
703     struct vhost_vdpa *v = dev->opaque;
704     int ret;
705     uint8_t status = 0;
706 
707     vhost_vdpa_reset_svq(v);
708 
709     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
710     trace_vhost_vdpa_reset_device(dev, status);
711     return ret;
712 }
713 
714 static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx)
715 {
716     assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
717 
718     trace_vhost_vdpa_get_vq_index(dev, idx, idx);
719     return idx;
720 }
721 
722 static int vhost_vdpa_set_vring_ready(struct vhost_dev *dev)
723 {
724     int i;
725     trace_vhost_vdpa_set_vring_ready(dev);
726     for (i = 0; i < dev->nvqs; ++i) {
727         struct vhost_vring_state state = {
728             .index = dev->vq_index + i,
729             .num = 1,
730         };
731         vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state);
732     }
733     return 0;
734 }
735 
736 static void vhost_vdpa_dump_config(struct vhost_dev *dev, const uint8_t *config,
737                                    uint32_t config_len)
738 {
739     int b, len;
740     char line[QEMU_HEXDUMP_LINE_LEN];
741 
742     for (b = 0; b < config_len; b += 16) {
743         len = config_len - b;
744         qemu_hexdump_line(line, b, config, len, false);
745         trace_vhost_vdpa_dump_config(dev, line);
746     }
747 }
748 
749 static int vhost_vdpa_set_config(struct vhost_dev *dev, const uint8_t *data,
750                                    uint32_t offset, uint32_t size,
751                                    uint32_t flags)
752 {
753     struct vhost_vdpa_config *config;
754     int ret;
755     unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
756 
757     trace_vhost_vdpa_set_config(dev, offset, size, flags);
758     config = g_malloc(size + config_size);
759     config->off = offset;
760     config->len = size;
761     memcpy(config->buf, data, size);
762     if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_CONFIG) &&
763         trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) {
764         vhost_vdpa_dump_config(dev, data, size);
765     }
766     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_CONFIG, config);
767     g_free(config);
768     return ret;
769 }
770 
771 static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
772                                    uint32_t config_len, Error **errp)
773 {
774     struct vhost_vdpa_config *v_config;
775     unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
776     int ret;
777 
778     trace_vhost_vdpa_get_config(dev, config, config_len);
779     v_config = g_malloc(config_len + config_size);
780     v_config->len = config_len;
781     v_config->off = 0;
782     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_CONFIG, v_config);
783     memcpy(config, v_config->buf, config_len);
784     g_free(v_config);
785     if (trace_event_get_state_backends(TRACE_VHOST_VDPA_GET_CONFIG) &&
786         trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) {
787         vhost_vdpa_dump_config(dev, config, config_len);
788     }
789     return ret;
790  }
791 
792 static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
793                                          struct vhost_vring_state *ring)
794 {
795     trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
796     return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
797 }
798 
799 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
800                                          struct vhost_vring_file *file)
801 {
802     trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
803     return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
804 }
805 
806 static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
807                                          struct vhost_vring_file *file)
808 {
809     trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
810     return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
811 }
812 
813 static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
814                                          struct vhost_vring_addr *addr)
815 {
816     trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
817                                 addr->desc_user_addr, addr->used_user_addr,
818                                 addr->avail_user_addr,
819                                 addr->log_guest_addr);
820 
821     return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
822 
823 }
824 
825 /**
826  * Set the shadow virtqueue descriptors to the device
827  *
828  * @dev: The vhost device model
829  * @svq: The shadow virtqueue
830  * @idx: The index of the virtqueue in the vhost device
831  * @errp: Error
832  *
833  * Note that this function does not rewind kick file descriptor if cannot set
834  * call one.
835  */
836 static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
837                                   VhostShadowVirtqueue *svq, unsigned idx,
838                                   Error **errp)
839 {
840     struct vhost_vring_file file = {
841         .index = dev->vq_index + idx,
842     };
843     const EventNotifier *event_notifier = &svq->hdev_kick;
844     int r;
845 
846     file.fd = event_notifier_get_fd(event_notifier);
847     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
848     if (unlikely(r != 0)) {
849         error_setg_errno(errp, -r, "Can't set device kick fd");
850         return r;
851     }
852 
853     event_notifier = &svq->hdev_call;
854     file.fd = event_notifier_get_fd(event_notifier);
855     r = vhost_vdpa_set_vring_dev_call(dev, &file);
856     if (unlikely(r != 0)) {
857         error_setg_errno(errp, -r, "Can't set device call fd");
858     }
859 
860     return r;
861 }
862 
863 /**
864  * Unmap a SVQ area in the device
865  */
866 static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v,
867                                       const DMAMap *needle)
868 {
869     const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle);
870     hwaddr size;
871     int r;
872 
873     if (unlikely(!result)) {
874         error_report("Unable to find SVQ address to unmap");
875         return false;
876     }
877 
878     size = ROUND_UP(result->size, qemu_real_host_page_size);
879     r = vhost_vdpa_dma_unmap(v, result->iova, size);
880     return r == 0;
881 }
882 
883 static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
884                                        const VhostShadowVirtqueue *svq)
885 {
886     DMAMap needle = {};
887     struct vhost_vdpa *v = dev->opaque;
888     struct vhost_vring_addr svq_addr;
889     bool ok;
890 
891     vhost_svq_get_vring_addr(svq, &svq_addr);
892 
893     needle.translated_addr = svq_addr.desc_user_addr;
894     ok = vhost_vdpa_svq_unmap_ring(v, &needle);
895     if (unlikely(!ok)) {
896         return false;
897     }
898 
899     needle.translated_addr = svq_addr.used_user_addr;
900     return vhost_vdpa_svq_unmap_ring(v, &needle);
901 }
902 
903 /**
904  * Map the SVQ area in the device
905  *
906  * @v: Vhost-vdpa device
907  * @needle: The area to search iova
908  * @errorp: Error pointer
909  */
910 static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
911                                     Error **errp)
912 {
913     int r;
914 
915     r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
916     if (unlikely(r != IOVA_OK)) {
917         error_setg(errp, "Cannot allocate iova (%d)", r);
918         return false;
919     }
920 
921     r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1,
922                            (void *)(uintptr_t)needle->translated_addr,
923                            needle->perm == IOMMU_RO);
924     if (unlikely(r != 0)) {
925         error_setg_errno(errp, -r, "Cannot map region to device");
926         vhost_iova_tree_remove(v->iova_tree, needle);
927     }
928 
929     return r == 0;
930 }
931 
932 /**
933  * Map the shadow virtqueue rings in the device
934  *
935  * @dev: The vhost device
936  * @svq: The shadow virtqueue
937  * @addr: Assigned IOVA addresses
938  * @errp: Error pointer
939  */
940 static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
941                                      const VhostShadowVirtqueue *svq,
942                                      struct vhost_vring_addr *addr,
943                                      Error **errp)
944 {
945     DMAMap device_region, driver_region;
946     struct vhost_vring_addr svq_addr;
947     struct vhost_vdpa *v = dev->opaque;
948     size_t device_size = vhost_svq_device_area_size(svq);
949     size_t driver_size = vhost_svq_driver_area_size(svq);
950     size_t avail_offset;
951     bool ok;
952 
953     ERRP_GUARD();
954     vhost_svq_get_vring_addr(svq, &svq_addr);
955 
956     driver_region = (DMAMap) {
957         .translated_addr = svq_addr.desc_user_addr,
958         .size = driver_size - 1,
959         .perm = IOMMU_RO,
960     };
961     ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
962     if (unlikely(!ok)) {
963         error_prepend(errp, "Cannot create vq driver region: ");
964         return false;
965     }
966     addr->desc_user_addr = driver_region.iova;
967     avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
968     addr->avail_user_addr = driver_region.iova + avail_offset;
969 
970     device_region = (DMAMap) {
971         .translated_addr = svq_addr.used_user_addr,
972         .size = device_size - 1,
973         .perm = IOMMU_RW,
974     };
975     ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
976     if (unlikely(!ok)) {
977         error_prepend(errp, "Cannot create vq device region: ");
978         vhost_vdpa_svq_unmap_ring(v, &driver_region);
979     }
980     addr->used_user_addr = device_region.iova;
981 
982     return ok;
983 }
984 
985 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
986                                  VhostShadowVirtqueue *svq, unsigned idx,
987                                  Error **errp)
988 {
989     uint16_t vq_index = dev->vq_index + idx;
990     struct vhost_vring_state s = {
991         .index = vq_index,
992     };
993     int r;
994 
995     r = vhost_vdpa_set_dev_vring_base(dev, &s);
996     if (unlikely(r)) {
997         error_setg_errno(errp, -r, "Cannot set vring base");
998         return false;
999     }
1000 
1001     r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
1002     return r == 0;
1003 }
1004 
1005 static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
1006 {
1007     struct vhost_vdpa *v = dev->opaque;
1008     Error *err = NULL;
1009     unsigned i;
1010 
1011     if (!v->shadow_vqs) {
1012         return true;
1013     }
1014 
1015     for (i = 0; i < v->shadow_vqs->len; ++i) {
1016         VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
1017         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
1018         struct vhost_vring_addr addr = {
1019             .index = i,
1020         };
1021         int r;
1022         bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
1023         if (unlikely(!ok)) {
1024             goto err;
1025         }
1026 
1027         vhost_svq_start(svq, dev->vdev, vq);
1028         ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
1029         if (unlikely(!ok)) {
1030             goto err_map;
1031         }
1032 
1033         /* Override vring GPA set by vhost subsystem */
1034         r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
1035         if (unlikely(r != 0)) {
1036             error_setg_errno(&err, -r, "Cannot set device address");
1037             goto err_set_addr;
1038         }
1039     }
1040 
1041     return true;
1042 
1043 err_set_addr:
1044     vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
1045 
1046 err_map:
1047     vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
1048 
1049 err:
1050     error_reportf_err(err, "Cannot setup SVQ %u: ", i);
1051     for (unsigned j = 0; j < i; ++j) {
1052         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
1053         vhost_vdpa_svq_unmap_rings(dev, svq);
1054         vhost_svq_stop(svq);
1055     }
1056 
1057     return false;
1058 }
1059 
1060 static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev)
1061 {
1062     struct vhost_vdpa *v = dev->opaque;
1063 
1064     if (!v->shadow_vqs) {
1065         return true;
1066     }
1067 
1068     for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
1069         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
1070         bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
1071         if (unlikely(!ok)) {
1072             return false;
1073         }
1074     }
1075 
1076     return true;
1077 }
1078 
1079 static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
1080 {
1081     struct vhost_vdpa *v = dev->opaque;
1082     bool ok;
1083     trace_vhost_vdpa_dev_start(dev, started);
1084 
1085     if (started) {
1086         vhost_vdpa_host_notifiers_init(dev);
1087         ok = vhost_vdpa_svqs_start(dev);
1088         if (unlikely(!ok)) {
1089             return -1;
1090         }
1091         vhost_vdpa_set_vring_ready(dev);
1092     } else {
1093         ok = vhost_vdpa_svqs_stop(dev);
1094         if (unlikely(!ok)) {
1095             return -1;
1096         }
1097         vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
1098     }
1099 
1100     if (dev->vq_index + dev->nvqs != dev->vq_index_end) {
1101         return 0;
1102     }
1103 
1104     if (started) {
1105         memory_listener_register(&v->listener, &address_space_memory);
1106         return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
1107     } else {
1108         vhost_vdpa_reset_device(dev);
1109         vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
1110                                    VIRTIO_CONFIG_S_DRIVER);
1111         memory_listener_unregister(&v->listener);
1112 
1113         return 0;
1114     }
1115 }
1116 
1117 static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
1118                                      struct vhost_log *log)
1119 {
1120     struct vhost_vdpa *v = dev->opaque;
1121     if (v->shadow_vqs_enabled || vhost_vdpa_one_time_request(dev)) {
1122         return 0;
1123     }
1124 
1125     trace_vhost_vdpa_set_log_base(dev, base, log->size, log->refcnt, log->fd,
1126                                   log->log);
1127     return vhost_vdpa_call(dev, VHOST_SET_LOG_BASE, &base);
1128 }
1129 
1130 static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
1131                                        struct vhost_vring_addr *addr)
1132 {
1133     struct vhost_vdpa *v = dev->opaque;
1134 
1135     if (v->shadow_vqs_enabled) {
1136         /*
1137          * Device vring addr was set at device start. SVQ base is handled by
1138          * VirtQueue code.
1139          */
1140         return 0;
1141     }
1142 
1143     return vhost_vdpa_set_vring_dev_addr(dev, addr);
1144 }
1145 
1146 static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
1147                                       struct vhost_vring_state *ring)
1148 {
1149     trace_vhost_vdpa_set_vring_num(dev, ring->index, ring->num);
1150     return vhost_vdpa_call(dev, VHOST_SET_VRING_NUM, ring);
1151 }
1152 
1153 static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
1154                                        struct vhost_vring_state *ring)
1155 {
1156     struct vhost_vdpa *v = dev->opaque;
1157 
1158     if (v->shadow_vqs_enabled) {
1159         /*
1160          * Device vring base was set at device start. SVQ base is handled by
1161          * VirtQueue code.
1162          */
1163         return 0;
1164     }
1165 
1166     return vhost_vdpa_set_dev_vring_base(dev, ring);
1167 }
1168 
1169 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
1170                                        struct vhost_vring_state *ring)
1171 {
1172     struct vhost_vdpa *v = dev->opaque;
1173     int ret;
1174 
1175     if (v->shadow_vqs_enabled) {
1176         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
1177                                                       ring->index);
1178 
1179         /*
1180          * Setting base as last used idx, so destination will see as available
1181          * all the entries that the device did not use, including the in-flight
1182          * processing ones.
1183          *
1184          * TODO: This is ok for networking, but other kinds of devices might
1185          * have problems with these retransmissions.
1186          */
1187         ring->num = svq->last_used_idx;
1188         return 0;
1189     }
1190 
1191     ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
1192     trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
1193     return ret;
1194 }
1195 
1196 static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
1197                                        struct vhost_vring_file *file)
1198 {
1199     struct vhost_vdpa *v = dev->opaque;
1200     int vdpa_idx = file->index - dev->vq_index;
1201 
1202     if (v->shadow_vqs_enabled) {
1203         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1204         vhost_svq_set_svq_kick_fd(svq, file->fd);
1205         return 0;
1206     } else {
1207         return vhost_vdpa_set_vring_dev_kick(dev, file);
1208     }
1209 }
1210 
1211 static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
1212                                        struct vhost_vring_file *file)
1213 {
1214     struct vhost_vdpa *v = dev->opaque;
1215 
1216     if (v->shadow_vqs_enabled) {
1217         int vdpa_idx = file->index - dev->vq_index;
1218         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1219 
1220         vhost_svq_set_svq_call_fd(svq, file->fd);
1221         return 0;
1222     } else {
1223         return vhost_vdpa_set_vring_dev_call(dev, file);
1224     }
1225 }
1226 
1227 static int vhost_vdpa_get_features(struct vhost_dev *dev,
1228                                      uint64_t *features)
1229 {
1230     struct vhost_vdpa *v = dev->opaque;
1231     int ret = vhost_vdpa_get_dev_features(dev, features);
1232 
1233     if (ret == 0 && v->shadow_vqs_enabled) {
1234         /* Add SVQ logging capabilities */
1235         *features |= BIT_ULL(VHOST_F_LOG_ALL);
1236     }
1237 
1238     return ret;
1239 }
1240 
1241 static int vhost_vdpa_set_owner(struct vhost_dev *dev)
1242 {
1243     if (vhost_vdpa_one_time_request(dev)) {
1244         return 0;
1245     }
1246 
1247     trace_vhost_vdpa_set_owner(dev);
1248     return vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL);
1249 }
1250 
1251 static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev,
1252                     struct vhost_vring_addr *addr, struct vhost_virtqueue *vq)
1253 {
1254     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
1255     addr->desc_user_addr = (uint64_t)(unsigned long)vq->desc_phys;
1256     addr->avail_user_addr = (uint64_t)(unsigned long)vq->avail_phys;
1257     addr->used_user_addr = (uint64_t)(unsigned long)vq->used_phys;
1258     trace_vhost_vdpa_vq_get_addr(dev, vq, addr->desc_user_addr,
1259                                  addr->avail_user_addr, addr->used_user_addr);
1260     return 0;
1261 }
1262 
1263 static bool  vhost_vdpa_force_iommu(struct vhost_dev *dev)
1264 {
1265     return true;
1266 }
1267 
1268 const VhostOps vdpa_ops = {
1269         .backend_type = VHOST_BACKEND_TYPE_VDPA,
1270         .vhost_backend_init = vhost_vdpa_init,
1271         .vhost_backend_cleanup = vhost_vdpa_cleanup,
1272         .vhost_set_log_base = vhost_vdpa_set_log_base,
1273         .vhost_set_vring_addr = vhost_vdpa_set_vring_addr,
1274         .vhost_set_vring_num = vhost_vdpa_set_vring_num,
1275         .vhost_set_vring_base = vhost_vdpa_set_vring_base,
1276         .vhost_get_vring_base = vhost_vdpa_get_vring_base,
1277         .vhost_set_vring_kick = vhost_vdpa_set_vring_kick,
1278         .vhost_set_vring_call = vhost_vdpa_set_vring_call,
1279         .vhost_get_features = vhost_vdpa_get_features,
1280         .vhost_set_backend_cap = vhost_vdpa_set_backend_cap,
1281         .vhost_set_owner = vhost_vdpa_set_owner,
1282         .vhost_set_vring_endian = NULL,
1283         .vhost_backend_memslots_limit = vhost_vdpa_memslots_limit,
1284         .vhost_set_mem_table = vhost_vdpa_set_mem_table,
1285         .vhost_set_features = vhost_vdpa_set_features,
1286         .vhost_reset_device = vhost_vdpa_reset_device,
1287         .vhost_get_vq_index = vhost_vdpa_get_vq_index,
1288         .vhost_get_config  = vhost_vdpa_get_config,
1289         .vhost_set_config = vhost_vdpa_set_config,
1290         .vhost_requires_shm_log = NULL,
1291         .vhost_migration_done = NULL,
1292         .vhost_backend_can_merge = NULL,
1293         .vhost_net_set_mtu = NULL,
1294         .vhost_set_iotlb_callback = NULL,
1295         .vhost_send_device_iotlb_msg = NULL,
1296         .vhost_dev_start = vhost_vdpa_dev_start,
1297         .vhost_get_device_id = vhost_vdpa_get_device_id,
1298         .vhost_vq_get_addr = vhost_vdpa_vq_get_addr,
1299         .vhost_force_iommu = vhost_vdpa_force_iommu,
1300 };
1301