xref: /qemu/net/vhost-vdpa.c (revision fb83fd3b)
1 /*
2  * vhost-vdpa.c
3  *
4  * Copyright(c) 2017-2018 Intel Corporation.
5  * Copyright(c) 2020 Red Hat, Inc.
6  *
7  * This work is licensed under the terms of the GNU GPL, version 2 or later.
8  * See the COPYING file in the top-level directory.
9  *
10  */
11 
12 #include "qemu/osdep.h"
13 #include "clients.h"
14 #include "hw/virtio/virtio-net.h"
15 #include "net/vhost_net.h"
16 #include "net/vhost-vdpa.h"
17 #include "hw/virtio/vhost-vdpa.h"
18 #include "qemu/config-file.h"
19 #include "qemu/error-report.h"
20 #include "qemu/log.h"
21 #include "qemu/memalign.h"
22 #include "qemu/option.h"
23 #include "qapi/error.h"
24 #include <linux/vhost.h>
25 #include <sys/ioctl.h>
26 #include <err.h>
27 #include "standard-headers/linux/virtio_net.h"
28 #include "monitor/monitor.h"
29 #include "hw/virtio/vhost.h"
30 
31 /* Todo:need to add the multiqueue support here */
32 typedef struct VhostVDPAState {
33     NetClientState nc;
34     struct vhost_vdpa vhost_vdpa;
35     VHostNetState *vhost_net;
36 
37     /* Control commands shadow buffers */
38     void *cvq_cmd_out_buffer;
39     virtio_net_ctrl_ack *status;
40 
41     /* The device always have SVQ enabled */
42     bool always_svq;
43     bool started;
44 } VhostVDPAState;
45 
46 const int vdpa_feature_bits[] = {
47     VIRTIO_F_NOTIFY_ON_EMPTY,
48     VIRTIO_RING_F_INDIRECT_DESC,
49     VIRTIO_RING_F_EVENT_IDX,
50     VIRTIO_F_ANY_LAYOUT,
51     VIRTIO_F_VERSION_1,
52     VIRTIO_NET_F_CSUM,
53     VIRTIO_NET_F_GUEST_CSUM,
54     VIRTIO_NET_F_GSO,
55     VIRTIO_NET_F_GUEST_TSO4,
56     VIRTIO_NET_F_GUEST_TSO6,
57     VIRTIO_NET_F_GUEST_ECN,
58     VIRTIO_NET_F_GUEST_UFO,
59     VIRTIO_NET_F_HOST_TSO4,
60     VIRTIO_NET_F_HOST_TSO6,
61     VIRTIO_NET_F_HOST_ECN,
62     VIRTIO_NET_F_HOST_UFO,
63     VIRTIO_NET_F_MRG_RXBUF,
64     VIRTIO_NET_F_MTU,
65     VIRTIO_NET_F_CTRL_RX,
66     VIRTIO_NET_F_CTRL_RX_EXTRA,
67     VIRTIO_NET_F_CTRL_VLAN,
68     VIRTIO_NET_F_CTRL_MAC_ADDR,
69     VIRTIO_NET_F_RSS,
70     VIRTIO_NET_F_MQ,
71     VIRTIO_NET_F_CTRL_VQ,
72     VIRTIO_F_IOMMU_PLATFORM,
73     VIRTIO_F_RING_PACKED,
74     VIRTIO_F_RING_RESET,
75     VIRTIO_NET_F_RSS,
76     VIRTIO_NET_F_HASH_REPORT,
77     VIRTIO_NET_F_GUEST_ANNOUNCE,
78     VIRTIO_NET_F_STATUS,
79     VHOST_INVALID_FEATURE_BIT
80 };
81 
82 /** Supported device specific feature bits with SVQ */
83 static const uint64_t vdpa_svq_device_features =
84     BIT_ULL(VIRTIO_NET_F_CSUM) |
85     BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) |
86     BIT_ULL(VIRTIO_NET_F_MTU) |
87     BIT_ULL(VIRTIO_NET_F_MAC) |
88     BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) |
89     BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) |
90     BIT_ULL(VIRTIO_NET_F_GUEST_ECN) |
91     BIT_ULL(VIRTIO_NET_F_GUEST_UFO) |
92     BIT_ULL(VIRTIO_NET_F_HOST_TSO4) |
93     BIT_ULL(VIRTIO_NET_F_HOST_TSO6) |
94     BIT_ULL(VIRTIO_NET_F_HOST_ECN) |
95     BIT_ULL(VIRTIO_NET_F_HOST_UFO) |
96     BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) |
97     BIT_ULL(VIRTIO_NET_F_STATUS) |
98     BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |
99     BIT_ULL(VIRTIO_NET_F_MQ) |
100     BIT_ULL(VIRTIO_F_ANY_LAYOUT) |
101     BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) |
102     BIT_ULL(VIRTIO_NET_F_RSC_EXT) |
103     BIT_ULL(VIRTIO_NET_F_STANDBY);
104 
105 #define VHOST_VDPA_NET_CVQ_ASID 1
106 
107 VHostNetState *vhost_vdpa_get_vhost_net(NetClientState *nc)
108 {
109     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
110     assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
111     return s->vhost_net;
112 }
113 
114 static bool vhost_vdpa_net_valid_svq_features(uint64_t features, Error **errp)
115 {
116     uint64_t invalid_dev_features =
117         features & ~vdpa_svq_device_features &
118         /* Transport are all accepted at this point */
119         ~MAKE_64BIT_MASK(VIRTIO_TRANSPORT_F_START,
120                          VIRTIO_TRANSPORT_F_END - VIRTIO_TRANSPORT_F_START);
121 
122     if (invalid_dev_features) {
123         error_setg(errp, "vdpa svq does not work with features 0x%" PRIx64,
124                    invalid_dev_features);
125         return false;
126     }
127 
128     return vhost_svq_valid_features(features, errp);
129 }
130 
131 static int vhost_vdpa_net_check_device_id(struct vhost_net *net)
132 {
133     uint32_t device_id;
134     int ret;
135     struct vhost_dev *hdev;
136 
137     hdev = (struct vhost_dev *)&net->dev;
138     ret = hdev->vhost_ops->vhost_get_device_id(hdev, &device_id);
139     if (device_id != VIRTIO_ID_NET) {
140         return -ENOTSUP;
141     }
142     return ret;
143 }
144 
145 static int vhost_vdpa_add(NetClientState *ncs, void *be,
146                           int queue_pair_index, int nvqs)
147 {
148     VhostNetOptions options;
149     struct vhost_net *net = NULL;
150     VhostVDPAState *s;
151     int ret;
152 
153     options.backend_type = VHOST_BACKEND_TYPE_VDPA;
154     assert(ncs->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
155     s = DO_UPCAST(VhostVDPAState, nc, ncs);
156     options.net_backend = ncs;
157     options.opaque      = be;
158     options.busyloop_timeout = 0;
159     options.nvqs = nvqs;
160 
161     net = vhost_net_init(&options);
162     if (!net) {
163         error_report("failed to init vhost_net for queue");
164         goto err_init;
165     }
166     s->vhost_net = net;
167     ret = vhost_vdpa_net_check_device_id(net);
168     if (ret) {
169         goto err_check;
170     }
171     return 0;
172 err_check:
173     vhost_net_cleanup(net);
174     g_free(net);
175 err_init:
176     return -1;
177 }
178 
179 static void vhost_vdpa_cleanup(NetClientState *nc)
180 {
181     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
182     struct vhost_dev *dev = &s->vhost_net->dev;
183 
184     qemu_vfree(s->cvq_cmd_out_buffer);
185     qemu_vfree(s->status);
186     if (dev->vq_index + dev->nvqs == dev->vq_index_end) {
187         g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
188     }
189     if (s->vhost_net) {
190         vhost_net_cleanup(s->vhost_net);
191         g_free(s->vhost_net);
192         s->vhost_net = NULL;
193     }
194      if (s->vhost_vdpa.device_fd >= 0) {
195         qemu_close(s->vhost_vdpa.device_fd);
196         s->vhost_vdpa.device_fd = -1;
197     }
198 }
199 
200 static bool vhost_vdpa_has_vnet_hdr(NetClientState *nc)
201 {
202     assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
203 
204     return true;
205 }
206 
207 static bool vhost_vdpa_has_ufo(NetClientState *nc)
208 {
209     assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
210     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
211     uint64_t features = 0;
212     features |= (1ULL << VIRTIO_NET_F_HOST_UFO);
213     features = vhost_net_get_features(s->vhost_net, features);
214     return !!(features & (1ULL << VIRTIO_NET_F_HOST_UFO));
215 
216 }
217 
218 static bool vhost_vdpa_check_peer_type(NetClientState *nc, ObjectClass *oc,
219                                        Error **errp)
220 {
221     const char *driver = object_class_get_name(oc);
222 
223     if (!g_str_has_prefix(driver, "virtio-net-")) {
224         error_setg(errp, "vhost-vdpa requires frontend driver virtio-net-*");
225         return false;
226     }
227 
228     return true;
229 }
230 
231 /** Dummy receive in case qemu falls back to userland tap networking */
232 static ssize_t vhost_vdpa_receive(NetClientState *nc, const uint8_t *buf,
233                                   size_t size)
234 {
235     return size;
236 }
237 
238 static NetClientInfo net_vhost_vdpa_info = {
239         .type = NET_CLIENT_DRIVER_VHOST_VDPA,
240         .size = sizeof(VhostVDPAState),
241         .receive = vhost_vdpa_receive,
242         .cleanup = vhost_vdpa_cleanup,
243         .has_vnet_hdr = vhost_vdpa_has_vnet_hdr,
244         .has_ufo = vhost_vdpa_has_ufo,
245         .check_peer_type = vhost_vdpa_check_peer_type,
246 };
247 
248 static int64_t vhost_vdpa_get_vring_group(int device_fd, unsigned vq_index)
249 {
250     struct vhost_vring_state state = {
251         .index = vq_index,
252     };
253     int r = ioctl(device_fd, VHOST_VDPA_GET_VRING_GROUP, &state);
254 
255     if (unlikely(r < 0)) {
256         error_report("Cannot get VQ %u group: %s", vq_index,
257                      g_strerror(errno));
258         return r;
259     }
260 
261     return state.num;
262 }
263 
264 static int vhost_vdpa_set_address_space_id(struct vhost_vdpa *v,
265                                            unsigned vq_group,
266                                            unsigned asid_num)
267 {
268     struct vhost_vring_state asid = {
269         .index = vq_group,
270         .num = asid_num,
271     };
272     int r;
273 
274     r = ioctl(v->device_fd, VHOST_VDPA_SET_GROUP_ASID, &asid);
275     if (unlikely(r < 0)) {
276         error_report("Can't set vq group %u asid %u, errno=%d (%s)",
277                      asid.index, asid.num, errno, g_strerror(errno));
278     }
279     return r;
280 }
281 
282 static void vhost_vdpa_cvq_unmap_buf(struct vhost_vdpa *v, void *addr)
283 {
284     VhostIOVATree *tree = v->iova_tree;
285     DMAMap needle = {
286         /*
287          * No need to specify size or to look for more translations since
288          * this contiguous chunk was allocated by us.
289          */
290         .translated_addr = (hwaddr)(uintptr_t)addr,
291     };
292     const DMAMap *map = vhost_iova_tree_find_iova(tree, &needle);
293     int r;
294 
295     if (unlikely(!map)) {
296         error_report("Cannot locate expected map");
297         return;
298     }
299 
300     r = vhost_vdpa_dma_unmap(v, v->address_space_id, map->iova, map->size + 1);
301     if (unlikely(r != 0)) {
302         error_report("Device cannot unmap: %s(%d)", g_strerror(r), r);
303     }
304 
305     vhost_iova_tree_remove(tree, *map);
306 }
307 
308 static size_t vhost_vdpa_net_cvq_cmd_len(void)
309 {
310     /*
311      * MAC_TABLE_SET is the ctrl command that produces the longer out buffer.
312      * In buffer is always 1 byte, so it should fit here
313      */
314     return sizeof(struct virtio_net_ctrl_hdr) +
315            2 * sizeof(struct virtio_net_ctrl_mac) +
316            MAC_TABLE_ENTRIES * ETH_ALEN;
317 }
318 
319 static size_t vhost_vdpa_net_cvq_cmd_page_len(void)
320 {
321     return ROUND_UP(vhost_vdpa_net_cvq_cmd_len(), qemu_real_host_page_size());
322 }
323 
324 /** Map CVQ buffer. */
325 static int vhost_vdpa_cvq_map_buf(struct vhost_vdpa *v, void *buf, size_t size,
326                                   bool write)
327 {
328     DMAMap map = {};
329     int r;
330 
331     map.translated_addr = (hwaddr)(uintptr_t)buf;
332     map.size = size - 1;
333     map.perm = write ? IOMMU_RW : IOMMU_RO,
334     r = vhost_iova_tree_map_alloc(v->iova_tree, &map);
335     if (unlikely(r != IOVA_OK)) {
336         error_report("Cannot map injected element");
337         return r;
338     }
339 
340     r = vhost_vdpa_dma_map(v, v->address_space_id, map.iova,
341                            vhost_vdpa_net_cvq_cmd_page_len(), buf, !write);
342     if (unlikely(r < 0)) {
343         goto dma_map_err;
344     }
345 
346     return 0;
347 
348 dma_map_err:
349     vhost_iova_tree_remove(v->iova_tree, map);
350     return r;
351 }
352 
353 static int vhost_vdpa_net_cvq_start(NetClientState *nc)
354 {
355     VhostVDPAState *s;
356     struct vhost_vdpa *v;
357     uint64_t backend_features;
358     int64_t cvq_group;
359     int cvq_index, r;
360 
361     assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
362 
363     s = DO_UPCAST(VhostVDPAState, nc, nc);
364     v = &s->vhost_vdpa;
365 
366     v->shadow_data = s->always_svq;
367     v->shadow_vqs_enabled = s->always_svq;
368     s->vhost_vdpa.address_space_id = VHOST_VDPA_GUEST_PA_ASID;
369 
370     if (s->always_svq) {
371         /* SVQ is already configured for all virtqueues */
372         goto out;
373     }
374 
375     /*
376      * If we early return in these cases SVQ will not be enabled. The migration
377      * will be blocked as long as vhost-vdpa backends will not offer _F_LOG.
378      *
379      * Calling VHOST_GET_BACKEND_FEATURES as they are not available in v->dev
380      * yet.
381      */
382     r = ioctl(v->device_fd, VHOST_GET_BACKEND_FEATURES, &backend_features);
383     if (unlikely(r < 0)) {
384         error_report("Cannot get vdpa backend_features: %s(%d)",
385             g_strerror(errno), errno);
386         return -1;
387     }
388     if (!(backend_features & VHOST_BACKEND_F_IOTLB_ASID) ||
389         !vhost_vdpa_net_valid_svq_features(v->dev->features, NULL)) {
390         return 0;
391     }
392 
393     /*
394      * Check if all the virtqueues of the virtio device are in a different vq
395      * than the last vq. VQ group of last group passed in cvq_group.
396      */
397     cvq_index = v->dev->vq_index_end - 1;
398     cvq_group = vhost_vdpa_get_vring_group(v->device_fd, cvq_index);
399     if (unlikely(cvq_group < 0)) {
400         return cvq_group;
401     }
402     for (int i = 0; i < cvq_index; ++i) {
403         int64_t group = vhost_vdpa_get_vring_group(v->device_fd, i);
404 
405         if (unlikely(group < 0)) {
406             return group;
407         }
408 
409         if (group == cvq_group) {
410             return 0;
411         }
412     }
413 
414     r = vhost_vdpa_set_address_space_id(v, cvq_group, VHOST_VDPA_NET_CVQ_ASID);
415     if (unlikely(r < 0)) {
416         return r;
417     }
418 
419     v->iova_tree = vhost_iova_tree_new(v->iova_range.first,
420                                        v->iova_range.last);
421     v->shadow_vqs_enabled = true;
422     s->vhost_vdpa.address_space_id = VHOST_VDPA_NET_CVQ_ASID;
423 
424 out:
425     if (!s->vhost_vdpa.shadow_vqs_enabled) {
426         return 0;
427     }
428 
429     r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer,
430                                vhost_vdpa_net_cvq_cmd_page_len(), false);
431     if (unlikely(r < 0)) {
432         return r;
433     }
434 
435     r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->status,
436                                vhost_vdpa_net_cvq_cmd_page_len(), true);
437     if (unlikely(r < 0)) {
438         vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer);
439     }
440 
441     return r;
442 }
443 
444 static void vhost_vdpa_net_cvq_stop(NetClientState *nc)
445 {
446     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
447 
448     assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
449 
450     if (s->vhost_vdpa.shadow_vqs_enabled) {
451         vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer);
452         vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->status);
453         if (!s->always_svq) {
454             /*
455              * If only the CVQ is shadowed we can delete this safely.
456              * If all the VQs are shadows this will be needed by the time the
457              * device is started again to register SVQ vrings and similar.
458              */
459             g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
460         }
461     }
462 }
463 
464 static ssize_t vhost_vdpa_net_cvq_add(VhostVDPAState *s, size_t out_len,
465                                       size_t in_len)
466 {
467     /* Buffers for the device */
468     const struct iovec out = {
469         .iov_base = s->cvq_cmd_out_buffer,
470         .iov_len = out_len,
471     };
472     const struct iovec in = {
473         .iov_base = s->status,
474         .iov_len = sizeof(virtio_net_ctrl_ack),
475     };
476     VhostShadowVirtqueue *svq = g_ptr_array_index(s->vhost_vdpa.shadow_vqs, 0);
477     int r;
478 
479     r = vhost_svq_add(svq, &out, 1, &in, 1, NULL);
480     if (unlikely(r != 0)) {
481         if (unlikely(r == -ENOSPC)) {
482             qemu_log_mask(LOG_GUEST_ERROR, "%s: No space on device queue\n",
483                           __func__);
484         }
485         return r;
486     }
487 
488     /*
489      * We can poll here since we've had BQL from the time we sent the
490      * descriptor. Also, we need to take the answer before SVQ pulls by itself,
491      * when BQL is released
492      */
493     return vhost_svq_poll(svq);
494 }
495 
496 static ssize_t vhost_vdpa_net_load_cmd(VhostVDPAState *s, uint8_t class,
497                                        uint8_t cmd, const void *data,
498                                        size_t data_size)
499 {
500     const struct virtio_net_ctrl_hdr ctrl = {
501         .class = class,
502         .cmd = cmd,
503     };
504 
505     assert(data_size < vhost_vdpa_net_cvq_cmd_page_len() - sizeof(ctrl));
506 
507     memcpy(s->cvq_cmd_out_buffer, &ctrl, sizeof(ctrl));
508     memcpy(s->cvq_cmd_out_buffer + sizeof(ctrl), data, data_size);
509 
510     return vhost_vdpa_net_cvq_add(s, sizeof(ctrl) + data_size,
511                                   sizeof(virtio_net_ctrl_ack));
512 }
513 
514 static int vhost_vdpa_net_load_mac(VhostVDPAState *s, const VirtIONet *n)
515 {
516     uint64_t features = n->parent_obj.guest_features;
517     if (features & BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR)) {
518         ssize_t dev_written = vhost_vdpa_net_load_cmd(s, VIRTIO_NET_CTRL_MAC,
519                                                   VIRTIO_NET_CTRL_MAC_ADDR_SET,
520                                                   n->mac, sizeof(n->mac));
521         if (unlikely(dev_written < 0)) {
522             return dev_written;
523         }
524 
525         return *s->status != VIRTIO_NET_OK;
526     }
527 
528     return 0;
529 }
530 
531 static int vhost_vdpa_net_load_mq(VhostVDPAState *s,
532                                   const VirtIONet *n)
533 {
534     struct virtio_net_ctrl_mq mq;
535     uint64_t features = n->parent_obj.guest_features;
536     ssize_t dev_written;
537 
538     if (!(features & BIT_ULL(VIRTIO_NET_F_MQ))) {
539         return 0;
540     }
541 
542     mq.virtqueue_pairs = cpu_to_le16(n->curr_queue_pairs);
543     dev_written = vhost_vdpa_net_load_cmd(s, VIRTIO_NET_CTRL_MQ,
544                                           VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &mq,
545                                           sizeof(mq));
546     if (unlikely(dev_written < 0)) {
547         return dev_written;
548     }
549 
550     return *s->status != VIRTIO_NET_OK;
551 }
552 
553 static int vhost_vdpa_net_load(NetClientState *nc)
554 {
555     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
556     struct vhost_vdpa *v = &s->vhost_vdpa;
557     const VirtIONet *n;
558     int r;
559 
560     assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
561 
562     if (!v->shadow_vqs_enabled) {
563         return 0;
564     }
565 
566     n = VIRTIO_NET(v->dev->vdev);
567     r = vhost_vdpa_net_load_mac(s, n);
568     if (unlikely(r < 0)) {
569         return r;
570     }
571     r = vhost_vdpa_net_load_mq(s, n);
572     if (unlikely(r)) {
573         return r;
574     }
575 
576     return 0;
577 }
578 
579 static NetClientInfo net_vhost_vdpa_cvq_info = {
580     .type = NET_CLIENT_DRIVER_VHOST_VDPA,
581     .size = sizeof(VhostVDPAState),
582     .receive = vhost_vdpa_receive,
583     .start = vhost_vdpa_net_cvq_start,
584     .load = vhost_vdpa_net_load,
585     .stop = vhost_vdpa_net_cvq_stop,
586     .cleanup = vhost_vdpa_cleanup,
587     .has_vnet_hdr = vhost_vdpa_has_vnet_hdr,
588     .has_ufo = vhost_vdpa_has_ufo,
589     .check_peer_type = vhost_vdpa_check_peer_type,
590 };
591 
592 /**
593  * Validate and copy control virtqueue commands.
594  *
595  * Following QEMU guidelines, we offer a copy of the buffers to the device to
596  * prevent TOCTOU bugs.
597  */
598 static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq,
599                                             VirtQueueElement *elem,
600                                             void *opaque)
601 {
602     VhostVDPAState *s = opaque;
603     size_t in_len;
604     virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
605     /* Out buffer sent to both the vdpa device and the device model */
606     struct iovec out = {
607         .iov_base = s->cvq_cmd_out_buffer,
608     };
609     /* in buffer used for device model */
610     const struct iovec in = {
611         .iov_base = &status,
612         .iov_len = sizeof(status),
613     };
614     ssize_t dev_written = -EINVAL;
615 
616     out.iov_len = iov_to_buf(elem->out_sg, elem->out_num, 0,
617                              s->cvq_cmd_out_buffer,
618                              vhost_vdpa_net_cvq_cmd_len());
619     dev_written = vhost_vdpa_net_cvq_add(s, out.iov_len, sizeof(status));
620     if (unlikely(dev_written < 0)) {
621         goto out;
622     }
623 
624     if (unlikely(dev_written < sizeof(status))) {
625         error_report("Insufficient written data (%zu)", dev_written);
626         goto out;
627     }
628 
629     if (*s->status != VIRTIO_NET_OK) {
630         return VIRTIO_NET_ERR;
631     }
632 
633     status = VIRTIO_NET_ERR;
634     virtio_net_handle_ctrl_iov(svq->vdev, &in, 1, &out, 1);
635     if (status != VIRTIO_NET_OK) {
636         error_report("Bad CVQ processing in model");
637     }
638 
639 out:
640     in_len = iov_from_buf(elem->in_sg, elem->in_num, 0, &status,
641                           sizeof(status));
642     if (unlikely(in_len < sizeof(status))) {
643         error_report("Bad device CVQ written length");
644     }
645     vhost_svq_push_elem(svq, elem, MIN(in_len, sizeof(status)));
646     g_free(elem);
647     return dev_written < 0 ? dev_written : 0;
648 }
649 
650 static const VhostShadowVirtqueueOps vhost_vdpa_net_svq_ops = {
651     .avail_handler = vhost_vdpa_net_handle_ctrl_avail,
652 };
653 
654 static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
655                                        const char *device,
656                                        const char *name,
657                                        int vdpa_device_fd,
658                                        int queue_pair_index,
659                                        int nvqs,
660                                        bool is_datapath,
661                                        bool svq,
662                                        struct vhost_vdpa_iova_range iova_range,
663                                        VhostIOVATree *iova_tree)
664 {
665     NetClientState *nc = NULL;
666     VhostVDPAState *s;
667     int ret = 0;
668     assert(name);
669     if (is_datapath) {
670         nc = qemu_new_net_client(&net_vhost_vdpa_info, peer, device,
671                                  name);
672     } else {
673         nc = qemu_new_net_control_client(&net_vhost_vdpa_cvq_info, peer,
674                                          device, name);
675     }
676     qemu_set_info_str(nc, TYPE_VHOST_VDPA);
677     s = DO_UPCAST(VhostVDPAState, nc, nc);
678 
679     s->vhost_vdpa.device_fd = vdpa_device_fd;
680     s->vhost_vdpa.index = queue_pair_index;
681     s->always_svq = svq;
682     s->vhost_vdpa.shadow_vqs_enabled = svq;
683     s->vhost_vdpa.iova_range = iova_range;
684     s->vhost_vdpa.shadow_data = svq;
685     s->vhost_vdpa.iova_tree = iova_tree;
686     if (!is_datapath) {
687         s->cvq_cmd_out_buffer = qemu_memalign(qemu_real_host_page_size(),
688                                             vhost_vdpa_net_cvq_cmd_page_len());
689         memset(s->cvq_cmd_out_buffer, 0, vhost_vdpa_net_cvq_cmd_page_len());
690         s->status = qemu_memalign(qemu_real_host_page_size(),
691                                   vhost_vdpa_net_cvq_cmd_page_len());
692         memset(s->status, 0, vhost_vdpa_net_cvq_cmd_page_len());
693 
694         s->vhost_vdpa.shadow_vq_ops = &vhost_vdpa_net_svq_ops;
695         s->vhost_vdpa.shadow_vq_ops_opaque = s;
696     }
697     ret = vhost_vdpa_add(nc, (void *)&s->vhost_vdpa, queue_pair_index, nvqs);
698     if (ret) {
699         qemu_del_net_client(nc);
700         return NULL;
701     }
702     return nc;
703 }
704 
705 static int vhost_vdpa_get_iova_range(int fd,
706                                      struct vhost_vdpa_iova_range *iova_range)
707 {
708     int ret = ioctl(fd, VHOST_VDPA_GET_IOVA_RANGE, iova_range);
709 
710     return ret < 0 ? -errno : 0;
711 }
712 
713 static int vhost_vdpa_get_features(int fd, uint64_t *features, Error **errp)
714 {
715     int ret = ioctl(fd, VHOST_GET_FEATURES, features);
716     if (unlikely(ret < 0)) {
717         error_setg_errno(errp, errno,
718                          "Fail to query features from vhost-vDPA device");
719     }
720     return ret;
721 }
722 
723 static int vhost_vdpa_get_max_queue_pairs(int fd, uint64_t features,
724                                           int *has_cvq, Error **errp)
725 {
726     unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
727     g_autofree struct vhost_vdpa_config *config = NULL;
728     __virtio16 *max_queue_pairs;
729     int ret;
730 
731     if (features & (1 << VIRTIO_NET_F_CTRL_VQ)) {
732         *has_cvq = 1;
733     } else {
734         *has_cvq = 0;
735     }
736 
737     if (features & (1 << VIRTIO_NET_F_MQ)) {
738         config = g_malloc0(config_size + sizeof(*max_queue_pairs));
739         config->off = offsetof(struct virtio_net_config, max_virtqueue_pairs);
740         config->len = sizeof(*max_queue_pairs);
741 
742         ret = ioctl(fd, VHOST_VDPA_GET_CONFIG, config);
743         if (ret) {
744             error_setg(errp, "Fail to get config from vhost-vDPA device");
745             return -ret;
746         }
747 
748         max_queue_pairs = (__virtio16 *)&config->buf;
749 
750         return lduw_le_p(max_queue_pairs);
751     }
752 
753     return 1;
754 }
755 
756 int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
757                         NetClientState *peer, Error **errp)
758 {
759     const NetdevVhostVDPAOptions *opts;
760     uint64_t features;
761     int vdpa_device_fd;
762     g_autofree NetClientState **ncs = NULL;
763     g_autoptr(VhostIOVATree) iova_tree = NULL;
764     struct vhost_vdpa_iova_range iova_range;
765     NetClientState *nc;
766     int queue_pairs, r, i = 0, has_cvq = 0;
767 
768     assert(netdev->type == NET_CLIENT_DRIVER_VHOST_VDPA);
769     opts = &netdev->u.vhost_vdpa;
770     if (!opts->vhostdev && !opts->vhostfd) {
771         error_setg(errp,
772                    "vhost-vdpa: neither vhostdev= nor vhostfd= was specified");
773         return -1;
774     }
775 
776     if (opts->vhostdev && opts->vhostfd) {
777         error_setg(errp,
778                    "vhost-vdpa: vhostdev= and vhostfd= are mutually exclusive");
779         return -1;
780     }
781 
782     if (opts->vhostdev) {
783         vdpa_device_fd = qemu_open(opts->vhostdev, O_RDWR, errp);
784         if (vdpa_device_fd == -1) {
785             return -errno;
786         }
787     } else {
788         /* has_vhostfd */
789         vdpa_device_fd = monitor_fd_param(monitor_cur(), opts->vhostfd, errp);
790         if (vdpa_device_fd == -1) {
791             error_prepend(errp, "vhost-vdpa: unable to parse vhostfd: ");
792             return -1;
793         }
794     }
795 
796     r = vhost_vdpa_get_features(vdpa_device_fd, &features, errp);
797     if (unlikely(r < 0)) {
798         goto err;
799     }
800 
801     queue_pairs = vhost_vdpa_get_max_queue_pairs(vdpa_device_fd, features,
802                                                  &has_cvq, errp);
803     if (queue_pairs < 0) {
804         qemu_close(vdpa_device_fd);
805         return queue_pairs;
806     }
807 
808     vhost_vdpa_get_iova_range(vdpa_device_fd, &iova_range);
809     if (opts->x_svq) {
810         if (!vhost_vdpa_net_valid_svq_features(features, errp)) {
811             goto err_svq;
812         }
813 
814         iova_tree = vhost_iova_tree_new(iova_range.first, iova_range.last);
815     }
816 
817     ncs = g_malloc0(sizeof(*ncs) * queue_pairs);
818 
819     for (i = 0; i < queue_pairs; i++) {
820         ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
821                                      vdpa_device_fd, i, 2, true, opts->x_svq,
822                                      iova_range, iova_tree);
823         if (!ncs[i])
824             goto err;
825     }
826 
827     if (has_cvq) {
828         nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
829                                  vdpa_device_fd, i, 1, false,
830                                  opts->x_svq, iova_range, iova_tree);
831         if (!nc)
832             goto err;
833     }
834 
835     /* iova_tree ownership belongs to last NetClientState */
836     g_steal_pointer(&iova_tree);
837     return 0;
838 
839 err:
840     if (i) {
841         for (i--; i >= 0; i--) {
842             qemu_del_net_client(ncs[i]);
843         }
844     }
845 
846 err_svq:
847     qemu_close(vdpa_device_fd);
848 
849     return -1;
850 }
851