xref: /qemu/net/af-xdp.c (revision bed150be)
1cb039ef3SIlya Maximets /*
2cb039ef3SIlya Maximets  * AF_XDP network backend.
3cb039ef3SIlya Maximets  *
4cb039ef3SIlya Maximets  * Copyright (c) 2023 Red Hat, Inc.
5cb039ef3SIlya Maximets  *
6cb039ef3SIlya Maximets  * Authors:
7cb039ef3SIlya Maximets  *  Ilya Maximets <i.maximets@ovn.org>
8cb039ef3SIlya Maximets  *
9cb039ef3SIlya Maximets  * This work is licensed under the terms of the GNU GPL, version 2 or later.
10cb039ef3SIlya Maximets  * See the COPYING file in the top-level directory.
11cb039ef3SIlya Maximets  */
12cb039ef3SIlya Maximets 
13cb039ef3SIlya Maximets 
14cb039ef3SIlya Maximets #include "qemu/osdep.h"
15cb039ef3SIlya Maximets #include <bpf/bpf.h>
16cb039ef3SIlya Maximets #include <linux/if_link.h>
17cb039ef3SIlya Maximets #include <linux/if_xdp.h>
18cb039ef3SIlya Maximets #include <net/if.h>
19cb039ef3SIlya Maximets #include <xdp/xsk.h>
20cb039ef3SIlya Maximets 
21cb039ef3SIlya Maximets #include "clients.h"
22cb039ef3SIlya Maximets #include "monitor/monitor.h"
23cb039ef3SIlya Maximets #include "net/net.h"
24cb039ef3SIlya Maximets #include "qapi/error.h"
25cb039ef3SIlya Maximets #include "qemu/cutils.h"
26cb039ef3SIlya Maximets #include "qemu/error-report.h"
27cb039ef3SIlya Maximets #include "qemu/iov.h"
28cb039ef3SIlya Maximets #include "qemu/main-loop.h"
29cb039ef3SIlya Maximets #include "qemu/memalign.h"
30cb039ef3SIlya Maximets 
31cb039ef3SIlya Maximets 
32cb039ef3SIlya Maximets typedef struct AFXDPState {
33cb039ef3SIlya Maximets     NetClientState       nc;
34cb039ef3SIlya Maximets 
35cb039ef3SIlya Maximets     struct xsk_socket    *xsk;
36cb039ef3SIlya Maximets     struct xsk_ring_cons rx;
37cb039ef3SIlya Maximets     struct xsk_ring_prod tx;
38cb039ef3SIlya Maximets     struct xsk_ring_cons cq;
39cb039ef3SIlya Maximets     struct xsk_ring_prod fq;
40cb039ef3SIlya Maximets 
41cb039ef3SIlya Maximets     char                 ifname[IFNAMSIZ];
42cb039ef3SIlya Maximets     int                  ifindex;
43cb039ef3SIlya Maximets     bool                 read_poll;
44cb039ef3SIlya Maximets     bool                 write_poll;
45cb039ef3SIlya Maximets     uint32_t             outstanding_tx;
46cb039ef3SIlya Maximets 
47cb039ef3SIlya Maximets     uint64_t             *pool;
48cb039ef3SIlya Maximets     uint32_t             n_pool;
49cb039ef3SIlya Maximets     char                 *buffer;
50cb039ef3SIlya Maximets     struct xsk_umem      *umem;
51cb039ef3SIlya Maximets 
52cb039ef3SIlya Maximets     uint32_t             n_queues;
53cb039ef3SIlya Maximets     uint32_t             xdp_flags;
54cb039ef3SIlya Maximets     bool                 inhibit;
55cb039ef3SIlya Maximets } AFXDPState;
56cb039ef3SIlya Maximets 
57cb039ef3SIlya Maximets #define AF_XDP_BATCH_SIZE 64
58cb039ef3SIlya Maximets 
59cb039ef3SIlya Maximets static void af_xdp_send(void *opaque);
60cb039ef3SIlya Maximets static void af_xdp_writable(void *opaque);
61cb039ef3SIlya Maximets 
62cb039ef3SIlya Maximets /* Set the event-loop handlers for the af-xdp backend. */
af_xdp_update_fd_handler(AFXDPState * s)63cb039ef3SIlya Maximets static void af_xdp_update_fd_handler(AFXDPState *s)
64cb039ef3SIlya Maximets {
65cb039ef3SIlya Maximets     qemu_set_fd_handler(xsk_socket__fd(s->xsk),
66cb039ef3SIlya Maximets                         s->read_poll ? af_xdp_send : NULL,
67cb039ef3SIlya Maximets                         s->write_poll ? af_xdp_writable : NULL,
68cb039ef3SIlya Maximets                         s);
69cb039ef3SIlya Maximets }
70cb039ef3SIlya Maximets 
71cb039ef3SIlya Maximets /* Update the read handler. */
af_xdp_read_poll(AFXDPState * s,bool enable)72cb039ef3SIlya Maximets static void af_xdp_read_poll(AFXDPState *s, bool enable)
73cb039ef3SIlya Maximets {
74cb039ef3SIlya Maximets     if (s->read_poll != enable) {
75cb039ef3SIlya Maximets         s->read_poll = enable;
76cb039ef3SIlya Maximets         af_xdp_update_fd_handler(s);
77cb039ef3SIlya Maximets     }
78cb039ef3SIlya Maximets }
79cb039ef3SIlya Maximets 
80cb039ef3SIlya Maximets /* Update the write handler. */
af_xdp_write_poll(AFXDPState * s,bool enable)81cb039ef3SIlya Maximets static void af_xdp_write_poll(AFXDPState *s, bool enable)
82cb039ef3SIlya Maximets {
83cb039ef3SIlya Maximets     if (s->write_poll != enable) {
84cb039ef3SIlya Maximets         s->write_poll = enable;
85cb039ef3SIlya Maximets         af_xdp_update_fd_handler(s);
86cb039ef3SIlya Maximets     }
87cb039ef3SIlya Maximets }
88cb039ef3SIlya Maximets 
af_xdp_poll(NetClientState * nc,bool enable)89cb039ef3SIlya Maximets static void af_xdp_poll(NetClientState *nc, bool enable)
90cb039ef3SIlya Maximets {
91cb039ef3SIlya Maximets     AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
92cb039ef3SIlya Maximets 
93cb039ef3SIlya Maximets     if (s->read_poll != enable || s->write_poll != enable) {
94cb039ef3SIlya Maximets         s->write_poll = enable;
95cb039ef3SIlya Maximets         s->read_poll  = enable;
96cb039ef3SIlya Maximets         af_xdp_update_fd_handler(s);
97cb039ef3SIlya Maximets     }
98cb039ef3SIlya Maximets }
99cb039ef3SIlya Maximets 
af_xdp_complete_tx(AFXDPState * s)100cb039ef3SIlya Maximets static void af_xdp_complete_tx(AFXDPState *s)
101cb039ef3SIlya Maximets {
102cb039ef3SIlya Maximets     uint32_t idx = 0;
103cb039ef3SIlya Maximets     uint32_t done, i;
104cb039ef3SIlya Maximets     uint64_t *addr;
105cb039ef3SIlya Maximets 
106cb039ef3SIlya Maximets     done = xsk_ring_cons__peek(&s->cq, XSK_RING_CONS__DEFAULT_NUM_DESCS, &idx);
107cb039ef3SIlya Maximets 
108cb039ef3SIlya Maximets     for (i = 0; i < done; i++) {
109cb039ef3SIlya Maximets         addr = (void *) xsk_ring_cons__comp_addr(&s->cq, idx++);
110cb039ef3SIlya Maximets         s->pool[s->n_pool++] = *addr;
111cb039ef3SIlya Maximets         s->outstanding_tx--;
112cb039ef3SIlya Maximets     }
113cb039ef3SIlya Maximets 
114cb039ef3SIlya Maximets     if (done) {
115cb039ef3SIlya Maximets         xsk_ring_cons__release(&s->cq, done);
116cb039ef3SIlya Maximets     }
117cb039ef3SIlya Maximets }
118cb039ef3SIlya Maximets 
119cb039ef3SIlya Maximets /*
120cb039ef3SIlya Maximets  * The fd_write() callback, invoked if the fd is marked as writable
121cb039ef3SIlya Maximets  * after a poll.
122cb039ef3SIlya Maximets  */
af_xdp_writable(void * opaque)123cb039ef3SIlya Maximets static void af_xdp_writable(void *opaque)
124cb039ef3SIlya Maximets {
125cb039ef3SIlya Maximets     AFXDPState *s = opaque;
126cb039ef3SIlya Maximets 
127cb039ef3SIlya Maximets     /* Try to recover buffers that are already sent. */
128cb039ef3SIlya Maximets     af_xdp_complete_tx(s);
129cb039ef3SIlya Maximets 
130cb039ef3SIlya Maximets     /*
131cb039ef3SIlya Maximets      * Unregister the handler, unless we still have packets to transmit
132cb039ef3SIlya Maximets      * and kernel needs a wake up.
133cb039ef3SIlya Maximets      */
134cb039ef3SIlya Maximets     if (!s->outstanding_tx || !xsk_ring_prod__needs_wakeup(&s->tx)) {
135cb039ef3SIlya Maximets         af_xdp_write_poll(s, false);
136cb039ef3SIlya Maximets     }
137cb039ef3SIlya Maximets 
138cb039ef3SIlya Maximets     /* Flush any buffered packets. */
139cb039ef3SIlya Maximets     qemu_flush_queued_packets(&s->nc);
140cb039ef3SIlya Maximets }
141cb039ef3SIlya Maximets 
af_xdp_receive(NetClientState * nc,const uint8_t * buf,size_t size)142cb039ef3SIlya Maximets static ssize_t af_xdp_receive(NetClientState *nc,
143cb039ef3SIlya Maximets                               const uint8_t *buf, size_t size)
144cb039ef3SIlya Maximets {
145cb039ef3SIlya Maximets     AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
146cb039ef3SIlya Maximets     struct xdp_desc *desc;
147cb039ef3SIlya Maximets     uint32_t idx;
148cb039ef3SIlya Maximets     void *data;
149cb039ef3SIlya Maximets 
150cb039ef3SIlya Maximets     /* Try to recover buffers that are already sent. */
151cb039ef3SIlya Maximets     af_xdp_complete_tx(s);
152cb039ef3SIlya Maximets 
153cb039ef3SIlya Maximets     if (size > XSK_UMEM__DEFAULT_FRAME_SIZE) {
154cb039ef3SIlya Maximets         /* We can't transmit packet this size... */
155cb039ef3SIlya Maximets         return size;
156cb039ef3SIlya Maximets     }
157cb039ef3SIlya Maximets 
158cb039ef3SIlya Maximets     if (!s->n_pool || !xsk_ring_prod__reserve(&s->tx, 1, &idx)) {
159cb039ef3SIlya Maximets         /*
160cb039ef3SIlya Maximets          * Out of buffers or space in tx ring.  Poll until we can write.
161cb039ef3SIlya Maximets          * This will also kick the Tx, if it was waiting on CQ.
162cb039ef3SIlya Maximets          */
163cb039ef3SIlya Maximets         af_xdp_write_poll(s, true);
164cb039ef3SIlya Maximets         return 0;
165cb039ef3SIlya Maximets     }
166cb039ef3SIlya Maximets 
167cb039ef3SIlya Maximets     desc = xsk_ring_prod__tx_desc(&s->tx, idx);
168cb039ef3SIlya Maximets     desc->addr = s->pool[--s->n_pool];
169cb039ef3SIlya Maximets     desc->len = size;
170cb039ef3SIlya Maximets 
171cb039ef3SIlya Maximets     data = xsk_umem__get_data(s->buffer, desc->addr);
172cb039ef3SIlya Maximets     memcpy(data, buf, size);
173cb039ef3SIlya Maximets 
174cb039ef3SIlya Maximets     xsk_ring_prod__submit(&s->tx, 1);
175cb039ef3SIlya Maximets     s->outstanding_tx++;
176cb039ef3SIlya Maximets 
177cb039ef3SIlya Maximets     if (xsk_ring_prod__needs_wakeup(&s->tx)) {
178cb039ef3SIlya Maximets         af_xdp_write_poll(s, true);
179cb039ef3SIlya Maximets     }
180cb039ef3SIlya Maximets 
181cb039ef3SIlya Maximets     return size;
182cb039ef3SIlya Maximets }
183cb039ef3SIlya Maximets 
184cb039ef3SIlya Maximets /*
185cb039ef3SIlya Maximets  * Complete a previous send (backend --> guest) and enable the
186cb039ef3SIlya Maximets  * fd_read callback.
187cb039ef3SIlya Maximets  */
af_xdp_send_completed(NetClientState * nc,ssize_t len)188cb039ef3SIlya Maximets static void af_xdp_send_completed(NetClientState *nc, ssize_t len)
189cb039ef3SIlya Maximets {
190cb039ef3SIlya Maximets     AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
191cb039ef3SIlya Maximets 
192cb039ef3SIlya Maximets     af_xdp_read_poll(s, true);
193cb039ef3SIlya Maximets }
194cb039ef3SIlya Maximets 
af_xdp_fq_refill(AFXDPState * s,uint32_t n)195cb039ef3SIlya Maximets static void af_xdp_fq_refill(AFXDPState *s, uint32_t n)
196cb039ef3SIlya Maximets {
197cb039ef3SIlya Maximets     uint32_t i, idx = 0;
198cb039ef3SIlya Maximets 
199cb039ef3SIlya Maximets     /* Leave one packet for Tx, just in case. */
200cb039ef3SIlya Maximets     if (s->n_pool < n + 1) {
201cb039ef3SIlya Maximets         n = s->n_pool;
202cb039ef3SIlya Maximets     }
203cb039ef3SIlya Maximets 
204cb039ef3SIlya Maximets     if (!n || !xsk_ring_prod__reserve(&s->fq, n, &idx)) {
205cb039ef3SIlya Maximets         return;
206cb039ef3SIlya Maximets     }
207cb039ef3SIlya Maximets 
208cb039ef3SIlya Maximets     for (i = 0; i < n; i++) {
209cb039ef3SIlya Maximets         *xsk_ring_prod__fill_addr(&s->fq, idx++) = s->pool[--s->n_pool];
210cb039ef3SIlya Maximets     }
211cb039ef3SIlya Maximets     xsk_ring_prod__submit(&s->fq, n);
212cb039ef3SIlya Maximets 
213cb039ef3SIlya Maximets     if (xsk_ring_prod__needs_wakeup(&s->fq)) {
214cb039ef3SIlya Maximets         /* Receive was blocked by not having enough buffers.  Wake it up. */
215cb039ef3SIlya Maximets         af_xdp_read_poll(s, true);
216cb039ef3SIlya Maximets     }
217cb039ef3SIlya Maximets }
218cb039ef3SIlya Maximets 
af_xdp_send(void * opaque)219cb039ef3SIlya Maximets static void af_xdp_send(void *opaque)
220cb039ef3SIlya Maximets {
221cb039ef3SIlya Maximets     uint32_t i, n_rx, idx = 0;
222cb039ef3SIlya Maximets     AFXDPState *s = opaque;
223cb039ef3SIlya Maximets 
224cb039ef3SIlya Maximets     n_rx = xsk_ring_cons__peek(&s->rx, AF_XDP_BATCH_SIZE, &idx);
225cb039ef3SIlya Maximets     if (!n_rx) {
226cb039ef3SIlya Maximets         return;
227cb039ef3SIlya Maximets     }
228cb039ef3SIlya Maximets 
229cb039ef3SIlya Maximets     for (i = 0; i < n_rx; i++) {
230cb039ef3SIlya Maximets         const struct xdp_desc *desc;
231cb039ef3SIlya Maximets         struct iovec iov;
232cb039ef3SIlya Maximets 
233cb039ef3SIlya Maximets         desc = xsk_ring_cons__rx_desc(&s->rx, idx++);
234cb039ef3SIlya Maximets 
235cb039ef3SIlya Maximets         iov.iov_base = xsk_umem__get_data(s->buffer, desc->addr);
236cb039ef3SIlya Maximets         iov.iov_len = desc->len;
237cb039ef3SIlya Maximets 
238cb039ef3SIlya Maximets         s->pool[s->n_pool++] = desc->addr;
239cb039ef3SIlya Maximets 
240cb039ef3SIlya Maximets         if (!qemu_sendv_packet_async(&s->nc, &iov, 1,
241cb039ef3SIlya Maximets                                      af_xdp_send_completed)) {
242cb039ef3SIlya Maximets             /*
243cb039ef3SIlya Maximets              * The peer does not receive anymore.  Packet is queued, stop
244cb039ef3SIlya Maximets              * reading from the backend until af_xdp_send_completed().
245cb039ef3SIlya Maximets              */
246cb039ef3SIlya Maximets             af_xdp_read_poll(s, false);
247cb039ef3SIlya Maximets 
248cb039ef3SIlya Maximets             /* Return unused descriptors to not break the ring cache. */
249cb039ef3SIlya Maximets             xsk_ring_cons__cancel(&s->rx, n_rx - i - 1);
250cb039ef3SIlya Maximets             n_rx = i + 1;
251cb039ef3SIlya Maximets             break;
252cb039ef3SIlya Maximets         }
253cb039ef3SIlya Maximets     }
254cb039ef3SIlya Maximets 
255cb039ef3SIlya Maximets     /* Release actually sent descriptors and try to re-fill. */
256cb039ef3SIlya Maximets     xsk_ring_cons__release(&s->rx, n_rx);
257cb039ef3SIlya Maximets     af_xdp_fq_refill(s, AF_XDP_BATCH_SIZE);
258cb039ef3SIlya Maximets }
259cb039ef3SIlya Maximets 
260cb039ef3SIlya Maximets /* Flush and close. */
af_xdp_cleanup(NetClientState * nc)261cb039ef3SIlya Maximets static void af_xdp_cleanup(NetClientState *nc)
262cb039ef3SIlya Maximets {
263cb039ef3SIlya Maximets     AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
264cb039ef3SIlya Maximets 
265cb039ef3SIlya Maximets     qemu_purge_queued_packets(nc);
266cb039ef3SIlya Maximets 
267cb039ef3SIlya Maximets     af_xdp_poll(nc, false);
268cb039ef3SIlya Maximets 
269cb039ef3SIlya Maximets     xsk_socket__delete(s->xsk);
270cb039ef3SIlya Maximets     s->xsk = NULL;
271cb039ef3SIlya Maximets     g_free(s->pool);
272cb039ef3SIlya Maximets     s->pool = NULL;
273cb039ef3SIlya Maximets     xsk_umem__delete(s->umem);
274cb039ef3SIlya Maximets     s->umem = NULL;
275cb039ef3SIlya Maximets     qemu_vfree(s->buffer);
276cb039ef3SIlya Maximets     s->buffer = NULL;
277cb039ef3SIlya Maximets 
278cb039ef3SIlya Maximets     /* Remove the program if it's the last open queue. */
279cb039ef3SIlya Maximets     if (!s->inhibit && nc->queue_index == s->n_queues - 1 && s->xdp_flags
280cb039ef3SIlya Maximets         && bpf_xdp_detach(s->ifindex, s->xdp_flags, NULL) != 0) {
281cb039ef3SIlya Maximets         fprintf(stderr,
282cb039ef3SIlya Maximets                 "af-xdp: unable to remove XDP program from '%s', ifindex: %d\n",
283cb039ef3SIlya Maximets                 s->ifname, s->ifindex);
284cb039ef3SIlya Maximets     }
285cb039ef3SIlya Maximets }
286cb039ef3SIlya Maximets 
af_xdp_umem_create(AFXDPState * s,int sock_fd,Error ** errp)287cb039ef3SIlya Maximets static int af_xdp_umem_create(AFXDPState *s, int sock_fd, Error **errp)
288cb039ef3SIlya Maximets {
289cb039ef3SIlya Maximets     struct xsk_umem_config config = {
290cb039ef3SIlya Maximets         .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
291cb039ef3SIlya Maximets         .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
292cb039ef3SIlya Maximets         .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
293cb039ef3SIlya Maximets         .frame_headroom = 0,
294cb039ef3SIlya Maximets     };
295cb039ef3SIlya Maximets     uint64_t n_descs;
296cb039ef3SIlya Maximets     uint64_t size;
297cb039ef3SIlya Maximets     int64_t i;
298cb039ef3SIlya Maximets     int ret;
299cb039ef3SIlya Maximets 
300cb039ef3SIlya Maximets     /* Number of descriptors if all 4 queues (rx, tx, cq, fq) are full. */
301cb039ef3SIlya Maximets     n_descs = (XSK_RING_PROD__DEFAULT_NUM_DESCS
302cb039ef3SIlya Maximets                + XSK_RING_CONS__DEFAULT_NUM_DESCS) * 2;
303cb039ef3SIlya Maximets     size = n_descs * XSK_UMEM__DEFAULT_FRAME_SIZE;
304cb039ef3SIlya Maximets 
305cb039ef3SIlya Maximets     s->buffer = qemu_memalign(qemu_real_host_page_size(), size);
306cb039ef3SIlya Maximets     memset(s->buffer, 0, size);
307cb039ef3SIlya Maximets 
308cb039ef3SIlya Maximets     if (sock_fd < 0) {
309cb039ef3SIlya Maximets         ret = xsk_umem__create(&s->umem, s->buffer, size,
310cb039ef3SIlya Maximets                                &s->fq, &s->cq, &config);
311cb039ef3SIlya Maximets     } else {
312cb039ef3SIlya Maximets         ret = xsk_umem__create_with_fd(&s->umem, sock_fd, s->buffer, size,
313cb039ef3SIlya Maximets                                        &s->fq, &s->cq, &config);
314cb039ef3SIlya Maximets     }
315cb039ef3SIlya Maximets 
316cb039ef3SIlya Maximets     if (ret) {
317cb039ef3SIlya Maximets         qemu_vfree(s->buffer);
318cb039ef3SIlya Maximets         error_setg_errno(errp, errno,
319cb039ef3SIlya Maximets                          "failed to create umem for %s queue_index: %d",
320cb039ef3SIlya Maximets                          s->ifname, s->nc.queue_index);
321cb039ef3SIlya Maximets         return -1;
322cb039ef3SIlya Maximets     }
323cb039ef3SIlya Maximets 
324cb039ef3SIlya Maximets     s->pool = g_new(uint64_t, n_descs);
325cb039ef3SIlya Maximets     /* Fill the pool in the opposite order, because it's a LIFO queue. */
326cb039ef3SIlya Maximets     for (i = n_descs; i >= 0; i--) {
327cb039ef3SIlya Maximets         s->pool[i] = i * XSK_UMEM__DEFAULT_FRAME_SIZE;
328cb039ef3SIlya Maximets     }
329cb039ef3SIlya Maximets     s->n_pool = n_descs;
330cb039ef3SIlya Maximets 
331cb039ef3SIlya Maximets     af_xdp_fq_refill(s, XSK_RING_PROD__DEFAULT_NUM_DESCS);
332cb039ef3SIlya Maximets 
333cb039ef3SIlya Maximets     return 0;
334cb039ef3SIlya Maximets }
335cb039ef3SIlya Maximets 
af_xdp_socket_create(AFXDPState * s,const NetdevAFXDPOptions * opts,Error ** errp)336cb039ef3SIlya Maximets static int af_xdp_socket_create(AFXDPState *s,
337cb039ef3SIlya Maximets                                 const NetdevAFXDPOptions *opts, Error **errp)
338cb039ef3SIlya Maximets {
339cb039ef3SIlya Maximets     struct xsk_socket_config cfg = {
340cb039ef3SIlya Maximets         .rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
341cb039ef3SIlya Maximets         .tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
342cb039ef3SIlya Maximets         .libxdp_flags = 0,
343cb039ef3SIlya Maximets         .bind_flags = XDP_USE_NEED_WAKEUP,
344cb039ef3SIlya Maximets         .xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST,
345cb039ef3SIlya Maximets     };
346cb039ef3SIlya Maximets     int queue_id, error = 0;
347cb039ef3SIlya Maximets 
348cb039ef3SIlya Maximets     s->inhibit = opts->has_inhibit && opts->inhibit;
349cb039ef3SIlya Maximets     if (s->inhibit) {
350cb039ef3SIlya Maximets         cfg.libxdp_flags |= XSK_LIBXDP_FLAGS__INHIBIT_PROG_LOAD;
351cb039ef3SIlya Maximets     }
352cb039ef3SIlya Maximets 
353cb039ef3SIlya Maximets     if (opts->has_force_copy && opts->force_copy) {
354cb039ef3SIlya Maximets         cfg.bind_flags |= XDP_COPY;
355cb039ef3SIlya Maximets     }
356cb039ef3SIlya Maximets 
357cb039ef3SIlya Maximets     queue_id = s->nc.queue_index;
358cb039ef3SIlya Maximets     if (opts->has_start_queue && opts->start_queue > 0) {
359cb039ef3SIlya Maximets         queue_id += opts->start_queue;
360cb039ef3SIlya Maximets     }
361cb039ef3SIlya Maximets 
362cb039ef3SIlya Maximets     if (opts->has_mode) {
363cb039ef3SIlya Maximets         /* Specific mode requested. */
364cb039ef3SIlya Maximets         cfg.xdp_flags |= (opts->mode == AFXDP_MODE_NATIVE)
365cb039ef3SIlya Maximets                          ? XDP_FLAGS_DRV_MODE : XDP_FLAGS_SKB_MODE;
366cb039ef3SIlya Maximets         if (xsk_socket__create(&s->xsk, s->ifname, queue_id,
367cb039ef3SIlya Maximets                                s->umem, &s->rx, &s->tx, &cfg)) {
368cb039ef3SIlya Maximets             error = errno;
369cb039ef3SIlya Maximets         }
370cb039ef3SIlya Maximets     } else {
371cb039ef3SIlya Maximets         /* No mode requested, try native first. */
372cb039ef3SIlya Maximets         cfg.xdp_flags |= XDP_FLAGS_DRV_MODE;
373cb039ef3SIlya Maximets 
374cb039ef3SIlya Maximets         if (xsk_socket__create(&s->xsk, s->ifname, queue_id,
375cb039ef3SIlya Maximets                                s->umem, &s->rx, &s->tx, &cfg)) {
376cb039ef3SIlya Maximets             /* Can't use native mode, try skb. */
377cb039ef3SIlya Maximets             cfg.xdp_flags &= ~XDP_FLAGS_DRV_MODE;
378cb039ef3SIlya Maximets             cfg.xdp_flags |= XDP_FLAGS_SKB_MODE;
379cb039ef3SIlya Maximets 
380cb039ef3SIlya Maximets             if (xsk_socket__create(&s->xsk, s->ifname, queue_id,
381cb039ef3SIlya Maximets                                    s->umem, &s->rx, &s->tx, &cfg)) {
382cb039ef3SIlya Maximets                 error = errno;
383cb039ef3SIlya Maximets             }
384cb039ef3SIlya Maximets         }
385cb039ef3SIlya Maximets     }
386cb039ef3SIlya Maximets 
387cb039ef3SIlya Maximets     if (error) {
388cb039ef3SIlya Maximets         error_setg_errno(errp, error,
389cb039ef3SIlya Maximets                          "failed to create AF_XDP socket for %s queue_id: %d",
390cb039ef3SIlya Maximets                          s->ifname, queue_id);
391cb039ef3SIlya Maximets         return -1;
392cb039ef3SIlya Maximets     }
393cb039ef3SIlya Maximets 
394cb039ef3SIlya Maximets     s->xdp_flags = cfg.xdp_flags;
395cb039ef3SIlya Maximets 
396cb039ef3SIlya Maximets     return 0;
397cb039ef3SIlya Maximets }
398cb039ef3SIlya Maximets 
399cb039ef3SIlya Maximets /* NetClientInfo methods. */
400cb039ef3SIlya Maximets static NetClientInfo net_af_xdp_info = {
401cb039ef3SIlya Maximets     .type = NET_CLIENT_DRIVER_AF_XDP,
402cb039ef3SIlya Maximets     .size = sizeof(AFXDPState),
403cb039ef3SIlya Maximets     .receive = af_xdp_receive,
404cb039ef3SIlya Maximets     .poll = af_xdp_poll,
405cb039ef3SIlya Maximets     .cleanup = af_xdp_cleanup,
406cb039ef3SIlya Maximets };
407cb039ef3SIlya Maximets 
parse_socket_fds(const char * sock_fds_str,int64_t n_expected,Error ** errp)408cb039ef3SIlya Maximets static int *parse_socket_fds(const char *sock_fds_str,
409cb039ef3SIlya Maximets                              int64_t n_expected, Error **errp)
410cb039ef3SIlya Maximets {
411cb039ef3SIlya Maximets     gchar **substrings = g_strsplit(sock_fds_str, ":", -1);
412cb039ef3SIlya Maximets     int64_t i, n_sock_fds = g_strv_length(substrings);
413cb039ef3SIlya Maximets     int *sock_fds = NULL;
414cb039ef3SIlya Maximets 
415cb039ef3SIlya Maximets     if (n_sock_fds != n_expected) {
416cb039ef3SIlya Maximets         error_setg(errp, "expected %"PRIi64" socket fds, got %"PRIi64,
417cb039ef3SIlya Maximets                    n_expected, n_sock_fds);
418cb039ef3SIlya Maximets         goto exit;
419cb039ef3SIlya Maximets     }
420cb039ef3SIlya Maximets 
421cb039ef3SIlya Maximets     sock_fds = g_new(int, n_sock_fds);
422cb039ef3SIlya Maximets 
423cb039ef3SIlya Maximets     for (i = 0; i < n_sock_fds; i++) {
424cb039ef3SIlya Maximets         sock_fds[i] = monitor_fd_param(monitor_cur(), substrings[i], errp);
425cb039ef3SIlya Maximets         if (sock_fds[i] < 0) {
426cb039ef3SIlya Maximets             g_free(sock_fds);
427cb039ef3SIlya Maximets             sock_fds = NULL;
428cb039ef3SIlya Maximets             goto exit;
429cb039ef3SIlya Maximets         }
430cb039ef3SIlya Maximets     }
431cb039ef3SIlya Maximets 
432cb039ef3SIlya Maximets exit:
433cb039ef3SIlya Maximets     g_strfreev(substrings);
434cb039ef3SIlya Maximets     return sock_fds;
435cb039ef3SIlya Maximets }
436cb039ef3SIlya Maximets 
437cb039ef3SIlya Maximets /*
438cb039ef3SIlya Maximets  * The exported init function.
439cb039ef3SIlya Maximets  *
440cb039ef3SIlya Maximets  * ... -netdev af-xdp,ifname="..."
441cb039ef3SIlya Maximets  */
net_init_af_xdp(const Netdev * netdev,const char * name,NetClientState * peer,Error ** errp)442cb039ef3SIlya Maximets int net_init_af_xdp(const Netdev *netdev,
443cb039ef3SIlya Maximets                     const char *name, NetClientState *peer, Error **errp)
444cb039ef3SIlya Maximets {
445cb039ef3SIlya Maximets     const NetdevAFXDPOptions *opts = &netdev->u.af_xdp;
446cb039ef3SIlya Maximets     NetClientState *nc, *nc0 = NULL;
447cb039ef3SIlya Maximets     unsigned int ifindex;
448cb039ef3SIlya Maximets     uint32_t prog_id = 0;
449*bed150beSPeter Maydell     g_autofree int *sock_fds = NULL;
450cb039ef3SIlya Maximets     int64_t i, queues;
451cb039ef3SIlya Maximets     Error *err = NULL;
452cb039ef3SIlya Maximets     AFXDPState *s;
453cb039ef3SIlya Maximets 
454cb039ef3SIlya Maximets     ifindex = if_nametoindex(opts->ifname);
455cb039ef3SIlya Maximets     if (!ifindex) {
456cb039ef3SIlya Maximets         error_setg_errno(errp, errno, "failed to get ifindex for '%s'",
457cb039ef3SIlya Maximets                          opts->ifname);
458cb039ef3SIlya Maximets         return -1;
459cb039ef3SIlya Maximets     }
460cb039ef3SIlya Maximets 
461cb039ef3SIlya Maximets     queues = opts->has_queues ? opts->queues : 1;
462cb039ef3SIlya Maximets     if (queues < 1) {
463cb039ef3SIlya Maximets         error_setg(errp, "invalid number of queues (%" PRIi64 ") for '%s'",
464cb039ef3SIlya Maximets                    queues, opts->ifname);
465cb039ef3SIlya Maximets         return -1;
466cb039ef3SIlya Maximets     }
467cb039ef3SIlya Maximets 
468cb039ef3SIlya Maximets     if ((opts->has_inhibit && opts->inhibit) != !!opts->sock_fds) {
469cb039ef3SIlya Maximets         error_setg(errp, "'inhibit=on' requires 'sock-fds' and vice versa");
470cb039ef3SIlya Maximets         return -1;
471cb039ef3SIlya Maximets     }
472cb039ef3SIlya Maximets 
473cb039ef3SIlya Maximets     if (opts->sock_fds) {
474cb039ef3SIlya Maximets         sock_fds = parse_socket_fds(opts->sock_fds, queues, errp);
475cb039ef3SIlya Maximets         if (!sock_fds) {
476cb039ef3SIlya Maximets             return -1;
477cb039ef3SIlya Maximets         }
478cb039ef3SIlya Maximets     }
479cb039ef3SIlya Maximets 
480cb039ef3SIlya Maximets     for (i = 0; i < queues; i++) {
481cb039ef3SIlya Maximets         nc = qemu_new_net_client(&net_af_xdp_info, peer, "af-xdp", name);
482cb039ef3SIlya Maximets         qemu_set_info_str(nc, "af-xdp%"PRIi64" to %s", i, opts->ifname);
483cb039ef3SIlya Maximets         nc->queue_index = i;
484cb039ef3SIlya Maximets 
485cb039ef3SIlya Maximets         if (!nc0) {
486cb039ef3SIlya Maximets             nc0 = nc;
487cb039ef3SIlya Maximets         }
488cb039ef3SIlya Maximets 
489cb039ef3SIlya Maximets         s = DO_UPCAST(AFXDPState, nc, nc);
490cb039ef3SIlya Maximets 
491cb039ef3SIlya Maximets         pstrcpy(s->ifname, sizeof(s->ifname), opts->ifname);
492cb039ef3SIlya Maximets         s->ifindex = ifindex;
493cb039ef3SIlya Maximets         s->n_queues = queues;
494cb039ef3SIlya Maximets 
495cb039ef3SIlya Maximets         if (af_xdp_umem_create(s, sock_fds ? sock_fds[i] : -1, errp)
496cb039ef3SIlya Maximets             || af_xdp_socket_create(s, opts, errp)) {
497cb039ef3SIlya Maximets             /* Make sure the XDP program will be removed. */
498cb039ef3SIlya Maximets             s->n_queues = i;
499cb039ef3SIlya Maximets             error_propagate(errp, err);
500cb039ef3SIlya Maximets             goto err;
501cb039ef3SIlya Maximets         }
502cb039ef3SIlya Maximets     }
503cb039ef3SIlya Maximets 
504cb039ef3SIlya Maximets     if (nc0) {
505cb039ef3SIlya Maximets         s = DO_UPCAST(AFXDPState, nc, nc0);
506cb039ef3SIlya Maximets         if (bpf_xdp_query_id(s->ifindex, s->xdp_flags, &prog_id) || !prog_id) {
507cb039ef3SIlya Maximets             error_setg_errno(errp, errno,
508cb039ef3SIlya Maximets                              "no XDP program loaded on '%s', ifindex: %d",
509cb039ef3SIlya Maximets                              s->ifname, s->ifindex);
510cb039ef3SIlya Maximets             goto err;
511cb039ef3SIlya Maximets         }
512cb039ef3SIlya Maximets     }
513cb039ef3SIlya Maximets 
514cb039ef3SIlya Maximets     af_xdp_read_poll(s, true); /* Initially only poll for reads. */
515cb039ef3SIlya Maximets 
516cb039ef3SIlya Maximets     return 0;
517cb039ef3SIlya Maximets 
518cb039ef3SIlya Maximets err:
519cb039ef3SIlya Maximets     if (nc0) {
520cb039ef3SIlya Maximets         qemu_del_net_client(nc0);
521cb039ef3SIlya Maximets     }
522cb039ef3SIlya Maximets 
523cb039ef3SIlya Maximets     return -1;
524cb039ef3SIlya Maximets }
525