xref: /linux/net/xdp/xsk.c (revision 7fcf26b3)
1c0c77d8fSBjörn Töpel // SPDX-License-Identifier: GPL-2.0
2c0c77d8fSBjörn Töpel /* XDP sockets
3c0c77d8fSBjörn Töpel  *
4c0c77d8fSBjörn Töpel  * AF_XDP sockets allows a channel between XDP programs and userspace
5c0c77d8fSBjörn Töpel  * applications.
6c0c77d8fSBjörn Töpel  * Copyright(c) 2018 Intel Corporation.
7c0c77d8fSBjörn Töpel  *
8c0c77d8fSBjörn Töpel  * Author(s): Björn Töpel <bjorn.topel@intel.com>
9c0c77d8fSBjörn Töpel  *	      Magnus Karlsson <magnus.karlsson@intel.com>
10c0c77d8fSBjörn Töpel  */
11c0c77d8fSBjörn Töpel 
12c0c77d8fSBjörn Töpel #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
13c0c77d8fSBjörn Töpel 
14c0c77d8fSBjörn Töpel #include <linux/if_xdp.h>
15c0c77d8fSBjörn Töpel #include <linux/init.h>
16c0c77d8fSBjörn Töpel #include <linux/sched/mm.h>
17c0c77d8fSBjörn Töpel #include <linux/sched/signal.h>
18c0c77d8fSBjörn Töpel #include <linux/sched/task.h>
19c0c77d8fSBjörn Töpel #include <linux/socket.h>
20c0c77d8fSBjörn Töpel #include <linux/file.h>
21c0c77d8fSBjörn Töpel #include <linux/uaccess.h>
22c0c77d8fSBjörn Töpel #include <linux/net.h>
23c0c77d8fSBjörn Töpel #include <linux/netdevice.h>
24ac98d8aaSMagnus Karlsson #include <linux/rculist.h>
25951bce29SXuan Zhuo #include <linux/vmalloc.h>
26a71506a4SMagnus Karlsson #include <net/xdp_sock_drv.h>
27a0731952SBjörn Töpel #include <net/busy_poll.h>
2849e47a5bSJakub Kicinski #include <net/netdev_rx_queue.h>
29b9b6b68eSBjörn Töpel #include <net/xdp.h>
30c0c77d8fSBjörn Töpel 
31423f3832SMagnus Karlsson #include "xsk_queue.h"
32c0c77d8fSBjörn Töpel #include "xdp_umem.h"
33a36b38aaSBjörn Töpel #include "xsk.h"
34c0c77d8fSBjörn Töpel 
35e7a1c130SLi RongQing #define TX_BATCH_SIZE 32
3699b29a49SAlbert Huang #define MAX_PER_SOCKET_BUDGET (TX_BATCH_SIZE)
3735fcde7fSMagnus Karlsson 
38e312b9e7SBjörn Töpel static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
39e312b9e7SBjörn Töpel 
xsk_set_rx_need_wakeup(struct xsk_buff_pool * pool)40c4655761SMagnus Karlsson void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
4177cd0d7bSMagnus Karlsson {
42c2d3d6a4SMagnus Karlsson 	if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
4377cd0d7bSMagnus Karlsson 		return;
4477cd0d7bSMagnus Karlsson 
457361f9c3SMagnus Karlsson 	pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
46c2d3d6a4SMagnus Karlsson 	pool->cached_need_wakeup |= XDP_WAKEUP_RX;
4777cd0d7bSMagnus Karlsson }
4877cd0d7bSMagnus Karlsson EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
4977cd0d7bSMagnus Karlsson 
xsk_set_tx_need_wakeup(struct xsk_buff_pool * pool)50c4655761SMagnus Karlsson void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool)
5177cd0d7bSMagnus Karlsson {
5277cd0d7bSMagnus Karlsson 	struct xdp_sock *xs;
5377cd0d7bSMagnus Karlsson 
54c2d3d6a4SMagnus Karlsson 	if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
5577cd0d7bSMagnus Karlsson 		return;
5677cd0d7bSMagnus Karlsson 
5777cd0d7bSMagnus Karlsson 	rcu_read_lock();
58a5aa8e52SMagnus Karlsson 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
5977cd0d7bSMagnus Karlsson 		xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
6077cd0d7bSMagnus Karlsson 	}
6177cd0d7bSMagnus Karlsson 	rcu_read_unlock();
6277cd0d7bSMagnus Karlsson 
63c2d3d6a4SMagnus Karlsson 	pool->cached_need_wakeup |= XDP_WAKEUP_TX;
6477cd0d7bSMagnus Karlsson }
6577cd0d7bSMagnus Karlsson EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
6677cd0d7bSMagnus Karlsson 
xsk_clear_rx_need_wakeup(struct xsk_buff_pool * pool)67c4655761SMagnus Karlsson void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool)
6877cd0d7bSMagnus Karlsson {
69c2d3d6a4SMagnus Karlsson 	if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX))
7077cd0d7bSMagnus Karlsson 		return;
7177cd0d7bSMagnus Karlsson 
727361f9c3SMagnus Karlsson 	pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
73c2d3d6a4SMagnus Karlsson 	pool->cached_need_wakeup &= ~XDP_WAKEUP_RX;
7477cd0d7bSMagnus Karlsson }
7577cd0d7bSMagnus Karlsson EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
7677cd0d7bSMagnus Karlsson 
xsk_clear_tx_need_wakeup(struct xsk_buff_pool * pool)77c4655761SMagnus Karlsson void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool)
7877cd0d7bSMagnus Karlsson {
7977cd0d7bSMagnus Karlsson 	struct xdp_sock *xs;
8077cd0d7bSMagnus Karlsson 
81c2d3d6a4SMagnus Karlsson 	if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX))
8277cd0d7bSMagnus Karlsson 		return;
8377cd0d7bSMagnus Karlsson 
8477cd0d7bSMagnus Karlsson 	rcu_read_lock();
85a5aa8e52SMagnus Karlsson 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
8677cd0d7bSMagnus Karlsson 		xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
8777cd0d7bSMagnus Karlsson 	}
8877cd0d7bSMagnus Karlsson 	rcu_read_unlock();
8977cd0d7bSMagnus Karlsson 
90c2d3d6a4SMagnus Karlsson 	pool->cached_need_wakeup &= ~XDP_WAKEUP_TX;
9177cd0d7bSMagnus Karlsson }
9277cd0d7bSMagnus Karlsson EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
9377cd0d7bSMagnus Karlsson 
xsk_uses_need_wakeup(struct xsk_buff_pool * pool)94c4655761SMagnus Karlsson bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
9577cd0d7bSMagnus Karlsson {
96c2d3d6a4SMagnus Karlsson 	return pool->uses_need_wakeup;
9777cd0d7bSMagnus Karlsson }
98c4655761SMagnus Karlsson EXPORT_SYMBOL(xsk_uses_need_wakeup);
9977cd0d7bSMagnus Karlsson 
xsk_get_pool_from_qid(struct net_device * dev,u16 queue_id)1001c1efc2aSMagnus Karlsson struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
1011c1efc2aSMagnus Karlsson 					    u16 queue_id)
1021c1efc2aSMagnus Karlsson {
1031c1efc2aSMagnus Karlsson 	if (queue_id < dev->real_num_rx_queues)
1041c1efc2aSMagnus Karlsson 		return dev->_rx[queue_id].pool;
1051c1efc2aSMagnus Karlsson 	if (queue_id < dev->real_num_tx_queues)
1061c1efc2aSMagnus Karlsson 		return dev->_tx[queue_id].pool;
1071c1efc2aSMagnus Karlsson 
1081c1efc2aSMagnus Karlsson 	return NULL;
1091c1efc2aSMagnus Karlsson }
1101c1efc2aSMagnus Karlsson EXPORT_SYMBOL(xsk_get_pool_from_qid);
1111c1efc2aSMagnus Karlsson 
xsk_clear_pool_at_qid(struct net_device * dev,u16 queue_id)1121c1efc2aSMagnus Karlsson void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
1131c1efc2aSMagnus Karlsson {
114b425e24aSMaxim Mikityanskiy 	if (queue_id < dev->num_rx_queues)
1151c1efc2aSMagnus Karlsson 		dev->_rx[queue_id].pool = NULL;
116b425e24aSMaxim Mikityanskiy 	if (queue_id < dev->num_tx_queues)
1171c1efc2aSMagnus Karlsson 		dev->_tx[queue_id].pool = NULL;
1181c1efc2aSMagnus Karlsson }
1191c1efc2aSMagnus Karlsson 
1201c1efc2aSMagnus Karlsson /* The buffer pool is stored both in the _rx struct and the _tx struct as we do
1211c1efc2aSMagnus Karlsson  * not know if the device has more tx queues than rx, or the opposite.
1221c1efc2aSMagnus Karlsson  * This might also change during run time.
1231c1efc2aSMagnus Karlsson  */
xsk_reg_pool_at_qid(struct net_device * dev,struct xsk_buff_pool * pool,u16 queue_id)1241c1efc2aSMagnus Karlsson int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
1251c1efc2aSMagnus Karlsson 			u16 queue_id)
1261c1efc2aSMagnus Karlsson {
1271c1efc2aSMagnus Karlsson 	if (queue_id >= max_t(unsigned int,
1281c1efc2aSMagnus Karlsson 			      dev->real_num_rx_queues,
1291c1efc2aSMagnus Karlsson 			      dev->real_num_tx_queues))
1301c1efc2aSMagnus Karlsson 		return -EINVAL;
1311c1efc2aSMagnus Karlsson 
1321c1efc2aSMagnus Karlsson 	if (queue_id < dev->real_num_rx_queues)
1331c1efc2aSMagnus Karlsson 		dev->_rx[queue_id].pool = pool;
1341c1efc2aSMagnus Karlsson 	if (queue_id < dev->real_num_tx_queues)
1351c1efc2aSMagnus Karlsson 		dev->_tx[queue_id].pool = pool;
1361c1efc2aSMagnus Karlsson 
1371c1efc2aSMagnus Karlsson 	return 0;
1381c1efc2aSMagnus Karlsson }
1391c1efc2aSMagnus Karlsson 
__xsk_rcv_zc(struct xdp_sock * xs,struct xdp_buff_xsk * xskb,u32 len,u32 flags)140556444c4SMaciej Fijalkowski static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len,
141556444c4SMaciej Fijalkowski 			u32 flags)
142c05cd364SKevin Laatz {
1432b43470aSBjörn Töpel 	u64 addr;
1442b43470aSBjörn Töpel 	int err;
145c05cd364SKevin Laatz 
1462b43470aSBjörn Töpel 	addr = xp_get_handle(xskb);
14763a64a56STirthendu Sarkar 	err = xskq_prod_reserve_desc(xs->rx, addr, len, flags);
1482b43470aSBjörn Töpel 	if (err) {
1498aa5a335SCiara Loftus 		xs->rx_queue_full++;
1502b43470aSBjörn Töpel 		return err;
1512b43470aSBjörn Töpel 	}
152c05cd364SKevin Laatz 
1532b43470aSBjörn Töpel 	xp_release(xskb);
1542b43470aSBjörn Töpel 	return 0;
1552b43470aSBjörn Töpel }
156c05cd364SKevin Laatz 
xsk_rcv_zc(struct xdp_sock * xs,struct xdp_buff * xdp,u32 len)157556444c4SMaciej Fijalkowski static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
158556444c4SMaciej Fijalkowski {
159556444c4SMaciej Fijalkowski 	struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
16024ea5012SMaciej Fijalkowski 	u32 frags = xdp_buff_has_frags(xdp);
16124ea5012SMaciej Fijalkowski 	struct xdp_buff_xsk *pos, *tmp;
16224ea5012SMaciej Fijalkowski 	struct list_head *xskb_list;
16324ea5012SMaciej Fijalkowski 	u32 contd = 0;
16424ea5012SMaciej Fijalkowski 	int err;
165556444c4SMaciej Fijalkowski 
16624ea5012SMaciej Fijalkowski 	if (frags)
16724ea5012SMaciej Fijalkowski 		contd = XDP_PKT_CONTD;
16824ea5012SMaciej Fijalkowski 
16924ea5012SMaciej Fijalkowski 	err = __xsk_rcv_zc(xs, xskb, len, contd);
17026900989SMaciej Fijalkowski 	if (err)
17126900989SMaciej Fijalkowski 		goto err;
17226900989SMaciej Fijalkowski 	if (likely(!frags))
17326900989SMaciej Fijalkowski 		return 0;
17424ea5012SMaciej Fijalkowski 
17524ea5012SMaciej Fijalkowski 	xskb_list = &xskb->pool->xskb_list;
17624ea5012SMaciej Fijalkowski 	list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) {
17724ea5012SMaciej Fijalkowski 		if (list_is_singular(xskb_list))
17824ea5012SMaciej Fijalkowski 			contd = 0;
17924ea5012SMaciej Fijalkowski 		len = pos->xdp.data_end - pos->xdp.data;
18024ea5012SMaciej Fijalkowski 		err = __xsk_rcv_zc(xs, pos, len, contd);
18124ea5012SMaciej Fijalkowski 		if (err)
18226900989SMaciej Fijalkowski 			goto err;
18324ea5012SMaciej Fijalkowski 		list_del(&pos->xskb_list_node);
18424ea5012SMaciej Fijalkowski 	}
18524ea5012SMaciej Fijalkowski 
18626900989SMaciej Fijalkowski 	return 0;
18726900989SMaciej Fijalkowski err:
18826900989SMaciej Fijalkowski 	xsk_buff_free(xdp);
18924ea5012SMaciej Fijalkowski 	return err;
190556444c4SMaciej Fijalkowski }
191556444c4SMaciej Fijalkowski 
xsk_copy_xdp_start(struct xdp_buff * from)19280462775STirthendu Sarkar static void *xsk_copy_xdp_start(struct xdp_buff *from)
1932b43470aSBjörn Töpel {
19480462775STirthendu Sarkar 	if (unlikely(xdp_data_meta_unsupported(from)))
19580462775STirthendu Sarkar 		return from->data;
19680462775STirthendu Sarkar 	else
19780462775STirthendu Sarkar 		return from->data_meta;
198c05cd364SKevin Laatz }
199c05cd364SKevin Laatz 
xsk_copy_xdp(void * to,void ** from,u32 to_len,u32 * from_len,skb_frag_t ** frag,u32 rem)20080462775STirthendu Sarkar static u32 xsk_copy_xdp(void *to, void **from, u32 to_len,
20180462775STirthendu Sarkar 			u32 *from_len, skb_frag_t **frag, u32 rem)
20280462775STirthendu Sarkar {
20380462775STirthendu Sarkar 	u32 copied = 0;
20480462775STirthendu Sarkar 
20580462775STirthendu Sarkar 	while (1) {
20680462775STirthendu Sarkar 		u32 copy_len = min_t(u32, *from_len, to_len);
20780462775STirthendu Sarkar 
20880462775STirthendu Sarkar 		memcpy(to, *from, copy_len);
20980462775STirthendu Sarkar 		copied += copy_len;
21080462775STirthendu Sarkar 		if (rem == copied)
21180462775STirthendu Sarkar 			return copied;
21280462775STirthendu Sarkar 
21380462775STirthendu Sarkar 		if (*from_len == copy_len) {
21480462775STirthendu Sarkar 			*from = skb_frag_address(*frag);
21580462775STirthendu Sarkar 			*from_len = skb_frag_size((*frag)++);
21680462775STirthendu Sarkar 		} else {
21780462775STirthendu Sarkar 			*from += copy_len;
21880462775STirthendu Sarkar 			*from_len -= copy_len;
21980462775STirthendu Sarkar 		}
22080462775STirthendu Sarkar 		if (to_len == copy_len)
22180462775STirthendu Sarkar 			return copied;
22280462775STirthendu Sarkar 
22380462775STirthendu Sarkar 		to_len -= copy_len;
22480462775STirthendu Sarkar 		to += copy_len;
22580462775STirthendu Sarkar 	}
226c05cd364SKevin Laatz }
227c05cd364SKevin Laatz 
__xsk_rcv(struct xdp_sock * xs,struct xdp_buff * xdp,u32 len)228faa91b83STirthendu Sarkar static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
229173d3adbSBjörn Töpel {
23080462775STirthendu Sarkar 	u32 frame_size = xsk_pool_get_rx_frame_size(xs->pool);
23180462775STirthendu Sarkar 	void *copy_from = xsk_copy_xdp_start(xdp), *copy_to;
23280462775STirthendu Sarkar 	u32 from_len, meta_len, rem, num_desc;
233556444c4SMaciej Fijalkowski 	struct xdp_buff_xsk *xskb;
2342b43470aSBjörn Töpel 	struct xdp_buff *xsk_xdp;
23580462775STirthendu Sarkar 	skb_frag_t *frag;
23680462775STirthendu Sarkar 
23780462775STirthendu Sarkar 	from_len = xdp->data_end - copy_from;
23880462775STirthendu Sarkar 	meta_len = xdp->data - copy_from;
23980462775STirthendu Sarkar 	rem = len + meta_len;
24080462775STirthendu Sarkar 
24180462775STirthendu Sarkar 	if (len <= frame_size && !xdp_buff_has_frags(xdp)) {
2424e64c835SBjörn Töpel 		int err;
243c497176cSBjörn Töpel 
244c4655761SMagnus Karlsson 		xsk_xdp = xsk_buff_alloc(xs->pool);
2452b43470aSBjörn Töpel 		if (!xsk_xdp) {
2462b43470aSBjörn Töpel 			xs->rx_dropped++;
247c6c1f11bSBjörn Töpel 			return -ENOMEM;
24818baed26SBjörn Töpel 		}
24980462775STirthendu Sarkar 		memcpy(xsk_xdp->data - meta_len, copy_from, rem);
250556444c4SMaciej Fijalkowski 		xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp);
251556444c4SMaciej Fijalkowski 		err = __xsk_rcv_zc(xs, xskb, len, 0);
2522b43470aSBjörn Töpel 		if (err) {
2532b43470aSBjörn Töpel 			xsk_buff_free(xsk_xdp);
2542b43470aSBjörn Töpel 			return err;
2552b43470aSBjörn Töpel 		}
25680462775STirthendu Sarkar 
25780462775STirthendu Sarkar 		return 0;
25880462775STirthendu Sarkar 	}
25980462775STirthendu Sarkar 
26080462775STirthendu Sarkar 	num_desc = (len - 1) / frame_size + 1;
26180462775STirthendu Sarkar 
26280462775STirthendu Sarkar 	if (!xsk_buff_can_alloc(xs->pool, num_desc)) {
26380462775STirthendu Sarkar 		xs->rx_dropped++;
26480462775STirthendu Sarkar 		return -ENOMEM;
26580462775STirthendu Sarkar 	}
26680462775STirthendu Sarkar 	if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) {
26780462775STirthendu Sarkar 		xs->rx_queue_full++;
26880462775STirthendu Sarkar 		return -ENOBUFS;
26980462775STirthendu Sarkar 	}
27080462775STirthendu Sarkar 
27180462775STirthendu Sarkar 	if (xdp_buff_has_frags(xdp)) {
27280462775STirthendu Sarkar 		struct skb_shared_info *sinfo;
27380462775STirthendu Sarkar 
27480462775STirthendu Sarkar 		sinfo = xdp_get_shared_info_from_buff(xdp);
27580462775STirthendu Sarkar 		frag =  &sinfo->frags[0];
27680462775STirthendu Sarkar 	}
27780462775STirthendu Sarkar 
27880462775STirthendu Sarkar 	do {
27980462775STirthendu Sarkar 		u32 to_len = frame_size + meta_len;
28080462775STirthendu Sarkar 		u32 copied;
28180462775STirthendu Sarkar 
28280462775STirthendu Sarkar 		xsk_xdp = xsk_buff_alloc(xs->pool);
28380462775STirthendu Sarkar 		copy_to = xsk_xdp->data - meta_len;
28480462775STirthendu Sarkar 
28580462775STirthendu Sarkar 		copied = xsk_copy_xdp(copy_to, &copy_from, to_len, &from_len, &frag, rem);
28680462775STirthendu Sarkar 		rem -= copied;
28780462775STirthendu Sarkar 
28880462775STirthendu Sarkar 		xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp);
28980462775STirthendu Sarkar 		__xsk_rcv_zc(xs, xskb, copied - meta_len, rem ? XDP_PKT_CONTD : 0);
29080462775STirthendu Sarkar 		meta_len = 0;
29180462775STirthendu Sarkar 	} while (rem);
29280462775STirthendu Sarkar 
293173d3adbSBjörn Töpel 	return 0;
294173d3adbSBjörn Töpel }
295173d3adbSBjörn Töpel 
xsk_tx_writeable(struct xdp_sock * xs)2963413f041SXuan Zhuo static bool xsk_tx_writeable(struct xdp_sock *xs)
2973413f041SXuan Zhuo {
2983413f041SXuan Zhuo 	if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2)
2993413f041SXuan Zhuo 		return false;
3003413f041SXuan Zhuo 
3013413f041SXuan Zhuo 	return true;
3023413f041SXuan Zhuo }
3033413f041SXuan Zhuo 
xsk_is_bound(struct xdp_sock * xs)30442fddcc7SBjörn Töpel static bool xsk_is_bound(struct xdp_sock *xs)
30542fddcc7SBjörn Töpel {
30642fddcc7SBjörn Töpel 	if (READ_ONCE(xs->state) == XSK_BOUND) {
30742fddcc7SBjörn Töpel 		/* Matches smp_wmb() in bind(). */
30842fddcc7SBjörn Töpel 		smp_rmb();
30942fddcc7SBjörn Töpel 		return true;
31042fddcc7SBjörn Töpel 	}
31142fddcc7SBjörn Töpel 	return false;
31242fddcc7SBjörn Töpel }
31342fddcc7SBjörn Töpel 
xsk_rcv_check(struct xdp_sock * xs,struct xdp_buff * xdp,u32 len)314faa91b83STirthendu Sarkar static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
315c497176cSBjörn Töpel {
31642fddcc7SBjörn Töpel 	if (!xsk_is_bound(xs))
3172be4a677SMaciej Fijalkowski 		return -ENXIO;
31842fddcc7SBjörn Töpel 
319*7fcf26b3SMagnus Karlsson 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
320173d3adbSBjörn Töpel 		return -EINVAL;
321c497176cSBjörn Töpel 
32280462775STirthendu Sarkar 	if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) {
323faa91b83STirthendu Sarkar 		xs->rx_dropped++;
324faa91b83STirthendu Sarkar 		return -ENOSPC;
325faa91b83STirthendu Sarkar 	}
326faa91b83STirthendu Sarkar 
327b02e5a0eSBjörn Töpel 	sk_mark_napi_id_once_xdp(&xs->sk, xdp);
328458f7272SBjörn Töpel 	return 0;
329c497176cSBjörn Töpel }
330c497176cSBjörn Töpel 
xsk_flush(struct xdp_sock * xs)331d817991cSBjörn Töpel static void xsk_flush(struct xdp_sock *xs)
332c497176cSBjörn Töpel {
33359e35e55SMagnus Karlsson 	xskq_prod_submit(xs->rx);
3347361f9c3SMagnus Karlsson 	__xskq_cons_release(xs->pool->fq);
33543a825afSBjörn Töpel 	sock_def_readable(&xs->sk);
336c497176cSBjörn Töpel }
337c497176cSBjörn Töpel 
xsk_generic_rcv(struct xdp_sock * xs,struct xdp_buff * xdp)338c497176cSBjörn Töpel int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
339c497176cSBjörn Töpel {
340faa91b83STirthendu Sarkar 	u32 len = xdp_get_buff_len(xdp);
341c497176cSBjörn Töpel 	int err;
342c497176cSBjörn Töpel 
343bf0bdd13SIlya Maximets 	spin_lock_bh(&xs->rx_lock);
344faa91b83STirthendu Sarkar 	err = xsk_rcv_check(xs, xdp, len);
345458f7272SBjörn Töpel 	if (!err) {
346faa91b83STirthendu Sarkar 		err = __xsk_rcv(xs, xdp, len);
3472b43470aSBjörn Töpel 		xsk_flush(xs);
348458f7272SBjörn Töpel 	}
349bf0bdd13SIlya Maximets 	spin_unlock_bh(&xs->rx_lock);
350c497176cSBjörn Töpel 	return err;
351c497176cSBjörn Töpel }
352c497176cSBjörn Töpel 
xsk_rcv(struct xdp_sock * xs,struct xdp_buff * xdp)353458f7272SBjörn Töpel static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
354458f7272SBjörn Töpel {
355faa91b83STirthendu Sarkar 	u32 len = xdp_get_buff_len(xdp);
356458f7272SBjörn Töpel 	int err;
357458f7272SBjörn Töpel 
358faa91b83STirthendu Sarkar 	err = xsk_rcv_check(xs, xdp, len);
359458f7272SBjörn Töpel 	if (err)
360458f7272SBjörn Töpel 		return err;
361458f7272SBjörn Töpel 
362458f7272SBjörn Töpel 	if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) {
363458f7272SBjörn Töpel 		len = xdp->data_end - xdp->data;
364556444c4SMaciej Fijalkowski 		return xsk_rcv_zc(xs, xdp, len);
365458f7272SBjörn Töpel 	}
366458f7272SBjörn Töpel 
367faa91b83STirthendu Sarkar 	err = __xsk_rcv(xs, xdp, len);
368458f7272SBjörn Töpel 	if (!err)
369458f7272SBjörn Töpel 		xdp_return_buff(xdp);
370458f7272SBjörn Töpel 	return err;
371458f7272SBjörn Töpel }
372458f7272SBjörn Töpel 
__xsk_map_redirect(struct xdp_sock * xs,struct xdp_buff * xdp)373e312b9e7SBjörn Töpel int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
374d817991cSBjörn Töpel {
375e312b9e7SBjörn Töpel 	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
376d817991cSBjörn Töpel 	int err;
377d817991cSBjörn Töpel 
378458f7272SBjörn Töpel 	err = xsk_rcv(xs, xdp);
379d817991cSBjörn Töpel 	if (err)
380d817991cSBjörn Töpel 		return err;
381d817991cSBjörn Töpel 
382d817991cSBjörn Töpel 	if (!xs->flush_node.prev)
383d817991cSBjörn Töpel 		list_add(&xs->flush_node, flush_list);
384d817991cSBjörn Töpel 
385d817991cSBjörn Töpel 	return 0;
386d817991cSBjörn Töpel }
387d817991cSBjörn Töpel 
__xsk_map_flush(void)388e312b9e7SBjörn Töpel void __xsk_map_flush(void)
389d817991cSBjörn Töpel {
390e312b9e7SBjörn Töpel 	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
391d817991cSBjörn Töpel 	struct xdp_sock *xs, *tmp;
392d817991cSBjörn Töpel 
393d817991cSBjörn Töpel 	list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
394d817991cSBjörn Töpel 		xsk_flush(xs);
395d817991cSBjörn Töpel 		__list_del_clearprev(&xs->flush_node);
396d817991cSBjörn Töpel 	}
397d817991cSBjörn Töpel }
398d817991cSBjörn Töpel 
3999a675ba5SSebastian Andrzej Siewior #ifdef CONFIG_DEBUG_NET
xsk_map_check_flush(void)4009a675ba5SSebastian Andrzej Siewior bool xsk_map_check_flush(void)
4019a675ba5SSebastian Andrzej Siewior {
4029a675ba5SSebastian Andrzej Siewior 	if (list_empty(this_cpu_ptr(&xskmap_flush_list)))
4039a675ba5SSebastian Andrzej Siewior 		return false;
4049a675ba5SSebastian Andrzej Siewior 	__xsk_map_flush();
4059a675ba5SSebastian Andrzej Siewior 	return true;
4069a675ba5SSebastian Andrzej Siewior }
4079a675ba5SSebastian Andrzej Siewior #endif
4089a675ba5SSebastian Andrzej Siewior 
xsk_tx_completed(struct xsk_buff_pool * pool,u32 nb_entries)409c4655761SMagnus Karlsson void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
410ac98d8aaSMagnus Karlsson {
4117361f9c3SMagnus Karlsson 	xskq_prod_submit_n(pool->cq, nb_entries);
412ac98d8aaSMagnus Karlsson }
413c4655761SMagnus Karlsson EXPORT_SYMBOL(xsk_tx_completed);
414ac98d8aaSMagnus Karlsson 
xsk_tx_release(struct xsk_buff_pool * pool)415c4655761SMagnus Karlsson void xsk_tx_release(struct xsk_buff_pool *pool)
416ac98d8aaSMagnus Karlsson {
417ac98d8aaSMagnus Karlsson 	struct xdp_sock *xs;
418ac98d8aaSMagnus Karlsson 
419ac98d8aaSMagnus Karlsson 	rcu_read_lock();
420a5aa8e52SMagnus Karlsson 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
42130744a68SMagnus Karlsson 		__xskq_cons_release(xs->tx);
4223413f041SXuan Zhuo 		if (xsk_tx_writeable(xs))
423ac98d8aaSMagnus Karlsson 			xs->sk.sk_write_space(&xs->sk);
424ac98d8aaSMagnus Karlsson 	}
425ac98d8aaSMagnus Karlsson 	rcu_read_unlock();
426ac98d8aaSMagnus Karlsson }
427c4655761SMagnus Karlsson EXPORT_SYMBOL(xsk_tx_release);
428ac98d8aaSMagnus Karlsson 
xsk_tx_peek_desc(struct xsk_buff_pool * pool,struct xdp_desc * desc)429c4655761SMagnus Karlsson bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
430ac98d8aaSMagnus Karlsson {
43199b29a49SAlbert Huang 	bool budget_exhausted = false;
432ac98d8aaSMagnus Karlsson 	struct xdp_sock *xs;
433ac98d8aaSMagnus Karlsson 
434ac98d8aaSMagnus Karlsson 	rcu_read_lock();
43599b29a49SAlbert Huang again:
436a5aa8e52SMagnus Karlsson 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
43799b29a49SAlbert Huang 		if (xs->tx_budget_spent >= MAX_PER_SOCKET_BUDGET) {
43899b29a49SAlbert Huang 			budget_exhausted = true;
43999b29a49SAlbert Huang 			continue;
44099b29a49SAlbert Huang 		}
44199b29a49SAlbert Huang 
4421c1efc2aSMagnus Karlsson 		if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
443cf24f5a5STirthendu Sarkar 			if (xskq_has_descs(xs->tx))
444cf24f5a5STirthendu Sarkar 				xskq_cons_release(xs->tx);
445ac98d8aaSMagnus Karlsson 			continue;
4468aa5a335SCiara Loftus 		}
447ac98d8aaSMagnus Karlsson 
44899b29a49SAlbert Huang 		xs->tx_budget_spent++;
44999b29a49SAlbert Huang 
4500a05861fSTobias Klauser 		/* This is the backpressure mechanism for the Tx path.
45115d8c916SMagnus Karlsson 		 * Reserve space in the completion queue and only proceed
45215d8c916SMagnus Karlsson 		 * if there is space in it. This avoids having to implement
45315d8c916SMagnus Karlsson 		 * any buffering in the Tx path.
45415d8c916SMagnus Karlsson 		 */
4557361f9c3SMagnus Karlsson 		if (xskq_prod_reserve_addr(pool->cq, desc->addr))
456ac98d8aaSMagnus Karlsson 			goto out;
457ac98d8aaSMagnus Karlsson 
458c5ed924bSMagnus Karlsson 		xskq_cons_release(xs->tx);
459ac98d8aaSMagnus Karlsson 		rcu_read_unlock();
460ac98d8aaSMagnus Karlsson 		return true;
461ac98d8aaSMagnus Karlsson 	}
462ac98d8aaSMagnus Karlsson 
46399b29a49SAlbert Huang 	if (budget_exhausted) {
46499b29a49SAlbert Huang 		list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list)
46599b29a49SAlbert Huang 			xs->tx_budget_spent = 0;
46699b29a49SAlbert Huang 
46799b29a49SAlbert Huang 		budget_exhausted = false;
46899b29a49SAlbert Huang 		goto again;
46999b29a49SAlbert Huang 	}
47099b29a49SAlbert Huang 
471ac98d8aaSMagnus Karlsson out:
472ac98d8aaSMagnus Karlsson 	rcu_read_unlock();
473ac98d8aaSMagnus Karlsson 	return false;
474ac98d8aaSMagnus Karlsson }
475c4655761SMagnus Karlsson EXPORT_SYMBOL(xsk_tx_peek_desc);
476ac98d8aaSMagnus Karlsson 
xsk_tx_peek_release_fallback(struct xsk_buff_pool * pool,u32 max_entries)477d1bc532eSMagnus Karlsson static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entries)
4789349eb3aSMagnus Karlsson {
479d1bc532eSMagnus Karlsson 	struct xdp_desc *descs = pool->tx_descs;
4809349eb3aSMagnus Karlsson 	u32 nb_pkts = 0;
4819349eb3aSMagnus Karlsson 
4829349eb3aSMagnus Karlsson 	while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts]))
4839349eb3aSMagnus Karlsson 		nb_pkts++;
4849349eb3aSMagnus Karlsson 
4859349eb3aSMagnus Karlsson 	xsk_tx_release(pool);
4869349eb3aSMagnus Karlsson 	return nb_pkts;
4879349eb3aSMagnus Karlsson }
4889349eb3aSMagnus Karlsson 
xsk_tx_peek_release_desc_batch(struct xsk_buff_pool * pool,u32 nb_pkts)489c00c4461SMaciej Fijalkowski u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_pkts)
4909349eb3aSMagnus Karlsson {
4919349eb3aSMagnus Karlsson 	struct xdp_sock *xs;
4929349eb3aSMagnus Karlsson 
4939349eb3aSMagnus Karlsson 	rcu_read_lock();
4949349eb3aSMagnus Karlsson 	if (!list_is_singular(&pool->xsk_tx_list)) {
4959349eb3aSMagnus Karlsson 		/* Fallback to the non-batched version */
4969349eb3aSMagnus Karlsson 		rcu_read_unlock();
497c00c4461SMaciej Fijalkowski 		return xsk_tx_peek_release_fallback(pool, nb_pkts);
4989349eb3aSMagnus Karlsson 	}
4999349eb3aSMagnus Karlsson 
5009349eb3aSMagnus Karlsson 	xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
5019349eb3aSMagnus Karlsson 	if (!xs) {
5029349eb3aSMagnus Karlsson 		nb_pkts = 0;
5039349eb3aSMagnus Karlsson 		goto out;
5049349eb3aSMagnus Karlsson 	}
5059349eb3aSMagnus Karlsson 
506c00c4461SMaciej Fijalkowski 	nb_pkts = xskq_cons_nb_entries(xs->tx, nb_pkts);
5079349eb3aSMagnus Karlsson 
5089349eb3aSMagnus Karlsson 	/* This is the backpressure mechanism for the Tx path. Try to
5099349eb3aSMagnus Karlsson 	 * reserve space in the completion queue for all packets, but
5109349eb3aSMagnus Karlsson 	 * if there are fewer slots available, just process that many
5119349eb3aSMagnus Karlsson 	 * packets. This avoids having to implement any buffering in
5129349eb3aSMagnus Karlsson 	 * the Tx path.
5139349eb3aSMagnus Karlsson 	 */
514c00c4461SMaciej Fijalkowski 	nb_pkts = xskq_prod_nb_free(pool->cq, nb_pkts);
5159349eb3aSMagnus Karlsson 	if (!nb_pkts)
5169349eb3aSMagnus Karlsson 		goto out;
5179349eb3aSMagnus Karlsson 
518c00c4461SMaciej Fijalkowski 	nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, nb_pkts);
519c00c4461SMaciej Fijalkowski 	if (!nb_pkts) {
520c00c4461SMaciej Fijalkowski 		xs->tx->queue_empty_descs++;
521c00c4461SMaciej Fijalkowski 		goto out;
522c00c4461SMaciej Fijalkowski 	}
523c00c4461SMaciej Fijalkowski 
5249349eb3aSMagnus Karlsson 	__xskq_cons_release(xs->tx);
525c00c4461SMaciej Fijalkowski 	xskq_prod_write_addr_batch(pool->cq, pool->tx_descs, nb_pkts);
5269349eb3aSMagnus Karlsson 	xs->sk.sk_write_space(&xs->sk);
5279349eb3aSMagnus Karlsson 
5289349eb3aSMagnus Karlsson out:
5299349eb3aSMagnus Karlsson 	rcu_read_unlock();
5309349eb3aSMagnus Karlsson 	return nb_pkts;
5319349eb3aSMagnus Karlsson }
5329349eb3aSMagnus Karlsson EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch);
5339349eb3aSMagnus Karlsson 
xsk_wakeup(struct xdp_sock * xs,u8 flags)53406870682SMaxim Mikityanskiy static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
535ac98d8aaSMagnus Karlsson {
536ac98d8aaSMagnus Karlsson 	struct net_device *dev = xs->dev;
537ac98d8aaSMagnus Karlsson 
53818b1ab7aSMagnus Karlsson 	return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
539ac98d8aaSMagnus Karlsson }
540ac98d8aaSMagnus Karlsson 
xsk_cq_reserve_addr_locked(struct xdp_sock * xs,u64 addr)541b7f72a30STirthendu Sarkar static int xsk_cq_reserve_addr_locked(struct xdp_sock *xs, u64 addr)
54235fcde7fSMagnus Karlsson {
543b7f72a30STirthendu Sarkar 	unsigned long flags;
544b7f72a30STirthendu Sarkar 	int ret;
545b7f72a30STirthendu Sarkar 
546b7f72a30STirthendu Sarkar 	spin_lock_irqsave(&xs->pool->cq_lock, flags);
547b7f72a30STirthendu Sarkar 	ret = xskq_prod_reserve_addr(xs->pool->cq, addr);
548b7f72a30STirthendu Sarkar 	spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
549b7f72a30STirthendu Sarkar 
550b7f72a30STirthendu Sarkar 	return ret;
551b7f72a30STirthendu Sarkar }
552b7f72a30STirthendu Sarkar 
xsk_cq_submit_locked(struct xdp_sock * xs,u32 n)553b7f72a30STirthendu Sarkar static void xsk_cq_submit_locked(struct xdp_sock *xs, u32 n)
554b7f72a30STirthendu Sarkar {
555a9744f7cSMagnus Karlsson 	unsigned long flags;
55635fcde7fSMagnus Karlsson 
557f09ced40SMagnus Karlsson 	spin_lock_irqsave(&xs->pool->cq_lock, flags);
558b7f72a30STirthendu Sarkar 	xskq_prod_submit_n(xs->pool->cq, n);
559f09ced40SMagnus Karlsson 	spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
560b7f72a30STirthendu Sarkar }
56135fcde7fSMagnus Karlsson 
xsk_cq_cancel_locked(struct xdp_sock * xs,u32 n)562b7f72a30STirthendu Sarkar static void xsk_cq_cancel_locked(struct xdp_sock *xs, u32 n)
563b7f72a30STirthendu Sarkar {
564b7f72a30STirthendu Sarkar 	unsigned long flags;
565b7f72a30STirthendu Sarkar 
566b7f72a30STirthendu Sarkar 	spin_lock_irqsave(&xs->pool->cq_lock, flags);
567b7f72a30STirthendu Sarkar 	xskq_prod_cancel_n(xs->pool->cq, n);
568b7f72a30STirthendu Sarkar 	spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
569b7f72a30STirthendu Sarkar }
570b7f72a30STirthendu Sarkar 
xsk_get_num_desc(struct sk_buff * skb)571b7f72a30STirthendu Sarkar static u32 xsk_get_num_desc(struct sk_buff *skb)
572b7f72a30STirthendu Sarkar {
573b7f72a30STirthendu Sarkar 	return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
574b7f72a30STirthendu Sarkar }
575b7f72a30STirthendu Sarkar 
xsk_destruct_skb(struct sk_buff * skb)576b7f72a30STirthendu Sarkar static void xsk_destruct_skb(struct sk_buff *skb)
577b7f72a30STirthendu Sarkar {
57848eb03ddSStanislav Fomichev 	struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
57948eb03ddSStanislav Fomichev 
58048eb03ddSStanislav Fomichev 	if (compl->tx_timestamp) {
58148eb03ddSStanislav Fomichev 		/* sw completion timestamp, not a real one */
58248eb03ddSStanislav Fomichev 		*compl->tx_timestamp = ktime_get_tai_fast_ns();
58348eb03ddSStanislav Fomichev 	}
58448eb03ddSStanislav Fomichev 
585b7f72a30STirthendu Sarkar 	xsk_cq_submit_locked(xdp_sk(skb->sk), xsk_get_num_desc(skb));
58635fcde7fSMagnus Karlsson 	sock_wfree(skb);
58735fcde7fSMagnus Karlsson }
58835fcde7fSMagnus Karlsson 
xsk_set_destructor_arg(struct sk_buff * skb)589b7f72a30STirthendu Sarkar static void xsk_set_destructor_arg(struct sk_buff *skb)
590b7f72a30STirthendu Sarkar {
591b7f72a30STirthendu Sarkar 	long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
592b7f72a30STirthendu Sarkar 
593b7f72a30STirthendu Sarkar 	skb_shinfo(skb)->destructor_arg = (void *)num;
594b7f72a30STirthendu Sarkar }
595b7f72a30STirthendu Sarkar 
xsk_consume_skb(struct sk_buff * skb)596b7f72a30STirthendu Sarkar static void xsk_consume_skb(struct sk_buff *skb)
597b7f72a30STirthendu Sarkar {
598b7f72a30STirthendu Sarkar 	struct xdp_sock *xs = xdp_sk(skb->sk);
599b7f72a30STirthendu Sarkar 
600b7f72a30STirthendu Sarkar 	skb->destructor = sock_wfree;
601b7f72a30STirthendu Sarkar 	xsk_cq_cancel_locked(xs, xsk_get_num_desc(skb));
602b7f72a30STirthendu Sarkar 	/* Free skb without triggering the perf drop trace */
603b7f72a30STirthendu Sarkar 	consume_skb(skb);
604b7f72a30STirthendu Sarkar 	xs->skb = NULL;
605b7f72a30STirthendu Sarkar }
606b7f72a30STirthendu Sarkar 
xsk_drop_skb(struct sk_buff * skb)607cf24f5a5STirthendu Sarkar static void xsk_drop_skb(struct sk_buff *skb)
608cf24f5a5STirthendu Sarkar {
609cf24f5a5STirthendu Sarkar 	xdp_sk(skb->sk)->tx->invalid_descs += xsk_get_num_desc(skb);
610cf24f5a5STirthendu Sarkar 	xsk_consume_skb(skb);
611cf24f5a5STirthendu Sarkar }
612cf24f5a5STirthendu Sarkar 
xsk_build_skb_zerocopy(struct xdp_sock * xs,struct xdp_desc * desc)6139c8f21e6SXuan Zhuo static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
6149c8f21e6SXuan Zhuo 					      struct xdp_desc *desc)
6159c8f21e6SXuan Zhuo {
6169c8f21e6SXuan Zhuo 	struct xsk_buff_pool *pool = xs->pool;
6179c8f21e6SXuan Zhuo 	u32 hr, len, ts, offset, copy, copied;
618cf24f5a5STirthendu Sarkar 	struct sk_buff *skb = xs->skb;
6199c8f21e6SXuan Zhuo 	struct page *page;
6209c8f21e6SXuan Zhuo 	void *buffer;
6219c8f21e6SXuan Zhuo 	int err, i;
6229c8f21e6SXuan Zhuo 	u64 addr;
6239c8f21e6SXuan Zhuo 
624cf24f5a5STirthendu Sarkar 	if (!skb) {
6259c8f21e6SXuan Zhuo 		hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
6269c8f21e6SXuan Zhuo 
6279c8f21e6SXuan Zhuo 		skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
6289c8f21e6SXuan Zhuo 		if (unlikely(!skb))
6299c8f21e6SXuan Zhuo 			return ERR_PTR(err);
6309c8f21e6SXuan Zhuo 
6319c8f21e6SXuan Zhuo 		skb_reserve(skb, hr);
632cf24f5a5STirthendu Sarkar 	}
6339c8f21e6SXuan Zhuo 
6349c8f21e6SXuan Zhuo 	addr = desc->addr;
6359c8f21e6SXuan Zhuo 	len = desc->len;
6369c8f21e6SXuan Zhuo 	ts = pool->unaligned ? len : pool->chunk_size;
6379c8f21e6SXuan Zhuo 
6389c8f21e6SXuan Zhuo 	buffer = xsk_buff_raw_get_data(pool, addr);
6399c8f21e6SXuan Zhuo 	offset = offset_in_page(buffer);
6409c8f21e6SXuan Zhuo 	addr = buffer - pool->addrs;
6419c8f21e6SXuan Zhuo 
642cf24f5a5STirthendu Sarkar 	for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) {
643cf24f5a5STirthendu Sarkar 		if (unlikely(i >= MAX_SKB_FRAGS))
6449d0a67b9STirthendu Sarkar 			return ERR_PTR(-EOVERFLOW);
645cf24f5a5STirthendu Sarkar 
6469c8f21e6SXuan Zhuo 		page = pool->umem->pgs[addr >> PAGE_SHIFT];
6479c8f21e6SXuan Zhuo 		get_page(page);
6489c8f21e6SXuan Zhuo 
6499c8f21e6SXuan Zhuo 		copy = min_t(u32, PAGE_SIZE - offset, len - copied);
6509c8f21e6SXuan Zhuo 		skb_fill_page_desc(skb, i, page, offset, copy);
6519c8f21e6SXuan Zhuo 
6529c8f21e6SXuan Zhuo 		copied += copy;
6539c8f21e6SXuan Zhuo 		addr += copy;
6549c8f21e6SXuan Zhuo 		offset = 0;
6559c8f21e6SXuan Zhuo 	}
6569c8f21e6SXuan Zhuo 
6579c8f21e6SXuan Zhuo 	skb->len += len;
6589c8f21e6SXuan Zhuo 	skb->data_len += len;
6599c8f21e6SXuan Zhuo 	skb->truesize += ts;
6609c8f21e6SXuan Zhuo 
6619c8f21e6SXuan Zhuo 	refcount_add(ts, &xs->sk.sk_wmem_alloc);
6629c8f21e6SXuan Zhuo 
6639c8f21e6SXuan Zhuo 	return skb;
6649c8f21e6SXuan Zhuo }
6659c8f21e6SXuan Zhuo 
xsk_build_skb(struct xdp_sock * xs,struct xdp_desc * desc)6669c8f21e6SXuan Zhuo static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
6679c8f21e6SXuan Zhuo 				     struct xdp_desc *desc)
6689c8f21e6SXuan Zhuo {
66948eb03ddSStanislav Fomichev 	struct xsk_tx_metadata *meta = NULL;
6709c8f21e6SXuan Zhuo 	struct net_device *dev = xs->dev;
671cf24f5a5STirthendu Sarkar 	struct sk_buff *skb = xs->skb;
67248eb03ddSStanislav Fomichev 	bool first_frag = false;
673cf24f5a5STirthendu Sarkar 	int err;
6749c8f21e6SXuan Zhuo 
6759c8f21e6SXuan Zhuo 	if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
6769c8f21e6SXuan Zhuo 		skb = xsk_build_skb_zerocopy(xs, desc);
677cf24f5a5STirthendu Sarkar 		if (IS_ERR(skb)) {
678cf24f5a5STirthendu Sarkar 			err = PTR_ERR(skb);
679cf24f5a5STirthendu Sarkar 			goto free_err;
680cf24f5a5STirthendu Sarkar 		}
6819c8f21e6SXuan Zhuo 	} else {
6829c8f21e6SXuan Zhuo 		u32 hr, tr, len;
6839c8f21e6SXuan Zhuo 		void *buffer;
6849c8f21e6SXuan Zhuo 
685cf24f5a5STirthendu Sarkar 		buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
6869c8f21e6SXuan Zhuo 		len = desc->len;
6879c8f21e6SXuan Zhuo 
688cf24f5a5STirthendu Sarkar 		if (!skb) {
689cf24f5a5STirthendu Sarkar 			hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
690cf24f5a5STirthendu Sarkar 			tr = dev->needed_tailroom;
6919c8f21e6SXuan Zhuo 			skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
6929c8f21e6SXuan Zhuo 			if (unlikely(!skb))
693cf24f5a5STirthendu Sarkar 				goto free_err;
6949c8f21e6SXuan Zhuo 
6959c8f21e6SXuan Zhuo 			skb_reserve(skb, hr);
6969c8f21e6SXuan Zhuo 			skb_put(skb, len);
6979c8f21e6SXuan Zhuo 
6989c8f21e6SXuan Zhuo 			err = skb_store_bits(skb, 0, buffer, len);
6999d0a67b9STirthendu Sarkar 			if (unlikely(err)) {
7009d0a67b9STirthendu Sarkar 				kfree_skb(skb);
701cf24f5a5STirthendu Sarkar 				goto free_err;
7029d0a67b9STirthendu Sarkar 			}
70348eb03ddSStanislav Fomichev 
70448eb03ddSStanislav Fomichev 			first_frag = true;
705cf24f5a5STirthendu Sarkar 		} else {
706cf24f5a5STirthendu Sarkar 			int nr_frags = skb_shinfo(skb)->nr_frags;
707cf24f5a5STirthendu Sarkar 			struct page *page;
708cf24f5a5STirthendu Sarkar 			u8 *vaddr;
709cf24f5a5STirthendu Sarkar 
710cf24f5a5STirthendu Sarkar 			if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) {
7119d0a67b9STirthendu Sarkar 				err = -EOVERFLOW;
712cf24f5a5STirthendu Sarkar 				goto free_err;
713cf24f5a5STirthendu Sarkar 			}
714cf24f5a5STirthendu Sarkar 
715cf24f5a5STirthendu Sarkar 			page = alloc_page(xs->sk.sk_allocation);
716cf24f5a5STirthendu Sarkar 			if (unlikely(!page)) {
717cf24f5a5STirthendu Sarkar 				err = -EAGAIN;
718cf24f5a5STirthendu Sarkar 				goto free_err;
719cf24f5a5STirthendu Sarkar 			}
720cf24f5a5STirthendu Sarkar 
721cf24f5a5STirthendu Sarkar 			vaddr = kmap_local_page(page);
722cf24f5a5STirthendu Sarkar 			memcpy(vaddr, buffer, len);
723cf24f5a5STirthendu Sarkar 			kunmap_local(vaddr);
724cf24f5a5STirthendu Sarkar 
7252127c604SSebastian Andrzej Siewior 			skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE);
7262127c604SSebastian Andrzej Siewior 			refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
7279c8f21e6SXuan Zhuo 		}
72848eb03ddSStanislav Fomichev 
72948eb03ddSStanislav Fomichev 		if (first_frag && desc->options & XDP_TX_METADATA) {
73048eb03ddSStanislav Fomichev 			if (unlikely(xs->pool->tx_metadata_len == 0)) {
73148eb03ddSStanislav Fomichev 				err = -EINVAL;
73248eb03ddSStanislav Fomichev 				goto free_err;
73348eb03ddSStanislav Fomichev 			}
73448eb03ddSStanislav Fomichev 
73548eb03ddSStanislav Fomichev 			meta = buffer - xs->pool->tx_metadata_len;
736ce59f968SStanislav Fomichev 			if (unlikely(!xsk_buff_valid_tx_metadata(meta))) {
737ce59f968SStanislav Fomichev 				err = -EINVAL;
738ce59f968SStanislav Fomichev 				goto free_err;
739ce59f968SStanislav Fomichev 			}
74048eb03ddSStanislav Fomichev 
74148eb03ddSStanislav Fomichev 			if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) {
74248eb03ddSStanislav Fomichev 				if (unlikely(meta->request.csum_start +
74348eb03ddSStanislav Fomichev 					     meta->request.csum_offset +
74448eb03ddSStanislav Fomichev 					     sizeof(__sum16) > len)) {
74548eb03ddSStanislav Fomichev 					err = -EINVAL;
74648eb03ddSStanislav Fomichev 					goto free_err;
74748eb03ddSStanislav Fomichev 				}
74848eb03ddSStanislav Fomichev 
74948eb03ddSStanislav Fomichev 				skb->csum_start = hr + meta->request.csum_start;
75048eb03ddSStanislav Fomichev 				skb->csum_offset = meta->request.csum_offset;
75148eb03ddSStanislav Fomichev 				skb->ip_summed = CHECKSUM_PARTIAL;
75211614723SStanislav Fomichev 
75311614723SStanislav Fomichev 				if (unlikely(xs->pool->tx_sw_csum)) {
75411614723SStanislav Fomichev 					err = skb_checksum_help(skb);
75511614723SStanislav Fomichev 					if (err)
75611614723SStanislav Fomichev 						goto free_err;
75711614723SStanislav Fomichev 				}
75848eb03ddSStanislav Fomichev 			}
75948eb03ddSStanislav Fomichev 		}
7609c8f21e6SXuan Zhuo 	}
7619c8f21e6SXuan Zhuo 
7629c8f21e6SXuan Zhuo 	skb->dev = dev;
76310bbf165SEric Dumazet 	skb->priority = READ_ONCE(xs->sk.sk_priority);
7643c5b4d69SEric Dumazet 	skb->mark = READ_ONCE(xs->sk.sk_mark);
7659c8f21e6SXuan Zhuo 	skb->destructor = xsk_destruct_skb;
76648eb03ddSStanislav Fomichev 	xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
767b7f72a30STirthendu Sarkar 	xsk_set_destructor_arg(skb);
7689c8f21e6SXuan Zhuo 
7699c8f21e6SXuan Zhuo 	return skb;
770cf24f5a5STirthendu Sarkar 
771cf24f5a5STirthendu Sarkar free_err:
7729d0a67b9STirthendu Sarkar 	if (err == -EOVERFLOW) {
7739d0a67b9STirthendu Sarkar 		/* Drop the packet */
7749d0a67b9STirthendu Sarkar 		xsk_set_destructor_arg(xs->skb);
7759d0a67b9STirthendu Sarkar 		xsk_drop_skb(xs->skb);
776cf24f5a5STirthendu Sarkar 		xskq_cons_release(xs->tx);
7779d0a67b9STirthendu Sarkar 	} else {
7789d0a67b9STirthendu Sarkar 		/* Let application retry */
7799d0a67b9STirthendu Sarkar 		xsk_cq_cancel_locked(xs, 1);
780cf24f5a5STirthendu Sarkar 	}
781cf24f5a5STirthendu Sarkar 
782cf24f5a5STirthendu Sarkar 	return ERR_PTR(err);
7839c8f21e6SXuan Zhuo }
7849c8f21e6SXuan Zhuo 
__xsk_generic_xmit(struct sock * sk)7851596dae2SMaciej Fijalkowski static int __xsk_generic_xmit(struct sock *sk)
78635fcde7fSMagnus Karlsson {
78735fcde7fSMagnus Karlsson 	struct xdp_sock *xs = xdp_sk(sk);
788df551058SMagnus Karlsson 	u32 max_batch = TX_BATCH_SIZE;
78935fcde7fSMagnus Karlsson 	bool sent_frame = false;
79035fcde7fSMagnus Karlsson 	struct xdp_desc desc;
79135fcde7fSMagnus Karlsson 	struct sk_buff *skb;
79235fcde7fSMagnus Karlsson 	int err = 0;
79335fcde7fSMagnus Karlsson 
79435fcde7fSMagnus Karlsson 	mutex_lock(&xs->mutex);
79535fcde7fSMagnus Karlsson 
79618b1ab7aSMagnus Karlsson 	/* Since we dropped the RCU read lock, the socket state might have changed. */
79718b1ab7aSMagnus Karlsson 	if (unlikely(!xsk_is_bound(xs))) {
79818b1ab7aSMagnus Karlsson 		err = -ENXIO;
79918b1ab7aSMagnus Karlsson 		goto out;
80018b1ab7aSMagnus Karlsson 	}
80118b1ab7aSMagnus Karlsson 
80267571640SIlya Maximets 	if (xs->queue_id >= xs->dev->real_num_tx_queues)
80367571640SIlya Maximets 		goto out;
80467571640SIlya Maximets 
8051c1efc2aSMagnus Karlsson 	while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
80635fcde7fSMagnus Karlsson 		if (max_batch-- == 0) {
80735fcde7fSMagnus Karlsson 			err = -EAGAIN;
80835fcde7fSMagnus Karlsson 			goto out;
80935fcde7fSMagnus Karlsson 		}
81035fcde7fSMagnus Karlsson 
8110a05861fSTobias Klauser 		/* This is the backpressure mechanism for the Tx path.
81215d8c916SMagnus Karlsson 		 * Reserve space in the completion queue and only proceed
81315d8c916SMagnus Karlsson 		 * if there is space in it. This avoids having to implement
81415d8c916SMagnus Karlsson 		 * any buffering in the Tx path.
81515d8c916SMagnus Karlsson 		 */
816b7f72a30STirthendu Sarkar 		if (xsk_cq_reserve_addr_locked(xs, desc.addr))
81735fcde7fSMagnus Karlsson 			goto out;
81835fcde7fSMagnus Karlsson 
819a6e944f2SCiara Loftus 		skb = xsk_build_skb(xs, &desc);
820a6e944f2SCiara Loftus 		if (IS_ERR(skb)) {
821a6e944f2SCiara Loftus 			err = PTR_ERR(skb);
8229d0a67b9STirthendu Sarkar 			if (err != -EOVERFLOW)
823a6e944f2SCiara Loftus 				goto out;
824cf24f5a5STirthendu Sarkar 			err = 0;
825cf24f5a5STirthendu Sarkar 			continue;
826cf24f5a5STirthendu Sarkar 		}
827cf24f5a5STirthendu Sarkar 
828cf24f5a5STirthendu Sarkar 		xskq_cons_release(xs->tx);
829cf24f5a5STirthendu Sarkar 
830cf24f5a5STirthendu Sarkar 		if (xp_mb_desc(&desc)) {
831cf24f5a5STirthendu Sarkar 			xs->skb = skb;
832cf24f5a5STirthendu Sarkar 			continue;
833a6e944f2SCiara Loftus 		}
834a6e944f2SCiara Loftus 
83536ccdf85SBjörn Töpel 		err = __dev_direct_xmit(skb, xs->queue_id);
836642e450bSMagnus Karlsson 		if  (err == NETDEV_TX_BUSY) {
837642e450bSMagnus Karlsson 			/* Tell user-space to retry the send */
838cf24f5a5STirthendu Sarkar 			xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb));
839b7f72a30STirthendu Sarkar 			xsk_consume_skb(skb);
840642e450bSMagnus Karlsson 			err = -EAGAIN;
841642e450bSMagnus Karlsson 			goto out;
842642e450bSMagnus Karlsson 		}
843642e450bSMagnus Karlsson 
84435fcde7fSMagnus Karlsson 		/* Ignore NET_XMIT_CN as packet might have been sent */
845642e450bSMagnus Karlsson 		if (err == NET_XMIT_DROP) {
846fe588685SMagnus Karlsson 			/* SKB completed but not sent */
847fe588685SMagnus Karlsson 			err = -EBUSY;
848cf24f5a5STirthendu Sarkar 			xs->skb = NULL;
84935fcde7fSMagnus Karlsson 			goto out;
85035fcde7fSMagnus Karlsson 		}
85135fcde7fSMagnus Karlsson 
85235fcde7fSMagnus Karlsson 		sent_frame = true;
853cf24f5a5STirthendu Sarkar 		xs->skb = NULL;
85435fcde7fSMagnus Karlsson 	}
85535fcde7fSMagnus Karlsson 
856cf24f5a5STirthendu Sarkar 	if (xskq_has_descs(xs->tx)) {
857cf24f5a5STirthendu Sarkar 		if (xs->skb)
858cf24f5a5STirthendu Sarkar 			xsk_drop_skb(xs->skb);
859cf24f5a5STirthendu Sarkar 		xskq_cons_release(xs->tx);
860cf24f5a5STirthendu Sarkar 	}
8618aa5a335SCiara Loftus 
86235fcde7fSMagnus Karlsson out:
86335fcde7fSMagnus Karlsson 	if (sent_frame)
8643413f041SXuan Zhuo 		if (xsk_tx_writeable(xs))
86535fcde7fSMagnus Karlsson 			sk->sk_write_space(sk);
86635fcde7fSMagnus Karlsson 
86735fcde7fSMagnus Karlsson 	mutex_unlock(&xs->mutex);
86835fcde7fSMagnus Karlsson 	return err;
86935fcde7fSMagnus Karlsson }
87035fcde7fSMagnus Karlsson 
xsk_generic_xmit(struct sock * sk)8711596dae2SMaciej Fijalkowski static int xsk_generic_xmit(struct sock *sk)
872df551058SMagnus Karlsson {
87318b1ab7aSMagnus Karlsson 	int ret;
874df551058SMagnus Karlsson 
87518b1ab7aSMagnus Karlsson 	/* Drop the RCU lock since the SKB path might sleep. */
87618b1ab7aSMagnus Karlsson 	rcu_read_unlock();
8771596dae2SMaciej Fijalkowski 	ret = __xsk_generic_xmit(sk);
87818b1ab7aSMagnus Karlsson 	/* Reaquire RCU lock before going into common code. */
87918b1ab7aSMagnus Karlsson 	rcu_read_lock();
88018b1ab7aSMagnus Karlsson 
88118b1ab7aSMagnus Karlsson 	return ret;
882df551058SMagnus Karlsson }
883df551058SMagnus Karlsson 
xsk_no_wakeup(struct sock * sk)884a0731952SBjörn Töpel static bool xsk_no_wakeup(struct sock *sk)
885a0731952SBjörn Töpel {
886a0731952SBjörn Töpel #ifdef CONFIG_NET_RX_BUSY_POLL
887a0731952SBjörn Töpel 	/* Prefer busy-polling, skip the wakeup. */
888a0731952SBjörn Töpel 	return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) &&
889a0731952SBjörn Töpel 		READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID;
890a0731952SBjörn Töpel #else
891a0731952SBjörn Töpel 	return false;
892a0731952SBjörn Töpel #endif
893a0731952SBjörn Töpel }
894a0731952SBjörn Töpel 
xsk_check_common(struct xdp_sock * xs)8951596dae2SMaciej Fijalkowski static int xsk_check_common(struct xdp_sock *xs)
8961596dae2SMaciej Fijalkowski {
8971596dae2SMaciej Fijalkowski 	if (unlikely(!xsk_is_bound(xs)))
8981596dae2SMaciej Fijalkowski 		return -ENXIO;
8991596dae2SMaciej Fijalkowski 	if (unlikely(!(xs->dev->flags & IFF_UP)))
9001596dae2SMaciej Fijalkowski 		return -ENETDOWN;
9011596dae2SMaciej Fijalkowski 
9021596dae2SMaciej Fijalkowski 	return 0;
9031596dae2SMaciej Fijalkowski }
9041596dae2SMaciej Fijalkowski 
__xsk_sendmsg(struct socket * sock,struct msghdr * m,size_t total_len)90518b1ab7aSMagnus Karlsson static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
90635fcde7fSMagnus Karlsson {
907ac98d8aaSMagnus Karlsson 	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
90835fcde7fSMagnus Karlsson 	struct sock *sk = sock->sk;
90935fcde7fSMagnus Karlsson 	struct xdp_sock *xs = xdp_sk(sk);
910e3920818SBjörn Töpel 	struct xsk_buff_pool *pool;
9111596dae2SMaciej Fijalkowski 	int err;
91235fcde7fSMagnus Karlsson 
9131596dae2SMaciej Fijalkowski 	err = xsk_check_common(xs);
9141596dae2SMaciej Fijalkowski 	if (err)
9151596dae2SMaciej Fijalkowski 		return err;
916df551058SMagnus Karlsson 	if (unlikely(need_wait))
917ac98d8aaSMagnus Karlsson 		return -EOPNOTSUPP;
9181596dae2SMaciej Fijalkowski 	if (unlikely(!xs->tx))
9191596dae2SMaciej Fijalkowski 		return -ENOBUFS;
92035fcde7fSMagnus Karlsson 
921ca2e1a62SMaciej Fijalkowski 	if (sk_can_busy_loop(sk)) {
922ca2e1a62SMaciej Fijalkowski 		if (xs->zc)
923ca2e1a62SMaciej Fijalkowski 			__sk_mark_napi_id_once(sk, xsk_pool_get_napi_id(xs->pool));
924a0731952SBjörn Töpel 		sk_busy_loop(sk, 1); /* only support non-blocking sockets */
925ca2e1a62SMaciej Fijalkowski 	}
926a0731952SBjörn Töpel 
9278de8b71bSMaciej Fijalkowski 	if (xs->zc && xsk_no_wakeup(sk))
928a0731952SBjörn Töpel 		return 0;
929a0731952SBjörn Töpel 
930e3920818SBjörn Töpel 	pool = xs->pool;
9311596dae2SMaciej Fijalkowski 	if (pool->cached_need_wakeup & XDP_WAKEUP_TX) {
9321596dae2SMaciej Fijalkowski 		if (xs->zc)
9331596dae2SMaciej Fijalkowski 			return xsk_wakeup(xs, XDP_WAKEUP_TX);
9341596dae2SMaciej Fijalkowski 		return xsk_generic_xmit(sk);
9351596dae2SMaciej Fijalkowski 	}
936e3920818SBjörn Töpel 	return 0;
93735fcde7fSMagnus Karlsson }
93835fcde7fSMagnus Karlsson 
xsk_sendmsg(struct socket * sock,struct msghdr * m,size_t total_len)93918b1ab7aSMagnus Karlsson static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
94018b1ab7aSMagnus Karlsson {
94118b1ab7aSMagnus Karlsson 	int ret;
94218b1ab7aSMagnus Karlsson 
94318b1ab7aSMagnus Karlsson 	rcu_read_lock();
94418b1ab7aSMagnus Karlsson 	ret = __xsk_sendmsg(sock, m, total_len);
94518b1ab7aSMagnus Karlsson 	rcu_read_unlock();
94618b1ab7aSMagnus Karlsson 
94718b1ab7aSMagnus Karlsson 	return ret;
94818b1ab7aSMagnus Karlsson }
94918b1ab7aSMagnus Karlsson 
__xsk_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)95018b1ab7aSMagnus Karlsson static int __xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
95145a86681SBjörn Töpel {
95245a86681SBjörn Töpel 	bool need_wait = !(flags & MSG_DONTWAIT);
95345a86681SBjörn Töpel 	struct sock *sk = sock->sk;
95445a86681SBjörn Töpel 	struct xdp_sock *xs = xdp_sk(sk);
9551596dae2SMaciej Fijalkowski 	int err;
95645a86681SBjörn Töpel 
9571596dae2SMaciej Fijalkowski 	err = xsk_check_common(xs);
9581596dae2SMaciej Fijalkowski 	if (err)
9591596dae2SMaciej Fijalkowski 		return err;
96045a86681SBjörn Töpel 	if (unlikely(!xs->rx))
96145a86681SBjörn Töpel 		return -ENOBUFS;
96245a86681SBjörn Töpel 	if (unlikely(need_wait))
96345a86681SBjörn Töpel 		return -EOPNOTSUPP;
96445a86681SBjörn Töpel 
965a0731952SBjörn Töpel 	if (sk_can_busy_loop(sk))
966a0731952SBjörn Töpel 		sk_busy_loop(sk, 1); /* only support non-blocking sockets */
967a0731952SBjörn Töpel 
968a0731952SBjörn Töpel 	if (xsk_no_wakeup(sk))
969a0731952SBjörn Töpel 		return 0;
970a0731952SBjörn Töpel 
97145a86681SBjörn Töpel 	if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc)
97245a86681SBjörn Töpel 		return xsk_wakeup(xs, XDP_WAKEUP_RX);
97345a86681SBjörn Töpel 	return 0;
974c497176cSBjörn Töpel }
975c497176cSBjörn Töpel 
xsk_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)97618b1ab7aSMagnus Karlsson static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
97718b1ab7aSMagnus Karlsson {
97818b1ab7aSMagnus Karlsson 	int ret;
97918b1ab7aSMagnus Karlsson 
98018b1ab7aSMagnus Karlsson 	rcu_read_lock();
98118b1ab7aSMagnus Karlsson 	ret = __xsk_recvmsg(sock, m, len, flags);
98218b1ab7aSMagnus Karlsson 	rcu_read_unlock();
98318b1ab7aSMagnus Karlsson 
98418b1ab7aSMagnus Karlsson 	return ret;
98518b1ab7aSMagnus Karlsson }
98618b1ab7aSMagnus Karlsson 
xsk_poll(struct file * file,struct socket * sock,struct poll_table_struct * wait)9875d946c5aSLuc Van Oostenryck static __poll_t xsk_poll(struct file *file, struct socket *sock,
988a11e1d43SLinus Torvalds 			     struct poll_table_struct *wait)
989c497176cSBjörn Töpel {
990f5da5418SXuan Zhuo 	__poll_t mask = 0;
991df551058SMagnus Karlsson 	struct sock *sk = sock->sk;
992df551058SMagnus Karlsson 	struct xdp_sock *xs = xdp_sk(sk);
993c2d3d6a4SMagnus Karlsson 	struct xsk_buff_pool *pool;
99442fddcc7SBjörn Töpel 
9950706a78fSMagnus Karlsson 	sock_poll_wait(file, sock, wait);
9960706a78fSMagnus Karlsson 
99718b1ab7aSMagnus Karlsson 	rcu_read_lock();
9981596dae2SMaciej Fijalkowski 	if (xsk_check_common(xs))
999e4d008d4SYewon Choi 		goto out;
100042fddcc7SBjörn Töpel 
1001c2d3d6a4SMagnus Karlsson 	pool = xs->pool;
100277cd0d7bSMagnus Karlsson 
1003c2d3d6a4SMagnus Karlsson 	if (pool->cached_need_wakeup) {
100406870682SMaxim Mikityanskiy 		if (xs->zc)
1005c2d3d6a4SMagnus Karlsson 			xsk_wakeup(xs, pool->cached_need_wakeup);
10061596dae2SMaciej Fijalkowski 		else if (xs->tx)
1007df551058SMagnus Karlsson 			/* Poll needs to drive Tx also in copy mode */
10081596dae2SMaciej Fijalkowski 			xsk_generic_xmit(sk);
1009df551058SMagnus Karlsson 	}
1010c497176cSBjörn Töpel 
101159e35e55SMagnus Karlsson 	if (xs->rx && !xskq_prod_is_empty(xs->rx))
10125d946c5aSLuc Van Oostenryck 		mask |= EPOLLIN | EPOLLRDNORM;
10133413f041SXuan Zhuo 	if (xs->tx && xsk_tx_writeable(xs))
10145d946c5aSLuc Van Oostenryck 		mask |= EPOLLOUT | EPOLLWRNORM;
1015e4d008d4SYewon Choi out:
101618b1ab7aSMagnus Karlsson 	rcu_read_unlock();
1017c497176cSBjörn Töpel 	return mask;
1018c497176cSBjörn Töpel }
1019c497176cSBjörn Töpel 
xsk_init_queue(u32 entries,struct xsk_queue ** queue,bool umem_queue)1020b9b6b68eSBjörn Töpel static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
1021b9b6b68eSBjörn Töpel 			  bool umem_queue)
1022423f3832SMagnus Karlsson {
1023423f3832SMagnus Karlsson 	struct xsk_queue *q;
1024423f3832SMagnus Karlsson 
1025423f3832SMagnus Karlsson 	if (entries == 0 || *queue || !is_power_of_2(entries))
1026423f3832SMagnus Karlsson 		return -EINVAL;
1027423f3832SMagnus Karlsson 
1028b9b6b68eSBjörn Töpel 	q = xskq_create(entries, umem_queue);
1029423f3832SMagnus Karlsson 	if (!q)
1030423f3832SMagnus Karlsson 		return -ENOMEM;
1031423f3832SMagnus Karlsson 
103237b07693SBjörn Töpel 	/* Make sure queue is ready before it can be seen by others */
103337b07693SBjörn Töpel 	smp_wmb();
103494a99763SBjörn Töpel 	WRITE_ONCE(*queue, q);
1035423f3832SMagnus Karlsson 	return 0;
1036423f3832SMagnus Karlsson }
1037423f3832SMagnus Karlsson 
xsk_unbind_dev(struct xdp_sock * xs)1038455302d1SIlya Maximets static void xsk_unbind_dev(struct xdp_sock *xs)
1039455302d1SIlya Maximets {
1040455302d1SIlya Maximets 	struct net_device *dev = xs->dev;
1041455302d1SIlya Maximets 
104242fddcc7SBjörn Töpel 	if (xs->state != XSK_BOUND)
1043455302d1SIlya Maximets 		return;
104442fddcc7SBjörn Töpel 	WRITE_ONCE(xs->state, XSK_UNBOUND);
1045455302d1SIlya Maximets 
1046455302d1SIlya Maximets 	/* Wait for driver to stop using the xdp socket. */
1047a5aa8e52SMagnus Karlsson 	xp_del_xsk(xs->pool, xs);
1048455302d1SIlya Maximets 	synchronize_net();
1049455302d1SIlya Maximets 	dev_put(dev);
1050455302d1SIlya Maximets }
1051455302d1SIlya Maximets 
xsk_get_map_list_entry(struct xdp_sock * xs,struct xdp_sock __rcu *** map_entry)10520402acd6SBjörn Töpel static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
1053782347b6SToke Høiland-Jørgensen 					      struct xdp_sock __rcu ***map_entry)
10540402acd6SBjörn Töpel {
10550402acd6SBjörn Töpel 	struct xsk_map *map = NULL;
10560402acd6SBjörn Töpel 	struct xsk_map_node *node;
10570402acd6SBjörn Töpel 
10580402acd6SBjörn Töpel 	*map_entry = NULL;
10590402acd6SBjörn Töpel 
10600402acd6SBjörn Töpel 	spin_lock_bh(&xs->map_list_lock);
10610402acd6SBjörn Töpel 	node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
10620402acd6SBjörn Töpel 					node);
10630402acd6SBjörn Töpel 	if (node) {
1064bb1b25caSZhu Yanjun 		bpf_map_inc(&node->map->map);
10650402acd6SBjörn Töpel 		map = node->map;
10660402acd6SBjörn Töpel 		*map_entry = node->map_entry;
10670402acd6SBjörn Töpel 	}
10680402acd6SBjörn Töpel 	spin_unlock_bh(&xs->map_list_lock);
10690402acd6SBjörn Töpel 	return map;
10700402acd6SBjörn Töpel }
10710402acd6SBjörn Töpel 
xsk_delete_from_maps(struct xdp_sock * xs)10720402acd6SBjörn Töpel static void xsk_delete_from_maps(struct xdp_sock *xs)
10730402acd6SBjörn Töpel {
10740402acd6SBjörn Töpel 	/* This function removes the current XDP socket from all the
10750402acd6SBjörn Töpel 	 * maps it resides in. We need to take extra care here, due to
10760402acd6SBjörn Töpel 	 * the two locks involved. Each map has a lock synchronizing
10770402acd6SBjörn Töpel 	 * updates to the entries, and each socket has a lock that
10780402acd6SBjörn Töpel 	 * synchronizes access to the list of maps (map_list). For
10790402acd6SBjörn Töpel 	 * deadlock avoidance the locks need to be taken in the order
10800402acd6SBjörn Töpel 	 * "map lock"->"socket map list lock". We start off by
10810402acd6SBjörn Töpel 	 * accessing the socket map list, and take a reference to the
10820402acd6SBjörn Töpel 	 * map to guarantee existence between the
10830402acd6SBjörn Töpel 	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
10840402acd6SBjörn Töpel 	 * calls. Then we ask the map to remove the socket, which
10850402acd6SBjörn Töpel 	 * tries to remove the socket from the map. Note that there
10860402acd6SBjörn Töpel 	 * might be updates to the map between
10870402acd6SBjörn Töpel 	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
10880402acd6SBjörn Töpel 	 */
1089782347b6SToke Høiland-Jørgensen 	struct xdp_sock __rcu **map_entry = NULL;
10900402acd6SBjörn Töpel 	struct xsk_map *map;
10910402acd6SBjörn Töpel 
10920402acd6SBjörn Töpel 	while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
10930402acd6SBjörn Töpel 		xsk_map_try_sock_delete(map, xs, map_entry);
1094bb1b25caSZhu Yanjun 		bpf_map_put(&map->map);
10950402acd6SBjörn Töpel 	}
10960402acd6SBjörn Töpel }
10970402acd6SBjörn Töpel 
xsk_release(struct socket * sock)1098c0c77d8fSBjörn Töpel static int xsk_release(struct socket *sock)
1099c0c77d8fSBjörn Töpel {
1100c0c77d8fSBjörn Töpel 	struct sock *sk = sock->sk;
1101965a9909SMagnus Karlsson 	struct xdp_sock *xs = xdp_sk(sk);
1102c0c77d8fSBjörn Töpel 	struct net *net;
1103c0c77d8fSBjörn Töpel 
1104c0c77d8fSBjörn Töpel 	if (!sk)
1105c0c77d8fSBjörn Töpel 		return 0;
1106c0c77d8fSBjörn Töpel 
1107c0c77d8fSBjörn Töpel 	net = sock_net(sk);
1108c0c77d8fSBjörn Töpel 
1109cf24f5a5STirthendu Sarkar 	if (xs->skb)
1110cf24f5a5STirthendu Sarkar 		xsk_drop_skb(xs->skb);
1111cf24f5a5STirthendu Sarkar 
11121d0dc069SBjörn Töpel 	mutex_lock(&net->xdp.lock);
11131d0dc069SBjörn Töpel 	sk_del_node_init_rcu(sk);
11141d0dc069SBjörn Töpel 	mutex_unlock(&net->xdp.lock);
11151d0dc069SBjörn Töpel 
1116c0c77d8fSBjörn Töpel 	sock_prot_inuse_add(net, sk->sk_prot, -1);
1117c0c77d8fSBjörn Töpel 
11180402acd6SBjörn Töpel 	xsk_delete_from_maps(xs);
111942fddcc7SBjörn Töpel 	mutex_lock(&xs->mutex);
1120455302d1SIlya Maximets 	xsk_unbind_dev(xs);
112142fddcc7SBjörn Töpel 	mutex_unlock(&xs->mutex);
1122965a9909SMagnus Karlsson 
1123541d7fddSBjörn Töpel 	xskq_destroy(xs->rx);
1124541d7fddSBjörn Töpel 	xskq_destroy(xs->tx);
11257361f9c3SMagnus Karlsson 	xskq_destroy(xs->fq_tmp);
11267361f9c3SMagnus Karlsson 	xskq_destroy(xs->cq_tmp);
1127541d7fddSBjörn Töpel 
1128c0c77d8fSBjörn Töpel 	sock_orphan(sk);
1129c0c77d8fSBjörn Töpel 	sock->sk = NULL;
1130c0c77d8fSBjörn Töpel 
1131c0c77d8fSBjörn Töpel 	sock_put(sk);
1132c0c77d8fSBjörn Töpel 
1133c0c77d8fSBjörn Töpel 	return 0;
1134c0c77d8fSBjörn Töpel }
1135c0c77d8fSBjörn Töpel 
xsk_lookup_xsk_from_fd(int fd)1136965a9909SMagnus Karlsson static struct socket *xsk_lookup_xsk_from_fd(int fd)
1137965a9909SMagnus Karlsson {
1138965a9909SMagnus Karlsson 	struct socket *sock;
1139965a9909SMagnus Karlsson 	int err;
1140965a9909SMagnus Karlsson 
1141965a9909SMagnus Karlsson 	sock = sockfd_lookup(fd, &err);
1142965a9909SMagnus Karlsson 	if (!sock)
1143965a9909SMagnus Karlsson 		return ERR_PTR(-ENOTSOCK);
1144965a9909SMagnus Karlsson 
1145965a9909SMagnus Karlsson 	if (sock->sk->sk_family != PF_XDP) {
1146965a9909SMagnus Karlsson 		sockfd_put(sock);
1147965a9909SMagnus Karlsson 		return ERR_PTR(-ENOPROTOOPT);
1148965a9909SMagnus Karlsson 	}
1149965a9909SMagnus Karlsson 
1150965a9909SMagnus Karlsson 	return sock;
1151965a9909SMagnus Karlsson }
1152965a9909SMagnus Karlsson 
xsk_validate_queues(struct xdp_sock * xs)11537361f9c3SMagnus Karlsson static bool xsk_validate_queues(struct xdp_sock *xs)
11547361f9c3SMagnus Karlsson {
11557361f9c3SMagnus Karlsson 	return xs->fq_tmp && xs->cq_tmp;
11567361f9c3SMagnus Karlsson }
11577361f9c3SMagnus Karlsson 
xsk_bind(struct socket * sock,struct sockaddr * addr,int addr_len)1158965a9909SMagnus Karlsson static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
1159965a9909SMagnus Karlsson {
1160965a9909SMagnus Karlsson 	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
1161965a9909SMagnus Karlsson 	struct sock *sk = sock->sk;
1162965a9909SMagnus Karlsson 	struct xdp_sock *xs = xdp_sk(sk);
1163959b71dbSBjörn Töpel 	struct net_device *dev;
1164f7306aceSIlya Maximets 	int bound_dev_if;
1165173d3adbSBjörn Töpel 	u32 flags, qid;
1166965a9909SMagnus Karlsson 	int err = 0;
1167965a9909SMagnus Karlsson 
1168965a9909SMagnus Karlsson 	if (addr_len < sizeof(struct sockaddr_xdp))
1169965a9909SMagnus Karlsson 		return -EINVAL;
1170965a9909SMagnus Karlsson 	if (sxdp->sxdp_family != AF_XDP)
1171965a9909SMagnus Karlsson 		return -EINVAL;
1172965a9909SMagnus Karlsson 
1173f54ba391SBjörn Töpel 	flags = sxdp->sxdp_flags;
117477cd0d7bSMagnus Karlsson 	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
117581470b5cSTirthendu Sarkar 		      XDP_USE_NEED_WAKEUP | XDP_USE_SG))
1176f54ba391SBjörn Töpel 		return -EINVAL;
1177f54ba391SBjörn Töpel 
1178f7306aceSIlya Maximets 	bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
1179f7306aceSIlya Maximets 	if (bound_dev_if && bound_dev_if != sxdp->sxdp_ifindex)
1180f7306aceSIlya Maximets 		return -EINVAL;
1181f7306aceSIlya Maximets 
11825464c3a0SIlya Maximets 	rtnl_lock();
1183965a9909SMagnus Karlsson 	mutex_lock(&xs->mutex);
1184455302d1SIlya Maximets 	if (xs->state != XSK_READY) {
1185959b71dbSBjörn Töpel 		err = -EBUSY;
1186959b71dbSBjörn Töpel 		goto out_release;
1187959b71dbSBjörn Töpel 	}
1188959b71dbSBjörn Töpel 
1189965a9909SMagnus Karlsson 	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
1190965a9909SMagnus Karlsson 	if (!dev) {
1191965a9909SMagnus Karlsson 		err = -ENODEV;
1192965a9909SMagnus Karlsson 		goto out_release;
1193965a9909SMagnus Karlsson 	}
1194965a9909SMagnus Karlsson 
1195f6145903SMagnus Karlsson 	if (!xs->rx && !xs->tx) {
1196965a9909SMagnus Karlsson 		err = -EINVAL;
1197965a9909SMagnus Karlsson 		goto out_unlock;
1198965a9909SMagnus Karlsson 	}
1199965a9909SMagnus Karlsson 
1200173d3adbSBjörn Töpel 	qid = sxdp->sxdp_queue_id;
1201173d3adbSBjörn Töpel 
1202173d3adbSBjörn Töpel 	if (flags & XDP_SHARED_UMEM) {
1203965a9909SMagnus Karlsson 		struct xdp_sock *umem_xs;
1204965a9909SMagnus Karlsson 		struct socket *sock;
1205965a9909SMagnus Karlsson 
120677cd0d7bSMagnus Karlsson 		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
120781470b5cSTirthendu Sarkar 		    (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) {
1208173d3adbSBjörn Töpel 			/* Cannot specify flags for shared sockets. */
1209173d3adbSBjörn Töpel 			err = -EINVAL;
1210173d3adbSBjörn Töpel 			goto out_unlock;
1211173d3adbSBjörn Töpel 		}
1212173d3adbSBjörn Töpel 
1213965a9909SMagnus Karlsson 		if (xs->umem) {
1214965a9909SMagnus Karlsson 			/* We have already our own. */
1215965a9909SMagnus Karlsson 			err = -EINVAL;
1216965a9909SMagnus Karlsson 			goto out_unlock;
1217965a9909SMagnus Karlsson 		}
1218965a9909SMagnus Karlsson 
1219965a9909SMagnus Karlsson 		sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
1220965a9909SMagnus Karlsson 		if (IS_ERR(sock)) {
1221965a9909SMagnus Karlsson 			err = PTR_ERR(sock);
1222965a9909SMagnus Karlsson 			goto out_unlock;
1223965a9909SMagnus Karlsson 		}
1224965a9909SMagnus Karlsson 
1225965a9909SMagnus Karlsson 		umem_xs = xdp_sk(sock->sk);
122642fddcc7SBjörn Töpel 		if (!xsk_is_bound(umem_xs)) {
1227965a9909SMagnus Karlsson 			err = -EBADF;
1228965a9909SMagnus Karlsson 			sockfd_put(sock);
1229965a9909SMagnus Karlsson 			goto out_unlock;
123042fddcc7SBjörn Töpel 		}
1231965a9909SMagnus Karlsson 
1232a1132430SMagnus Karlsson 		if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
1233a1132430SMagnus Karlsson 			/* Share the umem with another socket on another qid
1234a1132430SMagnus Karlsson 			 * and/or device.
1235a1132430SMagnus Karlsson 			 */
1236b5aea28dSMagnus Karlsson 			xs->pool = xp_create_and_assign_umem(xs,
1237b5aea28dSMagnus Karlsson 							     umem_xs->umem);
1238b5aea28dSMagnus Karlsson 			if (!xs->pool) {
12391fd17c8cSMagnus Karlsson 				err = -ENOMEM;
1240b5aea28dSMagnus Karlsson 				sockfd_put(sock);
1241b5aea28dSMagnus Karlsson 				goto out_unlock;
1242b5aea28dSMagnus Karlsson 			}
1243b5aea28dSMagnus Karlsson 
124460240bc2SJalal Mostafa 			err = xp_assign_dev_shared(xs->pool, umem_xs, dev,
124560240bc2SJalal Mostafa 						   qid);
1246b5aea28dSMagnus Karlsson 			if (err) {
1247b5aea28dSMagnus Karlsson 				xp_destroy(xs->pool);
124883cf5c68SMagnus Karlsson 				xs->pool = NULL;
1249b5aea28dSMagnus Karlsson 				sockfd_put(sock);
1250b5aea28dSMagnus Karlsson 				goto out_unlock;
1251b5aea28dSMagnus Karlsson 			}
1252b5aea28dSMagnus Karlsson 		} else {
12531c1efc2aSMagnus Karlsson 			/* Share the buffer pool with the other socket. */
1254b5aea28dSMagnus Karlsson 			if (xs->fq_tmp || xs->cq_tmp) {
1255b5aea28dSMagnus Karlsson 				/* Do not allow setting your own fq or cq. */
1256b5aea28dSMagnus Karlsson 				err = -EINVAL;
1257b5aea28dSMagnus Karlsson 				sockfd_put(sock);
1258b5aea28dSMagnus Karlsson 				goto out_unlock;
1259b5aea28dSMagnus Karlsson 			}
1260b5aea28dSMagnus Karlsson 
12611c1efc2aSMagnus Karlsson 			xp_get_pool(umem_xs->pool);
12621c1efc2aSMagnus Karlsson 			xs->pool = umem_xs->pool;
1263ba3beec2SMaciej Fijalkowski 
1264ba3beec2SMaciej Fijalkowski 			/* If underlying shared umem was created without Tx
1265ba3beec2SMaciej Fijalkowski 			 * ring, allocate Tx descs array that Tx batching API
1266ba3beec2SMaciej Fijalkowski 			 * utilizes
1267ba3beec2SMaciej Fijalkowski 			 */
1268ba3beec2SMaciej Fijalkowski 			if (xs->tx && !xs->pool->tx_descs) {
1269ba3beec2SMaciej Fijalkowski 				err = xp_alloc_tx_descs(xs->pool, xs);
1270ba3beec2SMaciej Fijalkowski 				if (err) {
1271ba3beec2SMaciej Fijalkowski 					xp_put_pool(xs->pool);
127285c2c79aSMagnus Karlsson 					xs->pool = NULL;
1273ba3beec2SMaciej Fijalkowski 					sockfd_put(sock);
1274ba3beec2SMaciej Fijalkowski 					goto out_unlock;
1275ba3beec2SMaciej Fijalkowski 				}
1276ba3beec2SMaciej Fijalkowski 			}
1277b5aea28dSMagnus Karlsson 		}
1278b5aea28dSMagnus Karlsson 
1279965a9909SMagnus Karlsson 		xdp_get_umem(umem_xs->umem);
12809764f4b3SBjörn Töpel 		WRITE_ONCE(xs->umem, umem_xs->umem);
1281965a9909SMagnus Karlsson 		sockfd_put(sock);
12827361f9c3SMagnus Karlsson 	} else if (!xs->umem || !xsk_validate_queues(xs)) {
1283965a9909SMagnus Karlsson 		err = -EINVAL;
1284965a9909SMagnus Karlsson 		goto out_unlock;
1285c497176cSBjörn Töpel 	} else {
1286c497176cSBjörn Töpel 		/* This xsk has its own umem. */
12871c1efc2aSMagnus Karlsson 		xs->pool = xp_create_and_assign_umem(xs, xs->umem);
12881c1efc2aSMagnus Karlsson 		if (!xs->pool) {
12891c1efc2aSMagnus Karlsson 			err = -ENOMEM;
1290173d3adbSBjörn Töpel 			goto out_unlock;
1291965a9909SMagnus Karlsson 		}
1292965a9909SMagnus Karlsson 
12931c1efc2aSMagnus Karlsson 		err = xp_assign_dev(xs->pool, dev, qid, flags);
12941c1efc2aSMagnus Karlsson 		if (err) {
12951c1efc2aSMagnus Karlsson 			xp_destroy(xs->pool);
12961c1efc2aSMagnus Karlsson 			xs->pool = NULL;
12971c1efc2aSMagnus Karlsson 			goto out_unlock;
12981c1efc2aSMagnus Karlsson 		}
12991c1efc2aSMagnus Karlsson 	}
13001c1efc2aSMagnus Karlsson 
13018bee6833SMagnus Karlsson 	/* FQ and CQ are now owned by the buffer pool and cleaned up with it. */
13028bee6833SMagnus Karlsson 	xs->fq_tmp = NULL;
13038bee6833SMagnus Karlsson 	xs->cq_tmp = NULL;
13048bee6833SMagnus Karlsson 
1305965a9909SMagnus Karlsson 	xs->dev = dev;
1306ac98d8aaSMagnus Karlsson 	xs->zc = xs->umem->zc;
1307d609f3d2STirthendu Sarkar 	xs->sg = !!(xs->umem->flags & XDP_UMEM_SG_FLAG);
1308ac98d8aaSMagnus Karlsson 	xs->queue_id = qid;
1309a5aa8e52SMagnus Karlsson 	xp_add_xsk(xs->pool, xs);
1310965a9909SMagnus Karlsson 
1311965a9909SMagnus Karlsson out_unlock:
131242fddcc7SBjörn Töpel 	if (err) {
1313965a9909SMagnus Karlsson 		dev_put(dev);
131442fddcc7SBjörn Töpel 	} else {
131542fddcc7SBjörn Töpel 		/* Matches smp_rmb() in bind() for shared umem
131642fddcc7SBjörn Töpel 		 * sockets, and xsk_is_bound().
131742fddcc7SBjörn Töpel 		 */
131842fddcc7SBjörn Töpel 		smp_wmb();
131942fddcc7SBjörn Töpel 		WRITE_ONCE(xs->state, XSK_BOUND);
132042fddcc7SBjörn Töpel 	}
1321965a9909SMagnus Karlsson out_release:
1322965a9909SMagnus Karlsson 	mutex_unlock(&xs->mutex);
13235464c3a0SIlya Maximets 	rtnl_unlock();
1324965a9909SMagnus Karlsson 	return err;
1325965a9909SMagnus Karlsson }
1326965a9909SMagnus Karlsson 
1327c05cd364SKevin Laatz struct xdp_umem_reg_v1 {
1328c05cd364SKevin Laatz 	__u64 addr; /* Start of packet data area */
1329c05cd364SKevin Laatz 	__u64 len; /* Length of packet data area */
1330c05cd364SKevin Laatz 	__u32 chunk_size;
1331c05cd364SKevin Laatz 	__u32 headroom;
1332c05cd364SKevin Laatz };
1333c05cd364SKevin Laatz 
1334341ac980SStanislav Fomichev struct xdp_umem_reg_v2 {
1335341ac980SStanislav Fomichev 	__u64 addr; /* Start of packet data area */
1336341ac980SStanislav Fomichev 	__u64 len; /* Length of packet data area */
1337341ac980SStanislav Fomichev 	__u32 chunk_size;
1338341ac980SStanislav Fomichev 	__u32 headroom;
1339341ac980SStanislav Fomichev 	__u32 flags;
1340341ac980SStanislav Fomichev };
1341341ac980SStanislav Fomichev 
xsk_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)1342c0c77d8fSBjörn Töpel static int xsk_setsockopt(struct socket *sock, int level, int optname,
1343a7b75c5aSChristoph Hellwig 			  sockptr_t optval, unsigned int optlen)
1344c0c77d8fSBjörn Töpel {
1345c0c77d8fSBjörn Töpel 	struct sock *sk = sock->sk;
1346c0c77d8fSBjörn Töpel 	struct xdp_sock *xs = xdp_sk(sk);
1347c0c77d8fSBjörn Töpel 	int err;
1348c0c77d8fSBjörn Töpel 
1349c0c77d8fSBjörn Töpel 	if (level != SOL_XDP)
1350c0c77d8fSBjörn Töpel 		return -ENOPROTOOPT;
1351c0c77d8fSBjörn Töpel 
1352c0c77d8fSBjörn Töpel 	switch (optname) {
1353b9b6b68eSBjörn Töpel 	case XDP_RX_RING:
1354f6145903SMagnus Karlsson 	case XDP_TX_RING:
1355b9b6b68eSBjörn Töpel 	{
1356b9b6b68eSBjörn Töpel 		struct xsk_queue **q;
1357b9b6b68eSBjörn Töpel 		int entries;
1358b9b6b68eSBjörn Töpel 
1359b9b6b68eSBjörn Töpel 		if (optlen < sizeof(entries))
1360b9b6b68eSBjörn Töpel 			return -EINVAL;
1361a7b75c5aSChristoph Hellwig 		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
1362b9b6b68eSBjörn Töpel 			return -EFAULT;
1363b9b6b68eSBjörn Töpel 
1364b9b6b68eSBjörn Töpel 		mutex_lock(&xs->mutex);
1365455302d1SIlya Maximets 		if (xs->state != XSK_READY) {
1366455302d1SIlya Maximets 			mutex_unlock(&xs->mutex);
1367455302d1SIlya Maximets 			return -EBUSY;
1368455302d1SIlya Maximets 		}
1369f6145903SMagnus Karlsson 		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
1370b9b6b68eSBjörn Töpel 		err = xsk_init_queue(entries, q, false);
137177cd0d7bSMagnus Karlsson 		if (!err && optname == XDP_TX_RING)
137277cd0d7bSMagnus Karlsson 			/* Tx needs to be explicitly woken up the first time */
137377cd0d7bSMagnus Karlsson 			xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
1374b9b6b68eSBjörn Töpel 		mutex_unlock(&xs->mutex);
1375b9b6b68eSBjörn Töpel 		return err;
1376b9b6b68eSBjörn Töpel 	}
1377c0c77d8fSBjörn Töpel 	case XDP_UMEM_REG:
1378c0c77d8fSBjörn Töpel 	{
1379c05cd364SKevin Laatz 		size_t mr_size = sizeof(struct xdp_umem_reg);
1380c05cd364SKevin Laatz 		struct xdp_umem_reg mr = {};
1381c0c77d8fSBjörn Töpel 		struct xdp_umem *umem;
1382c0c77d8fSBjörn Töpel 
1383c05cd364SKevin Laatz 		if (optlen < sizeof(struct xdp_umem_reg_v1))
1384c05cd364SKevin Laatz 			return -EINVAL;
1385341ac980SStanislav Fomichev 		else if (optlen < sizeof(struct xdp_umem_reg_v2))
1386c05cd364SKevin Laatz 			mr_size = sizeof(struct xdp_umem_reg_v1);
1387341ac980SStanislav Fomichev 		else if (optlen < sizeof(mr))
1388341ac980SStanislav Fomichev 			mr_size = sizeof(struct xdp_umem_reg_v2);
1389c05cd364SKevin Laatz 
1390a7b75c5aSChristoph Hellwig 		if (copy_from_sockptr(&mr, optval, mr_size))
1391c0c77d8fSBjörn Töpel 			return -EFAULT;
1392c0c77d8fSBjörn Töpel 
1393c0c77d8fSBjörn Töpel 		mutex_lock(&xs->mutex);
1394455302d1SIlya Maximets 		if (xs->state != XSK_READY || xs->umem) {
1395c0c77d8fSBjörn Töpel 			mutex_unlock(&xs->mutex);
1396a49049eaSBjörn Töpel 			return -EBUSY;
1397a49049eaSBjörn Töpel 		}
1398a49049eaSBjörn Töpel 
1399a49049eaSBjörn Töpel 		umem = xdp_umem_create(&mr);
1400a49049eaSBjörn Töpel 		if (IS_ERR(umem)) {
1401a49049eaSBjörn Töpel 			mutex_unlock(&xs->mutex);
1402a49049eaSBjörn Töpel 			return PTR_ERR(umem);
1403c0c77d8fSBjörn Töpel 		}
1404c0c77d8fSBjörn Töpel 
1405c0c77d8fSBjörn Töpel 		/* Make sure umem is ready before it can be seen by others */
1406c0c77d8fSBjörn Töpel 		smp_wmb();
14079764f4b3SBjörn Töpel 		WRITE_ONCE(xs->umem, umem);
1408c0c77d8fSBjörn Töpel 		mutex_unlock(&xs->mutex);
1409c0c77d8fSBjörn Töpel 		return 0;
1410c0c77d8fSBjörn Töpel 	}
1411423f3832SMagnus Karlsson 	case XDP_UMEM_FILL_RING:
1412fe230832SMagnus Karlsson 	case XDP_UMEM_COMPLETION_RING:
1413423f3832SMagnus Karlsson 	{
1414423f3832SMagnus Karlsson 		struct xsk_queue **q;
1415423f3832SMagnus Karlsson 		int entries;
1416423f3832SMagnus Karlsson 
1417237f3cf1SEric Dumazet 		if (optlen < sizeof(entries))
1418237f3cf1SEric Dumazet 			return -EINVAL;
1419a7b75c5aSChristoph Hellwig 		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
1420423f3832SMagnus Karlsson 			return -EFAULT;
1421423f3832SMagnus Karlsson 
1422423f3832SMagnus Karlsson 		mutex_lock(&xs->mutex);
1423455302d1SIlya Maximets 		if (xs->state != XSK_READY) {
1424455302d1SIlya Maximets 			mutex_unlock(&xs->mutex);
1425455302d1SIlya Maximets 			return -EBUSY;
1426455302d1SIlya Maximets 		}
1427a49049eaSBjörn Töpel 
14287361f9c3SMagnus Karlsson 		q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
14297361f9c3SMagnus Karlsson 			&xs->cq_tmp;
1430b9b6b68eSBjörn Töpel 		err = xsk_init_queue(entries, q, true);
1431423f3832SMagnus Karlsson 		mutex_unlock(&xs->mutex);
1432423f3832SMagnus Karlsson 		return err;
1433423f3832SMagnus Karlsson 	}
1434c0c77d8fSBjörn Töpel 	default:
1435c0c77d8fSBjörn Töpel 		break;
1436c0c77d8fSBjörn Töpel 	}
1437c0c77d8fSBjörn Töpel 
1438c0c77d8fSBjörn Töpel 	return -ENOPROTOOPT;
1439c0c77d8fSBjörn Töpel }
1440c0c77d8fSBjörn Töpel 
xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 * ring)144177cd0d7bSMagnus Karlsson static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
144277cd0d7bSMagnus Karlsson {
144377cd0d7bSMagnus Karlsson 	ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
144477cd0d7bSMagnus Karlsson 	ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
144577cd0d7bSMagnus Karlsson 	ring->desc = offsetof(struct xdp_rxtx_ring, desc);
144677cd0d7bSMagnus Karlsson }
144777cd0d7bSMagnus Karlsson 
xsk_enter_umem_offsets(struct xdp_ring_offset_v1 * ring)144877cd0d7bSMagnus Karlsson static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
144977cd0d7bSMagnus Karlsson {
145077cd0d7bSMagnus Karlsson 	ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
145177cd0d7bSMagnus Karlsson 	ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
145277cd0d7bSMagnus Karlsson 	ring->desc = offsetof(struct xdp_umem_ring, desc);
145377cd0d7bSMagnus Karlsson }
145477cd0d7bSMagnus Karlsson 
14558aa5a335SCiara Loftus struct xdp_statistics_v1 {
14568aa5a335SCiara Loftus 	__u64 rx_dropped;
14578aa5a335SCiara Loftus 	__u64 rx_invalid_descs;
14588aa5a335SCiara Loftus 	__u64 tx_invalid_descs;
14598aa5a335SCiara Loftus };
14608aa5a335SCiara Loftus 
xsk_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)1461af75d9e0SMagnus Karlsson static int xsk_getsockopt(struct socket *sock, int level, int optname,
1462af75d9e0SMagnus Karlsson 			  char __user *optval, int __user *optlen)
1463af75d9e0SMagnus Karlsson {
1464af75d9e0SMagnus Karlsson 	struct sock *sk = sock->sk;
1465af75d9e0SMagnus Karlsson 	struct xdp_sock *xs = xdp_sk(sk);
1466af75d9e0SMagnus Karlsson 	int len;
1467af75d9e0SMagnus Karlsson 
1468af75d9e0SMagnus Karlsson 	if (level != SOL_XDP)
1469af75d9e0SMagnus Karlsson 		return -ENOPROTOOPT;
1470af75d9e0SMagnus Karlsson 
1471af75d9e0SMagnus Karlsson 	if (get_user(len, optlen))
1472af75d9e0SMagnus Karlsson 		return -EFAULT;
1473af75d9e0SMagnus Karlsson 	if (len < 0)
1474af75d9e0SMagnus Karlsson 		return -EINVAL;
1475af75d9e0SMagnus Karlsson 
1476af75d9e0SMagnus Karlsson 	switch (optname) {
1477af75d9e0SMagnus Karlsson 	case XDP_STATISTICS:
1478af75d9e0SMagnus Karlsson 	{
14793c4f850eSPeilin Ye 		struct xdp_statistics stats = {};
14808aa5a335SCiara Loftus 		bool extra_stats = true;
14818aa5a335SCiara Loftus 		size_t stats_size;
1482af75d9e0SMagnus Karlsson 
14838aa5a335SCiara Loftus 		if (len < sizeof(struct xdp_statistics_v1)) {
1484af75d9e0SMagnus Karlsson 			return -EINVAL;
14858aa5a335SCiara Loftus 		} else if (len < sizeof(stats)) {
14868aa5a335SCiara Loftus 			extra_stats = false;
14878aa5a335SCiara Loftus 			stats_size = sizeof(struct xdp_statistics_v1);
14888aa5a335SCiara Loftus 		} else {
14898aa5a335SCiara Loftus 			stats_size = sizeof(stats);
14908aa5a335SCiara Loftus 		}
1491af75d9e0SMagnus Karlsson 
1492af75d9e0SMagnus Karlsson 		mutex_lock(&xs->mutex);
1493af75d9e0SMagnus Karlsson 		stats.rx_dropped = xs->rx_dropped;
14948aa5a335SCiara Loftus 		if (extra_stats) {
14958aa5a335SCiara Loftus 			stats.rx_ring_full = xs->rx_queue_full;
14968aa5a335SCiara Loftus 			stats.rx_fill_ring_empty_descs =
14977361f9c3SMagnus Karlsson 				xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
14988aa5a335SCiara Loftus 			stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
14998aa5a335SCiara Loftus 		} else {
15008aa5a335SCiara Loftus 			stats.rx_dropped += xs->rx_queue_full;
15018aa5a335SCiara Loftus 		}
1502af75d9e0SMagnus Karlsson 		stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
1503af75d9e0SMagnus Karlsson 		stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
1504af75d9e0SMagnus Karlsson 		mutex_unlock(&xs->mutex);
1505af75d9e0SMagnus Karlsson 
15068aa5a335SCiara Loftus 		if (copy_to_user(optval, &stats, stats_size))
1507af75d9e0SMagnus Karlsson 			return -EFAULT;
15088aa5a335SCiara Loftus 		if (put_user(stats_size, optlen))
1509af75d9e0SMagnus Karlsson 			return -EFAULT;
1510af75d9e0SMagnus Karlsson 
1511af75d9e0SMagnus Karlsson 		return 0;
1512af75d9e0SMagnus Karlsson 	}
1513b3a9e0beSBjörn Töpel 	case XDP_MMAP_OFFSETS:
1514b3a9e0beSBjörn Töpel 	{
1515b3a9e0beSBjörn Töpel 		struct xdp_mmap_offsets off;
151677cd0d7bSMagnus Karlsson 		struct xdp_mmap_offsets_v1 off_v1;
151777cd0d7bSMagnus Karlsson 		bool flags_supported = true;
151877cd0d7bSMagnus Karlsson 		void *to_copy;
1519b3a9e0beSBjörn Töpel 
152077cd0d7bSMagnus Karlsson 		if (len < sizeof(off_v1))
1521b3a9e0beSBjörn Töpel 			return -EINVAL;
152277cd0d7bSMagnus Karlsson 		else if (len < sizeof(off))
152377cd0d7bSMagnus Karlsson 			flags_supported = false;
1524b3a9e0beSBjörn Töpel 
152577cd0d7bSMagnus Karlsson 		if (flags_supported) {
152677cd0d7bSMagnus Karlsson 			/* xdp_ring_offset is identical to xdp_ring_offset_v1
152777cd0d7bSMagnus Karlsson 			 * except for the flags field added to the end.
152877cd0d7bSMagnus Karlsson 			 */
152977cd0d7bSMagnus Karlsson 			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
153077cd0d7bSMagnus Karlsson 					       &off.rx);
153177cd0d7bSMagnus Karlsson 			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
153277cd0d7bSMagnus Karlsson 					       &off.tx);
153377cd0d7bSMagnus Karlsson 			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
153477cd0d7bSMagnus Karlsson 					       &off.fr);
153577cd0d7bSMagnus Karlsson 			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
153677cd0d7bSMagnus Karlsson 					       &off.cr);
153777cd0d7bSMagnus Karlsson 			off.rx.flags = offsetof(struct xdp_rxtx_ring,
153877cd0d7bSMagnus Karlsson 						ptrs.flags);
153977cd0d7bSMagnus Karlsson 			off.tx.flags = offsetof(struct xdp_rxtx_ring,
154077cd0d7bSMagnus Karlsson 						ptrs.flags);
154177cd0d7bSMagnus Karlsson 			off.fr.flags = offsetof(struct xdp_umem_ring,
154277cd0d7bSMagnus Karlsson 						ptrs.flags);
154377cd0d7bSMagnus Karlsson 			off.cr.flags = offsetof(struct xdp_umem_ring,
154477cd0d7bSMagnus Karlsson 						ptrs.flags);
1545b3a9e0beSBjörn Töpel 
1546b3a9e0beSBjörn Töpel 			len = sizeof(off);
154777cd0d7bSMagnus Karlsson 			to_copy = &off;
154877cd0d7bSMagnus Karlsson 		} else {
154977cd0d7bSMagnus Karlsson 			xsk_enter_rxtx_offsets(&off_v1.rx);
155077cd0d7bSMagnus Karlsson 			xsk_enter_rxtx_offsets(&off_v1.tx);
155177cd0d7bSMagnus Karlsson 			xsk_enter_umem_offsets(&off_v1.fr);
155277cd0d7bSMagnus Karlsson 			xsk_enter_umem_offsets(&off_v1.cr);
155377cd0d7bSMagnus Karlsson 
155477cd0d7bSMagnus Karlsson 			len = sizeof(off_v1);
155577cd0d7bSMagnus Karlsson 			to_copy = &off_v1;
155677cd0d7bSMagnus Karlsson 		}
155777cd0d7bSMagnus Karlsson 
155877cd0d7bSMagnus Karlsson 		if (copy_to_user(optval, to_copy, len))
1559b3a9e0beSBjörn Töpel 			return -EFAULT;
1560b3a9e0beSBjörn Töpel 		if (put_user(len, optlen))
1561b3a9e0beSBjörn Töpel 			return -EFAULT;
1562b3a9e0beSBjörn Töpel 
1563b3a9e0beSBjörn Töpel 		return 0;
1564b3a9e0beSBjörn Töpel 	}
15652640d3c8SMaxim Mikityanskiy 	case XDP_OPTIONS:
15662640d3c8SMaxim Mikityanskiy 	{
15672640d3c8SMaxim Mikityanskiy 		struct xdp_options opts = {};
15682640d3c8SMaxim Mikityanskiy 
15692640d3c8SMaxim Mikityanskiy 		if (len < sizeof(opts))
15702640d3c8SMaxim Mikityanskiy 			return -EINVAL;
15712640d3c8SMaxim Mikityanskiy 
15722640d3c8SMaxim Mikityanskiy 		mutex_lock(&xs->mutex);
15732640d3c8SMaxim Mikityanskiy 		if (xs->zc)
15742640d3c8SMaxim Mikityanskiy 			opts.flags |= XDP_OPTIONS_ZEROCOPY;
15752640d3c8SMaxim Mikityanskiy 		mutex_unlock(&xs->mutex);
15762640d3c8SMaxim Mikityanskiy 
15772640d3c8SMaxim Mikityanskiy 		len = sizeof(opts);
15782640d3c8SMaxim Mikityanskiy 		if (copy_to_user(optval, &opts, len))
15792640d3c8SMaxim Mikityanskiy 			return -EFAULT;
15802640d3c8SMaxim Mikityanskiy 		if (put_user(len, optlen))
15812640d3c8SMaxim Mikityanskiy 			return -EFAULT;
15822640d3c8SMaxim Mikityanskiy 
15832640d3c8SMaxim Mikityanskiy 		return 0;
15842640d3c8SMaxim Mikityanskiy 	}
1585af75d9e0SMagnus Karlsson 	default:
1586af75d9e0SMagnus Karlsson 		break;
1587af75d9e0SMagnus Karlsson 	}
1588af75d9e0SMagnus Karlsson 
1589af75d9e0SMagnus Karlsson 	return -EOPNOTSUPP;
1590af75d9e0SMagnus Karlsson }
1591af75d9e0SMagnus Karlsson 
xsk_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)1592423f3832SMagnus Karlsson static int xsk_mmap(struct file *file, struct socket *sock,
1593423f3832SMagnus Karlsson 		    struct vm_area_struct *vma)
1594423f3832SMagnus Karlsson {
1595a5a16e43SGeert Uytterhoeven 	loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
1596423f3832SMagnus Karlsson 	unsigned long size = vma->vm_end - vma->vm_start;
1597423f3832SMagnus Karlsson 	struct xdp_sock *xs = xdp_sk(sock->sk);
15985f5a7d8dSNuno Gonçalves 	int state = READ_ONCE(xs->state);
1599423f3832SMagnus Karlsson 	struct xsk_queue *q = NULL;
1600423f3832SMagnus Karlsson 
16015f5a7d8dSNuno Gonçalves 	if (state != XSK_READY && state != XSK_BOUND)
1602455302d1SIlya Maximets 		return -EBUSY;
1603455302d1SIlya Maximets 
1604b9b6b68eSBjörn Töpel 	if (offset == XDP_PGOFF_RX_RING) {
160537b07693SBjörn Töpel 		q = READ_ONCE(xs->rx);
1606f6145903SMagnus Karlsson 	} else if (offset == XDP_PGOFF_TX_RING) {
160737b07693SBjörn Töpel 		q = READ_ONCE(xs->tx);
1608b9b6b68eSBjörn Töpel 	} else {
1609e6762c8bSMagnus Karlsson 		/* Matches the smp_wmb() in XDP_UMEM_REG */
1610e6762c8bSMagnus Karlsson 		smp_rmb();
1611423f3832SMagnus Karlsson 		if (offset == XDP_UMEM_PGOFF_FILL_RING)
16125f5a7d8dSNuno Gonçalves 			q = state == XSK_READY ? READ_ONCE(xs->fq_tmp) :
16135f5a7d8dSNuno Gonçalves 						 READ_ONCE(xs->pool->fq);
1614fe230832SMagnus Karlsson 		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
16155f5a7d8dSNuno Gonçalves 			q = state == XSK_READY ? READ_ONCE(xs->cq_tmp) :
16165f5a7d8dSNuno Gonçalves 						 READ_ONCE(xs->pool->cq);
1617b9b6b68eSBjörn Töpel 	}
1618423f3832SMagnus Karlsson 
1619423f3832SMagnus Karlsson 	if (!q)
1620423f3832SMagnus Karlsson 		return -EINVAL;
1621423f3832SMagnus Karlsson 
1622e6762c8bSMagnus Karlsson 	/* Matches the smp_wmb() in xsk_init_queue */
1623e6762c8bSMagnus Karlsson 	smp_rmb();
16249f78bf33SXuan Zhuo 	if (size > q->ring_vmalloc_size)
1625423f3832SMagnus Karlsson 		return -EINVAL;
1626423f3832SMagnus Karlsson 
16279f78bf33SXuan Zhuo 	return remap_vmalloc_range(vma, q->ring, 0);
1628423f3832SMagnus Karlsson }
1629423f3832SMagnus Karlsson 
xsk_notifier(struct notifier_block * this,unsigned long msg,void * ptr)1630455302d1SIlya Maximets static int xsk_notifier(struct notifier_block *this,
1631455302d1SIlya Maximets 			unsigned long msg, void *ptr)
1632455302d1SIlya Maximets {
1633455302d1SIlya Maximets 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1634455302d1SIlya Maximets 	struct net *net = dev_net(dev);
1635455302d1SIlya Maximets 	struct sock *sk;
1636455302d1SIlya Maximets 
1637455302d1SIlya Maximets 	switch (msg) {
1638455302d1SIlya Maximets 	case NETDEV_UNREGISTER:
1639455302d1SIlya Maximets 		mutex_lock(&net->xdp.lock);
1640455302d1SIlya Maximets 		sk_for_each(sk, &net->xdp.list) {
1641455302d1SIlya Maximets 			struct xdp_sock *xs = xdp_sk(sk);
1642455302d1SIlya Maximets 
1643455302d1SIlya Maximets 			mutex_lock(&xs->mutex);
1644455302d1SIlya Maximets 			if (xs->dev == dev) {
1645455302d1SIlya Maximets 				sk->sk_err = ENETDOWN;
1646455302d1SIlya Maximets 				if (!sock_flag(sk, SOCK_DEAD))
1647e3ae2365SAlexander Aring 					sk_error_report(sk);
1648455302d1SIlya Maximets 
1649455302d1SIlya Maximets 				xsk_unbind_dev(xs);
1650455302d1SIlya Maximets 
16511c1efc2aSMagnus Karlsson 				/* Clear device references. */
16521c1efc2aSMagnus Karlsson 				xp_clear_dev(xs->pool);
1653455302d1SIlya Maximets 			}
1654455302d1SIlya Maximets 			mutex_unlock(&xs->mutex);
1655455302d1SIlya Maximets 		}
1656455302d1SIlya Maximets 		mutex_unlock(&net->xdp.lock);
1657455302d1SIlya Maximets 		break;
1658455302d1SIlya Maximets 	}
1659455302d1SIlya Maximets 	return NOTIFY_DONE;
1660455302d1SIlya Maximets }
1661455302d1SIlya Maximets 
1662c0c77d8fSBjörn Töpel static struct proto xsk_proto = {
1663c0c77d8fSBjörn Töpel 	.name =		"XDP",
1664c0c77d8fSBjörn Töpel 	.owner =	THIS_MODULE,
1665c0c77d8fSBjörn Töpel 	.obj_size =	sizeof(struct xdp_sock),
1666c0c77d8fSBjörn Töpel };
1667c0c77d8fSBjörn Töpel 
1668c0c77d8fSBjörn Töpel static const struct proto_ops xsk_proto_ops = {
1669c0c77d8fSBjörn Töpel 	.family		= PF_XDP,
1670c0c77d8fSBjörn Töpel 	.owner		= THIS_MODULE,
1671c0c77d8fSBjörn Töpel 	.release	= xsk_release,
1672965a9909SMagnus Karlsson 	.bind		= xsk_bind,
1673c0c77d8fSBjörn Töpel 	.connect	= sock_no_connect,
1674c0c77d8fSBjörn Töpel 	.socketpair	= sock_no_socketpair,
1675c0c77d8fSBjörn Töpel 	.accept		= sock_no_accept,
1676c0c77d8fSBjörn Töpel 	.getname	= sock_no_getname,
1677a11e1d43SLinus Torvalds 	.poll		= xsk_poll,
1678c0c77d8fSBjörn Töpel 	.ioctl		= sock_no_ioctl,
1679c0c77d8fSBjörn Töpel 	.listen		= sock_no_listen,
1680c0c77d8fSBjörn Töpel 	.shutdown	= sock_no_shutdown,
1681c0c77d8fSBjörn Töpel 	.setsockopt	= xsk_setsockopt,
1682af75d9e0SMagnus Karlsson 	.getsockopt	= xsk_getsockopt,
168335fcde7fSMagnus Karlsson 	.sendmsg	= xsk_sendmsg,
168445a86681SBjörn Töpel 	.recvmsg	= xsk_recvmsg,
1685423f3832SMagnus Karlsson 	.mmap		= xsk_mmap,
1686c0c77d8fSBjörn Töpel };
1687c0c77d8fSBjörn Töpel 
xsk_destruct(struct sock * sk)168811fe9262SBjörn Töpel static void xsk_destruct(struct sock *sk)
168911fe9262SBjörn Töpel {
169011fe9262SBjörn Töpel 	struct xdp_sock *xs = xdp_sk(sk);
169111fe9262SBjörn Töpel 
169211fe9262SBjörn Töpel 	if (!sock_flag(sk, SOCK_DEAD))
169311fe9262SBjörn Töpel 		return;
169411fe9262SBjörn Töpel 
1695e5e1a4bcSMagnus Karlsson 	if (!xp_put_pool(xs->pool))
1696537cf4e3SMagnus Karlsson 		xdp_put_umem(xs->umem, !xs->pool);
169711fe9262SBjörn Töpel }
169811fe9262SBjörn Töpel 
xsk_create(struct net * net,struct socket * sock,int protocol,int kern)1699c0c77d8fSBjörn Töpel static int xsk_create(struct net *net, struct socket *sock, int protocol,
1700c0c77d8fSBjörn Töpel 		      int kern)
1701c0c77d8fSBjörn Töpel {
1702c0c77d8fSBjörn Töpel 	struct xdp_sock *xs;
17031c1efc2aSMagnus Karlsson 	struct sock *sk;
1704c0c77d8fSBjörn Töpel 
1705c0c77d8fSBjörn Töpel 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
1706c0c77d8fSBjörn Töpel 		return -EPERM;
1707c0c77d8fSBjörn Töpel 	if (sock->type != SOCK_RAW)
1708c0c77d8fSBjörn Töpel 		return -ESOCKTNOSUPPORT;
1709c0c77d8fSBjörn Töpel 
1710c0c77d8fSBjörn Töpel 	if (protocol)
1711c0c77d8fSBjörn Töpel 		return -EPROTONOSUPPORT;
1712c0c77d8fSBjörn Töpel 
1713c0c77d8fSBjörn Töpel 	sock->state = SS_UNCONNECTED;
1714c0c77d8fSBjörn Töpel 
1715c0c77d8fSBjörn Töpel 	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
1716c0c77d8fSBjörn Töpel 	if (!sk)
1717c0c77d8fSBjörn Töpel 		return -ENOBUFS;
1718c0c77d8fSBjörn Töpel 
1719c0c77d8fSBjörn Töpel 	sock->ops = &xsk_proto_ops;
1720c0c77d8fSBjörn Töpel 
1721c0c77d8fSBjörn Töpel 	sock_init_data(sock, sk);
1722c0c77d8fSBjörn Töpel 
1723c0c77d8fSBjörn Töpel 	sk->sk_family = PF_XDP;
1724c0c77d8fSBjörn Töpel 
172511fe9262SBjörn Töpel 	sk->sk_destruct = xsk_destruct;
172611fe9262SBjörn Töpel 
1727cee27167SBjörn Töpel 	sock_set_flag(sk, SOCK_RCU_FREE);
1728cee27167SBjörn Töpel 
1729c0c77d8fSBjörn Töpel 	xs = xdp_sk(sk);
1730455302d1SIlya Maximets 	xs->state = XSK_READY;
1731c0c77d8fSBjörn Töpel 	mutex_init(&xs->mutex);
1732bf0bdd13SIlya Maximets 	spin_lock_init(&xs->rx_lock);
1733c0c77d8fSBjörn Töpel 
17340402acd6SBjörn Töpel 	INIT_LIST_HEAD(&xs->map_list);
17350402acd6SBjörn Töpel 	spin_lock_init(&xs->map_list_lock);
17360402acd6SBjörn Töpel 
17371d0dc069SBjörn Töpel 	mutex_lock(&net->xdp.lock);
17381d0dc069SBjörn Töpel 	sk_add_node_rcu(sk, &net->xdp.list);
17391d0dc069SBjörn Töpel 	mutex_unlock(&net->xdp.lock);
17401d0dc069SBjörn Töpel 
1741c0c77d8fSBjörn Töpel 	sock_prot_inuse_add(net, &xsk_proto, 1);
1742c0c77d8fSBjörn Töpel 
1743c0c77d8fSBjörn Töpel 	return 0;
1744c0c77d8fSBjörn Töpel }
1745c0c77d8fSBjörn Töpel 
1746c0c77d8fSBjörn Töpel static const struct net_proto_family xsk_family_ops = {
1747c0c77d8fSBjörn Töpel 	.family = PF_XDP,
1748c0c77d8fSBjörn Töpel 	.create = xsk_create,
1749c0c77d8fSBjörn Töpel 	.owner	= THIS_MODULE,
1750c0c77d8fSBjörn Töpel };
1751c0c77d8fSBjörn Töpel 
1752455302d1SIlya Maximets static struct notifier_block xsk_netdev_notifier = {
1753455302d1SIlya Maximets 	.notifier_call	= xsk_notifier,
1754455302d1SIlya Maximets };
1755455302d1SIlya Maximets 
xsk_net_init(struct net * net)17561d0dc069SBjörn Töpel static int __net_init xsk_net_init(struct net *net)
17571d0dc069SBjörn Töpel {
17581d0dc069SBjörn Töpel 	mutex_init(&net->xdp.lock);
17591d0dc069SBjörn Töpel 	INIT_HLIST_HEAD(&net->xdp.list);
17601d0dc069SBjörn Töpel 	return 0;
17611d0dc069SBjörn Töpel }
17621d0dc069SBjörn Töpel 
xsk_net_exit(struct net * net)17631d0dc069SBjörn Töpel static void __net_exit xsk_net_exit(struct net *net)
17641d0dc069SBjörn Töpel {
17651d0dc069SBjörn Töpel 	WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
17661d0dc069SBjörn Töpel }
17671d0dc069SBjörn Töpel 
17681d0dc069SBjörn Töpel static struct pernet_operations xsk_net_ops = {
17691d0dc069SBjörn Töpel 	.init = xsk_net_init,
17701d0dc069SBjörn Töpel 	.exit = xsk_net_exit,
17711d0dc069SBjörn Töpel };
17721d0dc069SBjörn Töpel 
xsk_init(void)1773c0c77d8fSBjörn Töpel static int __init xsk_init(void)
1774c0c77d8fSBjörn Töpel {
1775e312b9e7SBjörn Töpel 	int err, cpu;
1776c0c77d8fSBjörn Töpel 
1777c0c77d8fSBjörn Töpel 	err = proto_register(&xsk_proto, 0 /* no slab */);
1778c0c77d8fSBjörn Töpel 	if (err)
1779c0c77d8fSBjörn Töpel 		goto out;
1780c0c77d8fSBjörn Töpel 
1781c0c77d8fSBjörn Töpel 	err = sock_register(&xsk_family_ops);
1782c0c77d8fSBjörn Töpel 	if (err)
1783c0c77d8fSBjörn Töpel 		goto out_proto;
1784c0c77d8fSBjörn Töpel 
17851d0dc069SBjörn Töpel 	err = register_pernet_subsys(&xsk_net_ops);
17861d0dc069SBjörn Töpel 	if (err)
17871d0dc069SBjörn Töpel 		goto out_sk;
1788455302d1SIlya Maximets 
1789455302d1SIlya Maximets 	err = register_netdevice_notifier(&xsk_netdev_notifier);
1790455302d1SIlya Maximets 	if (err)
1791455302d1SIlya Maximets 		goto out_pernet;
1792455302d1SIlya Maximets 
1793e312b9e7SBjörn Töpel 	for_each_possible_cpu(cpu)
1794e312b9e7SBjörn Töpel 		INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
1795c0c77d8fSBjörn Töpel 	return 0;
1796c0c77d8fSBjörn Töpel 
1797455302d1SIlya Maximets out_pernet:
1798455302d1SIlya Maximets 	unregister_pernet_subsys(&xsk_net_ops);
17991d0dc069SBjörn Töpel out_sk:
18001d0dc069SBjörn Töpel 	sock_unregister(PF_XDP);
1801c0c77d8fSBjörn Töpel out_proto:
1802c0c77d8fSBjörn Töpel 	proto_unregister(&xsk_proto);
1803c0c77d8fSBjörn Töpel out:
1804c0c77d8fSBjörn Töpel 	return err;
1805c0c77d8fSBjörn Töpel }
1806c0c77d8fSBjörn Töpel 
1807c0c77d8fSBjörn Töpel fs_initcall(xsk_init);
1808