xref: /linux/net/xdp/xsk.c (revision f7306ace)
1c0c77d8fSBjörn Töpel // SPDX-License-Identifier: GPL-2.0
2c0c77d8fSBjörn Töpel /* XDP sockets
3c0c77d8fSBjörn Töpel  *
4c0c77d8fSBjörn Töpel  * AF_XDP sockets allows a channel between XDP programs and userspace
5c0c77d8fSBjörn Töpel  * applications.
6c0c77d8fSBjörn Töpel  * Copyright(c) 2018 Intel Corporation.
7c0c77d8fSBjörn Töpel  *
8c0c77d8fSBjörn Töpel  * Author(s): Björn Töpel <bjorn.topel@intel.com>
9c0c77d8fSBjörn Töpel  *	      Magnus Karlsson <magnus.karlsson@intel.com>
10c0c77d8fSBjörn Töpel  */
11c0c77d8fSBjörn Töpel 
12c0c77d8fSBjörn Töpel #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
13c0c77d8fSBjörn Töpel 
14c0c77d8fSBjörn Töpel #include <linux/if_xdp.h>
15c0c77d8fSBjörn Töpel #include <linux/init.h>
16c0c77d8fSBjörn Töpel #include <linux/sched/mm.h>
17c0c77d8fSBjörn Töpel #include <linux/sched/signal.h>
18c0c77d8fSBjörn Töpel #include <linux/sched/task.h>
19c0c77d8fSBjörn Töpel #include <linux/socket.h>
20c0c77d8fSBjörn Töpel #include <linux/file.h>
21c0c77d8fSBjörn Töpel #include <linux/uaccess.h>
22c0c77d8fSBjörn Töpel #include <linux/net.h>
23c0c77d8fSBjörn Töpel #include <linux/netdevice.h>
24ac98d8aaSMagnus Karlsson #include <linux/rculist.h>
25951bce29SXuan Zhuo #include <linux/vmalloc.h>
26a71506a4SMagnus Karlsson #include <net/xdp_sock_drv.h>
27a0731952SBjörn Töpel #include <net/busy_poll.h>
28b9b6b68eSBjörn Töpel #include <net/xdp.h>
29c0c77d8fSBjörn Töpel 
30423f3832SMagnus Karlsson #include "xsk_queue.h"
31c0c77d8fSBjörn Töpel #include "xdp_umem.h"
32a36b38aaSBjörn Töpel #include "xsk.h"
33c0c77d8fSBjörn Töpel 
34e7a1c130SLi RongQing #define TX_BATCH_SIZE 32
3535fcde7fSMagnus Karlsson 
36e312b9e7SBjörn Töpel static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
37e312b9e7SBjörn Töpel 
38c4655761SMagnus Karlsson void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
3977cd0d7bSMagnus Karlsson {
40c2d3d6a4SMagnus Karlsson 	if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
4177cd0d7bSMagnus Karlsson 		return;
4277cd0d7bSMagnus Karlsson 
437361f9c3SMagnus Karlsson 	pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
44c2d3d6a4SMagnus Karlsson 	pool->cached_need_wakeup |= XDP_WAKEUP_RX;
4577cd0d7bSMagnus Karlsson }
4677cd0d7bSMagnus Karlsson EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
4777cd0d7bSMagnus Karlsson 
48c4655761SMagnus Karlsson void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool)
4977cd0d7bSMagnus Karlsson {
5077cd0d7bSMagnus Karlsson 	struct xdp_sock *xs;
5177cd0d7bSMagnus Karlsson 
52c2d3d6a4SMagnus Karlsson 	if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
5377cd0d7bSMagnus Karlsson 		return;
5477cd0d7bSMagnus Karlsson 
5577cd0d7bSMagnus Karlsson 	rcu_read_lock();
56a5aa8e52SMagnus Karlsson 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
5777cd0d7bSMagnus Karlsson 		xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
5877cd0d7bSMagnus Karlsson 	}
5977cd0d7bSMagnus Karlsson 	rcu_read_unlock();
6077cd0d7bSMagnus Karlsson 
61c2d3d6a4SMagnus Karlsson 	pool->cached_need_wakeup |= XDP_WAKEUP_TX;
6277cd0d7bSMagnus Karlsson }
6377cd0d7bSMagnus Karlsson EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
6477cd0d7bSMagnus Karlsson 
65c4655761SMagnus Karlsson void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool)
6677cd0d7bSMagnus Karlsson {
67c2d3d6a4SMagnus Karlsson 	if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX))
6877cd0d7bSMagnus Karlsson 		return;
6977cd0d7bSMagnus Karlsson 
707361f9c3SMagnus Karlsson 	pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
71c2d3d6a4SMagnus Karlsson 	pool->cached_need_wakeup &= ~XDP_WAKEUP_RX;
7277cd0d7bSMagnus Karlsson }
7377cd0d7bSMagnus Karlsson EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
7477cd0d7bSMagnus Karlsson 
75c4655761SMagnus Karlsson void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool)
7677cd0d7bSMagnus Karlsson {
7777cd0d7bSMagnus Karlsson 	struct xdp_sock *xs;
7877cd0d7bSMagnus Karlsson 
79c2d3d6a4SMagnus Karlsson 	if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX))
8077cd0d7bSMagnus Karlsson 		return;
8177cd0d7bSMagnus Karlsson 
8277cd0d7bSMagnus Karlsson 	rcu_read_lock();
83a5aa8e52SMagnus Karlsson 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
8477cd0d7bSMagnus Karlsson 		xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
8577cd0d7bSMagnus Karlsson 	}
8677cd0d7bSMagnus Karlsson 	rcu_read_unlock();
8777cd0d7bSMagnus Karlsson 
88c2d3d6a4SMagnus Karlsson 	pool->cached_need_wakeup &= ~XDP_WAKEUP_TX;
8977cd0d7bSMagnus Karlsson }
9077cd0d7bSMagnus Karlsson EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
9177cd0d7bSMagnus Karlsson 
92c4655761SMagnus Karlsson bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
9377cd0d7bSMagnus Karlsson {
94c2d3d6a4SMagnus Karlsson 	return pool->uses_need_wakeup;
9577cd0d7bSMagnus Karlsson }
96c4655761SMagnus Karlsson EXPORT_SYMBOL(xsk_uses_need_wakeup);
9777cd0d7bSMagnus Karlsson 
981c1efc2aSMagnus Karlsson struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
991c1efc2aSMagnus Karlsson 					    u16 queue_id)
1001c1efc2aSMagnus Karlsson {
1011c1efc2aSMagnus Karlsson 	if (queue_id < dev->real_num_rx_queues)
1021c1efc2aSMagnus Karlsson 		return dev->_rx[queue_id].pool;
1031c1efc2aSMagnus Karlsson 	if (queue_id < dev->real_num_tx_queues)
1041c1efc2aSMagnus Karlsson 		return dev->_tx[queue_id].pool;
1051c1efc2aSMagnus Karlsson 
1061c1efc2aSMagnus Karlsson 	return NULL;
1071c1efc2aSMagnus Karlsson }
1081c1efc2aSMagnus Karlsson EXPORT_SYMBOL(xsk_get_pool_from_qid);
1091c1efc2aSMagnus Karlsson 
1101c1efc2aSMagnus Karlsson void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
1111c1efc2aSMagnus Karlsson {
112b425e24aSMaxim Mikityanskiy 	if (queue_id < dev->num_rx_queues)
1131c1efc2aSMagnus Karlsson 		dev->_rx[queue_id].pool = NULL;
114b425e24aSMaxim Mikityanskiy 	if (queue_id < dev->num_tx_queues)
1151c1efc2aSMagnus Karlsson 		dev->_tx[queue_id].pool = NULL;
1161c1efc2aSMagnus Karlsson }
1171c1efc2aSMagnus Karlsson 
1181c1efc2aSMagnus Karlsson /* The buffer pool is stored both in the _rx struct and the _tx struct as we do
1191c1efc2aSMagnus Karlsson  * not know if the device has more tx queues than rx, or the opposite.
1201c1efc2aSMagnus Karlsson  * This might also change during run time.
1211c1efc2aSMagnus Karlsson  */
1221c1efc2aSMagnus Karlsson int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
1231c1efc2aSMagnus Karlsson 			u16 queue_id)
1241c1efc2aSMagnus Karlsson {
1251c1efc2aSMagnus Karlsson 	if (queue_id >= max_t(unsigned int,
1261c1efc2aSMagnus Karlsson 			      dev->real_num_rx_queues,
1271c1efc2aSMagnus Karlsson 			      dev->real_num_tx_queues))
1281c1efc2aSMagnus Karlsson 		return -EINVAL;
1291c1efc2aSMagnus Karlsson 
1301c1efc2aSMagnus Karlsson 	if (queue_id < dev->real_num_rx_queues)
1311c1efc2aSMagnus Karlsson 		dev->_rx[queue_id].pool = pool;
1321c1efc2aSMagnus Karlsson 	if (queue_id < dev->real_num_tx_queues)
1331c1efc2aSMagnus Karlsson 		dev->_tx[queue_id].pool = pool;
1341c1efc2aSMagnus Karlsson 
1351c1efc2aSMagnus Karlsson 	return 0;
1361c1efc2aSMagnus Karlsson }
1371c1efc2aSMagnus Karlsson 
1382b43470aSBjörn Töpel static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
139c05cd364SKevin Laatz {
1402b43470aSBjörn Töpel 	struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
1412b43470aSBjörn Töpel 	u64 addr;
1422b43470aSBjörn Töpel 	int err;
143c05cd364SKevin Laatz 
1442b43470aSBjörn Töpel 	addr = xp_get_handle(xskb);
1452b43470aSBjörn Töpel 	err = xskq_prod_reserve_desc(xs->rx, addr, len);
1462b43470aSBjörn Töpel 	if (err) {
1478aa5a335SCiara Loftus 		xs->rx_queue_full++;
1482b43470aSBjörn Töpel 		return err;
1492b43470aSBjörn Töpel 	}
150c05cd364SKevin Laatz 
1512b43470aSBjörn Töpel 	xp_release(xskb);
1522b43470aSBjörn Töpel 	return 0;
1532b43470aSBjörn Töpel }
154c05cd364SKevin Laatz 
1552b43470aSBjörn Töpel static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len)
1562b43470aSBjörn Töpel {
1572b43470aSBjörn Töpel 	void *from_buf, *to_buf;
1582b43470aSBjörn Töpel 	u32 metalen;
1592b43470aSBjörn Töpel 
1602b43470aSBjörn Töpel 	if (unlikely(xdp_data_meta_unsupported(from))) {
1612b43470aSBjörn Töpel 		from_buf = from->data;
1622b43470aSBjörn Töpel 		to_buf = to->data;
1632b43470aSBjörn Töpel 		metalen = 0;
1642b43470aSBjörn Töpel 	} else {
1652b43470aSBjörn Töpel 		from_buf = from->data_meta;
1662b43470aSBjörn Töpel 		metalen = from->data - from->data_meta;
1672b43470aSBjörn Töpel 		to_buf = to->data - metalen;
168c05cd364SKevin Laatz 	}
169c05cd364SKevin Laatz 
170c05cd364SKevin Laatz 	memcpy(to_buf, from_buf, len + metalen);
171c05cd364SKevin Laatz }
172c05cd364SKevin Laatz 
173458f7272SBjörn Töpel static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
174173d3adbSBjörn Töpel {
1752b43470aSBjörn Töpel 	struct xdp_buff *xsk_xdp;
1764e64c835SBjörn Töpel 	int err;
177458f7272SBjörn Töpel 	u32 len;
178c497176cSBjörn Töpel 
179458f7272SBjörn Töpel 	len = xdp->data_end - xdp->data;
180c4655761SMagnus Karlsson 	if (len > xsk_pool_get_rx_frame_size(xs->pool)) {
181a509a955SBjörn Töpel 		xs->rx_dropped++;
182c497176cSBjörn Töpel 		return -ENOSPC;
183a509a955SBjörn Töpel 	}
184c497176cSBjörn Töpel 
185c4655761SMagnus Karlsson 	xsk_xdp = xsk_buff_alloc(xs->pool);
1862b43470aSBjörn Töpel 	if (!xsk_xdp) {
1872b43470aSBjörn Töpel 		xs->rx_dropped++;
188c6c1f11bSBjörn Töpel 		return -ENOMEM;
18918baed26SBjörn Töpel 	}
19018baed26SBjörn Töpel 
1912b43470aSBjörn Töpel 	xsk_copy_xdp(xsk_xdp, xdp, len);
1922b43470aSBjörn Töpel 	err = __xsk_rcv_zc(xs, xsk_xdp, len);
1932b43470aSBjörn Töpel 	if (err) {
1942b43470aSBjörn Töpel 		xsk_buff_free(xsk_xdp);
1952b43470aSBjörn Töpel 		return err;
1962b43470aSBjörn Töpel 	}
197173d3adbSBjörn Töpel 	return 0;
198173d3adbSBjörn Töpel }
199173d3adbSBjörn Töpel 
2003413f041SXuan Zhuo static bool xsk_tx_writeable(struct xdp_sock *xs)
2013413f041SXuan Zhuo {
2023413f041SXuan Zhuo 	if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2)
2033413f041SXuan Zhuo 		return false;
2043413f041SXuan Zhuo 
2053413f041SXuan Zhuo 	return true;
2063413f041SXuan Zhuo }
2073413f041SXuan Zhuo 
20842fddcc7SBjörn Töpel static bool xsk_is_bound(struct xdp_sock *xs)
20942fddcc7SBjörn Töpel {
21042fddcc7SBjörn Töpel 	if (READ_ONCE(xs->state) == XSK_BOUND) {
21142fddcc7SBjörn Töpel 		/* Matches smp_wmb() in bind(). */
21242fddcc7SBjörn Töpel 		smp_rmb();
21342fddcc7SBjörn Töpel 		return true;
21442fddcc7SBjörn Töpel 	}
21542fddcc7SBjörn Töpel 	return false;
21642fddcc7SBjörn Töpel }
21742fddcc7SBjörn Töpel 
218458f7272SBjörn Töpel static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp)
219c497176cSBjörn Töpel {
22042fddcc7SBjörn Töpel 	if (!xsk_is_bound(xs))
2212be4a677SMaciej Fijalkowski 		return -ENXIO;
22242fddcc7SBjörn Töpel 
223173d3adbSBjörn Töpel 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
224173d3adbSBjörn Töpel 		return -EINVAL;
225c497176cSBjörn Töpel 
226b02e5a0eSBjörn Töpel 	sk_mark_napi_id_once_xdp(&xs->sk, xdp);
227458f7272SBjörn Töpel 	return 0;
228c497176cSBjörn Töpel }
229c497176cSBjörn Töpel 
230d817991cSBjörn Töpel static void xsk_flush(struct xdp_sock *xs)
231c497176cSBjörn Töpel {
23259e35e55SMagnus Karlsson 	xskq_prod_submit(xs->rx);
2337361f9c3SMagnus Karlsson 	__xskq_cons_release(xs->pool->fq);
23443a825afSBjörn Töpel 	sock_def_readable(&xs->sk);
235c497176cSBjörn Töpel }
236c497176cSBjörn Töpel 
237c497176cSBjörn Töpel int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
238c497176cSBjörn Töpel {
239c497176cSBjörn Töpel 	int err;
240c497176cSBjörn Töpel 
241bf0bdd13SIlya Maximets 	spin_lock_bh(&xs->rx_lock);
242458f7272SBjörn Töpel 	err = xsk_rcv_check(xs, xdp);
243458f7272SBjörn Töpel 	if (!err) {
244458f7272SBjörn Töpel 		err = __xsk_rcv(xs, xdp);
2452b43470aSBjörn Töpel 		xsk_flush(xs);
246458f7272SBjörn Töpel 	}
247bf0bdd13SIlya Maximets 	spin_unlock_bh(&xs->rx_lock);
248c497176cSBjörn Töpel 	return err;
249c497176cSBjörn Töpel }
250c497176cSBjörn Töpel 
251458f7272SBjörn Töpel static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
252458f7272SBjörn Töpel {
253458f7272SBjörn Töpel 	int err;
254458f7272SBjörn Töpel 	u32 len;
255458f7272SBjörn Töpel 
256458f7272SBjörn Töpel 	err = xsk_rcv_check(xs, xdp);
257458f7272SBjörn Töpel 	if (err)
258458f7272SBjörn Töpel 		return err;
259458f7272SBjörn Töpel 
260458f7272SBjörn Töpel 	if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) {
261458f7272SBjörn Töpel 		len = xdp->data_end - xdp->data;
262458f7272SBjörn Töpel 		return __xsk_rcv_zc(xs, xdp, len);
263458f7272SBjörn Töpel 	}
264458f7272SBjörn Töpel 
265458f7272SBjörn Töpel 	err = __xsk_rcv(xs, xdp);
266458f7272SBjörn Töpel 	if (!err)
267458f7272SBjörn Töpel 		xdp_return_buff(xdp);
268458f7272SBjörn Töpel 	return err;
269458f7272SBjörn Töpel }
270458f7272SBjörn Töpel 
271e312b9e7SBjörn Töpel int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
272d817991cSBjörn Töpel {
273e312b9e7SBjörn Töpel 	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
274d817991cSBjörn Töpel 	int err;
275d817991cSBjörn Töpel 
276458f7272SBjörn Töpel 	err = xsk_rcv(xs, xdp);
277d817991cSBjörn Töpel 	if (err)
278d817991cSBjörn Töpel 		return err;
279d817991cSBjörn Töpel 
280d817991cSBjörn Töpel 	if (!xs->flush_node.prev)
281d817991cSBjörn Töpel 		list_add(&xs->flush_node, flush_list);
282d817991cSBjörn Töpel 
283d817991cSBjörn Töpel 	return 0;
284d817991cSBjörn Töpel }
285d817991cSBjörn Töpel 
286e312b9e7SBjörn Töpel void __xsk_map_flush(void)
287d817991cSBjörn Töpel {
288e312b9e7SBjörn Töpel 	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
289d817991cSBjörn Töpel 	struct xdp_sock *xs, *tmp;
290d817991cSBjörn Töpel 
291d817991cSBjörn Töpel 	list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
292d817991cSBjörn Töpel 		xsk_flush(xs);
293d817991cSBjörn Töpel 		__list_del_clearprev(&xs->flush_node);
294d817991cSBjörn Töpel 	}
295d817991cSBjörn Töpel }
296d817991cSBjörn Töpel 
297c4655761SMagnus Karlsson void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
298ac98d8aaSMagnus Karlsson {
2997361f9c3SMagnus Karlsson 	xskq_prod_submit_n(pool->cq, nb_entries);
300ac98d8aaSMagnus Karlsson }
301c4655761SMagnus Karlsson EXPORT_SYMBOL(xsk_tx_completed);
302ac98d8aaSMagnus Karlsson 
303c4655761SMagnus Karlsson void xsk_tx_release(struct xsk_buff_pool *pool)
304ac98d8aaSMagnus Karlsson {
305ac98d8aaSMagnus Karlsson 	struct xdp_sock *xs;
306ac98d8aaSMagnus Karlsson 
307ac98d8aaSMagnus Karlsson 	rcu_read_lock();
308a5aa8e52SMagnus Karlsson 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
30930744a68SMagnus Karlsson 		__xskq_cons_release(xs->tx);
3103413f041SXuan Zhuo 		if (xsk_tx_writeable(xs))
311ac98d8aaSMagnus Karlsson 			xs->sk.sk_write_space(&xs->sk);
312ac98d8aaSMagnus Karlsson 	}
313ac98d8aaSMagnus Karlsson 	rcu_read_unlock();
314ac98d8aaSMagnus Karlsson }
315c4655761SMagnus Karlsson EXPORT_SYMBOL(xsk_tx_release);
316ac98d8aaSMagnus Karlsson 
317c4655761SMagnus Karlsson bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
318ac98d8aaSMagnus Karlsson {
319ac98d8aaSMagnus Karlsson 	struct xdp_sock *xs;
320ac98d8aaSMagnus Karlsson 
321ac98d8aaSMagnus Karlsson 	rcu_read_lock();
322a5aa8e52SMagnus Karlsson 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
3231c1efc2aSMagnus Karlsson 		if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
3248aa5a335SCiara Loftus 			xs->tx->queue_empty_descs++;
325ac98d8aaSMagnus Karlsson 			continue;
3268aa5a335SCiara Loftus 		}
327ac98d8aaSMagnus Karlsson 
3280a05861fSTobias Klauser 		/* This is the backpressure mechanism for the Tx path.
32915d8c916SMagnus Karlsson 		 * Reserve space in the completion queue and only proceed
33015d8c916SMagnus Karlsson 		 * if there is space in it. This avoids having to implement
33115d8c916SMagnus Karlsson 		 * any buffering in the Tx path.
33215d8c916SMagnus Karlsson 		 */
3337361f9c3SMagnus Karlsson 		if (xskq_prod_reserve_addr(pool->cq, desc->addr))
334ac98d8aaSMagnus Karlsson 			goto out;
335ac98d8aaSMagnus Karlsson 
336c5ed924bSMagnus Karlsson 		xskq_cons_release(xs->tx);
337ac98d8aaSMagnus Karlsson 		rcu_read_unlock();
338ac98d8aaSMagnus Karlsson 		return true;
339ac98d8aaSMagnus Karlsson 	}
340ac98d8aaSMagnus Karlsson 
341ac98d8aaSMagnus Karlsson out:
342ac98d8aaSMagnus Karlsson 	rcu_read_unlock();
343ac98d8aaSMagnus Karlsson 	return false;
344ac98d8aaSMagnus Karlsson }
345c4655761SMagnus Karlsson EXPORT_SYMBOL(xsk_tx_peek_desc);
346ac98d8aaSMagnus Karlsson 
347d1bc532eSMagnus Karlsson static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entries)
3489349eb3aSMagnus Karlsson {
349d1bc532eSMagnus Karlsson 	struct xdp_desc *descs = pool->tx_descs;
3509349eb3aSMagnus Karlsson 	u32 nb_pkts = 0;
3519349eb3aSMagnus Karlsson 
3529349eb3aSMagnus Karlsson 	while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts]))
3539349eb3aSMagnus Karlsson 		nb_pkts++;
3549349eb3aSMagnus Karlsson 
3559349eb3aSMagnus Karlsson 	xsk_tx_release(pool);
3569349eb3aSMagnus Karlsson 	return nb_pkts;
3579349eb3aSMagnus Karlsson }
3589349eb3aSMagnus Karlsson 
359c00c4461SMaciej Fijalkowski u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_pkts)
3609349eb3aSMagnus Karlsson {
3619349eb3aSMagnus Karlsson 	struct xdp_sock *xs;
3629349eb3aSMagnus Karlsson 
3639349eb3aSMagnus Karlsson 	rcu_read_lock();
3649349eb3aSMagnus Karlsson 	if (!list_is_singular(&pool->xsk_tx_list)) {
3659349eb3aSMagnus Karlsson 		/* Fallback to the non-batched version */
3669349eb3aSMagnus Karlsson 		rcu_read_unlock();
367c00c4461SMaciej Fijalkowski 		return xsk_tx_peek_release_fallback(pool, nb_pkts);
3689349eb3aSMagnus Karlsson 	}
3699349eb3aSMagnus Karlsson 
3709349eb3aSMagnus Karlsson 	xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
3719349eb3aSMagnus Karlsson 	if (!xs) {
3729349eb3aSMagnus Karlsson 		nb_pkts = 0;
3739349eb3aSMagnus Karlsson 		goto out;
3749349eb3aSMagnus Karlsson 	}
3759349eb3aSMagnus Karlsson 
376c00c4461SMaciej Fijalkowski 	nb_pkts = xskq_cons_nb_entries(xs->tx, nb_pkts);
3779349eb3aSMagnus Karlsson 
3789349eb3aSMagnus Karlsson 	/* This is the backpressure mechanism for the Tx path. Try to
3799349eb3aSMagnus Karlsson 	 * reserve space in the completion queue for all packets, but
3809349eb3aSMagnus Karlsson 	 * if there are fewer slots available, just process that many
3819349eb3aSMagnus Karlsson 	 * packets. This avoids having to implement any buffering in
3829349eb3aSMagnus Karlsson 	 * the Tx path.
3839349eb3aSMagnus Karlsson 	 */
384c00c4461SMaciej Fijalkowski 	nb_pkts = xskq_prod_nb_free(pool->cq, nb_pkts);
3859349eb3aSMagnus Karlsson 	if (!nb_pkts)
3869349eb3aSMagnus Karlsson 		goto out;
3879349eb3aSMagnus Karlsson 
388c00c4461SMaciej Fijalkowski 	nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, nb_pkts);
389c00c4461SMaciej Fijalkowski 	if (!nb_pkts) {
390c00c4461SMaciej Fijalkowski 		xs->tx->queue_empty_descs++;
391c00c4461SMaciej Fijalkowski 		goto out;
392c00c4461SMaciej Fijalkowski 	}
393c00c4461SMaciej Fijalkowski 
3949349eb3aSMagnus Karlsson 	__xskq_cons_release(xs->tx);
395c00c4461SMaciej Fijalkowski 	xskq_prod_write_addr_batch(pool->cq, pool->tx_descs, nb_pkts);
3969349eb3aSMagnus Karlsson 	xs->sk.sk_write_space(&xs->sk);
3979349eb3aSMagnus Karlsson 
3989349eb3aSMagnus Karlsson out:
3999349eb3aSMagnus Karlsson 	rcu_read_unlock();
4009349eb3aSMagnus Karlsson 	return nb_pkts;
4019349eb3aSMagnus Karlsson }
4029349eb3aSMagnus Karlsson EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch);
4039349eb3aSMagnus Karlsson 
40406870682SMaxim Mikityanskiy static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
405ac98d8aaSMagnus Karlsson {
406ac98d8aaSMagnus Karlsson 	struct net_device *dev = xs->dev;
407ac98d8aaSMagnus Karlsson 
40818b1ab7aSMagnus Karlsson 	return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
409ac98d8aaSMagnus Karlsson }
410ac98d8aaSMagnus Karlsson 
41135fcde7fSMagnus Karlsson static void xsk_destruct_skb(struct sk_buff *skb)
41235fcde7fSMagnus Karlsson {
413bbff2f32SBjörn Töpel 	u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
41435fcde7fSMagnus Karlsson 	struct xdp_sock *xs = xdp_sk(skb->sk);
415a9744f7cSMagnus Karlsson 	unsigned long flags;
41635fcde7fSMagnus Karlsson 
417f09ced40SMagnus Karlsson 	spin_lock_irqsave(&xs->pool->cq_lock, flags);
4187361f9c3SMagnus Karlsson 	xskq_prod_submit_addr(xs->pool->cq, addr);
419f09ced40SMagnus Karlsson 	spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
42035fcde7fSMagnus Karlsson 
42135fcde7fSMagnus Karlsson 	sock_wfree(skb);
42235fcde7fSMagnus Karlsson }
42335fcde7fSMagnus Karlsson 
4249c8f21e6SXuan Zhuo static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
4259c8f21e6SXuan Zhuo 					      struct xdp_desc *desc)
4269c8f21e6SXuan Zhuo {
4279c8f21e6SXuan Zhuo 	struct xsk_buff_pool *pool = xs->pool;
4289c8f21e6SXuan Zhuo 	u32 hr, len, ts, offset, copy, copied;
4299c8f21e6SXuan Zhuo 	struct sk_buff *skb;
4309c8f21e6SXuan Zhuo 	struct page *page;
4319c8f21e6SXuan Zhuo 	void *buffer;
4329c8f21e6SXuan Zhuo 	int err, i;
4339c8f21e6SXuan Zhuo 	u64 addr;
4349c8f21e6SXuan Zhuo 
4359c8f21e6SXuan Zhuo 	hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
4369c8f21e6SXuan Zhuo 
4379c8f21e6SXuan Zhuo 	skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
4389c8f21e6SXuan Zhuo 	if (unlikely(!skb))
4399c8f21e6SXuan Zhuo 		return ERR_PTR(err);
4409c8f21e6SXuan Zhuo 
4419c8f21e6SXuan Zhuo 	skb_reserve(skb, hr);
4429c8f21e6SXuan Zhuo 
4439c8f21e6SXuan Zhuo 	addr = desc->addr;
4449c8f21e6SXuan Zhuo 	len = desc->len;
4459c8f21e6SXuan Zhuo 	ts = pool->unaligned ? len : pool->chunk_size;
4469c8f21e6SXuan Zhuo 
4479c8f21e6SXuan Zhuo 	buffer = xsk_buff_raw_get_data(pool, addr);
4489c8f21e6SXuan Zhuo 	offset = offset_in_page(buffer);
4499c8f21e6SXuan Zhuo 	addr = buffer - pool->addrs;
4509c8f21e6SXuan Zhuo 
4519c8f21e6SXuan Zhuo 	for (copied = 0, i = 0; copied < len; i++) {
4529c8f21e6SXuan Zhuo 		page = pool->umem->pgs[addr >> PAGE_SHIFT];
4539c8f21e6SXuan Zhuo 		get_page(page);
4549c8f21e6SXuan Zhuo 
4559c8f21e6SXuan Zhuo 		copy = min_t(u32, PAGE_SIZE - offset, len - copied);
4569c8f21e6SXuan Zhuo 		skb_fill_page_desc(skb, i, page, offset, copy);
4579c8f21e6SXuan Zhuo 
4589c8f21e6SXuan Zhuo 		copied += copy;
4599c8f21e6SXuan Zhuo 		addr += copy;
4609c8f21e6SXuan Zhuo 		offset = 0;
4619c8f21e6SXuan Zhuo 	}
4629c8f21e6SXuan Zhuo 
4639c8f21e6SXuan Zhuo 	skb->len += len;
4649c8f21e6SXuan Zhuo 	skb->data_len += len;
4659c8f21e6SXuan Zhuo 	skb->truesize += ts;
4669c8f21e6SXuan Zhuo 
4679c8f21e6SXuan Zhuo 	refcount_add(ts, &xs->sk.sk_wmem_alloc);
4689c8f21e6SXuan Zhuo 
4699c8f21e6SXuan Zhuo 	return skb;
4709c8f21e6SXuan Zhuo }
4719c8f21e6SXuan Zhuo 
4729c8f21e6SXuan Zhuo static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
4739c8f21e6SXuan Zhuo 				     struct xdp_desc *desc)
4749c8f21e6SXuan Zhuo {
4759c8f21e6SXuan Zhuo 	struct net_device *dev = xs->dev;
4769c8f21e6SXuan Zhuo 	struct sk_buff *skb;
4779c8f21e6SXuan Zhuo 
4789c8f21e6SXuan Zhuo 	if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
4799c8f21e6SXuan Zhuo 		skb = xsk_build_skb_zerocopy(xs, desc);
4809c8f21e6SXuan Zhuo 		if (IS_ERR(skb))
4819c8f21e6SXuan Zhuo 			return skb;
4829c8f21e6SXuan Zhuo 	} else {
4839c8f21e6SXuan Zhuo 		u32 hr, tr, len;
4849c8f21e6SXuan Zhuo 		void *buffer;
4859c8f21e6SXuan Zhuo 		int err;
4869c8f21e6SXuan Zhuo 
4879c8f21e6SXuan Zhuo 		hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
4889c8f21e6SXuan Zhuo 		tr = dev->needed_tailroom;
4899c8f21e6SXuan Zhuo 		len = desc->len;
4909c8f21e6SXuan Zhuo 
4919c8f21e6SXuan Zhuo 		skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
4929c8f21e6SXuan Zhuo 		if (unlikely(!skb))
4939c8f21e6SXuan Zhuo 			return ERR_PTR(err);
4949c8f21e6SXuan Zhuo 
4959c8f21e6SXuan Zhuo 		skb_reserve(skb, hr);
4969c8f21e6SXuan Zhuo 		skb_put(skb, len);
4979c8f21e6SXuan Zhuo 
4989c8f21e6SXuan Zhuo 		buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
4999c8f21e6SXuan Zhuo 		err = skb_store_bits(skb, 0, buffer, len);
5009c8f21e6SXuan Zhuo 		if (unlikely(err)) {
5019c8f21e6SXuan Zhuo 			kfree_skb(skb);
5029c8f21e6SXuan Zhuo 			return ERR_PTR(err);
5039c8f21e6SXuan Zhuo 		}
5049c8f21e6SXuan Zhuo 	}
5059c8f21e6SXuan Zhuo 
5069c8f21e6SXuan Zhuo 	skb->dev = dev;
5079c8f21e6SXuan Zhuo 	skb->priority = xs->sk.sk_priority;
5089c8f21e6SXuan Zhuo 	skb->mark = xs->sk.sk_mark;
5099c8f21e6SXuan Zhuo 	skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
5109c8f21e6SXuan Zhuo 	skb->destructor = xsk_destruct_skb;
5119c8f21e6SXuan Zhuo 
5129c8f21e6SXuan Zhuo 	return skb;
5139c8f21e6SXuan Zhuo }
5149c8f21e6SXuan Zhuo 
5151596dae2SMaciej Fijalkowski static int __xsk_generic_xmit(struct sock *sk)
51635fcde7fSMagnus Karlsson {
51735fcde7fSMagnus Karlsson 	struct xdp_sock *xs = xdp_sk(sk);
518df551058SMagnus Karlsson 	u32 max_batch = TX_BATCH_SIZE;
51935fcde7fSMagnus Karlsson 	bool sent_frame = false;
52035fcde7fSMagnus Karlsson 	struct xdp_desc desc;
52135fcde7fSMagnus Karlsson 	struct sk_buff *skb;
522f09ced40SMagnus Karlsson 	unsigned long flags;
52335fcde7fSMagnus Karlsson 	int err = 0;
52435fcde7fSMagnus Karlsson 
52535fcde7fSMagnus Karlsson 	mutex_lock(&xs->mutex);
52635fcde7fSMagnus Karlsson 
52718b1ab7aSMagnus Karlsson 	/* Since we dropped the RCU read lock, the socket state might have changed. */
52818b1ab7aSMagnus Karlsson 	if (unlikely(!xsk_is_bound(xs))) {
52918b1ab7aSMagnus Karlsson 		err = -ENXIO;
53018b1ab7aSMagnus Karlsson 		goto out;
53118b1ab7aSMagnus Karlsson 	}
53218b1ab7aSMagnus Karlsson 
53367571640SIlya Maximets 	if (xs->queue_id >= xs->dev->real_num_tx_queues)
53467571640SIlya Maximets 		goto out;
53567571640SIlya Maximets 
5361c1efc2aSMagnus Karlsson 	while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
53735fcde7fSMagnus Karlsson 		if (max_batch-- == 0) {
53835fcde7fSMagnus Karlsson 			err = -EAGAIN;
53935fcde7fSMagnus Karlsson 			goto out;
54035fcde7fSMagnus Karlsson 		}
54135fcde7fSMagnus Karlsson 
5420a05861fSTobias Klauser 		/* This is the backpressure mechanism for the Tx path.
54315d8c916SMagnus Karlsson 		 * Reserve space in the completion queue and only proceed
54415d8c916SMagnus Karlsson 		 * if there is space in it. This avoids having to implement
54515d8c916SMagnus Karlsson 		 * any buffering in the Tx path.
54615d8c916SMagnus Karlsson 		 */
547f09ced40SMagnus Karlsson 		spin_lock_irqsave(&xs->pool->cq_lock, flags);
5489c8f21e6SXuan Zhuo 		if (xskq_prod_reserve(xs->pool->cq)) {
549f09ced40SMagnus Karlsson 			spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
55035fcde7fSMagnus Karlsson 			goto out;
55135fcde7fSMagnus Karlsson 		}
552f09ced40SMagnus Karlsson 		spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
55335fcde7fSMagnus Karlsson 
554a6e944f2SCiara Loftus 		skb = xsk_build_skb(xs, &desc);
555a6e944f2SCiara Loftus 		if (IS_ERR(skb)) {
556a6e944f2SCiara Loftus 			err = PTR_ERR(skb);
557a6e944f2SCiara Loftus 			spin_lock_irqsave(&xs->pool->cq_lock, flags);
558a6e944f2SCiara Loftus 			xskq_prod_cancel(xs->pool->cq);
559a6e944f2SCiara Loftus 			spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
560a6e944f2SCiara Loftus 			goto out;
561a6e944f2SCiara Loftus 		}
562a6e944f2SCiara Loftus 
56336ccdf85SBjörn Töpel 		err = __dev_direct_xmit(skb, xs->queue_id);
564642e450bSMagnus Karlsson 		if  (err == NETDEV_TX_BUSY) {
565642e450bSMagnus Karlsson 			/* Tell user-space to retry the send */
566642e450bSMagnus Karlsson 			skb->destructor = sock_wfree;
567b1b95cb5SMagnus Karlsson 			spin_lock_irqsave(&xs->pool->cq_lock, flags);
568b1b95cb5SMagnus Karlsson 			xskq_prod_cancel(xs->pool->cq);
569b1b95cb5SMagnus Karlsson 			spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
570642e450bSMagnus Karlsson 			/* Free skb without triggering the perf drop trace */
571642e450bSMagnus Karlsson 			consume_skb(skb);
572642e450bSMagnus Karlsson 			err = -EAGAIN;
573642e450bSMagnus Karlsson 			goto out;
574642e450bSMagnus Karlsson 		}
575642e450bSMagnus Karlsson 
576c5ed924bSMagnus Karlsson 		xskq_cons_release(xs->tx);
57735fcde7fSMagnus Karlsson 		/* Ignore NET_XMIT_CN as packet might have been sent */
578642e450bSMagnus Karlsson 		if (err == NET_XMIT_DROP) {
579fe588685SMagnus Karlsson 			/* SKB completed but not sent */
580fe588685SMagnus Karlsson 			err = -EBUSY;
58135fcde7fSMagnus Karlsson 			goto out;
58235fcde7fSMagnus Karlsson 		}
58335fcde7fSMagnus Karlsson 
58435fcde7fSMagnus Karlsson 		sent_frame = true;
58535fcde7fSMagnus Karlsson 	}
58635fcde7fSMagnus Karlsson 
5878aa5a335SCiara Loftus 	xs->tx->queue_empty_descs++;
5888aa5a335SCiara Loftus 
58935fcde7fSMagnus Karlsson out:
59035fcde7fSMagnus Karlsson 	if (sent_frame)
5913413f041SXuan Zhuo 		if (xsk_tx_writeable(xs))
59235fcde7fSMagnus Karlsson 			sk->sk_write_space(sk);
59335fcde7fSMagnus Karlsson 
59435fcde7fSMagnus Karlsson 	mutex_unlock(&xs->mutex);
59535fcde7fSMagnus Karlsson 	return err;
59635fcde7fSMagnus Karlsson }
59735fcde7fSMagnus Karlsson 
5981596dae2SMaciej Fijalkowski static int xsk_generic_xmit(struct sock *sk)
599df551058SMagnus Karlsson {
60018b1ab7aSMagnus Karlsson 	int ret;
601df551058SMagnus Karlsson 
60218b1ab7aSMagnus Karlsson 	/* Drop the RCU lock since the SKB path might sleep. */
60318b1ab7aSMagnus Karlsson 	rcu_read_unlock();
6041596dae2SMaciej Fijalkowski 	ret = __xsk_generic_xmit(sk);
60518b1ab7aSMagnus Karlsson 	/* Reaquire RCU lock before going into common code. */
60618b1ab7aSMagnus Karlsson 	rcu_read_lock();
60718b1ab7aSMagnus Karlsson 
60818b1ab7aSMagnus Karlsson 	return ret;
609df551058SMagnus Karlsson }
610df551058SMagnus Karlsson 
611a0731952SBjörn Töpel static bool xsk_no_wakeup(struct sock *sk)
612a0731952SBjörn Töpel {
613a0731952SBjörn Töpel #ifdef CONFIG_NET_RX_BUSY_POLL
614a0731952SBjörn Töpel 	/* Prefer busy-polling, skip the wakeup. */
615a0731952SBjörn Töpel 	return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) &&
616a0731952SBjörn Töpel 		READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID;
617a0731952SBjörn Töpel #else
618a0731952SBjörn Töpel 	return false;
619a0731952SBjörn Töpel #endif
620a0731952SBjörn Töpel }
621a0731952SBjörn Töpel 
6221596dae2SMaciej Fijalkowski static int xsk_check_common(struct xdp_sock *xs)
6231596dae2SMaciej Fijalkowski {
6241596dae2SMaciej Fijalkowski 	if (unlikely(!xsk_is_bound(xs)))
6251596dae2SMaciej Fijalkowski 		return -ENXIO;
6261596dae2SMaciej Fijalkowski 	if (unlikely(!(xs->dev->flags & IFF_UP)))
6271596dae2SMaciej Fijalkowski 		return -ENETDOWN;
6281596dae2SMaciej Fijalkowski 
6291596dae2SMaciej Fijalkowski 	return 0;
6301596dae2SMaciej Fijalkowski }
6311596dae2SMaciej Fijalkowski 
63218b1ab7aSMagnus Karlsson static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
63335fcde7fSMagnus Karlsson {
634ac98d8aaSMagnus Karlsson 	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
63535fcde7fSMagnus Karlsson 	struct sock *sk = sock->sk;
63635fcde7fSMagnus Karlsson 	struct xdp_sock *xs = xdp_sk(sk);
637e3920818SBjörn Töpel 	struct xsk_buff_pool *pool;
6381596dae2SMaciej Fijalkowski 	int err;
63935fcde7fSMagnus Karlsson 
6401596dae2SMaciej Fijalkowski 	err = xsk_check_common(xs);
6411596dae2SMaciej Fijalkowski 	if (err)
6421596dae2SMaciej Fijalkowski 		return err;
643df551058SMagnus Karlsson 	if (unlikely(need_wait))
644ac98d8aaSMagnus Karlsson 		return -EOPNOTSUPP;
6451596dae2SMaciej Fijalkowski 	if (unlikely(!xs->tx))
6461596dae2SMaciej Fijalkowski 		return -ENOBUFS;
64735fcde7fSMagnus Karlsson 
648ca2e1a62SMaciej Fijalkowski 	if (sk_can_busy_loop(sk)) {
649ca2e1a62SMaciej Fijalkowski 		if (xs->zc)
650ca2e1a62SMaciej Fijalkowski 			__sk_mark_napi_id_once(sk, xsk_pool_get_napi_id(xs->pool));
651a0731952SBjörn Töpel 		sk_busy_loop(sk, 1); /* only support non-blocking sockets */
652ca2e1a62SMaciej Fijalkowski 	}
653a0731952SBjörn Töpel 
6548de8b71bSMaciej Fijalkowski 	if (xs->zc && xsk_no_wakeup(sk))
655a0731952SBjörn Töpel 		return 0;
656a0731952SBjörn Töpel 
657e3920818SBjörn Töpel 	pool = xs->pool;
6581596dae2SMaciej Fijalkowski 	if (pool->cached_need_wakeup & XDP_WAKEUP_TX) {
6591596dae2SMaciej Fijalkowski 		if (xs->zc)
6601596dae2SMaciej Fijalkowski 			return xsk_wakeup(xs, XDP_WAKEUP_TX);
6611596dae2SMaciej Fijalkowski 		return xsk_generic_xmit(sk);
6621596dae2SMaciej Fijalkowski 	}
663e3920818SBjörn Töpel 	return 0;
66435fcde7fSMagnus Karlsson }
66535fcde7fSMagnus Karlsson 
66618b1ab7aSMagnus Karlsson static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
66718b1ab7aSMagnus Karlsson {
66818b1ab7aSMagnus Karlsson 	int ret;
66918b1ab7aSMagnus Karlsson 
67018b1ab7aSMagnus Karlsson 	rcu_read_lock();
67118b1ab7aSMagnus Karlsson 	ret = __xsk_sendmsg(sock, m, total_len);
67218b1ab7aSMagnus Karlsson 	rcu_read_unlock();
67318b1ab7aSMagnus Karlsson 
67418b1ab7aSMagnus Karlsson 	return ret;
67518b1ab7aSMagnus Karlsson }
67618b1ab7aSMagnus Karlsson 
67718b1ab7aSMagnus Karlsson static int __xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
67845a86681SBjörn Töpel {
67945a86681SBjörn Töpel 	bool need_wait = !(flags & MSG_DONTWAIT);
68045a86681SBjörn Töpel 	struct sock *sk = sock->sk;
68145a86681SBjörn Töpel 	struct xdp_sock *xs = xdp_sk(sk);
6821596dae2SMaciej Fijalkowski 	int err;
68345a86681SBjörn Töpel 
6841596dae2SMaciej Fijalkowski 	err = xsk_check_common(xs);
6851596dae2SMaciej Fijalkowski 	if (err)
6861596dae2SMaciej Fijalkowski 		return err;
68745a86681SBjörn Töpel 	if (unlikely(!xs->rx))
68845a86681SBjörn Töpel 		return -ENOBUFS;
68945a86681SBjörn Töpel 	if (unlikely(need_wait))
69045a86681SBjörn Töpel 		return -EOPNOTSUPP;
69145a86681SBjörn Töpel 
692a0731952SBjörn Töpel 	if (sk_can_busy_loop(sk))
693a0731952SBjörn Töpel 		sk_busy_loop(sk, 1); /* only support non-blocking sockets */
694a0731952SBjörn Töpel 
695a0731952SBjörn Töpel 	if (xsk_no_wakeup(sk))
696a0731952SBjörn Töpel 		return 0;
697a0731952SBjörn Töpel 
69845a86681SBjörn Töpel 	if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc)
69945a86681SBjörn Töpel 		return xsk_wakeup(xs, XDP_WAKEUP_RX);
70045a86681SBjörn Töpel 	return 0;
701c497176cSBjörn Töpel }
702c497176cSBjörn Töpel 
70318b1ab7aSMagnus Karlsson static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
70418b1ab7aSMagnus Karlsson {
70518b1ab7aSMagnus Karlsson 	int ret;
70618b1ab7aSMagnus Karlsson 
70718b1ab7aSMagnus Karlsson 	rcu_read_lock();
70818b1ab7aSMagnus Karlsson 	ret = __xsk_recvmsg(sock, m, len, flags);
70918b1ab7aSMagnus Karlsson 	rcu_read_unlock();
71018b1ab7aSMagnus Karlsson 
71118b1ab7aSMagnus Karlsson 	return ret;
71218b1ab7aSMagnus Karlsson }
71318b1ab7aSMagnus Karlsson 
7145d946c5aSLuc Van Oostenryck static __poll_t xsk_poll(struct file *file, struct socket *sock,
715a11e1d43SLinus Torvalds 			     struct poll_table_struct *wait)
716c497176cSBjörn Töpel {
717f5da5418SXuan Zhuo 	__poll_t mask = 0;
718df551058SMagnus Karlsson 	struct sock *sk = sock->sk;
719df551058SMagnus Karlsson 	struct xdp_sock *xs = xdp_sk(sk);
720c2d3d6a4SMagnus Karlsson 	struct xsk_buff_pool *pool;
72142fddcc7SBjörn Töpel 
7220706a78fSMagnus Karlsson 	sock_poll_wait(file, sock, wait);
7230706a78fSMagnus Karlsson 
72418b1ab7aSMagnus Karlsson 	rcu_read_lock();
7251596dae2SMaciej Fijalkowski 	if (xsk_check_common(xs))
7261596dae2SMaciej Fijalkowski 		goto skip_tx;
72742fddcc7SBjörn Töpel 
728c2d3d6a4SMagnus Karlsson 	pool = xs->pool;
72977cd0d7bSMagnus Karlsson 
730c2d3d6a4SMagnus Karlsson 	if (pool->cached_need_wakeup) {
73106870682SMaxim Mikityanskiy 		if (xs->zc)
732c2d3d6a4SMagnus Karlsson 			xsk_wakeup(xs, pool->cached_need_wakeup);
7331596dae2SMaciej Fijalkowski 		else if (xs->tx)
734df551058SMagnus Karlsson 			/* Poll needs to drive Tx also in copy mode */
7351596dae2SMaciej Fijalkowski 			xsk_generic_xmit(sk);
736df551058SMagnus Karlsson 	}
737c497176cSBjörn Töpel 
7381596dae2SMaciej Fijalkowski skip_tx:
73959e35e55SMagnus Karlsson 	if (xs->rx && !xskq_prod_is_empty(xs->rx))
7405d946c5aSLuc Van Oostenryck 		mask |= EPOLLIN | EPOLLRDNORM;
7413413f041SXuan Zhuo 	if (xs->tx && xsk_tx_writeable(xs))
7425d946c5aSLuc Van Oostenryck 		mask |= EPOLLOUT | EPOLLWRNORM;
743c497176cSBjörn Töpel 
74418b1ab7aSMagnus Karlsson 	rcu_read_unlock();
745c497176cSBjörn Töpel 	return mask;
746c497176cSBjörn Töpel }
747c497176cSBjörn Töpel 
748b9b6b68eSBjörn Töpel static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
749b9b6b68eSBjörn Töpel 			  bool umem_queue)
750423f3832SMagnus Karlsson {
751423f3832SMagnus Karlsson 	struct xsk_queue *q;
752423f3832SMagnus Karlsson 
753423f3832SMagnus Karlsson 	if (entries == 0 || *queue || !is_power_of_2(entries))
754423f3832SMagnus Karlsson 		return -EINVAL;
755423f3832SMagnus Karlsson 
756b9b6b68eSBjörn Töpel 	q = xskq_create(entries, umem_queue);
757423f3832SMagnus Karlsson 	if (!q)
758423f3832SMagnus Karlsson 		return -ENOMEM;
759423f3832SMagnus Karlsson 
76037b07693SBjörn Töpel 	/* Make sure queue is ready before it can be seen by others */
76137b07693SBjörn Töpel 	smp_wmb();
76294a99763SBjörn Töpel 	WRITE_ONCE(*queue, q);
763423f3832SMagnus Karlsson 	return 0;
764423f3832SMagnus Karlsson }
765423f3832SMagnus Karlsson 
766455302d1SIlya Maximets static void xsk_unbind_dev(struct xdp_sock *xs)
767455302d1SIlya Maximets {
768455302d1SIlya Maximets 	struct net_device *dev = xs->dev;
769455302d1SIlya Maximets 
77042fddcc7SBjörn Töpel 	if (xs->state != XSK_BOUND)
771455302d1SIlya Maximets 		return;
77242fddcc7SBjörn Töpel 	WRITE_ONCE(xs->state, XSK_UNBOUND);
773455302d1SIlya Maximets 
774455302d1SIlya Maximets 	/* Wait for driver to stop using the xdp socket. */
775a5aa8e52SMagnus Karlsson 	xp_del_xsk(xs->pool, xs);
776455302d1SIlya Maximets 	synchronize_net();
777455302d1SIlya Maximets 	dev_put(dev);
778455302d1SIlya Maximets }
779455302d1SIlya Maximets 
7800402acd6SBjörn Töpel static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
781782347b6SToke Høiland-Jørgensen 					      struct xdp_sock __rcu ***map_entry)
7820402acd6SBjörn Töpel {
7830402acd6SBjörn Töpel 	struct xsk_map *map = NULL;
7840402acd6SBjörn Töpel 	struct xsk_map_node *node;
7850402acd6SBjörn Töpel 
7860402acd6SBjörn Töpel 	*map_entry = NULL;
7870402acd6SBjörn Töpel 
7880402acd6SBjörn Töpel 	spin_lock_bh(&xs->map_list_lock);
7890402acd6SBjörn Töpel 	node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
7900402acd6SBjörn Töpel 					node);
7910402acd6SBjörn Töpel 	if (node) {
792bb1b25caSZhu Yanjun 		bpf_map_inc(&node->map->map);
7930402acd6SBjörn Töpel 		map = node->map;
7940402acd6SBjörn Töpel 		*map_entry = node->map_entry;
7950402acd6SBjörn Töpel 	}
7960402acd6SBjörn Töpel 	spin_unlock_bh(&xs->map_list_lock);
7970402acd6SBjörn Töpel 	return map;
7980402acd6SBjörn Töpel }
7990402acd6SBjörn Töpel 
8000402acd6SBjörn Töpel static void xsk_delete_from_maps(struct xdp_sock *xs)
8010402acd6SBjörn Töpel {
8020402acd6SBjörn Töpel 	/* This function removes the current XDP socket from all the
8030402acd6SBjörn Töpel 	 * maps it resides in. We need to take extra care here, due to
8040402acd6SBjörn Töpel 	 * the two locks involved. Each map has a lock synchronizing
8050402acd6SBjörn Töpel 	 * updates to the entries, and each socket has a lock that
8060402acd6SBjörn Töpel 	 * synchronizes access to the list of maps (map_list). For
8070402acd6SBjörn Töpel 	 * deadlock avoidance the locks need to be taken in the order
8080402acd6SBjörn Töpel 	 * "map lock"->"socket map list lock". We start off by
8090402acd6SBjörn Töpel 	 * accessing the socket map list, and take a reference to the
8100402acd6SBjörn Töpel 	 * map to guarantee existence between the
8110402acd6SBjörn Töpel 	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
8120402acd6SBjörn Töpel 	 * calls. Then we ask the map to remove the socket, which
8130402acd6SBjörn Töpel 	 * tries to remove the socket from the map. Note that there
8140402acd6SBjörn Töpel 	 * might be updates to the map between
8150402acd6SBjörn Töpel 	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
8160402acd6SBjörn Töpel 	 */
817782347b6SToke Høiland-Jørgensen 	struct xdp_sock __rcu **map_entry = NULL;
8180402acd6SBjörn Töpel 	struct xsk_map *map;
8190402acd6SBjörn Töpel 
8200402acd6SBjörn Töpel 	while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
8210402acd6SBjörn Töpel 		xsk_map_try_sock_delete(map, xs, map_entry);
822bb1b25caSZhu Yanjun 		bpf_map_put(&map->map);
8230402acd6SBjörn Töpel 	}
8240402acd6SBjörn Töpel }
8250402acd6SBjörn Töpel 
826c0c77d8fSBjörn Töpel static int xsk_release(struct socket *sock)
827c0c77d8fSBjörn Töpel {
828c0c77d8fSBjörn Töpel 	struct sock *sk = sock->sk;
829965a9909SMagnus Karlsson 	struct xdp_sock *xs = xdp_sk(sk);
830c0c77d8fSBjörn Töpel 	struct net *net;
831c0c77d8fSBjörn Töpel 
832c0c77d8fSBjörn Töpel 	if (!sk)
833c0c77d8fSBjörn Töpel 		return 0;
834c0c77d8fSBjörn Töpel 
835c0c77d8fSBjörn Töpel 	net = sock_net(sk);
836c0c77d8fSBjörn Töpel 
8371d0dc069SBjörn Töpel 	mutex_lock(&net->xdp.lock);
8381d0dc069SBjörn Töpel 	sk_del_node_init_rcu(sk);
8391d0dc069SBjörn Töpel 	mutex_unlock(&net->xdp.lock);
8401d0dc069SBjörn Töpel 
841c0c77d8fSBjörn Töpel 	sock_prot_inuse_add(net, sk->sk_prot, -1);
842c0c77d8fSBjörn Töpel 
8430402acd6SBjörn Töpel 	xsk_delete_from_maps(xs);
84442fddcc7SBjörn Töpel 	mutex_lock(&xs->mutex);
845455302d1SIlya Maximets 	xsk_unbind_dev(xs);
84642fddcc7SBjörn Töpel 	mutex_unlock(&xs->mutex);
847965a9909SMagnus Karlsson 
848541d7fddSBjörn Töpel 	xskq_destroy(xs->rx);
849541d7fddSBjörn Töpel 	xskq_destroy(xs->tx);
8507361f9c3SMagnus Karlsson 	xskq_destroy(xs->fq_tmp);
8517361f9c3SMagnus Karlsson 	xskq_destroy(xs->cq_tmp);
852541d7fddSBjörn Töpel 
853c0c77d8fSBjörn Töpel 	sock_orphan(sk);
854c0c77d8fSBjörn Töpel 	sock->sk = NULL;
855c0c77d8fSBjörn Töpel 
856c0c77d8fSBjörn Töpel 	sock_put(sk);
857c0c77d8fSBjörn Töpel 
858c0c77d8fSBjörn Töpel 	return 0;
859c0c77d8fSBjörn Töpel }
860c0c77d8fSBjörn Töpel 
861965a9909SMagnus Karlsson static struct socket *xsk_lookup_xsk_from_fd(int fd)
862965a9909SMagnus Karlsson {
863965a9909SMagnus Karlsson 	struct socket *sock;
864965a9909SMagnus Karlsson 	int err;
865965a9909SMagnus Karlsson 
866965a9909SMagnus Karlsson 	sock = sockfd_lookup(fd, &err);
867965a9909SMagnus Karlsson 	if (!sock)
868965a9909SMagnus Karlsson 		return ERR_PTR(-ENOTSOCK);
869965a9909SMagnus Karlsson 
870965a9909SMagnus Karlsson 	if (sock->sk->sk_family != PF_XDP) {
871965a9909SMagnus Karlsson 		sockfd_put(sock);
872965a9909SMagnus Karlsson 		return ERR_PTR(-ENOPROTOOPT);
873965a9909SMagnus Karlsson 	}
874965a9909SMagnus Karlsson 
875965a9909SMagnus Karlsson 	return sock;
876965a9909SMagnus Karlsson }
877965a9909SMagnus Karlsson 
8787361f9c3SMagnus Karlsson static bool xsk_validate_queues(struct xdp_sock *xs)
8797361f9c3SMagnus Karlsson {
8807361f9c3SMagnus Karlsson 	return xs->fq_tmp && xs->cq_tmp;
8817361f9c3SMagnus Karlsson }
8827361f9c3SMagnus Karlsson 
883965a9909SMagnus Karlsson static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
884965a9909SMagnus Karlsson {
885965a9909SMagnus Karlsson 	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
886965a9909SMagnus Karlsson 	struct sock *sk = sock->sk;
887965a9909SMagnus Karlsson 	struct xdp_sock *xs = xdp_sk(sk);
888959b71dbSBjörn Töpel 	struct net_device *dev;
889*f7306aceSIlya Maximets 	int bound_dev_if;
890173d3adbSBjörn Töpel 	u32 flags, qid;
891965a9909SMagnus Karlsson 	int err = 0;
892965a9909SMagnus Karlsson 
893965a9909SMagnus Karlsson 	if (addr_len < sizeof(struct sockaddr_xdp))
894965a9909SMagnus Karlsson 		return -EINVAL;
895965a9909SMagnus Karlsson 	if (sxdp->sxdp_family != AF_XDP)
896965a9909SMagnus Karlsson 		return -EINVAL;
897965a9909SMagnus Karlsson 
898f54ba391SBjörn Töpel 	flags = sxdp->sxdp_flags;
89977cd0d7bSMagnus Karlsson 	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
90077cd0d7bSMagnus Karlsson 		      XDP_USE_NEED_WAKEUP))
901f54ba391SBjörn Töpel 		return -EINVAL;
902f54ba391SBjörn Töpel 
903*f7306aceSIlya Maximets 	bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
904*f7306aceSIlya Maximets 	if (bound_dev_if && bound_dev_if != sxdp->sxdp_ifindex)
905*f7306aceSIlya Maximets 		return -EINVAL;
906*f7306aceSIlya Maximets 
9075464c3a0SIlya Maximets 	rtnl_lock();
908965a9909SMagnus Karlsson 	mutex_lock(&xs->mutex);
909455302d1SIlya Maximets 	if (xs->state != XSK_READY) {
910959b71dbSBjörn Töpel 		err = -EBUSY;
911959b71dbSBjörn Töpel 		goto out_release;
912959b71dbSBjörn Töpel 	}
913959b71dbSBjörn Töpel 
914965a9909SMagnus Karlsson 	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
915965a9909SMagnus Karlsson 	if (!dev) {
916965a9909SMagnus Karlsson 		err = -ENODEV;
917965a9909SMagnus Karlsson 		goto out_release;
918965a9909SMagnus Karlsson 	}
919965a9909SMagnus Karlsson 
920f6145903SMagnus Karlsson 	if (!xs->rx && !xs->tx) {
921965a9909SMagnus Karlsson 		err = -EINVAL;
922965a9909SMagnus Karlsson 		goto out_unlock;
923965a9909SMagnus Karlsson 	}
924965a9909SMagnus Karlsson 
925173d3adbSBjörn Töpel 	qid = sxdp->sxdp_queue_id;
926173d3adbSBjörn Töpel 
927173d3adbSBjörn Töpel 	if (flags & XDP_SHARED_UMEM) {
928965a9909SMagnus Karlsson 		struct xdp_sock *umem_xs;
929965a9909SMagnus Karlsson 		struct socket *sock;
930965a9909SMagnus Karlsson 
93177cd0d7bSMagnus Karlsson 		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
93277cd0d7bSMagnus Karlsson 		    (flags & XDP_USE_NEED_WAKEUP)) {
933173d3adbSBjörn Töpel 			/* Cannot specify flags for shared sockets. */
934173d3adbSBjörn Töpel 			err = -EINVAL;
935173d3adbSBjörn Töpel 			goto out_unlock;
936173d3adbSBjörn Töpel 		}
937173d3adbSBjörn Töpel 
938965a9909SMagnus Karlsson 		if (xs->umem) {
939965a9909SMagnus Karlsson 			/* We have already our own. */
940965a9909SMagnus Karlsson 			err = -EINVAL;
941965a9909SMagnus Karlsson 			goto out_unlock;
942965a9909SMagnus Karlsson 		}
943965a9909SMagnus Karlsson 
944965a9909SMagnus Karlsson 		sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
945965a9909SMagnus Karlsson 		if (IS_ERR(sock)) {
946965a9909SMagnus Karlsson 			err = PTR_ERR(sock);
947965a9909SMagnus Karlsson 			goto out_unlock;
948965a9909SMagnus Karlsson 		}
949965a9909SMagnus Karlsson 
950965a9909SMagnus Karlsson 		umem_xs = xdp_sk(sock->sk);
95142fddcc7SBjörn Töpel 		if (!xsk_is_bound(umem_xs)) {
952965a9909SMagnus Karlsson 			err = -EBADF;
953965a9909SMagnus Karlsson 			sockfd_put(sock);
954965a9909SMagnus Karlsson 			goto out_unlock;
95542fddcc7SBjörn Töpel 		}
956965a9909SMagnus Karlsson 
957a1132430SMagnus Karlsson 		if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
958a1132430SMagnus Karlsson 			/* Share the umem with another socket on another qid
959a1132430SMagnus Karlsson 			 * and/or device.
960a1132430SMagnus Karlsson 			 */
961b5aea28dSMagnus Karlsson 			xs->pool = xp_create_and_assign_umem(xs,
962b5aea28dSMagnus Karlsson 							     umem_xs->umem);
963b5aea28dSMagnus Karlsson 			if (!xs->pool) {
9641fd17c8cSMagnus Karlsson 				err = -ENOMEM;
965b5aea28dSMagnus Karlsson 				sockfd_put(sock);
966b5aea28dSMagnus Karlsson 				goto out_unlock;
967b5aea28dSMagnus Karlsson 			}
968b5aea28dSMagnus Karlsson 
96960240bc2SJalal Mostafa 			err = xp_assign_dev_shared(xs->pool, umem_xs, dev,
97060240bc2SJalal Mostafa 						   qid);
971b5aea28dSMagnus Karlsson 			if (err) {
972b5aea28dSMagnus Karlsson 				xp_destroy(xs->pool);
97383cf5c68SMagnus Karlsson 				xs->pool = NULL;
974b5aea28dSMagnus Karlsson 				sockfd_put(sock);
975b5aea28dSMagnus Karlsson 				goto out_unlock;
976b5aea28dSMagnus Karlsson 			}
977b5aea28dSMagnus Karlsson 		} else {
9781c1efc2aSMagnus Karlsson 			/* Share the buffer pool with the other socket. */
979b5aea28dSMagnus Karlsson 			if (xs->fq_tmp || xs->cq_tmp) {
980b5aea28dSMagnus Karlsson 				/* Do not allow setting your own fq or cq. */
981b5aea28dSMagnus Karlsson 				err = -EINVAL;
982b5aea28dSMagnus Karlsson 				sockfd_put(sock);
983b5aea28dSMagnus Karlsson 				goto out_unlock;
984b5aea28dSMagnus Karlsson 			}
985b5aea28dSMagnus Karlsson 
9861c1efc2aSMagnus Karlsson 			xp_get_pool(umem_xs->pool);
9871c1efc2aSMagnus Karlsson 			xs->pool = umem_xs->pool;
988ba3beec2SMaciej Fijalkowski 
989ba3beec2SMaciej Fijalkowski 			/* If underlying shared umem was created without Tx
990ba3beec2SMaciej Fijalkowski 			 * ring, allocate Tx descs array that Tx batching API
991ba3beec2SMaciej Fijalkowski 			 * utilizes
992ba3beec2SMaciej Fijalkowski 			 */
993ba3beec2SMaciej Fijalkowski 			if (xs->tx && !xs->pool->tx_descs) {
994ba3beec2SMaciej Fijalkowski 				err = xp_alloc_tx_descs(xs->pool, xs);
995ba3beec2SMaciej Fijalkowski 				if (err) {
996ba3beec2SMaciej Fijalkowski 					xp_put_pool(xs->pool);
997ba3beec2SMaciej Fijalkowski 					sockfd_put(sock);
998ba3beec2SMaciej Fijalkowski 					goto out_unlock;
999ba3beec2SMaciej Fijalkowski 				}
1000ba3beec2SMaciej Fijalkowski 			}
1001b5aea28dSMagnus Karlsson 		}
1002b5aea28dSMagnus Karlsson 
1003965a9909SMagnus Karlsson 		xdp_get_umem(umem_xs->umem);
10049764f4b3SBjörn Töpel 		WRITE_ONCE(xs->umem, umem_xs->umem);
1005965a9909SMagnus Karlsson 		sockfd_put(sock);
10067361f9c3SMagnus Karlsson 	} else if (!xs->umem || !xsk_validate_queues(xs)) {
1007965a9909SMagnus Karlsson 		err = -EINVAL;
1008965a9909SMagnus Karlsson 		goto out_unlock;
1009c497176cSBjörn Töpel 	} else {
1010c497176cSBjörn Töpel 		/* This xsk has its own umem. */
10111c1efc2aSMagnus Karlsson 		xs->pool = xp_create_and_assign_umem(xs, xs->umem);
10121c1efc2aSMagnus Karlsson 		if (!xs->pool) {
10131c1efc2aSMagnus Karlsson 			err = -ENOMEM;
1014173d3adbSBjörn Töpel 			goto out_unlock;
1015965a9909SMagnus Karlsson 		}
1016965a9909SMagnus Karlsson 
10171c1efc2aSMagnus Karlsson 		err = xp_assign_dev(xs->pool, dev, qid, flags);
10181c1efc2aSMagnus Karlsson 		if (err) {
10191c1efc2aSMagnus Karlsson 			xp_destroy(xs->pool);
10201c1efc2aSMagnus Karlsson 			xs->pool = NULL;
10211c1efc2aSMagnus Karlsson 			goto out_unlock;
10221c1efc2aSMagnus Karlsson 		}
10231c1efc2aSMagnus Karlsson 	}
10241c1efc2aSMagnus Karlsson 
10258bee6833SMagnus Karlsson 	/* FQ and CQ are now owned by the buffer pool and cleaned up with it. */
10268bee6833SMagnus Karlsson 	xs->fq_tmp = NULL;
10278bee6833SMagnus Karlsson 	xs->cq_tmp = NULL;
10288bee6833SMagnus Karlsson 
1029965a9909SMagnus Karlsson 	xs->dev = dev;
1030ac98d8aaSMagnus Karlsson 	xs->zc = xs->umem->zc;
1031ac98d8aaSMagnus Karlsson 	xs->queue_id = qid;
1032a5aa8e52SMagnus Karlsson 	xp_add_xsk(xs->pool, xs);
1033965a9909SMagnus Karlsson 
1034965a9909SMagnus Karlsson out_unlock:
103542fddcc7SBjörn Töpel 	if (err) {
1036965a9909SMagnus Karlsson 		dev_put(dev);
103742fddcc7SBjörn Töpel 	} else {
103842fddcc7SBjörn Töpel 		/* Matches smp_rmb() in bind() for shared umem
103942fddcc7SBjörn Töpel 		 * sockets, and xsk_is_bound().
104042fddcc7SBjörn Töpel 		 */
104142fddcc7SBjörn Töpel 		smp_wmb();
104242fddcc7SBjörn Töpel 		WRITE_ONCE(xs->state, XSK_BOUND);
104342fddcc7SBjörn Töpel 	}
1044965a9909SMagnus Karlsson out_release:
1045965a9909SMagnus Karlsson 	mutex_unlock(&xs->mutex);
10465464c3a0SIlya Maximets 	rtnl_unlock();
1047965a9909SMagnus Karlsson 	return err;
1048965a9909SMagnus Karlsson }
1049965a9909SMagnus Karlsson 
1050c05cd364SKevin Laatz struct xdp_umem_reg_v1 {
1051c05cd364SKevin Laatz 	__u64 addr; /* Start of packet data area */
1052c05cd364SKevin Laatz 	__u64 len; /* Length of packet data area */
1053c05cd364SKevin Laatz 	__u32 chunk_size;
1054c05cd364SKevin Laatz 	__u32 headroom;
1055c05cd364SKevin Laatz };
1056c05cd364SKevin Laatz 
1057c0c77d8fSBjörn Töpel static int xsk_setsockopt(struct socket *sock, int level, int optname,
1058a7b75c5aSChristoph Hellwig 			  sockptr_t optval, unsigned int optlen)
1059c0c77d8fSBjörn Töpel {
1060c0c77d8fSBjörn Töpel 	struct sock *sk = sock->sk;
1061c0c77d8fSBjörn Töpel 	struct xdp_sock *xs = xdp_sk(sk);
1062c0c77d8fSBjörn Töpel 	int err;
1063c0c77d8fSBjörn Töpel 
1064c0c77d8fSBjörn Töpel 	if (level != SOL_XDP)
1065c0c77d8fSBjörn Töpel 		return -ENOPROTOOPT;
1066c0c77d8fSBjörn Töpel 
1067c0c77d8fSBjörn Töpel 	switch (optname) {
1068b9b6b68eSBjörn Töpel 	case XDP_RX_RING:
1069f6145903SMagnus Karlsson 	case XDP_TX_RING:
1070b9b6b68eSBjörn Töpel 	{
1071b9b6b68eSBjörn Töpel 		struct xsk_queue **q;
1072b9b6b68eSBjörn Töpel 		int entries;
1073b9b6b68eSBjörn Töpel 
1074b9b6b68eSBjörn Töpel 		if (optlen < sizeof(entries))
1075b9b6b68eSBjörn Töpel 			return -EINVAL;
1076a7b75c5aSChristoph Hellwig 		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
1077b9b6b68eSBjörn Töpel 			return -EFAULT;
1078b9b6b68eSBjörn Töpel 
1079b9b6b68eSBjörn Töpel 		mutex_lock(&xs->mutex);
1080455302d1SIlya Maximets 		if (xs->state != XSK_READY) {
1081455302d1SIlya Maximets 			mutex_unlock(&xs->mutex);
1082455302d1SIlya Maximets 			return -EBUSY;
1083455302d1SIlya Maximets 		}
1084f6145903SMagnus Karlsson 		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
1085b9b6b68eSBjörn Töpel 		err = xsk_init_queue(entries, q, false);
108677cd0d7bSMagnus Karlsson 		if (!err && optname == XDP_TX_RING)
108777cd0d7bSMagnus Karlsson 			/* Tx needs to be explicitly woken up the first time */
108877cd0d7bSMagnus Karlsson 			xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
1089b9b6b68eSBjörn Töpel 		mutex_unlock(&xs->mutex);
1090b9b6b68eSBjörn Töpel 		return err;
1091b9b6b68eSBjörn Töpel 	}
1092c0c77d8fSBjörn Töpel 	case XDP_UMEM_REG:
1093c0c77d8fSBjörn Töpel 	{
1094c05cd364SKevin Laatz 		size_t mr_size = sizeof(struct xdp_umem_reg);
1095c05cd364SKevin Laatz 		struct xdp_umem_reg mr = {};
1096c0c77d8fSBjörn Töpel 		struct xdp_umem *umem;
1097c0c77d8fSBjörn Töpel 
1098c05cd364SKevin Laatz 		if (optlen < sizeof(struct xdp_umem_reg_v1))
1099c05cd364SKevin Laatz 			return -EINVAL;
1100c05cd364SKevin Laatz 		else if (optlen < sizeof(mr))
1101c05cd364SKevin Laatz 			mr_size = sizeof(struct xdp_umem_reg_v1);
1102c05cd364SKevin Laatz 
1103a7b75c5aSChristoph Hellwig 		if (copy_from_sockptr(&mr, optval, mr_size))
1104c0c77d8fSBjörn Töpel 			return -EFAULT;
1105c0c77d8fSBjörn Töpel 
1106c0c77d8fSBjörn Töpel 		mutex_lock(&xs->mutex);
1107455302d1SIlya Maximets 		if (xs->state != XSK_READY || xs->umem) {
1108c0c77d8fSBjörn Töpel 			mutex_unlock(&xs->mutex);
1109a49049eaSBjörn Töpel 			return -EBUSY;
1110a49049eaSBjörn Töpel 		}
1111a49049eaSBjörn Töpel 
1112a49049eaSBjörn Töpel 		umem = xdp_umem_create(&mr);
1113a49049eaSBjörn Töpel 		if (IS_ERR(umem)) {
1114a49049eaSBjörn Töpel 			mutex_unlock(&xs->mutex);
1115a49049eaSBjörn Töpel 			return PTR_ERR(umem);
1116c0c77d8fSBjörn Töpel 		}
1117c0c77d8fSBjörn Töpel 
1118c0c77d8fSBjörn Töpel 		/* Make sure umem is ready before it can be seen by others */
1119c0c77d8fSBjörn Töpel 		smp_wmb();
11209764f4b3SBjörn Töpel 		WRITE_ONCE(xs->umem, umem);
1121c0c77d8fSBjörn Töpel 		mutex_unlock(&xs->mutex);
1122c0c77d8fSBjörn Töpel 		return 0;
1123c0c77d8fSBjörn Töpel 	}
1124423f3832SMagnus Karlsson 	case XDP_UMEM_FILL_RING:
1125fe230832SMagnus Karlsson 	case XDP_UMEM_COMPLETION_RING:
1126423f3832SMagnus Karlsson 	{
1127423f3832SMagnus Karlsson 		struct xsk_queue **q;
1128423f3832SMagnus Karlsson 		int entries;
1129423f3832SMagnus Karlsson 
1130a7b75c5aSChristoph Hellwig 		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
1131423f3832SMagnus Karlsson 			return -EFAULT;
1132423f3832SMagnus Karlsson 
1133423f3832SMagnus Karlsson 		mutex_lock(&xs->mutex);
1134455302d1SIlya Maximets 		if (xs->state != XSK_READY) {
1135455302d1SIlya Maximets 			mutex_unlock(&xs->mutex);
1136455302d1SIlya Maximets 			return -EBUSY;
1137455302d1SIlya Maximets 		}
1138a49049eaSBjörn Töpel 
11397361f9c3SMagnus Karlsson 		q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
11407361f9c3SMagnus Karlsson 			&xs->cq_tmp;
1141b9b6b68eSBjörn Töpel 		err = xsk_init_queue(entries, q, true);
1142423f3832SMagnus Karlsson 		mutex_unlock(&xs->mutex);
1143423f3832SMagnus Karlsson 		return err;
1144423f3832SMagnus Karlsson 	}
1145c0c77d8fSBjörn Töpel 	default:
1146c0c77d8fSBjörn Töpel 		break;
1147c0c77d8fSBjörn Töpel 	}
1148c0c77d8fSBjörn Töpel 
1149c0c77d8fSBjörn Töpel 	return -ENOPROTOOPT;
1150c0c77d8fSBjörn Töpel }
1151c0c77d8fSBjörn Töpel 
115277cd0d7bSMagnus Karlsson static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
115377cd0d7bSMagnus Karlsson {
115477cd0d7bSMagnus Karlsson 	ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
115577cd0d7bSMagnus Karlsson 	ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
115677cd0d7bSMagnus Karlsson 	ring->desc = offsetof(struct xdp_rxtx_ring, desc);
115777cd0d7bSMagnus Karlsson }
115877cd0d7bSMagnus Karlsson 
115977cd0d7bSMagnus Karlsson static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
116077cd0d7bSMagnus Karlsson {
116177cd0d7bSMagnus Karlsson 	ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
116277cd0d7bSMagnus Karlsson 	ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
116377cd0d7bSMagnus Karlsson 	ring->desc = offsetof(struct xdp_umem_ring, desc);
116477cd0d7bSMagnus Karlsson }
116577cd0d7bSMagnus Karlsson 
11668aa5a335SCiara Loftus struct xdp_statistics_v1 {
11678aa5a335SCiara Loftus 	__u64 rx_dropped;
11688aa5a335SCiara Loftus 	__u64 rx_invalid_descs;
11698aa5a335SCiara Loftus 	__u64 tx_invalid_descs;
11708aa5a335SCiara Loftus };
11718aa5a335SCiara Loftus 
1172af75d9e0SMagnus Karlsson static int xsk_getsockopt(struct socket *sock, int level, int optname,
1173af75d9e0SMagnus Karlsson 			  char __user *optval, int __user *optlen)
1174af75d9e0SMagnus Karlsson {
1175af75d9e0SMagnus Karlsson 	struct sock *sk = sock->sk;
1176af75d9e0SMagnus Karlsson 	struct xdp_sock *xs = xdp_sk(sk);
1177af75d9e0SMagnus Karlsson 	int len;
1178af75d9e0SMagnus Karlsson 
1179af75d9e0SMagnus Karlsson 	if (level != SOL_XDP)
1180af75d9e0SMagnus Karlsson 		return -ENOPROTOOPT;
1181af75d9e0SMagnus Karlsson 
1182af75d9e0SMagnus Karlsson 	if (get_user(len, optlen))
1183af75d9e0SMagnus Karlsson 		return -EFAULT;
1184af75d9e0SMagnus Karlsson 	if (len < 0)
1185af75d9e0SMagnus Karlsson 		return -EINVAL;
1186af75d9e0SMagnus Karlsson 
1187af75d9e0SMagnus Karlsson 	switch (optname) {
1188af75d9e0SMagnus Karlsson 	case XDP_STATISTICS:
1189af75d9e0SMagnus Karlsson 	{
11903c4f850eSPeilin Ye 		struct xdp_statistics stats = {};
11918aa5a335SCiara Loftus 		bool extra_stats = true;
11928aa5a335SCiara Loftus 		size_t stats_size;
1193af75d9e0SMagnus Karlsson 
11948aa5a335SCiara Loftus 		if (len < sizeof(struct xdp_statistics_v1)) {
1195af75d9e0SMagnus Karlsson 			return -EINVAL;
11968aa5a335SCiara Loftus 		} else if (len < sizeof(stats)) {
11978aa5a335SCiara Loftus 			extra_stats = false;
11988aa5a335SCiara Loftus 			stats_size = sizeof(struct xdp_statistics_v1);
11998aa5a335SCiara Loftus 		} else {
12008aa5a335SCiara Loftus 			stats_size = sizeof(stats);
12018aa5a335SCiara Loftus 		}
1202af75d9e0SMagnus Karlsson 
1203af75d9e0SMagnus Karlsson 		mutex_lock(&xs->mutex);
1204af75d9e0SMagnus Karlsson 		stats.rx_dropped = xs->rx_dropped;
12058aa5a335SCiara Loftus 		if (extra_stats) {
12068aa5a335SCiara Loftus 			stats.rx_ring_full = xs->rx_queue_full;
12078aa5a335SCiara Loftus 			stats.rx_fill_ring_empty_descs =
12087361f9c3SMagnus Karlsson 				xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
12098aa5a335SCiara Loftus 			stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
12108aa5a335SCiara Loftus 		} else {
12118aa5a335SCiara Loftus 			stats.rx_dropped += xs->rx_queue_full;
12128aa5a335SCiara Loftus 		}
1213af75d9e0SMagnus Karlsson 		stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
1214af75d9e0SMagnus Karlsson 		stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
1215af75d9e0SMagnus Karlsson 		mutex_unlock(&xs->mutex);
1216af75d9e0SMagnus Karlsson 
12178aa5a335SCiara Loftus 		if (copy_to_user(optval, &stats, stats_size))
1218af75d9e0SMagnus Karlsson 			return -EFAULT;
12198aa5a335SCiara Loftus 		if (put_user(stats_size, optlen))
1220af75d9e0SMagnus Karlsson 			return -EFAULT;
1221af75d9e0SMagnus Karlsson 
1222af75d9e0SMagnus Karlsson 		return 0;
1223af75d9e0SMagnus Karlsson 	}
1224b3a9e0beSBjörn Töpel 	case XDP_MMAP_OFFSETS:
1225b3a9e0beSBjörn Töpel 	{
1226b3a9e0beSBjörn Töpel 		struct xdp_mmap_offsets off;
122777cd0d7bSMagnus Karlsson 		struct xdp_mmap_offsets_v1 off_v1;
122877cd0d7bSMagnus Karlsson 		bool flags_supported = true;
122977cd0d7bSMagnus Karlsson 		void *to_copy;
1230b3a9e0beSBjörn Töpel 
123177cd0d7bSMagnus Karlsson 		if (len < sizeof(off_v1))
1232b3a9e0beSBjörn Töpel 			return -EINVAL;
123377cd0d7bSMagnus Karlsson 		else if (len < sizeof(off))
123477cd0d7bSMagnus Karlsson 			flags_supported = false;
1235b3a9e0beSBjörn Töpel 
123677cd0d7bSMagnus Karlsson 		if (flags_supported) {
123777cd0d7bSMagnus Karlsson 			/* xdp_ring_offset is identical to xdp_ring_offset_v1
123877cd0d7bSMagnus Karlsson 			 * except for the flags field added to the end.
123977cd0d7bSMagnus Karlsson 			 */
124077cd0d7bSMagnus Karlsson 			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
124177cd0d7bSMagnus Karlsson 					       &off.rx);
124277cd0d7bSMagnus Karlsson 			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
124377cd0d7bSMagnus Karlsson 					       &off.tx);
124477cd0d7bSMagnus Karlsson 			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
124577cd0d7bSMagnus Karlsson 					       &off.fr);
124677cd0d7bSMagnus Karlsson 			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
124777cd0d7bSMagnus Karlsson 					       &off.cr);
124877cd0d7bSMagnus Karlsson 			off.rx.flags = offsetof(struct xdp_rxtx_ring,
124977cd0d7bSMagnus Karlsson 						ptrs.flags);
125077cd0d7bSMagnus Karlsson 			off.tx.flags = offsetof(struct xdp_rxtx_ring,
125177cd0d7bSMagnus Karlsson 						ptrs.flags);
125277cd0d7bSMagnus Karlsson 			off.fr.flags = offsetof(struct xdp_umem_ring,
125377cd0d7bSMagnus Karlsson 						ptrs.flags);
125477cd0d7bSMagnus Karlsson 			off.cr.flags = offsetof(struct xdp_umem_ring,
125577cd0d7bSMagnus Karlsson 						ptrs.flags);
1256b3a9e0beSBjörn Töpel 
1257b3a9e0beSBjörn Töpel 			len = sizeof(off);
125877cd0d7bSMagnus Karlsson 			to_copy = &off;
125977cd0d7bSMagnus Karlsson 		} else {
126077cd0d7bSMagnus Karlsson 			xsk_enter_rxtx_offsets(&off_v1.rx);
126177cd0d7bSMagnus Karlsson 			xsk_enter_rxtx_offsets(&off_v1.tx);
126277cd0d7bSMagnus Karlsson 			xsk_enter_umem_offsets(&off_v1.fr);
126377cd0d7bSMagnus Karlsson 			xsk_enter_umem_offsets(&off_v1.cr);
126477cd0d7bSMagnus Karlsson 
126577cd0d7bSMagnus Karlsson 			len = sizeof(off_v1);
126677cd0d7bSMagnus Karlsson 			to_copy = &off_v1;
126777cd0d7bSMagnus Karlsson 		}
126877cd0d7bSMagnus Karlsson 
126977cd0d7bSMagnus Karlsson 		if (copy_to_user(optval, to_copy, len))
1270b3a9e0beSBjörn Töpel 			return -EFAULT;
1271b3a9e0beSBjörn Töpel 		if (put_user(len, optlen))
1272b3a9e0beSBjörn Töpel 			return -EFAULT;
1273b3a9e0beSBjörn Töpel 
1274b3a9e0beSBjörn Töpel 		return 0;
1275b3a9e0beSBjörn Töpel 	}
12762640d3c8SMaxim Mikityanskiy 	case XDP_OPTIONS:
12772640d3c8SMaxim Mikityanskiy 	{
12782640d3c8SMaxim Mikityanskiy 		struct xdp_options opts = {};
12792640d3c8SMaxim Mikityanskiy 
12802640d3c8SMaxim Mikityanskiy 		if (len < sizeof(opts))
12812640d3c8SMaxim Mikityanskiy 			return -EINVAL;
12822640d3c8SMaxim Mikityanskiy 
12832640d3c8SMaxim Mikityanskiy 		mutex_lock(&xs->mutex);
12842640d3c8SMaxim Mikityanskiy 		if (xs->zc)
12852640d3c8SMaxim Mikityanskiy 			opts.flags |= XDP_OPTIONS_ZEROCOPY;
12862640d3c8SMaxim Mikityanskiy 		mutex_unlock(&xs->mutex);
12872640d3c8SMaxim Mikityanskiy 
12882640d3c8SMaxim Mikityanskiy 		len = sizeof(opts);
12892640d3c8SMaxim Mikityanskiy 		if (copy_to_user(optval, &opts, len))
12902640d3c8SMaxim Mikityanskiy 			return -EFAULT;
12912640d3c8SMaxim Mikityanskiy 		if (put_user(len, optlen))
12922640d3c8SMaxim Mikityanskiy 			return -EFAULT;
12932640d3c8SMaxim Mikityanskiy 
12942640d3c8SMaxim Mikityanskiy 		return 0;
12952640d3c8SMaxim Mikityanskiy 	}
1296af75d9e0SMagnus Karlsson 	default:
1297af75d9e0SMagnus Karlsson 		break;
1298af75d9e0SMagnus Karlsson 	}
1299af75d9e0SMagnus Karlsson 
1300af75d9e0SMagnus Karlsson 	return -EOPNOTSUPP;
1301af75d9e0SMagnus Karlsson }
1302af75d9e0SMagnus Karlsson 
1303423f3832SMagnus Karlsson static int xsk_mmap(struct file *file, struct socket *sock,
1304423f3832SMagnus Karlsson 		    struct vm_area_struct *vma)
1305423f3832SMagnus Karlsson {
1306a5a16e43SGeert Uytterhoeven 	loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
1307423f3832SMagnus Karlsson 	unsigned long size = vma->vm_end - vma->vm_start;
1308423f3832SMagnus Karlsson 	struct xdp_sock *xs = xdp_sk(sock->sk);
13095f5a7d8dSNuno Gonçalves 	int state = READ_ONCE(xs->state);
1310423f3832SMagnus Karlsson 	struct xsk_queue *q = NULL;
1311423f3832SMagnus Karlsson 
13125f5a7d8dSNuno Gonçalves 	if (state != XSK_READY && state != XSK_BOUND)
1313455302d1SIlya Maximets 		return -EBUSY;
1314455302d1SIlya Maximets 
1315b9b6b68eSBjörn Töpel 	if (offset == XDP_PGOFF_RX_RING) {
131637b07693SBjörn Töpel 		q = READ_ONCE(xs->rx);
1317f6145903SMagnus Karlsson 	} else if (offset == XDP_PGOFF_TX_RING) {
131837b07693SBjörn Töpel 		q = READ_ONCE(xs->tx);
1319b9b6b68eSBjörn Töpel 	} else {
1320e6762c8bSMagnus Karlsson 		/* Matches the smp_wmb() in XDP_UMEM_REG */
1321e6762c8bSMagnus Karlsson 		smp_rmb();
1322423f3832SMagnus Karlsson 		if (offset == XDP_UMEM_PGOFF_FILL_RING)
13235f5a7d8dSNuno Gonçalves 			q = state == XSK_READY ? READ_ONCE(xs->fq_tmp) :
13245f5a7d8dSNuno Gonçalves 						 READ_ONCE(xs->pool->fq);
1325fe230832SMagnus Karlsson 		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
13265f5a7d8dSNuno Gonçalves 			q = state == XSK_READY ? READ_ONCE(xs->cq_tmp) :
13275f5a7d8dSNuno Gonçalves 						 READ_ONCE(xs->pool->cq);
1328b9b6b68eSBjörn Töpel 	}
1329423f3832SMagnus Karlsson 
1330423f3832SMagnus Karlsson 	if (!q)
1331423f3832SMagnus Karlsson 		return -EINVAL;
1332423f3832SMagnus Karlsson 
1333e6762c8bSMagnus Karlsson 	/* Matches the smp_wmb() in xsk_init_queue */
1334e6762c8bSMagnus Karlsson 	smp_rmb();
13359f78bf33SXuan Zhuo 	if (size > q->ring_vmalloc_size)
1336423f3832SMagnus Karlsson 		return -EINVAL;
1337423f3832SMagnus Karlsson 
13389f78bf33SXuan Zhuo 	return remap_vmalloc_range(vma, q->ring, 0);
1339423f3832SMagnus Karlsson }
1340423f3832SMagnus Karlsson 
1341455302d1SIlya Maximets static int xsk_notifier(struct notifier_block *this,
1342455302d1SIlya Maximets 			unsigned long msg, void *ptr)
1343455302d1SIlya Maximets {
1344455302d1SIlya Maximets 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1345455302d1SIlya Maximets 	struct net *net = dev_net(dev);
1346455302d1SIlya Maximets 	struct sock *sk;
1347455302d1SIlya Maximets 
1348455302d1SIlya Maximets 	switch (msg) {
1349455302d1SIlya Maximets 	case NETDEV_UNREGISTER:
1350455302d1SIlya Maximets 		mutex_lock(&net->xdp.lock);
1351455302d1SIlya Maximets 		sk_for_each(sk, &net->xdp.list) {
1352455302d1SIlya Maximets 			struct xdp_sock *xs = xdp_sk(sk);
1353455302d1SIlya Maximets 
1354455302d1SIlya Maximets 			mutex_lock(&xs->mutex);
1355455302d1SIlya Maximets 			if (xs->dev == dev) {
1356455302d1SIlya Maximets 				sk->sk_err = ENETDOWN;
1357455302d1SIlya Maximets 				if (!sock_flag(sk, SOCK_DEAD))
1358e3ae2365SAlexander Aring 					sk_error_report(sk);
1359455302d1SIlya Maximets 
1360455302d1SIlya Maximets 				xsk_unbind_dev(xs);
1361455302d1SIlya Maximets 
13621c1efc2aSMagnus Karlsson 				/* Clear device references. */
13631c1efc2aSMagnus Karlsson 				xp_clear_dev(xs->pool);
1364455302d1SIlya Maximets 			}
1365455302d1SIlya Maximets 			mutex_unlock(&xs->mutex);
1366455302d1SIlya Maximets 		}
1367455302d1SIlya Maximets 		mutex_unlock(&net->xdp.lock);
1368455302d1SIlya Maximets 		break;
1369455302d1SIlya Maximets 	}
1370455302d1SIlya Maximets 	return NOTIFY_DONE;
1371455302d1SIlya Maximets }
1372455302d1SIlya Maximets 
1373c0c77d8fSBjörn Töpel static struct proto xsk_proto = {
1374c0c77d8fSBjörn Töpel 	.name =		"XDP",
1375c0c77d8fSBjörn Töpel 	.owner =	THIS_MODULE,
1376c0c77d8fSBjörn Töpel 	.obj_size =	sizeof(struct xdp_sock),
1377c0c77d8fSBjörn Töpel };
1378c0c77d8fSBjörn Töpel 
1379c0c77d8fSBjörn Töpel static const struct proto_ops xsk_proto_ops = {
1380c0c77d8fSBjörn Töpel 	.family		= PF_XDP,
1381c0c77d8fSBjörn Töpel 	.owner		= THIS_MODULE,
1382c0c77d8fSBjörn Töpel 	.release	= xsk_release,
1383965a9909SMagnus Karlsson 	.bind		= xsk_bind,
1384c0c77d8fSBjörn Töpel 	.connect	= sock_no_connect,
1385c0c77d8fSBjörn Töpel 	.socketpair	= sock_no_socketpair,
1386c0c77d8fSBjörn Töpel 	.accept		= sock_no_accept,
1387c0c77d8fSBjörn Töpel 	.getname	= sock_no_getname,
1388a11e1d43SLinus Torvalds 	.poll		= xsk_poll,
1389c0c77d8fSBjörn Töpel 	.ioctl		= sock_no_ioctl,
1390c0c77d8fSBjörn Töpel 	.listen		= sock_no_listen,
1391c0c77d8fSBjörn Töpel 	.shutdown	= sock_no_shutdown,
1392c0c77d8fSBjörn Töpel 	.setsockopt	= xsk_setsockopt,
1393af75d9e0SMagnus Karlsson 	.getsockopt	= xsk_getsockopt,
139435fcde7fSMagnus Karlsson 	.sendmsg	= xsk_sendmsg,
139545a86681SBjörn Töpel 	.recvmsg	= xsk_recvmsg,
1396423f3832SMagnus Karlsson 	.mmap		= xsk_mmap,
1397c0c77d8fSBjörn Töpel };
1398c0c77d8fSBjörn Töpel 
139911fe9262SBjörn Töpel static void xsk_destruct(struct sock *sk)
140011fe9262SBjörn Töpel {
140111fe9262SBjörn Töpel 	struct xdp_sock *xs = xdp_sk(sk);
140211fe9262SBjörn Töpel 
140311fe9262SBjörn Töpel 	if (!sock_flag(sk, SOCK_DEAD))
140411fe9262SBjörn Töpel 		return;
140511fe9262SBjörn Töpel 
1406e5e1a4bcSMagnus Karlsson 	if (!xp_put_pool(xs->pool))
1407537cf4e3SMagnus Karlsson 		xdp_put_umem(xs->umem, !xs->pool);
140811fe9262SBjörn Töpel }
140911fe9262SBjörn Töpel 
1410c0c77d8fSBjörn Töpel static int xsk_create(struct net *net, struct socket *sock, int protocol,
1411c0c77d8fSBjörn Töpel 		      int kern)
1412c0c77d8fSBjörn Töpel {
1413c0c77d8fSBjörn Töpel 	struct xdp_sock *xs;
14141c1efc2aSMagnus Karlsson 	struct sock *sk;
1415c0c77d8fSBjörn Töpel 
1416c0c77d8fSBjörn Töpel 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
1417c0c77d8fSBjörn Töpel 		return -EPERM;
1418c0c77d8fSBjörn Töpel 	if (sock->type != SOCK_RAW)
1419c0c77d8fSBjörn Töpel 		return -ESOCKTNOSUPPORT;
1420c0c77d8fSBjörn Töpel 
1421c0c77d8fSBjörn Töpel 	if (protocol)
1422c0c77d8fSBjörn Töpel 		return -EPROTONOSUPPORT;
1423c0c77d8fSBjörn Töpel 
1424c0c77d8fSBjörn Töpel 	sock->state = SS_UNCONNECTED;
1425c0c77d8fSBjörn Töpel 
1426c0c77d8fSBjörn Töpel 	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
1427c0c77d8fSBjörn Töpel 	if (!sk)
1428c0c77d8fSBjörn Töpel 		return -ENOBUFS;
1429c0c77d8fSBjörn Töpel 
1430c0c77d8fSBjörn Töpel 	sock->ops = &xsk_proto_ops;
1431c0c77d8fSBjörn Töpel 
1432c0c77d8fSBjörn Töpel 	sock_init_data(sock, sk);
1433c0c77d8fSBjörn Töpel 
1434c0c77d8fSBjörn Töpel 	sk->sk_family = PF_XDP;
1435c0c77d8fSBjörn Töpel 
143611fe9262SBjörn Töpel 	sk->sk_destruct = xsk_destruct;
143711fe9262SBjörn Töpel 
1438cee27167SBjörn Töpel 	sock_set_flag(sk, SOCK_RCU_FREE);
1439cee27167SBjörn Töpel 
1440c0c77d8fSBjörn Töpel 	xs = xdp_sk(sk);
1441455302d1SIlya Maximets 	xs->state = XSK_READY;
1442c0c77d8fSBjörn Töpel 	mutex_init(&xs->mutex);
1443bf0bdd13SIlya Maximets 	spin_lock_init(&xs->rx_lock);
1444c0c77d8fSBjörn Töpel 
14450402acd6SBjörn Töpel 	INIT_LIST_HEAD(&xs->map_list);
14460402acd6SBjörn Töpel 	spin_lock_init(&xs->map_list_lock);
14470402acd6SBjörn Töpel 
14481d0dc069SBjörn Töpel 	mutex_lock(&net->xdp.lock);
14491d0dc069SBjörn Töpel 	sk_add_node_rcu(sk, &net->xdp.list);
14501d0dc069SBjörn Töpel 	mutex_unlock(&net->xdp.lock);
14511d0dc069SBjörn Töpel 
1452c0c77d8fSBjörn Töpel 	sock_prot_inuse_add(net, &xsk_proto, 1);
1453c0c77d8fSBjörn Töpel 
1454c0c77d8fSBjörn Töpel 	return 0;
1455c0c77d8fSBjörn Töpel }
1456c0c77d8fSBjörn Töpel 
1457c0c77d8fSBjörn Töpel static const struct net_proto_family xsk_family_ops = {
1458c0c77d8fSBjörn Töpel 	.family = PF_XDP,
1459c0c77d8fSBjörn Töpel 	.create = xsk_create,
1460c0c77d8fSBjörn Töpel 	.owner	= THIS_MODULE,
1461c0c77d8fSBjörn Töpel };
1462c0c77d8fSBjörn Töpel 
1463455302d1SIlya Maximets static struct notifier_block xsk_netdev_notifier = {
1464455302d1SIlya Maximets 	.notifier_call	= xsk_notifier,
1465455302d1SIlya Maximets };
1466455302d1SIlya Maximets 
14671d0dc069SBjörn Töpel static int __net_init xsk_net_init(struct net *net)
14681d0dc069SBjörn Töpel {
14691d0dc069SBjörn Töpel 	mutex_init(&net->xdp.lock);
14701d0dc069SBjörn Töpel 	INIT_HLIST_HEAD(&net->xdp.list);
14711d0dc069SBjörn Töpel 	return 0;
14721d0dc069SBjörn Töpel }
14731d0dc069SBjörn Töpel 
14741d0dc069SBjörn Töpel static void __net_exit xsk_net_exit(struct net *net)
14751d0dc069SBjörn Töpel {
14761d0dc069SBjörn Töpel 	WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
14771d0dc069SBjörn Töpel }
14781d0dc069SBjörn Töpel 
14791d0dc069SBjörn Töpel static struct pernet_operations xsk_net_ops = {
14801d0dc069SBjörn Töpel 	.init = xsk_net_init,
14811d0dc069SBjörn Töpel 	.exit = xsk_net_exit,
14821d0dc069SBjörn Töpel };
14831d0dc069SBjörn Töpel 
1484c0c77d8fSBjörn Töpel static int __init xsk_init(void)
1485c0c77d8fSBjörn Töpel {
1486e312b9e7SBjörn Töpel 	int err, cpu;
1487c0c77d8fSBjörn Töpel 
1488c0c77d8fSBjörn Töpel 	err = proto_register(&xsk_proto, 0 /* no slab */);
1489c0c77d8fSBjörn Töpel 	if (err)
1490c0c77d8fSBjörn Töpel 		goto out;
1491c0c77d8fSBjörn Töpel 
1492c0c77d8fSBjörn Töpel 	err = sock_register(&xsk_family_ops);
1493c0c77d8fSBjörn Töpel 	if (err)
1494c0c77d8fSBjörn Töpel 		goto out_proto;
1495c0c77d8fSBjörn Töpel 
14961d0dc069SBjörn Töpel 	err = register_pernet_subsys(&xsk_net_ops);
14971d0dc069SBjörn Töpel 	if (err)
14981d0dc069SBjörn Töpel 		goto out_sk;
1499455302d1SIlya Maximets 
1500455302d1SIlya Maximets 	err = register_netdevice_notifier(&xsk_netdev_notifier);
1501455302d1SIlya Maximets 	if (err)
1502455302d1SIlya Maximets 		goto out_pernet;
1503455302d1SIlya Maximets 
1504e312b9e7SBjörn Töpel 	for_each_possible_cpu(cpu)
1505e312b9e7SBjörn Töpel 		INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
1506c0c77d8fSBjörn Töpel 	return 0;
1507c0c77d8fSBjörn Töpel 
1508455302d1SIlya Maximets out_pernet:
1509455302d1SIlya Maximets 	unregister_pernet_subsys(&xsk_net_ops);
15101d0dc069SBjörn Töpel out_sk:
15111d0dc069SBjörn Töpel 	sock_unregister(PF_XDP);
1512c0c77d8fSBjörn Töpel out_proto:
1513c0c77d8fSBjörn Töpel 	proto_unregister(&xsk_proto);
1514c0c77d8fSBjörn Töpel out:
1515c0c77d8fSBjörn Töpel 	return err;
1516c0c77d8fSBjörn Töpel }
1517c0c77d8fSBjörn Töpel 
1518c0c77d8fSBjörn Töpel fs_initcall(xsk_init);
1519