xref: /linux/net/xdp/xsk.c (revision 642e450b)
1c0c77d8fSBjörn Töpel // SPDX-License-Identifier: GPL-2.0
2c0c77d8fSBjörn Töpel /* XDP sockets
3c0c77d8fSBjörn Töpel  *
4c0c77d8fSBjörn Töpel  * AF_XDP sockets allows a channel between XDP programs and userspace
5c0c77d8fSBjörn Töpel  * applications.
6c0c77d8fSBjörn Töpel  * Copyright(c) 2018 Intel Corporation.
7c0c77d8fSBjörn Töpel  *
8c0c77d8fSBjörn Töpel  * Author(s): Björn Töpel <bjorn.topel@intel.com>
9c0c77d8fSBjörn Töpel  *	      Magnus Karlsson <magnus.karlsson@intel.com>
10c0c77d8fSBjörn Töpel  */
11c0c77d8fSBjörn Töpel 
12c0c77d8fSBjörn Töpel #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
13c0c77d8fSBjörn Töpel 
14c0c77d8fSBjörn Töpel #include <linux/if_xdp.h>
15c0c77d8fSBjörn Töpel #include <linux/init.h>
16c0c77d8fSBjörn Töpel #include <linux/sched/mm.h>
17c0c77d8fSBjörn Töpel #include <linux/sched/signal.h>
18c0c77d8fSBjörn Töpel #include <linux/sched/task.h>
19c0c77d8fSBjörn Töpel #include <linux/socket.h>
20c0c77d8fSBjörn Töpel #include <linux/file.h>
21c0c77d8fSBjörn Töpel #include <linux/uaccess.h>
22c0c77d8fSBjörn Töpel #include <linux/net.h>
23c0c77d8fSBjörn Töpel #include <linux/netdevice.h>
24ac98d8aaSMagnus Karlsson #include <linux/rculist.h>
25a71506a4SMagnus Karlsson #include <net/xdp_sock_drv.h>
26b9b6b68eSBjörn Töpel #include <net/xdp.h>
27c0c77d8fSBjörn Töpel 
28423f3832SMagnus Karlsson #include "xsk_queue.h"
29c0c77d8fSBjörn Töpel #include "xdp_umem.h"
30a36b38aaSBjörn Töpel #include "xsk.h"
31c0c77d8fSBjörn Töpel 
3235fcde7fSMagnus Karlsson #define TX_BATCH_SIZE 16
3335fcde7fSMagnus Karlsson 
34e312b9e7SBjörn Töpel static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
35e312b9e7SBjörn Töpel 
36fbfc504aSBjörn Töpel bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
37fbfc504aSBjörn Töpel {
38173d3adbSBjörn Töpel 	return READ_ONCE(xs->rx) &&  READ_ONCE(xs->umem) &&
39173d3adbSBjörn Töpel 		READ_ONCE(xs->umem->fq);
40fbfc504aSBjörn Töpel }
41fbfc504aSBjörn Töpel 
4277cd0d7bSMagnus Karlsson void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
4377cd0d7bSMagnus Karlsson {
4477cd0d7bSMagnus Karlsson 	if (umem->need_wakeup & XDP_WAKEUP_RX)
4577cd0d7bSMagnus Karlsson 		return;
4677cd0d7bSMagnus Karlsson 
4777cd0d7bSMagnus Karlsson 	umem->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
4877cd0d7bSMagnus Karlsson 	umem->need_wakeup |= XDP_WAKEUP_RX;
4977cd0d7bSMagnus Karlsson }
5077cd0d7bSMagnus Karlsson EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
5177cd0d7bSMagnus Karlsson 
5277cd0d7bSMagnus Karlsson void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
5377cd0d7bSMagnus Karlsson {
5477cd0d7bSMagnus Karlsson 	struct xdp_sock *xs;
5577cd0d7bSMagnus Karlsson 
5677cd0d7bSMagnus Karlsson 	if (umem->need_wakeup & XDP_WAKEUP_TX)
5777cd0d7bSMagnus Karlsson 		return;
5877cd0d7bSMagnus Karlsson 
5977cd0d7bSMagnus Karlsson 	rcu_read_lock();
60e4e5aefcSMagnus Karlsson 	list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) {
6177cd0d7bSMagnus Karlsson 		xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
6277cd0d7bSMagnus Karlsson 	}
6377cd0d7bSMagnus Karlsson 	rcu_read_unlock();
6477cd0d7bSMagnus Karlsson 
6577cd0d7bSMagnus Karlsson 	umem->need_wakeup |= XDP_WAKEUP_TX;
6677cd0d7bSMagnus Karlsson }
6777cd0d7bSMagnus Karlsson EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
6877cd0d7bSMagnus Karlsson 
6977cd0d7bSMagnus Karlsson void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
7077cd0d7bSMagnus Karlsson {
7177cd0d7bSMagnus Karlsson 	if (!(umem->need_wakeup & XDP_WAKEUP_RX))
7277cd0d7bSMagnus Karlsson 		return;
7377cd0d7bSMagnus Karlsson 
7477cd0d7bSMagnus Karlsson 	umem->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
7577cd0d7bSMagnus Karlsson 	umem->need_wakeup &= ~XDP_WAKEUP_RX;
7677cd0d7bSMagnus Karlsson }
7777cd0d7bSMagnus Karlsson EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
7877cd0d7bSMagnus Karlsson 
7977cd0d7bSMagnus Karlsson void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
8077cd0d7bSMagnus Karlsson {
8177cd0d7bSMagnus Karlsson 	struct xdp_sock *xs;
8277cd0d7bSMagnus Karlsson 
8377cd0d7bSMagnus Karlsson 	if (!(umem->need_wakeup & XDP_WAKEUP_TX))
8477cd0d7bSMagnus Karlsson 		return;
8577cd0d7bSMagnus Karlsson 
8677cd0d7bSMagnus Karlsson 	rcu_read_lock();
87e4e5aefcSMagnus Karlsson 	list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) {
8877cd0d7bSMagnus Karlsson 		xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
8977cd0d7bSMagnus Karlsson 	}
9077cd0d7bSMagnus Karlsson 	rcu_read_unlock();
9177cd0d7bSMagnus Karlsson 
9277cd0d7bSMagnus Karlsson 	umem->need_wakeup &= ~XDP_WAKEUP_TX;
9377cd0d7bSMagnus Karlsson }
9477cd0d7bSMagnus Karlsson EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
9577cd0d7bSMagnus Karlsson 
9677cd0d7bSMagnus Karlsson bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
9777cd0d7bSMagnus Karlsson {
9877cd0d7bSMagnus Karlsson 	return umem->flags & XDP_UMEM_USES_NEED_WAKEUP;
9977cd0d7bSMagnus Karlsson }
10077cd0d7bSMagnus Karlsson EXPORT_SYMBOL(xsk_umem_uses_need_wakeup);
10177cd0d7bSMagnus Karlsson 
10226062b18SBjörn Töpel void xp_release(struct xdp_buff_xsk *xskb)
10326062b18SBjörn Töpel {
10426062b18SBjörn Töpel 	xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb;
10526062b18SBjörn Töpel }
10626062b18SBjörn Töpel 
10726062b18SBjörn Töpel static u64 xp_get_handle(struct xdp_buff_xsk *xskb)
10826062b18SBjörn Töpel {
10926062b18SBjörn Töpel 	u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start;
11026062b18SBjörn Töpel 
11126062b18SBjörn Töpel 	offset += xskb->pool->headroom;
11226062b18SBjörn Töpel 	if (!xskb->pool->unaligned)
11326062b18SBjörn Töpel 		return xskb->orig_addr + offset;
11426062b18SBjörn Töpel 	return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
11526062b18SBjörn Töpel }
11626062b18SBjörn Töpel 
1172b43470aSBjörn Töpel static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
118c05cd364SKevin Laatz {
1192b43470aSBjörn Töpel 	struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
1202b43470aSBjörn Töpel 	u64 addr;
1212b43470aSBjörn Töpel 	int err;
122c05cd364SKevin Laatz 
1232b43470aSBjörn Töpel 	addr = xp_get_handle(xskb);
1242b43470aSBjörn Töpel 	err = xskq_prod_reserve_desc(xs->rx, addr, len);
1252b43470aSBjörn Töpel 	if (err) {
1268aa5a335SCiara Loftus 		xs->rx_queue_full++;
1272b43470aSBjörn Töpel 		return err;
1282b43470aSBjörn Töpel 	}
129c05cd364SKevin Laatz 
1302b43470aSBjörn Töpel 	xp_release(xskb);
1312b43470aSBjörn Töpel 	return 0;
1322b43470aSBjörn Töpel }
133c05cd364SKevin Laatz 
1342b43470aSBjörn Töpel static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len)
1352b43470aSBjörn Töpel {
1362b43470aSBjörn Töpel 	void *from_buf, *to_buf;
1372b43470aSBjörn Töpel 	u32 metalen;
1382b43470aSBjörn Töpel 
1392b43470aSBjörn Töpel 	if (unlikely(xdp_data_meta_unsupported(from))) {
1402b43470aSBjörn Töpel 		from_buf = from->data;
1412b43470aSBjörn Töpel 		to_buf = to->data;
1422b43470aSBjörn Töpel 		metalen = 0;
1432b43470aSBjörn Töpel 	} else {
1442b43470aSBjörn Töpel 		from_buf = from->data_meta;
1452b43470aSBjörn Töpel 		metalen = from->data - from->data_meta;
1462b43470aSBjörn Töpel 		to_buf = to->data - metalen;
147c05cd364SKevin Laatz 	}
148c05cd364SKevin Laatz 
149c05cd364SKevin Laatz 	memcpy(to_buf, from_buf, len + metalen);
150c05cd364SKevin Laatz }
151c05cd364SKevin Laatz 
1522b43470aSBjörn Töpel static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len,
1532b43470aSBjörn Töpel 		     bool explicit_free)
154173d3adbSBjörn Töpel {
1552b43470aSBjörn Töpel 	struct xdp_buff *xsk_xdp;
1564e64c835SBjörn Töpel 	int err;
157c497176cSBjörn Töpel 
1582b43470aSBjörn Töpel 	if (len > xsk_umem_get_rx_frame_size(xs->umem)) {
159a509a955SBjörn Töpel 		xs->rx_dropped++;
160c497176cSBjörn Töpel 		return -ENOSPC;
161a509a955SBjörn Töpel 	}
162c497176cSBjörn Töpel 
1632b43470aSBjörn Töpel 	xsk_xdp = xsk_buff_alloc(xs->umem);
1642b43470aSBjörn Töpel 	if (!xsk_xdp) {
1652b43470aSBjörn Töpel 		xs->rx_dropped++;
1662b43470aSBjörn Töpel 		return -ENOSPC;
16718baed26SBjörn Töpel 	}
16818baed26SBjörn Töpel 
1692b43470aSBjörn Töpel 	xsk_copy_xdp(xsk_xdp, xdp, len);
1702b43470aSBjörn Töpel 	err = __xsk_rcv_zc(xs, xsk_xdp, len);
1712b43470aSBjörn Töpel 	if (err) {
1722b43470aSBjörn Töpel 		xsk_buff_free(xsk_xdp);
1732b43470aSBjörn Töpel 		return err;
1742b43470aSBjörn Töpel 	}
1752b43470aSBjörn Töpel 	if (explicit_free)
176173d3adbSBjörn Töpel 		xdp_return_buff(xdp);
177173d3adbSBjörn Töpel 	return 0;
178173d3adbSBjörn Töpel }
179173d3adbSBjörn Töpel 
18042fddcc7SBjörn Töpel static bool xsk_is_bound(struct xdp_sock *xs)
18142fddcc7SBjörn Töpel {
18242fddcc7SBjörn Töpel 	if (READ_ONCE(xs->state) == XSK_BOUND) {
18342fddcc7SBjörn Töpel 		/* Matches smp_wmb() in bind(). */
18442fddcc7SBjörn Töpel 		smp_rmb();
18542fddcc7SBjörn Töpel 		return true;
18642fddcc7SBjörn Töpel 	}
18742fddcc7SBjörn Töpel 	return false;
18842fddcc7SBjörn Töpel }
18942fddcc7SBjörn Töpel 
1902b43470aSBjörn Töpel static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp,
1912b43470aSBjörn Töpel 		   bool explicit_free)
192c497176cSBjörn Töpel {
193173d3adbSBjörn Töpel 	u32 len;
194c497176cSBjörn Töpel 
19542fddcc7SBjörn Töpel 	if (!xsk_is_bound(xs))
19642fddcc7SBjörn Töpel 		return -EINVAL;
19742fddcc7SBjörn Töpel 
198173d3adbSBjörn Töpel 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
199173d3adbSBjörn Töpel 		return -EINVAL;
200c497176cSBjörn Töpel 
201173d3adbSBjörn Töpel 	len = xdp->data_end - xdp->data;
202173d3adbSBjörn Töpel 
2030807892eSBjörn Töpel 	return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ?
2042b43470aSBjörn Töpel 		__xsk_rcv_zc(xs, xdp, len) :
2052b43470aSBjörn Töpel 		__xsk_rcv(xs, xdp, len, explicit_free);
206c497176cSBjörn Töpel }
207c497176cSBjörn Töpel 
208d817991cSBjörn Töpel static void xsk_flush(struct xdp_sock *xs)
209c497176cSBjörn Töpel {
21059e35e55SMagnus Karlsson 	xskq_prod_submit(xs->rx);
21130744a68SMagnus Karlsson 	__xskq_cons_release(xs->umem->fq);
21243a825afSBjörn Töpel 	sock_def_readable(&xs->sk);
213c497176cSBjörn Töpel }
214c497176cSBjörn Töpel 
215c497176cSBjörn Töpel int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
216c497176cSBjörn Töpel {
217c497176cSBjörn Töpel 	int err;
218c497176cSBjörn Töpel 
219bf0bdd13SIlya Maximets 	spin_lock_bh(&xs->rx_lock);
2202b43470aSBjörn Töpel 	err = xsk_rcv(xs, xdp, false);
2212b43470aSBjörn Töpel 	xsk_flush(xs);
222bf0bdd13SIlya Maximets 	spin_unlock_bh(&xs->rx_lock);
223c497176cSBjörn Töpel 	return err;
224c497176cSBjörn Töpel }
225c497176cSBjörn Töpel 
226e312b9e7SBjörn Töpel int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
227d817991cSBjörn Töpel {
228e312b9e7SBjörn Töpel 	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
229d817991cSBjörn Töpel 	int err;
230d817991cSBjörn Töpel 
2312b43470aSBjörn Töpel 	err = xsk_rcv(xs, xdp, true);
232d817991cSBjörn Töpel 	if (err)
233d817991cSBjörn Töpel 		return err;
234d817991cSBjörn Töpel 
235d817991cSBjörn Töpel 	if (!xs->flush_node.prev)
236d817991cSBjörn Töpel 		list_add(&xs->flush_node, flush_list);
237d817991cSBjörn Töpel 
238d817991cSBjörn Töpel 	return 0;
239d817991cSBjörn Töpel }
240d817991cSBjörn Töpel 
241e312b9e7SBjörn Töpel void __xsk_map_flush(void)
242d817991cSBjörn Töpel {
243e312b9e7SBjörn Töpel 	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
244d817991cSBjörn Töpel 	struct xdp_sock *xs, *tmp;
245d817991cSBjörn Töpel 
246d817991cSBjörn Töpel 	list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
247d817991cSBjörn Töpel 		xsk_flush(xs);
248d817991cSBjörn Töpel 		__list_del_clearprev(&xs->flush_node);
249d817991cSBjörn Töpel 	}
250d817991cSBjörn Töpel }
251d817991cSBjörn Töpel 
252ac98d8aaSMagnus Karlsson void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
253ac98d8aaSMagnus Karlsson {
25459e35e55SMagnus Karlsson 	xskq_prod_submit_n(umem->cq, nb_entries);
255ac98d8aaSMagnus Karlsson }
256ac98d8aaSMagnus Karlsson EXPORT_SYMBOL(xsk_umem_complete_tx);
257ac98d8aaSMagnus Karlsson 
258ac98d8aaSMagnus Karlsson void xsk_umem_consume_tx_done(struct xdp_umem *umem)
259ac98d8aaSMagnus Karlsson {
260ac98d8aaSMagnus Karlsson 	struct xdp_sock *xs;
261ac98d8aaSMagnus Karlsson 
262ac98d8aaSMagnus Karlsson 	rcu_read_lock();
263e4e5aefcSMagnus Karlsson 	list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) {
26430744a68SMagnus Karlsson 		__xskq_cons_release(xs->tx);
265ac98d8aaSMagnus Karlsson 		xs->sk.sk_write_space(&xs->sk);
266ac98d8aaSMagnus Karlsson 	}
267ac98d8aaSMagnus Karlsson 	rcu_read_unlock();
268ac98d8aaSMagnus Karlsson }
269ac98d8aaSMagnus Karlsson EXPORT_SYMBOL(xsk_umem_consume_tx_done);
270ac98d8aaSMagnus Karlsson 
2714bce4e5cSMaxim Mikityanskiy bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
272ac98d8aaSMagnus Karlsson {
273ac98d8aaSMagnus Karlsson 	struct xdp_sock *xs;
274ac98d8aaSMagnus Karlsson 
275ac98d8aaSMagnus Karlsson 	rcu_read_lock();
276e4e5aefcSMagnus Karlsson 	list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) {
2778aa5a335SCiara Loftus 		if (!xskq_cons_peek_desc(xs->tx, desc, umem)) {
2788aa5a335SCiara Loftus 			xs->tx->queue_empty_descs++;
279ac98d8aaSMagnus Karlsson 			continue;
2808aa5a335SCiara Loftus 		}
281ac98d8aaSMagnus Karlsson 
2820a05861fSTobias Klauser 		/* This is the backpressure mechanism for the Tx path.
28315d8c916SMagnus Karlsson 		 * Reserve space in the completion queue and only proceed
28415d8c916SMagnus Karlsson 		 * if there is space in it. This avoids having to implement
28515d8c916SMagnus Karlsson 		 * any buffering in the Tx path.
28615d8c916SMagnus Karlsson 		 */
28759e35e55SMagnus Karlsson 		if (xskq_prod_reserve_addr(umem->cq, desc->addr))
288ac98d8aaSMagnus Karlsson 			goto out;
289ac98d8aaSMagnus Karlsson 
290c5ed924bSMagnus Karlsson 		xskq_cons_release(xs->tx);
291ac98d8aaSMagnus Karlsson 		rcu_read_unlock();
292ac98d8aaSMagnus Karlsson 		return true;
293ac98d8aaSMagnus Karlsson 	}
294ac98d8aaSMagnus Karlsson 
295ac98d8aaSMagnus Karlsson out:
296ac98d8aaSMagnus Karlsson 	rcu_read_unlock();
297ac98d8aaSMagnus Karlsson 	return false;
298ac98d8aaSMagnus Karlsson }
299ac98d8aaSMagnus Karlsson EXPORT_SYMBOL(xsk_umem_consume_tx);
300ac98d8aaSMagnus Karlsson 
30106870682SMaxim Mikityanskiy static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
302ac98d8aaSMagnus Karlsson {
303ac98d8aaSMagnus Karlsson 	struct net_device *dev = xs->dev;
30406870682SMaxim Mikityanskiy 	int err;
305ac98d8aaSMagnus Karlsson 
30606870682SMaxim Mikityanskiy 	rcu_read_lock();
30706870682SMaxim Mikityanskiy 	err = dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
30806870682SMaxim Mikityanskiy 	rcu_read_unlock();
30906870682SMaxim Mikityanskiy 
31006870682SMaxim Mikityanskiy 	return err;
31106870682SMaxim Mikityanskiy }
31206870682SMaxim Mikityanskiy 
31306870682SMaxim Mikityanskiy static int xsk_zc_xmit(struct xdp_sock *xs)
31406870682SMaxim Mikityanskiy {
31506870682SMaxim Mikityanskiy 	return xsk_wakeup(xs, XDP_WAKEUP_TX);
316ac98d8aaSMagnus Karlsson }
317ac98d8aaSMagnus Karlsson 
31835fcde7fSMagnus Karlsson static void xsk_destruct_skb(struct sk_buff *skb)
31935fcde7fSMagnus Karlsson {
320bbff2f32SBjörn Töpel 	u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
32135fcde7fSMagnus Karlsson 	struct xdp_sock *xs = xdp_sk(skb->sk);
322a9744f7cSMagnus Karlsson 	unsigned long flags;
32335fcde7fSMagnus Karlsson 
324a9744f7cSMagnus Karlsson 	spin_lock_irqsave(&xs->tx_completion_lock, flags);
32559e35e55SMagnus Karlsson 	xskq_prod_submit_addr(xs->umem->cq, addr);
326a9744f7cSMagnus Karlsson 	spin_unlock_irqrestore(&xs->tx_completion_lock, flags);
32735fcde7fSMagnus Karlsson 
32835fcde7fSMagnus Karlsson 	sock_wfree(skb);
32935fcde7fSMagnus Karlsson }
33035fcde7fSMagnus Karlsson 
331df551058SMagnus Karlsson static int xsk_generic_xmit(struct sock *sk)
33235fcde7fSMagnus Karlsson {
33335fcde7fSMagnus Karlsson 	struct xdp_sock *xs = xdp_sk(sk);
334df551058SMagnus Karlsson 	u32 max_batch = TX_BATCH_SIZE;
33535fcde7fSMagnus Karlsson 	bool sent_frame = false;
33635fcde7fSMagnus Karlsson 	struct xdp_desc desc;
33735fcde7fSMagnus Karlsson 	struct sk_buff *skb;
33835fcde7fSMagnus Karlsson 	int err = 0;
33935fcde7fSMagnus Karlsson 
34035fcde7fSMagnus Karlsson 	mutex_lock(&xs->mutex);
34135fcde7fSMagnus Karlsson 
34267571640SIlya Maximets 	if (xs->queue_id >= xs->dev->real_num_tx_queues)
34367571640SIlya Maximets 		goto out;
34467571640SIlya Maximets 
345c5ed924bSMagnus Karlsson 	while (xskq_cons_peek_desc(xs->tx, &desc, xs->umem)) {
34635fcde7fSMagnus Karlsson 		char *buffer;
347bbff2f32SBjörn Töpel 		u64 addr;
348bbff2f32SBjörn Töpel 		u32 len;
34935fcde7fSMagnus Karlsson 
35035fcde7fSMagnus Karlsson 		if (max_batch-- == 0) {
35135fcde7fSMagnus Karlsson 			err = -EAGAIN;
35235fcde7fSMagnus Karlsson 			goto out;
35335fcde7fSMagnus Karlsson 		}
35435fcde7fSMagnus Karlsson 
35509210c4bSMagnus Karlsson 		len = desc.len;
356ac98d8aaSMagnus Karlsson 		skb = sock_alloc_send_skb(sk, len, 1, &err);
357aa2cad06SLi RongQing 		if (unlikely(!skb))
35835fcde7fSMagnus Karlsson 			goto out;
35935fcde7fSMagnus Karlsson 
36035fcde7fSMagnus Karlsson 		skb_put(skb, len);
361bbff2f32SBjörn Töpel 		addr = desc.addr;
3622b43470aSBjörn Töpel 		buffer = xsk_buff_raw_get_data(xs->umem, addr);
36335fcde7fSMagnus Karlsson 		err = skb_store_bits(skb, 0, buffer, len);
3640a05861fSTobias Klauser 		/* This is the backpressure mechanism for the Tx path.
36515d8c916SMagnus Karlsson 		 * Reserve space in the completion queue and only proceed
36615d8c916SMagnus Karlsson 		 * if there is space in it. This avoids having to implement
36715d8c916SMagnus Karlsson 		 * any buffering in the Tx path.
36815d8c916SMagnus Karlsson 		 */
36959e35e55SMagnus Karlsson 		if (unlikely(err) || xskq_prod_reserve(xs->umem->cq)) {
37035fcde7fSMagnus Karlsson 			kfree_skb(skb);
37135fcde7fSMagnus Karlsson 			goto out;
37235fcde7fSMagnus Karlsson 		}
37335fcde7fSMagnus Karlsson 
37435fcde7fSMagnus Karlsson 		skb->dev = xs->dev;
37535fcde7fSMagnus Karlsson 		skb->priority = sk->sk_priority;
37635fcde7fSMagnus Karlsson 		skb->mark = sk->sk_mark;
377c05cd364SKevin Laatz 		skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
37835fcde7fSMagnus Karlsson 		skb->destructor = xsk_destruct_skb;
37935fcde7fSMagnus Karlsson 
380*642e450bSMagnus Karlsson 		/* Hinder dev_direct_xmit from freeing the packet and
381*642e450bSMagnus Karlsson 		 * therefore completing it in the destructor
382*642e450bSMagnus Karlsson 		 */
383*642e450bSMagnus Karlsson 		refcount_inc(&skb->users);
38435fcde7fSMagnus Karlsson 		err = dev_direct_xmit(skb, xs->queue_id);
385*642e450bSMagnus Karlsson 		if  (err == NETDEV_TX_BUSY) {
386*642e450bSMagnus Karlsson 			/* Tell user-space to retry the send */
387*642e450bSMagnus Karlsson 			skb->destructor = sock_wfree;
388*642e450bSMagnus Karlsson 			/* Free skb without triggering the perf drop trace */
389*642e450bSMagnus Karlsson 			consume_skb(skb);
390*642e450bSMagnus Karlsson 			err = -EAGAIN;
391*642e450bSMagnus Karlsson 			goto out;
392*642e450bSMagnus Karlsson 		}
393*642e450bSMagnus Karlsson 
394c5ed924bSMagnus Karlsson 		xskq_cons_release(xs->tx);
39535fcde7fSMagnus Karlsson 		/* Ignore NET_XMIT_CN as packet might have been sent */
396*642e450bSMagnus Karlsson 		if (err == NET_XMIT_DROP) {
397fe588685SMagnus Karlsson 			/* SKB completed but not sent */
398*642e450bSMagnus Karlsson 			kfree_skb(skb);
399fe588685SMagnus Karlsson 			err = -EBUSY;
40035fcde7fSMagnus Karlsson 			goto out;
40135fcde7fSMagnus Karlsson 		}
40235fcde7fSMagnus Karlsson 
403*642e450bSMagnus Karlsson 		consume_skb(skb);
40435fcde7fSMagnus Karlsson 		sent_frame = true;
40535fcde7fSMagnus Karlsson 	}
40635fcde7fSMagnus Karlsson 
4078aa5a335SCiara Loftus 	xs->tx->queue_empty_descs++;
4088aa5a335SCiara Loftus 
40935fcde7fSMagnus Karlsson out:
41035fcde7fSMagnus Karlsson 	if (sent_frame)
41135fcde7fSMagnus Karlsson 		sk->sk_write_space(sk);
41235fcde7fSMagnus Karlsson 
41335fcde7fSMagnus Karlsson 	mutex_unlock(&xs->mutex);
41435fcde7fSMagnus Karlsson 	return err;
41535fcde7fSMagnus Karlsson }
41635fcde7fSMagnus Karlsson 
417df551058SMagnus Karlsson static int __xsk_sendmsg(struct sock *sk)
418df551058SMagnus Karlsson {
419df551058SMagnus Karlsson 	struct xdp_sock *xs = xdp_sk(sk);
420df551058SMagnus Karlsson 
421df551058SMagnus Karlsson 	if (unlikely(!(xs->dev->flags & IFF_UP)))
422df551058SMagnus Karlsson 		return -ENETDOWN;
423df551058SMagnus Karlsson 	if (unlikely(!xs->tx))
424df551058SMagnus Karlsson 		return -ENOBUFS;
425df551058SMagnus Karlsson 
426df551058SMagnus Karlsson 	return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk);
427df551058SMagnus Karlsson }
428df551058SMagnus Karlsson 
42935fcde7fSMagnus Karlsson static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
43035fcde7fSMagnus Karlsson {
431ac98d8aaSMagnus Karlsson 	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
43235fcde7fSMagnus Karlsson 	struct sock *sk = sock->sk;
43335fcde7fSMagnus Karlsson 	struct xdp_sock *xs = xdp_sk(sk);
43435fcde7fSMagnus Karlsson 
43542fddcc7SBjörn Töpel 	if (unlikely(!xsk_is_bound(xs)))
43635fcde7fSMagnus Karlsson 		return -ENXIO;
437df551058SMagnus Karlsson 	if (unlikely(need_wait))
438ac98d8aaSMagnus Karlsson 		return -EOPNOTSUPP;
43935fcde7fSMagnus Karlsson 
440df551058SMagnus Karlsson 	return __xsk_sendmsg(sk);
44135fcde7fSMagnus Karlsson }
44235fcde7fSMagnus Karlsson 
4435d946c5aSLuc Van Oostenryck static __poll_t xsk_poll(struct file *file, struct socket *sock,
444a11e1d43SLinus Torvalds 			     struct poll_table_struct *wait)
445c497176cSBjörn Töpel {
4465d946c5aSLuc Van Oostenryck 	__poll_t mask = datagram_poll(file, sock, wait);
447df551058SMagnus Karlsson 	struct sock *sk = sock->sk;
448df551058SMagnus Karlsson 	struct xdp_sock *xs = xdp_sk(sk);
44942fddcc7SBjörn Töpel 	struct xdp_umem *umem;
45042fddcc7SBjörn Töpel 
45142fddcc7SBjörn Töpel 	if (unlikely(!xsk_is_bound(xs)))
45242fddcc7SBjörn Töpel 		return mask;
45342fddcc7SBjörn Töpel 
45442fddcc7SBjörn Töpel 	umem = xs->umem;
45577cd0d7bSMagnus Karlsson 
456df551058SMagnus Karlsson 	if (umem->need_wakeup) {
45706870682SMaxim Mikityanskiy 		if (xs->zc)
45806870682SMaxim Mikityanskiy 			xsk_wakeup(xs, umem->need_wakeup);
459df551058SMagnus Karlsson 		else
460df551058SMagnus Karlsson 			/* Poll needs to drive Tx also in copy mode */
461df551058SMagnus Karlsson 			__xsk_sendmsg(sk);
462df551058SMagnus Karlsson 	}
463c497176cSBjörn Töpel 
46459e35e55SMagnus Karlsson 	if (xs->rx && !xskq_prod_is_empty(xs->rx))
4655d946c5aSLuc Van Oostenryck 		mask |= EPOLLIN | EPOLLRDNORM;
466c5ed924bSMagnus Karlsson 	if (xs->tx && !xskq_cons_is_full(xs->tx))
4675d946c5aSLuc Van Oostenryck 		mask |= EPOLLOUT | EPOLLWRNORM;
468c497176cSBjörn Töpel 
469c497176cSBjörn Töpel 	return mask;
470c497176cSBjörn Töpel }
471c497176cSBjörn Töpel 
472b9b6b68eSBjörn Töpel static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
473b9b6b68eSBjörn Töpel 			  bool umem_queue)
474423f3832SMagnus Karlsson {
475423f3832SMagnus Karlsson 	struct xsk_queue *q;
476423f3832SMagnus Karlsson 
477423f3832SMagnus Karlsson 	if (entries == 0 || *queue || !is_power_of_2(entries))
478423f3832SMagnus Karlsson 		return -EINVAL;
479423f3832SMagnus Karlsson 
480b9b6b68eSBjörn Töpel 	q = xskq_create(entries, umem_queue);
481423f3832SMagnus Karlsson 	if (!q)
482423f3832SMagnus Karlsson 		return -ENOMEM;
483423f3832SMagnus Karlsson 
48437b07693SBjörn Töpel 	/* Make sure queue is ready before it can be seen by others */
48537b07693SBjörn Töpel 	smp_wmb();
48694a99763SBjörn Töpel 	WRITE_ONCE(*queue, q);
487423f3832SMagnus Karlsson 	return 0;
488423f3832SMagnus Karlsson }
489423f3832SMagnus Karlsson 
490455302d1SIlya Maximets static void xsk_unbind_dev(struct xdp_sock *xs)
491455302d1SIlya Maximets {
492455302d1SIlya Maximets 	struct net_device *dev = xs->dev;
493455302d1SIlya Maximets 
49442fddcc7SBjörn Töpel 	if (xs->state != XSK_BOUND)
495455302d1SIlya Maximets 		return;
49642fddcc7SBjörn Töpel 	WRITE_ONCE(xs->state, XSK_UNBOUND);
497455302d1SIlya Maximets 
498455302d1SIlya Maximets 	/* Wait for driver to stop using the xdp socket. */
499455302d1SIlya Maximets 	xdp_del_sk_umem(xs->umem, xs);
500455302d1SIlya Maximets 	xs->dev = NULL;
501455302d1SIlya Maximets 	synchronize_net();
502455302d1SIlya Maximets 	dev_put(dev);
503455302d1SIlya Maximets }
504455302d1SIlya Maximets 
5050402acd6SBjörn Töpel static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
5060402acd6SBjörn Töpel 					      struct xdp_sock ***map_entry)
5070402acd6SBjörn Töpel {
5080402acd6SBjörn Töpel 	struct xsk_map *map = NULL;
5090402acd6SBjörn Töpel 	struct xsk_map_node *node;
5100402acd6SBjörn Töpel 
5110402acd6SBjörn Töpel 	*map_entry = NULL;
5120402acd6SBjörn Töpel 
5130402acd6SBjörn Töpel 	spin_lock_bh(&xs->map_list_lock);
5140402acd6SBjörn Töpel 	node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
5150402acd6SBjörn Töpel 					node);
5160402acd6SBjörn Töpel 	if (node) {
5170402acd6SBjörn Töpel 		WARN_ON(xsk_map_inc(node->map));
5180402acd6SBjörn Töpel 		map = node->map;
5190402acd6SBjörn Töpel 		*map_entry = node->map_entry;
5200402acd6SBjörn Töpel 	}
5210402acd6SBjörn Töpel 	spin_unlock_bh(&xs->map_list_lock);
5220402acd6SBjörn Töpel 	return map;
5230402acd6SBjörn Töpel }
5240402acd6SBjörn Töpel 
5250402acd6SBjörn Töpel static void xsk_delete_from_maps(struct xdp_sock *xs)
5260402acd6SBjörn Töpel {
5270402acd6SBjörn Töpel 	/* This function removes the current XDP socket from all the
5280402acd6SBjörn Töpel 	 * maps it resides in. We need to take extra care here, due to
5290402acd6SBjörn Töpel 	 * the two locks involved. Each map has a lock synchronizing
5300402acd6SBjörn Töpel 	 * updates to the entries, and each socket has a lock that
5310402acd6SBjörn Töpel 	 * synchronizes access to the list of maps (map_list). For
5320402acd6SBjörn Töpel 	 * deadlock avoidance the locks need to be taken in the order
5330402acd6SBjörn Töpel 	 * "map lock"->"socket map list lock". We start off by
5340402acd6SBjörn Töpel 	 * accessing the socket map list, and take a reference to the
5350402acd6SBjörn Töpel 	 * map to guarantee existence between the
5360402acd6SBjörn Töpel 	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
5370402acd6SBjörn Töpel 	 * calls. Then we ask the map to remove the socket, which
5380402acd6SBjörn Töpel 	 * tries to remove the socket from the map. Note that there
5390402acd6SBjörn Töpel 	 * might be updates to the map between
5400402acd6SBjörn Töpel 	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
5410402acd6SBjörn Töpel 	 */
5420402acd6SBjörn Töpel 	struct xdp_sock **map_entry = NULL;
5430402acd6SBjörn Töpel 	struct xsk_map *map;
5440402acd6SBjörn Töpel 
5450402acd6SBjörn Töpel 	while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
5460402acd6SBjörn Töpel 		xsk_map_try_sock_delete(map, xs, map_entry);
5470402acd6SBjörn Töpel 		xsk_map_put(map);
5480402acd6SBjörn Töpel 	}
5490402acd6SBjörn Töpel }
5500402acd6SBjörn Töpel 
551c0c77d8fSBjörn Töpel static int xsk_release(struct socket *sock)
552c0c77d8fSBjörn Töpel {
553c0c77d8fSBjörn Töpel 	struct sock *sk = sock->sk;
554965a9909SMagnus Karlsson 	struct xdp_sock *xs = xdp_sk(sk);
555c0c77d8fSBjörn Töpel 	struct net *net;
556c0c77d8fSBjörn Töpel 
557c0c77d8fSBjörn Töpel 	if (!sk)
558c0c77d8fSBjörn Töpel 		return 0;
559c0c77d8fSBjörn Töpel 
560c0c77d8fSBjörn Töpel 	net = sock_net(sk);
561c0c77d8fSBjörn Töpel 
5621d0dc069SBjörn Töpel 	mutex_lock(&net->xdp.lock);
5631d0dc069SBjörn Töpel 	sk_del_node_init_rcu(sk);
5641d0dc069SBjörn Töpel 	mutex_unlock(&net->xdp.lock);
5651d0dc069SBjörn Töpel 
566c0c77d8fSBjörn Töpel 	local_bh_disable();
567c0c77d8fSBjörn Töpel 	sock_prot_inuse_add(net, sk->sk_prot, -1);
568c0c77d8fSBjörn Töpel 	local_bh_enable();
569c0c77d8fSBjörn Töpel 
5700402acd6SBjörn Töpel 	xsk_delete_from_maps(xs);
57142fddcc7SBjörn Töpel 	mutex_lock(&xs->mutex);
572455302d1SIlya Maximets 	xsk_unbind_dev(xs);
57342fddcc7SBjörn Töpel 	mutex_unlock(&xs->mutex);
574965a9909SMagnus Karlsson 
575541d7fddSBjörn Töpel 	xskq_destroy(xs->rx);
576541d7fddSBjörn Töpel 	xskq_destroy(xs->tx);
577541d7fddSBjörn Töpel 
578c0c77d8fSBjörn Töpel 	sock_orphan(sk);
579c0c77d8fSBjörn Töpel 	sock->sk = NULL;
580c0c77d8fSBjörn Töpel 
581c0c77d8fSBjörn Töpel 	sk_refcnt_debug_release(sk);
582c0c77d8fSBjörn Töpel 	sock_put(sk);
583c0c77d8fSBjörn Töpel 
584c0c77d8fSBjörn Töpel 	return 0;
585c0c77d8fSBjörn Töpel }
586c0c77d8fSBjörn Töpel 
587965a9909SMagnus Karlsson static struct socket *xsk_lookup_xsk_from_fd(int fd)
588965a9909SMagnus Karlsson {
589965a9909SMagnus Karlsson 	struct socket *sock;
590965a9909SMagnus Karlsson 	int err;
591965a9909SMagnus Karlsson 
592965a9909SMagnus Karlsson 	sock = sockfd_lookup(fd, &err);
593965a9909SMagnus Karlsson 	if (!sock)
594965a9909SMagnus Karlsson 		return ERR_PTR(-ENOTSOCK);
595965a9909SMagnus Karlsson 
596965a9909SMagnus Karlsson 	if (sock->sk->sk_family != PF_XDP) {
597965a9909SMagnus Karlsson 		sockfd_put(sock);
598965a9909SMagnus Karlsson 		return ERR_PTR(-ENOPROTOOPT);
599965a9909SMagnus Karlsson 	}
600965a9909SMagnus Karlsson 
601965a9909SMagnus Karlsson 	return sock;
602965a9909SMagnus Karlsson }
603965a9909SMagnus Karlsson 
604965a9909SMagnus Karlsson static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
605965a9909SMagnus Karlsson {
606965a9909SMagnus Karlsson 	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
607965a9909SMagnus Karlsson 	struct sock *sk = sock->sk;
608965a9909SMagnus Karlsson 	struct xdp_sock *xs = xdp_sk(sk);
609959b71dbSBjörn Töpel 	struct net_device *dev;
610173d3adbSBjörn Töpel 	u32 flags, qid;
611965a9909SMagnus Karlsson 	int err = 0;
612965a9909SMagnus Karlsson 
613965a9909SMagnus Karlsson 	if (addr_len < sizeof(struct sockaddr_xdp))
614965a9909SMagnus Karlsson 		return -EINVAL;
615965a9909SMagnus Karlsson 	if (sxdp->sxdp_family != AF_XDP)
616965a9909SMagnus Karlsson 		return -EINVAL;
617965a9909SMagnus Karlsson 
618f54ba391SBjörn Töpel 	flags = sxdp->sxdp_flags;
61977cd0d7bSMagnus Karlsson 	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
62077cd0d7bSMagnus Karlsson 		      XDP_USE_NEED_WAKEUP))
621f54ba391SBjörn Töpel 		return -EINVAL;
622f54ba391SBjörn Töpel 
6235464c3a0SIlya Maximets 	rtnl_lock();
624965a9909SMagnus Karlsson 	mutex_lock(&xs->mutex);
625455302d1SIlya Maximets 	if (xs->state != XSK_READY) {
626959b71dbSBjörn Töpel 		err = -EBUSY;
627959b71dbSBjörn Töpel 		goto out_release;
628959b71dbSBjörn Töpel 	}
629959b71dbSBjörn Töpel 
630965a9909SMagnus Karlsson 	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
631965a9909SMagnus Karlsson 	if (!dev) {
632965a9909SMagnus Karlsson 		err = -ENODEV;
633965a9909SMagnus Karlsson 		goto out_release;
634965a9909SMagnus Karlsson 	}
635965a9909SMagnus Karlsson 
636f6145903SMagnus Karlsson 	if (!xs->rx && !xs->tx) {
637965a9909SMagnus Karlsson 		err = -EINVAL;
638965a9909SMagnus Karlsson 		goto out_unlock;
639965a9909SMagnus Karlsson 	}
640965a9909SMagnus Karlsson 
641173d3adbSBjörn Töpel 	qid = sxdp->sxdp_queue_id;
642173d3adbSBjörn Töpel 
643173d3adbSBjörn Töpel 	if (flags & XDP_SHARED_UMEM) {
644965a9909SMagnus Karlsson 		struct xdp_sock *umem_xs;
645965a9909SMagnus Karlsson 		struct socket *sock;
646965a9909SMagnus Karlsson 
64777cd0d7bSMagnus Karlsson 		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
64877cd0d7bSMagnus Karlsson 		    (flags & XDP_USE_NEED_WAKEUP)) {
649173d3adbSBjörn Töpel 			/* Cannot specify flags for shared sockets. */
650173d3adbSBjörn Töpel 			err = -EINVAL;
651173d3adbSBjörn Töpel 			goto out_unlock;
652173d3adbSBjörn Töpel 		}
653173d3adbSBjörn Töpel 
654965a9909SMagnus Karlsson 		if (xs->umem) {
655965a9909SMagnus Karlsson 			/* We have already our own. */
656965a9909SMagnus Karlsson 			err = -EINVAL;
657965a9909SMagnus Karlsson 			goto out_unlock;
658965a9909SMagnus Karlsson 		}
659965a9909SMagnus Karlsson 
660965a9909SMagnus Karlsson 		sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
661965a9909SMagnus Karlsson 		if (IS_ERR(sock)) {
662965a9909SMagnus Karlsson 			err = PTR_ERR(sock);
663965a9909SMagnus Karlsson 			goto out_unlock;
664965a9909SMagnus Karlsson 		}
665965a9909SMagnus Karlsson 
666965a9909SMagnus Karlsson 		umem_xs = xdp_sk(sock->sk);
66742fddcc7SBjörn Töpel 		if (!xsk_is_bound(umem_xs)) {
668965a9909SMagnus Karlsson 			err = -EBADF;
669965a9909SMagnus Karlsson 			sockfd_put(sock);
670965a9909SMagnus Karlsson 			goto out_unlock;
67142fddcc7SBjörn Töpel 		}
67242fddcc7SBjörn Töpel 		if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
673965a9909SMagnus Karlsson 			err = -EINVAL;
674965a9909SMagnus Karlsson 			sockfd_put(sock);
675965a9909SMagnus Karlsson 			goto out_unlock;
676965a9909SMagnus Karlsson 		}
677965a9909SMagnus Karlsson 
678965a9909SMagnus Karlsson 		xdp_get_umem(umem_xs->umem);
6799764f4b3SBjörn Töpel 		WRITE_ONCE(xs->umem, umem_xs->umem);
680965a9909SMagnus Karlsson 		sockfd_put(sock);
681965a9909SMagnus Karlsson 	} else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
682965a9909SMagnus Karlsson 		err = -EINVAL;
683965a9909SMagnus Karlsson 		goto out_unlock;
684c497176cSBjörn Töpel 	} else {
685c497176cSBjörn Töpel 		/* This xsk has its own umem. */
686173d3adbSBjörn Töpel 		err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
687173d3adbSBjörn Töpel 		if (err)
688173d3adbSBjörn Töpel 			goto out_unlock;
689965a9909SMagnus Karlsson 	}
690965a9909SMagnus Karlsson 
691965a9909SMagnus Karlsson 	xs->dev = dev;
692ac98d8aaSMagnus Karlsson 	xs->zc = xs->umem->zc;
693ac98d8aaSMagnus Karlsson 	xs->queue_id = qid;
694ac98d8aaSMagnus Karlsson 	xdp_add_sk_umem(xs->umem, xs);
695965a9909SMagnus Karlsson 
696965a9909SMagnus Karlsson out_unlock:
69742fddcc7SBjörn Töpel 	if (err) {
698965a9909SMagnus Karlsson 		dev_put(dev);
69942fddcc7SBjörn Töpel 	} else {
70042fddcc7SBjörn Töpel 		/* Matches smp_rmb() in bind() for shared umem
70142fddcc7SBjörn Töpel 		 * sockets, and xsk_is_bound().
70242fddcc7SBjörn Töpel 		 */
70342fddcc7SBjörn Töpel 		smp_wmb();
70442fddcc7SBjörn Töpel 		WRITE_ONCE(xs->state, XSK_BOUND);
70542fddcc7SBjörn Töpel 	}
706965a9909SMagnus Karlsson out_release:
707965a9909SMagnus Karlsson 	mutex_unlock(&xs->mutex);
7085464c3a0SIlya Maximets 	rtnl_unlock();
709965a9909SMagnus Karlsson 	return err;
710965a9909SMagnus Karlsson }
711965a9909SMagnus Karlsson 
712c05cd364SKevin Laatz struct xdp_umem_reg_v1 {
713c05cd364SKevin Laatz 	__u64 addr; /* Start of packet data area */
714c05cd364SKevin Laatz 	__u64 len; /* Length of packet data area */
715c05cd364SKevin Laatz 	__u32 chunk_size;
716c05cd364SKevin Laatz 	__u32 headroom;
717c05cd364SKevin Laatz };
718c05cd364SKevin Laatz 
719c0c77d8fSBjörn Töpel static int xsk_setsockopt(struct socket *sock, int level, int optname,
720a7b75c5aSChristoph Hellwig 			  sockptr_t optval, unsigned int optlen)
721c0c77d8fSBjörn Töpel {
722c0c77d8fSBjörn Töpel 	struct sock *sk = sock->sk;
723c0c77d8fSBjörn Töpel 	struct xdp_sock *xs = xdp_sk(sk);
724c0c77d8fSBjörn Töpel 	int err;
725c0c77d8fSBjörn Töpel 
726c0c77d8fSBjörn Töpel 	if (level != SOL_XDP)
727c0c77d8fSBjörn Töpel 		return -ENOPROTOOPT;
728c0c77d8fSBjörn Töpel 
729c0c77d8fSBjörn Töpel 	switch (optname) {
730b9b6b68eSBjörn Töpel 	case XDP_RX_RING:
731f6145903SMagnus Karlsson 	case XDP_TX_RING:
732b9b6b68eSBjörn Töpel 	{
733b9b6b68eSBjörn Töpel 		struct xsk_queue **q;
734b9b6b68eSBjörn Töpel 		int entries;
735b9b6b68eSBjörn Töpel 
736b9b6b68eSBjörn Töpel 		if (optlen < sizeof(entries))
737b9b6b68eSBjörn Töpel 			return -EINVAL;
738a7b75c5aSChristoph Hellwig 		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
739b9b6b68eSBjörn Töpel 			return -EFAULT;
740b9b6b68eSBjörn Töpel 
741b9b6b68eSBjörn Töpel 		mutex_lock(&xs->mutex);
742455302d1SIlya Maximets 		if (xs->state != XSK_READY) {
743455302d1SIlya Maximets 			mutex_unlock(&xs->mutex);
744455302d1SIlya Maximets 			return -EBUSY;
745455302d1SIlya Maximets 		}
746f6145903SMagnus Karlsson 		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
747b9b6b68eSBjörn Töpel 		err = xsk_init_queue(entries, q, false);
74877cd0d7bSMagnus Karlsson 		if (!err && optname == XDP_TX_RING)
74977cd0d7bSMagnus Karlsson 			/* Tx needs to be explicitly woken up the first time */
75077cd0d7bSMagnus Karlsson 			xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
751b9b6b68eSBjörn Töpel 		mutex_unlock(&xs->mutex);
752b9b6b68eSBjörn Töpel 		return err;
753b9b6b68eSBjörn Töpel 	}
754c0c77d8fSBjörn Töpel 	case XDP_UMEM_REG:
755c0c77d8fSBjörn Töpel 	{
756c05cd364SKevin Laatz 		size_t mr_size = sizeof(struct xdp_umem_reg);
757c05cd364SKevin Laatz 		struct xdp_umem_reg mr = {};
758c0c77d8fSBjörn Töpel 		struct xdp_umem *umem;
759c0c77d8fSBjörn Töpel 
760c05cd364SKevin Laatz 		if (optlen < sizeof(struct xdp_umem_reg_v1))
761c05cd364SKevin Laatz 			return -EINVAL;
762c05cd364SKevin Laatz 		else if (optlen < sizeof(mr))
763c05cd364SKevin Laatz 			mr_size = sizeof(struct xdp_umem_reg_v1);
764c05cd364SKevin Laatz 
765a7b75c5aSChristoph Hellwig 		if (copy_from_sockptr(&mr, optval, mr_size))
766c0c77d8fSBjörn Töpel 			return -EFAULT;
767c0c77d8fSBjörn Töpel 
768c0c77d8fSBjörn Töpel 		mutex_lock(&xs->mutex);
769455302d1SIlya Maximets 		if (xs->state != XSK_READY || xs->umem) {
770c0c77d8fSBjörn Töpel 			mutex_unlock(&xs->mutex);
771a49049eaSBjörn Töpel 			return -EBUSY;
772a49049eaSBjörn Töpel 		}
773a49049eaSBjörn Töpel 
774a49049eaSBjörn Töpel 		umem = xdp_umem_create(&mr);
775a49049eaSBjörn Töpel 		if (IS_ERR(umem)) {
776a49049eaSBjörn Töpel 			mutex_unlock(&xs->mutex);
777a49049eaSBjörn Töpel 			return PTR_ERR(umem);
778c0c77d8fSBjörn Töpel 		}
779c0c77d8fSBjörn Töpel 
780c0c77d8fSBjörn Töpel 		/* Make sure umem is ready before it can be seen by others */
781c0c77d8fSBjörn Töpel 		smp_wmb();
7829764f4b3SBjörn Töpel 		WRITE_ONCE(xs->umem, umem);
783c0c77d8fSBjörn Töpel 		mutex_unlock(&xs->mutex);
784c0c77d8fSBjörn Töpel 		return 0;
785c0c77d8fSBjörn Töpel 	}
786423f3832SMagnus Karlsson 	case XDP_UMEM_FILL_RING:
787fe230832SMagnus Karlsson 	case XDP_UMEM_COMPLETION_RING:
788423f3832SMagnus Karlsson 	{
789423f3832SMagnus Karlsson 		struct xsk_queue **q;
790423f3832SMagnus Karlsson 		int entries;
791423f3832SMagnus Karlsson 
792a7b75c5aSChristoph Hellwig 		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
793423f3832SMagnus Karlsson 			return -EFAULT;
794423f3832SMagnus Karlsson 
795423f3832SMagnus Karlsson 		mutex_lock(&xs->mutex);
796455302d1SIlya Maximets 		if (xs->state != XSK_READY) {
797455302d1SIlya Maximets 			mutex_unlock(&xs->mutex);
798455302d1SIlya Maximets 			return -EBUSY;
799455302d1SIlya Maximets 		}
800a49049eaSBjörn Töpel 		if (!xs->umem) {
801a49049eaSBjörn Töpel 			mutex_unlock(&xs->mutex);
802a49049eaSBjörn Töpel 			return -EINVAL;
803a49049eaSBjörn Töpel 		}
804a49049eaSBjörn Töpel 
805fe230832SMagnus Karlsson 		q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
806fe230832SMagnus Karlsson 			&xs->umem->cq;
807b9b6b68eSBjörn Töpel 		err = xsk_init_queue(entries, q, true);
8082b43470aSBjörn Töpel 		if (optname == XDP_UMEM_FILL_RING)
8092b43470aSBjörn Töpel 			xp_set_fq(xs->umem->pool, *q);
810423f3832SMagnus Karlsson 		mutex_unlock(&xs->mutex);
811423f3832SMagnus Karlsson 		return err;
812423f3832SMagnus Karlsson 	}
813c0c77d8fSBjörn Töpel 	default:
814c0c77d8fSBjörn Töpel 		break;
815c0c77d8fSBjörn Töpel 	}
816c0c77d8fSBjörn Töpel 
817c0c77d8fSBjörn Töpel 	return -ENOPROTOOPT;
818c0c77d8fSBjörn Töpel }
819c0c77d8fSBjörn Töpel 
82077cd0d7bSMagnus Karlsson static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
82177cd0d7bSMagnus Karlsson {
82277cd0d7bSMagnus Karlsson 	ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
82377cd0d7bSMagnus Karlsson 	ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
82477cd0d7bSMagnus Karlsson 	ring->desc = offsetof(struct xdp_rxtx_ring, desc);
82577cd0d7bSMagnus Karlsson }
82677cd0d7bSMagnus Karlsson 
82777cd0d7bSMagnus Karlsson static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
82877cd0d7bSMagnus Karlsson {
82977cd0d7bSMagnus Karlsson 	ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
83077cd0d7bSMagnus Karlsson 	ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
83177cd0d7bSMagnus Karlsson 	ring->desc = offsetof(struct xdp_umem_ring, desc);
83277cd0d7bSMagnus Karlsson }
83377cd0d7bSMagnus Karlsson 
8348aa5a335SCiara Loftus struct xdp_statistics_v1 {
8358aa5a335SCiara Loftus 	__u64 rx_dropped;
8368aa5a335SCiara Loftus 	__u64 rx_invalid_descs;
8378aa5a335SCiara Loftus 	__u64 tx_invalid_descs;
8388aa5a335SCiara Loftus };
8398aa5a335SCiara Loftus 
840af75d9e0SMagnus Karlsson static int xsk_getsockopt(struct socket *sock, int level, int optname,
841af75d9e0SMagnus Karlsson 			  char __user *optval, int __user *optlen)
842af75d9e0SMagnus Karlsson {
843af75d9e0SMagnus Karlsson 	struct sock *sk = sock->sk;
844af75d9e0SMagnus Karlsson 	struct xdp_sock *xs = xdp_sk(sk);
845af75d9e0SMagnus Karlsson 	int len;
846af75d9e0SMagnus Karlsson 
847af75d9e0SMagnus Karlsson 	if (level != SOL_XDP)
848af75d9e0SMagnus Karlsson 		return -ENOPROTOOPT;
849af75d9e0SMagnus Karlsson 
850af75d9e0SMagnus Karlsson 	if (get_user(len, optlen))
851af75d9e0SMagnus Karlsson 		return -EFAULT;
852af75d9e0SMagnus Karlsson 	if (len < 0)
853af75d9e0SMagnus Karlsson 		return -EINVAL;
854af75d9e0SMagnus Karlsson 
855af75d9e0SMagnus Karlsson 	switch (optname) {
856af75d9e0SMagnus Karlsson 	case XDP_STATISTICS:
857af75d9e0SMagnus Karlsson 	{
8583c4f850eSPeilin Ye 		struct xdp_statistics stats = {};
8598aa5a335SCiara Loftus 		bool extra_stats = true;
8608aa5a335SCiara Loftus 		size_t stats_size;
861af75d9e0SMagnus Karlsson 
8628aa5a335SCiara Loftus 		if (len < sizeof(struct xdp_statistics_v1)) {
863af75d9e0SMagnus Karlsson 			return -EINVAL;
8648aa5a335SCiara Loftus 		} else if (len < sizeof(stats)) {
8658aa5a335SCiara Loftus 			extra_stats = false;
8668aa5a335SCiara Loftus 			stats_size = sizeof(struct xdp_statistics_v1);
8678aa5a335SCiara Loftus 		} else {
8688aa5a335SCiara Loftus 			stats_size = sizeof(stats);
8698aa5a335SCiara Loftus 		}
870af75d9e0SMagnus Karlsson 
871af75d9e0SMagnus Karlsson 		mutex_lock(&xs->mutex);
872af75d9e0SMagnus Karlsson 		stats.rx_dropped = xs->rx_dropped;
8738aa5a335SCiara Loftus 		if (extra_stats) {
8748aa5a335SCiara Loftus 			stats.rx_ring_full = xs->rx_queue_full;
8758aa5a335SCiara Loftus 			stats.rx_fill_ring_empty_descs =
8768aa5a335SCiara Loftus 				xs->umem ? xskq_nb_queue_empty_descs(xs->umem->fq) : 0;
8778aa5a335SCiara Loftus 			stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
8788aa5a335SCiara Loftus 		} else {
8798aa5a335SCiara Loftus 			stats.rx_dropped += xs->rx_queue_full;
8808aa5a335SCiara Loftus 		}
881af75d9e0SMagnus Karlsson 		stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
882af75d9e0SMagnus Karlsson 		stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
883af75d9e0SMagnus Karlsson 		mutex_unlock(&xs->mutex);
884af75d9e0SMagnus Karlsson 
8858aa5a335SCiara Loftus 		if (copy_to_user(optval, &stats, stats_size))
886af75d9e0SMagnus Karlsson 			return -EFAULT;
8878aa5a335SCiara Loftus 		if (put_user(stats_size, optlen))
888af75d9e0SMagnus Karlsson 			return -EFAULT;
889af75d9e0SMagnus Karlsson 
890af75d9e0SMagnus Karlsson 		return 0;
891af75d9e0SMagnus Karlsson 	}
892b3a9e0beSBjörn Töpel 	case XDP_MMAP_OFFSETS:
893b3a9e0beSBjörn Töpel 	{
894b3a9e0beSBjörn Töpel 		struct xdp_mmap_offsets off;
89577cd0d7bSMagnus Karlsson 		struct xdp_mmap_offsets_v1 off_v1;
89677cd0d7bSMagnus Karlsson 		bool flags_supported = true;
89777cd0d7bSMagnus Karlsson 		void *to_copy;
898b3a9e0beSBjörn Töpel 
89977cd0d7bSMagnus Karlsson 		if (len < sizeof(off_v1))
900b3a9e0beSBjörn Töpel 			return -EINVAL;
90177cd0d7bSMagnus Karlsson 		else if (len < sizeof(off))
90277cd0d7bSMagnus Karlsson 			flags_supported = false;
903b3a9e0beSBjörn Töpel 
90477cd0d7bSMagnus Karlsson 		if (flags_supported) {
90577cd0d7bSMagnus Karlsson 			/* xdp_ring_offset is identical to xdp_ring_offset_v1
90677cd0d7bSMagnus Karlsson 			 * except for the flags field added to the end.
90777cd0d7bSMagnus Karlsson 			 */
90877cd0d7bSMagnus Karlsson 			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
90977cd0d7bSMagnus Karlsson 					       &off.rx);
91077cd0d7bSMagnus Karlsson 			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
91177cd0d7bSMagnus Karlsson 					       &off.tx);
91277cd0d7bSMagnus Karlsson 			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
91377cd0d7bSMagnus Karlsson 					       &off.fr);
91477cd0d7bSMagnus Karlsson 			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
91577cd0d7bSMagnus Karlsson 					       &off.cr);
91677cd0d7bSMagnus Karlsson 			off.rx.flags = offsetof(struct xdp_rxtx_ring,
91777cd0d7bSMagnus Karlsson 						ptrs.flags);
91877cd0d7bSMagnus Karlsson 			off.tx.flags = offsetof(struct xdp_rxtx_ring,
91977cd0d7bSMagnus Karlsson 						ptrs.flags);
92077cd0d7bSMagnus Karlsson 			off.fr.flags = offsetof(struct xdp_umem_ring,
92177cd0d7bSMagnus Karlsson 						ptrs.flags);
92277cd0d7bSMagnus Karlsson 			off.cr.flags = offsetof(struct xdp_umem_ring,
92377cd0d7bSMagnus Karlsson 						ptrs.flags);
924b3a9e0beSBjörn Töpel 
925b3a9e0beSBjörn Töpel 			len = sizeof(off);
92677cd0d7bSMagnus Karlsson 			to_copy = &off;
92777cd0d7bSMagnus Karlsson 		} else {
92877cd0d7bSMagnus Karlsson 			xsk_enter_rxtx_offsets(&off_v1.rx);
92977cd0d7bSMagnus Karlsson 			xsk_enter_rxtx_offsets(&off_v1.tx);
93077cd0d7bSMagnus Karlsson 			xsk_enter_umem_offsets(&off_v1.fr);
93177cd0d7bSMagnus Karlsson 			xsk_enter_umem_offsets(&off_v1.cr);
93277cd0d7bSMagnus Karlsson 
93377cd0d7bSMagnus Karlsson 			len = sizeof(off_v1);
93477cd0d7bSMagnus Karlsson 			to_copy = &off_v1;
93577cd0d7bSMagnus Karlsson 		}
93677cd0d7bSMagnus Karlsson 
93777cd0d7bSMagnus Karlsson 		if (copy_to_user(optval, to_copy, len))
938b3a9e0beSBjörn Töpel 			return -EFAULT;
939b3a9e0beSBjörn Töpel 		if (put_user(len, optlen))
940b3a9e0beSBjörn Töpel 			return -EFAULT;
941b3a9e0beSBjörn Töpel 
942b3a9e0beSBjörn Töpel 		return 0;
943b3a9e0beSBjörn Töpel 	}
9442640d3c8SMaxim Mikityanskiy 	case XDP_OPTIONS:
9452640d3c8SMaxim Mikityanskiy 	{
9462640d3c8SMaxim Mikityanskiy 		struct xdp_options opts = {};
9472640d3c8SMaxim Mikityanskiy 
9482640d3c8SMaxim Mikityanskiy 		if (len < sizeof(opts))
9492640d3c8SMaxim Mikityanskiy 			return -EINVAL;
9502640d3c8SMaxim Mikityanskiy 
9512640d3c8SMaxim Mikityanskiy 		mutex_lock(&xs->mutex);
9522640d3c8SMaxim Mikityanskiy 		if (xs->zc)
9532640d3c8SMaxim Mikityanskiy 			opts.flags |= XDP_OPTIONS_ZEROCOPY;
9542640d3c8SMaxim Mikityanskiy 		mutex_unlock(&xs->mutex);
9552640d3c8SMaxim Mikityanskiy 
9562640d3c8SMaxim Mikityanskiy 		len = sizeof(opts);
9572640d3c8SMaxim Mikityanskiy 		if (copy_to_user(optval, &opts, len))
9582640d3c8SMaxim Mikityanskiy 			return -EFAULT;
9592640d3c8SMaxim Mikityanskiy 		if (put_user(len, optlen))
9602640d3c8SMaxim Mikityanskiy 			return -EFAULT;
9612640d3c8SMaxim Mikityanskiy 
9622640d3c8SMaxim Mikityanskiy 		return 0;
9632640d3c8SMaxim Mikityanskiy 	}
964af75d9e0SMagnus Karlsson 	default:
965af75d9e0SMagnus Karlsson 		break;
966af75d9e0SMagnus Karlsson 	}
967af75d9e0SMagnus Karlsson 
968af75d9e0SMagnus Karlsson 	return -EOPNOTSUPP;
969af75d9e0SMagnus Karlsson }
970af75d9e0SMagnus Karlsson 
971423f3832SMagnus Karlsson static int xsk_mmap(struct file *file, struct socket *sock,
972423f3832SMagnus Karlsson 		    struct vm_area_struct *vma)
973423f3832SMagnus Karlsson {
974a5a16e43SGeert Uytterhoeven 	loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
975423f3832SMagnus Karlsson 	unsigned long size = vma->vm_end - vma->vm_start;
976423f3832SMagnus Karlsson 	struct xdp_sock *xs = xdp_sk(sock->sk);
977423f3832SMagnus Karlsson 	struct xsk_queue *q = NULL;
97837b07693SBjörn Töpel 	struct xdp_umem *umem;
979423f3832SMagnus Karlsson 	unsigned long pfn;
980423f3832SMagnus Karlsson 	struct page *qpg;
981423f3832SMagnus Karlsson 
98242fddcc7SBjörn Töpel 	if (READ_ONCE(xs->state) != XSK_READY)
983455302d1SIlya Maximets 		return -EBUSY;
984455302d1SIlya Maximets 
985b9b6b68eSBjörn Töpel 	if (offset == XDP_PGOFF_RX_RING) {
98637b07693SBjörn Töpel 		q = READ_ONCE(xs->rx);
987f6145903SMagnus Karlsson 	} else if (offset == XDP_PGOFF_TX_RING) {
98837b07693SBjörn Töpel 		q = READ_ONCE(xs->tx);
989b9b6b68eSBjörn Töpel 	} else {
99037b07693SBjörn Töpel 		umem = READ_ONCE(xs->umem);
99137b07693SBjörn Töpel 		if (!umem)
992423f3832SMagnus Karlsson 			return -EINVAL;
993423f3832SMagnus Karlsson 
994e6762c8bSMagnus Karlsson 		/* Matches the smp_wmb() in XDP_UMEM_REG */
995e6762c8bSMagnus Karlsson 		smp_rmb();
996423f3832SMagnus Karlsson 		if (offset == XDP_UMEM_PGOFF_FILL_RING)
99737b07693SBjörn Töpel 			q = READ_ONCE(umem->fq);
998fe230832SMagnus Karlsson 		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
99937b07693SBjörn Töpel 			q = READ_ONCE(umem->cq);
1000b9b6b68eSBjörn Töpel 	}
1001423f3832SMagnus Karlsson 
1002423f3832SMagnus Karlsson 	if (!q)
1003423f3832SMagnus Karlsson 		return -EINVAL;
1004423f3832SMagnus Karlsson 
1005e6762c8bSMagnus Karlsson 	/* Matches the smp_wmb() in xsk_init_queue */
1006e6762c8bSMagnus Karlsson 	smp_rmb();
1007423f3832SMagnus Karlsson 	qpg = virt_to_head_page(q->ring);
1008a50b854eSMatthew Wilcox (Oracle) 	if (size > page_size(qpg))
1009423f3832SMagnus Karlsson 		return -EINVAL;
1010423f3832SMagnus Karlsson 
1011423f3832SMagnus Karlsson 	pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
1012423f3832SMagnus Karlsson 	return remap_pfn_range(vma, vma->vm_start, pfn,
1013423f3832SMagnus Karlsson 			       size, vma->vm_page_prot);
1014423f3832SMagnus Karlsson }
1015423f3832SMagnus Karlsson 
1016455302d1SIlya Maximets static int xsk_notifier(struct notifier_block *this,
1017455302d1SIlya Maximets 			unsigned long msg, void *ptr)
1018455302d1SIlya Maximets {
1019455302d1SIlya Maximets 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1020455302d1SIlya Maximets 	struct net *net = dev_net(dev);
1021455302d1SIlya Maximets 	struct sock *sk;
1022455302d1SIlya Maximets 
1023455302d1SIlya Maximets 	switch (msg) {
1024455302d1SIlya Maximets 	case NETDEV_UNREGISTER:
1025455302d1SIlya Maximets 		mutex_lock(&net->xdp.lock);
1026455302d1SIlya Maximets 		sk_for_each(sk, &net->xdp.list) {
1027455302d1SIlya Maximets 			struct xdp_sock *xs = xdp_sk(sk);
1028455302d1SIlya Maximets 
1029455302d1SIlya Maximets 			mutex_lock(&xs->mutex);
1030455302d1SIlya Maximets 			if (xs->dev == dev) {
1031455302d1SIlya Maximets 				sk->sk_err = ENETDOWN;
1032455302d1SIlya Maximets 				if (!sock_flag(sk, SOCK_DEAD))
1033455302d1SIlya Maximets 					sk->sk_error_report(sk);
1034455302d1SIlya Maximets 
1035455302d1SIlya Maximets 				xsk_unbind_dev(xs);
1036455302d1SIlya Maximets 
1037455302d1SIlya Maximets 				/* Clear device references in umem. */
1038455302d1SIlya Maximets 				xdp_umem_clear_dev(xs->umem);
1039455302d1SIlya Maximets 			}
1040455302d1SIlya Maximets 			mutex_unlock(&xs->mutex);
1041455302d1SIlya Maximets 		}
1042455302d1SIlya Maximets 		mutex_unlock(&net->xdp.lock);
1043455302d1SIlya Maximets 		break;
1044455302d1SIlya Maximets 	}
1045455302d1SIlya Maximets 	return NOTIFY_DONE;
1046455302d1SIlya Maximets }
1047455302d1SIlya Maximets 
1048c0c77d8fSBjörn Töpel static struct proto xsk_proto = {
1049c0c77d8fSBjörn Töpel 	.name =		"XDP",
1050c0c77d8fSBjörn Töpel 	.owner =	THIS_MODULE,
1051c0c77d8fSBjörn Töpel 	.obj_size =	sizeof(struct xdp_sock),
1052c0c77d8fSBjörn Töpel };
1053c0c77d8fSBjörn Töpel 
1054c0c77d8fSBjörn Töpel static const struct proto_ops xsk_proto_ops = {
1055c0c77d8fSBjörn Töpel 	.family		= PF_XDP,
1056c0c77d8fSBjörn Töpel 	.owner		= THIS_MODULE,
1057c0c77d8fSBjörn Töpel 	.release	= xsk_release,
1058965a9909SMagnus Karlsson 	.bind		= xsk_bind,
1059c0c77d8fSBjörn Töpel 	.connect	= sock_no_connect,
1060c0c77d8fSBjörn Töpel 	.socketpair	= sock_no_socketpair,
1061c0c77d8fSBjörn Töpel 	.accept		= sock_no_accept,
1062c0c77d8fSBjörn Töpel 	.getname	= sock_no_getname,
1063a11e1d43SLinus Torvalds 	.poll		= xsk_poll,
1064c0c77d8fSBjörn Töpel 	.ioctl		= sock_no_ioctl,
1065c0c77d8fSBjörn Töpel 	.listen		= sock_no_listen,
1066c0c77d8fSBjörn Töpel 	.shutdown	= sock_no_shutdown,
1067c0c77d8fSBjörn Töpel 	.setsockopt	= xsk_setsockopt,
1068af75d9e0SMagnus Karlsson 	.getsockopt	= xsk_getsockopt,
106935fcde7fSMagnus Karlsson 	.sendmsg	= xsk_sendmsg,
1070c0c77d8fSBjörn Töpel 	.recvmsg	= sock_no_recvmsg,
1071423f3832SMagnus Karlsson 	.mmap		= xsk_mmap,
1072c0c77d8fSBjörn Töpel 	.sendpage	= sock_no_sendpage,
1073c0c77d8fSBjörn Töpel };
1074c0c77d8fSBjörn Töpel 
107511fe9262SBjörn Töpel static void xsk_destruct(struct sock *sk)
107611fe9262SBjörn Töpel {
107711fe9262SBjörn Töpel 	struct xdp_sock *xs = xdp_sk(sk);
107811fe9262SBjörn Töpel 
107911fe9262SBjörn Töpel 	if (!sock_flag(sk, SOCK_DEAD))
108011fe9262SBjörn Töpel 		return;
108111fe9262SBjörn Töpel 
108211fe9262SBjörn Töpel 	xdp_put_umem(xs->umem);
108311fe9262SBjörn Töpel 
108411fe9262SBjörn Töpel 	sk_refcnt_debug_dec(sk);
108511fe9262SBjörn Töpel }
108611fe9262SBjörn Töpel 
1087c0c77d8fSBjörn Töpel static int xsk_create(struct net *net, struct socket *sock, int protocol,
1088c0c77d8fSBjörn Töpel 		      int kern)
1089c0c77d8fSBjörn Töpel {
1090c0c77d8fSBjörn Töpel 	struct sock *sk;
1091c0c77d8fSBjörn Töpel 	struct xdp_sock *xs;
1092c0c77d8fSBjörn Töpel 
1093c0c77d8fSBjörn Töpel 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
1094c0c77d8fSBjörn Töpel 		return -EPERM;
1095c0c77d8fSBjörn Töpel 	if (sock->type != SOCK_RAW)
1096c0c77d8fSBjörn Töpel 		return -ESOCKTNOSUPPORT;
1097c0c77d8fSBjörn Töpel 
1098c0c77d8fSBjörn Töpel 	if (protocol)
1099c0c77d8fSBjörn Töpel 		return -EPROTONOSUPPORT;
1100c0c77d8fSBjörn Töpel 
1101c0c77d8fSBjörn Töpel 	sock->state = SS_UNCONNECTED;
1102c0c77d8fSBjörn Töpel 
1103c0c77d8fSBjörn Töpel 	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
1104c0c77d8fSBjörn Töpel 	if (!sk)
1105c0c77d8fSBjörn Töpel 		return -ENOBUFS;
1106c0c77d8fSBjörn Töpel 
1107c0c77d8fSBjörn Töpel 	sock->ops = &xsk_proto_ops;
1108c0c77d8fSBjörn Töpel 
1109c0c77d8fSBjörn Töpel 	sock_init_data(sock, sk);
1110c0c77d8fSBjörn Töpel 
1111c0c77d8fSBjörn Töpel 	sk->sk_family = PF_XDP;
1112c0c77d8fSBjörn Töpel 
111311fe9262SBjörn Töpel 	sk->sk_destruct = xsk_destruct;
111411fe9262SBjörn Töpel 	sk_refcnt_debug_inc(sk);
111511fe9262SBjörn Töpel 
1116cee27167SBjörn Töpel 	sock_set_flag(sk, SOCK_RCU_FREE);
1117cee27167SBjörn Töpel 
1118c0c77d8fSBjörn Töpel 	xs = xdp_sk(sk);
1119455302d1SIlya Maximets 	xs->state = XSK_READY;
1120c0c77d8fSBjörn Töpel 	mutex_init(&xs->mutex);
1121bf0bdd13SIlya Maximets 	spin_lock_init(&xs->rx_lock);
1122a9744f7cSMagnus Karlsson 	spin_lock_init(&xs->tx_completion_lock);
1123c0c77d8fSBjörn Töpel 
11240402acd6SBjörn Töpel 	INIT_LIST_HEAD(&xs->map_list);
11250402acd6SBjörn Töpel 	spin_lock_init(&xs->map_list_lock);
11260402acd6SBjörn Töpel 
11271d0dc069SBjörn Töpel 	mutex_lock(&net->xdp.lock);
11281d0dc069SBjörn Töpel 	sk_add_node_rcu(sk, &net->xdp.list);
11291d0dc069SBjörn Töpel 	mutex_unlock(&net->xdp.lock);
11301d0dc069SBjörn Töpel 
1131c0c77d8fSBjörn Töpel 	local_bh_disable();
1132c0c77d8fSBjörn Töpel 	sock_prot_inuse_add(net, &xsk_proto, 1);
1133c0c77d8fSBjörn Töpel 	local_bh_enable();
1134c0c77d8fSBjörn Töpel 
1135c0c77d8fSBjörn Töpel 	return 0;
1136c0c77d8fSBjörn Töpel }
1137c0c77d8fSBjörn Töpel 
1138c0c77d8fSBjörn Töpel static const struct net_proto_family xsk_family_ops = {
1139c0c77d8fSBjörn Töpel 	.family = PF_XDP,
1140c0c77d8fSBjörn Töpel 	.create = xsk_create,
1141c0c77d8fSBjörn Töpel 	.owner	= THIS_MODULE,
1142c0c77d8fSBjörn Töpel };
1143c0c77d8fSBjörn Töpel 
1144455302d1SIlya Maximets static struct notifier_block xsk_netdev_notifier = {
1145455302d1SIlya Maximets 	.notifier_call	= xsk_notifier,
1146455302d1SIlya Maximets };
1147455302d1SIlya Maximets 
11481d0dc069SBjörn Töpel static int __net_init xsk_net_init(struct net *net)
11491d0dc069SBjörn Töpel {
11501d0dc069SBjörn Töpel 	mutex_init(&net->xdp.lock);
11511d0dc069SBjörn Töpel 	INIT_HLIST_HEAD(&net->xdp.list);
11521d0dc069SBjörn Töpel 	return 0;
11531d0dc069SBjörn Töpel }
11541d0dc069SBjörn Töpel 
11551d0dc069SBjörn Töpel static void __net_exit xsk_net_exit(struct net *net)
11561d0dc069SBjörn Töpel {
11571d0dc069SBjörn Töpel 	WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
11581d0dc069SBjörn Töpel }
11591d0dc069SBjörn Töpel 
11601d0dc069SBjörn Töpel static struct pernet_operations xsk_net_ops = {
11611d0dc069SBjörn Töpel 	.init = xsk_net_init,
11621d0dc069SBjörn Töpel 	.exit = xsk_net_exit,
11631d0dc069SBjörn Töpel };
11641d0dc069SBjörn Töpel 
1165c0c77d8fSBjörn Töpel static int __init xsk_init(void)
1166c0c77d8fSBjörn Töpel {
1167e312b9e7SBjörn Töpel 	int err, cpu;
1168c0c77d8fSBjörn Töpel 
1169c0c77d8fSBjörn Töpel 	err = proto_register(&xsk_proto, 0 /* no slab */);
1170c0c77d8fSBjörn Töpel 	if (err)
1171c0c77d8fSBjörn Töpel 		goto out;
1172c0c77d8fSBjörn Töpel 
1173c0c77d8fSBjörn Töpel 	err = sock_register(&xsk_family_ops);
1174c0c77d8fSBjörn Töpel 	if (err)
1175c0c77d8fSBjörn Töpel 		goto out_proto;
1176c0c77d8fSBjörn Töpel 
11771d0dc069SBjörn Töpel 	err = register_pernet_subsys(&xsk_net_ops);
11781d0dc069SBjörn Töpel 	if (err)
11791d0dc069SBjörn Töpel 		goto out_sk;
1180455302d1SIlya Maximets 
1181455302d1SIlya Maximets 	err = register_netdevice_notifier(&xsk_netdev_notifier);
1182455302d1SIlya Maximets 	if (err)
1183455302d1SIlya Maximets 		goto out_pernet;
1184455302d1SIlya Maximets 
1185e312b9e7SBjörn Töpel 	for_each_possible_cpu(cpu)
1186e312b9e7SBjörn Töpel 		INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
1187c0c77d8fSBjörn Töpel 	return 0;
1188c0c77d8fSBjörn Töpel 
1189455302d1SIlya Maximets out_pernet:
1190455302d1SIlya Maximets 	unregister_pernet_subsys(&xsk_net_ops);
11911d0dc069SBjörn Töpel out_sk:
11921d0dc069SBjörn Töpel 	sock_unregister(PF_XDP);
1193c0c77d8fSBjörn Töpel out_proto:
1194c0c77d8fSBjörn Töpel 	proto_unregister(&xsk_proto);
1195c0c77d8fSBjörn Töpel out:
1196c0c77d8fSBjörn Töpel 	return err;
1197c0c77d8fSBjörn Töpel }
1198c0c77d8fSBjörn Töpel 
1199c0c77d8fSBjörn Töpel fs_initcall(xsk_init);
1200