xref: /openbsd/sys/net/ifq.h (revision cf96265b)
1 /*	$OpenBSD: ifq.h,v 1.41 2023/11/10 15:51:24 bluhm Exp $ */
2 
3 /*
4  * Copyright (c) 2015 David Gwynne <dlg@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #ifndef _NET_IFQ_H_
20 #define _NET_IFQ_H_
21 
22 struct ifnet;
23 struct kstat;
24 
25 struct ifq_ops;
26 
27 struct ifqueue {
28 	struct ifnet		*ifq_if;
29 	struct taskq		*ifq_softnet;
30 	union {
31 		void			*_ifq_softc;
32 		/*
33 		 * a rings sndq is found by looking up an array of pointers.
34 		 * by default we only have one sndq and the default drivers
35 		 * dont use ifq_softc, so we can borrow it for the map until
36 		 * we need to allocate a proper map.
37 		 */
38 		struct ifqueue		*_ifq_ifqs[1];
39 	} _ifq_ptr;
40 #define ifq_softc		 _ifq_ptr._ifq_softc
41 #define ifq_ifqs		 _ifq_ptr._ifq_ifqs
42 
43 	/* mbuf handling */
44 	struct mutex		 ifq_mtx;
45 	const struct ifq_ops	*ifq_ops;
46 	void			*ifq_q;
47 	struct mbuf_list	 ifq_free;
48 	unsigned int		 ifq_len;
49 	unsigned int		 ifq_oactive;
50 
51 	/* statistics */
52 	uint64_t		 ifq_packets;
53 	uint64_t		 ifq_bytes;
54 	uint64_t		 ifq_qdrops;
55 	uint64_t		 ifq_errors;
56 	uint64_t		 ifq_mcasts;
57 	uint32_t		 ifq_oactives;
58 
59 	struct kstat		*ifq_kstat;
60 
61 	/* work serialisation */
62 	struct mutex		 ifq_task_mtx;
63 	struct task_list	 ifq_task_list;
64 	void			*ifq_serializer;
65 	struct task		 ifq_bundle;
66 
67 	/* work to be serialised */
68 	struct task		 ifq_start;
69 	struct task		 ifq_restart;
70 
71 	/* properties */
72 	unsigned int		 ifq_maxlen;
73 	unsigned int		 ifq_idx;
74 };
75 
76 struct ifiqueue {
77 	struct ifnet		*ifiq_if;
78 	struct taskq		*ifiq_softnet;
79 	union {
80 		void			*_ifiq_softc;
81 		struct ifiqueue		*_ifiq_ifiqs[1];
82 	} _ifiq_ptr;
83 #define ifiq_softc		 _ifiq_ptr._ifiq_softc
84 #define ifiq_ifiqs		 _ifiq_ptr._ifiq_ifiqs
85 
86 	struct mutex		 ifiq_mtx;
87 	struct mbuf_list	 ifiq_ml;
88 	struct task		 ifiq_task;
89 	unsigned int		 ifiq_pressure;
90 
91 	/* counters */
92 	uint64_t		 ifiq_packets;
93 	uint64_t		 ifiq_bytes;
94 	uint64_t		 ifiq_fdrops;
95 	uint64_t		 ifiq_qdrops;
96 	uint64_t		 ifiq_errors;
97 	uint64_t		 ifiq_mcasts;
98 	uint64_t		 ifiq_noproto;
99 
100 	/* number of times a list of packets were put on ifiq_ml */
101 	uint64_t		 ifiq_enqueues;
102 	/* number of times a list of packets were pulled off ifiq_ml */
103 	uint64_t		 ifiq_dequeues;
104 
105 	struct kstat		*ifiq_kstat;
106 
107 	/* properties */
108 	unsigned int		 ifiq_idx;
109 };
110 
111 #ifdef _KERNEL
112 
113 #define IFQ_MAXLEN		256
114 
115 /*
116  *
117  * Interface Send Queues
118  *
119  * struct ifqueue sits between the network stack and a drivers
120  * transmission of packets. The high level view is that when the stack
121  * has finished generating a packet it hands it to a driver for
122  * transmission. It does this by queueing the packet on an ifqueue and
123  * notifying the driver to start transmission of the queued packets.
124  *
125  * A network device may have multiple contexts for the transmission
126  * of packets, ie, independent transmit rings. Such a network device,
127  * represented by a struct ifnet, would then have multiple ifqueue
128  * structures, each of which maps to an independent transmit ring.
129  *
130  * struct ifqueue also provides the point where conditioning of
131  * traffic (ie, priq and hfsc) is implemented, and provides some
132  * infrastructure to assist in the implementation of network drivers.
133  *
134  * = ifq API
135  *
136  * The ifq API provides functions for three distinct consumers:
137  *
138  * 1. The network stack
139  * 2. Traffic QoS/conditioning implementations
140  * 3. Network drivers
141  *
142  * == Network Stack API
143  *
144  * The network stack is responsible for initialising and destroying
145  * the ifqueue structures, changing the traffic conditioner on an
146  * interface, enqueuing packets for transmission, and notifying
147  * the driver to start transmission of a particular ifqueue.
148  *
149  * === ifq_init()
150  *
151  * During if_attach(), the network stack calls ifq_init to initialise
152  * the ifqueue structure. By default it configures the priq traffic
153  * conditioner.
154  *
155  * === ifq_destroy()
156  *
157  * The network stack calls ifq_destroy() during if_detach to tear down
158  * the ifqueue structure. It frees the traffic conditioner state, and
159  * frees any mbufs that were left queued.
160  *
161  * === ifq_attach()
162  *
163  * ifq_attach() is used to replace the current traffic conditioner on
164  * the ifqueue. All the pending mbufs are removed from the previous
165  * conditioner and requeued on the new.
166  *
167  * === ifq_idx()
168  *
169  * ifq_idx() selects a specific ifqueue from the current ifnet
170  * structure for use in the transmission of the mbuf.
171  *
172  * === ifq_enqueue()
173  *
174  * ifq_enqueue() attempts to fit an mbuf onto the ifqueue. The
175  * current traffic conditioner may drop a packet to make space on the
176  * queue.
177  *
178  * === ifq_start()
179  *
180  * Once a packet has been successfully queued with ifq_enqueue(),
181  * the network card is notified with a call to ifq_start().
182  * Calls to ifq_start() run in the ifqueue serialisation context,
183  * guaranteeing that only one instance of ifp->if_qstart() will be
184  * running on behalf of a specific ifqueue in the system at any point
185  * in time.
186  *
187  * == Traffic conditioners API
188  *
189  * The majority of interaction between struct ifqueue and a traffic
190  * conditioner occurs via the callbacks a traffic conditioner provides
191  * in an instance of struct ifq_ops.
192  *
193  * XXX document ifqop_*
194  *
195  * The ifqueue API implements the locking on behalf of the conditioning
196  * implementations so conditioners only have to reject or keep mbufs.
197  * If something needs to inspect a conditioners internals, the queue lock
198  * needs to be taken to allow for a consistent or safe view. The queue
199  * lock may be taken and released with ifq_q_enter() and ifq_q_leave().
200  *
201  * === ifq_q_enter()
202  *
203  * Code wishing to access a conditioners internals may take the queue
204  * lock with ifq_q_enter(). The caller must pass a reference to the
205  * conditioners ifq_ops structure so the infrastructure can ensure the
206  * caller is able to understand the internals. ifq_q_enter() returns
207  * a pointer to the conditioners internal structures, or NULL if the
208  * ifq_ops did not match the current conditioner.
209  *
210  * === ifq_q_leave()
211  *
212  * The queue lock acquired with ifq_q_enter() is released with
213  * ifq_q_leave().
214  *
215  * === ifq_mfreem() and ifq_mfreeml()
216  *
217  * A goal of the API is to avoid freeing an mbuf while mutexes are
218  * held. Because the ifq API manages the lock on behalf of the backend
219  * ifqops, the backend should not directly free mbufs. If a conditioner
220  * backend needs to drop a packet during the handling of ifqop_deq_begin,
221  * it may free it by calling ifq_mfreem(). This accounts for the drop,
222  * and schedules the free of the mbuf outside the hold of ifq_mtx.
223  * ifq_mfreeml() takes an mbuf list as an argument instead.
224  *
225  *
226  * == Network Driver API
227  *
228  * The API used by network drivers is mostly documented in the
229  * ifq_dequeue(9) manpage except for ifq_serialize().
230  *
231  * === ifq_serialize()
232  *
233  * A driver may run arbitrary work in the ifqueue serialiser context
234  * via ifq_serialize(). The work to be done is represented by a task
235  * that has been prepared with task_set.
236  *
237  * The work will be run in series with any other work dispatched by
238  * ifq_start(), ifq_restart(), or other ifq_serialize() calls.
239  *
240  * Because the work may be run on another CPU, the lifetime of the
241  * task and the work it represents can extend beyond the end of the
242  * call to ifq_serialize() that dispatched it.
243  *
244  *
245  * = ifqueue work serialisation
246  *
247  * ifqueues provide a mechanism to dispatch work to be run in a single
248  * context. Work in this mechanism is represented by task structures.
249  *
250  * The tasks are run in a context similar to a taskq serviced by a
251  * single kernel thread, except the work is run immediately by the
252  * first CPU that dispatches work. If a second CPU attempts to dispatch
253  * additional tasks while the first is still running, it will be queued
254  * to be run by the first CPU. The second CPU will return immediately.
255  *
256  * = MP Safe Network Drivers
257  *
258  * An MP safe network driver is one in which its start routine can be
259  * called by the network stack without holding the big kernel lock.
260  *
261  * == Attach
262  *
263  * A driver advertises its ability to run its start routine without
264  * the kernel lock by setting the IFXF_MPSAFE flag in ifp->if_xflags
265  * before calling if_attach(). Advertising an MPSAFE start routine
266  * also implies that the driver understands that a network card can
267  * have multiple rings or transmit queues, and therefore provides
268  * if_qstart function (which takes an ifqueue pointer) instead of an
269  * if_start function (which takes an ifnet pointer).
270  *
271  * If the hardware supports multiple transmit rings, it advertises
272  * support for multiple rings to the network stack with if_attach_queues()
273  * after the call to if_attach(). if_attach_queues allocates a struct
274  * ifqueue for each hardware ring, which can then be initialised by
275  * the driver with data for each ring.
276  *
277  *	void	drv_start(struct ifqueue *);
278  *
279  *	void
280  *	drv_attach()
281  *	{
282  *	...
283  *		ifp->if_xflags = IFXF_MPSAFE;
284  *		ifp->if_qstart = drv_start;
285  *		if_attach(ifp);
286  *
287  *		if_attach_queues(ifp, DRV_NUM_TX_RINGS);
288  *		for (i = 0; i < DRV_NUM_TX_RINGS; i++) {
289  *			struct ifqueue *ifq = ifp->if_ifqs[i];
290  *			struct drv_tx_ring *ring = &sc->sc_tx_rings[i];
291  *
292  *			ifq->ifq_softc = ring;
293  *			ring->ifq = ifq;
294  *		}
295  *	}
296  *
297  * The network stack will then call ifp->if_qstart via ifq_start()
298  * to guarantee there is only one instance of that function running
299  * for each ifq in the system, and to serialise it with other work
300  * the driver may provide.
301  *
302  * == Initialise
303  *
304  * When the stack requests an interface be brought up (ie, drv_ioctl()
305  * is called to handle SIOCSIFFLAGS with IFF_UP set in ifp->if_flags)
306  * drivers should set IFF_RUNNING in ifp->if_flags, and then call
307  * ifq_clr_oactive() against each ifq.
308  *
309  * == if_start
310  *
311  * ifq_start() checks that IFF_RUNNING is set in ifp->if_flags, that
312  * ifq_is_oactive() does not return true, and that there are pending
313  * packets to transmit via a call to ifq_len(). Therefore, drivers are
314  * no longer responsible for doing this themselves.
315  *
316  * If a driver should not transmit packets while its link is down, use
317  * ifq_purge() to flush pending packets from the transmit queue.
318  *
319  * Drivers for hardware should use the following pattern to transmit
320  * packets:
321  *
322  *	void
323  *	drv_start(struct ifqueue *ifq)
324  *	{
325  *		struct drv_tx_ring *ring = ifq->ifq_softc;
326  *		struct ifnet *ifp = ifq->ifq_if;
327  *		struct drv_softc *sc = ifp->if_softc;
328  *		struct mbuf *m;
329  *		int kick = 0;
330  *
331  *		if (NO_LINK) {
332  *			ifq_purge(ifq);
333  *			return;
334  *		}
335  *
336  *		for (;;) {
337  *			if (NO_SPACE(ring)) {
338  *				ifq_set_oactive(ifq);
339  *				break;
340  *			}
341  *
342  *			m = ifq_dequeue(ifq);
343  *			if (m == NULL)
344  *				break;
345  *
346  *			if (drv_encap(sc, ring, m) != 0) { // map and fill ring
347  *				m_freem(m);
348  *				continue;
349  *			}
350  *
351  *			bpf_mtap();
352  *		}
353  *
354  *		drv_kick(ring); // notify hw of new descriptors on the ring
355  *	 }
356  *
357  * == Transmission completion
358  *
359  * The following pattern should be used for transmit queue interrupt
360  * processing:
361  *
362  *	void
363  *	drv_txeof(struct drv_tx_ring *ring)
364  *	{
365  *		struct ifqueue *ifq = ring->ifq;
366  *
367  *		while (COMPLETED_PKTS(ring)) {
368  *			// unmap packets, m_freem() the mbufs.
369  *		}
370  *
371  *		if (ifq_is_oactive(ifq))
372  *			ifq_restart(ifq);
373  *	}
374  *
375  * == Stop
376  *
377  * Bringing an interface down (ie, IFF_UP was cleared in ifp->if_flags)
378  * should clear IFF_RUNNING in ifp->if_flags, and guarantee the start
379  * routine is not running before freeing any resources it uses:
380  *
381  *	void
382  *	drv_down(struct drv_softc *sc)
383  *	{
384  *		struct ifnet *ifp = &sc->sc_if;
385  *		struct ifqueue *ifq;
386  *		int i;
387  *
388  *		CLR(ifp->if_flags, IFF_RUNNING);
389  *		DISABLE_INTERRUPTS();
390  *
391  *		for (i = 0; i < sc->sc_num_queues; i++) {
392  *			ifq = ifp->if_ifqs[i];
393  *			ifq_barrier(ifq);
394  *		}
395  *
396  *		intr_barrier(sc->sc_ih);
397  *
398  *		FREE_RESOURCES();
399  *
400  *		for (i = 0; i < sc->sc_num_queues; i++) {
401  *			ifq = ifp->if_ifqs[i];
402  *			ifq_clr_oactive(ifq);
403  *		}
404  *	}
405  *
406  */
407 
408 struct ifq_ops {
409 	unsigned int		 (*ifqop_idx)(unsigned int,
410 				    const struct mbuf *);
411 	struct mbuf		*(*ifqop_enq)(struct ifqueue *, struct mbuf *);
412 	struct mbuf		*(*ifqop_deq_begin)(struct ifqueue *, void **);
413 	void			 (*ifqop_deq_commit)(struct ifqueue *,
414 				    struct mbuf *, void *);
415 	void			 (*ifqop_purge)(struct ifqueue *,
416 				    struct mbuf_list *);
417 	void			*(*ifqop_alloc)(unsigned int, void *);
418 	void			 (*ifqop_free)(unsigned int, void *);
419 };
420 
421 extern const struct ifq_ops * const ifq_priq_ops;
422 
423 /*
424  * Interface send queues.
425  */
426 
427 void		 ifq_init(struct ifqueue *, struct ifnet *, unsigned int);
428 void		 ifq_attach(struct ifqueue *, const struct ifq_ops *, void *);
429 void		 ifq_destroy(struct ifqueue *);
430 void		 ifq_add_data(struct ifqueue *, struct if_data *);
431 int		 ifq_enqueue(struct ifqueue *, struct mbuf *);
432 void		 ifq_start(struct ifqueue *);
433 struct mbuf	*ifq_deq_begin(struct ifqueue *);
434 void		 ifq_deq_commit(struct ifqueue *, struct mbuf *);
435 void		 ifq_deq_rollback(struct ifqueue *, struct mbuf *);
436 struct mbuf	*ifq_dequeue(struct ifqueue *);
437 int		 ifq_hdatalen(struct ifqueue *);
438 void		 ifq_init_maxlen(struct ifqueue *, unsigned int);
439 void		 ifq_mfreem(struct ifqueue *, struct mbuf *);
440 void		 ifq_mfreeml(struct ifqueue *, struct mbuf_list *);
441 unsigned int	 ifq_purge(struct ifqueue *);
442 void		*ifq_q_enter(struct ifqueue *, const struct ifq_ops *);
443 void		 ifq_q_leave(struct ifqueue *, void *);
444 void		 ifq_serialize(struct ifqueue *, struct task *);
445 void		 ifq_barrier(struct ifqueue *);
446 void		 ifq_set_oactive(struct ifqueue *);
447 
448 int		 ifq_deq_sleep(struct ifqueue *, struct mbuf **, int, int,
449 		     const char *, volatile unsigned int *,
450 		     volatile unsigned int *);
451 
452 #define ifq_len(_ifq)		READ_ONCE((_ifq)->ifq_len)
453 #define ifq_empty(_ifq)		(ifq_len(_ifq) == 0)
454 
455 static inline int
ifq_is_priq(struct ifqueue * ifq)456 ifq_is_priq(struct ifqueue *ifq)
457 {
458 	return (ifq->ifq_ops == ifq_priq_ops);
459 }
460 
461 static inline void
ifq_clr_oactive(struct ifqueue * ifq)462 ifq_clr_oactive(struct ifqueue *ifq)
463 {
464 	ifq->ifq_oactive = 0;
465 }
466 
467 static inline unsigned int
ifq_is_oactive(struct ifqueue * ifq)468 ifq_is_oactive(struct ifqueue *ifq)
469 {
470 	return (ifq->ifq_oactive);
471 }
472 
473 static inline void
ifq_restart(struct ifqueue * ifq)474 ifq_restart(struct ifqueue *ifq)
475 {
476 	ifq_serialize(ifq, &ifq->ifq_restart);
477 }
478 
479 static inline unsigned int
ifq_idx(struct ifqueue * ifq,unsigned int nifqs,const struct mbuf * m)480 ifq_idx(struct ifqueue *ifq, unsigned int nifqs, const struct mbuf *m)
481 {
482 	return ((*ifq->ifq_ops->ifqop_idx)(nifqs, m));
483 }
484 
485 /* ifiq */
486 
487 void		 ifiq_init(struct ifiqueue *, struct ifnet *, unsigned int);
488 void		 ifiq_destroy(struct ifiqueue *);
489 int		 ifiq_input(struct ifiqueue *, struct mbuf_list *);
490 int		 ifiq_enqueue(struct ifiqueue *, struct mbuf *);
491 void		 ifiq_add_data(struct ifiqueue *, struct if_data *);
492 
493 #define ifiq_len(_ifiq)		READ_ONCE(ml_len(&(_ifiq)->ifiq_ml))
494 #define ifiq_empty(_ifiq)	(ifiq_len(_ifiq) == 0)
495 
496 #endif /* _KERNEL */
497 
498 #endif /* _NET_IFQ_H_ */
499