1 /* $OpenBSD: ifq.h,v 1.42 2024/11/20 02:18:45 dlg Exp $ */
2
3 /*
4 * Copyright (c) 2015 David Gwynne <dlg@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19 #ifndef _NET_IFQ_H_
20 #define _NET_IFQ_H_
21
22 struct ifnet;
23 struct kstat;
24
25 struct ifq_ops;
26
27 struct ifqueue {
28 struct ifnet *ifq_if;
29 struct taskq *ifq_softnet;
30 union {
31 void *_ifq_softc;
32 /*
33 * a rings sndq is found by looking up an array of pointers.
34 * by default we only have one sndq and the default drivers
35 * dont use ifq_softc, so we can borrow it for the map until
36 * we need to allocate a proper map.
37 */
38 struct ifqueue *_ifq_ifqs[1];
39 } _ifq_ptr;
40 #define ifq_softc _ifq_ptr._ifq_softc
41 #define ifq_ifqs _ifq_ptr._ifq_ifqs
42
43 /* mbuf handling */
44 struct mutex ifq_mtx;
45 const struct ifq_ops *ifq_ops;
46 void *ifq_q;
47 struct mbuf_list ifq_free;
48 unsigned int ifq_len;
49 unsigned int ifq_oactive;
50
51 /* statistics */
52 uint64_t ifq_packets;
53 uint64_t ifq_bytes;
54 uint64_t ifq_qdrops;
55 uint64_t ifq_errors;
56 uint64_t ifq_mcasts;
57 uint32_t ifq_oactives;
58
59 struct kstat *ifq_kstat;
60
61 /* work serialisation */
62 struct mutex ifq_task_mtx;
63 struct task_list ifq_task_list;
64 void *ifq_serializer;
65 struct task ifq_bundle;
66
67 /* work to be serialised */
68 struct task ifq_start;
69 struct task ifq_restart;
70
71 /* properties */
72 unsigned int ifq_maxlen;
73 unsigned int ifq_idx;
74 };
75
76 struct ifiqueue {
77 struct ifnet *ifiq_if;
78 struct taskq *ifiq_softnet;
79 union {
80 void *_ifiq_softc;
81 struct ifiqueue *_ifiq_ifiqs[1];
82 } _ifiq_ptr;
83 #define ifiq_softc _ifiq_ptr._ifiq_softc
84 #define ifiq_ifiqs _ifiq_ptr._ifiq_ifiqs
85
86 struct mutex ifiq_mtx;
87 struct mbuf_list ifiq_ml;
88 struct task ifiq_task;
89 unsigned int ifiq_pressure;
90
91 /* counters */
92 uint64_t ifiq_packets;
93 uint64_t ifiq_bytes;
94 uint64_t ifiq_fdrops;
95 uint64_t ifiq_qdrops;
96 uint64_t ifiq_errors;
97 uint64_t ifiq_mcasts;
98 uint64_t ifiq_noproto;
99
100 /* number of times a list of packets were put on ifiq_ml */
101 uint64_t ifiq_enqueues;
102 /* number of times a list of packets were pulled off ifiq_ml */
103 uint64_t ifiq_dequeues;
104
105 struct kstat *ifiq_kstat;
106
107 /* properties */
108 unsigned int ifiq_idx;
109 };
110
111 #ifdef _KERNEL
112
113 #define IFQ_MAXLEN 256
114
115 /*
116 *
117 * Interface Send Queues
118 *
119 * struct ifqueue sits between the network stack and a drivers
120 * transmission of packets. The high level view is that when the stack
121 * has finished generating a packet it hands it to a driver for
122 * transmission. It does this by queueing the packet on an ifqueue and
123 * notifying the driver to start transmission of the queued packets.
124 *
125 * A network device may have multiple contexts for the transmission
126 * of packets, ie, independent transmit rings. Such a network device,
127 * represented by a struct ifnet, would then have multiple ifqueue
128 * structures, each of which maps to an independent transmit ring.
129 *
130 * struct ifqueue also provides the point where conditioning of
131 * traffic (ie, priq and hfsc) is implemented, and provides some
132 * infrastructure to assist in the implementation of network drivers.
133 *
134 * = ifq API
135 *
136 * The ifq API provides functions for three distinct consumers:
137 *
138 * 1. The network stack
139 * 2. Traffic QoS/conditioning implementations
140 * 3. Network drivers
141 *
142 * == Network Stack API
143 *
144 * The network stack is responsible for initialising and destroying
145 * the ifqueue structures, changing the traffic conditioner on an
146 * interface, enqueuing packets for transmission, and notifying
147 * the driver to start transmission of a particular ifqueue.
148 *
149 * === ifq_init()
150 *
151 * During if_attach(), the network stack calls ifq_init to initialise
152 * the ifqueue structure. By default it configures the priq traffic
153 * conditioner.
154 *
155 * === ifq_destroy()
156 *
157 * The network stack calls ifq_destroy() during if_detach to tear down
158 * the ifqueue structure. It frees the traffic conditioner state, and
159 * frees any mbufs that were left queued.
160 *
161 * === ifq_attach()
162 *
163 * ifq_attach() is used to replace the current traffic conditioner on
164 * the ifqueue. All the pending mbufs are removed from the previous
165 * conditioner and requeued on the new.
166 *
167 * === ifq_idx()
168 *
169 * ifq_idx() selects a specific ifqueue from the current ifnet
170 * structure for use in the transmission of the mbuf.
171 *
172 * === ifq_enqueue()
173 *
174 * ifq_enqueue() attempts to fit an mbuf onto the ifqueue. The
175 * current traffic conditioner may drop a packet to make space on the
176 * queue.
177 *
178 * === ifq_start()
179 *
180 * Once a packet has been successfully queued with ifq_enqueue(),
181 * the network card is notified with a call to ifq_start().
182 * Calls to ifq_start() run in the ifqueue serialisation context,
183 * guaranteeing that only one instance of ifp->if_qstart() will be
184 * running on behalf of a specific ifqueue in the system at any point
185 * in time.
186 *
187 * == Traffic conditioners API
188 *
189 * The majority of interaction between struct ifqueue and a traffic
190 * conditioner occurs via the callbacks a traffic conditioner provides
191 * in an instance of struct ifq_ops.
192 *
193 * XXX document ifqop_*
194 *
195 * The ifqueue API implements the locking on behalf of the conditioning
196 * implementations so conditioners only have to reject or keep mbufs.
197 * If something needs to inspect a conditioners internals, the queue lock
198 * needs to be taken to allow for a consistent or safe view. The queue
199 * lock may be taken and released with ifq_q_enter() and ifq_q_leave().
200 *
201 * === ifq_q_enter()
202 *
203 * Code wishing to access a conditioners internals may take the queue
204 * lock with ifq_q_enter(). The caller must pass a reference to the
205 * conditioners ifq_ops structure so the infrastructure can ensure the
206 * caller is able to understand the internals. ifq_q_enter() returns
207 * a pointer to the conditioners internal structures, or NULL if the
208 * ifq_ops did not match the current conditioner.
209 *
210 * === ifq_q_leave()
211 *
212 * The queue lock acquired with ifq_q_enter() is released with
213 * ifq_q_leave().
214 *
215 * === ifq_mfreem() and ifq_mfreeml()
216 *
217 * A goal of the API is to avoid freeing an mbuf while mutexes are
218 * held. Because the ifq API manages the lock on behalf of the backend
219 * ifqops, the backend should not directly free mbufs. If a conditioner
220 * backend needs to drop a packet during the handling of ifqop_deq_begin,
221 * it may free it by calling ifq_mfreem(). This accounts for the drop,
222 * and schedules the free of the mbuf outside the hold of ifq_mtx.
223 * ifq_mfreeml() takes an mbuf list as an argument instead.
224 *
225 *
226 * == Network Driver API
227 *
228 * The API used by network drivers is mostly documented in the
229 * ifq_dequeue(9) manpage except for ifq_serialize().
230 *
231 * === ifq_serialize()
232 *
233 * A driver may run arbitrary work in the ifqueue serialiser context
234 * via ifq_serialize(). The work to be done is represented by a task
235 * that has been prepared with task_set.
236 *
237 * The work will be run in series with any other work dispatched by
238 * ifq_start(), ifq_restart(), or other ifq_serialize() calls.
239 *
240 * Because the work may be run on another CPU, the lifetime of the
241 * task and the work it represents can extend beyond the end of the
242 * call to ifq_serialize() that dispatched it.
243 *
244 *
245 * = ifqueue work serialisation
246 *
247 * ifqueues provide a mechanism to dispatch work to be run in a single
248 * context. Work in this mechanism is represented by task structures.
249 *
250 * The tasks are run in a context similar to a taskq serviced by a
251 * single kernel thread, except the work is run immediately by the
252 * first CPU that dispatches work. If a second CPU attempts to dispatch
253 * additional tasks while the first is still running, it will be queued
254 * to be run by the first CPU. The second CPU will return immediately.
255 *
256 * = MP Safe Network Drivers
257 *
258 * An MP safe network driver is one in which its start routine can be
259 * called by the network stack without holding the big kernel lock.
260 *
261 * == Attach
262 *
263 * A driver advertises its ability to run its start routine without
264 * the kernel lock by setting the IFXF_MPSAFE flag in ifp->if_xflags
265 * before calling if_attach(). Advertising an MPSAFE start routine
266 * also implies that the driver understands that a network card can
267 * have multiple rings or transmit queues, and therefore provides
268 * if_qstart function (which takes an ifqueue pointer) instead of an
269 * if_start function (which takes an ifnet pointer).
270 *
271 * If the hardware supports multiple transmit rings, it advertises
272 * support for multiple rings to the network stack with if_attach_queues()
273 * after the call to if_attach(). if_attach_queues allocates a struct
274 * ifqueue for each hardware ring, which can then be initialised by
275 * the driver with data for each ring.
276 *
277 * void drv_start(struct ifqueue *);
278 *
279 * void
280 * drv_attach()
281 * {
282 * ...
283 * ifp->if_xflags = IFXF_MPSAFE;
284 * ifp->if_qstart = drv_start;
285 * if_attach(ifp);
286 *
287 * if_attach_queues(ifp, DRV_NUM_TX_RINGS);
288 * for (i = 0; i < DRV_NUM_TX_RINGS; i++) {
289 * struct ifqueue *ifq = ifp->if_ifqs[i];
290 * struct drv_tx_ring *ring = &sc->sc_tx_rings[i];
291 *
292 * ifq->ifq_softc = ring;
293 * ring->ifq = ifq;
294 * }
295 * }
296 *
297 * The network stack will then call ifp->if_qstart via ifq_start()
298 * to guarantee there is only one instance of that function running
299 * for each ifq in the system, and to serialise it with other work
300 * the driver may provide.
301 *
302 * == Initialise
303 *
304 * When the stack requests an interface be brought up (ie, drv_ioctl()
305 * is called to handle SIOCSIFFLAGS with IFF_UP set in ifp->if_flags)
306 * drivers should set IFF_RUNNING in ifp->if_flags, and then call
307 * ifq_clr_oactive() against each ifq.
308 *
309 * == if_start
310 *
311 * ifq_start() checks that IFF_RUNNING is set in ifp->if_flags, that
312 * ifq_is_oactive() does not return true, and that there are pending
313 * packets to transmit via a call to ifq_len(). Therefore, drivers are
314 * no longer responsible for doing this themselves.
315 *
316 * If a driver should not transmit packets while its link is down, use
317 * ifq_purge() to flush pending packets from the transmit queue.
318 *
319 * Drivers for hardware should use the following pattern to transmit
320 * packets:
321 *
322 * void
323 * drv_start(struct ifqueue *ifq)
324 * {
325 * struct drv_tx_ring *ring = ifq->ifq_softc;
326 * struct ifnet *ifp = ifq->ifq_if;
327 * struct drv_softc *sc = ifp->if_softc;
328 * struct mbuf *m;
329 * int kick = 0;
330 *
331 * if (NO_LINK) {
332 * ifq_purge(ifq);
333 * return;
334 * }
335 *
336 * for (;;) {
337 * if (NO_SPACE(ring)) {
338 * ifq_set_oactive(ifq);
339 * break;
340 * }
341 *
342 * m = ifq_dequeue(ifq);
343 * if (m == NULL)
344 * break;
345 *
346 * if (drv_encap(sc, ring, m) != 0) { // map and fill ring
347 * m_freem(m);
348 * continue;
349 * }
350 *
351 * bpf_mtap();
352 * }
353 *
354 * drv_kick(ring); // notify hw of new descriptors on the ring
355 * }
356 *
357 * == Transmission completion
358 *
359 * The following pattern should be used for transmit queue interrupt
360 * processing:
361 *
362 * void
363 * drv_txeof(struct drv_tx_ring *ring)
364 * {
365 * struct ifqueue *ifq = ring->ifq;
366 *
367 * while (COMPLETED_PKTS(ring)) {
368 * // unmap packets, m_freem() the mbufs.
369 * }
370 *
371 * if (ifq_is_oactive(ifq))
372 * ifq_restart(ifq);
373 * }
374 *
375 * == Stop
376 *
377 * Bringing an interface down (ie, IFF_UP was cleared in ifp->if_flags)
378 * should clear IFF_RUNNING in ifp->if_flags, and guarantee the start
379 * routine is not running before freeing any resources it uses:
380 *
381 * void
382 * drv_down(struct drv_softc *sc)
383 * {
384 * struct ifnet *ifp = &sc->sc_if;
385 * struct ifqueue *ifq;
386 * int i;
387 *
388 * CLR(ifp->if_flags, IFF_RUNNING);
389 * DISABLE_INTERRUPTS();
390 *
391 * for (i = 0; i < sc->sc_num_queues; i++) {
392 * ifq = ifp->if_ifqs[i];
393 * ifq_barrier(ifq);
394 * }
395 *
396 * intr_barrier(sc->sc_ih);
397 *
398 * FREE_RESOURCES();
399 *
400 * for (i = 0; i < sc->sc_num_queues; i++) {
401 * ifq = ifp->if_ifqs[i];
402 * ifq_clr_oactive(ifq);
403 * }
404 * }
405 *
406 */
407
408 struct ifq_ops {
409 unsigned int (*ifqop_idx)(unsigned int,
410 const struct mbuf *);
411 struct mbuf *(*ifqop_enq)(struct ifqueue *, struct mbuf *);
412 struct mbuf *(*ifqop_deq_begin)(struct ifqueue *, void **);
413 void (*ifqop_deq_commit)(struct ifqueue *,
414 struct mbuf *, void *);
415 void (*ifqop_purge)(struct ifqueue *,
416 struct mbuf_list *);
417 void *(*ifqop_alloc)(unsigned int, void *);
418 void (*ifqop_free)(unsigned int, void *);
419 };
420
421 extern const struct ifq_ops * const ifq_priq_ops;
422
423 /*
424 * Interface send queues.
425 */
426
427 void ifq_init(struct ifqueue *, struct ifnet *, unsigned int);
428 void ifq_attach(struct ifqueue *, const struct ifq_ops *, void *);
429 void ifq_destroy(struct ifqueue *);
430 void ifq_add_data(struct ifqueue *, struct if_data *);
431 int ifq_enqueue(struct ifqueue *, struct mbuf *);
432 void ifq_start(struct ifqueue *);
433 struct mbuf *ifq_deq_begin(struct ifqueue *);
434 void ifq_deq_commit(struct ifqueue *, struct mbuf *);
435 void ifq_deq_rollback(struct ifqueue *, struct mbuf *);
436 struct mbuf *ifq_dequeue(struct ifqueue *);
437 int ifq_hdatalen(struct ifqueue *);
438 void ifq_init_maxlen(struct ifqueue *, unsigned int);
439 void ifq_mfreem(struct ifqueue *, struct mbuf *);
440 void ifq_mfreeml(struct ifqueue *, struct mbuf_list *);
441 unsigned int ifq_purge(struct ifqueue *);
442 void *ifq_q_enter(struct ifqueue *, const struct ifq_ops *);
443 void ifq_q_leave(struct ifqueue *, void *);
444 void ifq_serialize(struct ifqueue *, struct task *);
445 void ifq_barrier(struct ifqueue *);
446 void ifq_set_oactive(struct ifqueue *);
447 void ifq_deq_set_oactive(struct ifqueue *);
448
449 int ifq_deq_sleep(struct ifqueue *, struct mbuf **, int, int,
450 const char *, volatile unsigned int *,
451 volatile unsigned int *);
452
453 #define ifq_len(_ifq) READ_ONCE((_ifq)->ifq_len)
454 #define ifq_empty(_ifq) (ifq_len(_ifq) == 0)
455
456 static inline int
ifq_is_priq(struct ifqueue * ifq)457 ifq_is_priq(struct ifqueue *ifq)
458 {
459 return (ifq->ifq_ops == ifq_priq_ops);
460 }
461
462 static inline void
ifq_clr_oactive(struct ifqueue * ifq)463 ifq_clr_oactive(struct ifqueue *ifq)
464 {
465 ifq->ifq_oactive = 0;
466 }
467
468 static inline unsigned int
ifq_is_oactive(struct ifqueue * ifq)469 ifq_is_oactive(struct ifqueue *ifq)
470 {
471 return (ifq->ifq_oactive);
472 }
473
474 static inline void
ifq_restart(struct ifqueue * ifq)475 ifq_restart(struct ifqueue *ifq)
476 {
477 ifq_serialize(ifq, &ifq->ifq_restart);
478 }
479
480 static inline unsigned int
ifq_idx(struct ifqueue * ifq,unsigned int nifqs,const struct mbuf * m)481 ifq_idx(struct ifqueue *ifq, unsigned int nifqs, const struct mbuf *m)
482 {
483 return ((*ifq->ifq_ops->ifqop_idx)(nifqs, m));
484 }
485
486 /* ifiq */
487
488 void ifiq_init(struct ifiqueue *, struct ifnet *, unsigned int);
489 void ifiq_destroy(struct ifiqueue *);
490 int ifiq_input(struct ifiqueue *, struct mbuf_list *);
491 int ifiq_enqueue(struct ifiqueue *, struct mbuf *);
492 void ifiq_add_data(struct ifiqueue *, struct if_data *);
493
494 #define ifiq_len(_ifiq) READ_ONCE(ml_len(&(_ifiq)->ifiq_ml))
495 #define ifiq_empty(_ifiq) (ifiq_len(_ifiq) == 0)
496
497 #endif /* _KERNEL */
498
499 #endif /* _NET_IFQ_H_ */
500