xref: /freebsd/sys/net/iflib.c (revision 0e6acb26)
1 /*-
2  * Copyright (c) 2014-2017, Matthew Macy <mmacy@nextbsd.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  *  1. Redistributions of source code must retain the above copyright notice,
9  *     this list of conditions and the following disclaimer.
10  *
11  *  2. Neither the name of Matthew Macy nor the names of its
12  *     contributors may be used to endorse or promote products derived from
13  *     this software without specific prior written permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25  * POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_inet.h"
32 #include "opt_inet6.h"
33 #include "opt_acpi.h"
34 
35 #include <sys/param.h>
36 #include <sys/types.h>
37 #include <sys/bus.h>
38 #include <sys/eventhandler.h>
39 #include <sys/sockio.h>
40 #include <sys/kernel.h>
41 #include <sys/lock.h>
42 #include <sys/mutex.h>
43 #include <sys/module.h>
44 #include <sys/kobj.h>
45 #include <sys/rman.h>
46 #include <sys/sbuf.h>
47 #include <sys/smp.h>
48 #include <sys/socket.h>
49 #include <sys/sysctl.h>
50 #include <sys/syslog.h>
51 #include <sys/taskqueue.h>
52 #include <sys/limits.h>
53 
54 
55 #include <net/if.h>
56 #include <net/if_var.h>
57 #include <net/if_types.h>
58 #include <net/if_media.h>
59 #include <net/bpf.h>
60 #include <net/ethernet.h>
61 #include <net/mp_ring.h>
62 
63 #include <netinet/in.h>
64 #include <netinet/in_pcb.h>
65 #include <netinet/tcp_lro.h>
66 #include <netinet/in_systm.h>
67 #include <netinet/if_ether.h>
68 #include <netinet/ip.h>
69 #include <netinet/ip6.h>
70 #include <netinet/tcp.h>
71 
72 #include <machine/bus.h>
73 #include <machine/in_cksum.h>
74 
75 #include <vm/vm.h>
76 #include <vm/pmap.h>
77 
78 #include <dev/led/led.h>
79 #include <dev/pci/pcireg.h>
80 #include <dev/pci/pcivar.h>
81 #include <dev/pci/pci_private.h>
82 
83 #include <net/iflib.h>
84 
85 #include "ifdi_if.h"
86 
87 #if defined(__i386__) || defined(__amd64__)
88 #include <sys/memdesc.h>
89 #include <machine/bus.h>
90 #include <machine/md_var.h>
91 #include <machine/specialreg.h>
92 #include <x86/include/busdma_impl.h>
93 #include <x86/iommu/busdma_dmar.h>
94 #endif
95 
96 /*
97  * enable accounting of every mbuf as it comes in to and goes out of
98  * iflib's software descriptor references
99  */
100 #define MEMORY_LOGGING 0
101 /*
102  * Enable mbuf vectors for compressing long mbuf chains
103  */
104 
105 /*
106  * NB:
107  * - Prefetching in tx cleaning should perhaps be a tunable. The distance ahead
108  *   we prefetch needs to be determined by the time spent in m_free vis a vis
109  *   the cost of a prefetch. This will of course vary based on the workload:
110  *      - NFLX's m_free path is dominated by vm-based M_EXT manipulation which
111  *        is quite expensive, thus suggesting very little prefetch.
112  *      - small packet forwarding which is just returning a single mbuf to
113  *        UMA will typically be very fast vis a vis the cost of a memory
114  *        access.
115  */
116 
117 
118 /*
119  * File organization:
120  *  - private structures
121  *  - iflib private utility functions
122  *  - ifnet functions
123  *  - vlan registry and other exported functions
124  *  - iflib public core functions
125  *
126  *
127  */
128 static MALLOC_DEFINE(M_IFLIB, "iflib", "ifnet library");
129 
130 struct iflib_txq;
131 typedef struct iflib_txq *iflib_txq_t;
132 struct iflib_rxq;
133 typedef struct iflib_rxq *iflib_rxq_t;
134 struct iflib_fl;
135 typedef struct iflib_fl *iflib_fl_t;
136 
137 struct iflib_ctx;
138 
139 typedef struct iflib_filter_info {
140 	driver_filter_t *ifi_filter;
141 	void *ifi_filter_arg;
142 	struct grouptask *ifi_task;
143 	void *ifi_ctx;
144 } *iflib_filter_info_t;
145 
146 struct iflib_ctx {
147 	KOBJ_FIELDS;
148    /*
149    * Pointer to hardware driver's softc
150    */
151 	void *ifc_softc;
152 	device_t ifc_dev;
153 	if_t ifc_ifp;
154 
155 	cpuset_t ifc_cpus;
156 	if_shared_ctx_t ifc_sctx;
157 	struct if_softc_ctx ifc_softc_ctx;
158 
159 	struct mtx ifc_mtx;
160 
161 	uint16_t ifc_nhwtxqs;
162 	uint16_t ifc_nhwrxqs;
163 
164 	iflib_txq_t ifc_txqs;
165 	iflib_rxq_t ifc_rxqs;
166 	uint32_t ifc_if_flags;
167 	uint32_t ifc_flags;
168 	uint32_t ifc_max_fl_buf_size;
169 	int ifc_in_detach;
170 
171 	int ifc_link_state;
172 	int ifc_link_irq;
173 	int ifc_pause_frames;
174 	int ifc_watchdog_events;
175 	struct cdev *ifc_led_dev;
176 	struct resource *ifc_msix_mem;
177 
178 	struct if_irq ifc_legacy_irq;
179 	struct grouptask ifc_admin_task;
180 	struct grouptask ifc_vflr_task;
181 	struct iflib_filter_info ifc_filter_info;
182 	struct ifmedia	ifc_media;
183 
184 	struct sysctl_oid *ifc_sysctl_node;
185 	uint16_t ifc_sysctl_ntxqs;
186 	uint16_t ifc_sysctl_nrxqs;
187 	uint16_t ifc_sysctl_qs_eq_override;
188 
189 	qidx_t ifc_sysctl_ntxds[8];
190 	qidx_t ifc_sysctl_nrxds[8];
191 	struct if_txrx ifc_txrx;
192 #define isc_txd_encap  ifc_txrx.ift_txd_encap
193 #define isc_txd_flush  ifc_txrx.ift_txd_flush
194 #define isc_txd_credits_update  ifc_txrx.ift_txd_credits_update
195 #define isc_rxd_available ifc_txrx.ift_rxd_available
196 #define isc_rxd_pkt_get ifc_txrx.ift_rxd_pkt_get
197 #define isc_rxd_refill ifc_txrx.ift_rxd_refill
198 #define isc_rxd_flush ifc_txrx.ift_rxd_flush
199 #define isc_rxd_refill ifc_txrx.ift_rxd_refill
200 #define isc_rxd_refill ifc_txrx.ift_rxd_refill
201 #define isc_legacy_intr ifc_txrx.ift_legacy_intr
202 	eventhandler_tag ifc_vlan_attach_event;
203 	eventhandler_tag ifc_vlan_detach_event;
204 	uint8_t ifc_mac[ETHER_ADDR_LEN];
205 	char ifc_mtx_name[16];
206 };
207 
208 
209 void *
210 iflib_get_softc(if_ctx_t ctx)
211 {
212 
213 	return (ctx->ifc_softc);
214 }
215 
216 device_t
217 iflib_get_dev(if_ctx_t ctx)
218 {
219 
220 	return (ctx->ifc_dev);
221 }
222 
223 if_t
224 iflib_get_ifp(if_ctx_t ctx)
225 {
226 
227 	return (ctx->ifc_ifp);
228 }
229 
230 struct ifmedia *
231 iflib_get_media(if_ctx_t ctx)
232 {
233 
234 	return (&ctx->ifc_media);
235 }
236 
237 void
238 iflib_set_mac(if_ctx_t ctx, uint8_t mac[ETHER_ADDR_LEN])
239 {
240 
241 	bcopy(mac, ctx->ifc_mac, ETHER_ADDR_LEN);
242 }
243 
244 if_softc_ctx_t
245 iflib_get_softc_ctx(if_ctx_t ctx)
246 {
247 
248 	return (&ctx->ifc_softc_ctx);
249 }
250 
251 if_shared_ctx_t
252 iflib_get_sctx(if_ctx_t ctx)
253 {
254 
255 	return (ctx->ifc_sctx);
256 }
257 
258 #define IP_ALIGNED(m) ((((uintptr_t)(m)->m_data) & 0x3) == 0x2)
259 #define CACHE_PTR_INCREMENT (CACHE_LINE_SIZE/sizeof(void*))
260 #define CACHE_PTR_NEXT(ptr) ((void *)(((uintptr_t)(ptr)+CACHE_LINE_SIZE-1) & (CACHE_LINE_SIZE-1)))
261 
262 #define LINK_ACTIVE(ctx) ((ctx)->ifc_link_state == LINK_STATE_UP)
263 #define CTX_IS_VF(ctx) ((ctx)->ifc_sctx->isc_flags & IFLIB_IS_VF)
264 
265 #define RX_SW_DESC_MAP_CREATED	(1 << 0)
266 #define TX_SW_DESC_MAP_CREATED	(1 << 1)
267 #define RX_SW_DESC_INUSE        (1 << 3)
268 #define TX_SW_DESC_MAPPED       (1 << 4)
269 
270 typedef struct iflib_sw_rx_desc_array {
271 	bus_dmamap_t	*ifsd_map;         /* bus_dma maps for packet */
272 	struct mbuf	**ifsd_m;           /* pkthdr mbufs */
273 	caddr_t		*ifsd_cl;          /* direct cluster pointer for rx */
274 	uint8_t		*ifsd_flags;
275 } iflib_rxsd_array_t;
276 
277 typedef struct iflib_sw_tx_desc_array {
278 	bus_dmamap_t    *ifsd_map;         /* bus_dma maps for packet */
279 	struct mbuf    **ifsd_m;           /* pkthdr mbufs */
280 	uint8_t		*ifsd_flags;
281 } if_txsd_vec_t;
282 
283 
284 /* magic number that should be high enough for any hardware */
285 #define IFLIB_MAX_TX_SEGS		128
286 #define IFLIB_MAX_RX_SEGS		32
287 #define IFLIB_RX_COPY_THRESH		128
288 #define IFLIB_MAX_RX_REFRESH		32
289 /* The minimum descriptors per second before we start coalescing */
290 #define IFLIB_MIN_DESC_SEC		16384
291 #define IFLIB_DEFAULT_TX_UPDATE_FREQ	16
292 #define IFLIB_QUEUE_IDLE		0
293 #define IFLIB_QUEUE_HUNG		1
294 #define IFLIB_QUEUE_WORKING		2
295 /* maximum number of txqs that can share an rx interrupt */
296 #define IFLIB_MAX_TX_SHARED_INTR	4
297 
298 /* this should really scale with ring size - this is a fairly arbitrary value */
299 #define TX_BATCH_SIZE			32
300 
301 #define IFLIB_RESTART_BUDGET		8
302 
303 #define	IFC_LEGACY		0x001
304 #define	IFC_QFLUSH		0x002
305 #define	IFC_MULTISEG		0x004
306 #define	IFC_DMAR		0x008
307 #define	IFC_SC_ALLOCATED	0x010
308 #define	IFC_INIT_DONE		0x020
309 #define	IFC_PREFETCH		0x040
310 #define	IFC_DO_RESET		0x080
311 #define	IFC_CHECK_HUNG		0x100
312 
313 #define CSUM_OFFLOAD		(CSUM_IP_TSO|CSUM_IP6_TSO|CSUM_IP| \
314 				 CSUM_IP_UDP|CSUM_IP_TCP|CSUM_IP_SCTP| \
315 				 CSUM_IP6_UDP|CSUM_IP6_TCP|CSUM_IP6_SCTP)
316 struct iflib_txq {
317 	qidx_t		ift_in_use;
318 	qidx_t		ift_cidx;
319 	qidx_t		ift_cidx_processed;
320 	qidx_t		ift_pidx;
321 	uint8_t		ift_gen;
322 	uint8_t		ift_br_offset;
323 	uint16_t	ift_npending;
324 	uint16_t	ift_db_pending;
325 	uint16_t	ift_rs_pending;
326 	/* implicit pad */
327 	uint8_t		ift_txd_size[8];
328 	uint64_t	ift_processed;
329 	uint64_t	ift_cleaned;
330 	uint64_t	ift_cleaned_prev;
331 #if MEMORY_LOGGING
332 	uint64_t	ift_enqueued;
333 	uint64_t	ift_dequeued;
334 #endif
335 	uint64_t	ift_no_tx_dma_setup;
336 	uint64_t	ift_no_desc_avail;
337 	uint64_t	ift_mbuf_defrag_failed;
338 	uint64_t	ift_mbuf_defrag;
339 	uint64_t	ift_map_failed;
340 	uint64_t	ift_txd_encap_efbig;
341 	uint64_t	ift_pullups;
342 
343 	struct mtx	ift_mtx;
344 	struct mtx	ift_db_mtx;
345 
346 	/* constant values */
347 	if_ctx_t	ift_ctx;
348 	struct ifmp_ring        *ift_br;
349 	struct grouptask	ift_task;
350 	qidx_t		ift_size;
351 	uint16_t	ift_id;
352 	struct callout	ift_timer;
353 
354 	if_txsd_vec_t	ift_sds;
355 	uint8_t		ift_qstatus;
356 	uint8_t		ift_closed;
357 	uint8_t		ift_update_freq;
358 	struct iflib_filter_info ift_filter_info;
359 	bus_dma_tag_t		ift_desc_tag;
360 	bus_dma_tag_t		ift_tso_desc_tag;
361 	iflib_dma_info_t	ift_ifdi;
362 #define MTX_NAME_LEN 16
363 	char                    ift_mtx_name[MTX_NAME_LEN];
364 	char                    ift_db_mtx_name[MTX_NAME_LEN];
365 	bus_dma_segment_t	ift_segs[IFLIB_MAX_TX_SEGS]  __aligned(CACHE_LINE_SIZE);
366 #ifdef IFLIB_DIAGNOSTICS
367 	uint64_t ift_cpu_exec_count[256];
368 #endif
369 } __aligned(CACHE_LINE_SIZE);
370 
371 struct iflib_fl {
372 	qidx_t		ifl_cidx;
373 	qidx_t		ifl_pidx;
374 	qidx_t		ifl_credits;
375 	uint8_t		ifl_gen;
376 	uint8_t		ifl_rxd_size;
377 #if MEMORY_LOGGING
378 	uint64_t	ifl_m_enqueued;
379 	uint64_t	ifl_m_dequeued;
380 	uint64_t	ifl_cl_enqueued;
381 	uint64_t	ifl_cl_dequeued;
382 #endif
383 	/* implicit pad */
384 
385 	/* constant */
386 	qidx_t		ifl_size;
387 	uint16_t	ifl_buf_size;
388 	uint16_t	ifl_cltype;
389 	uma_zone_t	ifl_zone;
390 	iflib_rxsd_array_t	ifl_sds;
391 	iflib_rxq_t	ifl_rxq;
392 	uint8_t		ifl_id;
393 	bus_dma_tag_t           ifl_desc_tag;
394 	iflib_dma_info_t	ifl_ifdi;
395 	uint64_t	ifl_bus_addrs[IFLIB_MAX_RX_REFRESH] __aligned(CACHE_LINE_SIZE);
396 	caddr_t		ifl_vm_addrs[IFLIB_MAX_RX_REFRESH];
397 	qidx_t	ifl_rxd_idxs[IFLIB_MAX_RX_REFRESH];
398 }  __aligned(CACHE_LINE_SIZE);
399 
400 static inline qidx_t
401 get_inuse(int size, qidx_t cidx, qidx_t pidx, uint8_t gen)
402 {
403 	qidx_t used;
404 
405 	if (pidx > cidx)
406 		used = pidx - cidx;
407 	else if (pidx < cidx)
408 		used = size - cidx + pidx;
409 	else if (gen == 0 && pidx == cidx)
410 		used = 0;
411 	else if (gen == 1 && pidx == cidx)
412 		used = size;
413 	else
414 		panic("bad state");
415 
416 	return (used);
417 }
418 
419 #define TXQ_AVAIL(txq) (txq->ift_size - get_inuse(txq->ift_size, txq->ift_cidx, txq->ift_pidx, txq->ift_gen))
420 
421 #define IDXDIFF(head, tail, wrap) \
422 	((head) >= (tail) ? (head) - (tail) : (wrap) - (tail) + (head))
423 
424 struct iflib_rxq {
425 	/* If there is a separate completion queue -
426 	 * these are the cq cidx and pidx. Otherwise
427 	 * these are unused.
428 	 */
429 	qidx_t		ifr_size;
430 	qidx_t		ifr_cq_cidx;
431 	qidx_t		ifr_cq_pidx;
432 	uint8_t		ifr_cq_gen;
433 	uint8_t		ifr_fl_offset;
434 
435 	if_ctx_t	ifr_ctx;
436 	iflib_fl_t	ifr_fl;
437 	uint64_t	ifr_rx_irq;
438 	uint16_t	ifr_id;
439 	uint8_t		ifr_lro_enabled;
440 	uint8_t		ifr_nfl;
441 	uint8_t		ifr_ntxqirq;
442 	uint8_t		ifr_txqid[IFLIB_MAX_TX_SHARED_INTR];
443 	struct lro_ctrl			ifr_lc;
444 	struct grouptask        ifr_task;
445 	struct iflib_filter_info ifr_filter_info;
446 	iflib_dma_info_t		ifr_ifdi;
447 
448 	/* dynamically allocate if any drivers need a value substantially larger than this */
449 	struct if_rxd_frag	ifr_frags[IFLIB_MAX_RX_SEGS] __aligned(CACHE_LINE_SIZE);
450 #ifdef IFLIB_DIAGNOSTICS
451 	uint64_t ifr_cpu_exec_count[256];
452 #endif
453 }  __aligned(CACHE_LINE_SIZE);
454 
455 typedef struct if_rxsd {
456 	caddr_t *ifsd_cl;
457 	struct mbuf **ifsd_m;
458 	iflib_fl_t ifsd_fl;
459 	qidx_t ifsd_cidx;
460 } *if_rxsd_t;
461 
462 /* multiple of word size */
463 #ifdef __LP64__
464 #define PKT_INFO_SIZE	6
465 #define RXD_INFO_SIZE	5
466 #define PKT_TYPE uint64_t
467 #else
468 #define PKT_INFO_SIZE	11
469 #define RXD_INFO_SIZE	8
470 #define PKT_TYPE uint32_t
471 #endif
472 #define PKT_LOOP_BOUND  ((PKT_INFO_SIZE/3)*3)
473 #define RXD_LOOP_BOUND  ((RXD_INFO_SIZE/4)*4)
474 
475 typedef struct if_pkt_info_pad {
476 	PKT_TYPE pkt_val[PKT_INFO_SIZE];
477 } *if_pkt_info_pad_t;
478 typedef struct if_rxd_info_pad {
479 	PKT_TYPE rxd_val[RXD_INFO_SIZE];
480 } *if_rxd_info_pad_t;
481 
482 CTASSERT(sizeof(struct if_pkt_info_pad) == sizeof(struct if_pkt_info));
483 CTASSERT(sizeof(struct if_rxd_info_pad) == sizeof(struct if_rxd_info));
484 
485 
486 static inline void
487 pkt_info_zero(if_pkt_info_t pi)
488 {
489 	if_pkt_info_pad_t pi_pad;
490 
491 	pi_pad = (if_pkt_info_pad_t)pi;
492 	pi_pad->pkt_val[0] = 0; pi_pad->pkt_val[1] = 0; pi_pad->pkt_val[2] = 0;
493 	pi_pad->pkt_val[3] = 0; pi_pad->pkt_val[4] = 0; pi_pad->pkt_val[5] = 0;
494 #ifndef __LP64__
495 	pi_pad->pkt_val[6] = 0; pi_pad->pkt_val[7] = 0; pi_pad->pkt_val[8] = 0;
496 	pi_pad->pkt_val[9] = 0; pi_pad->pkt_val[10] = 0;
497 #endif
498 }
499 
500 static inline void
501 rxd_info_zero(if_rxd_info_t ri)
502 {
503 	if_rxd_info_pad_t ri_pad;
504 	int i;
505 
506 	ri_pad = (if_rxd_info_pad_t)ri;
507 	for (i = 0; i < RXD_LOOP_BOUND; i += 4) {
508 		ri_pad->rxd_val[i] = 0;
509 		ri_pad->rxd_val[i+1] = 0;
510 		ri_pad->rxd_val[i+2] = 0;
511 		ri_pad->rxd_val[i+3] = 0;
512 	}
513 #ifdef __LP64__
514 	ri_pad->rxd_val[RXD_INFO_SIZE-1] = 0;
515 #endif
516 }
517 
518 /*
519  * Only allow a single packet to take up most 1/nth of the tx ring
520  */
521 #define MAX_SINGLE_PACKET_FRACTION 12
522 #define IF_BAD_DMA (bus_addr_t)-1
523 
524 #define CTX_ACTIVE(ctx) ((if_getdrvflags((ctx)->ifc_ifp) & IFF_DRV_RUNNING))
525 
526 #define CTX_LOCK_INIT(_sc, _name)  mtx_init(&(_sc)->ifc_mtx, _name, "iflib ctx lock", MTX_DEF)
527 
528 #define CTX_LOCK(ctx) mtx_lock(&(ctx)->ifc_mtx)
529 #define CTX_UNLOCK(ctx) mtx_unlock(&(ctx)->ifc_mtx)
530 #define CTX_LOCK_DESTROY(ctx) mtx_destroy(&(ctx)->ifc_mtx)
531 
532 
533 #define CALLOUT_LOCK(txq)	mtx_lock(&txq->ift_mtx)
534 #define CALLOUT_UNLOCK(txq) 	mtx_unlock(&txq->ift_mtx)
535 
536 
537 /* Our boot-time initialization hook */
538 static int	iflib_module_event_handler(module_t, int, void *);
539 
540 static moduledata_t iflib_moduledata = {
541 	"iflib",
542 	iflib_module_event_handler,
543 	NULL
544 };
545 
546 DECLARE_MODULE(iflib, iflib_moduledata, SI_SUB_INIT_IF, SI_ORDER_ANY);
547 MODULE_VERSION(iflib, 1);
548 
549 MODULE_DEPEND(iflib, pci, 1, 1, 1);
550 MODULE_DEPEND(iflib, ether, 1, 1, 1);
551 
552 TASKQGROUP_DEFINE(if_io_tqg, mp_ncpus, 1);
553 TASKQGROUP_DEFINE(if_config_tqg, 1, 1);
554 
555 #ifndef IFLIB_DEBUG_COUNTERS
556 #ifdef INVARIANTS
557 #define IFLIB_DEBUG_COUNTERS 1
558 #else
559 #define IFLIB_DEBUG_COUNTERS 0
560 #endif /* !INVARIANTS */
561 #endif
562 
563 static SYSCTL_NODE(_net, OID_AUTO, iflib, CTLFLAG_RD, 0,
564                    "iflib driver parameters");
565 
566 /*
567  * XXX need to ensure that this can't accidentally cause the head to be moved backwards
568  */
569 static int iflib_min_tx_latency = 0;
570 SYSCTL_INT(_net_iflib, OID_AUTO, min_tx_latency, CTLFLAG_RW,
571 		   &iflib_min_tx_latency, 0, "minimize transmit latency at the possible expense of throughput");
572 static int iflib_no_tx_batch = 0;
573 SYSCTL_INT(_net_iflib, OID_AUTO, no_tx_batch, CTLFLAG_RW,
574 		   &iflib_no_tx_batch, 0, "minimize transmit latency at the possible expense of throughput");
575 
576 
577 #if IFLIB_DEBUG_COUNTERS
578 
579 static int iflib_tx_seen;
580 static int iflib_tx_sent;
581 static int iflib_tx_encap;
582 static int iflib_rx_allocs;
583 static int iflib_fl_refills;
584 static int iflib_fl_refills_large;
585 static int iflib_tx_frees;
586 
587 SYSCTL_INT(_net_iflib, OID_AUTO, tx_seen, CTLFLAG_RD,
588 		   &iflib_tx_seen, 0, "# tx mbufs seen");
589 SYSCTL_INT(_net_iflib, OID_AUTO, tx_sent, CTLFLAG_RD,
590 		   &iflib_tx_sent, 0, "# tx mbufs sent");
591 SYSCTL_INT(_net_iflib, OID_AUTO, tx_encap, CTLFLAG_RD,
592 		   &iflib_tx_encap, 0, "# tx mbufs encapped");
593 SYSCTL_INT(_net_iflib, OID_AUTO, tx_frees, CTLFLAG_RD,
594 		   &iflib_tx_frees, 0, "# tx frees");
595 SYSCTL_INT(_net_iflib, OID_AUTO, rx_allocs, CTLFLAG_RD,
596 		   &iflib_rx_allocs, 0, "# rx allocations");
597 SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills, CTLFLAG_RD,
598 		   &iflib_fl_refills, 0, "# refills");
599 SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills_large, CTLFLAG_RD,
600 		   &iflib_fl_refills_large, 0, "# large refills");
601 
602 
603 static int iflib_txq_drain_flushing;
604 static int iflib_txq_drain_oactive;
605 static int iflib_txq_drain_notready;
606 static int iflib_txq_drain_encapfail;
607 
608 SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_flushing, CTLFLAG_RD,
609 		   &iflib_txq_drain_flushing, 0, "# drain flushes");
610 SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_oactive, CTLFLAG_RD,
611 		   &iflib_txq_drain_oactive, 0, "# drain oactives");
612 SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_notready, CTLFLAG_RD,
613 		   &iflib_txq_drain_notready, 0, "# drain notready");
614 SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_encapfail, CTLFLAG_RD,
615 		   &iflib_txq_drain_encapfail, 0, "# drain encap fails");
616 
617 
618 static int iflib_encap_load_mbuf_fail;
619 static int iflib_encap_txq_avail_fail;
620 static int iflib_encap_txd_encap_fail;
621 
622 SYSCTL_INT(_net_iflib, OID_AUTO, encap_load_mbuf_fail, CTLFLAG_RD,
623 		   &iflib_encap_load_mbuf_fail, 0, "# busdma load failures");
624 SYSCTL_INT(_net_iflib, OID_AUTO, encap_txq_avail_fail, CTLFLAG_RD,
625 		   &iflib_encap_txq_avail_fail, 0, "# txq avail failures");
626 SYSCTL_INT(_net_iflib, OID_AUTO, encap_txd_encap_fail, CTLFLAG_RD,
627 		   &iflib_encap_txd_encap_fail, 0, "# driver encap failures");
628 
629 static int iflib_task_fn_rxs;
630 static int iflib_rx_intr_enables;
631 static int iflib_fast_intrs;
632 static int iflib_intr_link;
633 static int iflib_intr_msix;
634 static int iflib_rx_unavail;
635 static int iflib_rx_ctx_inactive;
636 static int iflib_rx_zero_len;
637 static int iflib_rx_if_input;
638 static int iflib_rx_mbuf_null;
639 static int iflib_rxd_flush;
640 
641 static int iflib_verbose_debug;
642 
643 SYSCTL_INT(_net_iflib, OID_AUTO, intr_link, CTLFLAG_RD,
644 		   &iflib_intr_link, 0, "# intr link calls");
645 SYSCTL_INT(_net_iflib, OID_AUTO, intr_msix, CTLFLAG_RD,
646 		   &iflib_intr_msix, 0, "# intr msix calls");
647 SYSCTL_INT(_net_iflib, OID_AUTO, task_fn_rx, CTLFLAG_RD,
648 		   &iflib_task_fn_rxs, 0, "# task_fn_rx calls");
649 SYSCTL_INT(_net_iflib, OID_AUTO, rx_intr_enables, CTLFLAG_RD,
650 		   &iflib_rx_intr_enables, 0, "# rx intr enables");
651 SYSCTL_INT(_net_iflib, OID_AUTO, fast_intrs, CTLFLAG_RD,
652 		   &iflib_fast_intrs, 0, "# fast_intr calls");
653 SYSCTL_INT(_net_iflib, OID_AUTO, rx_unavail, CTLFLAG_RD,
654 		   &iflib_rx_unavail, 0, "# times rxeof called with no available data");
655 SYSCTL_INT(_net_iflib, OID_AUTO, rx_ctx_inactive, CTLFLAG_RD,
656 		   &iflib_rx_ctx_inactive, 0, "# times rxeof called with inactive context");
657 SYSCTL_INT(_net_iflib, OID_AUTO, rx_zero_len, CTLFLAG_RD,
658 		   &iflib_rx_zero_len, 0, "# times rxeof saw zero len mbuf");
659 SYSCTL_INT(_net_iflib, OID_AUTO, rx_if_input, CTLFLAG_RD,
660 		   &iflib_rx_if_input, 0, "# times rxeof called if_input");
661 SYSCTL_INT(_net_iflib, OID_AUTO, rx_mbuf_null, CTLFLAG_RD,
662 		   &iflib_rx_mbuf_null, 0, "# times rxeof got null mbuf");
663 SYSCTL_INT(_net_iflib, OID_AUTO, rxd_flush, CTLFLAG_RD,
664 	         &iflib_rxd_flush, 0, "# times rxd_flush called");
665 SYSCTL_INT(_net_iflib, OID_AUTO, verbose_debug, CTLFLAG_RW,
666 		   &iflib_verbose_debug, 0, "enable verbose debugging");
667 
668 #define DBG_COUNTER_INC(name) atomic_add_int(&(iflib_ ## name), 1)
669 static void
670 iflib_debug_reset(void)
671 {
672 	iflib_tx_seen = iflib_tx_sent = iflib_tx_encap = iflib_rx_allocs =
673 		iflib_fl_refills = iflib_fl_refills_large = iflib_tx_frees =
674 		iflib_txq_drain_flushing = iflib_txq_drain_oactive =
675 		iflib_txq_drain_notready = iflib_txq_drain_encapfail =
676 		iflib_encap_load_mbuf_fail = iflib_encap_txq_avail_fail =
677 		iflib_encap_txd_encap_fail = iflib_task_fn_rxs = iflib_rx_intr_enables =
678 		iflib_fast_intrs = iflib_intr_link = iflib_intr_msix = iflib_rx_unavail =
679 		iflib_rx_ctx_inactive = iflib_rx_zero_len = iflib_rx_if_input =
680 		iflib_rx_mbuf_null = iflib_rxd_flush = 0;
681 }
682 
683 #else
684 #define DBG_COUNTER_INC(name)
685 static void iflib_debug_reset(void) {}
686 #endif
687 
688 
689 
690 #define IFLIB_DEBUG 0
691 
692 static void iflib_tx_structures_free(if_ctx_t ctx);
693 static void iflib_rx_structures_free(if_ctx_t ctx);
694 static int iflib_queues_alloc(if_ctx_t ctx);
695 static int iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq);
696 static int iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, qidx_t cidx, qidx_t budget);
697 static int iflib_qset_structures_setup(if_ctx_t ctx);
698 static int iflib_msix_init(if_ctx_t ctx);
699 static int iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filterarg, int *rid, char *str);
700 static void iflib_txq_check_drain(iflib_txq_t txq, int budget);
701 static uint32_t iflib_txq_can_drain(struct ifmp_ring *);
702 static int iflib_register(if_ctx_t);
703 static void iflib_init_locked(if_ctx_t ctx);
704 static void iflib_add_device_sysctl_pre(if_ctx_t ctx);
705 static void iflib_add_device_sysctl_post(if_ctx_t ctx);
706 static void iflib_ifmp_purge(iflib_txq_t txq);
707 static void _iflib_pre_assert(if_softc_ctx_t scctx);
708 static void iflib_stop(if_ctx_t ctx);
709 static void iflib_if_init_locked(if_ctx_t ctx);
710 #ifndef __NO_STRICT_ALIGNMENT
711 static struct mbuf * iflib_fixup_rx(struct mbuf *m);
712 #endif
713 
714 #ifdef DEV_NETMAP
715 #include <sys/selinfo.h>
716 #include <net/netmap.h>
717 #include <dev/netmap/netmap_kern.h>
718 
719 MODULE_DEPEND(iflib, netmap, 1, 1, 1);
720 
721 /*
722  * device-specific sysctl variables:
723  *
724  * iflib_crcstrip: 0: keep CRC in rx frames (default), 1: strip it.
725  *	During regular operations the CRC is stripped, but on some
726  *	hardware reception of frames not multiple of 64 is slower,
727  *	so using crcstrip=0 helps in benchmarks.
728  *
729  * iflib_rx_miss, iflib_rx_miss_bufs:
730  *	count packets that might be missed due to lost interrupts.
731  */
732 SYSCTL_DECL(_dev_netmap);
733 /*
734  * The xl driver by default strips CRCs and we do not override it.
735  */
736 
737 int iflib_crcstrip = 1;
738 SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_crcstrip,
739     CTLFLAG_RW, &iflib_crcstrip, 1, "strip CRC on rx frames");
740 
741 int iflib_rx_miss, iflib_rx_miss_bufs;
742 SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss,
743     CTLFLAG_RW, &iflib_rx_miss, 0, "potentially missed rx intr");
744 SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss_bufs,
745     CTLFLAG_RW, &iflib_rx_miss_bufs, 0, "potentially missed rx intr bufs");
746 
747 /*
748  * Register/unregister. We are already under netmap lock.
749  * Only called on the first register or the last unregister.
750  */
751 static int
752 iflib_netmap_register(struct netmap_adapter *na, int onoff)
753 {
754 	struct ifnet *ifp = na->ifp;
755 	if_ctx_t ctx = ifp->if_softc;
756 	int status;
757 
758 	CTX_LOCK(ctx);
759 	IFDI_INTR_DISABLE(ctx);
760 
761 	/* Tell the stack that the interface is no longer active */
762 	ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
763 
764 	if (!CTX_IS_VF(ctx))
765 		IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip);
766 
767 	/* enable or disable flags and callbacks in na and ifp */
768 	if (onoff) {
769 		nm_set_native_flags(na);
770 	} else {
771 		nm_clear_native_flags(na);
772 	}
773 	iflib_stop(ctx);
774 	iflib_init_locked(ctx);
775 	IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip); // XXX why twice ?
776 	status = ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1;
777 	if (status)
778 		nm_clear_native_flags(na);
779 	CTX_UNLOCK(ctx);
780 	return (status);
781 }
782 
783 /*
784  * Reconcile kernel and user view of the transmit ring.
785  *
786  * All information is in the kring.
787  * Userspace wants to send packets up to the one before kring->rhead,
788  * kernel knows kring->nr_hwcur is the first unsent packet.
789  *
790  * Here we push packets out (as many as possible), and possibly
791  * reclaim buffers from previously completed transmission.
792  *
793  * The caller (netmap) guarantees that there is only one instance
794  * running at any time. Any interference with other driver
795  * methods should be handled by the individual drivers.
796  */
797 static int
798 iflib_netmap_txsync(struct netmap_kring *kring, int flags)
799 {
800 	struct netmap_adapter *na = kring->na;
801 	struct ifnet *ifp = na->ifp;
802 	struct netmap_ring *ring = kring->ring;
803 	u_int nm_i;	/* index into the netmap ring */
804 	u_int nic_i;	/* index into the NIC ring */
805 	u_int n;
806 	u_int const lim = kring->nkr_num_slots - 1;
807 	u_int const head = kring->rhead;
808 	struct if_pkt_info pi;
809 
810 	/*
811 	 * interrupts on every tx packet are expensive so request
812 	 * them every half ring, or where NS_REPORT is set
813 	 */
814 	u_int report_frequency = kring->nkr_num_slots >> 1;
815 	/* device-specific */
816 	if_ctx_t ctx = ifp->if_softc;
817 	iflib_txq_t txq = &ctx->ifc_txqs[kring->ring_id];
818 
819 	if (txq->ift_sds.ifsd_map)
820 		bus_dmamap_sync(txq->ift_desc_tag, txq->ift_ifdi->idi_map,
821 				BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
822 
823 
824 	/*
825 	 * First part: process new packets to send.
826 	 * nm_i is the current index in the netmap ring,
827 	 * nic_i is the corresponding index in the NIC ring.
828 	 *
829 	 * If we have packets to send (nm_i != head)
830 	 * iterate over the netmap ring, fetch length and update
831 	 * the corresponding slot in the NIC ring. Some drivers also
832 	 * need to update the buffer's physical address in the NIC slot
833 	 * even NS_BUF_CHANGED is not set (PNMB computes the addresses).
834 	 *
835 	 * The netmap_reload_map() calls is especially expensive,
836 	 * even when (as in this case) the tag is 0, so do only
837 	 * when the buffer has actually changed.
838 	 *
839 	 * If possible do not set the report/intr bit on all slots,
840 	 * but only a few times per ring or when NS_REPORT is set.
841 	 *
842 	 * Finally, on 10G and faster drivers, it might be useful
843 	 * to prefetch the next slot and txr entry.
844 	 */
845 
846 	nm_i = kring->nr_hwcur;
847 	pkt_info_zero(&pi);
848 	pi.ipi_segs = txq->ift_segs;
849 	pi.ipi_qsidx = kring->ring_id;
850 	if (nm_i != head) {	/* we have new packets to send */
851 		nic_i = netmap_idx_k2n(kring, nm_i);
852 
853 		__builtin_prefetch(&ring->slot[nm_i]);
854 		__builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i]);
855 		if (txq->ift_sds.ifsd_map)
856 			__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i]);
857 
858 		for (n = 0; nm_i != head; n++) {
859 			struct netmap_slot *slot = &ring->slot[nm_i];
860 			u_int len = slot->len;
861 			uint64_t paddr;
862 			void *addr = PNMB(na, slot, &paddr);
863 			int flags = (slot->flags & NS_REPORT ||
864 				nic_i == 0 || nic_i == report_frequency) ?
865 				IPI_TX_INTR : 0;
866 
867 			/* device-specific */
868 			pi.ipi_len = len;
869 			pi.ipi_segs[0].ds_addr = paddr;
870 			pi.ipi_segs[0].ds_len = len;
871 			pi.ipi_nsegs = 1;
872 			pi.ipi_ndescs = 0;
873 			pi.ipi_pidx = nic_i;
874 			pi.ipi_flags = flags;
875 
876 			/* Fill the slot in the NIC ring. */
877 			ctx->isc_txd_encap(ctx->ifc_softc, &pi);
878 
879 			/* prefetch for next round */
880 			__builtin_prefetch(&ring->slot[nm_i + 1]);
881 			__builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i + 1]);
882 			if (txq->ift_sds.ifsd_map) {
883 				__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i + 1]);
884 
885 				NM_CHECK_ADDR_LEN(na, addr, len);
886 
887 				if (slot->flags & NS_BUF_CHANGED) {
888 					/* buffer has changed, reload map */
889 					netmap_reload_map(na, txq->ift_desc_tag, txq->ift_sds.ifsd_map[nic_i], addr);
890 				}
891 				/* make sure changes to the buffer are synced */
892 				bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_sds.ifsd_map[nic_i],
893 						BUS_DMASYNC_PREWRITE);
894 			}
895 			slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
896 			nm_i = nm_next(nm_i, lim);
897 			nic_i = nm_next(nic_i, lim);
898 		}
899 		kring->nr_hwcur = head;
900 
901 		/* synchronize the NIC ring */
902 		if (txq->ift_sds.ifsd_map)
903 			bus_dmamap_sync(txq->ift_desc_tag, txq->ift_ifdi->idi_map,
904 						BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
905 
906 		/* (re)start the tx unit up to slot nic_i (excluded) */
907 		ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, nic_i);
908 	}
909 
910 	/*
911 	 * Second part: reclaim buffers for completed transmissions.
912 	 */
913 	if (iflib_tx_credits_update(ctx, txq)) {
914 		/* some tx completed, increment avail */
915 		nic_i = txq->ift_cidx_processed;
916 		kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim);
917 	}
918 	return (0);
919 }
920 
921 /*
922  * Reconcile kernel and user view of the receive ring.
923  * Same as for the txsync, this routine must be efficient.
924  * The caller guarantees a single invocations, but races against
925  * the rest of the driver should be handled here.
926  *
927  * On call, kring->rhead is the first packet that userspace wants
928  * to keep, and kring->rcur is the wakeup point.
929  * The kernel has previously reported packets up to kring->rtail.
930  *
931  * If (flags & NAF_FORCE_READ) also check for incoming packets irrespective
932  * of whether or not we received an interrupt.
933  */
934 static int
935 iflib_netmap_rxsync(struct netmap_kring *kring, int flags)
936 {
937 	struct netmap_adapter *na = kring->na;
938 	struct netmap_ring *ring = kring->ring;
939 	uint32_t nm_i;	/* index into the netmap ring */
940 	uint32_t nic_i, nic_i_start;	/* index into the NIC ring */
941 	u_int i, n;
942 	u_int const lim = kring->nkr_num_slots - 1;
943 	u_int const head = kring->rhead;
944 	int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
945 	struct if_rxd_info ri;
946 	struct if_rxd_update iru;
947 
948 	struct ifnet *ifp = na->ifp;
949 	if_ctx_t ctx = ifp->if_softc;
950 	iflib_rxq_t rxq = &ctx->ifc_rxqs[kring->ring_id];
951 	iflib_fl_t fl = rxq->ifr_fl;
952 	if (head > lim)
953 		return netmap_ring_reinit(kring);
954 
955 	/* XXX check sync modes */
956 	for (i = 0, fl = rxq->ifr_fl; i < rxq->ifr_nfl; i++, fl++) {
957 		if (fl->ifl_sds.ifsd_map == NULL)
958 			continue;
959 		bus_dmamap_sync(rxq->ifr_fl[i].ifl_desc_tag, fl->ifl_ifdi->idi_map,
960 				BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
961 	}
962 	/*
963 	 * First part: import newly received packets.
964 	 *
965 	 * nm_i is the index of the next free slot in the netmap ring,
966 	 * nic_i is the index of the next received packet in the NIC ring,
967 	 * and they may differ in case if_init() has been called while
968 	 * in netmap mode. For the receive ring we have
969 	 *
970 	 *	nic_i = rxr->next_check;
971 	 *	nm_i = kring->nr_hwtail (previous)
972 	 * and
973 	 *	nm_i == (nic_i + kring->nkr_hwofs) % ring_size
974 	 *
975 	 * rxr->next_check is set to 0 on a ring reinit
976 	 */
977 	if (netmap_no_pendintr || force_update) {
978 		int crclen = iflib_crcstrip ? 0 : 4;
979 		int error, avail;
980 		uint16_t slot_flags = kring->nkr_slot_flags;
981 
982 		for (fl = rxq->ifr_fl, i = 0; i < rxq->ifr_nfl; i++, fl++) {
983 			nic_i = fl->ifl_cidx;
984 			nm_i = netmap_idx_n2k(kring, nic_i);
985 			avail = iflib_rxd_avail(ctx, rxq, nic_i, USHRT_MAX);
986 			for (n = 0; avail > 0; n++, avail--) {
987 				rxd_info_zero(&ri);
988 				ri.iri_frags = rxq->ifr_frags;
989 				ri.iri_qsidx = kring->ring_id;
990 				ri.iri_ifp = ctx->ifc_ifp;
991 				ri.iri_cidx = nic_i;
992 
993 				error = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri);
994 				ring->slot[nm_i].len = error ? 0 : ri.iri_len - crclen;
995 				ring->slot[nm_i].flags = slot_flags;
996 				if (fl->ifl_sds.ifsd_map)
997 					bus_dmamap_sync(fl->ifl_ifdi->idi_tag,
998 							fl->ifl_sds.ifsd_map[nic_i], BUS_DMASYNC_POSTREAD);
999 				nm_i = nm_next(nm_i, lim);
1000 				nic_i = nm_next(nic_i, lim);
1001 			}
1002 			if (n) { /* update the state variables */
1003 				if (netmap_no_pendintr && !force_update) {
1004 					/* diagnostics */
1005 					iflib_rx_miss ++;
1006 					iflib_rx_miss_bufs += n;
1007 				}
1008 				fl->ifl_cidx = nic_i;
1009 				kring->nr_hwtail = nm_i;
1010 			}
1011 			kring->nr_kflags &= ~NKR_PENDINTR;
1012 		}
1013 	}
1014 	/*
1015 	 * Second part: skip past packets that userspace has released.
1016 	 * (kring->nr_hwcur to head excluded),
1017 	 * and make the buffers available for reception.
1018 	 * As usual nm_i is the index in the netmap ring,
1019 	 * nic_i is the index in the NIC ring, and
1020 	 * nm_i == (nic_i + kring->nkr_hwofs) % ring_size
1021 	 */
1022 	/* XXX not sure how this will work with multiple free lists */
1023 	nm_i = kring->nr_hwcur;
1024 	if (nm_i == head)
1025 		return (0);
1026 
1027 	iru.iru_paddrs = fl->ifl_bus_addrs;
1028 	iru.iru_vaddrs = &fl->ifl_vm_addrs[0];
1029 	iru.iru_idxs = fl->ifl_rxd_idxs;
1030 	iru.iru_qsidx = rxq->ifr_id;
1031 	iru.iru_buf_size = fl->ifl_buf_size;
1032 	iru.iru_flidx = fl->ifl_id;
1033 	nic_i_start = nic_i = netmap_idx_k2n(kring, nm_i);
1034 	for (i = 0; nm_i != head; i++) {
1035 		struct netmap_slot *slot = &ring->slot[nm_i];
1036 		void *addr = PNMB(na, slot, &fl->ifl_bus_addrs[i]);
1037 
1038 		if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
1039 			goto ring_reset;
1040 
1041 		fl->ifl_vm_addrs[i] = addr;
1042 		if (fl->ifl_sds.ifsd_map && (slot->flags & NS_BUF_CHANGED)) {
1043 			/* buffer has changed, reload map */
1044 			netmap_reload_map(na, fl->ifl_ifdi->idi_tag, fl->ifl_sds.ifsd_map[nic_i], addr);
1045 		}
1046 		slot->flags &= ~NS_BUF_CHANGED;
1047 
1048 		nm_i = nm_next(nm_i, lim);
1049 		fl->ifl_rxd_idxs[i] = nic_i = nm_next(nic_i, lim);
1050 		if (nm_i != head && i < IFLIB_MAX_RX_REFRESH)
1051 			continue;
1052 
1053 		iru.iru_pidx = nic_i_start;
1054 		iru.iru_count = i;
1055 		i = 0;
1056 		ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
1057 		if (fl->ifl_sds.ifsd_map == NULL) {
1058 			nic_i_start = nic_i;
1059 			continue;
1060 		}
1061 		nic_i = nic_i_start;
1062 		for (n = 0; n < iru.iru_count; n++) {
1063 			bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_sds.ifsd_map[nic_i],
1064 					BUS_DMASYNC_PREREAD);
1065 			nic_i = nm_next(nic_i, lim);
1066 		}
1067 		nic_i_start = nic_i;
1068 	}
1069 	kring->nr_hwcur = head;
1070 
1071 	if (fl->ifl_sds.ifsd_map)
1072 		bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
1073 				BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1074 	/*
1075 	 * IMPORTANT: we must leave one free slot in the ring,
1076 	 * so move nic_i back by one unit
1077 	 */
1078 	nic_i = nm_prev(nic_i, lim);
1079 	ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, fl->ifl_id, nic_i);
1080 	return 0;
1081 
1082 ring_reset:
1083 	return netmap_ring_reinit(kring);
1084 }
1085 
1086 static void
1087 iflib_netmap_intr(struct netmap_adapter *na, int onoff)
1088 {
1089 	struct ifnet *ifp = na->ifp;
1090 	if_ctx_t ctx = ifp->if_softc;
1091 
1092 	CTX_LOCK(ctx);
1093 	if (onoff) {
1094 		IFDI_INTR_ENABLE(ctx);
1095 	} else {
1096 		IFDI_INTR_DISABLE(ctx);
1097 	}
1098 	CTX_UNLOCK(ctx);
1099 }
1100 
1101 
1102 static int
1103 iflib_netmap_attach(if_ctx_t ctx)
1104 {
1105 	struct netmap_adapter na;
1106 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
1107 
1108 	bzero(&na, sizeof(na));
1109 
1110 	na.ifp = ctx->ifc_ifp;
1111 	na.na_flags = NAF_BDG_MAYSLEEP;
1112 	MPASS(ctx->ifc_softc_ctx.isc_ntxqsets);
1113 	MPASS(ctx->ifc_softc_ctx.isc_nrxqsets);
1114 
1115 	na.num_tx_desc = scctx->isc_ntxd[0];
1116 	na.num_rx_desc = scctx->isc_nrxd[0];
1117 	na.nm_txsync = iflib_netmap_txsync;
1118 	na.nm_rxsync = iflib_netmap_rxsync;
1119 	na.nm_register = iflib_netmap_register;
1120 	na.nm_intr = iflib_netmap_intr;
1121 	na.num_tx_rings = ctx->ifc_softc_ctx.isc_ntxqsets;
1122 	na.num_rx_rings = ctx->ifc_softc_ctx.isc_nrxqsets;
1123 	return (netmap_attach(&na));
1124 }
1125 
1126 static void
1127 iflib_netmap_txq_init(if_ctx_t ctx, iflib_txq_t txq)
1128 {
1129 	struct netmap_adapter *na = NA(ctx->ifc_ifp);
1130 	struct netmap_slot *slot;
1131 
1132 	slot = netmap_reset(na, NR_TX, txq->ift_id, 0);
1133 	if (slot == NULL)
1134 		return;
1135 	if (txq->ift_sds.ifsd_map == NULL)
1136 		return;
1137 
1138 	for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxd[0]; i++) {
1139 
1140 		/*
1141 		 * In netmap mode, set the map for the packet buffer.
1142 		 * NOTE: Some drivers (not this one) also need to set
1143 		 * the physical buffer address in the NIC ring.
1144 		 * netmap_idx_n2k() maps a nic index, i, into the corresponding
1145 		 * netmap slot index, si
1146 		 */
1147 		int si = netmap_idx_n2k(&na->tx_rings[txq->ift_id], i);
1148 		netmap_load_map(na, txq->ift_desc_tag, txq->ift_sds.ifsd_map[i], NMB(na, slot + si));
1149 	}
1150 }
1151 static void
1152 iflib_netmap_rxq_init(if_ctx_t ctx, iflib_rxq_t rxq)
1153 {
1154 	struct netmap_adapter *na = NA(ctx->ifc_ifp);
1155 	struct netmap_slot *slot;
1156 	struct if_rxd_update iru;
1157 	iflib_fl_t fl;
1158 	bus_dmamap_t *map;
1159 	int nrxd;
1160 	uint32_t i, j, pidx_start;
1161 
1162 	slot = netmap_reset(na, NR_RX, rxq->ifr_id, 0);
1163 	if (slot == NULL)
1164 		return;
1165 	fl = &rxq->ifr_fl[0];
1166 	map = fl->ifl_sds.ifsd_map;
1167 	nrxd = ctx->ifc_softc_ctx.isc_nrxd[0];
1168 	iru.iru_paddrs = fl->ifl_bus_addrs;
1169 	iru.iru_vaddrs = &fl->ifl_vm_addrs[0];
1170 	iru.iru_idxs = fl->ifl_rxd_idxs;
1171 	iru.iru_qsidx = rxq->ifr_id;
1172 	iru.iru_buf_size = rxq->ifr_fl[0].ifl_buf_size;
1173 	iru.iru_flidx = 0;
1174 
1175 	for (pidx_start = i = j = 0; i < nrxd; i++, j++) {
1176 		int sj = netmap_idx_n2k(&na->rx_rings[rxq->ifr_id], i);
1177 		void *addr;
1178 
1179 		fl->ifl_rxd_idxs[j] = i;
1180 		addr = fl->ifl_vm_addrs[j] = PNMB(na, slot + sj, &fl->ifl_bus_addrs[j]);
1181 		if (map) {
1182 			netmap_load_map(na, rxq->ifr_fl[0].ifl_ifdi->idi_tag, *map, addr);
1183 			map++;
1184 		}
1185 
1186 		if (j < IFLIB_MAX_RX_REFRESH && i < nrxd - 1)
1187 			continue;
1188 
1189 		iru.iru_pidx = pidx_start;
1190 		pidx_start = i;
1191 		iru.iru_count = j;
1192 		j = 0;
1193 		MPASS(pidx_start + j <= nrxd);
1194 		/* Update descriptors and the cached value */
1195 		ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
1196 	}
1197 	/* preserve queue */
1198 	if (ctx->ifc_ifp->if_capenable & IFCAP_NETMAP) {
1199 		struct netmap_kring *kring = &na->rx_rings[rxq->ifr_id];
1200 		int t = na->num_rx_desc - 1 - nm_kr_rxspace(kring);
1201 		ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, 0 /* fl_id */, t);
1202 	} else
1203 		ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, 0 /* fl_id */, nrxd-1);
1204 }
1205 
1206 #define iflib_netmap_detach(ifp) netmap_detach(ifp)
1207 
1208 #else
1209 #define iflib_netmap_txq_init(ctx, txq)
1210 #define iflib_netmap_rxq_init(ctx, rxq)
1211 #define iflib_netmap_detach(ifp)
1212 
1213 #define iflib_netmap_attach(ctx) (0)
1214 #define netmap_rx_irq(ifp, qid, budget) (0)
1215 #define netmap_tx_irq(ifp, qid) do {} while (0)
1216 
1217 #endif
1218 
1219 #if defined(__i386__) || defined(__amd64__)
1220 static __inline void
1221 prefetch(void *x)
1222 {
1223 	__asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
1224 }
1225 #else
1226 #define prefetch(x)
1227 #endif
1228 
1229 static void
1230 _iflib_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int err)
1231 {
1232 	if (err)
1233 		return;
1234 	*(bus_addr_t *) arg = segs[0].ds_addr;
1235 }
1236 
1237 int
1238 iflib_dma_alloc(if_ctx_t ctx, int size, iflib_dma_info_t dma, int mapflags)
1239 {
1240 	int err;
1241 	if_shared_ctx_t sctx = ctx->ifc_sctx;
1242 	device_t dev = ctx->ifc_dev;
1243 
1244 	KASSERT(sctx->isc_q_align != 0, ("alignment value not initialized"));
1245 
1246 	err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */
1247 				sctx->isc_q_align, 0,	/* alignment, bounds */
1248 				BUS_SPACE_MAXADDR,	/* lowaddr */
1249 				BUS_SPACE_MAXADDR,	/* highaddr */
1250 				NULL, NULL,		/* filter, filterarg */
1251 				size,			/* maxsize */
1252 				1,			/* nsegments */
1253 				size,			/* maxsegsize */
1254 				BUS_DMA_ALLOCNOW,	/* flags */
1255 				NULL,			/* lockfunc */
1256 				NULL,			/* lockarg */
1257 				&dma->idi_tag);
1258 	if (err) {
1259 		device_printf(dev,
1260 		    "%s: bus_dma_tag_create failed: %d\n",
1261 		    __func__, err);
1262 		goto fail_0;
1263 	}
1264 
1265 	err = bus_dmamem_alloc(dma->idi_tag, (void**) &dma->idi_vaddr,
1266 	    BUS_DMA_NOWAIT | BUS_DMA_COHERENT | BUS_DMA_ZERO, &dma->idi_map);
1267 	if (err) {
1268 		device_printf(dev,
1269 		    "%s: bus_dmamem_alloc(%ju) failed: %d\n",
1270 		    __func__, (uintmax_t)size, err);
1271 		goto fail_1;
1272 	}
1273 
1274 	dma->idi_paddr = IF_BAD_DMA;
1275 	err = bus_dmamap_load(dma->idi_tag, dma->idi_map, dma->idi_vaddr,
1276 	    size, _iflib_dmamap_cb, &dma->idi_paddr, mapflags | BUS_DMA_NOWAIT);
1277 	if (err || dma->idi_paddr == IF_BAD_DMA) {
1278 		device_printf(dev,
1279 		    "%s: bus_dmamap_load failed: %d\n",
1280 		    __func__, err);
1281 		goto fail_2;
1282 	}
1283 
1284 	dma->idi_size = size;
1285 	return (0);
1286 
1287 fail_2:
1288 	bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map);
1289 fail_1:
1290 	bus_dma_tag_destroy(dma->idi_tag);
1291 fail_0:
1292 	dma->idi_tag = NULL;
1293 
1294 	return (err);
1295 }
1296 
1297 int
1298 iflib_dma_alloc_multi(if_ctx_t ctx, int *sizes, iflib_dma_info_t *dmalist, int mapflags, int count)
1299 {
1300 	int i, err;
1301 	iflib_dma_info_t *dmaiter;
1302 
1303 	dmaiter = dmalist;
1304 	for (i = 0; i < count; i++, dmaiter++) {
1305 		if ((err = iflib_dma_alloc(ctx, sizes[i], *dmaiter, mapflags)) != 0)
1306 			break;
1307 	}
1308 	if (err)
1309 		iflib_dma_free_multi(dmalist, i);
1310 	return (err);
1311 }
1312 
1313 void
1314 iflib_dma_free(iflib_dma_info_t dma)
1315 {
1316 	if (dma->idi_tag == NULL)
1317 		return;
1318 	if (dma->idi_paddr != IF_BAD_DMA) {
1319 		bus_dmamap_sync(dma->idi_tag, dma->idi_map,
1320 		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
1321 		bus_dmamap_unload(dma->idi_tag, dma->idi_map);
1322 		dma->idi_paddr = IF_BAD_DMA;
1323 	}
1324 	if (dma->idi_vaddr != NULL) {
1325 		bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map);
1326 		dma->idi_vaddr = NULL;
1327 	}
1328 	bus_dma_tag_destroy(dma->idi_tag);
1329 	dma->idi_tag = NULL;
1330 }
1331 
1332 void
1333 iflib_dma_free_multi(iflib_dma_info_t *dmalist, int count)
1334 {
1335 	int i;
1336 	iflib_dma_info_t *dmaiter = dmalist;
1337 
1338 	for (i = 0; i < count; i++, dmaiter++)
1339 		iflib_dma_free(*dmaiter);
1340 }
1341 
1342 #ifdef EARLY_AP_STARTUP
1343 static const int iflib_started = 1;
1344 #else
1345 /*
1346  * We used to abuse the smp_started flag to decide if the queues have been
1347  * fully initialized (by late taskqgroup_adjust() calls in a SYSINIT()).
1348  * That gave bad races, since the SYSINIT() runs strictly after smp_started
1349  * is set.  Run a SYSINIT() strictly after that to just set a usable
1350  * completion flag.
1351  */
1352 
1353 static int iflib_started;
1354 
1355 static void
1356 iflib_record_started(void *arg)
1357 {
1358 	iflib_started = 1;
1359 }
1360 
1361 SYSINIT(iflib_record_started, SI_SUB_SMP + 1, SI_ORDER_FIRST,
1362 	iflib_record_started, NULL);
1363 #endif
1364 
1365 static int
1366 iflib_fast_intr(void *arg)
1367 {
1368 	iflib_filter_info_t info = arg;
1369 	struct grouptask *gtask = info->ifi_task;
1370 	if (!iflib_started)
1371 		return (FILTER_HANDLED);
1372 
1373 	DBG_COUNTER_INC(fast_intrs);
1374 	if (info->ifi_filter != NULL && info->ifi_filter(info->ifi_filter_arg) == FILTER_HANDLED)
1375 		return (FILTER_HANDLED);
1376 
1377 	GROUPTASK_ENQUEUE(gtask);
1378 	return (FILTER_HANDLED);
1379 }
1380 
1381 static int
1382 iflib_fast_intr_rxtx(void *arg)
1383 {
1384 	iflib_filter_info_t info = arg;
1385 	struct grouptask *gtask = info->ifi_task;
1386 	iflib_rxq_t rxq = (iflib_rxq_t)info->ifi_ctx;
1387 	if_ctx_t ctx;
1388 	int i, cidx;
1389 
1390 	if (!iflib_started)
1391 		return (FILTER_HANDLED);
1392 
1393 	DBG_COUNTER_INC(fast_intrs);
1394 	if (info->ifi_filter != NULL && info->ifi_filter(info->ifi_filter_arg) == FILTER_HANDLED)
1395 		return (FILTER_HANDLED);
1396 
1397 	for (i = 0; i < rxq->ifr_ntxqirq; i++) {
1398 		qidx_t txqid = rxq->ifr_txqid[i];
1399 
1400 		ctx = rxq->ifr_ctx;
1401 
1402 		if (!ctx->isc_txd_credits_update(ctx->ifc_softc, txqid, false)) {
1403 			IFDI_TX_QUEUE_INTR_ENABLE(ctx, txqid);
1404 			continue;
1405 		}
1406 		GROUPTASK_ENQUEUE(&ctx->ifc_txqs[txqid].ift_task);
1407 	}
1408 	if (ctx->ifc_sctx->isc_flags & IFLIB_HAS_RXCQ)
1409 		cidx = rxq->ifr_cq_cidx;
1410 	else
1411 		cidx = rxq->ifr_fl[0].ifl_cidx;
1412 	if (iflib_rxd_avail(ctx, rxq, cidx, 1))
1413 		GROUPTASK_ENQUEUE(gtask);
1414 	else
1415 		IFDI_RX_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id);
1416 	return (FILTER_HANDLED);
1417 }
1418 
1419 
1420 static int
1421 iflib_fast_intr_ctx(void *arg)
1422 {
1423 	iflib_filter_info_t info = arg;
1424 	struct grouptask *gtask = info->ifi_task;
1425 
1426 	if (!iflib_started)
1427 		return (FILTER_HANDLED);
1428 
1429 	DBG_COUNTER_INC(fast_intrs);
1430 	if (info->ifi_filter != NULL && info->ifi_filter(info->ifi_filter_arg) == FILTER_HANDLED)
1431 		return (FILTER_HANDLED);
1432 
1433 	GROUPTASK_ENQUEUE(gtask);
1434 	return (FILTER_HANDLED);
1435 }
1436 
1437 static int
1438 _iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
1439 	driver_filter_t filter, driver_intr_t handler, void *arg,
1440 				 char *name)
1441 {
1442 	int rc, flags;
1443 	struct resource *res;
1444 	void *tag = NULL;
1445 	device_t dev = ctx->ifc_dev;
1446 
1447 	flags = RF_ACTIVE;
1448 	if (ctx->ifc_flags & IFC_LEGACY)
1449 		flags |= RF_SHAREABLE;
1450 	MPASS(rid < 512);
1451 	irq->ii_rid = rid;
1452 	res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &irq->ii_rid, flags);
1453 	if (res == NULL) {
1454 		device_printf(dev,
1455 		    "failed to allocate IRQ for rid %d, name %s.\n", rid, name);
1456 		return (ENOMEM);
1457 	}
1458 	irq->ii_res = res;
1459 	KASSERT(filter == NULL || handler == NULL, ("filter and handler can't both be non-NULL"));
1460 	rc = bus_setup_intr(dev, res, INTR_MPSAFE | INTR_TYPE_NET,
1461 						filter, handler, arg, &tag);
1462 	if (rc != 0) {
1463 		device_printf(dev,
1464 		    "failed to setup interrupt for rid %d, name %s: %d\n",
1465 					  rid, name ? name : "unknown", rc);
1466 		return (rc);
1467 	} else if (name)
1468 		bus_describe_intr(dev, res, tag, "%s", name);
1469 
1470 	irq->ii_tag = tag;
1471 	return (0);
1472 }
1473 
1474 
1475 /*********************************************************************
1476  *
1477  *  Allocate memory for tx_buffer structures. The tx_buffer stores all
1478  *  the information needed to transmit a packet on the wire. This is
1479  *  called only once at attach, setup is done every reset.
1480  *
1481  **********************************************************************/
1482 
1483 static int
1484 iflib_txsd_alloc(iflib_txq_t txq)
1485 {
1486 	if_ctx_t ctx = txq->ift_ctx;
1487 	if_shared_ctx_t sctx = ctx->ifc_sctx;
1488 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
1489 	device_t dev = ctx->ifc_dev;
1490 	int err, nsegments, ntsosegments;
1491 
1492 	nsegments = scctx->isc_tx_nsegments;
1493 	ntsosegments = scctx->isc_tx_tso_segments_max;
1494 	MPASS(scctx->isc_ntxd[0] > 0);
1495 	MPASS(scctx->isc_ntxd[txq->ift_br_offset] > 0);
1496 	MPASS(nsegments > 0);
1497 	MPASS(ntsosegments > 0);
1498 	/*
1499 	 * Setup DMA descriptor areas.
1500 	 */
1501 	if ((err = bus_dma_tag_create(bus_get_dma_tag(dev),
1502 			       1, 0,			/* alignment, bounds */
1503 			       BUS_SPACE_MAXADDR,	/* lowaddr */
1504 			       BUS_SPACE_MAXADDR,	/* highaddr */
1505 			       NULL, NULL,		/* filter, filterarg */
1506 			       sctx->isc_tx_maxsize,		/* maxsize */
1507 			       nsegments,	/* nsegments */
1508 			       sctx->isc_tx_maxsegsize,	/* maxsegsize */
1509 			       0,			/* flags */
1510 			       NULL,			/* lockfunc */
1511 			       NULL,			/* lockfuncarg */
1512 			       &txq->ift_desc_tag))) {
1513 		device_printf(dev,"Unable to allocate TX DMA tag: %d\n", err);
1514 		device_printf(dev,"maxsize: %zd nsegments: %d maxsegsize: %zd\n",
1515 					  sctx->isc_tx_maxsize, nsegments, sctx->isc_tx_maxsegsize);
1516 		goto fail;
1517 	}
1518 	if ((err = bus_dma_tag_create(bus_get_dma_tag(dev),
1519 			       1, 0,			/* alignment, bounds */
1520 			       BUS_SPACE_MAXADDR,	/* lowaddr */
1521 			       BUS_SPACE_MAXADDR,	/* highaddr */
1522 			       NULL, NULL,		/* filter, filterarg */
1523 			       scctx->isc_tx_tso_size_max,		/* maxsize */
1524 			       ntsosegments,	/* nsegments */
1525 			       scctx->isc_tx_tso_segsize_max,	/* maxsegsize */
1526 			       0,			/* flags */
1527 			       NULL,			/* lockfunc */
1528 			       NULL,			/* lockfuncarg */
1529 			       &txq->ift_tso_desc_tag))) {
1530 		device_printf(dev,"Unable to allocate TX TSO DMA tag: %d\n", err);
1531 
1532 		goto fail;
1533 	}
1534 	if (!(txq->ift_sds.ifsd_flags =
1535 	    (uint8_t *) malloc(sizeof(uint8_t) *
1536 	    scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
1537 		device_printf(dev, "Unable to allocate tx_buffer memory\n");
1538 		err = ENOMEM;
1539 		goto fail;
1540 	}
1541 	if (!(txq->ift_sds.ifsd_m =
1542 	    (struct mbuf **) malloc(sizeof(struct mbuf *) *
1543 	    scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
1544 		device_printf(dev, "Unable to allocate tx_buffer memory\n");
1545 		err = ENOMEM;
1546 		goto fail;
1547 	}
1548 
1549         /* Create the descriptor buffer dma maps */
1550 #if defined(ACPI_DMAR) || (! (defined(__i386__) || defined(__amd64__)))
1551 	if ((ctx->ifc_flags & IFC_DMAR) == 0)
1552 		return (0);
1553 
1554 	if (!(txq->ift_sds.ifsd_map =
1555 	    (bus_dmamap_t *) malloc(sizeof(bus_dmamap_t) * scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
1556 		device_printf(dev, "Unable to allocate tx_buffer map memory\n");
1557 		err = ENOMEM;
1558 		goto fail;
1559 	}
1560 
1561 	for (int i = 0; i < scctx->isc_ntxd[txq->ift_br_offset]; i++) {
1562 		err = bus_dmamap_create(txq->ift_desc_tag, 0, &txq->ift_sds.ifsd_map[i]);
1563 		if (err != 0) {
1564 			device_printf(dev, "Unable to create TX DMA map\n");
1565 			goto fail;
1566 		}
1567 	}
1568 #endif
1569 	return (0);
1570 fail:
1571 	/* We free all, it handles case where we are in the middle */
1572 	iflib_tx_structures_free(ctx);
1573 	return (err);
1574 }
1575 
1576 static void
1577 iflib_txsd_destroy(if_ctx_t ctx, iflib_txq_t txq, int i)
1578 {
1579 	bus_dmamap_t map;
1580 
1581 	map = NULL;
1582 	if (txq->ift_sds.ifsd_map != NULL)
1583 		map = txq->ift_sds.ifsd_map[i];
1584 	if (map != NULL) {
1585 		bus_dmamap_unload(txq->ift_desc_tag, map);
1586 		bus_dmamap_destroy(txq->ift_desc_tag, map);
1587 		txq->ift_sds.ifsd_map[i] = NULL;
1588 	}
1589 }
1590 
1591 static void
1592 iflib_txq_destroy(iflib_txq_t txq)
1593 {
1594 	if_ctx_t ctx = txq->ift_ctx;
1595 
1596 	for (int i = 0; i < txq->ift_size; i++)
1597 		iflib_txsd_destroy(ctx, txq, i);
1598 	if (txq->ift_sds.ifsd_map != NULL) {
1599 		free(txq->ift_sds.ifsd_map, M_IFLIB);
1600 		txq->ift_sds.ifsd_map = NULL;
1601 	}
1602 	if (txq->ift_sds.ifsd_m != NULL) {
1603 		free(txq->ift_sds.ifsd_m, M_IFLIB);
1604 		txq->ift_sds.ifsd_m = NULL;
1605 	}
1606 	if (txq->ift_sds.ifsd_flags != NULL) {
1607 		free(txq->ift_sds.ifsd_flags, M_IFLIB);
1608 		txq->ift_sds.ifsd_flags = NULL;
1609 	}
1610 	if (txq->ift_desc_tag != NULL) {
1611 		bus_dma_tag_destroy(txq->ift_desc_tag);
1612 		txq->ift_desc_tag = NULL;
1613 	}
1614 	if (txq->ift_tso_desc_tag != NULL) {
1615 		bus_dma_tag_destroy(txq->ift_tso_desc_tag);
1616 		txq->ift_tso_desc_tag = NULL;
1617 	}
1618 }
1619 
1620 static void
1621 iflib_txsd_free(if_ctx_t ctx, iflib_txq_t txq, int i)
1622 {
1623 	struct mbuf **mp;
1624 
1625 	mp = &txq->ift_sds.ifsd_m[i];
1626 	if (*mp == NULL)
1627 		return;
1628 
1629 	if (txq->ift_sds.ifsd_map != NULL) {
1630 		bus_dmamap_sync(txq->ift_desc_tag,
1631 				txq->ift_sds.ifsd_map[i],
1632 				BUS_DMASYNC_POSTWRITE);
1633 		bus_dmamap_unload(txq->ift_desc_tag,
1634 				  txq->ift_sds.ifsd_map[i]);
1635 	}
1636 	m_free(*mp);
1637 	DBG_COUNTER_INC(tx_frees);
1638 	*mp = NULL;
1639 }
1640 
1641 static int
1642 iflib_txq_setup(iflib_txq_t txq)
1643 {
1644 	if_ctx_t ctx = txq->ift_ctx;
1645 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
1646 	iflib_dma_info_t di;
1647 	int i;
1648 
1649 	/* Set number of descriptors available */
1650 	txq->ift_qstatus = IFLIB_QUEUE_IDLE;
1651 	/* XXX make configurable */
1652 	txq->ift_update_freq = IFLIB_DEFAULT_TX_UPDATE_FREQ;
1653 
1654 	/* Reset indices */
1655 	txq->ift_cidx_processed = 0;
1656 	txq->ift_pidx = txq->ift_cidx = txq->ift_npending = 0;
1657 	txq->ift_size = scctx->isc_ntxd[txq->ift_br_offset];
1658 
1659 	for (i = 0, di = txq->ift_ifdi; i < ctx->ifc_nhwtxqs; i++, di++)
1660 		bzero((void *)di->idi_vaddr, di->idi_size);
1661 
1662 	IFDI_TXQ_SETUP(ctx, txq->ift_id);
1663 	for (i = 0, di = txq->ift_ifdi; i < ctx->ifc_nhwtxqs; i++, di++)
1664 		bus_dmamap_sync(di->idi_tag, di->idi_map,
1665 						BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1666 	return (0);
1667 }
1668 
1669 /*********************************************************************
1670  *
1671  *  Allocate memory for rx_buffer structures. Since we use one
1672  *  rx_buffer per received packet, the maximum number of rx_buffer's
1673  *  that we'll need is equal to the number of receive descriptors
1674  *  that we've allocated.
1675  *
1676  **********************************************************************/
1677 static int
1678 iflib_rxsd_alloc(iflib_rxq_t rxq)
1679 {
1680 	if_ctx_t ctx = rxq->ifr_ctx;
1681 	if_shared_ctx_t sctx = ctx->ifc_sctx;
1682 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
1683 	device_t dev = ctx->ifc_dev;
1684 	iflib_fl_t fl;
1685 	int			err;
1686 
1687 	MPASS(scctx->isc_nrxd[0] > 0);
1688 	MPASS(scctx->isc_nrxd[rxq->ifr_fl_offset] > 0);
1689 
1690 	fl = rxq->ifr_fl;
1691 	for (int i = 0; i <  rxq->ifr_nfl; i++, fl++) {
1692 		fl->ifl_size = scctx->isc_nrxd[rxq->ifr_fl_offset]; /* this isn't necessarily the same */
1693 		err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */
1694 					 1, 0,			/* alignment, bounds */
1695 					 BUS_SPACE_MAXADDR,	/* lowaddr */
1696 					 BUS_SPACE_MAXADDR,	/* highaddr */
1697 					 NULL, NULL,		/* filter, filterarg */
1698 					 sctx->isc_rx_maxsize,	/* maxsize */
1699 					 sctx->isc_rx_nsegments,	/* nsegments */
1700 					 sctx->isc_rx_maxsegsize,	/* maxsegsize */
1701 					 0,			/* flags */
1702 					 NULL,			/* lockfunc */
1703 					 NULL,			/* lockarg */
1704 					 &fl->ifl_desc_tag);
1705 		if (err) {
1706 			device_printf(dev, "%s: bus_dma_tag_create failed %d\n",
1707 				__func__, err);
1708 			goto fail;
1709 		}
1710 		if (!(fl->ifl_sds.ifsd_flags =
1711 		      (uint8_t *) malloc(sizeof(uint8_t) *
1712 					 scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
1713 			device_printf(dev, "Unable to allocate tx_buffer memory\n");
1714 			err = ENOMEM;
1715 			goto fail;
1716 		}
1717 		if (!(fl->ifl_sds.ifsd_m =
1718 		      (struct mbuf **) malloc(sizeof(struct mbuf *) *
1719 					      scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
1720 			device_printf(dev, "Unable to allocate tx_buffer memory\n");
1721 			err = ENOMEM;
1722 			goto fail;
1723 		}
1724 		if (!(fl->ifl_sds.ifsd_cl =
1725 		      (caddr_t *) malloc(sizeof(caddr_t) *
1726 					      scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
1727 			device_printf(dev, "Unable to allocate tx_buffer memory\n");
1728 			err = ENOMEM;
1729 			goto fail;
1730 		}
1731 
1732 		/* Create the descriptor buffer dma maps */
1733 #if defined(ACPI_DMAR) || (! (defined(__i386__) || defined(__amd64__)))
1734 		if ((ctx->ifc_flags & IFC_DMAR) == 0)
1735 			continue;
1736 
1737 		if (!(fl->ifl_sds.ifsd_map =
1738 		      (bus_dmamap_t *) malloc(sizeof(bus_dmamap_t) * scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
1739 			device_printf(dev, "Unable to allocate tx_buffer map memory\n");
1740 			err = ENOMEM;
1741 			goto fail;
1742 		}
1743 
1744 		for (int i = 0; i < scctx->isc_nrxd[rxq->ifr_fl_offset]; i++) {
1745 			err = bus_dmamap_create(fl->ifl_desc_tag, 0, &fl->ifl_sds.ifsd_map[i]);
1746 			if (err != 0) {
1747 				device_printf(dev, "Unable to create RX buffer DMA map\n");
1748 				goto fail;
1749 			}
1750 		}
1751 #endif
1752 	}
1753 	return (0);
1754 
1755 fail:
1756 	iflib_rx_structures_free(ctx);
1757 	return (err);
1758 }
1759 
1760 
1761 /*
1762  * Internal service routines
1763  */
1764 
1765 struct rxq_refill_cb_arg {
1766 	int               error;
1767 	bus_dma_segment_t seg;
1768 	int               nseg;
1769 };
1770 
1771 static void
1772 _rxq_refill_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
1773 {
1774 	struct rxq_refill_cb_arg *cb_arg = arg;
1775 
1776 	cb_arg->error = error;
1777 	cb_arg->seg = segs[0];
1778 	cb_arg->nseg = nseg;
1779 }
1780 
1781 
1782 #ifdef ACPI_DMAR
1783 #define IS_DMAR(ctx) (ctx->ifc_flags & IFC_DMAR)
1784 #else
1785 #define IS_DMAR(ctx) (0)
1786 #endif
1787 
1788 /**
1789  *	rxq_refill - refill an rxq  free-buffer list
1790  *	@ctx: the iflib context
1791  *	@rxq: the free-list to refill
1792  *	@n: the number of new buffers to allocate
1793  *
1794  *	(Re)populate an rxq free-buffer list with up to @n new packet buffers.
1795  *	The caller must assure that @n does not exceed the queue's capacity.
1796  */
1797 static void
1798 _iflib_fl_refill(if_ctx_t ctx, iflib_fl_t fl, int count)
1799 {
1800 	struct mbuf *m;
1801 	int idx, pidx = fl->ifl_pidx;
1802 	caddr_t cl, *sd_cl;
1803 	struct mbuf **sd_m;
1804 	uint8_t *sd_flags;
1805 	struct if_rxd_update iru;
1806 	bus_dmamap_t *sd_map;
1807 	int n, i = 0;
1808 	uint64_t bus_addr;
1809 	int err;
1810 
1811 	sd_m = fl->ifl_sds.ifsd_m;
1812 	sd_map = fl->ifl_sds.ifsd_map;
1813 	sd_cl = fl->ifl_sds.ifsd_cl;
1814 	sd_flags = fl->ifl_sds.ifsd_flags;
1815 	idx = pidx;
1816 
1817 	n  = count;
1818 	MPASS(n > 0);
1819 	MPASS(fl->ifl_credits + n <= fl->ifl_size);
1820 
1821 	if (pidx < fl->ifl_cidx)
1822 		MPASS(pidx + n <= fl->ifl_cidx);
1823 	if (pidx == fl->ifl_cidx && (fl->ifl_credits < fl->ifl_size))
1824 		MPASS(fl->ifl_gen == 0);
1825 	if (pidx > fl->ifl_cidx)
1826 		MPASS(n <= fl->ifl_size - pidx + fl->ifl_cidx);
1827 
1828 	DBG_COUNTER_INC(fl_refills);
1829 	if (n > 8)
1830 		DBG_COUNTER_INC(fl_refills_large);
1831 	iru.iru_paddrs = fl->ifl_bus_addrs;
1832 	iru.iru_vaddrs = &fl->ifl_vm_addrs[0];
1833 	iru.iru_idxs = fl->ifl_rxd_idxs;
1834 	iru.iru_qsidx = fl->ifl_rxq->ifr_id;
1835 	iru.iru_buf_size = fl->ifl_buf_size;
1836 	iru.iru_flidx = fl->ifl_id;
1837 	while (n--) {
1838 		/*
1839 		 * We allocate an uninitialized mbuf + cluster, mbuf is
1840 		 * initialized after rx.
1841 		 *
1842 		 * If the cluster is still set then we know a minimum sized packet was received
1843 		 */
1844 		if ((cl = sd_cl[idx]) == NULL) {
1845 			if ((cl = sd_cl[idx] = m_cljget(NULL, M_NOWAIT, fl->ifl_buf_size)) == NULL)
1846 				break;
1847 #if MEMORY_LOGGING
1848 			fl->ifl_cl_enqueued++;
1849 #endif
1850 		}
1851 		if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
1852 			break;
1853 		}
1854 #if MEMORY_LOGGING
1855 		fl->ifl_m_enqueued++;
1856 #endif
1857 
1858 		DBG_COUNTER_INC(rx_allocs);
1859 #if defined(__i386__) || defined(__amd64__)
1860 		if (!IS_DMAR(ctx)) {
1861 			bus_addr = pmap_kextract((vm_offset_t)cl);
1862 		} else
1863 #endif
1864 		{
1865 			struct rxq_refill_cb_arg cb_arg;
1866 			iflib_rxq_t q;
1867 
1868 			cb_arg.error = 0;
1869 			q = fl->ifl_rxq;
1870 			MPASS(sd_map != NULL);
1871 			MPASS(sd_map[idx] != NULL);
1872 			err = bus_dmamap_load(fl->ifl_desc_tag, sd_map[idx],
1873 		         cl, fl->ifl_buf_size, _rxq_refill_cb, &cb_arg, 0);
1874 			bus_dmamap_sync(fl->ifl_desc_tag, sd_map[idx], BUS_DMASYNC_PREREAD);
1875 
1876 			if (err != 0 || cb_arg.error) {
1877 				/*
1878 				 * !zone_pack ?
1879 				 */
1880 				if (fl->ifl_zone == zone_pack)
1881 					uma_zfree(fl->ifl_zone, cl);
1882 				m_free(m);
1883 				n = 0;
1884 				goto done;
1885 			}
1886 			bus_addr = cb_arg.seg.ds_addr;
1887 		}
1888 		sd_flags[idx] |= RX_SW_DESC_INUSE;
1889 
1890 		MPASS(sd_m[idx] == NULL);
1891 		sd_cl[idx] = cl;
1892 		sd_m[idx] = m;
1893 		fl->ifl_rxd_idxs[i] = idx;
1894 		fl->ifl_bus_addrs[i] = bus_addr;
1895 		fl->ifl_vm_addrs[i] = cl;
1896 		fl->ifl_credits++;
1897 		i++;
1898 		MPASS(fl->ifl_credits <= fl->ifl_size);
1899 		if (++idx == fl->ifl_size) {
1900 			fl->ifl_gen = 1;
1901 			idx = 0;
1902 		}
1903 		if (n == 0 || i == IFLIB_MAX_RX_REFRESH) {
1904 			iru.iru_pidx = pidx;
1905 			iru.iru_count = i;
1906 			ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
1907 			i = 0;
1908 			pidx = idx;
1909 		}
1910 		fl->ifl_pidx = idx;
1911 
1912 	}
1913 done:
1914 	DBG_COUNTER_INC(rxd_flush);
1915 	if (fl->ifl_pidx == 0)
1916 		pidx = fl->ifl_size - 1;
1917 	else
1918 		pidx = fl->ifl_pidx - 1;
1919 
1920 	if (sd_map)
1921 		bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
1922 				BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1923 	ctx->isc_rxd_flush(ctx->ifc_softc, fl->ifl_rxq->ifr_id, fl->ifl_id, pidx);
1924 }
1925 
1926 static __inline void
1927 __iflib_fl_refill_lt(if_ctx_t ctx, iflib_fl_t fl, int max)
1928 {
1929 	/* we avoid allowing pidx to catch up with cidx as it confuses ixl */
1930 	int32_t reclaimable = fl->ifl_size - fl->ifl_credits - 1;
1931 #ifdef INVARIANTS
1932 	int32_t delta = fl->ifl_size - get_inuse(fl->ifl_size, fl->ifl_cidx, fl->ifl_pidx, fl->ifl_gen) - 1;
1933 #endif
1934 
1935 	MPASS(fl->ifl_credits <= fl->ifl_size);
1936 	MPASS(reclaimable == delta);
1937 
1938 	if (reclaimable > 0)
1939 		_iflib_fl_refill(ctx, fl, min(max, reclaimable));
1940 }
1941 
1942 static void
1943 iflib_fl_bufs_free(iflib_fl_t fl)
1944 {
1945 	iflib_dma_info_t idi = fl->ifl_ifdi;
1946 	uint32_t i;
1947 
1948 	for (i = 0; i < fl->ifl_size; i++) {
1949 		struct mbuf **sd_m = &fl->ifl_sds.ifsd_m[i];
1950 		uint8_t *sd_flags = &fl->ifl_sds.ifsd_flags[i];
1951 		caddr_t *sd_cl = &fl->ifl_sds.ifsd_cl[i];
1952 
1953 		if (*sd_flags & RX_SW_DESC_INUSE) {
1954 			if (fl->ifl_sds.ifsd_map != NULL) {
1955 				bus_dmamap_t sd_map = fl->ifl_sds.ifsd_map[i];
1956 				bus_dmamap_unload(fl->ifl_desc_tag, sd_map);
1957 				bus_dmamap_destroy(fl->ifl_desc_tag, sd_map);
1958 			}
1959 			if (*sd_m != NULL) {
1960 				m_init(*sd_m, M_NOWAIT, MT_DATA, 0);
1961 				uma_zfree(zone_mbuf, *sd_m);
1962 			}
1963 			if (*sd_cl != NULL)
1964 				uma_zfree(fl->ifl_zone, *sd_cl);
1965 			*sd_flags = 0;
1966 		} else {
1967 			MPASS(*sd_cl == NULL);
1968 			MPASS(*sd_m == NULL);
1969 		}
1970 #if MEMORY_LOGGING
1971 		fl->ifl_m_dequeued++;
1972 		fl->ifl_cl_dequeued++;
1973 #endif
1974 		*sd_cl = NULL;
1975 		*sd_m = NULL;
1976 	}
1977 #ifdef INVARIANTS
1978 	for (i = 0; i < fl->ifl_size; i++) {
1979 		MPASS(fl->ifl_sds.ifsd_flags[i] == 0);
1980 		MPASS(fl->ifl_sds.ifsd_cl[i] == NULL);
1981 		MPASS(fl->ifl_sds.ifsd_m[i] == NULL);
1982 	}
1983 #endif
1984 	/*
1985 	 * Reset free list values
1986 	 */
1987 	fl->ifl_credits = fl->ifl_cidx = fl->ifl_pidx = fl->ifl_gen = 0;;
1988 	bzero(idi->idi_vaddr, idi->idi_size);
1989 }
1990 
1991 /*********************************************************************
1992  *
1993  *  Initialize a receive ring and its buffers.
1994  *
1995  **********************************************************************/
1996 static int
1997 iflib_fl_setup(iflib_fl_t fl)
1998 {
1999 	iflib_rxq_t rxq = fl->ifl_rxq;
2000 	if_ctx_t ctx = rxq->ifr_ctx;
2001 	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
2002 
2003 	/*
2004 	** Free current RX buffer structs and their mbufs
2005 	*/
2006 	iflib_fl_bufs_free(fl);
2007 	/* Now replenish the mbufs */
2008 	MPASS(fl->ifl_credits == 0);
2009 	/*
2010 	 * XXX don't set the max_frame_size to larger
2011 	 * than the hardware can handle
2012 	 */
2013 	if (sctx->isc_max_frame_size <= 2048)
2014 		fl->ifl_buf_size = MCLBYTES;
2015 #ifndef CONTIGMALLOC_WORKS
2016 	else
2017 		fl->ifl_buf_size = MJUMPAGESIZE;
2018 #else
2019 	else if (sctx->isc_max_frame_size <= 4096)
2020 		fl->ifl_buf_size = MJUMPAGESIZE;
2021 	else if (sctx->isc_max_frame_size <= 9216)
2022 		fl->ifl_buf_size = MJUM9BYTES;
2023 	else
2024 		fl->ifl_buf_size = MJUM16BYTES;
2025 #endif
2026 	if (fl->ifl_buf_size > ctx->ifc_max_fl_buf_size)
2027 		ctx->ifc_max_fl_buf_size = fl->ifl_buf_size;
2028 	fl->ifl_cltype = m_gettype(fl->ifl_buf_size);
2029 	fl->ifl_zone = m_getzone(fl->ifl_buf_size);
2030 
2031 
2032 	/* avoid pre-allocating zillions of clusters to an idle card
2033 	 * potentially speeding up attach
2034 	 */
2035 	_iflib_fl_refill(ctx, fl, min(128, fl->ifl_size));
2036 	MPASS(min(128, fl->ifl_size) == fl->ifl_credits);
2037 	if (min(128, fl->ifl_size) != fl->ifl_credits)
2038 		return (ENOBUFS);
2039 	/*
2040 	 * handle failure
2041 	 */
2042 	MPASS(rxq != NULL);
2043 	MPASS(fl->ifl_ifdi != NULL);
2044 	bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
2045 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
2046 	return (0);
2047 }
2048 
2049 /*********************************************************************
2050  *
2051  *  Free receive ring data structures
2052  *
2053  **********************************************************************/
2054 static void
2055 iflib_rx_sds_free(iflib_rxq_t rxq)
2056 {
2057 	iflib_fl_t fl;
2058 	int i;
2059 
2060 	if (rxq->ifr_fl != NULL) {
2061 		for (i = 0; i < rxq->ifr_nfl; i++) {
2062 			fl = &rxq->ifr_fl[i];
2063 			if (fl->ifl_desc_tag != NULL) {
2064 				bus_dma_tag_destroy(fl->ifl_desc_tag);
2065 				fl->ifl_desc_tag = NULL;
2066 			}
2067 			free(fl->ifl_sds.ifsd_m, M_IFLIB);
2068 			free(fl->ifl_sds.ifsd_cl, M_IFLIB);
2069 			/* XXX destroy maps first */
2070 			free(fl->ifl_sds.ifsd_map, M_IFLIB);
2071 			fl->ifl_sds.ifsd_m = NULL;
2072 			fl->ifl_sds.ifsd_cl = NULL;
2073 			fl->ifl_sds.ifsd_map = NULL;
2074 		}
2075 		free(rxq->ifr_fl, M_IFLIB);
2076 		rxq->ifr_fl = NULL;
2077 		rxq->ifr_cq_gen = rxq->ifr_cq_cidx = rxq->ifr_cq_pidx = 0;
2078 	}
2079 }
2080 
2081 /*
2082  * MI independent logic
2083  *
2084  */
2085 static void
2086 iflib_timer(void *arg)
2087 {
2088 	iflib_txq_t txq = arg;
2089 	if_ctx_t ctx = txq->ift_ctx;
2090 
2091 	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
2092 		return;
2093 	/*
2094 	** Check on the state of the TX queue(s), this
2095 	** can be done without the lock because its RO
2096 	** and the HUNG state will be static if set.
2097 	*/
2098 	IFDI_TIMER(ctx, txq->ift_id);
2099 	if ((txq->ift_qstatus == IFLIB_QUEUE_HUNG) &&
2100 	    ((txq->ift_cleaned_prev == txq->ift_cleaned) ||
2101 	     (ctx->ifc_pause_frames == 0)))
2102 		goto hung;
2103 
2104 	if (ifmp_ring_is_stalled(txq->ift_br))
2105 		txq->ift_qstatus = IFLIB_QUEUE_HUNG;
2106 	txq->ift_cleaned_prev = txq->ift_cleaned;
2107 	/* handle any laggards */
2108 	if (txq->ift_db_pending)
2109 		GROUPTASK_ENQUEUE(&txq->ift_task);
2110 
2111 	ctx->ifc_pause_frames = 0;
2112 	if (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)
2113 		callout_reset_on(&txq->ift_timer, hz/2, iflib_timer, txq, txq->ift_timer.c_cpu);
2114 	return;
2115 hung:
2116 	CTX_LOCK(ctx);
2117 	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
2118 	device_printf(ctx->ifc_dev,  "TX(%d) desc avail = %d, pidx = %d\n",
2119 				  txq->ift_id, TXQ_AVAIL(txq), txq->ift_pidx);
2120 
2121 	IFDI_WATCHDOG_RESET(ctx);
2122 	ctx->ifc_watchdog_events++;
2123 	ctx->ifc_pause_frames = 0;
2124 
2125 	ctx->ifc_flags |= IFC_DO_RESET;
2126 	iflib_admin_intr_deferred(ctx);
2127 	CTX_UNLOCK(ctx);
2128 }
2129 
2130 static void
2131 iflib_init_locked(if_ctx_t ctx)
2132 {
2133 	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
2134 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
2135 	if_t ifp = ctx->ifc_ifp;
2136 	iflib_fl_t fl;
2137 	iflib_txq_t txq;
2138 	iflib_rxq_t rxq;
2139 	int i, j, tx_ip_csum_flags, tx_ip6_csum_flags;
2140 
2141 
2142 	if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
2143 	IFDI_INTR_DISABLE(ctx);
2144 
2145 	tx_ip_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP);
2146 	tx_ip6_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_IP6_SCTP);
2147 	/* Set hardware offload abilities */
2148 	if_clearhwassist(ifp);
2149 	if (if_getcapenable(ifp) & IFCAP_TXCSUM)
2150 		if_sethwassistbits(ifp, tx_ip_csum_flags, 0);
2151 	if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6)
2152 		if_sethwassistbits(ifp,  tx_ip6_csum_flags, 0);
2153 	if (if_getcapenable(ifp) & IFCAP_TSO4)
2154 		if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
2155 	if (if_getcapenable(ifp) & IFCAP_TSO6)
2156 		if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
2157 
2158 	for (i = 0, txq = ctx->ifc_txqs; i < sctx->isc_ntxqsets; i++, txq++) {
2159 		CALLOUT_LOCK(txq);
2160 		callout_stop(&txq->ift_timer);
2161 		CALLOUT_UNLOCK(txq);
2162 		iflib_netmap_txq_init(ctx, txq);
2163 	}
2164 	for (i = 0, rxq = ctx->ifc_rxqs; i < sctx->isc_nrxqsets; i++, rxq++) {
2165 		MPASS(rxq->ifr_id == i);
2166 		iflib_netmap_rxq_init(ctx, rxq);
2167 	}
2168 #ifdef INVARIANTS
2169 	i = if_getdrvflags(ifp);
2170 #endif
2171 	IFDI_INIT(ctx);
2172 	MPASS(if_getdrvflags(ifp) == i);
2173 	for (i = 0, rxq = ctx->ifc_rxqs; i < sctx->isc_nrxqsets; i++, rxq++) {
2174 		/* XXX this should really be done on a per-queue basis */
2175 		if (if_getcapenable(ifp) & IFCAP_NETMAP)
2176 			continue;
2177 		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) {
2178 			if (iflib_fl_setup(fl)) {
2179 				device_printf(ctx->ifc_dev, "freelist setup failed - check cluster settings\n");
2180 				goto done;
2181 			}
2182 		}
2183 	}
2184 	done:
2185 	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE);
2186 	IFDI_INTR_ENABLE(ctx);
2187 	txq = ctx->ifc_txqs;
2188 	for (i = 0; i < sctx->isc_ntxqsets; i++, txq++)
2189 		callout_reset_on(&txq->ift_timer, hz/2, iflib_timer, txq,
2190 			txq->ift_timer.c_cpu);
2191 }
2192 
2193 static int
2194 iflib_media_change(if_t ifp)
2195 {
2196 	if_ctx_t ctx = if_getsoftc(ifp);
2197 	int err;
2198 
2199 	CTX_LOCK(ctx);
2200 	if ((err = IFDI_MEDIA_CHANGE(ctx)) == 0)
2201 		iflib_init_locked(ctx);
2202 	CTX_UNLOCK(ctx);
2203 	return (err);
2204 }
2205 
2206 static void
2207 iflib_media_status(if_t ifp, struct ifmediareq *ifmr)
2208 {
2209 	if_ctx_t ctx = if_getsoftc(ifp);
2210 
2211 	CTX_LOCK(ctx);
2212 	IFDI_UPDATE_ADMIN_STATUS(ctx);
2213 	IFDI_MEDIA_STATUS(ctx, ifmr);
2214 	CTX_UNLOCK(ctx);
2215 }
2216 
2217 static void
2218 iflib_stop(if_ctx_t ctx)
2219 {
2220 	iflib_txq_t txq = ctx->ifc_txqs;
2221 	iflib_rxq_t rxq = ctx->ifc_rxqs;
2222 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
2223 	iflib_dma_info_t di;
2224 	iflib_fl_t fl;
2225 	int i, j;
2226 
2227 	/* Tell the stack that the interface is no longer active */
2228 	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
2229 
2230 	IFDI_INTR_DISABLE(ctx);
2231 	DELAY(1000);
2232 	IFDI_STOP(ctx);
2233 	DELAY(1000);
2234 
2235 	iflib_debug_reset();
2236 	/* Wait for current tx queue users to exit to disarm watchdog timer. */
2237 	for (i = 0; i < scctx->isc_ntxqsets; i++, txq++) {
2238 		/* make sure all transmitters have completed before proceeding XXX */
2239 
2240 		/* clean any enqueued buffers */
2241 		iflib_ifmp_purge(txq);
2242 		/* Free any existing tx buffers. */
2243 		for (j = 0; j < txq->ift_size; j++) {
2244 			iflib_txsd_free(ctx, txq, j);
2245 		}
2246 		txq->ift_processed = txq->ift_cleaned = txq->ift_cidx_processed = 0;
2247 		txq->ift_in_use = txq->ift_gen = txq->ift_cidx = txq->ift_pidx = txq->ift_no_desc_avail = 0;
2248 		txq->ift_closed = txq->ift_mbuf_defrag = txq->ift_mbuf_defrag_failed = 0;
2249 		txq->ift_no_tx_dma_setup = txq->ift_txd_encap_efbig = txq->ift_map_failed = 0;
2250 		txq->ift_pullups = 0;
2251 		ifmp_ring_reset_stats(txq->ift_br);
2252 		for (j = 0, di = txq->ift_ifdi; j < ctx->ifc_nhwtxqs; j++, di++)
2253 			bzero((void *)di->idi_vaddr, di->idi_size);
2254 	}
2255 	for (i = 0; i < scctx->isc_nrxqsets; i++, rxq++) {
2256 		/* make sure all transmitters have completed before proceeding XXX */
2257 
2258 		for (j = 0, di = txq->ift_ifdi; j < ctx->ifc_nhwrxqs; j++, di++)
2259 			bzero((void *)di->idi_vaddr, di->idi_size);
2260 		/* also resets the free lists pidx/cidx */
2261 		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++)
2262 			iflib_fl_bufs_free(fl);
2263 	}
2264 }
2265 
2266 static inline caddr_t
2267 calc_next_rxd(iflib_fl_t fl, int cidx)
2268 {
2269 	qidx_t size;
2270 	int nrxd;
2271 	caddr_t start, end, cur, next;
2272 
2273 	nrxd = fl->ifl_size;
2274 	size = fl->ifl_rxd_size;
2275 	start = fl->ifl_ifdi->idi_vaddr;
2276 
2277 	if (__predict_false(size == 0))
2278 		return (start);
2279 	cur = start + size*cidx;
2280 	end = start + size*nrxd;
2281 	next = CACHE_PTR_NEXT(cur);
2282 	return (next < end ? next : start);
2283 }
2284 
2285 static inline void
2286 prefetch_pkts(iflib_fl_t fl, int cidx)
2287 {
2288 	int nextptr;
2289 	int nrxd = fl->ifl_size;
2290 	caddr_t next_rxd;
2291 
2292 
2293 	nextptr = (cidx + CACHE_PTR_INCREMENT) & (nrxd-1);
2294 	prefetch(&fl->ifl_sds.ifsd_m[nextptr]);
2295 	prefetch(&fl->ifl_sds.ifsd_cl[nextptr]);
2296 	next_rxd = calc_next_rxd(fl, cidx);
2297 	prefetch(next_rxd);
2298 	prefetch(fl->ifl_sds.ifsd_m[(cidx + 1) & (nrxd-1)]);
2299 	prefetch(fl->ifl_sds.ifsd_m[(cidx + 2) & (nrxd-1)]);
2300 	prefetch(fl->ifl_sds.ifsd_m[(cidx + 3) & (nrxd-1)]);
2301 	prefetch(fl->ifl_sds.ifsd_m[(cidx + 4) & (nrxd-1)]);
2302 	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 1) & (nrxd-1)]);
2303 	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 2) & (nrxd-1)]);
2304 	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 3) & (nrxd-1)]);
2305 	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 4) & (nrxd-1)]);
2306 }
2307 
2308 static void
2309 rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, int unload, if_rxsd_t sd)
2310 {
2311 	int flid, cidx;
2312 	bus_dmamap_t map;
2313 	iflib_fl_t fl;
2314 	iflib_dma_info_t di;
2315 	int next;
2316 
2317 	map = NULL;
2318 	flid = irf->irf_flid;
2319 	cidx = irf->irf_idx;
2320 	fl = &rxq->ifr_fl[flid];
2321 	sd->ifsd_fl = fl;
2322 	sd->ifsd_cidx = cidx;
2323 	sd->ifsd_m = &fl->ifl_sds.ifsd_m[cidx];
2324 	sd->ifsd_cl = &fl->ifl_sds.ifsd_cl[cidx];
2325 	fl->ifl_credits--;
2326 #if MEMORY_LOGGING
2327 	fl->ifl_m_dequeued++;
2328 #endif
2329 	if (rxq->ifr_ctx->ifc_flags & IFC_PREFETCH)
2330 		prefetch_pkts(fl, cidx);
2331 	if (fl->ifl_sds.ifsd_map != NULL) {
2332 		next = (cidx + CACHE_PTR_INCREMENT) & (fl->ifl_size-1);
2333 		prefetch(&fl->ifl_sds.ifsd_map[next]);
2334 		map = fl->ifl_sds.ifsd_map[cidx];
2335 		di = fl->ifl_ifdi;
2336 		next = (cidx + CACHE_LINE_SIZE) & (fl->ifl_size-1);
2337 		prefetch(&fl->ifl_sds.ifsd_flags[next]);
2338 		bus_dmamap_sync(di->idi_tag, di->idi_map,
2339 				BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
2340 
2341 	/* not valid assert if bxe really does SGE from non-contiguous elements */
2342 		MPASS(fl->ifl_cidx == cidx);
2343 		if (unload)
2344 			bus_dmamap_unload(fl->ifl_desc_tag, map);
2345 	}
2346 	fl->ifl_cidx = (fl->ifl_cidx + 1) & (fl->ifl_size-1);
2347 	if (__predict_false(fl->ifl_cidx == 0))
2348 		fl->ifl_gen = 0;
2349 	if (map != NULL)
2350 		bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
2351 			BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
2352 }
2353 
2354 static struct mbuf *
2355 assemble_segments(iflib_rxq_t rxq, if_rxd_info_t ri, if_rxsd_t sd)
2356 {
2357 	int i, padlen , flags;
2358 	struct mbuf *m, *mh, *mt;
2359 	caddr_t cl;
2360 
2361 	i = 0;
2362 	mh = NULL;
2363 	do {
2364 		rxd_frag_to_sd(rxq, &ri->iri_frags[i], TRUE, sd);
2365 
2366 		MPASS(*sd->ifsd_cl != NULL);
2367 		MPASS(*sd->ifsd_m != NULL);
2368 
2369 		/* Don't include zero-length frags */
2370 		if (ri->iri_frags[i].irf_len == 0) {
2371 			/* XXX we can save the cluster here, but not the mbuf */
2372 			m_init(*sd->ifsd_m, M_NOWAIT, MT_DATA, 0);
2373 			m_free(*sd->ifsd_m);
2374 			*sd->ifsd_m = NULL;
2375 			continue;
2376 		}
2377 		m = *sd->ifsd_m;
2378 		*sd->ifsd_m = NULL;
2379 		if (mh == NULL) {
2380 			flags = M_PKTHDR|M_EXT;
2381 			mh = mt = m;
2382 			padlen = ri->iri_pad;
2383 		} else {
2384 			flags = M_EXT;
2385 			mt->m_next = m;
2386 			mt = m;
2387 			/* assuming padding is only on the first fragment */
2388 			padlen = 0;
2389 		}
2390 		cl = *sd->ifsd_cl;
2391 		*sd->ifsd_cl = NULL;
2392 
2393 		/* Can these two be made one ? */
2394 		m_init(m, M_NOWAIT, MT_DATA, flags);
2395 		m_cljset(m, cl, sd->ifsd_fl->ifl_cltype);
2396 		/*
2397 		 * These must follow m_init and m_cljset
2398 		 */
2399 		m->m_data += padlen;
2400 		ri->iri_len -= padlen;
2401 		m->m_len = ri->iri_frags[i].irf_len;
2402 	} while (++i < ri->iri_nfrags);
2403 
2404 	return (mh);
2405 }
2406 
2407 /*
2408  * Process one software descriptor
2409  */
2410 static struct mbuf *
2411 iflib_rxd_pkt_get(iflib_rxq_t rxq, if_rxd_info_t ri)
2412 {
2413 	struct if_rxsd sd;
2414 	struct mbuf *m;
2415 
2416 	/* should I merge this back in now that the two paths are basically duplicated? */
2417 	if (ri->iri_nfrags == 1 &&
2418 	    ri->iri_frags[0].irf_len <= IFLIB_RX_COPY_THRESH) {
2419 		rxd_frag_to_sd(rxq, &ri->iri_frags[0], FALSE, &sd);
2420 		m = *sd.ifsd_m;
2421 		*sd.ifsd_m = NULL;
2422 		m_init(m, M_NOWAIT, MT_DATA, M_PKTHDR);
2423 #ifndef __NO_STRICT_ALIGNMENT
2424 		if (!IP_ALIGNED(m))
2425 			m->m_data += 2;
2426 #endif
2427 		memcpy(m->m_data, *sd.ifsd_cl, ri->iri_len);
2428 		m->m_len = ri->iri_frags[0].irf_len;
2429        } else {
2430 		m = assemble_segments(rxq, ri, &sd);
2431 	}
2432 	m->m_pkthdr.len = ri->iri_len;
2433 	m->m_pkthdr.rcvif = ri->iri_ifp;
2434 	m->m_flags |= ri->iri_flags;
2435 	m->m_pkthdr.ether_vtag = ri->iri_vtag;
2436 	m->m_pkthdr.flowid = ri->iri_flowid;
2437 	M_HASHTYPE_SET(m, ri->iri_rsstype);
2438 	m->m_pkthdr.csum_flags = ri->iri_csum_flags;
2439 	m->m_pkthdr.csum_data = ri->iri_csum_data;
2440 	return (m);
2441 }
2442 
2443 static bool
2444 iflib_rxeof(iflib_rxq_t rxq, qidx_t budget)
2445 {
2446 	if_ctx_t ctx = rxq->ifr_ctx;
2447 	if_shared_ctx_t sctx = ctx->ifc_sctx;
2448 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
2449 	int avail, i;
2450 	qidx_t *cidxp;
2451 	struct if_rxd_info ri;
2452 	int err, budget_left, rx_bytes, rx_pkts;
2453 	iflib_fl_t fl;
2454 	struct ifnet *ifp;
2455 	int lro_enabled;
2456 
2457 	/*
2458 	 * XXX early demux data packets so that if_input processing only handles
2459 	 * acks in interrupt context
2460 	 */
2461 	struct mbuf *m, *mh, *mt;
2462 
2463 	ifp = ctx->ifc_ifp;
2464 #ifdef DEV_NETMAP
2465 	if (ifp->if_capenable & IFCAP_NETMAP) {
2466 		u_int work = 0;
2467 		if (netmap_rx_irq(ifp, rxq->ifr_id, &work))
2468 			return (FALSE);
2469 	}
2470 #endif
2471 
2472 	mh = mt = NULL;
2473 	MPASS(budget > 0);
2474 	rx_pkts	= rx_bytes = 0;
2475 	if (sctx->isc_flags & IFLIB_HAS_RXCQ)
2476 		cidxp = &rxq->ifr_cq_cidx;
2477 	else
2478 		cidxp = &rxq->ifr_fl[0].ifl_cidx;
2479 	if ((avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget)) == 0) {
2480 		for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++)
2481 			__iflib_fl_refill_lt(ctx, fl, budget + 8);
2482 		DBG_COUNTER_INC(rx_unavail);
2483 		return (false);
2484 	}
2485 
2486 	for (budget_left = budget; (budget_left > 0) && (avail > 0); budget_left--, avail--) {
2487 		if (__predict_false(!CTX_ACTIVE(ctx))) {
2488 			DBG_COUNTER_INC(rx_ctx_inactive);
2489 			break;
2490 		}
2491 		/*
2492 		 * Reset client set fields to their default values
2493 		 */
2494 		rxd_info_zero(&ri);
2495 		ri.iri_qsidx = rxq->ifr_id;
2496 		ri.iri_cidx = *cidxp;
2497 		ri.iri_ifp = ifp;
2498 		ri.iri_frags = rxq->ifr_frags;
2499 		err = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri);
2500 
2501 		if (err)
2502 			goto err;
2503 		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
2504 			*cidxp = ri.iri_cidx;
2505 			/* Update our consumer index */
2506 			/* XXX NB: shurd - check if this is still safe */
2507 			while (rxq->ifr_cq_cidx >= scctx->isc_nrxd[0]) {
2508 				rxq->ifr_cq_cidx -= scctx->isc_nrxd[0];
2509 				rxq->ifr_cq_gen = 0;
2510 			}
2511 			/* was this only a completion queue message? */
2512 			if (__predict_false(ri.iri_nfrags == 0))
2513 				continue;
2514 		}
2515 		MPASS(ri.iri_nfrags != 0);
2516 		MPASS(ri.iri_len != 0);
2517 
2518 		/* will advance the cidx on the corresponding free lists */
2519 		m = iflib_rxd_pkt_get(rxq, &ri);
2520 		if (avail == 0 && budget_left)
2521 			avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget_left);
2522 
2523 		if (__predict_false(m == NULL)) {
2524 			DBG_COUNTER_INC(rx_mbuf_null);
2525 			continue;
2526 		}
2527 		/* imm_pkt: -- cxgb */
2528 		if (mh == NULL)
2529 			mh = mt = m;
2530 		else {
2531 			mt->m_nextpkt = m;
2532 			mt = m;
2533 		}
2534 	}
2535 	/* make sure that we can refill faster than drain */
2536 	for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++)
2537 		__iflib_fl_refill_lt(ctx, fl, budget + 8);
2538 
2539 	lro_enabled = (if_getcapenable(ifp) & IFCAP_LRO);
2540 	while (mh != NULL) {
2541 		m = mh;
2542 		mh = mh->m_nextpkt;
2543 		m->m_nextpkt = NULL;
2544 #ifndef __NO_STRICT_ALIGNMENT
2545 		if (!IP_ALIGNED(m) && (m = iflib_fixup_rx(m)) == NULL)
2546 			continue;
2547 #endif
2548 		rx_bytes += m->m_pkthdr.len;
2549 		rx_pkts++;
2550 #if defined(INET6) || defined(INET)
2551 		if (lro_enabled && tcp_lro_rx(&rxq->ifr_lc, m, 0) == 0)
2552 			continue;
2553 #endif
2554 		DBG_COUNTER_INC(rx_if_input);
2555 		ifp->if_input(ifp, m);
2556 	}
2557 
2558 	if_inc_counter(ifp, IFCOUNTER_IBYTES, rx_bytes);
2559 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, rx_pkts);
2560 
2561 	/*
2562 	 * Flush any outstanding LRO work
2563 	 */
2564 #if defined(INET6) || defined(INET)
2565 	tcp_lro_flush_all(&rxq->ifr_lc);
2566 #endif
2567 	if (avail)
2568 		return true;
2569 	return (iflib_rxd_avail(ctx, rxq, *cidxp, 1));
2570 err:
2571 	CTX_LOCK(ctx);
2572 	ctx->ifc_flags |= IFC_DO_RESET;
2573 	iflib_admin_intr_deferred(ctx);
2574 	CTX_UNLOCK(ctx);
2575 	return (false);
2576 }
2577 
2578 #define TXD_NOTIFY_COUNT(txq) (((txq)->ift_size / (txq)->ift_update_freq)-1)
2579 static inline qidx_t
2580 txq_max_db_deferred(iflib_txq_t txq, qidx_t in_use)
2581 {
2582 	qidx_t notify_count = TXD_NOTIFY_COUNT(txq);
2583 	qidx_t minthresh = txq->ift_size / 8;
2584 	if (in_use > 4*minthresh)
2585 		return (notify_count);
2586 	if (in_use > 2*minthresh)
2587 		return (notify_count >> 1);
2588 	if (in_use > minthresh)
2589 		return (notify_count >> 3);
2590 	return (0);
2591 }
2592 
2593 static inline qidx_t
2594 txq_max_rs_deferred(iflib_txq_t txq)
2595 {
2596 	qidx_t notify_count = TXD_NOTIFY_COUNT(txq);
2597 	qidx_t minthresh = txq->ift_size / 8;
2598 	if (txq->ift_in_use > 4*minthresh)
2599 		return (notify_count);
2600 	if (txq->ift_in_use > 2*minthresh)
2601 		return (notify_count >> 1);
2602 	if (txq->ift_in_use > minthresh)
2603 		return (notify_count >> 2);
2604 	return (2);
2605 }
2606 
2607 #define M_CSUM_FLAGS(m) ((m)->m_pkthdr.csum_flags)
2608 #define M_HAS_VLANTAG(m) (m->m_flags & M_VLANTAG)
2609 
2610 #define TXQ_MAX_DB_DEFERRED(txq, in_use) txq_max_db_deferred((txq), (in_use))
2611 #define TXQ_MAX_RS_DEFERRED(txq) txq_max_rs_deferred(txq)
2612 #define TXQ_MAX_DB_CONSUMED(size) (size >> 4)
2613 
2614 /* forward compatibility for cxgb */
2615 #define FIRST_QSET(ctx) 0
2616 #define NTXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_ntxqsets)
2617 #define NRXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_nrxqsets)
2618 #define QIDX(ctx, m) ((((m)->m_pkthdr.flowid & ctx->ifc_softc_ctx.isc_rss_table_mask) % NTXQSETS(ctx)) + FIRST_QSET(ctx))
2619 #define DESC_RECLAIMABLE(q) ((int)((q)->ift_processed - (q)->ift_cleaned - (q)->ift_ctx->ifc_softc_ctx.isc_tx_nsegments))
2620 
2621 /* XXX we should be setting this to something other than zero */
2622 #define RECLAIM_THRESH(ctx) ((ctx)->ifc_sctx->isc_tx_reclaim_thresh)
2623 #define MAX_TX_DESC(ctx) ((ctx)->ifc_softc_ctx.isc_tx_tso_segments_max)
2624 
2625 static inline bool
2626 iflib_txd_db_check(if_ctx_t ctx, iflib_txq_t txq, int ring, qidx_t in_use)
2627 {
2628 	qidx_t dbval, max;
2629 	bool rang;
2630 
2631 	rang = false;
2632 	max = TXQ_MAX_DB_DEFERRED(txq, in_use);
2633 	if (ring || txq->ift_db_pending >= max) {
2634 		dbval = txq->ift_npending ? txq->ift_npending : txq->ift_pidx;
2635 		ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, dbval);
2636 		txq->ift_db_pending = txq->ift_npending = 0;
2637 		rang = true;
2638 	}
2639 	return (rang);
2640 }
2641 
2642 #ifdef PKT_DEBUG
2643 static void
2644 print_pkt(if_pkt_info_t pi)
2645 {
2646 	printf("pi len:  %d qsidx: %d nsegs: %d ndescs: %d flags: %x pidx: %d\n",
2647 	       pi->ipi_len, pi->ipi_qsidx, pi->ipi_nsegs, pi->ipi_ndescs, pi->ipi_flags, pi->ipi_pidx);
2648 	printf("pi new_pidx: %d csum_flags: %lx tso_segsz: %d mflags: %x vtag: %d\n",
2649 	       pi->ipi_new_pidx, pi->ipi_csum_flags, pi->ipi_tso_segsz, pi->ipi_mflags, pi->ipi_vtag);
2650 	printf("pi etype: %d ehdrlen: %d ip_hlen: %d ipproto: %d\n",
2651 	       pi->ipi_etype, pi->ipi_ehdrlen, pi->ipi_ip_hlen, pi->ipi_ipproto);
2652 }
2653 #endif
2654 
2655 #define IS_TSO4(pi) ((pi)->ipi_csum_flags & CSUM_IP_TSO)
2656 #define IS_TSO6(pi) ((pi)->ipi_csum_flags & CSUM_IP6_TSO)
2657 
2658 static int
2659 iflib_parse_header(iflib_txq_t txq, if_pkt_info_t pi, struct mbuf **mp)
2660 {
2661 	if_shared_ctx_t sctx = txq->ift_ctx->ifc_sctx;
2662 	struct ether_vlan_header *eh;
2663 	struct mbuf *m, *n;
2664 
2665 	n = m = *mp;
2666 	if ((sctx->isc_flags & IFLIB_NEED_SCRATCH) &&
2667 	    M_WRITABLE(m) == 0) {
2668 		if ((m = m_dup(m, M_NOWAIT)) == NULL) {
2669 			return (ENOMEM);
2670 		} else {
2671 			m_freem(*mp);
2672 			n = *mp = m;
2673 		}
2674 	}
2675 
2676 	/*
2677 	 * Determine where frame payload starts.
2678 	 * Jump over vlan headers if already present,
2679 	 * helpful for QinQ too.
2680 	 */
2681 	if (__predict_false(m->m_len < sizeof(*eh))) {
2682 		txq->ift_pullups++;
2683 		if (__predict_false((m = m_pullup(m, sizeof(*eh))) == NULL))
2684 			return (ENOMEM);
2685 	}
2686 	eh = mtod(m, struct ether_vlan_header *);
2687 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
2688 		pi->ipi_etype = ntohs(eh->evl_proto);
2689 		pi->ipi_ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
2690 	} else {
2691 		pi->ipi_etype = ntohs(eh->evl_encap_proto);
2692 		pi->ipi_ehdrlen = ETHER_HDR_LEN;
2693 	}
2694 
2695 	switch (pi->ipi_etype) {
2696 #ifdef INET
2697 	case ETHERTYPE_IP:
2698 	{
2699 		struct ip *ip = NULL;
2700 		struct tcphdr *th = NULL;
2701 		int minthlen;
2702 
2703 		minthlen = min(m->m_pkthdr.len, pi->ipi_ehdrlen + sizeof(*ip) + sizeof(*th));
2704 		if (__predict_false(m->m_len < minthlen)) {
2705 			/*
2706 			 * if this code bloat is causing too much of a hit
2707 			 * move it to a separate function and mark it noinline
2708 			 */
2709 			if (m->m_len == pi->ipi_ehdrlen) {
2710 				n = m->m_next;
2711 				MPASS(n);
2712 				if (n->m_len >= sizeof(*ip))  {
2713 					ip = (struct ip *)n->m_data;
2714 					if (n->m_len >= (ip->ip_hl << 2) + sizeof(*th))
2715 						th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
2716 				} else {
2717 					txq->ift_pullups++;
2718 					if (__predict_false((m = m_pullup(m, minthlen)) == NULL))
2719 						return (ENOMEM);
2720 					ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
2721 				}
2722 			} else {
2723 				txq->ift_pullups++;
2724 				if (__predict_false((m = m_pullup(m, minthlen)) == NULL))
2725 					return (ENOMEM);
2726 				ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
2727 				if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th))
2728 					th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
2729 			}
2730 		} else {
2731 			ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
2732 			if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th))
2733 				th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
2734 		}
2735 		pi->ipi_ip_hlen = ip->ip_hl << 2;
2736 		pi->ipi_ipproto = ip->ip_p;
2737 		pi->ipi_flags |= IPI_TX_IPV4;
2738 
2739 		if (pi->ipi_csum_flags & CSUM_IP)
2740                        ip->ip_sum = 0;
2741 
2742 		if (pi->ipi_ipproto == IPPROTO_TCP) {
2743 			if (__predict_false(th == NULL)) {
2744 				txq->ift_pullups++;
2745 				if (__predict_false((m = m_pullup(m, (ip->ip_hl << 2) + sizeof(*th))) == NULL))
2746 					return (ENOMEM);
2747 				th = (struct tcphdr *)((caddr_t)ip + pi->ipi_ip_hlen);
2748 			}
2749 			pi->ipi_tcp_hflags = th->th_flags;
2750 			pi->ipi_tcp_hlen = th->th_off << 2;
2751 			pi->ipi_tcp_seq = th->th_seq;
2752 		}
2753 		if (IS_TSO4(pi)) {
2754 			if (__predict_false(ip->ip_p != IPPROTO_TCP))
2755 				return (ENXIO);
2756 			th->th_sum = in_pseudo(ip->ip_src.s_addr,
2757 					       ip->ip_dst.s_addr, htons(IPPROTO_TCP));
2758 			pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz;
2759 			if (sctx->isc_flags & IFLIB_TSO_INIT_IP) {
2760 				ip->ip_sum = 0;
2761 				ip->ip_len = htons(pi->ipi_ip_hlen + pi->ipi_tcp_hlen + pi->ipi_tso_segsz);
2762 			}
2763 		}
2764 		break;
2765 	}
2766 #endif
2767 #ifdef INET6
2768 	case ETHERTYPE_IPV6:
2769 	{
2770 		struct ip6_hdr *ip6 = (struct ip6_hdr *)(m->m_data + pi->ipi_ehdrlen);
2771 		struct tcphdr *th;
2772 		pi->ipi_ip_hlen = sizeof(struct ip6_hdr);
2773 
2774 		if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) {
2775 			if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) == NULL))
2776 				return (ENOMEM);
2777 		}
2778 		th = (struct tcphdr *)((caddr_t)ip6 + pi->ipi_ip_hlen);
2779 
2780 		/* XXX-BZ this will go badly in case of ext hdrs. */
2781 		pi->ipi_ipproto = ip6->ip6_nxt;
2782 		pi->ipi_flags |= IPI_TX_IPV6;
2783 
2784 		if (pi->ipi_ipproto == IPPROTO_TCP) {
2785 			if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) {
2786 				if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) == NULL))
2787 					return (ENOMEM);
2788 			}
2789 			pi->ipi_tcp_hflags = th->th_flags;
2790 			pi->ipi_tcp_hlen = th->th_off << 2;
2791 		}
2792 		if (IS_TSO6(pi)) {
2793 
2794 			if (__predict_false(ip6->ip6_nxt != IPPROTO_TCP))
2795 				return (ENXIO);
2796 			/*
2797 			 * The corresponding flag is set by the stack in the IPv4
2798 			 * TSO case, but not in IPv6 (at least in FreeBSD 10.2).
2799 			 * So, set it here because the rest of the flow requires it.
2800 			 */
2801 			pi->ipi_csum_flags |= CSUM_TCP_IPV6;
2802 			th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
2803 			pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz;
2804 		}
2805 		break;
2806 	}
2807 #endif
2808 	default:
2809 		pi->ipi_csum_flags &= ~CSUM_OFFLOAD;
2810 		pi->ipi_ip_hlen = 0;
2811 		break;
2812 	}
2813 	*mp = m;
2814 
2815 	return (0);
2816 }
2817 
2818 static  __noinline  struct mbuf *
2819 collapse_pkthdr(struct mbuf *m0)
2820 {
2821 	struct mbuf *m, *m_next, *tmp;
2822 
2823 	m = m0;
2824 	m_next = m->m_next;
2825 	while (m_next != NULL && m_next->m_len == 0) {
2826 		m = m_next;
2827 		m->m_next = NULL;
2828 		m_free(m);
2829 		m_next = m_next->m_next;
2830 	}
2831 	m = m0;
2832 	m->m_next = m_next;
2833 	if ((m_next->m_flags & M_EXT) == 0) {
2834 		m = m_defrag(m, M_NOWAIT);
2835 	} else {
2836 		tmp = m_next->m_next;
2837 		memcpy(m_next, m, MPKTHSIZE);
2838 		m = m_next;
2839 		m->m_next = tmp;
2840 	}
2841 	return (m);
2842 }
2843 
2844 /*
2845  * If dodgy hardware rejects the scatter gather chain we've handed it
2846  * we'll need to remove the mbuf chain from ifsg_m[] before we can add the
2847  * m_defrag'd mbufs
2848  */
2849 static __noinline struct mbuf *
2850 iflib_remove_mbuf(iflib_txq_t txq)
2851 {
2852 	int ntxd, i, pidx;
2853 	struct mbuf *m, *mh, **ifsd_m;
2854 
2855 	pidx = txq->ift_pidx;
2856 	ifsd_m = txq->ift_sds.ifsd_m;
2857 	ntxd = txq->ift_size;
2858 	mh = m = ifsd_m[pidx];
2859 	ifsd_m[pidx] = NULL;
2860 #if MEMORY_LOGGING
2861 	txq->ift_dequeued++;
2862 #endif
2863 	i = 1;
2864 
2865 	while (m) {
2866 		ifsd_m[(pidx + i) & (ntxd -1)] = NULL;
2867 #if MEMORY_LOGGING
2868 		txq->ift_dequeued++;
2869 #endif
2870 		m = m->m_next;
2871 		i++;
2872 	}
2873 	return (mh);
2874 }
2875 
2876 static int
2877 iflib_busdma_load_mbuf_sg(iflib_txq_t txq, bus_dma_tag_t tag, bus_dmamap_t map,
2878 			  struct mbuf **m0, bus_dma_segment_t *segs, int *nsegs,
2879 			  int max_segs, int flags)
2880 {
2881 	if_ctx_t ctx;
2882 	if_shared_ctx_t		sctx;
2883 	if_softc_ctx_t		scctx;
2884 	int i, next, pidx, mask, err, maxsegsz, ntxd, count;
2885 	struct mbuf *m, *tmp, **ifsd_m, **mp;
2886 
2887 	m = *m0;
2888 
2889 	/*
2890 	 * Please don't ever do this
2891 	 */
2892 	if (__predict_false(m->m_len == 0))
2893 		*m0 = m = collapse_pkthdr(m);
2894 
2895 	ctx = txq->ift_ctx;
2896 	sctx = ctx->ifc_sctx;
2897 	scctx = &ctx->ifc_softc_ctx;
2898 	ifsd_m = txq->ift_sds.ifsd_m;
2899 	ntxd = txq->ift_size;
2900 	pidx = txq->ift_pidx;
2901 	if (map != NULL) {
2902 		uint8_t *ifsd_flags = txq->ift_sds.ifsd_flags;
2903 
2904 		err = bus_dmamap_load_mbuf_sg(tag, map,
2905 					      *m0, segs, nsegs, BUS_DMA_NOWAIT);
2906 		if (err)
2907 			return (err);
2908 		ifsd_flags[pidx] |= TX_SW_DESC_MAPPED;
2909 		i = 0;
2910 		next = pidx;
2911 		mask = (txq->ift_size-1);
2912 		m = *m0;
2913 		do {
2914 			mp = &ifsd_m[next];
2915 			*mp = m;
2916 			m = m->m_next;
2917 			if (__predict_false((*mp)->m_len == 0)) {
2918 				m_free(*mp);
2919 				*mp = NULL;
2920 			} else
2921 				next = (pidx + i) & (ntxd-1);
2922 		} while (m != NULL);
2923 	} else {
2924 		int buflen, sgsize, max_sgsize;
2925 		vm_offset_t vaddr;
2926 		vm_paddr_t curaddr;
2927 
2928 		count = i = 0;
2929 		maxsegsz = sctx->isc_tx_maxsize;
2930 		m = *m0;
2931 		do {
2932 			if (__predict_false(m->m_len <= 0)) {
2933 				tmp = m;
2934 				m = m->m_next;
2935 				tmp->m_next = NULL;
2936 				m_free(tmp);
2937 				continue;
2938 			}
2939 			buflen = m->m_len;
2940 			vaddr = (vm_offset_t)m->m_data;
2941 			/*
2942 			 * see if we can't be smarter about physically
2943 			 * contiguous mappings
2944 			 */
2945 			next = (pidx + count) & (ntxd-1);
2946 			MPASS(ifsd_m[next] == NULL);
2947 #if MEMORY_LOGGING
2948 			txq->ift_enqueued++;
2949 #endif
2950 			ifsd_m[next] = m;
2951 			while (buflen > 0) {
2952 				max_sgsize = MIN(buflen, maxsegsz);
2953 				curaddr = pmap_kextract(vaddr);
2954 				sgsize = PAGE_SIZE - (curaddr & PAGE_MASK);
2955 				sgsize = MIN(sgsize, max_sgsize);
2956 				segs[i].ds_addr = curaddr;
2957 				segs[i].ds_len = sgsize;
2958 				vaddr += sgsize;
2959 				buflen -= sgsize;
2960 				i++;
2961 				if (i >= max_segs)
2962 					goto err;
2963 			}
2964 			count++;
2965 			tmp = m;
2966 			m = m->m_next;
2967 		} while (m != NULL);
2968 		*nsegs = i;
2969 	}
2970 	return (0);
2971 err:
2972 	*m0 = iflib_remove_mbuf(txq);
2973 	return (EFBIG);
2974 }
2975 
2976 static inline caddr_t
2977 calc_next_txd(iflib_txq_t txq, int cidx, uint8_t qid)
2978 {
2979 	qidx_t size;
2980 	int ntxd;
2981 	caddr_t start, end, cur, next;
2982 
2983 	ntxd = txq->ift_size;
2984 	size = txq->ift_txd_size[qid];
2985 	start = txq->ift_ifdi[qid].idi_vaddr;
2986 
2987 	if (__predict_false(size == 0))
2988 		return (start);
2989 	cur = start + size*cidx;
2990 	end = start + size*ntxd;
2991 	next = CACHE_PTR_NEXT(cur);
2992 	return (next < end ? next : start);
2993 }
2994 
2995 static int
2996 iflib_encap(iflib_txq_t txq, struct mbuf **m_headp)
2997 {
2998 	if_ctx_t		ctx;
2999 	if_shared_ctx_t		sctx;
3000 	if_softc_ctx_t		scctx;
3001 	bus_dma_segment_t	*segs;
3002 	struct mbuf		*m_head;
3003 	void			*next_txd;
3004 	bus_dmamap_t		map;
3005 	struct if_pkt_info	pi;
3006 	int remap = 0;
3007 	int err, nsegs, ndesc, max_segs, pidx, cidx, next, ntxd;
3008 	bus_dma_tag_t desc_tag;
3009 
3010 	segs = txq->ift_segs;
3011 	ctx = txq->ift_ctx;
3012 	sctx = ctx->ifc_sctx;
3013 	scctx = &ctx->ifc_softc_ctx;
3014 	segs = txq->ift_segs;
3015 	ntxd = txq->ift_size;
3016 	m_head = *m_headp;
3017 	map = NULL;
3018 
3019 	/*
3020 	 * If we're doing TSO the next descriptor to clean may be quite far ahead
3021 	 */
3022 	cidx = txq->ift_cidx;
3023 	pidx = txq->ift_pidx;
3024 	if (ctx->ifc_flags & IFC_PREFETCH) {
3025 		next = (cidx + CACHE_PTR_INCREMENT) & (ntxd-1);
3026 		if (!(ctx->ifc_flags & IFLIB_HAS_TXCQ)) {
3027 			next_txd = calc_next_txd(txq, cidx, 0);
3028 			prefetch(next_txd);
3029 		}
3030 
3031 		/* prefetch the next cache line of mbuf pointers and flags */
3032 		prefetch(&txq->ift_sds.ifsd_m[next]);
3033 		if (txq->ift_sds.ifsd_map != NULL) {
3034 			prefetch(&txq->ift_sds.ifsd_map[next]);
3035 			next = (cidx + CACHE_LINE_SIZE) & (ntxd-1);
3036 			prefetch(&txq->ift_sds.ifsd_flags[next]);
3037 		}
3038 	} else if (txq->ift_sds.ifsd_map != NULL)
3039 		map = txq->ift_sds.ifsd_map[pidx];
3040 
3041 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3042 		desc_tag = txq->ift_tso_desc_tag;
3043 		max_segs = scctx->isc_tx_tso_segments_max;
3044 	} else {
3045 		desc_tag = txq->ift_desc_tag;
3046 		max_segs = scctx->isc_tx_nsegments;
3047 	}
3048 	m_head = *m_headp;
3049 
3050 	pkt_info_zero(&pi);
3051 	pi.ipi_len = m_head->m_pkthdr.len;
3052 	pi.ipi_mflags = (m_head->m_flags & (M_VLANTAG|M_BCAST|M_MCAST));
3053 	pi.ipi_csum_flags = m_head->m_pkthdr.csum_flags;
3054 	pi.ipi_vtag = (m_head->m_flags & M_VLANTAG) ? m_head->m_pkthdr.ether_vtag : 0;
3055 	pi.ipi_pidx = pidx;
3056 	pi.ipi_qsidx = txq->ift_id;
3057 
3058 	/* deliberate bitwise OR to make one condition */
3059 	if (__predict_true((pi.ipi_csum_flags | pi.ipi_vtag))) {
3060 		if (__predict_false((err = iflib_parse_header(txq, &pi, m_headp)) != 0))
3061 			return (err);
3062 		m_head = *m_headp;
3063 	}
3064 
3065 retry:
3066 	err = iflib_busdma_load_mbuf_sg(txq, desc_tag, map, m_headp, segs, &nsegs, max_segs, BUS_DMA_NOWAIT);
3067 defrag:
3068 	if (__predict_false(err)) {
3069 		switch (err) {
3070 		case EFBIG:
3071 			/* try collapse once and defrag once */
3072 			if (remap == 0)
3073 				m_head = m_collapse(*m_headp, M_NOWAIT, max_segs);
3074 			if (remap == 1)
3075 				m_head = m_defrag(*m_headp, M_NOWAIT);
3076 			remap++;
3077 			if (__predict_false(m_head == NULL))
3078 				goto defrag_failed;
3079 			txq->ift_mbuf_defrag++;
3080 			*m_headp = m_head;
3081 			goto retry;
3082 			break;
3083 		case ENOMEM:
3084 			txq->ift_no_tx_dma_setup++;
3085 			break;
3086 		default:
3087 			txq->ift_no_tx_dma_setup++;
3088 			m_freem(*m_headp);
3089 			DBG_COUNTER_INC(tx_frees);
3090 			*m_headp = NULL;
3091 			break;
3092 		}
3093 		txq->ift_map_failed++;
3094 		DBG_COUNTER_INC(encap_load_mbuf_fail);
3095 		return (err);
3096 	}
3097 
3098 	/*
3099 	 * XXX assumes a 1 to 1 relationship between segments and
3100 	 *        descriptors - this does not hold true on all drivers, e.g.
3101 	 *        cxgb
3102 	 */
3103 	if (__predict_false(nsegs + 2 > TXQ_AVAIL(txq))) {
3104 		txq->ift_no_desc_avail++;
3105 		if (map != NULL)
3106 			bus_dmamap_unload(desc_tag, map);
3107 		DBG_COUNTER_INC(encap_txq_avail_fail);
3108 		if ((txq->ift_task.gt_task.ta_flags & TASK_ENQUEUED) == 0)
3109 			GROUPTASK_ENQUEUE(&txq->ift_task);
3110 		return (ENOBUFS);
3111 	}
3112 	/*
3113 	 * On Intel cards we can greatly reduce the number of TX interrupts
3114 	 * we see by only setting report status on every Nth descriptor.
3115 	 * However, this also means that the driver will need to keep track
3116 	 * of the descriptors that RS was set on to check them for the DD bit.
3117 	 */
3118 	txq->ift_rs_pending += nsegs + 1;
3119 	if (txq->ift_rs_pending > TXQ_MAX_RS_DEFERRED(txq) ||
3120 	     iflib_no_tx_batch || (TXQ_AVAIL(txq) - nsegs - 1) <= MAX_TX_DESC(ctx)) {
3121 		pi.ipi_flags |= IPI_TX_INTR;
3122 		txq->ift_rs_pending = 0;
3123 	}
3124 
3125 	pi.ipi_segs = segs;
3126 	pi.ipi_nsegs = nsegs;
3127 
3128 	MPASS(pidx >= 0 && pidx < txq->ift_size);
3129 #ifdef PKT_DEBUG
3130 	print_pkt(&pi);
3131 #endif
3132 	if (map != NULL)
3133 		bus_dmamap_sync(desc_tag, map, BUS_DMASYNC_PREWRITE);
3134 	if ((err = ctx->isc_txd_encap(ctx->ifc_softc, &pi)) == 0) {
3135 		if (map != NULL)
3136 			bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
3137 					BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
3138 		DBG_COUNTER_INC(tx_encap);
3139 		MPASS(pi.ipi_new_pidx < txq->ift_size);
3140 
3141 		ndesc = pi.ipi_new_pidx - pi.ipi_pidx;
3142 		if (pi.ipi_new_pidx < pi.ipi_pidx) {
3143 			ndesc += txq->ift_size;
3144 			txq->ift_gen = 1;
3145 		}
3146 		/*
3147 		 * drivers can need as many as
3148 		 * two sentinels
3149 		 */
3150 		MPASS(ndesc <= pi.ipi_nsegs + 2);
3151 		MPASS(pi.ipi_new_pidx != pidx);
3152 		MPASS(ndesc > 0);
3153 		txq->ift_in_use += ndesc;
3154 
3155 		/*
3156 		 * We update the last software descriptor again here because there may
3157 		 * be a sentinel and/or there may be more mbufs than segments
3158 		 */
3159 		txq->ift_pidx = pi.ipi_new_pidx;
3160 		txq->ift_npending += pi.ipi_ndescs;
3161 	} else if (__predict_false(err == EFBIG && remap < 2)) {
3162 		*m_headp = m_head = iflib_remove_mbuf(txq);
3163 		remap = 1;
3164 		txq->ift_txd_encap_efbig++;
3165 		goto defrag;
3166 	} else
3167 		DBG_COUNTER_INC(encap_txd_encap_fail);
3168 	return (err);
3169 
3170 defrag_failed:
3171 	txq->ift_mbuf_defrag_failed++;
3172 	txq->ift_map_failed++;
3173 	m_freem(*m_headp);
3174 	DBG_COUNTER_INC(tx_frees);
3175 	*m_headp = NULL;
3176 	return (ENOMEM);
3177 }
3178 
3179 static void
3180 iflib_tx_desc_free(iflib_txq_t txq, int n)
3181 {
3182 	int hasmap;
3183 	uint32_t qsize, cidx, mask, gen;
3184 	struct mbuf *m, **ifsd_m;
3185 	uint8_t *ifsd_flags;
3186 	bus_dmamap_t *ifsd_map;
3187 	bool do_prefetch;
3188 
3189 	cidx = txq->ift_cidx;
3190 	gen = txq->ift_gen;
3191 	qsize = txq->ift_size;
3192 	mask = qsize-1;
3193 	hasmap = txq->ift_sds.ifsd_map != NULL;
3194 	ifsd_flags = txq->ift_sds.ifsd_flags;
3195 	ifsd_m = txq->ift_sds.ifsd_m;
3196 	ifsd_map = txq->ift_sds.ifsd_map;
3197 	do_prefetch = (txq->ift_ctx->ifc_flags & IFC_PREFETCH);
3198 
3199 	while (n--) {
3200 		if (do_prefetch) {
3201 			prefetch(ifsd_m[(cidx + 3) & mask]);
3202 			prefetch(ifsd_m[(cidx + 4) & mask]);
3203 		}
3204 		if (ifsd_m[cidx] != NULL) {
3205 			prefetch(&ifsd_m[(cidx + CACHE_PTR_INCREMENT) & mask]);
3206 			prefetch(&ifsd_flags[(cidx + CACHE_PTR_INCREMENT) & mask]);
3207 			if (hasmap && (ifsd_flags[cidx] & TX_SW_DESC_MAPPED)) {
3208 				/*
3209 				 * does it matter if it's not the TSO tag? If so we'll
3210 				 * have to add the type to flags
3211 				 */
3212 				bus_dmamap_unload(txq->ift_desc_tag, ifsd_map[cidx]);
3213 				ifsd_flags[cidx] &= ~TX_SW_DESC_MAPPED;
3214 			}
3215 			if ((m = ifsd_m[cidx]) != NULL) {
3216 				/* XXX we don't support any drivers that batch packets yet */
3217 				MPASS(m->m_nextpkt == NULL);
3218 
3219 				m_free(m);
3220 				ifsd_m[cidx] = NULL;
3221 #if MEMORY_LOGGING
3222 				txq->ift_dequeued++;
3223 #endif
3224 				DBG_COUNTER_INC(tx_frees);
3225 			}
3226 		}
3227 		if (__predict_false(++cidx == qsize)) {
3228 			cidx = 0;
3229 			gen = 0;
3230 		}
3231 	}
3232 	txq->ift_cidx = cidx;
3233 	txq->ift_gen = gen;
3234 }
3235 
3236 static __inline int
3237 iflib_completed_tx_reclaim(iflib_txq_t txq, int thresh)
3238 {
3239 	int reclaim;
3240 	if_ctx_t ctx = txq->ift_ctx;
3241 
3242 	KASSERT(thresh >= 0, ("invalid threshold to reclaim"));
3243 	MPASS(thresh /*+ MAX_TX_DESC(txq->ift_ctx) */ < txq->ift_size);
3244 
3245 	/*
3246 	 * Need a rate-limiting check so that this isn't called every time
3247 	 */
3248 	iflib_tx_credits_update(ctx, txq);
3249 	reclaim = DESC_RECLAIMABLE(txq);
3250 
3251 	if (reclaim <= thresh /* + MAX_TX_DESC(txq->ift_ctx) */) {
3252 #ifdef INVARIANTS
3253 		if (iflib_verbose_debug) {
3254 			printf("%s processed=%ju cleaned=%ju tx_nsegments=%d reclaim=%d thresh=%d\n", __FUNCTION__,
3255 			       txq->ift_processed, txq->ift_cleaned, txq->ift_ctx->ifc_softc_ctx.isc_tx_nsegments,
3256 			       reclaim, thresh);
3257 
3258 		}
3259 #endif
3260 		return (0);
3261 	}
3262 	iflib_tx_desc_free(txq, reclaim);
3263 	txq->ift_cleaned += reclaim;
3264 	txq->ift_in_use -= reclaim;
3265 
3266 	return (reclaim);
3267 }
3268 
3269 static struct mbuf **
3270 _ring_peek_one(struct ifmp_ring *r, int cidx, int offset, int remaining)
3271 {
3272 	int next, size;
3273 	struct mbuf **items;
3274 
3275 	size = r->size;
3276 	next = (cidx + CACHE_PTR_INCREMENT) & (size-1);
3277 	items = __DEVOLATILE(struct mbuf **, &r->items[0]);
3278 
3279 	prefetch(items[(cidx + offset) & (size-1)]);
3280 	if (remaining > 1) {
3281 		prefetch(&items[next]);
3282 		prefetch(items[(cidx + offset + 1) & (size-1)]);
3283 		prefetch(items[(cidx + offset + 2) & (size-1)]);
3284 		prefetch(items[(cidx + offset + 3) & (size-1)]);
3285 	}
3286 	return (__DEVOLATILE(struct mbuf **, &r->items[(cidx + offset) & (size-1)]));
3287 }
3288 
3289 static void
3290 iflib_txq_check_drain(iflib_txq_t txq, int budget)
3291 {
3292 
3293 	ifmp_ring_check_drainage(txq->ift_br, budget);
3294 }
3295 
3296 static uint32_t
3297 iflib_txq_can_drain(struct ifmp_ring *r)
3298 {
3299 	iflib_txq_t txq = r->cookie;
3300 	if_ctx_t ctx = txq->ift_ctx;
3301 
3302 	return ((TXQ_AVAIL(txq) > MAX_TX_DESC(ctx) + 2) ||
3303 		ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, false));
3304 }
3305 
3306 static uint32_t
3307 iflib_txq_drain(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
3308 {
3309 	iflib_txq_t txq = r->cookie;
3310 	if_ctx_t ctx = txq->ift_ctx;
3311 	struct ifnet *ifp = ctx->ifc_ifp;
3312 	struct mbuf **mp, *m;
3313 	int i, count, consumed, pkt_sent, bytes_sent, mcast_sent, avail;
3314 	int reclaimed, err, in_use_prev, desc_used;
3315 	bool do_prefetch, ring, rang;
3316 
3317 	if (__predict_false(!(if_getdrvflags(ifp) & IFF_DRV_RUNNING) ||
3318 			    !LINK_ACTIVE(ctx))) {
3319 		DBG_COUNTER_INC(txq_drain_notready);
3320 		return (0);
3321 	}
3322 	reclaimed = iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
3323 	rang = iflib_txd_db_check(ctx, txq, reclaimed, txq->ift_in_use);
3324 	avail = IDXDIFF(pidx, cidx, r->size);
3325 	if (__predict_false(ctx->ifc_flags & IFC_QFLUSH)) {
3326 		DBG_COUNTER_INC(txq_drain_flushing);
3327 		for (i = 0; i < avail; i++) {
3328 			m_free(r->items[(cidx + i) & (r->size-1)]);
3329 			r->items[(cidx + i) & (r->size-1)] = NULL;
3330 		}
3331 		return (avail);
3332 	}
3333 
3334 	if (__predict_false(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE)) {
3335 		txq->ift_qstatus = IFLIB_QUEUE_IDLE;
3336 		CALLOUT_LOCK(txq);
3337 		callout_stop(&txq->ift_timer);
3338 		CALLOUT_UNLOCK(txq);
3339 		DBG_COUNTER_INC(txq_drain_oactive);
3340 		return (0);
3341 	}
3342 	if (reclaimed)
3343 		txq->ift_qstatus = IFLIB_QUEUE_IDLE;
3344 	consumed = mcast_sent = bytes_sent = pkt_sent = 0;
3345 	count = MIN(avail, TX_BATCH_SIZE);
3346 #ifdef INVARIANTS
3347 	if (iflib_verbose_debug)
3348 		printf("%s avail=%d ifc_flags=%x txq_avail=%d ", __FUNCTION__,
3349 		       avail, ctx->ifc_flags, TXQ_AVAIL(txq));
3350 #endif
3351 	do_prefetch = (ctx->ifc_flags & IFC_PREFETCH);
3352 	avail = TXQ_AVAIL(txq);
3353 	for (desc_used = i = 0; i < count && avail > MAX_TX_DESC(ctx) + 2; i++) {
3354 		int pidx_prev, rem = do_prefetch ? count - i : 0;
3355 
3356 		mp = _ring_peek_one(r, cidx, i, rem);
3357 		MPASS(mp != NULL && *mp != NULL);
3358 		if (__predict_false(*mp == (struct mbuf *)txq)) {
3359 			consumed++;
3360 			reclaimed++;
3361 			continue;
3362 		}
3363 		in_use_prev = txq->ift_in_use;
3364 		pidx_prev = txq->ift_pidx;
3365 		err = iflib_encap(txq, mp);
3366 		if (__predict_false(err)) {
3367 			DBG_COUNTER_INC(txq_drain_encapfail);
3368 			/* no room - bail out */
3369 			if (err == ENOBUFS)
3370 				break;
3371 			consumed++;
3372 			DBG_COUNTER_INC(txq_drain_encapfail);
3373 			/* we can't send this packet - skip it */
3374 			continue;
3375 		}
3376 		consumed++;
3377 		pkt_sent++;
3378 		m = *mp;
3379 		DBG_COUNTER_INC(tx_sent);
3380 		bytes_sent += m->m_pkthdr.len;
3381 		mcast_sent += !!(m->m_flags & M_MCAST);
3382 		avail = TXQ_AVAIL(txq);
3383 
3384 		txq->ift_db_pending += (txq->ift_in_use - in_use_prev);
3385 		desc_used += (txq->ift_in_use - in_use_prev);
3386 		ETHER_BPF_MTAP(ifp, m);
3387 		if (__predict_false(!(ifp->if_drv_flags & IFF_DRV_RUNNING)))
3388 			break;
3389 		rang = iflib_txd_db_check(ctx, txq, false, in_use_prev);
3390 	}
3391 
3392 	/* deliberate use of bitwise or to avoid gratuitous short-circuit */
3393 	ring = rang ? false  : (iflib_min_tx_latency | err) || (TXQ_AVAIL(txq) < MAX_TX_DESC(ctx));
3394 	iflib_txd_db_check(ctx, txq, ring, txq->ift_in_use);
3395 	if_inc_counter(ifp, IFCOUNTER_OBYTES, bytes_sent);
3396 	if_inc_counter(ifp, IFCOUNTER_OPACKETS, pkt_sent);
3397 	if (mcast_sent)
3398 		if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast_sent);
3399 #ifdef INVARIANTS
3400 	if (iflib_verbose_debug)
3401 		printf("consumed=%d\n", consumed);
3402 #endif
3403 	return (consumed);
3404 }
3405 
3406 static uint32_t
3407 iflib_txq_drain_always(struct ifmp_ring *r)
3408 {
3409 	return (1);
3410 }
3411 
3412 static uint32_t
3413 iflib_txq_drain_free(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
3414 {
3415 	int i, avail;
3416 	struct mbuf **mp;
3417 	iflib_txq_t txq;
3418 
3419 	txq = r->cookie;
3420 
3421 	txq->ift_qstatus = IFLIB_QUEUE_IDLE;
3422 	CALLOUT_LOCK(txq);
3423 	callout_stop(&txq->ift_timer);
3424 	CALLOUT_UNLOCK(txq);
3425 
3426 	avail = IDXDIFF(pidx, cidx, r->size);
3427 	for (i = 0; i < avail; i++) {
3428 		mp = _ring_peek_one(r, cidx, i, avail - i);
3429 		if (__predict_false(*mp == (struct mbuf *)txq))
3430 			continue;
3431 		m_freem(*mp);
3432 	}
3433 	MPASS(ifmp_ring_is_stalled(r) == 0);
3434 	return (avail);
3435 }
3436 
3437 static void
3438 iflib_ifmp_purge(iflib_txq_t txq)
3439 {
3440 	struct ifmp_ring *r;
3441 
3442 	r = txq->ift_br;
3443 	r->drain = iflib_txq_drain_free;
3444 	r->can_drain = iflib_txq_drain_always;
3445 
3446 	ifmp_ring_check_drainage(r, r->size);
3447 
3448 	r->drain = iflib_txq_drain;
3449 	r->can_drain = iflib_txq_can_drain;
3450 }
3451 
3452 static void
3453 _task_fn_tx(void *context)
3454 {
3455 	iflib_txq_t txq = context;
3456 	if_ctx_t ctx = txq->ift_ctx;
3457 	struct ifnet *ifp = ctx->ifc_ifp;
3458 	int rc;
3459 
3460 #ifdef IFLIB_DIAGNOSTICS
3461 	txq->ift_cpu_exec_count[curcpu]++;
3462 #endif
3463 	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
3464 		return;
3465 	if ((ifp->if_capenable & IFCAP_NETMAP)) {
3466 		if (ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, false))
3467 			netmap_tx_irq(ifp, txq->ift_id);
3468 		IFDI_TX_QUEUE_INTR_ENABLE(ctx, txq->ift_id);
3469 		return;
3470 	}
3471 	if (txq->ift_db_pending)
3472 		ifmp_ring_enqueue(txq->ift_br, (void **)&txq, 1, TX_BATCH_SIZE);
3473 	else
3474 		ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
3475 	if (ctx->ifc_flags & IFC_LEGACY)
3476 		IFDI_INTR_ENABLE(ctx);
3477 	else {
3478 		rc = IFDI_TX_QUEUE_INTR_ENABLE(ctx, txq->ift_id);
3479 		KASSERT(rc != ENOTSUP, ("MSI-X support requires queue_intr_enable, but not implemented in driver"));
3480 	}
3481 }
3482 
3483 static void
3484 _task_fn_rx(void *context)
3485 {
3486 	iflib_rxq_t rxq = context;
3487 	if_ctx_t ctx = rxq->ifr_ctx;
3488 	bool more;
3489 	int rc;
3490 
3491 #ifdef IFLIB_DIAGNOSTICS
3492 	rxq->ifr_cpu_exec_count[curcpu]++;
3493 #endif
3494 	DBG_COUNTER_INC(task_fn_rxs);
3495 	if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
3496 		return;
3497 	if ((more = iflib_rxeof(rxq, 16 /* XXX */)) == false) {
3498 		if (ctx->ifc_flags & IFC_LEGACY)
3499 			IFDI_INTR_ENABLE(ctx);
3500 		else {
3501 			DBG_COUNTER_INC(rx_intr_enables);
3502 			rc = IFDI_RX_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id);
3503 			KASSERT(rc != ENOTSUP, ("MSI-X support requires queue_intr_enable, but not implemented in driver"));
3504 		}
3505 	}
3506 	if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
3507 		return;
3508 	if (more)
3509 		GROUPTASK_ENQUEUE(&rxq->ifr_task);
3510 }
3511 
3512 static void
3513 _task_fn_admin(void *context)
3514 {
3515 	if_ctx_t ctx = context;
3516 	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
3517 	iflib_txq_t txq;
3518 	int i;
3519 
3520 	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)) {
3521 		if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE)) {
3522 			return;
3523 		}
3524 	}
3525 
3526 	CTX_LOCK(ctx);
3527 	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) {
3528 		CALLOUT_LOCK(txq);
3529 		callout_stop(&txq->ift_timer);
3530 		CALLOUT_UNLOCK(txq);
3531 	}
3532 	IFDI_UPDATE_ADMIN_STATUS(ctx);
3533 	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++)
3534 		callout_reset_on(&txq->ift_timer, hz/2, iflib_timer, txq, txq->ift_timer.c_cpu);
3535 	IFDI_LINK_INTR_ENABLE(ctx);
3536 	if (ctx->ifc_flags & IFC_DO_RESET) {
3537 		ctx->ifc_flags &= ~IFC_DO_RESET;
3538 		iflib_if_init_locked(ctx);
3539 	}
3540 	CTX_UNLOCK(ctx);
3541 
3542 	if (LINK_ACTIVE(ctx) == 0)
3543 		return;
3544 	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++)
3545 		iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET);
3546 }
3547 
3548 
3549 static void
3550 _task_fn_iov(void *context)
3551 {
3552 	if_ctx_t ctx = context;
3553 
3554 	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
3555 		return;
3556 
3557 	CTX_LOCK(ctx);
3558 	IFDI_VFLR_HANDLE(ctx);
3559 	CTX_UNLOCK(ctx);
3560 }
3561 
3562 static int
3563 iflib_sysctl_int_delay(SYSCTL_HANDLER_ARGS)
3564 {
3565 	int err;
3566 	if_int_delay_info_t info;
3567 	if_ctx_t ctx;
3568 
3569 	info = (if_int_delay_info_t)arg1;
3570 	ctx = info->iidi_ctx;
3571 	info->iidi_req = req;
3572 	info->iidi_oidp = oidp;
3573 	CTX_LOCK(ctx);
3574 	err = IFDI_SYSCTL_INT_DELAY(ctx, info);
3575 	CTX_UNLOCK(ctx);
3576 	return (err);
3577 }
3578 
3579 /*********************************************************************
3580  *
3581  *  IFNET FUNCTIONS
3582  *
3583  **********************************************************************/
3584 
3585 static void
3586 iflib_if_init_locked(if_ctx_t ctx)
3587 {
3588 	iflib_stop(ctx);
3589 	iflib_init_locked(ctx);
3590 }
3591 
3592 
3593 static void
3594 iflib_if_init(void *arg)
3595 {
3596 	if_ctx_t ctx = arg;
3597 
3598 	CTX_LOCK(ctx);
3599 	iflib_if_init_locked(ctx);
3600 	CTX_UNLOCK(ctx);
3601 }
3602 
3603 static int
3604 iflib_if_transmit(if_t ifp, struct mbuf *m)
3605 {
3606 	if_ctx_t	ctx = if_getsoftc(ifp);
3607 
3608 	iflib_txq_t txq;
3609 	int err, qidx;
3610 
3611 	if (__predict_false((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || !LINK_ACTIVE(ctx))) {
3612 		DBG_COUNTER_INC(tx_frees);
3613 		m_freem(m);
3614 		return (ENOBUFS);
3615 	}
3616 
3617 	MPASS(m->m_nextpkt == NULL);
3618 	qidx = 0;
3619 	if ((NTXQSETS(ctx) > 1) && M_HASHTYPE_GET(m))
3620 		qidx = QIDX(ctx, m);
3621 	/*
3622 	 * XXX calculate buf_ring based on flowid (divvy up bits?)
3623 	 */
3624 	txq = &ctx->ifc_txqs[qidx];
3625 
3626 #ifdef DRIVER_BACKPRESSURE
3627 	if (txq->ift_closed) {
3628 		while (m != NULL) {
3629 			next = m->m_nextpkt;
3630 			m->m_nextpkt = NULL;
3631 			m_freem(m);
3632 			m = next;
3633 		}
3634 		return (ENOBUFS);
3635 	}
3636 #endif
3637 #ifdef notyet
3638 	qidx = count = 0;
3639 	mp = marr;
3640 	next = m;
3641 	do {
3642 		count++;
3643 		next = next->m_nextpkt;
3644 	} while (next != NULL);
3645 
3646 	if (count > nitems(marr))
3647 		if ((mp = malloc(count*sizeof(struct mbuf *), M_IFLIB, M_NOWAIT)) == NULL) {
3648 			/* XXX check nextpkt */
3649 			m_freem(m);
3650 			/* XXX simplify for now */
3651 			DBG_COUNTER_INC(tx_frees);
3652 			return (ENOBUFS);
3653 		}
3654 	for (next = m, i = 0; next != NULL; i++) {
3655 		mp[i] = next;
3656 		next = next->m_nextpkt;
3657 		mp[i]->m_nextpkt = NULL;
3658 	}
3659 #endif
3660 	DBG_COUNTER_INC(tx_seen);
3661 	err = ifmp_ring_enqueue(txq->ift_br, (void **)&m, 1, TX_BATCH_SIZE);
3662 
3663 	if (err) {
3664 		GROUPTASK_ENQUEUE(&txq->ift_task);
3665 		/* support forthcoming later */
3666 #ifdef DRIVER_BACKPRESSURE
3667 		txq->ift_closed = TRUE;
3668 #endif
3669 		ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
3670 		m_freem(m);
3671 	} else if (TXQ_AVAIL(txq) < (txq->ift_size >> 1)) {
3672 		GROUPTASK_ENQUEUE(&txq->ift_task);
3673 	}
3674 
3675 	return (err);
3676 }
3677 
3678 static void
3679 iflib_if_qflush(if_t ifp)
3680 {
3681 	if_ctx_t ctx = if_getsoftc(ifp);
3682 	iflib_txq_t txq = ctx->ifc_txqs;
3683 	int i;
3684 
3685 	CTX_LOCK(ctx);
3686 	ctx->ifc_flags |= IFC_QFLUSH;
3687 	CTX_UNLOCK(ctx);
3688 	for (i = 0; i < NTXQSETS(ctx); i++, txq++)
3689 		while (!(ifmp_ring_is_idle(txq->ift_br) || ifmp_ring_is_stalled(txq->ift_br)))
3690 			iflib_txq_check_drain(txq, 0);
3691 	CTX_LOCK(ctx);
3692 	ctx->ifc_flags &= ~IFC_QFLUSH;
3693 	CTX_UNLOCK(ctx);
3694 
3695 	if_qflush(ifp);
3696 }
3697 
3698 
3699 #define IFCAP_FLAGS (IFCAP_TXCSUM_IPV6 | IFCAP_RXCSUM_IPV6 | IFCAP_HWCSUM | IFCAP_LRO | \
3700 		     IFCAP_TSO4 | IFCAP_TSO6 | IFCAP_VLAN_HWTAGGING |	\
3701 		     IFCAP_VLAN_MTU | IFCAP_VLAN_HWFILTER | IFCAP_VLAN_HWTSO)
3702 
3703 static int
3704 iflib_if_ioctl(if_t ifp, u_long command, caddr_t data)
3705 {
3706 	if_ctx_t ctx = if_getsoftc(ifp);
3707 	struct ifreq	*ifr = (struct ifreq *)data;
3708 #if defined(INET) || defined(INET6)
3709 	struct ifaddr	*ifa = (struct ifaddr *)data;
3710 #endif
3711 	bool		avoid_reset = FALSE;
3712 	int		err = 0, reinit = 0, bits;
3713 
3714 	switch (command) {
3715 	case SIOCSIFADDR:
3716 #ifdef INET
3717 		if (ifa->ifa_addr->sa_family == AF_INET)
3718 			avoid_reset = TRUE;
3719 #endif
3720 #ifdef INET6
3721 		if (ifa->ifa_addr->sa_family == AF_INET6)
3722 			avoid_reset = TRUE;
3723 #endif
3724 		/*
3725 		** Calling init results in link renegotiation,
3726 		** so we avoid doing it when possible.
3727 		*/
3728 		if (avoid_reset) {
3729 			if_setflagbits(ifp, IFF_UP,0);
3730 			if (!(if_getdrvflags(ifp)& IFF_DRV_RUNNING))
3731 				reinit = 1;
3732 #ifdef INET
3733 			if (!(if_getflags(ifp) & IFF_NOARP))
3734 				arp_ifinit(ifp, ifa);
3735 #endif
3736 		} else
3737 			err = ether_ioctl(ifp, command, data);
3738 		break;
3739 	case SIOCSIFMTU:
3740 		CTX_LOCK(ctx);
3741 		if (ifr->ifr_mtu == if_getmtu(ifp)) {
3742 			CTX_UNLOCK(ctx);
3743 			break;
3744 		}
3745 		bits = if_getdrvflags(ifp);
3746 		/* stop the driver and free any clusters before proceeding */
3747 		iflib_stop(ctx);
3748 
3749 		if ((err = IFDI_MTU_SET(ctx, ifr->ifr_mtu)) == 0) {
3750 			if (ifr->ifr_mtu > ctx->ifc_max_fl_buf_size)
3751 				ctx->ifc_flags |= IFC_MULTISEG;
3752 			else
3753 				ctx->ifc_flags &= ~IFC_MULTISEG;
3754 			err = if_setmtu(ifp, ifr->ifr_mtu);
3755 		}
3756 		iflib_init_locked(ctx);
3757 		if_setdrvflags(ifp, bits);
3758 		CTX_UNLOCK(ctx);
3759 		break;
3760 	case SIOCSIFFLAGS:
3761 		CTX_LOCK(ctx);
3762 		if (if_getflags(ifp) & IFF_UP) {
3763 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
3764 				if ((if_getflags(ifp) ^ ctx->ifc_if_flags) &
3765 				    (IFF_PROMISC | IFF_ALLMULTI)) {
3766 					err = IFDI_PROMISC_SET(ctx, if_getflags(ifp));
3767 				}
3768 			} else
3769 				reinit = 1;
3770 		} else if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
3771 			iflib_stop(ctx);
3772 		}
3773 		ctx->ifc_if_flags = if_getflags(ifp);
3774 		CTX_UNLOCK(ctx);
3775 		break;
3776 	case SIOCADDMULTI:
3777 	case SIOCDELMULTI:
3778 		if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
3779 			CTX_LOCK(ctx);
3780 			IFDI_INTR_DISABLE(ctx);
3781 			IFDI_MULTI_SET(ctx);
3782 			IFDI_INTR_ENABLE(ctx);
3783 			CTX_UNLOCK(ctx);
3784 		}
3785 		break;
3786 	case SIOCSIFMEDIA:
3787 		CTX_LOCK(ctx);
3788 		IFDI_MEDIA_SET(ctx);
3789 		CTX_UNLOCK(ctx);
3790 		/* falls thru */
3791 	case SIOCGIFMEDIA:
3792 		err = ifmedia_ioctl(ifp, ifr, &ctx->ifc_media, command);
3793 		break;
3794 	case SIOCGI2C:
3795 	{
3796 		struct ifi2creq i2c;
3797 
3798 		err = copyin(ifr->ifr_data, &i2c, sizeof(i2c));
3799 		if (err != 0)
3800 			break;
3801 		if (i2c.dev_addr != 0xA0 && i2c.dev_addr != 0xA2) {
3802 			err = EINVAL;
3803 			break;
3804 		}
3805 		if (i2c.len > sizeof(i2c.data)) {
3806 			err = EINVAL;
3807 			break;
3808 		}
3809 
3810 		if ((err = IFDI_I2C_REQ(ctx, &i2c)) == 0)
3811 			err = copyout(&i2c, ifr->ifr_data, sizeof(i2c));
3812 		break;
3813 	}
3814 	case SIOCSIFCAP:
3815 	{
3816 		int mask, setmask;
3817 
3818 		mask = ifr->ifr_reqcap ^ if_getcapenable(ifp);
3819 		setmask = 0;
3820 #ifdef TCP_OFFLOAD
3821 		setmask |= mask & (IFCAP_TOE4|IFCAP_TOE6);
3822 #endif
3823 		setmask |= (mask & IFCAP_FLAGS);
3824 
3825 		if (setmask  & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6))
3826 			setmask |= (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6);
3827 		if ((mask & IFCAP_WOL) &&
3828 		    (if_getcapabilities(ifp) & IFCAP_WOL) != 0)
3829 			setmask |= (mask & (IFCAP_WOL_MCAST|IFCAP_WOL_MAGIC));
3830 		if_vlancap(ifp);
3831 		/*
3832 		 * want to ensure that traffic has stopped before we change any of the flags
3833 		 */
3834 		if (setmask) {
3835 			CTX_LOCK(ctx);
3836 			bits = if_getdrvflags(ifp);
3837 			if (bits & IFF_DRV_RUNNING)
3838 				iflib_stop(ctx);
3839 			if_togglecapenable(ifp, setmask);
3840 			if (bits & IFF_DRV_RUNNING)
3841 				iflib_init_locked(ctx);
3842 			if_setdrvflags(ifp, bits);
3843 			CTX_UNLOCK(ctx);
3844 		}
3845 		break;
3846 	    }
3847 	case SIOCGPRIVATE_0:
3848 	case SIOCSDRVSPEC:
3849 	case SIOCGDRVSPEC:
3850 		CTX_LOCK(ctx);
3851 		err = IFDI_PRIV_IOCTL(ctx, command, data);
3852 		CTX_UNLOCK(ctx);
3853 		break;
3854 	default:
3855 		err = ether_ioctl(ifp, command, data);
3856 		break;
3857 	}
3858 	if (reinit)
3859 		iflib_if_init(ctx);
3860 	return (err);
3861 }
3862 
3863 static uint64_t
3864 iflib_if_get_counter(if_t ifp, ift_counter cnt)
3865 {
3866 	if_ctx_t ctx = if_getsoftc(ifp);
3867 
3868 	return (IFDI_GET_COUNTER(ctx, cnt));
3869 }
3870 
3871 /*********************************************************************
3872  *
3873  *  OTHER FUNCTIONS EXPORTED TO THE STACK
3874  *
3875  **********************************************************************/
3876 
3877 static void
3878 iflib_vlan_register(void *arg, if_t ifp, uint16_t vtag)
3879 {
3880 	if_ctx_t ctx = if_getsoftc(ifp);
3881 
3882 	if ((void *)ctx != arg)
3883 		return;
3884 
3885 	if ((vtag == 0) || (vtag > 4095))
3886 		return;
3887 
3888 	CTX_LOCK(ctx);
3889 	IFDI_VLAN_REGISTER(ctx, vtag);
3890 	/* Re-init to load the changes */
3891 	if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER)
3892 		iflib_init_locked(ctx);
3893 	CTX_UNLOCK(ctx);
3894 }
3895 
3896 static void
3897 iflib_vlan_unregister(void *arg, if_t ifp, uint16_t vtag)
3898 {
3899 	if_ctx_t ctx = if_getsoftc(ifp);
3900 
3901 	if ((void *)ctx != arg)
3902 		return;
3903 
3904 	if ((vtag == 0) || (vtag > 4095))
3905 		return;
3906 
3907 	CTX_LOCK(ctx);
3908 	IFDI_VLAN_UNREGISTER(ctx, vtag);
3909 	/* Re-init to load the changes */
3910 	if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER)
3911 		iflib_init_locked(ctx);
3912 	CTX_UNLOCK(ctx);
3913 }
3914 
3915 static void
3916 iflib_led_func(void *arg, int onoff)
3917 {
3918 	if_ctx_t ctx = arg;
3919 
3920 	CTX_LOCK(ctx);
3921 	IFDI_LED_FUNC(ctx, onoff);
3922 	CTX_UNLOCK(ctx);
3923 }
3924 
3925 /*********************************************************************
3926  *
3927  *  BUS FUNCTION DEFINITIONS
3928  *
3929  **********************************************************************/
3930 
3931 int
3932 iflib_device_probe(device_t dev)
3933 {
3934 	pci_vendor_info_t *ent;
3935 
3936 	uint16_t	pci_vendor_id, pci_device_id;
3937 	uint16_t	pci_subvendor_id, pci_subdevice_id;
3938 	uint16_t	pci_rev_id;
3939 	if_shared_ctx_t sctx;
3940 
3941 	if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC)
3942 		return (ENOTSUP);
3943 
3944 	pci_vendor_id = pci_get_vendor(dev);
3945 	pci_device_id = pci_get_device(dev);
3946 	pci_subvendor_id = pci_get_subvendor(dev);
3947 	pci_subdevice_id = pci_get_subdevice(dev);
3948 	pci_rev_id = pci_get_revid(dev);
3949 	if (sctx->isc_parse_devinfo != NULL)
3950 		sctx->isc_parse_devinfo(&pci_device_id, &pci_subvendor_id, &pci_subdevice_id, &pci_rev_id);
3951 
3952 	ent = sctx->isc_vendor_info;
3953 	while (ent->pvi_vendor_id != 0) {
3954 		if (pci_vendor_id != ent->pvi_vendor_id) {
3955 			ent++;
3956 			continue;
3957 		}
3958 		if ((pci_device_id == ent->pvi_device_id) &&
3959 		    ((pci_subvendor_id == ent->pvi_subvendor_id) ||
3960 		     (ent->pvi_subvendor_id == 0)) &&
3961 		    ((pci_subdevice_id == ent->pvi_subdevice_id) ||
3962 		     (ent->pvi_subdevice_id == 0)) &&
3963 		    ((pci_rev_id == ent->pvi_rev_id) ||
3964 		     (ent->pvi_rev_id == 0))) {
3965 
3966 			device_set_desc_copy(dev, ent->pvi_name);
3967 			/* this needs to be changed to zero if the bus probing code
3968 			 * ever stops re-probing on best match because the sctx
3969 			 * may have its values over written by register calls
3970 			 * in subsequent probes
3971 			 */
3972 			return (BUS_PROBE_DEFAULT);
3973 		}
3974 		ent++;
3975 	}
3976 	return (ENXIO);
3977 }
3978 
3979 int
3980 iflib_device_register(device_t dev, void *sc, if_shared_ctx_t sctx, if_ctx_t *ctxp)
3981 {
3982 	int err, rid, msix, msix_bar;
3983 	if_ctx_t ctx;
3984 	if_t ifp;
3985 	if_softc_ctx_t scctx;
3986 	int i;
3987 	uint16_t main_txq;
3988 	uint16_t main_rxq;
3989 
3990 
3991 	ctx = malloc(sizeof(* ctx), M_IFLIB, M_WAITOK|M_ZERO);
3992 
3993 	if (sc == NULL) {
3994 		sc = malloc(sctx->isc_driver->size, M_IFLIB, M_WAITOK|M_ZERO);
3995 		device_set_softc(dev, ctx);
3996 		ctx->ifc_flags |= IFC_SC_ALLOCATED;
3997 	}
3998 
3999 	ctx->ifc_sctx = sctx;
4000 	ctx->ifc_dev = dev;
4001 	ctx->ifc_softc = sc;
4002 
4003 	if ((err = iflib_register(ctx)) != 0) {
4004 		device_printf(dev, "iflib_register failed %d\n", err);
4005 		return (err);
4006 	}
4007 	iflib_add_device_sysctl_pre(ctx);
4008 
4009 	scctx = &ctx->ifc_softc_ctx;
4010 	ifp = ctx->ifc_ifp;
4011 
4012 	/*
4013 	 * XXX sanity check that ntxd & nrxd are a power of 2
4014 	 */
4015 	if (ctx->ifc_sysctl_ntxqs != 0)
4016 		scctx->isc_ntxqsets = ctx->ifc_sysctl_ntxqs;
4017 	if (ctx->ifc_sysctl_nrxqs != 0)
4018 		scctx->isc_nrxqsets = ctx->ifc_sysctl_nrxqs;
4019 
4020 	for (i = 0; i < sctx->isc_ntxqs; i++) {
4021 		if (ctx->ifc_sysctl_ntxds[i] != 0)
4022 			scctx->isc_ntxd[i] = ctx->ifc_sysctl_ntxds[i];
4023 		else
4024 			scctx->isc_ntxd[i] = sctx->isc_ntxd_default[i];
4025 	}
4026 
4027 	for (i = 0; i < sctx->isc_nrxqs; i++) {
4028 		if (ctx->ifc_sysctl_nrxds[i] != 0)
4029 			scctx->isc_nrxd[i] = ctx->ifc_sysctl_nrxds[i];
4030 		else
4031 			scctx->isc_nrxd[i] = sctx->isc_nrxd_default[i];
4032 	}
4033 
4034 	for (i = 0; i < sctx->isc_nrxqs; i++) {
4035 		if (scctx->isc_nrxd[i] < sctx->isc_nrxd_min[i]) {
4036 			device_printf(dev, "nrxd%d: %d less than nrxd_min %d - resetting to min\n",
4037 				      i, scctx->isc_nrxd[i], sctx->isc_nrxd_min[i]);
4038 			scctx->isc_nrxd[i] = sctx->isc_nrxd_min[i];
4039 		}
4040 		if (scctx->isc_nrxd[i] > sctx->isc_nrxd_max[i]) {
4041 			device_printf(dev, "nrxd%d: %d greater than nrxd_max %d - resetting to max\n",
4042 				      i, scctx->isc_nrxd[i], sctx->isc_nrxd_max[i]);
4043 			scctx->isc_nrxd[i] = sctx->isc_nrxd_max[i];
4044 		}
4045 	}
4046 
4047 	for (i = 0; i < sctx->isc_ntxqs; i++) {
4048 		if (scctx->isc_ntxd[i] < sctx->isc_ntxd_min[i]) {
4049 			device_printf(dev, "ntxd%d: %d less than ntxd_min %d - resetting to min\n",
4050 				      i, scctx->isc_ntxd[i], sctx->isc_ntxd_min[i]);
4051 			scctx->isc_ntxd[i] = sctx->isc_ntxd_min[i];
4052 		}
4053 		if (scctx->isc_ntxd[i] > sctx->isc_ntxd_max[i]) {
4054 			device_printf(dev, "ntxd%d: %d greater than ntxd_max %d - resetting to max\n",
4055 				      i, scctx->isc_ntxd[i], sctx->isc_ntxd_max[i]);
4056 			scctx->isc_ntxd[i] = sctx->isc_ntxd_max[i];
4057 		}
4058 	}
4059 
4060 	if ((err = IFDI_ATTACH_PRE(ctx)) != 0) {
4061 		device_printf(dev, "IFDI_ATTACH_PRE failed %d\n", err);
4062 		return (err);
4063 	}
4064 	_iflib_pre_assert(scctx);
4065 	ctx->ifc_txrx = *scctx->isc_txrx;
4066 
4067 #ifdef INVARIANTS
4068 	MPASS(scctx->isc_capenable);
4069 	if (scctx->isc_capenable & IFCAP_TXCSUM)
4070 		MPASS(scctx->isc_tx_csum_flags);
4071 #endif
4072 
4073 	if_setcapabilities(ifp, scctx->isc_capenable);
4074 	if_setcapenable(ifp, scctx->isc_capenable);
4075 
4076 	if (scctx->isc_ntxqsets == 0 || (scctx->isc_ntxqsets_max && scctx->isc_ntxqsets_max < scctx->isc_ntxqsets))
4077 		scctx->isc_ntxqsets = scctx->isc_ntxqsets_max;
4078 	if (scctx->isc_nrxqsets == 0 || (scctx->isc_nrxqsets_max && scctx->isc_nrxqsets_max < scctx->isc_nrxqsets))
4079 		scctx->isc_nrxqsets = scctx->isc_nrxqsets_max;
4080 
4081 #ifdef ACPI_DMAR
4082 	if (dmar_get_dma_tag(device_get_parent(dev), dev) != NULL)
4083 		ctx->ifc_flags |= IFC_DMAR;
4084 #elif !(defined(__i386__) || defined(__amd64__))
4085 	/* set unconditionally for !x86 */
4086 	ctx->ifc_flags |= IFC_DMAR;
4087 #endif
4088 
4089 	msix_bar = scctx->isc_msix_bar;
4090 	main_txq = (sctx->isc_flags & IFLIB_HAS_TXCQ) ? 1 : 0;
4091 	main_rxq = (sctx->isc_flags & IFLIB_HAS_RXCQ) ? 1 : 0;
4092 
4093 	/* XXX change for per-queue sizes */
4094 	device_printf(dev, "using %d tx descriptors and %d rx descriptors\n",
4095 		      scctx->isc_ntxd[main_txq], scctx->isc_nrxd[main_rxq]);
4096 	for (i = 0; i < sctx->isc_nrxqs; i++) {
4097 		if (!powerof2(scctx->isc_nrxd[i])) {
4098 			/* round down instead? */
4099 			device_printf(dev, "# rx descriptors must be a power of 2\n");
4100 			err = EINVAL;
4101 			goto fail;
4102 		}
4103 	}
4104 	for (i = 0; i < sctx->isc_ntxqs; i++) {
4105 		if (!powerof2(scctx->isc_ntxd[i])) {
4106 			device_printf(dev,
4107 			    "# tx descriptors must be a power of 2");
4108 			err = EINVAL;
4109 			goto fail;
4110 		}
4111 	}
4112 
4113 	if (scctx->isc_tx_nsegments > scctx->isc_ntxd[main_txq] /
4114 	    MAX_SINGLE_PACKET_FRACTION)
4115 		scctx->isc_tx_nsegments = max(1, scctx->isc_ntxd[main_txq] /
4116 		    MAX_SINGLE_PACKET_FRACTION);
4117 	if (scctx->isc_tx_tso_segments_max > scctx->isc_ntxd[main_txq] /
4118 	    MAX_SINGLE_PACKET_FRACTION)
4119 		scctx->isc_tx_tso_segments_max = max(1,
4120 		    scctx->isc_ntxd[main_txq] / MAX_SINGLE_PACKET_FRACTION);
4121 
4122 	/*
4123 	 * Protect the stack against modern hardware
4124 	 */
4125 	if (scctx->isc_tx_tso_size_max > FREEBSD_TSO_SIZE_MAX)
4126 		scctx->isc_tx_tso_size_max = FREEBSD_TSO_SIZE_MAX;
4127 
4128 	/* TSO parameters - dig these out of the data sheet - simply correspond to tag setup */
4129 	ifp->if_hw_tsomaxsegcount = scctx->isc_tx_tso_segments_max;
4130 	ifp->if_hw_tsomax = scctx->isc_tx_tso_size_max;
4131 	ifp->if_hw_tsomaxsegsize = scctx->isc_tx_tso_segsize_max;
4132 	if (scctx->isc_rss_table_size == 0)
4133 		scctx->isc_rss_table_size = 64;
4134 	scctx->isc_rss_table_mask = scctx->isc_rss_table_size-1;
4135 
4136 	GROUPTASK_INIT(&ctx->ifc_admin_task, 0, _task_fn_admin, ctx);
4137 	/* XXX format name */
4138 	taskqgroup_attach(qgroup_if_config_tqg, &ctx->ifc_admin_task, ctx, -1, "admin");
4139 	/*
4140 	** Now setup MSI or MSI/X, should
4141 	** return us the number of supported
4142 	** vectors. (Will be 1 for MSI)
4143 	*/
4144 	if (sctx->isc_flags & IFLIB_SKIP_MSIX) {
4145 		msix = scctx->isc_vectors;
4146 	} else if (scctx->isc_msix_bar != 0)
4147 	       /*
4148 		* The simple fact that isc_msix_bar is not 0 does not mean we
4149 		* we have a good value there that is known to work.
4150 		*/
4151 		msix = iflib_msix_init(ctx);
4152 	else {
4153 		scctx->isc_vectors = 1;
4154 		scctx->isc_ntxqsets = 1;
4155 		scctx->isc_nrxqsets = 1;
4156 		scctx->isc_intr = IFLIB_INTR_LEGACY;
4157 		msix = 0;
4158 	}
4159 	/* Get memory for the station queues */
4160 	if ((err = iflib_queues_alloc(ctx))) {
4161 		device_printf(dev, "Unable to allocate queue memory\n");
4162 		goto fail;
4163 	}
4164 
4165 	if ((err = iflib_qset_structures_setup(ctx))) {
4166 		device_printf(dev, "qset structure setup failed %d\n", err);
4167 		goto fail_queues;
4168 	}
4169 
4170 	/*
4171 	 * Group taskqueues aren't properly set up until SMP is started,
4172 	 * so we disable interrupts until we can handle them post
4173 	 * SI_SUB_SMP.
4174 	 *
4175 	 * XXX: disabling interrupts doesn't actually work, at least for
4176 	 * the non-MSI case.  When they occur before SI_SUB_SMP completes,
4177 	 * we do null handling and depend on this not causing too large an
4178 	 * interrupt storm.
4179 	 */
4180 	IFDI_INTR_DISABLE(ctx);
4181 	if (msix > 1 && (err = IFDI_MSIX_INTR_ASSIGN(ctx, msix)) != 0) {
4182 		device_printf(dev, "IFDI_MSIX_INTR_ASSIGN failed %d\n", err);
4183 		goto fail_intr_free;
4184 	}
4185 	if (msix <= 1) {
4186 		rid = 0;
4187 		if (scctx->isc_intr == IFLIB_INTR_MSI) {
4188 			MPASS(msix == 1);
4189 			rid = 1;
4190 		}
4191 		if ((err = iflib_legacy_setup(ctx, ctx->isc_legacy_intr, ctx->ifc_softc, &rid, "irq0")) != 0) {
4192 			device_printf(dev, "iflib_legacy_setup failed %d\n", err);
4193 			goto fail_intr_free;
4194 		}
4195 	}
4196 	ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac);
4197 	if ((err = IFDI_ATTACH_POST(ctx)) != 0) {
4198 		device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err);
4199 		goto fail_detach;
4200 	}
4201 	if ((err = iflib_netmap_attach(ctx))) {
4202 		device_printf(ctx->ifc_dev, "netmap attach failed: %d\n", err);
4203 		goto fail_detach;
4204 	}
4205 	*ctxp = ctx;
4206 
4207 	if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter);
4208 	iflib_add_device_sysctl_post(ctx);
4209 	ctx->ifc_flags |= IFC_INIT_DONE;
4210 	return (0);
4211 fail_detach:
4212 	ether_ifdetach(ctx->ifc_ifp);
4213 fail_intr_free:
4214 	if (scctx->isc_intr == IFLIB_INTR_MSIX || scctx->isc_intr == IFLIB_INTR_MSI)
4215 		pci_release_msi(ctx->ifc_dev);
4216 fail_queues:
4217 	/* XXX free queues */
4218 fail:
4219 	IFDI_DETACH(ctx);
4220 	return (err);
4221 }
4222 
4223 int
4224 iflib_device_attach(device_t dev)
4225 {
4226 	if_ctx_t ctx;
4227 	if_shared_ctx_t sctx;
4228 
4229 	if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC)
4230 		return (ENOTSUP);
4231 
4232 	pci_enable_busmaster(dev);
4233 
4234 	return (iflib_device_register(dev, NULL, sctx, &ctx));
4235 }
4236 
4237 int
4238 iflib_device_deregister(if_ctx_t ctx)
4239 {
4240 	if_t ifp = ctx->ifc_ifp;
4241 	iflib_txq_t txq;
4242 	iflib_rxq_t rxq;
4243 	device_t dev = ctx->ifc_dev;
4244 	int i;
4245 	struct taskqgroup *tqg;
4246 
4247 	/* Make sure VLANS are not using driver */
4248 	if (if_vlantrunkinuse(ifp)) {
4249 		device_printf(dev,"Vlan in use, detach first\n");
4250 		return (EBUSY);
4251 	}
4252 
4253 	CTX_LOCK(ctx);
4254 	ctx->ifc_in_detach = 1;
4255 	iflib_stop(ctx);
4256 	CTX_UNLOCK(ctx);
4257 
4258 	/* Unregister VLAN events */
4259 	if (ctx->ifc_vlan_attach_event != NULL)
4260 		EVENTHANDLER_DEREGISTER(vlan_config, ctx->ifc_vlan_attach_event);
4261 	if (ctx->ifc_vlan_detach_event != NULL)
4262 		EVENTHANDLER_DEREGISTER(vlan_unconfig, ctx->ifc_vlan_detach_event);
4263 
4264 	iflib_netmap_detach(ifp);
4265 	ether_ifdetach(ifp);
4266 	/* ether_ifdetach calls if_qflush - lock must be destroy afterwards*/
4267 	CTX_LOCK_DESTROY(ctx);
4268 	if (ctx->ifc_led_dev != NULL)
4269 		led_destroy(ctx->ifc_led_dev);
4270 	/* XXX drain any dependent tasks */
4271 	tqg = qgroup_if_io_tqg;
4272 	for (txq = ctx->ifc_txqs, i = 0; i < NTXQSETS(ctx); i++, txq++) {
4273 		callout_drain(&txq->ift_timer);
4274 		if (txq->ift_task.gt_uniq != NULL)
4275 			taskqgroup_detach(tqg, &txq->ift_task);
4276 	}
4277 	for (i = 0, rxq = ctx->ifc_rxqs; i < NRXQSETS(ctx); i++, rxq++) {
4278 		if (rxq->ifr_task.gt_uniq != NULL)
4279 			taskqgroup_detach(tqg, &rxq->ifr_task);
4280 	}
4281 	tqg = qgroup_if_config_tqg;
4282 	if (ctx->ifc_admin_task.gt_uniq != NULL)
4283 		taskqgroup_detach(tqg, &ctx->ifc_admin_task);
4284 	if (ctx->ifc_vflr_task.gt_uniq != NULL)
4285 		taskqgroup_detach(tqg, &ctx->ifc_vflr_task);
4286 
4287 	IFDI_DETACH(ctx);
4288 	device_set_softc(ctx->ifc_dev, NULL);
4289 	if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_LEGACY) {
4290 		pci_release_msi(dev);
4291 	}
4292 	if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_MSIX) {
4293 		iflib_irq_free(ctx, &ctx->ifc_legacy_irq);
4294 	}
4295 	if (ctx->ifc_msix_mem != NULL) {
4296 		bus_release_resource(ctx->ifc_dev, SYS_RES_MEMORY,
4297 			ctx->ifc_softc_ctx.isc_msix_bar, ctx->ifc_msix_mem);
4298 		ctx->ifc_msix_mem = NULL;
4299 	}
4300 
4301 	bus_generic_detach(dev);
4302 	if_free(ifp);
4303 
4304 	iflib_tx_structures_free(ctx);
4305 	iflib_rx_structures_free(ctx);
4306 	if (ctx->ifc_flags & IFC_SC_ALLOCATED)
4307 		free(ctx->ifc_softc, M_IFLIB);
4308 	free(ctx, M_IFLIB);
4309 	return (0);
4310 }
4311 
4312 
4313 int
4314 iflib_device_detach(device_t dev)
4315 {
4316 	if_ctx_t ctx = device_get_softc(dev);
4317 
4318 	return (iflib_device_deregister(ctx));
4319 }
4320 
4321 int
4322 iflib_device_suspend(device_t dev)
4323 {
4324 	if_ctx_t ctx = device_get_softc(dev);
4325 
4326 	CTX_LOCK(ctx);
4327 	IFDI_SUSPEND(ctx);
4328 	CTX_UNLOCK(ctx);
4329 
4330 	return bus_generic_suspend(dev);
4331 }
4332 int
4333 iflib_device_shutdown(device_t dev)
4334 {
4335 	if_ctx_t ctx = device_get_softc(dev);
4336 
4337 	CTX_LOCK(ctx);
4338 	IFDI_SHUTDOWN(ctx);
4339 	CTX_UNLOCK(ctx);
4340 
4341 	return bus_generic_suspend(dev);
4342 }
4343 
4344 
4345 int
4346 iflib_device_resume(device_t dev)
4347 {
4348 	if_ctx_t ctx = device_get_softc(dev);
4349 	iflib_txq_t txq = ctx->ifc_txqs;
4350 
4351 	CTX_LOCK(ctx);
4352 	IFDI_RESUME(ctx);
4353 	iflib_init_locked(ctx);
4354 	CTX_UNLOCK(ctx);
4355 	for (int i = 0; i < NTXQSETS(ctx); i++, txq++)
4356 		iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET);
4357 
4358 	return (bus_generic_resume(dev));
4359 }
4360 
4361 int
4362 iflib_device_iov_init(device_t dev, uint16_t num_vfs, const nvlist_t *params)
4363 {
4364 	int error;
4365 	if_ctx_t ctx = device_get_softc(dev);
4366 
4367 	CTX_LOCK(ctx);
4368 	error = IFDI_IOV_INIT(ctx, num_vfs, params);
4369 	CTX_UNLOCK(ctx);
4370 
4371 	return (error);
4372 }
4373 
4374 void
4375 iflib_device_iov_uninit(device_t dev)
4376 {
4377 	if_ctx_t ctx = device_get_softc(dev);
4378 
4379 	CTX_LOCK(ctx);
4380 	IFDI_IOV_UNINIT(ctx);
4381 	CTX_UNLOCK(ctx);
4382 }
4383 
4384 int
4385 iflib_device_iov_add_vf(device_t dev, uint16_t vfnum, const nvlist_t *params)
4386 {
4387 	int error;
4388 	if_ctx_t ctx = device_get_softc(dev);
4389 
4390 	CTX_LOCK(ctx);
4391 	error = IFDI_IOV_VF_ADD(ctx, vfnum, params);
4392 	CTX_UNLOCK(ctx);
4393 
4394 	return (error);
4395 }
4396 
4397 /*********************************************************************
4398  *
4399  *  MODULE FUNCTION DEFINITIONS
4400  *
4401  **********************************************************************/
4402 
4403 /*
4404  * - Start a fast taskqueue thread for each core
4405  * - Start a taskqueue for control operations
4406  */
4407 static int
4408 iflib_module_init(void)
4409 {
4410 	return (0);
4411 }
4412 
4413 static int
4414 iflib_module_event_handler(module_t mod, int what, void *arg)
4415 {
4416 	int err;
4417 
4418 	switch (what) {
4419 	case MOD_LOAD:
4420 		if ((err = iflib_module_init()) != 0)
4421 			return (err);
4422 		break;
4423 	case MOD_UNLOAD:
4424 		return (EBUSY);
4425 	default:
4426 		return (EOPNOTSUPP);
4427 	}
4428 
4429 	return (0);
4430 }
4431 
4432 /*********************************************************************
4433  *
4434  *  PUBLIC FUNCTION DEFINITIONS
4435  *     ordered as in iflib.h
4436  *
4437  **********************************************************************/
4438 
4439 
4440 static void
4441 _iflib_assert(if_shared_ctx_t sctx)
4442 {
4443 	MPASS(sctx->isc_tx_maxsize);
4444 	MPASS(sctx->isc_tx_maxsegsize);
4445 
4446 	MPASS(sctx->isc_rx_maxsize);
4447 	MPASS(sctx->isc_rx_nsegments);
4448 	MPASS(sctx->isc_rx_maxsegsize);
4449 
4450 	MPASS(sctx->isc_nrxd_min[0]);
4451 	MPASS(sctx->isc_nrxd_max[0]);
4452 	MPASS(sctx->isc_nrxd_default[0]);
4453 	MPASS(sctx->isc_ntxd_min[0]);
4454 	MPASS(sctx->isc_ntxd_max[0]);
4455 	MPASS(sctx->isc_ntxd_default[0]);
4456 }
4457 
4458 static void
4459 _iflib_pre_assert(if_softc_ctx_t scctx)
4460 {
4461 
4462 	MPASS(scctx->isc_txrx->ift_txd_encap);
4463 	MPASS(scctx->isc_txrx->ift_txd_flush);
4464 	MPASS(scctx->isc_txrx->ift_txd_credits_update);
4465 	MPASS(scctx->isc_txrx->ift_rxd_available);
4466 	MPASS(scctx->isc_txrx->ift_rxd_pkt_get);
4467 	MPASS(scctx->isc_txrx->ift_rxd_refill);
4468 	MPASS(scctx->isc_txrx->ift_rxd_flush);
4469 }
4470 
4471 static int
4472 iflib_register(if_ctx_t ctx)
4473 {
4474 	if_shared_ctx_t sctx = ctx->ifc_sctx;
4475 	driver_t *driver = sctx->isc_driver;
4476 	device_t dev = ctx->ifc_dev;
4477 	if_t ifp;
4478 
4479 	_iflib_assert(sctx);
4480 
4481 	CTX_LOCK_INIT(ctx, device_get_nameunit(ctx->ifc_dev));
4482 
4483 	ifp = ctx->ifc_ifp = if_gethandle(IFT_ETHER);
4484 	if (ifp == NULL) {
4485 		device_printf(dev, "can not allocate ifnet structure\n");
4486 		return (ENOMEM);
4487 	}
4488 
4489 	/*
4490 	 * Initialize our context's device specific methods
4491 	 */
4492 	kobj_init((kobj_t) ctx, (kobj_class_t) driver);
4493 	kobj_class_compile((kobj_class_t) driver);
4494 	driver->refs++;
4495 
4496 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4497 	if_setsoftc(ifp, ctx);
4498 	if_setdev(ifp, dev);
4499 	if_setinitfn(ifp, iflib_if_init);
4500 	if_setioctlfn(ifp, iflib_if_ioctl);
4501 	if_settransmitfn(ifp, iflib_if_transmit);
4502 	if_setqflushfn(ifp, iflib_if_qflush);
4503 	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
4504 
4505 	ctx->ifc_vlan_attach_event =
4506 		EVENTHANDLER_REGISTER(vlan_config, iflib_vlan_register, ctx,
4507 							  EVENTHANDLER_PRI_FIRST);
4508 	ctx->ifc_vlan_detach_event =
4509 		EVENTHANDLER_REGISTER(vlan_unconfig, iflib_vlan_unregister, ctx,
4510 							  EVENTHANDLER_PRI_FIRST);
4511 
4512 	ifmedia_init(&ctx->ifc_media, IFM_IMASK,
4513 					 iflib_media_change, iflib_media_status);
4514 
4515 	return (0);
4516 }
4517 
4518 
4519 static int
4520 iflib_queues_alloc(if_ctx_t ctx)
4521 {
4522 	if_shared_ctx_t sctx = ctx->ifc_sctx;
4523 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
4524 	device_t dev = ctx->ifc_dev;
4525 	int nrxqsets = scctx->isc_nrxqsets;
4526 	int ntxqsets = scctx->isc_ntxqsets;
4527 	iflib_txq_t txq;
4528 	iflib_rxq_t rxq;
4529 	iflib_fl_t fl = NULL;
4530 	int i, j, cpu, err, txconf, rxconf;
4531 	iflib_dma_info_t ifdip;
4532 	uint32_t *rxqsizes = scctx->isc_rxqsizes;
4533 	uint32_t *txqsizes = scctx->isc_txqsizes;
4534 	uint8_t nrxqs = sctx->isc_nrxqs;
4535 	uint8_t ntxqs = sctx->isc_ntxqs;
4536 	int nfree_lists = sctx->isc_nfl ? sctx->isc_nfl : 1;
4537 	caddr_t *vaddrs;
4538 	uint64_t *paddrs;
4539 	struct ifmp_ring **brscp;
4540 
4541 	KASSERT(ntxqs > 0, ("number of queues per qset must be at least 1"));
4542 	KASSERT(nrxqs > 0, ("number of queues per qset must be at least 1"));
4543 
4544 	brscp = NULL;
4545 	txq = NULL;
4546 	rxq = NULL;
4547 
4548 /* Allocate the TX ring struct memory */
4549 	if (!(txq =
4550 	    (iflib_txq_t) malloc(sizeof(struct iflib_txq) *
4551 	    ntxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
4552 		device_printf(dev, "Unable to allocate TX ring memory\n");
4553 		err = ENOMEM;
4554 		goto fail;
4555 	}
4556 
4557 	/* Now allocate the RX */
4558 	if (!(rxq =
4559 	    (iflib_rxq_t) malloc(sizeof(struct iflib_rxq) *
4560 	    nrxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
4561 		device_printf(dev, "Unable to allocate RX ring memory\n");
4562 		err = ENOMEM;
4563 		goto rx_fail;
4564 	}
4565 
4566 	ctx->ifc_txqs = txq;
4567 	ctx->ifc_rxqs = rxq;
4568 
4569 	/*
4570 	 * XXX handle allocation failure
4571 	 */
4572 	for (txconf = i = 0, cpu = CPU_FIRST(); i < ntxqsets; i++, txconf++, txq++, cpu = CPU_NEXT(cpu)) {
4573 		/* Set up some basics */
4574 
4575 		if ((ifdip = malloc(sizeof(struct iflib_dma_info) * ntxqs, M_IFLIB, M_WAITOK|M_ZERO)) == NULL) {
4576 			device_printf(dev, "failed to allocate iflib_dma_info\n");
4577 			err = ENOMEM;
4578 			goto err_tx_desc;
4579 		}
4580 		txq->ift_ifdi = ifdip;
4581 		for (j = 0; j < ntxqs; j++, ifdip++) {
4582 			if (iflib_dma_alloc(ctx, txqsizes[j], ifdip, BUS_DMA_NOWAIT)) {
4583 				device_printf(dev, "Unable to allocate Descriptor memory\n");
4584 				err = ENOMEM;
4585 				goto err_tx_desc;
4586 			}
4587 			txq->ift_txd_size[j] = scctx->isc_txd_size[j];
4588 			bzero((void *)ifdip->idi_vaddr, txqsizes[j]);
4589 		}
4590 		txq->ift_ctx = ctx;
4591 		txq->ift_id = i;
4592 		if (sctx->isc_flags & IFLIB_HAS_TXCQ) {
4593 			txq->ift_br_offset = 1;
4594 		} else {
4595 			txq->ift_br_offset = 0;
4596 		}
4597 		/* XXX fix this */
4598 		txq->ift_timer.c_cpu = cpu;
4599 
4600 		if (iflib_txsd_alloc(txq)) {
4601 			device_printf(dev, "Critical Failure setting up TX buffers\n");
4602 			err = ENOMEM;
4603 			goto err_tx_desc;
4604 		}
4605 
4606 		/* Initialize the TX lock */
4607 		snprintf(txq->ift_mtx_name, MTX_NAME_LEN, "%s:tx(%d):callout",
4608 		    device_get_nameunit(dev), txq->ift_id);
4609 		mtx_init(&txq->ift_mtx, txq->ift_mtx_name, NULL, MTX_DEF);
4610 		callout_init_mtx(&txq->ift_timer, &txq->ift_mtx, 0);
4611 
4612 		snprintf(txq->ift_db_mtx_name, MTX_NAME_LEN, "%s:tx(%d):db",
4613 			 device_get_nameunit(dev), txq->ift_id);
4614 
4615 		err = ifmp_ring_alloc(&txq->ift_br, 2048, txq, iflib_txq_drain,
4616 				      iflib_txq_can_drain, M_IFLIB, M_WAITOK);
4617 		if (err) {
4618 			/* XXX free any allocated rings */
4619 			device_printf(dev, "Unable to allocate buf_ring\n");
4620 			goto err_tx_desc;
4621 		}
4622 	}
4623 
4624 	for (rxconf = i = 0; i < nrxqsets; i++, rxconf++, rxq++) {
4625 		/* Set up some basics */
4626 
4627 		if ((ifdip = malloc(sizeof(struct iflib_dma_info) * nrxqs, M_IFLIB, M_WAITOK|M_ZERO)) == NULL) {
4628 			device_printf(dev, "failed to allocate iflib_dma_info\n");
4629 			err = ENOMEM;
4630 			goto err_tx_desc;
4631 		}
4632 
4633 		rxq->ifr_ifdi = ifdip;
4634 		/* XXX this needs to be changed if #rx queues != #tx queues */
4635 		rxq->ifr_ntxqirq = 1;
4636 		rxq->ifr_txqid[0] = i;
4637 		for (j = 0; j < nrxqs; j++, ifdip++) {
4638 			if (iflib_dma_alloc(ctx, rxqsizes[j], ifdip, BUS_DMA_NOWAIT)) {
4639 				device_printf(dev, "Unable to allocate Descriptor memory\n");
4640 				err = ENOMEM;
4641 				goto err_tx_desc;
4642 			}
4643 			bzero((void *)ifdip->idi_vaddr, rxqsizes[j]);
4644 		}
4645 		rxq->ifr_ctx = ctx;
4646 		rxq->ifr_id = i;
4647 		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
4648 			rxq->ifr_fl_offset = 1;
4649 		} else {
4650 			rxq->ifr_fl_offset = 0;
4651 		}
4652 		rxq->ifr_nfl = nfree_lists;
4653 		if (!(fl =
4654 			  (iflib_fl_t) malloc(sizeof(struct iflib_fl) * nfree_lists, M_IFLIB, M_NOWAIT | M_ZERO))) {
4655 			device_printf(dev, "Unable to allocate free list memory\n");
4656 			err = ENOMEM;
4657 			goto err_tx_desc;
4658 		}
4659 		rxq->ifr_fl = fl;
4660 		for (j = 0; j < nfree_lists; j++) {
4661 			fl[j].ifl_rxq = rxq;
4662 			fl[j].ifl_id = j;
4663 			fl[j].ifl_ifdi = &rxq->ifr_ifdi[j + rxq->ifr_fl_offset];
4664 			fl[j].ifl_rxd_size = scctx->isc_rxd_size[j];
4665 		}
4666         /* Allocate receive buffers for the ring*/
4667 		if (iflib_rxsd_alloc(rxq)) {
4668 			device_printf(dev,
4669 			    "Critical Failure setting up receive buffers\n");
4670 			err = ENOMEM;
4671 			goto err_rx_desc;
4672 		}
4673 	}
4674 
4675 	/* TXQs */
4676 	vaddrs = malloc(sizeof(caddr_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK);
4677 	paddrs = malloc(sizeof(uint64_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK);
4678 	for (i = 0; i < ntxqsets; i++) {
4679 		iflib_dma_info_t di = ctx->ifc_txqs[i].ift_ifdi;
4680 
4681 		for (j = 0; j < ntxqs; j++, di++) {
4682 			vaddrs[i*ntxqs + j] = di->idi_vaddr;
4683 			paddrs[i*ntxqs + j] = di->idi_paddr;
4684 		}
4685 	}
4686 	if ((err = IFDI_TX_QUEUES_ALLOC(ctx, vaddrs, paddrs, ntxqs, ntxqsets)) != 0) {
4687 		device_printf(ctx->ifc_dev, "device queue allocation failed\n");
4688 		iflib_tx_structures_free(ctx);
4689 		free(vaddrs, M_IFLIB);
4690 		free(paddrs, M_IFLIB);
4691 		goto err_rx_desc;
4692 	}
4693 	free(vaddrs, M_IFLIB);
4694 	free(paddrs, M_IFLIB);
4695 
4696 	/* RXQs */
4697 	vaddrs = malloc(sizeof(caddr_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK);
4698 	paddrs = malloc(sizeof(uint64_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK);
4699 	for (i = 0; i < nrxqsets; i++) {
4700 		iflib_dma_info_t di = ctx->ifc_rxqs[i].ifr_ifdi;
4701 
4702 		for (j = 0; j < nrxqs; j++, di++) {
4703 			vaddrs[i*nrxqs + j] = di->idi_vaddr;
4704 			paddrs[i*nrxqs + j] = di->idi_paddr;
4705 		}
4706 	}
4707 	if ((err = IFDI_RX_QUEUES_ALLOC(ctx, vaddrs, paddrs, nrxqs, nrxqsets)) != 0) {
4708 		device_printf(ctx->ifc_dev, "device queue allocation failed\n");
4709 		iflib_tx_structures_free(ctx);
4710 		free(vaddrs, M_IFLIB);
4711 		free(paddrs, M_IFLIB);
4712 		goto err_rx_desc;
4713 	}
4714 	free(vaddrs, M_IFLIB);
4715 	free(paddrs, M_IFLIB);
4716 
4717 	return (0);
4718 
4719 /* XXX handle allocation failure changes */
4720 err_rx_desc:
4721 err_tx_desc:
4722 	if (ctx->ifc_rxqs != NULL)
4723 		free(ctx->ifc_rxqs, M_IFLIB);
4724 	ctx->ifc_rxqs = NULL;
4725 	if (ctx->ifc_txqs != NULL)
4726 		free(ctx->ifc_txqs, M_IFLIB);
4727 	ctx->ifc_txqs = NULL;
4728 rx_fail:
4729 	if (brscp != NULL)
4730 		free(brscp, M_IFLIB);
4731 	if (rxq != NULL)
4732 		free(rxq, M_IFLIB);
4733 	if (txq != NULL)
4734 		free(txq, M_IFLIB);
4735 fail:
4736 	return (err);
4737 }
4738 
4739 static int
4740 iflib_tx_structures_setup(if_ctx_t ctx)
4741 {
4742 	iflib_txq_t txq = ctx->ifc_txqs;
4743 	int i;
4744 
4745 	for (i = 0; i < NTXQSETS(ctx); i++, txq++)
4746 		iflib_txq_setup(txq);
4747 
4748 	return (0);
4749 }
4750 
4751 static void
4752 iflib_tx_structures_free(if_ctx_t ctx)
4753 {
4754 	iflib_txq_t txq = ctx->ifc_txqs;
4755 	int i, j;
4756 
4757 	for (i = 0; i < NTXQSETS(ctx); i++, txq++) {
4758 		iflib_txq_destroy(txq);
4759 		for (j = 0; j < ctx->ifc_nhwtxqs; j++)
4760 			iflib_dma_free(&txq->ift_ifdi[j]);
4761 	}
4762 	free(ctx->ifc_txqs, M_IFLIB);
4763 	ctx->ifc_txqs = NULL;
4764 	IFDI_QUEUES_FREE(ctx);
4765 }
4766 
4767 /*********************************************************************
4768  *
4769  *  Initialize all receive rings.
4770  *
4771  **********************************************************************/
4772 static int
4773 iflib_rx_structures_setup(if_ctx_t ctx)
4774 {
4775 	iflib_rxq_t rxq = ctx->ifc_rxqs;
4776 	int q;
4777 #if defined(INET6) || defined(INET)
4778 	int i, err;
4779 #endif
4780 
4781 	for (q = 0; q < ctx->ifc_softc_ctx.isc_nrxqsets; q++, rxq++) {
4782 #if defined(INET6) || defined(INET)
4783 		tcp_lro_free(&rxq->ifr_lc);
4784 		if ((err = tcp_lro_init_args(&rxq->ifr_lc, ctx->ifc_ifp,
4785 		    TCP_LRO_ENTRIES, min(1024,
4786 		    ctx->ifc_softc_ctx.isc_nrxd[rxq->ifr_fl_offset]))) != 0) {
4787 			device_printf(ctx->ifc_dev, "LRO Initialization failed!\n");
4788 			goto fail;
4789 		}
4790 		rxq->ifr_lro_enabled = TRUE;
4791 #endif
4792 		IFDI_RXQ_SETUP(ctx, rxq->ifr_id);
4793 	}
4794 	return (0);
4795 #if defined(INET6) || defined(INET)
4796 fail:
4797 	/*
4798 	 * Free RX software descriptors allocated so far, we will only handle
4799 	 * the rings that completed, the failing case will have
4800 	 * cleaned up for itself. 'q' failed, so its the terminus.
4801 	 */
4802 	rxq = ctx->ifc_rxqs;
4803 	for (i = 0; i < q; ++i, rxq++) {
4804 		iflib_rx_sds_free(rxq);
4805 		rxq->ifr_cq_gen = rxq->ifr_cq_cidx = rxq->ifr_cq_pidx = 0;
4806 	}
4807 	return (err);
4808 #endif
4809 }
4810 
4811 /*********************************************************************
4812  *
4813  *  Free all receive rings.
4814  *
4815  **********************************************************************/
4816 static void
4817 iflib_rx_structures_free(if_ctx_t ctx)
4818 {
4819 	iflib_rxq_t rxq = ctx->ifc_rxqs;
4820 
4821 	for (int i = 0; i < ctx->ifc_softc_ctx.isc_nrxqsets; i++, rxq++) {
4822 		iflib_rx_sds_free(rxq);
4823 	}
4824 }
4825 
4826 static int
4827 iflib_qset_structures_setup(if_ctx_t ctx)
4828 {
4829 	int err;
4830 
4831 	if ((err = iflib_tx_structures_setup(ctx)) != 0)
4832 		return (err);
4833 
4834 	if ((err = iflib_rx_structures_setup(ctx)) != 0) {
4835 		device_printf(ctx->ifc_dev, "iflib_rx_structures_setup failed: %d\n", err);
4836 		iflib_tx_structures_free(ctx);
4837 		iflib_rx_structures_free(ctx);
4838 	}
4839 	return (err);
4840 }
4841 
4842 int
4843 iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
4844 				driver_filter_t filter, void *filter_arg, driver_intr_t handler, void *arg, char *name)
4845 {
4846 
4847 	return (_iflib_irq_alloc(ctx, irq, rid, filter, handler, arg, name));
4848 }
4849 
4850 static int
4851 find_nth(if_ctx_t ctx, cpuset_t *cpus, int qid)
4852 {
4853 	int i, cpuid, eqid, count;
4854 
4855 	CPU_COPY(&ctx->ifc_cpus, cpus);
4856 	count = CPU_COUNT(&ctx->ifc_cpus);
4857 	eqid = qid % count;
4858 	/* clear up to the qid'th bit */
4859 	for (i = 0; i < eqid; i++) {
4860 		cpuid = CPU_FFS(cpus);
4861 		MPASS(cpuid != 0);
4862 		CPU_CLR(cpuid-1, cpus);
4863 	}
4864 	cpuid = CPU_FFS(cpus);
4865 	MPASS(cpuid != 0);
4866 	return (cpuid-1);
4867 }
4868 
4869 int
4870 iflib_irq_alloc_generic(if_ctx_t ctx, if_irq_t irq, int rid,
4871 						iflib_intr_type_t type, driver_filter_t *filter,
4872 						void *filter_arg, int qid, char *name)
4873 {
4874 	struct grouptask *gtask;
4875 	struct taskqgroup *tqg;
4876 	iflib_filter_info_t info;
4877 	cpuset_t cpus;
4878 	gtask_fn_t *fn;
4879 	int tqrid, err, cpuid;
4880 	driver_filter_t *intr_fast;
4881 	void *q;
4882 
4883 	info = &ctx->ifc_filter_info;
4884 	tqrid = rid;
4885 
4886 	switch (type) {
4887 	/* XXX merge tx/rx for netmap? */
4888 	case IFLIB_INTR_TX:
4889 		q = &ctx->ifc_txqs[qid];
4890 		info = &ctx->ifc_txqs[qid].ift_filter_info;
4891 		gtask = &ctx->ifc_txqs[qid].ift_task;
4892 		tqg = qgroup_if_io_tqg;
4893 		fn = _task_fn_tx;
4894 		intr_fast = iflib_fast_intr;
4895 		GROUPTASK_INIT(gtask, 0, fn, q);
4896 		break;
4897 	case IFLIB_INTR_RX:
4898 		q = &ctx->ifc_rxqs[qid];
4899 		info = &ctx->ifc_rxqs[qid].ifr_filter_info;
4900 		gtask = &ctx->ifc_rxqs[qid].ifr_task;
4901 		tqg = qgroup_if_io_tqg;
4902 		fn = _task_fn_rx;
4903 		intr_fast = iflib_fast_intr;
4904 		GROUPTASK_INIT(gtask, 0, fn, q);
4905 		break;
4906 	case IFLIB_INTR_RXTX:
4907 		q = &ctx->ifc_rxqs[qid];
4908 		info = &ctx->ifc_rxqs[qid].ifr_filter_info;
4909 		gtask = &ctx->ifc_rxqs[qid].ifr_task;
4910 		tqg = qgroup_if_io_tqg;
4911 		fn = _task_fn_rx;
4912 		intr_fast = iflib_fast_intr_rxtx;
4913 		GROUPTASK_INIT(gtask, 0, fn, q);
4914 		break;
4915 	case IFLIB_INTR_ADMIN:
4916 		q = ctx;
4917 		tqrid = -1;
4918 		info = &ctx->ifc_filter_info;
4919 		gtask = &ctx->ifc_admin_task;
4920 		tqg = qgroup_if_config_tqg;
4921 		fn = _task_fn_admin;
4922 		intr_fast = iflib_fast_intr_ctx;
4923 		break;
4924 	default:
4925 		panic("unknown net intr type");
4926 	}
4927 
4928 	info->ifi_filter = filter;
4929 	info->ifi_filter_arg = filter_arg;
4930 	info->ifi_task = gtask;
4931 	info->ifi_ctx = q;
4932 
4933 	err = _iflib_irq_alloc(ctx, irq, rid, intr_fast, NULL, info,  name);
4934 	if (err != 0) {
4935 		device_printf(ctx->ifc_dev, "_iflib_irq_alloc failed %d\n", err);
4936 		return (err);
4937 	}
4938 	if (type == IFLIB_INTR_ADMIN)
4939 		return (0);
4940 
4941 	if (tqrid != -1) {
4942 		cpuid = find_nth(ctx, &cpus, qid);
4943 		taskqgroup_attach_cpu(tqg, gtask, q, cpuid, irq->ii_rid, name);
4944 	} else {
4945 		taskqgroup_attach(tqg, gtask, q, tqrid, name);
4946 	}
4947 
4948 	return (0);
4949 }
4950 
4951 void
4952 iflib_softirq_alloc_generic(if_ctx_t ctx, int rid, iflib_intr_type_t type,  void *arg, int qid, char *name)
4953 {
4954 	struct grouptask *gtask;
4955 	struct taskqgroup *tqg;
4956 	gtask_fn_t *fn;
4957 	void *q;
4958 
4959 	switch (type) {
4960 	case IFLIB_INTR_TX:
4961 		q = &ctx->ifc_txqs[qid];
4962 		gtask = &ctx->ifc_txqs[qid].ift_task;
4963 		tqg = qgroup_if_io_tqg;
4964 		fn = _task_fn_tx;
4965 		break;
4966 	case IFLIB_INTR_RX:
4967 		q = &ctx->ifc_rxqs[qid];
4968 		gtask = &ctx->ifc_rxqs[qid].ifr_task;
4969 		tqg = qgroup_if_io_tqg;
4970 		fn = _task_fn_rx;
4971 		break;
4972 	case IFLIB_INTR_IOV:
4973 		q = ctx;
4974 		gtask = &ctx->ifc_vflr_task;
4975 		tqg = qgroup_if_config_tqg;
4976 		rid = -1;
4977 		fn = _task_fn_iov;
4978 		break;
4979 	default:
4980 		panic("unknown net intr type");
4981 	}
4982 	GROUPTASK_INIT(gtask, 0, fn, q);
4983 	taskqgroup_attach(tqg, gtask, q, rid, name);
4984 }
4985 
4986 void
4987 iflib_irq_free(if_ctx_t ctx, if_irq_t irq)
4988 {
4989 	if (irq->ii_tag)
4990 		bus_teardown_intr(ctx->ifc_dev, irq->ii_res, irq->ii_tag);
4991 
4992 	if (irq->ii_res)
4993 		bus_release_resource(ctx->ifc_dev, SYS_RES_IRQ, irq->ii_rid, irq->ii_res);
4994 }
4995 
4996 static int
4997 iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filter_arg, int *rid, char *name)
4998 {
4999 	iflib_txq_t txq = ctx->ifc_txqs;
5000 	iflib_rxq_t rxq = ctx->ifc_rxqs;
5001 	if_irq_t irq = &ctx->ifc_legacy_irq;
5002 	iflib_filter_info_t info;
5003 	struct grouptask *gtask;
5004 	struct taskqgroup *tqg;
5005 	gtask_fn_t *fn;
5006 	int tqrid;
5007 	void *q;
5008 	int err;
5009 
5010 	q = &ctx->ifc_rxqs[0];
5011 	info = &rxq[0].ifr_filter_info;
5012 	gtask = &rxq[0].ifr_task;
5013 	tqg = qgroup_if_io_tqg;
5014 	tqrid = irq->ii_rid = *rid;
5015 	fn = _task_fn_rx;
5016 
5017 	ctx->ifc_flags |= IFC_LEGACY;
5018 	info->ifi_filter = filter;
5019 	info->ifi_filter_arg = filter_arg;
5020 	info->ifi_task = gtask;
5021 	info->ifi_ctx = ctx;
5022 
5023 	/* We allocate a single interrupt resource */
5024 	if ((err = _iflib_irq_alloc(ctx, irq, tqrid, iflib_fast_intr_ctx, NULL, info, name)) != 0)
5025 		return (err);
5026 	GROUPTASK_INIT(gtask, 0, fn, q);
5027 	taskqgroup_attach(tqg, gtask, q, tqrid, name);
5028 
5029 	GROUPTASK_INIT(&txq->ift_task, 0, _task_fn_tx, txq);
5030 	taskqgroup_attach(qgroup_if_io_tqg, &txq->ift_task, txq, tqrid, "tx");
5031 	return (0);
5032 }
5033 
5034 void
5035 iflib_led_create(if_ctx_t ctx)
5036 {
5037 
5038 	ctx->ifc_led_dev = led_create(iflib_led_func, ctx,
5039 	    device_get_nameunit(ctx->ifc_dev));
5040 }
5041 
5042 void
5043 iflib_tx_intr_deferred(if_ctx_t ctx, int txqid)
5044 {
5045 
5046 	GROUPTASK_ENQUEUE(&ctx->ifc_txqs[txqid].ift_task);
5047 }
5048 
5049 void
5050 iflib_rx_intr_deferred(if_ctx_t ctx, int rxqid)
5051 {
5052 
5053 	GROUPTASK_ENQUEUE(&ctx->ifc_rxqs[rxqid].ifr_task);
5054 }
5055 
5056 void
5057 iflib_admin_intr_deferred(if_ctx_t ctx)
5058 {
5059 #ifdef INVARIANTS
5060 	struct grouptask *gtask;
5061 
5062 	gtask = &ctx->ifc_admin_task;
5063 	MPASS(gtask->gt_taskqueue != NULL);
5064 #endif
5065 
5066 	GROUPTASK_ENQUEUE(&ctx->ifc_admin_task);
5067 }
5068 
5069 void
5070 iflib_iov_intr_deferred(if_ctx_t ctx)
5071 {
5072 
5073 	GROUPTASK_ENQUEUE(&ctx->ifc_vflr_task);
5074 }
5075 
5076 void
5077 iflib_io_tqg_attach(struct grouptask *gt, void *uniq, int cpu, char *name)
5078 {
5079 
5080 	taskqgroup_attach_cpu(qgroup_if_io_tqg, gt, uniq, cpu, -1, name);
5081 }
5082 
5083 void
5084 iflib_config_gtask_init(if_ctx_t ctx, struct grouptask *gtask, gtask_fn_t *fn,
5085 	char *name)
5086 {
5087 
5088 	GROUPTASK_INIT(gtask, 0, fn, ctx);
5089 	taskqgroup_attach(qgroup_if_config_tqg, gtask, gtask, -1, name);
5090 }
5091 
5092 void
5093 iflib_config_gtask_deinit(struct grouptask *gtask)
5094 {
5095 
5096 	taskqgroup_detach(qgroup_if_config_tqg, gtask);
5097 }
5098 
5099 void
5100 iflib_link_state_change(if_ctx_t ctx, int link_state, uint64_t baudrate)
5101 {
5102 	if_t ifp = ctx->ifc_ifp;
5103 	iflib_txq_t txq = ctx->ifc_txqs;
5104 
5105 	if_setbaudrate(ifp, baudrate);
5106 	if (baudrate >= IF_Gbps(10))
5107 		ctx->ifc_flags |= IFC_PREFETCH;
5108 
5109 	/* If link down, disable watchdog */
5110 	if ((ctx->ifc_link_state == LINK_STATE_UP) && (link_state == LINK_STATE_DOWN)) {
5111 		for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxqsets; i++, txq++)
5112 			txq->ift_qstatus = IFLIB_QUEUE_IDLE;
5113 	}
5114 	ctx->ifc_link_state = link_state;
5115 	if_link_state_change(ifp, link_state);
5116 }
5117 
5118 static int
5119 iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq)
5120 {
5121 	int credits;
5122 #ifdef INVARIANTS
5123 	int credits_pre = txq->ift_cidx_processed;
5124 #endif
5125 
5126 	if (ctx->isc_txd_credits_update == NULL)
5127 		return (0);
5128 
5129 	if ((credits = ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, true)) == 0)
5130 		return (0);
5131 
5132 	txq->ift_processed += credits;
5133 	txq->ift_cidx_processed += credits;
5134 
5135 	MPASS(credits_pre + credits == txq->ift_cidx_processed);
5136 	if (txq->ift_cidx_processed >= txq->ift_size)
5137 		txq->ift_cidx_processed -= txq->ift_size;
5138 	return (credits);
5139 }
5140 
5141 static int
5142 iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, qidx_t cidx, qidx_t budget)
5143 {
5144 
5145 	return (ctx->isc_rxd_available(ctx->ifc_softc, rxq->ifr_id, cidx,
5146 	    budget));
5147 }
5148 
5149 void
5150 iflib_add_int_delay_sysctl(if_ctx_t ctx, const char *name,
5151 	const char *description, if_int_delay_info_t info,
5152 	int offset, int value)
5153 {
5154 	info->iidi_ctx = ctx;
5155 	info->iidi_offset = offset;
5156 	info->iidi_value = value;
5157 	SYSCTL_ADD_PROC(device_get_sysctl_ctx(ctx->ifc_dev),
5158 	    SYSCTL_CHILDREN(device_get_sysctl_tree(ctx->ifc_dev)),
5159 	    OID_AUTO, name, CTLTYPE_INT|CTLFLAG_RW,
5160 	    info, 0, iflib_sysctl_int_delay, "I", description);
5161 }
5162 
5163 struct mtx *
5164 iflib_ctx_lock_get(if_ctx_t ctx)
5165 {
5166 
5167 	return (&ctx->ifc_mtx);
5168 }
5169 
5170 static int
5171 iflib_msix_init(if_ctx_t ctx)
5172 {
5173 	device_t dev = ctx->ifc_dev;
5174 	if_shared_ctx_t sctx = ctx->ifc_sctx;
5175 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
5176 	int vectors, queues, rx_queues, tx_queues, queuemsgs, msgs;
5177 	int iflib_num_tx_queues, iflib_num_rx_queues;
5178 	int err, admincnt, bar;
5179 
5180 	iflib_num_tx_queues = scctx->isc_ntxqsets;
5181 	iflib_num_rx_queues = scctx->isc_nrxqsets;
5182 
5183 	device_printf(dev, "msix_init qsets capped at %d\n", iflib_num_tx_queues);
5184 
5185 	bar = ctx->ifc_softc_ctx.isc_msix_bar;
5186 	admincnt = sctx->isc_admin_intrcnt;
5187 	/* Override by tuneable */
5188 	if (scctx->isc_disable_msix)
5189 		goto msi;
5190 
5191 	/*
5192 	** When used in a virtualized environment
5193 	** PCI BUSMASTER capability may not be set
5194 	** so explicity set it here and rewrite
5195 	** the ENABLE in the MSIX control register
5196 	** at this point to cause the host to
5197 	** successfully initialize us.
5198 	*/
5199 	{
5200 		int msix_ctrl, rid;
5201 
5202  		pci_enable_busmaster(dev);
5203 		rid = 0;
5204 		if (pci_find_cap(dev, PCIY_MSIX, &rid) == 0 && rid != 0) {
5205 			rid += PCIR_MSIX_CTRL;
5206 			msix_ctrl = pci_read_config(dev, rid, 2);
5207 			msix_ctrl |= PCIM_MSIXCTRL_MSIX_ENABLE;
5208 			pci_write_config(dev, rid, msix_ctrl, 2);
5209 		} else {
5210 			device_printf(dev, "PCIY_MSIX capability not found; "
5211 			                   "or rid %d == 0.\n", rid);
5212 			goto msi;
5213 		}
5214 	}
5215 
5216 	/*
5217 	 * bar == -1 => "trust me I know what I'm doing"
5218 	 * https://www.youtube.com/watch?v=nnwWKkNau4I
5219 	 * Some drivers are for hardware that is so shoddily
5220 	 * documented that no one knows which bars are which
5221 	 * so the developer has to map all bars. This hack
5222 	 * allows shoddy garbage to use msix in this framework.
5223 	 */
5224 	if (bar != -1) {
5225 		ctx->ifc_msix_mem = bus_alloc_resource_any(dev,
5226 	            SYS_RES_MEMORY, &bar, RF_ACTIVE);
5227 		if (ctx->ifc_msix_mem == NULL) {
5228 			/* May not be enabled */
5229 			device_printf(dev, "Unable to map MSIX table \n");
5230 			goto msi;
5231 		}
5232 	}
5233 	/* First try MSI/X */
5234 	if ((msgs = pci_msix_count(dev)) == 0) { /* system has msix disabled */
5235 		device_printf(dev, "System has MSIX disabled \n");
5236 		bus_release_resource(dev, SYS_RES_MEMORY,
5237 		    bar, ctx->ifc_msix_mem);
5238 		ctx->ifc_msix_mem = NULL;
5239 		goto msi;
5240 	}
5241 #if IFLIB_DEBUG
5242 	/* use only 1 qset in debug mode */
5243 	queuemsgs = min(msgs - admincnt, 1);
5244 #else
5245 	queuemsgs = msgs - admincnt;
5246 #endif
5247 	if (bus_get_cpus(dev, INTR_CPUS, sizeof(ctx->ifc_cpus), &ctx->ifc_cpus) == 0) {
5248 #ifdef RSS
5249 		queues = imin(queuemsgs, rss_getnumbuckets());
5250 #else
5251 		queues = queuemsgs;
5252 #endif
5253 		queues = imin(CPU_COUNT(&ctx->ifc_cpus), queues);
5254 		device_printf(dev, "pxm cpus: %d queue msgs: %d admincnt: %d\n",
5255 					  CPU_COUNT(&ctx->ifc_cpus), queuemsgs, admincnt);
5256 	} else {
5257 		device_printf(dev, "Unable to fetch CPU list\n");
5258 		/* Figure out a reasonable auto config value */
5259 		queues = min(queuemsgs, mp_ncpus);
5260 	}
5261 #ifdef  RSS
5262 	/* If we're doing RSS, clamp at the number of RSS buckets */
5263 	if (queues > rss_getnumbuckets())
5264 		queues = rss_getnumbuckets();
5265 #endif
5266 	if (iflib_num_rx_queues > 0 && iflib_num_rx_queues < queuemsgs - admincnt)
5267 		rx_queues = iflib_num_rx_queues;
5268 	else
5269 		rx_queues = queues;
5270 	/*
5271 	 * We want this to be all logical CPUs by default
5272 	 */
5273 	if (iflib_num_tx_queues > 0 && iflib_num_tx_queues < queues)
5274 		tx_queues = iflib_num_tx_queues;
5275 	else
5276 		tx_queues = mp_ncpus;
5277 
5278 	if (ctx->ifc_sysctl_qs_eq_override == 0) {
5279 #ifdef INVARIANTS
5280 		if (tx_queues != rx_queues)
5281 			device_printf(dev, "queue equality override not set, capping rx_queues at %d and tx_queues at %d\n",
5282 				      min(rx_queues, tx_queues), min(rx_queues, tx_queues));
5283 #endif
5284 		tx_queues = min(rx_queues, tx_queues);
5285 		rx_queues = min(rx_queues, tx_queues);
5286 	}
5287 
5288 	device_printf(dev, "using %d rx queues %d tx queues \n", rx_queues, tx_queues);
5289 
5290 	vectors = rx_queues + admincnt;
5291 	if ((err = pci_alloc_msix(dev, &vectors)) == 0) {
5292 		device_printf(dev,
5293 					  "Using MSIX interrupts with %d vectors\n", vectors);
5294 		scctx->isc_vectors = vectors;
5295 		scctx->isc_nrxqsets = rx_queues;
5296 		scctx->isc_ntxqsets = tx_queues;
5297 		scctx->isc_intr = IFLIB_INTR_MSIX;
5298 
5299 		return (vectors);
5300 	} else {
5301 		device_printf(dev, "failed to allocate %d msix vectors, err: %d - using MSI\n", vectors, err);
5302 	}
5303 msi:
5304 	vectors = pci_msi_count(dev);
5305 	scctx->isc_nrxqsets = 1;
5306 	scctx->isc_ntxqsets = 1;
5307 	scctx->isc_vectors = vectors;
5308 	if (vectors == 1 && pci_alloc_msi(dev, &vectors) == 0) {
5309 		device_printf(dev,"Using an MSI interrupt\n");
5310 		scctx->isc_intr = IFLIB_INTR_MSI;
5311 	} else {
5312 		device_printf(dev,"Using a Legacy interrupt\n");
5313 		scctx->isc_intr = IFLIB_INTR_LEGACY;
5314 	}
5315 
5316 	return (vectors);
5317 }
5318 
5319 char * ring_states[] = { "IDLE", "BUSY", "STALLED", "ABDICATED" };
5320 
5321 static int
5322 mp_ring_state_handler(SYSCTL_HANDLER_ARGS)
5323 {
5324 	int rc;
5325 	uint16_t *state = ((uint16_t *)oidp->oid_arg1);
5326 	struct sbuf *sb;
5327 	char *ring_state = "UNKNOWN";
5328 
5329 	/* XXX needed ? */
5330 	rc = sysctl_wire_old_buffer(req, 0);
5331 	MPASS(rc == 0);
5332 	if (rc != 0)
5333 		return (rc);
5334 	sb = sbuf_new_for_sysctl(NULL, NULL, 80, req);
5335 	MPASS(sb != NULL);
5336 	if (sb == NULL)
5337 		return (ENOMEM);
5338 	if (state[3] <= 3)
5339 		ring_state = ring_states[state[3]];
5340 
5341 	sbuf_printf(sb, "pidx_head: %04hd pidx_tail: %04hd cidx: %04hd state: %s",
5342 		    state[0], state[1], state[2], ring_state);
5343 	rc = sbuf_finish(sb);
5344 	sbuf_delete(sb);
5345         return(rc);
5346 }
5347 
5348 enum iflib_ndesc_handler {
5349 	IFLIB_NTXD_HANDLER,
5350 	IFLIB_NRXD_HANDLER,
5351 };
5352 
5353 static int
5354 mp_ndesc_handler(SYSCTL_HANDLER_ARGS)
5355 {
5356 	if_ctx_t ctx = (void *)arg1;
5357 	enum iflib_ndesc_handler type = arg2;
5358 	char buf[256] = {0};
5359 	qidx_t *ndesc;
5360 	char *p, *next;
5361 	int nqs, rc, i;
5362 
5363 	MPASS(type == IFLIB_NTXD_HANDLER || type == IFLIB_NRXD_HANDLER);
5364 
5365 	nqs = 8;
5366 	switch(type) {
5367 	case IFLIB_NTXD_HANDLER:
5368 		ndesc = ctx->ifc_sysctl_ntxds;
5369 		if (ctx->ifc_sctx)
5370 			nqs = ctx->ifc_sctx->isc_ntxqs;
5371 		break;
5372 	case IFLIB_NRXD_HANDLER:
5373 		ndesc = ctx->ifc_sysctl_nrxds;
5374 		if (ctx->ifc_sctx)
5375 			nqs = ctx->ifc_sctx->isc_nrxqs;
5376 		break;
5377 	}
5378 	if (nqs == 0)
5379 		nqs = 8;
5380 
5381 	for (i=0; i<8; i++) {
5382 		if (i >= nqs)
5383 			break;
5384 		if (i)
5385 			strcat(buf, ",");
5386 		sprintf(strchr(buf, 0), "%d", ndesc[i]);
5387 	}
5388 
5389 	rc = sysctl_handle_string(oidp, buf, sizeof(buf), req);
5390 	if (rc || req->newptr == NULL)
5391 		return rc;
5392 
5393 	for (i = 0, next = buf, p = strsep(&next, " ,"); i < 8 && p;
5394 	    i++, p = strsep(&next, " ,")) {
5395 		ndesc[i] = strtoul(p, NULL, 10);
5396 	}
5397 
5398 	return(rc);
5399 }
5400 
5401 #define NAME_BUFLEN 32
5402 static void
5403 iflib_add_device_sysctl_pre(if_ctx_t ctx)
5404 {
5405         device_t dev = iflib_get_dev(ctx);
5406 	struct sysctl_oid_list *child, *oid_list;
5407 	struct sysctl_ctx_list *ctx_list;
5408 	struct sysctl_oid *node;
5409 
5410 	ctx_list = device_get_sysctl_ctx(dev);
5411 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
5412 	ctx->ifc_sysctl_node = node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, "iflib",
5413 						      CTLFLAG_RD, NULL, "IFLIB fields");
5414 	oid_list = SYSCTL_CHILDREN(node);
5415 
5416 	SYSCTL_ADD_STRING(ctx_list, oid_list, OID_AUTO, "driver_version",
5417 		       CTLFLAG_RD, ctx->ifc_sctx->isc_driver_version, 0,
5418 		       "driver version");
5419 
5420 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_ntxqs",
5421 		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_ntxqs, 0,
5422 			"# of txqs to use, 0 => use default #");
5423 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_nrxqs",
5424 		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_nrxqs, 0,
5425 			"# of rxqs to use, 0 => use default #");
5426 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_qs_enable",
5427 		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_qs_eq_override, 0,
5428                        "permit #txq != #rxq");
5429        SYSCTL_ADD_INT(ctx_list, oid_list, OID_AUTO, "disable_msix",
5430                       CTLFLAG_RWTUN, &ctx->ifc_softc_ctx.isc_disable_msix, 0,
5431                       "disable MSIX (default 0)");
5432 
5433 	/* XXX change for per-queue sizes */
5434 	SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_ntxds",
5435 		       CTLTYPE_STRING|CTLFLAG_RWTUN, ctx, IFLIB_NTXD_HANDLER,
5436                        mp_ndesc_handler, "A",
5437                        "list of # of tx descriptors to use, 0 = use default #");
5438 	SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_nrxds",
5439 		       CTLTYPE_STRING|CTLFLAG_RWTUN, ctx, IFLIB_NRXD_HANDLER,
5440                        mp_ndesc_handler, "A",
5441                        "list of # of rx descriptors to use, 0 = use default #");
5442 }
5443 
5444 static void
5445 iflib_add_device_sysctl_post(if_ctx_t ctx)
5446 {
5447 	if_shared_ctx_t sctx = ctx->ifc_sctx;
5448 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
5449         device_t dev = iflib_get_dev(ctx);
5450 	struct sysctl_oid_list *child;
5451 	struct sysctl_ctx_list *ctx_list;
5452 	iflib_fl_t fl;
5453 	iflib_txq_t txq;
5454 	iflib_rxq_t rxq;
5455 	int i, j;
5456 	char namebuf[NAME_BUFLEN];
5457 	char *qfmt;
5458 	struct sysctl_oid *queue_node, *fl_node, *node;
5459 	struct sysctl_oid_list *queue_list, *fl_list;
5460 	ctx_list = device_get_sysctl_ctx(dev);
5461 
5462 	node = ctx->ifc_sysctl_node;
5463 	child = SYSCTL_CHILDREN(node);
5464 
5465 	if (scctx->isc_ntxqsets > 100)
5466 		qfmt = "txq%03d";
5467 	else if (scctx->isc_ntxqsets > 10)
5468 		qfmt = "txq%02d";
5469 	else
5470 		qfmt = "txq%d";
5471 	for (i = 0, txq = ctx->ifc_txqs; i < scctx->isc_ntxqsets; i++, txq++) {
5472 		snprintf(namebuf, NAME_BUFLEN, qfmt, i);
5473 		queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf,
5474 					     CTLFLAG_RD, NULL, "Queue Name");
5475 		queue_list = SYSCTL_CHILDREN(queue_node);
5476 #if MEMORY_LOGGING
5477 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_dequeued",
5478 				CTLFLAG_RD,
5479 				&txq->ift_dequeued, "total mbufs freed");
5480 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_enqueued",
5481 				CTLFLAG_RD,
5482 				&txq->ift_enqueued, "total mbufs enqueued");
5483 #endif
5484 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "mbuf_defrag",
5485 				   CTLFLAG_RD,
5486 				   &txq->ift_mbuf_defrag, "# of times m_defrag was called");
5487 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "m_pullups",
5488 				   CTLFLAG_RD,
5489 				   &txq->ift_pullups, "# of times m_pullup was called");
5490 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "mbuf_defrag_failed",
5491 				   CTLFLAG_RD,
5492 				   &txq->ift_mbuf_defrag_failed, "# of times m_defrag failed");
5493 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "no_desc_avail",
5494 				   CTLFLAG_RD,
5495 				   &txq->ift_no_desc_avail, "# of times no descriptors were available");
5496 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "tx_map_failed",
5497 				   CTLFLAG_RD,
5498 				   &txq->ift_map_failed, "# of times dma map failed");
5499 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txd_encap_efbig",
5500 				   CTLFLAG_RD,
5501 				   &txq->ift_txd_encap_efbig, "# of times txd_encap returned EFBIG");
5502 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "no_tx_dma_setup",
5503 				   CTLFLAG_RD,
5504 				   &txq->ift_no_tx_dma_setup, "# of times map failed for other than EFBIG");
5505 		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_pidx",
5506 				   CTLFLAG_RD,
5507 				   &txq->ift_pidx, 1, "Producer Index");
5508 		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_cidx",
5509 				   CTLFLAG_RD,
5510 				   &txq->ift_cidx, 1, "Consumer Index");
5511 		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_cidx_processed",
5512 				   CTLFLAG_RD,
5513 				   &txq->ift_cidx_processed, 1, "Consumer Index seen by credit update");
5514 		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_in_use",
5515 				   CTLFLAG_RD,
5516 				   &txq->ift_in_use, 1, "descriptors in use");
5517 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_processed",
5518 				   CTLFLAG_RD,
5519 				   &txq->ift_processed, "descriptors procesed for clean");
5520 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_cleaned",
5521 				   CTLFLAG_RD,
5522 				   &txq->ift_cleaned, "total cleaned");
5523 		SYSCTL_ADD_PROC(ctx_list, queue_list, OID_AUTO, "ring_state",
5524 				CTLTYPE_STRING | CTLFLAG_RD, __DEVOLATILE(uint64_t *, &txq->ift_br->state),
5525 				0, mp_ring_state_handler, "A", "soft ring state");
5526 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_enqueues",
5527 				       CTLFLAG_RD, &txq->ift_br->enqueues,
5528 				       "# of enqueues to the mp_ring for this queue");
5529 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_drops",
5530 				       CTLFLAG_RD, &txq->ift_br->drops,
5531 				       "# of drops in the mp_ring for this queue");
5532 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_starts",
5533 				       CTLFLAG_RD, &txq->ift_br->starts,
5534 				       "# of normal consumer starts in the mp_ring for this queue");
5535 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_stalls",
5536 				       CTLFLAG_RD, &txq->ift_br->stalls,
5537 					       "# of consumer stalls in the mp_ring for this queue");
5538 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_restarts",
5539 			       CTLFLAG_RD, &txq->ift_br->restarts,
5540 				       "# of consumer restarts in the mp_ring for this queue");
5541 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_abdications",
5542 				       CTLFLAG_RD, &txq->ift_br->abdications,
5543 				       "# of consumer abdications in the mp_ring for this queue");
5544 	}
5545 
5546 	if (scctx->isc_nrxqsets > 100)
5547 		qfmt = "rxq%03d";
5548 	else if (scctx->isc_nrxqsets > 10)
5549 		qfmt = "rxq%02d";
5550 	else
5551 		qfmt = "rxq%d";
5552 	for (i = 0, rxq = ctx->ifc_rxqs; i < scctx->isc_nrxqsets; i++, rxq++) {
5553 		snprintf(namebuf, NAME_BUFLEN, qfmt, i);
5554 		queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf,
5555 					     CTLFLAG_RD, NULL, "Queue Name");
5556 		queue_list = SYSCTL_CHILDREN(queue_node);
5557 		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
5558 			SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "rxq_cq_pidx",
5559 				       CTLFLAG_RD,
5560 				       &rxq->ifr_cq_pidx, 1, "Producer Index");
5561 			SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "rxq_cq_cidx",
5562 				       CTLFLAG_RD,
5563 				       &rxq->ifr_cq_cidx, 1, "Consumer Index");
5564 		}
5565 
5566 		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) {
5567 			snprintf(namebuf, NAME_BUFLEN, "rxq_fl%d", j);
5568 			fl_node = SYSCTL_ADD_NODE(ctx_list, queue_list, OID_AUTO, namebuf,
5569 						     CTLFLAG_RD, NULL, "freelist Name");
5570 			fl_list = SYSCTL_CHILDREN(fl_node);
5571 			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "pidx",
5572 				       CTLFLAG_RD,
5573 				       &fl->ifl_pidx, 1, "Producer Index");
5574 			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "cidx",
5575 				       CTLFLAG_RD,
5576 				       &fl->ifl_cidx, 1, "Consumer Index");
5577 			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "credits",
5578 				       CTLFLAG_RD,
5579 				       &fl->ifl_credits, 1, "credits available");
5580 #if MEMORY_LOGGING
5581 			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_m_enqueued",
5582 					CTLFLAG_RD,
5583 					&fl->ifl_m_enqueued, "mbufs allocated");
5584 			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_m_dequeued",
5585 					CTLFLAG_RD,
5586 					&fl->ifl_m_dequeued, "mbufs freed");
5587 			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_cl_enqueued",
5588 					CTLFLAG_RD,
5589 					&fl->ifl_cl_enqueued, "clusters allocated");
5590 			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_cl_dequeued",
5591 					CTLFLAG_RD,
5592 					&fl->ifl_cl_dequeued, "clusters freed");
5593 #endif
5594 
5595 		}
5596 	}
5597 
5598 }
5599 
5600 #ifndef __NO_STRICT_ALIGNMENT
5601 static struct mbuf *
5602 iflib_fixup_rx(struct mbuf *m)
5603 {
5604 	struct mbuf *n;
5605 
5606 	if (m->m_len <= (MCLBYTES - ETHER_HDR_LEN)) {
5607 		bcopy(m->m_data, m->m_data + ETHER_HDR_LEN, m->m_len);
5608 		m->m_data += ETHER_HDR_LEN;
5609 		n = m;
5610 	} else {
5611 		MGETHDR(n, M_NOWAIT, MT_DATA);
5612 		if (n == NULL) {
5613 			m_freem(m);
5614 			return (NULL);
5615 		}
5616 		bcopy(m->m_data, n->m_data, ETHER_HDR_LEN);
5617 		m->m_data += ETHER_HDR_LEN;
5618 		m->m_len -= ETHER_HDR_LEN;
5619 		n->m_len = ETHER_HDR_LEN;
5620 		M_MOVE_PKTHDR(n, m);
5621 		n->m_next = m;
5622 	}
5623 	return (n);
5624 }
5625 #endif
5626