xref: /freebsd/sys/dev/cxgbe/t4_sge.c (revision aa0a1e58)
1 /*-
2  * Copyright (c) 2011 Chelsio Communications, Inc.
3  * All rights reserved.
4  * Written by: Navdeep Parhar <np@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_inet.h"
32 
33 #include <sys/types.h>
34 #include <sys/mbuf.h>
35 #include <sys/socket.h>
36 #include <sys/kernel.h>
37 #include <sys/malloc.h>
38 #include <sys/queue.h>
39 #include <sys/taskqueue.h>
40 #include <sys/sysctl.h>
41 #include <net/bpf.h>
42 #include <net/ethernet.h>
43 #include <net/if.h>
44 #include <net/if_vlan_var.h>
45 #include <netinet/in.h>
46 #include <netinet/ip.h>
47 #include <netinet/tcp.h>
48 
49 #include "common/common.h"
50 #include "common/t4_regs.h"
51 #include "common/t4_regs_values.h"
52 #include "common/t4_msg.h"
53 #include "common/t4fw_interface.h"
54 
55 struct fl_buf_info {
56 	int size;
57 	int type;
58 	uma_zone_t zone;
59 };
60 
61 /* Filled up by t4_sge_modload */
62 static struct fl_buf_info fl_buf_info[FL_BUF_SIZES];
63 
64 #define FL_BUF_SIZE(x)	(fl_buf_info[x].size)
65 #define FL_BUF_TYPE(x)	(fl_buf_info[x].type)
66 #define FL_BUF_ZONE(x)	(fl_buf_info[x].zone)
67 
68 enum {
69 	FL_PKTSHIFT = 2
70 };
71 
72 #define FL_ALIGN	min(CACHE_LINE_SIZE, 32)
73 #if CACHE_LINE_SIZE > 64
74 #define SPG_LEN		128
75 #else
76 #define SPG_LEN		64
77 #endif
78 
79 /* Used to track coalesced tx work request */
80 struct txpkts {
81 	uint64_t *flitp;	/* ptr to flit where next pkt should start */
82 	uint8_t npkt;		/* # of packets in this work request */
83 	uint8_t nflits;		/* # of flits used by this work request */
84 	uint16_t plen;		/* total payload (sum of all packets) */
85 };
86 
87 /* A packet's SGL.  This + m_pkthdr has all info needed for tx */
88 struct sgl {
89 	int nsegs;		/* # of segments in the SGL, 0 means imm. tx */
90 	int nflits;		/* # of flits needed for the SGL */
91 	bus_dma_segment_t seg[TX_SGL_SEGS];
92 };
93 
94 static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int,
95     int, iq_intr_handler_t *, char *);
96 static inline void init_fl(struct sge_fl *, int, char *);
97 static inline void init_txq(struct sge_txq *, int, char *);
98 static int alloc_ring(struct adapter *, size_t, bus_dma_tag_t *, bus_dmamap_t *,
99     bus_addr_t *, void **);
100 static int free_ring(struct adapter *, bus_dma_tag_t, bus_dmamap_t, bus_addr_t,
101     void *);
102 static int alloc_iq_fl(struct port_info *, struct sge_iq *, struct sge_fl *,
103     int);
104 static int free_iq_fl(struct port_info *, struct sge_iq *, struct sge_fl *);
105 static int alloc_iq(struct sge_iq *, int);
106 static int free_iq(struct sge_iq *);
107 static int alloc_rxq(struct port_info *, struct sge_rxq *, int, int);
108 static int free_rxq(struct port_info *, struct sge_rxq *);
109 static int alloc_txq(struct port_info *, struct sge_txq *, int);
110 static int free_txq(struct port_info *, struct sge_txq *);
111 static void oneseg_dma_callback(void *, bus_dma_segment_t *, int, int);
112 static inline bool is_new_response(const struct sge_iq *, struct rsp_ctrl **);
113 static inline void iq_next(struct sge_iq *);
114 static inline void ring_fl_db(struct adapter *, struct sge_fl *);
115 static void refill_fl(struct sge_fl *, int);
116 static int alloc_fl_sdesc(struct sge_fl *);
117 static void free_fl_sdesc(struct sge_fl *);
118 static int alloc_eq_maps(struct sge_eq *);
119 static void free_eq_maps(struct sge_eq *);
120 static void set_fl_tag_idx(struct sge_fl *, int);
121 
122 static int get_pkt_sgl(struct sge_txq *, struct mbuf **, struct sgl *, int);
123 static int free_pkt_sgl(struct sge_txq *, struct sgl *);
124 static int write_txpkt_wr(struct port_info *, struct sge_txq *, struct mbuf *,
125     struct sgl *);
126 static int add_to_txpkts(struct port_info *, struct sge_txq *, struct txpkts *,
127     struct mbuf *, struct sgl *);
128 static void write_txpkts_wr(struct sge_txq *, struct txpkts *);
129 static inline void write_ulp_cpl_sgl(struct port_info *, struct sge_txq *,
130     struct txpkts *, struct mbuf *, struct sgl *);
131 static int write_sgl_to_txd(struct sge_eq *, struct sgl *, caddr_t *);
132 static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int);
133 static inline void ring_tx_db(struct adapter *, struct sge_eq *);
134 static inline int reclaimable(struct sge_eq *);
135 static int reclaim_tx_descs(struct sge_eq *, int, int);
136 static void write_eqflush_wr(struct sge_eq *);
137 static __be64 get_flit(bus_dma_segment_t *, int, int);
138 static int handle_sge_egr_update(struct adapter *,
139     const struct cpl_sge_egr_update *);
140 
141 /*
142  * Called on MOD_LOAD and fills up fl_buf_info[].
143  */
144 void
145 t4_sge_modload(void)
146 {
147 	int i;
148 	int bufsize[FL_BUF_SIZES] = {
149 		MCLBYTES,
150 #if MJUMPAGESIZE != MCLBYTES
151 		MJUMPAGESIZE,
152 #endif
153 		MJUM9BYTES,
154 		MJUM16BYTES
155 	};
156 
157 	for (i = 0; i < FL_BUF_SIZES; i++) {
158 		FL_BUF_SIZE(i) = bufsize[i];
159 		FL_BUF_TYPE(i) = m_gettype(bufsize[i]);
160 		FL_BUF_ZONE(i) = m_getzone(bufsize[i]);
161 	}
162 }
163 
164 /**
165  *	t4_sge_init - initialize SGE
166  *	@sc: the adapter
167  *
168  *	Performs SGE initialization needed every time after a chip reset.
169  *	We do not initialize any of the queues here, instead the driver
170  *	top-level must request them individually.
171  */
172 void
173 t4_sge_init(struct adapter *sc)
174 {
175 	struct sge *s = &sc->sge;
176 	int i;
177 
178 	t4_set_reg_field(sc, A_SGE_CONTROL, V_PKTSHIFT(M_PKTSHIFT) |
179 			 V_INGPADBOUNDARY(M_INGPADBOUNDARY) |
180 			 F_EGRSTATUSPAGESIZE,
181 			 V_INGPADBOUNDARY(ilog2(FL_ALIGN) - 5) |
182 			 V_PKTSHIFT(FL_PKTSHIFT) |
183 			 F_RXPKTCPLMODE |
184 			 V_EGRSTATUSPAGESIZE(SPG_LEN == 128));
185 	t4_set_reg_field(sc, A_SGE_HOST_PAGE_SIZE,
186 			 V_HOSTPAGESIZEPF0(M_HOSTPAGESIZEPF0),
187 			 V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10));
188 
189 	for (i = 0; i < FL_BUF_SIZES; i++) {
190 		t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i),
191 		    FL_BUF_SIZE(i));
192 	}
193 
194 	t4_write_reg(sc, A_SGE_INGRESS_RX_THRESHOLD,
195 		     V_THRESHOLD_0(s->counter_val[0]) |
196 		     V_THRESHOLD_1(s->counter_val[1]) |
197 		     V_THRESHOLD_2(s->counter_val[2]) |
198 		     V_THRESHOLD_3(s->counter_val[3]));
199 
200 	t4_write_reg(sc, A_SGE_TIMER_VALUE_0_AND_1,
201 		     V_TIMERVALUE0(us_to_core_ticks(sc, s->timer_val[0])) |
202 		     V_TIMERVALUE1(us_to_core_ticks(sc, s->timer_val[1])));
203 	t4_write_reg(sc, A_SGE_TIMER_VALUE_2_AND_3,
204 		     V_TIMERVALUE2(us_to_core_ticks(sc, s->timer_val[2])) |
205 		     V_TIMERVALUE3(us_to_core_ticks(sc, s->timer_val[3])));
206 	t4_write_reg(sc, A_SGE_TIMER_VALUE_4_AND_5,
207 		     V_TIMERVALUE4(us_to_core_ticks(sc, s->timer_val[4])) |
208 		     V_TIMERVALUE5(us_to_core_ticks(sc, s->timer_val[5])));
209 }
210 
211 int
212 t4_create_dma_tag(struct adapter *sc)
213 {
214 	int rc;
215 
216 	rc = bus_dma_tag_create(bus_get_dma_tag(sc->dev), 1, 0,
217 	    BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, BUS_SPACE_MAXSIZE,
218 	    BUS_SPACE_UNRESTRICTED, BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL,
219 	    NULL, &sc->dmat);
220 	if (rc != 0) {
221 		device_printf(sc->dev,
222 		    "failed to create main DMA tag: %d\n", rc);
223 	}
224 
225 	return (rc);
226 }
227 
228 int
229 t4_destroy_dma_tag(struct adapter *sc)
230 {
231 	if (sc->dmat)
232 		bus_dma_tag_destroy(sc->dmat);
233 
234 	return (0);
235 }
236 
237 /*
238  * Allocate and initialize the firmware event queue and the forwarded interrupt
239  * queues, if any.  The adapter owns all these queues as they are not associated
240  * with any particular port.
241  *
242  * Returns errno on failure.  Resources allocated up to that point may still be
243  * allocated.  Caller is responsible for cleanup in case this function fails.
244  */
245 int
246 t4_setup_adapter_iqs(struct adapter *sc)
247 {
248 	int i, rc;
249 	struct sge_iq *iq, *fwq;
250 	iq_intr_handler_t *handler;
251 	char name[16];
252 
253 	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
254 
255 	fwq = &sc->sge.fwq;
256 	if (sc->flags & INTR_FWD) {
257 		iq = &sc->sge.fiq[0];
258 
259 		/*
260 		 * Forwarded interrupt queues - allocate 1 if there's only 1
261 		 * vector available, one less than the number of vectors
262 		 * otherwise (the first vector is reserved for the error
263 		 * interrupt in that case).
264 		 */
265 		i = sc->intr_count > 1 ? 1 : 0;
266 		for (; i < sc->intr_count; i++, iq++) {
267 
268 			snprintf(name, sizeof(name), "%s fiq%d",
269 			    device_get_nameunit(sc->dev), i);
270 			init_iq(iq, sc, 0, 0, (sc->sge.nrxq + 1) * 2, 16, NULL,
271 			    name);
272 
273 			rc = alloc_iq(iq, i);
274 			if (rc != 0) {
275 				device_printf(sc->dev,
276 				    "failed to create fwd intr queue %d: %d\n",
277 				    i, rc);
278 				return (rc);
279 			}
280 		}
281 
282 		handler = t4_intr_evt;
283 		i = 0;	/* forward fwq's interrupt to the first fiq */
284 	} else {
285 		handler = NULL;
286 		i = 1;	/* fwq should use vector 1 (0 is used by error) */
287 	}
288 
289 	snprintf(name, sizeof(name), "%s fwq", device_get_nameunit(sc->dev));
290 	init_iq(fwq, sc, 0, 0, FW_IQ_QSIZE, FW_IQ_ESIZE, handler, name);
291 	rc = alloc_iq(fwq, i);
292 	if (rc != 0) {
293 		device_printf(sc->dev,
294 		    "failed to create firmware event queue: %d\n", rc);
295 	}
296 
297 	return (rc);
298 }
299 
300 /*
301  * Idempotent
302  */
303 int
304 t4_teardown_adapter_iqs(struct adapter *sc)
305 {
306 	int i;
307 	struct sge_iq *iq;
308 
309 	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
310 
311 	iq = &sc->sge.fwq;
312 	free_iq(iq);
313 	if (sc->flags & INTR_FWD) {
314 		for (i = 0; i < NFIQ(sc); i++) {
315 			iq = &sc->sge.fiq[i];
316 			free_iq(iq);
317 		}
318 	}
319 
320 	return (0);
321 }
322 
323 int
324 t4_setup_eth_queues(struct port_info *pi)
325 {
326 	int rc = 0, i, intr_idx;
327 	struct sge_rxq *rxq;
328 	struct sge_txq *txq;
329 	char name[16];
330 	struct adapter *sc = pi->adapter;
331 
332 	if (sysctl_ctx_init(&pi->ctx) == 0) {
333 		struct sysctl_oid *oid = device_get_sysctl_tree(pi->dev);
334 		struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
335 
336 		pi->oid_rxq = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO,
337 		    "rxq", CTLFLAG_RD, NULL, "rx queues");
338 		pi->oid_txq = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO,
339 		    "txq", CTLFLAG_RD, NULL, "tx queues");
340 	}
341 
342 	for_each_rxq(pi, i, rxq) {
343 
344 		snprintf(name, sizeof(name), "%s rxq%d-iq",
345 		    device_get_nameunit(pi->dev), i);
346 		init_iq(&rxq->iq, sc, pi->tmr_idx, pi->pktc_idx,
347 		    pi->qsize_rxq, RX_IQ_ESIZE,
348 		    sc->flags & INTR_FWD ? t4_intr_data: NULL, name);
349 
350 		snprintf(name, sizeof(name), "%s rxq%d-fl",
351 		    device_get_nameunit(pi->dev), i);
352 		init_fl(&rxq->fl, pi->qsize_rxq / 8, name);
353 
354 		if (sc->flags & INTR_FWD)
355 			intr_idx = (pi->first_rxq + i) % NFIQ(sc);
356 		else
357 			intr_idx = pi->first_rxq + i + 2;
358 
359 		rc = alloc_rxq(pi, rxq, intr_idx, i);
360 		if (rc != 0)
361 			goto done;
362 
363 		intr_idx++;
364 	}
365 
366 	for_each_txq(pi, i, txq) {
367 
368 		snprintf(name, sizeof(name), "%s txq%d",
369 		    device_get_nameunit(pi->dev), i);
370 		init_txq(txq, pi->qsize_txq, name);
371 
372 		rc = alloc_txq(pi, txq, i);
373 		if (rc != 0)
374 			goto done;
375 	}
376 
377 done:
378 	if (rc)
379 		t4_teardown_eth_queues(pi);
380 
381 	return (rc);
382 }
383 
384 /*
385  * Idempotent
386  */
387 int
388 t4_teardown_eth_queues(struct port_info *pi)
389 {
390 	int i;
391 	struct sge_rxq *rxq;
392 	struct sge_txq *txq;
393 
394 	/* Do this before freeing the queues */
395 	if (pi->oid_txq || pi->oid_rxq) {
396 		sysctl_ctx_free(&pi->ctx);
397 		pi->oid_txq = pi->oid_rxq = NULL;
398 	}
399 
400 	for_each_txq(pi, i, txq) {
401 		free_txq(pi, txq);
402 	}
403 
404 	for_each_rxq(pi, i, rxq) {
405 		free_rxq(pi, rxq);
406 	}
407 
408 	return (0);
409 }
410 
411 /* Deals with errors and forwarded interrupts */
412 void
413 t4_intr_all(void *arg)
414 {
415 	struct adapter *sc = arg;
416 
417 	t4_intr_err(arg);
418 	t4_intr_fwd(&sc->sge.fiq[0]);
419 }
420 
421 /* Deals with forwarded interrupts on the given ingress queue */
422 void
423 t4_intr_fwd(void *arg)
424 {
425 	struct sge_iq *iq = arg, *q;
426 	struct adapter *sc = iq->adapter;
427 	struct rsp_ctrl *ctrl;
428 	int ndesc_pending = 0, ndesc_total = 0;
429 	int qid;
430 
431 	while (is_new_response(iq, &ctrl)) {
432 
433 		rmb();
434 
435 		/* Only interrupt muxing expected on this queue */
436 		KASSERT(G_RSPD_TYPE(ctrl->u.type_gen) == X_RSPD_TYPE_INTR,
437 		    ("unexpected event on forwarded interrupt queue: %x",
438 		    G_RSPD_TYPE(ctrl->u.type_gen)));
439 
440 		qid = ntohl(ctrl->pldbuflen_qid) - sc->sge.iq_start;
441 		q = sc->sge.iqmap[qid];
442 
443 		q->handler(q);
444 
445 		ndesc_total++;
446 		if (++ndesc_pending >= iq->qsize / 4) {
447 			t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS),
448 			    V_CIDXINC(ndesc_pending) |
449 			    V_INGRESSQID(iq->cntxt_id) |
450 			    V_SEINTARM(
451 				V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX)));
452 			ndesc_pending = 0;
453 		}
454 
455 		iq_next(iq);
456 	}
457 
458 	if (ndesc_total > 0) {
459 		t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS),
460 		    V_CIDXINC(ndesc_pending) | V_INGRESSQID((u32)iq->cntxt_id) |
461 		    V_SEINTARM(iq->intr_params));
462 	}
463 }
464 
465 /* Deals with error interrupts */
466 void
467 t4_intr_err(void *arg)
468 {
469 	struct adapter *sc = arg;
470 
471 	if (sc->intr_type == 1)
472 		t4_write_reg(sc, MYPF_REG(A_PCIE_PF_CLI), 0);
473 
474 	t4_slow_intr_handler(sc);
475 }
476 
477 /* Deals with the firmware event queue */
478 void
479 t4_intr_evt(void *arg)
480 {
481 	struct sge_iq *iq = arg;
482 	struct adapter *sc = iq->adapter;
483 	struct rsp_ctrl *ctrl;
484 	const struct rss_header *rss;
485 	int ndesc_pending = 0, ndesc_total = 0;
486 
487 	KASSERT(iq == &sc->sge.fwq, ("%s: unexpected ingress queue", __func__));
488 
489 	while (is_new_response(iq, &ctrl)) {
490 
491 		rmb();
492 
493 		rss = (const void *)iq->cdesc;
494 
495 		/* Should only get CPL on this queue */
496 		KASSERT(G_RSPD_TYPE(ctrl->u.type_gen) == X_RSPD_TYPE_CPL,
497 		    ("%s: unexpected type %d", __func__,
498 		    G_RSPD_TYPE(ctrl->u.type_gen)));
499 
500 		switch (rss->opcode) {
501 		case CPL_FW4_MSG:
502 		case CPL_FW6_MSG: {
503 			const struct cpl_fw6_msg *cpl;
504 
505 			cpl = (const void *)(rss + 1);
506 			if (cpl->type == FW6_TYPE_CMD_RPL)
507 				t4_handle_fw_rpl(sc, cpl->data);
508 
509 			break;
510 			}
511 		case CPL_SGE_EGR_UPDATE:
512 			handle_sge_egr_update(sc, (const void *)(rss + 1));
513 			break;
514 
515 		default:
516 			device_printf(sc->dev,
517 			    "can't handle CPL opcode %d.", rss->opcode);
518 		}
519 
520 		ndesc_total++;
521 		if (++ndesc_pending >= iq->qsize / 4) {
522 			t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS),
523 			    V_CIDXINC(ndesc_pending) |
524 			    V_INGRESSQID(iq->cntxt_id) |
525 			    V_SEINTARM(
526 				V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX)));
527 			ndesc_pending = 0;
528 		}
529 		iq_next(iq);
530 	}
531 
532 	if (ndesc_total > 0) {
533 		t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS),
534 		    V_CIDXINC(ndesc_pending) | V_INGRESSQID(iq->cntxt_id) |
535 		    V_SEINTARM(iq->intr_params));
536 	}
537 }
538 
539 void
540 t4_intr_data(void *arg)
541 {
542 	struct sge_rxq *rxq = arg;
543 	struct sge_iq *iq = arg;
544 	struct adapter *sc = iq->adapter;
545 	struct rsp_ctrl *ctrl;
546 	struct ifnet *ifp = rxq->ifp;
547 	struct sge_fl *fl = &rxq->fl;
548 	struct fl_sdesc *sd = &fl->sdesc[fl->cidx], *sd_next;
549 	const struct rss_header *rss;
550 	const struct cpl_rx_pkt *cpl;
551 	uint32_t len;
552 	int ndescs = 0, i;
553 	struct mbuf *m0, *m;
554 #ifdef INET
555 	struct lro_ctrl *lro = &rxq->lro;
556 	struct lro_entry *l;
557 #endif
558 
559 	prefetch(sd->m);
560 	prefetch(sd->cl);
561 
562 	iq->intr_next = iq->intr_params;
563 	while (is_new_response(iq, &ctrl)) {
564 
565 		rmb();
566 
567 		rss = (const void *)iq->cdesc;
568 		i = G_RSPD_TYPE(ctrl->u.type_gen);
569 
570 		if (__predict_false(i == X_RSPD_TYPE_CPL)) {
571 
572 			/* Can't be anything except an egress update */
573 			KASSERT(rss->opcode == CPL_SGE_EGR_UPDATE,
574 			    ("%s: unexpected CPL %x", __func__, rss->opcode));
575 
576 			handle_sge_egr_update(sc, (const void *)(rss + 1));
577 			goto nextdesc;
578 		}
579 		KASSERT(i == X_RSPD_TYPE_FLBUF && rss->opcode == CPL_RX_PKT,
580 		    ("%s: unexpected CPL %x rsp %d", __func__, rss->opcode, i));
581 
582 		sd_next = sd + 1;
583 		if (__predict_false(fl->cidx + 1 == fl->cap))
584 			sd_next = fl->sdesc;
585 		prefetch(sd_next->m);
586 		prefetch(sd_next->cl);
587 
588 		cpl = (const void *)(rss + 1);
589 
590 		m0 = sd->m;
591 		sd->m = NULL;	/* consumed */
592 
593 		len = be32toh(ctrl->pldbuflen_qid);
594 		if (__predict_false((len & F_RSPD_NEWBUF) == 0))
595 			panic("%s: cannot handle packed frames", __func__);
596 		len = G_RSPD_LEN(len);
597 
598 		bus_dmamap_sync(fl->tag[sd->tag_idx], sd->map,
599 		    BUS_DMASYNC_POSTREAD);
600 
601 		m_init(m0, NULL, 0, M_NOWAIT, MT_DATA, M_PKTHDR);
602 		if (len < MINCLSIZE) {
603 			/* copy data to mbuf, buffer will be recycled */
604 			bcopy(sd->cl, mtod(m0, caddr_t), len);
605 			m0->m_len = len;
606 		} else {
607 			bus_dmamap_unload(fl->tag[sd->tag_idx], sd->map);
608 			m_cljset(m0, sd->cl, FL_BUF_TYPE(sd->tag_idx));
609 			sd->cl = NULL;	/* consumed */
610 			m0->m_len = min(len, FL_BUF_SIZE(sd->tag_idx));
611 		}
612 
613 		len -= FL_PKTSHIFT;
614 		m0->m_len -= FL_PKTSHIFT;
615 		m0->m_data += FL_PKTSHIFT;
616 
617 		m0->m_pkthdr.len = len;
618 		m0->m_pkthdr.rcvif = ifp;
619 		m0->m_flags |= M_FLOWID;
620 		m0->m_pkthdr.flowid = rss->hash_val;
621 
622 		if (cpl->csum_calc && !cpl->err_vec &&
623 		    ifp->if_capenable & IFCAP_RXCSUM) {
624 			m0->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED |
625 			    CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
626 			if (cpl->ip_frag)
627 				m0->m_pkthdr.csum_data = be16toh(cpl->csum);
628 			else
629 				m0->m_pkthdr.csum_data = 0xffff;
630 			rxq->rxcsum++;
631 		}
632 
633 		if (cpl->vlan_ex) {
634 			m0->m_pkthdr.ether_vtag = be16toh(cpl->vlan);
635 			m0->m_flags |= M_VLANTAG;
636 			rxq->vlan_extraction++;
637 		}
638 
639 		i = 1;	/* # of fl sdesc used */
640 		sd = sd_next;
641 		if (__predict_false(++fl->cidx == fl->cap))
642 			fl->cidx = 0;
643 
644 		len -= m0->m_len;
645 		m = m0;
646 		while (len) {
647 			i++;
648 
649 			sd_next = sd + 1;
650 			if (__predict_false(fl->cidx + 1 == fl->cap))
651 				sd_next = fl->sdesc;
652 			prefetch(sd_next->m);
653 			prefetch(sd_next->cl);
654 
655 			m->m_next = sd->m;
656 			sd->m = NULL;	/* consumed */
657 			m = m->m_next;
658 
659 			bus_dmamap_sync(fl->tag[sd->tag_idx], sd->map,
660 			    BUS_DMASYNC_POSTREAD);
661 
662 			m_init(m, NULL, 0, M_NOWAIT, MT_DATA, 0);
663 			if (len <= MLEN) {
664 				bcopy(sd->cl, mtod(m, caddr_t), len);
665 				m->m_len = len;
666 			} else {
667 				bus_dmamap_unload(fl->tag[sd->tag_idx],
668 				    sd->map);
669 				m_cljset(m, sd->cl, FL_BUF_TYPE(sd->tag_idx));
670 				sd->cl = NULL;	/* consumed */
671 				m->m_len = min(len, FL_BUF_SIZE(sd->tag_idx));
672 			}
673 
674 			i++;
675 			sd = sd_next;
676 			if (__predict_false(++fl->cidx == fl->cap))
677 				fl->cidx = 0;
678 
679 			len -= m->m_len;
680 		}
681 
682 #ifdef INET
683 		if (cpl->l2info & htobe32(F_RXF_LRO) &&
684 		    rxq->flags & RXQ_LRO_ENABLED &&
685 		    tcp_lro_rx(lro, m0, 0) == 0) {
686 			/* queued for LRO */
687 		} else
688 #endif
689 		ifp->if_input(ifp, m0);
690 
691 		FL_LOCK(fl);
692 		fl->needed += i;
693 		if (fl->needed >= 32)
694 			refill_fl(fl, 64);
695 		if (fl->pending >= 32)
696 			ring_fl_db(sc, fl);
697 		FL_UNLOCK(fl);
698 
699 nextdesc:	ndescs++;
700 		iq_next(iq);
701 
702 		if (ndescs > 32) {
703 			t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS),
704 			    V_CIDXINC(ndescs) |
705 			    V_INGRESSQID((u32)iq->cntxt_id) |
706 			    V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX)));
707 			ndescs = 0;
708 		}
709 	}
710 
711 #ifdef INET
712 	while (!SLIST_EMPTY(&lro->lro_active)) {
713 		l = SLIST_FIRST(&lro->lro_active);
714 		SLIST_REMOVE_HEAD(&lro->lro_active, next);
715 		tcp_lro_flush(lro, l);
716 	}
717 #endif
718 
719 	t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS), V_CIDXINC(ndescs) |
720 	    V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_next));
721 
722 	FL_LOCK(fl);
723 	if (fl->needed >= 32)
724 		refill_fl(fl, 128);
725 	if (fl->pending >= 8)
726 		ring_fl_db(sc, fl);
727 	FL_UNLOCK(fl);
728 }
729 
730 /* Per-packet header in a coalesced tx WR, before the SGL starts (in flits) */
731 #define TXPKTS_PKT_HDR ((\
732     sizeof(struct ulp_txpkt) + \
733     sizeof(struct ulptx_idata) + \
734     sizeof(struct cpl_tx_pkt_core) \
735     ) / 8)
736 
737 /* Header of a coalesced tx WR, before SGL of first packet (in flits) */
738 #define TXPKTS_WR_HDR (\
739     sizeof(struct fw_eth_tx_pkts_wr) / 8 + \
740     TXPKTS_PKT_HDR)
741 
742 /* Header of a tx WR, before SGL of first packet (in flits) */
743 #define TXPKT_WR_HDR ((\
744     sizeof(struct fw_eth_tx_pkt_wr) + \
745     sizeof(struct cpl_tx_pkt_core) \
746     ) / 8 )
747 
748 /* Header of a tx LSO WR, before SGL of first packet (in flits) */
749 #define TXPKT_LSO_WR_HDR ((\
750     sizeof(struct fw_eth_tx_pkt_wr) + \
751     sizeof(struct cpl_tx_pkt_lso) + \
752     sizeof(struct cpl_tx_pkt_core) \
753     ) / 8 )
754 
755 int
756 t4_eth_tx(struct ifnet *ifp, struct sge_txq *txq, struct mbuf *m)
757 {
758 	struct port_info *pi = (void *)ifp->if_softc;
759 	struct adapter *sc = pi->adapter;
760 	struct sge_eq *eq = &txq->eq;
761 	struct buf_ring *br = eq->br;
762 	struct mbuf *next;
763 	int rc, coalescing, can_reclaim;
764 	struct txpkts txpkts;
765 	struct sgl sgl;
766 
767 	TXQ_LOCK_ASSERT_OWNED(txq);
768 	KASSERT(m, ("%s: called with nothing to do.", __func__));
769 
770 	prefetch(&eq->desc[eq->pidx]);
771 	prefetch(&eq->sdesc[eq->pidx]);
772 
773 	txpkts.npkt = 0;/* indicates there's nothing in txpkts */
774 	coalescing = 0;
775 
776 	if (eq->avail < 8)
777 		reclaim_tx_descs(eq, 0, 8);
778 
779 	for (; m; m = next ? next : drbr_dequeue(ifp, br)) {
780 
781 		if (eq->avail < 8)
782 			break;
783 
784 		next = m->m_nextpkt;
785 		m->m_nextpkt = NULL;
786 
787 		if (next || buf_ring_peek(br))
788 			coalescing = 1;
789 
790 		rc = get_pkt_sgl(txq, &m, &sgl, coalescing);
791 		if (rc != 0) {
792 			if (rc == ENOMEM) {
793 
794 				/* Short of resources, suspend tx */
795 
796 				m->m_nextpkt = next;
797 				break;
798 			}
799 
800 			/*
801 			 * Unrecoverable error for this packet, throw it away
802 			 * and move on to the next.  get_pkt_sgl may already
803 			 * have freed m (it will be NULL in that case and the
804 			 * m_freem here is still safe).
805 			 */
806 
807 			m_freem(m);
808 			continue;
809 		}
810 
811 		if (coalescing &&
812 		    add_to_txpkts(pi, txq, &txpkts, m, &sgl) == 0) {
813 
814 			/* Successfully absorbed into txpkts */
815 
816 			write_ulp_cpl_sgl(pi, txq, &txpkts, m, &sgl);
817 			goto doorbell;
818 		}
819 
820 		/*
821 		 * We weren't coalescing to begin with, or current frame could
822 		 * not be coalesced (add_to_txpkts flushes txpkts if a frame
823 		 * given to it can't be coalesced).  Either way there should be
824 		 * nothing in txpkts.
825 		 */
826 		KASSERT(txpkts.npkt == 0,
827 		    ("%s: txpkts not empty: %d", __func__, txpkts.npkt));
828 
829 		/* We're sending out individual packets now */
830 		coalescing = 0;
831 
832 		if (eq->avail < 8)
833 			reclaim_tx_descs(eq, 0, 8);
834 		rc = write_txpkt_wr(pi, txq, m, &sgl);
835 		if (rc != 0) {
836 
837 			/* Short of hardware descriptors, suspend tx */
838 
839 			/*
840 			 * This is an unlikely but expensive failure.  We've
841 			 * done all the hard work (DMA mappings etc.) and now we
842 			 * can't send out the packet.  What's worse, we have to
843 			 * spend even more time freeing up everything in sgl.
844 			 */
845 			txq->no_desc++;
846 			free_pkt_sgl(txq, &sgl);
847 
848 			m->m_nextpkt = next;
849 			break;
850 		}
851 
852 		ETHER_BPF_MTAP(ifp, m);
853 		if (sgl.nsegs == 0)
854 			m_freem(m);
855 
856 doorbell:
857 		/* Fewer and fewer doorbells as the queue fills up */
858 		if (eq->pending >= (1 << (fls(eq->qsize - eq->avail) / 2)))
859 		    ring_tx_db(sc, eq);
860 
861 		can_reclaim = reclaimable(eq);
862 		if (can_reclaim >= 32)
863 			reclaim_tx_descs(eq, can_reclaim, 32);
864 	}
865 
866 	if (txpkts.npkt > 0)
867 		write_txpkts_wr(txq, &txpkts);
868 
869 	/*
870 	 * m not NULL means there was an error but we haven't thrown it away.
871 	 * This can happen when we're short of tx descriptors (no_desc) or maybe
872 	 * even DMA maps (no_dmamap).  Either way, a credit flush and reclaim
873 	 * will get things going again.
874 	 *
875 	 * If eq->avail is already 0 we know a credit flush was requested in the
876 	 * WR that reduced it to 0 so we don't need another flush (we don't have
877 	 * any descriptor for a flush WR anyway, duh).
878 	 */
879 	if (m && eq->avail > 0)
880 		write_eqflush_wr(eq);
881 	txq->m = m;
882 
883 	if (eq->pending)
884 		ring_tx_db(sc, eq);
885 
886 	can_reclaim = reclaimable(eq);
887 	if (can_reclaim >= 32)
888 		reclaim_tx_descs(eq, can_reclaim, 128);
889 
890 	return (0);
891 }
892 
893 void
894 t4_update_fl_bufsize(struct ifnet *ifp)
895 {
896 	struct port_info *pi = ifp->if_softc;
897 	struct sge_rxq *rxq;
898 	struct sge_fl *fl;
899 	int i;
900 
901 	for_each_rxq(pi, i, rxq) {
902 		fl = &rxq->fl;
903 
904 		FL_LOCK(fl);
905 		set_fl_tag_idx(fl, ifp->if_mtu);
906 		FL_UNLOCK(fl);
907 	}
908 }
909 
910 /*
911  * A non-NULL handler indicates this iq will not receive direct interrupts, the
912  * handler will be invoked by a forwarded interrupt queue.
913  */
914 static inline void
915 init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int pktc_idx,
916     int qsize, int esize, iq_intr_handler_t *handler, char *name)
917 {
918 	KASSERT(tmr_idx >= 0 && tmr_idx < SGE_NTIMERS,
919 	    ("%s: bad tmr_idx %d", __func__, tmr_idx));
920 	KASSERT(pktc_idx < SGE_NCOUNTERS,	/* -ve is ok, means don't use */
921 	    ("%s: bad pktc_idx %d", __func__, pktc_idx));
922 
923 	iq->flags = 0;
924 	iq->adapter = sc;
925 	iq->intr_params = V_QINTR_TIMER_IDX(tmr_idx) |
926 	    V_QINTR_CNT_EN(pktc_idx >= 0);
927 	iq->intr_pktc_idx = pktc_idx;
928 	iq->qsize = roundup(qsize, 16);		/* See FW_IQ_CMD/iqsize */
929 	iq->esize = max(esize, 16);		/* See FW_IQ_CMD/iqesize */
930 	iq->handler = handler;
931 	strlcpy(iq->lockname, name, sizeof(iq->lockname));
932 }
933 
934 static inline void
935 init_fl(struct sge_fl *fl, int qsize, char *name)
936 {
937 	fl->qsize = qsize;
938 	strlcpy(fl->lockname, name, sizeof(fl->lockname));
939 }
940 
941 static inline void
942 init_txq(struct sge_txq *txq, int qsize, char *name)
943 {
944 	txq->eq.qsize = qsize;
945 	strlcpy(txq->eq.lockname, name, sizeof(txq->eq.lockname));
946 }
947 
948 static int
949 alloc_ring(struct adapter *sc, size_t len, bus_dma_tag_t *tag,
950     bus_dmamap_t *map, bus_addr_t *pa, void **va)
951 {
952 	int rc;
953 
954 	rc = bus_dma_tag_create(sc->dmat, 512, 0, BUS_SPACE_MAXADDR,
955 	    BUS_SPACE_MAXADDR, NULL, NULL, len, 1, len, 0, NULL, NULL, tag);
956 	if (rc != 0) {
957 		device_printf(sc->dev, "cannot allocate DMA tag: %d\n", rc);
958 		goto done;
959 	}
960 
961 	rc = bus_dmamem_alloc(*tag, va,
962 	    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, map);
963 	if (rc != 0) {
964 		device_printf(sc->dev, "cannot allocate DMA memory: %d\n", rc);
965 		goto done;
966 	}
967 
968 	rc = bus_dmamap_load(*tag, *map, *va, len, oneseg_dma_callback, pa, 0);
969 	if (rc != 0) {
970 		device_printf(sc->dev, "cannot load DMA map: %d\n", rc);
971 		goto done;
972 	}
973 done:
974 	if (rc)
975 		free_ring(sc, *tag, *map, *pa, *va);
976 
977 	return (rc);
978 }
979 
980 static int
981 free_ring(struct adapter *sc, bus_dma_tag_t tag, bus_dmamap_t map,
982     bus_addr_t pa, void *va)
983 {
984 	if (pa)
985 		bus_dmamap_unload(tag, map);
986 	if (va)
987 		bus_dmamem_free(tag, va, map);
988 	if (tag)
989 		bus_dma_tag_destroy(tag);
990 
991 	return (0);
992 }
993 
994 /*
995  * Allocates the ring for an ingress queue and an optional freelist.  If the
996  * freelist is specified it will be allocated and then associated with the
997  * ingress queue.
998  *
999  * Returns errno on failure.  Resources allocated up to that point may still be
1000  * allocated.  Caller is responsible for cleanup in case this function fails.
1001  *
1002  * If the ingress queue will take interrupts directly (iq->handler == NULL) then
1003  * the intr_idx specifies the vector, starting from 0.  Otherwise it specifies
1004  * the index of the queue to which its interrupts will be forwarded.
1005  */
1006 static int
1007 alloc_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl,
1008     int intr_idx)
1009 {
1010 	int rc, i, cntxt_id;
1011 	size_t len;
1012 	struct fw_iq_cmd c;
1013 	struct adapter *sc = iq->adapter;
1014 	__be32 v = 0;
1015 
1016 	/* The adapter queues are nominally allocated in port[0]'s name */
1017 	if (pi == NULL)
1018 		pi = sc->port[0];
1019 
1020 	mtx_init(&iq->iq_lock, iq->lockname, NULL, MTX_DEF);
1021 
1022 	len = iq->qsize * iq->esize;
1023 	rc = alloc_ring(sc, len, &iq->desc_tag, &iq->desc_map, &iq->ba,
1024 	    (void **)&iq->desc);
1025 	if (rc != 0)
1026 		return (rc);
1027 
1028 	bzero(&c, sizeof(c));
1029 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST |
1030 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) |
1031 	    V_FW_IQ_CMD_VFN(0));
1032 
1033 	c.alloc_to_len16 = htobe32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART |
1034 	    FW_LEN16(c));
1035 
1036 	/* Special handling for firmware event queue */
1037 	if (iq == &sc->sge.fwq)
1038 		v |= F_FW_IQ_CMD_IQASYNCH;
1039 
1040 	if (iq->handler) {
1041 		KASSERT(intr_idx < NFIQ(sc),
1042 		    ("%s: invalid indirect intr_idx %d", __func__, intr_idx));
1043 		v |= F_FW_IQ_CMD_IQANDST;
1044 		v |= V_FW_IQ_CMD_IQANDSTINDEX(sc->sge.fiq[intr_idx].abs_id);
1045 	} else {
1046 		KASSERT(intr_idx < sc->intr_count,
1047 		    ("%s: invalid direct intr_idx %d", __func__, intr_idx));
1048 		v |= V_FW_IQ_CMD_IQANDSTINDEX(intr_idx);
1049 	}
1050 
1051 	c.type_to_iqandstindex = htobe32(v |
1052 	    V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) |
1053 	    V_FW_IQ_CMD_VIID(pi->viid) |
1054 	    V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT));
1055 	c.iqdroprss_to_iqesize = htobe16(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) |
1056 	    F_FW_IQ_CMD_IQGTSMODE |
1057 	    V_FW_IQ_CMD_IQINTCNTTHRESH(iq->intr_pktc_idx) |
1058 	    V_FW_IQ_CMD_IQESIZE(ilog2(iq->esize) - 4));
1059 	c.iqsize = htobe16(iq->qsize);
1060 	c.iqaddr = htobe64(iq->ba);
1061 
1062 	if (fl) {
1063 		mtx_init(&fl->fl_lock, fl->lockname, NULL, MTX_DEF);
1064 
1065 		for (i = 0; i < FL_BUF_SIZES; i++) {
1066 
1067 			/*
1068 			 * A freelist buffer must be 16 byte aligned as the SGE
1069 			 * uses the low 4 bits of the bus addr to figure out the
1070 			 * buffer size.
1071 			 */
1072 			rc = bus_dma_tag_create(sc->dmat, 16, 0,
1073 			    BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
1074 			    FL_BUF_SIZE(i), 1, FL_BUF_SIZE(i), BUS_DMA_ALLOCNOW,
1075 			    NULL, NULL, &fl->tag[i]);
1076 			if (rc != 0) {
1077 				device_printf(sc->dev,
1078 				    "failed to create fl DMA tag[%d]: %d\n",
1079 				    i, rc);
1080 				return (rc);
1081 			}
1082 		}
1083 		len = fl->qsize * RX_FL_ESIZE;
1084 		rc = alloc_ring(sc, len, &fl->desc_tag, &fl->desc_map,
1085 		    &fl->ba, (void **)&fl->desc);
1086 		if (rc)
1087 			return (rc);
1088 
1089 		/* Allocate space for one software descriptor per buffer. */
1090 		fl->cap = (fl->qsize - SPG_LEN / RX_FL_ESIZE) * 8;
1091 		FL_LOCK(fl);
1092 		set_fl_tag_idx(fl, pi->ifp->if_mtu);
1093 		rc = alloc_fl_sdesc(fl);
1094 		FL_UNLOCK(fl);
1095 		if (rc != 0) {
1096 			device_printf(sc->dev,
1097 			    "failed to setup fl software descriptors: %d\n",
1098 			    rc);
1099 			return (rc);
1100 		}
1101 		fl->needed = fl->cap - 1; /* one less to avoid cidx = pidx */
1102 
1103 		c.iqns_to_fl0congen =
1104 		    htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE));
1105 		c.fl0dcaen_to_fl0cidxfthresh =
1106 		    htobe16(V_FW_IQ_CMD_FL0FBMIN(X_FETCHBURSTMIN_64B) |
1107 			V_FW_IQ_CMD_FL0FBMAX(X_FETCHBURSTMAX_512B));
1108 		c.fl0size = htobe16(fl->qsize);
1109 		c.fl0addr = htobe64(fl->ba);
1110 	}
1111 
1112 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
1113 	if (rc != 0) {
1114 		device_printf(sc->dev,
1115 		    "failed to create ingress queue: %d\n", rc);
1116 		return (rc);
1117 	}
1118 
1119 	iq->cdesc = iq->desc;
1120 	iq->cidx = 0;
1121 	iq->gen = 1;
1122 	iq->intr_next = iq->intr_params;
1123 	iq->cntxt_id = be16toh(c.iqid);
1124 	iq->abs_id = be16toh(c.physiqid);
1125 	iq->flags |= (IQ_ALLOCATED | IQ_STARTED);
1126 
1127 	cntxt_id = iq->cntxt_id - sc->sge.iq_start;
1128 	KASSERT(cntxt_id < sc->sge.niq,
1129 	    ("%s: iq->cntxt_id (%d) more than the max (%d)", __func__,
1130 	    cntxt_id, sc->sge.niq - 1));
1131 	sc->sge.iqmap[cntxt_id] = iq;
1132 
1133 	if (fl) {
1134 		fl->cntxt_id = be16toh(c.fl0id);
1135 		fl->pidx = fl->cidx = 0;
1136 
1137 		cntxt_id = iq->cntxt_id - sc->sge.eq_start;
1138 		KASSERT(cntxt_id < sc->sge.neq,
1139 		    ("%s: fl->cntxt_id (%d) more than the max (%d)", __func__,
1140 		    cntxt_id, sc->sge.neq - 1));
1141 		sc->sge.eqmap[cntxt_id] = (void *)fl;
1142 
1143 		FL_LOCK(fl);
1144 		refill_fl(fl, -1);
1145 		if (fl->pending >= 8)
1146 			ring_fl_db(sc, fl);
1147 		FL_UNLOCK(fl);
1148 	}
1149 
1150 	/* Enable IQ interrupts */
1151 	t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS), V_SEINTARM(iq->intr_params) |
1152 	    V_INGRESSQID(iq->cntxt_id));
1153 
1154 	return (0);
1155 }
1156 
1157 /*
1158  * This can be called with the iq/fl in any state - fully allocated and
1159  * functional, partially allocated, even all-zeroed out.
1160  */
1161 static int
1162 free_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl)
1163 {
1164 	int i, rc;
1165 	struct adapter *sc = iq->adapter;
1166 	device_t dev;
1167 
1168 	if (sc == NULL)
1169 		return (0);	/* nothing to do */
1170 
1171 	dev = pi ? pi->dev : sc->dev;
1172 
1173 	if (iq->flags & IQ_STARTED) {
1174 		rc = -t4_iq_start_stop(sc, sc->mbox, 0, sc->pf, 0,
1175 		    iq->cntxt_id, fl ? fl->cntxt_id : 0xffff, 0xffff);
1176 		if (rc != 0) {
1177 			device_printf(dev,
1178 			    "failed to stop queue %p: %d\n", iq, rc);
1179 			return (rc);
1180 		}
1181 		iq->flags &= ~IQ_STARTED;
1182 	}
1183 
1184 	if (iq->flags & IQ_ALLOCATED) {
1185 
1186 		rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0,
1187 		    FW_IQ_TYPE_FL_INT_CAP, iq->cntxt_id,
1188 		    fl ? fl->cntxt_id : 0xffff, 0xffff);
1189 		if (rc != 0) {
1190 			device_printf(dev,
1191 			    "failed to free queue %p: %d\n", iq, rc);
1192 			return (rc);
1193 		}
1194 		iq->flags &= ~IQ_ALLOCATED;
1195 	}
1196 
1197 	free_ring(sc, iq->desc_tag, iq->desc_map, iq->ba, iq->desc);
1198 
1199 	if (mtx_initialized(&iq->iq_lock))
1200 		mtx_destroy(&iq->iq_lock);
1201 
1202 	bzero(iq, sizeof(*iq));
1203 
1204 	if (fl) {
1205 		free_ring(sc, fl->desc_tag, fl->desc_map, fl->ba,
1206 		    fl->desc);
1207 
1208 		if (fl->sdesc) {
1209 			FL_LOCK(fl);
1210 			free_fl_sdesc(fl);
1211 			FL_UNLOCK(fl);
1212 		}
1213 
1214 		if (mtx_initialized(&fl->fl_lock))
1215 			mtx_destroy(&fl->fl_lock);
1216 
1217 		for (i = 0; i < FL_BUF_SIZES; i++) {
1218 			if (fl->tag[i])
1219 				bus_dma_tag_destroy(fl->tag[i]);
1220 		}
1221 
1222 		bzero(fl, sizeof(*fl));
1223 	}
1224 
1225 	return (0);
1226 }
1227 
1228 static int
1229 alloc_iq(struct sge_iq *iq, int intr_idx)
1230 {
1231 	return alloc_iq_fl(NULL, iq, NULL, intr_idx);
1232 }
1233 
1234 static int
1235 free_iq(struct sge_iq *iq)
1236 {
1237 	return free_iq_fl(NULL, iq, NULL);
1238 }
1239 
1240 static int
1241 alloc_rxq(struct port_info *pi, struct sge_rxq *rxq, int intr_idx, int idx)
1242 {
1243 	int rc;
1244 	struct sysctl_oid *oid;
1245 	struct sysctl_oid_list *children;
1246 	char name[16];
1247 
1248 	rc = alloc_iq_fl(pi, &rxq->iq, &rxq->fl, intr_idx);
1249 	if (rc != 0)
1250 		return (rc);
1251 
1252 #ifdef INET
1253 	rc = tcp_lro_init(&rxq->lro);
1254 	if (rc != 0)
1255 		return (rc);
1256 	rxq->lro.ifp = pi->ifp; /* also indicates LRO init'ed */
1257 
1258 	if (pi->ifp->if_capenable & IFCAP_LRO)
1259 		rxq->flags |= RXQ_LRO_ENABLED;
1260 #endif
1261 	rxq->ifp = pi->ifp;
1262 
1263 	children = SYSCTL_CHILDREN(pi->oid_rxq);
1264 
1265 	snprintf(name, sizeof(name), "%d", idx);
1266 	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
1267 	    NULL, "rx queue");
1268 	children = SYSCTL_CHILDREN(oid);
1269 
1270 #ifdef INET
1271 	SYSCTL_ADD_INT(&pi->ctx, children, OID_AUTO, "lro_queued", CTLFLAG_RD,
1272 	    &rxq->lro.lro_queued, 0, NULL);
1273 	SYSCTL_ADD_INT(&pi->ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD,
1274 	    &rxq->lro.lro_flushed, 0, NULL);
1275 #endif
1276 	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "rxcsum", CTLFLAG_RD,
1277 	    &rxq->rxcsum, "# of times hardware assisted with checksum");
1278 	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "vlan_extraction",
1279 	    CTLFLAG_RD, &rxq->vlan_extraction,
1280 	    "# of times hardware extracted 802.1Q tag");
1281 
1282 	return (rc);
1283 }
1284 
1285 static int
1286 free_rxq(struct port_info *pi, struct sge_rxq *rxq)
1287 {
1288 	int rc;
1289 
1290 #ifdef INET
1291 	if (rxq->lro.ifp) {
1292 		tcp_lro_free(&rxq->lro);
1293 		rxq->lro.ifp = NULL;
1294 	}
1295 #endif
1296 
1297 	rc = free_iq_fl(pi, &rxq->iq, &rxq->fl);
1298 	if (rc == 0)
1299 		bzero(rxq, sizeof(*rxq));
1300 
1301 	return (rc);
1302 }
1303 
1304 static int
1305 alloc_txq(struct port_info *pi, struct sge_txq *txq, int idx)
1306 {
1307 	int rc, cntxt_id;
1308 	size_t len;
1309 	struct adapter *sc = pi->adapter;
1310 	struct fw_eq_eth_cmd c;
1311 	struct sge_eq *eq = &txq->eq;
1312 	char name[16];
1313 	struct sysctl_oid *oid;
1314 	struct sysctl_oid_list *children;
1315 
1316 	txq->ifp = pi->ifp;
1317 	TASK_INIT(&txq->resume_tx, 0, cxgbe_txq_start, txq);
1318 
1319 	mtx_init(&eq->eq_lock, eq->lockname, NULL, MTX_DEF);
1320 
1321 	len = eq->qsize * TX_EQ_ESIZE;
1322 	rc = alloc_ring(sc, len, &eq->desc_tag, &eq->desc_map,
1323 	    &eq->ba, (void **)&eq->desc);
1324 	if (rc)
1325 		return (rc);
1326 
1327 	eq->cap = eq->qsize - SPG_LEN / TX_EQ_ESIZE;
1328 	eq->spg = (void *)&eq->desc[eq->cap];
1329 	eq->avail = eq->cap - 1;	/* one less to avoid cidx = pidx */
1330 	eq->sdesc = malloc(eq->cap * sizeof(struct tx_sdesc), M_CXGBE,
1331 	    M_ZERO | M_WAITOK);
1332 	eq->br = buf_ring_alloc(eq->qsize, M_CXGBE, M_WAITOK, &eq->eq_lock);
1333 	eq->iqid = sc->sge.rxq[pi->first_rxq].iq.cntxt_id;
1334 
1335 	rc = bus_dma_tag_create(sc->dmat, 1, 0, BUS_SPACE_MAXADDR,
1336 	    BUS_SPACE_MAXADDR, NULL, NULL, 64 * 1024, TX_SGL_SEGS,
1337 	    BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL, NULL, &eq->tx_tag);
1338 	if (rc != 0) {
1339 		device_printf(sc->dev,
1340 		    "failed to create tx DMA tag: %d\n", rc);
1341 		return (rc);
1342 	}
1343 
1344 	rc = alloc_eq_maps(eq);
1345 	if (rc != 0) {
1346 		device_printf(sc->dev, "failed to setup tx DMA maps: %d\n", rc);
1347 		return (rc);
1348 	}
1349 
1350 	bzero(&c, sizeof(c));
1351 
1352 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_ETH_CMD) | F_FW_CMD_REQUEST |
1353 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_ETH_CMD_PFN(sc->pf) |
1354 	    V_FW_EQ_ETH_CMD_VFN(0));
1355 	c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC |
1356 	    F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c));
1357 	c.viid_pkd = htobe32(V_FW_EQ_ETH_CMD_VIID(pi->viid));
1358 	c.fetchszm_to_iqid =
1359 	    htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) |
1360 		V_FW_EQ_ETH_CMD_PCIECHN(pi->tx_chan) |
1361 		V_FW_EQ_ETH_CMD_IQID(eq->iqid));
1362 	c.dcaen_to_eqsize = htobe32(V_FW_EQ_ETH_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
1363 		      V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
1364 		      V_FW_EQ_ETH_CMD_CIDXFTHRESH(X_CIDXFLUSHTHRESH_32) |
1365 		      V_FW_EQ_ETH_CMD_EQSIZE(eq->qsize));
1366 	c.eqaddr = htobe64(eq->ba);
1367 
1368 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
1369 	if (rc != 0) {
1370 		device_printf(pi->dev,
1371 		    "failed to create egress queue: %d\n", rc);
1372 		return (rc);
1373 	}
1374 
1375 	eq->pidx = eq->cidx = 0;
1376 	eq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(be32toh(c.eqid_pkd));
1377 	eq->flags |= (EQ_ALLOCATED | EQ_STARTED);
1378 
1379 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
1380 	KASSERT(cntxt_id < sc->sge.neq,
1381 	    ("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
1382 	    cntxt_id, sc->sge.neq - 1));
1383 	sc->sge.eqmap[cntxt_id] = eq;
1384 
1385 	children = SYSCTL_CHILDREN(pi->oid_txq);
1386 
1387 	snprintf(name, sizeof(name), "%d", idx);
1388 	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
1389 	    NULL, "tx queue");
1390 	children = SYSCTL_CHILDREN(oid);
1391 
1392 	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txcsum", CTLFLAG_RD,
1393 	    &txq->txcsum, "# of times hardware assisted with checksum");
1394 	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "vlan_insertion",
1395 	    CTLFLAG_RD, &txq->vlan_insertion,
1396 	    "# of times hardware inserted 802.1Q tag");
1397 	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "tso_wrs", CTLFLAG_RD,
1398 	    &txq->tso_wrs, "# of IPv4 TSO work requests");
1399 	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "imm_wrs", CTLFLAG_RD,
1400 	    &txq->imm_wrs, "# of work requests with immediate data");
1401 	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "sgl_wrs", CTLFLAG_RD,
1402 	    &txq->sgl_wrs, "# of work requests with direct SGL");
1403 	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkt_wrs", CTLFLAG_RD,
1404 	    &txq->txpkt_wrs, "# of txpkt work requests (one pkt/WR)");
1405 	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkts_wrs", CTLFLAG_RD,
1406 	    &txq->txpkts_wrs, "# of txpkts work requests (multiple pkts/WR)");
1407 	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkts_pkts", CTLFLAG_RD,
1408 	    &txq->txpkts_pkts, "# of frames tx'd using txpkts work requests");
1409 
1410 	SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "no_dmamap", CTLFLAG_RD,
1411 	    &txq->no_dmamap, 0, "# of times txq ran out of DMA maps");
1412 	SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "no_desc", CTLFLAG_RD,
1413 	    &txq->no_desc, 0, "# of times txq ran out of hardware descriptors");
1414 	SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "egr_update", CTLFLAG_RD,
1415 	    &txq->egr_update, 0, "egress update notifications from the SGE");
1416 
1417 	return (rc);
1418 }
1419 
1420 static int
1421 free_txq(struct port_info *pi, struct sge_txq *txq)
1422 {
1423 	int rc;
1424 	struct adapter *sc = pi->adapter;
1425 	struct sge_eq *eq = &txq->eq;
1426 
1427 	if (eq->flags & (EQ_ALLOCATED | EQ_STARTED)) {
1428 		rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id);
1429 		if (rc != 0) {
1430 			device_printf(pi->dev,
1431 			    "failed to free egress queue %p: %d\n", eq, rc);
1432 			return (rc);
1433 		}
1434 		eq->flags &= ~(EQ_ALLOCATED | EQ_STARTED);
1435 	}
1436 
1437 	free_ring(sc, eq->desc_tag, eq->desc_map, eq->ba, eq->desc);
1438 
1439 	free(eq->sdesc, M_CXGBE);
1440 
1441 	if (eq->maps)
1442 		free_eq_maps(eq);
1443 
1444 	buf_ring_free(eq->br, M_CXGBE);
1445 
1446 	if (eq->tx_tag)
1447 		bus_dma_tag_destroy(eq->tx_tag);
1448 
1449 	if (mtx_initialized(&eq->eq_lock))
1450 		mtx_destroy(&eq->eq_lock);
1451 
1452 	bzero(txq, sizeof(*txq));
1453 	return (0);
1454 }
1455 
1456 static void
1457 oneseg_dma_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error)
1458 {
1459 	bus_addr_t *ba = arg;
1460 
1461 	KASSERT(nseg == 1,
1462 	    ("%s meant for single segment mappings only.", __func__));
1463 
1464 	*ba = error ? 0 : segs->ds_addr;
1465 }
1466 
1467 static inline bool
1468 is_new_response(const struct sge_iq *iq, struct rsp_ctrl **ctrl)
1469 {
1470 	*ctrl = (void *)((uintptr_t)iq->cdesc +
1471 	    (iq->esize - sizeof(struct rsp_ctrl)));
1472 
1473 	return (((*ctrl)->u.type_gen >> S_RSPD_GEN) == iq->gen);
1474 }
1475 
1476 static inline void
1477 iq_next(struct sge_iq *iq)
1478 {
1479 	iq->cdesc = (void *) ((uintptr_t)iq->cdesc + iq->esize);
1480 	if (__predict_false(++iq->cidx == iq->qsize - 1)) {
1481 		iq->cidx = 0;
1482 		iq->gen ^= 1;
1483 		iq->cdesc = iq->desc;
1484 	}
1485 }
1486 
1487 static inline void
1488 ring_fl_db(struct adapter *sc, struct sge_fl *fl)
1489 {
1490 	int ndesc = fl->pending / 8;
1491 
1492 	/* Caller responsible for ensuring there's something useful to do */
1493 	KASSERT(ndesc > 0, ("%s called with no useful work to do.", __func__));
1494 
1495 	wmb();
1496 
1497 	t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL), F_DBPRIO |
1498 	    V_QID(fl->cntxt_id) | V_PIDX(ndesc));
1499 
1500 	fl->pending &= 7;
1501 }
1502 
1503 static void
1504 refill_fl(struct sge_fl *fl, int nbufs)
1505 {
1506 	__be64 *d = &fl->desc[fl->pidx];
1507 	struct fl_sdesc *sd = &fl->sdesc[fl->pidx];
1508 	bus_dma_tag_t tag;
1509 	bus_addr_t pa;
1510 	caddr_t cl;
1511 	int rc;
1512 
1513 	FL_LOCK_ASSERT_OWNED(fl);
1514 
1515 	if (nbufs < 0 || nbufs > fl->needed)
1516 		nbufs = fl->needed;
1517 
1518 	while (nbufs--) {
1519 
1520 		if (sd->cl != NULL) {
1521 
1522 			/*
1523 			 * This happens when a frame small enough to fit
1524 			 * entirely in an mbuf was received in cl last time.
1525 			 * We'd held on to cl and can reuse it now.  Note that
1526 			 * we reuse a cluster of the old size if fl->tag_idx is
1527 			 * no longer the same as sd->tag_idx.
1528 			 */
1529 
1530 			KASSERT(*d == sd->ba_tag,
1531 			    ("%s: recyling problem at pidx %d",
1532 			    __func__, fl->pidx));
1533 
1534 			d++;
1535 			goto recycled;
1536 		}
1537 
1538 
1539 		if (fl->tag_idx != sd->tag_idx) {
1540 			bus_dmamap_t map;
1541 			bus_dma_tag_t newtag = fl->tag[fl->tag_idx];
1542 			bus_dma_tag_t oldtag = fl->tag[sd->tag_idx];
1543 
1544 			/*
1545 			 * An MTU change can get us here.  Discard the old map
1546 			 * which was created with the old tag, but only if
1547 			 * we're able to get a new one.
1548 			 */
1549 			rc = bus_dmamap_create(newtag, 0, &map);
1550 			if (rc == 0) {
1551 				bus_dmamap_destroy(oldtag, sd->map);
1552 				sd->map = map;
1553 				sd->tag_idx = fl->tag_idx;
1554 			}
1555 		}
1556 
1557 		tag = fl->tag[sd->tag_idx];
1558 
1559 		cl = m_cljget(NULL, M_NOWAIT, FL_BUF_SIZE(sd->tag_idx));
1560 		if (cl == NULL)
1561 			break;
1562 
1563 		rc = bus_dmamap_load(tag, sd->map, cl, FL_BUF_SIZE(sd->tag_idx),
1564 		    oneseg_dma_callback, &pa, 0);
1565 		if (rc != 0 || pa == 0) {
1566 			fl->dmamap_failed++;
1567 			uma_zfree(FL_BUF_ZONE(sd->tag_idx), cl);
1568 			break;
1569 		}
1570 
1571 		sd->cl = cl;
1572 		*d++ = htobe64(pa | sd->tag_idx);
1573 
1574 #ifdef INVARIANTS
1575 		sd->ba_tag = htobe64(pa | sd->tag_idx);
1576 #endif
1577 
1578 recycled:
1579 		/* sd->m is never recycled, should always be NULL */
1580 		KASSERT(sd->m == NULL, ("%s: stray mbuf", __func__));
1581 
1582 		sd->m = m_gethdr(M_NOWAIT, MT_NOINIT);
1583 		if (sd->m == NULL)
1584 			break;
1585 
1586 		fl->pending++;
1587 		fl->needed--;
1588 		sd++;
1589 		if (++fl->pidx == fl->cap) {
1590 			fl->pidx = 0;
1591 			sd = fl->sdesc;
1592 			d = fl->desc;
1593 		}
1594 	}
1595 }
1596 
1597 static int
1598 alloc_fl_sdesc(struct sge_fl *fl)
1599 {
1600 	struct fl_sdesc *sd;
1601 	bus_dma_tag_t tag;
1602 	int i, rc;
1603 
1604 	FL_LOCK_ASSERT_OWNED(fl);
1605 
1606 	fl->sdesc = malloc(fl->cap * sizeof(struct fl_sdesc), M_CXGBE,
1607 	    M_ZERO | M_WAITOK);
1608 
1609 	tag = fl->tag[fl->tag_idx];
1610 	sd = fl->sdesc;
1611 	for (i = 0; i < fl->cap; i++, sd++) {
1612 
1613 		sd->tag_idx = fl->tag_idx;
1614 		rc = bus_dmamap_create(tag, 0, &sd->map);
1615 		if (rc != 0)
1616 			goto failed;
1617 	}
1618 
1619 	return (0);
1620 failed:
1621 	while (--i >= 0) {
1622 		sd--;
1623 		bus_dmamap_destroy(tag, sd->map);
1624 		if (sd->m) {
1625 			m_init(sd->m, NULL, 0, M_NOWAIT, MT_DATA, 0);
1626 			m_free(sd->m);
1627 			sd->m = NULL;
1628 		}
1629 	}
1630 	KASSERT(sd == fl->sdesc, ("%s: EDOOFUS", __func__));
1631 
1632 	free(fl->sdesc, M_CXGBE);
1633 	fl->sdesc = NULL;
1634 
1635 	return (rc);
1636 }
1637 
1638 static void
1639 free_fl_sdesc(struct sge_fl *fl)
1640 {
1641 	struct fl_sdesc *sd;
1642 	int i;
1643 
1644 	FL_LOCK_ASSERT_OWNED(fl);
1645 
1646 	sd = fl->sdesc;
1647 	for (i = 0; i < fl->cap; i++, sd++) {
1648 
1649 		if (sd->m) {
1650 			m_init(sd->m, NULL, 0, M_NOWAIT, MT_DATA, 0);
1651 			m_free(sd->m);
1652 			sd->m = NULL;
1653 		}
1654 
1655 		if (sd->cl) {
1656 			bus_dmamap_unload(fl->tag[sd->tag_idx], sd->map);
1657 			uma_zfree(FL_BUF_ZONE(sd->tag_idx), sd->cl);
1658 			sd->cl = NULL;
1659 		}
1660 
1661 		bus_dmamap_destroy(fl->tag[sd->tag_idx], sd->map);
1662 	}
1663 
1664 	free(fl->sdesc, M_CXGBE);
1665 	fl->sdesc = NULL;
1666 }
1667 
1668 static int
1669 alloc_eq_maps(struct sge_eq *eq)
1670 {
1671 	struct tx_map *txm;
1672 	int i, rc, count;
1673 
1674 	/*
1675 	 * We can stuff ~10 frames in an 8-descriptor txpkts WR (8 is the SGE
1676 	 * limit for any WR).  txq->no_dmamap events shouldn't occur if maps is
1677 	 * sized for the worst case.
1678 	 */
1679 	count = eq->qsize * 10 / 8;
1680 	eq->map_total = eq->map_avail = count;
1681 	eq->map_cidx = eq->map_pidx = 0;
1682 
1683 	eq->maps = malloc(count * sizeof(struct tx_map), M_CXGBE,
1684 	    M_ZERO | M_WAITOK);
1685 
1686 	txm = eq->maps;
1687 	for (i = 0; i < count; i++, txm++) {
1688 		rc = bus_dmamap_create(eq->tx_tag, 0, &txm->map);
1689 		if (rc != 0)
1690 			goto failed;
1691 	}
1692 
1693 	return (0);
1694 failed:
1695 	while (--i >= 0) {
1696 		txm--;
1697 		bus_dmamap_destroy(eq->tx_tag, txm->map);
1698 	}
1699 	KASSERT(txm == eq->maps, ("%s: EDOOFUS", __func__));
1700 
1701 	free(eq->maps, M_CXGBE);
1702 	eq->maps = NULL;
1703 
1704 	return (rc);
1705 }
1706 
1707 static void
1708 free_eq_maps(struct sge_eq *eq)
1709 {
1710 	struct tx_map *txm;
1711 	int i;
1712 
1713 	txm = eq->maps;
1714 	for (i = 0; i < eq->map_total; i++, txm++) {
1715 
1716 		if (txm->m) {
1717 			bus_dmamap_unload(eq->tx_tag, txm->map);
1718 			m_freem(txm->m);
1719 			txm->m = NULL;
1720 		}
1721 
1722 		bus_dmamap_destroy(eq->tx_tag, txm->map);
1723 	}
1724 
1725 	free(eq->maps, M_CXGBE);
1726 	eq->maps = NULL;
1727 }
1728 
1729 /*
1730  * We'll do immediate data tx for non-TSO, but only when not coalescing.  We're
1731  * willing to use upto 2 hardware descriptors which means a maximum of 96 bytes
1732  * of immediate data.
1733  */
1734 #define IMM_LEN ( \
1735       2 * TX_EQ_ESIZE \
1736     - sizeof(struct fw_eth_tx_pkt_wr) \
1737     - sizeof(struct cpl_tx_pkt_core))
1738 
1739 /*
1740  * Returns non-zero on failure, no need to cleanup anything in that case.
1741  *
1742  * Note 1: We always try to defrag the mbuf if required and return EFBIG only
1743  * if the resulting chain still won't fit in a tx descriptor.
1744  *
1745  * Note 2: We'll pullup the mbuf chain if TSO is requested and the first mbuf
1746  * does not have the TCP header in it.
1747  */
1748 static int
1749 get_pkt_sgl(struct sge_txq *txq, struct mbuf **fp, struct sgl *sgl,
1750     int sgl_only)
1751 {
1752 	struct mbuf *m = *fp;
1753 	struct sge_eq *eq = &txq->eq;
1754 	struct tx_map *txm;
1755 	int rc, defragged = 0, n;
1756 
1757 	TXQ_LOCK_ASSERT_OWNED(txq);
1758 
1759 	if (m->m_pkthdr.tso_segsz)
1760 		sgl_only = 1;	/* Do not allow immediate data with LSO */
1761 
1762 start:	sgl->nsegs = 0;
1763 
1764 	if (m->m_pkthdr.len <= IMM_LEN && !sgl_only)
1765 		return (0);	/* nsegs = 0 tells caller to use imm. tx */
1766 
1767 	if (eq->map_avail == 0) {
1768 		txq->no_dmamap++;
1769 		return (ENOMEM);
1770 	}
1771 	txm = &eq->maps[eq->map_pidx];
1772 
1773 	if (m->m_pkthdr.tso_segsz && m->m_len < 50) {
1774 		*fp = m_pullup(m, 50);
1775 		m = *fp;
1776 		if (m == NULL)
1777 			return (ENOBUFS);
1778 	}
1779 
1780 	rc = bus_dmamap_load_mbuf_sg(eq->tx_tag, txm->map, m, sgl->seg,
1781 	    &sgl->nsegs, BUS_DMA_NOWAIT);
1782 	if (rc == EFBIG && defragged == 0) {
1783 		m = m_defrag(m, M_DONTWAIT);
1784 		if (m == NULL)
1785 			return (EFBIG);
1786 
1787 		defragged = 1;
1788 		*fp = m;
1789 		goto start;
1790 	}
1791 	if (rc != 0)
1792 		return (rc);
1793 
1794 	txm->m = m;
1795 	eq->map_avail--;
1796 	if (++eq->map_pidx == eq->map_total)
1797 		eq->map_pidx = 0;
1798 
1799 	KASSERT(sgl->nsegs > 0 && sgl->nsegs <= TX_SGL_SEGS,
1800 	    ("%s: bad DMA mapping (%d segments)", __func__, sgl->nsegs));
1801 
1802 	/*
1803 	 * Store the # of flits required to hold this frame's SGL in nflits.  An
1804 	 * SGL has a (ULPTX header + len0, addr0) tuple optionally followed by
1805 	 * multiple (len0 + len1, addr0, addr1) tuples.  If addr1 is not used
1806 	 * then len1 must be set to 0.
1807 	 */
1808 	n = sgl->nsegs - 1;
1809 	sgl->nflits = (3 * n) / 2 + (n & 1) + 2;
1810 
1811 	return (0);
1812 }
1813 
1814 
1815 /*
1816  * Releases all the txq resources used up in the specified sgl.
1817  */
1818 static int
1819 free_pkt_sgl(struct sge_txq *txq, struct sgl *sgl)
1820 {
1821 	struct sge_eq *eq = &txq->eq;
1822 	struct tx_map *txm;
1823 
1824 	TXQ_LOCK_ASSERT_OWNED(txq);
1825 
1826 	if (sgl->nsegs == 0)
1827 		return (0);	/* didn't use any map */
1828 
1829 	/* 1 pkt uses exactly 1 map, back it out */
1830 
1831 	eq->map_avail++;
1832 	if (eq->map_pidx > 0)
1833 		eq->map_pidx--;
1834 	else
1835 		eq->map_pidx = eq->map_total - 1;
1836 
1837 	txm = &eq->maps[eq->map_pidx];
1838 	bus_dmamap_unload(eq->tx_tag, txm->map);
1839 	txm->m = NULL;
1840 
1841 	return (0);
1842 }
1843 
1844 static int
1845 write_txpkt_wr(struct port_info *pi, struct sge_txq *txq, struct mbuf *m,
1846     struct sgl *sgl)
1847 {
1848 	struct sge_eq *eq = &txq->eq;
1849 	struct fw_eth_tx_pkt_wr *wr;
1850 	struct cpl_tx_pkt_core *cpl;
1851 	uint32_t ctrl;	/* used in many unrelated places */
1852 	uint64_t ctrl1;
1853 	int nflits, ndesc, pktlen;
1854 	struct tx_sdesc *txsd;
1855 	caddr_t dst;
1856 
1857 	TXQ_LOCK_ASSERT_OWNED(txq);
1858 
1859 	pktlen = m->m_pkthdr.len;
1860 
1861 	/*
1862 	 * Do we have enough flits to send this frame out?
1863 	 */
1864 	ctrl = sizeof(struct cpl_tx_pkt_core);
1865 	if (m->m_pkthdr.tso_segsz) {
1866 		nflits = TXPKT_LSO_WR_HDR;
1867 		ctrl += sizeof(struct cpl_tx_pkt_lso);
1868 	} else
1869 		nflits = TXPKT_WR_HDR;
1870 	if (sgl->nsegs > 0)
1871 		nflits += sgl->nflits;
1872 	else {
1873 		nflits += howmany(pktlen, 8);
1874 		ctrl += pktlen;
1875 	}
1876 	ndesc = howmany(nflits, 8);
1877 	if (ndesc > eq->avail)
1878 		return (ENOMEM);
1879 
1880 	/* Firmware work request header */
1881 	wr = (void *)&eq->desc[eq->pidx];
1882 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
1883 	    V_FW_WR_IMMDLEN(ctrl));
1884 	ctrl = V_FW_WR_LEN16(howmany(nflits, 2));
1885 	if (eq->avail == ndesc)
1886 		ctrl |= F_FW_WR_EQUEQ | F_FW_WR_EQUIQ;
1887 	wr->equiq_to_len16 = htobe32(ctrl);
1888 	wr->r3 = 0;
1889 
1890 	if (m->m_pkthdr.tso_segsz) {
1891 		struct cpl_tx_pkt_lso *lso = (void *)(wr + 1);
1892 		struct ether_header *eh;
1893 		struct ip *ip;
1894 		struct tcphdr *tcp;
1895 
1896 		ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
1897 		    F_LSO_LAST_SLICE;
1898 
1899 		eh = mtod(m, struct ether_header *);
1900 		if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
1901 			ctrl |= V_LSO_ETHHDR_LEN(1);
1902 			ip = (void *)((struct ether_vlan_header *)eh + 1);
1903 		} else
1904 			ip = (void *)(eh + 1);
1905 
1906 		tcp = (void *)((uintptr_t)ip + ip->ip_hl * 4);
1907 		ctrl |= V_LSO_IPHDR_LEN(ip->ip_hl) |
1908 		    V_LSO_TCPHDR_LEN(tcp->th_off);
1909 
1910 		lso->lso_ctrl = htobe32(ctrl);
1911 		lso->ipid_ofst = htobe16(0);
1912 		lso->mss = htobe16(m->m_pkthdr.tso_segsz);
1913 		lso->seqno_offset = htobe32(0);
1914 		lso->len = htobe32(pktlen);
1915 
1916 		cpl = (void *)(lso + 1);
1917 
1918 		txq->tso_wrs++;
1919 	} else
1920 		cpl = (void *)(wr + 1);
1921 
1922 	/* Checksum offload */
1923 	ctrl1 = 0;
1924 	if (!(m->m_pkthdr.csum_flags & CSUM_IP))
1925 		ctrl1 |= F_TXPKT_IPCSUM_DIS;
1926 	if (!(m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP)))
1927 		ctrl1 |= F_TXPKT_L4CSUM_DIS;
1928 	if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP))
1929 		txq->txcsum++;	/* some hardware assistance provided */
1930 
1931 	/* VLAN tag insertion */
1932 	if (m->m_flags & M_VLANTAG) {
1933 		ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
1934 		txq->vlan_insertion++;
1935 	}
1936 
1937 	/* CPL header */
1938 	cpl->ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
1939 	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(pi->adapter->pf));
1940 	cpl->pack = 0;
1941 	cpl->len = htobe16(pktlen);
1942 	cpl->ctrl1 = htobe64(ctrl1);
1943 
1944 	/* Software descriptor */
1945 	txsd = &eq->sdesc[eq->pidx];
1946 	txsd->desc_used = ndesc;
1947 
1948 	eq->pending += ndesc;
1949 	eq->avail -= ndesc;
1950 	eq->pidx += ndesc;
1951 	if (eq->pidx >= eq->cap)
1952 		eq->pidx -= eq->cap;
1953 
1954 	/* SGL */
1955 	dst = (void *)(cpl + 1);
1956 	if (sgl->nsegs > 0) {
1957 		txsd->map_used = 1;
1958 		txq->sgl_wrs++;
1959 		write_sgl_to_txd(eq, sgl, &dst);
1960 	} else {
1961 		txsd->map_used = 0;
1962 		txq->imm_wrs++;
1963 		for (; m; m = m->m_next) {
1964 			copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len);
1965 #ifdef INVARIANTS
1966 			pktlen -= m->m_len;
1967 #endif
1968 		}
1969 #ifdef INVARIANTS
1970 		KASSERT(pktlen == 0, ("%s: %d bytes left.", __func__, pktlen));
1971 #endif
1972 
1973 	}
1974 
1975 	txq->txpkt_wrs++;
1976 	return (0);
1977 }
1978 
1979 /*
1980  * Returns 0 to indicate that m has been accepted into a coalesced tx work
1981  * request.  It has either been folded into txpkts or txpkts was flushed and m
1982  * has started a new coalesced work request (as the first frame in a fresh
1983  * txpkts).
1984  *
1985  * Returns non-zero to indicate a failure - caller is responsible for
1986  * transmitting m, if there was anything in txpkts it has been flushed.
1987  */
1988 static int
1989 add_to_txpkts(struct port_info *pi, struct sge_txq *txq, struct txpkts *txpkts,
1990     struct mbuf *m, struct sgl *sgl)
1991 {
1992 	struct sge_eq *eq = &txq->eq;
1993 	int can_coalesce;
1994 	struct tx_sdesc *txsd;
1995 	int flits;
1996 
1997 	TXQ_LOCK_ASSERT_OWNED(txq);
1998 
1999 	if (txpkts->npkt > 0) {
2000 		flits = TXPKTS_PKT_HDR + sgl->nflits;
2001 		can_coalesce = m->m_pkthdr.tso_segsz == 0 &&
2002 		    txpkts->nflits + flits <= TX_WR_FLITS &&
2003 		    txpkts->nflits + flits <= eq->avail * 8 &&
2004 		    txpkts->plen + m->m_pkthdr.len < 65536;
2005 
2006 		if (can_coalesce) {
2007 			txpkts->npkt++;
2008 			txpkts->nflits += flits;
2009 			txpkts->plen += m->m_pkthdr.len;
2010 
2011 			txsd = &eq->sdesc[eq->pidx];
2012 			txsd->map_used++;
2013 
2014 			return (0);
2015 		}
2016 
2017 		/*
2018 		 * Couldn't coalesce m into txpkts.  The first order of business
2019 		 * is to send txpkts on its way.  Then we'll revisit m.
2020 		 */
2021 		write_txpkts_wr(txq, txpkts);
2022 	}
2023 
2024 	/*
2025 	 * Check if we can start a new coalesced tx work request with m as
2026 	 * the first packet in it.
2027 	 */
2028 
2029 	KASSERT(txpkts->npkt == 0, ("%s: txpkts not empty", __func__));
2030 
2031 	flits = TXPKTS_WR_HDR + sgl->nflits;
2032 	can_coalesce = m->m_pkthdr.tso_segsz == 0 &&
2033 	    flits <= eq->avail * 8 && flits <= TX_WR_FLITS;
2034 
2035 	if (can_coalesce == 0)
2036 		return (EINVAL);
2037 
2038 	/*
2039 	 * Start a fresh coalesced tx WR with m as the first frame in it.
2040 	 */
2041 	txpkts->npkt = 1;
2042 	txpkts->nflits = flits;
2043 	txpkts->flitp = &eq->desc[eq->pidx].flit[2];
2044 	txpkts->plen = m->m_pkthdr.len;
2045 
2046 	txsd = &eq->sdesc[eq->pidx];
2047 	txsd->map_used = 1;
2048 
2049 	return (0);
2050 }
2051 
2052 /*
2053  * Note that write_txpkts_wr can never run out of hardware descriptors (but
2054  * write_txpkt_wr can).  add_to_txpkts ensures that a frame is accepted for
2055  * coalescing only if sufficient hardware descriptors are available.
2056  */
2057 static void
2058 write_txpkts_wr(struct sge_txq *txq, struct txpkts *txpkts)
2059 {
2060 	struct sge_eq *eq = &txq->eq;
2061 	struct fw_eth_tx_pkts_wr *wr;
2062 	struct tx_sdesc *txsd;
2063 	uint32_t ctrl;
2064 	int ndesc;
2065 
2066 	TXQ_LOCK_ASSERT_OWNED(txq);
2067 
2068 	ndesc = howmany(txpkts->nflits, 8);
2069 
2070 	wr = (void *)&eq->desc[eq->pidx];
2071 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR) |
2072 	    V_FW_WR_IMMDLEN(0)); /* immdlen does not matter in this WR */
2073 	ctrl = V_FW_WR_LEN16(howmany(txpkts->nflits, 2));
2074 	if (eq->avail == ndesc)
2075 		ctrl |= F_FW_WR_EQUEQ | F_FW_WR_EQUIQ;
2076 	wr->equiq_to_len16 = htobe32(ctrl);
2077 	wr->plen = htobe16(txpkts->plen);
2078 	wr->npkt = txpkts->npkt;
2079 	wr->r3 = wr->r4 = 0;
2080 
2081 	/* Everything else already written */
2082 
2083 	txsd = &eq->sdesc[eq->pidx];
2084 	txsd->desc_used = ndesc;
2085 
2086 	KASSERT(eq->avail >= ndesc, ("%s: out ouf descriptors", __func__));
2087 
2088 	eq->pending += ndesc;
2089 	eq->avail -= ndesc;
2090 	eq->pidx += ndesc;
2091 	if (eq->pidx >= eq->cap)
2092 		eq->pidx -= eq->cap;
2093 
2094 	txq->txpkts_pkts += txpkts->npkt;
2095 	txq->txpkts_wrs++;
2096 	txpkts->npkt = 0;	/* emptied */
2097 }
2098 
2099 static inline void
2100 write_ulp_cpl_sgl(struct port_info *pi, struct sge_txq *txq,
2101     struct txpkts *txpkts, struct mbuf *m, struct sgl *sgl)
2102 {
2103 	struct ulp_txpkt *ulpmc;
2104 	struct ulptx_idata *ulpsc;
2105 	struct cpl_tx_pkt_core *cpl;
2106 	struct sge_eq *eq = &txq->eq;
2107 	uintptr_t flitp, start, end;
2108 	uint64_t ctrl;
2109 	caddr_t dst;
2110 
2111 	KASSERT(txpkts->npkt > 0, ("%s: txpkts is empty", __func__));
2112 
2113 	start = (uintptr_t)eq->desc;
2114 	end = (uintptr_t)eq->spg;
2115 
2116 	/* Checksum offload */
2117 	ctrl = 0;
2118 	if (!(m->m_pkthdr.csum_flags & CSUM_IP))
2119 		ctrl |= F_TXPKT_IPCSUM_DIS;
2120 	if (!(m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP)))
2121 		ctrl |= F_TXPKT_L4CSUM_DIS;
2122 	if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP))
2123 		txq->txcsum++;	/* some hardware assistance provided */
2124 
2125 	/* VLAN tag insertion */
2126 	if (m->m_flags & M_VLANTAG) {
2127 		ctrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
2128 		txq->vlan_insertion++;
2129 	}
2130 
2131 	/*
2132 	 * The previous packet's SGL must have ended at a 16 byte boundary (this
2133 	 * is required by the firmware/hardware).  It follows that flitp cannot
2134 	 * wrap around between the ULPTX master command and ULPTX subcommand (8
2135 	 * bytes each), and that it can not wrap around in the middle of the
2136 	 * cpl_tx_pkt_core either.
2137 	 */
2138 	flitp = (uintptr_t)txpkts->flitp;
2139 	KASSERT((flitp & 0xf) == 0,
2140 	    ("%s: last SGL did not end at 16 byte boundary: %p",
2141 	    __func__, txpkts->flitp));
2142 
2143 	/* ULP master command */
2144 	ulpmc = (void *)flitp;
2145 	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0) |
2146 	    V_ULP_TXPKT_FID(eq->iqid));
2147 	ulpmc->len = htonl(howmany(sizeof(*ulpmc) + sizeof(*ulpsc) +
2148 	    sizeof(*cpl) + 8 * sgl->nflits, 16));
2149 
2150 	/* ULP subcommand */
2151 	ulpsc = (void *)(ulpmc + 1);
2152 	ulpsc->cmd_more = htobe32(V_ULPTX_CMD((u32)ULP_TX_SC_IMM) |
2153 	    F_ULP_TX_SC_MORE);
2154 	ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core));
2155 
2156 	flitp += sizeof(*ulpmc) + sizeof(*ulpsc);
2157 	if (flitp == end)
2158 		flitp = start;
2159 
2160 	/* CPL_TX_PKT */
2161 	cpl = (void *)flitp;
2162 	cpl->ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
2163 	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(pi->adapter->pf));
2164 	cpl->pack = 0;
2165 	cpl->len = htobe16(m->m_pkthdr.len);
2166 	cpl->ctrl1 = htobe64(ctrl);
2167 
2168 	flitp += sizeof(*cpl);
2169 	if (flitp == end)
2170 		flitp = start;
2171 
2172 	/* SGL for this frame */
2173 	dst = (caddr_t)flitp;
2174 	txpkts->nflits += write_sgl_to_txd(eq, sgl, &dst);
2175 	txpkts->flitp = (void *)dst;
2176 
2177 	KASSERT(((uintptr_t)dst & 0xf) == 0,
2178 	    ("%s: SGL ends at %p (not a 16 byte boundary)", __func__, dst));
2179 }
2180 
2181 /*
2182  * If the SGL ends on an address that is not 16 byte aligned, this function will
2183  * add a 0 filled flit at the end.  It returns 1 in that case.
2184  */
2185 static int
2186 write_sgl_to_txd(struct sge_eq *eq, struct sgl *sgl, caddr_t *to)
2187 {
2188 	__be64 *flitp, *end;
2189 	struct ulptx_sgl *usgl;
2190 	bus_dma_segment_t *seg;
2191 	int i, padded;
2192 
2193 	KASSERT(sgl->nsegs > 0 && sgl->nflits > 0,
2194 	    ("%s: bad SGL - nsegs=%d, nflits=%d",
2195 	    __func__, sgl->nsegs, sgl->nflits));
2196 
2197 	KASSERT(((uintptr_t)(*to) & 0xf) == 0,
2198 	    ("%s: SGL must start at a 16 byte boundary: %p", __func__, *to));
2199 
2200 	flitp = (__be64 *)(*to);
2201 	end = flitp + sgl->nflits;
2202 	seg = &sgl->seg[0];
2203 	usgl = (void *)flitp;
2204 
2205 	/*
2206 	 * We start at a 16 byte boundary somewhere inside the tx descriptor
2207 	 * ring, so we're at least 16 bytes away from the status page.  There is
2208 	 * no chance of a wrap around in the middle of usgl (which is 16 bytes).
2209 	 */
2210 
2211 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
2212 	    V_ULPTX_NSGE(sgl->nsegs));
2213 	usgl->len0 = htobe32(seg->ds_len);
2214 	usgl->addr0 = htobe64(seg->ds_addr);
2215 	seg++;
2216 
2217 	if ((uintptr_t)end <= (uintptr_t)eq->spg) {
2218 
2219 		/* Won't wrap around at all */
2220 
2221 		for (i = 0; i < sgl->nsegs - 1; i++, seg++) {
2222 			usgl->sge[i / 2].len[i & 1] = htobe32(seg->ds_len);
2223 			usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ds_addr);
2224 		}
2225 		if (i & 1)
2226 			usgl->sge[i / 2].len[1] = htobe32(0);
2227 	} else {
2228 
2229 		/* Will wrap somewhere in the rest of the SGL */
2230 
2231 		/* 2 flits already written, write the rest flit by flit */
2232 		flitp = (void *)(usgl + 1);
2233 		for (i = 0; i < sgl->nflits - 2; i++) {
2234 			if ((uintptr_t)flitp == (uintptr_t)eq->spg)
2235 				flitp = (void *)eq->desc;
2236 			*flitp++ = get_flit(seg, sgl->nsegs - 1, i);
2237 		}
2238 		end = flitp;
2239 	}
2240 
2241 	if ((uintptr_t)end & 0xf) {
2242 		*(uint64_t *)end = 0;
2243 		end++;
2244 		padded = 1;
2245 	} else
2246 		padded = 0;
2247 
2248 	if ((uintptr_t)end == (uintptr_t)eq->spg)
2249 		*to = (void *)eq->desc;
2250 	else
2251 		*to = (void *)end;
2252 
2253 	return (padded);
2254 }
2255 
2256 static inline void
2257 copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len)
2258 {
2259 	if ((uintptr_t)(*to) + len <= (uintptr_t)eq->spg) {
2260 		bcopy(from, *to, len);
2261 		(*to) += len;
2262 	} else {
2263 		int portion = (uintptr_t)eq->spg - (uintptr_t)(*to);
2264 
2265 		bcopy(from, *to, portion);
2266 		from += portion;
2267 		portion = len - portion;	/* remaining */
2268 		bcopy(from, (void *)eq->desc, portion);
2269 		(*to) = (caddr_t)eq->desc + portion;
2270 	}
2271 }
2272 
2273 static inline void
2274 ring_tx_db(struct adapter *sc, struct sge_eq *eq)
2275 {
2276 	wmb();
2277 	t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL),
2278 	    V_QID(eq->cntxt_id) | V_PIDX(eq->pending));
2279 	eq->pending = 0;
2280 }
2281 
2282 static inline int
2283 reclaimable(struct sge_eq *eq)
2284 {
2285 	unsigned int cidx;
2286 
2287 	cidx = eq->spg->cidx;	/* stable snapshot */
2288 	cidx = be16_to_cpu(cidx);
2289 
2290 	if (cidx >= eq->cidx)
2291 		return (cidx - eq->cidx);
2292 	else
2293 		return (cidx + eq->cap - eq->cidx);
2294 }
2295 
2296 /*
2297  * There are "can_reclaim" tx descriptors ready to be reclaimed.  Reclaim as
2298  * many as possible but stop when there are around "n" mbufs to free.
2299  *
2300  * The actual number reclaimed is provided as the return value.
2301  */
2302 static int
2303 reclaim_tx_descs(struct sge_eq *eq, int can_reclaim, int n)
2304 {
2305 	struct tx_sdesc *txsd;
2306 	struct tx_map *txm;
2307 	unsigned int reclaimed, maps;
2308 
2309 	EQ_LOCK_ASSERT_OWNED(eq);
2310 
2311 	if (can_reclaim == 0)
2312 		can_reclaim = reclaimable(eq);
2313 
2314 	maps = reclaimed = 0;
2315 	while (can_reclaim && maps < n) {
2316 		int ndesc;
2317 
2318 		txsd = &eq->sdesc[eq->cidx];
2319 		ndesc = txsd->desc_used;
2320 
2321 		/* Firmware doesn't return "partial" credits. */
2322 		KASSERT(can_reclaim >= ndesc,
2323 		    ("%s: unexpected number of credits: %d, %d",
2324 		    __func__, can_reclaim, ndesc));
2325 
2326 		maps += txsd->map_used;
2327 
2328 		reclaimed += ndesc;
2329 		can_reclaim -= ndesc;
2330 
2331 		eq->cidx += ndesc;
2332 		if (__predict_false(eq->cidx >= eq->cap))
2333 			eq->cidx -= eq->cap;
2334 	}
2335 
2336 	txm = &eq->maps[eq->map_cidx];
2337 	if (maps)
2338 		prefetch(txm->m);
2339 
2340 	eq->avail += reclaimed;
2341 	KASSERT(eq->avail < eq->cap,	/* avail tops out at (cap - 1) */
2342 	    ("%s: too many descriptors available", __func__));
2343 
2344 	eq->map_avail += maps;
2345 	KASSERT(eq->map_avail <= eq->map_total,
2346 	    ("%s: too many maps available", __func__));
2347 
2348 	while (maps--) {
2349 		struct tx_map *next;
2350 
2351 		next = txm + 1;
2352 		if (__predict_false(eq->map_cidx + 1 == eq->map_total))
2353 			next = eq->maps;
2354 		prefetch(next->m);
2355 
2356 		bus_dmamap_unload(eq->tx_tag, txm->map);
2357 		m_freem(txm->m);
2358 		txm->m = NULL;
2359 
2360 		txm = next;
2361 		if (__predict_false(++eq->map_cidx == eq->map_total))
2362 			eq->map_cidx = 0;
2363 	}
2364 
2365 	return (reclaimed);
2366 }
2367 
2368 static void
2369 write_eqflush_wr(struct sge_eq *eq)
2370 {
2371 	struct fw_eq_flush_wr *wr;
2372 	struct tx_sdesc *txsd;
2373 
2374 	EQ_LOCK_ASSERT_OWNED(eq);
2375 	KASSERT(eq->avail > 0, ("%s: no descriptors left.", __func__));
2376 
2377 	wr = (void *)&eq->desc[eq->pidx];
2378 	bzero(wr, sizeof(*wr));
2379 	wr->opcode = FW_EQ_FLUSH_WR;
2380 	wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(sizeof(*wr) / 16) |
2381 	    F_FW_WR_EQUEQ | F_FW_WR_EQUIQ);
2382 
2383 	txsd = &eq->sdesc[eq->pidx];
2384 	txsd->desc_used = 1;
2385 	txsd->map_used = 0;
2386 
2387 	eq->pending++;
2388 	eq->avail--;
2389 	if (++eq->pidx == eq->cap)
2390 		eq->pidx = 0;
2391 }
2392 
2393 static __be64
2394 get_flit(bus_dma_segment_t *sgl, int nsegs, int idx)
2395 {
2396 	int i = (idx / 3) * 2;
2397 
2398 	switch (idx % 3) {
2399 	case 0: {
2400 		__be64 rc;
2401 
2402 		rc = htobe32(sgl[i].ds_len);
2403 		if (i + 1 < nsegs)
2404 			rc |= (uint64_t)htobe32(sgl[i + 1].ds_len) << 32;
2405 
2406 		return (rc);
2407 	}
2408 	case 1:
2409 		return htobe64(sgl[i].ds_addr);
2410 	case 2:
2411 		return htobe64(sgl[i + 1].ds_addr);
2412 	}
2413 
2414 	return (0);
2415 }
2416 
2417 static void
2418 set_fl_tag_idx(struct sge_fl *fl, int mtu)
2419 {
2420 	int i;
2421 
2422 	FL_LOCK_ASSERT_OWNED(fl);
2423 
2424 	for (i = 0; i < FL_BUF_SIZES - 1; i++) {
2425 		if (FL_BUF_SIZE(i) >= (mtu + FL_PKTSHIFT))
2426 			break;
2427 	}
2428 
2429 	fl->tag_idx = i;
2430 }
2431 
2432 static int
2433 handle_sge_egr_update(struct adapter *sc, const struct cpl_sge_egr_update *cpl)
2434 {
2435 	unsigned int qid = G_EGR_QID(ntohl(cpl->opcode_qid));
2436 	struct sge *s = &sc->sge;
2437 	struct sge_txq *txq;
2438 	struct port_info *pi;
2439 
2440 	txq = (void *)s->eqmap[qid - s->eq_start];
2441 	pi = txq->ifp->if_softc;
2442 	taskqueue_enqueue(pi->tq, &txq->resume_tx);
2443 	txq->egr_update++;
2444 
2445 	return (0);
2446 }
2447