xref: /freebsd/sys/dev/cxgb/cxgb_sge.c (revision 7790c8c1)
1 /**************************************************************************
2 SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 
4 Copyright (c) 2007-2009, Chelsio Inc.
5 All rights reserved.
6 
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9 
10  1. Redistributions of source code must retain the above copyright notice,
11     this list of conditions and the following disclaimer.
12 
13  2. Neither the name of the Chelsio Corporation nor the names of its
14     contributors may be used to endorse or promote products derived from
15     this software without specific prior written permission.
16 
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 POSSIBILITY OF SUCH DAMAGE.
28 
29 ***************************************************************************/
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_inet6.h"
35 #include "opt_inet.h"
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/module.h>
41 #include <sys/bus.h>
42 #include <sys/conf.h>
43 #include <machine/bus.h>
44 #include <machine/resource.h>
45 #include <sys/rman.h>
46 #include <sys/queue.h>
47 #include <sys/sysctl.h>
48 #include <sys/taskqueue.h>
49 
50 #include <sys/proc.h>
51 #include <sys/sbuf.h>
52 #include <sys/sched.h>
53 #include <sys/smp.h>
54 #include <sys/systm.h>
55 #include <sys/syslog.h>
56 #include <sys/socket.h>
57 #include <sys/sglist.h>
58 
59 #include <net/if.h>
60 #include <net/if_var.h>
61 #include <net/bpf.h>
62 #include <net/ethernet.h>
63 #include <net/if_vlan_var.h>
64 
65 #include <netinet/in_systm.h>
66 #include <netinet/in.h>
67 #include <netinet/ip.h>
68 #include <netinet/ip6.h>
69 #include <netinet/tcp.h>
70 
71 #include <dev/pci/pcireg.h>
72 #include <dev/pci/pcivar.h>
73 
74 #include <vm/vm.h>
75 #include <vm/pmap.h>
76 
77 #include <cxgb_include.h>
78 #include <sys/mvec.h>
79 
80 int	txq_fills = 0;
81 int	multiq_tx_enable = 1;
82 
83 #ifdef TCP_OFFLOAD
84 CTASSERT(NUM_CPL_HANDLERS >= NUM_CPL_CMDS);
85 #endif
86 
87 extern struct sysctl_oid_list sysctl__hw_cxgb_children;
88 int cxgb_txq_buf_ring_size = TX_ETH_Q_SIZE;
89 SYSCTL_INT(_hw_cxgb, OID_AUTO, txq_mr_size, CTLFLAG_RDTUN, &cxgb_txq_buf_ring_size, 0,
90     "size of per-queue mbuf ring");
91 
92 static int cxgb_tx_coalesce_force = 0;
93 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_force, CTLFLAG_RWTUN,
94     &cxgb_tx_coalesce_force, 0,
95     "coalesce small packets into a single work request regardless of ring state");
96 
97 #define	COALESCE_START_DEFAULT		TX_ETH_Q_SIZE>>1
98 #define	COALESCE_START_MAX		(TX_ETH_Q_SIZE-(TX_ETH_Q_SIZE>>3))
99 #define	COALESCE_STOP_DEFAULT		TX_ETH_Q_SIZE>>2
100 #define	COALESCE_STOP_MIN		TX_ETH_Q_SIZE>>5
101 #define	TX_RECLAIM_DEFAULT		TX_ETH_Q_SIZE>>5
102 #define	TX_RECLAIM_MAX			TX_ETH_Q_SIZE>>2
103 #define	TX_RECLAIM_MIN			TX_ETH_Q_SIZE>>6
104 
105 
106 static int cxgb_tx_coalesce_enable_start = COALESCE_START_DEFAULT;
107 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_start, CTLFLAG_RWTUN,
108     &cxgb_tx_coalesce_enable_start, 0,
109     "coalesce enable threshold");
110 static int cxgb_tx_coalesce_enable_stop = COALESCE_STOP_DEFAULT;
111 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_stop, CTLFLAG_RWTUN,
112     &cxgb_tx_coalesce_enable_stop, 0,
113     "coalesce disable threshold");
114 static int cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
115 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_reclaim_threshold, CTLFLAG_RWTUN,
116     &cxgb_tx_reclaim_threshold, 0,
117     "tx cleaning minimum threshold");
118 
119 /*
120  * XXX don't re-enable this until TOE stops assuming
121  * we have an m_ext
122  */
123 static int recycle_enable = 0;
124 
125 extern int cxgb_use_16k_clusters;
126 extern int nmbjumbop;
127 extern int nmbjumbo9;
128 extern int nmbjumbo16;
129 
130 #define USE_GTS 0
131 
132 #define SGE_RX_SM_BUF_SIZE	1536
133 #define SGE_RX_DROP_THRES	16
134 #define SGE_RX_COPY_THRES	128
135 
136 /*
137  * Period of the Tx buffer reclaim timer.  This timer does not need to run
138  * frequently as Tx buffers are usually reclaimed by new Tx packets.
139  */
140 #define TX_RECLAIM_PERIOD       (hz >> 1)
141 
142 /*
143  * Values for sge_txq.flags
144  */
145 enum {
146 	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
147 	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
148 };
149 
150 struct tx_desc {
151 	uint64_t	flit[TX_DESC_FLITS];
152 } __packed;
153 
154 struct rx_desc {
155 	uint32_t	addr_lo;
156 	uint32_t	len_gen;
157 	uint32_t	gen2;
158 	uint32_t	addr_hi;
159 } __packed;
160 
161 struct rsp_desc {               /* response queue descriptor */
162 	struct rss_header	rss_hdr;
163 	uint32_t		flags;
164 	uint32_t		len_cq;
165 	uint8_t			imm_data[47];
166 	uint8_t			intr_gen;
167 } __packed;
168 
169 #define RX_SW_DESC_MAP_CREATED	(1 << 0)
170 #define TX_SW_DESC_MAP_CREATED	(1 << 1)
171 #define RX_SW_DESC_INUSE        (1 << 3)
172 #define TX_SW_DESC_MAPPED       (1 << 4)
173 
174 #define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
175 #define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
176 #define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
177 #define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
178 
179 struct tx_sw_desc {                /* SW state per Tx descriptor */
180 	struct mbuf	*m;
181 	bus_dmamap_t	map;
182 	int		flags;
183 };
184 
185 struct rx_sw_desc {                /* SW state per Rx descriptor */
186 	caddr_t		rxsd_cl;
187 	struct mbuf	*m;
188 	bus_dmamap_t	map;
189 	int		flags;
190 };
191 
192 struct txq_state {
193 	unsigned int	compl;
194 	unsigned int	gen;
195 	unsigned int	pidx;
196 };
197 
198 struct refill_fl_cb_arg {
199 	int               error;
200 	bus_dma_segment_t seg;
201 	int               nseg;
202 };
203 
204 
205 /*
206  * Maps a number of flits to the number of Tx descriptors that can hold them.
207  * The formula is
208  *
209  * desc = 1 + (flits - 2) / (WR_FLITS - 1).
210  *
211  * HW allows up to 4 descriptors to be combined into a WR.
212  */
213 static uint8_t flit_desc_map[] = {
214 	0,
215 #if SGE_NUM_GENBITS == 1
216 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
217 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
218 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
219 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
220 #elif SGE_NUM_GENBITS == 2
221 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
222 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
223 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
224 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
225 #else
226 # error "SGE_NUM_GENBITS must be 1 or 2"
227 #endif
228 };
229 
230 #define	TXQ_LOCK_ASSERT(qs)	mtx_assert(&(qs)->lock, MA_OWNED)
231 #define	TXQ_TRYLOCK(qs)		mtx_trylock(&(qs)->lock)
232 #define	TXQ_LOCK(qs)		mtx_lock(&(qs)->lock)
233 #define	TXQ_UNLOCK(qs)		mtx_unlock(&(qs)->lock)
234 #define	TXQ_RING_EMPTY(qs)	drbr_empty((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
235 #define	TXQ_RING_NEEDS_ENQUEUE(qs)					\
236 	drbr_needs_enqueue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
237 #define	TXQ_RING_FLUSH(qs)	drbr_flush((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
238 #define	TXQ_RING_DEQUEUE_COND(qs, func, arg)				\
239 	drbr_dequeue_cond((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr, func, arg)
240 #define	TXQ_RING_DEQUEUE(qs) \
241 	drbr_dequeue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
242 
243 int cxgb_debug = 0;
244 
245 static void sge_timer_cb(void *arg);
246 static void sge_timer_reclaim(void *arg, int ncount);
247 static void sge_txq_reclaim_handler(void *arg, int ncount);
248 static void cxgb_start_locked(struct sge_qset *qs);
249 
250 /*
251  * XXX need to cope with bursty scheduling by looking at a wider
252  * window than we are now for determining the need for coalescing
253  *
254  */
255 static __inline uint64_t
256 check_pkt_coalesce(struct sge_qset *qs)
257 {
258         struct adapter *sc;
259         struct sge_txq *txq;
260 	uint8_t *fill;
261 
262 	if (__predict_false(cxgb_tx_coalesce_force))
263 		return (1);
264 	txq = &qs->txq[TXQ_ETH];
265         sc = qs->port->adapter;
266 	fill = &sc->tunq_fill[qs->idx];
267 
268 	if (cxgb_tx_coalesce_enable_start > COALESCE_START_MAX)
269 		cxgb_tx_coalesce_enable_start = COALESCE_START_MAX;
270 	if (cxgb_tx_coalesce_enable_stop < COALESCE_STOP_MIN)
271 		cxgb_tx_coalesce_enable_start = COALESCE_STOP_MIN;
272 	/*
273 	 * if the hardware transmit queue is more than 1/8 full
274 	 * we mark it as coalescing - we drop back from coalescing
275 	 * when we go below 1/32 full and there are no packets enqueued,
276 	 * this provides us with some degree of hysteresis
277 	 */
278         if (*fill != 0 && (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
279 	    TXQ_RING_EMPTY(qs) && (qs->coalescing == 0))
280                 *fill = 0;
281         else if (*fill == 0 && (txq->in_use >= cxgb_tx_coalesce_enable_start))
282                 *fill = 1;
283 
284 	return (sc->tunq_coalesce);
285 }
286 
287 #ifdef __LP64__
288 static void
289 set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
290 {
291 	uint64_t wr_hilo;
292 #if _BYTE_ORDER == _LITTLE_ENDIAN
293 	wr_hilo = wr_hi;
294 	wr_hilo |= (((uint64_t)wr_lo)<<32);
295 #else
296 	wr_hilo = wr_lo;
297 	wr_hilo |= (((uint64_t)wr_hi)<<32);
298 #endif
299 	wrp->wrh_hilo = wr_hilo;
300 }
301 #else
302 static void
303 set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
304 {
305 
306 	wrp->wrh_hi = wr_hi;
307 	wmb();
308 	wrp->wrh_lo = wr_lo;
309 }
310 #endif
311 
312 struct coalesce_info {
313 	int count;
314 	int nbytes;
315 };
316 
317 static int
318 coalesce_check(struct mbuf *m, void *arg)
319 {
320 	struct coalesce_info *ci = arg;
321 	int *count = &ci->count;
322 	int *nbytes = &ci->nbytes;
323 
324 	if ((*nbytes == 0) || ((*nbytes + m->m_len <= 10500) &&
325 		(*count < 7) && (m->m_next == NULL))) {
326 		*count += 1;
327 		*nbytes += m->m_len;
328 		return (1);
329 	}
330 	return (0);
331 }
332 
333 static struct mbuf *
334 cxgb_dequeue(struct sge_qset *qs)
335 {
336 	struct mbuf *m, *m_head, *m_tail;
337 	struct coalesce_info ci;
338 
339 
340 	if (check_pkt_coalesce(qs) == 0)
341 		return TXQ_RING_DEQUEUE(qs);
342 
343 	m_head = m_tail = NULL;
344 	ci.count = ci.nbytes = 0;
345 	do {
346 		m = TXQ_RING_DEQUEUE_COND(qs, coalesce_check, &ci);
347 		if (m_head == NULL) {
348 			m_tail = m_head = m;
349 		} else if (m != NULL) {
350 			m_tail->m_nextpkt = m;
351 			m_tail = m;
352 		}
353 	} while (m != NULL);
354 	if (ci.count > 7)
355 		panic("trying to coalesce %d packets in to one WR", ci.count);
356 	return (m_head);
357 }
358 
359 /**
360  *	reclaim_completed_tx - reclaims completed Tx descriptors
361  *	@adapter: the adapter
362  *	@q: the Tx queue to reclaim completed descriptors from
363  *
364  *	Reclaims Tx descriptors that the SGE has indicated it has processed,
365  *	and frees the associated buffers if possible.  Called with the Tx
366  *	queue's lock held.
367  */
368 static __inline int
369 reclaim_completed_tx(struct sge_qset *qs, int reclaim_min, int queue)
370 {
371 	struct sge_txq *q = &qs->txq[queue];
372 	int reclaim = desc_reclaimable(q);
373 
374 	if ((cxgb_tx_reclaim_threshold > TX_RECLAIM_MAX) ||
375 	    (cxgb_tx_reclaim_threshold < TX_RECLAIM_MIN))
376 		cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
377 
378 	if (reclaim < reclaim_min)
379 		return (0);
380 
381 	mtx_assert(&qs->lock, MA_OWNED);
382 	if (reclaim > 0) {
383 		t3_free_tx_desc(qs, reclaim, queue);
384 		q->cleaned += reclaim;
385 		q->in_use -= reclaim;
386 	}
387 	if (isset(&qs->txq_stopped, TXQ_ETH))
388                 clrbit(&qs->txq_stopped, TXQ_ETH);
389 
390 	return (reclaim);
391 }
392 
393 #ifdef DEBUGNET
394 int
395 cxgb_debugnet_poll_tx(struct sge_qset *qs)
396 {
397 
398 	return (reclaim_completed_tx(qs, TX_RECLAIM_MAX, TXQ_ETH));
399 }
400 #endif
401 
402 /**
403  *	should_restart_tx - are there enough resources to restart a Tx queue?
404  *	@q: the Tx queue
405  *
406  *	Checks if there are enough descriptors to restart a suspended Tx queue.
407  */
408 static __inline int
409 should_restart_tx(const struct sge_txq *q)
410 {
411 	unsigned int r = q->processed - q->cleaned;
412 
413 	return q->in_use - r < (q->size >> 1);
414 }
415 
416 /**
417  *	t3_sge_init - initialize SGE
418  *	@adap: the adapter
419  *	@p: the SGE parameters
420  *
421  *	Performs SGE initialization needed every time after a chip reset.
422  *	We do not initialize any of the queue sets here, instead the driver
423  *	top-level must request those individually.  We also do not enable DMA
424  *	here, that should be done after the queues have been set up.
425  */
426 void
427 t3_sge_init(adapter_t *adap, struct sge_params *p)
428 {
429 	u_int ctrl, ups;
430 
431 	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
432 
433 	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
434 	       F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN |
435 	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
436 	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
437 #if SGE_NUM_GENBITS == 1
438 	ctrl |= F_EGRGENCTRL;
439 #endif
440 	if (adap->params.rev > 0) {
441 		if (!(adap->flags & (USING_MSIX | USING_MSI)))
442 			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
443 	}
444 	t3_write_reg(adap, A_SG_CONTROL, ctrl);
445 	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
446 		     V_LORCQDRBTHRSH(512));
447 	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
448 	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
449 		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
450 	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH,
451 		     adap->params.rev < T3_REV_C ? 1000 : 500);
452 	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
453 	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
454 	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
455 	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
456 	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
457 }
458 
459 
460 /**
461  *	sgl_len - calculates the size of an SGL of the given capacity
462  *	@n: the number of SGL entries
463  *
464  *	Calculates the number of flits needed for a scatter/gather list that
465  *	can hold the given number of entries.
466  */
467 static __inline unsigned int
468 sgl_len(unsigned int n)
469 {
470 	return ((3 * n) / 2 + (n & 1));
471 }
472 
473 /**
474  *	get_imm_packet - return the next ingress packet buffer from a response
475  *	@resp: the response descriptor containing the packet data
476  *
477  *	Return a packet containing the immediate data of the given response.
478  */
479 static int
480 get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m)
481 {
482 
483 	if (resp->rss_hdr.opcode == CPL_RX_DATA) {
484 		const struct cpl_rx_data *cpl = (const void *)&resp->imm_data[0];
485 		m->m_len = sizeof(*cpl) + ntohs(cpl->len);
486 	} else if (resp->rss_hdr.opcode == CPL_RX_PKT) {
487 		const struct cpl_rx_pkt *cpl = (const void *)&resp->imm_data[0];
488 		m->m_len = sizeof(*cpl) + ntohs(cpl->len);
489 	} else
490 		m->m_len = IMMED_PKT_SIZE;
491 	m->m_ext.ext_buf = NULL;
492 	m->m_ext.ext_type = 0;
493 	memcpy(mtod(m, uint8_t *), resp->imm_data, m->m_len);
494 	return (0);
495 }
496 
497 static __inline u_int
498 flits_to_desc(u_int n)
499 {
500 	return (flit_desc_map[n]);
501 }
502 
503 #define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
504 		    F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
505 		    V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
506 		    F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
507 		    F_HIRCQPARITYERROR)
508 #define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR)
509 #define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \
510 		      F_RSPQDISABLED)
511 
512 /**
513  *	t3_sge_err_intr_handler - SGE async event interrupt handler
514  *	@adapter: the adapter
515  *
516  *	Interrupt handler for SGE asynchronous (non-data) events.
517  */
518 void
519 t3_sge_err_intr_handler(adapter_t *adapter)
520 {
521 	unsigned int v, status;
522 
523 	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
524 	if (status & SGE_PARERR)
525 		CH_ALERT(adapter, "SGE parity error (0x%x)\n",
526 			 status & SGE_PARERR);
527 	if (status & SGE_FRAMINGERR)
528 		CH_ALERT(adapter, "SGE framing error (0x%x)\n",
529 			 status & SGE_FRAMINGERR);
530 	if (status & F_RSPQCREDITOVERFOW)
531 		CH_ALERT(adapter, "SGE response queue credit overflow\n");
532 
533 	if (status & F_RSPQDISABLED) {
534 		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
535 
536 		CH_ALERT(adapter,
537 			 "packet delivered to disabled response queue (0x%x)\n",
538 			 (v >> S_RSPQ0DISABLED) & 0xff);
539 	}
540 
541 	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
542 	if (status & SGE_FATALERR)
543 		t3_fatal_err(adapter);
544 }
545 
546 void
547 t3_sge_prep(adapter_t *adap, struct sge_params *p)
548 {
549 	int i, nqsets, fl_q_size, jumbo_q_size, use_16k, jumbo_buf_size;
550 
551 	nqsets = min(SGE_QSETS / adap->params.nports, mp_ncpus);
552 	nqsets *= adap->params.nports;
553 
554 	fl_q_size = min(nmbclusters/(3*nqsets), FL_Q_SIZE);
555 
556 	while (!powerof2(fl_q_size))
557 		fl_q_size--;
558 
559 	use_16k = cxgb_use_16k_clusters != -1 ? cxgb_use_16k_clusters :
560 	    is_offload(adap);
561 
562 #if __FreeBSD_version >= 700111
563 	if (use_16k) {
564 		jumbo_q_size = min(nmbjumbo16/(3*nqsets), JUMBO_Q_SIZE);
565 		jumbo_buf_size = MJUM16BYTES;
566 	} else {
567 		jumbo_q_size = min(nmbjumbo9/(3*nqsets), JUMBO_Q_SIZE);
568 		jumbo_buf_size = MJUM9BYTES;
569 	}
570 #else
571 	jumbo_q_size = min(nmbjumbop/(3*nqsets), JUMBO_Q_SIZE);
572 	jumbo_buf_size = MJUMPAGESIZE;
573 #endif
574 	while (!powerof2(jumbo_q_size))
575 		jumbo_q_size--;
576 
577 	if (fl_q_size < (FL_Q_SIZE / 4) || jumbo_q_size < (JUMBO_Q_SIZE / 2))
578 		device_printf(adap->dev,
579 		    "Insufficient clusters and/or jumbo buffers.\n");
580 
581 	p->max_pkt_size = jumbo_buf_size - sizeof(struct cpl_rx_data);
582 
583 	for (i = 0; i < SGE_QSETS; ++i) {
584 		struct qset_params *q = p->qset + i;
585 
586 		if (adap->params.nports > 2) {
587 			q->coalesce_usecs = 50;
588 		} else {
589 #ifdef INVARIANTS
590 			q->coalesce_usecs = 10;
591 #else
592 			q->coalesce_usecs = 5;
593 #endif
594 		}
595 		q->polling = 0;
596 		q->rspq_size = RSPQ_Q_SIZE;
597 		q->fl_size = fl_q_size;
598 		q->jumbo_size = jumbo_q_size;
599 		q->jumbo_buf_size = jumbo_buf_size;
600 		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
601 		q->txq_size[TXQ_OFLD] = is_offload(adap) ? TX_OFLD_Q_SIZE : 16;
602 		q->txq_size[TXQ_CTRL] = TX_CTRL_Q_SIZE;
603 		q->cong_thres = 0;
604 	}
605 }
606 
607 int
608 t3_sge_alloc(adapter_t *sc)
609 {
610 
611 	/* The parent tag. */
612 	if (bus_dma_tag_create( bus_get_dma_tag(sc->dev),/* PCI parent */
613 				1, 0,			/* algnmnt, boundary */
614 				BUS_SPACE_MAXADDR,	/* lowaddr */
615 				BUS_SPACE_MAXADDR,	/* highaddr */
616 				NULL, NULL,		/* filter, filterarg */
617 				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
618 				BUS_SPACE_UNRESTRICTED, /* nsegments */
619 				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
620 				0,			/* flags */
621 				NULL, NULL,		/* lock, lockarg */
622 				&sc->parent_dmat)) {
623 		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
624 		return (ENOMEM);
625 	}
626 
627 	/*
628 	 * DMA tag for normal sized RX frames
629 	 */
630 	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
631 		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
632 		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
633 		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
634 		return (ENOMEM);
635 	}
636 
637 	/*
638 	 * DMA tag for jumbo sized RX frames.
639 	 */
640 	if (bus_dma_tag_create(sc->parent_dmat, MJUM16BYTES, 0, BUS_SPACE_MAXADDR,
641 		BUS_SPACE_MAXADDR, NULL, NULL, MJUM16BYTES, 1, MJUM16BYTES,
642 		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
643 		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
644 		return (ENOMEM);
645 	}
646 
647 	/*
648 	 * DMA tag for TX frames.
649 	 */
650 	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
651 		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
652 		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
653 		NULL, NULL, &sc->tx_dmat)) {
654 		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
655 		return (ENOMEM);
656 	}
657 
658 	return (0);
659 }
660 
661 int
662 t3_sge_free(struct adapter * sc)
663 {
664 
665 	if (sc->tx_dmat != NULL)
666 		bus_dma_tag_destroy(sc->tx_dmat);
667 
668 	if (sc->rx_jumbo_dmat != NULL)
669 		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
670 
671 	if (sc->rx_dmat != NULL)
672 		bus_dma_tag_destroy(sc->rx_dmat);
673 
674 	if (sc->parent_dmat != NULL)
675 		bus_dma_tag_destroy(sc->parent_dmat);
676 
677 	return (0);
678 }
679 
680 void
681 t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
682 {
683 
684 	qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);
685 	qs->rspq.polling = 0 /* p->polling */;
686 }
687 
688 #if !defined(__i386__) && !defined(__amd64__)
689 static void
690 refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
691 {
692 	struct refill_fl_cb_arg *cb_arg = arg;
693 
694 	cb_arg->error = error;
695 	cb_arg->seg = segs[0];
696 	cb_arg->nseg = nseg;
697 
698 }
699 #endif
700 /**
701  *	refill_fl - refill an SGE free-buffer list
702  *	@sc: the controller softc
703  *	@q: the free-list to refill
704  *	@n: the number of new buffers to allocate
705  *
706  *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
707  *	The caller must assure that @n does not exceed the queue's capacity.
708  */
709 static void
710 refill_fl(adapter_t *sc, struct sge_fl *q, int n)
711 {
712 	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
713 	struct rx_desc *d = &q->desc[q->pidx];
714 	struct refill_fl_cb_arg cb_arg;
715 	struct mbuf *m;
716 	caddr_t cl;
717 	int err;
718 
719 	cb_arg.error = 0;
720 	while (n--) {
721 		/*
722 		 * We allocate an uninitialized mbuf + cluster, mbuf is
723 		 * initialized after rx.
724 		 */
725 		if (q->zone == zone_pack) {
726 			if ((m = m_getcl(M_NOWAIT, MT_NOINIT, M_PKTHDR)) == NULL)
727 				break;
728 			cl = m->m_ext.ext_buf;
729 		} else {
730 			if ((cl = m_cljget(NULL, M_NOWAIT, q->buf_size)) == NULL)
731 				break;
732 			if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
733 				uma_zfree(q->zone, cl);
734 				break;
735 			}
736 		}
737 		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
738 			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
739 				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
740 				uma_zfree(q->zone, cl);
741 				goto done;
742 			}
743 			sd->flags |= RX_SW_DESC_MAP_CREATED;
744 		}
745 #if !defined(__i386__) && !defined(__amd64__)
746 		err = bus_dmamap_load(q->entry_tag, sd->map,
747 		    cl, q->buf_size, refill_fl_cb, &cb_arg, 0);
748 
749 		if (err != 0 || cb_arg.error) {
750 			if (q->zone != zone_pack)
751 				uma_zfree(q->zone, cl);
752 			m_free(m);
753 			goto done;
754 		}
755 #else
756 		cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)cl);
757 #endif
758 		sd->flags |= RX_SW_DESC_INUSE;
759 		sd->rxsd_cl = cl;
760 		sd->m = m;
761 		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
762 		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
763 		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
764 		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
765 
766 		d++;
767 		sd++;
768 
769 		if (++q->pidx == q->size) {
770 			q->pidx = 0;
771 			q->gen ^= 1;
772 			sd = q->sdesc;
773 			d = q->desc;
774 		}
775 		q->credits++;
776 		q->db_pending++;
777 	}
778 
779 done:
780 	if (q->db_pending >= 32) {
781 		q->db_pending = 0;
782 		t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
783 	}
784 }
785 
786 
787 /**
788  *	free_rx_bufs - free the Rx buffers on an SGE free list
789  *	@sc: the controle softc
790  *	@q: the SGE free list to clean up
791  *
792  *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
793  *	this queue should be stopped before calling this function.
794  */
795 static void
796 free_rx_bufs(adapter_t *sc, struct sge_fl *q)
797 {
798 	u_int cidx = q->cidx;
799 
800 	while (q->credits--) {
801 		struct rx_sw_desc *d = &q->sdesc[cidx];
802 
803 		if (d->flags & RX_SW_DESC_INUSE) {
804 			bus_dmamap_unload(q->entry_tag, d->map);
805 			bus_dmamap_destroy(q->entry_tag, d->map);
806 			if (q->zone == zone_pack) {
807 				m_init(d->m, M_NOWAIT, MT_DATA, M_EXT);
808 				uma_zfree(zone_pack, d->m);
809 			} else {
810 				m_init(d->m, M_NOWAIT, MT_DATA, 0);
811 				uma_zfree(zone_mbuf, d->m);
812 				uma_zfree(q->zone, d->rxsd_cl);
813 			}
814 		}
815 
816 		d->rxsd_cl = NULL;
817 		d->m = NULL;
818 		if (++cidx == q->size)
819 			cidx = 0;
820 	}
821 }
822 
823 static __inline void
824 __refill_fl(adapter_t *adap, struct sge_fl *fl)
825 {
826 	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
827 }
828 
829 static __inline void
830 __refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max)
831 {
832 	uint32_t reclaimable = fl->size - fl->credits;
833 
834 	if (reclaimable > 0)
835 		refill_fl(adap, fl, min(max, reclaimable));
836 }
837 
838 /**
839  *	recycle_rx_buf - recycle a receive buffer
840  *	@adapter: the adapter
841  *	@q: the SGE free list
842  *	@idx: index of buffer to recycle
843  *
844  *	Recycles the specified buffer on the given free list by adding it at
845  *	the next available slot on the list.
846  */
847 static void
848 recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx)
849 {
850 	struct rx_desc *from = &q->desc[idx];
851 	struct rx_desc *to   = &q->desc[q->pidx];
852 
853 	q->sdesc[q->pidx] = q->sdesc[idx];
854 	to->addr_lo = from->addr_lo;        // already big endian
855 	to->addr_hi = from->addr_hi;        // likewise
856 	wmb();	/* necessary ? */
857 	to->len_gen = htobe32(V_FLD_GEN1(q->gen));
858 	to->gen2 = htobe32(V_FLD_GEN2(q->gen));
859 	q->credits++;
860 
861 	if (++q->pidx == q->size) {
862 		q->pidx = 0;
863 		q->gen ^= 1;
864 	}
865 	t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
866 }
867 
868 static void
869 alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
870 {
871 	uint32_t *addr;
872 
873 	addr = arg;
874 	*addr = segs[0].ds_addr;
875 }
876 
877 static int
878 alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
879     bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
880     bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
881 {
882 	size_t len = nelem * elem_size;
883 	void *s = NULL;
884 	void *p = NULL;
885 	int err;
886 
887 	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
888 				      BUS_SPACE_MAXADDR_32BIT,
889 				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
890 				      len, 0, NULL, NULL, tag)) != 0) {
891 		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
892 		return (ENOMEM);
893 	}
894 
895 	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
896 				    map)) != 0) {
897 		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
898 		return (ENOMEM);
899 	}
900 
901 	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
902 	bzero(p, len);
903 	*(void **)desc = p;
904 
905 	if (sw_size) {
906 		len = nelem * sw_size;
907 		s = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
908 		*(void **)sdesc = s;
909 	}
910 	if (parent_entry_tag == NULL)
911 		return (0);
912 
913 	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
914 				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
915 		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
916 				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
917 		                      NULL, NULL, entry_tag)) != 0) {
918 		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
919 		return (ENOMEM);
920 	}
921 	return (0);
922 }
923 
924 static void
925 sge_slow_intr_handler(void *arg, int ncount)
926 {
927 	adapter_t *sc = arg;
928 
929 	t3_slow_intr_handler(sc);
930 	t3_write_reg(sc, A_PL_INT_ENABLE0, sc->slow_intr_mask);
931 	(void) t3_read_reg(sc, A_PL_INT_ENABLE0);
932 }
933 
934 /**
935  *	sge_timer_cb - perform periodic maintenance of an SGE qset
936  *	@data: the SGE queue set to maintain
937  *
938  *	Runs periodically from a timer to perform maintenance of an SGE queue
939  *	set.  It performs two tasks:
940  *
941  *	a) Cleans up any completed Tx descriptors that may still be pending.
942  *	Normal descriptor cleanup happens when new packets are added to a Tx
943  *	queue so this timer is relatively infrequent and does any cleanup only
944  *	if the Tx queue has not seen any new packets in a while.  We make a
945  *	best effort attempt to reclaim descriptors, in that we don't wait
946  *	around if we cannot get a queue's lock (which most likely is because
947  *	someone else is queueing new packets and so will also handle the clean
948  *	up).  Since control queues use immediate data exclusively we don't
949  *	bother cleaning them up here.
950  *
951  *	b) Replenishes Rx queues that have run out due to memory shortage.
952  *	Normally new Rx buffers are added when existing ones are consumed but
953  *	when out of memory a queue can become empty.  We try to add only a few
954  *	buffers here, the queue will be replenished fully as these new buffers
955  *	are used up if memory shortage has subsided.
956  *
957  *	c) Return coalesced response queue credits in case a response queue is
958  *	starved.
959  *
960  *	d) Ring doorbells for T304 tunnel queues since we have seen doorbell
961  *	fifo overflows and the FW doesn't implement any recovery scheme yet.
962  */
963 static void
964 sge_timer_cb(void *arg)
965 {
966 	adapter_t *sc = arg;
967 	if ((sc->flags & USING_MSIX) == 0) {
968 
969 		struct port_info *pi;
970 		struct sge_qset *qs;
971 		struct sge_txq  *txq;
972 		int i, j;
973 		int reclaim_ofl, refill_rx;
974 
975 		if (sc->open_device_map == 0)
976 			return;
977 
978 		for (i = 0; i < sc->params.nports; i++) {
979 			pi = &sc->port[i];
980 			for (j = 0; j < pi->nqsets; j++) {
981 				qs = &sc->sge.qs[pi->first_qset + j];
982 				txq = &qs->txq[0];
983 				reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
984 				refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
985 				    (qs->fl[1].credits < qs->fl[1].size));
986 				if (reclaim_ofl || refill_rx) {
987 					taskqueue_enqueue(sc->tq, &pi->timer_reclaim_task);
988 					break;
989 				}
990 			}
991 		}
992 	}
993 
994 	if (sc->params.nports > 2) {
995 		int i;
996 
997 		for_each_port(sc, i) {
998 			struct port_info *pi = &sc->port[i];
999 
1000 			t3_write_reg(sc, A_SG_KDOORBELL,
1001 				     F_SELEGRCNTX |
1002 				     (FW_TUNNEL_SGEEC_START + pi->first_qset));
1003 		}
1004 	}
1005 	if (((sc->flags & USING_MSIX) == 0 || sc->params.nports > 2) &&
1006 	    sc->open_device_map != 0)
1007 		callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1008 }
1009 
1010 /*
1011  * This is meant to be a catch-all function to keep sge state private
1012  * to sge.c
1013  *
1014  */
1015 int
1016 t3_sge_init_adapter(adapter_t *sc)
1017 {
1018 	callout_init(&sc->sge_timer_ch, 1);
1019 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1020 	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
1021 	return (0);
1022 }
1023 
1024 int
1025 t3_sge_reset_adapter(adapter_t *sc)
1026 {
1027 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1028 	return (0);
1029 }
1030 
1031 int
1032 t3_sge_init_port(struct port_info *pi)
1033 {
1034 	TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi);
1035 	return (0);
1036 }
1037 
1038 /**
1039  *	refill_rspq - replenish an SGE response queue
1040  *	@adapter: the adapter
1041  *	@q: the response queue to replenish
1042  *	@credits: how many new responses to make available
1043  *
1044  *	Replenishes a response queue by making the supplied number of responses
1045  *	available to HW.
1046  */
1047 static __inline void
1048 refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
1049 {
1050 
1051 	/* mbufs are allocated on demand when a rspq entry is processed. */
1052 	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
1053 		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
1054 }
1055 
1056 static void
1057 sge_txq_reclaim_handler(void *arg, int ncount)
1058 {
1059 	struct sge_qset *qs = arg;
1060 	int i;
1061 
1062 	for (i = 0; i < 3; i++)
1063 		reclaim_completed_tx(qs, 16, i);
1064 }
1065 
1066 static void
1067 sge_timer_reclaim(void *arg, int ncount)
1068 {
1069 	struct port_info *pi = arg;
1070 	int i, nqsets = pi->nqsets;
1071 	adapter_t *sc = pi->adapter;
1072 	struct sge_qset *qs;
1073 	struct mtx *lock;
1074 
1075 	KASSERT((sc->flags & USING_MSIX) == 0,
1076 	    ("can't call timer reclaim for msi-x"));
1077 
1078 	for (i = 0; i < nqsets; i++) {
1079 		qs = &sc->sge.qs[pi->first_qset + i];
1080 
1081 		reclaim_completed_tx(qs, 16, TXQ_OFLD);
1082 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
1083 			    &sc->sge.qs[0].rspq.lock;
1084 
1085 		if (mtx_trylock(lock)) {
1086 			/* XXX currently assume that we are *NOT* polling */
1087 			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
1088 
1089 			if (qs->fl[0].credits < qs->fl[0].size - 16)
1090 				__refill_fl(sc, &qs->fl[0]);
1091 			if (qs->fl[1].credits < qs->fl[1].size - 16)
1092 				__refill_fl(sc, &qs->fl[1]);
1093 
1094 			if (status & (1 << qs->rspq.cntxt_id)) {
1095 				if (qs->rspq.credits) {
1096 					refill_rspq(sc, &qs->rspq, 1);
1097 					qs->rspq.credits--;
1098 					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
1099 					    1 << qs->rspq.cntxt_id);
1100 				}
1101 			}
1102 			mtx_unlock(lock);
1103 		}
1104 	}
1105 }
1106 
1107 /**
1108  *	init_qset_cntxt - initialize an SGE queue set context info
1109  *	@qs: the queue set
1110  *	@id: the queue set id
1111  *
1112  *	Initializes the TIDs and context ids for the queues of a queue set.
1113  */
1114 static void
1115 init_qset_cntxt(struct sge_qset *qs, u_int id)
1116 {
1117 
1118 	qs->rspq.cntxt_id = id;
1119 	qs->fl[0].cntxt_id = 2 * id;
1120 	qs->fl[1].cntxt_id = 2 * id + 1;
1121 	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
1122 	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
1123 	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
1124 	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
1125 	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
1126 
1127 	/* XXX: a sane limit is needed instead of INT_MAX */
1128 	mbufq_init(&qs->txq[TXQ_ETH].sendq, INT_MAX);
1129 	mbufq_init(&qs->txq[TXQ_OFLD].sendq, INT_MAX);
1130 	mbufq_init(&qs->txq[TXQ_CTRL].sendq, INT_MAX);
1131 }
1132 
1133 
1134 static void
1135 txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
1136 {
1137 	txq->in_use += ndesc;
1138 	/*
1139 	 * XXX we don't handle stopping of queue
1140 	 * presumably start handles this when we bump against the end
1141 	 */
1142 	txqs->gen = txq->gen;
1143 	txq->unacked += ndesc;
1144 	txqs->compl = (txq->unacked & 32) << (S_WR_COMPL - 5);
1145 	txq->unacked &= 31;
1146 	txqs->pidx = txq->pidx;
1147 	txq->pidx += ndesc;
1148 #ifdef INVARIANTS
1149 	if (((txqs->pidx > txq->cidx) &&
1150 		(txq->pidx < txqs->pidx) &&
1151 		(txq->pidx >= txq->cidx)) ||
1152 	    ((txqs->pidx < txq->cidx) &&
1153 		(txq->pidx >= txq-> cidx)) ||
1154 	    ((txqs->pidx < txq->cidx) &&
1155 		(txq->cidx < txqs->pidx)))
1156 		panic("txqs->pidx=%d txq->pidx=%d txq->cidx=%d",
1157 		    txqs->pidx, txq->pidx, txq->cidx);
1158 #endif
1159 	if (txq->pidx >= txq->size) {
1160 		txq->pidx -= txq->size;
1161 		txq->gen ^= 1;
1162 	}
1163 
1164 }
1165 
1166 /**
1167  *	calc_tx_descs - calculate the number of Tx descriptors for a packet
1168  *	@m: the packet mbufs
1169  *      @nsegs: the number of segments
1170  *
1171  * 	Returns the number of Tx descriptors needed for the given Ethernet
1172  * 	packet.  Ethernet packets require addition of WR and CPL headers.
1173  */
1174 static __inline unsigned int
1175 calc_tx_descs(const struct mbuf *m, int nsegs)
1176 {
1177 	unsigned int flits;
1178 
1179 	if (m->m_pkthdr.len <= PIO_LEN)
1180 		return 1;
1181 
1182 	flits = sgl_len(nsegs) + 2;
1183 	if (m->m_pkthdr.csum_flags & CSUM_TSO)
1184 		flits++;
1185 
1186 	return flits_to_desc(flits);
1187 }
1188 
1189 /**
1190  *	make_sgl - populate a scatter/gather list for a packet
1191  *	@sgp: the SGL to populate
1192  *	@segs: the packet dma segments
1193  *	@nsegs: the number of segments
1194  *
1195  *	Generates a scatter/gather list for the buffers that make up a packet
1196  *	and returns the SGL size in 8-byte words.  The caller must size the SGL
1197  *	appropriately.
1198  */
1199 static __inline void
1200 make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
1201 {
1202 	int i, idx;
1203 
1204 	for (idx = 0, i = 0; i < nsegs; i++) {
1205 		/*
1206 		 * firmware doesn't like empty segments
1207 		 */
1208 		if (segs[i].ds_len == 0)
1209 			continue;
1210 		if (i && idx == 0)
1211 			++sgp;
1212 
1213 		sgp->len[idx] = htobe32(segs[i].ds_len);
1214 		sgp->addr[idx] = htobe64(segs[i].ds_addr);
1215 		idx ^= 1;
1216 	}
1217 
1218 	if (idx) {
1219 		sgp->len[idx] = 0;
1220 		sgp->addr[idx] = 0;
1221 	}
1222 }
1223 
1224 /**
1225  *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
1226  *	@adap: the adapter
1227  *	@q: the Tx queue
1228  *
1229  *	Ring the doorbell if a Tx queue is asleep.  There is a natural race,
1230  *	where the HW is going to sleep just after we checked, however,
1231  *	then the interrupt handler will detect the outstanding TX packet
1232  *	and ring the doorbell for us.
1233  *
1234  *	When GTS is disabled we unconditionally ring the doorbell.
1235  */
1236 static __inline void
1237 check_ring_tx_db(adapter_t *adap, struct sge_txq *q, int mustring)
1238 {
1239 #if USE_GTS
1240 	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
1241 	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
1242 		set_bit(TXQ_LAST_PKT_DB, &q->flags);
1243 #ifdef T3_TRACE
1244 		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
1245 			  q->cntxt_id);
1246 #endif
1247 		t3_write_reg(adap, A_SG_KDOORBELL,
1248 			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1249 	}
1250 #else
1251 	if (mustring || ++q->db_pending >= 32) {
1252 		wmb();            /* write descriptors before telling HW */
1253 		t3_write_reg(adap, A_SG_KDOORBELL,
1254 		    F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1255 		q->db_pending = 0;
1256 	}
1257 #endif
1258 }
1259 
1260 static __inline void
1261 wr_gen2(struct tx_desc *d, unsigned int gen)
1262 {
1263 #if SGE_NUM_GENBITS == 2
1264 	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
1265 #endif
1266 }
1267 
1268 /**
1269  *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
1270  *	@ndesc: number of Tx descriptors spanned by the SGL
1271  *	@txd: first Tx descriptor to be written
1272  *	@txqs: txq state (generation and producer index)
1273  *	@txq: the SGE Tx queue
1274  *	@sgl: the SGL
1275  *	@flits: number of flits to the start of the SGL in the first descriptor
1276  *	@sgl_flits: the SGL size in flits
1277  *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
1278  *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
1279  *
1280  *	Write a work request header and an associated SGL.  If the SGL is
1281  *	small enough to fit into one Tx descriptor it has already been written
1282  *	and we just need to write the WR header.  Otherwise we distribute the
1283  *	SGL across the number of descriptors it spans.
1284  */
1285 static void
1286 write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
1287     const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
1288     unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo)
1289 {
1290 
1291 	struct work_request_hdr *wrp = (struct work_request_hdr *)txd;
1292 	struct tx_sw_desc *txsd = &txq->sdesc[txqs->pidx];
1293 
1294 	if (__predict_true(ndesc == 1)) {
1295 		set_wr_hdr(wrp, htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1296 		    V_WR_SGLSFLT(flits)) | wr_hi,
1297 		    htonl(V_WR_LEN(flits + sgl_flits) | V_WR_GEN(txqs->gen)) |
1298 		    wr_lo);
1299 
1300 		wr_gen2(txd, txqs->gen);
1301 
1302 	} else {
1303 		unsigned int ogen = txqs->gen;
1304 		const uint64_t *fp = (const uint64_t *)sgl;
1305 		struct work_request_hdr *wp = wrp;
1306 
1307 		wrp->wrh_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1308 		    V_WR_SGLSFLT(flits)) | wr_hi;
1309 
1310 		while (sgl_flits) {
1311 			unsigned int avail = WR_FLITS - flits;
1312 
1313 			if (avail > sgl_flits)
1314 				avail = sgl_flits;
1315 			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1316 			sgl_flits -= avail;
1317 			ndesc--;
1318 			if (!sgl_flits)
1319 				break;
1320 
1321 			fp += avail;
1322 			txd++;
1323 			txsd++;
1324 			if (++txqs->pidx == txq->size) {
1325 				txqs->pidx = 0;
1326 				txqs->gen ^= 1;
1327 				txd = txq->desc;
1328 				txsd = txq->sdesc;
1329 			}
1330 
1331 			/*
1332 			 * when the head of the mbuf chain
1333 			 * is freed all clusters will be freed
1334 			 * with it
1335 			 */
1336 			wrp = (struct work_request_hdr *)txd;
1337 			wrp->wrh_hi = htonl(V_WR_DATATYPE(1) |
1338 			    V_WR_SGLSFLT(1)) | wr_hi;
1339 			wrp->wrh_lo = htonl(V_WR_LEN(min(WR_FLITS,
1340 				    sgl_flits + 1)) |
1341 			    V_WR_GEN(txqs->gen)) | wr_lo;
1342 			wr_gen2(txd, txqs->gen);
1343 			flits = 1;
1344 		}
1345 		wrp->wrh_hi |= htonl(F_WR_EOP);
1346 		wmb();
1347 		wp->wrh_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1348 		wr_gen2((struct tx_desc *)wp, ogen);
1349 	}
1350 }
1351 
1352 /* sizeof(*eh) + sizeof(*ip) + sizeof(*tcp) */
1353 #define TCPPKTHDRSIZE (ETHER_HDR_LEN + 20 + 20)
1354 
1355 #define GET_VTAG(cntrl, m) \
1356 do { \
1357 	if ((m)->m_flags & M_VLANTAG)					            \
1358 		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \
1359 } while (0)
1360 
1361 static int
1362 t3_encap(struct sge_qset *qs, struct mbuf **m)
1363 {
1364 	adapter_t *sc;
1365 	struct mbuf *m0;
1366 	struct sge_txq *txq;
1367 	struct txq_state txqs;
1368 	struct port_info *pi;
1369 	unsigned int ndesc, flits, cntrl, mlen;
1370 	int err, nsegs, tso_info = 0;
1371 
1372 	struct work_request_hdr *wrp;
1373 	struct tx_sw_desc *txsd;
1374 	struct sg_ent *sgp, *sgl;
1375 	uint32_t wr_hi, wr_lo, sgl_flits;
1376 	bus_dma_segment_t segs[TX_MAX_SEGS];
1377 
1378 	struct tx_desc *txd;
1379 
1380 	pi = qs->port;
1381 	sc = pi->adapter;
1382 	txq = &qs->txq[TXQ_ETH];
1383 	txd = &txq->desc[txq->pidx];
1384 	txsd = &txq->sdesc[txq->pidx];
1385 	sgl = txq->txq_sgl;
1386 
1387 	prefetch(txd);
1388 	m0 = *m;
1389 
1390 	mtx_assert(&qs->lock, MA_OWNED);
1391 	cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1392 	KASSERT(m0->m_flags & M_PKTHDR, ("not packet header\n"));
1393 
1394 	if  (m0->m_nextpkt == NULL && m0->m_next != NULL &&
1395 	    m0->m_pkthdr.csum_flags & (CSUM_TSO))
1396 		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
1397 
1398 	if (m0->m_nextpkt != NULL) {
1399 		busdma_map_sg_vec(txq->entry_tag, txsd->map, m0, segs, &nsegs);
1400 		ndesc = 1;
1401 		mlen = 0;
1402 	} else {
1403 		if ((err = busdma_map_sg_collapse(txq->entry_tag, txsd->map,
1404 		    &m0, segs, &nsegs))) {
1405 			if (cxgb_debug)
1406 				printf("failed ... err=%d\n", err);
1407 			return (err);
1408 		}
1409 		mlen = m0->m_pkthdr.len;
1410 		ndesc = calc_tx_descs(m0, nsegs);
1411 	}
1412 	txq_prod(txq, ndesc, &txqs);
1413 
1414 	KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d", nsegs));
1415 	txsd->m = m0;
1416 
1417 	if (m0->m_nextpkt != NULL) {
1418 		struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd;
1419 		int i, fidx;
1420 
1421 		if (nsegs > 7)
1422 			panic("trying to coalesce %d packets in to one WR", nsegs);
1423 		txq->txq_coalesced += nsegs;
1424 		wrp = (struct work_request_hdr *)txd;
1425 		flits = nsegs*2 + 1;
1426 
1427 		for (fidx = 1, i = 0; i < nsegs; i++, fidx += 2) {
1428 			struct cpl_tx_pkt_batch_entry *cbe;
1429 			uint64_t flit;
1430 			uint32_t *hflit = (uint32_t *)&flit;
1431 			int cflags = m0->m_pkthdr.csum_flags;
1432 
1433 			cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1434 			GET_VTAG(cntrl, m0);
1435 			cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1436 			if (__predict_false(!(cflags & CSUM_IP)))
1437 				cntrl |= F_TXPKT_IPCSUM_DIS;
1438 			if (__predict_false(!(cflags & (CSUM_TCP | CSUM_UDP |
1439 			    CSUM_UDP_IPV6 | CSUM_TCP_IPV6))))
1440 				cntrl |= F_TXPKT_L4CSUM_DIS;
1441 
1442 			hflit[0] = htonl(cntrl);
1443 			hflit[1] = htonl(segs[i].ds_len | 0x80000000);
1444 			flit |= htobe64(1 << 24);
1445 			cbe = &cpl_batch->pkt_entry[i];
1446 			cbe->cntrl = hflit[0];
1447 			cbe->len = hflit[1];
1448 			cbe->addr = htobe64(segs[i].ds_addr);
1449 		}
1450 
1451 		wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1452 		    V_WR_SGLSFLT(flits)) |
1453 		    htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1454 		wr_lo = htonl(V_WR_LEN(flits) |
1455 		    V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token));
1456 		set_wr_hdr(wrp, wr_hi, wr_lo);
1457 		wmb();
1458 		ETHER_BPF_MTAP(pi->ifp, m0);
1459 		wr_gen2(txd, txqs.gen);
1460 		check_ring_tx_db(sc, txq, 0);
1461 		return (0);
1462 	} else if (tso_info) {
1463 		uint16_t eth_type;
1464 		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd;
1465 		struct ether_header *eh;
1466 		void *l3hdr;
1467 		struct tcphdr *tcp;
1468 
1469 		txd->flit[2] = 0;
1470 		GET_VTAG(cntrl, m0);
1471 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1472 		hdr->cntrl = htonl(cntrl);
1473 		hdr->len = htonl(mlen | 0x80000000);
1474 
1475 		if (__predict_false(mlen < TCPPKTHDRSIZE)) {
1476 			printf("mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%b,flags=%#x",
1477 			    m0, mlen, m0->m_pkthdr.tso_segsz,
1478 			    (int)m0->m_pkthdr.csum_flags, CSUM_BITS, m0->m_flags);
1479 			panic("tx tso packet too small");
1480 		}
1481 
1482 		/* Make sure that ether, ip, tcp headers are all in m0 */
1483 		if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) {
1484 			m0 = m_pullup(m0, TCPPKTHDRSIZE);
1485 			if (__predict_false(m0 == NULL)) {
1486 				/* XXX panic probably an overreaction */
1487 				panic("couldn't fit header into mbuf");
1488 			}
1489 		}
1490 
1491 		eh = mtod(m0, struct ether_header *);
1492 		eth_type = eh->ether_type;
1493 		if (eth_type == htons(ETHERTYPE_VLAN)) {
1494 			struct ether_vlan_header *evh = (void *)eh;
1495 
1496 			tso_info |= V_LSO_ETH_TYPE(CPL_ETH_II_VLAN);
1497 			l3hdr = evh + 1;
1498 			eth_type = evh->evl_proto;
1499 		} else {
1500 			tso_info |= V_LSO_ETH_TYPE(CPL_ETH_II);
1501 			l3hdr = eh + 1;
1502 		}
1503 
1504 		if (eth_type == htons(ETHERTYPE_IP)) {
1505 			struct ip *ip = l3hdr;
1506 
1507 			tso_info |= V_LSO_IPHDR_WORDS(ip->ip_hl);
1508 			tcp = (struct tcphdr *)(ip + 1);
1509 		} else if (eth_type == htons(ETHERTYPE_IPV6)) {
1510 			struct ip6_hdr *ip6 = l3hdr;
1511 
1512 			KASSERT(ip6->ip6_nxt == IPPROTO_TCP,
1513 			    ("%s: CSUM_TSO with ip6_nxt %d",
1514 			    __func__, ip6->ip6_nxt));
1515 
1516 			tso_info |= F_LSO_IPV6;
1517 			tso_info |= V_LSO_IPHDR_WORDS(sizeof(*ip6) >> 2);
1518 			tcp = (struct tcphdr *)(ip6 + 1);
1519 		} else
1520 			panic("%s: CSUM_TSO but neither ip nor ip6", __func__);
1521 
1522 		tso_info |= V_LSO_TCPHDR_WORDS(tcp->th_off);
1523 		hdr->lso_info = htonl(tso_info);
1524 
1525 		if (__predict_false(mlen <= PIO_LEN)) {
1526 			/*
1527 			 * pkt not undersized but fits in PIO_LEN
1528 			 * Indicates a TSO bug at the higher levels.
1529 			 */
1530 			txsd->m = NULL;
1531 			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[3]);
1532 			flits = (mlen + 7) / 8 + 3;
1533 			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1534 					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1535 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1536 			wr_lo = htonl(V_WR_LEN(flits) |
1537 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1538 			set_wr_hdr(&hdr->wr, wr_hi, wr_lo);
1539 			wmb();
1540 			ETHER_BPF_MTAP(pi->ifp, m0);
1541 			wr_gen2(txd, txqs.gen);
1542 			check_ring_tx_db(sc, txq, 0);
1543 			m_freem(m0);
1544 			return (0);
1545 		}
1546 		flits = 3;
1547 	} else {
1548 		struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd;
1549 
1550 		GET_VTAG(cntrl, m0);
1551 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1552 		if (__predict_false(!(m0->m_pkthdr.csum_flags & CSUM_IP)))
1553 			cntrl |= F_TXPKT_IPCSUM_DIS;
1554 		if (__predict_false(!(m0->m_pkthdr.csum_flags & (CSUM_TCP |
1555 		    CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6))))
1556 			cntrl |= F_TXPKT_L4CSUM_DIS;
1557 		cpl->cntrl = htonl(cntrl);
1558 		cpl->len = htonl(mlen | 0x80000000);
1559 
1560 		if (mlen <= PIO_LEN) {
1561 			txsd->m = NULL;
1562 			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1563 			flits = (mlen + 7) / 8 + 2;
1564 
1565 			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1566 			    V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1567 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1568 			wr_lo = htonl(V_WR_LEN(flits) |
1569 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1570 			set_wr_hdr(&cpl->wr, wr_hi, wr_lo);
1571 			wmb();
1572 			ETHER_BPF_MTAP(pi->ifp, m0);
1573 			wr_gen2(txd, txqs.gen);
1574 			check_ring_tx_db(sc, txq, 0);
1575 			m_freem(m0);
1576 			return (0);
1577 		}
1578 		flits = 2;
1579 	}
1580 	wrp = (struct work_request_hdr *)txd;
1581 	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
1582 	make_sgl(sgp, segs, nsegs);
1583 
1584 	sgl_flits = sgl_len(nsegs);
1585 
1586 	ETHER_BPF_MTAP(pi->ifp, m0);
1587 
1588 	KASSERT(ndesc <= 4, ("ndesc too large %d", ndesc));
1589 	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1590 	wr_lo = htonl(V_WR_TID(txq->token));
1591 	write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits,
1592 	    sgl_flits, wr_hi, wr_lo);
1593 	check_ring_tx_db(sc, txq, 0);
1594 
1595 	return (0);
1596 }
1597 
1598 #ifdef DEBUGNET
1599 int
1600 cxgb_debugnet_encap(struct sge_qset *qs, struct mbuf **m)
1601 {
1602 	int error;
1603 
1604 	error = t3_encap(qs, m);
1605 	if (error == 0)
1606 		check_ring_tx_db(qs->port->adapter, &qs->txq[TXQ_ETH], 1);
1607 	else if (*m != NULL) {
1608 		m_freem(*m);
1609 		*m = NULL;
1610 	}
1611 	return (error);
1612 }
1613 #endif
1614 
1615 void
1616 cxgb_tx_watchdog(void *arg)
1617 {
1618 	struct sge_qset *qs = arg;
1619 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1620 
1621         if (qs->coalescing != 0 &&
1622 	    (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
1623 	    TXQ_RING_EMPTY(qs))
1624                 qs->coalescing = 0;
1625         else if (qs->coalescing == 0 &&
1626 	    (txq->in_use >= cxgb_tx_coalesce_enable_start))
1627                 qs->coalescing = 1;
1628 	if (TXQ_TRYLOCK(qs)) {
1629 		qs->qs_flags |= QS_FLUSHING;
1630 		cxgb_start_locked(qs);
1631 		qs->qs_flags &= ~QS_FLUSHING;
1632 		TXQ_UNLOCK(qs);
1633 	}
1634 	if (qs->port->ifp->if_drv_flags & IFF_DRV_RUNNING)
1635 		callout_reset_on(&txq->txq_watchdog, hz/4, cxgb_tx_watchdog,
1636 		    qs, txq->txq_watchdog.c_cpu);
1637 }
1638 
1639 static void
1640 cxgb_tx_timeout(void *arg)
1641 {
1642 	struct sge_qset *qs = arg;
1643 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1644 
1645 	if (qs->coalescing == 0 && (txq->in_use >= (txq->size>>3)))
1646                 qs->coalescing = 1;
1647 	if (TXQ_TRYLOCK(qs)) {
1648 		qs->qs_flags |= QS_TIMEOUT;
1649 		cxgb_start_locked(qs);
1650 		qs->qs_flags &= ~QS_TIMEOUT;
1651 		TXQ_UNLOCK(qs);
1652 	}
1653 }
1654 
1655 static void
1656 cxgb_start_locked(struct sge_qset *qs)
1657 {
1658 	struct mbuf *m_head = NULL;
1659 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1660 	struct port_info *pi = qs->port;
1661 	struct ifnet *ifp = pi->ifp;
1662 
1663 	if (qs->qs_flags & (QS_FLUSHING|QS_TIMEOUT))
1664 		reclaim_completed_tx(qs, 0, TXQ_ETH);
1665 
1666 	if (!pi->link_config.link_ok) {
1667 		TXQ_RING_FLUSH(qs);
1668 		return;
1669 	}
1670 	TXQ_LOCK_ASSERT(qs);
1671 	while (!TXQ_RING_EMPTY(qs) && (ifp->if_drv_flags & IFF_DRV_RUNNING) &&
1672 	    pi->link_config.link_ok) {
1673 		reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1674 
1675 		if (txq->size - txq->in_use <= TX_MAX_DESC)
1676 			break;
1677 
1678 		if ((m_head = cxgb_dequeue(qs)) == NULL)
1679 			break;
1680 		/*
1681 		 *  Encapsulation can modify our pointer, and or make it
1682 		 *  NULL on failure.  In that event, we can't requeue.
1683 		 */
1684 		if (t3_encap(qs, &m_head) || m_head == NULL)
1685 			break;
1686 
1687 		m_head = NULL;
1688 	}
1689 
1690 	if (txq->db_pending)
1691 		check_ring_tx_db(pi->adapter, txq, 1);
1692 
1693 	if (!TXQ_RING_EMPTY(qs) && callout_pending(&txq->txq_timer) == 0 &&
1694 	    pi->link_config.link_ok)
1695 		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1696 		    qs, txq->txq_timer.c_cpu);
1697 	if (m_head != NULL)
1698 		m_freem(m_head);
1699 }
1700 
1701 static int
1702 cxgb_transmit_locked(struct ifnet *ifp, struct sge_qset *qs, struct mbuf *m)
1703 {
1704 	struct port_info *pi = qs->port;
1705 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1706 	struct buf_ring *br = txq->txq_mr;
1707 	int error, avail;
1708 
1709 	avail = txq->size - txq->in_use;
1710 	TXQ_LOCK_ASSERT(qs);
1711 
1712 	/*
1713 	 * We can only do a direct transmit if the following are true:
1714 	 * - we aren't coalescing (ring < 3/4 full)
1715 	 * - the link is up -- checked in caller
1716 	 * - there are no packets enqueued already
1717 	 * - there is space in hardware transmit queue
1718 	 */
1719 	if (check_pkt_coalesce(qs) == 0 &&
1720 	    !TXQ_RING_NEEDS_ENQUEUE(qs) && avail > TX_MAX_DESC) {
1721 		if (t3_encap(qs, &m)) {
1722 			if (m != NULL &&
1723 			    (error = drbr_enqueue(ifp, br, m)) != 0)
1724 				return (error);
1725 		} else {
1726 			if (txq->db_pending)
1727 				check_ring_tx_db(pi->adapter, txq, 1);
1728 
1729 			/*
1730 			 * We've bypassed the buf ring so we need to update
1731 			 * the stats directly
1732 			 */
1733 			txq->txq_direct_packets++;
1734 			txq->txq_direct_bytes += m->m_pkthdr.len;
1735 		}
1736 	} else if ((error = drbr_enqueue(ifp, br, m)) != 0)
1737 		return (error);
1738 
1739 	reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1740 	if (!TXQ_RING_EMPTY(qs) && pi->link_config.link_ok &&
1741 	    (!check_pkt_coalesce(qs) || (drbr_inuse(ifp, br) >= 7)))
1742 		cxgb_start_locked(qs);
1743 	else if (!TXQ_RING_EMPTY(qs) && !callout_pending(&txq->txq_timer))
1744 		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1745 		    qs, txq->txq_timer.c_cpu);
1746 	return (0);
1747 }
1748 
1749 int
1750 cxgb_transmit(struct ifnet *ifp, struct mbuf *m)
1751 {
1752 	struct sge_qset *qs;
1753 	struct port_info *pi = ifp->if_softc;
1754 	int error, qidx = pi->first_qset;
1755 
1756 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0
1757 	    ||(!pi->link_config.link_ok)) {
1758 		m_freem(m);
1759 		return (0);
1760 	}
1761 
1762 	/* check if flowid is set */
1763 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
1764 		qidx = (m->m_pkthdr.flowid % pi->nqsets) + pi->first_qset;
1765 
1766 	qs = &pi->adapter->sge.qs[qidx];
1767 
1768 	if (TXQ_TRYLOCK(qs)) {
1769 		/* XXX running */
1770 		error = cxgb_transmit_locked(ifp, qs, m);
1771 		TXQ_UNLOCK(qs);
1772 	} else
1773 		error = drbr_enqueue(ifp, qs->txq[TXQ_ETH].txq_mr, m);
1774 	return (error);
1775 }
1776 
1777 void
1778 cxgb_qflush(struct ifnet *ifp)
1779 {
1780 	/*
1781 	 * flush any enqueued mbufs in the buf_rings
1782 	 * and in the transmit queues
1783 	 * no-op for now
1784 	 */
1785 	return;
1786 }
1787 
1788 /**
1789  *	write_imm - write a packet into a Tx descriptor as immediate data
1790  *	@d: the Tx descriptor to write
1791  *	@m: the packet
1792  *	@len: the length of packet data to write as immediate data
1793  *	@gen: the generation bit value to write
1794  *
1795  *	Writes a packet as immediate data into a Tx descriptor.  The packet
1796  *	contains a work request at its beginning.  We must write the packet
1797  *	carefully so the SGE doesn't read accidentally before it's written in
1798  *	its entirety.
1799  */
1800 static __inline void
1801 write_imm(struct tx_desc *d, caddr_t src,
1802 	  unsigned int len, unsigned int gen)
1803 {
1804 	struct work_request_hdr *from = (struct work_request_hdr *)src;
1805 	struct work_request_hdr *to = (struct work_request_hdr *)d;
1806 	uint32_t wr_hi, wr_lo;
1807 
1808 	KASSERT(len <= WR_LEN && len >= sizeof(*from),
1809 	    ("%s: invalid len %d", __func__, len));
1810 
1811 	memcpy(&to[1], &from[1], len - sizeof(*from));
1812 	wr_hi = from->wrh_hi | htonl(F_WR_SOP | F_WR_EOP |
1813 	    V_WR_BCNTLFLT(len & 7));
1814 	wr_lo = from->wrh_lo | htonl(V_WR_GEN(gen) | V_WR_LEN((len + 7) / 8));
1815 	set_wr_hdr(to, wr_hi, wr_lo);
1816 	wmb();
1817 	wr_gen2(d, gen);
1818 }
1819 
1820 /**
1821  *	check_desc_avail - check descriptor availability on a send queue
1822  *	@adap: the adapter
1823  *	@q: the TX queue
1824  *	@m: the packet needing the descriptors
1825  *	@ndesc: the number of Tx descriptors needed
1826  *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1827  *
1828  *	Checks if the requested number of Tx descriptors is available on an
1829  *	SGE send queue.  If the queue is already suspended or not enough
1830  *	descriptors are available the packet is queued for later transmission.
1831  *	Must be called with the Tx queue locked.
1832  *
1833  *	Returns 0 if enough descriptors are available, 1 if there aren't
1834  *	enough descriptors and the packet has been queued, and 2 if the caller
1835  *	needs to retry because there weren't enough descriptors at the
1836  *	beginning of the call but some freed up in the mean time.
1837  */
1838 static __inline int
1839 check_desc_avail(adapter_t *adap, struct sge_txq *q,
1840 		 struct mbuf *m, unsigned int ndesc,
1841 		 unsigned int qid)
1842 {
1843 	/*
1844 	 * XXX We currently only use this for checking the control queue
1845 	 * the control queue is only used for binding qsets which happens
1846 	 * at init time so we are guaranteed enough descriptors
1847 	 */
1848 	if (__predict_false(mbufq_len(&q->sendq))) {
1849 addq_exit:	(void )mbufq_enqueue(&q->sendq, m);
1850 		return 1;
1851 	}
1852 	if (__predict_false(q->size - q->in_use < ndesc)) {
1853 
1854 		struct sge_qset *qs = txq_to_qset(q, qid);
1855 
1856 		setbit(&qs->txq_stopped, qid);
1857 		if (should_restart_tx(q) &&
1858 		    test_and_clear_bit(qid, &qs->txq_stopped))
1859 			return 2;
1860 
1861 		q->stops++;
1862 		goto addq_exit;
1863 	}
1864 	return 0;
1865 }
1866 
1867 
1868 /**
1869  *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1870  *	@q: the SGE control Tx queue
1871  *
1872  *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1873  *	that send only immediate data (presently just the control queues) and
1874  *	thus do not have any mbufs
1875  */
1876 static __inline void
1877 reclaim_completed_tx_imm(struct sge_txq *q)
1878 {
1879 	unsigned int reclaim = q->processed - q->cleaned;
1880 
1881 	q->in_use -= reclaim;
1882 	q->cleaned += reclaim;
1883 }
1884 
1885 /**
1886  *	ctrl_xmit - send a packet through an SGE control Tx queue
1887  *	@adap: the adapter
1888  *	@q: the control queue
1889  *	@m: the packet
1890  *
1891  *	Send a packet through an SGE control Tx queue.  Packets sent through
1892  *	a control queue must fit entirely as immediate data in a single Tx
1893  *	descriptor and have no page fragments.
1894  */
1895 static int
1896 ctrl_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
1897 {
1898 	int ret;
1899 	struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *);
1900 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1901 
1902 	KASSERT(m->m_len <= WR_LEN, ("%s: bad tx data", __func__));
1903 
1904 	wrp->wrh_hi |= htonl(F_WR_SOP | F_WR_EOP);
1905 	wrp->wrh_lo = htonl(V_WR_TID(q->token));
1906 
1907 	TXQ_LOCK(qs);
1908 again:	reclaim_completed_tx_imm(q);
1909 
1910 	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1911 	if (__predict_false(ret)) {
1912 		if (ret == 1) {
1913 			TXQ_UNLOCK(qs);
1914 			return (ENOSPC);
1915 		}
1916 		goto again;
1917 	}
1918 	write_imm(&q->desc[q->pidx], m->m_data, m->m_len, q->gen);
1919 
1920 	q->in_use++;
1921 	if (++q->pidx >= q->size) {
1922 		q->pidx = 0;
1923 		q->gen ^= 1;
1924 	}
1925 	TXQ_UNLOCK(qs);
1926 	wmb();
1927 	t3_write_reg(adap, A_SG_KDOORBELL,
1928 	    F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1929 
1930 	m_free(m);
1931 	return (0);
1932 }
1933 
1934 
1935 /**
1936  *	restart_ctrlq - restart a suspended control queue
1937  *	@qs: the queue set cotaining the control queue
1938  *
1939  *	Resumes transmission on a suspended Tx control queue.
1940  */
1941 static void
1942 restart_ctrlq(void *data, int npending)
1943 {
1944 	struct mbuf *m;
1945 	struct sge_qset *qs = (struct sge_qset *)data;
1946 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1947 	adapter_t *adap = qs->port->adapter;
1948 
1949 	TXQ_LOCK(qs);
1950 again:	reclaim_completed_tx_imm(q);
1951 
1952 	while (q->in_use < q->size &&
1953 	       (m = mbufq_dequeue(&q->sendq)) != NULL) {
1954 
1955 		write_imm(&q->desc[q->pidx], m->m_data, m->m_len, q->gen);
1956 		m_free(m);
1957 
1958 		if (++q->pidx >= q->size) {
1959 			q->pidx = 0;
1960 			q->gen ^= 1;
1961 		}
1962 		q->in_use++;
1963 	}
1964 	if (mbufq_len(&q->sendq)) {
1965 		setbit(&qs->txq_stopped, TXQ_CTRL);
1966 
1967 		if (should_restart_tx(q) &&
1968 		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1969 			goto again;
1970 		q->stops++;
1971 	}
1972 	TXQ_UNLOCK(qs);
1973 	t3_write_reg(adap, A_SG_KDOORBELL,
1974 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1975 }
1976 
1977 
1978 /*
1979  * Send a management message through control queue 0
1980  */
1981 int
1982 t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
1983 {
1984 	return ctrl_xmit(adap, &adap->sge.qs[0], m);
1985 }
1986 
1987 /**
1988  *	free_qset - free the resources of an SGE queue set
1989  *	@sc: the controller owning the queue set
1990  *	@q: the queue set
1991  *
1992  *	Release the HW and SW resources associated with an SGE queue set, such
1993  *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
1994  *	queue set must be quiesced prior to calling this.
1995  */
1996 static void
1997 t3_free_qset(adapter_t *sc, struct sge_qset *q)
1998 {
1999 	int i;
2000 
2001 	reclaim_completed_tx(q, 0, TXQ_ETH);
2002 	if (q->txq[TXQ_ETH].txq_mr != NULL)
2003 		buf_ring_free(q->txq[TXQ_ETH].txq_mr, M_DEVBUF);
2004 	if (q->txq[TXQ_ETH].txq_ifq != NULL) {
2005 		ifq_delete(q->txq[TXQ_ETH].txq_ifq);
2006 		free(q->txq[TXQ_ETH].txq_ifq, M_DEVBUF);
2007 	}
2008 
2009 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2010 		if (q->fl[i].desc) {
2011 			mtx_lock_spin(&sc->sge.reg_lock);
2012 			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
2013 			mtx_unlock_spin(&sc->sge.reg_lock);
2014 			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
2015 			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
2016 					q->fl[i].desc_map);
2017 			bus_dma_tag_destroy(q->fl[i].desc_tag);
2018 			bus_dma_tag_destroy(q->fl[i].entry_tag);
2019 		}
2020 		if (q->fl[i].sdesc) {
2021 			free_rx_bufs(sc, &q->fl[i]);
2022 			free(q->fl[i].sdesc, M_DEVBUF);
2023 		}
2024 	}
2025 
2026 	mtx_unlock(&q->lock);
2027 	MTX_DESTROY(&q->lock);
2028 	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2029 		if (q->txq[i].desc) {
2030 			mtx_lock_spin(&sc->sge.reg_lock);
2031 			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
2032 			mtx_unlock_spin(&sc->sge.reg_lock);
2033 			bus_dmamap_unload(q->txq[i].desc_tag,
2034 					q->txq[i].desc_map);
2035 			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
2036 					q->txq[i].desc_map);
2037 			bus_dma_tag_destroy(q->txq[i].desc_tag);
2038 			bus_dma_tag_destroy(q->txq[i].entry_tag);
2039 		}
2040 		if (q->txq[i].sdesc) {
2041 			free(q->txq[i].sdesc, M_DEVBUF);
2042 		}
2043 	}
2044 
2045 	if (q->rspq.desc) {
2046 		mtx_lock_spin(&sc->sge.reg_lock);
2047 		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
2048 		mtx_unlock_spin(&sc->sge.reg_lock);
2049 
2050 		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
2051 		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
2052 			        q->rspq.desc_map);
2053 		bus_dma_tag_destroy(q->rspq.desc_tag);
2054 		MTX_DESTROY(&q->rspq.lock);
2055 	}
2056 
2057 #if defined(INET6) || defined(INET)
2058 	tcp_lro_free(&q->lro.ctrl);
2059 #endif
2060 
2061 	bzero(q, sizeof(*q));
2062 }
2063 
2064 /**
2065  *	t3_free_sge_resources - free SGE resources
2066  *	@sc: the adapter softc
2067  *
2068  *	Frees resources used by the SGE queue sets.
2069  */
2070 void
2071 t3_free_sge_resources(adapter_t *sc, int nqsets)
2072 {
2073 	int i;
2074 
2075 	for (i = 0; i < nqsets; ++i) {
2076 		TXQ_LOCK(&sc->sge.qs[i]);
2077 		t3_free_qset(sc, &sc->sge.qs[i]);
2078 	}
2079 }
2080 
2081 /**
2082  *	t3_sge_start - enable SGE
2083  *	@sc: the controller softc
2084  *
2085  *	Enables the SGE for DMAs.  This is the last step in starting packet
2086  *	transfers.
2087  */
2088 void
2089 t3_sge_start(adapter_t *sc)
2090 {
2091 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
2092 }
2093 
2094 /**
2095  *	t3_sge_stop - disable SGE operation
2096  *	@sc: the adapter
2097  *
2098  *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
2099  *	from error interrupts) or from normal process context.  In the latter
2100  *	case it also disables any pending queue restart tasklets.  Note that
2101  *	if it is called in interrupt context it cannot disable the restart
2102  *	tasklets as it cannot wait, however the tasklets will have no effect
2103  *	since the doorbells are disabled and the driver will call this again
2104  *	later from process context, at which time the tasklets will be stopped
2105  *	if they are still running.
2106  */
2107 void
2108 t3_sge_stop(adapter_t *sc)
2109 {
2110 	int i, nqsets;
2111 
2112 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0);
2113 
2114 	if (sc->tq == NULL)
2115 		return;
2116 
2117 	for (nqsets = i = 0; i < (sc)->params.nports; i++)
2118 		nqsets += sc->port[i].nqsets;
2119 #ifdef notyet
2120 	/*
2121 	 *
2122 	 * XXX
2123 	 */
2124 	for (i = 0; i < nqsets; ++i) {
2125 		struct sge_qset *qs = &sc->sge.qs[i];
2126 
2127 		taskqueue_drain(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2128 		taskqueue_drain(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2129 	}
2130 #endif
2131 }
2132 
2133 /**
2134  *	t3_free_tx_desc - reclaims Tx descriptors and their buffers
2135  *	@adapter: the adapter
2136  *	@q: the Tx queue to reclaim descriptors from
2137  *	@reclaimable: the number of descriptors to reclaim
2138  *      @m_vec_size: maximum number of buffers to reclaim
2139  *      @desc_reclaimed: returns the number of descriptors reclaimed
2140  *
2141  *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
2142  *	Tx buffers.  Called with the Tx queue lock held.
2143  *
2144  *      Returns number of buffers of reclaimed
2145  */
2146 void
2147 t3_free_tx_desc(struct sge_qset *qs, int reclaimable, int queue)
2148 {
2149 	struct tx_sw_desc *txsd;
2150 	unsigned int cidx, mask;
2151 	struct sge_txq *q = &qs->txq[queue];
2152 
2153 #ifdef T3_TRACE
2154 	T3_TRACE2(sc->tb[q->cntxt_id & 7],
2155 		  "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx);
2156 #endif
2157 	cidx = q->cidx;
2158 	mask = q->size - 1;
2159 	txsd = &q->sdesc[cidx];
2160 
2161 	mtx_assert(&qs->lock, MA_OWNED);
2162 	while (reclaimable--) {
2163 		prefetch(q->sdesc[(cidx + 1) & mask].m);
2164 		prefetch(q->sdesc[(cidx + 2) & mask].m);
2165 
2166 		if (txsd->m != NULL) {
2167 			if (txsd->flags & TX_SW_DESC_MAPPED) {
2168 				bus_dmamap_unload(q->entry_tag, txsd->map);
2169 				txsd->flags &= ~TX_SW_DESC_MAPPED;
2170 			}
2171 			m_freem_list(txsd->m);
2172 			txsd->m = NULL;
2173 		} else
2174 			q->txq_skipped++;
2175 
2176 		++txsd;
2177 		if (++cidx == q->size) {
2178 			cidx = 0;
2179 			txsd = q->sdesc;
2180 		}
2181 	}
2182 	q->cidx = cidx;
2183 
2184 }
2185 
2186 /**
2187  *	is_new_response - check if a response is newly written
2188  *	@r: the response descriptor
2189  *	@q: the response queue
2190  *
2191  *	Returns true if a response descriptor contains a yet unprocessed
2192  *	response.
2193  */
2194 static __inline int
2195 is_new_response(const struct rsp_desc *r,
2196     const struct sge_rspq *q)
2197 {
2198 	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
2199 }
2200 
2201 #define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
2202 #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
2203 			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
2204 			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
2205 			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
2206 
2207 /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
2208 #define NOMEM_INTR_DELAY 2500
2209 
2210 #ifdef TCP_OFFLOAD
2211 /**
2212  *	write_ofld_wr - write an offload work request
2213  *	@adap: the adapter
2214  *	@m: the packet to send
2215  *	@q: the Tx queue
2216  *	@pidx: index of the first Tx descriptor to write
2217  *	@gen: the generation value to use
2218  *	@ndesc: number of descriptors the packet will occupy
2219  *
2220  *	Write an offload work request to send the supplied packet.  The packet
2221  *	data already carry the work request with most fields populated.
2222  */
2223 static void
2224 write_ofld_wr(adapter_t *adap, struct mbuf *m, struct sge_txq *q,
2225     unsigned int pidx, unsigned int gen, unsigned int ndesc)
2226 {
2227 	unsigned int sgl_flits, flits;
2228 	int i, idx, nsegs, wrlen;
2229 	struct work_request_hdr *from;
2230 	struct sg_ent *sgp, t3sgl[TX_MAX_SEGS / 2 + 1];
2231 	struct tx_desc *d = &q->desc[pidx];
2232 	struct txq_state txqs;
2233 	struct sglist_seg *segs;
2234 	struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2235 	struct sglist *sgl;
2236 
2237 	from = (void *)(oh + 1);	/* Start of WR within mbuf */
2238 	wrlen = m->m_len - sizeof(*oh);
2239 
2240 	if (!(oh->flags & F_HDR_SGL)) {
2241 		write_imm(d, (caddr_t)from, wrlen, gen);
2242 
2243 		/*
2244 		 * mbuf with "real" immediate tx data will be enqueue_wr'd by
2245 		 * t3_push_frames and freed in wr_ack.  Others, like those sent
2246 		 * down by close_conn, t3_send_reset, etc. should be freed here.
2247 		 */
2248 		if (!(oh->flags & F_HDR_DF))
2249 			m_free(m);
2250 		return;
2251 	}
2252 
2253 	memcpy(&d->flit[1], &from[1], wrlen - sizeof(*from));
2254 
2255 	sgl = oh->sgl;
2256 	flits = wrlen / 8;
2257 	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : t3sgl;
2258 
2259 	nsegs = sgl->sg_nseg;
2260 	segs = sgl->sg_segs;
2261 	for (idx = 0, i = 0; i < nsegs; i++) {
2262 		KASSERT(segs[i].ss_len, ("%s: 0 len in sgl", __func__));
2263 		if (i && idx == 0)
2264 			++sgp;
2265 		sgp->len[idx] = htobe32(segs[i].ss_len);
2266 		sgp->addr[idx] = htobe64(segs[i].ss_paddr);
2267 		idx ^= 1;
2268 	}
2269 	if (idx) {
2270 		sgp->len[idx] = 0;
2271 		sgp->addr[idx] = 0;
2272 	}
2273 
2274 	sgl_flits = sgl_len(nsegs);
2275 	txqs.gen = gen;
2276 	txqs.pidx = pidx;
2277 	txqs.compl = 0;
2278 
2279 	write_wr_hdr_sgl(ndesc, d, &txqs, q, t3sgl, flits, sgl_flits,
2280 	    from->wrh_hi, from->wrh_lo);
2281 }
2282 
2283 /**
2284  *	ofld_xmit - send a packet through an offload queue
2285  *	@adap: the adapter
2286  *	@q: the Tx offload queue
2287  *	@m: the packet
2288  *
2289  *	Send an offload packet through an SGE offload queue.
2290  */
2291 static int
2292 ofld_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
2293 {
2294 	int ret;
2295 	unsigned int ndesc;
2296 	unsigned int pidx, gen;
2297 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2298 	struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2299 
2300 	ndesc = G_HDR_NDESC(oh->flags);
2301 
2302 	TXQ_LOCK(qs);
2303 again:	reclaim_completed_tx(qs, 16, TXQ_OFLD);
2304 	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
2305 	if (__predict_false(ret)) {
2306 		if (ret == 1) {
2307 			TXQ_UNLOCK(qs);
2308 			return (EINTR);
2309 		}
2310 		goto again;
2311 	}
2312 
2313 	gen = q->gen;
2314 	q->in_use += ndesc;
2315 	pidx = q->pidx;
2316 	q->pidx += ndesc;
2317 	if (q->pidx >= q->size) {
2318 		q->pidx -= q->size;
2319 		q->gen ^= 1;
2320 	}
2321 
2322 	write_ofld_wr(adap, m, q, pidx, gen, ndesc);
2323 	check_ring_tx_db(adap, q, 1);
2324 	TXQ_UNLOCK(qs);
2325 
2326 	return (0);
2327 }
2328 
2329 /**
2330  *	restart_offloadq - restart a suspended offload queue
2331  *	@qs: the queue set cotaining the offload queue
2332  *
2333  *	Resumes transmission on a suspended Tx offload queue.
2334  */
2335 static void
2336 restart_offloadq(void *data, int npending)
2337 {
2338 	struct mbuf *m;
2339 	struct sge_qset *qs = data;
2340 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2341 	adapter_t *adap = qs->port->adapter;
2342 	int cleaned;
2343 
2344 	TXQ_LOCK(qs);
2345 again:	cleaned = reclaim_completed_tx(qs, 16, TXQ_OFLD);
2346 
2347 	while ((m = mbufq_first(&q->sendq)) != NULL) {
2348 		unsigned int gen, pidx;
2349 		struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2350 		unsigned int ndesc = G_HDR_NDESC(oh->flags);
2351 
2352 		if (__predict_false(q->size - q->in_use < ndesc)) {
2353 			setbit(&qs->txq_stopped, TXQ_OFLD);
2354 			if (should_restart_tx(q) &&
2355 			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
2356 				goto again;
2357 			q->stops++;
2358 			break;
2359 		}
2360 
2361 		gen = q->gen;
2362 		q->in_use += ndesc;
2363 		pidx = q->pidx;
2364 		q->pidx += ndesc;
2365 		if (q->pidx >= q->size) {
2366 			q->pidx -= q->size;
2367 			q->gen ^= 1;
2368 		}
2369 
2370 		(void)mbufq_dequeue(&q->sendq);
2371 		TXQ_UNLOCK(qs);
2372 		write_ofld_wr(adap, m, q, pidx, gen, ndesc);
2373 		TXQ_LOCK(qs);
2374 	}
2375 #if USE_GTS
2376 	set_bit(TXQ_RUNNING, &q->flags);
2377 	set_bit(TXQ_LAST_PKT_DB, &q->flags);
2378 #endif
2379 	TXQ_UNLOCK(qs);
2380 	wmb();
2381 	t3_write_reg(adap, A_SG_KDOORBELL,
2382 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
2383 }
2384 
2385 /**
2386  *	t3_offload_tx - send an offload packet
2387  *	@m: the packet
2388  *
2389  *	Sends an offload packet.  We use the packet priority to select the
2390  *	appropriate Tx queue as follows: bit 0 indicates whether the packet
2391  *	should be sent as regular or control, bits 1-3 select the queue set.
2392  */
2393 int
2394 t3_offload_tx(struct adapter *sc, struct mbuf *m)
2395 {
2396 	struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2397 	struct sge_qset *qs = &sc->sge.qs[G_HDR_QSET(oh->flags)];
2398 
2399 	if (oh->flags & F_HDR_CTRL) {
2400 		m_adj(m, sizeof (*oh));	/* trim ofld_hdr off */
2401 		return (ctrl_xmit(sc, qs, m));
2402 	} else
2403 		return (ofld_xmit(sc, qs, m));
2404 }
2405 #endif
2406 
2407 static void
2408 restart_tx(struct sge_qset *qs)
2409 {
2410 	struct adapter *sc = qs->port->adapter;
2411 
2412 	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
2413 	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
2414 	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
2415 		qs->txq[TXQ_OFLD].restarts++;
2416 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2417 	}
2418 
2419 	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
2420 	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
2421 	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
2422 		qs->txq[TXQ_CTRL].restarts++;
2423 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2424 	}
2425 }
2426 
2427 /**
2428  *	t3_sge_alloc_qset - initialize an SGE queue set
2429  *	@sc: the controller softc
2430  *	@id: the queue set id
2431  *	@nports: how many Ethernet ports will be using this queue set
2432  *	@irq_vec_idx: the IRQ vector index for response queue interrupts
2433  *	@p: configuration parameters for this queue set
2434  *	@ntxq: number of Tx queues for the queue set
2435  *	@pi: port info for queue set
2436  *
2437  *	Allocate resources and initialize an SGE queue set.  A queue set
2438  *	comprises a response queue, two Rx free-buffer queues, and up to 3
2439  *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
2440  *	queue, offload queue, and control queue.
2441  */
2442 int
2443 t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
2444 		  const struct qset_params *p, int ntxq, struct port_info *pi)
2445 {
2446 	struct sge_qset *q = &sc->sge.qs[id];
2447 	int i, ret = 0;
2448 
2449 	MTX_INIT(&q->lock, q->namebuf, NULL, MTX_DEF);
2450 	q->port = pi;
2451 	q->adap = sc;
2452 
2453 	if ((q->txq[TXQ_ETH].txq_mr = buf_ring_alloc(cxgb_txq_buf_ring_size,
2454 	    M_DEVBUF, M_WAITOK, &q->lock)) == NULL) {
2455 		device_printf(sc->dev, "failed to allocate mbuf ring\n");
2456 		goto err;
2457 	}
2458 	if ((q->txq[TXQ_ETH].txq_ifq = malloc(sizeof(struct ifaltq), M_DEVBUF,
2459 	    M_NOWAIT | M_ZERO)) == NULL) {
2460 		device_printf(sc->dev, "failed to allocate ifq\n");
2461 		goto err;
2462 	}
2463 	ifq_init(q->txq[TXQ_ETH].txq_ifq, pi->ifp);
2464 	callout_init(&q->txq[TXQ_ETH].txq_timer, 1);
2465 	callout_init(&q->txq[TXQ_ETH].txq_watchdog, 1);
2466 	q->txq[TXQ_ETH].txq_timer.c_cpu = id % mp_ncpus;
2467 	q->txq[TXQ_ETH].txq_watchdog.c_cpu = id % mp_ncpus;
2468 
2469 	init_qset_cntxt(q, id);
2470 	q->idx = id;
2471 	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
2472 		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
2473 		    &q->fl[0].desc, &q->fl[0].sdesc,
2474 		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
2475 		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
2476 		printf("error %d from alloc ring fl0\n", ret);
2477 		goto err;
2478 	}
2479 
2480 	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
2481 		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
2482 		    &q->fl[1].desc, &q->fl[1].sdesc,
2483 		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
2484 		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
2485 		printf("error %d from alloc ring fl1\n", ret);
2486 		goto err;
2487 	}
2488 
2489 	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
2490 		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
2491 		    &q->rspq.desc_tag, &q->rspq.desc_map,
2492 		    NULL, NULL)) != 0) {
2493 		printf("error %d from alloc ring rspq\n", ret);
2494 		goto err;
2495 	}
2496 
2497 	snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d",
2498 	    device_get_unit(sc->dev), irq_vec_idx);
2499 	MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF);
2500 
2501 	for (i = 0; i < ntxq; ++i) {
2502 		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
2503 
2504 		if ((ret = alloc_ring(sc, p->txq_size[i],
2505 			    sizeof(struct tx_desc), sz,
2506 			    &q->txq[i].phys_addr, &q->txq[i].desc,
2507 			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
2508 			    &q->txq[i].desc_map,
2509 			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
2510 			printf("error %d from alloc ring tx %i\n", ret, i);
2511 			goto err;
2512 		}
2513 		mbufq_init(&q->txq[i].sendq, INT_MAX);
2514 		q->txq[i].gen = 1;
2515 		q->txq[i].size = p->txq_size[i];
2516 	}
2517 
2518 #ifdef TCP_OFFLOAD
2519 	TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q);
2520 #endif
2521 	TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q);
2522 	TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2523 	TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2524 
2525 	q->fl[0].gen = q->fl[1].gen = 1;
2526 	q->fl[0].size = p->fl_size;
2527 	q->fl[1].size = p->jumbo_size;
2528 
2529 	q->rspq.gen = 1;
2530 	q->rspq.cidx = 0;
2531 	q->rspq.size = p->rspq_size;
2532 
2533 	q->txq[TXQ_ETH].stop_thres = nports *
2534 	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
2535 
2536 	q->fl[0].buf_size = MCLBYTES;
2537 	q->fl[0].zone = zone_pack;
2538 	q->fl[0].type = EXT_PACKET;
2539 
2540 	if (p->jumbo_buf_size ==  MJUM16BYTES) {
2541 		q->fl[1].zone = zone_jumbo16;
2542 		q->fl[1].type = EXT_JUMBO16;
2543 	} else if (p->jumbo_buf_size ==  MJUM9BYTES) {
2544 		q->fl[1].zone = zone_jumbo9;
2545 		q->fl[1].type = EXT_JUMBO9;
2546 	} else if (p->jumbo_buf_size ==  MJUMPAGESIZE) {
2547 		q->fl[1].zone = zone_jumbop;
2548 		q->fl[1].type = EXT_JUMBOP;
2549 	} else {
2550 		KASSERT(0, ("can't deal with jumbo_buf_size %d.", p->jumbo_buf_size));
2551 		ret = EDOOFUS;
2552 		goto err;
2553 	}
2554 	q->fl[1].buf_size = p->jumbo_buf_size;
2555 
2556 	/* Allocate and setup the lro_ctrl structure */
2557 	q->lro.enabled = !!(pi->ifp->if_capenable & IFCAP_LRO);
2558 #if defined(INET6) || defined(INET)
2559 	ret = tcp_lro_init(&q->lro.ctrl);
2560 	if (ret) {
2561 		printf("error %d from tcp_lro_init\n", ret);
2562 		goto err;
2563 	}
2564 #endif
2565 	q->lro.ctrl.ifp = pi->ifp;
2566 
2567 	mtx_lock_spin(&sc->sge.reg_lock);
2568 	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
2569 				   q->rspq.phys_addr, q->rspq.size,
2570 				   q->fl[0].buf_size, 1, 0);
2571 	if (ret) {
2572 		printf("error %d from t3_sge_init_rspcntxt\n", ret);
2573 		goto err_unlock;
2574 	}
2575 
2576 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2577 		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
2578 					  q->fl[i].phys_addr, q->fl[i].size,
2579 					  q->fl[i].buf_size, p->cong_thres, 1,
2580 					  0);
2581 		if (ret) {
2582 			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
2583 			goto err_unlock;
2584 		}
2585 	}
2586 
2587 	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
2588 				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
2589 				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
2590 				 1, 0);
2591 	if (ret) {
2592 		printf("error %d from t3_sge_init_ecntxt\n", ret);
2593 		goto err_unlock;
2594 	}
2595 
2596 	if (ntxq > 1) {
2597 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
2598 					 USE_GTS, SGE_CNTXT_OFLD, id,
2599 					 q->txq[TXQ_OFLD].phys_addr,
2600 					 q->txq[TXQ_OFLD].size, 0, 1, 0);
2601 		if (ret) {
2602 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2603 			goto err_unlock;
2604 		}
2605 	}
2606 
2607 	if (ntxq > 2) {
2608 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
2609 					 SGE_CNTXT_CTRL, id,
2610 					 q->txq[TXQ_CTRL].phys_addr,
2611 					 q->txq[TXQ_CTRL].size,
2612 					 q->txq[TXQ_CTRL].token, 1, 0);
2613 		if (ret) {
2614 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2615 			goto err_unlock;
2616 		}
2617 	}
2618 
2619 	mtx_unlock_spin(&sc->sge.reg_lock);
2620 	t3_update_qset_coalesce(q, p);
2621 
2622 	refill_fl(sc, &q->fl[0], q->fl[0].size);
2623 	refill_fl(sc, &q->fl[1], q->fl[1].size);
2624 	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
2625 
2626 	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
2627 		     V_NEWTIMER(q->rspq.holdoff_tmr));
2628 
2629 	return (0);
2630 
2631 err_unlock:
2632 	mtx_unlock_spin(&sc->sge.reg_lock);
2633 err:
2634 	TXQ_LOCK(q);
2635 	t3_free_qset(sc, q);
2636 
2637 	return (ret);
2638 }
2639 
2640 /*
2641  * Remove CPL_RX_PKT headers from the mbuf and reduce it to a regular mbuf with
2642  * ethernet data.  Hardware assistance with various checksums and any vlan tag
2643  * will also be taken into account here.
2644  */
2645 void
2646 t3_rx_eth(struct adapter *adap, struct mbuf *m, int ethpad)
2647 {
2648 	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad);
2649 	struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]];
2650 	struct ifnet *ifp = pi->ifp;
2651 
2652 	if (cpl->vlan_valid) {
2653 		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
2654 		m->m_flags |= M_VLANTAG;
2655 	}
2656 
2657 	m->m_pkthdr.rcvif = ifp;
2658 	/*
2659 	 * adjust after conversion to mbuf chain
2660 	 */
2661 	m->m_pkthdr.len -= (sizeof(*cpl) + ethpad);
2662 	m->m_len -= (sizeof(*cpl) + ethpad);
2663 	m->m_data += (sizeof(*cpl) + ethpad);
2664 
2665 	if (!cpl->fragment && cpl->csum_valid && cpl->csum == 0xffff) {
2666 		struct ether_header *eh = mtod(m, void *);
2667 		uint16_t eh_type;
2668 
2669 		if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2670 			struct ether_vlan_header *evh = mtod(m, void *);
2671 
2672 			eh_type = evh->evl_proto;
2673 		} else
2674 			eh_type = eh->ether_type;
2675 
2676 		if (ifp->if_capenable & IFCAP_RXCSUM &&
2677 		    eh_type == htons(ETHERTYPE_IP)) {
2678 			m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED |
2679 			    CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2680 			m->m_pkthdr.csum_data = 0xffff;
2681 		} else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
2682 		    eh_type == htons(ETHERTYPE_IPV6)) {
2683 			m->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 |
2684 			    CSUM_PSEUDO_HDR);
2685 			m->m_pkthdr.csum_data = 0xffff;
2686 		}
2687 	}
2688 }
2689 
2690 /**
2691  *	get_packet - return the next ingress packet buffer from a free list
2692  *	@adap: the adapter that received the packet
2693  *	@drop_thres: # of remaining buffers before we start dropping packets
2694  *	@qs: the qset that the SGE free list holding the packet belongs to
2695  *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
2696  *      @r: response descriptor
2697  *
2698  *	Get the next packet from a free list and complete setup of the
2699  *	sk_buff.  If the packet is small we make a copy and recycle the
2700  *	original buffer, otherwise we use the original buffer itself.  If a
2701  *	positive drop threshold is supplied packets are dropped and their
2702  *	buffers recycled if (a) the number of remaining buffers is under the
2703  *	threshold and the packet is too big to copy, or (b) the packet should
2704  *	be copied but there is no memory for the copy.
2705  */
2706 static int
2707 get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2708     struct t3_mbuf_hdr *mh, struct rsp_desc *r)
2709 {
2710 
2711 	unsigned int len_cq =  ntohl(r->len_cq);
2712 	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2713 	int mask, cidx = fl->cidx;
2714 	struct rx_sw_desc *sd = &fl->sdesc[cidx];
2715 	uint32_t len = G_RSPD_LEN(len_cq);
2716 	uint32_t flags = M_EXT;
2717 	uint8_t sopeop = G_RSPD_SOP_EOP(ntohl(r->flags));
2718 	caddr_t cl;
2719 	struct mbuf *m;
2720 	int ret = 0;
2721 
2722 	mask = fl->size - 1;
2723 	prefetch(fl->sdesc[(cidx + 1) & mask].m);
2724 	prefetch(fl->sdesc[(cidx + 2) & mask].m);
2725 	prefetch(fl->sdesc[(cidx + 1) & mask].rxsd_cl);
2726 	prefetch(fl->sdesc[(cidx + 2) & mask].rxsd_cl);
2727 
2728 	fl->credits--;
2729 	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2730 
2731 	if (recycle_enable && len <= SGE_RX_COPY_THRES &&
2732 	    sopeop == RSPQ_SOP_EOP) {
2733 		if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
2734 			goto skip_recycle;
2735 		cl = mtod(m, void *);
2736 		memcpy(cl, sd->rxsd_cl, len);
2737 		recycle_rx_buf(adap, fl, fl->cidx);
2738 		m->m_pkthdr.len = m->m_len = len;
2739 		m->m_flags = 0;
2740 		mh->mh_head = mh->mh_tail = m;
2741 		ret = 1;
2742 		goto done;
2743 	} else {
2744 	skip_recycle:
2745 		bus_dmamap_unload(fl->entry_tag, sd->map);
2746 		cl = sd->rxsd_cl;
2747 		m = sd->m;
2748 
2749 		if ((sopeop == RSPQ_SOP_EOP) ||
2750 		    (sopeop == RSPQ_SOP))
2751 			flags |= M_PKTHDR;
2752 		m_init(m, M_NOWAIT, MT_DATA, flags);
2753 		if (fl->zone == zone_pack) {
2754 			/*
2755 			 * restore clobbered data pointer
2756 			 */
2757 			m->m_data = m->m_ext.ext_buf;
2758 		} else {
2759 			m_cljset(m, cl, fl->type);
2760 		}
2761 		m->m_len = len;
2762 	}
2763 	switch(sopeop) {
2764 	case RSPQ_SOP_EOP:
2765 		ret = 1;
2766 		/* FALLTHROUGH */
2767 	case RSPQ_SOP:
2768 		mh->mh_head = mh->mh_tail = m;
2769 		m->m_pkthdr.len = len;
2770 		break;
2771 	case RSPQ_EOP:
2772 		ret = 1;
2773 		/* FALLTHROUGH */
2774 	case RSPQ_NSOP_NEOP:
2775 		if (mh->mh_tail == NULL) {
2776 			log(LOG_ERR, "discarding intermediate descriptor entry\n");
2777 			m_freem(m);
2778 			break;
2779 		}
2780 		mh->mh_tail->m_next = m;
2781 		mh->mh_tail = m;
2782 		mh->mh_head->m_pkthdr.len += len;
2783 		break;
2784 	}
2785 	if (cxgb_debug)
2786 		printf("len=%d pktlen=%d\n", m->m_len, m->m_pkthdr.len);
2787 done:
2788 	if (++fl->cidx == fl->size)
2789 		fl->cidx = 0;
2790 
2791 	return (ret);
2792 }
2793 
2794 /**
2795  *	handle_rsp_cntrl_info - handles control information in a response
2796  *	@qs: the queue set corresponding to the response
2797  *	@flags: the response control flags
2798  *
2799  *	Handles the control information of an SGE response, such as GTS
2800  *	indications and completion credits for the queue set's Tx queues.
2801  *	HW coalesces credits, we don't do any extra SW coalescing.
2802  */
2803 static __inline void
2804 handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
2805 {
2806 	unsigned int credits;
2807 
2808 #if USE_GTS
2809 	if (flags & F_RSPD_TXQ0_GTS)
2810 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2811 #endif
2812 	credits = G_RSPD_TXQ0_CR(flags);
2813 	if (credits)
2814 		qs->txq[TXQ_ETH].processed += credits;
2815 
2816 	credits = G_RSPD_TXQ2_CR(flags);
2817 	if (credits)
2818 		qs->txq[TXQ_CTRL].processed += credits;
2819 
2820 # if USE_GTS
2821 	if (flags & F_RSPD_TXQ1_GTS)
2822 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2823 # endif
2824 	credits = G_RSPD_TXQ1_CR(flags);
2825 	if (credits)
2826 		qs->txq[TXQ_OFLD].processed += credits;
2827 
2828 }
2829 
2830 static void
2831 check_ring_db(adapter_t *adap, struct sge_qset *qs,
2832     unsigned int sleeping)
2833 {
2834 	;
2835 }
2836 
2837 /**
2838  *	process_responses - process responses from an SGE response queue
2839  *	@adap: the adapter
2840  *	@qs: the queue set to which the response queue belongs
2841  *	@budget: how many responses can be processed in this round
2842  *
2843  *	Process responses from an SGE response queue up to the supplied budget.
2844  *	Responses include received packets as well as credits and other events
2845  *	for the queues that belong to the response queue's queue set.
2846  *	A negative budget is effectively unlimited.
2847  *
2848  *	Additionally choose the interrupt holdoff time for the next interrupt
2849  *	on this queue.  If the system is under memory shortage use a fairly
2850  *	long delay to help recovery.
2851  */
2852 static int
2853 process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
2854 {
2855 	struct sge_rspq *rspq = &qs->rspq;
2856 	struct rsp_desc *r = &rspq->desc[rspq->cidx];
2857 	int budget_left = budget;
2858 	unsigned int sleeping = 0;
2859 #if defined(INET6) || defined(INET)
2860 	int lro_enabled = qs->lro.enabled;
2861 	int skip_lro;
2862 	struct lro_ctrl *lro_ctrl = &qs->lro.ctrl;
2863 #endif
2864 	struct t3_mbuf_hdr *mh = &rspq->rspq_mh;
2865 #ifdef DEBUG
2866 	static int last_holdoff = 0;
2867 	if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) {
2868 		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
2869 		last_holdoff = rspq->holdoff_tmr;
2870 	}
2871 #endif
2872 	rspq->next_holdoff = rspq->holdoff_tmr;
2873 
2874 	while (__predict_true(budget_left && is_new_response(r, rspq))) {
2875 		int eth, eop = 0, ethpad = 0;
2876 		uint32_t flags = ntohl(r->flags);
2877 		uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val);
2878 		uint8_t opcode = r->rss_hdr.opcode;
2879 
2880 		eth = (opcode == CPL_RX_PKT);
2881 
2882 		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
2883 			struct mbuf *m;
2884 
2885 			if (cxgb_debug)
2886 				printf("async notification\n");
2887 
2888 			if (mh->mh_head == NULL) {
2889 				mh->mh_head = m_gethdr(M_NOWAIT, MT_DATA);
2890 				m = mh->mh_head;
2891 			} else {
2892 				m = m_gethdr(M_NOWAIT, MT_DATA);
2893 			}
2894 			if (m == NULL)
2895 				goto no_mem;
2896 
2897                         memcpy(mtod(m, char *), r, AN_PKT_SIZE);
2898 			m->m_len = m->m_pkthdr.len = AN_PKT_SIZE;
2899                         *mtod(m, uint8_t *) = CPL_ASYNC_NOTIF;
2900 			opcode = CPL_ASYNC_NOTIF;
2901 			eop = 1;
2902                         rspq->async_notif++;
2903 			goto skip;
2904 		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
2905 			struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
2906 
2907 			if (m == NULL) {
2908 		no_mem:
2909 				rspq->next_holdoff = NOMEM_INTR_DELAY;
2910 				budget_left--;
2911 				break;
2912 			}
2913 			if (mh->mh_head == NULL)
2914 				mh->mh_head = m;
2915                         else
2916 				mh->mh_tail->m_next = m;
2917 			mh->mh_tail = m;
2918 
2919 			get_imm_packet(adap, r, m);
2920 			mh->mh_head->m_pkthdr.len += m->m_len;
2921 			eop = 1;
2922 			rspq->imm_data++;
2923 		} else if (r->len_cq) {
2924 			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
2925 
2926 			eop = get_packet(adap, drop_thresh, qs, mh, r);
2927 			if (eop) {
2928 				if (r->rss_hdr.hash_type && !adap->timestamp) {
2929 					M_HASHTYPE_SET(mh->mh_head,
2930 					    M_HASHTYPE_OPAQUE_HASH);
2931 					mh->mh_head->m_pkthdr.flowid = rss_hash;
2932 				}
2933 			}
2934 
2935 			ethpad = 2;
2936 		} else {
2937 			rspq->pure_rsps++;
2938 		}
2939 	skip:
2940 		if (flags & RSPD_CTRL_MASK) {
2941 			sleeping |= flags & RSPD_GTS_MASK;
2942 			handle_rsp_cntrl_info(qs, flags);
2943 		}
2944 
2945 		if (!eth && eop) {
2946 			rspq->offload_pkts++;
2947 #ifdef TCP_OFFLOAD
2948 			adap->cpl_handler[opcode](qs, r, mh->mh_head);
2949 #else
2950 			m_freem(mh->mh_head);
2951 #endif
2952 			mh->mh_head = NULL;
2953 		} else if (eth && eop) {
2954 			struct mbuf *m = mh->mh_head;
2955 
2956 			t3_rx_eth(adap, m, ethpad);
2957 
2958 			/*
2959 			 * The T304 sends incoming packets on any qset.  If LRO
2960 			 * is also enabled, we could end up sending packet up
2961 			 * lro_ctrl->ifp's input.  That is incorrect.
2962 			 *
2963 			 * The mbuf's rcvif was derived from the cpl header and
2964 			 * is accurate.  Skip LRO and just use that.
2965 			 */
2966 #if defined(INET6) || defined(INET)
2967 			skip_lro = __predict_false(qs->port->ifp != m->m_pkthdr.rcvif);
2968 
2969 			if (lro_enabled && lro_ctrl->lro_cnt && !skip_lro
2970 			    && (tcp_lro_rx(lro_ctrl, m, 0) == 0)
2971 			    ) {
2972 				/* successfully queue'd for LRO */
2973 			} else
2974 #endif
2975 			{
2976 				/*
2977 				 * LRO not enabled, packet unsuitable for LRO,
2978 				 * or unable to queue.  Pass it up right now in
2979 				 * either case.
2980 				 */
2981 				struct ifnet *ifp = m->m_pkthdr.rcvif;
2982 				(*ifp->if_input)(ifp, m);
2983 			}
2984 			mh->mh_head = NULL;
2985 
2986 		}
2987 
2988 		r++;
2989 		if (__predict_false(++rspq->cidx == rspq->size)) {
2990 			rspq->cidx = 0;
2991 			rspq->gen ^= 1;
2992 			r = rspq->desc;
2993 		}
2994 
2995 		if (++rspq->credits >= 64) {
2996 			refill_rspq(adap, rspq, rspq->credits);
2997 			rspq->credits = 0;
2998 		}
2999 		__refill_fl_lt(adap, &qs->fl[0], 32);
3000 		__refill_fl_lt(adap, &qs->fl[1], 32);
3001 		--budget_left;
3002 	}
3003 
3004 #if defined(INET6) || defined(INET)
3005 	/* Flush LRO */
3006 	tcp_lro_flush_all(lro_ctrl);
3007 #endif
3008 
3009 	if (sleeping)
3010 		check_ring_db(adap, qs, sleeping);
3011 
3012 	mb();  /* commit Tx queue processed updates */
3013 	if (__predict_false(qs->txq_stopped > 1))
3014 		restart_tx(qs);
3015 
3016 	__refill_fl_lt(adap, &qs->fl[0], 512);
3017 	__refill_fl_lt(adap, &qs->fl[1], 512);
3018 	budget -= budget_left;
3019 	return (budget);
3020 }
3021 
3022 /*
3023  * A helper function that processes responses and issues GTS.
3024  */
3025 static __inline int
3026 process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
3027 {
3028 	int work;
3029 	static int last_holdoff = 0;
3030 
3031 	work = process_responses(adap, rspq_to_qset(rq), -1);
3032 
3033 	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
3034 		printf("next_holdoff=%d\n", rq->next_holdoff);
3035 		last_holdoff = rq->next_holdoff;
3036 	}
3037 	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
3038 	    V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
3039 
3040 	return (work);
3041 }
3042 
3043 #ifdef DEBUGNET
3044 int
3045 cxgb_debugnet_poll_rx(adapter_t *adap, struct sge_qset *qs)
3046 {
3047 
3048 	return (process_responses_gts(adap, &qs->rspq));
3049 }
3050 #endif
3051 
3052 /*
3053  * Interrupt handler for legacy INTx interrupts for T3B-based cards.
3054  * Handles data events from SGE response queues as well as error and other
3055  * async events as they all use the same interrupt pin.  We use one SGE
3056  * response queue per port in this mode and protect all response queues with
3057  * queue 0's lock.
3058  */
3059 void
3060 t3b_intr(void *data)
3061 {
3062 	uint32_t i, map;
3063 	adapter_t *adap = data;
3064 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3065 
3066 	t3_write_reg(adap, A_PL_CLI, 0);
3067 	map = t3_read_reg(adap, A_SG_DATA_INTR);
3068 
3069 	if (!map)
3070 		return;
3071 
3072 	if (__predict_false(map & F_ERRINTR)) {
3073 		t3_write_reg(adap, A_PL_INT_ENABLE0, 0);
3074 		(void) t3_read_reg(adap, A_PL_INT_ENABLE0);
3075 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3076 	}
3077 
3078 	mtx_lock(&q0->lock);
3079 	for_each_port(adap, i)
3080 	    if (map & (1 << i))
3081 			process_responses_gts(adap, &adap->sge.qs[i].rspq);
3082 	mtx_unlock(&q0->lock);
3083 }
3084 
3085 /*
3086  * The MSI interrupt handler.  This needs to handle data events from SGE
3087  * response queues as well as error and other async events as they all use
3088  * the same MSI vector.  We use one SGE response queue per port in this mode
3089  * and protect all response queues with queue 0's lock.
3090  */
3091 void
3092 t3_intr_msi(void *data)
3093 {
3094 	adapter_t *adap = data;
3095 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3096 	int i, new_packets = 0;
3097 
3098 	mtx_lock(&q0->lock);
3099 
3100 	for_each_port(adap, i)
3101 	    if (process_responses_gts(adap, &adap->sge.qs[i].rspq))
3102 		    new_packets = 1;
3103 	mtx_unlock(&q0->lock);
3104 	if (new_packets == 0) {
3105 		t3_write_reg(adap, A_PL_INT_ENABLE0, 0);
3106 		(void) t3_read_reg(adap, A_PL_INT_ENABLE0);
3107 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3108 	}
3109 }
3110 
3111 void
3112 t3_intr_msix(void *data)
3113 {
3114 	struct sge_qset *qs = data;
3115 	adapter_t *adap = qs->port->adapter;
3116 	struct sge_rspq *rspq = &qs->rspq;
3117 
3118 	if (process_responses_gts(adap, rspq) == 0)
3119 		rspq->unhandled_irqs++;
3120 }
3121 
3122 #define QDUMP_SBUF_SIZE		32 * 400
3123 static int
3124 t3_dump_rspq(SYSCTL_HANDLER_ARGS)
3125 {
3126 	struct sge_rspq *rspq;
3127 	struct sge_qset *qs;
3128 	int i, err, dump_end, idx;
3129 	struct sbuf *sb;
3130 	struct rsp_desc *rspd;
3131 	uint32_t data[4];
3132 
3133 	rspq = arg1;
3134 	qs = rspq_to_qset(rspq);
3135 	if (rspq->rspq_dump_count == 0)
3136 		return (0);
3137 	if (rspq->rspq_dump_count > RSPQ_Q_SIZE) {
3138 		log(LOG_WARNING,
3139 		    "dump count is too large %d\n", rspq->rspq_dump_count);
3140 		rspq->rspq_dump_count = 0;
3141 		return (EINVAL);
3142 	}
3143 	if (rspq->rspq_dump_start > (RSPQ_Q_SIZE-1)) {
3144 		log(LOG_WARNING,
3145 		    "dump start of %d is greater than queue size\n",
3146 		    rspq->rspq_dump_start);
3147 		rspq->rspq_dump_start = 0;
3148 		return (EINVAL);
3149 	}
3150 	err = t3_sge_read_rspq(qs->port->adapter, rspq->cntxt_id, data);
3151 	if (err)
3152 		return (err);
3153 	err = sysctl_wire_old_buffer(req, 0);
3154 	if (err)
3155 		return (err);
3156 	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3157 
3158 	sbuf_printf(sb, " \n index=%u size=%u MSI-X/RspQ=%u intr enable=%u intr armed=%u\n",
3159 	    (data[0] & 0xffff), data[0] >> 16, ((data[2] >> 20) & 0x3f),
3160 	    ((data[2] >> 26) & 1), ((data[2] >> 27) & 1));
3161 	sbuf_printf(sb, " generation=%u CQ mode=%u FL threshold=%u\n",
3162 	    ((data[2] >> 28) & 1), ((data[2] >> 31) & 1), data[3]);
3163 
3164 	sbuf_printf(sb, " start=%d -> end=%d\n", rspq->rspq_dump_start,
3165 	    (rspq->rspq_dump_start + rspq->rspq_dump_count) & (RSPQ_Q_SIZE-1));
3166 
3167 	dump_end = rspq->rspq_dump_start + rspq->rspq_dump_count;
3168 	for (i = rspq->rspq_dump_start; i < dump_end; i++) {
3169 		idx = i & (RSPQ_Q_SIZE-1);
3170 
3171 		rspd = &rspq->desc[idx];
3172 		sbuf_printf(sb, "\tidx=%04d opcode=%02x cpu_idx=%x hash_type=%x cq_idx=%x\n",
3173 		    idx, rspd->rss_hdr.opcode, rspd->rss_hdr.cpu_idx,
3174 		    rspd->rss_hdr.hash_type, be16toh(rspd->rss_hdr.cq_idx));
3175 		sbuf_printf(sb, "\trss_hash_val=%x flags=%08x len_cq=%x intr_gen=%x\n",
3176 		    rspd->rss_hdr.rss_hash_val, be32toh(rspd->flags),
3177 		    be32toh(rspd->len_cq), rspd->intr_gen);
3178 	}
3179 
3180 	err = sbuf_finish(sb);
3181 	sbuf_delete(sb);
3182 	return (err);
3183 }
3184 
3185 static int
3186 t3_dump_txq_eth(SYSCTL_HANDLER_ARGS)
3187 {
3188 	struct sge_txq *txq;
3189 	struct sge_qset *qs;
3190 	int i, j, err, dump_end;
3191 	struct sbuf *sb;
3192 	struct tx_desc *txd;
3193 	uint32_t *WR, wr_hi, wr_lo, gen;
3194 	uint32_t data[4];
3195 
3196 	txq = arg1;
3197 	qs = txq_to_qset(txq, TXQ_ETH);
3198 	if (txq->txq_dump_count == 0) {
3199 		return (0);
3200 	}
3201 	if (txq->txq_dump_count > TX_ETH_Q_SIZE) {
3202 		log(LOG_WARNING,
3203 		    "dump count is too large %d\n", txq->txq_dump_count);
3204 		txq->txq_dump_count = 1;
3205 		return (EINVAL);
3206 	}
3207 	if (txq->txq_dump_start > (TX_ETH_Q_SIZE-1)) {
3208 		log(LOG_WARNING,
3209 		    "dump start of %d is greater than queue size\n",
3210 		    txq->txq_dump_start);
3211 		txq->txq_dump_start = 0;
3212 		return (EINVAL);
3213 	}
3214 	err = t3_sge_read_ecntxt(qs->port->adapter, qs->rspq.cntxt_id, data);
3215 	if (err)
3216 		return (err);
3217 	err = sysctl_wire_old_buffer(req, 0);
3218 	if (err)
3219 		return (err);
3220 	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3221 
3222 	sbuf_printf(sb, " \n credits=%u GTS=%u index=%u size=%u rspq#=%u cmdq#=%u\n",
3223 	    (data[0] & 0x7fff), ((data[0] >> 15) & 1), (data[0] >> 16),
3224 	    (data[1] & 0xffff), ((data[3] >> 4) & 7), ((data[3] >> 7) & 1));
3225 	sbuf_printf(sb, " TUN=%u TOE=%u generation%u uP token=%u valid=%u\n",
3226 	    ((data[3] >> 8) & 1), ((data[3] >> 9) & 1), ((data[3] >> 10) & 1),
3227 	    ((data[3] >> 11) & 0xfffff), ((data[3] >> 31) & 1));
3228 	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3229 	    txq->txq_dump_start,
3230 	    (txq->txq_dump_start + txq->txq_dump_count) & (TX_ETH_Q_SIZE-1));
3231 
3232 	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3233 	for (i = txq->txq_dump_start; i < dump_end; i++) {
3234 		txd = &txq->desc[i & (TX_ETH_Q_SIZE-1)];
3235 		WR = (uint32_t *)txd->flit;
3236 		wr_hi = ntohl(WR[0]);
3237 		wr_lo = ntohl(WR[1]);
3238 		gen = G_WR_GEN(wr_lo);
3239 
3240 		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3241 		    wr_hi, wr_lo, gen);
3242 		for (j = 2; j < 30; j += 4)
3243 			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3244 			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3245 
3246 	}
3247 	err = sbuf_finish(sb);
3248 	sbuf_delete(sb);
3249 	return (err);
3250 }
3251 
3252 static int
3253 t3_dump_txq_ctrl(SYSCTL_HANDLER_ARGS)
3254 {
3255 	struct sge_txq *txq;
3256 	struct sge_qset *qs;
3257 	int i, j, err, dump_end;
3258 	struct sbuf *sb;
3259 	struct tx_desc *txd;
3260 	uint32_t *WR, wr_hi, wr_lo, gen;
3261 
3262 	txq = arg1;
3263 	qs = txq_to_qset(txq, TXQ_CTRL);
3264 	if (txq->txq_dump_count == 0) {
3265 		return (0);
3266 	}
3267 	if (txq->txq_dump_count > 256) {
3268 		log(LOG_WARNING,
3269 		    "dump count is too large %d\n", txq->txq_dump_count);
3270 		txq->txq_dump_count = 1;
3271 		return (EINVAL);
3272 	}
3273 	if (txq->txq_dump_start > 255) {
3274 		log(LOG_WARNING,
3275 		    "dump start of %d is greater than queue size\n",
3276 		    txq->txq_dump_start);
3277 		txq->txq_dump_start = 0;
3278 		return (EINVAL);
3279 	}
3280 
3281 	err = sysctl_wire_old_buffer(req, 0);
3282 	if (err != 0)
3283 		return (err);
3284 	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3285 	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3286 	    txq->txq_dump_start,
3287 	    (txq->txq_dump_start + txq->txq_dump_count) & 255);
3288 
3289 	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3290 	for (i = txq->txq_dump_start; i < dump_end; i++) {
3291 		txd = &txq->desc[i & (255)];
3292 		WR = (uint32_t *)txd->flit;
3293 		wr_hi = ntohl(WR[0]);
3294 		wr_lo = ntohl(WR[1]);
3295 		gen = G_WR_GEN(wr_lo);
3296 
3297 		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3298 		    wr_hi, wr_lo, gen);
3299 		for (j = 2; j < 30; j += 4)
3300 			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3301 			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3302 
3303 	}
3304 	err = sbuf_finish(sb);
3305 	sbuf_delete(sb);
3306 	return (err);
3307 }
3308 
3309 static int
3310 t3_set_coalesce_usecs(SYSCTL_HANDLER_ARGS)
3311 {
3312 	adapter_t *sc = arg1;
3313 	struct qset_params *qsp = &sc->params.sge.qset[0];
3314 	int coalesce_usecs;
3315 	struct sge_qset *qs;
3316 	int i, j, err, nqsets = 0;
3317 	struct mtx *lock;
3318 
3319 	if ((sc->flags & FULL_INIT_DONE) == 0)
3320 		return (ENXIO);
3321 
3322 	coalesce_usecs = qsp->coalesce_usecs;
3323         err = sysctl_handle_int(oidp, &coalesce_usecs, arg2, req);
3324 
3325 	if (err != 0) {
3326 		return (err);
3327 	}
3328 	if (coalesce_usecs == qsp->coalesce_usecs)
3329 		return (0);
3330 
3331 	for (i = 0; i < sc->params.nports; i++)
3332 		for (j = 0; j < sc->port[i].nqsets; j++)
3333 			nqsets++;
3334 
3335 	coalesce_usecs = max(1, coalesce_usecs);
3336 
3337 	for (i = 0; i < nqsets; i++) {
3338 		qs = &sc->sge.qs[i];
3339 		qsp = &sc->params.sge.qset[i];
3340 		qsp->coalesce_usecs = coalesce_usecs;
3341 
3342 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
3343 			    &sc->sge.qs[0].rspq.lock;
3344 
3345 		mtx_lock(lock);
3346 		t3_update_qset_coalesce(qs, qsp);
3347 		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
3348 		    V_NEWTIMER(qs->rspq.holdoff_tmr));
3349 		mtx_unlock(lock);
3350 	}
3351 
3352 	return (0);
3353 }
3354 
3355 static int
3356 t3_pkt_timestamp(SYSCTL_HANDLER_ARGS)
3357 {
3358 	adapter_t *sc = arg1;
3359 	int rc, timestamp;
3360 
3361 	if ((sc->flags & FULL_INIT_DONE) == 0)
3362 		return (ENXIO);
3363 
3364 	timestamp = sc->timestamp;
3365 	rc = sysctl_handle_int(oidp, &timestamp, arg2, req);
3366 
3367 	if (rc != 0)
3368 		return (rc);
3369 
3370 	if (timestamp != sc->timestamp) {
3371 		t3_set_reg_field(sc, A_TP_PC_CONFIG2, F_ENABLERXPKTTMSTPRSS,
3372 		    timestamp ? F_ENABLERXPKTTMSTPRSS : 0);
3373 		sc->timestamp = timestamp;
3374 	}
3375 
3376 	return (0);
3377 }
3378 
3379 void
3380 t3_add_attach_sysctls(adapter_t *sc)
3381 {
3382 	struct sysctl_ctx_list *ctx;
3383 	struct sysctl_oid_list *children;
3384 
3385 	ctx = device_get_sysctl_ctx(sc->dev);
3386 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3387 
3388 	/* random information */
3389 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3390 	    "firmware_version",
3391 	    CTLFLAG_RD, sc->fw_version,
3392 	    0, "firmware version");
3393 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO,
3394 	    "hw_revision",
3395 	    CTLFLAG_RD, &sc->params.rev,
3396 	    0, "chip model");
3397 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3398 	    "port_types",
3399 	    CTLFLAG_RD, sc->port_types,
3400 	    0, "type of ports");
3401 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3402 	    "enable_debug",
3403 	    CTLFLAG_RW, &cxgb_debug,
3404 	    0, "enable verbose debugging output");
3405 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tunq_coalesce",
3406 	    CTLFLAG_RD, &sc->tunq_coalesce,
3407 	    "#tunneled packets freed");
3408 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3409 	    "txq_overrun",
3410 	    CTLFLAG_RD, &txq_fills,
3411 	    0, "#times txq overrun");
3412 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO,
3413 	    "core_clock",
3414 	    CTLFLAG_RD, &sc->params.vpd.cclk,
3415 	    0, "core clock frequency (in KHz)");
3416 }
3417 
3418 
3419 static const char *rspq_name = "rspq";
3420 static const char *txq_names[] =
3421 {
3422 	"txq_eth",
3423 	"txq_ofld",
3424 	"txq_ctrl"
3425 };
3426 
3427 static int
3428 sysctl_handle_macstat(SYSCTL_HANDLER_ARGS)
3429 {
3430 	struct port_info *p = arg1;
3431 	uint64_t *parg;
3432 
3433 	if (!p)
3434 		return (EINVAL);
3435 
3436 	cxgb_refresh_stats(p);
3437 	parg = (uint64_t *) ((uint8_t *)&p->mac.stats + arg2);
3438 
3439 	return (sysctl_handle_64(oidp, parg, 0, req));
3440 }
3441 
3442 void
3443 t3_add_configured_sysctls(adapter_t *sc)
3444 {
3445 	struct sysctl_ctx_list *ctx;
3446 	struct sysctl_oid_list *children;
3447 	int i, j;
3448 
3449 	ctx = device_get_sysctl_ctx(sc->dev);
3450 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3451 
3452 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3453 	    "intr_coal",
3454 	    CTLTYPE_INT|CTLFLAG_RW, sc,
3455 	    0, t3_set_coalesce_usecs,
3456 	    "I", "interrupt coalescing timer (us)");
3457 
3458 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3459 	    "pkt_timestamp",
3460 	    CTLTYPE_INT | CTLFLAG_RW, sc,
3461 	    0, t3_pkt_timestamp,
3462 	    "I", "provide packet timestamp instead of connection hash");
3463 
3464 	for (i = 0; i < sc->params.nports; i++) {
3465 		struct port_info *pi = &sc->port[i];
3466 		struct sysctl_oid *poid;
3467 		struct sysctl_oid_list *poidlist;
3468 		struct mac_stats *mstats = &pi->mac.stats;
3469 
3470 		snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i);
3471 		poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
3472 		    pi->namebuf, CTLFLAG_RD, NULL, "port statistics");
3473 		poidlist = SYSCTL_CHILDREN(poid);
3474 		SYSCTL_ADD_UINT(ctx, poidlist, OID_AUTO,
3475 		    "nqsets", CTLFLAG_RD, &pi->nqsets,
3476 		    0, "#queue sets");
3477 
3478 		for (j = 0; j < pi->nqsets; j++) {
3479 			struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j];
3480 			struct sysctl_oid *qspoid, *rspqpoid, *txqpoid,
3481 					  *ctrlqpoid, *lropoid;
3482 			struct sysctl_oid_list *qspoidlist, *rspqpoidlist,
3483 					       *txqpoidlist, *ctrlqpoidlist,
3484 					       *lropoidlist;
3485 			struct sge_txq *txq = &qs->txq[TXQ_ETH];
3486 
3487 			snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j);
3488 
3489 			qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO,
3490 			    qs->namebuf, CTLFLAG_RD, NULL, "qset statistics");
3491 			qspoidlist = SYSCTL_CHILDREN(qspoid);
3492 
3493 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl0_empty",
3494 					CTLFLAG_RD, &qs->fl[0].empty, 0,
3495 					"freelist #0 empty");
3496 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl1_empty",
3497 					CTLFLAG_RD, &qs->fl[1].empty, 0,
3498 					"freelist #1 empty");
3499 
3500 			rspqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3501 			    rspq_name, CTLFLAG_RD, NULL, "rspq statistics");
3502 			rspqpoidlist = SYSCTL_CHILDREN(rspqpoid);
3503 
3504 			txqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3505 			    txq_names[0], CTLFLAG_RD, NULL, "txq statistics");
3506 			txqpoidlist = SYSCTL_CHILDREN(txqpoid);
3507 
3508 			ctrlqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3509 			    txq_names[2], CTLFLAG_RD, NULL, "ctrlq statistics");
3510 			ctrlqpoidlist = SYSCTL_CHILDREN(ctrlqpoid);
3511 
3512 			lropoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3513 			    "lro_stats", CTLFLAG_RD, NULL, "LRO statistics");
3514 			lropoidlist = SYSCTL_CHILDREN(lropoid);
3515 
3516 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "size",
3517 			    CTLFLAG_RD, &qs->rspq.size,
3518 			    0, "#entries in response queue");
3519 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "cidx",
3520 			    CTLFLAG_RD, &qs->rspq.cidx,
3521 			    0, "consumer index");
3522 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "credits",
3523 			    CTLFLAG_RD, &qs->rspq.credits,
3524 			    0, "#credits");
3525 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "starved",
3526 			    CTLFLAG_RD, &qs->rspq.starved,
3527 			    0, "#times starved");
3528 			SYSCTL_ADD_UAUTO(ctx, rspqpoidlist, OID_AUTO, "phys_addr",
3529 			    CTLFLAG_RD, &qs->rspq.phys_addr,
3530 			    "physical_address_of the queue");
3531 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_start",
3532 			    CTLFLAG_RW, &qs->rspq.rspq_dump_start,
3533 			    0, "start rspq dump entry");
3534 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_count",
3535 			    CTLFLAG_RW, &qs->rspq.rspq_dump_count,
3536 			    0, "#rspq entries to dump");
3537 			SYSCTL_ADD_PROC(ctx, rspqpoidlist, OID_AUTO, "qdump",
3538 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->rspq,
3539 			    0, t3_dump_rspq, "A", "dump of the response queue");
3540 
3541 			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "dropped",
3542 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_mr->br_drops,
3543 			    "#tunneled packets dropped");
3544 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "sendqlen",
3545 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.mq_len,
3546 			    0, "#tunneled packets waiting to be sent");
3547 #if 0
3548 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_pidx",
3549 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod,
3550 			    0, "#tunneled packets queue producer index");
3551 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_cidx",
3552 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons,
3553 			    0, "#tunneled packets queue consumer index");
3554 #endif
3555 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "processed",
3556 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].processed,
3557 			    0, "#tunneled packets processed by the card");
3558 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "cleaned",
3559 			    CTLFLAG_RD, &txq->cleaned,
3560 			    0, "#tunneled packets cleaned");
3561 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "in_use",
3562 			    CTLFLAG_RD, &txq->in_use,
3563 			    0, "#tunneled packet slots in use");
3564 			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "frees",
3565 			    CTLFLAG_RD, &txq->txq_frees,
3566 			    "#tunneled packets freed");
3567 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "skipped",
3568 			    CTLFLAG_RD, &txq->txq_skipped,
3569 			    0, "#tunneled packet descriptors skipped");
3570 			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "coalesced",
3571 			    CTLFLAG_RD, &txq->txq_coalesced,
3572 			    "#tunneled packets coalesced");
3573 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "enqueued",
3574 			    CTLFLAG_RD, &txq->txq_enqueued,
3575 			    0, "#tunneled packets enqueued to hardware");
3576 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "stopped_flags",
3577 			    CTLFLAG_RD, &qs->txq_stopped,
3578 			    0, "tx queues stopped");
3579 			SYSCTL_ADD_UAUTO(ctx, txqpoidlist, OID_AUTO, "phys_addr",
3580 			    CTLFLAG_RD, &txq->phys_addr,
3581 			    "physical_address_of the queue");
3582 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "qgen",
3583 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].gen,
3584 			    0, "txq generation");
3585 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_cidx",
3586 			    CTLFLAG_RD, &txq->cidx,
3587 			    0, "hardware queue cidx");
3588 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_pidx",
3589 			    CTLFLAG_RD, &txq->pidx,
3590 			    0, "hardware queue pidx");
3591 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_start",
3592 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_start,
3593 			    0, "txq start idx for dump");
3594 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_count",
3595 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_count,
3596 			    0, "txq #entries to dump");
3597 			SYSCTL_ADD_PROC(ctx, txqpoidlist, OID_AUTO, "qdump",
3598 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_ETH],
3599 			    0, t3_dump_txq_eth, "A", "dump of the transmit queue");
3600 
3601 			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_start",
3602 			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_start,
3603 			    0, "ctrlq start idx for dump");
3604 			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_count",
3605 			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_count,
3606 			    0, "ctrl #entries to dump");
3607 			SYSCTL_ADD_PROC(ctx, ctrlqpoidlist, OID_AUTO, "qdump",
3608 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_CTRL],
3609 			    0, t3_dump_txq_ctrl, "A", "dump of the transmit queue");
3610 
3611 			SYSCTL_ADD_U64(ctx, lropoidlist, OID_AUTO, "lro_queued",
3612 			    CTLFLAG_RD, &qs->lro.ctrl.lro_queued, 0, NULL);
3613 			SYSCTL_ADD_U64(ctx, lropoidlist, OID_AUTO, "lro_flushed",
3614 			    CTLFLAG_RD, &qs->lro.ctrl.lro_flushed, 0, NULL);
3615 			SYSCTL_ADD_U64(ctx, lropoidlist, OID_AUTO, "lro_bad_csum",
3616 			    CTLFLAG_RD, &qs->lro.ctrl.lro_bad_csum, 0, NULL);
3617 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_cnt",
3618 			    CTLFLAG_RD, &qs->lro.ctrl.lro_cnt, 0, NULL);
3619 		}
3620 
3621 		/* Now add a node for mac stats. */
3622 		poid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, "mac_stats",
3623 		    CTLFLAG_RD, NULL, "MAC statistics");
3624 		poidlist = SYSCTL_CHILDREN(poid);
3625 
3626 		/*
3627 		 * We (ab)use the length argument (arg2) to pass on the offset
3628 		 * of the data that we are interested in.  This is only required
3629 		 * for the quad counters that are updated from the hardware (we
3630 		 * make sure that we return the latest value).
3631 		 * sysctl_handle_macstat first updates *all* the counters from
3632 		 * the hardware, and then returns the latest value of the
3633 		 * requested counter.  Best would be to update only the
3634 		 * requested counter from hardware, but t3_mac_update_stats()
3635 		 * hides all the register details and we don't want to dive into
3636 		 * all that here.
3637 		 */
3638 #define CXGB_SYSCTL_ADD_QUAD(a)	SYSCTL_ADD_OID(ctx, poidlist, OID_AUTO, #a, \
3639     (CTLTYPE_U64 | CTLFLAG_RD), pi, offsetof(struct mac_stats, a), \
3640     sysctl_handle_macstat, "QU", 0)
3641 		CXGB_SYSCTL_ADD_QUAD(tx_octets);
3642 		CXGB_SYSCTL_ADD_QUAD(tx_octets_bad);
3643 		CXGB_SYSCTL_ADD_QUAD(tx_frames);
3644 		CXGB_SYSCTL_ADD_QUAD(tx_mcast_frames);
3645 		CXGB_SYSCTL_ADD_QUAD(tx_bcast_frames);
3646 		CXGB_SYSCTL_ADD_QUAD(tx_pause);
3647 		CXGB_SYSCTL_ADD_QUAD(tx_deferred);
3648 		CXGB_SYSCTL_ADD_QUAD(tx_late_collisions);
3649 		CXGB_SYSCTL_ADD_QUAD(tx_total_collisions);
3650 		CXGB_SYSCTL_ADD_QUAD(tx_excess_collisions);
3651 		CXGB_SYSCTL_ADD_QUAD(tx_underrun);
3652 		CXGB_SYSCTL_ADD_QUAD(tx_len_errs);
3653 		CXGB_SYSCTL_ADD_QUAD(tx_mac_internal_errs);
3654 		CXGB_SYSCTL_ADD_QUAD(tx_excess_deferral);
3655 		CXGB_SYSCTL_ADD_QUAD(tx_fcs_errs);
3656 		CXGB_SYSCTL_ADD_QUAD(tx_frames_64);
3657 		CXGB_SYSCTL_ADD_QUAD(tx_frames_65_127);
3658 		CXGB_SYSCTL_ADD_QUAD(tx_frames_128_255);
3659 		CXGB_SYSCTL_ADD_QUAD(tx_frames_256_511);
3660 		CXGB_SYSCTL_ADD_QUAD(tx_frames_512_1023);
3661 		CXGB_SYSCTL_ADD_QUAD(tx_frames_1024_1518);
3662 		CXGB_SYSCTL_ADD_QUAD(tx_frames_1519_max);
3663 		CXGB_SYSCTL_ADD_QUAD(rx_octets);
3664 		CXGB_SYSCTL_ADD_QUAD(rx_octets_bad);
3665 		CXGB_SYSCTL_ADD_QUAD(rx_frames);
3666 		CXGB_SYSCTL_ADD_QUAD(rx_mcast_frames);
3667 		CXGB_SYSCTL_ADD_QUAD(rx_bcast_frames);
3668 		CXGB_SYSCTL_ADD_QUAD(rx_pause);
3669 		CXGB_SYSCTL_ADD_QUAD(rx_fcs_errs);
3670 		CXGB_SYSCTL_ADD_QUAD(rx_align_errs);
3671 		CXGB_SYSCTL_ADD_QUAD(rx_symbol_errs);
3672 		CXGB_SYSCTL_ADD_QUAD(rx_data_errs);
3673 		CXGB_SYSCTL_ADD_QUAD(rx_sequence_errs);
3674 		CXGB_SYSCTL_ADD_QUAD(rx_runt);
3675 		CXGB_SYSCTL_ADD_QUAD(rx_jabber);
3676 		CXGB_SYSCTL_ADD_QUAD(rx_short);
3677 		CXGB_SYSCTL_ADD_QUAD(rx_too_long);
3678 		CXGB_SYSCTL_ADD_QUAD(rx_mac_internal_errs);
3679 		CXGB_SYSCTL_ADD_QUAD(rx_cong_drops);
3680 		CXGB_SYSCTL_ADD_QUAD(rx_frames_64);
3681 		CXGB_SYSCTL_ADD_QUAD(rx_frames_65_127);
3682 		CXGB_SYSCTL_ADD_QUAD(rx_frames_128_255);
3683 		CXGB_SYSCTL_ADD_QUAD(rx_frames_256_511);
3684 		CXGB_SYSCTL_ADD_QUAD(rx_frames_512_1023);
3685 		CXGB_SYSCTL_ADD_QUAD(rx_frames_1024_1518);
3686 		CXGB_SYSCTL_ADD_QUAD(rx_frames_1519_max);
3687 #undef CXGB_SYSCTL_ADD_QUAD
3688 
3689 #define CXGB_SYSCTL_ADD_ULONG(a) SYSCTL_ADD_ULONG(ctx, poidlist, OID_AUTO, #a, \
3690     CTLFLAG_RD, &mstats->a, 0)
3691 		CXGB_SYSCTL_ADD_ULONG(tx_fifo_parity_err);
3692 		CXGB_SYSCTL_ADD_ULONG(rx_fifo_parity_err);
3693 		CXGB_SYSCTL_ADD_ULONG(tx_fifo_urun);
3694 		CXGB_SYSCTL_ADD_ULONG(rx_fifo_ovfl);
3695 		CXGB_SYSCTL_ADD_ULONG(serdes_signal_loss);
3696 		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_ctc_err);
3697 		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_align_change);
3698 		CXGB_SYSCTL_ADD_ULONG(num_toggled);
3699 		CXGB_SYSCTL_ADD_ULONG(num_resets);
3700 		CXGB_SYSCTL_ADD_ULONG(link_faults);
3701 #undef CXGB_SYSCTL_ADD_ULONG
3702 	}
3703 }
3704 
3705 /**
3706  *	t3_get_desc - dump an SGE descriptor for debugging purposes
3707  *	@qs: the queue set
3708  *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
3709  *	@idx: the descriptor index in the queue
3710  *	@data: where to dump the descriptor contents
3711  *
3712  *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
3713  *	size of the descriptor.
3714  */
3715 int
3716 t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
3717 		unsigned char *data)
3718 {
3719 	if (qnum >= 6)
3720 		return (EINVAL);
3721 
3722 	if (qnum < 3) {
3723 		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
3724 			return -EINVAL;
3725 		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
3726 		return sizeof(struct tx_desc);
3727 	}
3728 
3729 	if (qnum == 3) {
3730 		if (!qs->rspq.desc || idx >= qs->rspq.size)
3731 			return (EINVAL);
3732 		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
3733 		return sizeof(struct rsp_desc);
3734 	}
3735 
3736 	qnum -= 4;
3737 	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
3738 		return (EINVAL);
3739 	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
3740 	return sizeof(struct rx_desc);
3741 }
3742