xref: /freebsd/sys/dev/cxgbe/tom/t4_listen.c (revision a0ee8cc6)
1 /*-
2  * Copyright (c) 2012 Chelsio Communications, Inc.
3  * All rights reserved.
4  * Written by: Navdeep Parhar <np@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_inet.h"
32 #include "opt_inet6.h"
33 
34 #ifdef TCP_OFFLOAD
35 #include <sys/param.h>
36 #include <sys/types.h>
37 #include <sys/kernel.h>
38 #include <sys/ktr.h>
39 #include <sys/module.h>
40 #include <sys/protosw.h>
41 #include <sys/refcount.h>
42 #include <sys/domain.h>
43 #include <sys/fnv_hash.h>
44 #include <sys/socket.h>
45 #include <sys/socketvar.h>
46 #include <net/ethernet.h>
47 #include <net/if.h>
48 #include <net/if_types.h>
49 #include <net/if_vlan_var.h>
50 #include <net/route.h>
51 #include <netinet/in.h>
52 #include <netinet/in_fib.h>
53 #include <netinet/in_pcb.h>
54 #include <netinet/ip.h>
55 #include <netinet/ip6.h>
56 #include <netinet6/in6_fib.h>
57 #include <netinet6/scope6_var.h>
58 #include <netinet/tcp_timer.h>
59 #include <netinet/tcp_var.h>
60 #define TCPSTATES
61 #include <netinet/tcp_fsm.h>
62 #include <netinet/toecore.h>
63 
64 #include "common/common.h"
65 #include "common/t4_msg.h"
66 #include "common/t4_regs.h"
67 #include "tom/t4_tom_l2t.h"
68 #include "tom/t4_tom.h"
69 
70 /* stid services */
71 static int alloc_stid(struct adapter *, struct listen_ctx *, int);
72 static struct listen_ctx *lookup_stid(struct adapter *, int);
73 static void free_stid(struct adapter *, struct listen_ctx *);
74 
75 /* lctx services */
76 static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *,
77     struct vi_info *);
78 static int free_lctx(struct adapter *, struct listen_ctx *);
79 static void hold_lctx(struct listen_ctx *);
80 static void listen_hash_add(struct adapter *, struct listen_ctx *);
81 static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *);
82 static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *);
83 static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *);
84 
85 static inline void save_qids_in_mbuf(struct mbuf *, struct vi_info *);
86 static inline void get_qids_from_mbuf(struct mbuf *m, int *, int *);
87 static void send_reset_synqe(struct toedev *, struct synq_entry *);
88 
89 static int
90 alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6)
91 {
92 	struct tid_info *t = &sc->tids;
93 	u_int stid, n, f, mask;
94 	struct stid_region *sr = &lctx->stid_region;
95 
96 	/*
97 	 * An IPv6 server needs 2 naturally aligned stids (1 stid = 4 cells) in
98 	 * the TCAM.  The start of the stid region is properly aligned (the chip
99 	 * requires each region to be 128-cell aligned).
100 	 */
101 	n = isipv6 ? 2 : 1;
102 	mask = n - 1;
103 	KASSERT((t->stid_base & mask) == 0 && (t->nstids & mask) == 0,
104 	    ("%s: stid region (%u, %u) not properly aligned.  n = %u",
105 	    __func__, t->stid_base, t->nstids, n));
106 
107 	mtx_lock(&t->stid_lock);
108 	if (n > t->nstids - t->stids_in_use) {
109 		mtx_unlock(&t->stid_lock);
110 		return (-1);
111 	}
112 
113 	if (t->nstids_free_head >= n) {
114 		/*
115 		 * This allocation will definitely succeed because the region
116 		 * starts at a good alignment and we just checked we have enough
117 		 * stids free.
118 		 */
119 		f = t->nstids_free_head & mask;
120 		t->nstids_free_head -= n + f;
121 		stid = t->nstids_free_head;
122 		TAILQ_INSERT_HEAD(&t->stids, sr, link);
123 	} else {
124 		struct stid_region *s;
125 
126 		stid = t->nstids_free_head;
127 		TAILQ_FOREACH(s, &t->stids, link) {
128 			stid += s->used + s->free;
129 			f = stid & mask;
130 			if (s->free >= n + f) {
131 				stid -= n + f;
132 				s->free -= n + f;
133 				TAILQ_INSERT_AFTER(&t->stids, s, sr, link);
134 				goto allocated;
135 			}
136 		}
137 
138 		if (__predict_false(stid != t->nstids)) {
139 			panic("%s: stids TAILQ (%p) corrupt."
140 			    "  At %d instead of %d at the end of the queue.",
141 			    __func__, &t->stids, stid, t->nstids);
142 		}
143 
144 		mtx_unlock(&t->stid_lock);
145 		return (-1);
146 	}
147 
148 allocated:
149 	sr->used = n;
150 	sr->free = f;
151 	t->stids_in_use += n;
152 	t->stid_tab[stid] = lctx;
153 	mtx_unlock(&t->stid_lock);
154 
155 	KASSERT(((stid + t->stid_base) & mask) == 0,
156 	    ("%s: EDOOFUS.", __func__));
157 	return (stid + t->stid_base);
158 }
159 
160 static struct listen_ctx *
161 lookup_stid(struct adapter *sc, int stid)
162 {
163 	struct tid_info *t = &sc->tids;
164 
165 	return (t->stid_tab[stid - t->stid_base]);
166 }
167 
168 static void
169 free_stid(struct adapter *sc, struct listen_ctx *lctx)
170 {
171 	struct tid_info *t = &sc->tids;
172 	struct stid_region *sr = &lctx->stid_region;
173 	struct stid_region *s;
174 
175 	KASSERT(sr->used > 0, ("%s: nonsense free (%d)", __func__, sr->used));
176 
177 	mtx_lock(&t->stid_lock);
178 	s = TAILQ_PREV(sr, stid_head, link);
179 	if (s != NULL)
180 		s->free += sr->used + sr->free;
181 	else
182 		t->nstids_free_head += sr->used + sr->free;
183 	KASSERT(t->stids_in_use >= sr->used,
184 	    ("%s: stids_in_use (%u) < stids being freed (%u)", __func__,
185 	    t->stids_in_use, sr->used));
186 	t->stids_in_use -= sr->used;
187 	TAILQ_REMOVE(&t->stids, sr, link);
188 	mtx_unlock(&t->stid_lock);
189 }
190 
191 static struct listen_ctx *
192 alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi)
193 {
194 	struct listen_ctx *lctx;
195 
196 	INP_WLOCK_ASSERT(inp);
197 
198 	lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO);
199 	if (lctx == NULL)
200 		return (NULL);
201 
202 	lctx->stid = alloc_stid(sc, lctx, inp->inp_vflag & INP_IPV6);
203 	if (lctx->stid < 0) {
204 		free(lctx, M_CXGBE);
205 		return (NULL);
206 	}
207 
208 	if (inp->inp_vflag & INP_IPV6 &&
209 	    !IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) {
210 		struct tom_data *td = sc->tom_softc;
211 
212 		lctx->ce = hold_lip(td, &inp->in6p_laddr);
213 		if (lctx->ce == NULL) {
214 			free(lctx, M_CXGBE);
215 			return (NULL);
216 		}
217 	}
218 
219 	lctx->ctrlq = &sc->sge.ctrlq[vi->pi->port_id];
220 	lctx->ofld_rxq = &sc->sge.ofld_rxq[vi->first_ofld_rxq];
221 	refcount_init(&lctx->refcount, 1);
222 	TAILQ_INIT(&lctx->synq);
223 
224 	lctx->inp = inp;
225 	in_pcbref(inp);
226 
227 	return (lctx);
228 }
229 
230 /* Don't call this directly, use release_lctx instead */
231 static int
232 free_lctx(struct adapter *sc, struct listen_ctx *lctx)
233 {
234 	struct inpcb *inp = lctx->inp;
235 	struct tom_data *td = sc->tom_softc;
236 
237 	INP_WLOCK_ASSERT(inp);
238 	KASSERT(lctx->refcount == 0,
239 	    ("%s: refcount %d", __func__, lctx->refcount));
240 	KASSERT(TAILQ_EMPTY(&lctx->synq),
241 	    ("%s: synq not empty.", __func__));
242 	KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid));
243 
244 	CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p",
245 	    __func__, lctx->stid, lctx, lctx->inp);
246 
247 	if (lctx->ce)
248 		release_lip(td, lctx->ce);
249 	free_stid(sc, lctx);
250 	free(lctx, M_CXGBE);
251 
252 	return (in_pcbrele_wlocked(inp));
253 }
254 
255 static void
256 hold_lctx(struct listen_ctx *lctx)
257 {
258 
259 	refcount_acquire(&lctx->refcount);
260 }
261 
262 static inline uint32_t
263 listen_hashfn(void *key, u_long mask)
264 {
265 
266 	return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask);
267 }
268 
269 /*
270  * Add a listen_ctx entry to the listen hash table.
271  */
272 static void
273 listen_hash_add(struct adapter *sc, struct listen_ctx *lctx)
274 {
275 	struct tom_data *td = sc->tom_softc;
276 	int bucket = listen_hashfn(lctx->inp, td->listen_mask);
277 
278 	mtx_lock(&td->lctx_hash_lock);
279 	LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link);
280 	td->lctx_count++;
281 	mtx_unlock(&td->lctx_hash_lock);
282 }
283 
284 /*
285  * Look for the listening socket's context entry in the hash and return it.
286  */
287 static struct listen_ctx *
288 listen_hash_find(struct adapter *sc, struct inpcb *inp)
289 {
290 	struct tom_data *td = sc->tom_softc;
291 	int bucket = listen_hashfn(inp, td->listen_mask);
292 	struct listen_ctx *lctx;
293 
294 	mtx_lock(&td->lctx_hash_lock);
295 	LIST_FOREACH(lctx, &td->listen_hash[bucket], link) {
296 		if (lctx->inp == inp)
297 			break;
298 	}
299 	mtx_unlock(&td->lctx_hash_lock);
300 
301 	return (lctx);
302 }
303 
304 /*
305  * Removes the listen_ctx structure for inp from the hash and returns it.
306  */
307 static struct listen_ctx *
308 listen_hash_del(struct adapter *sc, struct inpcb *inp)
309 {
310 	struct tom_data *td = sc->tom_softc;
311 	int bucket = listen_hashfn(inp, td->listen_mask);
312 	struct listen_ctx *lctx, *l;
313 
314 	mtx_lock(&td->lctx_hash_lock);
315 	LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) {
316 		if (lctx->inp == inp) {
317 			LIST_REMOVE(lctx, link);
318 			td->lctx_count--;
319 			break;
320 		}
321 	}
322 	mtx_unlock(&td->lctx_hash_lock);
323 
324 	return (lctx);
325 }
326 
327 /*
328  * Releases a hold on the lctx.  Must be called with the listening socket's inp
329  * locked.  The inp may be freed by this function and it returns NULL to
330  * indicate this.
331  */
332 static struct inpcb *
333 release_lctx(struct adapter *sc, struct listen_ctx *lctx)
334 {
335 	struct inpcb *inp = lctx->inp;
336 	int inp_freed = 0;
337 
338 	INP_WLOCK_ASSERT(inp);
339 	if (refcount_release(&lctx->refcount))
340 		inp_freed = free_lctx(sc, lctx);
341 
342 	return (inp_freed ? NULL : inp);
343 }
344 
345 static void
346 send_reset_synqe(struct toedev *tod, struct synq_entry *synqe)
347 {
348 	struct adapter *sc = tod->tod_softc;
349 	struct mbuf *m = synqe->syn;
350 	struct ifnet *ifp = m->m_pkthdr.rcvif;
351 	struct vi_info *vi = ifp->if_softc;
352 	struct port_info *pi = vi->pi;
353 	struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx];
354 	struct wrqe *wr;
355 	struct fw_flowc_wr *flowc;
356 	struct cpl_abort_req *req;
357 	int txqid, rxqid, flowclen;
358 	struct sge_wrq *ofld_txq;
359 	struct sge_ofld_rxq *ofld_rxq;
360 	const int nparams = 6;
361 	unsigned int pfvf = G_FW_VIID_PFN(vi->viid) << S_FW_VIID_PFN;
362 
363 	INP_WLOCK_ASSERT(synqe->lctx->inp);
364 
365 	CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s",
366 	    __func__, synqe, synqe->flags, synqe->tid,
367 	    synqe->flags & TPF_ABORT_SHUTDOWN ?
368 	    " (abort already in progress)" : "");
369 	if (synqe->flags & TPF_ABORT_SHUTDOWN)
370 		return;	/* abort already in progress */
371 	synqe->flags |= TPF_ABORT_SHUTDOWN;
372 
373 	get_qids_from_mbuf(m, &txqid, &rxqid);
374 	ofld_txq = &sc->sge.ofld_txq[txqid];
375 	ofld_rxq = &sc->sge.ofld_rxq[rxqid];
376 
377 	/* The wrqe will have two WRs - a flowc followed by an abort_req */
378 	flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
379 
380 	wr = alloc_wrqe(roundup2(flowclen, EQ_ESIZE) + sizeof(*req), ofld_txq);
381 	if (wr == NULL) {
382 		/* XXX */
383 		panic("%s: allocation failure.", __func__);
384 	}
385 	flowc = wrtod(wr);
386 	req = (void *)((caddr_t)flowc + roundup2(flowclen, EQ_ESIZE));
387 
388 	/* First the flowc ... */
389 	memset(flowc, 0, wr->wr_len);
390 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
391 	    V_FW_FLOWC_WR_NPARAMS(nparams));
392 	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
393 	    V_FW_WR_FLOWID(synqe->tid));
394 	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
395 	flowc->mnemval[0].val = htobe32(pfvf);
396 	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
397 	flowc->mnemval[1].val = htobe32(pi->tx_chan);
398 	flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
399 	flowc->mnemval[2].val = htobe32(pi->tx_chan);
400 	flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
401 	flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id);
402  	flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF;
403  	flowc->mnemval[4].val = htobe32(512);
404  	flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS;
405  	flowc->mnemval[5].val = htobe32(512);
406 	synqe->flags |= TPF_FLOWC_WR_SENT;
407 
408 	/* ... then ABORT request */
409 	INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid);
410 	req->rsvd0 = 0;	/* don't have a snd_nxt */
411 	req->rsvd1 = 1;	/* no data sent yet */
412 	req->cmd = CPL_ABORT_SEND_RST;
413 
414 	t4_l2t_send(sc, wr, e);
415 }
416 
417 static int
418 create_server(struct adapter *sc, struct listen_ctx *lctx)
419 {
420 	struct wrqe *wr;
421 	struct cpl_pass_open_req *req;
422 	struct inpcb *inp = lctx->inp;
423 
424 	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
425 	if (wr == NULL) {
426 		log(LOG_ERR, "%s: allocation failure", __func__);
427 		return (ENOMEM);
428 	}
429 	req = wrtod(wr);
430 
431 	INIT_TP_WR(req, 0);
432 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid));
433 	req->local_port = inp->inp_lport;
434 	req->peer_port = 0;
435 	req->local_ip = inp->inp_laddr.s_addr;
436 	req->peer_ip = 0;
437 	req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
438 	req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
439 	    F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
440 
441 	t4_wrq_tx(sc, wr);
442 	return (0);
443 }
444 
445 static int
446 create_server6(struct adapter *sc, struct listen_ctx *lctx)
447 {
448 	struct wrqe *wr;
449 	struct cpl_pass_open_req6 *req;
450 	struct inpcb *inp = lctx->inp;
451 
452 	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
453 	if (wr == NULL) {
454 		log(LOG_ERR, "%s: allocation failure", __func__);
455 		return (ENOMEM);
456 	}
457 	req = wrtod(wr);
458 
459 	INIT_TP_WR(req, 0);
460 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid));
461 	req->local_port = inp->inp_lport;
462 	req->peer_port = 0;
463 	req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0];
464 	req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8];
465 	req->peer_ip_hi = 0;
466 	req->peer_ip_lo = 0;
467 	req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
468 	req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
469 	    F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
470 
471 	t4_wrq_tx(sc, wr);
472 	return (0);
473 }
474 
475 static int
476 destroy_server(struct adapter *sc, struct listen_ctx *lctx)
477 {
478 	struct wrqe *wr;
479 	struct cpl_close_listsvr_req *req;
480 
481 	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
482 	if (wr == NULL) {
483 		/* XXX */
484 		panic("%s: allocation failure.", __func__);
485 	}
486 	req = wrtod(wr);
487 
488 	INIT_TP_WR(req, 0);
489 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ,
490 	    lctx->stid));
491 	req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id);
492 	req->rsvd = htobe16(0);
493 
494 	t4_wrq_tx(sc, wr);
495 	return (0);
496 }
497 
498 /*
499  * Start a listening server by sending a passive open request to HW.
500  *
501  * Can't take adapter lock here and access to sc->flags,
502  * sc->offload_map, if_capenable are all race prone.
503  */
504 int
505 t4_listen_start(struct toedev *tod, struct tcpcb *tp)
506 {
507 	struct adapter *sc = tod->tod_softc;
508 	struct vi_info *vi;
509 	struct port_info *pi;
510 	struct inpcb *inp = tp->t_inpcb;
511 	struct listen_ctx *lctx;
512 	int i, rc, v;
513 
514 	INP_WLOCK_ASSERT(inp);
515 
516 	/* Don't start a hardware listener for any loopback address. */
517 	if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr))
518 		return (0);
519 	if (!(inp->inp_vflag & INP_IPV6) &&
520 	    IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr)))
521 		return (0);
522 #if 0
523 	ADAPTER_LOCK(sc);
524 	if (IS_BUSY(sc)) {
525 		log(LOG_ERR, "%s: listen request ignored, %s is busy",
526 		    __func__, device_get_nameunit(sc->dev));
527 		goto done;
528 	}
529 
530 	KASSERT(uld_active(sc, ULD_TOM),
531 	    ("%s: TOM not initialized", __func__));
532 #endif
533 
534 	/*
535 	 * Find a running VI with IFCAP_TOE (4 or 6).  We'll use the first
536 	 * such VI's queues to send the passive open and receive the reply to
537 	 * it.
538 	 *
539 	 * XXX: need a way to mark a port in use by offload.  if_cxgbe should
540 	 * then reject any attempt to bring down such a port (and maybe reject
541 	 * attempts to disable IFCAP_TOE on that port too?).
542 	 */
543 	for_each_port(sc, i) {
544 		pi = sc->port[i];
545 		for_each_vi(pi, v, vi) {
546 			if (vi->ifp->if_drv_flags & IFF_DRV_RUNNING &&
547 			    vi->ifp->if_capenable & IFCAP_TOE)
548 				goto found;
549 		}
550 	}
551 	goto done;	/* no port that's UP with IFCAP_TOE enabled */
552 found:
553 
554 	if (listen_hash_find(sc, inp) != NULL)
555 		goto done;	/* already setup */
556 
557 	lctx = alloc_lctx(sc, inp, vi);
558 	if (lctx == NULL) {
559 		log(LOG_ERR,
560 		    "%s: listen request ignored, %s couldn't allocate lctx\n",
561 		    __func__, device_get_nameunit(sc->dev));
562 		goto done;
563 	}
564 	listen_hash_add(sc, lctx);
565 
566 	CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x",
567 	    __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp,
568 	    inp->inp_vflag);
569 
570 	if (inp->inp_vflag & INP_IPV6)
571 		rc = create_server6(sc, lctx);
572 	else
573 		rc = create_server(sc, lctx);
574 	if (rc != 0) {
575 		log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n",
576 		    __func__, device_get_nameunit(sc->dev), rc);
577 		(void) listen_hash_del(sc, inp);
578 		inp = release_lctx(sc, lctx);
579 		/* can't be freed, host stack has a reference */
580 		KASSERT(inp != NULL, ("%s: inp freed", __func__));
581 		goto done;
582 	}
583 	lctx->flags |= LCTX_RPL_PENDING;
584 done:
585 #if 0
586 	ADAPTER_UNLOCK(sc);
587 #endif
588 	return (0);
589 }
590 
591 int
592 t4_listen_stop(struct toedev *tod, struct tcpcb *tp)
593 {
594 	struct listen_ctx *lctx;
595 	struct adapter *sc = tod->tod_softc;
596 	struct inpcb *inp = tp->t_inpcb;
597 	struct synq_entry *synqe;
598 
599 	INP_WLOCK_ASSERT(inp);
600 
601 	lctx = listen_hash_del(sc, inp);
602 	if (lctx == NULL)
603 		return (ENOENT);	/* no hardware listener for this inp */
604 
605 	CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid,
606 	    lctx, lctx->flags);
607 
608 	/*
609 	 * If the reply to the PASS_OPEN is still pending we'll wait for it to
610 	 * arrive and clean up when it does.
611 	 */
612 	if (lctx->flags & LCTX_RPL_PENDING) {
613 		KASSERT(TAILQ_EMPTY(&lctx->synq),
614 		    ("%s: synq not empty.", __func__));
615 		return (EINPROGRESS);
616 	}
617 
618 	/*
619 	 * The host stack will abort all the connections on the listening
620 	 * socket's so_comp.  It doesn't know about the connections on the synq
621 	 * so we need to take care of those.
622 	 */
623 	TAILQ_FOREACH(synqe, &lctx->synq, link) {
624 		if (synqe->flags & TPF_SYNQE_HAS_L2TE)
625 			send_reset_synqe(tod, synqe);
626 	}
627 
628 	destroy_server(sc, lctx);
629 	return (0);
630 }
631 
632 static inline void
633 hold_synqe(struct synq_entry *synqe)
634 {
635 
636 	refcount_acquire(&synqe->refcnt);
637 }
638 
639 static inline void
640 release_synqe(struct synq_entry *synqe)
641 {
642 
643 	if (refcount_release(&synqe->refcnt)) {
644 		int needfree = synqe->flags & TPF_SYNQE_NEEDFREE;
645 
646 		m_freem(synqe->syn);
647 		if (needfree)
648 			free(synqe, M_CXGBE);
649 	}
650 }
651 
652 void
653 t4_syncache_added(struct toedev *tod __unused, void *arg)
654 {
655 	struct synq_entry *synqe = arg;
656 
657 	hold_synqe(synqe);
658 }
659 
660 void
661 t4_syncache_removed(struct toedev *tod __unused, void *arg)
662 {
663 	struct synq_entry *synqe = arg;
664 
665 	release_synqe(synqe);
666 }
667 
668 /* XXX */
669 extern void tcp_dooptions(struct tcpopt *, u_char *, int, int);
670 
671 int
672 t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m)
673 {
674 	struct adapter *sc = tod->tod_softc;
675 	struct synq_entry *synqe = arg;
676 	struct wrqe *wr;
677 	struct l2t_entry *e;
678 	struct tcpopt to;
679 	struct ip *ip = mtod(m, struct ip *);
680 	struct tcphdr *th;
681 
682 	wr = (struct wrqe *)atomic_readandclear_ptr(&synqe->wr);
683 	if (wr == NULL) {
684 		m_freem(m);
685 		return (EALREADY);
686 	}
687 
688 	if (ip->ip_v == IPVERSION)
689 		th = (void *)(ip + 1);
690 	else
691 		th = (void *)((struct ip6_hdr *)ip + 1);
692 	bzero(&to, sizeof(to));
693 	tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th),
694 	    TO_SYN);
695 
696 	/* save these for later */
697 	synqe->iss = be32toh(th->th_seq);
698 	synqe->ts = to.to_tsval;
699 
700 	if (is_t5(sc)) {
701 		struct cpl_t5_pass_accept_rpl *rpl5 = wrtod(wr);
702 
703 		rpl5->iss = th->th_seq;
704 	}
705 
706 	e = &sc->l2t->l2tab[synqe->l2e_idx];
707 	t4_l2t_send(sc, wr, e);
708 
709 	m_freem(m);	/* don't need this any more */
710 	return (0);
711 }
712 
713 static int
714 do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
715     struct mbuf *m)
716 {
717 	struct adapter *sc = iq->adapter;
718 	const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1);
719 	int stid = GET_TID(cpl);
720 	unsigned int status = cpl->status;
721 	struct listen_ctx *lctx = lookup_stid(sc, stid);
722 	struct inpcb *inp = lctx->inp;
723 #ifdef INVARIANTS
724 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
725 #endif
726 
727 	KASSERT(opcode == CPL_PASS_OPEN_RPL,
728 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
729 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
730 	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
731 
732 	INP_WLOCK(inp);
733 
734 	CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x",
735 	    __func__, stid, status, lctx->flags);
736 
737 	lctx->flags &= ~LCTX_RPL_PENDING;
738 
739 	if (status != CPL_ERR_NONE)
740 		log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status);
741 
742 #ifdef INVARIANTS
743 	/*
744 	 * If the inp has been dropped (listening socket closed) then
745 	 * listen_stop must have run and taken the inp out of the hash.
746 	 */
747 	if (inp->inp_flags & INP_DROPPED) {
748 		KASSERT(listen_hash_del(sc, inp) == NULL,
749 		    ("%s: inp %p still in listen hash", __func__, inp));
750 	}
751 #endif
752 
753 	if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) {
754 		if (release_lctx(sc, lctx) != NULL)
755 			INP_WUNLOCK(inp);
756 		return (status);
757 	}
758 
759 	/*
760 	 * Listening socket stopped listening earlier and now the chip tells us
761 	 * it has started the hardware listener.  Stop it; the lctx will be
762 	 * released in do_close_server_rpl.
763 	 */
764 	if (inp->inp_flags & INP_DROPPED) {
765 		destroy_server(sc, lctx);
766 		INP_WUNLOCK(inp);
767 		return (status);
768 	}
769 
770 	/*
771 	 * Failed to start hardware listener.  Take inp out of the hash and
772 	 * release our reference on it.  An error message has been logged
773 	 * already.
774 	 */
775 	if (status != CPL_ERR_NONE) {
776 		listen_hash_del(sc, inp);
777 		if (release_lctx(sc, lctx) != NULL)
778 			INP_WUNLOCK(inp);
779 		return (status);
780 	}
781 
782 	/* hardware listener open for business */
783 
784 	INP_WUNLOCK(inp);
785 	return (status);
786 }
787 
788 static int
789 do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss,
790     struct mbuf *m)
791 {
792 	struct adapter *sc = iq->adapter;
793 	const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1);
794 	int stid = GET_TID(cpl);
795 	unsigned int status = cpl->status;
796 	struct listen_ctx *lctx = lookup_stid(sc, stid);
797 	struct inpcb *inp = lctx->inp;
798 #ifdef INVARIANTS
799 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
800 #endif
801 
802 	KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL,
803 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
804 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
805 	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
806 
807 	CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status);
808 
809 	if (status != CPL_ERR_NONE) {
810 		log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n",
811 		    __func__, status, stid);
812 		return (status);
813 	}
814 
815 	INP_WLOCK(inp);
816 	inp = release_lctx(sc, lctx);
817 	if (inp != NULL)
818 		INP_WUNLOCK(inp);
819 
820 	return (status);
821 }
822 
823 static void
824 done_with_synqe(struct adapter *sc, struct synq_entry *synqe)
825 {
826 	struct listen_ctx *lctx = synqe->lctx;
827 	struct inpcb *inp = lctx->inp;
828 	struct vi_info *vi = synqe->syn->m_pkthdr.rcvif->if_softc;
829 	struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx];
830 
831 	INP_WLOCK_ASSERT(inp);
832 
833 	TAILQ_REMOVE(&lctx->synq, synqe, link);
834 	inp = release_lctx(sc, lctx);
835 	if (inp)
836 		INP_WUNLOCK(inp);
837 	remove_tid(sc, synqe->tid);
838 	release_tid(sc, synqe->tid, &sc->sge.ctrlq[vi->pi->port_id]);
839 	t4_l2t_release(e);
840 	release_synqe(synqe);	/* removed from synq list */
841 }
842 
843 int
844 do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss,
845     struct mbuf *m)
846 {
847 	struct adapter *sc = iq->adapter;
848 	const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
849 	unsigned int tid = GET_TID(cpl);
850 	struct synq_entry *synqe = lookup_tid(sc, tid);
851 	struct listen_ctx *lctx = synqe->lctx;
852 	struct inpcb *inp = lctx->inp;
853 	int txqid;
854 	struct sge_wrq *ofld_txq;
855 #ifdef INVARIANTS
856 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
857 #endif
858 
859 	KASSERT(opcode == CPL_ABORT_REQ_RSS,
860 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
861 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
862 	KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
863 
864 	CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
865 	    __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
866 
867 	if (negative_advice(cpl->status))
868 		return (0);	/* Ignore negative advice */
869 
870 	INP_WLOCK(inp);
871 
872 	get_qids_from_mbuf(synqe->syn, &txqid, NULL);
873 	ofld_txq = &sc->sge.ofld_txq[txqid];
874 
875 	/*
876 	 * If we'd initiated an abort earlier the reply to it is responsible for
877 	 * cleaning up resources.  Otherwise we tear everything down right here
878 	 * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
879 	 */
880 	if (synqe->flags & TPF_ABORT_SHUTDOWN) {
881 		INP_WUNLOCK(inp);
882 		goto done;
883 	}
884 
885 	done_with_synqe(sc, synqe);
886 	/* inp lock released by done_with_synqe */
887 done:
888 	send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
889 	return (0);
890 }
891 
892 int
893 do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss,
894     struct mbuf *m)
895 {
896 	struct adapter *sc = iq->adapter;
897 	const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
898 	unsigned int tid = GET_TID(cpl);
899 	struct synq_entry *synqe = lookup_tid(sc, tid);
900 	struct listen_ctx *lctx = synqe->lctx;
901 	struct inpcb *inp = lctx->inp;
902 #ifdef INVARIANTS
903 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
904 #endif
905 
906 	KASSERT(opcode == CPL_ABORT_RPL_RSS,
907 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
908 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
909 	KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
910 
911 	CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
912 	    __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
913 
914 	INP_WLOCK(inp);
915 	KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
916 	    ("%s: wasn't expecting abort reply for synqe %p (0x%x)",
917 	    __func__, synqe, synqe->flags));
918 
919 	done_with_synqe(sc, synqe);
920 	/* inp lock released by done_with_synqe */
921 
922 	return (0);
923 }
924 
925 void
926 t4_offload_socket(struct toedev *tod, void *arg, struct socket *so)
927 {
928 	struct adapter *sc = tod->tod_softc;
929 	struct synq_entry *synqe = arg;
930 #ifdef INVARIANTS
931 	struct inpcb *inp = sotoinpcb(so);
932 #endif
933 	struct cpl_pass_establish *cpl = mtod(synqe->syn, void *);
934 	struct toepcb *toep = *(struct toepcb **)(cpl + 1);
935 
936 	INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */
937 	INP_WLOCK_ASSERT(inp);
938 	KASSERT(synqe->flags & TPF_SYNQE,
939 	    ("%s: %p not a synq_entry?", __func__, arg));
940 
941 	offload_socket(so, toep);
942 	make_established(toep, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt);
943 	toep->flags |= TPF_CPL_PENDING;
944 	update_tid(sc, synqe->tid, toep);
945 	synqe->flags |= TPF_SYNQE_EXPANDED;
946 }
947 
948 static inline void
949 save_qids_in_mbuf(struct mbuf *m, struct vi_info *vi)
950 {
951 	uint32_t txqid, rxqid;
952 
953 	txqid = (arc4random() % vi->nofldtxq) + vi->first_ofld_txq;
954 	rxqid = (arc4random() % vi->nofldrxq) + vi->first_ofld_rxq;
955 
956 	m->m_pkthdr.flowid = (txqid << 16) | (rxqid & 0xffff);
957 }
958 
959 static inline void
960 get_qids_from_mbuf(struct mbuf *m, int *txqid, int *rxqid)
961 {
962 
963 	if (txqid)
964 		*txqid = m->m_pkthdr.flowid >> 16;
965 	if (rxqid)
966 		*rxqid = m->m_pkthdr.flowid & 0xffff;
967 }
968 
969 /*
970  * Use the trailing space in the mbuf in which the PASS_ACCEPT_REQ arrived to
971  * store some state temporarily.
972  */
973 static struct synq_entry *
974 mbuf_to_synqe(struct mbuf *m)
975 {
976 	int len = roundup2(sizeof (struct synq_entry), 8);
977 	int tspace = M_TRAILINGSPACE(m);
978 	struct synq_entry *synqe = NULL;
979 
980 	if (tspace < len) {
981 		synqe = malloc(sizeof(*synqe), M_CXGBE, M_NOWAIT);
982 		if (synqe == NULL)
983 			return (NULL);
984 		synqe->flags = TPF_SYNQE | TPF_SYNQE_NEEDFREE;
985 	} else {
986 		synqe = (void *)(m->m_data + m->m_len + tspace - len);
987 		synqe->flags = TPF_SYNQE;
988 	}
989 
990 	return (synqe);
991 }
992 
993 static void
994 t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to)
995 {
996 	bzero(to, sizeof(*to));
997 
998 	if (t4opt->mss) {
999 		to->to_flags |= TOF_MSS;
1000 		to->to_mss = be16toh(t4opt->mss);
1001 	}
1002 
1003 	if (t4opt->wsf) {
1004 		to->to_flags |= TOF_SCALE;
1005 		to->to_wscale = t4opt->wsf;
1006 	}
1007 
1008 	if (t4opt->tstamp)
1009 		to->to_flags |= TOF_TS;
1010 
1011 	if (t4opt->sack)
1012 		to->to_flags |= TOF_SACKPERM;
1013 }
1014 
1015 /*
1016  * Options2 for passive open.
1017  */
1018 static uint32_t
1019 calc_opt2p(struct adapter *sc, struct port_info *pi, int rxqid,
1020     const struct tcp_options *tcpopt, struct tcphdr *th, int ulp_mode)
1021 {
1022 	struct sge_ofld_rxq *ofld_rxq = &sc->sge.ofld_rxq[rxqid];
1023 	uint32_t opt2;
1024 
1025 	opt2 = V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]) |
1026 	    F_RSS_QUEUE_VALID | V_RSS_QUEUE(ofld_rxq->iq.abs_id);
1027 
1028 	if (V_tcp_do_rfc1323) {
1029 		if (tcpopt->tstamp)
1030 			opt2 |= F_TSTAMPS_EN;
1031 		if (tcpopt->sack)
1032 			opt2 |= F_SACK_EN;
1033 		if (tcpopt->wsf <= 14)
1034 			opt2 |= F_WND_SCALE_EN;
1035 	}
1036 
1037 	if (V_tcp_do_ecn && th->th_flags & (TH_ECE | TH_CWR))
1038 		opt2 |= F_CCTRL_ECN;
1039 
1040 	/* RX_COALESCE is always a valid value (0 or M_RX_COALESCE). */
1041 	if (is_t4(sc))
1042 		opt2 |= F_RX_COALESCE_VALID;
1043 	else {
1044 		opt2 |= F_T5_OPT_2_VALID;
1045 		opt2 |= F_CONG_CNTRL_VALID; /* OPT_2_ISS really, for T5 */
1046 	}
1047 	if (sc->tt.rx_coalesce)
1048 		opt2 |= V_RX_COALESCE(M_RX_COALESCE);
1049 
1050 #ifdef USE_DDP_RX_FLOW_CONTROL
1051 	if (ulp_mode == ULP_MODE_TCPDDP)
1052 		opt2 |= F_RX_FC_VALID | F_RX_FC_DDP;
1053 #endif
1054 
1055 	return htobe32(opt2);
1056 }
1057 
1058 static void
1059 pass_accept_req_to_protohdrs(const struct mbuf *m, struct in_conninfo *inc,
1060     struct tcphdr *th)
1061 {
1062 	const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
1063 	const struct ether_header *eh;
1064 	unsigned int hlen = be32toh(cpl->hdr_len);
1065 	uintptr_t l3hdr;
1066 	const struct tcphdr *tcp;
1067 
1068 	eh = (const void *)(cpl + 1);
1069 	l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen));
1070 	tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen));
1071 
1072 	if (inc) {
1073 		bzero(inc, sizeof(*inc));
1074 		inc->inc_fport = tcp->th_sport;
1075 		inc->inc_lport = tcp->th_dport;
1076 		if (((struct ip *)l3hdr)->ip_v == IPVERSION) {
1077 			const struct ip *ip = (const void *)l3hdr;
1078 
1079 			inc->inc_faddr = ip->ip_src;
1080 			inc->inc_laddr = ip->ip_dst;
1081 		} else {
1082 			const struct ip6_hdr *ip6 = (const void *)l3hdr;
1083 
1084 			inc->inc_flags |= INC_ISIPV6;
1085 			inc->inc6_faddr = ip6->ip6_src;
1086 			inc->inc6_laddr = ip6->ip6_dst;
1087 		}
1088 	}
1089 
1090 	if (th) {
1091 		bcopy(tcp, th, sizeof(*th));
1092 		tcp_fields_to_host(th);		/* just like tcp_input */
1093 	}
1094 }
1095 
1096 static struct l2t_entry *
1097 get_l2te_for_nexthop(struct port_info *pi, struct ifnet *ifp,
1098     struct in_conninfo *inc)
1099 {
1100 	struct l2t_entry *e;
1101 	struct sockaddr_in6 sin6;
1102 	struct sockaddr *dst = (void *)&sin6;
1103 
1104 	if (inc->inc_flags & INC_ISIPV6) {
1105 		struct nhop6_basic nh6;
1106 
1107 		bzero(dst, sizeof(struct sockaddr_in6));
1108 		dst->sa_len = sizeof(struct sockaddr_in6);
1109 		dst->sa_family = AF_INET6;
1110 
1111 		if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) {
1112 			/* no need for route lookup */
1113 			e = t4_l2t_get(pi, ifp, dst);
1114 			return (e);
1115 		}
1116 
1117 		if (fib6_lookup_nh_basic(RT_DEFAULT_FIB, &inc->inc6_faddr,
1118 		    0, 0, 0, &nh6) != 0)
1119 			return (NULL);
1120 		if (nh6.nh_ifp != ifp)
1121 			return (NULL);
1122 		((struct sockaddr_in6 *)dst)->sin6_addr = nh6.nh_addr;
1123 	} else {
1124 		struct nhop4_basic nh4;
1125 
1126 		dst->sa_len = sizeof(struct sockaddr_in);
1127 		dst->sa_family = AF_INET;
1128 
1129 		if (fib4_lookup_nh_basic(RT_DEFAULT_FIB, inc->inc_faddr, 0, 0,
1130 		    &nh4) != 0)
1131 			return (NULL);
1132 		if (nh4.nh_ifp != ifp)
1133 			return (NULL);
1134 		((struct sockaddr_in *)dst)->sin_addr = nh4.nh_addr;
1135 	}
1136 
1137 	e = t4_l2t_get(pi, ifp, dst);
1138 	return (e);
1139 }
1140 
1141 #define REJECT_PASS_ACCEPT()	do { \
1142 	reject_reason = __LINE__; \
1143 	goto reject; \
1144 } while (0)
1145 
1146 /*
1147  * The context associated with a tid entry via insert_tid could be a synq_entry
1148  * or a toepcb.  The only way CPL handlers can tell is via a bit in these flags.
1149  */
1150 CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags));
1151 
1152 /*
1153  * Incoming SYN on a listening socket.
1154  *
1155  * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe,
1156  * etc.
1157  */
1158 static int
1159 do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss,
1160     struct mbuf *m)
1161 {
1162 	struct adapter *sc = iq->adapter;
1163 	struct toedev *tod;
1164 	const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
1165 	struct cpl_pass_accept_rpl *rpl;
1166 	struct wrqe *wr;
1167 	unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
1168 	unsigned int tid = GET_TID(cpl);
1169 	struct listen_ctx *lctx = lookup_stid(sc, stid);
1170 	struct inpcb *inp;
1171 	struct socket *so;
1172 	struct in_conninfo inc;
1173 	struct tcphdr th;
1174 	struct tcpopt to;
1175 	struct port_info *pi;
1176 	struct vi_info *vi;
1177 	struct ifnet *hw_ifp, *ifp;
1178 	struct l2t_entry *e = NULL;
1179 	int rscale, mtu_idx, rx_credits, rxqid, ulp_mode;
1180 	struct synq_entry *synqe = NULL;
1181 	int reject_reason, v;
1182 	uint16_t vid;
1183 #ifdef INVARIANTS
1184 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1185 #endif
1186 
1187 	KASSERT(opcode == CPL_PASS_ACCEPT_REQ,
1188 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1189 	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
1190 
1191 	CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid,
1192 	    lctx);
1193 
1194 	pass_accept_req_to_protohdrs(m, &inc, &th);
1195 	t4opt_to_tcpopt(&cpl->tcpopt, &to);
1196 
1197 	pi = sc->port[G_SYN_INTF(be16toh(cpl->l2info))];
1198 
1199 	/*
1200 	 * Use the MAC index to lookup the associated VI.  If this SYN
1201 	 * didn't match a perfect MAC filter, punt.
1202 	 */
1203 	if (!(be16toh(cpl->l2info) & F_SYN_XACT_MATCH)) {
1204 		m_freem(m);
1205 		m = NULL;
1206 		REJECT_PASS_ACCEPT();
1207 	}
1208 	for_each_vi(pi, v, vi) {
1209 		if (vi->xact_addr_filt == G_SYN_MAC_IDX(be16toh(cpl->l2info)))
1210 			goto found;
1211 	}
1212 	m_freem(m);
1213 	m = NULL;
1214 	REJECT_PASS_ACCEPT();
1215 
1216 found:
1217 	hw_ifp = vi->ifp;	/* the (v)cxgbeX ifnet */
1218 	m->m_pkthdr.rcvif = hw_ifp;
1219 	tod = TOEDEV(hw_ifp);
1220 
1221 	/*
1222 	 * Figure out if there is a pseudo interface (vlan, lagg, etc.)
1223 	 * involved.  Don't offload if the SYN had a VLAN tag and the vid
1224 	 * doesn't match anything on this interface.
1225 	 *
1226 	 * XXX: lagg support, lagg + vlan support.
1227 	 */
1228 	vid = EVL_VLANOFTAG(be16toh(cpl->vlan));
1229 	if (vid != 0xfff) {
1230 		ifp = VLAN_DEVAT(hw_ifp, vid);
1231 		if (ifp == NULL)
1232 			REJECT_PASS_ACCEPT();
1233 	} else
1234 		ifp = hw_ifp;
1235 
1236 	/*
1237 	 * Don't offload if the peer requested a TCP option that's not known to
1238 	 * the silicon.
1239 	 */
1240 	if (cpl->tcpopt.unknown)
1241 		REJECT_PASS_ACCEPT();
1242 
1243 	if (inc.inc_flags & INC_ISIPV6) {
1244 
1245 		/* Don't offload if the ifcap isn't enabled */
1246 		if ((ifp->if_capenable & IFCAP_TOE6) == 0)
1247 			REJECT_PASS_ACCEPT();
1248 
1249 		/*
1250 		 * SYN must be directed to an IP6 address on this ifnet.  This
1251 		 * is more restrictive than in6_localip.
1252 		 */
1253 		if (!in6_ifhasaddr(ifp, &inc.inc6_laddr))
1254 			REJECT_PASS_ACCEPT();
1255 	} else {
1256 
1257 		/* Don't offload if the ifcap isn't enabled */
1258 		if ((ifp->if_capenable & IFCAP_TOE4) == 0)
1259 			REJECT_PASS_ACCEPT();
1260 
1261 		/*
1262 		 * SYN must be directed to an IP address on this ifnet.  This
1263 		 * is more restrictive than in_localip.
1264 		 */
1265 		if (!in_ifhasaddr(ifp, inc.inc_laddr))
1266 			REJECT_PASS_ACCEPT();
1267 	}
1268 
1269 	e = get_l2te_for_nexthop(pi, ifp, &inc);
1270 	if (e == NULL)
1271 		REJECT_PASS_ACCEPT();
1272 
1273 	synqe = mbuf_to_synqe(m);
1274 	if (synqe == NULL)
1275 		REJECT_PASS_ACCEPT();
1276 
1277 	wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) :
1278 	    sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[pi->port_id]);
1279 	if (wr == NULL)
1280 		REJECT_PASS_ACCEPT();
1281 	rpl = wrtod(wr);
1282 
1283 	INP_INFO_RLOCK(&V_tcbinfo);	/* for 4-tuple check */
1284 
1285 	/* Don't offload if the 4-tuple is already in use */
1286 	if (toe_4tuple_check(&inc, &th, ifp) != 0) {
1287 		INP_INFO_RUNLOCK(&V_tcbinfo);
1288 		free(wr, M_CXGBE);
1289 		REJECT_PASS_ACCEPT();
1290 	}
1291 	INP_INFO_RUNLOCK(&V_tcbinfo);
1292 
1293 	inp = lctx->inp;		/* listening socket, not owned by TOE */
1294 	INP_WLOCK(inp);
1295 
1296 	/* Don't offload if the listening socket has closed */
1297 	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1298 		/*
1299 		 * The listening socket has closed.  The reply from the TOE to
1300 		 * our CPL_CLOSE_LISTSRV_REQ will ultimately release all
1301 		 * resources tied to this listen context.
1302 		 */
1303 		INP_WUNLOCK(inp);
1304 		free(wr, M_CXGBE);
1305 		REJECT_PASS_ACCEPT();
1306 	}
1307 	so = inp->inp_socket;
1308 
1309 	mtu_idx = find_best_mtu_idx(sc, &inc, be16toh(cpl->tcpopt.mss));
1310 	rscale = cpl->tcpopt.wsf && V_tcp_do_rfc1323 ? select_rcv_wscale() : 0;
1311 	SOCKBUF_LOCK(&so->so_rcv);
1312 	/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
1313 	rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
1314 	SOCKBUF_UNLOCK(&so->so_rcv);
1315 
1316 	save_qids_in_mbuf(m, vi);
1317 	get_qids_from_mbuf(m, NULL, &rxqid);
1318 
1319 	if (is_t4(sc))
1320 		INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid);
1321 	else {
1322 		struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl;
1323 
1324 		INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid);
1325 	}
1326 	if (sc->tt.ddp && (so->so_options & SO_NO_DDP) == 0) {
1327 		ulp_mode = ULP_MODE_TCPDDP;
1328 		synqe->flags |= TPF_SYNQE_TCPDDP;
1329 	} else
1330 		ulp_mode = ULP_MODE_NONE;
1331 	rpl->opt0 = calc_opt0(so, vi, e, mtu_idx, rscale, rx_credits, ulp_mode);
1332 	rpl->opt2 = calc_opt2p(sc, pi, rxqid, &cpl->tcpopt, &th, ulp_mode);
1333 
1334 	synqe->tid = tid;
1335 	synqe->lctx = lctx;
1336 	synqe->syn = m;
1337 	m = NULL;
1338 	refcount_init(&synqe->refcnt, 1);	/* 1 means extra hold */
1339 	synqe->l2e_idx = e->idx;
1340 	synqe->rcv_bufsize = rx_credits;
1341 	atomic_store_rel_ptr(&synqe->wr, (uintptr_t)wr);
1342 
1343 	insert_tid(sc, tid, synqe);
1344 	TAILQ_INSERT_TAIL(&lctx->synq, synqe, link);
1345 	hold_synqe(synqe);	/* hold for the duration it's in the synq */
1346 	hold_lctx(lctx);	/* A synqe on the list has a ref on its lctx */
1347 
1348 	/*
1349 	 * If all goes well t4_syncache_respond will get called during
1350 	 * syncache_add.  Note that syncache_add releases the pcb lock.
1351 	 */
1352 	toe_syncache_add(&inc, &to, &th, inp, tod, synqe);
1353 	INP_UNLOCK_ASSERT(inp);	/* ok to assert, we have a ref on the inp */
1354 
1355 	/*
1356 	 * If we replied during syncache_add (synqe->wr has been consumed),
1357 	 * good.  Otherwise, set it to 0 so that further syncache_respond
1358 	 * attempts by the kernel will be ignored.
1359 	 */
1360 	if (atomic_cmpset_ptr(&synqe->wr, (uintptr_t)wr, 0)) {
1361 
1362 		/*
1363 		 * syncache may or may not have a hold on the synqe, which may
1364 		 * or may not be stashed in the original SYN mbuf passed to us.
1365 		 * Just copy it over instead of dealing with all possibilities.
1366 		 */
1367 		m = m_dup(synqe->syn, M_NOWAIT);
1368 		if (m)
1369 			m->m_pkthdr.rcvif = hw_ifp;
1370 
1371 		remove_tid(sc, synqe->tid);
1372 		free(wr, M_CXGBE);
1373 
1374 		/* Yank the synqe out of the lctx synq. */
1375 		INP_WLOCK(inp);
1376 		TAILQ_REMOVE(&lctx->synq, synqe, link);
1377 		release_synqe(synqe);	/* removed from synq list */
1378 		inp = release_lctx(sc, lctx);
1379 		if (inp)
1380 			INP_WUNLOCK(inp);
1381 
1382 		release_synqe(synqe);	/* extra hold */
1383 		REJECT_PASS_ACCEPT();
1384 	}
1385 
1386 	CTR5(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p, synqe %p, SYNACK",
1387 	    __func__, stid, tid, lctx, synqe);
1388 
1389 	INP_WLOCK(inp);
1390 	synqe->flags |= TPF_SYNQE_HAS_L2TE;
1391 	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1392 		/*
1393 		 * Listening socket closed but tod_listen_stop did not abort
1394 		 * this tid because there was no L2T entry for the tid at that
1395 		 * time.  Abort it now.  The reply to the abort will clean up.
1396 		 */
1397 		CTR6(KTR_CXGBE,
1398 		    "%s: stid %u, tid %u, lctx %p, synqe %p (0x%x), ABORT",
1399 		    __func__, stid, tid, lctx, synqe, synqe->flags);
1400 		if (!(synqe->flags & TPF_SYNQE_EXPANDED))
1401 			send_reset_synqe(tod, synqe);
1402 		INP_WUNLOCK(inp);
1403 
1404 		release_synqe(synqe);	/* extra hold */
1405 		return (__LINE__);
1406 	}
1407 	INP_WUNLOCK(inp);
1408 
1409 	release_synqe(synqe);	/* extra hold */
1410 	return (0);
1411 reject:
1412 	CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid,
1413 	    reject_reason);
1414 
1415 	if (e)
1416 		t4_l2t_release(e);
1417 	release_tid(sc, tid, lctx->ctrlq);
1418 
1419 	if (__predict_true(m != NULL)) {
1420 		m_adj(m, sizeof(*cpl));
1421 		m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID |
1422 		    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1423 		m->m_pkthdr.csum_data = 0xffff;
1424 		hw_ifp->if_input(hw_ifp, m);
1425 	}
1426 
1427 	return (reject_reason);
1428 }
1429 
1430 static void
1431 synqe_to_protohdrs(struct synq_entry *synqe,
1432     const struct cpl_pass_establish *cpl, struct in_conninfo *inc,
1433     struct tcphdr *th, struct tcpopt *to)
1434 {
1435 	uint16_t tcp_opt = be16toh(cpl->tcp_opt);
1436 
1437 	/* start off with the original SYN */
1438 	pass_accept_req_to_protohdrs(synqe->syn, inc, th);
1439 
1440 	/* modify parts to make it look like the ACK to our SYN|ACK */
1441 	th->th_flags = TH_ACK;
1442 	th->th_ack = synqe->iss + 1;
1443 	th->th_seq = be32toh(cpl->rcv_isn);
1444 	bzero(to, sizeof(*to));
1445 	if (G_TCPOPT_TSTAMP(tcp_opt)) {
1446 		to->to_flags |= TOF_TS;
1447 		to->to_tsecr = synqe->ts;
1448 	}
1449 }
1450 
1451 static int
1452 do_pass_establish(struct sge_iq *iq, const struct rss_header *rss,
1453     struct mbuf *m)
1454 {
1455 	struct adapter *sc = iq->adapter;
1456 	struct vi_info *vi;
1457 	struct ifnet *ifp;
1458 	const struct cpl_pass_establish *cpl = (const void *)(rss + 1);
1459 #if defined(KTR) || defined(INVARIANTS)
1460 	unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
1461 #endif
1462 	unsigned int tid = GET_TID(cpl);
1463 	struct synq_entry *synqe = lookup_tid(sc, tid);
1464 	struct listen_ctx *lctx = synqe->lctx;
1465 	struct inpcb *inp = lctx->inp, *new_inp;
1466 	struct socket *so;
1467 	struct tcphdr th;
1468 	struct tcpopt to;
1469 	struct in_conninfo inc;
1470 	struct toepcb *toep;
1471 	u_int txqid, rxqid;
1472 #ifdef INVARIANTS
1473 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1474 #endif
1475 
1476 	KASSERT(opcode == CPL_PASS_ESTABLISH,
1477 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1478 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1479 	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
1480 	KASSERT(synqe->flags & TPF_SYNQE,
1481 	    ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe));
1482 
1483 	INP_INFO_RLOCK(&V_tcbinfo);	/* for syncache_expand */
1484 	INP_WLOCK(inp);
1485 
1486 	CTR6(KTR_CXGBE,
1487 	    "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x",
1488 	    __func__, stid, tid, synqe, synqe->flags, inp->inp_flags);
1489 
1490 	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1491 
1492 		if (synqe->flags & TPF_SYNQE_HAS_L2TE) {
1493 			KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
1494 			    ("%s: listen socket closed but tid %u not aborted.",
1495 			    __func__, tid));
1496 		}
1497 
1498 		INP_WUNLOCK(inp);
1499 		INP_INFO_RUNLOCK(&V_tcbinfo);
1500 		return (0);
1501 	}
1502 
1503 	ifp = synqe->syn->m_pkthdr.rcvif;
1504 	vi = ifp->if_softc;
1505 	KASSERT(vi->pi->adapter == sc,
1506 	    ("%s: vi %p, sc %p mismatch", __func__, vi, sc));
1507 
1508 	get_qids_from_mbuf(synqe->syn, &txqid, &rxqid);
1509 	KASSERT(rxqid == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0],
1510 	    ("%s: CPL arrived on unexpected rxq.  %d %d", __func__, rxqid,
1511 	    (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0])));
1512 
1513 	toep = alloc_toepcb(vi, txqid, rxqid, M_NOWAIT);
1514 	if (toep == NULL) {
1515 reset:
1516 		/*
1517 		 * The reply to this abort will perform final cleanup.  There is
1518 		 * no need to check for HAS_L2TE here.  We can be here only if
1519 		 * we responded to the PASS_ACCEPT_REQ, and our response had the
1520 		 * L2T idx.
1521 		 */
1522 		send_reset_synqe(TOEDEV(ifp), synqe);
1523 		INP_WUNLOCK(inp);
1524 		INP_INFO_RUNLOCK(&V_tcbinfo);
1525 		return (0);
1526 	}
1527 	toep->tid = tid;
1528 	toep->l2te = &sc->l2t->l2tab[synqe->l2e_idx];
1529 	if (synqe->flags & TPF_SYNQE_TCPDDP)
1530 		set_tcpddp_ulp_mode(toep);
1531 	else
1532 		toep->ulp_mode = ULP_MODE_NONE;
1533 	/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
1534 	toep->rx_credits = synqe->rcv_bufsize;
1535 
1536 	so = inp->inp_socket;
1537 	KASSERT(so != NULL, ("%s: socket is NULL", __func__));
1538 
1539 	/* Come up with something that syncache_expand should be ok with. */
1540 	synqe_to_protohdrs(synqe, cpl, &inc, &th, &to);
1541 
1542 	/*
1543 	 * No more need for anything in the mbuf that carried the
1544 	 * CPL_PASS_ACCEPT_REQ.  Drop the CPL_PASS_ESTABLISH and toep pointer
1545 	 * there.  XXX: bad form but I don't want to increase the size of synqe.
1546 	 */
1547 	m = synqe->syn;
1548 	KASSERT(sizeof(*cpl) + sizeof(toep) <= m->m_len,
1549 	    ("%s: no room in mbuf %p (m_len %d)", __func__, m, m->m_len));
1550 	bcopy(cpl, mtod(m, void *), sizeof(*cpl));
1551 	*(struct toepcb **)(mtod(m, struct cpl_pass_establish *) + 1) = toep;
1552 
1553 	if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) {
1554 		free_toepcb(toep);
1555 		goto reset;
1556 	}
1557 
1558 	/* New connection inpcb is already locked by syncache_expand(). */
1559 	new_inp = sotoinpcb(so);
1560 	INP_WLOCK_ASSERT(new_inp);
1561 
1562 	/*
1563 	 * This is for the unlikely case where the syncache entry that we added
1564 	 * has been evicted from the syncache, but the syncache_expand above
1565 	 * works because of syncookies.
1566 	 *
1567 	 * XXX: we've held the tcbinfo lock throughout so there's no risk of
1568 	 * anyone accept'ing a connection before we've installed our hooks, but
1569 	 * this somewhat defeats the purpose of having a tod_offload_socket :-(
1570 	 */
1571 	if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) {
1572 		tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0);
1573 		t4_offload_socket(TOEDEV(ifp), synqe, so);
1574 	}
1575 
1576 	INP_WUNLOCK(new_inp);
1577 
1578 	/* Done with the synqe */
1579 	TAILQ_REMOVE(&lctx->synq, synqe, link);
1580 	inp = release_lctx(sc, lctx);
1581 	if (inp != NULL)
1582 		INP_WUNLOCK(inp);
1583 	INP_INFO_RUNLOCK(&V_tcbinfo);
1584 	release_synqe(synqe);
1585 
1586 	return (0);
1587 }
1588 
1589 void
1590 t4_init_listen_cpl_handlers(struct adapter *sc)
1591 {
1592 
1593 	t4_register_cpl_handler(sc, CPL_PASS_OPEN_RPL, do_pass_open_rpl);
1594 	t4_register_cpl_handler(sc, CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
1595 	t4_register_cpl_handler(sc, CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
1596 	t4_register_cpl_handler(sc, CPL_PASS_ESTABLISH, do_pass_establish);
1597 }
1598 #endif
1599