xref: /freebsd/sys/dev/cxgbe/tom/t4_listen.c (revision 535af610)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2012 Chelsio Communications, Inc.
5  * All rights reserved.
6  * Written by: Navdeep Parhar <np@FreeBSD.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include "opt_inet.h"
34 #include "opt_inet6.h"
35 
36 #ifdef TCP_OFFLOAD
37 #include <sys/param.h>
38 #include <sys/types.h>
39 #include <sys/kernel.h>
40 #include <sys/ktr.h>
41 #include <sys/module.h>
42 #include <sys/protosw.h>
43 #include <sys/refcount.h>
44 #include <sys/domain.h>
45 #include <sys/fnv_hash.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #include <sys/sysctl.h>
49 #include <net/ethernet.h>
50 #include <net/if.h>
51 #include <net/if_types.h>
52 #include <net/if_vlan_var.h>
53 #include <net/route.h>
54 #include <net/route/nhop.h>
55 #include <netinet/in.h>
56 #include <netinet/in_fib.h>
57 #include <netinet/in_pcb.h>
58 #include <netinet/ip.h>
59 #include <netinet/ip6.h>
60 #include <netinet6/in6_fib.h>
61 #include <netinet6/scope6_var.h>
62 #include <netinet/tcp_timer.h>
63 #define TCPSTATES
64 #include <netinet/tcp_fsm.h>
65 #include <netinet/tcp_var.h>
66 #include <netinet/toecore.h>
67 #include <netinet/cc/cc.h>
68 
69 #include "common/common.h"
70 #include "common/t4_msg.h"
71 #include "common/t4_regs.h"
72 #include "t4_clip.h"
73 #include "tom/t4_tom_l2t.h"
74 #include "tom/t4_tom.h"
75 
76 /* stid services */
77 static int alloc_stid(struct adapter *, struct listen_ctx *, int);
78 static struct listen_ctx *lookup_stid(struct adapter *, int);
79 static void free_stid(struct adapter *, struct listen_ctx *);
80 
81 /* lctx services */
82 static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *,
83     struct vi_info *);
84 static int free_lctx(struct adapter *, struct listen_ctx *);
85 static void hold_lctx(struct listen_ctx *);
86 static void listen_hash_add(struct adapter *, struct listen_ctx *);
87 static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *);
88 static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *);
89 static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *);
90 
91 static void send_abort_rpl_synqe(struct toedev *, struct synq_entry *, int);
92 
93 static int
94 alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6)
95 {
96 	struct tid_info *t = &sc->tids;
97 	u_int stid, n, f, mask;
98 	struct stid_region *sr = &lctx->stid_region;
99 
100 	/*
101 	 * An IPv6 server needs 2 naturally aligned stids (1 stid = 4 cells) in
102 	 * the TCAM.  The start of the stid region is properly aligned (the chip
103 	 * requires each region to be 128-cell aligned).
104 	 */
105 	n = isipv6 ? 2 : 1;
106 	mask = n - 1;
107 	KASSERT((t->stid_base & mask) == 0 && (t->nstids & mask) == 0,
108 	    ("%s: stid region (%u, %u) not properly aligned.  n = %u",
109 	    __func__, t->stid_base, t->nstids, n));
110 
111 	mtx_lock(&t->stid_lock);
112 	if (n > t->nstids - t->stids_in_use) {
113 		mtx_unlock(&t->stid_lock);
114 		return (-1);
115 	}
116 
117 	if (t->nstids_free_head >= n) {
118 		/*
119 		 * This allocation will definitely succeed because the region
120 		 * starts at a good alignment and we just checked we have enough
121 		 * stids free.
122 		 */
123 		f = t->nstids_free_head & mask;
124 		t->nstids_free_head -= n + f;
125 		stid = t->nstids_free_head;
126 		TAILQ_INSERT_HEAD(&t->stids, sr, link);
127 	} else {
128 		struct stid_region *s;
129 
130 		stid = t->nstids_free_head;
131 		TAILQ_FOREACH(s, &t->stids, link) {
132 			stid += s->used + s->free;
133 			f = stid & mask;
134 			if (s->free >= n + f) {
135 				stid -= n + f;
136 				s->free -= n + f;
137 				TAILQ_INSERT_AFTER(&t->stids, s, sr, link);
138 				goto allocated;
139 			}
140 		}
141 
142 		if (__predict_false(stid != t->nstids)) {
143 			panic("%s: stids TAILQ (%p) corrupt."
144 			    "  At %d instead of %d at the end of the queue.",
145 			    __func__, &t->stids, stid, t->nstids);
146 		}
147 
148 		mtx_unlock(&t->stid_lock);
149 		return (-1);
150 	}
151 
152 allocated:
153 	sr->used = n;
154 	sr->free = f;
155 	t->stids_in_use += n;
156 	t->stid_tab[stid] = lctx;
157 	mtx_unlock(&t->stid_lock);
158 
159 	KASSERT(((stid + t->stid_base) & mask) == 0,
160 	    ("%s: EDOOFUS.", __func__));
161 	return (stid + t->stid_base);
162 }
163 
164 static struct listen_ctx *
165 lookup_stid(struct adapter *sc, int stid)
166 {
167 	struct tid_info *t = &sc->tids;
168 
169 	return (t->stid_tab[stid - t->stid_base]);
170 }
171 
172 static void
173 free_stid(struct adapter *sc, struct listen_ctx *lctx)
174 {
175 	struct tid_info *t = &sc->tids;
176 	struct stid_region *sr = &lctx->stid_region;
177 	struct stid_region *s;
178 
179 	KASSERT(sr->used > 0, ("%s: nonsense free (%d)", __func__, sr->used));
180 
181 	mtx_lock(&t->stid_lock);
182 	s = TAILQ_PREV(sr, stid_head, link);
183 	if (s != NULL)
184 		s->free += sr->used + sr->free;
185 	else
186 		t->nstids_free_head += sr->used + sr->free;
187 	KASSERT(t->stids_in_use >= sr->used,
188 	    ("%s: stids_in_use (%u) < stids being freed (%u)", __func__,
189 	    t->stids_in_use, sr->used));
190 	t->stids_in_use -= sr->used;
191 	TAILQ_REMOVE(&t->stids, sr, link);
192 	mtx_unlock(&t->stid_lock);
193 }
194 
195 static struct listen_ctx *
196 alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi)
197 {
198 	struct listen_ctx *lctx;
199 
200 	INP_WLOCK_ASSERT(inp);
201 
202 	lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO);
203 	if (lctx == NULL)
204 		return (NULL);
205 
206 	lctx->stid = alloc_stid(sc, lctx, inp->inp_vflag & INP_IPV6);
207 	if (lctx->stid < 0) {
208 		free(lctx, M_CXGBE);
209 		return (NULL);
210 	}
211 
212 	if (inp->inp_vflag & INP_IPV6 &&
213 	    !IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) {
214 		lctx->ce = t4_get_clip_entry(sc, &inp->in6p_laddr, true);
215 		if (lctx->ce == NULL) {
216 			free(lctx, M_CXGBE);
217 			return (NULL);
218 		}
219 	}
220 
221 	lctx->ctrlq = &sc->sge.ctrlq[vi->pi->port_id];
222 	lctx->ofld_rxq = &sc->sge.ofld_rxq[vi->first_ofld_rxq];
223 	refcount_init(&lctx->refcount, 1);
224 
225 	lctx->inp = inp;
226 	lctx->vnet = inp->inp_socket->so_vnet;
227 	in_pcbref(inp);
228 
229 	return (lctx);
230 }
231 
232 /* Don't call this directly, use release_lctx instead */
233 static int
234 free_lctx(struct adapter *sc, struct listen_ctx *lctx)
235 {
236 	struct inpcb *inp = lctx->inp;
237 
238 	INP_WLOCK_ASSERT(inp);
239 	KASSERT(lctx->refcount == 0,
240 	    ("%s: refcount %d", __func__, lctx->refcount));
241 	KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid));
242 
243 	CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p",
244 	    __func__, lctx->stid, lctx, lctx->inp);
245 
246 	if (lctx->ce)
247 		t4_release_clip_entry(sc, lctx->ce);
248 	free_stid(sc, lctx);
249 	free(lctx, M_CXGBE);
250 
251 	return (in_pcbrele_wlocked(inp));
252 }
253 
254 static void
255 hold_lctx(struct listen_ctx *lctx)
256 {
257 
258 	refcount_acquire(&lctx->refcount);
259 }
260 
261 static inline uint32_t
262 listen_hashfn(void *key, u_long mask)
263 {
264 
265 	return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask);
266 }
267 
268 /*
269  * Add a listen_ctx entry to the listen hash table.
270  */
271 static void
272 listen_hash_add(struct adapter *sc, struct listen_ctx *lctx)
273 {
274 	struct tom_data *td = sc->tom_softc;
275 	int bucket = listen_hashfn(lctx->inp, td->listen_mask);
276 
277 	mtx_lock(&td->lctx_hash_lock);
278 	LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link);
279 	td->lctx_count++;
280 	mtx_unlock(&td->lctx_hash_lock);
281 }
282 
283 /*
284  * Look for the listening socket's context entry in the hash and return it.
285  */
286 static struct listen_ctx *
287 listen_hash_find(struct adapter *sc, struct inpcb *inp)
288 {
289 	struct tom_data *td = sc->tom_softc;
290 	int bucket = listen_hashfn(inp, td->listen_mask);
291 	struct listen_ctx *lctx;
292 
293 	mtx_lock(&td->lctx_hash_lock);
294 	LIST_FOREACH(lctx, &td->listen_hash[bucket], link) {
295 		if (lctx->inp == inp)
296 			break;
297 	}
298 	mtx_unlock(&td->lctx_hash_lock);
299 
300 	return (lctx);
301 }
302 
303 /*
304  * Removes the listen_ctx structure for inp from the hash and returns it.
305  */
306 static struct listen_ctx *
307 listen_hash_del(struct adapter *sc, struct inpcb *inp)
308 {
309 	struct tom_data *td = sc->tom_softc;
310 	int bucket = listen_hashfn(inp, td->listen_mask);
311 	struct listen_ctx *lctx, *l;
312 
313 	mtx_lock(&td->lctx_hash_lock);
314 	LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) {
315 		if (lctx->inp == inp) {
316 			LIST_REMOVE(lctx, link);
317 			td->lctx_count--;
318 			break;
319 		}
320 	}
321 	mtx_unlock(&td->lctx_hash_lock);
322 
323 	return (lctx);
324 }
325 
326 /*
327  * Releases a hold on the lctx.  Must be called with the listening socket's inp
328  * locked.  The inp may be freed by this function and it returns NULL to
329  * indicate this.
330  */
331 static struct inpcb *
332 release_lctx(struct adapter *sc, struct listen_ctx *lctx)
333 {
334 	struct inpcb *inp = lctx->inp;
335 	int inp_freed = 0;
336 
337 	INP_WLOCK_ASSERT(inp);
338 	if (refcount_release(&lctx->refcount))
339 		inp_freed = free_lctx(sc, lctx);
340 
341 	return (inp_freed ? NULL : inp);
342 }
343 
344 static void
345 send_flowc_wr_synqe(struct adapter *sc, struct synq_entry *synqe)
346 {
347 	struct mbuf *m = synqe->syn;
348 	if_t ifp = m->m_pkthdr.rcvif;
349 	struct vi_info *vi = if_getsoftc(ifp);
350 	struct port_info *pi = vi->pi;
351 	struct wrqe *wr;
352 	struct fw_flowc_wr *flowc;
353 	struct sge_ofld_txq *ofld_txq;
354 	struct sge_ofld_rxq *ofld_rxq;
355 	const int nparams = 6;
356 	const int flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
357 	const u_int pfvf = sc->pf << S_FW_VIID_PFN;
358 
359 	INP_WLOCK_ASSERT(synqe->lctx->inp);
360 	MPASS((synqe->flags & TPF_FLOWC_WR_SENT) == 0);
361 
362 	ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx];
363 	ofld_rxq = &sc->sge.ofld_rxq[synqe->params.rxq_idx];
364 
365 	wr = alloc_wrqe(roundup2(flowclen, 16), &ofld_txq->wrq);
366 	if (wr == NULL) {
367 		/* XXX */
368 		panic("%s: allocation failure.", __func__);
369 	}
370 	flowc = wrtod(wr);
371 	memset(flowc, 0, wr->wr_len);
372 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
373 	    V_FW_FLOWC_WR_NPARAMS(nparams));
374 	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
375 	    V_FW_WR_FLOWID(synqe->tid));
376 	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
377 	flowc->mnemval[0].val = htobe32(pfvf);
378 	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
379 	flowc->mnemval[1].val = htobe32(pi->tx_chan);
380 	flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
381 	flowc->mnemval[2].val = htobe32(pi->tx_chan);
382 	flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
383 	flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id);
384 	flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF;
385 	flowc->mnemval[4].val = htobe32(512);
386 	flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS;
387 	flowc->mnemval[5].val = htobe32(512);
388 
389 	synqe->flags |= TPF_FLOWC_WR_SENT;
390 	t4_wrq_tx(sc, wr);
391 }
392 
393 static void
394 send_abort_rpl_synqe(struct toedev *tod, struct synq_entry *synqe,
395     int rst_status)
396 {
397 	struct adapter *sc = tod->tod_softc;
398 	struct wrqe *wr;
399 	struct cpl_abort_req *req;
400 
401 	INP_WLOCK_ASSERT(synqe->lctx->inp);
402 
403 	CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s",
404 	    __func__, synqe, synqe->flags, synqe->tid,
405 	    synqe->flags & TPF_ABORT_SHUTDOWN ?
406 	    " (abort already in progress)" : "");
407 	if (synqe->flags & TPF_ABORT_SHUTDOWN)
408 		return;	/* abort already in progress */
409 	synqe->flags |= TPF_ABORT_SHUTDOWN;
410 
411 	if (!(synqe->flags & TPF_FLOWC_WR_SENT))
412 		send_flowc_wr_synqe(sc, synqe);
413 
414 	wr = alloc_wrqe(sizeof(*req),
415 	    &sc->sge.ofld_txq[synqe->params.txq_idx].wrq);
416 	if (wr == NULL) {
417 		/* XXX */
418 		panic("%s: allocation failure.", __func__);
419 	}
420 	req = wrtod(wr);
421 	INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid);
422 	req->rsvd0 = 0;	/* don't have a snd_nxt */
423 	req->rsvd1 = 1;	/* no data sent yet */
424 	req->cmd = rst_status;
425 
426 	t4_l2t_send(sc, wr, &sc->l2t->l2tab[synqe->params.l2t_idx]);
427 }
428 
429 static int
430 create_server(struct adapter *sc, struct listen_ctx *lctx)
431 {
432 	struct wrqe *wr;
433 	struct cpl_pass_open_req *req;
434 	struct inpcb *inp = lctx->inp;
435 
436 	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
437 	if (wr == NULL) {
438 		log(LOG_ERR, "%s: allocation failure", __func__);
439 		return (ENOMEM);
440 	}
441 	req = wrtod(wr);
442 
443 	INIT_TP_WR(req, 0);
444 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid));
445 	req->local_port = inp->inp_lport;
446 	req->peer_port = 0;
447 	req->local_ip = inp->inp_laddr.s_addr;
448 	req->peer_ip = 0;
449 	req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
450 	req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
451 	    F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
452 
453 	t4_wrq_tx(sc, wr);
454 	return (0);
455 }
456 
457 static int
458 create_server6(struct adapter *sc, struct listen_ctx *lctx)
459 {
460 	struct wrqe *wr;
461 	struct cpl_pass_open_req6 *req;
462 	struct inpcb *inp = lctx->inp;
463 
464 	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
465 	if (wr == NULL) {
466 		log(LOG_ERR, "%s: allocation failure", __func__);
467 		return (ENOMEM);
468 	}
469 	req = wrtod(wr);
470 
471 	INIT_TP_WR(req, 0);
472 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid));
473 	req->local_port = inp->inp_lport;
474 	req->peer_port = 0;
475 	req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0];
476 	req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8];
477 	req->peer_ip_hi = 0;
478 	req->peer_ip_lo = 0;
479 	req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
480 	req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
481 	    F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
482 
483 	t4_wrq_tx(sc, wr);
484 	return (0);
485 }
486 
487 static int
488 destroy_server(struct adapter *sc, struct listen_ctx *lctx)
489 {
490 	struct wrqe *wr;
491 	struct cpl_close_listsvr_req *req;
492 
493 	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
494 	if (wr == NULL) {
495 		/* XXX */
496 		panic("%s: allocation failure.", __func__);
497 	}
498 	req = wrtod(wr);
499 
500 	INIT_TP_WR(req, 0);
501 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ,
502 	    lctx->stid));
503 	req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id);
504 	req->rsvd = htobe16(0);
505 
506 	t4_wrq_tx(sc, wr);
507 	return (0);
508 }
509 
510 /*
511  * Start a listening server by sending a passive open request to HW.
512  *
513  * Can't take adapter lock here and access to sc->flags,
514  * sc->offload_map, if_capenable are all race prone.
515  */
516 int
517 t4_listen_start(struct toedev *tod, struct tcpcb *tp)
518 {
519 	struct adapter *sc = tod->tod_softc;
520 	struct vi_info *vi;
521 	struct port_info *pi;
522 	struct inpcb *inp = tptoinpcb(tp);
523 	struct listen_ctx *lctx;
524 	int i, rc, v;
525 	struct offload_settings settings;
526 
527 	INP_WLOCK_ASSERT(inp);
528 
529 	rw_rlock(&sc->policy_lock);
530 	settings = *lookup_offload_policy(sc, OPEN_TYPE_LISTEN, NULL,
531 	    EVL_MAKETAG(0xfff, 0, 0), inp);
532 	rw_runlock(&sc->policy_lock);
533 	if (!settings.offload)
534 		return (0);
535 
536 	/* Don't start a hardware listener for any loopback address. */
537 	if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr))
538 		return (0);
539 	if (!(inp->inp_vflag & INP_IPV6) &&
540 	    IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr)))
541 		return (0);
542 	if (sc->flags & KERN_TLS_ON)
543 		return (0);
544 #if 0
545 	ADAPTER_LOCK(sc);
546 	if (IS_BUSY(sc)) {
547 		log(LOG_ERR, "%s: listen request ignored, %s is busy",
548 		    __func__, device_get_nameunit(sc->dev));
549 		goto done;
550 	}
551 
552 	KASSERT(uld_active(sc, ULD_TOM),
553 	    ("%s: TOM not initialized", __func__));
554 #endif
555 
556 	/*
557 	 * Find an initialized VI with IFCAP_TOE (4 or 6).  We'll use the first
558 	 * such VI's queues to send the passive open and receive the reply to
559 	 * it.
560 	 *
561 	 * XXX: need a way to mark a port in use by offload.  if_cxgbe should
562 	 * then reject any attempt to bring down such a port (and maybe reject
563 	 * attempts to disable IFCAP_TOE on that port too?).
564 	 */
565 	for_each_port(sc, i) {
566 		pi = sc->port[i];
567 		for_each_vi(pi, v, vi) {
568 			if (vi->flags & VI_INIT_DONE &&
569 			    if_getcapenable(vi->ifp) & IFCAP_TOE)
570 				goto found;
571 		}
572 	}
573 	goto done;	/* no port that's UP with IFCAP_TOE enabled */
574 found:
575 
576 	if (listen_hash_find(sc, inp) != NULL)
577 		goto done;	/* already setup */
578 
579 	lctx = alloc_lctx(sc, inp, vi);
580 	if (lctx == NULL) {
581 		log(LOG_ERR,
582 		    "%s: listen request ignored, %s couldn't allocate lctx\n",
583 		    __func__, device_get_nameunit(sc->dev));
584 		goto done;
585 	}
586 	listen_hash_add(sc, lctx);
587 
588 	CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x",
589 	    __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp,
590 	    inp->inp_vflag);
591 
592 	if (inp->inp_vflag & INP_IPV6)
593 		rc = create_server6(sc, lctx);
594 	else
595 		rc = create_server(sc, lctx);
596 	if (rc != 0) {
597 		log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n",
598 		    __func__, device_get_nameunit(sc->dev), rc);
599 		(void) listen_hash_del(sc, inp);
600 		inp = release_lctx(sc, lctx);
601 		/* can't be freed, host stack has a reference */
602 		KASSERT(inp != NULL, ("%s: inp freed", __func__));
603 		goto done;
604 	}
605 	lctx->flags |= LCTX_RPL_PENDING;
606 done:
607 #if 0
608 	ADAPTER_UNLOCK(sc);
609 #endif
610 	return (0);
611 }
612 
613 int
614 t4_listen_stop(struct toedev *tod, struct tcpcb *tp)
615 {
616 	struct listen_ctx *lctx;
617 	struct adapter *sc = tod->tod_softc;
618 	struct inpcb *inp = tptoinpcb(tp);
619 
620 	INP_WLOCK_ASSERT(inp);
621 
622 	lctx = listen_hash_del(sc, inp);
623 	if (lctx == NULL)
624 		return (ENOENT);	/* no hardware listener for this inp */
625 
626 	CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid,
627 	    lctx, lctx->flags);
628 
629 	/*
630 	 * If the reply to the PASS_OPEN is still pending we'll wait for it to
631 	 * arrive and clean up when it does.
632 	 */
633 	if (lctx->flags & LCTX_RPL_PENDING) {
634 		return (EINPROGRESS);
635 	}
636 
637 	destroy_server(sc, lctx);
638 	return (0);
639 }
640 
641 static inline struct synq_entry *
642 alloc_synqe(struct adapter *sc __unused, struct listen_ctx *lctx, int flags)
643 {
644 	struct synq_entry *synqe;
645 
646 	INP_RLOCK_ASSERT(lctx->inp);
647 	MPASS(flags == M_WAITOK || flags == M_NOWAIT);
648 
649 	synqe = malloc(sizeof(*synqe), M_CXGBE, flags);
650 	if (__predict_true(synqe != NULL)) {
651 		synqe->flags = TPF_SYNQE;
652 		refcount_init(&synqe->refcnt, 1);
653 		synqe->lctx = lctx;
654 		hold_lctx(lctx);	/* Every synqe has a ref on its lctx. */
655 		synqe->syn = NULL;
656 	}
657 
658 	return (synqe);
659 }
660 
661 static inline void
662 hold_synqe(struct synq_entry *synqe)
663 {
664 
665 	refcount_acquire(&synqe->refcnt);
666 }
667 
668 static inline struct inpcb *
669 release_synqe(struct adapter *sc, struct synq_entry *synqe)
670 {
671 	struct inpcb *inp;
672 
673 	MPASS(synqe->flags & TPF_SYNQE);
674 	MPASS(synqe->lctx != NULL);
675 
676 	inp = synqe->lctx->inp;
677 	MPASS(inp != NULL);
678 	INP_WLOCK_ASSERT(inp);
679 
680 	if (refcount_release(&synqe->refcnt)) {
681 		inp = release_lctx(sc, synqe->lctx);
682 		m_freem(synqe->syn);
683 		free(synqe, M_CXGBE);
684 	}
685 
686 	return (inp);
687 }
688 
689 void
690 t4_syncache_added(struct toedev *tod __unused, void *arg)
691 {
692 	struct synq_entry *synqe = arg;
693 
694 	hold_synqe(synqe);
695 }
696 
697 void
698 t4_syncache_removed(struct toedev *tod, void *arg)
699 {
700 	struct adapter *sc = tod->tod_softc;
701 	struct synq_entry *synqe = arg;
702 	struct inpcb *inp = synqe->lctx->inp;
703 
704 	/*
705 	 * XXX: this is a LOR but harmless when running from the softclock.
706 	 */
707 	INP_WLOCK(inp);
708 	inp = release_synqe(sc, synqe);
709 	if (inp != NULL)
710 		INP_WUNLOCK(inp);
711 }
712 
713 int
714 t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m)
715 {
716 	struct synq_entry *synqe = arg;
717 
718 	if (atomic_fetchadd_int(&synqe->ok_to_respond, 1) == 0) {
719 		struct tcpopt to;
720 		struct ip *ip = mtod(m, struct ip *);
721 		struct tcphdr *th;
722 
723 		if (ip->ip_v == IPVERSION)
724 			th = (void *)(ip + 1);
725 		else
726 			th = (void *)((struct ip6_hdr *)ip + 1);
727 		bzero(&to, sizeof(to));
728 		tcp_dooptions(&to, (void *)(th + 1),
729 		    (th->th_off << 2) - sizeof(*th), TO_SYN);
730 
731 		/* save these for later */
732 		synqe->iss = be32toh(th->th_seq);
733 		synqe->irs = be32toh(th->th_ack) - 1;
734 		synqe->ts = to.to_tsval;
735 	}
736 
737 	m_freem(m);	/* don't need this any more */
738 	return (0);
739 }
740 
741 static int
742 do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
743     struct mbuf *m)
744 {
745 	struct adapter *sc = iq->adapter;
746 	const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1);
747 	int stid = GET_TID(cpl);
748 	unsigned int status = cpl->status;
749 	struct listen_ctx *lctx = lookup_stid(sc, stid);
750 	struct inpcb *inp = lctx->inp;
751 #ifdef INVARIANTS
752 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
753 #endif
754 
755 	KASSERT(opcode == CPL_PASS_OPEN_RPL,
756 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
757 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
758 	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
759 
760 	INP_WLOCK(inp);
761 
762 	CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x",
763 	    __func__, stid, status, lctx->flags);
764 
765 	lctx->flags &= ~LCTX_RPL_PENDING;
766 
767 	if (status != CPL_ERR_NONE)
768 		log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status);
769 
770 #ifdef INVARIANTS
771 	/*
772 	 * If the inp has been dropped (listening socket closed) then
773 	 * listen_stop must have run and taken the inp out of the hash.
774 	 */
775 	if (inp->inp_flags & INP_DROPPED) {
776 		KASSERT(listen_hash_del(sc, inp) == NULL,
777 		    ("%s: inp %p still in listen hash", __func__, inp));
778 	}
779 #endif
780 
781 	if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) {
782 		if (release_lctx(sc, lctx) != NULL)
783 			INP_WUNLOCK(inp);
784 		return (status);
785 	}
786 
787 	/*
788 	 * Listening socket stopped listening earlier and now the chip tells us
789 	 * it has started the hardware listener.  Stop it; the lctx will be
790 	 * released in do_close_server_rpl.
791 	 */
792 	if (inp->inp_flags & INP_DROPPED) {
793 		destroy_server(sc, lctx);
794 		INP_WUNLOCK(inp);
795 		return (status);
796 	}
797 
798 	/*
799 	 * Failed to start hardware listener.  Take inp out of the hash and
800 	 * release our reference on it.  An error message has been logged
801 	 * already.
802 	 */
803 	if (status != CPL_ERR_NONE) {
804 		listen_hash_del(sc, inp);
805 		if (release_lctx(sc, lctx) != NULL)
806 			INP_WUNLOCK(inp);
807 		return (status);
808 	}
809 
810 	/* hardware listener open for business */
811 
812 	INP_WUNLOCK(inp);
813 	return (status);
814 }
815 
816 static int
817 do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss,
818     struct mbuf *m)
819 {
820 	struct adapter *sc = iq->adapter;
821 	const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1);
822 	int stid = GET_TID(cpl);
823 	unsigned int status = cpl->status;
824 	struct listen_ctx *lctx = lookup_stid(sc, stid);
825 	struct inpcb *inp = lctx->inp;
826 #ifdef INVARIANTS
827 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
828 #endif
829 
830 	KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL,
831 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
832 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
833 	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
834 
835 	CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status);
836 
837 	if (status != CPL_ERR_NONE) {
838 		log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n",
839 		    __func__, status, stid);
840 		return (status);
841 	}
842 
843 	INP_WLOCK(inp);
844 	inp = release_lctx(sc, lctx);
845 	if (inp != NULL)
846 		INP_WUNLOCK(inp);
847 
848 	return (status);
849 }
850 
851 static void
852 done_with_synqe(struct adapter *sc, struct synq_entry *synqe)
853 {
854 	struct listen_ctx *lctx = synqe->lctx;
855 	struct inpcb *inp = lctx->inp;
856 	struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx];
857 	int ntids;
858 
859 	INP_WLOCK_ASSERT(inp);
860 	ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1;
861 
862 	remove_tid(sc, synqe->tid, ntids);
863 	release_tid(sc, synqe->tid, lctx->ctrlq);
864 	t4_l2t_release(e);
865 	inp = release_synqe(sc, synqe);
866 	if (inp)
867 		INP_WUNLOCK(inp);
868 }
869 
870 void
871 synack_failure_cleanup(struct adapter *sc, int tid)
872 {
873 	struct synq_entry *synqe = lookup_tid(sc, tid);
874 
875 	INP_WLOCK(synqe->lctx->inp);
876 	done_with_synqe(sc, synqe);
877 }
878 
879 int
880 do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss,
881     struct mbuf *m)
882 {
883 	struct adapter *sc = iq->adapter;
884 	const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
885 	unsigned int tid = GET_TID(cpl);
886 	struct synq_entry *synqe = lookup_tid(sc, tid);
887 	struct listen_ctx *lctx = synqe->lctx;
888 	struct inpcb *inp = lctx->inp;
889 	struct sge_ofld_txq *ofld_txq;
890 #ifdef INVARIANTS
891 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
892 #endif
893 
894 	KASSERT(opcode == CPL_ABORT_REQ_RSS,
895 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
896 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
897 	KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
898 
899 	CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
900 	    __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
901 
902 	if (negative_advice(cpl->status))
903 		return (0);	/* Ignore negative advice */
904 
905 	INP_WLOCK(inp);
906 
907 	ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx];
908 
909 	if (!(synqe->flags & TPF_FLOWC_WR_SENT))
910 		send_flowc_wr_synqe(sc, synqe);
911 
912 	/*
913 	 * If we'd initiated an abort earlier the reply to it is responsible for
914 	 * cleaning up resources.  Otherwise we tear everything down right here
915 	 * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
916 	 */
917 	if (synqe->flags & TPF_ABORT_SHUTDOWN) {
918 		INP_WUNLOCK(inp);
919 		goto done;
920 	}
921 
922 	done_with_synqe(sc, synqe);
923 	/* inp lock released by done_with_synqe */
924 done:
925 	send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
926 	return (0);
927 }
928 
929 int
930 do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss,
931     struct mbuf *m)
932 {
933 	struct adapter *sc = iq->adapter;
934 	const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
935 	unsigned int tid = GET_TID(cpl);
936 	struct synq_entry *synqe = lookup_tid(sc, tid);
937 	struct listen_ctx *lctx = synqe->lctx;
938 	struct inpcb *inp = lctx->inp;
939 #ifdef INVARIANTS
940 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
941 #endif
942 
943 	KASSERT(opcode == CPL_ABORT_RPL_RSS,
944 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
945 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
946 	KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
947 
948 	CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
949 	    __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
950 
951 	INP_WLOCK(inp);
952 	KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
953 	    ("%s: wasn't expecting abort reply for synqe %p (0x%x)",
954 	    __func__, synqe, synqe->flags));
955 
956 	done_with_synqe(sc, synqe);
957 	/* inp lock released by done_with_synqe */
958 
959 	return (0);
960 }
961 
962 void
963 t4_offload_socket(struct toedev *tod, void *arg, struct socket *so)
964 {
965 	struct adapter *sc = tod->tod_softc;
966 	struct synq_entry *synqe = arg;
967 	struct inpcb *inp = sotoinpcb(so);
968 	struct toepcb *toep = synqe->toep;
969 
970 	NET_EPOCH_ASSERT();	/* prevents bad race with accept() */
971 	INP_WLOCK_ASSERT(inp);
972 	KASSERT(synqe->flags & TPF_SYNQE,
973 	    ("%s: %p not a synq_entry?", __func__, arg));
974 	MPASS(toep->tid == synqe->tid);
975 
976 	offload_socket(so, toep);
977 	make_established(toep, synqe->iss, synqe->irs, synqe->tcp_opt);
978 	toep->flags |= TPF_CPL_PENDING;
979 	update_tid(sc, synqe->tid, toep);
980 	synqe->flags |= TPF_SYNQE_EXPANDED;
981 	inp->inp_flowtype = (inp->inp_vflag & INP_IPV6) ?
982 	    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4;
983 	inp->inp_flowid = synqe->rss_hash;
984 }
985 
986 static void
987 t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to)
988 {
989 	bzero(to, sizeof(*to));
990 
991 	if (t4opt->mss) {
992 		to->to_flags |= TOF_MSS;
993 		to->to_mss = be16toh(t4opt->mss);
994 	}
995 
996 	if (t4opt->wsf > 0 && t4opt->wsf < 15) {
997 		to->to_flags |= TOF_SCALE;
998 		to->to_wscale = t4opt->wsf;
999 	}
1000 
1001 	if (t4opt->tstamp)
1002 		to->to_flags |= TOF_TS;
1003 
1004 	if (t4opt->sack)
1005 		to->to_flags |= TOF_SACKPERM;
1006 }
1007 
1008 static bool
1009 encapsulated_syn(struct adapter *sc, const struct cpl_pass_accept_req *cpl)
1010 {
1011 	u_int hlen = be32toh(cpl->hdr_len);
1012 
1013 	if (chip_id(sc) >= CHELSIO_T6)
1014 		return (G_T6_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header));
1015 	else
1016 		return (G_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header));
1017 }
1018 
1019 static void
1020 pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m,
1021     struct in_conninfo *inc, struct tcphdr *th, uint8_t *iptos)
1022 {
1023 	const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
1024 	const struct ether_header *eh;
1025 	unsigned int hlen = be32toh(cpl->hdr_len);
1026 	uintptr_t l3hdr;
1027 	const struct tcphdr *tcp;
1028 
1029 	eh = (const void *)(cpl + 1);
1030 	if (chip_id(sc) >= CHELSIO_T6) {
1031 		l3hdr = ((uintptr_t)eh + G_T6_ETH_HDR_LEN(hlen));
1032 		tcp = (const void *)(l3hdr + G_T6_IP_HDR_LEN(hlen));
1033 	} else {
1034 		l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen));
1035 		tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen));
1036 	}
1037 
1038 	/* extract TOS (DiffServ + ECN) byte for AccECN */
1039 	if (iptos) {
1040 		if (((struct ip *)l3hdr)->ip_v == IPVERSION) {
1041 			const struct ip *ip = (const void *)l3hdr;
1042 			*iptos = ip->ip_tos;
1043 		}
1044 #ifdef INET6
1045 		else
1046 		if (((struct ip *)l3hdr)->ip_v == (IPV6_VERSION >> 4)) {
1047 			const struct ip6_hdr *ip6 = (const void *)l3hdr;
1048 			*iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
1049 		}
1050 #endif /* INET */
1051 	}
1052 
1053 	if (inc) {
1054 		bzero(inc, sizeof(*inc));
1055 		inc->inc_fport = tcp->th_sport;
1056 		inc->inc_lport = tcp->th_dport;
1057 		if (((struct ip *)l3hdr)->ip_v == IPVERSION) {
1058 			const struct ip *ip = (const void *)l3hdr;
1059 
1060 			inc->inc_faddr = ip->ip_src;
1061 			inc->inc_laddr = ip->ip_dst;
1062 		} else {
1063 			const struct ip6_hdr *ip6 = (const void *)l3hdr;
1064 
1065 			inc->inc_flags |= INC_ISIPV6;
1066 			inc->inc6_faddr = ip6->ip6_src;
1067 			inc->inc6_laddr = ip6->ip6_dst;
1068 		}
1069 	}
1070 
1071 	if (th) {
1072 		bcopy(tcp, th, sizeof(*th));
1073 		tcp_fields_to_host(th);		/* just like tcp_input */
1074 	}
1075 }
1076 
1077 static struct l2t_entry *
1078 get_l2te_for_nexthop(struct port_info *pi, if_t ifp,
1079     struct in_conninfo *inc)
1080 {
1081 	struct l2t_entry *e;
1082 	struct sockaddr_in6 sin6;
1083 	struct sockaddr *dst = (void *)&sin6;
1084 	struct nhop_object *nh;
1085 
1086 	if (inc->inc_flags & INC_ISIPV6) {
1087 		bzero(dst, sizeof(struct sockaddr_in6));
1088 		dst->sa_len = sizeof(struct sockaddr_in6);
1089 		dst->sa_family = AF_INET6;
1090 
1091 		if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) {
1092 			/* no need for route lookup */
1093 			e = t4_l2t_get(pi, ifp, dst);
1094 			return (e);
1095 		}
1096 
1097 		nh = fib6_lookup(RT_DEFAULT_FIB, &inc->inc6_faddr, 0, NHR_NONE, 0);
1098 		if (nh == NULL)
1099 			return (NULL);
1100 		if (nh->nh_ifp != ifp)
1101 			return (NULL);
1102 		if (nh->nh_flags & NHF_GATEWAY)
1103 			((struct sockaddr_in6 *)dst)->sin6_addr = nh->gw6_sa.sin6_addr;
1104 		else
1105 			((struct sockaddr_in6 *)dst)->sin6_addr = inc->inc6_faddr;
1106 	} else {
1107 		dst->sa_len = sizeof(struct sockaddr_in);
1108 		dst->sa_family = AF_INET;
1109 
1110 		nh = fib4_lookup(RT_DEFAULT_FIB, inc->inc_faddr, 0, NHR_NONE, 0);
1111 		if (nh == NULL)
1112 			return (NULL);
1113 		if (nh->nh_ifp != ifp)
1114 			return (NULL);
1115 		if (nh->nh_flags & NHF_GATEWAY)
1116 			if (nh->gw_sa.sa_family == AF_INET)
1117 				((struct sockaddr_in *)dst)->sin_addr = nh->gw4_sa.sin_addr;
1118 			else
1119 				*((struct sockaddr_in6 *)dst) = nh->gw6_sa;
1120 		else
1121 			((struct sockaddr_in *)dst)->sin_addr = inc->inc_faddr;
1122 	}
1123 
1124 	e = t4_l2t_get(pi, ifp, dst);
1125 	return (e);
1126 }
1127 
1128 static int
1129 send_synack(struct adapter *sc, struct synq_entry *synqe, uint64_t opt0,
1130     uint32_t opt2, int tid)
1131 {
1132 	struct wrqe *wr;
1133 	struct cpl_pass_accept_rpl *rpl;
1134 	struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx];
1135 
1136 	wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) :
1137 	    sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[0]);
1138 	if (wr == NULL)
1139 		return (ENOMEM);
1140 	rpl = wrtod(wr);
1141 
1142 	if (is_t4(sc))
1143 		INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid);
1144 	else {
1145 		struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl;
1146 
1147 		INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid);
1148 		rpl5->iss = htobe32(synqe->iss);
1149 	}
1150 	rpl->opt0 = opt0;
1151 	rpl->opt2 = opt2;
1152 
1153 	return (t4_l2t_send(sc, wr, e));
1154 }
1155 
1156 #define REJECT_PASS_ACCEPT_REQ(tunnel)	do { \
1157 	if (!tunnel) { \
1158 		m_freem(m); \
1159 		m = NULL; \
1160 	} \
1161 	reject_reason = __LINE__; \
1162 	goto reject; \
1163 } while (0)
1164 
1165 /*
1166  * The context associated with a tid entry via insert_tid could be a synq_entry
1167  * or a toepcb.  The only way CPL handlers can tell is via a bit in these flags.
1168  */
1169 CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags));
1170 
1171 /*
1172  * Incoming SYN on a listening socket.
1173  *
1174  * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe,
1175  * etc.
1176  */
1177 static int
1178 do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss,
1179     struct mbuf *m)
1180 {
1181 	struct adapter *sc = iq->adapter;
1182 	struct toedev *tod;
1183 	const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
1184 	unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
1185 	unsigned int tid = GET_TID(cpl);
1186 	struct listen_ctx *lctx = lookup_stid(sc, stid);
1187 	struct inpcb *inp;
1188 	struct socket *so;
1189 	struct in_conninfo inc;
1190 	struct tcphdr th;
1191 	struct tcpopt to;
1192 	struct port_info *pi;
1193 	struct vi_info *vi;
1194 	if_t hw_ifp, ifp;
1195 	struct l2t_entry *e = NULL;
1196 	struct synq_entry *synqe = NULL;
1197 	int reject_reason, v, ntids;
1198 	uint16_t vid, l2info;
1199 	struct epoch_tracker et;
1200 #ifdef INVARIANTS
1201 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1202 #endif
1203 	struct offload_settings settings;
1204 	uint8_t iptos;
1205 
1206 	KASSERT(opcode == CPL_PASS_ACCEPT_REQ,
1207 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1208 	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
1209 
1210 	CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid,
1211 	    lctx);
1212 
1213 	/*
1214 	 * Figure out the port the SYN arrived on.  We'll look for an exact VI
1215 	 * match in a bit but in case we don't find any we'll use the main VI as
1216 	 * the incoming ifnet.
1217 	 */
1218 	l2info = be16toh(cpl->l2info);
1219 	pi = sc->port[G_SYN_INTF(l2info)];
1220 	hw_ifp = pi->vi[0].ifp;
1221 	m->m_pkthdr.rcvif = hw_ifp;
1222 
1223 	CURVNET_SET(lctx->vnet);	/* before any potential REJECT */
1224 
1225 	/*
1226 	 * If VXLAN/NVGRE parsing is enabled then SYNs in the inner traffic will
1227 	 * also hit the listener.  We don't want to offload those.
1228 	 */
1229 	if (encapsulated_syn(sc, cpl)) {
1230 		REJECT_PASS_ACCEPT_REQ(true);
1231 	}
1232 
1233 	/*
1234 	 * Use the MAC index to lookup the associated VI.  If this SYN didn't
1235 	 * match a perfect MAC filter, punt.
1236 	 */
1237 	if (!(l2info & F_SYN_XACT_MATCH)) {
1238 		REJECT_PASS_ACCEPT_REQ(true);
1239 	}
1240 	for_each_vi(pi, v, vi) {
1241 		if (vi->xact_addr_filt == G_SYN_MAC_IDX(l2info))
1242 			goto found;
1243 	}
1244 	REJECT_PASS_ACCEPT_REQ(true);
1245 found:
1246 	hw_ifp = vi->ifp;	/* the cxgbe ifnet */
1247 	m->m_pkthdr.rcvif = hw_ifp;
1248 	tod = TOEDEV(hw_ifp);
1249 
1250 	/*
1251 	 * Don't offload if the peer requested a TCP option that's not known to
1252 	 * the silicon.  Send the SYN to the kernel instead.
1253 	 */
1254 	if (__predict_false(cpl->tcpopt.unknown))
1255 		REJECT_PASS_ACCEPT_REQ(true);
1256 
1257 	/*
1258 	 * Figure out if there is a pseudo interface (vlan, lagg, etc.)
1259 	 * involved.  Don't offload if the SYN had a VLAN tag and the vid
1260 	 * doesn't match anything on this interface.
1261 	 *
1262 	 * XXX: lagg support, lagg + vlan support.
1263 	 */
1264 	vid = EVL_VLANOFTAG(be16toh(cpl->vlan));
1265 	if (vid != 0xfff && vid != 0) {
1266 		ifp = VLAN_DEVAT(hw_ifp, vid);
1267 		if (ifp == NULL)
1268 			REJECT_PASS_ACCEPT_REQ(true);
1269 	} else
1270 		ifp = hw_ifp;
1271 
1272 	/*
1273 	 * Don't offload if the ifnet that the SYN came in on is not in the same
1274 	 * vnet as the listening socket.
1275 	 */
1276 	if (lctx->vnet != if_getvnet(ifp))
1277 		REJECT_PASS_ACCEPT_REQ(true);
1278 
1279 	pass_accept_req_to_protohdrs(sc, m, &inc, &th, &iptos);
1280 	if (inc.inc_flags & INC_ISIPV6) {
1281 
1282 		/* Don't offload if the ifcap isn't enabled */
1283 		if ((if_getcapenable(ifp) & IFCAP_TOE6) == 0)
1284 			REJECT_PASS_ACCEPT_REQ(true);
1285 
1286 		/*
1287 		 * SYN must be directed to an IP6 address on this ifnet.  This
1288 		 * is more restrictive than in6_localip.
1289 		 */
1290 		NET_EPOCH_ENTER(et);
1291 		if (!in6_ifhasaddr(ifp, &inc.inc6_laddr)) {
1292 			NET_EPOCH_EXIT(et);
1293 			REJECT_PASS_ACCEPT_REQ(true);
1294 		}
1295 
1296 		ntids = 2;
1297 	} else {
1298 
1299 		/* Don't offload if the ifcap isn't enabled */
1300 		if ((if_getcapenable(ifp) & IFCAP_TOE4) == 0)
1301 			REJECT_PASS_ACCEPT_REQ(true);
1302 
1303 		/*
1304 		 * SYN must be directed to an IP address on this ifnet.  This
1305 		 * is more restrictive than in_localip.
1306 		 */
1307 		NET_EPOCH_ENTER(et);
1308 		if (!in_ifhasaddr(ifp, inc.inc_laddr)) {
1309 			NET_EPOCH_EXIT(et);
1310 			REJECT_PASS_ACCEPT_REQ(true);
1311 		}
1312 
1313 		ntids = 1;
1314 	}
1315 
1316 	e = get_l2te_for_nexthop(pi, ifp, &inc);
1317 	if (e == NULL) {
1318 		NET_EPOCH_EXIT(et);
1319 		REJECT_PASS_ACCEPT_REQ(true);
1320 	}
1321 
1322 	/* Don't offload if the 4-tuple is already in use */
1323 	if (toe_4tuple_check(&inc, &th, ifp) != 0) {
1324 		NET_EPOCH_EXIT(et);
1325 		REJECT_PASS_ACCEPT_REQ(false);
1326 	}
1327 
1328 	inp = lctx->inp;		/* listening socket, not owned by TOE */
1329 	INP_RLOCK(inp);
1330 
1331 	/* Don't offload if the listening socket has closed */
1332 	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1333 		INP_RUNLOCK(inp);
1334 		NET_EPOCH_EXIT(et);
1335 		REJECT_PASS_ACCEPT_REQ(false);
1336 	}
1337 	so = inp->inp_socket;
1338 	rw_rlock(&sc->policy_lock);
1339 	settings = *lookup_offload_policy(sc, OPEN_TYPE_PASSIVE, m,
1340 	    EVL_MAKETAG(0xfff, 0, 0), inp);
1341 	rw_runlock(&sc->policy_lock);
1342 	if (!settings.offload) {
1343 		INP_RUNLOCK(inp);
1344 		NET_EPOCH_EXIT(et);
1345 		REJECT_PASS_ACCEPT_REQ(true);	/* Rejected by COP. */
1346 	}
1347 
1348 	synqe = alloc_synqe(sc, lctx, M_NOWAIT);
1349 	if (synqe == NULL) {
1350 		INP_RUNLOCK(inp);
1351 		NET_EPOCH_EXIT(et);
1352 		REJECT_PASS_ACCEPT_REQ(true);
1353 	}
1354 	MPASS(rss->hash_type == RSS_HASH_TCP);
1355 	synqe->rss_hash = be32toh(rss->hash_val);
1356 	atomic_store_int(&synqe->ok_to_respond, 0);
1357 
1358 	init_conn_params(vi, &settings, &inc, so, &cpl->tcpopt, e->idx,
1359 	    &synqe->params);
1360 
1361 	/*
1362 	 * If all goes well t4_syncache_respond will get called during
1363 	 * syncache_add.  Note that syncache_add releases the pcb lock.
1364 	 */
1365 	t4opt_to_tcpopt(&cpl->tcpopt, &to);
1366 	toe_syncache_add(&inc, &to, &th, inp, tod, synqe, iptos);
1367 
1368 	if (atomic_load_int(&synqe->ok_to_respond) > 0) {
1369 		uint64_t opt0;
1370 		uint32_t opt2;
1371 
1372 		opt0 = calc_options0(vi, &synqe->params);
1373 		opt2 = calc_options2(vi, &synqe->params);
1374 
1375 		insert_tid(sc, tid, synqe, ntids);
1376 		synqe->tid = tid;
1377 		synqe->syn = m;
1378 		m = NULL;
1379 
1380 		if (send_synack(sc, synqe, opt0, opt2, tid) != 0) {
1381 			remove_tid(sc, tid, ntids);
1382 			m = synqe->syn;
1383 			synqe->syn = NULL;
1384 			NET_EPOCH_EXIT(et);
1385 			REJECT_PASS_ACCEPT_REQ(true);
1386 		}
1387 
1388 		CTR6(KTR_CXGBE,
1389 		    "%s: stid %u, tid %u, synqe %p, opt0 %#016lx, opt2 %#08x",
1390 		    __func__, stid, tid, synqe, be64toh(opt0), be32toh(opt2));
1391 	} else {
1392 		NET_EPOCH_EXIT(et);
1393 		REJECT_PASS_ACCEPT_REQ(false);
1394 	}
1395 
1396 	NET_EPOCH_EXIT(et);
1397 	CURVNET_RESTORE();
1398 	return (0);
1399 reject:
1400 	CURVNET_RESTORE();
1401 	CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid,
1402 	    reject_reason);
1403 
1404 	if (e)
1405 		t4_l2t_release(e);
1406 	release_tid(sc, tid, lctx->ctrlq);
1407 	if (synqe) {
1408 		inp = synqe->lctx->inp;
1409 		INP_WLOCK(inp);
1410 		inp = release_synqe(sc, synqe);
1411 		if (inp)
1412 			INP_WUNLOCK(inp);
1413 	}
1414 
1415 	if (m) {
1416 		/*
1417 		 * The connection request hit a TOE listener but is being passed
1418 		 * on to the kernel sw stack instead of getting offloaded.
1419 		 */
1420 		m_adj(m, sizeof(*cpl));
1421 		m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID |
1422 		    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1423 		m->m_pkthdr.csum_data = 0xffff;
1424 		if_input(hw_ifp, m);
1425 	}
1426 
1427 	return (reject_reason);
1428 }
1429 
1430 static void
1431 synqe_to_protohdrs(struct adapter *sc, struct synq_entry *synqe,
1432     const struct cpl_pass_establish *cpl, struct in_conninfo *inc,
1433     struct tcphdr *th, struct tcpopt *to)
1434 {
1435 	uint16_t tcp_opt = be16toh(cpl->tcp_opt);
1436 	uint8_t iptos;
1437 
1438 	/* start off with the original SYN */
1439 	pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th, &iptos);
1440 
1441 	/* modify parts to make it look like the ACK to our SYN|ACK */
1442 	th->th_flags = TH_ACK;
1443 	th->th_ack = synqe->iss + 1;
1444 	th->th_seq = be32toh(cpl->rcv_isn);
1445 	bzero(to, sizeof(*to));
1446 	if (G_TCPOPT_TSTAMP(tcp_opt)) {
1447 		to->to_flags |= TOF_TS;
1448 		to->to_tsecr = synqe->ts;
1449 	}
1450 }
1451 
1452 static int
1453 do_pass_establish(struct sge_iq *iq, const struct rss_header *rss,
1454     struct mbuf *m)
1455 {
1456 	struct adapter *sc = iq->adapter;
1457 	struct vi_info *vi;
1458 	if_t ifp;
1459 	const struct cpl_pass_establish *cpl = (const void *)(rss + 1);
1460 #if defined(KTR) || defined(INVARIANTS)
1461 	unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
1462 #endif
1463 	unsigned int tid = GET_TID(cpl);
1464 	struct synq_entry *synqe = lookup_tid(sc, tid);
1465 	struct listen_ctx *lctx = synqe->lctx;
1466 	struct inpcb *inp = lctx->inp, *new_inp;
1467 	struct socket *so;
1468 	struct tcphdr th;
1469 	struct tcpopt to;
1470 	struct in_conninfo inc;
1471 	struct toepcb *toep;
1472 	struct epoch_tracker et;
1473 	int rstreason;
1474 #ifdef INVARIANTS
1475 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1476 #endif
1477 
1478 	KASSERT(opcode == CPL_PASS_ESTABLISH,
1479 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1480 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1481 	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
1482 	KASSERT(synqe->flags & TPF_SYNQE,
1483 	    ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe));
1484 
1485 	CURVNET_SET(lctx->vnet);
1486 	NET_EPOCH_ENTER(et);	/* for syncache_expand */
1487 	INP_WLOCK(inp);
1488 
1489 	CTR6(KTR_CXGBE,
1490 	    "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x",
1491 	    __func__, stid, tid, synqe, synqe->flags, inp->inp_flags);
1492 
1493 	ifp = synqe->syn->m_pkthdr.rcvif;
1494 	vi = if_getsoftc(ifp);
1495 	KASSERT(vi->adapter == sc,
1496 	    ("%s: vi %p, sc %p mismatch", __func__, vi, sc));
1497 
1498 	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1499 reset:
1500 		send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_SEND_RST);
1501 		INP_WUNLOCK(inp);
1502 		NET_EPOCH_EXIT(et);
1503 		CURVNET_RESTORE();
1504 		return (0);
1505 	}
1506 
1507 	KASSERT(synqe->params.rxq_idx == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0],
1508 	    ("%s: CPL arrived on unexpected rxq.  %d %d", __func__,
1509 	    synqe->params.rxq_idx,
1510 	    (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0])));
1511 
1512 	toep = alloc_toepcb(vi, M_NOWAIT);
1513 	if (toep == NULL)
1514 		goto reset;
1515 	toep->tid = tid;
1516 	toep->l2te = &sc->l2t->l2tab[synqe->params.l2t_idx];
1517 	toep->vnet = lctx->vnet;
1518 	bcopy(&synqe->params, &toep->params, sizeof(toep->params));
1519 	init_toepcb(vi, toep);
1520 
1521 	MPASS(be32toh(cpl->snd_isn) - 1 == synqe->iss);
1522 	MPASS(be32toh(cpl->rcv_isn) - 1 == synqe->irs);
1523 	synqe->tcp_opt = cpl->tcp_opt;
1524 	synqe->toep = toep;
1525 
1526 	/* Come up with something that syncache_expand should be ok with. */
1527 	synqe_to_protohdrs(sc, synqe, cpl, &inc, &th, &to);
1528 	if (inc.inc_flags & INC_ISIPV6) {
1529 		if (lctx->ce == NULL) {
1530 			toep->ce = t4_get_clip_entry(sc, &inc.inc6_laddr, true);
1531 			if (toep->ce == NULL) {
1532 				free_toepcb(toep);
1533 				goto reset;	/* RST without a CLIP entry? */
1534 			}
1535 		} else {
1536 			t4_hold_clip_entry(sc, lctx->ce);
1537 			toep->ce = lctx->ce;
1538 		}
1539 	}
1540 	so = inp->inp_socket;
1541 	KASSERT(so != NULL, ("%s: socket is NULL", __func__));
1542 
1543 	rstreason = toe_syncache_expand(&inc, &to, &th, &so);
1544 	if (rstreason < 0) {
1545 		free_toepcb(toep);
1546 		send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_NO_RST);
1547 		INP_WUNLOCK(inp);
1548 		NET_EPOCH_EXIT(et);
1549 		CURVNET_RESTORE();
1550 		return (0);
1551 	} else if (rstreason == 0 || so == NULL) {
1552 		free_toepcb(toep);
1553 		goto reset;
1554 	}
1555 
1556 	/* New connection inpcb is already locked by syncache_expand(). */
1557 	new_inp = sotoinpcb(so);
1558 	INP_WLOCK_ASSERT(new_inp);
1559 	MPASS(so->so_vnet == lctx->vnet);
1560 
1561 	/*
1562 	 * This is for expansion from syncookies.
1563 	 *
1564 	 * XXX: we've held the tcbinfo lock throughout so there's no risk of
1565 	 * anyone accept'ing a connection before we've installed our hooks, but
1566 	 * this somewhat defeats the purpose of having a tod_offload_socket :-(
1567 	 */
1568 	if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) {
1569 		tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0);
1570 		t4_offload_socket(TOEDEV(ifp), synqe, so);
1571 	}
1572 
1573 	INP_WUNLOCK(new_inp);
1574 
1575 	/* Done with the synqe */
1576 	inp = release_synqe(sc, synqe);
1577 	if (inp != NULL)
1578 		INP_WUNLOCK(inp);
1579 	NET_EPOCH_EXIT(et);
1580 	CURVNET_RESTORE();
1581 
1582 	return (0);
1583 }
1584 
1585 void
1586 t4_init_listen_cpl_handlers(void)
1587 {
1588 
1589 	t4_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl);
1590 	t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
1591 	t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
1592 	t4_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
1593 }
1594 
1595 void
1596 t4_uninit_listen_cpl_handlers(void)
1597 {
1598 
1599 	t4_register_cpl_handler(CPL_PASS_OPEN_RPL, NULL);
1600 	t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, NULL);
1601 	t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, NULL);
1602 	t4_register_cpl_handler(CPL_PASS_ESTABLISH, NULL);
1603 }
1604 #endif
1605