xref: /freebsd/sys/dev/cxgbe/tom/t4_listen.c (revision 6419bb52)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2012 Chelsio Communications, Inc.
5  * All rights reserved.
6  * Written by: Navdeep Parhar <np@FreeBSD.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include "opt_inet.h"
34 #include "opt_inet6.h"
35 
36 #ifdef TCP_OFFLOAD
37 #include <sys/param.h>
38 #include <sys/types.h>
39 #include <sys/kernel.h>
40 #include <sys/ktr.h>
41 #include <sys/module.h>
42 #include <sys/protosw.h>
43 #include <sys/refcount.h>
44 #include <sys/domain.h>
45 #include <sys/fnv_hash.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #include <sys/sysctl.h>
49 #include <net/ethernet.h>
50 #include <net/if.h>
51 #include <net/if_types.h>
52 #include <net/if_vlan_var.h>
53 #include <net/route.h>
54 #include <netinet/in.h>
55 #include <netinet/in_fib.h>
56 #include <netinet/in_pcb.h>
57 #include <netinet/ip.h>
58 #include <netinet/ip6.h>
59 #include <netinet6/in6_fib.h>
60 #include <netinet6/scope6_var.h>
61 #include <netinet/tcp_timer.h>
62 #define TCPSTATES
63 #include <netinet/tcp_fsm.h>
64 #include <netinet/tcp_var.h>
65 #include <netinet/toecore.h>
66 #include <netinet/cc/cc.h>
67 
68 #include "common/common.h"
69 #include "common/t4_msg.h"
70 #include "common/t4_regs.h"
71 #include "t4_clip.h"
72 #include "tom/t4_tom_l2t.h"
73 #include "tom/t4_tom.h"
74 
75 /* stid services */
76 static int alloc_stid(struct adapter *, struct listen_ctx *, int);
77 static struct listen_ctx *lookup_stid(struct adapter *, int);
78 static void free_stid(struct adapter *, struct listen_ctx *);
79 
80 /* lctx services */
81 static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *,
82     struct vi_info *);
83 static int free_lctx(struct adapter *, struct listen_ctx *);
84 static void hold_lctx(struct listen_ctx *);
85 static void listen_hash_add(struct adapter *, struct listen_ctx *);
86 static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *);
87 static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *);
88 static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *);
89 
90 static void send_reset_synqe(struct toedev *, struct synq_entry *);
91 
92 static int
93 alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6)
94 {
95 	struct tid_info *t = &sc->tids;
96 	u_int stid, n, f, mask;
97 	struct stid_region *sr = &lctx->stid_region;
98 
99 	/*
100 	 * An IPv6 server needs 2 naturally aligned stids (1 stid = 4 cells) in
101 	 * the TCAM.  The start of the stid region is properly aligned (the chip
102 	 * requires each region to be 128-cell aligned).
103 	 */
104 	n = isipv6 ? 2 : 1;
105 	mask = n - 1;
106 	KASSERT((t->stid_base & mask) == 0 && (t->nstids & mask) == 0,
107 	    ("%s: stid region (%u, %u) not properly aligned.  n = %u",
108 	    __func__, t->stid_base, t->nstids, n));
109 
110 	mtx_lock(&t->stid_lock);
111 	if (n > t->nstids - t->stids_in_use) {
112 		mtx_unlock(&t->stid_lock);
113 		return (-1);
114 	}
115 
116 	if (t->nstids_free_head >= n) {
117 		/*
118 		 * This allocation will definitely succeed because the region
119 		 * starts at a good alignment and we just checked we have enough
120 		 * stids free.
121 		 */
122 		f = t->nstids_free_head & mask;
123 		t->nstids_free_head -= n + f;
124 		stid = t->nstids_free_head;
125 		TAILQ_INSERT_HEAD(&t->stids, sr, link);
126 	} else {
127 		struct stid_region *s;
128 
129 		stid = t->nstids_free_head;
130 		TAILQ_FOREACH(s, &t->stids, link) {
131 			stid += s->used + s->free;
132 			f = stid & mask;
133 			if (s->free >= n + f) {
134 				stid -= n + f;
135 				s->free -= n + f;
136 				TAILQ_INSERT_AFTER(&t->stids, s, sr, link);
137 				goto allocated;
138 			}
139 		}
140 
141 		if (__predict_false(stid != t->nstids)) {
142 			panic("%s: stids TAILQ (%p) corrupt."
143 			    "  At %d instead of %d at the end of the queue.",
144 			    __func__, &t->stids, stid, t->nstids);
145 		}
146 
147 		mtx_unlock(&t->stid_lock);
148 		return (-1);
149 	}
150 
151 allocated:
152 	sr->used = n;
153 	sr->free = f;
154 	t->stids_in_use += n;
155 	t->stid_tab[stid] = lctx;
156 	mtx_unlock(&t->stid_lock);
157 
158 	KASSERT(((stid + t->stid_base) & mask) == 0,
159 	    ("%s: EDOOFUS.", __func__));
160 	return (stid + t->stid_base);
161 }
162 
163 static struct listen_ctx *
164 lookup_stid(struct adapter *sc, int stid)
165 {
166 	struct tid_info *t = &sc->tids;
167 
168 	return (t->stid_tab[stid - t->stid_base]);
169 }
170 
171 static void
172 free_stid(struct adapter *sc, struct listen_ctx *lctx)
173 {
174 	struct tid_info *t = &sc->tids;
175 	struct stid_region *sr = &lctx->stid_region;
176 	struct stid_region *s;
177 
178 	KASSERT(sr->used > 0, ("%s: nonsense free (%d)", __func__, sr->used));
179 
180 	mtx_lock(&t->stid_lock);
181 	s = TAILQ_PREV(sr, stid_head, link);
182 	if (s != NULL)
183 		s->free += sr->used + sr->free;
184 	else
185 		t->nstids_free_head += sr->used + sr->free;
186 	KASSERT(t->stids_in_use >= sr->used,
187 	    ("%s: stids_in_use (%u) < stids being freed (%u)", __func__,
188 	    t->stids_in_use, sr->used));
189 	t->stids_in_use -= sr->used;
190 	TAILQ_REMOVE(&t->stids, sr, link);
191 	mtx_unlock(&t->stid_lock);
192 }
193 
194 static struct listen_ctx *
195 alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi)
196 {
197 	struct listen_ctx *lctx;
198 
199 	INP_WLOCK_ASSERT(inp);
200 
201 	lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO);
202 	if (lctx == NULL)
203 		return (NULL);
204 
205 	lctx->stid = alloc_stid(sc, lctx, inp->inp_vflag & INP_IPV6);
206 	if (lctx->stid < 0) {
207 		free(lctx, M_CXGBE);
208 		return (NULL);
209 	}
210 
211 	if (inp->inp_vflag & INP_IPV6 &&
212 	    !IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) {
213 		lctx->ce = t4_hold_lip(sc, &inp->in6p_laddr, NULL);
214 		if (lctx->ce == NULL) {
215 			free(lctx, M_CXGBE);
216 			return (NULL);
217 		}
218 	}
219 
220 	lctx->ctrlq = &sc->sge.ctrlq[vi->pi->port_id];
221 	lctx->ofld_rxq = &sc->sge.ofld_rxq[vi->first_ofld_rxq];
222 	refcount_init(&lctx->refcount, 1);
223 
224 	lctx->inp = inp;
225 	lctx->vnet = inp->inp_socket->so_vnet;
226 	in_pcbref(inp);
227 
228 	return (lctx);
229 }
230 
231 /* Don't call this directly, use release_lctx instead */
232 static int
233 free_lctx(struct adapter *sc, struct listen_ctx *lctx)
234 {
235 	struct inpcb *inp = lctx->inp;
236 
237 	INP_WLOCK_ASSERT(inp);
238 	KASSERT(lctx->refcount == 0,
239 	    ("%s: refcount %d", __func__, lctx->refcount));
240 	KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid));
241 
242 	CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p",
243 	    __func__, lctx->stid, lctx, lctx->inp);
244 
245 	if (lctx->ce)
246 		t4_release_lip(sc, lctx->ce);
247 	free_stid(sc, lctx);
248 	free(lctx, M_CXGBE);
249 
250 	return (in_pcbrele_wlocked(inp));
251 }
252 
253 static void
254 hold_lctx(struct listen_ctx *lctx)
255 {
256 
257 	refcount_acquire(&lctx->refcount);
258 }
259 
260 static inline uint32_t
261 listen_hashfn(void *key, u_long mask)
262 {
263 
264 	return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask);
265 }
266 
267 /*
268  * Add a listen_ctx entry to the listen hash table.
269  */
270 static void
271 listen_hash_add(struct adapter *sc, struct listen_ctx *lctx)
272 {
273 	struct tom_data *td = sc->tom_softc;
274 	int bucket = listen_hashfn(lctx->inp, td->listen_mask);
275 
276 	mtx_lock(&td->lctx_hash_lock);
277 	LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link);
278 	td->lctx_count++;
279 	mtx_unlock(&td->lctx_hash_lock);
280 }
281 
282 /*
283  * Look for the listening socket's context entry in the hash and return it.
284  */
285 static struct listen_ctx *
286 listen_hash_find(struct adapter *sc, struct inpcb *inp)
287 {
288 	struct tom_data *td = sc->tom_softc;
289 	int bucket = listen_hashfn(inp, td->listen_mask);
290 	struct listen_ctx *lctx;
291 
292 	mtx_lock(&td->lctx_hash_lock);
293 	LIST_FOREACH(lctx, &td->listen_hash[bucket], link) {
294 		if (lctx->inp == inp)
295 			break;
296 	}
297 	mtx_unlock(&td->lctx_hash_lock);
298 
299 	return (lctx);
300 }
301 
302 /*
303  * Removes the listen_ctx structure for inp from the hash and returns it.
304  */
305 static struct listen_ctx *
306 listen_hash_del(struct adapter *sc, struct inpcb *inp)
307 {
308 	struct tom_data *td = sc->tom_softc;
309 	int bucket = listen_hashfn(inp, td->listen_mask);
310 	struct listen_ctx *lctx, *l;
311 
312 	mtx_lock(&td->lctx_hash_lock);
313 	LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) {
314 		if (lctx->inp == inp) {
315 			LIST_REMOVE(lctx, link);
316 			td->lctx_count--;
317 			break;
318 		}
319 	}
320 	mtx_unlock(&td->lctx_hash_lock);
321 
322 	return (lctx);
323 }
324 
325 /*
326  * Releases a hold on the lctx.  Must be called with the listening socket's inp
327  * locked.  The inp may be freed by this function and it returns NULL to
328  * indicate this.
329  */
330 static struct inpcb *
331 release_lctx(struct adapter *sc, struct listen_ctx *lctx)
332 {
333 	struct inpcb *inp = lctx->inp;
334 	int inp_freed = 0;
335 
336 	INP_WLOCK_ASSERT(inp);
337 	if (refcount_release(&lctx->refcount))
338 		inp_freed = free_lctx(sc, lctx);
339 
340 	return (inp_freed ? NULL : inp);
341 }
342 
343 static void
344 send_reset_synqe(struct toedev *tod, struct synq_entry *synqe)
345 {
346 	struct adapter *sc = tod->tod_softc;
347 	struct mbuf *m = synqe->syn;
348 	struct ifnet *ifp = m->m_pkthdr.rcvif;
349 	struct vi_info *vi = ifp->if_softc;
350 	struct port_info *pi = vi->pi;
351 	struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx];
352 	struct wrqe *wr;
353 	struct fw_flowc_wr *flowc;
354 	struct cpl_abort_req *req;
355 	int flowclen;
356 	struct sge_wrq *ofld_txq;
357 	struct sge_ofld_rxq *ofld_rxq;
358 	const int nparams = 6;
359 	const u_int pfvf = sc->pf << S_FW_VIID_PFN;
360 
361 	INP_WLOCK_ASSERT(synqe->lctx->inp);
362 
363 	CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s",
364 	    __func__, synqe, synqe->flags, synqe->tid,
365 	    synqe->flags & TPF_ABORT_SHUTDOWN ?
366 	    " (abort already in progress)" : "");
367 	if (synqe->flags & TPF_ABORT_SHUTDOWN)
368 		return;	/* abort already in progress */
369 	synqe->flags |= TPF_ABORT_SHUTDOWN;
370 
371 	ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx];
372 	ofld_rxq = &sc->sge.ofld_rxq[synqe->params.rxq_idx];
373 
374 	/* The wrqe will have two WRs - a flowc followed by an abort_req */
375 	flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
376 
377 	wr = alloc_wrqe(roundup2(flowclen, EQ_ESIZE) + sizeof(*req), ofld_txq);
378 	if (wr == NULL) {
379 		/* XXX */
380 		panic("%s: allocation failure.", __func__);
381 	}
382 	flowc = wrtod(wr);
383 	req = (void *)((caddr_t)flowc + roundup2(flowclen, EQ_ESIZE));
384 
385 	/* First the flowc ... */
386 	memset(flowc, 0, wr->wr_len);
387 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
388 	    V_FW_FLOWC_WR_NPARAMS(nparams));
389 	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
390 	    V_FW_WR_FLOWID(synqe->tid));
391 	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
392 	flowc->mnemval[0].val = htobe32(pfvf);
393 	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
394 	flowc->mnemval[1].val = htobe32(pi->tx_chan);
395 	flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
396 	flowc->mnemval[2].val = htobe32(pi->tx_chan);
397 	flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
398 	flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id);
399  	flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF;
400  	flowc->mnemval[4].val = htobe32(512);
401  	flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS;
402  	flowc->mnemval[5].val = htobe32(512);
403 	synqe->flags |= TPF_FLOWC_WR_SENT;
404 
405 	/* ... then ABORT request */
406 	INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid);
407 	req->rsvd0 = 0;	/* don't have a snd_nxt */
408 	req->rsvd1 = 1;	/* no data sent yet */
409 	req->cmd = CPL_ABORT_SEND_RST;
410 
411 	t4_l2t_send(sc, wr, e);
412 }
413 
414 static int
415 create_server(struct adapter *sc, struct listen_ctx *lctx)
416 {
417 	struct wrqe *wr;
418 	struct cpl_pass_open_req *req;
419 	struct inpcb *inp = lctx->inp;
420 
421 	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
422 	if (wr == NULL) {
423 		log(LOG_ERR, "%s: allocation failure", __func__);
424 		return (ENOMEM);
425 	}
426 	req = wrtod(wr);
427 
428 	INIT_TP_WR(req, 0);
429 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid));
430 	req->local_port = inp->inp_lport;
431 	req->peer_port = 0;
432 	req->local_ip = inp->inp_laddr.s_addr;
433 	req->peer_ip = 0;
434 	req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
435 	req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
436 	    F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
437 
438 	t4_wrq_tx(sc, wr);
439 	return (0);
440 }
441 
442 static int
443 create_server6(struct adapter *sc, struct listen_ctx *lctx)
444 {
445 	struct wrqe *wr;
446 	struct cpl_pass_open_req6 *req;
447 	struct inpcb *inp = lctx->inp;
448 
449 	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
450 	if (wr == NULL) {
451 		log(LOG_ERR, "%s: allocation failure", __func__);
452 		return (ENOMEM);
453 	}
454 	req = wrtod(wr);
455 
456 	INIT_TP_WR(req, 0);
457 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid));
458 	req->local_port = inp->inp_lport;
459 	req->peer_port = 0;
460 	req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0];
461 	req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8];
462 	req->peer_ip_hi = 0;
463 	req->peer_ip_lo = 0;
464 	req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
465 	req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
466 	    F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
467 
468 	t4_wrq_tx(sc, wr);
469 	return (0);
470 }
471 
472 static int
473 destroy_server(struct adapter *sc, struct listen_ctx *lctx)
474 {
475 	struct wrqe *wr;
476 	struct cpl_close_listsvr_req *req;
477 
478 	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
479 	if (wr == NULL) {
480 		/* XXX */
481 		panic("%s: allocation failure.", __func__);
482 	}
483 	req = wrtod(wr);
484 
485 	INIT_TP_WR(req, 0);
486 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ,
487 	    lctx->stid));
488 	req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id);
489 	req->rsvd = htobe16(0);
490 
491 	t4_wrq_tx(sc, wr);
492 	return (0);
493 }
494 
495 /*
496  * Start a listening server by sending a passive open request to HW.
497  *
498  * Can't take adapter lock here and access to sc->flags,
499  * sc->offload_map, if_capenable are all race prone.
500  */
501 int
502 t4_listen_start(struct toedev *tod, struct tcpcb *tp)
503 {
504 	struct adapter *sc = tod->tod_softc;
505 	struct vi_info *vi;
506 	struct port_info *pi;
507 	struct inpcb *inp = tp->t_inpcb;
508 	struct listen_ctx *lctx;
509 	int i, rc, v;
510 	struct offload_settings settings;
511 
512 	INP_WLOCK_ASSERT(inp);
513 
514 	rw_rlock(&sc->policy_lock);
515 	settings = *lookup_offload_policy(sc, OPEN_TYPE_LISTEN, NULL,
516 	    EVL_MAKETAG(0xfff, 0, 0), inp);
517 	rw_runlock(&sc->policy_lock);
518 	if (!settings.offload)
519 		return (0);
520 
521 	/* Don't start a hardware listener for any loopback address. */
522 	if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr))
523 		return (0);
524 	if (!(inp->inp_vflag & INP_IPV6) &&
525 	    IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr)))
526 		return (0);
527 	if (sc->flags & KERN_TLS_OK)
528 		return (0);
529 #if 0
530 	ADAPTER_LOCK(sc);
531 	if (IS_BUSY(sc)) {
532 		log(LOG_ERR, "%s: listen request ignored, %s is busy",
533 		    __func__, device_get_nameunit(sc->dev));
534 		goto done;
535 	}
536 
537 	KASSERT(uld_active(sc, ULD_TOM),
538 	    ("%s: TOM not initialized", __func__));
539 #endif
540 
541 	/*
542 	 * Find an initialized VI with IFCAP_TOE (4 or 6).  We'll use the first
543 	 * such VI's queues to send the passive open and receive the reply to
544 	 * it.
545 	 *
546 	 * XXX: need a way to mark a port in use by offload.  if_cxgbe should
547 	 * then reject any attempt to bring down such a port (and maybe reject
548 	 * attempts to disable IFCAP_TOE on that port too?).
549 	 */
550 	for_each_port(sc, i) {
551 		pi = sc->port[i];
552 		for_each_vi(pi, v, vi) {
553 			if (vi->flags & VI_INIT_DONE &&
554 			    vi->ifp->if_capenable & IFCAP_TOE)
555 				goto found;
556 		}
557 	}
558 	goto done;	/* no port that's UP with IFCAP_TOE enabled */
559 found:
560 
561 	if (listen_hash_find(sc, inp) != NULL)
562 		goto done;	/* already setup */
563 
564 	lctx = alloc_lctx(sc, inp, vi);
565 	if (lctx == NULL) {
566 		log(LOG_ERR,
567 		    "%s: listen request ignored, %s couldn't allocate lctx\n",
568 		    __func__, device_get_nameunit(sc->dev));
569 		goto done;
570 	}
571 	listen_hash_add(sc, lctx);
572 
573 	CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x",
574 	    __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp,
575 	    inp->inp_vflag);
576 
577 	if (inp->inp_vflag & INP_IPV6)
578 		rc = create_server6(sc, lctx);
579 	else
580 		rc = create_server(sc, lctx);
581 	if (rc != 0) {
582 		log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n",
583 		    __func__, device_get_nameunit(sc->dev), rc);
584 		(void) listen_hash_del(sc, inp);
585 		inp = release_lctx(sc, lctx);
586 		/* can't be freed, host stack has a reference */
587 		KASSERT(inp != NULL, ("%s: inp freed", __func__));
588 		goto done;
589 	}
590 	lctx->flags |= LCTX_RPL_PENDING;
591 done:
592 #if 0
593 	ADAPTER_UNLOCK(sc);
594 #endif
595 	return (0);
596 }
597 
598 int
599 t4_listen_stop(struct toedev *tod, struct tcpcb *tp)
600 {
601 	struct listen_ctx *lctx;
602 	struct adapter *sc = tod->tod_softc;
603 	struct inpcb *inp = tp->t_inpcb;
604 
605 	INP_WLOCK_ASSERT(inp);
606 
607 	lctx = listen_hash_del(sc, inp);
608 	if (lctx == NULL)
609 		return (ENOENT);	/* no hardware listener for this inp */
610 
611 	CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid,
612 	    lctx, lctx->flags);
613 
614 	/*
615 	 * If the reply to the PASS_OPEN is still pending we'll wait for it to
616 	 * arrive and clean up when it does.
617 	 */
618 	if (lctx->flags & LCTX_RPL_PENDING) {
619 		return (EINPROGRESS);
620 	}
621 
622 	destroy_server(sc, lctx);
623 	return (0);
624 }
625 
626 static inline struct synq_entry *
627 alloc_synqe(struct adapter *sc __unused, struct listen_ctx *lctx, int flags)
628 {
629 	struct synq_entry *synqe;
630 
631 	INP_WLOCK_ASSERT(lctx->inp);
632 	MPASS(flags == M_WAITOK || flags == M_NOWAIT);
633 
634 	synqe = malloc(sizeof(*synqe), M_CXGBE, flags);
635 	if (__predict_true(synqe != NULL)) {
636 		synqe->flags = TPF_SYNQE;
637 		refcount_init(&synqe->refcnt, 1);
638 		synqe->lctx = lctx;
639 		hold_lctx(lctx);	/* Every synqe has a ref on its lctx. */
640 		synqe->syn = NULL;
641 	}
642 
643 	return (synqe);
644 }
645 
646 static inline void
647 hold_synqe(struct synq_entry *synqe)
648 {
649 
650 	refcount_acquire(&synqe->refcnt);
651 }
652 
653 static inline struct inpcb *
654 release_synqe(struct adapter *sc, struct synq_entry *synqe)
655 {
656 	struct inpcb *inp;
657 
658 	MPASS(synqe->flags & TPF_SYNQE);
659 	MPASS(synqe->lctx != NULL);
660 
661 	inp = synqe->lctx->inp;
662 	MPASS(inp != NULL);
663 	INP_WLOCK_ASSERT(inp);
664 
665 	if (refcount_release(&synqe->refcnt)) {
666 		inp = release_lctx(sc, synqe->lctx);
667 		m_freem(synqe->syn);
668 		free(synqe, M_CXGBE);
669 	}
670 
671 	return (inp);
672 }
673 
674 void
675 t4_syncache_added(struct toedev *tod __unused, void *arg)
676 {
677 	struct synq_entry *synqe = arg;
678 
679 	hold_synqe(synqe);
680 }
681 
682 void
683 t4_syncache_removed(struct toedev *tod, void *arg)
684 {
685 	struct adapter *sc = tod->tod_softc;
686 	struct synq_entry *synqe = arg;
687 	struct inpcb *inp = synqe->lctx->inp;
688 
689 	/*
690 	 * XXX: this is a LOR but harmless when running from the softclock.
691 	 */
692 	INP_WLOCK(inp);
693 	inp = release_synqe(sc, synqe);
694 	if (inp != NULL)
695 		INP_WUNLOCK(inp);
696 }
697 
698 int
699 t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m)
700 {
701 	struct synq_entry *synqe = arg;
702 
703 	if (atomic_fetchadd_int(&synqe->ok_to_respond, 1) == 0) {
704 		struct tcpopt to;
705 		struct ip *ip = mtod(m, struct ip *);
706 		struct tcphdr *th;
707 
708 		if (ip->ip_v == IPVERSION)
709 			th = (void *)(ip + 1);
710 		else
711 			th = (void *)((struct ip6_hdr *)ip + 1);
712 		bzero(&to, sizeof(to));
713 		tcp_dooptions(&to, (void *)(th + 1),
714 		    (th->th_off << 2) - sizeof(*th), TO_SYN);
715 
716 		/* save these for later */
717 		synqe->iss = be32toh(th->th_seq);
718 		synqe->irs = be32toh(th->th_ack) - 1;
719 		synqe->ts = to.to_tsval;
720 	}
721 
722 	m_freem(m);	/* don't need this any more */
723 	return (0);
724 }
725 
726 static int
727 do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
728     struct mbuf *m)
729 {
730 	struct adapter *sc = iq->adapter;
731 	const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1);
732 	int stid = GET_TID(cpl);
733 	unsigned int status = cpl->status;
734 	struct listen_ctx *lctx = lookup_stid(sc, stid);
735 	struct inpcb *inp = lctx->inp;
736 #ifdef INVARIANTS
737 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
738 #endif
739 
740 	KASSERT(opcode == CPL_PASS_OPEN_RPL,
741 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
742 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
743 	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
744 
745 	INP_WLOCK(inp);
746 
747 	CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x",
748 	    __func__, stid, status, lctx->flags);
749 
750 	lctx->flags &= ~LCTX_RPL_PENDING;
751 
752 	if (status != CPL_ERR_NONE)
753 		log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status);
754 
755 #ifdef INVARIANTS
756 	/*
757 	 * If the inp has been dropped (listening socket closed) then
758 	 * listen_stop must have run and taken the inp out of the hash.
759 	 */
760 	if (inp->inp_flags & INP_DROPPED) {
761 		KASSERT(listen_hash_del(sc, inp) == NULL,
762 		    ("%s: inp %p still in listen hash", __func__, inp));
763 	}
764 #endif
765 
766 	if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) {
767 		if (release_lctx(sc, lctx) != NULL)
768 			INP_WUNLOCK(inp);
769 		return (status);
770 	}
771 
772 	/*
773 	 * Listening socket stopped listening earlier and now the chip tells us
774 	 * it has started the hardware listener.  Stop it; the lctx will be
775 	 * released in do_close_server_rpl.
776 	 */
777 	if (inp->inp_flags & INP_DROPPED) {
778 		destroy_server(sc, lctx);
779 		INP_WUNLOCK(inp);
780 		return (status);
781 	}
782 
783 	/*
784 	 * Failed to start hardware listener.  Take inp out of the hash and
785 	 * release our reference on it.  An error message has been logged
786 	 * already.
787 	 */
788 	if (status != CPL_ERR_NONE) {
789 		listen_hash_del(sc, inp);
790 		if (release_lctx(sc, lctx) != NULL)
791 			INP_WUNLOCK(inp);
792 		return (status);
793 	}
794 
795 	/* hardware listener open for business */
796 
797 	INP_WUNLOCK(inp);
798 	return (status);
799 }
800 
801 static int
802 do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss,
803     struct mbuf *m)
804 {
805 	struct adapter *sc = iq->adapter;
806 	const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1);
807 	int stid = GET_TID(cpl);
808 	unsigned int status = cpl->status;
809 	struct listen_ctx *lctx = lookup_stid(sc, stid);
810 	struct inpcb *inp = lctx->inp;
811 #ifdef INVARIANTS
812 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
813 #endif
814 
815 	KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL,
816 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
817 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
818 	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
819 
820 	CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status);
821 
822 	if (status != CPL_ERR_NONE) {
823 		log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n",
824 		    __func__, status, stid);
825 		return (status);
826 	}
827 
828 	INP_WLOCK(inp);
829 	inp = release_lctx(sc, lctx);
830 	if (inp != NULL)
831 		INP_WUNLOCK(inp);
832 
833 	return (status);
834 }
835 
836 static void
837 done_with_synqe(struct adapter *sc, struct synq_entry *synqe)
838 {
839 	struct listen_ctx *lctx = synqe->lctx;
840 	struct inpcb *inp = lctx->inp;
841 	struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx];
842 	int ntids;
843 
844 	INP_WLOCK_ASSERT(inp);
845 	ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1;
846 
847 	remove_tid(sc, synqe->tid, ntids);
848 	release_tid(sc, synqe->tid, lctx->ctrlq);
849 	t4_l2t_release(e);
850 	inp = release_synqe(sc, synqe);
851 	if (inp)
852 		INP_WUNLOCK(inp);
853 }
854 
855 void
856 synack_failure_cleanup(struct adapter *sc, int tid)
857 {
858 	struct synq_entry *synqe = lookup_tid(sc, tid);
859 
860 	INP_WLOCK(synqe->lctx->inp);
861 	done_with_synqe(sc, synqe);
862 }
863 
864 int
865 do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss,
866     struct mbuf *m)
867 {
868 	struct adapter *sc = iq->adapter;
869 	const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
870 	unsigned int tid = GET_TID(cpl);
871 	struct synq_entry *synqe = lookup_tid(sc, tid);
872 	struct listen_ctx *lctx = synqe->lctx;
873 	struct inpcb *inp = lctx->inp;
874 	struct sge_wrq *ofld_txq;
875 #ifdef INVARIANTS
876 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
877 #endif
878 
879 	KASSERT(opcode == CPL_ABORT_REQ_RSS,
880 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
881 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
882 	KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
883 
884 	CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
885 	    __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
886 
887 	if (negative_advice(cpl->status))
888 		return (0);	/* Ignore negative advice */
889 
890 	INP_WLOCK(inp);
891 
892 	ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx];
893 
894 	/*
895 	 * If we'd initiated an abort earlier the reply to it is responsible for
896 	 * cleaning up resources.  Otherwise we tear everything down right here
897 	 * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
898 	 */
899 	if (synqe->flags & TPF_ABORT_SHUTDOWN) {
900 		INP_WUNLOCK(inp);
901 		goto done;
902 	}
903 
904 	done_with_synqe(sc, synqe);
905 	/* inp lock released by done_with_synqe */
906 done:
907 	send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
908 	return (0);
909 }
910 
911 int
912 do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss,
913     struct mbuf *m)
914 {
915 	struct adapter *sc = iq->adapter;
916 	const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
917 	unsigned int tid = GET_TID(cpl);
918 	struct synq_entry *synqe = lookup_tid(sc, tid);
919 	struct listen_ctx *lctx = synqe->lctx;
920 	struct inpcb *inp = lctx->inp;
921 #ifdef INVARIANTS
922 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
923 #endif
924 
925 	KASSERT(opcode == CPL_ABORT_RPL_RSS,
926 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
927 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
928 	KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
929 
930 	CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
931 	    __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
932 
933 	INP_WLOCK(inp);
934 	KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
935 	    ("%s: wasn't expecting abort reply for synqe %p (0x%x)",
936 	    __func__, synqe, synqe->flags));
937 
938 	done_with_synqe(sc, synqe);
939 	/* inp lock released by done_with_synqe */
940 
941 	return (0);
942 }
943 
944 void
945 t4_offload_socket(struct toedev *tod, void *arg, struct socket *so)
946 {
947 	struct adapter *sc = tod->tod_softc;
948 	struct synq_entry *synqe = arg;
949 	struct inpcb *inp = sotoinpcb(so);
950 	struct toepcb *toep = synqe->toep;
951 
952 	NET_EPOCH_ASSERT();	/* prevents bad race with accept() */
953 	INP_WLOCK_ASSERT(inp);
954 	KASSERT(synqe->flags & TPF_SYNQE,
955 	    ("%s: %p not a synq_entry?", __func__, arg));
956 	MPASS(toep->tid == synqe->tid);
957 
958 	offload_socket(so, toep);
959 	make_established(toep, synqe->iss, synqe->irs, synqe->tcp_opt);
960 	toep->flags |= TPF_CPL_PENDING;
961 	update_tid(sc, synqe->tid, toep);
962 	synqe->flags |= TPF_SYNQE_EXPANDED;
963 	inp->inp_flowtype = (inp->inp_vflag & INP_IPV6) ?
964 	    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4;
965 	inp->inp_flowid = synqe->rss_hash;
966 }
967 
968 static void
969 t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to)
970 {
971 	bzero(to, sizeof(*to));
972 
973 	if (t4opt->mss) {
974 		to->to_flags |= TOF_MSS;
975 		to->to_mss = be16toh(t4opt->mss);
976 	}
977 
978 	if (t4opt->wsf > 0 && t4opt->wsf < 15) {
979 		to->to_flags |= TOF_SCALE;
980 		to->to_wscale = t4opt->wsf;
981 	}
982 
983 	if (t4opt->tstamp)
984 		to->to_flags |= TOF_TS;
985 
986 	if (t4opt->sack)
987 		to->to_flags |= TOF_SACKPERM;
988 }
989 
990 static void
991 pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m,
992     struct in_conninfo *inc, struct tcphdr *th, uint8_t *iptos)
993 {
994 	const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
995 	const struct ether_header *eh;
996 	unsigned int hlen = be32toh(cpl->hdr_len);
997 	uintptr_t l3hdr;
998 	const struct tcphdr *tcp;
999 
1000 	eh = (const void *)(cpl + 1);
1001 	if (chip_id(sc) >= CHELSIO_T6) {
1002 		l3hdr = ((uintptr_t)eh + G_T6_ETH_HDR_LEN(hlen));
1003 		tcp = (const void *)(l3hdr + G_T6_IP_HDR_LEN(hlen));
1004 	} else {
1005 		l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen));
1006 		tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen));
1007 	}
1008 
1009 	/* extract TOS (DiffServ + ECN) byte for AccECN */
1010 	if (iptos) {
1011 		if (((struct ip *)l3hdr)->ip_v == IPVERSION) {
1012 			const struct ip *ip = (const void *)l3hdr;
1013 			*iptos = ip->ip_tos;
1014 		}
1015 #ifdef INET6
1016 		else
1017 		if (((struct ip *)l3hdr)->ip_v == (IPV6_VERSION >> 4)) {
1018 			const struct ip6_hdr *ip6 = (const void *)l3hdr;
1019 			*iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
1020 		}
1021 #endif /* INET */
1022 	}
1023 
1024 	if (inc) {
1025 		bzero(inc, sizeof(*inc));
1026 		inc->inc_fport = tcp->th_sport;
1027 		inc->inc_lport = tcp->th_dport;
1028 		if (((struct ip *)l3hdr)->ip_v == IPVERSION) {
1029 			const struct ip *ip = (const void *)l3hdr;
1030 
1031 			inc->inc_faddr = ip->ip_src;
1032 			inc->inc_laddr = ip->ip_dst;
1033 		} else {
1034 			const struct ip6_hdr *ip6 = (const void *)l3hdr;
1035 
1036 			inc->inc_flags |= INC_ISIPV6;
1037 			inc->inc6_faddr = ip6->ip6_src;
1038 			inc->inc6_laddr = ip6->ip6_dst;
1039 		}
1040 	}
1041 
1042 	if (th) {
1043 		bcopy(tcp, th, sizeof(*th));
1044 		tcp_fields_to_host(th);		/* just like tcp_input */
1045 	}
1046 }
1047 
1048 static struct l2t_entry *
1049 get_l2te_for_nexthop(struct port_info *pi, struct ifnet *ifp,
1050     struct in_conninfo *inc)
1051 {
1052 	struct l2t_entry *e;
1053 	struct sockaddr_in6 sin6;
1054 	struct sockaddr *dst = (void *)&sin6;
1055 
1056 	if (inc->inc_flags & INC_ISIPV6) {
1057 		struct nhop6_basic nh6;
1058 
1059 		bzero(dst, sizeof(struct sockaddr_in6));
1060 		dst->sa_len = sizeof(struct sockaddr_in6);
1061 		dst->sa_family = AF_INET6;
1062 
1063 		if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) {
1064 			/* no need for route lookup */
1065 			e = t4_l2t_get(pi, ifp, dst);
1066 			return (e);
1067 		}
1068 
1069 		if (fib6_lookup_nh_basic(RT_DEFAULT_FIB, &inc->inc6_faddr,
1070 		    0, 0, 0, &nh6) != 0)
1071 			return (NULL);
1072 		if (nh6.nh_ifp != ifp)
1073 			return (NULL);
1074 		((struct sockaddr_in6 *)dst)->sin6_addr = nh6.nh_addr;
1075 	} else {
1076 		struct nhop4_basic nh4;
1077 
1078 		dst->sa_len = sizeof(struct sockaddr_in);
1079 		dst->sa_family = AF_INET;
1080 
1081 		if (fib4_lookup_nh_basic(RT_DEFAULT_FIB, inc->inc_faddr, 0, 0,
1082 		    &nh4) != 0)
1083 			return (NULL);
1084 		if (nh4.nh_ifp != ifp)
1085 			return (NULL);
1086 		((struct sockaddr_in *)dst)->sin_addr = nh4.nh_addr;
1087 	}
1088 
1089 	e = t4_l2t_get(pi, ifp, dst);
1090 	return (e);
1091 }
1092 
1093 static int
1094 send_synack(struct adapter *sc, struct synq_entry *synqe, uint64_t opt0,
1095     uint32_t opt2, int tid)
1096 {
1097 	struct wrqe *wr;
1098 	struct cpl_pass_accept_rpl *rpl;
1099 	struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx];
1100 
1101 	wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) :
1102 	    sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[0]);
1103 	if (wr == NULL)
1104 		return (ENOMEM);
1105 	rpl = wrtod(wr);
1106 
1107 	if (is_t4(sc))
1108 		INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid);
1109 	else {
1110 		struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl;
1111 
1112 		INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid);
1113 		rpl5->iss = htobe32(synqe->iss);
1114 	}
1115 	rpl->opt0 = opt0;
1116 	rpl->opt2 = opt2;
1117 
1118 	return (t4_l2t_send(sc, wr, e));
1119 }
1120 
1121 #define REJECT_PASS_ACCEPT_REQ(tunnel)	do { \
1122 	if (!tunnel) { \
1123 		m_freem(m); \
1124 		m = NULL; \
1125 	} \
1126 	reject_reason = __LINE__; \
1127 	goto reject; \
1128 } while (0)
1129 
1130 /*
1131  * The context associated with a tid entry via insert_tid could be a synq_entry
1132  * or a toepcb.  The only way CPL handlers can tell is via a bit in these flags.
1133  */
1134 CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags));
1135 
1136 /*
1137  * Incoming SYN on a listening socket.
1138  *
1139  * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe,
1140  * etc.
1141  */
1142 static int
1143 do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss,
1144     struct mbuf *m)
1145 {
1146 	struct adapter *sc = iq->adapter;
1147 	struct toedev *tod;
1148 	const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
1149 	unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
1150 	unsigned int tid = GET_TID(cpl);
1151 	struct listen_ctx *lctx = lookup_stid(sc, stid);
1152 	struct inpcb *inp;
1153 	struct socket *so;
1154 	struct in_conninfo inc;
1155 	struct tcphdr th;
1156 	struct tcpopt to;
1157 	struct port_info *pi;
1158 	struct vi_info *vi;
1159 	struct ifnet *hw_ifp, *ifp;
1160 	struct l2t_entry *e = NULL;
1161 	struct synq_entry *synqe = NULL;
1162 	int reject_reason, v, ntids;
1163 	uint16_t vid, l2info;
1164 	struct epoch_tracker et;
1165 #ifdef INVARIANTS
1166 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1167 #endif
1168 	struct offload_settings settings;
1169 	uint8_t iptos;
1170 
1171 	KASSERT(opcode == CPL_PASS_ACCEPT_REQ,
1172 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1173 	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
1174 
1175 	CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid,
1176 	    lctx);
1177 
1178 	CURVNET_SET(lctx->vnet);	/* before any potential REJECT */
1179 
1180 	/*
1181 	 * Use the MAC index to lookup the associated VI.  If this SYN didn't
1182 	 * match a perfect MAC filter, punt.
1183 	 */
1184 	l2info = be16toh(cpl->l2info);
1185 	pi = sc->port[G_SYN_INTF(l2info)];
1186 	if (!(l2info & F_SYN_XACT_MATCH)) {
1187 		REJECT_PASS_ACCEPT_REQ(false);
1188 	}
1189 	for_each_vi(pi, v, vi) {
1190 		if (vi->xact_addr_filt == G_SYN_MAC_IDX(l2info))
1191 			goto found;
1192 	}
1193 	REJECT_PASS_ACCEPT_REQ(false);
1194 found:
1195 	hw_ifp = vi->ifp;	/* the cxgbe ifnet */
1196 	m->m_pkthdr.rcvif = hw_ifp;
1197 	tod = TOEDEV(hw_ifp);
1198 
1199 	/*
1200 	 * Don't offload if the peer requested a TCP option that's not known to
1201 	 * the silicon.  Send the SYN to the kernel instead.
1202 	 */
1203 	if (__predict_false(cpl->tcpopt.unknown))
1204 		REJECT_PASS_ACCEPT_REQ(true);
1205 
1206 	/*
1207 	 * Figure out if there is a pseudo interface (vlan, lagg, etc.)
1208 	 * involved.  Don't offload if the SYN had a VLAN tag and the vid
1209 	 * doesn't match anything on this interface.
1210 	 *
1211 	 * XXX: lagg support, lagg + vlan support.
1212 	 */
1213 	vid = EVL_VLANOFTAG(be16toh(cpl->vlan));
1214 	if (vid != 0xfff && vid != 0) {
1215 		ifp = VLAN_DEVAT(hw_ifp, vid);
1216 		if (ifp == NULL)
1217 			REJECT_PASS_ACCEPT_REQ(true);
1218 	} else
1219 		ifp = hw_ifp;
1220 
1221 	/*
1222 	 * Don't offload if the ifnet that the SYN came in on is not in the same
1223 	 * vnet as the listening socket.
1224 	 */
1225 	if (lctx->vnet != ifp->if_vnet)
1226 		REJECT_PASS_ACCEPT_REQ(true);
1227 
1228 	pass_accept_req_to_protohdrs(sc, m, &inc, &th, &iptos);
1229 	if (inc.inc_flags & INC_ISIPV6) {
1230 
1231 		/* Don't offload if the ifcap isn't enabled */
1232 		if ((ifp->if_capenable & IFCAP_TOE6) == 0)
1233 			REJECT_PASS_ACCEPT_REQ(true);
1234 
1235 		/*
1236 		 * SYN must be directed to an IP6 address on this ifnet.  This
1237 		 * is more restrictive than in6_localip.
1238 		 */
1239 		NET_EPOCH_ENTER(et);
1240 		if (!in6_ifhasaddr(ifp, &inc.inc6_laddr)) {
1241 			NET_EPOCH_EXIT(et);
1242 			REJECT_PASS_ACCEPT_REQ(true);
1243 		}
1244 
1245 		ntids = 2;
1246 	} else {
1247 
1248 		/* Don't offload if the ifcap isn't enabled */
1249 		if ((ifp->if_capenable & IFCAP_TOE4) == 0)
1250 			REJECT_PASS_ACCEPT_REQ(true);
1251 
1252 		/*
1253 		 * SYN must be directed to an IP address on this ifnet.  This
1254 		 * is more restrictive than in_localip.
1255 		 */
1256 		NET_EPOCH_ENTER(et);
1257 		if (!in_ifhasaddr(ifp, inc.inc_laddr)) {
1258 			NET_EPOCH_EXIT(et);
1259 			REJECT_PASS_ACCEPT_REQ(true);
1260 		}
1261 
1262 		ntids = 1;
1263 	}
1264 
1265 	e = get_l2te_for_nexthop(pi, ifp, &inc);
1266 	if (e == NULL) {
1267 		NET_EPOCH_EXIT(et);
1268 		REJECT_PASS_ACCEPT_REQ(true);
1269 	}
1270 
1271 	/* Don't offload if the 4-tuple is already in use */
1272 	if (toe_4tuple_check(&inc, &th, ifp) != 0) {
1273 		NET_EPOCH_EXIT(et);
1274 		REJECT_PASS_ACCEPT_REQ(false);
1275 	}
1276 
1277 	inp = lctx->inp;		/* listening socket, not owned by TOE */
1278 	INP_WLOCK(inp);
1279 
1280 	/* Don't offload if the listening socket has closed */
1281 	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1282 		INP_WUNLOCK(inp);
1283 		NET_EPOCH_EXIT(et);
1284 		REJECT_PASS_ACCEPT_REQ(false);
1285 	}
1286 	so = inp->inp_socket;
1287 	rw_rlock(&sc->policy_lock);
1288 	settings = *lookup_offload_policy(sc, OPEN_TYPE_PASSIVE, m,
1289 	    EVL_MAKETAG(0xfff, 0, 0), inp);
1290 	rw_runlock(&sc->policy_lock);
1291 	if (!settings.offload) {
1292 		INP_WUNLOCK(inp);
1293 		NET_EPOCH_EXIT(et);
1294 		REJECT_PASS_ACCEPT_REQ(true);	/* Rejected by COP. */
1295 	}
1296 
1297 	synqe = alloc_synqe(sc, lctx, M_NOWAIT);
1298 	if (synqe == NULL) {
1299 		INP_WUNLOCK(inp);
1300 		NET_EPOCH_EXIT(et);
1301 		REJECT_PASS_ACCEPT_REQ(true);
1302 	}
1303 	MPASS(rss->hash_type == RSS_HASH_TCP);
1304 	synqe->rss_hash = be32toh(rss->hash_val);
1305 	atomic_store_int(&synqe->ok_to_respond, 0);
1306 
1307 	init_conn_params(vi, &settings, &inc, so, &cpl->tcpopt, e->idx,
1308 	    &synqe->params);
1309 
1310 	/*
1311 	 * If all goes well t4_syncache_respond will get called during
1312 	 * syncache_add.  Note that syncache_add releases the pcb lock.
1313 	 */
1314 	t4opt_to_tcpopt(&cpl->tcpopt, &to);
1315 	toe_syncache_add(&inc, &to, &th, inp, tod, synqe, iptos);
1316 
1317 	if (atomic_load_int(&synqe->ok_to_respond) > 0) {
1318 		uint64_t opt0;
1319 		uint32_t opt2;
1320 
1321 		opt0 = calc_options0(vi, &synqe->params);
1322 		opt2 = calc_options2(vi, &synqe->params);
1323 
1324 		insert_tid(sc, tid, synqe, ntids);
1325 		synqe->tid = tid;
1326 		synqe->syn = m;
1327 		m = NULL;
1328 
1329 		if (send_synack(sc, synqe, opt0, opt2, tid) != 0) {
1330 			remove_tid(sc, tid, ntids);
1331 			m = synqe->syn;
1332 			synqe->syn = NULL;
1333 			NET_EPOCH_EXIT(et);
1334 			REJECT_PASS_ACCEPT_REQ(true);
1335 		}
1336 
1337 		CTR6(KTR_CXGBE,
1338 		    "%s: stid %u, tid %u, synqe %p, opt0 %#016lx, opt2 %#08x",
1339 		    __func__, stid, tid, synqe, be64toh(opt0), be32toh(opt2));
1340 	} else {
1341 		NET_EPOCH_EXIT(et);
1342 		REJECT_PASS_ACCEPT_REQ(false);
1343 	}
1344 
1345 	NET_EPOCH_EXIT(et);
1346 	CURVNET_RESTORE();
1347 	return (0);
1348 reject:
1349 	CURVNET_RESTORE();
1350 	CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid,
1351 	    reject_reason);
1352 
1353 	if (e)
1354 		t4_l2t_release(e);
1355 	release_tid(sc, tid, lctx->ctrlq);
1356 	if (synqe) {
1357 		inp = synqe->lctx->inp;
1358 		INP_WLOCK(inp);
1359 		inp = release_synqe(sc, synqe);
1360 		if (inp)
1361 			INP_WUNLOCK(inp);
1362 	}
1363 
1364 	if (m) {
1365 		/*
1366 		 * The connection request hit a TOE listener but is being passed
1367 		 * on to the kernel sw stack instead of getting offloaded.
1368 		 */
1369 		m_adj(m, sizeof(*cpl));
1370 		m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID |
1371 		    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1372 		m->m_pkthdr.csum_data = 0xffff;
1373 		hw_ifp->if_input(hw_ifp, m);
1374 	}
1375 
1376 	return (reject_reason);
1377 }
1378 
1379 static void
1380 synqe_to_protohdrs(struct adapter *sc, struct synq_entry *synqe,
1381     const struct cpl_pass_establish *cpl, struct in_conninfo *inc,
1382     struct tcphdr *th, struct tcpopt *to)
1383 {
1384 	uint16_t tcp_opt = be16toh(cpl->tcp_opt);
1385 	uint8_t iptos;
1386 
1387 	/* start off with the original SYN */
1388 	pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th, &iptos);
1389 
1390 	/* modify parts to make it look like the ACK to our SYN|ACK */
1391 	th->th_flags = TH_ACK;
1392 	th->th_ack = synqe->iss + 1;
1393 	th->th_seq = be32toh(cpl->rcv_isn);
1394 	bzero(to, sizeof(*to));
1395 	if (G_TCPOPT_TSTAMP(tcp_opt)) {
1396 		to->to_flags |= TOF_TS;
1397 		to->to_tsecr = synqe->ts;
1398 	}
1399 }
1400 
1401 static int
1402 do_pass_establish(struct sge_iq *iq, const struct rss_header *rss,
1403     struct mbuf *m)
1404 {
1405 	struct adapter *sc = iq->adapter;
1406 	struct vi_info *vi;
1407 	struct ifnet *ifp;
1408 	const struct cpl_pass_establish *cpl = (const void *)(rss + 1);
1409 #if defined(KTR) || defined(INVARIANTS)
1410 	unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
1411 #endif
1412 	unsigned int tid = GET_TID(cpl);
1413 	struct synq_entry *synqe = lookup_tid(sc, tid);
1414 	struct listen_ctx *lctx = synqe->lctx;
1415 	struct inpcb *inp = lctx->inp, *new_inp;
1416 	struct socket *so;
1417 	struct tcphdr th;
1418 	struct tcpopt to;
1419 	struct in_conninfo inc;
1420 	struct toepcb *toep;
1421 	struct epoch_tracker et;
1422 #ifdef INVARIANTS
1423 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1424 #endif
1425 
1426 	KASSERT(opcode == CPL_PASS_ESTABLISH,
1427 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1428 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1429 	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
1430 	KASSERT(synqe->flags & TPF_SYNQE,
1431 	    ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe));
1432 
1433 	CURVNET_SET(lctx->vnet);
1434 	NET_EPOCH_ENTER(et);	/* for syncache_expand */
1435 	INP_WLOCK(inp);
1436 
1437 	CTR6(KTR_CXGBE,
1438 	    "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x",
1439 	    __func__, stid, tid, synqe, synqe->flags, inp->inp_flags);
1440 
1441 	ifp = synqe->syn->m_pkthdr.rcvif;
1442 	vi = ifp->if_softc;
1443 	KASSERT(vi->pi->adapter == sc,
1444 	    ("%s: vi %p, sc %p mismatch", __func__, vi, sc));
1445 
1446 	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1447 reset:
1448 		send_reset_synqe(TOEDEV(ifp), synqe);
1449 		INP_WUNLOCK(inp);
1450 		NET_EPOCH_EXIT(et);
1451 		CURVNET_RESTORE();
1452 		return (0);
1453 	}
1454 
1455 	KASSERT(synqe->params.rxq_idx == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0],
1456 	    ("%s: CPL arrived on unexpected rxq.  %d %d", __func__,
1457 	    synqe->params.rxq_idx,
1458 	    (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0])));
1459 
1460 	toep = alloc_toepcb(vi, M_NOWAIT);
1461 	if (toep == NULL)
1462 		goto reset;
1463 	toep->tid = tid;
1464 	toep->l2te = &sc->l2t->l2tab[synqe->params.l2t_idx];
1465 	toep->vnet = lctx->vnet;
1466 	bcopy(&synqe->params, &toep->params, sizeof(toep->params));
1467 	init_toepcb(vi, toep);
1468 
1469 	MPASS(be32toh(cpl->snd_isn) - 1 == synqe->iss);
1470 	MPASS(be32toh(cpl->rcv_isn) - 1 == synqe->irs);
1471 	synqe->tcp_opt = cpl->tcp_opt;
1472 	synqe->toep = toep;
1473 
1474 	/* Come up with something that syncache_expand should be ok with. */
1475 	synqe_to_protohdrs(sc, synqe, cpl, &inc, &th, &to);
1476 	if (inc.inc_flags & INC_ISIPV6)
1477 		toep->ce = t4_hold_lip(sc, &inc.inc6_laddr, lctx->ce);
1478 	so = inp->inp_socket;
1479 	KASSERT(so != NULL, ("%s: socket is NULL", __func__));
1480 
1481 	if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) {
1482 		free_toepcb(toep);
1483 		goto reset;
1484 	}
1485 
1486 	/* New connection inpcb is already locked by syncache_expand(). */
1487 	new_inp = sotoinpcb(so);
1488 	INP_WLOCK_ASSERT(new_inp);
1489 	MPASS(so->so_vnet == lctx->vnet);
1490 
1491 	/*
1492 	 * This is for expansion from syncookies.
1493 	 *
1494 	 * XXX: we've held the tcbinfo lock throughout so there's no risk of
1495 	 * anyone accept'ing a connection before we've installed our hooks, but
1496 	 * this somewhat defeats the purpose of having a tod_offload_socket :-(
1497 	 */
1498 	if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) {
1499 		tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0);
1500 		t4_offload_socket(TOEDEV(ifp), synqe, so);
1501 	}
1502 
1503 	INP_WUNLOCK(new_inp);
1504 
1505 	/* Done with the synqe */
1506 	inp = release_synqe(sc, synqe);
1507 	if (inp != NULL)
1508 		INP_WUNLOCK(inp);
1509 	NET_EPOCH_EXIT(et);
1510 	CURVNET_RESTORE();
1511 
1512 	return (0);
1513 }
1514 
1515 void
1516 t4_init_listen_cpl_handlers(void)
1517 {
1518 
1519 	t4_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl);
1520 	t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
1521 	t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
1522 	t4_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
1523 }
1524 
1525 void
1526 t4_uninit_listen_cpl_handlers(void)
1527 {
1528 
1529 	t4_register_cpl_handler(CPL_PASS_OPEN_RPL, NULL);
1530 	t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, NULL);
1531 	t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, NULL);
1532 	t4_register_cpl_handler(CPL_PASS_ESTABLISH, NULL);
1533 }
1534 #endif
1535