xref: /freebsd/sys/dev/cxgbe/tom/t4_listen.c (revision 685dc743)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2012 Chelsio Communications, Inc.
5  * All rights reserved.
6  * Written by: Navdeep Parhar <np@FreeBSD.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 #include "opt_inet.h"
32 #include "opt_inet6.h"
33 
34 #ifdef TCP_OFFLOAD
35 #include <sys/param.h>
36 #include <sys/types.h>
37 #include <sys/kernel.h>
38 #include <sys/ktr.h>
39 #include <sys/module.h>
40 #include <sys/protosw.h>
41 #include <sys/refcount.h>
42 #include <sys/domain.h>
43 #include <sys/fnv_hash.h>
44 #include <sys/socket.h>
45 #include <sys/socketvar.h>
46 #include <sys/sysctl.h>
47 #include <net/ethernet.h>
48 #include <net/if.h>
49 #include <net/if_types.h>
50 #include <net/if_vlan_var.h>
51 #include <net/route.h>
52 #include <net/route/nhop.h>
53 #include <netinet/in.h>
54 #include <netinet/in_fib.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/ip.h>
57 #include <netinet/ip6.h>
58 #include <netinet6/in6_fib.h>
59 #include <netinet6/scope6_var.h>
60 #include <netinet/tcp_timer.h>
61 #define TCPSTATES
62 #include <netinet/tcp_fsm.h>
63 #include <netinet/tcp_var.h>
64 #include <netinet/toecore.h>
65 #include <netinet/cc/cc.h>
66 
67 #include "common/common.h"
68 #include "common/t4_msg.h"
69 #include "common/t4_regs.h"
70 #include "t4_clip.h"
71 #include "tom/t4_tom_l2t.h"
72 #include "tom/t4_tom.h"
73 
74 /* stid services */
75 static int alloc_stid(struct adapter *, struct listen_ctx *, int);
76 static struct listen_ctx *lookup_stid(struct adapter *, int);
77 static void free_stid(struct adapter *, struct listen_ctx *);
78 
79 /* lctx services */
80 static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *,
81     struct vi_info *);
82 static int free_lctx(struct adapter *, struct listen_ctx *);
83 static void hold_lctx(struct listen_ctx *);
84 static void listen_hash_add(struct adapter *, struct listen_ctx *);
85 static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *);
86 static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *);
87 static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *);
88 
89 static void send_abort_rpl_synqe(struct toedev *, struct synq_entry *, int);
90 
91 static int
alloc_stid(struct adapter * sc,struct listen_ctx * lctx,int isipv6)92 alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6)
93 {
94 	struct tid_info *t = &sc->tids;
95 	u_int stid, n, f, mask;
96 	struct stid_region *sr = &lctx->stid_region;
97 
98 	/*
99 	 * An IPv6 server needs 2 naturally aligned stids (1 stid = 4 cells) in
100 	 * the TCAM.  The start of the stid region is properly aligned (the chip
101 	 * requires each region to be 128-cell aligned).
102 	 */
103 	n = isipv6 ? 2 : 1;
104 	mask = n - 1;
105 	KASSERT((t->stid_base & mask) == 0 && (t->nstids & mask) == 0,
106 	    ("%s: stid region (%u, %u) not properly aligned.  n = %u",
107 	    __func__, t->stid_base, t->nstids, n));
108 
109 	mtx_lock(&t->stid_lock);
110 	if (n > t->nstids - t->stids_in_use) {
111 		mtx_unlock(&t->stid_lock);
112 		return (-1);
113 	}
114 
115 	if (t->nstids_free_head >= n) {
116 		/*
117 		 * This allocation will definitely succeed because the region
118 		 * starts at a good alignment and we just checked we have enough
119 		 * stids free.
120 		 */
121 		f = t->nstids_free_head & mask;
122 		t->nstids_free_head -= n + f;
123 		stid = t->nstids_free_head;
124 		TAILQ_INSERT_HEAD(&t->stids, sr, link);
125 	} else {
126 		struct stid_region *s;
127 
128 		stid = t->nstids_free_head;
129 		TAILQ_FOREACH(s, &t->stids, link) {
130 			stid += s->used + s->free;
131 			f = stid & mask;
132 			if (s->free >= n + f) {
133 				stid -= n + f;
134 				s->free -= n + f;
135 				TAILQ_INSERT_AFTER(&t->stids, s, sr, link);
136 				goto allocated;
137 			}
138 		}
139 
140 		if (__predict_false(stid != t->nstids)) {
141 			panic("%s: stids TAILQ (%p) corrupt."
142 			    "  At %d instead of %d at the end of the queue.",
143 			    __func__, &t->stids, stid, t->nstids);
144 		}
145 
146 		mtx_unlock(&t->stid_lock);
147 		return (-1);
148 	}
149 
150 allocated:
151 	sr->used = n;
152 	sr->free = f;
153 	t->stids_in_use += n;
154 	t->stid_tab[stid] = lctx;
155 	mtx_unlock(&t->stid_lock);
156 
157 	KASSERT(((stid + t->stid_base) & mask) == 0,
158 	    ("%s: EDOOFUS.", __func__));
159 	return (stid + t->stid_base);
160 }
161 
162 static struct listen_ctx *
lookup_stid(struct adapter * sc,int stid)163 lookup_stid(struct adapter *sc, int stid)
164 {
165 	struct tid_info *t = &sc->tids;
166 
167 	return (t->stid_tab[stid - t->stid_base]);
168 }
169 
170 static void
free_stid(struct adapter * sc,struct listen_ctx * lctx)171 free_stid(struct adapter *sc, struct listen_ctx *lctx)
172 {
173 	struct tid_info *t = &sc->tids;
174 	struct stid_region *sr = &lctx->stid_region;
175 	struct stid_region *s;
176 
177 	KASSERT(sr->used > 0, ("%s: nonsense free (%d)", __func__, sr->used));
178 
179 	mtx_lock(&t->stid_lock);
180 	s = TAILQ_PREV(sr, stid_head, link);
181 	if (s != NULL)
182 		s->free += sr->used + sr->free;
183 	else
184 		t->nstids_free_head += sr->used + sr->free;
185 	KASSERT(t->stids_in_use >= sr->used,
186 	    ("%s: stids_in_use (%u) < stids being freed (%u)", __func__,
187 	    t->stids_in_use, sr->used));
188 	t->stids_in_use -= sr->used;
189 	TAILQ_REMOVE(&t->stids, sr, link);
190 	mtx_unlock(&t->stid_lock);
191 }
192 
193 static struct listen_ctx *
alloc_lctx(struct adapter * sc,struct inpcb * inp,struct vi_info * vi)194 alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi)
195 {
196 	struct listen_ctx *lctx;
197 
198 	INP_WLOCK_ASSERT(inp);
199 
200 	lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO);
201 	if (lctx == NULL)
202 		return (NULL);
203 
204 	lctx->stid = alloc_stid(sc, lctx, inp->inp_vflag & INP_IPV6);
205 	if (lctx->stid < 0) {
206 		free(lctx, M_CXGBE);
207 		return (NULL);
208 	}
209 
210 	if (inp->inp_vflag & INP_IPV6 &&
211 	    !IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) {
212 		lctx->ce = t4_get_clip_entry(sc, &inp->in6p_laddr, true);
213 		if (lctx->ce == NULL) {
214 			free(lctx, M_CXGBE);
215 			return (NULL);
216 		}
217 	}
218 
219 	lctx->ctrlq = &sc->sge.ctrlq[vi->pi->port_id];
220 	lctx->ofld_rxq = &sc->sge.ofld_rxq[vi->first_ofld_rxq];
221 	refcount_init(&lctx->refcount, 1);
222 
223 	lctx->inp = inp;
224 	lctx->vnet = inp->inp_socket->so_vnet;
225 	in_pcbref(inp);
226 
227 	return (lctx);
228 }
229 
230 /* Don't call this directly, use release_lctx instead */
231 static int
free_lctx(struct adapter * sc,struct listen_ctx * lctx)232 free_lctx(struct adapter *sc, struct listen_ctx *lctx)
233 {
234 	struct inpcb *inp = lctx->inp;
235 
236 	INP_WLOCK_ASSERT(inp);
237 	KASSERT(lctx->refcount == 0,
238 	    ("%s: refcount %d", __func__, lctx->refcount));
239 	KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid));
240 
241 	CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p",
242 	    __func__, lctx->stid, lctx, lctx->inp);
243 
244 	if (lctx->ce)
245 		t4_release_clip_entry(sc, lctx->ce);
246 	free_stid(sc, lctx);
247 	free(lctx, M_CXGBE);
248 
249 	return (in_pcbrele_wlocked(inp));
250 }
251 
252 static void
hold_lctx(struct listen_ctx * lctx)253 hold_lctx(struct listen_ctx *lctx)
254 {
255 
256 	refcount_acquire(&lctx->refcount);
257 }
258 
259 static inline uint32_t
listen_hashfn(void * key,u_long mask)260 listen_hashfn(void *key, u_long mask)
261 {
262 
263 	return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask);
264 }
265 
266 /*
267  * Add a listen_ctx entry to the listen hash table.
268  */
269 static void
listen_hash_add(struct adapter * sc,struct listen_ctx * lctx)270 listen_hash_add(struct adapter *sc, struct listen_ctx *lctx)
271 {
272 	struct tom_data *td = sc->tom_softc;
273 	int bucket = listen_hashfn(lctx->inp, td->listen_mask);
274 
275 	mtx_lock(&td->lctx_hash_lock);
276 	LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link);
277 	td->lctx_count++;
278 	mtx_unlock(&td->lctx_hash_lock);
279 }
280 
281 /*
282  * Look for the listening socket's context entry in the hash and return it.
283  */
284 static struct listen_ctx *
listen_hash_find(struct adapter * sc,struct inpcb * inp)285 listen_hash_find(struct adapter *sc, struct inpcb *inp)
286 {
287 	struct tom_data *td = sc->tom_softc;
288 	int bucket = listen_hashfn(inp, td->listen_mask);
289 	struct listen_ctx *lctx;
290 
291 	mtx_lock(&td->lctx_hash_lock);
292 	LIST_FOREACH(lctx, &td->listen_hash[bucket], link) {
293 		if (lctx->inp == inp)
294 			break;
295 	}
296 	mtx_unlock(&td->lctx_hash_lock);
297 
298 	return (lctx);
299 }
300 
301 /*
302  * Removes the listen_ctx structure for inp from the hash and returns it.
303  */
304 static struct listen_ctx *
listen_hash_del(struct adapter * sc,struct inpcb * inp)305 listen_hash_del(struct adapter *sc, struct inpcb *inp)
306 {
307 	struct tom_data *td = sc->tom_softc;
308 	int bucket = listen_hashfn(inp, td->listen_mask);
309 	struct listen_ctx *lctx, *l;
310 
311 	mtx_lock(&td->lctx_hash_lock);
312 	LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) {
313 		if (lctx->inp == inp) {
314 			LIST_REMOVE(lctx, link);
315 			td->lctx_count--;
316 			break;
317 		}
318 	}
319 	mtx_unlock(&td->lctx_hash_lock);
320 
321 	return (lctx);
322 }
323 
324 /*
325  * Releases a hold on the lctx.  Must be called with the listening socket's inp
326  * locked.  The inp may be freed by this function and it returns NULL to
327  * indicate this.
328  */
329 static struct inpcb *
release_lctx(struct adapter * sc,struct listen_ctx * lctx)330 release_lctx(struct adapter *sc, struct listen_ctx *lctx)
331 {
332 	struct inpcb *inp = lctx->inp;
333 	int inp_freed = 0;
334 
335 	INP_WLOCK_ASSERT(inp);
336 	if (refcount_release(&lctx->refcount))
337 		inp_freed = free_lctx(sc, lctx);
338 
339 	return (inp_freed ? NULL : inp);
340 }
341 
342 static void
send_flowc_wr_synqe(struct adapter * sc,struct synq_entry * synqe)343 send_flowc_wr_synqe(struct adapter *sc, struct synq_entry *synqe)
344 {
345 	struct mbuf *m = synqe->syn;
346 	if_t ifp = m->m_pkthdr.rcvif;
347 	struct vi_info *vi = if_getsoftc(ifp);
348 	struct port_info *pi = vi->pi;
349 	struct wrqe *wr;
350 	struct fw_flowc_wr *flowc;
351 	struct sge_ofld_txq *ofld_txq;
352 	struct sge_ofld_rxq *ofld_rxq;
353 	const int nparams = 6;
354 	const int flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
355 	const u_int pfvf = sc->pf << S_FW_VIID_PFN;
356 
357 	INP_WLOCK_ASSERT(synqe->lctx->inp);
358 	MPASS((synqe->flags & TPF_FLOWC_WR_SENT) == 0);
359 
360 	ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx];
361 	ofld_rxq = &sc->sge.ofld_rxq[synqe->params.rxq_idx];
362 
363 	wr = alloc_wrqe(roundup2(flowclen, 16), &ofld_txq->wrq);
364 	if (wr == NULL) {
365 		/* XXX */
366 		panic("%s: allocation failure.", __func__);
367 	}
368 	flowc = wrtod(wr);
369 	memset(flowc, 0, wr->wr_len);
370 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
371 	    V_FW_FLOWC_WR_NPARAMS(nparams));
372 	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
373 	    V_FW_WR_FLOWID(synqe->tid));
374 	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
375 	flowc->mnemval[0].val = htobe32(pfvf);
376 	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
377 	flowc->mnemval[1].val = htobe32(pi->tx_chan);
378 	flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
379 	flowc->mnemval[2].val = htobe32(pi->tx_chan);
380 	flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
381 	flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id);
382 	flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF;
383 	flowc->mnemval[4].val = htobe32(512);
384 	flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS;
385 	flowc->mnemval[5].val = htobe32(512);
386 
387 	synqe->flags |= TPF_FLOWC_WR_SENT;
388 	t4_wrq_tx(sc, wr);
389 }
390 
391 static void
send_abort_rpl_synqe(struct toedev * tod,struct synq_entry * synqe,int rst_status)392 send_abort_rpl_synqe(struct toedev *tod, struct synq_entry *synqe,
393     int rst_status)
394 {
395 	struct adapter *sc = tod->tod_softc;
396 	struct wrqe *wr;
397 	struct cpl_abort_req *req;
398 
399 	INP_WLOCK_ASSERT(synqe->lctx->inp);
400 
401 	CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s",
402 	    __func__, synqe, synqe->flags, synqe->tid,
403 	    synqe->flags & TPF_ABORT_SHUTDOWN ?
404 	    " (abort already in progress)" : "");
405 	if (synqe->flags & TPF_ABORT_SHUTDOWN)
406 		return;	/* abort already in progress */
407 	synqe->flags |= TPF_ABORT_SHUTDOWN;
408 
409 	if (!(synqe->flags & TPF_FLOWC_WR_SENT))
410 		send_flowc_wr_synqe(sc, synqe);
411 
412 	wr = alloc_wrqe(sizeof(*req),
413 	    &sc->sge.ofld_txq[synqe->params.txq_idx].wrq);
414 	if (wr == NULL) {
415 		/* XXX */
416 		panic("%s: allocation failure.", __func__);
417 	}
418 	req = wrtod(wr);
419 	INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid);
420 	req->rsvd0 = 0;	/* don't have a snd_nxt */
421 	req->rsvd1 = 1;	/* no data sent yet */
422 	req->cmd = rst_status;
423 
424 	t4_l2t_send(sc, wr, &sc->l2t->l2tab[synqe->params.l2t_idx]);
425 }
426 
427 static int
create_server(struct adapter * sc,struct listen_ctx * lctx)428 create_server(struct adapter *sc, struct listen_ctx *lctx)
429 {
430 	struct wrqe *wr;
431 	struct cpl_pass_open_req *req;
432 	struct inpcb *inp = lctx->inp;
433 
434 	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
435 	if (wr == NULL) {
436 		log(LOG_ERR, "%s: allocation failure", __func__);
437 		return (ENOMEM);
438 	}
439 	req = wrtod(wr);
440 
441 	INIT_TP_WR(req, 0);
442 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid));
443 	req->local_port = inp->inp_lport;
444 	req->peer_port = 0;
445 	req->local_ip = inp->inp_laddr.s_addr;
446 	req->peer_ip = 0;
447 	req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
448 	req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
449 	    F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
450 
451 	t4_wrq_tx(sc, wr);
452 	return (0);
453 }
454 
455 static int
create_server6(struct adapter * sc,struct listen_ctx * lctx)456 create_server6(struct adapter *sc, struct listen_ctx *lctx)
457 {
458 	struct wrqe *wr;
459 	struct cpl_pass_open_req6 *req;
460 	struct inpcb *inp = lctx->inp;
461 
462 	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
463 	if (wr == NULL) {
464 		log(LOG_ERR, "%s: allocation failure", __func__);
465 		return (ENOMEM);
466 	}
467 	req = wrtod(wr);
468 
469 	INIT_TP_WR(req, 0);
470 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid));
471 	req->local_port = inp->inp_lport;
472 	req->peer_port = 0;
473 	req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0];
474 	req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8];
475 	req->peer_ip_hi = 0;
476 	req->peer_ip_lo = 0;
477 	req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
478 	req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
479 	    F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
480 
481 	t4_wrq_tx(sc, wr);
482 	return (0);
483 }
484 
485 static int
destroy_server(struct adapter * sc,struct listen_ctx * lctx)486 destroy_server(struct adapter *sc, struct listen_ctx *lctx)
487 {
488 	struct wrqe *wr;
489 	struct cpl_close_listsvr_req *req;
490 
491 	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
492 	if (wr == NULL) {
493 		/* XXX */
494 		panic("%s: allocation failure.", __func__);
495 	}
496 	req = wrtod(wr);
497 
498 	INIT_TP_WR(req, 0);
499 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ,
500 	    lctx->stid));
501 	req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id);
502 	req->rsvd = htobe16(0);
503 
504 	t4_wrq_tx(sc, wr);
505 	return (0);
506 }
507 
508 /*
509  * Start a listening server by sending a passive open request to HW.
510  *
511  * Can't take adapter lock here and access to sc->flags,
512  * sc->offload_map, if_capenable are all race prone.
513  */
514 int
t4_listen_start(struct toedev * tod,struct tcpcb * tp)515 t4_listen_start(struct toedev *tod, struct tcpcb *tp)
516 {
517 	struct adapter *sc = tod->tod_softc;
518 	struct vi_info *vi;
519 	struct port_info *pi;
520 	struct inpcb *inp = tptoinpcb(tp);
521 	struct listen_ctx *lctx;
522 	int i, rc, v;
523 	struct offload_settings settings;
524 
525 	INP_WLOCK_ASSERT(inp);
526 
527 	rw_rlock(&sc->policy_lock);
528 	settings = *lookup_offload_policy(sc, OPEN_TYPE_LISTEN, NULL,
529 	    EVL_MAKETAG(0xfff, 0, 0), inp);
530 	rw_runlock(&sc->policy_lock);
531 	if (!settings.offload)
532 		return (0);
533 
534 	/* Don't start a hardware listener for any loopback address. */
535 	if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr))
536 		return (0);
537 	if (!(inp->inp_vflag & INP_IPV6) &&
538 	    IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr)))
539 		return (0);
540 	if (sc->flags & KERN_TLS_ON)
541 		return (0);
542 #if 0
543 	ADAPTER_LOCK(sc);
544 	if (IS_BUSY(sc)) {
545 		log(LOG_ERR, "%s: listen request ignored, %s is busy",
546 		    __func__, device_get_nameunit(sc->dev));
547 		goto done;
548 	}
549 
550 	KASSERT(uld_active(sc, ULD_TOM),
551 	    ("%s: TOM not initialized", __func__));
552 #endif
553 
554 	/*
555 	 * Find an initialized VI with IFCAP_TOE (4 or 6).  We'll use the first
556 	 * such VI's queues to send the passive open and receive the reply to
557 	 * it.
558 	 *
559 	 * XXX: need a way to mark a port in use by offload.  if_cxgbe should
560 	 * then reject any attempt to bring down such a port (and maybe reject
561 	 * attempts to disable IFCAP_TOE on that port too?).
562 	 */
563 	for_each_port(sc, i) {
564 		pi = sc->port[i];
565 		for_each_vi(pi, v, vi) {
566 			if (vi->flags & VI_INIT_DONE &&
567 			    if_getcapenable(vi->ifp) & IFCAP_TOE)
568 				goto found;
569 		}
570 	}
571 	goto done;	/* no port that's UP with IFCAP_TOE enabled */
572 found:
573 
574 	if (listen_hash_find(sc, inp) != NULL)
575 		goto done;	/* already setup */
576 
577 	lctx = alloc_lctx(sc, inp, vi);
578 	if (lctx == NULL) {
579 		log(LOG_ERR,
580 		    "%s: listen request ignored, %s couldn't allocate lctx\n",
581 		    __func__, device_get_nameunit(sc->dev));
582 		goto done;
583 	}
584 	listen_hash_add(sc, lctx);
585 
586 	CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x",
587 	    __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp,
588 	    inp->inp_vflag);
589 
590 	if (inp->inp_vflag & INP_IPV6)
591 		rc = create_server6(sc, lctx);
592 	else
593 		rc = create_server(sc, lctx);
594 	if (rc != 0) {
595 		log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n",
596 		    __func__, device_get_nameunit(sc->dev), rc);
597 		(void) listen_hash_del(sc, inp);
598 		inp = release_lctx(sc, lctx);
599 		/* can't be freed, host stack has a reference */
600 		KASSERT(inp != NULL, ("%s: inp freed", __func__));
601 		goto done;
602 	}
603 	lctx->flags |= LCTX_RPL_PENDING;
604 done:
605 #if 0
606 	ADAPTER_UNLOCK(sc);
607 #endif
608 	return (0);
609 }
610 
611 int
t4_listen_stop(struct toedev * tod,struct tcpcb * tp)612 t4_listen_stop(struct toedev *tod, struct tcpcb *tp)
613 {
614 	struct listen_ctx *lctx;
615 	struct adapter *sc = tod->tod_softc;
616 	struct inpcb *inp = tptoinpcb(tp);
617 
618 	INP_WLOCK_ASSERT(inp);
619 
620 	lctx = listen_hash_del(sc, inp);
621 	if (lctx == NULL)
622 		return (ENOENT);	/* no hardware listener for this inp */
623 
624 	CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid,
625 	    lctx, lctx->flags);
626 
627 	/*
628 	 * If the reply to the PASS_OPEN is still pending we'll wait for it to
629 	 * arrive and clean up when it does.
630 	 */
631 	if (lctx->flags & LCTX_RPL_PENDING) {
632 		return (EINPROGRESS);
633 	}
634 
635 	destroy_server(sc, lctx);
636 	return (0);
637 }
638 
639 static inline struct synq_entry *
alloc_synqe(struct adapter * sc __unused,struct listen_ctx * lctx,int flags)640 alloc_synqe(struct adapter *sc __unused, struct listen_ctx *lctx, int flags)
641 {
642 	struct synq_entry *synqe;
643 
644 	INP_RLOCK_ASSERT(lctx->inp);
645 	MPASS(flags == M_WAITOK || flags == M_NOWAIT);
646 
647 	synqe = malloc(sizeof(*synqe), M_CXGBE, flags);
648 	if (__predict_true(synqe != NULL)) {
649 		synqe->flags = TPF_SYNQE;
650 		refcount_init(&synqe->refcnt, 1);
651 		synqe->lctx = lctx;
652 		hold_lctx(lctx);	/* Every synqe has a ref on its lctx. */
653 		synqe->syn = NULL;
654 	}
655 
656 	return (synqe);
657 }
658 
659 static inline void
hold_synqe(struct synq_entry * synqe)660 hold_synqe(struct synq_entry *synqe)
661 {
662 
663 	refcount_acquire(&synqe->refcnt);
664 }
665 
666 static inline struct inpcb *
release_synqe(struct adapter * sc,struct synq_entry * synqe)667 release_synqe(struct adapter *sc, struct synq_entry *synqe)
668 {
669 	struct inpcb *inp;
670 
671 	MPASS(synqe->flags & TPF_SYNQE);
672 	MPASS(synqe->lctx != NULL);
673 
674 	inp = synqe->lctx->inp;
675 	MPASS(inp != NULL);
676 	INP_WLOCK_ASSERT(inp);
677 
678 	if (refcount_release(&synqe->refcnt)) {
679 		inp = release_lctx(sc, synqe->lctx);
680 		m_freem(synqe->syn);
681 		free(synqe, M_CXGBE);
682 	}
683 
684 	return (inp);
685 }
686 
687 void
t4_syncache_added(struct toedev * tod __unused,void * arg)688 t4_syncache_added(struct toedev *tod __unused, void *arg)
689 {
690 	struct synq_entry *synqe = arg;
691 
692 	hold_synqe(synqe);
693 }
694 
695 void
t4_syncache_removed(struct toedev * tod,void * arg)696 t4_syncache_removed(struct toedev *tod, void *arg)
697 {
698 	struct adapter *sc = tod->tod_softc;
699 	struct synq_entry *synqe = arg;
700 	struct inpcb *inp = synqe->lctx->inp;
701 
702 	/*
703 	 * XXX: this is a LOR but harmless when running from the softclock.
704 	 */
705 	INP_WLOCK(inp);
706 	inp = release_synqe(sc, synqe);
707 	if (inp != NULL)
708 		INP_WUNLOCK(inp);
709 }
710 
711 int
t4_syncache_respond(struct toedev * tod,void * arg,struct mbuf * m)712 t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m)
713 {
714 	struct synq_entry *synqe = arg;
715 
716 	if (atomic_fetchadd_int(&synqe->ok_to_respond, 1) == 0) {
717 		struct tcpopt to;
718 		struct ip *ip = mtod(m, struct ip *);
719 		struct tcphdr *th;
720 
721 		if (ip->ip_v == IPVERSION)
722 			th = (void *)(ip + 1);
723 		else
724 			th = (void *)((struct ip6_hdr *)ip + 1);
725 		bzero(&to, sizeof(to));
726 		tcp_dooptions(&to, (void *)(th + 1),
727 		    (th->th_off << 2) - sizeof(*th), TO_SYN);
728 
729 		/* save these for later */
730 		synqe->iss = be32toh(th->th_seq);
731 		synqe->irs = be32toh(th->th_ack) - 1;
732 		synqe->ts = to.to_tsval;
733 	}
734 
735 	m_freem(m);	/* don't need this any more */
736 	return (0);
737 }
738 
739 static int
do_pass_open_rpl(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)740 do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
741     struct mbuf *m)
742 {
743 	struct adapter *sc = iq->adapter;
744 	const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1);
745 	int stid = GET_TID(cpl);
746 	unsigned int status = cpl->status;
747 	struct listen_ctx *lctx = lookup_stid(sc, stid);
748 	struct inpcb *inp = lctx->inp;
749 #ifdef INVARIANTS
750 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
751 #endif
752 
753 	KASSERT(opcode == CPL_PASS_OPEN_RPL,
754 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
755 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
756 	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
757 
758 	INP_WLOCK(inp);
759 
760 	CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x",
761 	    __func__, stid, status, lctx->flags);
762 
763 	lctx->flags &= ~LCTX_RPL_PENDING;
764 
765 	if (status != CPL_ERR_NONE)
766 		log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status);
767 
768 #ifdef INVARIANTS
769 	/*
770 	 * If the inp has been dropped (listening socket closed) then
771 	 * listen_stop must have run and taken the inp out of the hash.
772 	 */
773 	if (inp->inp_flags & INP_DROPPED) {
774 		KASSERT(listen_hash_del(sc, inp) == NULL,
775 		    ("%s: inp %p still in listen hash", __func__, inp));
776 	}
777 #endif
778 
779 	if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) {
780 		if (release_lctx(sc, lctx) != NULL)
781 			INP_WUNLOCK(inp);
782 		return (status);
783 	}
784 
785 	/*
786 	 * Listening socket stopped listening earlier and now the chip tells us
787 	 * it has started the hardware listener.  Stop it; the lctx will be
788 	 * released in do_close_server_rpl.
789 	 */
790 	if (inp->inp_flags & INP_DROPPED) {
791 		destroy_server(sc, lctx);
792 		INP_WUNLOCK(inp);
793 		return (status);
794 	}
795 
796 	/*
797 	 * Failed to start hardware listener.  Take inp out of the hash and
798 	 * release our reference on it.  An error message has been logged
799 	 * already.
800 	 */
801 	if (status != CPL_ERR_NONE) {
802 		listen_hash_del(sc, inp);
803 		if (release_lctx(sc, lctx) != NULL)
804 			INP_WUNLOCK(inp);
805 		return (status);
806 	}
807 
808 	/* hardware listener open for business */
809 
810 	INP_WUNLOCK(inp);
811 	return (status);
812 }
813 
814 static int
do_close_server_rpl(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)815 do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss,
816     struct mbuf *m)
817 {
818 	struct adapter *sc = iq->adapter;
819 	const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1);
820 	int stid = GET_TID(cpl);
821 	unsigned int status = cpl->status;
822 	struct listen_ctx *lctx = lookup_stid(sc, stid);
823 	struct inpcb *inp = lctx->inp;
824 #ifdef INVARIANTS
825 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
826 #endif
827 
828 	KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL,
829 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
830 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
831 	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
832 
833 	CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status);
834 
835 	if (status != CPL_ERR_NONE) {
836 		log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n",
837 		    __func__, status, stid);
838 		return (status);
839 	}
840 
841 	INP_WLOCK(inp);
842 	inp = release_lctx(sc, lctx);
843 	if (inp != NULL)
844 		INP_WUNLOCK(inp);
845 
846 	return (status);
847 }
848 
849 static void
done_with_synqe(struct adapter * sc,struct synq_entry * synqe)850 done_with_synqe(struct adapter *sc, struct synq_entry *synqe)
851 {
852 	struct listen_ctx *lctx = synqe->lctx;
853 	struct inpcb *inp = lctx->inp;
854 	struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx];
855 	int ntids;
856 
857 	INP_WLOCK_ASSERT(inp);
858 	ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1;
859 
860 	remove_tid(sc, synqe->tid, ntids);
861 	release_tid(sc, synqe->tid, lctx->ctrlq);
862 	t4_l2t_release(e);
863 	inp = release_synqe(sc, synqe);
864 	if (inp)
865 		INP_WUNLOCK(inp);
866 }
867 
868 void
synack_failure_cleanup(struct adapter * sc,int tid)869 synack_failure_cleanup(struct adapter *sc, int tid)
870 {
871 	struct synq_entry *synqe = lookup_tid(sc, tid);
872 
873 	INP_WLOCK(synqe->lctx->inp);
874 	done_with_synqe(sc, synqe);
875 }
876 
877 int
do_abort_req_synqe(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)878 do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss,
879     struct mbuf *m)
880 {
881 	struct adapter *sc = iq->adapter;
882 	const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
883 	unsigned int tid = GET_TID(cpl);
884 	struct synq_entry *synqe = lookup_tid(sc, tid);
885 	struct listen_ctx *lctx = synqe->lctx;
886 	struct inpcb *inp = lctx->inp;
887 	struct sge_ofld_txq *ofld_txq;
888 #ifdef INVARIANTS
889 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
890 #endif
891 
892 	KASSERT(opcode == CPL_ABORT_REQ_RSS,
893 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
894 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
895 	KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
896 
897 	CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
898 	    __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
899 
900 	if (negative_advice(cpl->status))
901 		return (0);	/* Ignore negative advice */
902 
903 	INP_WLOCK(inp);
904 
905 	ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx];
906 
907 	if (!(synqe->flags & TPF_FLOWC_WR_SENT))
908 		send_flowc_wr_synqe(sc, synqe);
909 
910 	/*
911 	 * If we'd initiated an abort earlier the reply to it is responsible for
912 	 * cleaning up resources.  Otherwise we tear everything down right here
913 	 * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
914 	 */
915 	if (synqe->flags & TPF_ABORT_SHUTDOWN) {
916 		INP_WUNLOCK(inp);
917 		goto done;
918 	}
919 
920 	done_with_synqe(sc, synqe);
921 	/* inp lock released by done_with_synqe */
922 done:
923 	send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
924 	return (0);
925 }
926 
927 int
do_abort_rpl_synqe(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)928 do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss,
929     struct mbuf *m)
930 {
931 	struct adapter *sc = iq->adapter;
932 	const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
933 	unsigned int tid = GET_TID(cpl);
934 	struct synq_entry *synqe = lookup_tid(sc, tid);
935 	struct listen_ctx *lctx = synqe->lctx;
936 	struct inpcb *inp = lctx->inp;
937 #ifdef INVARIANTS
938 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
939 #endif
940 
941 	KASSERT(opcode == CPL_ABORT_RPL_RSS,
942 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
943 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
944 	KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
945 
946 	CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
947 	    __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
948 
949 	INP_WLOCK(inp);
950 	KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
951 	    ("%s: wasn't expecting abort reply for synqe %p (0x%x)",
952 	    __func__, synqe, synqe->flags));
953 
954 	done_with_synqe(sc, synqe);
955 	/* inp lock released by done_with_synqe */
956 
957 	return (0);
958 }
959 
960 void
t4_offload_socket(struct toedev * tod,void * arg,struct socket * so)961 t4_offload_socket(struct toedev *tod, void *arg, struct socket *so)
962 {
963 	struct adapter *sc = tod->tod_softc;
964 	struct synq_entry *synqe = arg;
965 	struct inpcb *inp = sotoinpcb(so);
966 	struct toepcb *toep = synqe->toep;
967 
968 	NET_EPOCH_ASSERT();	/* prevents bad race with accept() */
969 	INP_WLOCK_ASSERT(inp);
970 	KASSERT(synqe->flags & TPF_SYNQE,
971 	    ("%s: %p not a synq_entry?", __func__, arg));
972 	MPASS(toep->tid == synqe->tid);
973 
974 	offload_socket(so, toep);
975 	make_established(toep, synqe->iss, synqe->irs, synqe->tcp_opt);
976 	toep->flags |= TPF_CPL_PENDING;
977 	update_tid(sc, synqe->tid, toep);
978 	synqe->flags |= TPF_SYNQE_EXPANDED;
979 	inp->inp_flowtype = (inp->inp_vflag & INP_IPV6) ?
980 	    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4;
981 	inp->inp_flowid = synqe->rss_hash;
982 }
983 
984 static void
t4opt_to_tcpopt(const struct tcp_options * t4opt,struct tcpopt * to)985 t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to)
986 {
987 	bzero(to, sizeof(*to));
988 
989 	if (t4opt->mss) {
990 		to->to_flags |= TOF_MSS;
991 		to->to_mss = be16toh(t4opt->mss);
992 	}
993 
994 	if (t4opt->wsf > 0 && t4opt->wsf < 15) {
995 		to->to_flags |= TOF_SCALE;
996 		to->to_wscale = t4opt->wsf;
997 	}
998 
999 	if (t4opt->tstamp)
1000 		to->to_flags |= TOF_TS;
1001 
1002 	if (t4opt->sack)
1003 		to->to_flags |= TOF_SACKPERM;
1004 }
1005 
1006 static bool
encapsulated_syn(struct adapter * sc,const struct cpl_pass_accept_req * cpl)1007 encapsulated_syn(struct adapter *sc, const struct cpl_pass_accept_req *cpl)
1008 {
1009 	u_int hlen = be32toh(cpl->hdr_len);
1010 
1011 	if (chip_id(sc) >= CHELSIO_T6)
1012 		return (G_T6_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header));
1013 	else
1014 		return (G_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header));
1015 }
1016 
1017 static void
pass_accept_req_to_protohdrs(struct adapter * sc,const struct mbuf * m,struct in_conninfo * inc,struct tcphdr * th,uint8_t * iptos)1018 pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m,
1019     struct in_conninfo *inc, struct tcphdr *th, uint8_t *iptos)
1020 {
1021 	const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
1022 	const struct ether_header *eh;
1023 	unsigned int hlen = be32toh(cpl->hdr_len);
1024 	uintptr_t l3hdr;
1025 	const struct tcphdr *tcp;
1026 
1027 	eh = (const void *)(cpl + 1);
1028 	if (chip_id(sc) >= CHELSIO_T6) {
1029 		l3hdr = ((uintptr_t)eh + G_T6_ETH_HDR_LEN(hlen));
1030 		tcp = (const void *)(l3hdr + G_T6_IP_HDR_LEN(hlen));
1031 	} else {
1032 		l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen));
1033 		tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen));
1034 	}
1035 
1036 	/* extract TOS (DiffServ + ECN) byte for AccECN */
1037 	if (iptos) {
1038 		if (((struct ip *)l3hdr)->ip_v == IPVERSION) {
1039 			const struct ip *ip = (const void *)l3hdr;
1040 			*iptos = ip->ip_tos;
1041 		}
1042 #ifdef INET6
1043 		else
1044 		if (((struct ip *)l3hdr)->ip_v == (IPV6_VERSION >> 4)) {
1045 			const struct ip6_hdr *ip6 = (const void *)l3hdr;
1046 			*iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
1047 		}
1048 #endif /* INET */
1049 	}
1050 
1051 	if (inc) {
1052 		bzero(inc, sizeof(*inc));
1053 		inc->inc_fport = tcp->th_sport;
1054 		inc->inc_lport = tcp->th_dport;
1055 		if (((struct ip *)l3hdr)->ip_v == IPVERSION) {
1056 			const struct ip *ip = (const void *)l3hdr;
1057 
1058 			inc->inc_faddr = ip->ip_src;
1059 			inc->inc_laddr = ip->ip_dst;
1060 		} else {
1061 			const struct ip6_hdr *ip6 = (const void *)l3hdr;
1062 
1063 			inc->inc_flags |= INC_ISIPV6;
1064 			inc->inc6_faddr = ip6->ip6_src;
1065 			inc->inc6_laddr = ip6->ip6_dst;
1066 		}
1067 	}
1068 
1069 	if (th) {
1070 		bcopy(tcp, th, sizeof(*th));
1071 		tcp_fields_to_host(th);		/* just like tcp_input */
1072 	}
1073 }
1074 
1075 static struct l2t_entry *
get_l2te_for_nexthop(struct port_info * pi,if_t ifp,struct in_conninfo * inc)1076 get_l2te_for_nexthop(struct port_info *pi, if_t ifp,
1077     struct in_conninfo *inc)
1078 {
1079 	struct l2t_entry *e;
1080 	struct sockaddr_in6 sin6;
1081 	struct sockaddr *dst = (void *)&sin6;
1082 	struct nhop_object *nh;
1083 
1084 	if (inc->inc_flags & INC_ISIPV6) {
1085 		bzero(dst, sizeof(struct sockaddr_in6));
1086 		dst->sa_len = sizeof(struct sockaddr_in6);
1087 		dst->sa_family = AF_INET6;
1088 
1089 		if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) {
1090 			/* no need for route lookup */
1091 			e = t4_l2t_get(pi, ifp, dst);
1092 			return (e);
1093 		}
1094 
1095 		nh = fib6_lookup(RT_DEFAULT_FIB, &inc->inc6_faddr, 0, NHR_NONE, 0);
1096 		if (nh == NULL)
1097 			return (NULL);
1098 		if (nh->nh_ifp != ifp)
1099 			return (NULL);
1100 		if (nh->nh_flags & NHF_GATEWAY)
1101 			((struct sockaddr_in6 *)dst)->sin6_addr = nh->gw6_sa.sin6_addr;
1102 		else
1103 			((struct sockaddr_in6 *)dst)->sin6_addr = inc->inc6_faddr;
1104 	} else {
1105 		dst->sa_len = sizeof(struct sockaddr_in);
1106 		dst->sa_family = AF_INET;
1107 
1108 		nh = fib4_lookup(RT_DEFAULT_FIB, inc->inc_faddr, 0, NHR_NONE, 0);
1109 		if (nh == NULL)
1110 			return (NULL);
1111 		if (nh->nh_ifp != ifp)
1112 			return (NULL);
1113 		if (nh->nh_flags & NHF_GATEWAY)
1114 			if (nh->gw_sa.sa_family == AF_INET)
1115 				((struct sockaddr_in *)dst)->sin_addr = nh->gw4_sa.sin_addr;
1116 			else
1117 				*((struct sockaddr_in6 *)dst) = nh->gw6_sa;
1118 		else
1119 			((struct sockaddr_in *)dst)->sin_addr = inc->inc_faddr;
1120 	}
1121 
1122 	e = t4_l2t_get(pi, ifp, dst);
1123 	return (e);
1124 }
1125 
1126 static int
send_synack(struct adapter * sc,struct synq_entry * synqe,uint64_t opt0,uint32_t opt2,int tid)1127 send_synack(struct adapter *sc, struct synq_entry *synqe, uint64_t opt0,
1128     uint32_t opt2, int tid)
1129 {
1130 	struct wrqe *wr;
1131 	struct cpl_pass_accept_rpl *rpl;
1132 	struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx];
1133 
1134 	wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) :
1135 	    sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[0]);
1136 	if (wr == NULL)
1137 		return (ENOMEM);
1138 	rpl = wrtod(wr);
1139 
1140 	if (is_t4(sc))
1141 		INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid);
1142 	else {
1143 		struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl;
1144 
1145 		INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid);
1146 		rpl5->iss = htobe32(synqe->iss);
1147 	}
1148 	rpl->opt0 = opt0;
1149 	rpl->opt2 = opt2;
1150 
1151 	return (t4_l2t_send(sc, wr, e));
1152 }
1153 
1154 #define REJECT_PASS_ACCEPT_REQ(tunnel)	do { \
1155 	if (!tunnel) { \
1156 		m_freem(m); \
1157 		m = NULL; \
1158 	} \
1159 	reject_reason = __LINE__; \
1160 	goto reject; \
1161 } while (0)
1162 
1163 /*
1164  * The context associated with a tid entry via insert_tid could be a synq_entry
1165  * or a toepcb.  The only way CPL handlers can tell is via a bit in these flags.
1166  */
1167 CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags));
1168 
1169 /*
1170  * Incoming SYN on a listening socket.
1171  *
1172  * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe,
1173  * etc.
1174  */
1175 static int
do_pass_accept_req(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)1176 do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss,
1177     struct mbuf *m)
1178 {
1179 	struct adapter *sc = iq->adapter;
1180 	struct toedev *tod;
1181 	const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
1182 	unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
1183 	unsigned int tid = GET_TID(cpl);
1184 	struct listen_ctx *lctx = lookup_stid(sc, stid);
1185 	struct inpcb *inp;
1186 	struct socket *so;
1187 	struct in_conninfo inc;
1188 	struct tcphdr th;
1189 	struct tcpopt to;
1190 	struct port_info *pi;
1191 	struct vi_info *vi;
1192 	if_t hw_ifp, ifp;
1193 	struct l2t_entry *e = NULL;
1194 	struct synq_entry *synqe = NULL;
1195 	int reject_reason, v, ntids;
1196 	uint16_t vid, l2info;
1197 	struct epoch_tracker et;
1198 #ifdef INVARIANTS
1199 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1200 #endif
1201 	struct offload_settings settings;
1202 	uint8_t iptos;
1203 
1204 	KASSERT(opcode == CPL_PASS_ACCEPT_REQ,
1205 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1206 	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
1207 
1208 	CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid,
1209 	    lctx);
1210 
1211 	/*
1212 	 * Figure out the port the SYN arrived on.  We'll look for an exact VI
1213 	 * match in a bit but in case we don't find any we'll use the main VI as
1214 	 * the incoming ifnet.
1215 	 */
1216 	l2info = be16toh(cpl->l2info);
1217 	pi = sc->port[G_SYN_INTF(l2info)];
1218 	hw_ifp = pi->vi[0].ifp;
1219 	m->m_pkthdr.rcvif = hw_ifp;
1220 
1221 	CURVNET_SET(lctx->vnet);	/* before any potential REJECT */
1222 
1223 	/*
1224 	 * If VXLAN/NVGRE parsing is enabled then SYNs in the inner traffic will
1225 	 * also hit the listener.  We don't want to offload those.
1226 	 */
1227 	if (encapsulated_syn(sc, cpl)) {
1228 		REJECT_PASS_ACCEPT_REQ(true);
1229 	}
1230 
1231 	/*
1232 	 * Use the MAC index to lookup the associated VI.  If this SYN didn't
1233 	 * match a perfect MAC filter, punt.
1234 	 */
1235 	if (!(l2info & F_SYN_XACT_MATCH)) {
1236 		REJECT_PASS_ACCEPT_REQ(true);
1237 	}
1238 	for_each_vi(pi, v, vi) {
1239 		if (vi->xact_addr_filt == G_SYN_MAC_IDX(l2info))
1240 			goto found;
1241 	}
1242 	REJECT_PASS_ACCEPT_REQ(true);
1243 found:
1244 	hw_ifp = vi->ifp;	/* the cxgbe ifnet */
1245 	m->m_pkthdr.rcvif = hw_ifp;
1246 	tod = TOEDEV(hw_ifp);
1247 
1248 	/*
1249 	 * Don't offload if the peer requested a TCP option that's not known to
1250 	 * the silicon.  Send the SYN to the kernel instead.
1251 	 */
1252 	if (__predict_false(cpl->tcpopt.unknown))
1253 		REJECT_PASS_ACCEPT_REQ(true);
1254 
1255 	/*
1256 	 * Figure out if there is a pseudo interface (vlan, lagg, etc.)
1257 	 * involved.  Don't offload if the SYN had a VLAN tag and the vid
1258 	 * doesn't match anything on this interface.
1259 	 *
1260 	 * XXX: lagg support, lagg + vlan support.
1261 	 */
1262 	vid = EVL_VLANOFTAG(be16toh(cpl->vlan));
1263 	if (vid != 0xfff && vid != 0) {
1264 		ifp = VLAN_DEVAT(hw_ifp, vid);
1265 		if (ifp == NULL)
1266 			REJECT_PASS_ACCEPT_REQ(true);
1267 	} else
1268 		ifp = hw_ifp;
1269 
1270 	/*
1271 	 * Don't offload if the ifnet that the SYN came in on is not in the same
1272 	 * vnet as the listening socket.
1273 	 */
1274 	if (lctx->vnet != if_getvnet(ifp))
1275 		REJECT_PASS_ACCEPT_REQ(true);
1276 
1277 	pass_accept_req_to_protohdrs(sc, m, &inc, &th, &iptos);
1278 	if (inc.inc_flags & INC_ISIPV6) {
1279 
1280 		/* Don't offload if the ifcap isn't enabled */
1281 		if ((if_getcapenable(ifp) & IFCAP_TOE6) == 0)
1282 			REJECT_PASS_ACCEPT_REQ(true);
1283 
1284 		/*
1285 		 * SYN must be directed to an IP6 address on this ifnet.  This
1286 		 * is more restrictive than in6_localip.
1287 		 */
1288 		NET_EPOCH_ENTER(et);
1289 		if (!in6_ifhasaddr(ifp, &inc.inc6_laddr)) {
1290 			NET_EPOCH_EXIT(et);
1291 			REJECT_PASS_ACCEPT_REQ(true);
1292 		}
1293 
1294 		ntids = 2;
1295 	} else {
1296 
1297 		/* Don't offload if the ifcap isn't enabled */
1298 		if ((if_getcapenable(ifp) & IFCAP_TOE4) == 0)
1299 			REJECT_PASS_ACCEPT_REQ(true);
1300 
1301 		/*
1302 		 * SYN must be directed to an IP address on this ifnet.  This
1303 		 * is more restrictive than in_localip.
1304 		 */
1305 		NET_EPOCH_ENTER(et);
1306 		if (!in_ifhasaddr(ifp, inc.inc_laddr)) {
1307 			NET_EPOCH_EXIT(et);
1308 			REJECT_PASS_ACCEPT_REQ(true);
1309 		}
1310 
1311 		ntids = 1;
1312 	}
1313 
1314 	e = get_l2te_for_nexthop(pi, ifp, &inc);
1315 	if (e == NULL) {
1316 		NET_EPOCH_EXIT(et);
1317 		REJECT_PASS_ACCEPT_REQ(true);
1318 	}
1319 
1320 	/* Don't offload if the 4-tuple is already in use */
1321 	if (toe_4tuple_check(&inc, &th, ifp) != 0) {
1322 		NET_EPOCH_EXIT(et);
1323 		REJECT_PASS_ACCEPT_REQ(false);
1324 	}
1325 
1326 	inp = lctx->inp;		/* listening socket, not owned by TOE */
1327 	INP_RLOCK(inp);
1328 
1329 	/* Don't offload if the listening socket has closed */
1330 	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1331 		INP_RUNLOCK(inp);
1332 		NET_EPOCH_EXIT(et);
1333 		REJECT_PASS_ACCEPT_REQ(false);
1334 	}
1335 	so = inp->inp_socket;
1336 	rw_rlock(&sc->policy_lock);
1337 	settings = *lookup_offload_policy(sc, OPEN_TYPE_PASSIVE, m,
1338 	    EVL_MAKETAG(0xfff, 0, 0), inp);
1339 	rw_runlock(&sc->policy_lock);
1340 	if (!settings.offload) {
1341 		INP_RUNLOCK(inp);
1342 		NET_EPOCH_EXIT(et);
1343 		REJECT_PASS_ACCEPT_REQ(true);	/* Rejected by COP. */
1344 	}
1345 
1346 	synqe = alloc_synqe(sc, lctx, M_NOWAIT);
1347 	if (synqe == NULL) {
1348 		INP_RUNLOCK(inp);
1349 		NET_EPOCH_EXIT(et);
1350 		REJECT_PASS_ACCEPT_REQ(true);
1351 	}
1352 	MPASS(rss->hash_type == RSS_HASH_TCP);
1353 	synqe->rss_hash = be32toh(rss->hash_val);
1354 	atomic_store_int(&synqe->ok_to_respond, 0);
1355 
1356 	init_conn_params(vi, &settings, &inc, so, &cpl->tcpopt, e->idx,
1357 	    &synqe->params);
1358 
1359 	/*
1360 	 * If all goes well t4_syncache_respond will get called during
1361 	 * syncache_add.  Note that syncache_add releases the pcb lock.
1362 	 */
1363 	t4opt_to_tcpopt(&cpl->tcpopt, &to);
1364 	toe_syncache_add(&inc, &to, &th, inp, tod, synqe, iptos);
1365 
1366 	if (atomic_load_int(&synqe->ok_to_respond) > 0) {
1367 		uint64_t opt0;
1368 		uint32_t opt2;
1369 
1370 		opt0 = calc_options0(vi, &synqe->params);
1371 		opt2 = calc_options2(vi, &synqe->params);
1372 
1373 		insert_tid(sc, tid, synqe, ntids);
1374 		synqe->tid = tid;
1375 		synqe->syn = m;
1376 		m = NULL;
1377 
1378 		if (send_synack(sc, synqe, opt0, opt2, tid) != 0) {
1379 			remove_tid(sc, tid, ntids);
1380 			m = synqe->syn;
1381 			synqe->syn = NULL;
1382 			NET_EPOCH_EXIT(et);
1383 			REJECT_PASS_ACCEPT_REQ(true);
1384 		}
1385 
1386 		CTR6(KTR_CXGBE,
1387 		    "%s: stid %u, tid %u, synqe %p, opt0 %#016lx, opt2 %#08x",
1388 		    __func__, stid, tid, synqe, be64toh(opt0), be32toh(opt2));
1389 	} else {
1390 		NET_EPOCH_EXIT(et);
1391 		REJECT_PASS_ACCEPT_REQ(false);
1392 	}
1393 
1394 	NET_EPOCH_EXIT(et);
1395 	CURVNET_RESTORE();
1396 	return (0);
1397 reject:
1398 	CURVNET_RESTORE();
1399 	CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid,
1400 	    reject_reason);
1401 
1402 	if (e)
1403 		t4_l2t_release(e);
1404 	release_tid(sc, tid, lctx->ctrlq);
1405 	if (synqe) {
1406 		inp = synqe->lctx->inp;
1407 		INP_WLOCK(inp);
1408 		inp = release_synqe(sc, synqe);
1409 		if (inp)
1410 			INP_WUNLOCK(inp);
1411 	}
1412 
1413 	if (m) {
1414 		/*
1415 		 * The connection request hit a TOE listener but is being passed
1416 		 * on to the kernel sw stack instead of getting offloaded.
1417 		 */
1418 		m_adj(m, sizeof(*cpl));
1419 		m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID |
1420 		    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1421 		m->m_pkthdr.csum_data = 0xffff;
1422 		if_input(hw_ifp, m);
1423 	}
1424 
1425 	return (reject_reason);
1426 }
1427 
1428 static void
synqe_to_protohdrs(struct adapter * sc,struct synq_entry * synqe,const struct cpl_pass_establish * cpl,struct in_conninfo * inc,struct tcphdr * th,struct tcpopt * to)1429 synqe_to_protohdrs(struct adapter *sc, struct synq_entry *synqe,
1430     const struct cpl_pass_establish *cpl, struct in_conninfo *inc,
1431     struct tcphdr *th, struct tcpopt *to)
1432 {
1433 	uint16_t tcp_opt = be16toh(cpl->tcp_opt);
1434 	uint8_t iptos;
1435 
1436 	/* start off with the original SYN */
1437 	pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th, &iptos);
1438 
1439 	/* modify parts to make it look like the ACK to our SYN|ACK */
1440 	th->th_flags = TH_ACK;
1441 	th->th_ack = synqe->iss + 1;
1442 	th->th_seq = be32toh(cpl->rcv_isn);
1443 	bzero(to, sizeof(*to));
1444 	if (G_TCPOPT_TSTAMP(tcp_opt)) {
1445 		to->to_flags |= TOF_TS;
1446 		to->to_tsecr = synqe->ts;
1447 	}
1448 }
1449 
1450 static int
do_pass_establish(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)1451 do_pass_establish(struct sge_iq *iq, const struct rss_header *rss,
1452     struct mbuf *m)
1453 {
1454 	struct adapter *sc = iq->adapter;
1455 	struct vi_info *vi;
1456 	if_t ifp;
1457 	const struct cpl_pass_establish *cpl = (const void *)(rss + 1);
1458 #if defined(KTR) || defined(INVARIANTS)
1459 	unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
1460 #endif
1461 	unsigned int tid = GET_TID(cpl);
1462 	struct synq_entry *synqe = lookup_tid(sc, tid);
1463 	struct listen_ctx *lctx = synqe->lctx;
1464 	struct inpcb *inp = lctx->inp, *new_inp;
1465 	struct socket *so;
1466 	struct tcphdr th;
1467 	struct tcpopt to;
1468 	struct in_conninfo inc;
1469 	struct toepcb *toep;
1470 	struct epoch_tracker et;
1471 	int rstreason;
1472 #ifdef INVARIANTS
1473 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1474 #endif
1475 
1476 	KASSERT(opcode == CPL_PASS_ESTABLISH,
1477 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1478 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1479 	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
1480 	KASSERT(synqe->flags & TPF_SYNQE,
1481 	    ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe));
1482 
1483 	CURVNET_SET(lctx->vnet);
1484 	NET_EPOCH_ENTER(et);	/* for syncache_expand */
1485 	INP_WLOCK(inp);
1486 
1487 	CTR6(KTR_CXGBE,
1488 	    "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x",
1489 	    __func__, stid, tid, synqe, synqe->flags, inp->inp_flags);
1490 
1491 	ifp = synqe->syn->m_pkthdr.rcvif;
1492 	vi = if_getsoftc(ifp);
1493 	KASSERT(vi->adapter == sc,
1494 	    ("%s: vi %p, sc %p mismatch", __func__, vi, sc));
1495 
1496 	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1497 reset:
1498 		send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_SEND_RST);
1499 		INP_WUNLOCK(inp);
1500 		NET_EPOCH_EXIT(et);
1501 		CURVNET_RESTORE();
1502 		return (0);
1503 	}
1504 
1505 	KASSERT(synqe->params.rxq_idx == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0],
1506 	    ("%s: CPL arrived on unexpected rxq.  %d %d", __func__,
1507 	    synqe->params.rxq_idx,
1508 	    (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0])));
1509 
1510 	toep = alloc_toepcb(vi, M_NOWAIT);
1511 	if (toep == NULL)
1512 		goto reset;
1513 	toep->tid = tid;
1514 	toep->l2te = &sc->l2t->l2tab[synqe->params.l2t_idx];
1515 	toep->vnet = lctx->vnet;
1516 	bcopy(&synqe->params, &toep->params, sizeof(toep->params));
1517 	init_toepcb(vi, toep);
1518 
1519 	MPASS(be32toh(cpl->snd_isn) - 1 == synqe->iss);
1520 	MPASS(be32toh(cpl->rcv_isn) - 1 == synqe->irs);
1521 	synqe->tcp_opt = cpl->tcp_opt;
1522 	synqe->toep = toep;
1523 
1524 	/* Come up with something that syncache_expand should be ok with. */
1525 	synqe_to_protohdrs(sc, synqe, cpl, &inc, &th, &to);
1526 	if (inc.inc_flags & INC_ISIPV6) {
1527 		if (lctx->ce == NULL) {
1528 			toep->ce = t4_get_clip_entry(sc, &inc.inc6_laddr, true);
1529 			if (toep->ce == NULL) {
1530 				free_toepcb(toep);
1531 				goto reset;	/* RST without a CLIP entry? */
1532 			}
1533 		} else {
1534 			t4_hold_clip_entry(sc, lctx->ce);
1535 			toep->ce = lctx->ce;
1536 		}
1537 	}
1538 	so = inp->inp_socket;
1539 	KASSERT(so != NULL, ("%s: socket is NULL", __func__));
1540 
1541 	rstreason = toe_syncache_expand(&inc, &to, &th, &so);
1542 	if (rstreason < 0) {
1543 		free_toepcb(toep);
1544 		send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_NO_RST);
1545 		INP_WUNLOCK(inp);
1546 		NET_EPOCH_EXIT(et);
1547 		CURVNET_RESTORE();
1548 		return (0);
1549 	} else if (rstreason == 0 || so == NULL) {
1550 		free_toepcb(toep);
1551 		goto reset;
1552 	}
1553 
1554 	/* New connection inpcb is already locked by syncache_expand(). */
1555 	new_inp = sotoinpcb(so);
1556 	INP_WLOCK_ASSERT(new_inp);
1557 	MPASS(so->so_vnet == lctx->vnet);
1558 
1559 	/*
1560 	 * This is for expansion from syncookies.
1561 	 *
1562 	 * XXX: we've held the tcbinfo lock throughout so there's no risk of
1563 	 * anyone accept'ing a connection before we've installed our hooks, but
1564 	 * this somewhat defeats the purpose of having a tod_offload_socket :-(
1565 	 */
1566 	if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) {
1567 		tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0);
1568 		t4_offload_socket(TOEDEV(ifp), synqe, so);
1569 	}
1570 
1571 	INP_WUNLOCK(new_inp);
1572 
1573 	/* Done with the synqe */
1574 	inp = release_synqe(sc, synqe);
1575 	if (inp != NULL)
1576 		INP_WUNLOCK(inp);
1577 	NET_EPOCH_EXIT(et);
1578 	CURVNET_RESTORE();
1579 
1580 	return (0);
1581 }
1582 
1583 void
t4_init_listen_cpl_handlers(void)1584 t4_init_listen_cpl_handlers(void)
1585 {
1586 
1587 	t4_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl);
1588 	t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
1589 	t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
1590 	t4_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
1591 }
1592 
1593 void
t4_uninit_listen_cpl_handlers(void)1594 t4_uninit_listen_cpl_handlers(void)
1595 {
1596 
1597 	t4_register_cpl_handler(CPL_PASS_OPEN_RPL, NULL);
1598 	t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, NULL);
1599 	t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, NULL);
1600 	t4_register_cpl_handler(CPL_PASS_ESTABLISH, NULL);
1601 }
1602 #endif
1603