1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2012 Chelsio Communications, Inc.
5 * All rights reserved.
6 * Written by: Navdeep Parhar <np@FreeBSD.org>
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include <sys/cdefs.h>
31 #include "opt_inet.h"
32 #include "opt_inet6.h"
33
34 #ifdef TCP_OFFLOAD
35 #include <sys/param.h>
36 #include <sys/types.h>
37 #include <sys/kernel.h>
38 #include <sys/ktr.h>
39 #include <sys/module.h>
40 #include <sys/protosw.h>
41 #include <sys/refcount.h>
42 #include <sys/domain.h>
43 #include <sys/fnv_hash.h>
44 #include <sys/socket.h>
45 #include <sys/socketvar.h>
46 #include <sys/sysctl.h>
47 #include <net/ethernet.h>
48 #include <net/if.h>
49 #include <net/if_types.h>
50 #include <net/if_vlan_var.h>
51 #include <net/route.h>
52 #include <net/route/nhop.h>
53 #include <netinet/in.h>
54 #include <netinet/in_fib.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/ip.h>
57 #include <netinet/ip6.h>
58 #include <netinet6/in6_fib.h>
59 #include <netinet6/scope6_var.h>
60 #include <netinet/tcp_timer.h>
61 #define TCPSTATES
62 #include <netinet/tcp_fsm.h>
63 #include <netinet/tcp_var.h>
64 #include <netinet/toecore.h>
65 #include <netinet/cc/cc.h>
66
67 #include "common/common.h"
68 #include "common/t4_msg.h"
69 #include "common/t4_regs.h"
70 #include "t4_clip.h"
71 #include "tom/t4_tom_l2t.h"
72 #include "tom/t4_tom.h"
73
74 /* stid services */
75 static int alloc_stid(struct adapter *, struct listen_ctx *, int);
76 static struct listen_ctx *lookup_stid(struct adapter *, int);
77 static void free_stid(struct adapter *, struct listen_ctx *);
78
79 /* lctx services */
80 static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *,
81 struct vi_info *);
82 static int free_lctx(struct adapter *, struct listen_ctx *);
83 static void hold_lctx(struct listen_ctx *);
84 static void listen_hash_add(struct adapter *, struct listen_ctx *);
85 static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *);
86 static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *);
87 static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *);
88
89 static void send_abort_rpl_synqe(struct toedev *, struct synq_entry *, int);
90
91 static int
alloc_stid(struct adapter * sc,struct listen_ctx * lctx,int isipv6)92 alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6)
93 {
94 struct tid_info *t = &sc->tids;
95 u_int stid, n, f, mask;
96 struct stid_region *sr = &lctx->stid_region;
97
98 /*
99 * An IPv6 server needs 2 naturally aligned stids (1 stid = 4 cells) in
100 * the TCAM. The start of the stid region is properly aligned (the chip
101 * requires each region to be 128-cell aligned).
102 */
103 n = isipv6 ? 2 : 1;
104 mask = n - 1;
105 KASSERT((t->stid_base & mask) == 0 && (t->nstids & mask) == 0,
106 ("%s: stid region (%u, %u) not properly aligned. n = %u",
107 __func__, t->stid_base, t->nstids, n));
108
109 mtx_lock(&t->stid_lock);
110 if (n > t->nstids - t->stids_in_use) {
111 mtx_unlock(&t->stid_lock);
112 return (-1);
113 }
114
115 if (t->nstids_free_head >= n) {
116 /*
117 * This allocation will definitely succeed because the region
118 * starts at a good alignment and we just checked we have enough
119 * stids free.
120 */
121 f = t->nstids_free_head & mask;
122 t->nstids_free_head -= n + f;
123 stid = t->nstids_free_head;
124 TAILQ_INSERT_HEAD(&t->stids, sr, link);
125 } else {
126 struct stid_region *s;
127
128 stid = t->nstids_free_head;
129 TAILQ_FOREACH(s, &t->stids, link) {
130 stid += s->used + s->free;
131 f = stid & mask;
132 if (s->free >= n + f) {
133 stid -= n + f;
134 s->free -= n + f;
135 TAILQ_INSERT_AFTER(&t->stids, s, sr, link);
136 goto allocated;
137 }
138 }
139
140 if (__predict_false(stid != t->nstids)) {
141 panic("%s: stids TAILQ (%p) corrupt."
142 " At %d instead of %d at the end of the queue.",
143 __func__, &t->stids, stid, t->nstids);
144 }
145
146 mtx_unlock(&t->stid_lock);
147 return (-1);
148 }
149
150 allocated:
151 sr->used = n;
152 sr->free = f;
153 t->stids_in_use += n;
154 t->stid_tab[stid] = lctx;
155 mtx_unlock(&t->stid_lock);
156
157 KASSERT(((stid + t->stid_base) & mask) == 0,
158 ("%s: EDOOFUS.", __func__));
159 return (stid + t->stid_base);
160 }
161
162 static struct listen_ctx *
lookup_stid(struct adapter * sc,int stid)163 lookup_stid(struct adapter *sc, int stid)
164 {
165 struct tid_info *t = &sc->tids;
166
167 return (t->stid_tab[stid - t->stid_base]);
168 }
169
170 static void
free_stid(struct adapter * sc,struct listen_ctx * lctx)171 free_stid(struct adapter *sc, struct listen_ctx *lctx)
172 {
173 struct tid_info *t = &sc->tids;
174 struct stid_region *sr = &lctx->stid_region;
175 struct stid_region *s;
176
177 KASSERT(sr->used > 0, ("%s: nonsense free (%d)", __func__, sr->used));
178
179 mtx_lock(&t->stid_lock);
180 s = TAILQ_PREV(sr, stid_head, link);
181 if (s != NULL)
182 s->free += sr->used + sr->free;
183 else
184 t->nstids_free_head += sr->used + sr->free;
185 KASSERT(t->stids_in_use >= sr->used,
186 ("%s: stids_in_use (%u) < stids being freed (%u)", __func__,
187 t->stids_in_use, sr->used));
188 t->stids_in_use -= sr->used;
189 TAILQ_REMOVE(&t->stids, sr, link);
190 mtx_unlock(&t->stid_lock);
191 }
192
193 static struct listen_ctx *
alloc_lctx(struct adapter * sc,struct inpcb * inp,struct vi_info * vi)194 alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi)
195 {
196 struct listen_ctx *lctx;
197
198 INP_WLOCK_ASSERT(inp);
199
200 lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO);
201 if (lctx == NULL)
202 return (NULL);
203
204 lctx->stid = alloc_stid(sc, lctx, inp->inp_vflag & INP_IPV6);
205 if (lctx->stid < 0) {
206 free(lctx, M_CXGBE);
207 return (NULL);
208 }
209
210 if (inp->inp_vflag & INP_IPV6 &&
211 !IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) {
212 lctx->ce = t4_get_clip_entry(sc, &inp->in6p_laddr, true);
213 if (lctx->ce == NULL) {
214 free(lctx, M_CXGBE);
215 return (NULL);
216 }
217 }
218
219 lctx->ctrlq = &sc->sge.ctrlq[vi->pi->port_id];
220 lctx->ofld_rxq = &sc->sge.ofld_rxq[vi->first_ofld_rxq];
221 refcount_init(&lctx->refcount, 1);
222
223 lctx->inp = inp;
224 lctx->vnet = inp->inp_socket->so_vnet;
225 in_pcbref(inp);
226
227 return (lctx);
228 }
229
230 /* Don't call this directly, use release_lctx instead */
231 static int
free_lctx(struct adapter * sc,struct listen_ctx * lctx)232 free_lctx(struct adapter *sc, struct listen_ctx *lctx)
233 {
234 struct inpcb *inp = lctx->inp;
235
236 INP_WLOCK_ASSERT(inp);
237 KASSERT(lctx->refcount == 0,
238 ("%s: refcount %d", __func__, lctx->refcount));
239 KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid));
240
241 CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p",
242 __func__, lctx->stid, lctx, lctx->inp);
243
244 if (lctx->ce)
245 t4_release_clip_entry(sc, lctx->ce);
246 free_stid(sc, lctx);
247 free(lctx, M_CXGBE);
248
249 return (in_pcbrele_wlocked(inp));
250 }
251
252 static void
hold_lctx(struct listen_ctx * lctx)253 hold_lctx(struct listen_ctx *lctx)
254 {
255
256 refcount_acquire(&lctx->refcount);
257 }
258
259 static inline uint32_t
listen_hashfn(void * key,u_long mask)260 listen_hashfn(void *key, u_long mask)
261 {
262
263 return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask);
264 }
265
266 /*
267 * Add a listen_ctx entry to the listen hash table.
268 */
269 static void
listen_hash_add(struct adapter * sc,struct listen_ctx * lctx)270 listen_hash_add(struct adapter *sc, struct listen_ctx *lctx)
271 {
272 struct tom_data *td = sc->tom_softc;
273 int bucket = listen_hashfn(lctx->inp, td->listen_mask);
274
275 mtx_lock(&td->lctx_hash_lock);
276 LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link);
277 td->lctx_count++;
278 mtx_unlock(&td->lctx_hash_lock);
279 }
280
281 /*
282 * Look for the listening socket's context entry in the hash and return it.
283 */
284 static struct listen_ctx *
listen_hash_find(struct adapter * sc,struct inpcb * inp)285 listen_hash_find(struct adapter *sc, struct inpcb *inp)
286 {
287 struct tom_data *td = sc->tom_softc;
288 int bucket = listen_hashfn(inp, td->listen_mask);
289 struct listen_ctx *lctx;
290
291 mtx_lock(&td->lctx_hash_lock);
292 LIST_FOREACH(lctx, &td->listen_hash[bucket], link) {
293 if (lctx->inp == inp)
294 break;
295 }
296 mtx_unlock(&td->lctx_hash_lock);
297
298 return (lctx);
299 }
300
301 /*
302 * Removes the listen_ctx structure for inp from the hash and returns it.
303 */
304 static struct listen_ctx *
listen_hash_del(struct adapter * sc,struct inpcb * inp)305 listen_hash_del(struct adapter *sc, struct inpcb *inp)
306 {
307 struct tom_data *td = sc->tom_softc;
308 int bucket = listen_hashfn(inp, td->listen_mask);
309 struct listen_ctx *lctx, *l;
310
311 mtx_lock(&td->lctx_hash_lock);
312 LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) {
313 if (lctx->inp == inp) {
314 LIST_REMOVE(lctx, link);
315 td->lctx_count--;
316 break;
317 }
318 }
319 mtx_unlock(&td->lctx_hash_lock);
320
321 return (lctx);
322 }
323
324 /*
325 * Releases a hold on the lctx. Must be called with the listening socket's inp
326 * locked. The inp may be freed by this function and it returns NULL to
327 * indicate this.
328 */
329 static struct inpcb *
release_lctx(struct adapter * sc,struct listen_ctx * lctx)330 release_lctx(struct adapter *sc, struct listen_ctx *lctx)
331 {
332 struct inpcb *inp = lctx->inp;
333 int inp_freed = 0;
334
335 INP_WLOCK_ASSERT(inp);
336 if (refcount_release(&lctx->refcount))
337 inp_freed = free_lctx(sc, lctx);
338
339 return (inp_freed ? NULL : inp);
340 }
341
342 static void
send_flowc_wr_synqe(struct adapter * sc,struct synq_entry * synqe)343 send_flowc_wr_synqe(struct adapter *sc, struct synq_entry *synqe)
344 {
345 struct mbuf *m = synqe->syn;
346 if_t ifp = m->m_pkthdr.rcvif;
347 struct vi_info *vi = if_getsoftc(ifp);
348 struct port_info *pi = vi->pi;
349 struct wrqe *wr;
350 struct fw_flowc_wr *flowc;
351 struct sge_ofld_txq *ofld_txq;
352 struct sge_ofld_rxq *ofld_rxq;
353 const int nparams = 6;
354 const int flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
355 const u_int pfvf = sc->pf << S_FW_VIID_PFN;
356
357 INP_WLOCK_ASSERT(synqe->lctx->inp);
358 MPASS((synqe->flags & TPF_FLOWC_WR_SENT) == 0);
359
360 ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx];
361 ofld_rxq = &sc->sge.ofld_rxq[synqe->params.rxq_idx];
362
363 wr = alloc_wrqe(roundup2(flowclen, 16), &ofld_txq->wrq);
364 if (wr == NULL) {
365 /* XXX */
366 panic("%s: allocation failure.", __func__);
367 }
368 flowc = wrtod(wr);
369 memset(flowc, 0, wr->wr_len);
370 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
371 V_FW_FLOWC_WR_NPARAMS(nparams));
372 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
373 V_FW_WR_FLOWID(synqe->tid));
374 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
375 flowc->mnemval[0].val = htobe32(pfvf);
376 flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
377 flowc->mnemval[1].val = htobe32(pi->tx_chan);
378 flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
379 flowc->mnemval[2].val = htobe32(pi->tx_chan);
380 flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
381 flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id);
382 flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF;
383 flowc->mnemval[4].val = htobe32(512);
384 flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS;
385 flowc->mnemval[5].val = htobe32(512);
386
387 synqe->flags |= TPF_FLOWC_WR_SENT;
388 t4_wrq_tx(sc, wr);
389 }
390
391 static void
send_abort_rpl_synqe(struct toedev * tod,struct synq_entry * synqe,int rst_status)392 send_abort_rpl_synqe(struct toedev *tod, struct synq_entry *synqe,
393 int rst_status)
394 {
395 struct adapter *sc = tod->tod_softc;
396 struct wrqe *wr;
397 struct cpl_abort_req *req;
398
399 INP_WLOCK_ASSERT(synqe->lctx->inp);
400
401 CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s",
402 __func__, synqe, synqe->flags, synqe->tid,
403 synqe->flags & TPF_ABORT_SHUTDOWN ?
404 " (abort already in progress)" : "");
405 if (synqe->flags & TPF_ABORT_SHUTDOWN)
406 return; /* abort already in progress */
407 synqe->flags |= TPF_ABORT_SHUTDOWN;
408
409 if (!(synqe->flags & TPF_FLOWC_WR_SENT))
410 send_flowc_wr_synqe(sc, synqe);
411
412 wr = alloc_wrqe(sizeof(*req),
413 &sc->sge.ofld_txq[synqe->params.txq_idx].wrq);
414 if (wr == NULL) {
415 /* XXX */
416 panic("%s: allocation failure.", __func__);
417 }
418 req = wrtod(wr);
419 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid);
420 req->rsvd0 = 0; /* don't have a snd_nxt */
421 req->rsvd1 = 1; /* no data sent yet */
422 req->cmd = rst_status;
423
424 t4_l2t_send(sc, wr, &sc->l2t->l2tab[synqe->params.l2t_idx]);
425 }
426
427 static int
create_server(struct adapter * sc,struct listen_ctx * lctx)428 create_server(struct adapter *sc, struct listen_ctx *lctx)
429 {
430 struct wrqe *wr;
431 struct cpl_pass_open_req *req;
432 struct inpcb *inp = lctx->inp;
433
434 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
435 if (wr == NULL) {
436 log(LOG_ERR, "%s: allocation failure", __func__);
437 return (ENOMEM);
438 }
439 req = wrtod(wr);
440
441 INIT_TP_WR(req, 0);
442 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid));
443 req->local_port = inp->inp_lport;
444 req->peer_port = 0;
445 req->local_ip = inp->inp_laddr.s_addr;
446 req->peer_ip = 0;
447 req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
448 req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
449 F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
450
451 t4_wrq_tx(sc, wr);
452 return (0);
453 }
454
455 static int
create_server6(struct adapter * sc,struct listen_ctx * lctx)456 create_server6(struct adapter *sc, struct listen_ctx *lctx)
457 {
458 struct wrqe *wr;
459 struct cpl_pass_open_req6 *req;
460 struct inpcb *inp = lctx->inp;
461
462 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
463 if (wr == NULL) {
464 log(LOG_ERR, "%s: allocation failure", __func__);
465 return (ENOMEM);
466 }
467 req = wrtod(wr);
468
469 INIT_TP_WR(req, 0);
470 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid));
471 req->local_port = inp->inp_lport;
472 req->peer_port = 0;
473 req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0];
474 req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8];
475 req->peer_ip_hi = 0;
476 req->peer_ip_lo = 0;
477 req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
478 req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
479 F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
480
481 t4_wrq_tx(sc, wr);
482 return (0);
483 }
484
485 static int
destroy_server(struct adapter * sc,struct listen_ctx * lctx)486 destroy_server(struct adapter *sc, struct listen_ctx *lctx)
487 {
488 struct wrqe *wr;
489 struct cpl_close_listsvr_req *req;
490
491 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
492 if (wr == NULL) {
493 /* XXX */
494 panic("%s: allocation failure.", __func__);
495 }
496 req = wrtod(wr);
497
498 INIT_TP_WR(req, 0);
499 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ,
500 lctx->stid));
501 req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id);
502 req->rsvd = htobe16(0);
503
504 t4_wrq_tx(sc, wr);
505 return (0);
506 }
507
508 /*
509 * Start a listening server by sending a passive open request to HW.
510 *
511 * Can't take adapter lock here and access to sc->flags,
512 * sc->offload_map, if_capenable are all race prone.
513 */
514 int
t4_listen_start(struct toedev * tod,struct tcpcb * tp)515 t4_listen_start(struct toedev *tod, struct tcpcb *tp)
516 {
517 struct adapter *sc = tod->tod_softc;
518 struct vi_info *vi;
519 struct port_info *pi;
520 struct inpcb *inp = tptoinpcb(tp);
521 struct listen_ctx *lctx;
522 int i, rc, v;
523 struct offload_settings settings;
524
525 INP_WLOCK_ASSERT(inp);
526
527 rw_rlock(&sc->policy_lock);
528 settings = *lookup_offload_policy(sc, OPEN_TYPE_LISTEN, NULL,
529 EVL_MAKETAG(0xfff, 0, 0), inp);
530 rw_runlock(&sc->policy_lock);
531 if (!settings.offload)
532 return (0);
533
534 /* Don't start a hardware listener for any loopback address. */
535 if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr))
536 return (0);
537 if (!(inp->inp_vflag & INP_IPV6) &&
538 IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr)))
539 return (0);
540 if (sc->flags & KERN_TLS_ON)
541 return (0);
542 #if 0
543 ADAPTER_LOCK(sc);
544 if (IS_BUSY(sc)) {
545 log(LOG_ERR, "%s: listen request ignored, %s is busy",
546 __func__, device_get_nameunit(sc->dev));
547 goto done;
548 }
549
550 KASSERT(uld_active(sc, ULD_TOM),
551 ("%s: TOM not initialized", __func__));
552 #endif
553
554 /*
555 * Find an initialized VI with IFCAP_TOE (4 or 6). We'll use the first
556 * such VI's queues to send the passive open and receive the reply to
557 * it.
558 *
559 * XXX: need a way to mark a port in use by offload. if_cxgbe should
560 * then reject any attempt to bring down such a port (and maybe reject
561 * attempts to disable IFCAP_TOE on that port too?).
562 */
563 for_each_port(sc, i) {
564 pi = sc->port[i];
565 for_each_vi(pi, v, vi) {
566 if (vi->flags & VI_INIT_DONE &&
567 if_getcapenable(vi->ifp) & IFCAP_TOE)
568 goto found;
569 }
570 }
571 goto done; /* no port that's UP with IFCAP_TOE enabled */
572 found:
573
574 if (listen_hash_find(sc, inp) != NULL)
575 goto done; /* already setup */
576
577 lctx = alloc_lctx(sc, inp, vi);
578 if (lctx == NULL) {
579 log(LOG_ERR,
580 "%s: listen request ignored, %s couldn't allocate lctx\n",
581 __func__, device_get_nameunit(sc->dev));
582 goto done;
583 }
584 listen_hash_add(sc, lctx);
585
586 CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x",
587 __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp,
588 inp->inp_vflag);
589
590 if (inp->inp_vflag & INP_IPV6)
591 rc = create_server6(sc, lctx);
592 else
593 rc = create_server(sc, lctx);
594 if (rc != 0) {
595 log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n",
596 __func__, device_get_nameunit(sc->dev), rc);
597 (void) listen_hash_del(sc, inp);
598 inp = release_lctx(sc, lctx);
599 /* can't be freed, host stack has a reference */
600 KASSERT(inp != NULL, ("%s: inp freed", __func__));
601 goto done;
602 }
603 lctx->flags |= LCTX_RPL_PENDING;
604 done:
605 #if 0
606 ADAPTER_UNLOCK(sc);
607 #endif
608 return (0);
609 }
610
611 int
t4_listen_stop(struct toedev * tod,struct tcpcb * tp)612 t4_listen_stop(struct toedev *tod, struct tcpcb *tp)
613 {
614 struct listen_ctx *lctx;
615 struct adapter *sc = tod->tod_softc;
616 struct inpcb *inp = tptoinpcb(tp);
617
618 INP_WLOCK_ASSERT(inp);
619
620 lctx = listen_hash_del(sc, inp);
621 if (lctx == NULL)
622 return (ENOENT); /* no hardware listener for this inp */
623
624 CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid,
625 lctx, lctx->flags);
626
627 /*
628 * If the reply to the PASS_OPEN is still pending we'll wait for it to
629 * arrive and clean up when it does.
630 */
631 if (lctx->flags & LCTX_RPL_PENDING) {
632 return (EINPROGRESS);
633 }
634
635 destroy_server(sc, lctx);
636 return (0);
637 }
638
639 static inline struct synq_entry *
alloc_synqe(struct adapter * sc __unused,struct listen_ctx * lctx,int flags)640 alloc_synqe(struct adapter *sc __unused, struct listen_ctx *lctx, int flags)
641 {
642 struct synq_entry *synqe;
643
644 INP_RLOCK_ASSERT(lctx->inp);
645 MPASS(flags == M_WAITOK || flags == M_NOWAIT);
646
647 synqe = malloc(sizeof(*synqe), M_CXGBE, flags);
648 if (__predict_true(synqe != NULL)) {
649 synqe->flags = TPF_SYNQE;
650 refcount_init(&synqe->refcnt, 1);
651 synqe->lctx = lctx;
652 hold_lctx(lctx); /* Every synqe has a ref on its lctx. */
653 synqe->syn = NULL;
654 }
655
656 return (synqe);
657 }
658
659 static inline void
hold_synqe(struct synq_entry * synqe)660 hold_synqe(struct synq_entry *synqe)
661 {
662
663 refcount_acquire(&synqe->refcnt);
664 }
665
666 static inline struct inpcb *
release_synqe(struct adapter * sc,struct synq_entry * synqe)667 release_synqe(struct adapter *sc, struct synq_entry *synqe)
668 {
669 struct inpcb *inp;
670
671 MPASS(synqe->flags & TPF_SYNQE);
672 MPASS(synqe->lctx != NULL);
673
674 inp = synqe->lctx->inp;
675 MPASS(inp != NULL);
676 INP_WLOCK_ASSERT(inp);
677
678 if (refcount_release(&synqe->refcnt)) {
679 inp = release_lctx(sc, synqe->lctx);
680 m_freem(synqe->syn);
681 free(synqe, M_CXGBE);
682 }
683
684 return (inp);
685 }
686
687 void
t4_syncache_added(struct toedev * tod __unused,void * arg)688 t4_syncache_added(struct toedev *tod __unused, void *arg)
689 {
690 struct synq_entry *synqe = arg;
691
692 hold_synqe(synqe);
693 }
694
695 void
t4_syncache_removed(struct toedev * tod,void * arg)696 t4_syncache_removed(struct toedev *tod, void *arg)
697 {
698 struct adapter *sc = tod->tod_softc;
699 struct synq_entry *synqe = arg;
700 struct inpcb *inp = synqe->lctx->inp;
701
702 /*
703 * XXX: this is a LOR but harmless when running from the softclock.
704 */
705 INP_WLOCK(inp);
706 inp = release_synqe(sc, synqe);
707 if (inp != NULL)
708 INP_WUNLOCK(inp);
709 }
710
711 int
t4_syncache_respond(struct toedev * tod,void * arg,struct mbuf * m)712 t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m)
713 {
714 struct synq_entry *synqe = arg;
715
716 if (atomic_fetchadd_int(&synqe->ok_to_respond, 1) == 0) {
717 struct tcpopt to;
718 struct ip *ip = mtod(m, struct ip *);
719 struct tcphdr *th;
720
721 if (ip->ip_v == IPVERSION)
722 th = (void *)(ip + 1);
723 else
724 th = (void *)((struct ip6_hdr *)ip + 1);
725 bzero(&to, sizeof(to));
726 tcp_dooptions(&to, (void *)(th + 1),
727 (th->th_off << 2) - sizeof(*th), TO_SYN);
728
729 /* save these for later */
730 synqe->iss = be32toh(th->th_seq);
731 synqe->irs = be32toh(th->th_ack) - 1;
732 synqe->ts = to.to_tsval;
733 }
734
735 m_freem(m); /* don't need this any more */
736 return (0);
737 }
738
739 static int
do_pass_open_rpl(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)740 do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
741 struct mbuf *m)
742 {
743 struct adapter *sc = iq->adapter;
744 const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1);
745 int stid = GET_TID(cpl);
746 unsigned int status = cpl->status;
747 struct listen_ctx *lctx = lookup_stid(sc, stid);
748 struct inpcb *inp = lctx->inp;
749 #ifdef INVARIANTS
750 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
751 #endif
752
753 KASSERT(opcode == CPL_PASS_OPEN_RPL,
754 ("%s: unexpected opcode 0x%x", __func__, opcode));
755 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
756 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
757
758 INP_WLOCK(inp);
759
760 CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x",
761 __func__, stid, status, lctx->flags);
762
763 lctx->flags &= ~LCTX_RPL_PENDING;
764
765 if (status != CPL_ERR_NONE)
766 log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status);
767
768 #ifdef INVARIANTS
769 /*
770 * If the inp has been dropped (listening socket closed) then
771 * listen_stop must have run and taken the inp out of the hash.
772 */
773 if (inp->inp_flags & INP_DROPPED) {
774 KASSERT(listen_hash_del(sc, inp) == NULL,
775 ("%s: inp %p still in listen hash", __func__, inp));
776 }
777 #endif
778
779 if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) {
780 if (release_lctx(sc, lctx) != NULL)
781 INP_WUNLOCK(inp);
782 return (status);
783 }
784
785 /*
786 * Listening socket stopped listening earlier and now the chip tells us
787 * it has started the hardware listener. Stop it; the lctx will be
788 * released in do_close_server_rpl.
789 */
790 if (inp->inp_flags & INP_DROPPED) {
791 destroy_server(sc, lctx);
792 INP_WUNLOCK(inp);
793 return (status);
794 }
795
796 /*
797 * Failed to start hardware listener. Take inp out of the hash and
798 * release our reference on it. An error message has been logged
799 * already.
800 */
801 if (status != CPL_ERR_NONE) {
802 listen_hash_del(sc, inp);
803 if (release_lctx(sc, lctx) != NULL)
804 INP_WUNLOCK(inp);
805 return (status);
806 }
807
808 /* hardware listener open for business */
809
810 INP_WUNLOCK(inp);
811 return (status);
812 }
813
814 static int
do_close_server_rpl(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)815 do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss,
816 struct mbuf *m)
817 {
818 struct adapter *sc = iq->adapter;
819 const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1);
820 int stid = GET_TID(cpl);
821 unsigned int status = cpl->status;
822 struct listen_ctx *lctx = lookup_stid(sc, stid);
823 struct inpcb *inp = lctx->inp;
824 #ifdef INVARIANTS
825 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
826 #endif
827
828 KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL,
829 ("%s: unexpected opcode 0x%x", __func__, opcode));
830 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
831 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
832
833 CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status);
834
835 if (status != CPL_ERR_NONE) {
836 log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n",
837 __func__, status, stid);
838 return (status);
839 }
840
841 INP_WLOCK(inp);
842 inp = release_lctx(sc, lctx);
843 if (inp != NULL)
844 INP_WUNLOCK(inp);
845
846 return (status);
847 }
848
849 static void
done_with_synqe(struct adapter * sc,struct synq_entry * synqe)850 done_with_synqe(struct adapter *sc, struct synq_entry *synqe)
851 {
852 struct listen_ctx *lctx = synqe->lctx;
853 struct inpcb *inp = lctx->inp;
854 struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx];
855 int ntids;
856
857 INP_WLOCK_ASSERT(inp);
858 ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1;
859
860 remove_tid(sc, synqe->tid, ntids);
861 release_tid(sc, synqe->tid, lctx->ctrlq);
862 t4_l2t_release(e);
863 inp = release_synqe(sc, synqe);
864 if (inp)
865 INP_WUNLOCK(inp);
866 }
867
868 void
synack_failure_cleanup(struct adapter * sc,int tid)869 synack_failure_cleanup(struct adapter *sc, int tid)
870 {
871 struct synq_entry *synqe = lookup_tid(sc, tid);
872
873 INP_WLOCK(synqe->lctx->inp);
874 done_with_synqe(sc, synqe);
875 }
876
877 int
do_abort_req_synqe(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)878 do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss,
879 struct mbuf *m)
880 {
881 struct adapter *sc = iq->adapter;
882 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
883 unsigned int tid = GET_TID(cpl);
884 struct synq_entry *synqe = lookup_tid(sc, tid);
885 struct listen_ctx *lctx = synqe->lctx;
886 struct inpcb *inp = lctx->inp;
887 struct sge_ofld_txq *ofld_txq;
888 #ifdef INVARIANTS
889 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
890 #endif
891
892 KASSERT(opcode == CPL_ABORT_REQ_RSS,
893 ("%s: unexpected opcode 0x%x", __func__, opcode));
894 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
895 KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
896
897 CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
898 __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
899
900 if (negative_advice(cpl->status))
901 return (0); /* Ignore negative advice */
902
903 INP_WLOCK(inp);
904
905 ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx];
906
907 if (!(synqe->flags & TPF_FLOWC_WR_SENT))
908 send_flowc_wr_synqe(sc, synqe);
909
910 /*
911 * If we'd initiated an abort earlier the reply to it is responsible for
912 * cleaning up resources. Otherwise we tear everything down right here
913 * right now. We owe the T4 a CPL_ABORT_RPL no matter what.
914 */
915 if (synqe->flags & TPF_ABORT_SHUTDOWN) {
916 INP_WUNLOCK(inp);
917 goto done;
918 }
919
920 done_with_synqe(sc, synqe);
921 /* inp lock released by done_with_synqe */
922 done:
923 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
924 return (0);
925 }
926
927 int
do_abort_rpl_synqe(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)928 do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss,
929 struct mbuf *m)
930 {
931 struct adapter *sc = iq->adapter;
932 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
933 unsigned int tid = GET_TID(cpl);
934 struct synq_entry *synqe = lookup_tid(sc, tid);
935 struct listen_ctx *lctx = synqe->lctx;
936 struct inpcb *inp = lctx->inp;
937 #ifdef INVARIANTS
938 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
939 #endif
940
941 KASSERT(opcode == CPL_ABORT_RPL_RSS,
942 ("%s: unexpected opcode 0x%x", __func__, opcode));
943 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
944 KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
945
946 CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
947 __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
948
949 INP_WLOCK(inp);
950 KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
951 ("%s: wasn't expecting abort reply for synqe %p (0x%x)",
952 __func__, synqe, synqe->flags));
953
954 done_with_synqe(sc, synqe);
955 /* inp lock released by done_with_synqe */
956
957 return (0);
958 }
959
960 void
t4_offload_socket(struct toedev * tod,void * arg,struct socket * so)961 t4_offload_socket(struct toedev *tod, void *arg, struct socket *so)
962 {
963 struct adapter *sc = tod->tod_softc;
964 struct synq_entry *synqe = arg;
965 struct inpcb *inp = sotoinpcb(so);
966 struct toepcb *toep = synqe->toep;
967
968 NET_EPOCH_ASSERT(); /* prevents bad race with accept() */
969 INP_WLOCK_ASSERT(inp);
970 KASSERT(synqe->flags & TPF_SYNQE,
971 ("%s: %p not a synq_entry?", __func__, arg));
972 MPASS(toep->tid == synqe->tid);
973
974 offload_socket(so, toep);
975 make_established(toep, synqe->iss, synqe->irs, synqe->tcp_opt);
976 toep->flags |= TPF_CPL_PENDING;
977 update_tid(sc, synqe->tid, toep);
978 synqe->flags |= TPF_SYNQE_EXPANDED;
979 inp->inp_flowtype = (inp->inp_vflag & INP_IPV6) ?
980 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4;
981 inp->inp_flowid = synqe->rss_hash;
982 }
983
984 static void
t4opt_to_tcpopt(const struct tcp_options * t4opt,struct tcpopt * to)985 t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to)
986 {
987 bzero(to, sizeof(*to));
988
989 if (t4opt->mss) {
990 to->to_flags |= TOF_MSS;
991 to->to_mss = be16toh(t4opt->mss);
992 }
993
994 if (t4opt->wsf > 0 && t4opt->wsf < 15) {
995 to->to_flags |= TOF_SCALE;
996 to->to_wscale = t4opt->wsf;
997 }
998
999 if (t4opt->tstamp)
1000 to->to_flags |= TOF_TS;
1001
1002 if (t4opt->sack)
1003 to->to_flags |= TOF_SACKPERM;
1004 }
1005
1006 static bool
encapsulated_syn(struct adapter * sc,const struct cpl_pass_accept_req * cpl)1007 encapsulated_syn(struct adapter *sc, const struct cpl_pass_accept_req *cpl)
1008 {
1009 u_int hlen = be32toh(cpl->hdr_len);
1010
1011 if (chip_id(sc) >= CHELSIO_T6)
1012 return (G_T6_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header));
1013 else
1014 return (G_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header));
1015 }
1016
1017 static void
pass_accept_req_to_protohdrs(struct adapter * sc,const struct mbuf * m,struct in_conninfo * inc,struct tcphdr * th,uint8_t * iptos)1018 pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m,
1019 struct in_conninfo *inc, struct tcphdr *th, uint8_t *iptos)
1020 {
1021 const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
1022 const struct ether_header *eh;
1023 unsigned int hlen = be32toh(cpl->hdr_len);
1024 uintptr_t l3hdr;
1025 const struct tcphdr *tcp;
1026
1027 eh = (const void *)(cpl + 1);
1028 if (chip_id(sc) >= CHELSIO_T6) {
1029 l3hdr = ((uintptr_t)eh + G_T6_ETH_HDR_LEN(hlen));
1030 tcp = (const void *)(l3hdr + G_T6_IP_HDR_LEN(hlen));
1031 } else {
1032 l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen));
1033 tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen));
1034 }
1035
1036 /* extract TOS (DiffServ + ECN) byte for AccECN */
1037 if (iptos) {
1038 if (((struct ip *)l3hdr)->ip_v == IPVERSION) {
1039 const struct ip *ip = (const void *)l3hdr;
1040 *iptos = ip->ip_tos;
1041 }
1042 #ifdef INET6
1043 else
1044 if (((struct ip *)l3hdr)->ip_v == (IPV6_VERSION >> 4)) {
1045 const struct ip6_hdr *ip6 = (const void *)l3hdr;
1046 *iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
1047 }
1048 #endif /* INET */
1049 }
1050
1051 if (inc) {
1052 bzero(inc, sizeof(*inc));
1053 inc->inc_fport = tcp->th_sport;
1054 inc->inc_lport = tcp->th_dport;
1055 if (((struct ip *)l3hdr)->ip_v == IPVERSION) {
1056 const struct ip *ip = (const void *)l3hdr;
1057
1058 inc->inc_faddr = ip->ip_src;
1059 inc->inc_laddr = ip->ip_dst;
1060 } else {
1061 const struct ip6_hdr *ip6 = (const void *)l3hdr;
1062
1063 inc->inc_flags |= INC_ISIPV6;
1064 inc->inc6_faddr = ip6->ip6_src;
1065 inc->inc6_laddr = ip6->ip6_dst;
1066 }
1067 }
1068
1069 if (th) {
1070 bcopy(tcp, th, sizeof(*th));
1071 tcp_fields_to_host(th); /* just like tcp_input */
1072 }
1073 }
1074
1075 static struct l2t_entry *
get_l2te_for_nexthop(struct port_info * pi,if_t ifp,struct in_conninfo * inc)1076 get_l2te_for_nexthop(struct port_info *pi, if_t ifp,
1077 struct in_conninfo *inc)
1078 {
1079 struct l2t_entry *e;
1080 struct sockaddr_in6 sin6;
1081 struct sockaddr *dst = (void *)&sin6;
1082 struct nhop_object *nh;
1083
1084 if (inc->inc_flags & INC_ISIPV6) {
1085 bzero(dst, sizeof(struct sockaddr_in6));
1086 dst->sa_len = sizeof(struct sockaddr_in6);
1087 dst->sa_family = AF_INET6;
1088
1089 if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) {
1090 /* no need for route lookup */
1091 e = t4_l2t_get(pi, ifp, dst);
1092 return (e);
1093 }
1094
1095 nh = fib6_lookup(RT_DEFAULT_FIB, &inc->inc6_faddr, 0, NHR_NONE, 0);
1096 if (nh == NULL)
1097 return (NULL);
1098 if (nh->nh_ifp != ifp)
1099 return (NULL);
1100 if (nh->nh_flags & NHF_GATEWAY)
1101 ((struct sockaddr_in6 *)dst)->sin6_addr = nh->gw6_sa.sin6_addr;
1102 else
1103 ((struct sockaddr_in6 *)dst)->sin6_addr = inc->inc6_faddr;
1104 } else {
1105 dst->sa_len = sizeof(struct sockaddr_in);
1106 dst->sa_family = AF_INET;
1107
1108 nh = fib4_lookup(RT_DEFAULT_FIB, inc->inc_faddr, 0, NHR_NONE, 0);
1109 if (nh == NULL)
1110 return (NULL);
1111 if (nh->nh_ifp != ifp)
1112 return (NULL);
1113 if (nh->nh_flags & NHF_GATEWAY)
1114 if (nh->gw_sa.sa_family == AF_INET)
1115 ((struct sockaddr_in *)dst)->sin_addr = nh->gw4_sa.sin_addr;
1116 else
1117 *((struct sockaddr_in6 *)dst) = nh->gw6_sa;
1118 else
1119 ((struct sockaddr_in *)dst)->sin_addr = inc->inc_faddr;
1120 }
1121
1122 e = t4_l2t_get(pi, ifp, dst);
1123 return (e);
1124 }
1125
1126 static int
send_synack(struct adapter * sc,struct synq_entry * synqe,uint64_t opt0,uint32_t opt2,int tid)1127 send_synack(struct adapter *sc, struct synq_entry *synqe, uint64_t opt0,
1128 uint32_t opt2, int tid)
1129 {
1130 struct wrqe *wr;
1131 struct cpl_pass_accept_rpl *rpl;
1132 struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx];
1133
1134 wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) :
1135 sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[0]);
1136 if (wr == NULL)
1137 return (ENOMEM);
1138 rpl = wrtod(wr);
1139
1140 if (is_t4(sc))
1141 INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid);
1142 else {
1143 struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl;
1144
1145 INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid);
1146 rpl5->iss = htobe32(synqe->iss);
1147 }
1148 rpl->opt0 = opt0;
1149 rpl->opt2 = opt2;
1150
1151 return (t4_l2t_send(sc, wr, e));
1152 }
1153
1154 #define REJECT_PASS_ACCEPT_REQ(tunnel) do { \
1155 if (!tunnel) { \
1156 m_freem(m); \
1157 m = NULL; \
1158 } \
1159 reject_reason = __LINE__; \
1160 goto reject; \
1161 } while (0)
1162
1163 /*
1164 * The context associated with a tid entry via insert_tid could be a synq_entry
1165 * or a toepcb. The only way CPL handlers can tell is via a bit in these flags.
1166 */
1167 CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags));
1168
1169 /*
1170 * Incoming SYN on a listening socket.
1171 *
1172 * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe,
1173 * etc.
1174 */
1175 static int
do_pass_accept_req(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)1176 do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss,
1177 struct mbuf *m)
1178 {
1179 struct adapter *sc = iq->adapter;
1180 struct toedev *tod;
1181 const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
1182 unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
1183 unsigned int tid = GET_TID(cpl);
1184 struct listen_ctx *lctx = lookup_stid(sc, stid);
1185 struct inpcb *inp;
1186 struct socket *so;
1187 struct in_conninfo inc;
1188 struct tcphdr th;
1189 struct tcpopt to;
1190 struct port_info *pi;
1191 struct vi_info *vi;
1192 if_t hw_ifp, ifp;
1193 struct l2t_entry *e = NULL;
1194 struct synq_entry *synqe = NULL;
1195 int reject_reason, v, ntids;
1196 uint16_t vid, l2info;
1197 struct epoch_tracker et;
1198 #ifdef INVARIANTS
1199 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1200 #endif
1201 struct offload_settings settings;
1202 uint8_t iptos;
1203
1204 KASSERT(opcode == CPL_PASS_ACCEPT_REQ,
1205 ("%s: unexpected opcode 0x%x", __func__, opcode));
1206 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
1207
1208 CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid,
1209 lctx);
1210
1211 /*
1212 * Figure out the port the SYN arrived on. We'll look for an exact VI
1213 * match in a bit but in case we don't find any we'll use the main VI as
1214 * the incoming ifnet.
1215 */
1216 l2info = be16toh(cpl->l2info);
1217 pi = sc->port[G_SYN_INTF(l2info)];
1218 hw_ifp = pi->vi[0].ifp;
1219 m->m_pkthdr.rcvif = hw_ifp;
1220
1221 CURVNET_SET(lctx->vnet); /* before any potential REJECT */
1222
1223 /*
1224 * If VXLAN/NVGRE parsing is enabled then SYNs in the inner traffic will
1225 * also hit the listener. We don't want to offload those.
1226 */
1227 if (encapsulated_syn(sc, cpl)) {
1228 REJECT_PASS_ACCEPT_REQ(true);
1229 }
1230
1231 /*
1232 * Use the MAC index to lookup the associated VI. If this SYN didn't
1233 * match a perfect MAC filter, punt.
1234 */
1235 if (!(l2info & F_SYN_XACT_MATCH)) {
1236 REJECT_PASS_ACCEPT_REQ(true);
1237 }
1238 for_each_vi(pi, v, vi) {
1239 if (vi->xact_addr_filt == G_SYN_MAC_IDX(l2info))
1240 goto found;
1241 }
1242 REJECT_PASS_ACCEPT_REQ(true);
1243 found:
1244 hw_ifp = vi->ifp; /* the cxgbe ifnet */
1245 m->m_pkthdr.rcvif = hw_ifp;
1246 tod = TOEDEV(hw_ifp);
1247
1248 /*
1249 * Don't offload if the peer requested a TCP option that's not known to
1250 * the silicon. Send the SYN to the kernel instead.
1251 */
1252 if (__predict_false(cpl->tcpopt.unknown))
1253 REJECT_PASS_ACCEPT_REQ(true);
1254
1255 /*
1256 * Figure out if there is a pseudo interface (vlan, lagg, etc.)
1257 * involved. Don't offload if the SYN had a VLAN tag and the vid
1258 * doesn't match anything on this interface.
1259 *
1260 * XXX: lagg support, lagg + vlan support.
1261 */
1262 vid = EVL_VLANOFTAG(be16toh(cpl->vlan));
1263 if (vid != 0xfff && vid != 0) {
1264 ifp = VLAN_DEVAT(hw_ifp, vid);
1265 if (ifp == NULL)
1266 REJECT_PASS_ACCEPT_REQ(true);
1267 } else
1268 ifp = hw_ifp;
1269
1270 /*
1271 * Don't offload if the ifnet that the SYN came in on is not in the same
1272 * vnet as the listening socket.
1273 */
1274 if (lctx->vnet != if_getvnet(ifp))
1275 REJECT_PASS_ACCEPT_REQ(true);
1276
1277 pass_accept_req_to_protohdrs(sc, m, &inc, &th, &iptos);
1278 if (inc.inc_flags & INC_ISIPV6) {
1279
1280 /* Don't offload if the ifcap isn't enabled */
1281 if ((if_getcapenable(ifp) & IFCAP_TOE6) == 0)
1282 REJECT_PASS_ACCEPT_REQ(true);
1283
1284 /*
1285 * SYN must be directed to an IP6 address on this ifnet. This
1286 * is more restrictive than in6_localip.
1287 */
1288 NET_EPOCH_ENTER(et);
1289 if (!in6_ifhasaddr(ifp, &inc.inc6_laddr)) {
1290 NET_EPOCH_EXIT(et);
1291 REJECT_PASS_ACCEPT_REQ(true);
1292 }
1293
1294 ntids = 2;
1295 } else {
1296
1297 /* Don't offload if the ifcap isn't enabled */
1298 if ((if_getcapenable(ifp) & IFCAP_TOE4) == 0)
1299 REJECT_PASS_ACCEPT_REQ(true);
1300
1301 /*
1302 * SYN must be directed to an IP address on this ifnet. This
1303 * is more restrictive than in_localip.
1304 */
1305 NET_EPOCH_ENTER(et);
1306 if (!in_ifhasaddr(ifp, inc.inc_laddr)) {
1307 NET_EPOCH_EXIT(et);
1308 REJECT_PASS_ACCEPT_REQ(true);
1309 }
1310
1311 ntids = 1;
1312 }
1313
1314 e = get_l2te_for_nexthop(pi, ifp, &inc);
1315 if (e == NULL) {
1316 NET_EPOCH_EXIT(et);
1317 REJECT_PASS_ACCEPT_REQ(true);
1318 }
1319
1320 /* Don't offload if the 4-tuple is already in use */
1321 if (toe_4tuple_check(&inc, &th, ifp) != 0) {
1322 NET_EPOCH_EXIT(et);
1323 REJECT_PASS_ACCEPT_REQ(false);
1324 }
1325
1326 inp = lctx->inp; /* listening socket, not owned by TOE */
1327 INP_RLOCK(inp);
1328
1329 /* Don't offload if the listening socket has closed */
1330 if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1331 INP_RUNLOCK(inp);
1332 NET_EPOCH_EXIT(et);
1333 REJECT_PASS_ACCEPT_REQ(false);
1334 }
1335 so = inp->inp_socket;
1336 rw_rlock(&sc->policy_lock);
1337 settings = *lookup_offload_policy(sc, OPEN_TYPE_PASSIVE, m,
1338 EVL_MAKETAG(0xfff, 0, 0), inp);
1339 rw_runlock(&sc->policy_lock);
1340 if (!settings.offload) {
1341 INP_RUNLOCK(inp);
1342 NET_EPOCH_EXIT(et);
1343 REJECT_PASS_ACCEPT_REQ(true); /* Rejected by COP. */
1344 }
1345
1346 synqe = alloc_synqe(sc, lctx, M_NOWAIT);
1347 if (synqe == NULL) {
1348 INP_RUNLOCK(inp);
1349 NET_EPOCH_EXIT(et);
1350 REJECT_PASS_ACCEPT_REQ(true);
1351 }
1352 MPASS(rss->hash_type == RSS_HASH_TCP);
1353 synqe->rss_hash = be32toh(rss->hash_val);
1354 atomic_store_int(&synqe->ok_to_respond, 0);
1355
1356 init_conn_params(vi, &settings, &inc, so, &cpl->tcpopt, e->idx,
1357 &synqe->params);
1358
1359 /*
1360 * If all goes well t4_syncache_respond will get called during
1361 * syncache_add. Note that syncache_add releases the pcb lock.
1362 */
1363 t4opt_to_tcpopt(&cpl->tcpopt, &to);
1364 toe_syncache_add(&inc, &to, &th, inp, tod, synqe, iptos);
1365
1366 if (atomic_load_int(&synqe->ok_to_respond) > 0) {
1367 uint64_t opt0;
1368 uint32_t opt2;
1369
1370 opt0 = calc_options0(vi, &synqe->params);
1371 opt2 = calc_options2(vi, &synqe->params);
1372
1373 insert_tid(sc, tid, synqe, ntids);
1374 synqe->tid = tid;
1375 synqe->syn = m;
1376 m = NULL;
1377
1378 if (send_synack(sc, synqe, opt0, opt2, tid) != 0) {
1379 remove_tid(sc, tid, ntids);
1380 m = synqe->syn;
1381 synqe->syn = NULL;
1382 NET_EPOCH_EXIT(et);
1383 REJECT_PASS_ACCEPT_REQ(true);
1384 }
1385
1386 CTR6(KTR_CXGBE,
1387 "%s: stid %u, tid %u, synqe %p, opt0 %#016lx, opt2 %#08x",
1388 __func__, stid, tid, synqe, be64toh(opt0), be32toh(opt2));
1389 } else {
1390 NET_EPOCH_EXIT(et);
1391 REJECT_PASS_ACCEPT_REQ(false);
1392 }
1393
1394 NET_EPOCH_EXIT(et);
1395 CURVNET_RESTORE();
1396 return (0);
1397 reject:
1398 CURVNET_RESTORE();
1399 CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid,
1400 reject_reason);
1401
1402 if (e)
1403 t4_l2t_release(e);
1404 release_tid(sc, tid, lctx->ctrlq);
1405 if (synqe) {
1406 inp = synqe->lctx->inp;
1407 INP_WLOCK(inp);
1408 inp = release_synqe(sc, synqe);
1409 if (inp)
1410 INP_WUNLOCK(inp);
1411 }
1412
1413 if (m) {
1414 /*
1415 * The connection request hit a TOE listener but is being passed
1416 * on to the kernel sw stack instead of getting offloaded.
1417 */
1418 m_adj(m, sizeof(*cpl));
1419 m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID |
1420 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1421 m->m_pkthdr.csum_data = 0xffff;
1422 if_input(hw_ifp, m);
1423 }
1424
1425 return (reject_reason);
1426 }
1427
1428 static void
synqe_to_protohdrs(struct adapter * sc,struct synq_entry * synqe,const struct cpl_pass_establish * cpl,struct in_conninfo * inc,struct tcphdr * th,struct tcpopt * to)1429 synqe_to_protohdrs(struct adapter *sc, struct synq_entry *synqe,
1430 const struct cpl_pass_establish *cpl, struct in_conninfo *inc,
1431 struct tcphdr *th, struct tcpopt *to)
1432 {
1433 uint16_t tcp_opt = be16toh(cpl->tcp_opt);
1434 uint8_t iptos;
1435
1436 /* start off with the original SYN */
1437 pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th, &iptos);
1438
1439 /* modify parts to make it look like the ACK to our SYN|ACK */
1440 th->th_flags = TH_ACK;
1441 th->th_ack = synqe->iss + 1;
1442 th->th_seq = be32toh(cpl->rcv_isn);
1443 bzero(to, sizeof(*to));
1444 if (G_TCPOPT_TSTAMP(tcp_opt)) {
1445 to->to_flags |= TOF_TS;
1446 to->to_tsecr = synqe->ts;
1447 }
1448 }
1449
1450 static int
do_pass_establish(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)1451 do_pass_establish(struct sge_iq *iq, const struct rss_header *rss,
1452 struct mbuf *m)
1453 {
1454 struct adapter *sc = iq->adapter;
1455 struct vi_info *vi;
1456 if_t ifp;
1457 const struct cpl_pass_establish *cpl = (const void *)(rss + 1);
1458 #if defined(KTR) || defined(INVARIANTS)
1459 unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
1460 #endif
1461 unsigned int tid = GET_TID(cpl);
1462 struct synq_entry *synqe = lookup_tid(sc, tid);
1463 struct listen_ctx *lctx = synqe->lctx;
1464 struct inpcb *inp = lctx->inp, *new_inp;
1465 struct socket *so;
1466 struct tcphdr th;
1467 struct tcpopt to;
1468 struct in_conninfo inc;
1469 struct toepcb *toep;
1470 struct epoch_tracker et;
1471 int rstreason;
1472 #ifdef INVARIANTS
1473 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1474 #endif
1475
1476 KASSERT(opcode == CPL_PASS_ESTABLISH,
1477 ("%s: unexpected opcode 0x%x", __func__, opcode));
1478 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1479 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
1480 KASSERT(synqe->flags & TPF_SYNQE,
1481 ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe));
1482
1483 CURVNET_SET(lctx->vnet);
1484 NET_EPOCH_ENTER(et); /* for syncache_expand */
1485 INP_WLOCK(inp);
1486
1487 CTR6(KTR_CXGBE,
1488 "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x",
1489 __func__, stid, tid, synqe, synqe->flags, inp->inp_flags);
1490
1491 ifp = synqe->syn->m_pkthdr.rcvif;
1492 vi = if_getsoftc(ifp);
1493 KASSERT(vi->adapter == sc,
1494 ("%s: vi %p, sc %p mismatch", __func__, vi, sc));
1495
1496 if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1497 reset:
1498 send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_SEND_RST);
1499 INP_WUNLOCK(inp);
1500 NET_EPOCH_EXIT(et);
1501 CURVNET_RESTORE();
1502 return (0);
1503 }
1504
1505 KASSERT(synqe->params.rxq_idx == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0],
1506 ("%s: CPL arrived on unexpected rxq. %d %d", __func__,
1507 synqe->params.rxq_idx,
1508 (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0])));
1509
1510 toep = alloc_toepcb(vi, M_NOWAIT);
1511 if (toep == NULL)
1512 goto reset;
1513 toep->tid = tid;
1514 toep->l2te = &sc->l2t->l2tab[synqe->params.l2t_idx];
1515 toep->vnet = lctx->vnet;
1516 bcopy(&synqe->params, &toep->params, sizeof(toep->params));
1517 init_toepcb(vi, toep);
1518
1519 MPASS(be32toh(cpl->snd_isn) - 1 == synqe->iss);
1520 MPASS(be32toh(cpl->rcv_isn) - 1 == synqe->irs);
1521 synqe->tcp_opt = cpl->tcp_opt;
1522 synqe->toep = toep;
1523
1524 /* Come up with something that syncache_expand should be ok with. */
1525 synqe_to_protohdrs(sc, synqe, cpl, &inc, &th, &to);
1526 if (inc.inc_flags & INC_ISIPV6) {
1527 if (lctx->ce == NULL) {
1528 toep->ce = t4_get_clip_entry(sc, &inc.inc6_laddr, true);
1529 if (toep->ce == NULL) {
1530 free_toepcb(toep);
1531 goto reset; /* RST without a CLIP entry? */
1532 }
1533 } else {
1534 t4_hold_clip_entry(sc, lctx->ce);
1535 toep->ce = lctx->ce;
1536 }
1537 }
1538 so = inp->inp_socket;
1539 KASSERT(so != NULL, ("%s: socket is NULL", __func__));
1540
1541 rstreason = toe_syncache_expand(&inc, &to, &th, &so);
1542 if (rstreason < 0) {
1543 free_toepcb(toep);
1544 send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_NO_RST);
1545 INP_WUNLOCK(inp);
1546 NET_EPOCH_EXIT(et);
1547 CURVNET_RESTORE();
1548 return (0);
1549 } else if (rstreason == 0 || so == NULL) {
1550 free_toepcb(toep);
1551 goto reset;
1552 }
1553
1554 /* New connection inpcb is already locked by syncache_expand(). */
1555 new_inp = sotoinpcb(so);
1556 INP_WLOCK_ASSERT(new_inp);
1557 MPASS(so->so_vnet == lctx->vnet);
1558
1559 /*
1560 * This is for expansion from syncookies.
1561 *
1562 * XXX: we've held the tcbinfo lock throughout so there's no risk of
1563 * anyone accept'ing a connection before we've installed our hooks, but
1564 * this somewhat defeats the purpose of having a tod_offload_socket :-(
1565 */
1566 if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) {
1567 tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0);
1568 t4_offload_socket(TOEDEV(ifp), synqe, so);
1569 }
1570
1571 INP_WUNLOCK(new_inp);
1572
1573 /* Done with the synqe */
1574 inp = release_synqe(sc, synqe);
1575 if (inp != NULL)
1576 INP_WUNLOCK(inp);
1577 NET_EPOCH_EXIT(et);
1578 CURVNET_RESTORE();
1579
1580 return (0);
1581 }
1582
1583 void
t4_init_listen_cpl_handlers(void)1584 t4_init_listen_cpl_handlers(void)
1585 {
1586
1587 t4_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl);
1588 t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
1589 t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
1590 t4_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
1591 }
1592
1593 void
t4_uninit_listen_cpl_handlers(void)1594 t4_uninit_listen_cpl_handlers(void)
1595 {
1596
1597 t4_register_cpl_handler(CPL_PASS_OPEN_RPL, NULL);
1598 t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, NULL);
1599 t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, NULL);
1600 t4_register_cpl_handler(CPL_PASS_ESTABLISH, NULL);
1601 }
1602 #endif
1603