1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2012 Chelsio Communications, Inc.
5 * All rights reserved.
6 * Written by: Navdeep Parhar <np@FreeBSD.org>
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include <sys/cdefs.h>
31 #include "opt_inet.h"
32 #include "opt_inet6.h"
33 #include "opt_kern_tls.h"
34 #include "opt_ratelimit.h"
35
36 #include <sys/param.h>
37 #include <sys/types.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/ktr.h>
41 #include <sys/lock.h>
42 #include <sys/limits.h>
43 #include <sys/module.h>
44 #include <sys/protosw.h>
45 #include <sys/domain.h>
46 #include <sys/refcount.h>
47 #include <sys/rmlock.h>
48 #include <sys/socket.h>
49 #include <sys/socketvar.h>
50 #include <sys/sysctl.h>
51 #include <sys/taskqueue.h>
52 #include <net/if.h>
53 #include <net/if_var.h>
54 #include <net/if_types.h>
55 #include <net/if_vlan_var.h>
56 #include <netinet/in.h>
57 #include <netinet/in_pcb.h>
58 #include <netinet/in_var.h>
59 #include <netinet/ip.h>
60 #include <netinet/ip6.h>
61 #include <netinet6/scope6_var.h>
62 #define TCPSTATES
63 #include <netinet/tcp_fsm.h>
64 #include <netinet/tcp_seq.h>
65 #include <netinet/tcp_timer.h>
66 #include <netinet/tcp_var.h>
67 #include <netinet/toecore.h>
68 #include <netinet/cc/cc.h>
69
70 #ifdef TCP_OFFLOAD
71 #include "common/common.h"
72 #include "common/t4_msg.h"
73 #include "common/t4_regs.h"
74 #include "common/t4_regs_values.h"
75 #include "common/t4_tcb.h"
76 #include "t4_clip.h"
77 #include "tom/t4_tom_l2t.h"
78 #include "tom/t4_tom.h"
79 #include "tom/t4_tls.h"
80
81 static struct protosw toe_protosw;
82 static struct protosw toe6_protosw;
83
84 /* Module ops */
85 static int t4_tom_mod_load(void);
86 static int t4_tom_mod_unload(void);
87 static int t4_tom_modevent(module_t, int, void *);
88
89 /* ULD ops and helpers */
90 static int t4_tom_activate(struct adapter *);
91 static int t4_tom_deactivate(struct adapter *);
92 static int t4_tom_stop(struct adapter *);
93 static int t4_tom_restart(struct adapter *);
94
95 static struct uld_info tom_uld_info = {
96 .uld_activate = t4_tom_activate,
97 .uld_deactivate = t4_tom_deactivate,
98 .uld_stop = t4_tom_stop,
99 .uld_restart = t4_tom_restart,
100 };
101
102 static void release_offload_resources(struct toepcb *);
103 static void done_with_toepcb(struct toepcb *);
104 static int alloc_tid_tabs(struct adapter *);
105 static void free_tid_tabs(struct adapter *);
106 static void free_tom_data(struct adapter *, struct tom_data *);
107 static void reclaim_wr_resources(void *, int);
108 static void cleanup_stranded_tids(void *, int);
109
110 struct toepcb *
alloc_toepcb(struct vi_info * vi,int flags)111 alloc_toepcb(struct vi_info *vi, int flags)
112 {
113 struct port_info *pi = vi->pi;
114 struct adapter *sc = pi->adapter;
115 struct toepcb *toep;
116 int tx_credits, txsd_total, len;
117
118 /*
119 * The firmware counts tx work request credits in units of 16 bytes
120 * each. Reserve room for an ABORT_REQ so the driver never has to worry
121 * about tx credits if it wants to abort a connection.
122 */
123 tx_credits = sc->params.ofldq_wr_cred;
124 tx_credits -= howmany(sizeof(struct cpl_abort_req), 16);
125
126 /*
127 * Shortest possible tx work request is a fw_ofld_tx_data_wr + 1 byte
128 * immediate payload, and firmware counts tx work request credits in
129 * units of 16 byte. Calculate the maximum work requests possible.
130 */
131 txsd_total = tx_credits /
132 howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16);
133
134 len = offsetof(struct toepcb, txsd) +
135 txsd_total * sizeof(struct ofld_tx_sdesc);
136
137 toep = malloc(len, M_CXGBE, M_ZERO | flags);
138 if (toep == NULL)
139 return (NULL);
140
141 refcount_init(&toep->refcount, 1);
142 toep->td = sc->tom_softc;
143 toep->incarnation = sc->incarnation;
144 toep->vi = vi;
145 toep->tid = -1;
146 toep->tx_total = tx_credits;
147 toep->tx_credits = tx_credits;
148 mbufq_init(&toep->ulp_pduq, INT_MAX);
149 mbufq_init(&toep->ulp_pdu_reclaimq, INT_MAX);
150 toep->txsd_total = txsd_total;
151 toep->txsd_avail = txsd_total;
152 toep->txsd_pidx = 0;
153 toep->txsd_cidx = 0;
154 aiotx_init_toep(toep);
155
156 return (toep);
157 }
158
159 /*
160 * Initialize a toepcb after its params have been filled out.
161 */
162 int
init_toepcb(struct vi_info * vi,struct toepcb * toep)163 init_toepcb(struct vi_info *vi, struct toepcb *toep)
164 {
165 struct conn_params *cp = &toep->params;
166 struct port_info *pi = vi->pi;
167 struct adapter *sc = pi->adapter;
168 struct tx_cl_rl_params *tc;
169
170 if (cp->tc_idx >= 0 && cp->tc_idx < sc->params.nsched_cls) {
171 tc = &pi->sched_params->cl_rl[cp->tc_idx];
172 mtx_lock(&sc->tc_lock);
173 if (tc->state != CS_HW_CONFIGURED) {
174 CH_ERR(vi, "tid %d cannot be bound to traffic class %d "
175 "because it is not configured (its state is %d)\n",
176 toep->tid, cp->tc_idx, tc->state);
177 cp->tc_idx = -1;
178 } else {
179 tc->refcount++;
180 }
181 mtx_unlock(&sc->tc_lock);
182 }
183 toep->ofld_txq = &sc->sge.ofld_txq[cp->txq_idx];
184 toep->ofld_rxq = &sc->sge.ofld_rxq[cp->rxq_idx];
185 toep->ctrlq = &sc->sge.ctrlq[pi->port_id];
186
187 tls_init_toep(toep);
188 MPASS(ulp_mode(toep) != ULP_MODE_TCPDDP);
189
190 toep->flags |= TPF_INITIALIZED;
191
192 return (0);
193 }
194
195 struct toepcb *
hold_toepcb(struct toepcb * toep)196 hold_toepcb(struct toepcb *toep)
197 {
198
199 refcount_acquire(&toep->refcount);
200 return (toep);
201 }
202
203 void
free_toepcb(struct toepcb * toep)204 free_toepcb(struct toepcb *toep)
205 {
206
207 if (refcount_release(&toep->refcount) == 0)
208 return;
209
210 KASSERT(!(toep->flags & TPF_ATTACHED),
211 ("%s: attached to an inpcb", __func__));
212 KASSERT(!(toep->flags & TPF_CPL_PENDING),
213 ("%s: CPL pending", __func__));
214
215 if (toep->flags & TPF_INITIALIZED) {
216 if (ulp_mode(toep) == ULP_MODE_TCPDDP)
217 ddp_uninit_toep(toep);
218 tls_uninit_toep(toep);
219 }
220 free(toep, M_CXGBE);
221 }
222
223 /*
224 * Set up the socket for TCP offload.
225 */
226 void
offload_socket(struct socket * so,struct toepcb * toep)227 offload_socket(struct socket *so, struct toepcb *toep)
228 {
229 struct tom_data *td = toep->td;
230 struct inpcb *inp = sotoinpcb(so);
231 struct tcpcb *tp = intotcpcb(inp);
232 struct sockbuf *sb;
233
234 INP_WLOCK_ASSERT(inp);
235
236 /* Update socket */
237 sb = &so->so_snd;
238 SOCKBUF_LOCK(sb);
239 sb->sb_flags |= SB_NOCOALESCE;
240 SOCKBUF_UNLOCK(sb);
241 sb = &so->so_rcv;
242 SOCKBUF_LOCK(sb);
243 sb->sb_flags |= SB_NOCOALESCE;
244 if (inp->inp_vflag & INP_IPV6)
245 so->so_proto = &toe6_protosw;
246 else
247 so->so_proto = &toe_protosw;
248 SOCKBUF_UNLOCK(sb);
249
250 /* Update TCP PCB */
251 tp->tod = &td->tod;
252 tp->t_toe = toep;
253 tp->t_flags |= TF_TOE;
254
255 /* Install an extra hold on inp */
256 toep->inp = inp;
257 toep->flags |= TPF_ATTACHED;
258 in_pcbref(inp);
259
260 /* Add the TOE PCB to the active list */
261 mtx_lock(&td->toep_list_lock);
262 TAILQ_INSERT_HEAD(&td->toep_list, toep, link);
263 toep->flags |= TPF_IN_TOEP_LIST;
264 mtx_unlock(&td->toep_list_lock);
265 }
266
267 void
restore_so_proto(struct socket * so,bool v6)268 restore_so_proto(struct socket *so, bool v6)
269 {
270 if (v6)
271 so->so_proto = &tcp6_protosw;
272 else
273 so->so_proto = &tcp_protosw;
274 }
275
276 /* This is _not_ the normal way to "unoffload" a socket. */
277 void
undo_offload_socket(struct socket * so)278 undo_offload_socket(struct socket *so)
279 {
280 struct inpcb *inp = sotoinpcb(so);
281 struct tcpcb *tp = intotcpcb(inp);
282 struct toepcb *toep = tp->t_toe;
283 struct tom_data *td = toep->td;
284 struct sockbuf *sb;
285
286 INP_WLOCK_ASSERT(inp);
287
288 sb = &so->so_snd;
289 SOCKBUF_LOCK(sb);
290 sb->sb_flags &= ~SB_NOCOALESCE;
291 SOCKBUF_UNLOCK(sb);
292 sb = &so->so_rcv;
293 SOCKBUF_LOCK(sb);
294 sb->sb_flags &= ~SB_NOCOALESCE;
295 restore_so_proto(so, inp->inp_vflag & INP_IPV6);
296 SOCKBUF_UNLOCK(sb);
297
298 tp->tod = NULL;
299 tp->t_toe = NULL;
300 tp->t_flags &= ~TF_TOE;
301
302 toep->inp = NULL;
303 toep->flags &= ~TPF_ATTACHED;
304 if (in_pcbrele_wlocked(inp))
305 panic("%s: inp freed.", __func__);
306
307 mtx_lock(&td->toep_list_lock);
308 toep->flags &= ~TPF_IN_TOEP_LIST;
309 TAILQ_REMOVE(&td->toep_list, toep, link);
310 mtx_unlock(&td->toep_list_lock);
311 }
312
313 static void
release_offload_resources(struct toepcb * toep)314 release_offload_resources(struct toepcb *toep)
315 {
316 struct tom_data *td = toep->td;
317 struct adapter *sc = td_adapter(td);
318 int tid = toep->tid;
319
320 KASSERT(!(toep->flags & TPF_CPL_PENDING),
321 ("%s: %p has CPL pending.", __func__, toep));
322
323 CTR5(KTR_CXGBE, "%s: toep %p (tid %d, l2te %p, ce %p)",
324 __func__, toep, tid, toep->l2te, toep->ce);
325
326 if (toep->l2te) {
327 t4_l2t_release(toep->l2te);
328 toep->l2te = NULL;
329 }
330 if (tid >= 0) {
331 remove_tid(sc, tid, toep->ce ? 2 : 1);
332 release_tid(sc, tid, toep->ctrlq);
333 toep->tid = -1;
334 }
335 if (toep->ce) {
336 t4_release_clip_entry(sc, toep->ce);
337 toep->ce = NULL;
338 }
339 if (toep->params.tc_idx != -1)
340 t4_release_cl_rl(sc, toep->vi->pi->port_id, toep->params.tc_idx);
341 }
342
343 /*
344 * Both the driver and kernel are done with the toepcb.
345 */
346 static void
done_with_toepcb(struct toepcb * toep)347 done_with_toepcb(struct toepcb *toep)
348 {
349 struct tom_data *td = toep->td;
350
351 KASSERT(!(toep->flags & TPF_CPL_PENDING),
352 ("%s: %p has CPL pending.", __func__, toep));
353 KASSERT(!(toep->flags & TPF_ATTACHED),
354 ("%s: %p is still attached.", __func__, toep));
355
356 CTR(KTR_CXGBE, "%s: toep %p (0x%x)", __func__, toep, toep->flags);
357
358 /*
359 * These queues should have been emptied at approximately the same time
360 * that a normal connection's socket's so_snd would have been purged or
361 * drained. Do _not_ clean up here.
362 */
363 MPASS(mbufq_empty(&toep->ulp_pduq));
364 MPASS(mbufq_empty(&toep->ulp_pdu_reclaimq));
365 #ifdef INVARIANTS
366 if (ulp_mode(toep) == ULP_MODE_TCPDDP)
367 ddp_assert_empty(toep);
368 #endif
369 MPASS(TAILQ_EMPTY(&toep->aiotx_jobq));
370 MPASS(toep->tid == -1);
371 MPASS(toep->l2te == NULL);
372 MPASS(toep->ce == NULL);
373
374 mtx_lock(&td->toep_list_lock);
375 if (toep->flags & TPF_IN_TOEP_LIST) {
376 toep->flags &= ~TPF_IN_TOEP_LIST;
377 TAILQ_REMOVE(&td->toep_list, toep, link);
378 }
379 mtx_unlock(&td->toep_list_lock);
380
381 free_toepcb(toep);
382 }
383
384 /*
385 * The kernel is done with the TCP PCB and this is our opportunity to unhook the
386 * toepcb hanging off of it. If the TOE driver is also done with the toepcb (no
387 * pending CPL) then it is time to release all resources tied to the toepcb.
388 *
389 * Also gets called when an offloaded active open fails and the TOM wants the
390 * kernel to take the TCP PCB back.
391 */
392 void
t4_pcb_detach(struct toedev * tod __unused,struct tcpcb * tp)393 t4_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp)
394 {
395 #if defined(KTR) || defined(INVARIANTS)
396 struct inpcb *inp = tptoinpcb(tp);
397 #endif
398 struct toepcb *toep = tp->t_toe;
399
400 INP_WLOCK_ASSERT(inp);
401
402 KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
403 KASSERT(toep->flags & TPF_ATTACHED,
404 ("%s: not attached", __func__));
405
406 #ifdef KTR
407 if (tp->t_state == TCPS_SYN_SENT) {
408 CTR6(KTR_CXGBE, "%s: atid %d, toep %p (0x%x), inp %p (0x%x)",
409 __func__, toep->tid, toep, toep->flags, inp,
410 inp->inp_flags);
411 } else {
412 CTR6(KTR_CXGBE,
413 "t4_pcb_detach: tid %d (%s), toep %p (0x%x), inp %p (0x%x)",
414 toep->tid, tcpstates[tp->t_state], toep, toep->flags, inp,
415 inp->inp_flags);
416 }
417 #endif
418
419 tp->tod = NULL;
420 tp->t_toe = NULL;
421 tp->t_flags &= ~TF_TOE;
422 toep->flags &= ~TPF_ATTACHED;
423
424 if (!(toep->flags & TPF_CPL_PENDING))
425 done_with_toepcb(toep);
426 }
427
428 /*
429 * setsockopt handler.
430 */
431 static void
t4_ctloutput(struct toedev * tod,struct tcpcb * tp,int dir,int name)432 t4_ctloutput(struct toedev *tod, struct tcpcb *tp, int dir, int name)
433 {
434 struct adapter *sc = tod->tod_softc;
435 struct toepcb *toep = tp->t_toe;
436
437 if (dir == SOPT_GET)
438 return;
439
440 CTR4(KTR_CXGBE, "%s: tp %p, dir %u, name %u", __func__, tp, dir, name);
441
442 switch (name) {
443 case TCP_NODELAY:
444 if (tp->t_state != TCPS_ESTABLISHED)
445 break;
446 toep->params.nagle = tp->t_flags & TF_NODELAY ? 0 : 1;
447 t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS,
448 V_TF_NAGLE(1), V_TF_NAGLE(toep->params.nagle), 0, 0);
449 break;
450 default:
451 break;
452 }
453 }
454
455 static inline uint64_t
get_tcb_tflags(const uint64_t * tcb)456 get_tcb_tflags(const uint64_t *tcb)
457 {
458
459 return ((be64toh(tcb[14]) << 32) | (be64toh(tcb[15]) >> 32));
460 }
461
462 static inline uint32_t
get_tcb_field(const uint64_t * tcb,u_int word,uint32_t mask,u_int shift)463 get_tcb_field(const uint64_t *tcb, u_int word, uint32_t mask, u_int shift)
464 {
465 #define LAST_WORD ((TCB_SIZE / 4) - 1)
466 uint64_t t1, t2;
467 int flit_idx;
468
469 MPASS(mask != 0);
470 MPASS(word <= LAST_WORD);
471 MPASS(shift < 32);
472
473 flit_idx = (LAST_WORD - word) / 2;
474 if (word & 0x1)
475 shift += 32;
476 t1 = be64toh(tcb[flit_idx]) >> shift;
477 t2 = 0;
478 if (fls(mask) > 64 - shift) {
479 /*
480 * Will spill over into the next logical flit, which is the flit
481 * before this one. The flit_idx before this one must be valid.
482 */
483 MPASS(flit_idx > 0);
484 t2 = be64toh(tcb[flit_idx - 1]) << (64 - shift);
485 }
486 return ((t2 | t1) & mask);
487 #undef LAST_WORD
488 }
489 #define GET_TCB_FIELD(tcb, F) \
490 get_tcb_field(tcb, W_TCB_##F, M_TCB_##F, S_TCB_##F)
491
492 /*
493 * Issues a CPL_GET_TCB to read the entire TCB for the tid.
494 */
495 static int
send_get_tcb(struct adapter * sc,u_int tid)496 send_get_tcb(struct adapter *sc, u_int tid)
497 {
498 struct cpl_get_tcb *cpl;
499 struct wrq_cookie cookie;
500
501 MPASS(tid >= sc->tids.tid_base);
502 MPASS(tid - sc->tids.tid_base < sc->tids.ntids);
503
504 cpl = start_wrq_wr(&sc->sge.ctrlq[0], howmany(sizeof(*cpl), 16),
505 &cookie);
506 if (__predict_false(cpl == NULL))
507 return (ENOMEM);
508 bzero(cpl, sizeof(*cpl));
509 INIT_TP_WR(cpl, tid);
510 OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_GET_TCB, tid));
511 cpl->reply_ctrl = htobe16(V_REPLY_CHAN(0) |
512 V_QUEUENO(sc->sge.ofld_rxq[0].iq.cntxt_id));
513 cpl->cookie = 0xff;
514 commit_wrq_wr(&sc->sge.ctrlq[0], cpl, &cookie);
515
516 return (0);
517 }
518
519 static struct tcb_histent *
alloc_tcb_histent(struct adapter * sc,u_int tid,int flags)520 alloc_tcb_histent(struct adapter *sc, u_int tid, int flags)
521 {
522 struct tcb_histent *te;
523
524 MPASS(flags == M_NOWAIT || flags == M_WAITOK);
525
526 te = malloc(sizeof(*te), M_CXGBE, M_ZERO | flags);
527 if (te == NULL)
528 return (NULL);
529 mtx_init(&te->te_lock, "TCB entry", NULL, MTX_DEF);
530 callout_init_mtx(&te->te_callout, &te->te_lock, 0);
531 te->te_adapter = sc;
532 te->te_tid = tid;
533
534 return (te);
535 }
536
537 static void
free_tcb_histent(struct tcb_histent * te)538 free_tcb_histent(struct tcb_histent *te)
539 {
540
541 mtx_destroy(&te->te_lock);
542 free(te, M_CXGBE);
543 }
544
545 /*
546 * Start tracking the tid in the TCB history.
547 */
548 int
add_tid_to_history(struct adapter * sc,u_int tid)549 add_tid_to_history(struct adapter *sc, u_int tid)
550 {
551 struct tcb_histent *te = NULL;
552 struct tom_data *td = sc->tom_softc;
553 int rc;
554
555 MPASS(tid >= sc->tids.tid_base);
556 MPASS(tid - sc->tids.tid_base < sc->tids.ntids);
557
558 if (td->tcb_history == NULL)
559 return (ENXIO);
560
561 rw_wlock(&td->tcb_history_lock);
562 if (td->tcb_history[tid] != NULL) {
563 rc = EEXIST;
564 goto done;
565 }
566 te = alloc_tcb_histent(sc, tid, M_NOWAIT);
567 if (te == NULL) {
568 rc = ENOMEM;
569 goto done;
570 }
571 mtx_lock(&te->te_lock);
572 rc = send_get_tcb(sc, tid);
573 if (rc == 0) {
574 te->te_flags |= TE_RPL_PENDING;
575 td->tcb_history[tid] = te;
576 } else {
577 free(te, M_CXGBE);
578 }
579 mtx_unlock(&te->te_lock);
580 done:
581 rw_wunlock(&td->tcb_history_lock);
582 return (rc);
583 }
584
585 static void
remove_tcb_histent(struct tcb_histent * te)586 remove_tcb_histent(struct tcb_histent *te)
587 {
588 struct adapter *sc = te->te_adapter;
589 struct tom_data *td = sc->tom_softc;
590
591 rw_assert(&td->tcb_history_lock, RA_WLOCKED);
592 mtx_assert(&te->te_lock, MA_OWNED);
593 MPASS(td->tcb_history[te->te_tid] == te);
594
595 td->tcb_history[te->te_tid] = NULL;
596 free_tcb_histent(te);
597 rw_wunlock(&td->tcb_history_lock);
598 }
599
600 static inline struct tcb_histent *
lookup_tcb_histent(struct adapter * sc,u_int tid,bool addrem)601 lookup_tcb_histent(struct adapter *sc, u_int tid, bool addrem)
602 {
603 struct tcb_histent *te;
604 struct tom_data *td = sc->tom_softc;
605
606 MPASS(tid >= sc->tids.tid_base);
607 MPASS(tid - sc->tids.tid_base < sc->tids.ntids);
608
609 if (td->tcb_history == NULL)
610 return (NULL);
611
612 if (addrem)
613 rw_wlock(&td->tcb_history_lock);
614 else
615 rw_rlock(&td->tcb_history_lock);
616 te = td->tcb_history[tid];
617 if (te != NULL) {
618 mtx_lock(&te->te_lock);
619 return (te); /* with both locks held */
620 }
621 if (addrem)
622 rw_wunlock(&td->tcb_history_lock);
623 else
624 rw_runlock(&td->tcb_history_lock);
625
626 return (te);
627 }
628
629 static inline void
release_tcb_histent(struct tcb_histent * te)630 release_tcb_histent(struct tcb_histent *te)
631 {
632 struct adapter *sc = te->te_adapter;
633 struct tom_data *td = sc->tom_softc;
634
635 mtx_assert(&te->te_lock, MA_OWNED);
636 mtx_unlock(&te->te_lock);
637 rw_assert(&td->tcb_history_lock, RA_RLOCKED);
638 rw_runlock(&td->tcb_history_lock);
639 }
640
641 static void
request_tcb(void * arg)642 request_tcb(void *arg)
643 {
644 struct tcb_histent *te = arg;
645
646 mtx_assert(&te->te_lock, MA_OWNED);
647
648 /* Noone else is supposed to update the histent. */
649 MPASS(!(te->te_flags & TE_RPL_PENDING));
650 if (send_get_tcb(te->te_adapter, te->te_tid) == 0)
651 te->te_flags |= TE_RPL_PENDING;
652 else
653 callout_schedule(&te->te_callout, hz / 100);
654 }
655
656 static void
update_tcb_histent(struct tcb_histent * te,const uint64_t * tcb)657 update_tcb_histent(struct tcb_histent *te, const uint64_t *tcb)
658 {
659 struct tom_data *td = te->te_adapter->tom_softc;
660 uint64_t tflags = get_tcb_tflags(tcb);
661 uint8_t sample = 0;
662
663 if (GET_TCB_FIELD(tcb, SND_MAX_RAW) != GET_TCB_FIELD(tcb, SND_UNA_RAW)) {
664 if (GET_TCB_FIELD(tcb, T_RXTSHIFT) != 0)
665 sample |= TS_RTO;
666 if (GET_TCB_FIELD(tcb, T_DUPACKS) != 0)
667 sample |= TS_DUPACKS;
668 if (GET_TCB_FIELD(tcb, T_DUPACKS) >= td->dupack_threshold)
669 sample |= TS_FASTREXMT;
670 }
671
672 if (GET_TCB_FIELD(tcb, SND_MAX_RAW) != 0) {
673 uint32_t snd_wnd;
674
675 sample |= TS_SND_BACKLOGGED; /* for whatever reason. */
676
677 snd_wnd = GET_TCB_FIELD(tcb, RCV_ADV);
678 if (tflags & V_TF_RECV_SCALE(1))
679 snd_wnd <<= GET_TCB_FIELD(tcb, RCV_SCALE);
680 if (GET_TCB_FIELD(tcb, SND_CWND) < snd_wnd)
681 sample |= TS_CWND_LIMITED; /* maybe due to CWND */
682 }
683
684 if (tflags & V_TF_CCTRL_ECN(1)) {
685
686 /*
687 * CE marker on incoming IP hdr, echoing ECE back in the TCP
688 * hdr. Indicates congestion somewhere on the way from the peer
689 * to this node.
690 */
691 if (tflags & V_TF_CCTRL_ECE(1))
692 sample |= TS_ECN_ECE;
693
694 /*
695 * ECE seen and CWR sent (or about to be sent). Might indicate
696 * congestion on the way to the peer. This node is reducing its
697 * congestion window in response.
698 */
699 if (tflags & (V_TF_CCTRL_CWR(1) | V_TF_CCTRL_RFR(1)))
700 sample |= TS_ECN_CWR;
701 }
702
703 te->te_sample[te->te_pidx] = sample;
704 if (++te->te_pidx == nitems(te->te_sample))
705 te->te_pidx = 0;
706 memcpy(te->te_tcb, tcb, TCB_SIZE);
707 te->te_flags |= TE_ACTIVE;
708 }
709
710 static int
do_get_tcb_rpl(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)711 do_get_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
712 {
713 struct adapter *sc = iq->adapter;
714 const struct cpl_get_tcb_rpl *cpl = mtod(m, const void *);
715 const uint64_t *tcb = (const uint64_t *)(const void *)(cpl + 1);
716 struct tcb_histent *te;
717 const u_int tid = GET_TID(cpl);
718 bool remove;
719
720 remove = GET_TCB_FIELD(tcb, T_STATE) == TCPS_CLOSED;
721 te = lookup_tcb_histent(sc, tid, remove);
722 if (te == NULL) {
723 /* Not in the history. Who issued the GET_TCB for this? */
724 device_printf(sc->dev, "tcb %u: flags 0x%016jx, state %u, "
725 "srtt %u, sscale %u, rscale %u, cookie 0x%x\n", tid,
726 (uintmax_t)get_tcb_tflags(tcb), GET_TCB_FIELD(tcb, T_STATE),
727 GET_TCB_FIELD(tcb, T_SRTT), GET_TCB_FIELD(tcb, SND_SCALE),
728 GET_TCB_FIELD(tcb, RCV_SCALE), cpl->cookie);
729 goto done;
730 }
731
732 MPASS(te->te_flags & TE_RPL_PENDING);
733 te->te_flags &= ~TE_RPL_PENDING;
734 if (remove) {
735 remove_tcb_histent(te);
736 } else {
737 update_tcb_histent(te, tcb);
738 callout_reset(&te->te_callout, hz / 10, request_tcb, te);
739 release_tcb_histent(te);
740 }
741 done:
742 m_freem(m);
743 return (0);
744 }
745
746 static void
fill_tcp_info_from_tcb(struct adapter * sc,uint64_t * tcb,struct tcp_info * ti)747 fill_tcp_info_from_tcb(struct adapter *sc, uint64_t *tcb, struct tcp_info *ti)
748 {
749 uint32_t v;
750
751 ti->tcpi_state = GET_TCB_FIELD(tcb, T_STATE);
752
753 v = GET_TCB_FIELD(tcb, T_SRTT);
754 ti->tcpi_rtt = tcp_ticks_to_us(sc, v);
755
756 v = GET_TCB_FIELD(tcb, T_RTTVAR);
757 ti->tcpi_rttvar = tcp_ticks_to_us(sc, v);
758
759 ti->tcpi_snd_ssthresh = GET_TCB_FIELD(tcb, SND_SSTHRESH);
760 ti->tcpi_snd_cwnd = GET_TCB_FIELD(tcb, SND_CWND);
761 ti->tcpi_rcv_nxt = GET_TCB_FIELD(tcb, RCV_NXT);
762 ti->tcpi_rcv_adv = GET_TCB_FIELD(tcb, RCV_ADV);
763 ti->tcpi_dupacks = GET_TCB_FIELD(tcb, T_DUPACKS);
764
765 v = GET_TCB_FIELD(tcb, TX_MAX);
766 ti->tcpi_snd_nxt = v - GET_TCB_FIELD(tcb, SND_NXT_RAW);
767 ti->tcpi_snd_una = v - GET_TCB_FIELD(tcb, SND_UNA_RAW);
768 ti->tcpi_snd_max = v - GET_TCB_FIELD(tcb, SND_MAX_RAW);
769
770 /* Receive window being advertised by us. */
771 ti->tcpi_rcv_wscale = GET_TCB_FIELD(tcb, SND_SCALE); /* Yes, SND. */
772 ti->tcpi_rcv_space = GET_TCB_FIELD(tcb, RCV_WND);
773
774 /* Send window */
775 ti->tcpi_snd_wscale = GET_TCB_FIELD(tcb, RCV_SCALE); /* Yes, RCV. */
776 ti->tcpi_snd_wnd = GET_TCB_FIELD(tcb, RCV_ADV);
777 if (get_tcb_tflags(tcb) & V_TF_RECV_SCALE(1))
778 ti->tcpi_snd_wnd <<= ti->tcpi_snd_wscale;
779 else
780 ti->tcpi_snd_wscale = 0;
781
782 }
783
784 static void
fill_tcp_info_from_history(struct adapter * sc,struct tcb_histent * te,struct tcp_info * ti)785 fill_tcp_info_from_history(struct adapter *sc, struct tcb_histent *te,
786 struct tcp_info *ti)
787 {
788
789 fill_tcp_info_from_tcb(sc, te->te_tcb, ti);
790 }
791
792 /*
793 * Reads the TCB for the given tid using a memory window and copies it to 'buf'
794 * in the same format as CPL_GET_TCB_RPL.
795 */
796 static void
read_tcb_using_memwin(struct adapter * sc,u_int tid,uint64_t * buf)797 read_tcb_using_memwin(struct adapter *sc, u_int tid, uint64_t *buf)
798 {
799 int i, j, k, rc;
800 uint32_t addr;
801 u_char *tcb, tmp;
802
803 MPASS(tid >= sc->tids.tid_base);
804 MPASS(tid - sc->tids.tid_base < sc->tids.ntids);
805
806 addr = t4_read_reg(sc, A_TP_CMM_TCB_BASE) + tid * TCB_SIZE;
807 rc = read_via_memwin(sc, 2, addr, (uint32_t *)buf, TCB_SIZE);
808 if (rc != 0)
809 return;
810
811 tcb = (u_char *)buf;
812 for (i = 0, j = TCB_SIZE - 16; i < j; i += 16, j -= 16) {
813 for (k = 0; k < 16; k++) {
814 tmp = tcb[i + k];
815 tcb[i + k] = tcb[j + k];
816 tcb[j + k] = tmp;
817 }
818 }
819 }
820
821 static void
fill_tcp_info(struct adapter * sc,u_int tid,struct tcp_info * ti)822 fill_tcp_info(struct adapter *sc, u_int tid, struct tcp_info *ti)
823 {
824 uint64_t tcb[TCB_SIZE / sizeof(uint64_t)];
825 struct tcb_histent *te;
826
827 ti->tcpi_toe_tid = tid;
828 te = lookup_tcb_histent(sc, tid, false);
829 if (te != NULL) {
830 fill_tcp_info_from_history(sc, te, ti);
831 release_tcb_histent(te);
832 } else {
833 if (!(sc->debug_flags & DF_DISABLE_TCB_CACHE)) {
834 /* XXX: tell firmware to flush TCB cache. */
835 }
836 read_tcb_using_memwin(sc, tid, tcb);
837 fill_tcp_info_from_tcb(sc, tcb, ti);
838 }
839 }
840
841 /*
842 * Called by the kernel to allow the TOE driver to "refine" values filled up in
843 * the tcp_info for an offloaded connection.
844 */
845 static void
t4_tcp_info(struct toedev * tod,const struct tcpcb * tp,struct tcp_info * ti)846 t4_tcp_info(struct toedev *tod, const struct tcpcb *tp, struct tcp_info *ti)
847 {
848 struct adapter *sc = tod->tod_softc;
849 struct toepcb *toep = tp->t_toe;
850
851 INP_LOCK_ASSERT(tptoinpcb(tp));
852 MPASS(ti != NULL);
853
854 fill_tcp_info(sc, toep->tid, ti);
855 }
856
857 #ifdef KERN_TLS
858 static int
t4_alloc_tls_session(struct toedev * tod,struct tcpcb * tp,struct ktls_session * tls,int direction)859 t4_alloc_tls_session(struct toedev *tod, struct tcpcb *tp,
860 struct ktls_session *tls, int direction)
861 {
862 struct toepcb *toep = tp->t_toe;
863
864 INP_WLOCK_ASSERT(tptoinpcb(tp));
865 MPASS(tls != NULL);
866
867 return (tls_alloc_ktls(toep, tls, direction));
868 }
869 #endif
870
871 static void
send_mss_flowc_wr(struct adapter * sc,struct toepcb * toep)872 send_mss_flowc_wr(struct adapter *sc, struct toepcb *toep)
873 {
874 struct wrq_cookie cookie;
875 struct fw_flowc_wr *flowc;
876 struct ofld_tx_sdesc *txsd;
877 const int flowclen = sizeof(*flowc) + sizeof(struct fw_flowc_mnemval);
878 const int flowclen16 = howmany(flowclen, 16);
879
880 if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0) {
881 CH_ERR(sc, "%s: tid %u out of tx credits (%d, %d).\n", __func__,
882 toep->tid, toep->tx_credits, toep->txsd_avail);
883 return;
884 }
885
886 flowc = start_wrq_wr(&toep->ofld_txq->wrq, flowclen16, &cookie);
887 if (__predict_false(flowc == NULL)) {
888 CH_ERR(sc, "ENOMEM in %s for tid %u.\n", __func__, toep->tid);
889 return;
890 }
891 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
892 V_FW_FLOWC_WR_NPARAMS(1));
893 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) |
894 V_FW_WR_FLOWID(toep->tid));
895 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_MSS;
896 flowc->mnemval[0].val = htobe32(toep->params.emss);
897
898 txsd = &toep->txsd[toep->txsd_pidx];
899 txsd->tx_credits = flowclen16;
900 txsd->plen = 0;
901 toep->tx_credits -= txsd->tx_credits;
902 if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
903 toep->txsd_pidx = 0;
904 toep->txsd_avail--;
905 commit_wrq_wr(&toep->ofld_txq->wrq, flowc, &cookie);
906 }
907
908 static void
t4_pmtu_update(struct toedev * tod,struct tcpcb * tp,tcp_seq seq,int mtu)909 t4_pmtu_update(struct toedev *tod, struct tcpcb *tp, tcp_seq seq, int mtu)
910 {
911 struct work_request_hdr *wrh;
912 struct ulp_txpkt *ulpmc;
913 int idx, len;
914 struct wrq_cookie cookie;
915 struct inpcb *inp = tptoinpcb(tp);
916 struct toepcb *toep = tp->t_toe;
917 struct adapter *sc = td_adapter(toep->td);
918 unsigned short *mtus = &sc->params.mtus[0];
919
920 INP_WLOCK_ASSERT(inp);
921 MPASS(mtu > 0); /* kernel is supposed to provide something usable. */
922
923 /* tp->snd_una and snd_max are in host byte order too. */
924 seq = be32toh(seq);
925
926 CTR6(KTR_CXGBE, "%s: tid %d, seq 0x%08x, mtu %u, mtu_idx %u (%d)",
927 __func__, toep->tid, seq, mtu, toep->params.mtu_idx,
928 mtus[toep->params.mtu_idx]);
929
930 if (ulp_mode(toep) == ULP_MODE_NONE && /* XXX: Read TCB otherwise? */
931 (SEQ_LT(seq, tp->snd_una) || SEQ_GEQ(seq, tp->snd_max))) {
932 CTR5(KTR_CXGBE,
933 "%s: tid %d, seq 0x%08x not in range [0x%08x, 0x%08x).",
934 __func__, toep->tid, seq, tp->snd_una, tp->snd_max);
935 return;
936 }
937
938 /* Find the best mtu_idx for the suggested MTU. */
939 for (idx = 0; idx < NMTUS - 1 && mtus[idx + 1] <= mtu; idx++)
940 continue;
941 if (idx >= toep->params.mtu_idx)
942 return; /* Never increase the PMTU (just like the kernel). */
943
944 /*
945 * We'll send a compound work request with 2 SET_TCB_FIELDs -- the first
946 * one updates the mtu_idx and the second one triggers a retransmit.
947 */
948 len = sizeof(*wrh) + 2 * roundup2(LEN__SET_TCB_FIELD_ULP, 16);
949 wrh = start_wrq_wr(toep->ctrlq, howmany(len, 16), &cookie);
950 if (wrh == NULL) {
951 CH_ERR(sc, "failed to change mtu_idx of tid %d (%u -> %u).\n",
952 toep->tid, toep->params.mtu_idx, idx);
953 return;
954 }
955 INIT_ULPTX_WRH(wrh, len, 1, 0); /* atomic */
956 ulpmc = (struct ulp_txpkt *)(wrh + 1);
957 ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_T_MAXSEG,
958 V_TCB_T_MAXSEG(M_TCB_T_MAXSEG), V_TCB_T_MAXSEG(idx));
959 ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_TIMESTAMP,
960 V_TCB_TIMESTAMP(0x7FFFFULL << 11), 0);
961 commit_wrq_wr(toep->ctrlq, wrh, &cookie);
962
963 /* Update the software toepcb and tcpcb. */
964 toep->params.mtu_idx = idx;
965 tp->t_maxseg = mtus[toep->params.mtu_idx];
966 if (inp->inp_inc.inc_flags & INC_ISIPV6)
967 tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
968 else
969 tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr);
970 toep->params.emss = tp->t_maxseg;
971 if (tp->t_flags & TF_RCVD_TSTMP)
972 toep->params.emss -= TCPOLEN_TSTAMP_APPA;
973
974 /* Update the firmware flowc. */
975 send_mss_flowc_wr(sc, toep);
976
977 /* Update the MTU in the kernel's hostcache. */
978 if (sc->tt.update_hc_on_pmtu_change != 0) {
979 struct in_conninfo inc = {0};
980
981 inc.inc_fibnum = inp->inp_inc.inc_fibnum;
982 if (inp->inp_inc.inc_flags & INC_ISIPV6) {
983 inc.inc_flags |= INC_ISIPV6;
984 inc.inc6_faddr = inp->inp_inc.inc6_faddr;
985 } else {
986 inc.inc_faddr = inp->inp_inc.inc_faddr;
987 }
988 tcp_hc_updatemtu(&inc, mtu);
989 }
990
991 CTR6(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u), t_maxseg %u, emss %u",
992 __func__, toep->tid, toep->params.mtu_idx,
993 mtus[toep->params.mtu_idx], tp->t_maxseg, toep->params.emss);
994 }
995
996 /*
997 * The TOE driver will not receive any more CPLs for the tid associated with the
998 * toepcb; release the hold on the inpcb.
999 */
1000 void
final_cpl_received(struct toepcb * toep)1001 final_cpl_received(struct toepcb *toep)
1002 {
1003 struct inpcb *inp = toep->inp;
1004 bool need_wakeup;
1005
1006 KASSERT(inp != NULL, ("%s: inp is NULL", __func__));
1007 INP_WLOCK_ASSERT(inp);
1008 KASSERT(toep->flags & TPF_CPL_PENDING,
1009 ("%s: CPL not pending already?", __func__));
1010
1011 CTR6(KTR_CXGBE, "%s: tid %d, toep %p (0x%x), inp %p (0x%x)",
1012 __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags);
1013
1014 if (ulp_mode(toep) == ULP_MODE_TCPDDP)
1015 release_ddp_resources(toep);
1016 toep->inp = NULL;
1017 need_wakeup = (toep->flags & TPF_WAITING_FOR_FINAL) != 0;
1018 toep->flags &= ~(TPF_CPL_PENDING | TPF_WAITING_FOR_FINAL);
1019 mbufq_drain(&toep->ulp_pduq);
1020 mbufq_drain(&toep->ulp_pdu_reclaimq);
1021 release_offload_resources(toep);
1022 if (!(toep->flags & TPF_ATTACHED))
1023 done_with_toepcb(toep);
1024
1025 if (!in_pcbrele_wlocked(inp))
1026 INP_WUNLOCK(inp);
1027
1028 if (need_wakeup) {
1029 struct mtx *lock = mtx_pool_find(mtxpool_sleep, toep);
1030
1031 mtx_lock(lock);
1032 wakeup(toep);
1033 mtx_unlock(lock);
1034 }
1035 }
1036
1037 void
insert_tid(struct adapter * sc,int tid,void * ctx,int ntids)1038 insert_tid(struct adapter *sc, int tid, void *ctx, int ntids)
1039 {
1040 struct tid_info *t = &sc->tids;
1041
1042 MPASS(tid >= t->tid_base);
1043 MPASS(tid - t->tid_base < t->ntids);
1044
1045 t->tid_tab[tid - t->tid_base] = ctx;
1046 atomic_add_int(&t->tids_in_use, ntids);
1047 }
1048
1049 void *
lookup_tid(struct adapter * sc,int tid)1050 lookup_tid(struct adapter *sc, int tid)
1051 {
1052 struct tid_info *t = &sc->tids;
1053
1054 return (t->tid_tab[tid - t->tid_base]);
1055 }
1056
1057 void
update_tid(struct adapter * sc,int tid,void * ctx)1058 update_tid(struct adapter *sc, int tid, void *ctx)
1059 {
1060 struct tid_info *t = &sc->tids;
1061
1062 t->tid_tab[tid - t->tid_base] = ctx;
1063 }
1064
1065 void
remove_tid(struct adapter * sc,int tid,int ntids)1066 remove_tid(struct adapter *sc, int tid, int ntids)
1067 {
1068 struct tid_info *t = &sc->tids;
1069
1070 t->tid_tab[tid - t->tid_base] = NULL;
1071 atomic_subtract_int(&t->tids_in_use, ntids);
1072 }
1073
1074 /*
1075 * What mtu_idx to use, given a 4-tuple. Note that both s->mss and tcp_mssopt
1076 * have the MSS that we should advertise in our SYN. Advertised MSS doesn't
1077 * account for any TCP options so the effective MSS (only payload, no headers or
1078 * options) could be different.
1079 */
1080 static int
find_best_mtu_idx(struct adapter * sc,struct in_conninfo * inc,struct offload_settings * s)1081 find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc,
1082 struct offload_settings *s)
1083 {
1084 unsigned short *mtus = &sc->params.mtus[0];
1085 int i, mss, mtu;
1086
1087 MPASS(inc != NULL);
1088
1089 mss = s->mss > 0 ? s->mss : tcp_mssopt(inc);
1090 if (inc->inc_flags & INC_ISIPV6)
1091 mtu = mss + sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1092 else
1093 mtu = mss + sizeof(struct ip) + sizeof(struct tcphdr);
1094
1095 for (i = 0; i < NMTUS - 1 && mtus[i + 1] <= mtu; i++)
1096 continue;
1097
1098 return (i);
1099 }
1100
1101 /*
1102 * Determine the receive window size for a socket.
1103 */
1104 u_long
select_rcv_wnd(struct socket * so)1105 select_rcv_wnd(struct socket *so)
1106 {
1107 unsigned long wnd;
1108
1109 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1110
1111 wnd = sbspace(&so->so_rcv);
1112 if (wnd < MIN_RCV_WND)
1113 wnd = MIN_RCV_WND;
1114
1115 return min(wnd, MAX_RCV_WND);
1116 }
1117
1118 int
select_rcv_wscale(void)1119 select_rcv_wscale(void)
1120 {
1121 int wscale = 0;
1122 unsigned long space = sb_max;
1123
1124 if (space > MAX_RCV_WND)
1125 space = MAX_RCV_WND;
1126
1127 while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space)
1128 wscale++;
1129
1130 return (wscale);
1131 }
1132
1133 __be64
calc_options0(struct vi_info * vi,struct conn_params * cp)1134 calc_options0(struct vi_info *vi, struct conn_params *cp)
1135 {
1136 uint64_t opt0 = 0;
1137
1138 opt0 |= F_TCAM_BYPASS;
1139
1140 MPASS(cp->wscale >= 0 && cp->wscale <= M_WND_SCALE);
1141 opt0 |= V_WND_SCALE(cp->wscale);
1142
1143 MPASS(cp->mtu_idx >= 0 && cp->mtu_idx < NMTUS);
1144 opt0 |= V_MSS_IDX(cp->mtu_idx);
1145
1146 MPASS(cp->ulp_mode >= 0 && cp->ulp_mode <= M_ULP_MODE);
1147 opt0 |= V_ULP_MODE(cp->ulp_mode);
1148
1149 MPASS(cp->opt0_bufsize >= 0 && cp->opt0_bufsize <= M_RCV_BUFSIZ);
1150 opt0 |= V_RCV_BUFSIZ(cp->opt0_bufsize);
1151
1152 MPASS(cp->l2t_idx >= 0 && cp->l2t_idx < vi->adapter->vres.l2t.size);
1153 opt0 |= V_L2T_IDX(cp->l2t_idx);
1154
1155 opt0 |= V_SMAC_SEL(vi->smt_idx);
1156 opt0 |= V_TX_CHAN(vi->pi->tx_chan);
1157
1158 MPASS(cp->keepalive == 0 || cp->keepalive == 1);
1159 opt0 |= V_KEEP_ALIVE(cp->keepalive);
1160
1161 MPASS(cp->nagle == 0 || cp->nagle == 1);
1162 opt0 |= V_NAGLE(cp->nagle);
1163
1164 return (htobe64(opt0));
1165 }
1166
1167 __be32
calc_options2(struct vi_info * vi,struct conn_params * cp)1168 calc_options2(struct vi_info *vi, struct conn_params *cp)
1169 {
1170 uint32_t opt2 = 0;
1171 struct port_info *pi = vi->pi;
1172 struct adapter *sc = pi->adapter;
1173
1174 /*
1175 * rx flow control, rx coalesce, congestion control, and tx pace are all
1176 * explicitly set by the driver. On T5+ the ISS is also set by the
1177 * driver to the value picked by the kernel.
1178 */
1179 if (is_t4(sc)) {
1180 opt2 |= F_RX_FC_VALID | F_RX_COALESCE_VALID;
1181 opt2 |= F_CONG_CNTRL_VALID | F_PACE_VALID;
1182 } else {
1183 opt2 |= F_T5_OPT_2_VALID; /* all 4 valid */
1184 opt2 |= F_T5_ISS; /* ISS provided in CPL */
1185 }
1186
1187 MPASS(cp->sack == 0 || cp->sack == 1);
1188 opt2 |= V_SACK_EN(cp->sack);
1189
1190 MPASS(cp->tstamp == 0 || cp->tstamp == 1);
1191 opt2 |= V_TSTAMPS_EN(cp->tstamp);
1192
1193 if (cp->wscale > 0)
1194 opt2 |= F_WND_SCALE_EN;
1195
1196 MPASS(cp->ecn == 0 || cp->ecn == 1);
1197 opt2 |= V_CCTRL_ECN(cp->ecn);
1198
1199 opt2 |= V_TX_QUEUE(TX_MODQ(pi->tx_chan));
1200 opt2 |= V_PACE(0);
1201 opt2 |= F_RSS_QUEUE_VALID;
1202 opt2 |= V_RSS_QUEUE(sc->sge.ofld_rxq[cp->rxq_idx].iq.abs_id);
1203 if (chip_id(sc) <= CHELSIO_T6) {
1204 MPASS(pi->rx_chan == 0 || pi->rx_chan == 1);
1205 opt2 |= V_RX_CHANNEL(pi->rx_chan);
1206 }
1207
1208 MPASS(cp->cong_algo >= 0 && cp->cong_algo <= M_CONG_CNTRL);
1209 opt2 |= V_CONG_CNTRL(cp->cong_algo);
1210
1211 MPASS(cp->rx_coalesce == 0 || cp->rx_coalesce == 1);
1212 if (cp->rx_coalesce == 1)
1213 opt2 |= V_RX_COALESCE(M_RX_COALESCE);
1214
1215 opt2 |= V_RX_FC_DDP(0) | V_RX_FC_DISABLE(0);
1216 MPASS(cp->ulp_mode != ULP_MODE_TCPDDP);
1217
1218 return (htobe32(opt2));
1219 }
1220
1221 uint64_t
select_ntuple(struct vi_info * vi,struct l2t_entry * e)1222 select_ntuple(struct vi_info *vi, struct l2t_entry *e)
1223 {
1224 struct adapter *sc = vi->adapter;
1225 struct tp_params *tp = &sc->params.tp;
1226 uint64_t ntuple = 0;
1227
1228 /*
1229 * Initialize each of the fields which we care about which are present
1230 * in the Compressed Filter Tuple.
1231 */
1232 if (tp->vlan_shift >= 0 && EVL_VLANOFTAG(e->vlan) != CPL_L2T_VLAN_NONE)
1233 ntuple |= (uint64_t)(F_FT_VLAN_VLD | e->vlan) << tp->vlan_shift;
1234
1235 if (tp->port_shift >= 0)
1236 ntuple |= (uint64_t)e->lport << tp->port_shift;
1237
1238 if (tp->protocol_shift >= 0)
1239 ntuple |= (uint64_t)IPPROTO_TCP << tp->protocol_shift;
1240
1241 if (tp->vnic_shift >= 0 && tp->vnic_mode == FW_VNIC_MODE_PF_VF) {
1242 ntuple |= (uint64_t)(V_FT_VNID_ID_VF(vi->vin) |
1243 V_FT_VNID_ID_PF(sc->pf) | V_FT_VNID_ID_VLD(vi->vfvld)) <<
1244 tp->vnic_shift;
1245 }
1246
1247 if (is_t4(sc))
1248 return (htobe32((uint32_t)ntuple));
1249 else
1250 return (htobe64(V_FILTER_TUPLE(ntuple)));
1251 }
1252
1253 /*
1254 * Initialize various connection parameters.
1255 */
1256 void
init_conn_params(struct vi_info * vi,struct offload_settings * s,struct in_conninfo * inc,struct socket * so,const struct tcp_options * tcpopt,int16_t l2t_idx,struct conn_params * cp)1257 init_conn_params(struct vi_info *vi , struct offload_settings *s,
1258 struct in_conninfo *inc, struct socket *so,
1259 const struct tcp_options *tcpopt, int16_t l2t_idx, struct conn_params *cp)
1260 {
1261 struct port_info *pi = vi->pi;
1262 struct adapter *sc = pi->adapter;
1263 struct tom_tunables *tt = &sc->tt;
1264 struct inpcb *inp = sotoinpcb(so);
1265 struct tcpcb *tp = intotcpcb(inp);
1266 u_long wnd;
1267 u_int q_idx;
1268
1269 MPASS(s->offload != 0);
1270
1271 /* Congestion control algorithm */
1272 if (s->cong_algo >= 0)
1273 cp->cong_algo = s->cong_algo & M_CONG_CNTRL;
1274 else if (sc->tt.cong_algorithm >= 0)
1275 cp->cong_algo = tt->cong_algorithm & M_CONG_CNTRL;
1276 else {
1277 struct cc_algo *cc = CC_ALGO(tp);
1278
1279 if (strcasecmp(cc->name, "reno") == 0)
1280 cp->cong_algo = CONG_ALG_RENO;
1281 else if (strcasecmp(cc->name, "tahoe") == 0)
1282 cp->cong_algo = CONG_ALG_TAHOE;
1283 if (strcasecmp(cc->name, "newreno") == 0)
1284 cp->cong_algo = CONG_ALG_NEWRENO;
1285 if (strcasecmp(cc->name, "highspeed") == 0)
1286 cp->cong_algo = CONG_ALG_HIGHSPEED;
1287 else {
1288 /*
1289 * Use newreno in case the algorithm selected by the
1290 * host stack is not supported by the hardware.
1291 */
1292 cp->cong_algo = CONG_ALG_NEWRENO;
1293 }
1294 }
1295
1296 /* Tx traffic scheduling class. */
1297 if (s->sched_class >= 0 && s->sched_class < sc->params.nsched_cls)
1298 cp->tc_idx = s->sched_class;
1299 else
1300 cp->tc_idx = -1;
1301
1302 /* Nagle's algorithm. */
1303 if (s->nagle >= 0)
1304 cp->nagle = s->nagle > 0 ? 1 : 0;
1305 else
1306 cp->nagle = tp->t_flags & TF_NODELAY ? 0 : 1;
1307
1308 /* TCP Keepalive. */
1309 if (V_tcp_always_keepalive || so_options_get(so) & SO_KEEPALIVE)
1310 cp->keepalive = 1;
1311 else
1312 cp->keepalive = 0;
1313
1314 /* Optimization that's specific to T5 @ 40G. */
1315 if (tt->tx_align >= 0)
1316 cp->tx_align = tt->tx_align > 0 ? 1 : 0;
1317 else if (chip_id(sc) == CHELSIO_T5 &&
1318 (port_top_speed(pi) > 10 || sc->params.nports > 2))
1319 cp->tx_align = 1;
1320 else
1321 cp->tx_align = 0;
1322
1323 /* ULP mode. */
1324 cp->ulp_mode = ULP_MODE_NONE;
1325
1326 /* Rx coalescing. */
1327 if (s->rx_coalesce >= 0)
1328 cp->rx_coalesce = s->rx_coalesce > 0 ? 1 : 0;
1329 else if (tt->rx_coalesce >= 0)
1330 cp->rx_coalesce = tt->rx_coalesce > 0 ? 1 : 0;
1331 else
1332 cp->rx_coalesce = 1; /* default */
1333
1334 /*
1335 * Index in the PMTU table. This controls the MSS that we announce in
1336 * our SYN initially, but after ESTABLISHED it controls the MSS that we
1337 * use to send data.
1338 */
1339 cp->mtu_idx = find_best_mtu_idx(sc, inc, s);
1340
1341 /* Tx queue for this connection. */
1342 if (s->txq == QUEUE_RANDOM)
1343 q_idx = arc4random();
1344 else if (s->txq == QUEUE_ROUNDROBIN)
1345 q_idx = atomic_fetchadd_int(&vi->txq_rr, 1);
1346 else
1347 q_idx = s->txq;
1348 cp->txq_idx = vi->first_ofld_txq + q_idx % vi->nofldtxq;
1349
1350 /* Rx queue for this connection. */
1351 if (s->rxq == QUEUE_RANDOM)
1352 q_idx = arc4random();
1353 else if (s->rxq == QUEUE_ROUNDROBIN)
1354 q_idx = atomic_fetchadd_int(&vi->rxq_rr, 1);
1355 else
1356 q_idx = s->rxq;
1357 cp->rxq_idx = vi->first_ofld_rxq + q_idx % vi->nofldrxq;
1358
1359 if (SOLISTENING(so)) {
1360 /* Passive open */
1361 MPASS(tcpopt != NULL);
1362
1363 /* TCP timestamp option */
1364 if (tcpopt->tstamp &&
1365 (s->tstamp > 0 || (s->tstamp < 0 && V_tcp_do_rfc1323)))
1366 cp->tstamp = 1;
1367 else
1368 cp->tstamp = 0;
1369
1370 /* SACK */
1371 if (tcpopt->sack &&
1372 (s->sack > 0 || (s->sack < 0 && V_tcp_do_sack)))
1373 cp->sack = 1;
1374 else
1375 cp->sack = 0;
1376
1377 /* Receive window scaling. */
1378 if (tcpopt->wsf > 0 && tcpopt->wsf < 15 && V_tcp_do_rfc1323)
1379 cp->wscale = select_rcv_wscale();
1380 else
1381 cp->wscale = 0;
1382
1383 /* ECN */
1384 if (tcpopt->ecn && /* XXX: review. */
1385 (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn)))
1386 cp->ecn = 1;
1387 else
1388 cp->ecn = 0;
1389
1390 wnd = max(so->sol_sbrcv_hiwat, MIN_RCV_WND);
1391 cp->opt0_bufsize = min(wnd >> 10, M_RCV_BUFSIZ);
1392
1393 if (tt->sndbuf > 0)
1394 cp->sndbuf = tt->sndbuf;
1395 else if (so->sol_sbsnd_flags & SB_AUTOSIZE &&
1396 V_tcp_do_autosndbuf)
1397 cp->sndbuf = 256 * 1024;
1398 else
1399 cp->sndbuf = so->sol_sbsnd_hiwat;
1400 } else {
1401 /* Active open */
1402
1403 /* TCP timestamp option */
1404 if (s->tstamp > 0 ||
1405 (s->tstamp < 0 && (tp->t_flags & TF_REQ_TSTMP)))
1406 cp->tstamp = 1;
1407 else
1408 cp->tstamp = 0;
1409
1410 /* SACK */
1411 if (s->sack > 0 ||
1412 (s->sack < 0 && (tp->t_flags & TF_SACK_PERMIT)))
1413 cp->sack = 1;
1414 else
1415 cp->sack = 0;
1416
1417 /* Receive window scaling */
1418 if (tp->t_flags & TF_REQ_SCALE)
1419 cp->wscale = select_rcv_wscale();
1420 else
1421 cp->wscale = 0;
1422
1423 /* ECN */
1424 if (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn == 1))
1425 cp->ecn = 1;
1426 else
1427 cp->ecn = 0;
1428
1429 SOCKBUF_LOCK(&so->so_rcv);
1430 wnd = max(select_rcv_wnd(so), MIN_RCV_WND);
1431 SOCKBUF_UNLOCK(&so->so_rcv);
1432 cp->opt0_bufsize = min(wnd >> 10, M_RCV_BUFSIZ);
1433
1434 if (tt->sndbuf > 0)
1435 cp->sndbuf = tt->sndbuf;
1436 else {
1437 SOCKBUF_LOCK(&so->so_snd);
1438 if (so->so_snd.sb_flags & SB_AUTOSIZE &&
1439 V_tcp_do_autosndbuf)
1440 cp->sndbuf = 256 * 1024;
1441 else
1442 cp->sndbuf = so->so_snd.sb_hiwat;
1443 SOCKBUF_UNLOCK(&so->so_snd);
1444 }
1445 }
1446
1447 cp->l2t_idx = l2t_idx;
1448
1449 /* This will be initialized on ESTABLISHED. */
1450 cp->emss = 0;
1451 }
1452
1453 int
negative_advice(int status)1454 negative_advice(int status)
1455 {
1456
1457 return (status == CPL_ERR_RTX_NEG_ADVICE ||
1458 status == CPL_ERR_PERSIST_NEG_ADVICE ||
1459 status == CPL_ERR_KEEPALV_NEG_ADVICE);
1460 }
1461
1462 static int
alloc_tid_tab(struct adapter * sc)1463 alloc_tid_tab(struct adapter *sc)
1464 {
1465 struct tid_info *t = &sc->tids;
1466
1467 MPASS(t->ntids > 0);
1468 MPASS(t->tid_tab == NULL);
1469
1470 t->tid_tab = malloc(t->ntids * sizeof(*t->tid_tab), M_CXGBE,
1471 M_ZERO | M_NOWAIT);
1472 if (t->tid_tab == NULL)
1473 return (ENOMEM);
1474 atomic_store_rel_int(&t->tids_in_use, 0);
1475
1476 return (0);
1477 }
1478
1479 static void
free_tid_tab(struct adapter * sc)1480 free_tid_tab(struct adapter *sc)
1481 {
1482 struct tid_info *t = &sc->tids;
1483
1484 KASSERT(t->tids_in_use == 0,
1485 ("%s: %d tids still in use.", __func__, t->tids_in_use));
1486
1487 free(t->tid_tab, M_CXGBE);
1488 t->tid_tab = NULL;
1489 }
1490
1491 static void
free_tid_tabs(struct adapter * sc)1492 free_tid_tabs(struct adapter *sc)
1493 {
1494 free_tid_tab(sc);
1495 free_stid_tab(sc);
1496 }
1497
1498 static int
alloc_tid_tabs(struct adapter * sc)1499 alloc_tid_tabs(struct adapter *sc)
1500 {
1501 int rc;
1502
1503 rc = alloc_tid_tab(sc);
1504 if (rc != 0)
1505 goto failed;
1506
1507 rc = alloc_stid_tab(sc);
1508 if (rc != 0)
1509 goto failed;
1510
1511 return (0);
1512 failed:
1513 free_tid_tabs(sc);
1514 return (rc);
1515 }
1516
1517 static inline void
alloc_tcb_history(struct adapter * sc,struct tom_data * td)1518 alloc_tcb_history(struct adapter *sc, struct tom_data *td)
1519 {
1520
1521 if (sc->tids.ntids == 0 || sc->tids.ntids > 1024)
1522 return;
1523 rw_init(&td->tcb_history_lock, "TCB history");
1524 td->tcb_history = malloc(sc->tids.ntids * sizeof(*td->tcb_history),
1525 M_CXGBE, M_ZERO | M_NOWAIT);
1526 td->dupack_threshold = G_DUPACKTHRESH(t4_read_reg(sc, A_TP_PARA_REG0));
1527 }
1528
1529 static inline void
free_tcb_history(struct adapter * sc,struct tom_data * td)1530 free_tcb_history(struct adapter *sc, struct tom_data *td)
1531 {
1532 #ifdef INVARIANTS
1533 int i;
1534
1535 if (td->tcb_history != NULL) {
1536 for (i = 0; i < sc->tids.ntids; i++) {
1537 MPASS(td->tcb_history[i] == NULL);
1538 }
1539 }
1540 #endif
1541 free(td->tcb_history, M_CXGBE);
1542 if (rw_initialized(&td->tcb_history_lock))
1543 rw_destroy(&td->tcb_history_lock);
1544 }
1545
1546 static void
free_tom_data(struct adapter * sc,struct tom_data * td)1547 free_tom_data(struct adapter *sc, struct tom_data *td)
1548 {
1549
1550 ASSERT_SYNCHRONIZED_OP(sc);
1551
1552 KASSERT(TAILQ_EMPTY(&td->toep_list),
1553 ("%s: TOE PCB list is not empty.", __func__));
1554 KASSERT(td->lctx_count == 0,
1555 ("%s: lctx hash table is not empty.", __func__));
1556
1557 t4_free_ppod_region(&td->pr);
1558
1559 if (td->listen_mask != 0)
1560 hashdestroy(td->listen_hash, M_CXGBE, td->listen_mask);
1561
1562 if (mtx_initialized(&td->unsent_wr_lock))
1563 mtx_destroy(&td->unsent_wr_lock);
1564 if (mtx_initialized(&td->lctx_hash_lock))
1565 mtx_destroy(&td->lctx_hash_lock);
1566 if (mtx_initialized(&td->toep_list_lock))
1567 mtx_destroy(&td->toep_list_lock);
1568
1569 free_tcb_history(sc, td);
1570 free_tid_tabs(sc);
1571 free(td, M_CXGBE);
1572 }
1573
1574 static char *
prepare_pkt(int open_type,uint16_t vtag,struct inpcb * inp,int * pktlen,int * buflen)1575 prepare_pkt(int open_type, uint16_t vtag, struct inpcb *inp, int *pktlen,
1576 int *buflen)
1577 {
1578 char *pkt;
1579 struct tcphdr *th;
1580 int ipv6, len;
1581 const int maxlen =
1582 max(sizeof(struct ether_header), sizeof(struct ether_vlan_header)) +
1583 max(sizeof(struct ip), sizeof(struct ip6_hdr)) +
1584 sizeof(struct tcphdr);
1585
1586 MPASS(open_type == OPEN_TYPE_ACTIVE || open_type == OPEN_TYPE_LISTEN);
1587
1588 pkt = malloc(maxlen, M_CXGBE, M_ZERO | M_NOWAIT);
1589 if (pkt == NULL)
1590 return (NULL);
1591
1592 ipv6 = inp->inp_vflag & INP_IPV6;
1593 len = 0;
1594
1595 if (EVL_VLANOFTAG(vtag) == 0xfff) {
1596 struct ether_header *eh = (void *)pkt;
1597
1598 if (ipv6)
1599 eh->ether_type = htons(ETHERTYPE_IPV6);
1600 else
1601 eh->ether_type = htons(ETHERTYPE_IP);
1602
1603 len += sizeof(*eh);
1604 } else {
1605 struct ether_vlan_header *evh = (void *)pkt;
1606
1607 evh->evl_encap_proto = htons(ETHERTYPE_VLAN);
1608 evh->evl_tag = htons(vtag);
1609 if (ipv6)
1610 evh->evl_proto = htons(ETHERTYPE_IPV6);
1611 else
1612 evh->evl_proto = htons(ETHERTYPE_IP);
1613
1614 len += sizeof(*evh);
1615 }
1616
1617 if (ipv6) {
1618 struct ip6_hdr *ip6 = (void *)&pkt[len];
1619
1620 ip6->ip6_vfc = IPV6_VERSION;
1621 ip6->ip6_plen = htons(sizeof(struct tcphdr));
1622 ip6->ip6_nxt = IPPROTO_TCP;
1623 if (open_type == OPEN_TYPE_ACTIVE) {
1624 ip6->ip6_src = inp->in6p_laddr;
1625 ip6->ip6_dst = inp->in6p_faddr;
1626 } else if (open_type == OPEN_TYPE_LISTEN) {
1627 ip6->ip6_src = inp->in6p_laddr;
1628 ip6->ip6_dst = ip6->ip6_src;
1629 }
1630
1631 len += sizeof(*ip6);
1632 } else {
1633 struct ip *ip = (void *)&pkt[len];
1634
1635 ip->ip_v = IPVERSION;
1636 ip->ip_hl = sizeof(*ip) >> 2;
1637 ip->ip_tos = inp->inp_ip_tos;
1638 ip->ip_len = htons(sizeof(struct ip) + sizeof(struct tcphdr));
1639 ip->ip_ttl = inp->inp_ip_ttl;
1640 ip->ip_p = IPPROTO_TCP;
1641 if (open_type == OPEN_TYPE_ACTIVE) {
1642 ip->ip_src = inp->inp_laddr;
1643 ip->ip_dst = inp->inp_faddr;
1644 } else if (open_type == OPEN_TYPE_LISTEN) {
1645 ip->ip_src = inp->inp_laddr;
1646 ip->ip_dst = ip->ip_src;
1647 }
1648
1649 len += sizeof(*ip);
1650 }
1651
1652 th = (void *)&pkt[len];
1653 if (open_type == OPEN_TYPE_ACTIVE) {
1654 th->th_sport = inp->inp_lport; /* network byte order already */
1655 th->th_dport = inp->inp_fport; /* ditto */
1656 } else if (open_type == OPEN_TYPE_LISTEN) {
1657 th->th_sport = inp->inp_lport; /* network byte order already */
1658 th->th_dport = th->th_sport;
1659 }
1660 len += sizeof(th);
1661
1662 *pktlen = *buflen = len;
1663 return (pkt);
1664 }
1665
1666 const struct offload_settings *
lookup_offload_policy(struct adapter * sc,int open_type,struct mbuf * m,uint16_t vtag,struct inpcb * inp)1667 lookup_offload_policy(struct adapter *sc, int open_type, struct mbuf *m,
1668 uint16_t vtag, struct inpcb *inp)
1669 {
1670 const struct t4_offload_policy *op;
1671 char *pkt;
1672 struct offload_rule *r;
1673 int i, matched, pktlen, buflen;
1674 static const struct offload_settings allow_offloading_settings = {
1675 .offload = 1,
1676 .rx_coalesce = -1,
1677 .cong_algo = -1,
1678 .sched_class = -1,
1679 .tstamp = -1,
1680 .sack = -1,
1681 .nagle = -1,
1682 .ecn = -1,
1683 .ddp = -1,
1684 .tls = -1,
1685 .txq = QUEUE_RANDOM,
1686 .rxq = QUEUE_RANDOM,
1687 .mss = -1,
1688 };
1689 static const struct offload_settings disallow_offloading_settings = {
1690 .offload = 0,
1691 /* rest is irrelevant when offload is off. */
1692 };
1693
1694 rw_assert(&sc->policy_lock, RA_LOCKED);
1695
1696 /*
1697 * If there's no Connection Offloading Policy attached to the device
1698 * then we need to return a default static policy. If
1699 * "cop_managed_offloading" is true, then we need to disallow
1700 * offloading until a COP is attached to the device. Otherwise we
1701 * allow offloading ...
1702 */
1703 op = sc->policy;
1704 if (op == NULL) {
1705 if (sc->tt.cop_managed_offloading)
1706 return (&disallow_offloading_settings);
1707 else
1708 return (&allow_offloading_settings);
1709 }
1710
1711 switch (open_type) {
1712 case OPEN_TYPE_ACTIVE:
1713 case OPEN_TYPE_LISTEN:
1714 pkt = prepare_pkt(open_type, vtag, inp, &pktlen, &buflen);
1715 break;
1716 case OPEN_TYPE_PASSIVE:
1717 MPASS(m != NULL);
1718 pkt = mtod(m, char *);
1719 MPASS(*pkt == CPL_PASS_ACCEPT_REQ);
1720 pkt += sizeof(struct cpl_pass_accept_req);
1721 pktlen = m->m_pkthdr.len - sizeof(struct cpl_pass_accept_req);
1722 buflen = m->m_len - sizeof(struct cpl_pass_accept_req);
1723 break;
1724 default:
1725 MPASS(0);
1726 return (&disallow_offloading_settings);
1727 }
1728
1729 if (pkt == NULL || pktlen == 0 || buflen == 0)
1730 return (&disallow_offloading_settings);
1731
1732 matched = 0;
1733 r = &op->rule[0];
1734 for (i = 0; i < op->nrules; i++, r++) {
1735 if (r->open_type != open_type &&
1736 r->open_type != OPEN_TYPE_DONTCARE) {
1737 continue;
1738 }
1739 matched = bpf_filter(r->bpf_prog.bf_insns, pkt, pktlen, buflen);
1740 if (matched)
1741 break;
1742 }
1743
1744 if (open_type == OPEN_TYPE_ACTIVE || open_type == OPEN_TYPE_LISTEN)
1745 free(pkt, M_CXGBE);
1746
1747 return (matched ? &r->settings : &disallow_offloading_settings);
1748 }
1749
1750 static void
reclaim_wr_resources(void * arg,int count)1751 reclaim_wr_resources(void *arg, int count)
1752 {
1753 struct tom_data *td = arg;
1754 STAILQ_HEAD(, wrqe) twr_list = STAILQ_HEAD_INITIALIZER(twr_list);
1755 struct cpl_act_open_req *cpl;
1756 u_int opcode, atid, tid;
1757 struct wrqe *wr;
1758 struct adapter *sc = td_adapter(td);
1759
1760 mtx_lock(&td->unsent_wr_lock);
1761 STAILQ_SWAP(&td->unsent_wr_list, &twr_list, wrqe);
1762 mtx_unlock(&td->unsent_wr_lock);
1763
1764 while ((wr = STAILQ_FIRST(&twr_list)) != NULL) {
1765 STAILQ_REMOVE_HEAD(&twr_list, link);
1766
1767 cpl = wrtod(wr);
1768 opcode = GET_OPCODE(cpl);
1769
1770 switch (opcode) {
1771 case CPL_ACT_OPEN_REQ:
1772 case CPL_ACT_OPEN_REQ6:
1773 atid = G_TID_TID(be32toh(OPCODE_TID(cpl)));
1774 CTR2(KTR_CXGBE, "%s: atid %u ", __func__, atid);
1775 act_open_failure_cleanup(sc, lookup_atid(sc, atid),
1776 EHOSTUNREACH);
1777 free(wr, M_CXGBE);
1778 break;
1779 case CPL_PASS_ACCEPT_RPL:
1780 tid = GET_TID(cpl);
1781 CTR2(KTR_CXGBE, "%s: tid %u ", __func__, tid);
1782 synack_failure_cleanup(sc, lookup_tid(sc, tid));
1783 free(wr, M_CXGBE);
1784 break;
1785 default:
1786 log(LOG_ERR, "%s: leaked work request %p, wr_len %d, "
1787 "opcode %x\n", __func__, wr, wr->wr_len, opcode);
1788 /* WR not freed here; go look at it with a debugger. */
1789 }
1790 }
1791 }
1792
1793 /*
1794 * Based on do_abort_req. We treat an abrupt hardware stop as a connection
1795 * abort from the hardware.
1796 */
1797 static void
live_tid_failure_cleanup(struct adapter * sc,struct toepcb * toep,u_int status)1798 live_tid_failure_cleanup(struct adapter *sc, struct toepcb *toep, u_int status)
1799 {
1800 struct inpcb *inp;
1801 struct tcpcb *tp;
1802 struct epoch_tracker et;
1803
1804 MPASS(!(toep->flags & TPF_SYNQE));
1805
1806 inp = toep->inp;
1807 CURVNET_SET(toep->vnet);
1808 NET_EPOCH_ENTER(et); /* for tcp_close */
1809 INP_WLOCK(inp);
1810 tp = intotcpcb(inp);
1811 toep->flags |= TPF_ABORT_SHUTDOWN;
1812 if ((inp->inp_flags & INP_DROPPED) == 0) {
1813 struct socket *so = inp->inp_socket;
1814
1815 if (so != NULL)
1816 so_error_set(so, status);
1817 tp = tcp_close(tp);
1818 if (tp == NULL)
1819 INP_WLOCK(inp); /* re-acquire */
1820 }
1821 final_cpl_received(toep);
1822 NET_EPOCH_EXIT(et);
1823 CURVNET_RESTORE();
1824 }
1825
1826 static void
cleanup_stranded_tids(void * arg,int count)1827 cleanup_stranded_tids(void *arg, int count)
1828 {
1829 TAILQ_HEAD(, toepcb) tlist = TAILQ_HEAD_INITIALIZER(tlist);
1830 TAILQ_HEAD(, synq_entry) slist = TAILQ_HEAD_INITIALIZER(slist);
1831 struct tom_data *td = arg;
1832 struct adapter *sc = td_adapter(td);
1833 struct toepcb *toep;
1834 struct synq_entry *synqe;
1835
1836 /* Clean up synq entries. */
1837 mtx_lock(&td->toep_list_lock);
1838 TAILQ_SWAP(&td->stranded_synqe, &slist, synq_entry, link);
1839 mtx_unlock(&td->toep_list_lock);
1840 while ((synqe = TAILQ_FIRST(&slist)) != NULL) {
1841 TAILQ_REMOVE(&slist, synqe, link);
1842 MPASS(synqe->tid >= 0); /* stale, was kept around for debug */
1843 synqe->tid = -1;
1844 synack_failure_cleanup(sc, synqe);
1845 }
1846
1847 /* Clean up in-flight active opens. */
1848 mtx_lock(&td->toep_list_lock);
1849 TAILQ_SWAP(&td->stranded_atids, &tlist, toepcb, link);
1850 mtx_unlock(&td->toep_list_lock);
1851 while ((toep = TAILQ_FIRST(&tlist)) != NULL) {
1852 TAILQ_REMOVE(&tlist, toep, link);
1853 MPASS(toep->tid >= 0); /* stale, was kept around for debug */
1854 toep->tid = -1;
1855 act_open_failure_cleanup(sc, toep, EHOSTUNREACH);
1856 }
1857
1858 /* Clean up live connections. */
1859 mtx_lock(&td->toep_list_lock);
1860 TAILQ_SWAP(&td->stranded_tids, &tlist, toepcb, link);
1861 mtx_unlock(&td->toep_list_lock);
1862 while ((toep = TAILQ_FIRST(&tlist)) != NULL) {
1863 TAILQ_REMOVE(&tlist, toep, link);
1864 MPASS(toep->tid >= 0); /* stale, was kept around for debug */
1865 toep->tid = -1;
1866 live_tid_failure_cleanup(sc, toep, ECONNABORTED);
1867 }
1868 }
1869
1870 /*
1871 * Ground control to Major TOM
1872 * Commencing countdown, engines on
1873 */
1874 static int
t4_tom_activate(struct adapter * sc)1875 t4_tom_activate(struct adapter *sc)
1876 {
1877 struct tom_data *td;
1878 struct toedev *tod;
1879 struct vi_info *vi;
1880 int i, rc, v;
1881
1882 ASSERT_SYNCHRONIZED_OP(sc);
1883
1884 /* per-adapter softc for TOM */
1885 td = malloc(sizeof(*td), M_CXGBE, M_ZERO | M_NOWAIT);
1886 if (td == NULL)
1887 return (ENOMEM);
1888
1889 /* List of TOE PCBs and associated lock */
1890 mtx_init(&td->toep_list_lock, "PCB list lock", NULL, MTX_DEF);
1891 TAILQ_INIT(&td->toep_list);
1892 TAILQ_INIT(&td->synqe_list);
1893 TAILQ_INIT(&td->stranded_atids);
1894 TAILQ_INIT(&td->stranded_tids);
1895 TASK_INIT(&td->cleanup_stranded_tids, 0, cleanup_stranded_tids, td);
1896
1897 /* Listen context */
1898 mtx_init(&td->lctx_hash_lock, "lctx hash lock", NULL, MTX_DEF);
1899 td->listen_hash = hashinit_flags(LISTEN_HASH_SIZE, M_CXGBE,
1900 &td->listen_mask, HASH_NOWAIT);
1901
1902 /* List of WRs for which L2 resolution failed */
1903 mtx_init(&td->unsent_wr_lock, "Unsent WR list lock", NULL, MTX_DEF);
1904 STAILQ_INIT(&td->unsent_wr_list);
1905 TASK_INIT(&td->reclaim_wr_resources, 0, reclaim_wr_resources, td);
1906
1907 /* TID tables */
1908 rc = alloc_tid_tabs(sc);
1909 if (rc != 0)
1910 goto done;
1911
1912 rc = t4_init_ppod_region(&td->pr, &sc->vres.ddp,
1913 t4_read_reg(sc, A_ULP_RX_TDDP_PSZ), "TDDP page pods");
1914 if (rc != 0)
1915 goto done;
1916 t4_set_reg_field(sc, A_ULP_RX_TDDP_TAGMASK,
1917 V_TDDPTAGMASK(M_TDDPTAGMASK), td->pr.pr_tag_mask);
1918
1919 alloc_tcb_history(sc, td);
1920
1921 /* toedev ops */
1922 tod = &td->tod;
1923 init_toedev(tod);
1924 tod->tod_softc = sc;
1925 tod->tod_connect = t4_connect;
1926 tod->tod_listen_start = t4_listen_start;
1927 tod->tod_listen_stop = t4_listen_stop;
1928 tod->tod_rcvd = t4_rcvd;
1929 tod->tod_output = t4_tod_output;
1930 tod->tod_send_rst = t4_send_rst;
1931 tod->tod_send_fin = t4_send_fin;
1932 tod->tod_pcb_detach = t4_pcb_detach;
1933 tod->tod_l2_update = t4_l2_update;
1934 tod->tod_syncache_added = t4_syncache_added;
1935 tod->tod_syncache_removed = t4_syncache_removed;
1936 tod->tod_syncache_respond = t4_syncache_respond;
1937 tod->tod_offload_socket = t4_offload_socket;
1938 tod->tod_ctloutput = t4_ctloutput;
1939 tod->tod_tcp_info = t4_tcp_info;
1940 #ifdef KERN_TLS
1941 tod->tod_alloc_tls_session = t4_alloc_tls_session;
1942 #endif
1943 tod->tod_pmtu_update = t4_pmtu_update;
1944
1945 for_each_port(sc, i) {
1946 for_each_vi(sc->port[i], v, vi) {
1947 SETTOEDEV(vi->ifp, &td->tod);
1948 }
1949 }
1950
1951 sc->tom_softc = td;
1952 register_toedev(sc->tom_softc);
1953
1954 done:
1955 if (rc != 0)
1956 free_tom_data(sc, td);
1957 return (rc);
1958 }
1959
1960 static int
t4_tom_deactivate(struct adapter * sc)1961 t4_tom_deactivate(struct adapter *sc)
1962 {
1963 int rc = 0;
1964 struct tom_data *td = sc->tom_softc;
1965
1966 ASSERT_SYNCHRONIZED_OP(sc);
1967
1968 if (td == NULL)
1969 return (0); /* XXX. KASSERT? */
1970
1971 if (sc->offload_map != 0)
1972 return (EBUSY); /* at least one port has IFCAP_TOE enabled */
1973
1974 if (uld_active(sc, ULD_IWARP) || uld_active(sc, ULD_ISCSI))
1975 return (EBUSY); /* both iWARP and iSCSI rely on the TOE. */
1976
1977 mtx_lock(&td->toep_list_lock);
1978 if (!TAILQ_EMPTY(&td->toep_list))
1979 rc = EBUSY;
1980 MPASS(TAILQ_EMPTY(&td->synqe_list));
1981 MPASS(TAILQ_EMPTY(&td->stranded_tids));
1982 mtx_unlock(&td->toep_list_lock);
1983 mtx_unlock(&td->toep_list_lock);
1984
1985 mtx_lock(&td->lctx_hash_lock);
1986 if (td->lctx_count > 0)
1987 rc = EBUSY;
1988 mtx_unlock(&td->lctx_hash_lock);
1989
1990 taskqueue_drain(taskqueue_thread, &td->reclaim_wr_resources);
1991 taskqueue_drain(taskqueue_thread, &td->cleanup_stranded_tids);
1992 mtx_lock(&td->unsent_wr_lock);
1993 if (!STAILQ_EMPTY(&td->unsent_wr_list))
1994 rc = EBUSY;
1995 mtx_unlock(&td->unsent_wr_lock);
1996
1997 if (rc == 0) {
1998 unregister_toedev(sc->tom_softc);
1999 free_tom_data(sc, td);
2000 sc->tom_softc = NULL;
2001 }
2002
2003 return (rc);
2004 }
2005
2006 static void
stop_atids(struct adapter * sc)2007 stop_atids(struct adapter *sc)
2008 {
2009 struct tom_data *td = sc->tom_softc;
2010 struct tid_info *t = &sc->tids;
2011 struct toepcb *toep;
2012 int atid;
2013
2014 /*
2015 * Hashfilters and T6-KTLS are the only other users of atids but they're
2016 * both mutually exclusive with TOE. That means t4_tom owns all the
2017 * atids in the table.
2018 */
2019 MPASS(!is_hashfilter(sc));
2020 if (is_t6(sc))
2021 MPASS(!(sc->flags & KERN_TLS_ON));
2022
2023 /* New atids are not being allocated. */
2024 #ifdef INVARIANTS
2025 mtx_lock(&t->atid_lock);
2026 MPASS(t->atid_alloc_stopped == true);
2027 mtx_unlock(&t->atid_lock);
2028 #endif
2029
2030 /*
2031 * In-use atids fall in one of these two categories:
2032 * a) Those waiting for L2 resolution before being submitted to
2033 * hardware.
2034 * b) Those that have been submitted to hardware and are awaiting
2035 * replies that will never arrive because the LLD is stopped.
2036 */
2037 for (atid = 0; atid < t->natids; atid++) {
2038 toep = lookup_atid(sc, atid);
2039 if ((uintptr_t)toep >= (uintptr_t)&t->atid_tab[0] &&
2040 (uintptr_t)toep < (uintptr_t)&t->atid_tab[t->natids])
2041 continue;
2042 MPASS(toep->tid == atid);
2043 MPASS(toep->incarnation == sc->incarnation);
2044 /*
2045 * Take the atid out of the lookup table. toep->tid is stale
2046 * after this but useful for debug.
2047 */
2048 CTR(KTR_CXGBE, "%s: atid %d@%d STRANDED, removed from table",
2049 __func__, atid, toep->incarnation);
2050 free_atid(sc, toep->tid);
2051 #if 0
2052 toep->tid = -1;
2053 #endif
2054 mtx_lock(&td->toep_list_lock);
2055 TAILQ_INSERT_TAIL(&td->stranded_atids, toep, link);
2056 mtx_unlock(&td->toep_list_lock);
2057 }
2058 MPASS(atomic_load_int(&t->atids_in_use) == 0);
2059 }
2060
2061 static void
stop_tids(struct adapter * sc)2062 stop_tids(struct adapter *sc)
2063 {
2064 struct tom_data *td = sc->tom_softc;
2065 struct toepcb *toep;
2066 #ifdef INVARIANTS
2067 struct tid_info *t = &sc->tids;
2068 #endif
2069
2070 /*
2071 * The LLD's offload queues are stopped so do_act_establish and
2072 * do_pass_accept_req cannot run and insert tids in parallel with this
2073 * thread. stop_stid_tab has also run and removed the synq entries'
2074 * tids from the table. The only tids in the table are for connections
2075 * at or beyond ESTABLISHED that are still waiting for the final CPL.
2076 */
2077 mtx_lock(&td->toep_list_lock);
2078 TAILQ_FOREACH(toep, &td->toep_list, link) {
2079 MPASS(sc->incarnation == toep->incarnation);
2080 MPASS(toep->tid >= 0);
2081 MPASS(toep == lookup_tid(sc, toep->tid));
2082 /* Remove tid from the lookup table immediately. */
2083 CTR(KTR_CXGBE, "%s: tid %d@%d STRANDED, removed from table",
2084 __func__, toep->tid, toep->incarnation);
2085 remove_tid(sc, toep->tid, toep->ce ? 2 : 1);
2086 #if 0
2087 /* toep->tid is stale now but left alone for debug. */
2088 toep->tid = -1;
2089 #endif
2090 /* All toep in this list will get bulk moved to stranded_tids */
2091 toep->flags &= ~TPF_IN_TOEP_LIST;
2092 }
2093 MPASS(TAILQ_EMPTY(&td->stranded_tids));
2094 TAILQ_CONCAT(&td->stranded_tids, &td->toep_list, link);
2095 MPASS(TAILQ_EMPTY(&td->toep_list));
2096 mtx_unlock(&td->toep_list_lock);
2097
2098 MPASS(atomic_load_int(&t->tids_in_use) == 0);
2099 }
2100
2101 /*
2102 * L2T is stable because
2103 * 1. stop_lld stopped all new allocations.
2104 * 2. stop_lld also stopped the tx wrq so nothing is enqueueing new WRs to the
2105 * queue or to l2t_entry->wr_list.
2106 * 3. t4_l2t_update is ignoring all L2 updates.
2107 */
2108 static void
stop_tom_l2t(struct adapter * sc)2109 stop_tom_l2t(struct adapter *sc)
2110 {
2111 struct l2t_data *d = sc->l2t;
2112 struct l2t_entry *e;
2113 int i;
2114
2115 for (i = 0; i < d->l2t_size; i++) {
2116 e = &d->l2tab[i];
2117 mtx_lock(&e->lock);
2118 if (e->state == L2T_STATE_VALID)
2119 e->state = L2T_STATE_RESOLVING;
2120 if (!STAILQ_EMPTY(&e->wr_list))
2121 CXGBE_UNIMPLEMENTED("l2t e->wr_list");
2122 mtx_unlock(&e->lock);
2123 }
2124 }
2125
2126 static int
t4_tom_stop(struct adapter * sc)2127 t4_tom_stop(struct adapter *sc)
2128 {
2129 struct tid_info *t = &sc->tids;
2130 struct tom_data *td = sc->tom_softc;
2131
2132 ASSERT_SYNCHRONIZED_OP(sc);
2133
2134 stop_tom_l2t(sc);
2135 if (atomic_load_int(&t->atids_in_use) > 0)
2136 stop_atids(sc);
2137 if (atomic_load_int(&t->stids_in_use) > 0)
2138 stop_stid_tab(sc);
2139 if (atomic_load_int(&t->tids_in_use) > 0)
2140 stop_tids(sc);
2141 taskqueue_enqueue(taskqueue_thread, &td->cleanup_stranded_tids);
2142
2143 return (0);
2144 }
2145
2146 static int
t4_tom_restart(struct adapter * sc)2147 t4_tom_restart(struct adapter *sc)
2148 {
2149 ASSERT_SYNCHRONIZED_OP(sc);
2150
2151 restart_stid_tab(sc);
2152
2153 return (0);
2154 }
2155
2156 static int
t4_ctloutput_tom(struct socket * so,struct sockopt * sopt)2157 t4_ctloutput_tom(struct socket *so, struct sockopt *sopt)
2158 {
2159 struct tcpcb *tp = sototcpcb(so);
2160 struct toepcb *toep = tp->t_toe;
2161 int error, optval;
2162
2163 if (sopt->sopt_level == IPPROTO_TCP && sopt->sopt_name == TCP_USE_DDP) {
2164 if (sopt->sopt_dir != SOPT_SET)
2165 return (EOPNOTSUPP);
2166
2167 if (sopt->sopt_td != NULL) {
2168 /* Only settable by the kernel. */
2169 return (EPERM);
2170 }
2171
2172 error = sooptcopyin(sopt, &optval, sizeof(optval),
2173 sizeof(optval));
2174 if (error != 0)
2175 return (error);
2176
2177 if (optval != 0)
2178 return (t4_enable_ddp_rcv(so, toep));
2179 else
2180 return (EOPNOTSUPP);
2181 }
2182 return (tcp_ctloutput(so, sopt));
2183 }
2184
2185 static int
t4_aio_queue_tom(struct socket * so,struct kaiocb * job)2186 t4_aio_queue_tom(struct socket *so, struct kaiocb *job)
2187 {
2188 struct tcpcb *tp = sototcpcb(so);
2189 struct toepcb *toep = tp->t_toe;
2190 int error;
2191
2192 /*
2193 * No lock is needed as TOE sockets never change between
2194 * active and passive.
2195 */
2196 if (SOLISTENING(so))
2197 return (EINVAL);
2198
2199 if (ulp_mode(toep) == ULP_MODE_TCPDDP ||
2200 ulp_mode(toep) == ULP_MODE_NONE) {
2201 error = t4_aio_queue_ddp(so, job);
2202 if (error != EOPNOTSUPP)
2203 return (error);
2204 }
2205
2206 return (t4_aio_queue_aiotx(so, job));
2207 }
2208
2209 static int
t4_tom_mod_load(void)2210 t4_tom_mod_load(void)
2211 {
2212 /* CPL handlers */
2213 t4_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
2214 t4_register_shared_cpl_handler(CPL_L2T_WRITE_RPL, do_l2t_write_rpl2,
2215 CPL_COOKIE_TOM);
2216 t4_init_connect_cpl_handlers();
2217 t4_init_listen_cpl_handlers();
2218 t4_init_cpl_io_handlers();
2219
2220 t4_ddp_mod_load();
2221 t4_tls_mod_load();
2222
2223 bcopy(&tcp_protosw, &toe_protosw, sizeof(toe_protosw));
2224 toe_protosw.pr_ctloutput = t4_ctloutput_tom;
2225 toe_protosw.pr_aio_queue = t4_aio_queue_tom;
2226
2227 bcopy(&tcp6_protosw, &toe6_protosw, sizeof(toe6_protosw));
2228 toe6_protosw.pr_ctloutput = t4_ctloutput_tom;
2229 toe6_protosw.pr_aio_queue = t4_aio_queue_tom;
2230
2231 return (t4_register_uld(&tom_uld_info, ULD_TOM));
2232 }
2233
2234 static void
tom_uninit(struct adapter * sc,void * arg __unused)2235 tom_uninit(struct adapter *sc, void *arg __unused)
2236 {
2237 if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4tomun"))
2238 return;
2239
2240 /* Try to free resources (works only if no port has IFCAP_TOE) */
2241 if (uld_active(sc, ULD_TOM))
2242 t4_deactivate_uld(sc, ULD_TOM);
2243
2244 end_synchronized_op(sc, 0);
2245 }
2246
2247 static int
t4_tom_mod_unload(void)2248 t4_tom_mod_unload(void)
2249 {
2250 t4_iterate(tom_uninit, NULL);
2251
2252 if (t4_unregister_uld(&tom_uld_info, ULD_TOM) == EBUSY)
2253 return (EBUSY);
2254
2255 t4_tls_mod_unload();
2256 t4_ddp_mod_unload();
2257
2258 t4_uninit_connect_cpl_handlers();
2259 t4_uninit_listen_cpl_handlers();
2260 t4_uninit_cpl_io_handlers();
2261 t4_register_shared_cpl_handler(CPL_L2T_WRITE_RPL, NULL, CPL_COOKIE_TOM);
2262 t4_register_cpl_handler(CPL_GET_TCB_RPL, NULL);
2263
2264 return (0);
2265 }
2266 #endif /* TCP_OFFLOAD */
2267
2268 static int
t4_tom_modevent(module_t mod,int cmd,void * arg)2269 t4_tom_modevent(module_t mod, int cmd, void *arg)
2270 {
2271 int rc = 0;
2272
2273 #ifdef TCP_OFFLOAD
2274 switch (cmd) {
2275 case MOD_LOAD:
2276 rc = t4_tom_mod_load();
2277 break;
2278
2279 case MOD_UNLOAD:
2280 rc = t4_tom_mod_unload();
2281 break;
2282
2283 default:
2284 rc = EINVAL;
2285 }
2286 #else
2287 printf("t4_tom: compiled without TCP_OFFLOAD support.\n");
2288 rc = EOPNOTSUPP;
2289 #endif
2290 return (rc);
2291 }
2292
2293 static moduledata_t t4_tom_moddata= {
2294 "t4_tom",
2295 t4_tom_modevent,
2296 0
2297 };
2298
2299 MODULE_VERSION(t4_tom, 1);
2300 MODULE_DEPEND(t4_tom, toecore, 1, 1, 1);
2301 MODULE_DEPEND(t4_tom, t4nex, 1, 1, 1);
2302 DECLARE_MODULE(t4_tom, t4_tom_moddata, SI_SUB_EXEC, SI_ORDER_ANY);
2303