xref: /freebsd/sys/dev/cxgbe/tom/t4_tom.c (revision acc1a9ef)
1 /*-
2  * Copyright (c) 2012 Chelsio Communications, Inc.
3  * All rights reserved.
4  * Written by: Navdeep Parhar <np@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_inet.h"
32 #include "opt_inet6.h"
33 
34 #include <sys/param.h>
35 #include <sys/types.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 #include <sys/ktr.h>
39 #include <sys/lock.h>
40 #include <sys/limits.h>
41 #include <sys/module.h>
42 #include <sys/protosw.h>
43 #include <sys/domain.h>
44 #include <sys/rmlock.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/taskqueue.h>
48 #include <net/if.h>
49 #include <net/if_var.h>
50 #include <netinet/in.h>
51 #include <netinet/in_pcb.h>
52 #include <netinet/in_var.h>
53 #include <netinet/ip.h>
54 #include <netinet/ip6.h>
55 #include <netinet6/scope6_var.h>
56 #define TCPSTATES
57 #include <netinet/tcp_fsm.h>
58 #include <netinet/tcp_var.h>
59 #include <netinet/toecore.h>
60 
61 #ifdef TCP_OFFLOAD
62 #include "common/common.h"
63 #include "common/t4_msg.h"
64 #include "common/t4_regs.h"
65 #include "common/t4_regs_values.h"
66 #include "common/t4_tcb.h"
67 #include "tom/t4_tom_l2t.h"
68 #include "tom/t4_tom.h"
69 
70 static struct protosw ddp_protosw;
71 static struct pr_usrreqs ddp_usrreqs;
72 
73 static struct protosw ddp6_protosw;
74 static struct pr_usrreqs ddp6_usrreqs;
75 
76 /* Module ops */
77 static int t4_tom_mod_load(void);
78 static int t4_tom_mod_unload(void);
79 static int t4_tom_modevent(module_t, int, void *);
80 
81 /* ULD ops and helpers */
82 static int t4_tom_activate(struct adapter *);
83 static int t4_tom_deactivate(struct adapter *);
84 
85 static struct uld_info tom_uld_info = {
86 	.uld_id = ULD_TOM,
87 	.activate = t4_tom_activate,
88 	.deactivate = t4_tom_deactivate,
89 };
90 
91 static void queue_tid_release(struct adapter *, int);
92 static void release_offload_resources(struct toepcb *);
93 static int alloc_tid_tabs(struct tid_info *);
94 static void free_tid_tabs(struct tid_info *);
95 static int add_lip(struct adapter *, struct in6_addr *);
96 static int delete_lip(struct adapter *, struct in6_addr *);
97 static struct clip_entry *search_lip(struct tom_data *, struct in6_addr *);
98 static void init_clip_table(struct adapter *, struct tom_data *);
99 static void update_clip(struct adapter *, void *);
100 static void t4_clip_task(void *, int);
101 static void update_clip_table(struct adapter *, struct tom_data *);
102 static void destroy_clip_table(struct adapter *, struct tom_data *);
103 static void free_tom_data(struct adapter *, struct tom_data *);
104 static void reclaim_wr_resources(void *, int);
105 
106 static int in6_ifaddr_gen;
107 static eventhandler_tag ifaddr_evhandler;
108 static struct timeout_task clip_task;
109 
110 struct toepcb *
111 alloc_toepcb(struct vi_info *vi, int txqid, int rxqid, int flags)
112 {
113 	struct port_info *pi = vi->pi;
114 	struct adapter *sc = pi->adapter;
115 	struct toepcb *toep;
116 	int tx_credits, txsd_total, len;
117 
118 	/*
119 	 * The firmware counts tx work request credits in units of 16 bytes
120 	 * each.  Reserve room for an ABORT_REQ so the driver never has to worry
121 	 * about tx credits if it wants to abort a connection.
122 	 */
123 	tx_credits = sc->params.ofldq_wr_cred;
124 	tx_credits -= howmany(sizeof(struct cpl_abort_req), 16);
125 
126 	/*
127 	 * Shortest possible tx work request is a fw_ofld_tx_data_wr + 1 byte
128 	 * immediate payload, and firmware counts tx work request credits in
129 	 * units of 16 byte.  Calculate the maximum work requests possible.
130 	 */
131 	txsd_total = tx_credits /
132 	    howmany((sizeof(struct fw_ofld_tx_data_wr) + 1), 16);
133 
134 	if (txqid < 0)
135 		txqid = (arc4random() % vi->nofldtxq) + vi->first_ofld_txq;
136 	KASSERT(txqid >= vi->first_ofld_txq &&
137 	    txqid < vi->first_ofld_txq + vi->nofldtxq,
138 	    ("%s: txqid %d for vi %p (first %d, n %d)", __func__, txqid, vi,
139 		vi->first_ofld_txq, vi->nofldtxq));
140 
141 	if (rxqid < 0)
142 		rxqid = (arc4random() % vi->nofldrxq) + vi->first_ofld_rxq;
143 	KASSERT(rxqid >= vi->first_ofld_rxq &&
144 	    rxqid < vi->first_ofld_rxq + vi->nofldrxq,
145 	    ("%s: rxqid %d for vi %p (first %d, n %d)", __func__, rxqid, vi,
146 		vi->first_ofld_rxq, vi->nofldrxq));
147 
148 	len = offsetof(struct toepcb, txsd) +
149 	    txsd_total * sizeof(struct ofld_tx_sdesc);
150 
151 	toep = malloc(len, M_CXGBE, M_ZERO | flags);
152 	if (toep == NULL)
153 		return (NULL);
154 
155 	toep->td = sc->tom_softc;
156 	toep->vi = vi;
157 	toep->tx_total = tx_credits;
158 	toep->tx_credits = tx_credits;
159 	toep->ofld_txq = &sc->sge.ofld_txq[txqid];
160 	toep->ofld_rxq = &sc->sge.ofld_rxq[rxqid];
161 	toep->ctrlq = &sc->sge.ctrlq[pi->port_id];
162 	mbufq_init(&toep->ulp_pduq, INT_MAX);
163 	mbufq_init(&toep->ulp_pdu_reclaimq, INT_MAX);
164 	toep->txsd_total = txsd_total;
165 	toep->txsd_avail = txsd_total;
166 	toep->txsd_pidx = 0;
167 	toep->txsd_cidx = 0;
168 
169 	return (toep);
170 }
171 
172 void
173 free_toepcb(struct toepcb *toep)
174 {
175 
176 	KASSERT(!(toep->flags & TPF_ATTACHED),
177 	    ("%s: attached to an inpcb", __func__));
178 	KASSERT(!(toep->flags & TPF_CPL_PENDING),
179 	    ("%s: CPL pending", __func__));
180 
181 	free(toep, M_CXGBE);
182 }
183 
184 /*
185  * Set up the socket for TCP offload.
186  */
187 void
188 offload_socket(struct socket *so, struct toepcb *toep)
189 {
190 	struct tom_data *td = toep->td;
191 	struct inpcb *inp = sotoinpcb(so);
192 	struct tcpcb *tp = intotcpcb(inp);
193 	struct sockbuf *sb;
194 
195 	INP_WLOCK_ASSERT(inp);
196 
197 	/* Update socket */
198 	sb = &so->so_snd;
199 	SOCKBUF_LOCK(sb);
200 	sb->sb_flags |= SB_NOCOALESCE;
201 	SOCKBUF_UNLOCK(sb);
202 	sb = &so->so_rcv;
203 	SOCKBUF_LOCK(sb);
204 	sb->sb_flags |= SB_NOCOALESCE;
205 	if (toep->ulp_mode == ULP_MODE_TCPDDP) {
206 		if (inp->inp_vflag & INP_IPV6)
207 			so->so_proto = &ddp6_protosw;
208 		else
209 			so->so_proto = &ddp_protosw;
210 	}
211 	SOCKBUF_UNLOCK(sb);
212 
213 	/* Update TCP PCB */
214 	tp->tod = &td->tod;
215 	tp->t_toe = toep;
216 	tp->t_flags |= TF_TOE;
217 
218 	/* Install an extra hold on inp */
219 	toep->inp = inp;
220 	toep->flags |= TPF_ATTACHED;
221 	in_pcbref(inp);
222 
223 	/* Add the TOE PCB to the active list */
224 	mtx_lock(&td->toep_list_lock);
225 	TAILQ_INSERT_HEAD(&td->toep_list, toep, link);
226 	mtx_unlock(&td->toep_list_lock);
227 }
228 
229 /* This is _not_ the normal way to "unoffload" a socket. */
230 void
231 undo_offload_socket(struct socket *so)
232 {
233 	struct inpcb *inp = sotoinpcb(so);
234 	struct tcpcb *tp = intotcpcb(inp);
235 	struct toepcb *toep = tp->t_toe;
236 	struct tom_data *td = toep->td;
237 	struct sockbuf *sb;
238 
239 	INP_WLOCK_ASSERT(inp);
240 
241 	sb = &so->so_snd;
242 	SOCKBUF_LOCK(sb);
243 	sb->sb_flags &= ~SB_NOCOALESCE;
244 	SOCKBUF_UNLOCK(sb);
245 	sb = &so->so_rcv;
246 	SOCKBUF_LOCK(sb);
247 	sb->sb_flags &= ~SB_NOCOALESCE;
248 	SOCKBUF_UNLOCK(sb);
249 
250 	tp->tod = NULL;
251 	tp->t_toe = NULL;
252 	tp->t_flags &= ~TF_TOE;
253 
254 	toep->inp = NULL;
255 	toep->flags &= ~TPF_ATTACHED;
256 	if (in_pcbrele_wlocked(inp))
257 		panic("%s: inp freed.", __func__);
258 
259 	mtx_lock(&td->toep_list_lock);
260 	TAILQ_REMOVE(&td->toep_list, toep, link);
261 	mtx_unlock(&td->toep_list_lock);
262 }
263 
264 static void
265 release_offload_resources(struct toepcb *toep)
266 {
267 	struct tom_data *td = toep->td;
268 	struct adapter *sc = td_adapter(td);
269 	int tid = toep->tid;
270 
271 	KASSERT(!(toep->flags & TPF_CPL_PENDING),
272 	    ("%s: %p has CPL pending.", __func__, toep));
273 	KASSERT(!(toep->flags & TPF_ATTACHED),
274 	    ("%s: %p is still attached.", __func__, toep));
275 
276 	CTR5(KTR_CXGBE, "%s: toep %p (tid %d, l2te %p, ce %p)",
277 	    __func__, toep, tid, toep->l2te, toep->ce);
278 
279 	/*
280 	 * These queues should have been emptied at approximately the same time
281 	 * that a normal connection's socket's so_snd would have been purged or
282 	 * drained.  Do _not_ clean up here.
283 	 */
284 	MPASS(mbufq_len(&toep->ulp_pduq) == 0);
285 	MPASS(mbufq_len(&toep->ulp_pdu_reclaimq) == 0);
286 
287 	if (toep->ulp_mode == ULP_MODE_TCPDDP)
288 		release_ddp_resources(toep);
289 
290 	if (toep->l2te)
291 		t4_l2t_release(toep->l2te);
292 
293 	if (tid >= 0) {
294 		remove_tid(sc, tid);
295 		release_tid(sc, tid, toep->ctrlq);
296 	}
297 
298 	if (toep->ce)
299 		release_lip(td, toep->ce);
300 
301 	mtx_lock(&td->toep_list_lock);
302 	TAILQ_REMOVE(&td->toep_list, toep, link);
303 	mtx_unlock(&td->toep_list_lock);
304 
305 	free_toepcb(toep);
306 }
307 
308 /*
309  * The kernel is done with the TCP PCB and this is our opportunity to unhook the
310  * toepcb hanging off of it.  If the TOE driver is also done with the toepcb (no
311  * pending CPL) then it is time to release all resources tied to the toepcb.
312  *
313  * Also gets called when an offloaded active open fails and the TOM wants the
314  * kernel to take the TCP PCB back.
315  */
316 static void
317 t4_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp)
318 {
319 #if defined(KTR) || defined(INVARIANTS)
320 	struct inpcb *inp = tp->t_inpcb;
321 #endif
322 	struct toepcb *toep = tp->t_toe;
323 
324 	INP_WLOCK_ASSERT(inp);
325 
326 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
327 	KASSERT(toep->flags & TPF_ATTACHED,
328 	    ("%s: not attached", __func__));
329 
330 #ifdef KTR
331 	if (tp->t_state == TCPS_SYN_SENT) {
332 		CTR6(KTR_CXGBE, "%s: atid %d, toep %p (0x%x), inp %p (0x%x)",
333 		    __func__, toep->tid, toep, toep->flags, inp,
334 		    inp->inp_flags);
335 	} else {
336 		CTR6(KTR_CXGBE,
337 		    "t4_pcb_detach: tid %d (%s), toep %p (0x%x), inp %p (0x%x)",
338 		    toep->tid, tcpstates[tp->t_state], toep, toep->flags, inp,
339 		    inp->inp_flags);
340 	}
341 #endif
342 
343 	tp->t_toe = NULL;
344 	tp->t_flags &= ~TF_TOE;
345 	toep->flags &= ~TPF_ATTACHED;
346 
347 	if (!(toep->flags & TPF_CPL_PENDING))
348 		release_offload_resources(toep);
349 }
350 
351 /*
352  * setsockopt handler.
353  */
354 static void
355 t4_ctloutput(struct toedev *tod, struct tcpcb *tp, int dir, int name)
356 {
357 	struct adapter *sc = tod->tod_softc;
358 	struct toepcb *toep = tp->t_toe;
359 
360 	if (dir == SOPT_GET)
361 		return;
362 
363 	CTR4(KTR_CXGBE, "%s: tp %p, dir %u, name %u", __func__, tp, dir, name);
364 
365 	switch (name) {
366 	case TCP_NODELAY:
367 		t4_set_tcb_field(sc, toep, 1, W_TCB_T_FLAGS, V_TF_NAGLE(1),
368 		    V_TF_NAGLE(tp->t_flags & TF_NODELAY ? 0 : 1));
369 		break;
370 	default:
371 		break;
372 	}
373 }
374 
375 /*
376  * The TOE driver will not receive any more CPLs for the tid associated with the
377  * toepcb; release the hold on the inpcb.
378  */
379 void
380 final_cpl_received(struct toepcb *toep)
381 {
382 	struct inpcb *inp = toep->inp;
383 
384 	KASSERT(inp != NULL, ("%s: inp is NULL", __func__));
385 	INP_WLOCK_ASSERT(inp);
386 	KASSERT(toep->flags & TPF_CPL_PENDING,
387 	    ("%s: CPL not pending already?", __func__));
388 
389 	CTR6(KTR_CXGBE, "%s: tid %d, toep %p (0x%x), inp %p (0x%x)",
390 	    __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags);
391 
392 	toep->inp = NULL;
393 	toep->flags &= ~TPF_CPL_PENDING;
394 	mbufq_drain(&toep->ulp_pdu_reclaimq);
395 
396 	if (!(toep->flags & TPF_ATTACHED))
397 		release_offload_resources(toep);
398 
399 	if (!in_pcbrele_wlocked(inp))
400 		INP_WUNLOCK(inp);
401 }
402 
403 void
404 insert_tid(struct adapter *sc, int tid, void *ctx)
405 {
406 	struct tid_info *t = &sc->tids;
407 
408 	t->tid_tab[tid] = ctx;
409 	atomic_add_int(&t->tids_in_use, 1);
410 }
411 
412 void *
413 lookup_tid(struct adapter *sc, int tid)
414 {
415 	struct tid_info *t = &sc->tids;
416 
417 	return (t->tid_tab[tid]);
418 }
419 
420 void
421 update_tid(struct adapter *sc, int tid, void *ctx)
422 {
423 	struct tid_info *t = &sc->tids;
424 
425 	t->tid_tab[tid] = ctx;
426 }
427 
428 void
429 remove_tid(struct adapter *sc, int tid)
430 {
431 	struct tid_info *t = &sc->tids;
432 
433 	t->tid_tab[tid] = NULL;
434 	atomic_subtract_int(&t->tids_in_use, 1);
435 }
436 
437 void
438 release_tid(struct adapter *sc, int tid, struct sge_wrq *ctrlq)
439 {
440 	struct wrqe *wr;
441 	struct cpl_tid_release *req;
442 
443 	wr = alloc_wrqe(sizeof(*req), ctrlq);
444 	if (wr == NULL) {
445 		queue_tid_release(sc, tid);	/* defer */
446 		return;
447 	}
448 	req = wrtod(wr);
449 
450 	INIT_TP_WR_MIT_CPL(req, CPL_TID_RELEASE, tid);
451 
452 	t4_wrq_tx(sc, wr);
453 }
454 
455 static void
456 queue_tid_release(struct adapter *sc, int tid)
457 {
458 
459 	CXGBE_UNIMPLEMENTED("deferred tid release");
460 }
461 
462 /*
463  * What mtu_idx to use, given a 4-tuple and/or an MSS cap
464  */
465 int
466 find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, int pmss)
467 {
468 	unsigned short *mtus = &sc->params.mtus[0];
469 	int i, mss, n;
470 
471 	KASSERT(inc != NULL || pmss > 0,
472 	    ("%s: at least one of inc/pmss must be specified", __func__));
473 
474 	mss = inc ? tcp_mssopt(inc) : pmss;
475 	if (pmss > 0 && mss > pmss)
476 		mss = pmss;
477 
478 	if (inc->inc_flags & INC_ISIPV6)
479 		n = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
480 	else
481 		n = sizeof(struct ip) + sizeof(struct tcphdr);
482 
483 	for (i = 0; i < NMTUS - 1 && mtus[i + 1] <= mss + n; i++)
484 		continue;
485 
486 	return (i);
487 }
488 
489 /*
490  * Determine the receive window size for a socket.
491  */
492 u_long
493 select_rcv_wnd(struct socket *so)
494 {
495 	unsigned long wnd;
496 
497 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
498 
499 	wnd = sbspace(&so->so_rcv);
500 	if (wnd < MIN_RCV_WND)
501 		wnd = MIN_RCV_WND;
502 
503 	return min(wnd, MAX_RCV_WND);
504 }
505 
506 int
507 select_rcv_wscale(void)
508 {
509 	int wscale = 0;
510 	unsigned long space = sb_max;
511 
512 	if (space > MAX_RCV_WND)
513 		space = MAX_RCV_WND;
514 
515 	while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space)
516 		wscale++;
517 
518 	return (wscale);
519 }
520 
521 extern int always_keepalive;
522 #define VIID_SMACIDX(v)	(((unsigned int)(v) & 0x7f) << 1)
523 
524 /*
525  * socket so could be a listening socket too.
526  */
527 uint64_t
528 calc_opt0(struct socket *so, struct vi_info *vi, struct l2t_entry *e,
529     int mtu_idx, int rscale, int rx_credits, int ulp_mode)
530 {
531 	uint64_t opt0;
532 
533 	KASSERT(rx_credits <= M_RCV_BUFSIZ,
534 	    ("%s: rcv_bufsiz too high", __func__));
535 
536 	opt0 = F_TCAM_BYPASS | V_WND_SCALE(rscale) | V_MSS_IDX(mtu_idx) |
537 	    V_ULP_MODE(ulp_mode) | V_RCV_BUFSIZ(rx_credits);
538 
539 	if (so != NULL) {
540 		struct inpcb *inp = sotoinpcb(so);
541 		struct tcpcb *tp = intotcpcb(inp);
542 		int keepalive = always_keepalive ||
543 		    so_options_get(so) & SO_KEEPALIVE;
544 
545 		opt0 |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0);
546 		opt0 |= V_KEEP_ALIVE(keepalive != 0);
547 	}
548 
549 	if (e != NULL)
550 		opt0 |= V_L2T_IDX(e->idx);
551 
552 	if (vi != NULL) {
553 		opt0 |= V_SMAC_SEL(VIID_SMACIDX(vi->viid));
554 		opt0 |= V_TX_CHAN(vi->pi->tx_chan);
555 	}
556 
557 	return htobe64(opt0);
558 }
559 
560 uint64_t
561 select_ntuple(struct vi_info *vi, struct l2t_entry *e)
562 {
563 	struct adapter *sc = vi->pi->adapter;
564 	struct tp_params *tp = &sc->params.tp;
565 	uint16_t viid = vi->viid;
566 	uint64_t ntuple = 0;
567 
568 	/*
569 	 * Initialize each of the fields which we care about which are present
570 	 * in the Compressed Filter Tuple.
571 	 */
572 	if (tp->vlan_shift >= 0 && e->vlan != CPL_L2T_VLAN_NONE)
573 		ntuple |= (uint64_t)(F_FT_VLAN_VLD | e->vlan) << tp->vlan_shift;
574 
575 	if (tp->port_shift >= 0)
576 		ntuple |= (uint64_t)e->lport << tp->port_shift;
577 
578 	if (tp->protocol_shift >= 0)
579 		ntuple |= (uint64_t)IPPROTO_TCP << tp->protocol_shift;
580 
581 	if (tp->vnic_shift >= 0) {
582 		uint32_t vf = G_FW_VIID_VIN(viid);
583 		uint32_t pf = G_FW_VIID_PFN(viid);
584 		uint32_t vld = G_FW_VIID_VIVLD(viid);
585 
586 		ntuple |= (uint64_t)(V_FT_VNID_ID_VF(vf) | V_FT_VNID_ID_PF(pf) |
587 		    V_FT_VNID_ID_VLD(vld)) << tp->vnic_shift;
588 	}
589 
590 	if (is_t4(sc))
591 		return (htobe32((uint32_t)ntuple));
592 	else
593 		return (htobe64(V_FILTER_TUPLE(ntuple)));
594 }
595 
596 void
597 set_tcpddp_ulp_mode(struct toepcb *toep)
598 {
599 
600 	toep->ulp_mode = ULP_MODE_TCPDDP;
601 	toep->ddp_flags = DDP_OK;
602 	toep->ddp_score = DDP_LOW_SCORE;
603 }
604 
605 int
606 negative_advice(int status)
607 {
608 
609 	return (status == CPL_ERR_RTX_NEG_ADVICE ||
610 	    status == CPL_ERR_PERSIST_NEG_ADVICE ||
611 	    status == CPL_ERR_KEEPALV_NEG_ADVICE);
612 }
613 
614 static int
615 alloc_tid_tabs(struct tid_info *t)
616 {
617 	size_t size;
618 	unsigned int i;
619 
620 	size = t->ntids * sizeof(*t->tid_tab) +
621 	    t->natids * sizeof(*t->atid_tab) +
622 	    t->nstids * sizeof(*t->stid_tab);
623 
624 	t->tid_tab = malloc(size, M_CXGBE, M_ZERO | M_NOWAIT);
625 	if (t->tid_tab == NULL)
626 		return (ENOMEM);
627 
628 	mtx_init(&t->atid_lock, "atid lock", NULL, MTX_DEF);
629 	t->atid_tab = (union aopen_entry *)&t->tid_tab[t->ntids];
630 	t->afree = t->atid_tab;
631 	t->atids_in_use = 0;
632 	for (i = 1; i < t->natids; i++)
633 		t->atid_tab[i - 1].next = &t->atid_tab[i];
634 	t->atid_tab[t->natids - 1].next = NULL;
635 
636 	mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF);
637 	t->stid_tab = (struct listen_ctx **)&t->atid_tab[t->natids];
638 	t->stids_in_use = 0;
639 	TAILQ_INIT(&t->stids);
640 	t->nstids_free_head = t->nstids;
641 
642 	atomic_store_rel_int(&t->tids_in_use, 0);
643 
644 	return (0);
645 }
646 
647 static void
648 free_tid_tabs(struct tid_info *t)
649 {
650 	KASSERT(t->tids_in_use == 0,
651 	    ("%s: %d tids still in use.", __func__, t->tids_in_use));
652 	KASSERT(t->atids_in_use == 0,
653 	    ("%s: %d atids still in use.", __func__, t->atids_in_use));
654 	KASSERT(t->stids_in_use == 0,
655 	    ("%s: %d tids still in use.", __func__, t->stids_in_use));
656 
657 	free(t->tid_tab, M_CXGBE);
658 	t->tid_tab = NULL;
659 
660 	if (mtx_initialized(&t->atid_lock))
661 		mtx_destroy(&t->atid_lock);
662 	if (mtx_initialized(&t->stid_lock))
663 		mtx_destroy(&t->stid_lock);
664 }
665 
666 static int
667 add_lip(struct adapter *sc, struct in6_addr *lip)
668 {
669         struct fw_clip_cmd c;
670 
671 	ASSERT_SYNCHRONIZED_OP(sc);
672 	/* mtx_assert(&td->clip_table_lock, MA_OWNED); */
673 
674         memset(&c, 0, sizeof(c));
675 	c.op_to_write = htonl(V_FW_CMD_OP(FW_CLIP_CMD) | F_FW_CMD_REQUEST |
676 	    F_FW_CMD_WRITE);
677         c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_ALLOC | FW_LEN16(c));
678         c.ip_hi = *(uint64_t *)&lip->s6_addr[0];
679         c.ip_lo = *(uint64_t *)&lip->s6_addr[8];
680 
681 	return (-t4_wr_mbox_ns(sc, sc->mbox, &c, sizeof(c), &c));
682 }
683 
684 static int
685 delete_lip(struct adapter *sc, struct in6_addr *lip)
686 {
687 	struct fw_clip_cmd c;
688 
689 	ASSERT_SYNCHRONIZED_OP(sc);
690 	/* mtx_assert(&td->clip_table_lock, MA_OWNED); */
691 
692 	memset(&c, 0, sizeof(c));
693 	c.op_to_write = htonl(V_FW_CMD_OP(FW_CLIP_CMD) | F_FW_CMD_REQUEST |
694 	    F_FW_CMD_READ);
695         c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_FREE | FW_LEN16(c));
696         c.ip_hi = *(uint64_t *)&lip->s6_addr[0];
697         c.ip_lo = *(uint64_t *)&lip->s6_addr[8];
698 
699 	return (-t4_wr_mbox_ns(sc, sc->mbox, &c, sizeof(c), &c));
700 }
701 
702 static struct clip_entry *
703 search_lip(struct tom_data *td, struct in6_addr *lip)
704 {
705 	struct clip_entry *ce;
706 
707 	mtx_assert(&td->clip_table_lock, MA_OWNED);
708 
709 	TAILQ_FOREACH(ce, &td->clip_table, link) {
710 		if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip))
711 			return (ce);
712 	}
713 
714 	return (NULL);
715 }
716 
717 struct clip_entry *
718 hold_lip(struct tom_data *td, struct in6_addr *lip)
719 {
720 	struct clip_entry *ce;
721 
722 	mtx_lock(&td->clip_table_lock);
723 	ce = search_lip(td, lip);
724 	if (ce != NULL)
725 		ce->refcount++;
726 	mtx_unlock(&td->clip_table_lock);
727 
728 	return (ce);
729 }
730 
731 void
732 release_lip(struct tom_data *td, struct clip_entry *ce)
733 {
734 
735 	mtx_lock(&td->clip_table_lock);
736 	KASSERT(search_lip(td, &ce->lip) == ce,
737 	    ("%s: CLIP entry %p p not in CLIP table.", __func__, ce));
738 	KASSERT(ce->refcount > 0,
739 	    ("%s: CLIP entry %p has refcount 0", __func__, ce));
740 	--ce->refcount;
741 	mtx_unlock(&td->clip_table_lock);
742 }
743 
744 static void
745 init_clip_table(struct adapter *sc, struct tom_data *td)
746 {
747 
748 	ASSERT_SYNCHRONIZED_OP(sc);
749 
750 	mtx_init(&td->clip_table_lock, "CLIP table lock", NULL, MTX_DEF);
751 	TAILQ_INIT(&td->clip_table);
752 	td->clip_gen = -1;
753 
754 	update_clip_table(sc, td);
755 }
756 
757 static void
758 update_clip(struct adapter *sc, void *arg __unused)
759 {
760 
761 	if (begin_synchronized_op(sc, NULL, HOLD_LOCK, "t4tomuc"))
762 		return;
763 
764 	if (uld_active(sc, ULD_TOM))
765 		update_clip_table(sc, sc->tom_softc);
766 
767 	end_synchronized_op(sc, LOCK_HELD);
768 }
769 
770 static void
771 t4_clip_task(void *arg, int count)
772 {
773 
774 	t4_iterate(update_clip, NULL);
775 }
776 
777 static void
778 update_clip_table(struct adapter *sc, struct tom_data *td)
779 {
780 	struct rm_priotracker in6_ifa_tracker;
781 	struct in6_ifaddr *ia;
782 	struct in6_addr *lip, tlip;
783 	struct clip_head stale;
784 	struct clip_entry *ce, *ce_temp;
785 	int rc, gen = atomic_load_acq_int(&in6_ifaddr_gen);
786 
787 	ASSERT_SYNCHRONIZED_OP(sc);
788 
789 	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
790 	mtx_lock(&td->clip_table_lock);
791 
792 	if (gen == td->clip_gen)
793 		goto done;
794 
795 	TAILQ_INIT(&stale);
796 	TAILQ_CONCAT(&stale, &td->clip_table, link);
797 
798 	TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) {
799 		lip = &ia->ia_addr.sin6_addr;
800 
801 		KASSERT(!IN6_IS_ADDR_MULTICAST(lip),
802 		    ("%s: mcast address in in6_ifaddr list", __func__));
803 
804 		if (IN6_IS_ADDR_LOOPBACK(lip))
805 			continue;
806 		if (IN6_IS_SCOPE_EMBED(lip)) {
807 			/* Remove the embedded scope */
808 			tlip = *lip;
809 			lip = &tlip;
810 			in6_clearscope(lip);
811 		}
812 		/*
813 		 * XXX: how to weed out the link local address for the loopback
814 		 * interface?  It's fe80::1 usually (always?).
815 		 */
816 
817 		/*
818 		 * If it's in the main list then we already know it's not stale.
819 		 */
820 		TAILQ_FOREACH(ce, &td->clip_table, link) {
821 			if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip))
822 				goto next;
823 		}
824 
825 		/*
826 		 * If it's in the stale list we should move it to the main list.
827 		 */
828 		TAILQ_FOREACH(ce, &stale, link) {
829 			if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip)) {
830 				TAILQ_REMOVE(&stale, ce, link);
831 				TAILQ_INSERT_TAIL(&td->clip_table, ce, link);
832 				goto next;
833 			}
834 		}
835 
836 		/* A new IP6 address; add it to the CLIP table */
837 		ce = malloc(sizeof(*ce), M_CXGBE, M_NOWAIT);
838 		memcpy(&ce->lip, lip, sizeof(ce->lip));
839 		ce->refcount = 0;
840 		rc = add_lip(sc, lip);
841 		if (rc == 0)
842 			TAILQ_INSERT_TAIL(&td->clip_table, ce, link);
843 		else {
844 			char ip[INET6_ADDRSTRLEN];
845 
846 			inet_ntop(AF_INET6, &ce->lip, &ip[0], sizeof(ip));
847 			log(LOG_ERR, "%s: could not add %s (%d)\n",
848 			    __func__, ip, rc);
849 			free(ce, M_CXGBE);
850 		}
851 next:
852 		continue;
853 	}
854 
855 	/*
856 	 * Remove stale addresses (those no longer in V_in6_ifaddrhead) that are
857 	 * no longer referenced by the driver.
858 	 */
859 	TAILQ_FOREACH_SAFE(ce, &stale, link, ce_temp) {
860 		if (ce->refcount == 0) {
861 			rc = delete_lip(sc, &ce->lip);
862 			if (rc == 0) {
863 				TAILQ_REMOVE(&stale, ce, link);
864 				free(ce, M_CXGBE);
865 			} else {
866 				char ip[INET6_ADDRSTRLEN];
867 
868 				inet_ntop(AF_INET6, &ce->lip, &ip[0],
869 				    sizeof(ip));
870 				log(LOG_ERR, "%s: could not delete %s (%d)\n",
871 				    __func__, ip, rc);
872 			}
873 		}
874 	}
875 	/* The ones that are still referenced need to stay in the CLIP table */
876 	TAILQ_CONCAT(&td->clip_table, &stale, link);
877 
878 	td->clip_gen = gen;
879 done:
880 	mtx_unlock(&td->clip_table_lock);
881 	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
882 }
883 
884 static void
885 destroy_clip_table(struct adapter *sc, struct tom_data *td)
886 {
887 	struct clip_entry *ce, *ce_temp;
888 
889 	if (mtx_initialized(&td->clip_table_lock)) {
890 		mtx_lock(&td->clip_table_lock);
891 		TAILQ_FOREACH_SAFE(ce, &td->clip_table, link, ce_temp) {
892 			KASSERT(ce->refcount == 0,
893 			    ("%s: CLIP entry %p still in use (%d)", __func__,
894 			    ce, ce->refcount));
895 			TAILQ_REMOVE(&td->clip_table, ce, link);
896 			delete_lip(sc, &ce->lip);
897 			free(ce, M_CXGBE);
898 		}
899 		mtx_unlock(&td->clip_table_lock);
900 		mtx_destroy(&td->clip_table_lock);
901 	}
902 }
903 
904 static void
905 free_tom_data(struct adapter *sc, struct tom_data *td)
906 {
907 
908 	ASSERT_SYNCHRONIZED_OP(sc);
909 
910 	KASSERT(TAILQ_EMPTY(&td->toep_list),
911 	    ("%s: TOE PCB list is not empty.", __func__));
912 	KASSERT(td->lctx_count == 0,
913 	    ("%s: lctx hash table is not empty.", __func__));
914 
915 	t4_uninit_l2t_cpl_handlers(sc);
916 	t4_uninit_cpl_io_handlers(sc);
917 	t4_uninit_ddp(sc, td);
918 	destroy_clip_table(sc, td);
919 
920 	if (td->listen_mask != 0)
921 		hashdestroy(td->listen_hash, M_CXGBE, td->listen_mask);
922 
923 	if (mtx_initialized(&td->unsent_wr_lock))
924 		mtx_destroy(&td->unsent_wr_lock);
925 	if (mtx_initialized(&td->lctx_hash_lock))
926 		mtx_destroy(&td->lctx_hash_lock);
927 	if (mtx_initialized(&td->toep_list_lock))
928 		mtx_destroy(&td->toep_list_lock);
929 
930 	free_tid_tabs(&sc->tids);
931 	free(td, M_CXGBE);
932 }
933 
934 static void
935 reclaim_wr_resources(void *arg, int count)
936 {
937 	struct tom_data *td = arg;
938 	STAILQ_HEAD(, wrqe) twr_list = STAILQ_HEAD_INITIALIZER(twr_list);
939 	struct cpl_act_open_req *cpl;
940 	u_int opcode, atid;
941 	struct wrqe *wr;
942 	struct adapter *sc;
943 
944 	mtx_lock(&td->unsent_wr_lock);
945 	STAILQ_SWAP(&td->unsent_wr_list, &twr_list, wrqe);
946 	mtx_unlock(&td->unsent_wr_lock);
947 
948 	while ((wr = STAILQ_FIRST(&twr_list)) != NULL) {
949 		STAILQ_REMOVE_HEAD(&twr_list, link);
950 
951 		cpl = wrtod(wr);
952 		opcode = GET_OPCODE(cpl);
953 
954 		switch (opcode) {
955 		case CPL_ACT_OPEN_REQ:
956 		case CPL_ACT_OPEN_REQ6:
957 			atid = G_TID_TID(be32toh(OPCODE_TID(cpl)));
958 			sc = td_adapter(td);
959 
960 			CTR2(KTR_CXGBE, "%s: atid %u ", __func__, atid);
961 			act_open_failure_cleanup(sc, atid, EHOSTUNREACH);
962 			free(wr, M_CXGBE);
963 			break;
964 		default:
965 			log(LOG_ERR, "%s: leaked work request %p, wr_len %d, "
966 			    "opcode %x\n", __func__, wr, wr->wr_len, opcode);
967 			/* WR not freed here; go look at it with a debugger.  */
968 		}
969 	}
970 }
971 
972 /*
973  * Ground control to Major TOM
974  * Commencing countdown, engines on
975  */
976 static int
977 t4_tom_activate(struct adapter *sc)
978 {
979 	struct tom_data *td;
980 	struct toedev *tod;
981 	struct vi_info *vi;
982 	int i, rc, v;
983 
984 	ASSERT_SYNCHRONIZED_OP(sc);
985 
986 	/* per-adapter softc for TOM */
987 	td = malloc(sizeof(*td), M_CXGBE, M_ZERO | M_NOWAIT);
988 	if (td == NULL)
989 		return (ENOMEM);
990 
991 	/* List of TOE PCBs and associated lock */
992 	mtx_init(&td->toep_list_lock, "PCB list lock", NULL, MTX_DEF);
993 	TAILQ_INIT(&td->toep_list);
994 
995 	/* Listen context */
996 	mtx_init(&td->lctx_hash_lock, "lctx hash lock", NULL, MTX_DEF);
997 	td->listen_hash = hashinit_flags(LISTEN_HASH_SIZE, M_CXGBE,
998 	    &td->listen_mask, HASH_NOWAIT);
999 
1000 	/* List of WRs for which L2 resolution failed */
1001 	mtx_init(&td->unsent_wr_lock, "Unsent WR list lock", NULL, MTX_DEF);
1002 	STAILQ_INIT(&td->unsent_wr_list);
1003 	TASK_INIT(&td->reclaim_wr_resources, 0, reclaim_wr_resources, td);
1004 
1005 	/* TID tables */
1006 	rc = alloc_tid_tabs(&sc->tids);
1007 	if (rc != 0)
1008 		goto done;
1009 
1010 	/* DDP page pods and CPL handlers */
1011 	t4_init_ddp(sc, td);
1012 
1013 	/* CLIP table for IPv6 offload */
1014 	init_clip_table(sc, td);
1015 
1016 	/* CPL handlers */
1017 	t4_init_connect_cpl_handlers(sc);
1018 	t4_init_l2t_cpl_handlers(sc);
1019 	t4_init_listen_cpl_handlers(sc);
1020 	t4_init_cpl_io_handlers(sc);
1021 
1022 	/* toedev ops */
1023 	tod = &td->tod;
1024 	init_toedev(tod);
1025 	tod->tod_softc = sc;
1026 	tod->tod_connect = t4_connect;
1027 	tod->tod_listen_start = t4_listen_start;
1028 	tod->tod_listen_stop = t4_listen_stop;
1029 	tod->tod_rcvd = t4_rcvd;
1030 	tod->tod_output = t4_tod_output;
1031 	tod->tod_send_rst = t4_send_rst;
1032 	tod->tod_send_fin = t4_send_fin;
1033 	tod->tod_pcb_detach = t4_pcb_detach;
1034 	tod->tod_l2_update = t4_l2_update;
1035 	tod->tod_syncache_added = t4_syncache_added;
1036 	tod->tod_syncache_removed = t4_syncache_removed;
1037 	tod->tod_syncache_respond = t4_syncache_respond;
1038 	tod->tod_offload_socket = t4_offload_socket;
1039 	tod->tod_ctloutput = t4_ctloutput;
1040 
1041 	for_each_port(sc, i) {
1042 		for_each_vi(sc->port[i], v, vi) {
1043 			TOEDEV(vi->ifp) = &td->tod;
1044 		}
1045 	}
1046 
1047 	sc->tom_softc = td;
1048 	register_toedev(sc->tom_softc);
1049 
1050 done:
1051 	if (rc != 0)
1052 		free_tom_data(sc, td);
1053 	return (rc);
1054 }
1055 
1056 static int
1057 t4_tom_deactivate(struct adapter *sc)
1058 {
1059 	int rc = 0;
1060 	struct tom_data *td = sc->tom_softc;
1061 
1062 	ASSERT_SYNCHRONIZED_OP(sc);
1063 
1064 	if (td == NULL)
1065 		return (0);	/* XXX. KASSERT? */
1066 
1067 	if (sc->offload_map != 0)
1068 		return (EBUSY);	/* at least one port has IFCAP_TOE enabled */
1069 
1070 	if (uld_active(sc, ULD_IWARP) || uld_active(sc, ULD_ISCSI))
1071 		return (EBUSY);	/* both iWARP and iSCSI rely on the TOE. */
1072 
1073 	mtx_lock(&td->toep_list_lock);
1074 	if (!TAILQ_EMPTY(&td->toep_list))
1075 		rc = EBUSY;
1076 	mtx_unlock(&td->toep_list_lock);
1077 
1078 	mtx_lock(&td->lctx_hash_lock);
1079 	if (td->lctx_count > 0)
1080 		rc = EBUSY;
1081 	mtx_unlock(&td->lctx_hash_lock);
1082 
1083 	taskqueue_drain(taskqueue_thread, &td->reclaim_wr_resources);
1084 	mtx_lock(&td->unsent_wr_lock);
1085 	if (!STAILQ_EMPTY(&td->unsent_wr_list))
1086 		rc = EBUSY;
1087 	mtx_unlock(&td->unsent_wr_lock);
1088 
1089 	if (rc == 0) {
1090 		unregister_toedev(sc->tom_softc);
1091 		free_tom_data(sc, td);
1092 		sc->tom_softc = NULL;
1093 	}
1094 
1095 	return (rc);
1096 }
1097 
1098 static void
1099 t4_tom_ifaddr_event(void *arg __unused, struct ifnet *ifp)
1100 {
1101 
1102 	atomic_add_rel_int(&in6_ifaddr_gen, 1);
1103 	taskqueue_enqueue_timeout(taskqueue_thread, &clip_task, -hz / 4);
1104 }
1105 
1106 static int
1107 t4_tom_mod_load(void)
1108 {
1109 	int rc;
1110 	struct protosw *tcp_protosw, *tcp6_protosw;
1111 
1112 	tcp_protosw = pffindproto(PF_INET, IPPROTO_TCP, SOCK_STREAM);
1113 	if (tcp_protosw == NULL)
1114 		return (ENOPROTOOPT);
1115 	bcopy(tcp_protosw, &ddp_protosw, sizeof(ddp_protosw));
1116 	bcopy(tcp_protosw->pr_usrreqs, &ddp_usrreqs, sizeof(ddp_usrreqs));
1117 	ddp_usrreqs.pru_soreceive = t4_soreceive_ddp;
1118 	ddp_protosw.pr_usrreqs = &ddp_usrreqs;
1119 
1120 	tcp6_protosw = pffindproto(PF_INET6, IPPROTO_TCP, SOCK_STREAM);
1121 	if (tcp6_protosw == NULL)
1122 		return (ENOPROTOOPT);
1123 	bcopy(tcp6_protosw, &ddp6_protosw, sizeof(ddp6_protosw));
1124 	bcopy(tcp6_protosw->pr_usrreqs, &ddp6_usrreqs, sizeof(ddp6_usrreqs));
1125 	ddp6_usrreqs.pru_soreceive = t4_soreceive_ddp;
1126 	ddp6_protosw.pr_usrreqs = &ddp6_usrreqs;
1127 
1128 	TIMEOUT_TASK_INIT(taskqueue_thread, &clip_task, 0, t4_clip_task, NULL);
1129 	ifaddr_evhandler = EVENTHANDLER_REGISTER(ifaddr_event,
1130 	    t4_tom_ifaddr_event, NULL, EVENTHANDLER_PRI_ANY);
1131 
1132 	rc = t4_register_uld(&tom_uld_info);
1133 	if (rc != 0)
1134 		t4_tom_mod_unload();
1135 
1136 	return (rc);
1137 }
1138 
1139 static void
1140 tom_uninit(struct adapter *sc, void *arg __unused)
1141 {
1142 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4tomun"))
1143 		return;
1144 
1145 	/* Try to free resources (works only if no port has IFCAP_TOE) */
1146 	if (uld_active(sc, ULD_TOM))
1147 		t4_deactivate_uld(sc, ULD_TOM);
1148 
1149 	end_synchronized_op(sc, 0);
1150 }
1151 
1152 static int
1153 t4_tom_mod_unload(void)
1154 {
1155 	t4_iterate(tom_uninit, NULL);
1156 
1157 	if (t4_unregister_uld(&tom_uld_info) == EBUSY)
1158 		return (EBUSY);
1159 
1160 	if (ifaddr_evhandler) {
1161 		EVENTHANDLER_DEREGISTER(ifaddr_event, ifaddr_evhandler);
1162 		taskqueue_cancel_timeout(taskqueue_thread, &clip_task, NULL);
1163 	}
1164 
1165 	return (0);
1166 }
1167 #endif	/* TCP_OFFLOAD */
1168 
1169 static int
1170 t4_tom_modevent(module_t mod, int cmd, void *arg)
1171 {
1172 	int rc = 0;
1173 
1174 #ifdef TCP_OFFLOAD
1175 	switch (cmd) {
1176 	case MOD_LOAD:
1177 		rc = t4_tom_mod_load();
1178 		break;
1179 
1180 	case MOD_UNLOAD:
1181 		rc = t4_tom_mod_unload();
1182 		break;
1183 
1184 	default:
1185 		rc = EINVAL;
1186 	}
1187 #else
1188 	printf("t4_tom: compiled without TCP_OFFLOAD support.\n");
1189 	rc = EOPNOTSUPP;
1190 #endif
1191 	return (rc);
1192 }
1193 
1194 static moduledata_t t4_tom_moddata= {
1195 	"t4_tom",
1196 	t4_tom_modevent,
1197 	0
1198 };
1199 
1200 MODULE_VERSION(t4_tom, 1);
1201 MODULE_DEPEND(t4_tom, toecore, 1, 1, 1);
1202 MODULE_DEPEND(t4_tom, t4nex, 1, 1, 1);
1203 DECLARE_MODULE(t4_tom, t4_tom_moddata, SI_SUB_EXEC, SI_ORDER_ANY);
1204