xref: /freebsd/sys/dev/cxgbe/cxgbei/cxgbei.c (revision 1f474190)
1 /*-
2  * Copyright (c) 2012 Chelsio Communications, Inc.
3  * All rights reserved.
4  *
5  * Chelsio T5xx iSCSI driver
6  *
7  * Written by: Sreenivasa Honnur <shonnur@chelsio.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_inet.h"
35 #include "opt_inet6.h"
36 
37 #include <sys/types.h>
38 #include <sys/param.h>
39 #include <sys/kernel.h>
40 #include <sys/ktr.h>
41 #include <sys/module.h>
42 #include <sys/systm.h>
43 
44 #ifdef TCP_OFFLOAD
45 #include <sys/errno.h>
46 #include <sys/kthread.h>
47 #include <sys/smp.h>
48 #include <sys/socket.h>
49 #include <sys/socketvar.h>
50 #include <sys/mbuf.h>
51 #include <sys/lock.h>
52 #include <sys/mutex.h>
53 #include <sys/condvar.h>
54 
55 #include <netinet/in.h>
56 #include <netinet/in_pcb.h>
57 #include <netinet/toecore.h>
58 #include <netinet/tcp_var.h>
59 #include <netinet/tcp_fsm.h>
60 
61 #include <cam/scsi/scsi_all.h>
62 #include <cam/scsi/scsi_da.h>
63 #include <cam/ctl/ctl_io.h>
64 #include <cam/ctl/ctl.h>
65 #include <cam/ctl/ctl_backend.h>
66 #include <cam/ctl/ctl_error.h>
67 #include <cam/ctl/ctl_frontend.h>
68 #include <cam/ctl/ctl_debug.h>
69 #include <cam/ctl/ctl_ha.h>
70 #include <cam/ctl/ctl_ioctl.h>
71 
72 #include <dev/iscsi/icl.h>
73 #include <dev/iscsi/iscsi_proto.h>
74 #include <dev/iscsi/iscsi_ioctl.h>
75 #include <dev/iscsi/iscsi.h>
76 #include <cam/ctl/ctl_frontend_iscsi.h>
77 
78 #include <cam/cam.h>
79 #include <cam/cam_ccb.h>
80 #include <cam/cam_xpt.h>
81 #include <cam/cam_debug.h>
82 #include <cam/cam_sim.h>
83 #include <cam/cam_xpt_sim.h>
84 #include <cam/cam_xpt_periph.h>
85 #include <cam/cam_periph.h>
86 #include <cam/cam_compat.h>
87 #include <cam/scsi/scsi_message.h>
88 
89 #include "common/common.h"
90 #include "common/t4_msg.h"
91 #include "common/t4_regs.h"     /* for PCIE_MEM_ACCESS */
92 #include "tom/t4_tom.h"
93 #include "cxgbei.h"
94 
95 static int worker_thread_count;
96 static struct cxgbei_worker_thread_softc *cwt_softc;
97 static struct proc *cxgbei_proc;
98 
99 /* XXXNP some header instead. */
100 struct icl_pdu *icl_cxgbei_new_pdu(int);
101 void icl_cxgbei_new_pdu_set_conn(struct icl_pdu *, struct icl_conn *);
102 void icl_cxgbei_conn_pdu_free(struct icl_conn *, struct icl_pdu *);
103 
104 static void
105 free_ci_counters(struct cxgbei_data *ci)
106 {
107 
108 #define FREE_CI_COUNTER(x) do { \
109 	if (ci->x != NULL) { \
110 		counter_u64_free(ci->x); \
111 		ci->x = NULL; \
112 	} \
113 } while (0)
114 
115 	FREE_CI_COUNTER(ddp_setup_ok);
116 	FREE_CI_COUNTER(ddp_setup_error);
117 	FREE_CI_COUNTER(ddp_bytes);
118 	FREE_CI_COUNTER(ddp_pdus);
119 	FREE_CI_COUNTER(fl_bytes);
120 	FREE_CI_COUNTER(fl_pdus);
121 #undef FREE_CI_COUNTER
122 }
123 
124 static int
125 alloc_ci_counters(struct cxgbei_data *ci)
126 {
127 
128 #define ALLOC_CI_COUNTER(x) do { \
129 	ci->x = counter_u64_alloc(M_WAITOK); \
130 	if (ci->x == NULL) \
131 		goto fail; \
132 } while (0)
133 
134 	ALLOC_CI_COUNTER(ddp_setup_ok);
135 	ALLOC_CI_COUNTER(ddp_setup_error);
136 	ALLOC_CI_COUNTER(ddp_bytes);
137 	ALLOC_CI_COUNTER(ddp_pdus);
138 	ALLOC_CI_COUNTER(fl_bytes);
139 	ALLOC_CI_COUNTER(fl_pdus);
140 #undef ALLOC_CI_COUNTER
141 
142 	return (0);
143 fail:
144 	free_ci_counters(ci);
145 	return (ENOMEM);
146 }
147 
148 static void
149 read_pdu_limits(struct adapter *sc, uint32_t *max_tx_pdu_len,
150     uint32_t *max_rx_pdu_len)
151 {
152 	uint32_t tx_len, rx_len, r, v;
153 
154 	rx_len = t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE);
155 	tx_len = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE);
156 
157 	r = t4_read_reg(sc, A_TP_PARA_REG2);
158 	rx_len = min(rx_len, G_MAXRXDATA(r));
159 	tx_len = min(tx_len, G_MAXRXDATA(r));
160 
161 	r = t4_read_reg(sc, A_TP_PARA_REG7);
162 	v = min(G_PMMAXXFERLEN0(r), G_PMMAXXFERLEN1(r));
163 	rx_len = min(rx_len, v);
164 	tx_len = min(tx_len, v);
165 
166 	/* Remove after FW_FLOWC_MNEM_TXDATAPLEN_MAX fix in firmware. */
167 	tx_len = min(tx_len, 3 * 4096);
168 
169 	*max_tx_pdu_len = rounddown2(tx_len, 512);
170 	*max_rx_pdu_len = rounddown2(rx_len, 512);
171 }
172 
173 /*
174  * Initialize the software state of the iSCSI ULP driver.
175  *
176  * ENXIO means firmware didn't set up something that it was supposed to.
177  */
178 static int
179 cxgbei_init(struct adapter *sc, struct cxgbei_data *ci)
180 {
181 	struct sysctl_oid *oid;
182 	struct sysctl_oid_list *children;
183 	struct ppod_region *pr;
184 	uint32_t r;
185 	int rc;
186 
187 	MPASS(sc->vres.iscsi.size > 0);
188 	MPASS(ci != NULL);
189 
190 	rc = alloc_ci_counters(ci);
191 	if (rc != 0)
192 		return (rc);
193 
194 	read_pdu_limits(sc, &ci->max_tx_pdu_len, &ci->max_rx_pdu_len);
195 
196 	pr = &ci->pr;
197 	r = t4_read_reg(sc, A_ULP_RX_ISCSI_PSZ);
198 	rc = t4_init_ppod_region(pr, &sc->vres.iscsi, r, "iSCSI page pods");
199 	if (rc != 0) {
200 		device_printf(sc->dev,
201 		    "%s: failed to initialize the iSCSI page pod region: %u.\n",
202 		    __func__, rc);
203 		free_ci_counters(ci);
204 		return (rc);
205 	}
206 
207 	r = t4_read_reg(sc, A_ULP_RX_ISCSI_TAGMASK);
208 	r &= V_ISCSITAGMASK(M_ISCSITAGMASK);
209 	if (r != pr->pr_tag_mask) {
210 		/*
211 		 * Recent firmwares are supposed to set up the iSCSI tagmask
212 		 * but we'll do it ourselves it the computed value doesn't match
213 		 * what's in the register.
214 		 */
215 		device_printf(sc->dev,
216 		    "tagmask 0x%08x does not match computed mask 0x%08x.\n", r,
217 		    pr->pr_tag_mask);
218 		t4_set_reg_field(sc, A_ULP_RX_ISCSI_TAGMASK,
219 		    V_ISCSITAGMASK(M_ISCSITAGMASK), pr->pr_tag_mask);
220 	}
221 
222 	sysctl_ctx_init(&ci->ctx);
223 	oid = device_get_sysctl_tree(sc->dev);	/* dev.t5nex.X */
224 	children = SYSCTL_CHILDREN(oid);
225 
226 	oid = SYSCTL_ADD_NODE(&ci->ctx, children, OID_AUTO, "iscsi",
227 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "iSCSI ULP statistics");
228 	children = SYSCTL_CHILDREN(oid);
229 
230 	SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "ddp_setup_ok",
231 	    CTLFLAG_RD, &ci->ddp_setup_ok,
232 	    "# of times DDP buffer was setup successfully.");
233 
234 	SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "ddp_setup_error",
235 	    CTLFLAG_RD, &ci->ddp_setup_error,
236 	    "# of times DDP buffer setup failed.");
237 
238 	SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "ddp_bytes",
239 	    CTLFLAG_RD, &ci->ddp_bytes, "# of bytes placed directly");
240 
241 	SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "ddp_pdus",
242 	    CTLFLAG_RD, &ci->ddp_pdus, "# of PDUs with data placed directly.");
243 
244 	SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "fl_bytes",
245 	    CTLFLAG_RD, &ci->fl_bytes, "# of data bytes delivered in freelist");
246 
247 	SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "fl_pdus",
248 	    CTLFLAG_RD, &ci->fl_pdus,
249 	    "# of PDUs with data delivered in freelist");
250 
251 	ci->ddp_threshold = 2048;
252 	SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "ddp_threshold",
253 	    CTLFLAG_RW, &ci->ddp_threshold, 0, "Rx zero copy threshold");
254 
255 	return (0);
256 }
257 
258 static int
259 do_rx_iscsi_hdr(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
260 {
261 	struct adapter *sc = iq->adapter;
262 	struct cpl_iscsi_hdr *cpl = mtod(m, struct cpl_iscsi_hdr *);
263 	u_int tid = GET_TID(cpl);
264 	struct toepcb *toep = lookup_tid(sc, tid);
265 	struct icl_pdu *ip;
266 	struct icl_cxgbei_pdu *icp;
267 	uint16_t len_ddp = be16toh(cpl->pdu_len_ddp);
268 	uint16_t len = be16toh(cpl->len);
269 
270 	M_ASSERTPKTHDR(m);
271 	MPASS(m->m_pkthdr.len == len + sizeof(*cpl));
272 
273 	ip = icl_cxgbei_new_pdu(M_NOWAIT);
274 	if (ip == NULL)
275 		CXGBE_UNIMPLEMENTED("PDU allocation failure");
276 	m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs);
277 	ip->ip_data_len = G_ISCSI_PDU_LEN(len_ddp) - len;
278 	icp = ip_to_icp(ip);
279 	icp->icp_seq = ntohl(cpl->seq);
280 	icp->icp_flags = ICPF_RX_HDR;
281 
282 	/* This is the start of a new PDU.  There should be no old state. */
283 	MPASS(toep->ulpcb2 == NULL);
284 	toep->ulpcb2 = icp;
285 
286 #if 0
287 	CTR5(KTR_CXGBE, "%s: tid %u, cpl->len %u, pdu_len_ddp 0x%04x, icp %p",
288 	    __func__, tid, len, len_ddp, icp);
289 #endif
290 
291 	m_freem(m);
292 	return (0);
293 }
294 
295 static int
296 do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
297 {
298 	struct adapter *sc = iq->adapter;
299 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
300 	struct cpl_iscsi_data *cpl =  mtod(m, struct cpl_iscsi_data *);
301 	u_int tid = GET_TID(cpl);
302 	struct toepcb *toep = lookup_tid(sc, tid);
303 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
304 
305 	M_ASSERTPKTHDR(m);
306 	MPASS(m->m_pkthdr.len == be16toh(cpl->len) + sizeof(*cpl));
307 
308 	/* Must already have received the header (but not the data). */
309 	MPASS(icp != NULL);
310 	MPASS(icp->icp_flags == ICPF_RX_HDR);
311 	MPASS(icp->ip.ip_data_mbuf == NULL);
312 
313 
314 	m_adj(m, sizeof(*cpl));
315 	MPASS(icp->ip.ip_data_len == m->m_pkthdr.len);
316 
317 	icp->icp_flags |= ICPF_RX_FLBUF;
318 	icp->ip.ip_data_mbuf = m;
319 	counter_u64_add(ci->fl_pdus, 1);
320 	counter_u64_add(ci->fl_bytes, m->m_pkthdr.len);
321 
322 #if 0
323 	CTR3(KTR_CXGBE, "%s: tid %u, cpl->len %u", __func__, tid,
324 	    be16toh(cpl->len));
325 #endif
326 
327 	return (0);
328 }
329 
330 static int
331 do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
332 {
333 	struct adapter *sc = iq->adapter;
334 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
335 	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
336 	u_int tid = GET_TID(cpl);
337 	struct toepcb *toep = lookup_tid(sc, tid);
338 	struct inpcb *inp = toep->inp;
339 	struct socket *so;
340 	struct sockbuf *sb;
341 	struct tcpcb *tp;
342 	struct icl_cxgbei_conn *icc;
343 	struct icl_conn *ic;
344 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
345 	struct icl_pdu *ip;
346 	u_int pdu_len, val;
347 	struct epoch_tracker et;
348 
349 	MPASS(m == NULL);
350 
351 	/* Must already be assembling a PDU. */
352 	MPASS(icp != NULL);
353 	MPASS(icp->icp_flags & ICPF_RX_HDR);	/* Data is optional. */
354 	MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0);
355 
356 	pdu_len = be16toh(cpl->len);	/* includes everything. */
357 	val = be32toh(cpl->ddpvld);
358 
359 #if 0
360 	CTR5(KTR_CXGBE,
361 	    "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp_flags 0x%08x",
362 	    __func__, tid, pdu_len, val, icp->icp_flags);
363 #endif
364 
365 	icp->icp_flags |= ICPF_RX_STATUS;
366 	ip = &icp->ip;
367 	if (val & F_DDP_PADDING_ERR)
368 		icp->icp_flags |= ICPF_PAD_ERR;
369 	if (val & F_DDP_HDRCRC_ERR)
370 		icp->icp_flags |= ICPF_HCRC_ERR;
371 	if (val & F_DDP_DATACRC_ERR)
372 		icp->icp_flags |= ICPF_DCRC_ERR;
373 	if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) {
374 		MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0);
375 		MPASS(ip->ip_data_len > 0);
376 		icp->icp_flags |= ICPF_RX_DDP;
377 		counter_u64_add(ci->ddp_pdus, 1);
378 		counter_u64_add(ci->ddp_bytes, ip->ip_data_len);
379 	}
380 
381 	INP_WLOCK(inp);
382 	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
383 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
384 		    __func__, tid, pdu_len, inp->inp_flags);
385 		INP_WUNLOCK(inp);
386 		icl_cxgbei_conn_pdu_free(NULL, ip);
387 #ifdef INVARIANTS
388 		toep->ulpcb2 = NULL;
389 #endif
390 		return (0);
391 	}
392 
393 	tp = intotcpcb(inp);
394 	MPASS(icp->icp_seq == tp->rcv_nxt);
395 	MPASS(tp->rcv_wnd >= pdu_len);
396 	tp->rcv_nxt += pdu_len;
397 	tp->rcv_wnd -= pdu_len;
398 	tp->t_rcvtime = ticks;
399 
400 	/* update rx credits */
401 	t4_rcvd(&toep->td->tod, tp);	/* XXX: sc->tom_softc.tod */
402 
403 	so = inp->inp_socket;
404 	sb = &so->so_rcv;
405 	SOCKBUF_LOCK(sb);
406 
407 	icc = toep->ulpcb;
408 	if (__predict_false(icc == NULL || sb->sb_state & SBS_CANTRCVMORE)) {
409 		CTR5(KTR_CXGBE,
410 		    "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
411 		    __func__, tid, pdu_len, icc, sb->sb_state);
412 		SOCKBUF_UNLOCK(sb);
413 		INP_WUNLOCK(inp);
414 
415 		NET_EPOCH_ENTER(et);
416 		INP_WLOCK(inp);
417 		tp = tcp_drop(tp, ECONNRESET);
418 		if (tp)
419 			INP_WUNLOCK(inp);
420 		NET_EPOCH_EXIT(et);
421 
422 		icl_cxgbei_conn_pdu_free(NULL, ip);
423 #ifdef INVARIANTS
424 		toep->ulpcb2 = NULL;
425 #endif
426 		return (0);
427 	}
428 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
429 	ic = &icc->ic;
430 	icl_cxgbei_new_pdu_set_conn(ip, ic);
431 
432 	MPASS(m == NULL); /* was unused, we'll use it now. */
433 	m = sbcut_locked(sb, sbused(sb)); /* XXXNP: toep->sb_cc accounting? */
434 	if (__predict_false(m != NULL)) {
435 		int len = m_length(m, NULL);
436 
437 		/*
438 		 * PDUs were received before the tid transitioned to ULP mode.
439 		 * Convert them to icl_cxgbei_pdus and send them to ICL before
440 		 * the PDU in icp/ip.
441 		 */
442 		CTR3(KTR_CXGBE, "%s: tid %u, %u bytes in so_rcv", __func__, tid,
443 		    len);
444 
445 		/* XXXNP: needs to be rewritten. */
446 		if (len == sizeof(struct iscsi_bhs) || len == 4 + sizeof(struct
447 		    iscsi_bhs)) {
448 			struct icl_cxgbei_pdu *icp0;
449 			struct icl_pdu *ip0;
450 
451 			ip0 = icl_cxgbei_new_pdu(M_NOWAIT);
452 			if (ip0 == NULL)
453 				CXGBE_UNIMPLEMENTED("PDU allocation failure");
454 			icl_cxgbei_new_pdu_set_conn(ip0, ic);
455 			icp0 = ip_to_icp(ip0);
456 			icp0->icp_seq = 0; /* XXX */
457 			icp0->icp_flags = ICPF_RX_HDR | ICPF_RX_STATUS;
458 			m_copydata(m, 0, sizeof(struct iscsi_bhs), (void *)ip0->ip_bhs);
459 			STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip0, ip_next);
460 		}
461 		m_freem(m);
462 	}
463 
464 	STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
465 	if ((icc->rx_flags & RXF_ACTIVE) == 0) {
466 		struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt];
467 
468 		mtx_lock(&cwt->cwt_lock);
469 		icc->rx_flags |= RXF_ACTIVE;
470 		TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link);
471 		if (cwt->cwt_state == CWT_SLEEPING) {
472 			cwt->cwt_state = CWT_RUNNING;
473 			cv_signal(&cwt->cwt_cv);
474 		}
475 		mtx_unlock(&cwt->cwt_lock);
476 	}
477 	SOCKBUF_UNLOCK(sb);
478 	INP_WUNLOCK(inp);
479 
480 #ifdef INVARIANTS
481 	toep->ulpcb2 = NULL;
482 #endif
483 
484 	return (0);
485 }
486 
487 static int
488 cxgbei_activate(struct adapter *sc)
489 {
490 	struct cxgbei_data *ci;
491 	int rc;
492 
493 	ASSERT_SYNCHRONIZED_OP(sc);
494 
495 	if (uld_active(sc, ULD_ISCSI)) {
496 		KASSERT(0, ("%s: iSCSI offload already enabled on adapter %p",
497 		    __func__, sc));
498 		return (0);
499 	}
500 
501 	if (sc->iscsicaps == 0 || sc->vres.iscsi.size == 0) {
502 		device_printf(sc->dev,
503 		    "not iSCSI offload capable, or capability disabled.\n");
504 		return (ENOSYS);
505 	}
506 
507 	/* per-adapter softc for iSCSI */
508 	ci = malloc(sizeof(*ci), M_CXGBE, M_ZERO | M_WAITOK);
509 	if (ci == NULL)
510 		return (ENOMEM);
511 
512 	rc = cxgbei_init(sc, ci);
513 	if (rc != 0) {
514 		free(ci, M_CXGBE);
515 		return (rc);
516 	}
517 
518 	sc->iscsi_ulp_softc = ci;
519 
520 	return (0);
521 }
522 
523 static int
524 cxgbei_deactivate(struct adapter *sc)
525 {
526 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
527 
528 	ASSERT_SYNCHRONIZED_OP(sc);
529 
530 	if (ci != NULL) {
531 		sysctl_ctx_free(&ci->ctx);
532 		t4_free_ppod_region(&ci->pr);
533 		free_ci_counters(ci);
534 		free(ci, M_CXGBE);
535 		sc->iscsi_ulp_softc = NULL;
536 	}
537 
538 	return (0);
539 }
540 
541 static void
542 cxgbei_activate_all(struct adapter *sc, void *arg __unused)
543 {
544 
545 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isact") != 0)
546 		return;
547 
548 	/* Activate iSCSI if any port on this adapter has IFCAP_TOE enabled. */
549 	if (sc->offload_map && !uld_active(sc, ULD_ISCSI))
550 		(void) t4_activate_uld(sc, ULD_ISCSI);
551 
552 	end_synchronized_op(sc, 0);
553 }
554 
555 static void
556 cxgbei_deactivate_all(struct adapter *sc, void *arg __unused)
557 {
558 
559 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isdea") != 0)
560 		return;
561 
562 	if (uld_active(sc, ULD_ISCSI))
563 	    (void) t4_deactivate_uld(sc, ULD_ISCSI);
564 
565 	end_synchronized_op(sc, 0);
566 }
567 
568 static struct uld_info cxgbei_uld_info = {
569 	.uld_id = ULD_ISCSI,
570 	.activate = cxgbei_activate,
571 	.deactivate = cxgbei_deactivate,
572 };
573 
574 static void
575 cwt_main(void *arg)
576 {
577 	struct cxgbei_worker_thread_softc *cwt = arg;
578 	struct icl_cxgbei_conn *icc = NULL;
579 	struct icl_conn *ic;
580 	struct icl_pdu *ip;
581 	struct sockbuf *sb;
582 	STAILQ_HEAD(, icl_pdu) rx_pdus = STAILQ_HEAD_INITIALIZER(rx_pdus);
583 
584 	MPASS(cwt != NULL);
585 
586 	mtx_lock(&cwt->cwt_lock);
587 	MPASS(cwt->cwt_state == 0);
588 	cwt->cwt_state = CWT_RUNNING;
589 	cv_signal(&cwt->cwt_cv);
590 
591 	while (__predict_true(cwt->cwt_state != CWT_STOP)) {
592 		cwt->cwt_state = CWT_RUNNING;
593 		while ((icc = TAILQ_FIRST(&cwt->rx_head)) != NULL) {
594 			TAILQ_REMOVE(&cwt->rx_head, icc, rx_link);
595 			mtx_unlock(&cwt->cwt_lock);
596 
597 			ic = &icc->ic;
598 			sb = &ic->ic_socket->so_rcv;
599 
600 			SOCKBUF_LOCK(sb);
601 			MPASS(icc->rx_flags & RXF_ACTIVE);
602 			if (__predict_true(!(sb->sb_state & SBS_CANTRCVMORE))) {
603 				MPASS(STAILQ_EMPTY(&rx_pdus));
604 				STAILQ_SWAP(&icc->rcvd_pdus, &rx_pdus, icl_pdu);
605 				SOCKBUF_UNLOCK(sb);
606 
607 				/* Hand over PDUs to ICL. */
608 				while ((ip = STAILQ_FIRST(&rx_pdus)) != NULL) {
609 					STAILQ_REMOVE_HEAD(&rx_pdus, ip_next);
610 					ic->ic_receive(ip);
611 				}
612 
613 				SOCKBUF_LOCK(sb);
614 				MPASS(STAILQ_EMPTY(&rx_pdus));
615 			}
616 			MPASS(icc->rx_flags & RXF_ACTIVE);
617 			if (STAILQ_EMPTY(&icc->rcvd_pdus) ||
618 			    __predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
619 				icc->rx_flags &= ~RXF_ACTIVE;
620 			} else {
621 				/*
622 				 * More PDUs were received while we were busy
623 				 * handing over the previous batch to ICL.
624 				 * Re-add this connection to the end of the
625 				 * queue.
626 				 */
627 				mtx_lock(&cwt->cwt_lock);
628 				TAILQ_INSERT_TAIL(&cwt->rx_head, icc,
629 				    rx_link);
630 				mtx_unlock(&cwt->cwt_lock);
631 			}
632 			SOCKBUF_UNLOCK(sb);
633 
634 			mtx_lock(&cwt->cwt_lock);
635 		}
636 
637 		/* Inner loop doesn't check for CWT_STOP, do that first. */
638 		if (__predict_false(cwt->cwt_state == CWT_STOP))
639 			break;
640 		cwt->cwt_state = CWT_SLEEPING;
641 		cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
642 	}
643 
644 	MPASS(TAILQ_FIRST(&cwt->rx_head) == NULL);
645 	mtx_assert(&cwt->cwt_lock, MA_OWNED);
646 	cwt->cwt_state = CWT_STOPPED;
647 	cv_signal(&cwt->cwt_cv);
648 	mtx_unlock(&cwt->cwt_lock);
649 	kthread_exit();
650 }
651 
652 static int
653 start_worker_threads(void)
654 {
655 	int i, rc;
656 	struct cxgbei_worker_thread_softc *cwt;
657 
658 	worker_thread_count = min(mp_ncpus, 32);
659 	cwt_softc = malloc(worker_thread_count * sizeof(*cwt), M_CXGBE,
660 	    M_WAITOK | M_ZERO);
661 
662 	MPASS(cxgbei_proc == NULL);
663 	for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) {
664 		mtx_init(&cwt->cwt_lock, "cwt lock", NULL, MTX_DEF);
665 		cv_init(&cwt->cwt_cv, "cwt cv");
666 		TAILQ_INIT(&cwt->rx_head);
667 		rc = kproc_kthread_add(cwt_main, cwt, &cxgbei_proc, NULL, 0, 0,
668 		    "cxgbei", "%d", i);
669 		if (rc != 0) {
670 			printf("cxgbei: failed to start thread #%d/%d (%d)\n",
671 			    i + 1, worker_thread_count, rc);
672 			mtx_destroy(&cwt->cwt_lock);
673 			cv_destroy(&cwt->cwt_cv);
674 			bzero(cwt, sizeof(*cwt));
675 			if (i == 0) {
676 				free(cwt_softc, M_CXGBE);
677 				worker_thread_count = 0;
678 
679 				return (rc);
680 			}
681 
682 			/* Not fatal, carry on with fewer threads. */
683 			worker_thread_count = i;
684 			rc = 0;
685 			break;
686 		}
687 
688 		/* Wait for thread to start before moving on to the next one. */
689 		mtx_lock(&cwt->cwt_lock);
690 		while (cwt->cwt_state == 0)
691 			cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
692 		mtx_unlock(&cwt->cwt_lock);
693 	}
694 
695 	MPASS(cwt_softc != NULL);
696 	MPASS(worker_thread_count > 0);
697 	return (0);
698 }
699 
700 static void
701 stop_worker_threads(void)
702 {
703 	int i;
704 	struct cxgbei_worker_thread_softc *cwt = &cwt_softc[0];
705 
706 	MPASS(worker_thread_count >= 0);
707 
708 	for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) {
709 		mtx_lock(&cwt->cwt_lock);
710 		MPASS(cwt->cwt_state == CWT_RUNNING ||
711 		    cwt->cwt_state == CWT_SLEEPING);
712 		cwt->cwt_state = CWT_STOP;
713 		cv_signal(&cwt->cwt_cv);
714 		do {
715 			cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
716 		} while (cwt->cwt_state != CWT_STOPPED);
717 		mtx_unlock(&cwt->cwt_lock);
718 		mtx_destroy(&cwt->cwt_lock);
719 		cv_destroy(&cwt->cwt_cv);
720 	}
721 	free(cwt_softc, M_CXGBE);
722 }
723 
724 /* Select a worker thread for a connection. */
725 u_int
726 cxgbei_select_worker_thread(struct icl_cxgbei_conn *icc)
727 {
728 	struct adapter *sc = icc->sc;
729 	struct toepcb *toep = icc->toep;
730 	u_int i, n;
731 
732 	n = worker_thread_count / sc->sge.nofldrxq;
733 	if (n > 0)
734 		i = toep->vi->pi->port_id * n + arc4random() % n;
735 	else
736 		i = arc4random() % worker_thread_count;
737 
738 	CTR3(KTR_CXGBE, "%s: tid %u, cwt %u", __func__, toep->tid, i);
739 
740 	return (i);
741 }
742 
743 static int
744 cxgbei_mod_load(void)
745 {
746 	int rc;
747 
748 	t4_register_cpl_handler(CPL_ISCSI_HDR, do_rx_iscsi_hdr);
749 	t4_register_cpl_handler(CPL_ISCSI_DATA, do_rx_iscsi_data);
750 	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, do_rx_iscsi_ddp);
751 
752 	rc = start_worker_threads();
753 	if (rc != 0)
754 		return (rc);
755 
756 	rc = t4_register_uld(&cxgbei_uld_info);
757 	if (rc != 0) {
758 		stop_worker_threads();
759 		return (rc);
760 	}
761 
762 	t4_iterate(cxgbei_activate_all, NULL);
763 
764 	return (rc);
765 }
766 
767 static int
768 cxgbei_mod_unload(void)
769 {
770 
771 	t4_iterate(cxgbei_deactivate_all, NULL);
772 
773 	if (t4_unregister_uld(&cxgbei_uld_info) == EBUSY)
774 		return (EBUSY);
775 
776 	stop_worker_threads();
777 
778 	t4_register_cpl_handler(CPL_ISCSI_HDR, NULL);
779 	t4_register_cpl_handler(CPL_ISCSI_DATA, NULL);
780 	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, NULL);
781 
782 	return (0);
783 }
784 #endif
785 
786 static int
787 cxgbei_modevent(module_t mod, int cmd, void *arg)
788 {
789 	int rc = 0;
790 
791 #ifdef TCP_OFFLOAD
792 	switch (cmd) {
793 	case MOD_LOAD:
794 		rc = cxgbei_mod_load();
795 		if (rc == 0)
796 			rc = icl_cxgbei_mod_load();
797 		break;
798 
799 	case MOD_UNLOAD:
800 		rc = icl_cxgbei_mod_unload();
801 		if (rc == 0)
802 			rc = cxgbei_mod_unload();
803 		break;
804 
805 	default:
806 		rc = EINVAL;
807 	}
808 #else
809 	printf("cxgbei: compiled without TCP_OFFLOAD support.\n");
810 	rc = EOPNOTSUPP;
811 #endif
812 
813 	return (rc);
814 }
815 
816 static moduledata_t cxgbei_mod = {
817 	"cxgbei",
818 	cxgbei_modevent,
819 	NULL,
820 };
821 
822 MODULE_VERSION(cxgbei, 1);
823 DECLARE_MODULE(cxgbei, cxgbei_mod, SI_SUB_EXEC, SI_ORDER_ANY);
824 MODULE_DEPEND(cxgbei, t4_tom, 1, 1, 1);
825 MODULE_DEPEND(cxgbei, cxgbe, 1, 1, 1);
826 MODULE_DEPEND(cxgbei, icl, 1, 1, 1);
827