xref: /freebsd/sys/dev/cxgbe/cxgbei/cxgbei.c (revision 076ad2f8)
1 /*-
2  * Copyright (c) 2012 Chelsio Communications, Inc.
3  * All rights reserved.
4  *
5  * Chelsio T5xx iSCSI driver
6  *
7  * Written by: Sreenivasa Honnur <shonnur@chelsio.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_inet.h"
35 #include "opt_inet6.h"
36 
37 #include <sys/types.h>
38 #include <sys/param.h>
39 #include <sys/kernel.h>
40 #include <sys/module.h>
41 #include <sys/systm.h>
42 
43 #ifdef TCP_OFFLOAD
44 #include <sys/errno.h>
45 #include <sys/kthread.h>
46 #include <sys/smp.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/mbuf.h>
50 #include <sys/lock.h>
51 #include <sys/mutex.h>
52 #include <sys/condvar.h>
53 
54 #include <netinet/in.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/toecore.h>
57 #include <netinet/tcp_var.h>
58 #include <netinet/tcp_fsm.h>
59 
60 #include <cam/scsi/scsi_all.h>
61 #include <cam/scsi/scsi_da.h>
62 #include <cam/ctl/ctl_io.h>
63 #include <cam/ctl/ctl.h>
64 #include <cam/ctl/ctl_backend.h>
65 #include <cam/ctl/ctl_error.h>
66 #include <cam/ctl/ctl_frontend.h>
67 #include <cam/ctl/ctl_debug.h>
68 #include <cam/ctl/ctl_ha.h>
69 #include <cam/ctl/ctl_ioctl.h>
70 
71 #include <dev/iscsi/icl.h>
72 #include <dev/iscsi/iscsi_proto.h>
73 #include <dev/iscsi/iscsi_ioctl.h>
74 #include <dev/iscsi/iscsi.h>
75 #include <cam/ctl/ctl_frontend_iscsi.h>
76 
77 #include <cam/cam.h>
78 #include <cam/cam_ccb.h>
79 #include <cam/cam_xpt.h>
80 #include <cam/cam_debug.h>
81 #include <cam/cam_sim.h>
82 #include <cam/cam_xpt_sim.h>
83 #include <cam/cam_xpt_periph.h>
84 #include <cam/cam_periph.h>
85 #include <cam/cam_compat.h>
86 #include <cam/scsi/scsi_message.h>
87 
88 #include "common/common.h"
89 #include "common/t4_msg.h"
90 #include "common/t4_regs.h"     /* for PCIE_MEM_ACCESS */
91 #include "tom/t4_tom.h"
92 #include "cxgbei.h"
93 
94 static int worker_thread_count;
95 static struct cxgbei_worker_thread_softc *cwt_softc;
96 static struct proc *cxgbei_proc;
97 
98 /* XXXNP some header instead. */
99 struct icl_pdu *icl_cxgbei_new_pdu(int);
100 void icl_cxgbei_new_pdu_set_conn(struct icl_pdu *, struct icl_conn *);
101 void icl_cxgbei_conn_pdu_free(struct icl_conn *, struct icl_pdu *);
102 
103 static void
104 free_ci_counters(struct cxgbei_data *ci)
105 {
106 
107 #define FREE_CI_COUNTER(x) do { \
108 	if (ci->x != NULL) { \
109 		counter_u64_free(ci->x); \
110 		ci->x = NULL; \
111 	} \
112 } while (0)
113 
114 	FREE_CI_COUNTER(ddp_setup_ok);
115 	FREE_CI_COUNTER(ddp_setup_error);
116 	FREE_CI_COUNTER(ddp_bytes);
117 	FREE_CI_COUNTER(ddp_pdus);
118 	FREE_CI_COUNTER(fl_bytes);
119 	FREE_CI_COUNTER(fl_pdus);
120 #undef FREE_CI_COUNTER
121 }
122 
123 static int
124 alloc_ci_counters(struct cxgbei_data *ci)
125 {
126 
127 #define ALLOC_CI_COUNTER(x) do { \
128 	ci->x = counter_u64_alloc(M_WAITOK); \
129 	if (ci->x == NULL) \
130 		goto fail; \
131 } while (0)
132 
133 	ALLOC_CI_COUNTER(ddp_setup_ok);
134 	ALLOC_CI_COUNTER(ddp_setup_error);
135 	ALLOC_CI_COUNTER(ddp_bytes);
136 	ALLOC_CI_COUNTER(ddp_pdus);
137 	ALLOC_CI_COUNTER(fl_bytes);
138 	ALLOC_CI_COUNTER(fl_pdus);
139 #undef ALLOC_CI_COUNTER
140 
141 	return (0);
142 fail:
143 	free_ci_counters(ci);
144 	return (ENOMEM);
145 }
146 
147 static void
148 read_pdu_limits(struct adapter *sc, uint32_t *max_tx_pdu_len,
149     uint32_t *max_rx_pdu_len)
150 {
151 	uint32_t tx_len, rx_len, r, v;
152 
153 	rx_len = t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE);
154 	tx_len = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE);
155 
156 	r = t4_read_reg(sc, A_TP_PARA_REG2);
157 	rx_len = min(rx_len, G_MAXRXDATA(r));
158 	tx_len = min(tx_len, G_MAXRXDATA(r));
159 
160 	r = t4_read_reg(sc, A_TP_PARA_REG7);
161 	v = min(G_PMMAXXFERLEN0(r), G_PMMAXXFERLEN1(r));
162 	rx_len = min(rx_len, v);
163 	tx_len = min(tx_len, v);
164 
165 	/* Remove after FW_FLOWC_MNEM_TXDATAPLEN_MAX fix in firmware. */
166 	tx_len = min(tx_len, 3 * 4096);
167 
168 	*max_tx_pdu_len = rounddown2(tx_len, 512);
169 	*max_rx_pdu_len = rounddown2(rx_len, 512);
170 }
171 
172 /*
173  * Initialize the software state of the iSCSI ULP driver.
174  *
175  * ENXIO means firmware didn't set up something that it was supposed to.
176  */
177 static int
178 cxgbei_init(struct adapter *sc, struct cxgbei_data *ci)
179 {
180 	struct sysctl_oid *oid;
181 	struct sysctl_oid_list *children;
182 	struct ppod_region *pr;
183 	uint32_t r;
184 	int rc;
185 
186 	MPASS(sc->vres.iscsi.size > 0);
187 	MPASS(ci != NULL);
188 
189 	rc = alloc_ci_counters(ci);
190 	if (rc != 0)
191 		return (rc);
192 
193 	read_pdu_limits(sc, &ci->max_tx_pdu_len, &ci->max_rx_pdu_len);
194 
195 	pr = &ci->pr;
196 	r = t4_read_reg(sc, A_ULP_RX_ISCSI_PSZ);
197 	rc = t4_init_ppod_region(pr, &sc->vres.iscsi, r, "iSCSI page pods");
198 	if (rc != 0) {
199 		device_printf(sc->dev,
200 		    "%s: failed to initialize the iSCSI page pod region: %u.\n",
201 		    __func__, rc);
202 		free_ci_counters(ci);
203 		return (rc);
204 	}
205 
206 	r = t4_read_reg(sc, A_ULP_RX_ISCSI_TAGMASK);
207 	r &= V_ISCSITAGMASK(M_ISCSITAGMASK);
208 	if (r != pr->pr_tag_mask) {
209 		/*
210 		 * Recent firmwares are supposed to set up the iSCSI tagmask
211 		 * but we'll do it ourselves it the computed value doesn't match
212 		 * what's in the register.
213 		 */
214 		device_printf(sc->dev,
215 		    "tagmask 0x%08x does not match computed mask 0x%08x.\n", r,
216 		    pr->pr_tag_mask);
217 		t4_set_reg_field(sc, A_ULP_RX_ISCSI_TAGMASK,
218 		    V_ISCSITAGMASK(M_ISCSITAGMASK), pr->pr_tag_mask);
219 	}
220 
221 	sysctl_ctx_init(&ci->ctx);
222 	oid = device_get_sysctl_tree(sc->dev);	/* dev.t5nex.X */
223 	children = SYSCTL_CHILDREN(oid);
224 
225 	oid = SYSCTL_ADD_NODE(&ci->ctx, children, OID_AUTO, "iscsi", CTLFLAG_RD,
226 	    NULL, "iSCSI ULP statistics");
227 	children = SYSCTL_CHILDREN(oid);
228 
229 	SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "ddp_setup_ok",
230 	    CTLFLAG_RD, &ci->ddp_setup_ok,
231 	    "# of times DDP buffer was setup successfully.");
232 
233 	SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "ddp_setup_error",
234 	    CTLFLAG_RD, &ci->ddp_setup_error,
235 	    "# of times DDP buffer setup failed.");
236 
237 	SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "ddp_bytes",
238 	    CTLFLAG_RD, &ci->ddp_bytes, "# of bytes placed directly");
239 
240 	SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "ddp_pdus",
241 	    CTLFLAG_RD, &ci->ddp_pdus, "# of PDUs with data placed directly.");
242 
243 	SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "fl_bytes",
244 	    CTLFLAG_RD, &ci->fl_bytes, "# of data bytes delivered in freelist");
245 
246 	SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "fl_pdus",
247 	    CTLFLAG_RD, &ci->fl_pdus,
248 	    "# of PDUs with data delivered in freelist");
249 
250 	ci->ddp_threshold = 2048;
251 	SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "ddp_threshold",
252 	    CTLFLAG_RW, &ci->ddp_threshold, 0, "Rx zero copy threshold");
253 
254 	return (0);
255 }
256 
257 static int
258 do_rx_iscsi_hdr(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
259 {
260 	struct adapter *sc = iq->adapter;
261 	struct cpl_iscsi_hdr *cpl = mtod(m, struct cpl_iscsi_hdr *);
262 	u_int tid = GET_TID(cpl);
263 	struct toepcb *toep = lookup_tid(sc, tid);
264 	struct icl_pdu *ip;
265 	struct icl_cxgbei_pdu *icp;
266 	uint16_t len_ddp = be16toh(cpl->pdu_len_ddp);
267 	uint16_t len = be16toh(cpl->len);
268 
269 	M_ASSERTPKTHDR(m);
270 	MPASS(m->m_pkthdr.len == len + sizeof(*cpl));
271 
272 	ip = icl_cxgbei_new_pdu(M_NOWAIT);
273 	if (ip == NULL)
274 		CXGBE_UNIMPLEMENTED("PDU allocation failure");
275 	m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs);
276 	ip->ip_data_len = G_ISCSI_PDU_LEN(len_ddp) - len;
277 	icp = ip_to_icp(ip);
278 	icp->icp_seq = ntohl(cpl->seq);
279 	icp->icp_flags = ICPF_RX_HDR;
280 
281 	/* This is the start of a new PDU.  There should be no old state. */
282 	MPASS(toep->ulpcb2 == NULL);
283 	toep->ulpcb2 = icp;
284 
285 #if 0
286 	CTR5(KTR_CXGBE, "%s: tid %u, cpl->len %u, pdu_len_ddp 0x%04x, icp %p",
287 	    __func__, tid, len, len_ddp, icp);
288 #endif
289 
290 	m_freem(m);
291 	return (0);
292 }
293 
294 static int
295 do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
296 {
297 	struct adapter *sc = iq->adapter;
298 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
299 	struct cpl_iscsi_data *cpl =  mtod(m, struct cpl_iscsi_data *);
300 	u_int tid = GET_TID(cpl);
301 	struct toepcb *toep = lookup_tid(sc, tid);
302 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
303 
304 	M_ASSERTPKTHDR(m);
305 	MPASS(m->m_pkthdr.len == be16toh(cpl->len) + sizeof(*cpl));
306 
307 	/* Must already have received the header (but not the data). */
308 	MPASS(icp != NULL);
309 	MPASS(icp->icp_flags == ICPF_RX_HDR);
310 	MPASS(icp->ip.ip_data_mbuf == NULL);
311 
312 
313 	m_adj(m, sizeof(*cpl));
314 	MPASS(icp->ip.ip_data_len == m->m_pkthdr.len);
315 
316 	icp->icp_flags |= ICPF_RX_FLBUF;
317 	icp->ip.ip_data_mbuf = m;
318 	counter_u64_add(ci->fl_pdus, 1);
319 	counter_u64_add(ci->fl_bytes, m->m_pkthdr.len);
320 
321 #if 0
322 	CTR3(KTR_CXGBE, "%s: tid %u, cpl->len %u", __func__, tid,
323 	    be16toh(cpl->len));
324 #endif
325 
326 	return (0);
327 }
328 
329 static int
330 do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
331 {
332 	struct adapter *sc = iq->adapter;
333 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
334 	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
335 	u_int tid = GET_TID(cpl);
336 	struct toepcb *toep = lookup_tid(sc, tid);
337 	struct inpcb *inp = toep->inp;
338 	struct socket *so;
339 	struct sockbuf *sb;
340 	struct tcpcb *tp;
341 	struct icl_cxgbei_conn *icc;
342 	struct icl_conn *ic;
343 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
344 	struct icl_pdu *ip;
345 	u_int pdu_len, val;
346 
347 	MPASS(m == NULL);
348 
349 	/* Must already be assembling a PDU. */
350 	MPASS(icp != NULL);
351 	MPASS(icp->icp_flags & ICPF_RX_HDR);	/* Data is optional. */
352 	MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0);
353 
354 	pdu_len = be16toh(cpl->len);	/* includes everything. */
355 	val = be32toh(cpl->ddpvld);
356 
357 #if 0
358 	CTR5(KTR_CXGBE,
359 	    "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp_flags 0x%08x",
360 	    __func__, tid, pdu_len, val, icp->icp_flags);
361 #endif
362 
363 	icp->icp_flags |= ICPF_RX_STATUS;
364 	ip = &icp->ip;
365 	if (val & F_DDP_PADDING_ERR)
366 		icp->icp_flags |= ICPF_PAD_ERR;
367 	if (val & F_DDP_HDRCRC_ERR)
368 		icp->icp_flags |= ICPF_HCRC_ERR;
369 	if (val & F_DDP_DATACRC_ERR)
370 		icp->icp_flags |= ICPF_DCRC_ERR;
371 	if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) {
372 		MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0);
373 		MPASS(ip->ip_data_len > 0);
374 		icp->icp_flags |= ICPF_RX_DDP;
375 		counter_u64_add(ci->ddp_pdus, 1);
376 		counter_u64_add(ci->ddp_bytes, ip->ip_data_len);
377 	}
378 
379 	INP_WLOCK(inp);
380 	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
381 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
382 		    __func__, tid, pdu_len, inp->inp_flags);
383 		INP_WUNLOCK(inp);
384 		icl_cxgbei_conn_pdu_free(NULL, ip);
385 #ifdef INVARIANTS
386 		toep->ulpcb2 = NULL;
387 #endif
388 		return (0);
389 	}
390 
391 	tp = intotcpcb(inp);
392 	MPASS(icp->icp_seq == tp->rcv_nxt);
393 	MPASS(tp->rcv_wnd >= pdu_len);
394 	tp->rcv_nxt += pdu_len;
395 	tp->rcv_wnd -= pdu_len;
396 	tp->t_rcvtime = ticks;
397 
398 	/* update rx credits */
399 	toep->rx_credits += pdu_len;
400 	t4_rcvd(&toep->td->tod, tp);	/* XXX: sc->tom_softc.tod */
401 
402 	so = inp->inp_socket;
403 	sb = &so->so_rcv;
404 	SOCKBUF_LOCK(sb);
405 
406 	icc = toep->ulpcb;
407 	if (__predict_false(icc == NULL || sb->sb_state & SBS_CANTRCVMORE)) {
408 		CTR5(KTR_CXGBE,
409 		    "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
410 		    __func__, tid, pdu_len, icc, sb->sb_state);
411 		SOCKBUF_UNLOCK(sb);
412 		INP_WUNLOCK(inp);
413 
414 		INP_INFO_RLOCK(&V_tcbinfo);
415 		INP_WLOCK(inp);
416 		tp = tcp_drop(tp, ECONNRESET);
417 		if (tp)
418 			INP_WUNLOCK(inp);
419 		INP_INFO_RUNLOCK(&V_tcbinfo);
420 
421 		icl_cxgbei_conn_pdu_free(NULL, ip);
422 #ifdef INVARIANTS
423 		toep->ulpcb2 = NULL;
424 #endif
425 		return (0);
426 	}
427 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
428 	ic = &icc->ic;
429 	icl_cxgbei_new_pdu_set_conn(ip, ic);
430 
431 	MPASS(m == NULL); /* was unused, we'll use it now. */
432 	m = sbcut_locked(sb, sbused(sb)); /* XXXNP: toep->sb_cc accounting? */
433 	if (__predict_false(m != NULL)) {
434 		int len = m_length(m, NULL);
435 
436 		/*
437 		 * PDUs were received before the tid transitioned to ULP mode.
438 		 * Convert them to icl_cxgbei_pdus and send them to ICL before
439 		 * the PDU in icp/ip.
440 		 */
441 		CTR3(KTR_CXGBE, "%s: tid %u, %u bytes in so_rcv", __func__, tid,
442 		    len);
443 
444 		/* XXXNP: needs to be rewritten. */
445 		if (len == sizeof(struct iscsi_bhs) || len == 4 + sizeof(struct
446 		    iscsi_bhs)) {
447 			struct icl_cxgbei_pdu *icp0;
448 			struct icl_pdu *ip0;
449 
450 			ip0 = icl_cxgbei_new_pdu(M_NOWAIT);
451 			icl_cxgbei_new_pdu_set_conn(ip0, ic);
452 			if (ip0 == NULL)
453 				CXGBE_UNIMPLEMENTED("PDU allocation failure");
454 			icp0 = ip_to_icp(ip0);
455 			icp0->icp_seq = 0; /* XXX */
456 			icp0->icp_flags = ICPF_RX_HDR | ICPF_RX_STATUS;
457 			m_copydata(m, 0, sizeof(struct iscsi_bhs), (void *)ip0->ip_bhs);
458 			STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip0, ip_next);
459 		}
460 		m_freem(m);
461 	}
462 
463 	STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
464 	if ((icc->rx_flags & RXF_ACTIVE) == 0) {
465 		struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt];
466 
467 		mtx_lock(&cwt->cwt_lock);
468 		icc->rx_flags |= RXF_ACTIVE;
469 		TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link);
470 		if (cwt->cwt_state == CWT_SLEEPING) {
471 			cwt->cwt_state = CWT_RUNNING;
472 			cv_signal(&cwt->cwt_cv);
473 		}
474 		mtx_unlock(&cwt->cwt_lock);
475 	}
476 	SOCKBUF_UNLOCK(sb);
477 	INP_WUNLOCK(inp);
478 
479 #ifdef INVARIANTS
480 	toep->ulpcb2 = NULL;
481 #endif
482 
483 	return (0);
484 }
485 
486 static int
487 cxgbei_activate(struct adapter *sc)
488 {
489 	struct cxgbei_data *ci;
490 	int rc;
491 
492 	ASSERT_SYNCHRONIZED_OP(sc);
493 
494 	if (uld_active(sc, ULD_ISCSI)) {
495 		KASSERT(0, ("%s: iSCSI offload already enabled on adapter %p",
496 		    __func__, sc));
497 		return (0);
498 	}
499 
500 	if (sc->iscsicaps == 0 || sc->vres.iscsi.size == 0) {
501 		device_printf(sc->dev,
502 		    "not iSCSI offload capable, or capability disabled.\n");
503 		return (ENOSYS);
504 	}
505 
506 	/* per-adapter softc for iSCSI */
507 	ci = malloc(sizeof(*ci), M_CXGBE, M_ZERO | M_WAITOK);
508 	if (ci == NULL)
509 		return (ENOMEM);
510 
511 	rc = cxgbei_init(sc, ci);
512 	if (rc != 0) {
513 		free(ci, M_CXGBE);
514 		return (rc);
515 	}
516 
517 	sc->iscsi_ulp_softc = ci;
518 
519 	return (0);
520 }
521 
522 static int
523 cxgbei_deactivate(struct adapter *sc)
524 {
525 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
526 
527 	ASSERT_SYNCHRONIZED_OP(sc);
528 
529 	if (ci != NULL) {
530 		sysctl_ctx_free(&ci->ctx);
531 		t4_free_ppod_region(&ci->pr);
532 		free_ci_counters(ci);
533 		free(ci, M_CXGBE);
534 		sc->iscsi_ulp_softc = NULL;
535 	}
536 
537 	return (0);
538 }
539 
540 static void
541 cxgbei_activate_all(struct adapter *sc, void *arg __unused)
542 {
543 
544 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isact") != 0)
545 		return;
546 
547 	/* Activate iSCSI if any port on this adapter has IFCAP_TOE enabled. */
548 	if (sc->offload_map && !uld_active(sc, ULD_ISCSI))
549 		(void) t4_activate_uld(sc, ULD_ISCSI);
550 
551 	end_synchronized_op(sc, 0);
552 }
553 
554 static void
555 cxgbei_deactivate_all(struct adapter *sc, void *arg __unused)
556 {
557 
558 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isdea") != 0)
559 		return;
560 
561 	if (uld_active(sc, ULD_ISCSI))
562 	    (void) t4_deactivate_uld(sc, ULD_ISCSI);
563 
564 	end_synchronized_op(sc, 0);
565 }
566 
567 static struct uld_info cxgbei_uld_info = {
568 	.uld_id = ULD_ISCSI,
569 	.activate = cxgbei_activate,
570 	.deactivate = cxgbei_deactivate,
571 };
572 
573 static void
574 cwt_main(void *arg)
575 {
576 	struct cxgbei_worker_thread_softc *cwt = arg;
577 	struct icl_cxgbei_conn *icc = NULL;
578 	struct icl_conn *ic;
579 	struct icl_pdu *ip;
580 	struct sockbuf *sb;
581 	STAILQ_HEAD(, icl_pdu) rx_pdus = STAILQ_HEAD_INITIALIZER(rx_pdus);
582 
583 	MPASS(cwt != NULL);
584 
585 	mtx_lock(&cwt->cwt_lock);
586 	MPASS(cwt->cwt_state == 0);
587 	cwt->cwt_state = CWT_RUNNING;
588 	cv_signal(&cwt->cwt_cv);
589 
590 	while (__predict_true(cwt->cwt_state != CWT_STOP)) {
591 		cwt->cwt_state = CWT_RUNNING;
592 		while ((icc = TAILQ_FIRST(&cwt->rx_head)) != NULL) {
593 			TAILQ_REMOVE(&cwt->rx_head, icc, rx_link);
594 			mtx_unlock(&cwt->cwt_lock);
595 
596 			ic = &icc->ic;
597 			sb = &ic->ic_socket->so_rcv;
598 
599 			SOCKBUF_LOCK(sb);
600 			MPASS(icc->rx_flags & RXF_ACTIVE);
601 			if (__predict_true(!(sb->sb_state & SBS_CANTRCVMORE))) {
602 				MPASS(STAILQ_EMPTY(&rx_pdus));
603 				STAILQ_SWAP(&icc->rcvd_pdus, &rx_pdus, icl_pdu);
604 				SOCKBUF_UNLOCK(sb);
605 
606 				/* Hand over PDUs to ICL. */
607 				while ((ip = STAILQ_FIRST(&rx_pdus)) != NULL) {
608 					STAILQ_REMOVE_HEAD(&rx_pdus, ip_next);
609 					ic->ic_receive(ip);
610 				}
611 
612 				SOCKBUF_LOCK(sb);
613 				MPASS(STAILQ_EMPTY(&rx_pdus));
614 			}
615 			MPASS(icc->rx_flags & RXF_ACTIVE);
616 			if (STAILQ_EMPTY(&icc->rcvd_pdus) ||
617 			    __predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
618 				icc->rx_flags &= ~RXF_ACTIVE;
619 			} else {
620 				/*
621 				 * More PDUs were received while we were busy
622 				 * handing over the previous batch to ICL.
623 				 * Re-add this connection to the end of the
624 				 * queue.
625 				 */
626 				mtx_lock(&cwt->cwt_lock);
627 				TAILQ_INSERT_TAIL(&cwt->rx_head, icc,
628 				    rx_link);
629 				mtx_unlock(&cwt->cwt_lock);
630 			}
631 			SOCKBUF_UNLOCK(sb);
632 
633 			mtx_lock(&cwt->cwt_lock);
634 		}
635 
636 		/* Inner loop doesn't check for CWT_STOP, do that first. */
637 		if (__predict_false(cwt->cwt_state == CWT_STOP))
638 			break;
639 		cwt->cwt_state = CWT_SLEEPING;
640 		cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
641 	}
642 
643 	MPASS(TAILQ_FIRST(&cwt->rx_head) == NULL);
644 	mtx_assert(&cwt->cwt_lock, MA_OWNED);
645 	cwt->cwt_state = CWT_STOPPED;
646 	cv_signal(&cwt->cwt_cv);
647 	mtx_unlock(&cwt->cwt_lock);
648 	kthread_exit();
649 }
650 
651 static int
652 start_worker_threads(void)
653 {
654 	int i, rc;
655 	struct cxgbei_worker_thread_softc *cwt;
656 
657 	worker_thread_count = min(mp_ncpus, 32);
658 	cwt_softc = malloc(worker_thread_count * sizeof(*cwt), M_CXGBE,
659 	    M_WAITOK | M_ZERO);
660 
661 	MPASS(cxgbei_proc == NULL);
662 	for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) {
663 		mtx_init(&cwt->cwt_lock, "cwt lock", NULL, MTX_DEF);
664 		cv_init(&cwt->cwt_cv, "cwt cv");
665 		TAILQ_INIT(&cwt->rx_head);
666 		rc = kproc_kthread_add(cwt_main, cwt, &cxgbei_proc, NULL, 0, 0,
667 		    "cxgbei", "%d", i);
668 		if (rc != 0) {
669 			printf("cxgbei: failed to start thread #%d/%d (%d)\n",
670 			    i + 1, worker_thread_count, rc);
671 			mtx_destroy(&cwt->cwt_lock);
672 			cv_destroy(&cwt->cwt_cv);
673 			bzero(&cwt, sizeof(*cwt));
674 			if (i == 0) {
675 				free(cwt_softc, M_CXGBE);
676 				worker_thread_count = 0;
677 
678 				return (rc);
679 			}
680 
681 			/* Not fatal, carry on with fewer threads. */
682 			worker_thread_count = i;
683 			rc = 0;
684 			break;
685 		}
686 
687 		/* Wait for thread to start before moving on to the next one. */
688 		mtx_lock(&cwt->cwt_lock);
689 		while (cwt->cwt_state == 0)
690 			cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
691 		mtx_unlock(&cwt->cwt_lock);
692 	}
693 
694 	MPASS(cwt_softc != NULL);
695 	MPASS(worker_thread_count > 0);
696 	return (0);
697 }
698 
699 static void
700 stop_worker_threads(void)
701 {
702 	int i;
703 	struct cxgbei_worker_thread_softc *cwt = &cwt_softc[0];
704 
705 	MPASS(worker_thread_count >= 0);
706 
707 	for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) {
708 		mtx_lock(&cwt->cwt_lock);
709 		MPASS(cwt->cwt_state == CWT_RUNNING ||
710 		    cwt->cwt_state == CWT_SLEEPING);
711 		cwt->cwt_state = CWT_STOP;
712 		cv_signal(&cwt->cwt_cv);
713 		do {
714 			cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
715 		} while (cwt->cwt_state != CWT_STOPPED);
716 		mtx_unlock(&cwt->cwt_lock);
717 	}
718 	free(cwt_softc, M_CXGBE);
719 }
720 
721 /* Select a worker thread for a connection. */
722 u_int
723 cxgbei_select_worker_thread(struct icl_cxgbei_conn *icc)
724 {
725 	struct adapter *sc = icc->sc;
726 	struct toepcb *toep = icc->toep;
727 	u_int i, n;
728 
729 	n = worker_thread_count / sc->sge.nofldrxq;
730 	if (n > 0)
731 		i = toep->vi->pi->port_id * n + arc4random() % n;
732 	else
733 		i = arc4random() % worker_thread_count;
734 
735 	CTR3(KTR_CXGBE, "%s: tid %u, cwt %u", __func__, toep->tid, i);
736 
737 	return (i);
738 }
739 
740 static int
741 cxgbei_mod_load(void)
742 {
743 	int rc;
744 
745 	t4_register_cpl_handler(CPL_ISCSI_HDR, do_rx_iscsi_hdr);
746 	t4_register_cpl_handler(CPL_ISCSI_DATA, do_rx_iscsi_data);
747 	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, do_rx_iscsi_ddp);
748 
749 	rc = start_worker_threads();
750 	if (rc != 0)
751 		return (rc);
752 
753 	rc = t4_register_uld(&cxgbei_uld_info);
754 	if (rc != 0) {
755 		stop_worker_threads();
756 		return (rc);
757 	}
758 
759 	t4_iterate(cxgbei_activate_all, NULL);
760 
761 	return (rc);
762 }
763 
764 static int
765 cxgbei_mod_unload(void)
766 {
767 
768 	t4_iterate(cxgbei_deactivate_all, NULL);
769 
770 	if (t4_unregister_uld(&cxgbei_uld_info) == EBUSY)
771 		return (EBUSY);
772 
773 	stop_worker_threads();
774 
775 	t4_register_cpl_handler(CPL_ISCSI_HDR, NULL);
776 	t4_register_cpl_handler(CPL_ISCSI_DATA, NULL);
777 	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, NULL);
778 
779 	return (0);
780 }
781 #endif
782 
783 static int
784 cxgbei_modevent(module_t mod, int cmd, void *arg)
785 {
786 	int rc = 0;
787 
788 #ifdef TCP_OFFLOAD
789 	switch (cmd) {
790 	case MOD_LOAD:
791 		rc = cxgbei_mod_load();
792 		if (rc == 0)
793 			rc = icl_cxgbei_mod_load();
794 		break;
795 
796 	case MOD_UNLOAD:
797 		rc = icl_cxgbei_mod_unload();
798 		if (rc == 0)
799 			rc = cxgbei_mod_unload();
800 		break;
801 
802 	default:
803 		rc = EINVAL;
804 	}
805 #else
806 	printf("cxgbei: compiled without TCP_OFFLOAD support.\n");
807 	rc = EOPNOTSUPP;
808 #endif
809 
810 	return (rc);
811 }
812 
813 static moduledata_t cxgbei_mod = {
814 	"cxgbei",
815 	cxgbei_modevent,
816 	NULL,
817 };
818 
819 MODULE_VERSION(cxgbei, 1);
820 DECLARE_MODULE(cxgbei, cxgbei_mod, SI_SUB_EXEC, SI_ORDER_ANY);
821 MODULE_DEPEND(cxgbei, t4_tom, 1, 1, 1);
822 MODULE_DEPEND(cxgbei, cxgbe, 1, 1, 1);
823 MODULE_DEPEND(cxgbei, icl, 1, 1, 1);
824