xref: /freebsd/sys/dev/cxgbe/cxgbei/cxgbei.c (revision 685dc743)
1 /*-
2  * Copyright (c) 2012 Chelsio Communications, Inc.
3  * All rights reserved.
4  *
5  * Chelsio T5xx iSCSI driver
6  *
7  * Written by: Sreenivasa Honnur <shonnur@chelsio.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 #include "opt_inet.h"
33 #include "opt_inet6.h"
34 
35 #include <sys/types.h>
36 #include <sys/param.h>
37 #include <sys/kernel.h>
38 #include <sys/ktr.h>
39 #include <sys/module.h>
40 #include <sys/systm.h>
41 
42 #ifdef TCP_OFFLOAD
43 #include <sys/errno.h>
44 #include <sys/gsb_crc32.h>
45 #include <sys/kthread.h>
46 #include <sys/smp.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/mbuf.h>
50 #include <sys/lock.h>
51 #include <sys/mutex.h>
52 #include <sys/condvar.h>
53 #include <sys/uio.h>
54 
55 #include <netinet/in.h>
56 #include <netinet/in_pcb.h>
57 #include <netinet/toecore.h>
58 #include <netinet/tcp_var.h>
59 #include <netinet/tcp_fsm.h>
60 
61 #include <cam/scsi/scsi_all.h>
62 #include <cam/scsi/scsi_da.h>
63 #include <cam/ctl/ctl_io.h>
64 #include <cam/ctl/ctl.h>
65 #include <cam/ctl/ctl_backend.h>
66 #include <cam/ctl/ctl_error.h>
67 #include <cam/ctl/ctl_frontend.h>
68 #include <cam/ctl/ctl_debug.h>
69 #include <cam/ctl/ctl_ha.h>
70 #include <cam/ctl/ctl_ioctl.h>
71 
72 #include <dev/iscsi/icl.h>
73 #include <dev/iscsi/iscsi_proto.h>
74 #include <dev/iscsi/iscsi_ioctl.h>
75 #include <dev/iscsi/iscsi.h>
76 #include <cam/ctl/ctl_frontend_iscsi.h>
77 
78 #include <cam/cam.h>
79 #include <cam/cam_ccb.h>
80 #include <cam/cam_xpt.h>
81 #include <cam/cam_debug.h>
82 #include <cam/cam_sim.h>
83 #include <cam/cam_xpt_sim.h>
84 #include <cam/cam_xpt_periph.h>
85 #include <cam/cam_periph.h>
86 #include <cam/cam_compat.h>
87 #include <cam/scsi/scsi_message.h>
88 
89 #include "common/common.h"
90 #include "common/t4_msg.h"
91 #include "common/t4_regs.h"	/* for PCIE_MEM_ACCESS */
92 #include "tom/t4_tom.h"
93 #include "cxgbei.h"
94 
95 static void
96 read_pdu_limits(struct adapter *sc, uint32_t *max_tx_data_len,
97     uint32_t *max_rx_data_len, struct ppod_region *pr)
98 {
99 	uint32_t tx_len, rx_len, r, v;
100 
101 	rx_len = t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE);
102 	tx_len = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE);
103 
104 	r = t4_read_reg(sc, A_TP_PARA_REG2);
105 	rx_len = min(rx_len, G_MAXRXDATA(r));
106 	tx_len = min(tx_len, G_MAXRXDATA(r));
107 
108 	r = t4_read_reg(sc, A_TP_PARA_REG7);
109 	v = min(G_PMMAXXFERLEN0(r), G_PMMAXXFERLEN1(r));
110 	rx_len = min(rx_len, v);
111 	tx_len = min(tx_len, v);
112 
113 	/*
114 	 * AHS is not supported by the kernel so we'll not account for
115 	 * it either in our PDU len -> data segment len conversions.
116 	 */
117 	rx_len -= ISCSI_BHS_SIZE + ISCSI_HEADER_DIGEST_SIZE +
118 	    ISCSI_DATA_DIGEST_SIZE;
119 	tx_len -= ISCSI_BHS_SIZE + ISCSI_HEADER_DIGEST_SIZE +
120 	    ISCSI_DATA_DIGEST_SIZE;
121 
122 	/*
123 	 * DDP can place only 4 pages for a single PDU.  A single
124 	 * request might use larger pages than the smallest page size,
125 	 * but that cannot be guaranteed.  Assume the smallest DDP
126 	 * page size for this limit.
127 	 */
128 	rx_len = min(rx_len, 4 * (1U << pr->pr_page_shift[0]));
129 
130 	if (chip_id(sc) == CHELSIO_T5) {
131 		tx_len = min(tx_len, 15360);
132 
133 		rx_len = rounddown2(rx_len, 512);
134 		tx_len = rounddown2(tx_len, 512);
135 	}
136 
137 	*max_tx_data_len = tx_len;
138 	*max_rx_data_len = rx_len;
139 }
140 
141 /*
142  * Initialize the software state of the iSCSI ULP driver.
143  *
144  * ENXIO means firmware didn't set up something that it was supposed to.
145  */
146 static int
147 cxgbei_init(struct adapter *sc, struct cxgbei_data *ci)
148 {
149 	struct sysctl_oid *oid;
150 	struct sysctl_oid_list *children;
151 	struct ppod_region *pr;
152 	uint32_t r;
153 	int rc;
154 
155 	MPASS(sc->vres.iscsi.size > 0);
156 	MPASS(ci != NULL);
157 
158 	pr = &ci->pr;
159 	r = t4_read_reg(sc, A_ULP_RX_ISCSI_PSZ);
160 	rc = t4_init_ppod_region(pr, &sc->vres.iscsi, r, "iSCSI page pods");
161 	if (rc != 0) {
162 		device_printf(sc->dev,
163 		    "%s: failed to initialize the iSCSI page pod region: %u.\n",
164 		    __func__, rc);
165 		return (rc);
166 	}
167 
168 	read_pdu_limits(sc, &ci->max_tx_data_len, &ci->max_rx_data_len, pr);
169 
170 	sysctl_ctx_init(&ci->ctx);
171 	oid = device_get_sysctl_tree(sc->dev);	/* dev.t5nex.X */
172 	children = SYSCTL_CHILDREN(oid);
173 
174 	oid = SYSCTL_ADD_NODE(&ci->ctx, children, OID_AUTO, "iscsi",
175 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "iSCSI ULP settings");
176 	children = SYSCTL_CHILDREN(oid);
177 
178 	ci->ddp_threshold = 2048;
179 	SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "ddp_threshold",
180 	    CTLFLAG_RW, &ci->ddp_threshold, 0, "Rx zero copy threshold");
181 
182 	SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "max_rx_data_len",
183 	    CTLFLAG_RW, &ci->max_rx_data_len, 0,
184 	    "Maximum receive data segment length");
185 	SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "max_tx_data_len",
186 	    CTLFLAG_RW, &ci->max_tx_data_len, 0,
187 	    "Maximum transmit data segment length");
188 
189 	return (0);
190 }
191 
192 static int
193 do_rx_iscsi_hdr(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
194 {
195 	struct adapter *sc = iq->adapter;
196 	struct cpl_iscsi_hdr *cpl = mtod(m, struct cpl_iscsi_hdr *);
197 	u_int tid = GET_TID(cpl);
198 	struct toepcb *toep = lookup_tid(sc, tid);
199 	struct icl_pdu *ip;
200 	struct icl_cxgbei_pdu *icp;
201 	uint16_t len_ddp = be16toh(cpl->pdu_len_ddp);
202 	uint16_t len = be16toh(cpl->len);
203 
204 	M_ASSERTPKTHDR(m);
205 	MPASS(m->m_pkthdr.len == len + sizeof(*cpl));
206 
207 	ip = icl_cxgbei_new_pdu(M_NOWAIT);
208 	if (ip == NULL)
209 		CXGBE_UNIMPLEMENTED("PDU allocation failure");
210 	m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs);
211 	ip->ip_data_len = G_ISCSI_PDU_LEN(len_ddp) - len;
212 	icp = ip_to_icp(ip);
213 	icp->icp_seq = ntohl(cpl->seq);
214 	icp->icp_flags = ICPF_RX_HDR;
215 
216 	/* This is the start of a new PDU.  There should be no old state. */
217 	MPASS(toep->ulpcb2 == NULL);
218 	toep->ulpcb2 = icp;
219 
220 #if 0
221 	CTR5(KTR_CXGBE, "%s: tid %u, cpl->len %u, pdu_len_ddp 0x%04x, icp %p",
222 	    __func__, tid, len, len_ddp, icp);
223 #endif
224 
225 	m_freem(m);
226 	return (0);
227 }
228 
229 static int
230 do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
231 {
232 	struct adapter *sc = iq->adapter;
233 	struct cpl_iscsi_data *cpl =  mtod(m, struct cpl_iscsi_data *);
234 	u_int tid = GET_TID(cpl);
235 	struct toepcb *toep = lookup_tid(sc, tid);
236 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
237 	struct icl_pdu *ip;
238 
239 	M_ASSERTPKTHDR(m);
240 	MPASS(m->m_pkthdr.len == be16toh(cpl->len) + sizeof(*cpl));
241 
242 	if (icp == NULL) {
243 		/*
244 		 * T6 completion enabled, start of a new pdu. Header
245 		 * will come in completion CPL.
246 		 */
247 	        ip = icl_cxgbei_new_pdu(M_NOWAIT);
248 	        if (ip == NULL)
249 			CXGBE_UNIMPLEMENTED("PDU allocation failure");
250 		icp = ip_to_icp(ip);
251 	} else {
252 		/* T5 mode, header is already received. */
253 		MPASS(icp->icp_flags == ICPF_RX_HDR);
254 		MPASS(icp->ip.ip_data_mbuf == NULL);
255 		MPASS(icp->ip.ip_data_len == m->m_pkthdr.len - sizeof(*cpl));
256 	}
257 
258 	/* Trim the cpl header from mbuf. */
259 	m_adj(m, sizeof(*cpl));
260 
261 	icp->icp_flags |= ICPF_RX_FLBUF;
262 	icp->ip.ip_data_mbuf = m;
263 	toep->ofld_rxq->rx_iscsi_fl_pdus++;
264 	toep->ofld_rxq->rx_iscsi_fl_octets += m->m_pkthdr.len;
265 
266 	/*
267 	 * For T6, save the icp for further processing in the
268 	 * completion handler.
269 	 */
270 	if (icp->icp_flags == ICPF_RX_FLBUF) {
271 		MPASS(toep->ulpcb2 == NULL);
272 		toep->ulpcb2 = icp;
273 	}
274 
275 #if 0
276 	CTR4(KTR_CXGBE, "%s: tid %u, cpl->len %u, icp %p", __func__, tid,
277 	    be16toh(cpl->len), icp);
278 #endif
279 
280 	return (0);
281 }
282 
283 static int
284 mbuf_crc32c_helper(void *arg, void *data, u_int len)
285 {
286 	uint32_t *digestp = arg;
287 
288 	*digestp = calculate_crc32c(*digestp, data, len);
289 	return (0);
290 }
291 
292 static struct icl_pdu *
293 parse_pdu(struct socket *so, struct toepcb *toep, struct icl_cxgbei_conn *icc,
294     struct sockbuf *sb, u_int total_len)
295 {
296 	struct uio uio;
297 	struct iovec iov[2];
298 	struct iscsi_bhs bhs;
299 	struct mbuf *m;
300 	struct icl_pdu *ip;
301 	u_int ahs_len, data_len, header_len, pdu_len;
302 	uint32_t calc_digest, wire_digest;
303 	int error;
304 
305 	uio.uio_segflg = UIO_SYSSPACE;
306 	uio.uio_rw = UIO_READ;
307 	uio.uio_td = curthread;
308 
309 	header_len = sizeof(struct iscsi_bhs);
310 	if (icc->ic.ic_header_crc32c)
311 		header_len += ISCSI_HEADER_DIGEST_SIZE;
312 
313 	if (total_len < header_len) {
314 		ICL_WARN("truncated pre-offload PDU with len %u", total_len);
315 		return (NULL);
316 	}
317 
318 	iov[0].iov_base = &bhs;
319 	iov[0].iov_len = sizeof(bhs);
320 	iov[1].iov_base = &wire_digest;
321 	iov[1].iov_len = sizeof(wire_digest);
322 	uio.uio_iov = iov;
323 	uio.uio_iovcnt = 1;
324 	uio.uio_offset = 0;
325 	uio.uio_resid = header_len;
326 	error = soreceive(so, NULL, &uio, NULL, NULL, NULL);
327 	if (error != 0) {
328 		ICL_WARN("failed to read BHS from pre-offload PDU: %d", error);
329 		return (NULL);
330 	}
331 
332 	ahs_len = bhs.bhs_total_ahs_len * 4;
333 	data_len = bhs.bhs_data_segment_len[0] << 16 |
334 	    bhs.bhs_data_segment_len[1] << 8 |
335 	    bhs.bhs_data_segment_len[2];
336 	pdu_len = header_len + ahs_len + roundup2(data_len, 4);
337 	if (icc->ic.ic_data_crc32c && data_len != 0)
338 		pdu_len += ISCSI_DATA_DIGEST_SIZE;
339 
340 	if (total_len < pdu_len) {
341 		ICL_WARN("truncated pre-offload PDU len %u vs %u", total_len,
342 		    pdu_len);
343 		return (NULL);
344 	}
345 
346 	if (ahs_len != 0) {
347 		ICL_WARN("received pre-offload PDU with AHS");
348 		return (NULL);
349 	}
350 
351 	if (icc->ic.ic_header_crc32c) {
352 		calc_digest = calculate_crc32c(0xffffffff, (caddr_t)&bhs,
353 		    sizeof(bhs));
354 		calc_digest ^= 0xffffffff;
355 		if (calc_digest != wire_digest) {
356 			ICL_WARN("received pre-offload PDU 0x%02x with "
357 			    "invalid header digest (0x%x vs 0x%x)",
358 			    bhs.bhs_opcode, wire_digest, calc_digest);
359 			toep->ofld_rxq->rx_iscsi_header_digest_errors++;
360 			return (NULL);
361 		}
362 	}
363 
364 	m = NULL;
365 	if (data_len != 0) {
366 		uio.uio_iov = NULL;
367 		uio.uio_resid = roundup2(data_len, 4);
368 		if (icc->ic.ic_data_crc32c)
369 			uio.uio_resid += ISCSI_DATA_DIGEST_SIZE;
370 
371 		error = soreceive(so, NULL, &uio, &m, NULL, NULL);
372 		if (error != 0) {
373 			ICL_WARN("failed to read data payload from "
374 			    "pre-offload PDU: %d", error);
375 			return (NULL);
376 		}
377 
378 		if (icc->ic.ic_data_crc32c) {
379 			m_copydata(m, roundup2(data_len, 4),
380 			    sizeof(wire_digest), (caddr_t)&wire_digest);
381 
382 			calc_digest = 0xffffffff;
383 			m_apply(m, 0, roundup2(data_len, 4), mbuf_crc32c_helper,
384 			    &calc_digest);
385 			calc_digest ^= 0xffffffff;
386 			if (calc_digest != wire_digest) {
387 				ICL_WARN("received pre-offload PDU 0x%02x "
388 				    "with invalid data digest (0x%x vs 0x%x)",
389 				    bhs.bhs_opcode, wire_digest, calc_digest);
390 				toep->ofld_rxq->rx_iscsi_data_digest_errors++;
391 				m_freem(m);
392 				return (NULL);
393 			}
394 		}
395 	}
396 
397 	ip = icl_cxgbei_new_pdu(M_WAITOK);
398 	icl_cxgbei_new_pdu_set_conn(ip, &icc->ic);
399 	*ip->ip_bhs = bhs;
400 	ip->ip_data_len = data_len;
401 	ip->ip_data_mbuf = m;
402 	return (ip);
403 }
404 
405 void
406 parse_pdus(struct icl_cxgbei_conn *icc, struct sockbuf *sb)
407 {
408 	struct icl_conn *ic = &icc->ic;
409 	struct socket *so = ic->ic_socket;
410 	struct toepcb *toep = icc->toep;
411 	struct icl_pdu *ip, *lastip;
412 	u_int total_len;
413 
414 	SOCKBUF_LOCK_ASSERT(sb);
415 
416 	CTR3(KTR_CXGBE, "%s: tid %u, %u bytes in so_rcv", __func__, toep->tid,
417 	    sbused(sb));
418 
419 	lastip = NULL;
420 	while (sbused(sb) != 0 && (sb->sb_state & SBS_CANTRCVMORE) == 0) {
421 		total_len = sbused(sb);
422 		SOCKBUF_UNLOCK(sb);
423 
424 		ip = parse_pdu(so, toep, icc, sb, total_len);
425 
426 		if (ip == NULL) {
427 			ic->ic_error(ic);
428 			SOCKBUF_LOCK(sb);
429 			return;
430 		}
431 
432 		if (lastip == NULL)
433 			STAILQ_INSERT_HEAD(&icc->rcvd_pdus, ip, ip_next);
434 		else
435 			STAILQ_INSERT_AFTER(&icc->rcvd_pdus, lastip, ip,
436 			    ip_next);
437 		lastip = ip;
438 
439 		SOCKBUF_LOCK(sb);
440 	}
441 }
442 
443 static int
444 do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
445 {
446 	struct adapter *sc = iq->adapter;
447 	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
448 	u_int tid = GET_TID(cpl);
449 	struct toepcb *toep = lookup_tid(sc, tid);
450 	struct inpcb *inp = toep->inp;
451 	struct socket *so;
452 	struct sockbuf *sb;
453 	struct tcpcb *tp;
454 	struct icl_cxgbei_conn *icc;
455 	struct icl_conn *ic;
456 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
457 	struct icl_pdu *ip;
458 	u_int pdu_len, val;
459 	struct epoch_tracker et;
460 
461 	MPASS(m == NULL);
462 
463 	/* Must already be assembling a PDU. */
464 	MPASS(icp != NULL);
465 	MPASS(icp->icp_flags & ICPF_RX_HDR);	/* Data is optional. */
466 	MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0);
467 
468 	pdu_len = be16toh(cpl->len);	/* includes everything. */
469 	val = be32toh(cpl->ddpvld);
470 
471 #if 0
472 	CTR5(KTR_CXGBE,
473 	    "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp_flags 0x%08x",
474 	    __func__, tid, pdu_len, val, icp->icp_flags);
475 #endif
476 
477 	icp->icp_flags |= ICPF_RX_STATUS;
478 	ip = &icp->ip;
479 	if (val & F_DDP_PADDING_ERR) {
480 		ICL_WARN("received PDU 0x%02x with invalid padding",
481 		    ip->ip_bhs->bhs_opcode);
482 		toep->ofld_rxq->rx_iscsi_padding_errors++;
483 	}
484 	if (val & F_DDP_HDRCRC_ERR) {
485 		ICL_WARN("received PDU 0x%02x with invalid header digest",
486 		    ip->ip_bhs->bhs_opcode);
487 		toep->ofld_rxq->rx_iscsi_header_digest_errors++;
488 	}
489 	if (val & F_DDP_DATACRC_ERR) {
490 		ICL_WARN("received PDU 0x%02x with invalid data digest",
491 		    ip->ip_bhs->bhs_opcode);
492 		toep->ofld_rxq->rx_iscsi_data_digest_errors++;
493 	}
494 	if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) {
495 		MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0);
496 		MPASS(ip->ip_data_len > 0);
497 		icp->icp_flags |= ICPF_RX_DDP;
498 		toep->ofld_rxq->rx_iscsi_ddp_pdus++;
499 		toep->ofld_rxq->rx_iscsi_ddp_octets += ip->ip_data_len;
500 	}
501 
502 	INP_WLOCK(inp);
503 	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
504 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
505 		    __func__, tid, pdu_len, inp->inp_flags);
506 		INP_WUNLOCK(inp);
507 		icl_cxgbei_conn_pdu_free(NULL, ip);
508 		toep->ulpcb2 = NULL;
509 		return (0);
510 	}
511 
512 	/*
513 	 * T6+ does not report data PDUs received via DDP without F
514 	 * set.  This can result in gaps in the TCP sequence space.
515 	 */
516 	tp = intotcpcb(inp);
517 	MPASS(chip_id(sc) >= CHELSIO_T6 || icp->icp_seq == tp->rcv_nxt);
518 	tp->rcv_nxt = icp->icp_seq + pdu_len;
519 	tp->t_rcvtime = ticks;
520 
521 	/*
522 	 * Don't update the window size or return credits since RX
523 	 * flow control is disabled.
524 	 */
525 
526 	so = inp->inp_socket;
527 	sb = &so->so_rcv;
528 	SOCKBUF_LOCK(sb);
529 
530 	icc = toep->ulpcb;
531 	if (__predict_false(icc == NULL || sb->sb_state & SBS_CANTRCVMORE)) {
532 		CTR5(KTR_CXGBE,
533 		    "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
534 		    __func__, tid, pdu_len, icc, sb->sb_state);
535 		SOCKBUF_UNLOCK(sb);
536 		INP_WUNLOCK(inp);
537 
538 		CURVNET_SET(so->so_vnet);
539 		NET_EPOCH_ENTER(et);
540 		INP_WLOCK(inp);
541 		tp = tcp_drop(tp, ECONNRESET);
542 		if (tp)
543 			INP_WUNLOCK(inp);
544 		NET_EPOCH_EXIT(et);
545 		CURVNET_RESTORE();
546 
547 		icl_cxgbei_conn_pdu_free(NULL, ip);
548 		toep->ulpcb2 = NULL;
549 		return (0);
550 	}
551 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
552 	ic = &icc->ic;
553 	if ((val & (F_DDP_PADDING_ERR | F_DDP_HDRCRC_ERR |
554 	    F_DDP_DATACRC_ERR)) != 0) {
555 		SOCKBUF_UNLOCK(sb);
556 		INP_WUNLOCK(inp);
557 
558 		icl_cxgbei_conn_pdu_free(NULL, ip);
559 		toep->ulpcb2 = NULL;
560 		ic->ic_error(ic);
561 		return (0);
562 	}
563 
564 	icl_cxgbei_new_pdu_set_conn(ip, ic);
565 
566 	STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
567 	if (!icc->rx_active) {
568 		icc->rx_active = true;
569 		wakeup(&icc->rx_active);
570 	}
571 	SOCKBUF_UNLOCK(sb);
572 	INP_WUNLOCK(inp);
573 
574 	toep->ulpcb2 = NULL;
575 
576 	return (0);
577 }
578 
579 static int
580 do_rx_iscsi_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
581 {
582 	struct epoch_tracker et;
583 	struct adapter *sc = iq->adapter;
584 	struct cpl_rx_iscsi_cmp *cpl = mtod(m, struct cpl_rx_iscsi_cmp *);
585 	u_int tid = GET_TID(cpl);
586 	struct toepcb *toep = lookup_tid(sc, tid);
587 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
588 	struct icl_pdu *ip;
589 	struct cxgbei_cmp *cmp;
590 	struct inpcb *inp = toep->inp;
591 #ifdef INVARIANTS
592 	uint16_t len = be16toh(cpl->len);
593 	u_int data_digest_len;
594 #endif
595 	struct socket *so;
596 	struct sockbuf *sb;
597 	struct tcpcb *tp;
598 	struct icl_cxgbei_conn *icc;
599 	struct icl_conn *ic;
600 	struct iscsi_bhs_data_out *bhsdo;
601 	u_int val = be32toh(cpl->ddpvld);
602 	u_int npdus, pdu_len;
603 	uint32_t prev_seg_len;
604 
605 	M_ASSERTPKTHDR(m);
606 	MPASS(m->m_pkthdr.len == len + sizeof(*cpl));
607 
608 	if ((val & F_DDP_PDU) == 0) {
609 		MPASS(icp != NULL);
610 		MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0);
611 		ip = &icp->ip;
612 	}
613 
614 	if (icp == NULL) {
615 		/* T6 completion enabled, start of a new PDU. */
616 		ip = icl_cxgbei_new_pdu(M_NOWAIT);
617 		if (ip == NULL)
618 			CXGBE_UNIMPLEMENTED("PDU allocation failure");
619 		icp = ip_to_icp(ip);
620 	}
621 	pdu_len = G_ISCSI_PDU_LEN(be16toh(cpl->pdu_len_ddp));
622 
623 #if 0
624 	CTR5(KTR_CXGBE,
625 	    "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp %p",
626 	    __func__, tid, pdu_len, val, icp);
627 #endif
628 
629 	/* Copy header */
630 	m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs);
631 	bhsdo = (struct iscsi_bhs_data_out *)ip->ip_bhs;
632 	ip->ip_data_len = bhsdo->bhsdo_data_segment_len[0] << 16 |
633 	    bhsdo->bhsdo_data_segment_len[1] << 8 |
634 	    bhsdo->bhsdo_data_segment_len[2];
635 	icp->icp_seq = ntohl(cpl->seq);
636 	icp->icp_flags |= ICPF_RX_HDR;
637 	icp->icp_flags |= ICPF_RX_STATUS;
638 
639 	if (val & F_DDP_PADDING_ERR) {
640 		ICL_WARN("received PDU 0x%02x with invalid padding",
641 		    ip->ip_bhs->bhs_opcode);
642 		toep->ofld_rxq->rx_iscsi_padding_errors++;
643 	}
644 	if (val & F_DDP_HDRCRC_ERR) {
645 		ICL_WARN("received PDU 0x%02x with invalid header digest",
646 		    ip->ip_bhs->bhs_opcode);
647 		toep->ofld_rxq->rx_iscsi_header_digest_errors++;
648 	}
649 	if (val & F_DDP_DATACRC_ERR) {
650 		ICL_WARN("received PDU 0x%02x with invalid data digest",
651 		    ip->ip_bhs->bhs_opcode);
652 		toep->ofld_rxq->rx_iscsi_data_digest_errors++;
653 	}
654 
655 	INP_WLOCK(inp);
656 	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
657 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
658 		    __func__, tid, pdu_len, inp->inp_flags);
659 		INP_WUNLOCK(inp);
660 		icl_cxgbei_conn_pdu_free(NULL, ip);
661 		toep->ulpcb2 = NULL;
662 		m_freem(m);
663 		return (0);
664 	}
665 
666 	tp = intotcpcb(inp);
667 
668 	/*
669 	 * If icc is NULL, the connection is being closed in
670 	 * icl_cxgbei_conn_close(), just drop this data.
671 	 */
672 	icc = toep->ulpcb;
673 	if (__predict_false(icc == NULL)) {
674 		CTR4(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes), icc %p",
675 		    __func__, tid, pdu_len, icc);
676 
677 		/*
678 		 * Update rcv_nxt so the sequence number of the FIN
679 		 * doesn't appear wrong.
680 		 */
681 		tp->rcv_nxt = icp->icp_seq + pdu_len;
682 		tp->t_rcvtime = ticks;
683 		INP_WUNLOCK(inp);
684 
685 		icl_cxgbei_conn_pdu_free(NULL, ip);
686 		toep->ulpcb2 = NULL;
687 		m_freem(m);
688 		return (0);
689 	}
690 
691 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
692 	ic = &icc->ic;
693 	if ((val & (F_DDP_PADDING_ERR | F_DDP_HDRCRC_ERR |
694 	    F_DDP_DATACRC_ERR)) != 0) {
695 		INP_WUNLOCK(inp);
696 
697 		icl_cxgbei_conn_pdu_free(NULL, ip);
698 		toep->ulpcb2 = NULL;
699 		m_freem(m);
700 		ic->ic_error(ic);
701 		return (0);
702 	}
703 
704 #ifdef INVARIANTS
705 	data_digest_len = (icc->ulp_submode & ULP_CRC_DATA) ?
706 	    ISCSI_DATA_DIGEST_SIZE : 0;
707 	MPASS(roundup2(ip->ip_data_len, 4) == pdu_len - len - data_digest_len);
708 #endif
709 
710 	if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) {
711 		MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0);
712 		MPASS(ip->ip_data_len > 0);
713 		icp->icp_flags |= ICPF_RX_DDP;
714 		bhsdo = (struct iscsi_bhs_data_out *)ip->ip_bhs;
715 
716 		switch (ip->ip_bhs->bhs_opcode & ~ISCSI_BHS_OPCODE_IMMEDIATE) {
717 		case ISCSI_BHS_OPCODE_SCSI_DATA_IN:
718 			cmp = cxgbei_find_cmp(icc,
719 			    be32toh(bhsdo->bhsdo_initiator_task_tag));
720 			break;
721 		case ISCSI_BHS_OPCODE_SCSI_DATA_OUT:
722 			cmp = cxgbei_find_cmp(icc,
723 			    be32toh(bhsdo->bhsdo_target_transfer_tag));
724 			break;
725 		default:
726 			__assert_unreachable();
727 		}
728 		MPASS(cmp != NULL);
729 
730 		/*
731 		 * The difference between the end of the last burst
732 		 * and the offset of the last PDU in this burst is
733 		 * the additional data received via DDP.
734 		 */
735 		prev_seg_len = be32toh(bhsdo->bhsdo_buffer_offset) -
736 		    cmp->next_buffer_offset;
737 
738 		if (prev_seg_len != 0) {
739 			uint32_t orig_datasn;
740 
741 			/*
742 			 * Return a "large" PDU representing the burst
743 			 * of PDUs.  Adjust the offset and length of
744 			 * this PDU to represent the entire burst.
745 			 */
746 			ip->ip_data_len += prev_seg_len;
747 			bhsdo->bhsdo_data_segment_len[2] = ip->ip_data_len;
748 			bhsdo->bhsdo_data_segment_len[1] = ip->ip_data_len >> 8;
749 			bhsdo->bhsdo_data_segment_len[0] = ip->ip_data_len >> 16;
750 			bhsdo->bhsdo_buffer_offset =
751 			    htobe32(cmp->next_buffer_offset);
752 
753 			orig_datasn = htobe32(bhsdo->bhsdo_datasn);
754 			npdus = orig_datasn - cmp->last_datasn;
755 			bhsdo->bhsdo_datasn = htobe32(cmp->last_datasn + 1);
756 			cmp->last_datasn = orig_datasn;
757 			ip->ip_additional_pdus = npdus - 1;
758 		} else {
759 			MPASS(htobe32(bhsdo->bhsdo_datasn) ==
760 			    cmp->last_datasn + 1);
761 			npdus = 1;
762 			cmp->last_datasn = htobe32(bhsdo->bhsdo_datasn);
763 		}
764 
765 		cmp->next_buffer_offset += ip->ip_data_len;
766 		toep->ofld_rxq->rx_iscsi_ddp_pdus += npdus;
767 		toep->ofld_rxq->rx_iscsi_ddp_octets += ip->ip_data_len;
768 	} else {
769 		MPASS(icp->icp_flags & (ICPF_RX_FLBUF));
770 		MPASS(ip->ip_data_len == ip->ip_data_mbuf->m_pkthdr.len);
771 	}
772 
773 	tp->rcv_nxt = icp->icp_seq + pdu_len;
774 	tp->t_rcvtime = ticks;
775 
776 	/*
777 	 * Don't update the window size or return credits since RX
778 	 * flow control is disabled.
779 	 */
780 
781 	so = inp->inp_socket;
782 	sb = &so->so_rcv;
783 	SOCKBUF_LOCK(sb);
784 	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
785 		CTR5(KTR_CXGBE,
786 		    "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
787 		    __func__, tid, pdu_len, icc, sb->sb_state);
788 		SOCKBUF_UNLOCK(sb);
789 		INP_WUNLOCK(inp);
790 
791 		CURVNET_SET(so->so_vnet);
792 		NET_EPOCH_ENTER(et);
793 		INP_WLOCK(inp);
794 		tp = tcp_drop(tp, ECONNRESET);
795 		if (tp != NULL)
796 			INP_WUNLOCK(inp);
797 		NET_EPOCH_EXIT(et);
798 		CURVNET_RESTORE();
799 
800 		icl_cxgbei_conn_pdu_free(NULL, ip);
801 		toep->ulpcb2 = NULL;
802 		m_freem(m);
803 		return (0);
804 	}
805 
806 	icl_cxgbei_new_pdu_set_conn(ip, ic);
807 
808 	/* Enqueue the PDU to the received pdus queue. */
809 	STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
810 	if (!icc->rx_active) {
811 		icc->rx_active = true;
812 		wakeup(&icc->rx_active);
813 	}
814 	SOCKBUF_UNLOCK(sb);
815 	INP_WUNLOCK(inp);
816 
817 	toep->ulpcb2 = NULL;
818 	m_freem(m);
819 
820 	return (0);
821 }
822 
823 static int
824 cxgbei_activate(struct adapter *sc)
825 {
826 	struct cxgbei_data *ci;
827 	int rc;
828 
829 	ASSERT_SYNCHRONIZED_OP(sc);
830 
831 	if (uld_active(sc, ULD_ISCSI)) {
832 		KASSERT(0, ("%s: iSCSI offload already enabled on adapter %p",
833 		    __func__, sc));
834 		return (0);
835 	}
836 
837 	if (sc->iscsicaps == 0 || sc->vres.iscsi.size == 0) {
838 		device_printf(sc->dev,
839 		    "not iSCSI offload capable, or capability disabled.\n");
840 		return (ENOSYS);
841 	}
842 
843 	/* per-adapter softc for iSCSI */
844 	ci = malloc(sizeof(*ci), M_CXGBE, M_ZERO | M_WAITOK);
845 	if (ci == NULL)
846 		return (ENOMEM);
847 
848 	rc = cxgbei_init(sc, ci);
849 	if (rc != 0) {
850 		free(ci, M_CXGBE);
851 		return (rc);
852 	}
853 
854 	sc->iscsi_ulp_softc = ci;
855 
856 	return (0);
857 }
858 
859 static int
860 cxgbei_deactivate(struct adapter *sc)
861 {
862 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
863 
864 	ASSERT_SYNCHRONIZED_OP(sc);
865 
866 	if (ci != NULL) {
867 		sysctl_ctx_free(&ci->ctx);
868 		t4_free_ppod_region(&ci->pr);
869 		free(ci, M_CXGBE);
870 		sc->iscsi_ulp_softc = NULL;
871 	}
872 
873 	return (0);
874 }
875 
876 static void
877 cxgbei_activate_all(struct adapter *sc, void *arg __unused)
878 {
879 
880 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isact") != 0)
881 		return;
882 
883 	/* Activate iSCSI if any port on this adapter has IFCAP_TOE enabled. */
884 	if (sc->offload_map && !uld_active(sc, ULD_ISCSI))
885 		(void) t4_activate_uld(sc, ULD_ISCSI);
886 
887 	end_synchronized_op(sc, 0);
888 }
889 
890 static void
891 cxgbei_deactivate_all(struct adapter *sc, void *arg __unused)
892 {
893 
894 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isdea") != 0)
895 		return;
896 
897 	if (uld_active(sc, ULD_ISCSI))
898 	    (void) t4_deactivate_uld(sc, ULD_ISCSI);
899 
900 	end_synchronized_op(sc, 0);
901 }
902 
903 static struct uld_info cxgbei_uld_info = {
904 	.uld_id = ULD_ISCSI,
905 	.activate = cxgbei_activate,
906 	.deactivate = cxgbei_deactivate,
907 };
908 
909 static int
910 cxgbei_mod_load(void)
911 {
912 	int rc;
913 
914 	t4_register_cpl_handler(CPL_ISCSI_HDR, do_rx_iscsi_hdr);
915 	t4_register_cpl_handler(CPL_ISCSI_DATA, do_rx_iscsi_data);
916 	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, do_rx_iscsi_ddp);
917 	t4_register_cpl_handler(CPL_RX_ISCSI_CMP, do_rx_iscsi_cmp);
918 
919 	rc = t4_register_uld(&cxgbei_uld_info);
920 	if (rc != 0)
921 		return (rc);
922 
923 	t4_iterate(cxgbei_activate_all, NULL);
924 
925 	return (rc);
926 }
927 
928 static int
929 cxgbei_mod_unload(void)
930 {
931 
932 	t4_iterate(cxgbei_deactivate_all, NULL);
933 
934 	if (t4_unregister_uld(&cxgbei_uld_info) == EBUSY)
935 		return (EBUSY);
936 
937 	t4_register_cpl_handler(CPL_ISCSI_HDR, NULL);
938 	t4_register_cpl_handler(CPL_ISCSI_DATA, NULL);
939 	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, NULL);
940 	t4_register_cpl_handler(CPL_RX_ISCSI_CMP, NULL);
941 
942 	return (0);
943 }
944 #endif
945 
946 static int
947 cxgbei_modevent(module_t mod, int cmd, void *arg)
948 {
949 	int rc = 0;
950 
951 #ifdef TCP_OFFLOAD
952 	switch (cmd) {
953 	case MOD_LOAD:
954 		rc = cxgbei_mod_load();
955 		if (rc == 0)
956 			rc = icl_cxgbei_mod_load();
957 		break;
958 
959 	case MOD_UNLOAD:
960 		rc = icl_cxgbei_mod_unload();
961 		if (rc == 0)
962 			rc = cxgbei_mod_unload();
963 		break;
964 
965 	default:
966 		rc = EINVAL;
967 	}
968 #else
969 	printf("cxgbei: compiled without TCP_OFFLOAD support.\n");
970 	rc = EOPNOTSUPP;
971 #endif
972 
973 	return (rc);
974 }
975 
976 static moduledata_t cxgbei_mod = {
977 	"cxgbei",
978 	cxgbei_modevent,
979 	NULL,
980 };
981 
982 MODULE_VERSION(cxgbei, 1);
983 DECLARE_MODULE(cxgbei, cxgbei_mod, SI_SUB_EXEC, SI_ORDER_ANY);
984 MODULE_DEPEND(cxgbei, t4_tom, 1, 1, 1);
985 MODULE_DEPEND(cxgbei, cxgbe, 1, 1, 1);
986 MODULE_DEPEND(cxgbei, icl, 1, 1, 1);
987