xref: /freebsd/sys/dev/cxgbe/tom/t4_ddp.c (revision 5f757f3f)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2012 Chelsio Communications, Inc.
5  * All rights reserved.
6  * Written by: Navdeep Parhar <np@FreeBSD.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 #include "opt_inet.h"
32 
33 #include <sys/param.h>
34 #include <sys/aio.h>
35 #include <sys/bio.h>
36 #include <sys/file.h>
37 #include <sys/systm.h>
38 #include <sys/kernel.h>
39 #include <sys/ktr.h>
40 #include <sys/module.h>
41 #include <sys/protosw.h>
42 #include <sys/proc.h>
43 #include <sys/domain.h>
44 #include <sys/socket.h>
45 #include <sys/socketvar.h>
46 #include <sys/taskqueue.h>
47 #include <sys/uio.h>
48 #include <netinet/in.h>
49 #include <netinet/in_pcb.h>
50 #include <netinet/ip.h>
51 #include <netinet/tcp_var.h>
52 #define TCPSTATES
53 #include <netinet/tcp_fsm.h>
54 #include <netinet/toecore.h>
55 
56 #include <vm/vm.h>
57 #include <vm/vm_extern.h>
58 #include <vm/vm_param.h>
59 #include <vm/pmap.h>
60 #include <vm/vm_map.h>
61 #include <vm/vm_page.h>
62 #include <vm/vm_object.h>
63 
64 #include <cam/scsi/scsi_all.h>
65 #include <cam/ctl/ctl_io.h>
66 
67 #ifdef TCP_OFFLOAD
68 #include "common/common.h"
69 #include "common/t4_msg.h"
70 #include "common/t4_regs.h"
71 #include "common/t4_tcb.h"
72 #include "tom/t4_tom.h"
73 
74 /*
75  * Use the 'backend3' field in AIO jobs to store the amount of data
76  * received by the AIO job so far.
77  */
78 #define	aio_received	backend3
79 
80 static void aio_ddp_requeue_task(void *context, int pending);
81 static void ddp_complete_all(struct toepcb *toep, int error);
82 static void t4_aio_cancel_active(struct kaiocb *job);
83 static void t4_aio_cancel_queued(struct kaiocb *job);
84 static int t4_alloc_page_pods_for_rcvbuf(struct ppod_region *pr,
85     struct ddp_rcv_buffer *drb);
86 static int t4_write_page_pods_for_rcvbuf(struct adapter *sc,
87     struct sge_wrq *wrq, int tid, struct ddp_rcv_buffer *drb);
88 
89 static TAILQ_HEAD(, pageset) ddp_orphan_pagesets;
90 static struct mtx ddp_orphan_pagesets_lock;
91 static struct task ddp_orphan_task;
92 
93 #define MAX_DDP_BUFFER_SIZE		(M_TCB_RX_DDP_BUF0_LEN)
94 
95 /*
96  * A page set holds information about a user buffer used for AIO DDP.
97  * The page set holds resources such as the VM pages backing the
98  * buffer (either held or wired) and the page pods associated with the
99  * buffer.  Recently used page sets are cached to allow for efficient
100  * reuse of buffers (avoiding the need to re-fault in pages, hold
101  * them, etc.).  Note that cached page sets keep the backing pages
102  * wired.  The number of wired pages is capped by only allowing for
103  * two wired pagesets per connection.  This is not a perfect cap, but
104  * is a trade-off for performance.
105  *
106  * If an application ping-pongs two buffers for a connection via
107  * aio_read(2) then those buffers should remain wired and expensive VM
108  * fault lookups should be avoided after each buffer has been used
109  * once.  If an application uses more than two buffers then this will
110  * fall back to doing expensive VM fault lookups for each operation.
111  */
112 static void
113 free_pageset(struct tom_data *td, struct pageset *ps)
114 {
115 	vm_page_t p;
116 	int i;
117 
118 	if (ps->prsv.prsv_nppods > 0)
119 		t4_free_page_pods(&ps->prsv);
120 
121 	for (i = 0; i < ps->npages; i++) {
122 		p = ps->pages[i];
123 		vm_page_unwire(p, PQ_INACTIVE);
124 	}
125 	mtx_lock(&ddp_orphan_pagesets_lock);
126 	TAILQ_INSERT_TAIL(&ddp_orphan_pagesets, ps, link);
127 	taskqueue_enqueue(taskqueue_thread, &ddp_orphan_task);
128 	mtx_unlock(&ddp_orphan_pagesets_lock);
129 }
130 
131 static void
132 ddp_free_orphan_pagesets(void *context, int pending)
133 {
134 	struct pageset *ps;
135 
136 	mtx_lock(&ddp_orphan_pagesets_lock);
137 	while (!TAILQ_EMPTY(&ddp_orphan_pagesets)) {
138 		ps = TAILQ_FIRST(&ddp_orphan_pagesets);
139 		TAILQ_REMOVE(&ddp_orphan_pagesets, ps, link);
140 		mtx_unlock(&ddp_orphan_pagesets_lock);
141 		if (ps->vm)
142 			vmspace_free(ps->vm);
143 		free(ps, M_CXGBE);
144 		mtx_lock(&ddp_orphan_pagesets_lock);
145 	}
146 	mtx_unlock(&ddp_orphan_pagesets_lock);
147 }
148 
149 static void
150 recycle_pageset(struct toepcb *toep, struct pageset *ps)
151 {
152 
153 	DDP_ASSERT_LOCKED(toep);
154 	if (!(toep->ddp.flags & DDP_DEAD)) {
155 		KASSERT(toep->ddp.cached_count + toep->ddp.active_count <
156 		    nitems(toep->ddp.db), ("too many wired pagesets"));
157 		TAILQ_INSERT_HEAD(&toep->ddp.cached_pagesets, ps, link);
158 		toep->ddp.cached_count++;
159 	} else
160 		free_pageset(toep->td, ps);
161 }
162 
163 static void
164 ddp_complete_one(struct kaiocb *job, int error)
165 {
166 	long copied;
167 
168 	/*
169 	 * If this job had copied data out of the socket buffer before
170 	 * it was cancelled, report it as a short read rather than an
171 	 * error.
172 	 */
173 	copied = job->aio_received;
174 	if (copied != 0 || error == 0)
175 		aio_complete(job, copied, 0);
176 	else
177 		aio_complete(job, -1, error);
178 }
179 
180 static void
181 free_ddp_rcv_buffer(struct toepcb *toep, struct ddp_rcv_buffer *drb)
182 {
183 	t4_free_page_pods(&drb->prsv);
184 	contigfree(drb->buf, drb->len, M_CXGBE);
185 	free(drb, M_CXGBE);
186 	counter_u64_add(toep->ofld_rxq->ddp_buffer_free, 1);
187 	free_toepcb(toep);
188 }
189 
190 static void
191 recycle_ddp_rcv_buffer(struct toepcb *toep, struct ddp_rcv_buffer *drb)
192 {
193 	DDP_CACHE_LOCK(toep);
194 	if (!(toep->ddp.flags & DDP_DEAD) &&
195 	    toep->ddp.cached_count < t4_ddp_rcvbuf_cache) {
196 		TAILQ_INSERT_HEAD(&toep->ddp.cached_buffers, drb, link);
197 		toep->ddp.cached_count++;
198 		DDP_CACHE_UNLOCK(toep);
199 	} else {
200 		DDP_CACHE_UNLOCK(toep);
201 		free_ddp_rcv_buffer(toep, drb);
202 	}
203 }
204 
205 static struct ddp_rcv_buffer *
206 alloc_cached_ddp_rcv_buffer(struct toepcb *toep)
207 {
208 	struct ddp_rcv_buffer *drb;
209 
210 	DDP_CACHE_LOCK(toep);
211 	if (!TAILQ_EMPTY(&toep->ddp.cached_buffers)) {
212 		drb = TAILQ_FIRST(&toep->ddp.cached_buffers);
213 		TAILQ_REMOVE(&toep->ddp.cached_buffers, drb, link);
214 		toep->ddp.cached_count--;
215 		counter_u64_add(toep->ofld_rxq->ddp_buffer_reuse, 1);
216 	} else
217 		drb = NULL;
218 	DDP_CACHE_UNLOCK(toep);
219 	return (drb);
220 }
221 
222 static struct ddp_rcv_buffer *
223 alloc_ddp_rcv_buffer(struct toepcb *toep, int how)
224 {
225 	struct tom_data *td = toep->td;
226 	struct adapter *sc = td_adapter(td);
227 	struct ddp_rcv_buffer *drb;
228 	int error;
229 
230 	drb = malloc(sizeof(*drb), M_CXGBE, how | M_ZERO);
231 	if (drb == NULL)
232 		return (NULL);
233 
234 	drb->buf = contigmalloc(t4_ddp_rcvbuf_len, M_CXGBE, how, 0, ~0,
235 	    t4_ddp_rcvbuf_len, 0);
236 	if (drb->buf == NULL) {
237 		free(drb, M_CXGBE);
238 		return (NULL);
239 	}
240 	drb->len = t4_ddp_rcvbuf_len;
241 	drb->refs = 1;
242 
243 	error = t4_alloc_page_pods_for_rcvbuf(&td->pr, drb);
244 	if (error != 0) {
245 		contigfree(drb->buf, drb->len, M_CXGBE);
246 		free(drb, M_CXGBE);
247 		return (NULL);
248 	}
249 
250 	error = t4_write_page_pods_for_rcvbuf(sc, toep->ctrlq, toep->tid, drb);
251 	if (error != 0) {
252 		t4_free_page_pods(&drb->prsv);
253 		contigfree(drb->buf, drb->len, M_CXGBE);
254 		free(drb, M_CXGBE);
255 		return (NULL);
256 	}
257 
258 	hold_toepcb(toep);
259 	counter_u64_add(toep->ofld_rxq->ddp_buffer_alloc, 1);
260 	return (drb);
261 }
262 
263 static void
264 free_ddp_buffer(struct toepcb *toep, struct ddp_buffer *db)
265 {
266 	if ((toep->ddp.flags & DDP_RCVBUF) != 0) {
267 		if (db->drb != NULL)
268 			free_ddp_rcv_buffer(toep, db->drb);
269 #ifdef INVARIANTS
270 		db->drb = NULL;
271 #endif
272 		return;
273 	}
274 
275 	if (db->job) {
276 		/*
277 		 * XXX: If we are un-offloading the socket then we
278 		 * should requeue these on the socket somehow.  If we
279 		 * got a FIN from the remote end, then this completes
280 		 * any remaining requests with an EOF read.
281 		 */
282 		if (!aio_clear_cancel_function(db->job))
283 			ddp_complete_one(db->job, 0);
284 #ifdef INVARIANTS
285 		db->job = NULL;
286 #endif
287 	}
288 
289 	if (db->ps) {
290 		free_pageset(toep->td, db->ps);
291 #ifdef INVARIANTS
292 		db->ps = NULL;
293 #endif
294 	}
295 }
296 
297 static void
298 ddp_init_toep(struct toepcb *toep)
299 {
300 
301 	toep->ddp.flags = DDP_OK;
302 	toep->ddp.active_id = -1;
303 	mtx_init(&toep->ddp.lock, "t4 ddp", NULL, MTX_DEF);
304 	mtx_init(&toep->ddp.cache_lock, "t4 ddp cache", NULL, MTX_DEF);
305 }
306 
307 void
308 ddp_uninit_toep(struct toepcb *toep)
309 {
310 
311 	mtx_destroy(&toep->ddp.lock);
312 	mtx_destroy(&toep->ddp.cache_lock);
313 }
314 
315 void
316 release_ddp_resources(struct toepcb *toep)
317 {
318 	struct ddp_rcv_buffer *drb;
319 	struct pageset *ps;
320 	int i;
321 
322 	DDP_LOCK(toep);
323 	DDP_CACHE_LOCK(toep);
324 	toep->ddp.flags |= DDP_DEAD;
325 	DDP_CACHE_UNLOCK(toep);
326 	for (i = 0; i < nitems(toep->ddp.db); i++) {
327 		free_ddp_buffer(toep, &toep->ddp.db[i]);
328 	}
329 	if ((toep->ddp.flags & DDP_AIO) != 0) {
330 		while ((ps = TAILQ_FIRST(&toep->ddp.cached_pagesets)) != NULL) {
331 			TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link);
332 			free_pageset(toep->td, ps);
333 		}
334 		ddp_complete_all(toep, 0);
335 	}
336 	if ((toep->ddp.flags & DDP_RCVBUF) != 0) {
337 		DDP_CACHE_LOCK(toep);
338 		while ((drb = TAILQ_FIRST(&toep->ddp.cached_buffers)) != NULL) {
339 			TAILQ_REMOVE(&toep->ddp.cached_buffers, drb, link);
340 			free_ddp_rcv_buffer(toep, drb);
341 		}
342 		DDP_CACHE_UNLOCK(toep);
343 	}
344 	DDP_UNLOCK(toep);
345 }
346 
347 #ifdef INVARIANTS
348 void
349 ddp_assert_empty(struct toepcb *toep)
350 {
351 	int i;
352 
353 	MPASS((toep->ddp.flags & (DDP_TASK_ACTIVE | DDP_DEAD)) != DDP_TASK_ACTIVE);
354 	for (i = 0; i < nitems(toep->ddp.db); i++) {
355 		if ((toep->ddp.flags & DDP_AIO) != 0) {
356 			MPASS(toep->ddp.db[i].job == NULL);
357 			MPASS(toep->ddp.db[i].ps == NULL);
358 		} else
359 			MPASS(toep->ddp.db[i].drb == NULL);
360 	}
361 	if ((toep->ddp.flags & DDP_AIO) != 0) {
362 		MPASS(TAILQ_EMPTY(&toep->ddp.cached_pagesets));
363 		MPASS(TAILQ_EMPTY(&toep->ddp.aiojobq));
364 	}
365 	if ((toep->ddp.flags & DDP_RCVBUF) != 0)
366 		MPASS(TAILQ_EMPTY(&toep->ddp.cached_buffers));
367 }
368 #endif
369 
370 static void
371 complete_ddp_buffer(struct toepcb *toep, struct ddp_buffer *db,
372     unsigned int db_idx)
373 {
374 	struct ddp_rcv_buffer *drb;
375 	unsigned int db_flag;
376 
377 	toep->ddp.active_count--;
378 	if (toep->ddp.active_id == db_idx) {
379 		if (toep->ddp.active_count == 0) {
380 			if ((toep->ddp.flags & DDP_AIO) != 0)
381 				KASSERT(toep->ddp.db[db_idx ^ 1].job == NULL,
382 				    ("%s: active_count mismatch", __func__));
383 			else
384 				KASSERT(toep->ddp.db[db_idx ^ 1].drb == NULL,
385 				    ("%s: active_count mismatch", __func__));
386 			toep->ddp.active_id = -1;
387 		} else
388 			toep->ddp.active_id ^= 1;
389 #ifdef VERBOSE_TRACES
390 		CTR3(KTR_CXGBE, "%s: tid %u, ddp_active_id = %d", __func__,
391 		    toep->tid, toep->ddp.active_id);
392 #endif
393 	} else {
394 		KASSERT(toep->ddp.active_count != 0 &&
395 		    toep->ddp.active_id != -1,
396 		    ("%s: active count mismatch", __func__));
397 	}
398 
399 	if ((toep->ddp.flags & DDP_AIO) != 0) {
400 		db->cancel_pending = 0;
401 		db->job = NULL;
402 		recycle_pageset(toep, db->ps);
403 		db->ps = NULL;
404 	} else {
405 		drb = db->drb;
406 		if (atomic_fetchadd_int(&drb->refs, -1) == 1)
407 			recycle_ddp_rcv_buffer(toep, drb);
408 		db->drb = NULL;
409 		db->placed = 0;
410 	}
411 
412 	db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
413 	KASSERT(toep->ddp.flags & db_flag,
414 	    ("%s: DDP buffer not active. toep %p, ddp_flags 0x%x",
415 	    __func__, toep, toep->ddp.flags));
416 	toep->ddp.flags &= ~db_flag;
417 }
418 
419 /* Called when m_free drops the last reference. */
420 static void
421 ddp_rcv_mbuf_done(struct mbuf *m)
422 {
423 	struct toepcb *toep = m->m_ext.ext_arg1;
424 	struct ddp_rcv_buffer *drb = m->m_ext.ext_arg2;
425 
426 	recycle_ddp_rcv_buffer(toep, drb);
427 }
428 
429 static void
430 queue_ddp_rcvbuf_mbuf(struct toepcb *toep, u_int db_idx, u_int len)
431 {
432 	struct inpcb *inp = toep->inp;
433 	struct sockbuf *sb;
434 	struct ddp_buffer *db;
435 	struct ddp_rcv_buffer *drb;
436 	struct mbuf *m;
437 
438 	m = m_gethdr(M_NOWAIT, MT_DATA);
439 	if (m == NULL) {
440 		printf("%s: failed to allocate mbuf", __func__);
441 		return;
442 	}
443 	m->m_pkthdr.rcvif = toep->vi->ifp;
444 
445 	db = &toep->ddp.db[db_idx];
446 	drb = db->drb;
447 	m_extaddref(m, (char *)drb->buf + db->placed, len, &drb->refs,
448 	    ddp_rcv_mbuf_done, toep, drb);
449 	m->m_pkthdr.len = len;
450 	m->m_len = len;
451 
452 	sb = &inp->inp_socket->so_rcv;
453 	SOCKBUF_LOCK_ASSERT(sb);
454 	sbappendstream_locked(sb, m, 0);
455 
456 	db->placed += len;
457 	toep->ofld_rxq->rx_toe_ddp_octets += len;
458 }
459 
460 /* XXX: handle_ddp_data code duplication */
461 void
462 insert_ddp_data(struct toepcb *toep, uint32_t n)
463 {
464 	struct inpcb *inp = toep->inp;
465 	struct tcpcb *tp = intotcpcb(inp);
466 	struct ddp_buffer *db;
467 	struct kaiocb *job;
468 	size_t placed;
469 	long copied;
470 	unsigned int db_idx;
471 #ifdef INVARIANTS
472 	unsigned int db_flag;
473 #endif
474 	bool ddp_rcvbuf;
475 
476 	INP_WLOCK_ASSERT(inp);
477 	DDP_ASSERT_LOCKED(toep);
478 
479 	ddp_rcvbuf = (toep->ddp.flags & DDP_RCVBUF) != 0;
480 	tp->rcv_nxt += n;
481 #ifndef USE_DDP_RX_FLOW_CONTROL
482 	KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__));
483 	tp->rcv_wnd -= n;
484 #endif
485 	CTR2(KTR_CXGBE, "%s: placed %u bytes before falling out of DDP",
486 	    __func__, n);
487 	while (toep->ddp.active_count > 0) {
488 		MPASS(toep->ddp.active_id != -1);
489 		db_idx = toep->ddp.active_id;
490 #ifdef INVARIANTS
491 		db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
492 #endif
493 		MPASS((toep->ddp.flags & db_flag) != 0);
494 		db = &toep->ddp.db[db_idx];
495 		if (ddp_rcvbuf) {
496 			placed = n;
497 			if (placed > db->drb->len - db->placed)
498 				placed = db->drb->len - db->placed;
499 			if (placed != 0)
500 				queue_ddp_rcvbuf_mbuf(toep, db_idx, placed);
501 			complete_ddp_buffer(toep, db, db_idx);
502 			n -= placed;
503 			continue;
504 		}
505 		job = db->job;
506 		copied = job->aio_received;
507 		placed = n;
508 		if (placed > job->uaiocb.aio_nbytes - copied)
509 			placed = job->uaiocb.aio_nbytes - copied;
510 		if (placed > 0) {
511 			job->msgrcv = 1;
512 			toep->ofld_rxq->rx_aio_ddp_jobs++;
513 		}
514 		toep->ofld_rxq->rx_aio_ddp_octets += placed;
515 		if (!aio_clear_cancel_function(job)) {
516 			/*
517 			 * Update the copied length for when
518 			 * t4_aio_cancel_active() completes this
519 			 * request.
520 			 */
521 			job->aio_received += placed;
522 		} else if (copied + placed != 0) {
523 			CTR4(KTR_CXGBE,
524 			    "%s: completing %p (copied %ld, placed %lu)",
525 			    __func__, job, copied, placed);
526 			/* XXX: This always completes if there is some data. */
527 			aio_complete(job, copied + placed, 0);
528 		} else if (aio_set_cancel_function(job, t4_aio_cancel_queued)) {
529 			TAILQ_INSERT_HEAD(&toep->ddp.aiojobq, job, list);
530 			toep->ddp.waiting_count++;
531 		} else
532 			aio_cancel(job);
533 		n -= placed;
534 		complete_ddp_buffer(toep, db, db_idx);
535 	}
536 
537 	MPASS(n == 0);
538 }
539 
540 /* SET_TCB_FIELD sent as a ULP command looks like this */
541 #define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \
542     sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core))
543 
544 /* RX_DATA_ACK sent as a ULP command looks like this */
545 #define LEN__RX_DATA_ACK_ULP (sizeof(struct ulp_txpkt) + \
546     sizeof(struct ulptx_idata) + sizeof(struct cpl_rx_data_ack_core))
547 
548 static inline void *
549 mk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep,
550     uint64_t word, uint64_t mask, uint64_t val)
551 {
552 	struct ulptx_idata *ulpsc;
553 	struct cpl_set_tcb_field_core *req;
554 
555 	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
556 	ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16));
557 
558 	ulpsc = (struct ulptx_idata *)(ulpmc + 1);
559 	ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
560 	ulpsc->len = htobe32(sizeof(*req));
561 
562 	req = (struct cpl_set_tcb_field_core *)(ulpsc + 1);
563 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tid));
564 	req->reply_ctrl = htobe16(V_NO_REPLY(1) |
565 	    V_QUEUENO(toep->ofld_rxq->iq.abs_id));
566 	req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0));
567         req->mask = htobe64(mask);
568         req->val = htobe64(val);
569 
570 	ulpsc = (struct ulptx_idata *)(req + 1);
571 	if (LEN__SET_TCB_FIELD_ULP % 16) {
572 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
573 		ulpsc->len = htobe32(0);
574 		return (ulpsc + 1);
575 	}
576 	return (ulpsc);
577 }
578 
579 static inline void *
580 mk_rx_data_ack_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep)
581 {
582 	struct ulptx_idata *ulpsc;
583 	struct cpl_rx_data_ack_core *req;
584 
585 	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
586 	ulpmc->len = htobe32(howmany(LEN__RX_DATA_ACK_ULP, 16));
587 
588 	ulpsc = (struct ulptx_idata *)(ulpmc + 1);
589 	ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
590 	ulpsc->len = htobe32(sizeof(*req));
591 
592 	req = (struct cpl_rx_data_ack_core *)(ulpsc + 1);
593 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tid));
594 	req->credit_dack = htobe32(F_RX_MODULATE_RX);
595 
596 	ulpsc = (struct ulptx_idata *)(req + 1);
597 	if (LEN__RX_DATA_ACK_ULP % 16) {
598 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
599 		ulpsc->len = htobe32(0);
600 		return (ulpsc + 1);
601 	}
602 	return (ulpsc);
603 }
604 
605 static struct wrqe *
606 mk_update_tcb_for_ddp(struct adapter *sc, struct toepcb *toep, int db_idx,
607     struct ppod_reservation *prsv, int offset, uint32_t len,
608     uint64_t ddp_flags, uint64_t ddp_flags_mask)
609 {
610 	struct wrqe *wr;
611 	struct work_request_hdr *wrh;
612 	struct ulp_txpkt *ulpmc;
613 	int wrlen;
614 
615 	KASSERT(db_idx == 0 || db_idx == 1,
616 	    ("%s: bad DDP buffer index %d", __func__, db_idx));
617 
618 	/*
619 	 * We'll send a compound work request that has 3 SET_TCB_FIELDs and an
620 	 * RX_DATA_ACK (with RX_MODULATE to speed up delivery).
621 	 *
622 	 * The work request header is 16B and always ends at a 16B boundary.
623 	 * The ULPTX master commands that follow must all end at 16B boundaries
624 	 * too so we round up the size to 16.
625 	 */
626 	wrlen = sizeof(*wrh) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16) +
627 	    roundup2(LEN__RX_DATA_ACK_ULP, 16);
628 
629 	wr = alloc_wrqe(wrlen, toep->ctrlq);
630 	if (wr == NULL)
631 		return (NULL);
632 	wrh = wrtod(wr);
633 	INIT_ULPTX_WRH(wrh, wrlen, 1, 0);	/* atomic */
634 	ulpmc = (struct ulp_txpkt *)(wrh + 1);
635 
636 	/* Write the buffer's tag */
637 	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
638 	    W_TCB_RX_DDP_BUF0_TAG + db_idx,
639 	    V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
640 	    V_TCB_RX_DDP_BUF0_TAG(prsv->prsv_tag));
641 
642 	/* Update the current offset in the DDP buffer and its total length */
643 	if (db_idx == 0)
644 		ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
645 		    W_TCB_RX_DDP_BUF0_OFFSET,
646 		    V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
647 		    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
648 		    V_TCB_RX_DDP_BUF0_OFFSET(offset) |
649 		    V_TCB_RX_DDP_BUF0_LEN(len));
650 	else
651 		ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
652 		    W_TCB_RX_DDP_BUF1_OFFSET,
653 		    V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
654 		    V_TCB_RX_DDP_BUF1_LEN((u64)M_TCB_RX_DDP_BUF1_LEN << 32),
655 		    V_TCB_RX_DDP_BUF1_OFFSET(offset) |
656 		    V_TCB_RX_DDP_BUF1_LEN((u64)len << 32));
657 
658 	/* Update DDP flags */
659 	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_FLAGS,
660 	    ddp_flags_mask, ddp_flags);
661 
662 	/* Gratuitous RX_DATA_ACK with RX_MODULATE set to speed up delivery. */
663 	ulpmc = mk_rx_data_ack_ulp(ulpmc, toep);
664 
665 	return (wr);
666 }
667 
668 static int
669 handle_ddp_data_aio(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt,
670     int len)
671 {
672 	uint32_t report = be32toh(ddp_report);
673 	unsigned int db_idx;
674 	struct inpcb *inp = toep->inp;
675 	struct ddp_buffer *db;
676 	struct tcpcb *tp;
677 	struct socket *so;
678 	struct sockbuf *sb;
679 	struct kaiocb *job;
680 	long copied;
681 
682 	db_idx = report & F_DDP_BUF_IDX ? 1 : 0;
683 
684 	if (__predict_false(!(report & F_DDP_INV)))
685 		CXGBE_UNIMPLEMENTED("DDP buffer still valid");
686 
687 	INP_WLOCK(inp);
688 	so = inp_inpcbtosocket(inp);
689 	sb = &so->so_rcv;
690 	DDP_LOCK(toep);
691 
692 	KASSERT(toep->ddp.active_id == db_idx,
693 	    ("completed DDP buffer (%d) != active_id (%d) for tid %d", db_idx,
694 	    toep->ddp.active_id, toep->tid));
695 	db = &toep->ddp.db[db_idx];
696 	job = db->job;
697 
698 	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
699 		/*
700 		 * This can happen due to an administrative tcpdrop(8).
701 		 * Just fail the request with ECONNRESET.
702 		 */
703 		CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x",
704 		    __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags);
705 		if (aio_clear_cancel_function(job))
706 			ddp_complete_one(job, ECONNRESET);
707 		goto completed;
708 	}
709 
710 	tp = intotcpcb(inp);
711 
712 	/*
713 	 * For RX_DDP_COMPLETE, len will be zero and rcv_nxt is the
714 	 * sequence number of the next byte to receive.  The length of
715 	 * the data received for this message must be computed by
716 	 * comparing the new and old values of rcv_nxt.
717 	 *
718 	 * For RX_DATA_DDP, len might be non-zero, but it is only the
719 	 * length of the most recent DMA.  It does not include the
720 	 * total length of the data received since the previous update
721 	 * for this DDP buffer.  rcv_nxt is the sequence number of the
722 	 * first received byte from the most recent DMA.
723 	 */
724 	len += be32toh(rcv_nxt) - tp->rcv_nxt;
725 	tp->rcv_nxt += len;
726 	tp->t_rcvtime = ticks;
727 #ifndef USE_DDP_RX_FLOW_CONTROL
728 	KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__));
729 	tp->rcv_wnd -= len;
730 #endif
731 #ifdef VERBOSE_TRACES
732 	CTR5(KTR_CXGBE, "%s: tid %u, DDP[%d] placed %d bytes (%#x)", __func__,
733 	    toep->tid, db_idx, len, report);
734 #endif
735 
736 	/* receive buffer autosize */
737 	MPASS(toep->vnet == so->so_vnet);
738 	CURVNET_SET(toep->vnet);
739 	SOCKBUF_LOCK(sb);
740 	if (sb->sb_flags & SB_AUTOSIZE &&
741 	    V_tcp_do_autorcvbuf &&
742 	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
743 	    len > (sbspace(sb) / 8 * 7)) {
744 		struct adapter *sc = td_adapter(toep->td);
745 		unsigned int hiwat = sb->sb_hiwat;
746 		unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc,
747 		    V_tcp_autorcvbuf_max);
748 
749 		if (!sbreserve_locked(so, SO_RCV, newsize, NULL))
750 			sb->sb_flags &= ~SB_AUTOSIZE;
751 	}
752 	SOCKBUF_UNLOCK(sb);
753 	CURVNET_RESTORE();
754 
755 	job->msgrcv = 1;
756 	toep->ofld_rxq->rx_aio_ddp_jobs++;
757 	toep->ofld_rxq->rx_aio_ddp_octets += len;
758 	if (db->cancel_pending) {
759 		/*
760 		 * Update the job's length but defer completion to the
761 		 * TCB_RPL callback.
762 		 */
763 		job->aio_received += len;
764 		goto out;
765 	} else if (!aio_clear_cancel_function(job)) {
766 		/*
767 		 * Update the copied length for when
768 		 * t4_aio_cancel_active() completes this request.
769 		 */
770 		job->aio_received += len;
771 	} else {
772 		copied = job->aio_received;
773 #ifdef VERBOSE_TRACES
774 		CTR5(KTR_CXGBE,
775 		    "%s: tid %u, completing %p (copied %ld, placed %d)",
776 		    __func__, toep->tid, job, copied, len);
777 #endif
778 		aio_complete(job, copied + len, 0);
779 		t4_rcvd(&toep->td->tod, tp);
780 	}
781 
782 completed:
783 	complete_ddp_buffer(toep, db, db_idx);
784 	if (toep->ddp.waiting_count > 0)
785 		ddp_queue_toep(toep);
786 out:
787 	DDP_UNLOCK(toep);
788 	INP_WUNLOCK(inp);
789 
790 	return (0);
791 }
792 
793 static bool
794 queue_ddp_rcvbuf(struct toepcb *toep, struct ddp_rcv_buffer *drb)
795 {
796 	struct adapter *sc = td_adapter(toep->td);
797 	struct ddp_buffer *db;
798 	struct wrqe *wr;
799 	uint64_t ddp_flags, ddp_flags_mask;
800 	int buf_flag, db_idx;
801 
802 	DDP_ASSERT_LOCKED(toep);
803 
804 	KASSERT((toep->ddp.flags & DDP_DEAD) == 0, ("%s: DDP_DEAD", __func__));
805 	KASSERT(toep->ddp.active_count < nitems(toep->ddp.db),
806 	    ("%s: no empty DDP buffer slot", __func__));
807 
808 	/* Determine which DDP buffer to use. */
809 	if (toep->ddp.db[0].drb == NULL) {
810 		db_idx = 0;
811 	} else {
812 		MPASS(toep->ddp.db[1].drb == NULL);
813 		db_idx = 1;
814 	}
815 
816 	/*
817 	 * Permit PSH to trigger a partial completion without
818 	 * invalidating the rest of the buffer, but disable the PUSH
819 	 * timer.
820 	 */
821 	ddp_flags = 0;
822 	ddp_flags_mask = 0;
823 	if (db_idx == 0) {
824 		ddp_flags |= V_TF_DDP_PSH_NO_INVALIDATE0(1) |
825 		    V_TF_DDP_PUSH_DISABLE_0(0) | V_TF_DDP_PSHF_ENABLE_0(1) |
826 		    V_TF_DDP_BUF0_VALID(1);
827 		ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE0(1) |
828 		    V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PSHF_ENABLE_0(1) |
829 		    V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF0_VALID(1);
830 		buf_flag = DDP_BUF0_ACTIVE;
831 	} else {
832 		ddp_flags |= V_TF_DDP_PSH_NO_INVALIDATE1(1) |
833 		    V_TF_DDP_PUSH_DISABLE_1(0) | V_TF_DDP_PSHF_ENABLE_1(1) |
834 		    V_TF_DDP_BUF1_VALID(1);
835 		ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE1(1) |
836 		    V_TF_DDP_PUSH_DISABLE_1(1) | V_TF_DDP_PSHF_ENABLE_1(1) |
837 		    V_TF_DDP_BUF1_FLUSH(1) | V_TF_DDP_BUF1_VALID(1);
838 		buf_flag = DDP_BUF1_ACTIVE;
839 	}
840 	MPASS((toep->ddp.flags & buf_flag) == 0);
841 	if ((toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0) {
842 		MPASS(db_idx == 0);
843 		MPASS(toep->ddp.active_id == -1);
844 		MPASS(toep->ddp.active_count == 0);
845 		ddp_flags_mask |= V_TF_DDP_ACTIVE_BUF(1);
846 	}
847 
848 	/*
849 	 * The TID for this connection should still be valid.  If
850 	 * DDP_DEAD is set, SBS_CANTRCVMORE should be set, so we
851 	 * shouldn't be this far anyway.
852 	 */
853 	wr = mk_update_tcb_for_ddp(sc, toep, db_idx, &drb->prsv, 0, drb->len,
854 	    ddp_flags, ddp_flags_mask);
855 	if (wr == NULL) {
856 		recycle_ddp_rcv_buffer(toep, drb);
857 		printf("%s: mk_update_tcb_for_ddp failed\n", __func__);
858 		return (false);
859 	}
860 
861 #ifdef VERBOSE_TRACES
862 	CTR(KTR_CXGBE,
863 	    "%s: tid %u, scheduling DDP[%d] (flags %#lx/%#lx)", __func__,
864 	    toep->tid, db_idx, ddp_flags, ddp_flags_mask);
865 #endif
866 	/*
867 	 * Hold a reference on scheduled buffers that is dropped in
868 	 * complete_ddp_buffer.
869 	 */
870 	drb->refs = 1;
871 
872 	/* Give the chip the go-ahead. */
873 	t4_wrq_tx(sc, wr);
874 	db = &toep->ddp.db[db_idx];
875 	db->drb = drb;
876 	toep->ddp.flags |= buf_flag;
877 	toep->ddp.active_count++;
878 	if (toep->ddp.active_count == 1) {
879 		MPASS(toep->ddp.active_id == -1);
880 		toep->ddp.active_id = db_idx;
881 		CTR2(KTR_CXGBE, "%s: ddp_active_id = %d", __func__,
882 		    toep->ddp.active_id);
883 	}
884 	return (true);
885 }
886 
887 static int
888 handle_ddp_data_rcvbuf(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt,
889     int len)
890 {
891 	uint32_t report = be32toh(ddp_report);
892 	struct inpcb *inp = toep->inp;
893 	struct tcpcb *tp;
894 	struct socket *so;
895 	struct sockbuf *sb;
896 	struct ddp_buffer *db;
897 	struct ddp_rcv_buffer *drb;
898 	unsigned int db_idx;
899 	bool invalidated;
900 
901 	db_idx = report & F_DDP_BUF_IDX ? 1 : 0;
902 
903 	invalidated = (report & F_DDP_INV) != 0;
904 
905 	INP_WLOCK(inp);
906 	so = inp_inpcbtosocket(inp);
907 	sb = &so->so_rcv;
908 	DDP_LOCK(toep);
909 
910 	KASSERT(toep->ddp.active_id == db_idx,
911 	    ("completed DDP buffer (%d) != active_id (%d) for tid %d", db_idx,
912 	    toep->ddp.active_id, toep->tid));
913 	db = &toep->ddp.db[db_idx];
914 
915 	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
916 		/*
917 		 * This can happen due to an administrative tcpdrop(8).
918 		 * Just ignore the received data.
919 		 */
920 		CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x",
921 		    __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags);
922 		if (invalidated)
923 			complete_ddp_buffer(toep, db, db_idx);
924 		goto out;
925 	}
926 
927 	tp = intotcpcb(inp);
928 
929 	/*
930 	 * For RX_DDP_COMPLETE, len will be zero and rcv_nxt is the
931 	 * sequence number of the next byte to receive.  The length of
932 	 * the data received for this message must be computed by
933 	 * comparing the new and old values of rcv_nxt.
934 	 *
935 	 * For RX_DATA_DDP, len might be non-zero, but it is only the
936 	 * length of the most recent DMA.  It does not include the
937 	 * total length of the data received since the previous update
938 	 * for this DDP buffer.  rcv_nxt is the sequence number of the
939 	 * first received byte from the most recent DMA.
940 	 */
941 	len += be32toh(rcv_nxt) - tp->rcv_nxt;
942 	tp->rcv_nxt += len;
943 	tp->t_rcvtime = ticks;
944 #ifndef USE_DDP_RX_FLOW_CONTROL
945 	KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__));
946 	tp->rcv_wnd -= len;
947 #endif
948 #ifdef VERBOSE_TRACES
949 	CTR5(KTR_CXGBE, "%s: tid %u, DDP[%d] placed %d bytes (%#x)", __func__,
950 	    toep->tid, db_idx, len, report);
951 #endif
952 
953 	/* receive buffer autosize */
954 	MPASS(toep->vnet == so->so_vnet);
955 	CURVNET_SET(toep->vnet);
956 	SOCKBUF_LOCK(sb);
957 	if (sb->sb_flags & SB_AUTOSIZE &&
958 	    V_tcp_do_autorcvbuf &&
959 	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
960 	    len > (sbspace(sb) / 8 * 7)) {
961 		struct adapter *sc = td_adapter(toep->td);
962 		unsigned int hiwat = sb->sb_hiwat;
963 		unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc,
964 		    V_tcp_autorcvbuf_max);
965 
966 		if (!sbreserve_locked(so, SO_RCV, newsize, NULL))
967 			sb->sb_flags &= ~SB_AUTOSIZE;
968 	}
969 
970 	if (len > 0) {
971 		queue_ddp_rcvbuf_mbuf(toep, db_idx, len);
972 		t4_rcvd_locked(&toep->td->tod, tp);
973 	}
974 	sorwakeup_locked(so);
975 	SOCKBUF_UNLOCK_ASSERT(sb);
976 	CURVNET_RESTORE();
977 
978 	if (invalidated)
979 		complete_ddp_buffer(toep, db, db_idx);
980 	else
981 		KASSERT(db->placed < db->drb->len,
982 		    ("%s: full DDP buffer not invalidated", __func__));
983 
984 	if (toep->ddp.active_count != nitems(toep->ddp.db)) {
985 		drb = alloc_cached_ddp_rcv_buffer(toep);
986 		if (drb == NULL)
987 			drb = alloc_ddp_rcv_buffer(toep, M_NOWAIT);
988 		if (drb == NULL)
989 			ddp_queue_toep(toep);
990 		else {
991 			if (!queue_ddp_rcvbuf(toep, drb)) {
992 				ddp_queue_toep(toep);
993 			}
994 		}
995 	}
996 out:
997 	DDP_UNLOCK(toep);
998 	INP_WUNLOCK(inp);
999 
1000 	return (0);
1001 }
1002 
1003 static int
1004 handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len)
1005 {
1006 	if ((toep->ddp.flags & DDP_RCVBUF) != 0)
1007 		return (handle_ddp_data_rcvbuf(toep, ddp_report, rcv_nxt, len));
1008 	else
1009 		return (handle_ddp_data_aio(toep, ddp_report, rcv_nxt, len));
1010 }
1011 
1012 void
1013 handle_ddp_indicate(struct toepcb *toep)
1014 {
1015 
1016 	DDP_ASSERT_LOCKED(toep);
1017 	if ((toep->ddp.flags & DDP_RCVBUF) != 0) {
1018 		/*
1019 		 * Indicates are not meaningful for RCVBUF since
1020 		 * buffers are activated when the socket option is
1021 		 * set.
1022 		 */
1023 		return;
1024 	}
1025 
1026 	MPASS(toep->ddp.active_count == 0);
1027 	MPASS((toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0);
1028 	if (toep->ddp.waiting_count == 0) {
1029 		/*
1030 		 * The pending requests that triggered the request for an
1031 		 * an indicate were cancelled.  Those cancels should have
1032 		 * already disabled DDP.  Just ignore this as the data is
1033 		 * going into the socket buffer anyway.
1034 		 */
1035 		return;
1036 	}
1037 	CTR3(KTR_CXGBE, "%s: tid %d indicated (%d waiting)", __func__,
1038 	    toep->tid, toep->ddp.waiting_count);
1039 	ddp_queue_toep(toep);
1040 }
1041 
1042 CTASSERT(CPL_COOKIE_DDP0 + 1 == CPL_COOKIE_DDP1);
1043 
1044 static int
1045 do_ddp_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1046 {
1047 	struct adapter *sc = iq->adapter;
1048 	const struct cpl_set_tcb_rpl *cpl = (const void *)(rss + 1);
1049 	unsigned int tid = GET_TID(cpl);
1050 	unsigned int db_idx;
1051 	struct toepcb *toep;
1052 	struct inpcb *inp;
1053 	struct ddp_buffer *db;
1054 	struct kaiocb *job;
1055 	long copied;
1056 
1057 	if (cpl->status != CPL_ERR_NONE)
1058 		panic("XXX: tcp_rpl failed: %d", cpl->status);
1059 
1060 	toep = lookup_tid(sc, tid);
1061 	inp = toep->inp;
1062 	switch (cpl->cookie) {
1063 	case V_WORD(W_TCB_RX_DDP_FLAGS) | V_COOKIE(CPL_COOKIE_DDP0):
1064 	case V_WORD(W_TCB_RX_DDP_FLAGS) | V_COOKIE(CPL_COOKIE_DDP1):
1065 		/*
1066 		 * XXX: This duplicates a lot of code with handle_ddp_data().
1067 		 */
1068 		KASSERT((toep->ddp.flags & DDP_AIO) != 0,
1069 		    ("%s: DDP_RCVBUF", __func__));
1070 		db_idx = G_COOKIE(cpl->cookie) - CPL_COOKIE_DDP0;
1071 		MPASS(db_idx < nitems(toep->ddp.db));
1072 		INP_WLOCK(inp);
1073 		DDP_LOCK(toep);
1074 		db = &toep->ddp.db[db_idx];
1075 
1076 		/*
1077 		 * handle_ddp_data() should leave the job around until
1078 		 * this callback runs once a cancel is pending.
1079 		 */
1080 		MPASS(db != NULL);
1081 		MPASS(db->job != NULL);
1082 		MPASS(db->cancel_pending);
1083 
1084 		/*
1085 		 * XXX: It's not clear what happens if there is data
1086 		 * placed when the buffer is invalidated.  I suspect we
1087 		 * need to read the TCB to see how much data was placed.
1088 		 *
1089 		 * For now this just pretends like nothing was placed.
1090 		 *
1091 		 * XXX: Note that if we did check the PCB we would need to
1092 		 * also take care of updating the tp, etc.
1093 		 */
1094 		job = db->job;
1095 		copied = job->aio_received;
1096 		if (copied == 0) {
1097 			CTR2(KTR_CXGBE, "%s: cancelling %p", __func__, job);
1098 			aio_cancel(job);
1099 		} else {
1100 			CTR3(KTR_CXGBE, "%s: completing %p (copied %ld)",
1101 			    __func__, job, copied);
1102 			aio_complete(job, copied, 0);
1103 			t4_rcvd(&toep->td->tod, intotcpcb(inp));
1104 		}
1105 
1106 		complete_ddp_buffer(toep, db, db_idx);
1107 		if (toep->ddp.waiting_count > 0)
1108 			ddp_queue_toep(toep);
1109 		DDP_UNLOCK(toep);
1110 		INP_WUNLOCK(inp);
1111 		break;
1112 	default:
1113 		panic("XXX: unknown tcb_rpl offset %#x, cookie %#x",
1114 		    G_WORD(cpl->cookie), G_COOKIE(cpl->cookie));
1115 	}
1116 
1117 	return (0);
1118 }
1119 
1120 void
1121 handle_ddp_close(struct toepcb *toep, struct tcpcb *tp, __be32 rcv_nxt)
1122 {
1123 	struct socket *so = toep->inp->inp_socket;
1124 	struct sockbuf *sb = &so->so_rcv;
1125 	struct ddp_buffer *db;
1126 	struct kaiocb *job;
1127 	long copied;
1128 	unsigned int db_idx;
1129 #ifdef INVARIANTS
1130 	unsigned int db_flag;
1131 #endif
1132 	int len, placed;
1133 	bool ddp_rcvbuf;
1134 
1135 	INP_WLOCK_ASSERT(toep->inp);
1136 	DDP_ASSERT_LOCKED(toep);
1137 
1138 	ddp_rcvbuf = (toep->ddp.flags & DDP_RCVBUF) != 0;
1139 
1140 	/* - 1 is to ignore the byte for FIN */
1141 	len = be32toh(rcv_nxt) - tp->rcv_nxt - 1;
1142 	tp->rcv_nxt += len;
1143 
1144 	CTR(KTR_CXGBE, "%s: tid %d placed %u bytes before FIN", __func__,
1145 	    toep->tid, len);
1146 	while (toep->ddp.active_count > 0) {
1147 		MPASS(toep->ddp.active_id != -1);
1148 		db_idx = toep->ddp.active_id;
1149 #ifdef INVARIANTS
1150 		db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
1151 #endif
1152 		MPASS((toep->ddp.flags & db_flag) != 0);
1153 		db = &toep->ddp.db[db_idx];
1154 		if (ddp_rcvbuf) {
1155 			placed = len;
1156 			if (placed > db->drb->len - db->placed)
1157 				placed = db->drb->len - db->placed;
1158 			if (placed != 0) {
1159 				SOCKBUF_LOCK(sb);
1160 				queue_ddp_rcvbuf_mbuf(toep, db_idx, placed);
1161 				sorwakeup_locked(so);
1162 				SOCKBUF_UNLOCK_ASSERT(sb);
1163 			}
1164 			complete_ddp_buffer(toep, db, db_idx);
1165 			len -= placed;
1166 			continue;
1167 		}
1168 		job = db->job;
1169 		copied = job->aio_received;
1170 		placed = len;
1171 		if (placed > job->uaiocb.aio_nbytes - copied)
1172 			placed = job->uaiocb.aio_nbytes - copied;
1173 		if (placed > 0) {
1174 			job->msgrcv = 1;
1175 			toep->ofld_rxq->rx_aio_ddp_jobs++;
1176 		}
1177 		toep->ofld_rxq->rx_aio_ddp_octets += placed;
1178 		if (!aio_clear_cancel_function(job)) {
1179 			/*
1180 			 * Update the copied length for when
1181 			 * t4_aio_cancel_active() completes this
1182 			 * request.
1183 			 */
1184 			job->aio_received += placed;
1185 		} else {
1186 			CTR4(KTR_CXGBE, "%s: tid %d completed buf %d len %d",
1187 			    __func__, toep->tid, db_idx, placed);
1188 			aio_complete(job, copied + placed, 0);
1189 		}
1190 		len -= placed;
1191 		complete_ddp_buffer(toep, db, db_idx);
1192 	}
1193 
1194 	MPASS(len == 0);
1195 	if ((toep->ddp.flags & DDP_AIO) != 0)
1196 		ddp_complete_all(toep, 0);
1197 }
1198 
1199 #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
1200 	 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
1201 	 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
1202 	 F_DDP_INVALID_PPOD | F_DDP_HDRCRC_ERR | F_DDP_DATACRC_ERR)
1203 
1204 extern cpl_handler_t t4_cpl_handler[];
1205 
1206 static int
1207 do_rx_data_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1208 {
1209 	struct adapter *sc = iq->adapter;
1210 	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
1211 	unsigned int tid = GET_TID(cpl);
1212 	uint32_t vld;
1213 	struct toepcb *toep = lookup_tid(sc, tid);
1214 
1215 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1216 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
1217 	KASSERT(!(toep->flags & TPF_SYNQE),
1218 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
1219 
1220 	vld = be32toh(cpl->ddpvld);
1221 	if (__predict_false(vld & DDP_ERR)) {
1222 		panic("%s: DDP error 0x%x (tid %d, toep %p)",
1223 		    __func__, vld, tid, toep);
1224 	}
1225 
1226 	if (ulp_mode(toep) == ULP_MODE_ISCSI) {
1227 		t4_cpl_handler[CPL_RX_ISCSI_DDP](iq, rss, m);
1228 		return (0);
1229 	}
1230 
1231 	handle_ddp_data(toep, cpl->u.ddp_report, cpl->seq, be16toh(cpl->len));
1232 
1233 	return (0);
1234 }
1235 
1236 static int
1237 do_rx_ddp_complete(struct sge_iq *iq, const struct rss_header *rss,
1238     struct mbuf *m)
1239 {
1240 	struct adapter *sc = iq->adapter;
1241 	const struct cpl_rx_ddp_complete *cpl = (const void *)(rss + 1);
1242 	unsigned int tid = GET_TID(cpl);
1243 	struct toepcb *toep = lookup_tid(sc, tid);
1244 
1245 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1246 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
1247 	KASSERT(!(toep->flags & TPF_SYNQE),
1248 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
1249 
1250 	handle_ddp_data(toep, cpl->ddp_report, cpl->rcv_nxt, 0);
1251 
1252 	return (0);
1253 }
1254 
1255 static bool
1256 set_ddp_ulp_mode(struct toepcb *toep)
1257 {
1258 	struct adapter *sc = toep->vi->adapter;
1259 	struct wrqe *wr;
1260 	struct work_request_hdr *wrh;
1261 	struct ulp_txpkt *ulpmc;
1262 	int fields, len;
1263 
1264 	if (!sc->tt.ddp)
1265 		return (false);
1266 
1267 	fields = 0;
1268 
1269 	/* Overlay region including W_TCB_RX_DDP_FLAGS */
1270 	fields += 3;
1271 
1272 	/* W_TCB_ULP_TYPE */
1273 	fields++;
1274 
1275 #ifdef USE_DDP_RX_FLOW_CONTROL
1276 	/* W_TCB_T_FLAGS */
1277 	fields++;
1278 #endif
1279 
1280 	len = sizeof(*wrh) + fields * roundup2(LEN__SET_TCB_FIELD_ULP, 16);
1281 	KASSERT(len <= SGE_MAX_WR_LEN,
1282 	    ("%s: WR with %d TCB field updates too large", __func__, fields));
1283 
1284 	wr = alloc_wrqe(len, toep->ctrlq);
1285 	if (wr == NULL)
1286 		return (false);
1287 
1288 	CTR(KTR_CXGBE, "%s: tid %u", __func__, toep->tid);
1289 
1290 	wrh = wrtod(wr);
1291 	INIT_ULPTX_WRH(wrh, len, 1, 0);	/* atomic */
1292 	ulpmc = (struct ulp_txpkt *)(wrh + 1);
1293 
1294 	/*
1295 	 * Words 26/27 are zero except for the DDP_OFF flag in
1296 	 * W_TCB_RX_DDP_FLAGS (27).
1297 	 */
1298 	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, 26,
1299 	    0xffffffffffffffff, (uint64_t)V_TF_DDP_OFF(1) << 32);
1300 
1301 	/* Words 28/29 are zero. */
1302 	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, 28,
1303 	    0xffffffffffffffff, 0);
1304 
1305 	/* Words 30/31 are zero. */
1306 	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, 30,
1307 	    0xffffffffffffffff, 0);
1308 
1309 	/* Set the ULP mode to ULP_MODE_TCPDDP. */
1310 	toep->params.ulp_mode = ULP_MODE_TCPDDP;
1311 	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_ULP_TYPE,
1312 	    V_TCB_ULP_TYPE(M_TCB_ULP_TYPE),
1313 	    V_TCB_ULP_TYPE(ULP_MODE_TCPDDP));
1314 
1315 #ifdef USE_DDP_RX_FLOW_CONTROL
1316 	/* Set TF_RX_FLOW_CONTROL_DDP. */
1317 	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_T_FLAGS,
1318 	    V_TF_RX_FLOW_CONTROL_DDP(1), V_TF_RX_FLOW_CONTROL_DDP(1));
1319 #endif
1320 
1321 	ddp_init_toep(toep);
1322 
1323 	t4_wrq_tx(sc, wr);
1324 	return (true);
1325 }
1326 
1327 static void
1328 enable_ddp(struct adapter *sc, struct toepcb *toep)
1329 {
1330 	uint64_t ddp_flags;
1331 
1332 	KASSERT((toep->ddp.flags & (DDP_ON | DDP_OK | DDP_SC_REQ)) == DDP_OK,
1333 	    ("%s: toep %p has bad ddp_flags 0x%x",
1334 	    __func__, toep, toep->ddp.flags));
1335 
1336 	CTR3(KTR_CXGBE, "%s: tid %u (time %u)",
1337 	    __func__, toep->tid, time_uptime);
1338 
1339 	ddp_flags = 0;
1340 	if ((toep->ddp.flags & DDP_AIO) != 0)
1341 		ddp_flags |= V_TF_DDP_BUF0_INDICATE(1) |
1342 		    V_TF_DDP_BUF1_INDICATE(1);
1343 	DDP_ASSERT_LOCKED(toep);
1344 	toep->ddp.flags |= DDP_SC_REQ;
1345 	t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_RX_DDP_FLAGS,
1346 	    V_TF_DDP_OFF(1) | V_TF_DDP_INDICATE_OUT(1) |
1347 	    V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1) |
1348 	    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1), ddp_flags, 0, 0);
1349 	t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS,
1350 	    V_TF_RCV_COALESCE_ENABLE(1), 0, 0, 0);
1351 }
1352 
1353 static int
1354 calculate_hcf(int n1, int n2)
1355 {
1356 	int a, b, t;
1357 
1358 	if (n1 <= n2) {
1359 		a = n1;
1360 		b = n2;
1361 	} else {
1362 		a = n2;
1363 		b = n1;
1364 	}
1365 
1366 	while (a != 0) {
1367 		t = a;
1368 		a = b % a;
1369 		b = t;
1370 	}
1371 
1372 	return (b);
1373 }
1374 
1375 static inline int
1376 pages_to_nppods(int npages, int ddp_page_shift)
1377 {
1378 
1379 	MPASS(ddp_page_shift >= PAGE_SHIFT);
1380 
1381 	return (howmany(npages >> (ddp_page_shift - PAGE_SHIFT), PPOD_PAGES));
1382 }
1383 
1384 static int
1385 alloc_page_pods(struct ppod_region *pr, u_int nppods, u_int pgsz_idx,
1386     struct ppod_reservation *prsv)
1387 {
1388 	vmem_addr_t addr;       /* relative to start of region */
1389 
1390 	if (vmem_alloc(pr->pr_arena, PPOD_SZ(nppods), M_NOWAIT | M_FIRSTFIT,
1391 	    &addr) != 0)
1392 		return (ENOMEM);
1393 
1394 #ifdef VERBOSE_TRACES
1395 	CTR5(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d, pgsz %d",
1396 	    __func__, pr->pr_arena, (uint32_t)addr & pr->pr_tag_mask,
1397 	    nppods, 1 << pr->pr_page_shift[pgsz_idx]);
1398 #endif
1399 
1400 	/*
1401 	 * The hardware tagmask includes an extra invalid bit but the arena was
1402 	 * seeded with valid values only.  An allocation out of this arena will
1403 	 * fit inside the tagmask but won't have the invalid bit set.
1404 	 */
1405 	MPASS((addr & pr->pr_tag_mask) == addr);
1406 	MPASS((addr & pr->pr_invalid_bit) == 0);
1407 
1408 	prsv->prsv_pr = pr;
1409 	prsv->prsv_tag = V_PPOD_PGSZ(pgsz_idx) | addr;
1410 	prsv->prsv_nppods = nppods;
1411 
1412 	return (0);
1413 }
1414 
1415 static int
1416 t4_alloc_page_pods_for_vmpages(struct ppod_region *pr, vm_page_t *pages,
1417     int npages, struct ppod_reservation *prsv)
1418 {
1419 	int i, hcf, seglen, idx, nppods;
1420 
1421 	/*
1422 	 * The DDP page size is unrelated to the VM page size.  We combine
1423 	 * contiguous physical pages into larger segments to get the best DDP
1424 	 * page size possible.  This is the largest of the four sizes in
1425 	 * A_ULP_RX_TDDP_PSZ that evenly divides the HCF of the segment sizes in
1426 	 * the page list.
1427 	 */
1428 	hcf = 0;
1429 	for (i = 0; i < npages; i++) {
1430 		seglen = PAGE_SIZE;
1431 		while (i < npages - 1 &&
1432 		    VM_PAGE_TO_PHYS(pages[i]) + PAGE_SIZE ==
1433 		    VM_PAGE_TO_PHYS(pages[i + 1])) {
1434 			seglen += PAGE_SIZE;
1435 			i++;
1436 		}
1437 
1438 		hcf = calculate_hcf(hcf, seglen);
1439 		if (hcf < (1 << pr->pr_page_shift[1])) {
1440 			idx = 0;
1441 			goto have_pgsz;	/* give up, short circuit */
1442 		}
1443 	}
1444 
1445 #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1)
1446 	MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */
1447 	for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) {
1448 		if ((hcf & PR_PAGE_MASK(idx)) == 0)
1449 			break;
1450 	}
1451 #undef PR_PAGE_MASK
1452 
1453 have_pgsz:
1454 	MPASS(idx <= M_PPOD_PGSZ);
1455 
1456 	nppods = pages_to_nppods(npages, pr->pr_page_shift[idx]);
1457 	if (alloc_page_pods(pr, nppods, idx, prsv) != 0)
1458 		return (ENOMEM);
1459 	MPASS(prsv->prsv_nppods > 0);
1460 
1461 	return (0);
1462 }
1463 
1464 int
1465 t4_alloc_page_pods_for_ps(struct ppod_region *pr, struct pageset *ps)
1466 {
1467 	struct ppod_reservation *prsv = &ps->prsv;
1468 
1469 	KASSERT(prsv->prsv_nppods == 0,
1470 	    ("%s: page pods already allocated", __func__));
1471 
1472 	return (t4_alloc_page_pods_for_vmpages(pr, ps->pages, ps->npages,
1473 	    prsv));
1474 }
1475 
1476 int
1477 t4_alloc_page_pods_for_bio(struct ppod_region *pr, struct bio *bp,
1478     struct ppod_reservation *prsv)
1479 {
1480 
1481 	MPASS(bp->bio_flags & BIO_UNMAPPED);
1482 
1483 	return (t4_alloc_page_pods_for_vmpages(pr, bp->bio_ma, bp->bio_ma_n,
1484 	    prsv));
1485 }
1486 
1487 int
1488 t4_alloc_page_pods_for_buf(struct ppod_region *pr, vm_offset_t buf, int len,
1489     struct ppod_reservation *prsv)
1490 {
1491 	int hcf, seglen, idx, npages, nppods;
1492 	uintptr_t start_pva, end_pva, pva, p1;
1493 
1494 	MPASS(buf > 0);
1495 	MPASS(len > 0);
1496 
1497 	/*
1498 	 * The DDP page size is unrelated to the VM page size.  We combine
1499 	 * contiguous physical pages into larger segments to get the best DDP
1500 	 * page size possible.  This is the largest of the four sizes in
1501 	 * A_ULP_RX_ISCSI_PSZ that evenly divides the HCF of the segment sizes
1502 	 * in the page list.
1503 	 */
1504 	hcf = 0;
1505 	start_pva = trunc_page(buf);
1506 	end_pva = trunc_page(buf + len - 1);
1507 	pva = start_pva;
1508 	while (pva <= end_pva) {
1509 		seglen = PAGE_SIZE;
1510 		p1 = pmap_kextract(pva);
1511 		pva += PAGE_SIZE;
1512 		while (pva <= end_pva && p1 + seglen == pmap_kextract(pva)) {
1513 			seglen += PAGE_SIZE;
1514 			pva += PAGE_SIZE;
1515 		}
1516 
1517 		hcf = calculate_hcf(hcf, seglen);
1518 		if (hcf < (1 << pr->pr_page_shift[1])) {
1519 			idx = 0;
1520 			goto have_pgsz;	/* give up, short circuit */
1521 		}
1522 	}
1523 
1524 #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1)
1525 	MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */
1526 	for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) {
1527 		if ((hcf & PR_PAGE_MASK(idx)) == 0)
1528 			break;
1529 	}
1530 #undef PR_PAGE_MASK
1531 
1532 have_pgsz:
1533 	MPASS(idx <= M_PPOD_PGSZ);
1534 
1535 	npages = 1;
1536 	npages += (end_pva - start_pva) >> pr->pr_page_shift[idx];
1537 	nppods = howmany(npages, PPOD_PAGES);
1538 	if (alloc_page_pods(pr, nppods, idx, prsv) != 0)
1539 		return (ENOMEM);
1540 	MPASS(prsv->prsv_nppods > 0);
1541 
1542 	return (0);
1543 }
1544 
1545 static int
1546 t4_alloc_page_pods_for_rcvbuf(struct ppod_region *pr,
1547     struct ddp_rcv_buffer *drb)
1548 {
1549 	struct ppod_reservation *prsv = &drb->prsv;
1550 
1551 	KASSERT(prsv->prsv_nppods == 0,
1552 	    ("%s: page pods already allocated", __func__));
1553 
1554 	return (t4_alloc_page_pods_for_buf(pr, (vm_offset_t)drb->buf, drb->len,
1555 	    prsv));
1556 }
1557 
1558 int
1559 t4_alloc_page_pods_for_sgl(struct ppod_region *pr, struct ctl_sg_entry *sgl,
1560     int entries, struct ppod_reservation *prsv)
1561 {
1562 	int hcf, seglen, idx = 0, npages, nppods, i, len;
1563 	uintptr_t start_pva, end_pva, pva, p1 ;
1564 	vm_offset_t buf;
1565 	struct ctl_sg_entry *sge;
1566 
1567 	MPASS(entries > 0);
1568 	MPASS(sgl);
1569 
1570 	/*
1571 	 * The DDP page size is unrelated to the VM page size.	We combine
1572 	 * contiguous physical pages into larger segments to get the best DDP
1573 	 * page size possible.	This is the largest of the four sizes in
1574 	 * A_ULP_RX_ISCSI_PSZ that evenly divides the HCF of the segment sizes
1575 	 * in the page list.
1576 	 */
1577 	hcf = 0;
1578 	for (i = entries - 1; i >= 0; i--) {
1579 		sge = sgl + i;
1580 		buf = (vm_offset_t)sge->addr;
1581 		len = sge->len;
1582 		start_pva = trunc_page(buf);
1583 		end_pva = trunc_page(buf + len - 1);
1584 		pva = start_pva;
1585 		while (pva <= end_pva) {
1586 			seglen = PAGE_SIZE;
1587 			p1 = pmap_kextract(pva);
1588 			pva += PAGE_SIZE;
1589 			while (pva <= end_pva && p1 + seglen ==
1590 			    pmap_kextract(pva)) {
1591 				seglen += PAGE_SIZE;
1592 				pva += PAGE_SIZE;
1593 			}
1594 
1595 			hcf = calculate_hcf(hcf, seglen);
1596 			if (hcf < (1 << pr->pr_page_shift[1])) {
1597 				idx = 0;
1598 				goto have_pgsz; /* give up, short circuit */
1599 			}
1600 		}
1601 	}
1602 #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1)
1603 	MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */
1604 	for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) {
1605 		if ((hcf & PR_PAGE_MASK(idx)) == 0)
1606 			break;
1607 	}
1608 #undef PR_PAGE_MASK
1609 
1610 have_pgsz:
1611 	MPASS(idx <= M_PPOD_PGSZ);
1612 
1613 	npages = 0;
1614 	while (entries--) {
1615 		npages++;
1616 		start_pva = trunc_page((vm_offset_t)sgl->addr);
1617 		end_pva = trunc_page((vm_offset_t)sgl->addr + sgl->len - 1);
1618 		npages += (end_pva - start_pva) >> pr->pr_page_shift[idx];
1619 		sgl = sgl + 1;
1620 	}
1621 	nppods = howmany(npages, PPOD_PAGES);
1622 	if (alloc_page_pods(pr, nppods, idx, prsv) != 0)
1623 		return (ENOMEM);
1624 	MPASS(prsv->prsv_nppods > 0);
1625 	return (0);
1626 }
1627 
1628 void
1629 t4_free_page_pods(struct ppod_reservation *prsv)
1630 {
1631 	struct ppod_region *pr = prsv->prsv_pr;
1632 	vmem_addr_t addr;
1633 
1634 	MPASS(prsv != NULL);
1635 	MPASS(prsv->prsv_nppods != 0);
1636 
1637 	addr = prsv->prsv_tag & pr->pr_tag_mask;
1638 	MPASS((addr & pr->pr_invalid_bit) == 0);
1639 
1640 #ifdef VERBOSE_TRACES
1641 	CTR4(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d", __func__,
1642 	    pr->pr_arena, addr, prsv->prsv_nppods);
1643 #endif
1644 
1645 	vmem_free(pr->pr_arena, addr, PPOD_SZ(prsv->prsv_nppods));
1646 	prsv->prsv_nppods = 0;
1647 }
1648 
1649 #define NUM_ULP_TX_SC_IMM_PPODS (256 / PPOD_SIZE)
1650 
1651 int
1652 t4_write_page_pods_for_ps(struct adapter *sc, struct sge_wrq *wrq, int tid,
1653     struct pageset *ps)
1654 {
1655 	struct wrqe *wr;
1656 	struct ulp_mem_io *ulpmc;
1657 	struct ulptx_idata *ulpsc;
1658 	struct pagepod *ppod;
1659 	int i, j, k, n, chunk, len, ddp_pgsz, idx;
1660 	u_int ppod_addr;
1661 	uint32_t cmd;
1662 	struct ppod_reservation *prsv = &ps->prsv;
1663 	struct ppod_region *pr = prsv->prsv_pr;
1664 	vm_paddr_t pa;
1665 
1666 	KASSERT(!(ps->flags & PS_PPODS_WRITTEN),
1667 	    ("%s: page pods already written", __func__));
1668 	MPASS(prsv->prsv_nppods > 0);
1669 
1670 	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
1671 	if (is_t4(sc))
1672 		cmd |= htobe32(F_ULP_MEMIO_ORDER);
1673 	else
1674 		cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
1675 	ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
1676 	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
1677 	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
1678 		/* How many page pods are we writing in this cycle */
1679 		n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
1680 		chunk = PPOD_SZ(n);
1681 		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
1682 
1683 		wr = alloc_wrqe(len, wrq);
1684 		if (wr == NULL)
1685 			return (ENOMEM);	/* ok to just bail out */
1686 		ulpmc = wrtod(wr);
1687 
1688 		INIT_ULPTX_WR(ulpmc, len, 0, 0);
1689 		ulpmc->cmd = cmd;
1690 		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
1691 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
1692 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
1693 
1694 		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
1695 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
1696 		ulpsc->len = htobe32(chunk);
1697 
1698 		ppod = (struct pagepod *)(ulpsc + 1);
1699 		for (j = 0; j < n; i++, j++, ppod++) {
1700 			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
1701 			    V_PPOD_TID(tid) | prsv->prsv_tag);
1702 			ppod->len_offset = htobe64(V_PPOD_LEN(ps->len) |
1703 			    V_PPOD_OFST(ps->offset));
1704 			ppod->rsvd = 0;
1705 			idx = i * PPOD_PAGES * (ddp_pgsz / PAGE_SIZE);
1706 			for (k = 0; k < nitems(ppod->addr); k++) {
1707 				if (idx < ps->npages) {
1708 					pa = VM_PAGE_TO_PHYS(ps->pages[idx]);
1709 					ppod->addr[k] = htobe64(pa);
1710 					idx += ddp_pgsz / PAGE_SIZE;
1711 				} else
1712 					ppod->addr[k] = 0;
1713 #if 0
1714 				CTR5(KTR_CXGBE,
1715 				    "%s: tid %d ppod[%d]->addr[%d] = %p",
1716 				    __func__, tid, i, k,
1717 				    be64toh(ppod->addr[k]));
1718 #endif
1719 			}
1720 
1721 		}
1722 
1723 		t4_wrq_tx(sc, wr);
1724 	}
1725 	ps->flags |= PS_PPODS_WRITTEN;
1726 
1727 	return (0);
1728 }
1729 
1730 static int
1731 t4_write_page_pods_for_rcvbuf(struct adapter *sc, struct sge_wrq *wrq, int tid,
1732     struct ddp_rcv_buffer *drb)
1733 {
1734 	struct wrqe *wr;
1735 	struct ulp_mem_io *ulpmc;
1736 	struct ulptx_idata *ulpsc;
1737 	struct pagepod *ppod;
1738 	int i, j, k, n, chunk, len, ddp_pgsz;
1739 	u_int ppod_addr, offset;
1740 	uint32_t cmd;
1741 	struct ppod_reservation *prsv = &drb->prsv;
1742 	struct ppod_region *pr = prsv->prsv_pr;
1743 	uintptr_t end_pva, pva;
1744 	vm_paddr_t pa;
1745 
1746 	MPASS(prsv->prsv_nppods > 0);
1747 
1748 	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
1749 	if (is_t4(sc))
1750 		cmd |= htobe32(F_ULP_MEMIO_ORDER);
1751 	else
1752 		cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
1753 	ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
1754 	offset = (uintptr_t)drb->buf & PAGE_MASK;
1755 	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
1756 	pva = trunc_page((uintptr_t)drb->buf);
1757 	end_pva = trunc_page((uintptr_t)drb->buf + drb->len - 1);
1758 	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
1759 		/* How many page pods are we writing in this cycle */
1760 		n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
1761 		MPASS(n > 0);
1762 		chunk = PPOD_SZ(n);
1763 		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
1764 
1765 		wr = alloc_wrqe(len, wrq);
1766 		if (wr == NULL)
1767 			return (ENOMEM);	/* ok to just bail out */
1768 		ulpmc = wrtod(wr);
1769 
1770 		INIT_ULPTX_WR(ulpmc, len, 0, 0);
1771 		ulpmc->cmd = cmd;
1772 		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
1773 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
1774 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
1775 
1776 		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
1777 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
1778 		ulpsc->len = htobe32(chunk);
1779 
1780 		ppod = (struct pagepod *)(ulpsc + 1);
1781 		for (j = 0; j < n; i++, j++, ppod++) {
1782 			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
1783 			    V_PPOD_TID(tid) | prsv->prsv_tag);
1784 			ppod->len_offset = htobe64(V_PPOD_LEN(drb->len) |
1785 			    V_PPOD_OFST(offset));
1786 			ppod->rsvd = 0;
1787 
1788 			for (k = 0; k < nitems(ppod->addr); k++) {
1789 				if (pva > end_pva)
1790 					ppod->addr[k] = 0;
1791 				else {
1792 					pa = pmap_kextract(pva);
1793 					ppod->addr[k] = htobe64(pa);
1794 					pva += ddp_pgsz;
1795 				}
1796 #if 0
1797 				CTR5(KTR_CXGBE,
1798 				    "%s: tid %d ppod[%d]->addr[%d] = %p",
1799 				    __func__, tid, i, k,
1800 				    be64toh(ppod->addr[k]));
1801 #endif
1802 			}
1803 
1804 			/*
1805 			 * Walk back 1 segment so that the first address in the
1806 			 * next pod is the same as the last one in the current
1807 			 * pod.
1808 			 */
1809 			pva -= ddp_pgsz;
1810 		}
1811 
1812 		t4_wrq_tx(sc, wr);
1813 	}
1814 
1815 	MPASS(pva <= end_pva);
1816 
1817 	return (0);
1818 }
1819 
1820 static struct mbuf *
1821 alloc_raw_wr_mbuf(int len)
1822 {
1823 	struct mbuf *m;
1824 
1825 	if (len <= MHLEN)
1826 		m = m_gethdr(M_NOWAIT, MT_DATA);
1827 	else if (len <= MCLBYTES)
1828 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
1829 	else
1830 		m = NULL;
1831 	if (m == NULL)
1832 		return (NULL);
1833 	m->m_pkthdr.len = len;
1834 	m->m_len = len;
1835 	set_mbuf_raw_wr(m, true);
1836 	return (m);
1837 }
1838 
1839 int
1840 t4_write_page_pods_for_bio(struct adapter *sc, struct toepcb *toep,
1841     struct ppod_reservation *prsv, struct bio *bp, struct mbufq *wrq)
1842 {
1843 	struct ulp_mem_io *ulpmc;
1844 	struct ulptx_idata *ulpsc;
1845 	struct pagepod *ppod;
1846 	int i, j, k, n, chunk, len, ddp_pgsz, idx;
1847 	u_int ppod_addr;
1848 	uint32_t cmd;
1849 	struct ppod_region *pr = prsv->prsv_pr;
1850 	vm_paddr_t pa;
1851 	struct mbuf *m;
1852 
1853 	MPASS(bp->bio_flags & BIO_UNMAPPED);
1854 
1855 	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
1856 	if (is_t4(sc))
1857 		cmd |= htobe32(F_ULP_MEMIO_ORDER);
1858 	else
1859 		cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
1860 	ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
1861 	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
1862 	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
1863 
1864 		/* How many page pods are we writing in this cycle */
1865 		n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
1866 		MPASS(n > 0);
1867 		chunk = PPOD_SZ(n);
1868 		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
1869 
1870 		m = alloc_raw_wr_mbuf(len);
1871 		if (m == NULL)
1872 			return (ENOMEM);
1873 
1874 		ulpmc = mtod(m, struct ulp_mem_io *);
1875 		INIT_ULPTX_WR(ulpmc, len, 0, toep->tid);
1876 		ulpmc->cmd = cmd;
1877 		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
1878 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
1879 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
1880 
1881 		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
1882 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
1883 		ulpsc->len = htobe32(chunk);
1884 
1885 		ppod = (struct pagepod *)(ulpsc + 1);
1886 		for (j = 0; j < n; i++, j++, ppod++) {
1887 			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
1888 			    V_PPOD_TID(toep->tid) |
1889 			    (prsv->prsv_tag & ~V_PPOD_PGSZ(M_PPOD_PGSZ)));
1890 			ppod->len_offset = htobe64(V_PPOD_LEN(bp->bio_bcount) |
1891 			    V_PPOD_OFST(bp->bio_ma_offset));
1892 			ppod->rsvd = 0;
1893 			idx = i * PPOD_PAGES * (ddp_pgsz / PAGE_SIZE);
1894 			for (k = 0; k < nitems(ppod->addr); k++) {
1895 				if (idx < bp->bio_ma_n) {
1896 					pa = VM_PAGE_TO_PHYS(bp->bio_ma[idx]);
1897 					ppod->addr[k] = htobe64(pa);
1898 					idx += ddp_pgsz / PAGE_SIZE;
1899 				} else
1900 					ppod->addr[k] = 0;
1901 #if 0
1902 				CTR5(KTR_CXGBE,
1903 				    "%s: tid %d ppod[%d]->addr[%d] = %p",
1904 				    __func__, toep->tid, i, k,
1905 				    be64toh(ppod->addr[k]));
1906 #endif
1907 			}
1908 		}
1909 
1910 		mbufq_enqueue(wrq, m);
1911 	}
1912 
1913 	return (0);
1914 }
1915 
1916 int
1917 t4_write_page_pods_for_buf(struct adapter *sc, struct toepcb *toep,
1918     struct ppod_reservation *prsv, vm_offset_t buf, int buflen,
1919     struct mbufq *wrq)
1920 {
1921 	struct ulp_mem_io *ulpmc;
1922 	struct ulptx_idata *ulpsc;
1923 	struct pagepod *ppod;
1924 	int i, j, k, n, chunk, len, ddp_pgsz;
1925 	u_int ppod_addr, offset;
1926 	uint32_t cmd;
1927 	struct ppod_region *pr = prsv->prsv_pr;
1928 	uintptr_t end_pva, pva;
1929 	vm_paddr_t pa;
1930 	struct mbuf *m;
1931 
1932 	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
1933 	if (is_t4(sc))
1934 		cmd |= htobe32(F_ULP_MEMIO_ORDER);
1935 	else
1936 		cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
1937 	ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
1938 	offset = buf & PAGE_MASK;
1939 	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
1940 	pva = trunc_page(buf);
1941 	end_pva = trunc_page(buf + buflen - 1);
1942 	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
1943 
1944 		/* How many page pods are we writing in this cycle */
1945 		n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
1946 		MPASS(n > 0);
1947 		chunk = PPOD_SZ(n);
1948 		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
1949 
1950 		m = alloc_raw_wr_mbuf(len);
1951 		if (m == NULL)
1952 			return (ENOMEM);
1953 		ulpmc = mtod(m, struct ulp_mem_io *);
1954 
1955 		INIT_ULPTX_WR(ulpmc, len, 0, toep->tid);
1956 		ulpmc->cmd = cmd;
1957 		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
1958 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
1959 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
1960 
1961 		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
1962 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
1963 		ulpsc->len = htobe32(chunk);
1964 
1965 		ppod = (struct pagepod *)(ulpsc + 1);
1966 		for (j = 0; j < n; i++, j++, ppod++) {
1967 			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
1968 			    V_PPOD_TID(toep->tid) |
1969 			    (prsv->prsv_tag & ~V_PPOD_PGSZ(M_PPOD_PGSZ)));
1970 			ppod->len_offset = htobe64(V_PPOD_LEN(buflen) |
1971 			    V_PPOD_OFST(offset));
1972 			ppod->rsvd = 0;
1973 
1974 			for (k = 0; k < nitems(ppod->addr); k++) {
1975 				if (pva > end_pva)
1976 					ppod->addr[k] = 0;
1977 				else {
1978 					pa = pmap_kextract(pva);
1979 					ppod->addr[k] = htobe64(pa);
1980 					pva += ddp_pgsz;
1981 				}
1982 #if 0
1983 				CTR5(KTR_CXGBE,
1984 				    "%s: tid %d ppod[%d]->addr[%d] = %p",
1985 				    __func__, toep->tid, i, k,
1986 				    be64toh(ppod->addr[k]));
1987 #endif
1988 			}
1989 
1990 			/*
1991 			 * Walk back 1 segment so that the first address in the
1992 			 * next pod is the same as the last one in the current
1993 			 * pod.
1994 			 */
1995 			pva -= ddp_pgsz;
1996 		}
1997 
1998 		mbufq_enqueue(wrq, m);
1999 	}
2000 
2001 	MPASS(pva <= end_pva);
2002 
2003 	return (0);
2004 }
2005 
2006 int
2007 t4_write_page_pods_for_sgl(struct adapter *sc, struct toepcb *toep,
2008     struct ppod_reservation *prsv, struct ctl_sg_entry *sgl, int entries,
2009     int xferlen, struct mbufq *wrq)
2010 {
2011 	struct ulp_mem_io *ulpmc;
2012 	struct ulptx_idata *ulpsc;
2013 	struct pagepod *ppod;
2014 	int i, j, k, n, chunk, len, ddp_pgsz;
2015 	u_int ppod_addr, offset, sg_offset = 0;
2016 	uint32_t cmd;
2017 	struct ppod_region *pr = prsv->prsv_pr;
2018 	uintptr_t pva;
2019 	vm_paddr_t pa;
2020 	struct mbuf *m;
2021 
2022 	MPASS(sgl != NULL);
2023 	MPASS(entries > 0);
2024 	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
2025 	if (is_t4(sc))
2026 		cmd |= htobe32(F_ULP_MEMIO_ORDER);
2027 	else
2028 		cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
2029 	ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
2030 	offset = (vm_offset_t)sgl->addr & PAGE_MASK;
2031 	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
2032 	pva = trunc_page((vm_offset_t)sgl->addr);
2033 	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
2034 
2035 		/* How many page pods are we writing in this cycle */
2036 		n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
2037 		MPASS(n > 0);
2038 		chunk = PPOD_SZ(n);
2039 		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
2040 
2041 		m = alloc_raw_wr_mbuf(len);
2042 		if (m == NULL)
2043 			return (ENOMEM);
2044 		ulpmc = mtod(m, struct ulp_mem_io *);
2045 
2046 		INIT_ULPTX_WR(ulpmc, len, 0, toep->tid);
2047 		ulpmc->cmd = cmd;
2048 		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
2049 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
2050 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
2051 
2052 		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
2053 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
2054 		ulpsc->len = htobe32(chunk);
2055 
2056 		ppod = (struct pagepod *)(ulpsc + 1);
2057 		for (j = 0; j < n; i++, j++, ppod++) {
2058 			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
2059 			    V_PPOD_TID(toep->tid) |
2060 			    (prsv->prsv_tag & ~V_PPOD_PGSZ(M_PPOD_PGSZ)));
2061 			ppod->len_offset = htobe64(V_PPOD_LEN(xferlen) |
2062 			    V_PPOD_OFST(offset));
2063 			ppod->rsvd = 0;
2064 
2065 			for (k = 0; k < nitems(ppod->addr); k++) {
2066 				if (entries != 0) {
2067 					pa = pmap_kextract(pva + sg_offset);
2068 					ppod->addr[k] = htobe64(pa);
2069 				} else
2070 					ppod->addr[k] = 0;
2071 
2072 #if 0
2073 				CTR5(KTR_CXGBE,
2074 				    "%s: tid %d ppod[%d]->addr[%d] = %p",
2075 				    __func__, toep->tid, i, k,
2076 				    be64toh(ppod->addr[k]));
2077 #endif
2078 
2079 				/*
2080 				 * If this is the last entry in a pod,
2081 				 * reuse the same entry for first address
2082 				 * in the next pod.
2083 				 */
2084 				if (k + 1 == nitems(ppod->addr))
2085 					break;
2086 
2087 				/*
2088 				 * Don't move to the next DDP page if the
2089 				 * sgl is already finished.
2090 				 */
2091 				if (entries == 0)
2092 					continue;
2093 
2094 				sg_offset += ddp_pgsz;
2095 				if (sg_offset == sgl->len) {
2096 					/*
2097 					 * This sgl entry is done.  Go
2098 					 * to the next.
2099 					 */
2100 					entries--;
2101 					sgl++;
2102 					sg_offset = 0;
2103 					if (entries != 0)
2104 						pva = trunc_page(
2105 						    (vm_offset_t)sgl->addr);
2106 				}
2107 			}
2108 		}
2109 
2110 		mbufq_enqueue(wrq, m);
2111 	}
2112 
2113 	return (0);
2114 }
2115 
2116 /*
2117  * Prepare a pageset for DDP.  This sets up page pods.
2118  */
2119 static int
2120 prep_pageset(struct adapter *sc, struct toepcb *toep, struct pageset *ps)
2121 {
2122 	struct tom_data *td = sc->tom_softc;
2123 
2124 	if (ps->prsv.prsv_nppods == 0 &&
2125 	    t4_alloc_page_pods_for_ps(&td->pr, ps) != 0) {
2126 		return (0);
2127 	}
2128 	if (!(ps->flags & PS_PPODS_WRITTEN) &&
2129 	    t4_write_page_pods_for_ps(sc, toep->ctrlq, toep->tid, ps) != 0) {
2130 		return (0);
2131 	}
2132 
2133 	return (1);
2134 }
2135 
2136 int
2137 t4_init_ppod_region(struct ppod_region *pr, struct t4_range *r, u_int psz,
2138     const char *name)
2139 {
2140 	int i;
2141 
2142 	MPASS(pr != NULL);
2143 	MPASS(r->size > 0);
2144 
2145 	pr->pr_start = r->start;
2146 	pr->pr_len = r->size;
2147 	pr->pr_page_shift[0] = 12 + G_HPZ0(psz);
2148 	pr->pr_page_shift[1] = 12 + G_HPZ1(psz);
2149 	pr->pr_page_shift[2] = 12 + G_HPZ2(psz);
2150 	pr->pr_page_shift[3] = 12 + G_HPZ3(psz);
2151 
2152 	/* The SGL -> page pod algorithm requires the sizes to be in order. */
2153 	for (i = 1; i < nitems(pr->pr_page_shift); i++) {
2154 		if (pr->pr_page_shift[i] <= pr->pr_page_shift[i - 1])
2155 			return (ENXIO);
2156 	}
2157 
2158 	pr->pr_tag_mask = ((1 << fls(r->size)) - 1) & V_PPOD_TAG(M_PPOD_TAG);
2159 	pr->pr_alias_mask = V_PPOD_TAG(M_PPOD_TAG) & ~pr->pr_tag_mask;
2160 	if (pr->pr_tag_mask == 0 || pr->pr_alias_mask == 0)
2161 		return (ENXIO);
2162 	pr->pr_alias_shift = fls(pr->pr_tag_mask);
2163 	pr->pr_invalid_bit = 1 << (pr->pr_alias_shift - 1);
2164 
2165 	pr->pr_arena = vmem_create(name, 0, pr->pr_len, PPOD_SIZE, 0,
2166 	    M_FIRSTFIT | M_NOWAIT);
2167 	if (pr->pr_arena == NULL)
2168 		return (ENOMEM);
2169 
2170 	return (0);
2171 }
2172 
2173 void
2174 t4_free_ppod_region(struct ppod_region *pr)
2175 {
2176 
2177 	MPASS(pr != NULL);
2178 
2179 	if (pr->pr_arena)
2180 		vmem_destroy(pr->pr_arena);
2181 	bzero(pr, sizeof(*pr));
2182 }
2183 
2184 static int
2185 pscmp(struct pageset *ps, struct vmspace *vm, vm_offset_t start, int npages,
2186     int pgoff, int len)
2187 {
2188 
2189 	if (ps->start != start || ps->npages != npages ||
2190 	    ps->offset != pgoff || ps->len != len)
2191 		return (1);
2192 
2193 	return (ps->vm != vm || ps->vm_timestamp != vm->vm_map.timestamp);
2194 }
2195 
2196 static int
2197 hold_aio(struct toepcb *toep, struct kaiocb *job, struct pageset **pps)
2198 {
2199 	struct vmspace *vm;
2200 	vm_map_t map;
2201 	vm_offset_t start, end, pgoff;
2202 	struct pageset *ps;
2203 	int n;
2204 
2205 	DDP_ASSERT_LOCKED(toep);
2206 
2207 	/*
2208 	 * The AIO subsystem will cancel and drain all requests before
2209 	 * permitting a process to exit or exec, so p_vmspace should
2210 	 * be stable here.
2211 	 */
2212 	vm = job->userproc->p_vmspace;
2213 	map = &vm->vm_map;
2214 	start = (uintptr_t)job->uaiocb.aio_buf;
2215 	pgoff = start & PAGE_MASK;
2216 	end = round_page(start + job->uaiocb.aio_nbytes);
2217 	start = trunc_page(start);
2218 
2219 	if (end - start > MAX_DDP_BUFFER_SIZE) {
2220 		/*
2221 		 * Truncate the request to a short read.
2222 		 * Alternatively, we could DDP in chunks to the larger
2223 		 * buffer, but that would be quite a bit more work.
2224 		 *
2225 		 * When truncating, round the request down to avoid
2226 		 * crossing a cache line on the final transaction.
2227 		 */
2228 		end = rounddown2(start + MAX_DDP_BUFFER_SIZE, CACHE_LINE_SIZE);
2229 #ifdef VERBOSE_TRACES
2230 		CTR4(KTR_CXGBE, "%s: tid %d, truncating size from %lu to %lu",
2231 		    __func__, toep->tid, (unsigned long)job->uaiocb.aio_nbytes,
2232 		    (unsigned long)(end - (start + pgoff)));
2233 		job->uaiocb.aio_nbytes = end - (start + pgoff);
2234 #endif
2235 		end = round_page(end);
2236 	}
2237 
2238 	n = atop(end - start);
2239 
2240 	/*
2241 	 * Try to reuse a cached pageset.
2242 	 */
2243 	TAILQ_FOREACH(ps, &toep->ddp.cached_pagesets, link) {
2244 		if (pscmp(ps, vm, start, n, pgoff,
2245 		    job->uaiocb.aio_nbytes) == 0) {
2246 			TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link);
2247 			toep->ddp.cached_count--;
2248 			*pps = ps;
2249 			return (0);
2250 		}
2251 	}
2252 
2253 	/*
2254 	 * If there are too many cached pagesets to create a new one,
2255 	 * free a pageset before creating a new one.
2256 	 */
2257 	KASSERT(toep->ddp.active_count + toep->ddp.cached_count <=
2258 	    nitems(toep->ddp.db), ("%s: too many wired pagesets", __func__));
2259 	if (toep->ddp.active_count + toep->ddp.cached_count ==
2260 	    nitems(toep->ddp.db)) {
2261 		KASSERT(toep->ddp.cached_count > 0,
2262 		    ("no cached pageset to free"));
2263 		ps = TAILQ_LAST(&toep->ddp.cached_pagesets, pagesetq);
2264 		TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link);
2265 		toep->ddp.cached_count--;
2266 		free_pageset(toep->td, ps);
2267 	}
2268 	DDP_UNLOCK(toep);
2269 
2270 	/* Create a new pageset. */
2271 	ps = malloc(sizeof(*ps) + n * sizeof(vm_page_t), M_CXGBE, M_WAITOK |
2272 	    M_ZERO);
2273 	ps->pages = (vm_page_t *)(ps + 1);
2274 	ps->vm_timestamp = map->timestamp;
2275 	ps->npages = vm_fault_quick_hold_pages(map, start, end - start,
2276 	    VM_PROT_WRITE, ps->pages, n);
2277 
2278 	DDP_LOCK(toep);
2279 	if (ps->npages < 0) {
2280 		free(ps, M_CXGBE);
2281 		return (EFAULT);
2282 	}
2283 
2284 	KASSERT(ps->npages == n, ("hold_aio: page count mismatch: %d vs %d",
2285 	    ps->npages, n));
2286 
2287 	ps->offset = pgoff;
2288 	ps->len = job->uaiocb.aio_nbytes;
2289 	refcount_acquire(&vm->vm_refcnt);
2290 	ps->vm = vm;
2291 	ps->start = start;
2292 
2293 	CTR5(KTR_CXGBE, "%s: tid %d, new pageset %p for job %p, npages %d",
2294 	    __func__, toep->tid, ps, job, ps->npages);
2295 	*pps = ps;
2296 	return (0);
2297 }
2298 
2299 static void
2300 ddp_complete_all(struct toepcb *toep, int error)
2301 {
2302 	struct kaiocb *job;
2303 
2304 	DDP_ASSERT_LOCKED(toep);
2305 	KASSERT((toep->ddp.flags & DDP_AIO) != 0, ("%s: DDP_RCVBUF", __func__));
2306 	while (!TAILQ_EMPTY(&toep->ddp.aiojobq)) {
2307 		job = TAILQ_FIRST(&toep->ddp.aiojobq);
2308 		TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
2309 		toep->ddp.waiting_count--;
2310 		if (aio_clear_cancel_function(job))
2311 			ddp_complete_one(job, error);
2312 	}
2313 }
2314 
2315 static void
2316 aio_ddp_cancel_one(struct kaiocb *job)
2317 {
2318 	long copied;
2319 
2320 	/*
2321 	 * If this job had copied data out of the socket buffer before
2322 	 * it was cancelled, report it as a short read rather than an
2323 	 * error.
2324 	 */
2325 	copied = job->aio_received;
2326 	if (copied != 0)
2327 		aio_complete(job, copied, 0);
2328 	else
2329 		aio_cancel(job);
2330 }
2331 
2332 /*
2333  * Called when the main loop wants to requeue a job to retry it later.
2334  * Deals with the race of the job being cancelled while it was being
2335  * examined.
2336  */
2337 static void
2338 aio_ddp_requeue_one(struct toepcb *toep, struct kaiocb *job)
2339 {
2340 
2341 	DDP_ASSERT_LOCKED(toep);
2342 	if (!(toep->ddp.flags & DDP_DEAD) &&
2343 	    aio_set_cancel_function(job, t4_aio_cancel_queued)) {
2344 		TAILQ_INSERT_HEAD(&toep->ddp.aiojobq, job, list);
2345 		toep->ddp.waiting_count++;
2346 	} else
2347 		aio_ddp_cancel_one(job);
2348 }
2349 
2350 static void
2351 aio_ddp_requeue(struct toepcb *toep)
2352 {
2353 	struct adapter *sc = td_adapter(toep->td);
2354 	struct socket *so;
2355 	struct sockbuf *sb;
2356 	struct inpcb *inp;
2357 	struct kaiocb *job;
2358 	struct ddp_buffer *db;
2359 	size_t copied, offset, resid;
2360 	struct pageset *ps;
2361 	struct mbuf *m;
2362 	uint64_t ddp_flags, ddp_flags_mask;
2363 	struct wrqe *wr;
2364 	int buf_flag, db_idx, error;
2365 
2366 	DDP_ASSERT_LOCKED(toep);
2367 
2368 restart:
2369 	if (toep->ddp.flags & DDP_DEAD) {
2370 		MPASS(toep->ddp.waiting_count == 0);
2371 		MPASS(toep->ddp.active_count == 0);
2372 		return;
2373 	}
2374 
2375 	if (toep->ddp.waiting_count == 0 ||
2376 	    toep->ddp.active_count == nitems(toep->ddp.db)) {
2377 		return;
2378 	}
2379 
2380 	job = TAILQ_FIRST(&toep->ddp.aiojobq);
2381 	so = job->fd_file->f_data;
2382 	sb = &so->so_rcv;
2383 	SOCKBUF_LOCK(sb);
2384 
2385 	/* We will never get anything unless we are or were connected. */
2386 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
2387 		SOCKBUF_UNLOCK(sb);
2388 		ddp_complete_all(toep, ENOTCONN);
2389 		return;
2390 	}
2391 
2392 	KASSERT(toep->ddp.active_count == 0 || sbavail(sb) == 0,
2393 	    ("%s: pending sockbuf data and DDP is active", __func__));
2394 
2395 	/* Abort if socket has reported problems. */
2396 	/* XXX: Wait for any queued DDP's to finish and/or flush them? */
2397 	if (so->so_error && sbavail(sb) == 0) {
2398 		toep->ddp.waiting_count--;
2399 		TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
2400 		if (!aio_clear_cancel_function(job)) {
2401 			SOCKBUF_UNLOCK(sb);
2402 			goto restart;
2403 		}
2404 
2405 		/*
2406 		 * If this job has previously copied some data, report
2407 		 * a short read and leave the error to be reported by
2408 		 * a future request.
2409 		 */
2410 		copied = job->aio_received;
2411 		if (copied != 0) {
2412 			SOCKBUF_UNLOCK(sb);
2413 			aio_complete(job, copied, 0);
2414 			goto restart;
2415 		}
2416 		error = so->so_error;
2417 		so->so_error = 0;
2418 		SOCKBUF_UNLOCK(sb);
2419 		aio_complete(job, -1, error);
2420 		goto restart;
2421 	}
2422 
2423 	/*
2424 	 * Door is closed.  If there is pending data in the socket buffer,
2425 	 * deliver it.  If there are pending DDP requests, wait for those
2426 	 * to complete.  Once they have completed, return EOF reads.
2427 	 */
2428 	if (sb->sb_state & SBS_CANTRCVMORE && sbavail(sb) == 0) {
2429 		SOCKBUF_UNLOCK(sb);
2430 		if (toep->ddp.active_count != 0)
2431 			return;
2432 		ddp_complete_all(toep, 0);
2433 		return;
2434 	}
2435 
2436 	/*
2437 	 * If DDP is not enabled and there is no pending socket buffer
2438 	 * data, try to enable DDP.
2439 	 */
2440 	if (sbavail(sb) == 0 && (toep->ddp.flags & DDP_ON) == 0) {
2441 		SOCKBUF_UNLOCK(sb);
2442 
2443 		/*
2444 		 * Wait for the card to ACK that DDP is enabled before
2445 		 * queueing any buffers.  Currently this waits for an
2446 		 * indicate to arrive.  This could use a TCB_SET_FIELD_RPL
2447 		 * message to know that DDP was enabled instead of waiting
2448 		 * for the indicate which would avoid copying the indicate
2449 		 * if no data is pending.
2450 		 *
2451 		 * XXX: Might want to limit the indicate size to the size
2452 		 * of the first queued request.
2453 		 */
2454 		if ((toep->ddp.flags & DDP_SC_REQ) == 0)
2455 			enable_ddp(sc, toep);
2456 		return;
2457 	}
2458 	SOCKBUF_UNLOCK(sb);
2459 
2460 	/*
2461 	 * If another thread is queueing a buffer for DDP, let it
2462 	 * drain any work and return.
2463 	 */
2464 	if (toep->ddp.queueing != NULL)
2465 		return;
2466 
2467 	/* Take the next job to prep it for DDP. */
2468 	toep->ddp.waiting_count--;
2469 	TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
2470 	if (!aio_clear_cancel_function(job))
2471 		goto restart;
2472 	toep->ddp.queueing = job;
2473 
2474 	/* NB: This drops DDP_LOCK while it holds the backing VM pages. */
2475 	error = hold_aio(toep, job, &ps);
2476 	if (error != 0) {
2477 		ddp_complete_one(job, error);
2478 		toep->ddp.queueing = NULL;
2479 		goto restart;
2480 	}
2481 
2482 	SOCKBUF_LOCK(sb);
2483 	if (so->so_error && sbavail(sb) == 0) {
2484 		copied = job->aio_received;
2485 		if (copied != 0) {
2486 			SOCKBUF_UNLOCK(sb);
2487 			recycle_pageset(toep, ps);
2488 			aio_complete(job, copied, 0);
2489 			toep->ddp.queueing = NULL;
2490 			goto restart;
2491 		}
2492 
2493 		error = so->so_error;
2494 		so->so_error = 0;
2495 		SOCKBUF_UNLOCK(sb);
2496 		recycle_pageset(toep, ps);
2497 		aio_complete(job, -1, error);
2498 		toep->ddp.queueing = NULL;
2499 		goto restart;
2500 	}
2501 
2502 	if (sb->sb_state & SBS_CANTRCVMORE && sbavail(sb) == 0) {
2503 		SOCKBUF_UNLOCK(sb);
2504 		recycle_pageset(toep, ps);
2505 		if (toep->ddp.active_count != 0) {
2506 			/*
2507 			 * The door is closed, but there are still pending
2508 			 * DDP buffers.  Requeue.  These jobs will all be
2509 			 * completed once those buffers drain.
2510 			 */
2511 			aio_ddp_requeue_one(toep, job);
2512 			toep->ddp.queueing = NULL;
2513 			return;
2514 		}
2515 		ddp_complete_one(job, 0);
2516 		ddp_complete_all(toep, 0);
2517 		toep->ddp.queueing = NULL;
2518 		return;
2519 	}
2520 
2521 sbcopy:
2522 	/*
2523 	 * If the toep is dead, there shouldn't be any data in the socket
2524 	 * buffer, so the above case should have handled this.
2525 	 */
2526 	MPASS(!(toep->ddp.flags & DDP_DEAD));
2527 
2528 	/*
2529 	 * If there is pending data in the socket buffer (either
2530 	 * from before the requests were queued or a DDP indicate),
2531 	 * copy those mbufs out directly.
2532 	 */
2533 	copied = 0;
2534 	offset = ps->offset + job->aio_received;
2535 	MPASS(job->aio_received <= job->uaiocb.aio_nbytes);
2536 	resid = job->uaiocb.aio_nbytes - job->aio_received;
2537 	m = sb->sb_mb;
2538 	KASSERT(m == NULL || toep->ddp.active_count == 0,
2539 	    ("%s: sockbuf data with active DDP", __func__));
2540 	while (m != NULL && resid > 0) {
2541 		struct iovec iov[1];
2542 		struct uio uio;
2543 #ifdef INVARIANTS
2544 		int error;
2545 #endif
2546 
2547 		iov[0].iov_base = mtod(m, void *);
2548 		iov[0].iov_len = m->m_len;
2549 		if (iov[0].iov_len > resid)
2550 			iov[0].iov_len = resid;
2551 		uio.uio_iov = iov;
2552 		uio.uio_iovcnt = 1;
2553 		uio.uio_offset = 0;
2554 		uio.uio_resid = iov[0].iov_len;
2555 		uio.uio_segflg = UIO_SYSSPACE;
2556 		uio.uio_rw = UIO_WRITE;
2557 #ifdef INVARIANTS
2558 		error = uiomove_fromphys(ps->pages, offset + copied,
2559 		    uio.uio_resid, &uio);
2560 #else
2561 		uiomove_fromphys(ps->pages, offset + copied, uio.uio_resid, &uio);
2562 #endif
2563 		MPASS(error == 0 && uio.uio_resid == 0);
2564 		copied += uio.uio_offset;
2565 		resid -= uio.uio_offset;
2566 		m = m->m_next;
2567 	}
2568 	if (copied != 0) {
2569 		sbdrop_locked(sb, copied);
2570 		job->aio_received += copied;
2571 		job->msgrcv = 1;
2572 		copied = job->aio_received;
2573 		inp = sotoinpcb(so);
2574 		if (!INP_TRY_WLOCK(inp)) {
2575 			/*
2576 			 * The reference on the socket file descriptor in
2577 			 * the AIO job should keep 'sb' and 'inp' stable.
2578 			 * Our caller has a reference on the 'toep' that
2579 			 * keeps it stable.
2580 			 */
2581 			SOCKBUF_UNLOCK(sb);
2582 			DDP_UNLOCK(toep);
2583 			INP_WLOCK(inp);
2584 			DDP_LOCK(toep);
2585 			SOCKBUF_LOCK(sb);
2586 
2587 			/*
2588 			 * If the socket has been closed, we should detect
2589 			 * that and complete this request if needed on
2590 			 * the next trip around the loop.
2591 			 */
2592 		}
2593 		t4_rcvd_locked(&toep->td->tod, intotcpcb(inp));
2594 		INP_WUNLOCK(inp);
2595 		if (resid == 0 || toep->ddp.flags & DDP_DEAD) {
2596 			/*
2597 			 * We filled the entire buffer with socket
2598 			 * data, DDP is not being used, or the socket
2599 			 * is being shut down, so complete the
2600 			 * request.
2601 			 */
2602 			SOCKBUF_UNLOCK(sb);
2603 			recycle_pageset(toep, ps);
2604 			aio_complete(job, copied, 0);
2605 			toep->ddp.queueing = NULL;
2606 			goto restart;
2607 		}
2608 
2609 		/*
2610 		 * If DDP is not enabled, requeue this request and restart.
2611 		 * This will either enable DDP or wait for more data to
2612 		 * arrive on the socket buffer.
2613 		 */
2614 		if ((toep->ddp.flags & (DDP_ON | DDP_SC_REQ)) != DDP_ON) {
2615 			SOCKBUF_UNLOCK(sb);
2616 			recycle_pageset(toep, ps);
2617 			aio_ddp_requeue_one(toep, job);
2618 			toep->ddp.queueing = NULL;
2619 			goto restart;
2620 		}
2621 
2622 		/*
2623 		 * An indicate might have arrived and been added to
2624 		 * the socket buffer while it was unlocked after the
2625 		 * copy to lock the INP.  If so, restart the copy.
2626 		 */
2627 		if (sbavail(sb) != 0)
2628 			goto sbcopy;
2629 	}
2630 	SOCKBUF_UNLOCK(sb);
2631 
2632 	if (prep_pageset(sc, toep, ps) == 0) {
2633 		recycle_pageset(toep, ps);
2634 		aio_ddp_requeue_one(toep, job);
2635 		toep->ddp.queueing = NULL;
2636 
2637 		/*
2638 		 * XXX: Need to retry this later.  Mostly need a trigger
2639 		 * when page pods are freed up.
2640 		 */
2641 		printf("%s: prep_pageset failed\n", __func__);
2642 		return;
2643 	}
2644 
2645 	/* Determine which DDP buffer to use. */
2646 	if (toep->ddp.db[0].job == NULL) {
2647 		db_idx = 0;
2648 	} else {
2649 		MPASS(toep->ddp.db[1].job == NULL);
2650 		db_idx = 1;
2651 	}
2652 
2653 	ddp_flags = 0;
2654 	ddp_flags_mask = 0;
2655 	if (db_idx == 0) {
2656 		ddp_flags |= V_TF_DDP_BUF0_VALID(1);
2657 		if (so->so_state & SS_NBIO)
2658 			ddp_flags |= V_TF_DDP_BUF0_FLUSH(1);
2659 		ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE0(1) |
2660 		    V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PSHF_ENABLE_0(1) |
2661 		    V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF0_VALID(1);
2662 		buf_flag = DDP_BUF0_ACTIVE;
2663 	} else {
2664 		ddp_flags |= V_TF_DDP_BUF1_VALID(1);
2665 		if (so->so_state & SS_NBIO)
2666 			ddp_flags |= V_TF_DDP_BUF1_FLUSH(1);
2667 		ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE1(1) |
2668 		    V_TF_DDP_PUSH_DISABLE_1(1) | V_TF_DDP_PSHF_ENABLE_1(1) |
2669 		    V_TF_DDP_BUF1_FLUSH(1) | V_TF_DDP_BUF1_VALID(1);
2670 		buf_flag = DDP_BUF1_ACTIVE;
2671 	}
2672 	MPASS((toep->ddp.flags & buf_flag) == 0);
2673 	if ((toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0) {
2674 		MPASS(db_idx == 0);
2675 		MPASS(toep->ddp.active_id == -1);
2676 		MPASS(toep->ddp.active_count == 0);
2677 		ddp_flags_mask |= V_TF_DDP_ACTIVE_BUF(1);
2678 	}
2679 
2680 	/*
2681 	 * The TID for this connection should still be valid.  If DDP_DEAD
2682 	 * is set, SBS_CANTRCVMORE should be set, so we shouldn't be
2683 	 * this far anyway.  Even if the socket is closing on the other
2684 	 * end, the AIO job holds a reference on this end of the socket
2685 	 * which will keep it open and keep the TCP PCB attached until
2686 	 * after the job is completed.
2687 	 */
2688 	wr = mk_update_tcb_for_ddp(sc, toep, db_idx, &ps->prsv, ps->len,
2689 	    job->aio_received, ddp_flags, ddp_flags_mask);
2690 	if (wr == NULL) {
2691 		recycle_pageset(toep, ps);
2692 		aio_ddp_requeue_one(toep, job);
2693 		toep->ddp.queueing = NULL;
2694 
2695 		/*
2696 		 * XXX: Need a way to kick a retry here.
2697 		 *
2698 		 * XXX: We know the fixed size needed and could
2699 		 * preallocate this using a blocking request at the
2700 		 * start of the task to avoid having to handle this
2701 		 * edge case.
2702 		 */
2703 		printf("%s: mk_update_tcb_for_ddp failed\n", __func__);
2704 		return;
2705 	}
2706 
2707 	if (!aio_set_cancel_function(job, t4_aio_cancel_active)) {
2708 		free_wrqe(wr);
2709 		recycle_pageset(toep, ps);
2710 		aio_ddp_cancel_one(job);
2711 		toep->ddp.queueing = NULL;
2712 		goto restart;
2713 	}
2714 
2715 #ifdef VERBOSE_TRACES
2716 	CTR6(KTR_CXGBE,
2717 	    "%s: tid %u, scheduling %p for DDP[%d] (flags %#lx/%#lx)", __func__,
2718 	    toep->tid, job, db_idx, ddp_flags, ddp_flags_mask);
2719 #endif
2720 	/* Give the chip the go-ahead. */
2721 	t4_wrq_tx(sc, wr);
2722 	db = &toep->ddp.db[db_idx];
2723 	db->cancel_pending = 0;
2724 	db->job = job;
2725 	db->ps = ps;
2726 	toep->ddp.queueing = NULL;
2727 	toep->ddp.flags |= buf_flag;
2728 	toep->ddp.active_count++;
2729 	if (toep->ddp.active_count == 1) {
2730 		MPASS(toep->ddp.active_id == -1);
2731 		toep->ddp.active_id = db_idx;
2732 		CTR2(KTR_CXGBE, "%s: ddp_active_id = %d", __func__,
2733 		    toep->ddp.active_id);
2734 	}
2735 	goto restart;
2736 }
2737 
2738 void
2739 ddp_queue_toep(struct toepcb *toep)
2740 {
2741 
2742 	DDP_ASSERT_LOCKED(toep);
2743 	if (toep->ddp.flags & DDP_TASK_ACTIVE)
2744 		return;
2745 	toep->ddp.flags |= DDP_TASK_ACTIVE;
2746 	hold_toepcb(toep);
2747 	soaio_enqueue(&toep->ddp.requeue_task);
2748 }
2749 
2750 static void
2751 aio_ddp_requeue_task(void *context, int pending)
2752 {
2753 	struct toepcb *toep = context;
2754 
2755 	DDP_LOCK(toep);
2756 	aio_ddp_requeue(toep);
2757 	toep->ddp.flags &= ~DDP_TASK_ACTIVE;
2758 	DDP_UNLOCK(toep);
2759 
2760 	free_toepcb(toep);
2761 }
2762 
2763 static void
2764 t4_aio_cancel_active(struct kaiocb *job)
2765 {
2766 	struct socket *so = job->fd_file->f_data;
2767 	struct tcpcb *tp = sototcpcb(so);
2768 	struct toepcb *toep = tp->t_toe;
2769 	struct adapter *sc = td_adapter(toep->td);
2770 	uint64_t valid_flag;
2771 	int i;
2772 
2773 	DDP_LOCK(toep);
2774 	if (aio_cancel_cleared(job)) {
2775 		DDP_UNLOCK(toep);
2776 		aio_ddp_cancel_one(job);
2777 		return;
2778 	}
2779 
2780 	for (i = 0; i < nitems(toep->ddp.db); i++) {
2781 		if (toep->ddp.db[i].job == job) {
2782 			/* Should only ever get one cancel request for a job. */
2783 			MPASS(toep->ddp.db[i].cancel_pending == 0);
2784 
2785 			/*
2786 			 * Invalidate this buffer.  It will be
2787 			 * cancelled or partially completed once the
2788 			 * card ACKs the invalidate.
2789 			 */
2790 			valid_flag = i == 0 ? V_TF_DDP_BUF0_VALID(1) :
2791 			    V_TF_DDP_BUF1_VALID(1);
2792 			t4_set_tcb_field(sc, toep->ctrlq, toep,
2793 			    W_TCB_RX_DDP_FLAGS, valid_flag, 0, 1,
2794 			    CPL_COOKIE_DDP0 + i);
2795 			toep->ddp.db[i].cancel_pending = 1;
2796 			CTR2(KTR_CXGBE, "%s: request %p marked pending",
2797 			    __func__, job);
2798 			break;
2799 		}
2800 	}
2801 	DDP_UNLOCK(toep);
2802 }
2803 
2804 static void
2805 t4_aio_cancel_queued(struct kaiocb *job)
2806 {
2807 	struct socket *so = job->fd_file->f_data;
2808 	struct tcpcb *tp = sototcpcb(so);
2809 	struct toepcb *toep = tp->t_toe;
2810 
2811 	DDP_LOCK(toep);
2812 	if (!aio_cancel_cleared(job)) {
2813 		TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
2814 		toep->ddp.waiting_count--;
2815 		if (toep->ddp.waiting_count == 0)
2816 			ddp_queue_toep(toep);
2817 	}
2818 	CTR2(KTR_CXGBE, "%s: request %p cancelled", __func__, job);
2819 	DDP_UNLOCK(toep);
2820 
2821 	aio_ddp_cancel_one(job);
2822 }
2823 
2824 int
2825 t4_aio_queue_ddp(struct socket *so, struct kaiocb *job)
2826 {
2827 	struct inpcb *inp = sotoinpcb(so);
2828 	struct tcpcb *tp = intotcpcb(inp);
2829 	struct toepcb *toep = tp->t_toe;
2830 
2831 	/* Ignore writes. */
2832 	if (job->uaiocb.aio_lio_opcode != LIO_READ)
2833 		return (EOPNOTSUPP);
2834 
2835 	INP_WLOCK(inp);
2836 	if (__predict_false(ulp_mode(toep) == ULP_MODE_NONE)) {
2837 		if (!set_ddp_ulp_mode(toep)) {
2838 			INP_WUNLOCK(inp);
2839 			return (EOPNOTSUPP);
2840 		}
2841 	}
2842 	INP_WUNLOCK(inp);
2843 
2844 	DDP_LOCK(toep);
2845 
2846 	/*
2847 	 * If DDP is being used for all normal receive, don't use it
2848 	 * for AIO.
2849 	 */
2850 	if ((toep->ddp.flags & DDP_RCVBUF) != 0) {
2851 		DDP_UNLOCK(toep);
2852 		return (EOPNOTSUPP);
2853 	}
2854 
2855 	/*
2856 	 * XXX: Think about possibly returning errors for ENOTCONN,
2857 	 * etc.  Perhaps the caller would only queue the request
2858 	 * if it failed with EOPNOTSUPP?
2859 	 */
2860 
2861 #ifdef VERBOSE_TRACES
2862 	CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid);
2863 #endif
2864 	if (!aio_set_cancel_function(job, t4_aio_cancel_queued))
2865 		panic("new job was cancelled");
2866 	TAILQ_INSERT_TAIL(&toep->ddp.aiojobq, job, list);
2867 	toep->ddp.waiting_count++;
2868 
2869 	if ((toep->ddp.flags & DDP_AIO) == 0) {
2870 		toep->ddp.flags |= DDP_AIO;
2871 		TAILQ_INIT(&toep->ddp.cached_pagesets);
2872 		TAILQ_INIT(&toep->ddp.aiojobq);
2873 		TASK_INIT(&toep->ddp.requeue_task, 0, aio_ddp_requeue_task,
2874 		    toep);
2875 	}
2876 
2877 	/*
2878 	 * Try to handle this request synchronously.  If this has
2879 	 * to block because the task is running, it will just bail
2880 	 * and let the task handle it instead.
2881 	 */
2882 	aio_ddp_requeue(toep);
2883 	DDP_UNLOCK(toep);
2884 	return (0);
2885 }
2886 
2887 static void
2888 ddp_rcvbuf_requeue(struct toepcb *toep)
2889 {
2890 	struct socket *so;
2891 	struct sockbuf *sb;
2892 	struct inpcb *inp;
2893 	struct ddp_rcv_buffer *drb;
2894 
2895 	DDP_ASSERT_LOCKED(toep);
2896 restart:
2897 	if ((toep->ddp.flags & DDP_DEAD) != 0) {
2898 		MPASS(toep->ddp.active_count == 0);
2899 		return;
2900 	}
2901 
2902 	/* If both buffers are active, nothing to do. */
2903 	if (toep->ddp.active_count == nitems(toep->ddp.db)) {
2904 		return;
2905 	}
2906 
2907 	inp = toep->inp;
2908 	so = inp->inp_socket;
2909 	sb = &so->so_rcv;
2910 
2911 	drb = alloc_cached_ddp_rcv_buffer(toep);
2912 	DDP_UNLOCK(toep);
2913 
2914 	if (drb == NULL) {
2915 		drb = alloc_ddp_rcv_buffer(toep, M_WAITOK);
2916 		if (drb == NULL) {
2917 			printf("%s: failed to allocate buffer\n", __func__);
2918 			DDP_LOCK(toep);
2919 			return;
2920 		}
2921 	}
2922 
2923 	DDP_LOCK(toep);
2924 	if ((toep->ddp.flags & DDP_DEAD) != 0 ||
2925 	    toep->ddp.active_count == nitems(toep->ddp.db)) {
2926 		recycle_ddp_rcv_buffer(toep, drb);
2927 		return;
2928 	}
2929 
2930 	/* We will never get anything unless we are or were connected. */
2931 	SOCKBUF_LOCK(sb);
2932 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
2933 		SOCKBUF_UNLOCK(sb);
2934 		recycle_ddp_rcv_buffer(toep, drb);
2935 		return;
2936 	}
2937 
2938 	/* Abort if socket has reported problems or is closed. */
2939 	if (so->so_error != 0 || (sb->sb_state & SBS_CANTRCVMORE) != 0) {
2940 		SOCKBUF_UNLOCK(sb);
2941 		recycle_ddp_rcv_buffer(toep, drb);
2942 		return;
2943 	}
2944 	SOCKBUF_UNLOCK(sb);
2945 
2946 	if (!queue_ddp_rcvbuf(toep, drb)) {
2947 		/*
2948 		 * XXX: Need a way to kick a retry here.
2949 		 *
2950 		 * XXX: We know the fixed size needed and could
2951 		 * preallocate the work request using a blocking
2952 		 * request at the start of the task to avoid having to
2953 		 * handle this edge case.
2954 		 */
2955 		return;
2956 	}
2957 	goto restart;
2958 }
2959 
2960 static void
2961 ddp_rcvbuf_requeue_task(void *context, int pending)
2962 {
2963 	struct toepcb *toep = context;
2964 
2965 	DDP_LOCK(toep);
2966 	ddp_rcvbuf_requeue(toep);
2967 	toep->ddp.flags &= ~DDP_TASK_ACTIVE;
2968 	DDP_UNLOCK(toep);
2969 
2970 	free_toepcb(toep);
2971 }
2972 
2973 int
2974 t4_enable_ddp_rcv(struct socket *so, struct toepcb *toep)
2975 {
2976 	struct inpcb *inp = sotoinpcb(so);
2977 	struct adapter *sc = td_adapter(toep->td);
2978 
2979 	INP_WLOCK(inp);
2980 	switch (ulp_mode(toep)) {
2981 	case ULP_MODE_TCPDDP:
2982 		break;
2983 	case ULP_MODE_NONE:
2984 		if (set_ddp_ulp_mode(toep))
2985 			break;
2986 		/* FALLTHROUGH */
2987 	default:
2988 		INP_WUNLOCK(inp);
2989 		return (EOPNOTSUPP);
2990 	}
2991 	INP_WUNLOCK(inp);
2992 
2993 	DDP_LOCK(toep);
2994 
2995 	/*
2996 	 * If DDP is being used for AIO already, don't use it for
2997 	 * normal receive.
2998 	 */
2999 	if ((toep->ddp.flags & DDP_AIO) != 0) {
3000 		DDP_UNLOCK(toep);
3001 		return (EOPNOTSUPP);
3002 	}
3003 
3004 	if ((toep->ddp.flags & DDP_RCVBUF) != 0) {
3005 		DDP_UNLOCK(toep);
3006 		return (EBUSY);
3007 	}
3008 
3009 	toep->ddp.flags |= DDP_RCVBUF;
3010 	TAILQ_INIT(&toep->ddp.cached_buffers);
3011 	enable_ddp(sc, toep);
3012 	TASK_INIT(&toep->ddp.requeue_task, 0, ddp_rcvbuf_requeue_task, toep);
3013 	ddp_queue_toep(toep);
3014 	DDP_UNLOCK(toep);
3015 	return (0);
3016 }
3017 
3018 void
3019 t4_ddp_mod_load(void)
3020 {
3021 	if (t4_ddp_rcvbuf_len < PAGE_SIZE)
3022 		t4_ddp_rcvbuf_len = PAGE_SIZE;
3023 	if (t4_ddp_rcvbuf_len > MAX_DDP_BUFFER_SIZE)
3024 		t4_ddp_rcvbuf_len = MAX_DDP_BUFFER_SIZE;
3025 	if (!powerof2(t4_ddp_rcvbuf_len))
3026 		t4_ddp_rcvbuf_len = 1 << fls(t4_ddp_rcvbuf_len);
3027 
3028 	t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, do_ddp_tcb_rpl,
3029 	    CPL_COOKIE_DDP0);
3030 	t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, do_ddp_tcb_rpl,
3031 	    CPL_COOKIE_DDP1);
3032 	t4_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
3033 	t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
3034 	TAILQ_INIT(&ddp_orphan_pagesets);
3035 	mtx_init(&ddp_orphan_pagesets_lock, "ddp orphans", NULL, MTX_DEF);
3036 	TASK_INIT(&ddp_orphan_task, 0, ddp_free_orphan_pagesets, NULL);
3037 }
3038 
3039 void
3040 t4_ddp_mod_unload(void)
3041 {
3042 
3043 	taskqueue_drain(taskqueue_thread, &ddp_orphan_task);
3044 	MPASS(TAILQ_EMPTY(&ddp_orphan_pagesets));
3045 	mtx_destroy(&ddp_orphan_pagesets_lock);
3046 	t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, NULL, CPL_COOKIE_DDP0);
3047 	t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, NULL, CPL_COOKIE_DDP1);
3048 	t4_register_cpl_handler(CPL_RX_DATA_DDP, NULL);
3049 	t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, NULL);
3050 }
3051 #endif
3052