1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * Copyright (c) 2006 Oracle.  All rights reserved.
27  *
28  * This software is available to you under a choice of one of two
29  * licenses.  You may choose to be licensed under the terms of the GNU
30  * General Public License (GPL) Version 2, available from the file
31  * COPYING in the main directory of this source tree, or the
32  * OpenIB.org BSD license below:
33  *
34  *     Redistribution and use in source and binary forms, with or
35  *     without modification, are permitted provided that the following
36  *     conditions are met:
37  *
38  *      - Redistributions of source code must retain the above
39  *        copyright notice, this list of conditions and the following
40  *        disclaimer.
41  *
42  *      - Redistributions in binary form must reproduce the above
43  *        copyright notice, this list of conditions and the following
44  *        disclaimer in the documentation and/or other materials
45  *        provided with the distribution.
46  *
47  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
48  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
49  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
50  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
51  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
52  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
53  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
54  * SOFTWARE.
55  *
56  */
57 #include <sys/types.h>
58 #include <sys/kmem.h>
59 #include <sys/cpuvar.h>
60 #include <sys/rds.h>
61 
62 #include <sys/ib/clients/rdsv3/rdsv3.h>
63 #include <sys/ib/clients/rdsv3/ib.h>
64 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
65 
66 static struct kmem_cache *rdsv3_ib_incoming_slab;
67 static atomic_t	rdsv3_ib_allocation = ATOMIC_INIT(0);
68 
69 void
70 rdsv3_ib_recv_init_ring(struct rdsv3_ib_connection *ic)
71 {
72 	struct rdsv3_ib_recv_work *recv;
73 	struct rdsv3_header *hdrp;
74 	uint32_t i;
75 
76 	RDSV3_DPRINTF4("rdsv3_ib_recv_init_ring", "ic: %p", ic);
77 
78 	hdrp = ic->i_recv_hdrs;
79 	for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
80 		recv->r_ibinc = NULL;
81 		recv->r_frag = NULL;
82 
83 		/* initialize the hdr sgl permanently */
84 		recv->r_sge[0].ds_va = (ib_vaddr_t)(uintptr_t)hdrp++;
85 		recv->r_sge[0].ds_len = sizeof (struct rdsv3_header);
86 		recv->r_sge[0].ds_key = ic->i_mr->lkey;
87 	}
88 }
89 
90 static void
91 rdsv3_ib_recv_clear_one(struct rdsv3_ib_connection *ic,
92     struct rdsv3_ib_recv_work *recv)
93 {
94 	RDSV3_DPRINTF4("rdsv3_ib_recv_clear_one", "ic: %p, recv: %p",
95 	    ic, recv);
96 
97 	if (recv->r_ibinc) {
98 		rdsv3_inc_put(&recv->r_ibinc->ii_inc);
99 		recv->r_ibinc = NULL;
100 	}
101 
102 	if (recv->r_frag) {
103 		kmem_cache_free(ic->rds_ibdev->ib_frag_slab, recv->r_frag);
104 		recv->r_frag = NULL;
105 	}
106 
107 	RDSV3_DPRINTF4("rdsv3_ib_recv_clear_one", "Return: ic: %p, recv: %p",
108 	    ic, recv);
109 }
110 
111 void
112 rdsv3_ib_recv_clear_ring(struct rdsv3_ib_connection *ic)
113 {
114 	uint32_t i;
115 
116 	RDSV3_DPRINTF4("rdsv3_ib_recv_clear_ring", "ic: %p", ic);
117 
118 	for (i = 0; i < ic->i_recv_ring.w_nr; i++)
119 		rdsv3_ib_recv_clear_one(ic, &ic->i_recvs[i]);
120 }
121 
122 extern int atomic_add_unless(atomic_t *, uint_t, ulong_t);
123 
124 static int
125 rdsv3_ib_recv_refill_one(struct rdsv3_connection *conn,
126     struct rdsv3_ib_recv_work *recv)
127 {
128 	struct rdsv3_ib_connection *ic = conn->c_transport_data;
129 	ibt_mi_hdl_t mi_hdl;
130 	ibt_iov_attr_t iov_attr;
131 	ibt_iov_t iov_arr[1];
132 
133 	RDSV3_DPRINTF5("rdsv3_ib_recv_refill_one", "conn: %p, recv: %p",
134 	    conn, recv);
135 
136 	if (!recv->r_ibinc) {
137 		if (!atomic_add_unless(&rdsv3_ib_allocation, 1,
138 		    rdsv3_ib_sysctl_max_recv_allocation)) {
139 			rdsv3_ib_stats_inc(s_ib_rx_alloc_limit);
140 			goto out;
141 		}
142 		recv->r_ibinc = kmem_cache_alloc(rdsv3_ib_incoming_slab,
143 		    KM_NOSLEEP);
144 		if (recv->r_ibinc == NULL) {
145 			atomic_add_32(&rdsv3_ib_allocation, -1);
146 			goto out;
147 		}
148 		rdsv3_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr);
149 		recv->r_ibinc->ii_ibdev = ic->rds_ibdev;
150 		recv->r_ibinc->ii_pool = ic->rds_ibdev->inc_pool;
151 	}
152 
153 	if (!recv->r_frag) {
154 		recv->r_frag = kmem_cache_alloc(ic->rds_ibdev->ib_frag_slab,
155 		    KM_NOSLEEP);
156 		if (!recv->r_frag)
157 			goto out;
158 	}
159 
160 	/* Data sge, structure copy */
161 	recv->r_sge[1] = recv->r_frag->f_sge;
162 
163 	RDSV3_DPRINTF5("rdsv3_ib_recv_refill_one", "Return: conn: %p, recv: %p",
164 	    conn, recv);
165 
166 	return (0);
167 out:
168 	if (recv->r_ibinc) {
169 		kmem_cache_free(rdsv3_ib_incoming_slab, recv->r_ibinc);
170 		atomic_add_32(&rdsv3_ib_allocation, -1);
171 		recv->r_ibinc = NULL;
172 	}
173 	return (-ENOMEM);
174 }
175 
176 /*
177  * This tries to allocate and post unused work requests after making sure that
178  * they have all the allocations they need to queue received fragments into
179  * sockets.  The i_recv_mutex is held here so that ring_alloc and _unalloc
180  * pairs don't go unmatched.
181  *
182  * -1 is returned if posting fails due to temporary resource exhaustion.
183  */
184 int
185 rdsv3_ib_recv_refill(struct rdsv3_connection *conn, int prefill)
186 {
187 	struct rdsv3_ib_connection *ic = conn->c_transport_data;
188 	struct rdsv3_ib_recv_work *recv;
189 	unsigned int posted = 0;
190 	int ret = 0, avail;
191 	uint32_t pos, i;
192 
193 	RDSV3_DPRINTF4("rdsv3_ib_recv_refill", "conn: %p, prefill: %d",
194 	    conn, prefill);
195 
196 	if (prefill || rdsv3_conn_up(conn)) {
197 		uint_t w_nr = ic->i_recv_ring.w_nr;
198 
199 		avail = rdsv3_ib_ring_alloc(&ic->i_recv_ring, w_nr, &pos);
200 		if ((avail <= 0) || (pos >= w_nr)) {
201 			RDSV3_DPRINTF2("rdsv3_ib_recv_refill",
202 			    "Argh - ring alloc returned pos=%u, avail: %d",
203 			    pos, avail);
204 			return (-EINVAL);
205 		}
206 
207 		/* populate the WRs */
208 		for (i = 0; i < avail; i++) {
209 			recv = &ic->i_recvs[pos];
210 			ret = rdsv3_ib_recv_refill_one(conn, recv);
211 			if (ret) {
212 				rdsv3_ib_ring_unalloc(&ic->i_recv_ring,
213 				    avail - i);
214 				break;
215 			}
216 			ic->i_recv_wrs[i].wr_id = (ibt_wrid_t)pos;
217 			ic->i_recv_wrs[i].wr_nds = RDSV3_IB_RECV_SGE;
218 			ic->i_recv_wrs[i].wr_sgl = &recv->r_sge[0];
219 
220 			pos = (pos + 1) % w_nr;
221 		}
222 
223 		if (i) {
224 			/* post the WRs at one shot */
225 			ret = ibt_post_recv(ib_get_ibt_channel_hdl(ic->i_cm_id),
226 			    &ic->i_recv_wrs[0], i, &posted);
227 			RDSV3_DPRINTF3("rdsv3_ib_recv_refill",
228 			    "attempted: %d posted: %d WRs ret %d",
229 			    i, posted, ret);
230 			if (ret) {
231 				RDSV3_DPRINTF2("rdsv3_ib_recv_refill",
232 				    "disconnecting and reconnecting\n",
233 				    NIPQUAD(conn->c_faddr), ret);
234 				rdsv3_ib_ring_unalloc(&ic->i_recv_ring,
235 				    i - posted);
236 				rdsv3_conn_drop(conn);
237 			}
238 		}
239 	}
240 
241 	/* We're doing flow control - update the window. */
242 	if (ic->i_flowctl && posted)
243 		rdsv3_ib_advertise_credits(conn, posted);
244 
245 	RDSV3_DPRINTF4("rdsv3_ib_recv_refill", "Return: conn: %p, posted: %d",
246 	    conn, posted);
247 	return (ret);
248 }
249 
250 /*
251  * delayed freed incoming's
252  */
253 struct rdsv3_inc_pool {
254 	list_t			f_list;	/* list of freed incoming */
255 	kmutex_t		f_lock; /* lock of fmr pool */
256 	int32_t			f_listcnt;
257 };
258 
259 void
260 rdsv3_ib_destroy_inc_pool(struct rdsv3_ib_device *rds_ibdev)
261 {
262 	struct rdsv3_inc_pool *pool = rds_ibdev->inc_pool;
263 
264 	if (pool) {
265 		list_destroy(&pool->f_list);
266 		kmem_free((void *) pool, sizeof (*pool));
267 	}
268 }
269 
270 int
271 rdsv3_ib_create_inc_pool(struct rdsv3_ib_device *rds_ibdev)
272 {
273 	struct rdsv3_inc_pool *pool;
274 
275 	pool = (struct rdsv3_inc_pool *)kmem_zalloc(sizeof (*pool), KM_NOSLEEP);
276 	if (pool == NULL) {
277 		return (-ENOMEM);
278 	}
279 	list_create(&pool->f_list, sizeof (struct rdsv3_ib_incoming),
280 	    offsetof(struct rdsv3_ib_incoming, ii_obj));
281 	mutex_init(&pool->f_lock, NULL, MUTEX_DRIVER, NULL);
282 	rds_ibdev->inc_pool = pool;
283 	return (0);
284 }
285 
286 static void
287 rdsv3_ib_inc_drop(struct rdsv3_ib_incoming *ibinc)
288 {
289 	struct rdsv3_page_frag *frag;
290 	struct rdsv3_page_frag *pos;
291 
292 	RDSV3_FOR_EACH_LIST_NODE_SAFE(frag, pos, &ibinc->ii_frags, f_item) {
293 		list_remove_node(&frag->f_item);
294 		kmem_cache_free(ibinc->ii_ibdev->ib_frag_slab, frag);
295 	}
296 
297 	ASSERT(list_is_empty(&ibinc->ii_frags));
298 	kmem_cache_free(rdsv3_ib_incoming_slab, ibinc);
299 	atomic_dec_uint(&rdsv3_ib_allocation);
300 }
301 
302 void
303 rdsv3_ib_drain_inclist(void *data)
304 {
305 	struct rdsv3_inc_pool *pool = (struct rdsv3_inc_pool *)data;
306 	struct rdsv3_ib_incoming *ibinc;
307 	list_t *listp = &pool->f_list;
308 	kmutex_t *lockp = &pool->f_lock;
309 	int i = 0;
310 
311 	for (;;) {
312 		mutex_enter(lockp);
313 		ibinc = (struct rdsv3_ib_incoming *)list_remove_head(listp);
314 		if (ibinc)
315 			pool->f_listcnt--;
316 		mutex_exit(lockp);
317 		if (!ibinc)
318 			break;
319 		i++;
320 		rdsv3_ib_inc_drop(ibinc);
321 	}
322 }
323 
324 void
325 rdsv3_ib_inc_free(struct rdsv3_incoming *inc)
326 {
327 	struct rdsv3_ib_incoming *ibinc;
328 	rdsv3_af_thr_t *af_thr;
329 
330 	RDSV3_DPRINTF4("rdsv3_ib_inc_free", "inc: %p", inc);
331 
332 	ibinc = container_of(inc, struct rdsv3_ib_incoming, ii_inc);
333 	/* save af_thr in a local as ib_inc might be freed at mutex_exit */
334 	af_thr = ibinc->ii_ibdev->inc_soft_cq;
335 
336 	mutex_enter(&ibinc->ii_pool->f_lock);
337 	list_insert_tail(&ibinc->ii_pool->f_list, ibinc);
338 	ibinc->ii_pool->f_listcnt++;
339 	mutex_exit(&ibinc->ii_pool->f_lock);
340 
341 	rdsv3_af_thr_fire(af_thr);
342 }
343 
344 int
345 rdsv3_ib_inc_copy_to_user(struct rdsv3_incoming *inc, uio_t *uiop,
346     size_t size)
347 {
348 	struct rdsv3_ib_incoming *ibinc;
349 	struct rdsv3_page_frag *frag;
350 	unsigned long to_copy;
351 	unsigned long frag_off = 0;
352 	int copied = 0;
353 	int ret;
354 	uint32_t len;
355 
356 	ibinc = container_of(inc, struct rdsv3_ib_incoming, ii_inc);
357 	frag = list_head(&ibinc->ii_frags);
358 	len = ntohl(inc->i_hdr.h_len);
359 
360 	RDSV3_DPRINTF4("rdsv3_ib_inc_copy_to_user", "inc: %p, size: %d len: %d",
361 	    inc, size, len);
362 
363 	while (copied < size && copied < len) {
364 		if (frag_off == RDSV3_FRAG_SIZE) {
365 			frag = list_next(&ibinc->ii_frags, frag);
366 			frag_off = 0;
367 		}
368 
369 		to_copy = min(len - copied, RDSV3_FRAG_SIZE - frag_off);
370 		to_copy = min(size - copied, to_copy);
371 
372 		RDSV3_DPRINTF5("rdsv3_ib_inc_copy_to_user",
373 		    "%lu bytes to user %p from frag [%p, %u] + %lu",
374 		    to_copy, uiop,
375 		    frag->f_page, frag->f_offset, frag_off);
376 
377 		ret = uiomove((caddr_t)(frag->f_page +
378 		    frag->f_offset + frag_off),
379 		    to_copy, UIO_READ, uiop);
380 		if (ret) {
381 			RDSV3_DPRINTF2("rdsv3_ib_inc_copy_to_user",
382 			    "uiomove (%d) returned: %d", to_copy, ret);
383 			break;
384 		}
385 
386 		frag_off += to_copy;
387 		copied += to_copy;
388 	}
389 
390 	RDSV3_DPRINTF4("rdsv3_ib_inc_copy_to_user",
391 	    "Return: inc: %p, copied: %d", inc, copied);
392 
393 	return (copied);
394 }
395 
396 /* ic starts out kmem_zalloc()ed */
397 void
398 rdsv3_ib_recv_init_ack(struct rdsv3_ib_connection *ic)
399 {
400 	ibt_send_wr_t *wr = &ic->i_ack_wr;
401 	ibt_wr_ds_t *sge = &ic->i_ack_sge;
402 
403 	RDSV3_DPRINTF4("rdsv3_ib_recv_init_ack", "ic: %p", ic);
404 
405 	sge->ds_va = ic->i_ack_dma;
406 	sge->ds_len = sizeof (struct rdsv3_header);
407 	sge->ds_key = ic->i_mr->lkey;
408 
409 	wr->wr_sgl = sge;
410 	wr->wr_nds = 1;
411 	wr->wr_opcode = IBT_WRC_SEND;
412 	wr->wr_id = RDSV3_IB_ACK_WR_ID;
413 	wr->wr_flags = IBT_WR_SEND_SIGNAL | IBT_WR_SEND_SOLICIT;
414 }
415 
416 /*
417  * You'd think that with reliable IB connections you wouldn't need to ack
418  * messages that have been received.  The problem is that IB hardware generates
419  * an ack message before it has DMAed the message into memory.  This creates a
420  * potential message loss if the HCA is disabled for any reason between when it
421  * sends the ack and before the message is DMAed and processed.  This is only a
422  * potential issue if another HCA is available for fail-over.
423  *
424  * When the remote host receives our ack they'll free the sent message from
425  * their send queue.  To decrease the latency of this we always send an ack
426  * immediately after we've received messages.
427  *
428  * For simplicity, we only have one ack in flight at a time.  This puts
429  * pressure on senders to have deep enough send queues to absorb the latency of
430  * a single ack frame being in flight.  This might not be good enough.
431  *
432  * This is implemented by have a long-lived send_wr and sge which point to a
433  * statically allocated ack frame.  This ack wr does not fall under the ring
434  * accounting that the tx and rx wrs do.  The QP attribute specifically makes
435  * room for it beyond the ring size.  Send completion notices its special
436  * wr_id and avoids working with the ring in that case.
437  */
438 void
439 rdsv3_ib_set_ack(struct rdsv3_ib_connection *ic, uint64_t seq,
440     int ack_required)
441 {
442 	RDSV3_DPRINTF4("rdsv3_ib_set_ack", "ic: %p, seq: %lld ack: %d",
443 	    ic, seq, ack_required);
444 
445 	mutex_enter(&ic->i_ack_lock);
446 	ic->i_ack_next = seq;
447 	if (ack_required)
448 		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
449 	mutex_exit(&ic->i_ack_lock);
450 }
451 
452 static uint64_t
453 rdsv3_ib_get_ack(struct rdsv3_ib_connection *ic)
454 {
455 	uint64_t seq;
456 
457 	RDSV3_DPRINTF4("rdsv3_ib_get_ack", "ic: %p", ic);
458 
459 	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
460 
461 	mutex_enter(&ic->i_ack_lock);
462 	seq = ic->i_ack_next;
463 	mutex_exit(&ic->i_ack_lock);
464 
465 	return (seq);
466 }
467 
468 static void
469 rdsv3_ib_send_ack(struct rdsv3_ib_connection *ic, unsigned int adv_credits)
470 {
471 	struct rdsv3_header *hdr = ic->i_ack;
472 	uint64_t seq;
473 	int ret;
474 
475 	RDSV3_DPRINTF4("rdsv3_ib_send_ack", "ic: %p adv_credits: %d",
476 	    ic, adv_credits);
477 
478 	seq = rdsv3_ib_get_ack(ic);
479 
480 	RDSV3_DPRINTF4("rdsv3_ib_send_ack", "send_ack: ic %p ack %llu",
481 	    ic, (unsigned long long) seq);
482 	rdsv3_message_populate_header(hdr, 0, 0, 0);
483 	hdr->h_ack = htonll(seq);
484 	hdr->h_credit = adv_credits;
485 	rdsv3_message_make_checksum(hdr);
486 	ic->i_ack_queued = jiffies;
487 
488 	ret = ibt_post_send(RDSV3_QP2CHANHDL(ic->i_cm_id->qp), &ic->i_ack_wr, 1,
489 	    NULL);
490 	if (ret) {
491 		/*
492 		 * Failed to send. Release the WR, and
493 		 * force another ACK.
494 		 */
495 		clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
496 		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
497 		rdsv3_ib_stats_inc(s_ib_ack_send_failure);
498 		RDSV3_DPRINTF2("rdsv3_ib_send_ack", "sending ack failed\n");
499 		rdsv3_conn_drop(ic->conn);
500 	} else {
501 		rdsv3_ib_stats_inc(s_ib_ack_sent);
502 	}
503 	RDSV3_DPRINTF4("rdsv3_ib_send_ack", "Return: ic: %p adv_credits: %d",
504 	    ic, adv_credits);
505 }
506 
507 /*
508  * There are 3 ways of getting acknowledgements to the peer:
509  *  1.	We call rdsv3_ib_attempt_ack from the recv completion handler
510  *	to send an ACK-only frame.
511  *	However, there can be only one such frame in the send queue
512  *	at any time, so we may have to postpone it.
513  *  2.	When another (data) packet is transmitted while there's
514  *	an ACK in the queue, we piggyback the ACK sequence number
515  *	on the data packet.
516  *  3.	If the ACK WR is done sending, we get called from the
517  *	send queue completion handler, and check whether there's
518  *	another ACK pending (postponed because the WR was on the
519  *	queue). If so, we transmit it.
520  *
521  * We maintain 2 variables:
522  *  -	i_ack_flags, which keeps track of whether the ACK WR
523  *	is currently in the send queue or not (IB_ACK_IN_FLIGHT)
524  *  -	i_ack_next, which is the last sequence number we received
525  *
526  * Potentially, send queue and receive queue handlers can run concurrently.
527  * It would be nice to not have to use a spinlock to synchronize things,
528  * but the one problem that rules this out is that 64bit updates are
529  * not atomic on all platforms. Things would be a lot simpler if
530  * we had atomic64 or maybe cmpxchg64 everywhere.
531  *
532  * Reconnecting complicates this picture just slightly. When we
533  * reconnect, we may be seeing duplicate packets. The peer
534  * is retransmitting them, because it hasn't seen an ACK for
535  * them. It is important that we ACK these.
536  *
537  * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
538  * this flag set *MUST* be acknowledged immediately.
539  */
540 
541 /*
542  * When we get here, we're called from the recv queue handler.
543  * Check whether we ought to transmit an ACK.
544  */
545 void
546 rdsv3_ib_attempt_ack(struct rdsv3_ib_connection *ic)
547 {
548 	unsigned int adv_credits;
549 
550 	RDSV3_DPRINTF4("rdsv3_ib_attempt_ack", "ic: %p", ic);
551 
552 	if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
553 		return;
554 
555 	if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
556 		rdsv3_ib_stats_inc(s_ib_ack_send_delayed);
557 		return;
558 	}
559 
560 	/* Can we get a send credit? */
561 	if (!rdsv3_ib_send_grab_credits(ic, 1, &adv_credits, 0)) {
562 		rdsv3_ib_stats_inc(s_ib_tx_throttle);
563 		clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
564 		return;
565 	}
566 
567 	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
568 	rdsv3_ib_send_ack(ic, adv_credits);
569 
570 	RDSV3_DPRINTF4("rdsv3_ib_attempt_ack", "Return: ic: %p", ic);
571 }
572 
573 /*
574  * We get here from the send completion handler, when the
575  * adapter tells us the ACK frame was sent.
576  */
577 void
578 rdsv3_ib_ack_send_complete(struct rdsv3_ib_connection *ic)
579 {
580 	RDSV3_DPRINTF4("rdsv3_ib_ack_send_complete", "ic: %p", ic);
581 	clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
582 	rdsv3_ib_attempt_ack(ic);
583 }
584 
585 /*
586  * This is called by the regular xmit code when it wants to piggyback
587  * an ACK on an outgoing frame.
588  */
589 uint64_t
590 rdsv3_ib_piggyb_ack(struct rdsv3_ib_connection *ic)
591 {
592 	RDSV3_DPRINTF4("rdsv3_ib_piggyb_ack", "ic: %p", ic);
593 	if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) {
594 		rdsv3_ib_stats_inc(s_ib_ack_send_piggybacked);
595 	}
596 	return (rdsv3_ib_get_ack(ic));
597 }
598 
599 /*
600  * It's kind of lame that we're copying from the posted receive pages into
601  * long-lived bitmaps.  We could have posted the bitmaps and rdma written into
602  * them.  But receiving new congestion bitmaps should be a *rare* event, so
603  * hopefully we won't need to invest that complexity in making it more
604  * efficient.  By copying we can share a simpler core with TCP which has to
605  * copy.
606  */
607 static void
608 rdsv3_ib_cong_recv(struct rdsv3_connection *conn,
609     struct rdsv3_ib_incoming *ibinc)
610 {
611 	struct rdsv3_cong_map *map;
612 	unsigned int map_off;
613 	unsigned int map_page;
614 	struct rdsv3_page_frag *frag;
615 	unsigned long frag_off;
616 	unsigned long to_copy;
617 	unsigned long copied;
618 	uint64_t uncongested = 0;
619 	caddr_t addr;
620 
621 	RDSV3_DPRINTF4("rdsv3_ib_cong_recv", "conn: %p, ibinc: %p",
622 	    conn, ibinc);
623 
624 	/* catch completely corrupt packets */
625 	if (ntohl(ibinc->ii_inc.i_hdr.h_len) != RDSV3_CONG_MAP_BYTES)
626 		return;
627 
628 	map = conn->c_fcong;
629 	map_page = 0;
630 	map_off = 0;
631 
632 	frag = list_head(&ibinc->ii_frags);
633 	frag_off = 0;
634 
635 	copied = 0;
636 
637 	while (copied < RDSV3_CONG_MAP_BYTES) {
638 		uint64_t *src, *dst;
639 		unsigned int k;
640 
641 		to_copy = min(RDSV3_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
642 		ASSERT(!(to_copy & 7)); /* Must be 64bit aligned. */
643 
644 		addr = frag->f_page + frag->f_offset;
645 
646 		src = (uint64_t *)(addr + frag_off);
647 		dst = (uint64_t *)(map->m_page_addrs[map_page] + map_off);
648 		RDSV3_DPRINTF4("rdsv3_ib_cong_recv",
649 		    "src: %p dst: %p copied: %d", src, dst, copied);
650 		for (k = 0; k < to_copy; k += 8) {
651 			/*
652 			 * Record ports that became uncongested, ie
653 			 * bits that changed from 0 to 1.
654 			 */
655 			uncongested |= ~(*src) & *dst;
656 			*dst++ = *src++;
657 		}
658 
659 		copied += to_copy;
660 		RDSV3_DPRINTF4("rdsv3_ib_cong_recv",
661 		    "src: %p dst: %p copied: %d", src, dst, copied);
662 
663 		map_off += to_copy;
664 		if (map_off == PAGE_SIZE) {
665 			map_off = 0;
666 			map_page++;
667 		}
668 
669 		frag_off += to_copy;
670 		if (frag_off == RDSV3_FRAG_SIZE) {
671 			frag = list_next(&ibinc->ii_frags, frag);
672 			frag_off = 0;
673 		}
674 	}
675 
676 #if 0
677 XXX
678 	/* the congestion map is in little endian order */
679 	uncongested = le64_to_cpu(uncongested);
680 #endif
681 
682 	rdsv3_cong_map_updated(map, uncongested);
683 
684 	RDSV3_DPRINTF4("rdsv3_ib_cong_recv", "Return: conn: %p, ibinc: %p",
685 	    conn, ibinc);
686 }
687 
688 static void
689 rdsv3_ib_process_recv(struct rdsv3_connection *conn,
690     struct rdsv3_ib_recv_work *recv, uint32_t data_len,
691     struct rdsv3_ib_ack_state *state)
692 {
693 	struct rdsv3_ib_connection *ic = conn->c_transport_data;
694 	struct rdsv3_ib_incoming *ibinc = ic->i_ibinc;
695 	struct rdsv3_header *ihdr, *hdr;
696 
697 	/* XXX shut down the connection if port 0,0 are seen? */
698 
699 	RDSV3_DPRINTF5("rdsv3_ib_process_recv",
700 	    "ic %p ibinc %p recv %p byte len %u", ic, ibinc, recv, data_len);
701 
702 	if (data_len < sizeof (struct rdsv3_header)) {
703 		RDSV3_DPRINTF2("rdsv3_ib_process_recv",
704 		    "incoming message from %u.%u.%u.%u didn't include a "
705 		    "header, disconnecting and reconnecting",
706 		    NIPQUAD(conn->c_faddr));
707 		rdsv3_conn_drop(conn);
708 		return;
709 	}
710 	data_len -= sizeof (struct rdsv3_header);
711 
712 	ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
713 
714 	/* Validate the checksum. */
715 	if (!rdsv3_message_verify_checksum(ihdr)) {
716 		RDSV3_DPRINTF2("rdsv3_ib_process_recv", "incoming message "
717 		    "from %u.%u.%u.%u has corrupted header - "
718 		    "forcing a reconnect",
719 		    NIPQUAD(conn->c_faddr));
720 		rdsv3_conn_drop(conn);
721 		rdsv3_stats_inc(s_recv_drop_bad_checksum);
722 		return;
723 	}
724 
725 	/* Process the ACK sequence which comes with every packet */
726 	state->ack_recv = ntohll(ihdr->h_ack);
727 	state->ack_recv_valid = 1;
728 
729 	/* Process the credits update if there was one */
730 	if (ihdr->h_credit)
731 		rdsv3_ib_send_add_credits(conn, ihdr->h_credit);
732 
733 	if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) {
734 		/*
735 		 * This is an ACK-only packet. The fact that it gets
736 		 * special treatment here is that historically, ACKs
737 		 * were rather special beasts.
738 		 */
739 		rdsv3_ib_stats_inc(s_ib_ack_received);
740 		return;
741 	}
742 
743 	/*
744 	 * If we don't already have an inc on the connection then this
745 	 * fragment has a header and starts a message.. copy its header
746 	 * into the inc and save the inc so we can hang upcoming fragments
747 	 * off its list.
748 	 */
749 	if (!ibinc) {
750 		ibinc = recv->r_ibinc;
751 		recv->r_ibinc = NULL;
752 		ic->i_ibinc = ibinc;
753 
754 		hdr = &ibinc->ii_inc.i_hdr;
755 		(void) memcpy(hdr, ihdr, sizeof (*hdr));
756 		ic->i_recv_data_rem = ntohl(hdr->h_len);
757 
758 		RDSV3_DPRINTF5("rdsv3_ib_process_recv",
759 		    "ic %p ibinc %p rem %u flag 0x%x", ic, ibinc,
760 		    ic->i_recv_data_rem, hdr->h_flags);
761 	} else {
762 		hdr = &ibinc->ii_inc.i_hdr;
763 		/*
764 		 * We can't just use memcmp here; fragments of a
765 		 * single message may carry different ACKs
766 		 */
767 		if (hdr->h_sequence != ihdr->h_sequence ||
768 		    hdr->h_len != ihdr->h_len ||
769 		    hdr->h_sport != ihdr->h_sport ||
770 		    hdr->h_dport != ihdr->h_dport) {
771 			RDSV3_DPRINTF2("rdsv3_ib_process_recv",
772 			    "fragment header mismatch; forcing reconnect");
773 			rdsv3_conn_drop(conn);
774 			return;
775 		}
776 	}
777 
778 	list_insert_tail(&ibinc->ii_frags, recv->r_frag);
779 	recv->r_frag = NULL;
780 
781 	if (ic->i_recv_data_rem > RDSV3_FRAG_SIZE)
782 		ic->i_recv_data_rem -= RDSV3_FRAG_SIZE;
783 	else {
784 		ic->i_recv_data_rem = 0;
785 		ic->i_ibinc = NULL;
786 
787 		if (ibinc->ii_inc.i_hdr.h_flags == RDSV3_FLAG_CONG_BITMAP)
788 			rdsv3_ib_cong_recv(conn, ibinc);
789 		else {
790 			rdsv3_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
791 			    &ibinc->ii_inc, KM_NOSLEEP);
792 			state->ack_next = ntohll(hdr->h_sequence);
793 			state->ack_next_valid = 1;
794 		}
795 
796 		/*
797 		 * Evaluate the ACK_REQUIRED flag *after* we received
798 		 * the complete frame, and after bumping the next_rx
799 		 * sequence.
800 		 */
801 		if (hdr->h_flags & RDSV3_FLAG_ACK_REQUIRED) {
802 			rdsv3_stats_inc(s_recv_ack_required);
803 			state->ack_required = 1;
804 		}
805 
806 		rdsv3_inc_put(&ibinc->ii_inc);
807 	}
808 
809 	RDSV3_DPRINTF4("rdsv3_ib_process_recv",
810 	    "Return: conn: %p recv: %p len: %d state: %p",
811 	    conn, recv, data_len, state);
812 }
813 
814 void
815 rdsv3_ib_recv_cqe_handler(struct rdsv3_ib_connection *ic, ibt_wc_t *wc,
816     struct rdsv3_ib_ack_state *state)
817 {
818 	struct rdsv3_connection *conn = ic->conn;
819 	struct rdsv3_ib_recv_work *recv;
820 	struct rdsv3_ib_work_ring *recv_ringp = &ic->i_recv_ring;
821 
822 	RDSV3_DPRINTF4("rdsv3_ib_recv_cqe_handler",
823 	    "rwc wc_id 0x%llx status %u byte_len %u imm_data %u\n",
824 	    (unsigned long long)wc->wc_id, wc->wc_status,
825 	    wc->wc_bytes_xfer, ntohl(wc->wc_immed_data));
826 
827 	rdsv3_ib_stats_inc(s_ib_rx_cq_event);
828 
829 	recv = &ic->i_recvs[rdsv3_ib_ring_oldest(recv_ringp)];
830 
831 	/*
832 	 * Also process recvs in connecting state because it is possible
833 	 * to get a recv completion _before_ the rdmacm ESTABLISHED
834 	 * event is processed.
835 	 */
836 	if (rdsv3_conn_up(conn) || rdsv3_conn_connecting(conn)) {
837 		/* We expect errors as the qp is drained during shutdown */
838 		if (wc->wc_status == IBT_WC_SUCCESS) {
839 			rdsv3_ib_process_recv(conn, recv,
840 			    wc->wc_bytes_xfer, state);
841 		} else {
842 			RDSV3_DPRINTF2("rdsv3_ib_recv_cqe_handler",
843 			    "recv completion on "
844 			    "%u.%u.%u.%u had status %u, "
845 			    "disconnecting and reconnecting\n",
846 			    NIPQUAD(conn->c_faddr),
847 			    wc->wc_status);
848 			rdsv3_conn_drop(conn);
849 		}
850 	}
851 
852 	rdsv3_ib_ring_free(recv_ringp, 1);
853 
854 	/*
855 	 * If we ever end up with a really empty receive ring, we're
856 	 * in deep trouble, as the sender will definitely see RNR
857 	 * timeouts.
858 	 */
859 	if (rdsv3_ib_ring_empty(recv_ringp))
860 		rdsv3_ib_stats_inc(s_ib_rx_ring_empty);
861 
862 	if (rdsv3_ib_ring_low(recv_ringp)) {
863 		rdsv3_af_thr_fire(ic->i_refill_rq);
864 	}
865 }
866 
867 int
868 rdsv3_ib_recv(struct rdsv3_connection *conn)
869 {
870 	struct rdsv3_ib_connection *ic = conn->c_transport_data;
871 	int ret = 0;
872 
873 	RDSV3_DPRINTF4("rdsv3_ib_recv", "conn %p\n", conn);
874 
875 	if (rdsv3_conn_up(conn))
876 		rdsv3_ib_attempt_ack(ic);
877 
878 	RDSV3_DPRINTF4("rdsv3_ib_recv", "Return: conn: %p", conn);
879 
880 	return (ret);
881 }
882 
883 extern int rdsv3_ib_inc_constructor(void *buf, void *arg, int kmflags);
884 extern void rdsv3_ib_inc_destructor(void *buf, void *arg);
885 
886 int
887 rdsv3_ib_recv_init(void)
888 {
889 	RDSV3_DPRINTF4("rdsv3_ib_recv_init", "Enter");
890 
891 	rdsv3_ib_incoming_slab = kmem_cache_create("rdsv3_ib_incoming",
892 	    sizeof (struct rdsv3_ib_incoming), 0, rdsv3_ib_inc_constructor,
893 	    rdsv3_ib_inc_destructor, NULL, NULL, NULL, 0);
894 	if (!rdsv3_ib_incoming_slab) {
895 		RDSV3_DPRINTF2("rdsv3_ib_recv_init", "kmem_cache_create "
896 		    "failed");
897 		return (-ENOMEM);
898 	}
899 
900 	RDSV3_DPRINTF4("rdsv3_ib_recv_init", "Return");
901 	return (0);
902 }
903 
904 void
905 rdsv3_ib_recv_exit(void)
906 {
907 	RDSV3_DPRINTF4("rdsv3_ib_recv_exit", "Enter");
908 	kmem_cache_destroy(rdsv3_ib_incoming_slab);
909 	RDSV3_DPRINTF4("rdsv3_ib_recv_exit", "Return");
910 }
911