1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 /* Copyright (c) 1990 Mentat Inc. */
27 
28 /*
29  * An implementation of the IPoIB-CM standard based on PSARC 2009/593.
30  */
31 #include <sys/types.h>
32 #include <sys/conf.h>
33 #include <sys/ddi.h>
34 #include <sys/sunddi.h>
35 #include <sys/modctl.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strsun.h>
39 #include <sys/strsubr.h>
40 #include <sys/dlpi.h>
41 #include <sys/mac_provider.h>
42 
43 #include <sys/pattr.h>		/* for HCK_FULLCKSUM */
44 #include <sys/atomic.h>		/* for atomic_add*() */
45 #include <sys/ethernet.h>	/* for ETHERTYPE_IP */
46 #include <netinet/in.h>		/* for netinet/ip.h below */
47 #include <netinet/ip.h>		/* for struct ip */
48 #include <inet/common.h>	/* for inet/ip.h below */
49 #include <inet/ip.h>		/* for ipha_t */
50 #include <inet/ip_if.h>		/* for ETHERTYPE_IPV6 */
51 #include <inet/ip6.h>		/* for ip6_t */
52 #include <netinet/icmp6.h>	/* for icmp6_t */
53 #include <sys/ib/ibtl/ibvti.h>	/* for ace->ac_dest->ud_dst_qpn */
54 
55 #include <sys/ib/clients/ibd/ibd.h>
56 
57 extern ibd_global_state_t ibd_gstate;
58 
59 /* Per-interface tunables (for developers) */
60 extern uint_t ibd_rc_tx_copy_thresh;
61 /*
62  * ibd_rc_rx_copy_thresh
63  *     If (the size of incoming buffer <= ibd_rc_rx_copy_thresh), ibd will
64  * attempt to allocate a buffer and do a bcopy of the incoming data into
65  * the alocated buffer.
66  *
67  * ibd_rc_rx_rwqe_thresh
68  *     If (the number of available rwqe < ibd_rc_rx_rwqe_thresh), ibd will
69  * attempt to allocate a buffer and do a bcopy of the incoming data into
70  * the allocated buffer.
71  */
72 uint_t ibd_rc_rx_copy_thresh = 0x1000;
73 uint_t ibd_rc_rx_rwqe_thresh = 0x200;	/* old is 32; */
74 
75 /*
76  * ibd_rc_num_swqe
77  *	1) Send CQ size = ibd_rc_num_swqe
78  *	2) The send queue size = ibd_rc_num_swqe -1
79  *	3) Number of pre-allocated Tx buffers for ibt_post_send() =
80  * ibd_rc_num_swqe - 1.
81  */
82 uint_t ibd_rc_num_swqe = 0x1ff;
83 
84 /*
85  * ibd_rc_num_rwqe
86  *	1) For non-SRQ, we pre-post ibd_rc_num_rwqe number of WRs
87  * via ibt_post_receive() for receive queue of each RC channel.
88  *	2) For SRQ and non-SRQ, receive CQ size = ibd_rc_num_rwqe
89  */
90 uint_t ibd_rc_num_rwqe = 0x7ff;
91 
92 /*
93  * For SRQ
94  *	If using SRQ, we allocate ibd_rc_num_srq number of buffers (the size of
95  * each buffer is equal to RC mtu). And post them by ibt_post_srq().
96  *
97  *	ibd_rc_num_srq should not be larger than ibd_rc_num_rwqe, otherwise
98  * it will cause a bug with the following warnings:
99  * NOTICE: hermon0: Device Error: EQE cq overrun or protection error
100  * NOTICE: hermon0: Device Error: EQE local work queue catastrophic error
101  * NOTICE: ibd0: HCA GUID 0003ba0001008984 port 1 PKEY ffff catastrophic
102  * channel error
103  * NOTICE: ibd0: HCA GUID 0003ba0001008984 port 1 PKEY ffff completion queue
104  * error
105  */
106 uint_t ibd_rc_num_srq = 0x7fe;
107 
108 boolean_t ibd_rc_enable_cq_moderation = B_TRUE;
109 
110 /*
111  * Send CQ moderation parameters
112  */
113 uint_t ibd_rc_txcomp_count = 10;
114 uint_t ibd_rc_txcomp_usec = 300;
115 
116 /*
117  * Receive CQ moderation parameters
118  */
119 uint_t ibd_rc_rxcomp_count = 4;
120 uint_t ibd_rc_rxcomp_usec = 10;
121 
122 uint_t ibd_rc_tx_softintr = 1;
123 
124 /*
125  * If the number of WRs in receive queue of each RC connection less than
126  * IBD_RC_RX_WR_THRESHOLD, we will post more receive WRs into it.
127  */
128 #define	IBD_RC_RX_WR_THRESHOLD		0x20
129 
130 /*
131  * If the number of free SWQEs (or large Tx buf) is larger than or equal to
132  * IBD_RC_TX_FREE_THRESH, we will call mac_tx_update to notify GLD to continue
133  * transmitting packets.
134  */
135 #define	IBD_RC_TX_FREE_THRESH		8
136 
137 #define	IBD_RC_QPN_TO_SID(qpn) \
138 	((uint64_t)(IBD_RC_SERVICE_ID | ((qpn) & 0xffffff)))
139 
140 /* For interop with legacy OFED */
141 #define	IBD_RC_QPN_TO_SID_OFED_INTEROP(qpn) \
142 	((uint64_t)(IBD_RC_SERVICE_ID_OFED_INTEROP | ((qpn) & 0xffffff)))
143 
144 /* Internet Header + 64 bits of Data Datagram. Refer to RFC 792 */
145 #define	IBD_RC_IP_ICMP_RETURN_DATA_BYTES	64
146 
147 
148 /* Functions for Reliable Connected Mode */
149 /* Connection Setup/Close Functions */
150 static ibt_cm_status_t ibd_rc_dispatch_pass_mad(void *,
151     ibt_cm_event_t *, ibt_cm_return_args_t *, void *, ibt_priv_data_len_t);
152 static ibt_cm_status_t ibd_rc_dispatch_actv_mad(void *,
153     ibt_cm_event_t *, ibt_cm_return_args_t *, void *, ibt_priv_data_len_t);
154 static int ibd_rc_pas_close(ibd_rc_chan_t *);
155 static void ibd_rc_act_close(ibd_rc_chan_t *);
156 
157 static inline void ibd_rc_add_to_chan_list(ibd_rc_chan_list_t *,
158     ibd_rc_chan_t *);
159 static inline ibd_rc_chan_t *ibd_rc_rm_header_chan_list(
160     ibd_rc_chan_list_t *);
161 static inline void ibd_rc_rm_from_chan_list(ibd_rc_chan_list_t *,
162     ibd_rc_chan_t *);
163 
164 /* CQ handlers */
165 static void ibd_rc_rcq_handler(ibt_cq_hdl_t, void *);
166 static void ibd_rc_scq_handler(ibt_cq_hdl_t, void *);
167 static void ibd_rc_poll_rcq(ibd_rc_chan_t *, ibt_cq_hdl_t);
168 
169 /* Receive Functions */
170 static int ibd_rc_post_srq(ibd_state_t *, ibd_rwqe_t *);
171 static void ibd_rc_srq_freemsg_cb(char *);
172 static void ibd_rc_srq_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
173 
174 static int ibd_rc_post_rwqe(ibd_rc_chan_t *, ibd_rwqe_t *);
175 static void ibd_rc_freemsg_cb(char *);
176 static void ibd_rc_process_rx(ibd_rc_chan_t *, ibd_rwqe_t *, ibt_wc_t *);
177 static void ibd_rc_free_rwqe(ibd_rc_chan_t *, ibd_rwqe_t *);
178 static void ibd_rc_fini_rxlist(ibd_rc_chan_t *);
179 
180 
181 /* Send Functions */
182 static void ibd_rc_release_swqe(ibd_rc_chan_t *, ibd_swqe_t *);
183 static int ibd_rc_init_txlist(ibd_rc_chan_t *);
184 static void ibd_rc_fini_txlist(ibd_rc_chan_t *);
185 static uint_t ibd_rc_tx_recycle(caddr_t);
186 
187 
188 void
189 ibd_async_rc_close_act_chan(ibd_state_t *state, ibd_req_t *req)
190 {
191 	ibd_rc_chan_t *rc_chan = req->rq_ptr;
192 	ibd_ace_t *ace;
193 
194 	while (rc_chan != NULL) {
195 		ace = rc_chan->ace;
196 		ASSERT(ace != NULL);
197 		/* Close old RC channel */
198 		ibd_rc_act_close(rc_chan);
199 		mutex_enter(&state->id_ac_mutex);
200 		ASSERT(ace->ac_ref != 0);
201 		atomic_dec_32(&ace->ac_ref);
202 		ace->ac_chan = NULL;
203 		if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
204 			IBD_ACACHE_INSERT_FREE(state, ace);
205 			ace->ac_ref = 0;
206 		} else {
207 			ace->ac_ref |= CYCLEVAL;
208 			state->rc_delay_ace_recycle++;
209 		}
210 		mutex_exit(&state->id_ac_mutex);
211 		rc_chan = ibd_rc_rm_header_chan_list(
212 		    &state->rc_obs_act_chan_list);
213 	}
214 }
215 
216 void
217 ibd_async_rc_recycle_ace(ibd_state_t *state, ibd_req_t *req)
218 {
219 	ibd_ace_t *ace = req->rq_ptr;
220 	ibd_rc_chan_t *rc_chan;
221 
222 	ASSERT(ace != NULL);
223 	rc_chan = ace->ac_chan;
224 	ASSERT(rc_chan != NULL);
225 	/* Close old RC channel */
226 	ibd_rc_act_close(rc_chan);
227 	mutex_enter(&state->id_ac_mutex);
228 	ASSERT(ace->ac_ref != 0);
229 	atomic_dec_32(&ace->ac_ref);
230 	ace->ac_chan = NULL;
231 	if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
232 		IBD_ACACHE_INSERT_FREE(state, ace);
233 		ace->ac_ref = 0;
234 	} else {
235 		ace->ac_ref |= CYCLEVAL;
236 		state->rc_delay_ace_recycle++;
237 	}
238 	mutex_exit(&state->id_ac_mutex);
239 	mutex_enter(&state->rc_ace_recycle_lock);
240 	state->rc_ace_recycle = NULL;
241 	mutex_exit(&state->rc_ace_recycle_lock);
242 }
243 
244 /* Simple ICMP IP Header Template */
245 static const ipha_t icmp_ipha = {
246 	IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
247 };
248 
249 /* Packet is too big. Send ICMP packet to GLD to request a smaller MTU */
250 void
251 ibd_async_rc_process_too_big(ibd_state_t *state, ibd_req_t *req)
252 {
253 	mblk_t *mp = req->rq_ptr;
254 	ibd_ace_t *ace = req->rq_ptr2;
255 	uint16_t mtu = state->id_mtu - IPOIB_HDRSIZE;
256 	uint_t	len_needed;
257 	size_t	msg_len;
258 	mblk_t	*pmtu_mp;
259 	ushort_t	sap;
260 	ib_header_info_t *ibha;	/* ib header for pmtu_pkt */
261 	/*
262 	 * ipha: IP header for pmtu_pkt
263 	 * old_ipha: IP header for old packet
264 	 */
265 	ipha_t *ipha, *old_ipha;
266 	icmph_t	*icmph;
267 
268 	sap = ntohs(((ipoib_hdr_t *)mp->b_rptr)->ipoib_type);
269 
270 	if (!pullupmsg(mp, -1)) {
271 		DPRINT(40, "ibd_async_rc_process_too_big: pullupmsg fail");
272 		goto too_big_fail;
273 	}
274 	/* move to IP header. */
275 	mp->b_rptr += IPOIB_HDRSIZE;
276 	old_ipha = (ipha_t *)mp->b_rptr;
277 
278 	len_needed = IPH_HDR_LENGTH(old_ipha);
279 	if (old_ipha->ipha_protocol == IPPROTO_ENCAP) {
280 		len_needed += IPH_HDR_LENGTH(((uchar_t *)old_ipha +
281 		    len_needed));
282 	} else if (old_ipha->ipha_protocol == IPPROTO_IPV6) {
283 		ip6_t *ip6h = (ip6_t *)((uchar_t *)old_ipha
284 		    + len_needed);
285 		len_needed += ip_hdr_length_v6(mp, ip6h);
286 	}
287 	len_needed += IBD_RC_IP_ICMP_RETURN_DATA_BYTES;
288 	msg_len = msgdsize(mp);
289 	if (msg_len > len_needed) {
290 		(void) adjmsg(mp, len_needed - msg_len);
291 		msg_len = len_needed;
292 	}
293 
294 	if ((pmtu_mp = allocb(sizeof (ib_header_info_t) + sizeof (ipha_t)
295 	    + sizeof (icmph_t), BPRI_MED)) == NULL) {
296 		DPRINT(40, "ibd_async_rc_process_too_big: allocb fail");
297 		goto too_big_fail;
298 	}
299 	pmtu_mp->b_cont = mp;
300 	pmtu_mp->b_wptr = pmtu_mp->b_rptr + sizeof (ib_header_info_t)
301 	    + sizeof (ipha_t) + sizeof (icmph_t);
302 
303 	ibha = (ib_header_info_t *)pmtu_mp->b_rptr;
304 
305 	/* Fill IB header */
306 	bcopy(&state->id_macaddr, &ibha->ib_dst, IPOIB_ADDRL);
307 	/*
308 	 * If the GRH is not valid, indicate to GLDv3 by setting
309 	 * the VerTcFlow field to 0.
310 	 */
311 	ibha->ib_grh.ipoib_vertcflow = 0;
312 	ibha->ipib_rhdr.ipoib_type = htons(sap);
313 	ibha->ipib_rhdr.ipoib_mbz = 0;
314 
315 	/* Fill IP header */
316 	ipha = (ipha_t *)&ibha[1];
317 	*ipha = icmp_ipha;
318 	ipha->ipha_src = old_ipha->ipha_dst;
319 	ipha->ipha_dst = old_ipha->ipha_src;
320 	ipha->ipha_ttl = old_ipha->ipha_ttl;
321 	msg_len += sizeof (icmp_ipha) + sizeof (icmph_t);
322 	if (msg_len > IP_MAXPACKET) {
323 		ibd_print_warn(state, "ibd_rc_process_too_big_pkt: msg_len(%d) "
324 		    "> IP_MAXPACKET", (uint32_t)msg_len);
325 		(void) adjmsg(mp, IP_MAXPACKET - msg_len);
326 		msg_len = IP_MAXPACKET;
327 	}
328 	ipha->ipha_length = htons((uint16_t)msg_len);
329 	ipha->ipha_hdr_checksum = 0;
330 	ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
331 
332 	/* Fill ICMP body */
333 	icmph = (icmph_t *)&ipha[1];
334 	bzero(icmph, sizeof (icmph_t));
335 	icmph->icmph_type = ICMP_DEST_UNREACHABLE;
336 	icmph->icmph_code = ICMP_FRAGMENTATION_NEEDED;
337 	icmph->icmph_du_mtu = htons(mtu);
338 	icmph->icmph_checksum = 0;
339 	icmph->icmph_checksum = IP_CSUM(pmtu_mp,
340 	    (int32_t)sizeof (ib_header_info_t) + (int32_t)sizeof (ipha_t), 0);
341 
342 	(void) hcksum_assoc(pmtu_mp, NULL, NULL, 0, 0, 0, 0,
343 	    HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);
344 
345 	DPRINT(30, "ibd_async_rc_process_too_big: sap=0x%x, ip_src=0x%x, "
346 	    "ip_dst=0x%x, ttl=%d, len_needed=%d, msg_len=%d",
347 	    sap, ipha->ipha_src, ipha->ipha_dst, ipha->ipha_ttl,
348 	    len_needed, (uint32_t)msg_len);
349 
350 	mac_rx(state->id_mh, state->id_rh, pmtu_mp);
351 
352 	mutex_enter(&ace->tx_too_big_mutex);
353 	ace->tx_too_big_ongoing = B_FALSE;
354 	mutex_exit(&ace->tx_too_big_mutex);
355 	return;
356 
357 too_big_fail:
358 	/* Drop packet */
359 	freemsg(mp);
360 	mutex_enter(&ace->tx_too_big_mutex);
361 	ace->tx_too_big_ongoing = B_FALSE;
362 	mutex_exit(&ace->tx_too_big_mutex);
363 }
364 
365 void
366 ibd_rc_get_conf(ibd_state_t *state)
367 {
368 	int *props;
369 	uint_t num_props;
370 	int instance;
371 
372 	instance = ddi_get_instance(state->id_dip);
373 
374 	/*
375 	 * Get the array of "enable_rc" properties from "ibd.conf" file
376 	 */
377 	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, state->id_dip,
378 	    DDI_PROP_DONTPASS, "enable_rc", &props, &num_props)
379 	    == DDI_PROP_SUCCESS) {
380 		if (instance < num_props) {
381 			if (props[instance] == 1) {
382 				state->id_enable_rc = B_TRUE;
383 			} else {
384 				state->id_enable_rc = B_FALSE;
385 			}
386 		} else {
387 			/* not enough properties configured */
388 			state->id_enable_rc = B_FALSE;
389 			DPRINT(40, "ibd_rc_get_conf: Not enough "
390 			    "enable_rc values in ibd.conf,"
391 			    " disable RC mode, instance=%d", instance);
392 		}
393 
394 		/* free memory allocated for properties */
395 		ddi_prop_free(props);
396 	} else {
397 		state->id_enable_rc = B_FALSE;
398 		DPRINT(30, "ibd_rc_get_conf: fail to find "
399 		    "enable_rc in ibd.conf, disable RC mode");
400 	}
401 
402 	state->rc_mtu = 65524;
403 	state->rc_enable_srq = B_TRUE;
404 }
405 
406 #ifdef DEBUG
407 /*
408  * ibd_rc_update_stats - update driver private kstat counters
409  *
410  * This routine will dump the internal statistics counters for ibd's
411  * Reliable Connected Mode. The current stats dump values will
412  * be sent to the kernel status area.
413  */
414 static int
415 ibd_rc_update_stats(kstat_t *ksp, int rw)
416 {
417 	ibd_state_t *state;
418 	ibd_rc_stat_t *ibd_rc_ksp;
419 
420 	if (rw == KSTAT_WRITE)
421 		return (EACCES);
422 
423 	state = (ibd_state_t *)ksp->ks_private;
424 	ASSERT(state != NULL);
425 	ibd_rc_ksp = (ibd_rc_stat_t *)ksp->ks_data;
426 
427 	ibd_rc_ksp->rc_rcv_trans_byte.value.ul = state->rc_rcv_trans_byte;
428 	ibd_rc_ksp->rc_rcv_trans_pkt.value.ul = state->rc_rcv_trans_pkt;
429 	ibd_rc_ksp->rc_rcv_copy_byte.value.ul = state->rc_rcv_copy_byte;
430 	ibd_rc_ksp->rc_rcv_copy_pkt.value.ul = state->rc_rcv_copy_pkt;
431 	ibd_rc_ksp->rc_rcv_alloc_fail.value.ul = state->rc_rcv_alloc_fail;
432 
433 	ibd_rc_ksp->rc_rcq_invoke.value.ul = state->rc_rcq_invoke;
434 	ibd_rc_ksp->rc_rcq_err.value.ul = state->rc_rcq_err;
435 	ibd_rc_ksp->rc_scq_invoke.value.ul = state->rc_scq_invoke;
436 
437 	ibd_rc_ksp->rc_rwqe_short.value.ul = state->rc_rwqe_short;
438 
439 	ibd_rc_ksp->rc_xmt_bytes.value.ul = state->rc_xmt_bytes;
440 	ibd_rc_ksp->rc_xmt_small_pkt.value.ul = state->rc_xmt_small_pkt;
441 	ibd_rc_ksp->rc_xmt_fragmented_pkt.value.ul =
442 	    state->rc_xmt_fragmented_pkt;
443 	ibd_rc_ksp->rc_xmt_map_fail_pkt.value.ul = state->rc_xmt_map_fail_pkt;
444 	ibd_rc_ksp->rc_xmt_map_succ_pkt.value.ul = state->rc_xmt_map_succ_pkt;
445 	ibd_rc_ksp->rc_ace_not_found.value.ul = state->rc_ace_not_found;
446 
447 	ibd_rc_ksp->rc_scq_no_swqe.value.ul = state->rc_scq_no_swqe;
448 	ibd_rc_ksp->rc_scq_no_largebuf.value.ul = state->rc_scq_no_largebuf;
449 	ibd_rc_ksp->rc_swqe_short.value.ul = state->rc_swqe_short;
450 	ibd_rc_ksp->rc_swqe_mac_update.value.ul = state->rc_swqe_mac_update;
451 	ibd_rc_ksp->rc_xmt_buf_short.value.ul = state->rc_xmt_buf_short;
452 	ibd_rc_ksp->rc_xmt_buf_mac_update.value.ul =
453 	    state->rc_xmt_buf_mac_update;
454 
455 	ibd_rc_ksp->rc_conn_succ.value.ul = state->rc_conn_succ;
456 	ibd_rc_ksp->rc_conn_fail.value.ul = state->rc_conn_fail;
457 	ibd_rc_ksp->rc_null_conn.value.ul = state->rc_null_conn;
458 	ibd_rc_ksp->rc_no_estab_conn.value.ul = state->rc_no_estab_conn;
459 
460 	ibd_rc_ksp->rc_act_close.value.ul = state->rc_act_close;
461 	ibd_rc_ksp->rc_pas_close.value.ul = state->rc_pas_close;
462 	ibd_rc_ksp->rc_delay_ace_recycle.value.ul = state->rc_delay_ace_recycle;
463 	ibd_rc_ksp->rc_act_close_simultaneous.value.ul =
464 	    state->rc_act_close_simultaneous;
465 	ibd_rc_ksp->rc_reset_cnt.value.ul = state->rc_reset_cnt;
466 
467 	return (0);
468 }
469 
470 
471 /*
472  * ibd_rc_init_stats - initialize kstat data structures
473  *
474  * This routine will create and initialize the driver private
475  * statistics counters.
476  */
477 int
478 ibd_rc_init_stats(ibd_state_t *state)
479 {
480 	kstat_t *ksp;
481 	ibd_rc_stat_t *ibd_rc_ksp;
482 
483 	/*
484 	 * Create and init kstat
485 	 */
486 	ksp = kstat_create("ibd", ddi_get_instance(state->id_dip),
487 	    "statistics", "net", KSTAT_TYPE_NAMED,
488 	    sizeof (ibd_rc_stat_t) / sizeof (kstat_named_t), 0);
489 
490 	if (ksp == NULL) {
491 		ibd_print_warn(state, "ibd_rc_init_stats: Could not create "
492 		    "kernel statistics");
493 		return (DDI_FAILURE);
494 	}
495 
496 	state->rc_ksp = ksp;	/* Fill in the ksp of ibd over RC mode */
497 
498 	ibd_rc_ksp = (ibd_rc_stat_t *)ksp->ks_data;
499 
500 	/*
501 	 * Initialize all the statistics
502 	 */
503 	kstat_named_init(&ibd_rc_ksp->rc_rcv_trans_byte, "RC: Rx Bytes, "
504 	    "transfer mode", KSTAT_DATA_ULONG);
505 	kstat_named_init(&ibd_rc_ksp->rc_rcv_trans_pkt, "RC: Rx Pkts, "
506 	    "transfer mode", KSTAT_DATA_ULONG);
507 	kstat_named_init(&ibd_rc_ksp->rc_rcv_copy_byte, "RC: Rx Bytes, "
508 	    "copy mode", KSTAT_DATA_ULONG);
509 	kstat_named_init(&ibd_rc_ksp->rc_rcv_copy_pkt, "RC: Rx Pkts, "
510 	    "copy mode", KSTAT_DATA_ULONG);
511 	kstat_named_init(&ibd_rc_ksp->rc_rcv_alloc_fail, "RC: Rx alloc fail",
512 	    KSTAT_DATA_ULONG);
513 
514 	kstat_named_init(&ibd_rc_ksp->rc_rcq_invoke, "RC: invoke of Recv CQ "
515 	    "handler", KSTAT_DATA_ULONG);
516 	kstat_named_init(&ibd_rc_ksp->rc_rcq_err, "RC: fail in Recv CQ handler",
517 	    KSTAT_DATA_ULONG);
518 
519 	kstat_named_init(&ibd_rc_ksp->rc_scq_invoke, "RC: invoke of Send CQ "
520 	    "handler", KSTAT_DATA_ULONG);
521 
522 	kstat_named_init(&ibd_rc_ksp->rc_rwqe_short, "RC: Short rwqe",
523 	    KSTAT_DATA_ULONG);
524 
525 	kstat_named_init(&ibd_rc_ksp->rc_xmt_bytes, "RC: Sent Bytes",
526 	    KSTAT_DATA_ULONG);
527 	kstat_named_init(&ibd_rc_ksp->rc_xmt_small_pkt,
528 	    "RC: Tx pkt small size", KSTAT_DATA_ULONG);
529 	kstat_named_init(&ibd_rc_ksp->rc_xmt_fragmented_pkt,
530 	    "RC: Tx pkt fragmentary", KSTAT_DATA_ULONG);
531 	kstat_named_init(&ibd_rc_ksp->rc_xmt_map_fail_pkt,
532 	    "RC: Tx pkt fail ibt_map_mem_iov()", KSTAT_DATA_ULONG);
533 	kstat_named_init(&ibd_rc_ksp->rc_xmt_map_succ_pkt,
534 	    "RC: Tx pkt succ ibt_map_mem_iov()", KSTAT_DATA_ULONG);
535 	kstat_named_init(&ibd_rc_ksp->rc_ace_not_found, "RC: ace not found",
536 	    KSTAT_DATA_ULONG);
537 
538 	kstat_named_init(&ibd_rc_ksp->rc_scq_no_swqe, "RC: No swqe after "
539 	    "recycle", KSTAT_DATA_ULONG);
540 	kstat_named_init(&ibd_rc_ksp->rc_scq_no_largebuf, "RC: No large tx buf "
541 	    "after recycle", KSTAT_DATA_ULONG);
542 	kstat_named_init(&ibd_rc_ksp->rc_swqe_short, "RC: No swqe in ibd_send",
543 	    KSTAT_DATA_ULONG);
544 	kstat_named_init(&ibd_rc_ksp->rc_swqe_mac_update, "RC: mac_tx_update "
545 	    "#, swqe available", KSTAT_DATA_ULONG);
546 	kstat_named_init(&ibd_rc_ksp->rc_xmt_buf_short, "RC: No buf in "
547 	    "ibd_send", KSTAT_DATA_ULONG);
548 	kstat_named_init(&ibd_rc_ksp->rc_xmt_buf_mac_update, "RC: "
549 	    "mac_tx_update #, buf available", KSTAT_DATA_ULONG);
550 
551 	kstat_named_init(&ibd_rc_ksp->rc_conn_succ, "RC: succ connected",
552 	    KSTAT_DATA_ULONG);
553 	kstat_named_init(&ibd_rc_ksp->rc_conn_fail, "RC: fail connect",
554 	    KSTAT_DATA_ULONG);
555 	kstat_named_init(&ibd_rc_ksp->rc_null_conn, "RC: null conn for unicast "
556 	    "pkt", KSTAT_DATA_ULONG);
557 	kstat_named_init(&ibd_rc_ksp->rc_no_estab_conn, "RC: not in act estab "
558 	    "state", KSTAT_DATA_ULONG);
559 
560 	kstat_named_init(&ibd_rc_ksp->rc_act_close, "RC: call ibd_rc_act_close",
561 	    KSTAT_DATA_ULONG);
562 	kstat_named_init(&ibd_rc_ksp->rc_pas_close, "RC: call ibd_rc_pas_close",
563 	    KSTAT_DATA_ULONG);
564 	kstat_named_init(&ibd_rc_ksp->rc_delay_ace_recycle, "RC: delay ace "
565 	    "recycle", KSTAT_DATA_ULONG);
566 	kstat_named_init(&ibd_rc_ksp->rc_act_close_simultaneous, "RC: "
567 	    "simultaneous ibd_rc_act_close", KSTAT_DATA_ULONG);
568 	kstat_named_init(&ibd_rc_ksp->rc_reset_cnt, "RC: Reset RC channel",
569 	    KSTAT_DATA_ULONG);
570 
571 	/*
572 	 * Function to provide kernel stat update on demand
573 	 */
574 	ksp->ks_update = ibd_rc_update_stats;
575 
576 	/*
577 	 * Pointer into provider's raw statistics
578 	 */
579 	ksp->ks_private = (void *)state;
580 
581 	/*
582 	 * Add kstat to systems kstat chain
583 	 */
584 	kstat_install(ksp);
585 
586 	return (DDI_SUCCESS);
587 }
588 #endif
589 
590 static ibt_status_t
591 ibd_rc_alloc_chan(ibd_rc_chan_t **ret_chan, ibd_state_t *state,
592     boolean_t is_tx_chan)
593 {
594 	ibt_status_t result;
595 	ibd_rc_chan_t *chan;
596 	ibt_rc_chan_alloc_args_t alloc_args;
597 	ibt_chan_alloc_flags_t alloc_flags;
598 	ibt_chan_sizes_t sizes;
599 	ibt_cq_attr_t cq_atts;
600 	int rv;
601 
602 	chan = kmem_zalloc(sizeof (ibd_rc_chan_t), KM_SLEEP);
603 
604 	chan->state = state;
605 	mutex_init(&chan->rx_wqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
606 	mutex_init(&chan->rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
607 	mutex_init(&chan->tx_wqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
608 	mutex_init(&chan->tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
609 	mutex_init(&chan->tx_post_lock, NULL, MUTEX_DRIVER, NULL);
610 	mutex_init(&chan->tx_poll_lock, NULL, MUTEX_DRIVER, NULL);
611 
612 	/* Allocate IB structures for a new RC channel. */
613 	if (is_tx_chan) {
614 		chan->scq_size = ibd_rc_num_swqe;
615 		chan->rcq_size = IBD_RC_MIN_CQ_SIZE;
616 	} else {
617 		chan->scq_size = IBD_RC_MIN_CQ_SIZE;
618 		chan->rcq_size = ibd_rc_num_rwqe;
619 	}
620 	cq_atts.cq_size = chan->scq_size;
621 	cq_atts.cq_sched = NULL;
622 	cq_atts.cq_flags = IBT_CQ_NO_FLAGS;
623 	result = ibt_alloc_cq(state->id_hca_hdl, &cq_atts, &chan->scq_hdl,
624 	    &chan->scq_size);
625 	if (result != IBT_SUCCESS) {
626 		DPRINT(40, "ibd_rc_alloc_chan: error <%d>"
627 		    "create scq completion queue (size <%d>)",
628 		    result, chan->scq_size);
629 		goto alloc_scq_err;
630 	}	/* if failure to alloc cq */
631 
632 	if (ibd_rc_enable_cq_moderation) {
633 		if (ibt_modify_cq(chan->scq_hdl, ibd_rc_txcomp_count,
634 		    ibd_rc_txcomp_usec, 0) != IBT_SUCCESS) {
635 			ibd_print_warn(state, "ibd_rc_alloc_chan: Send CQ "
636 			    "interrupt moderation failed");
637 		}
638 	}
639 
640 	ibt_set_cq_private(chan->scq_hdl, (void *) (uintptr_t)chan);
641 	ibt_set_cq_handler(chan->scq_hdl, ibd_rc_scq_handler,
642 	    (void *) (uintptr_t)chan);
643 
644 	cq_atts.cq_size = chan->rcq_size;
645 	cq_atts.cq_sched = NULL;
646 	cq_atts.cq_flags = IBT_CQ_NO_FLAGS;
647 	result = ibt_alloc_cq(state->id_hca_hdl, &cq_atts, &chan->rcq_hdl,
648 	    &chan->rcq_size);
649 	if (result != IBT_SUCCESS) {
650 		ibd_print_warn(state, "ibd_rc_alloc_chan: error <%d> creating "
651 		    "rx completion queue (size <%d>)", result, chan->rcq_size);
652 		goto alloc_rcq_err;
653 	}	/* if failure to alloc cq */
654 
655 	if (ibd_rc_enable_cq_moderation) {
656 		if (ibt_modify_cq(chan->rcq_hdl, ibd_rc_rxcomp_count,
657 		    ibd_rc_rxcomp_usec, 0) != IBT_SUCCESS) {
658 			ibd_print_warn(state, "ibd_rc_alloc_chan: Receive CQ "
659 			    "interrupt moderation failed");
660 		}
661 	}
662 	ibt_set_cq_private(chan->rcq_hdl, (void *) (uintptr_t)chan);
663 	ibt_set_cq_handler(chan->rcq_hdl, ibd_rc_rcq_handler,
664 	    (void *)(uintptr_t)chan);
665 
666 	if (is_tx_chan) {
667 		chan->is_tx_chan = B_TRUE;
668 		if (ibd_rc_init_txlist(chan) != DDI_SUCCESS) {
669 			ibd_print_warn(state, "ibd_rc_alloc_chan: "
670 			    "ibd_rc_init_txlist failed");
671 			goto init_txlist_err;
672 		}
673 		if (ibd_rc_tx_softintr == 1) {
674 			if ((rv = ddi_add_softintr(state->id_dip,
675 			    DDI_SOFTINT_LOW, &chan->scq_softintr, NULL, NULL,
676 			    ibd_rc_tx_recycle, (caddr_t)chan)) !=
677 			    DDI_SUCCESS) {
678 				DPRINT(10, "ibd_rc_alloc_chan: failed in "
679 				    "ddi_add_softintr(scq_softintr), ret=%d",
680 				    rv);
681 				goto alloc_softintr_err;
682 			}
683 		}
684 	} else {
685 		chan->is_tx_chan = B_FALSE;
686 	}
687 
688 	/*
689 	 * enable completions
690 	 */
691 	result = ibt_enable_cq_notify(chan->scq_hdl, IBT_NEXT_COMPLETION);
692 	if (result != IBT_SUCCESS) {
693 		ibd_print_warn(state, "ibd_rc_alloc_chan: ibt_enable_cq_notify"
694 		    "(scq) failed: status %d\n", result);
695 		goto alloc_scq_enable_err;
696 	}
697 
698 	/* We will enable chan->rcq_hdl later. */
699 
700 	/* alloc a RC channel */
701 	bzero(&alloc_args, sizeof (ibt_rc_chan_alloc_args_t));
702 	bzero(&sizes, sizeof (ibt_chan_sizes_t));
703 
704 	alloc_args.rc_flags = IBT_WR_SIGNALED;
705 	alloc_args.rc_control = IBT_CEP_NO_FLAGS;
706 
707 	alloc_args.rc_scq = chan->scq_hdl;
708 	alloc_args.rc_rcq = chan->rcq_hdl;
709 	alloc_args.rc_pd = state->id_pd_hdl;
710 
711 	alloc_args.rc_hca_port_num = state->id_port;
712 	alloc_args.rc_clone_chan = NULL;
713 
714 	/* scatter/gather */
715 	alloc_args.rc_sizes.cs_sq_sgl = state->rc_tx_max_sqseg;
716 
717 	/*
718 	 * For the number of SGL elements in receive side, I think it
719 	 * should be 1. Because ibd driver allocates a whole block memory
720 	 * for each ibt_post_recv().
721 	 */
722 	alloc_args.rc_sizes.cs_rq_sgl = 1;
723 
724 	/* The send queue size and the receive queue size */
725 	alloc_args.rc_sizes.cs_sq = chan->scq_size;
726 	alloc_args.rc_sizes.cs_rq = chan->rcq_size;
727 
728 	if (state->id_hca_res_lkey_capab) {
729 		alloc_args.rc_flags = IBT_FAST_REG_RES_LKEY;
730 	} else {
731 		DPRINT(40, "ibd_rc_alloc_chan: not support reserved lkey");
732 	}
733 
734 	if (state->rc_enable_srq) {
735 		alloc_flags = IBT_ACHAN_USES_SRQ;
736 		alloc_args.rc_srq = state->rc_srq_hdl;
737 	} else {
738 		alloc_flags = IBT_ACHAN_NO_FLAGS;
739 	}
740 
741 	result = ibt_alloc_rc_channel(state->id_hca_hdl,
742 	    alloc_flags, &alloc_args, &chan->chan_hdl, &sizes);
743 	if (result != IBT_SUCCESS) {
744 		ibd_print_warn(state, "ibd_rc_alloc_chan: ibd_rc_open_channel"
745 		    " fail:<%d>", result);
746 		goto alloc_scq_enable_err;
747 	}
748 
749 	*ret_chan = chan;
750 	return (IBT_SUCCESS);
751 
752 alloc_scq_enable_err:
753 	if (is_tx_chan) {
754 		if (ibd_rc_tx_softintr == 1) {
755 			ddi_remove_softintr(chan->scq_softintr);
756 		}
757 	}
758 alloc_softintr_err:
759 	if (is_tx_chan) {
760 		ibd_rc_fini_txlist(chan);
761 	}
762 init_txlist_err:
763 	(void) ibt_free_cq(chan->rcq_hdl);
764 alloc_rcq_err:
765 	(void) ibt_free_cq(chan->scq_hdl);
766 alloc_scq_err:
767 	mutex_destroy(&chan->tx_poll_lock);
768 	mutex_destroy(&chan->tx_post_lock);
769 	mutex_destroy(&chan->tx_rel_list.dl_mutex);
770 	mutex_destroy(&chan->tx_wqe_list.dl_mutex);
771 	mutex_destroy(&chan->rx_free_list.dl_mutex);
772 	mutex_destroy(&chan->rx_wqe_list.dl_mutex);
773 	kmem_free(chan, sizeof (ibd_rc_chan_t));
774 	return (result);
775 }
776 
777 static void
778 ibd_rc_free_chan(ibd_rc_chan_t *chan)
779 {
780 	ibt_status_t ret;
781 
782 	/* DPRINT(30, "ibd_rc_free_chan: chan=%p", chan); */
783 
784 	if (chan->chan_hdl != NULL) {
785 		ret = ibt_free_channel(chan->chan_hdl);
786 		if (ret != IBT_SUCCESS) {
787 			DPRINT(40, "ib_rc_free_chan: ibt_free_channel failed, "
788 			    "chan=%p, returned: %d", chan, ret);
789 			return;
790 		}
791 		chan->chan_hdl = NULL;
792 	}
793 
794 	if (chan->rcq_hdl != NULL) {
795 		ret = ibt_free_cq(chan->rcq_hdl);
796 		if (ret != IBT_SUCCESS) {
797 			DPRINT(40, "ib_rc_free_chan: ibt_free_cq(rcq) failed, "
798 			    "chan=%p, returned: %d", chan, ret);
799 			return;
800 		}
801 		chan->rcq_hdl = NULL;
802 	}
803 
804 	if (chan->scq_hdl != NULL) {
805 		ret = ibt_free_cq(chan->scq_hdl);
806 		if (ret != IBT_SUCCESS) {
807 			DPRINT(40, "ib_rc_free_chan: ibt_free_cq(scq) failed, "
808 			    "chan=%p, returned: %d", chan, ret);
809 			return;
810 		}
811 		chan->scq_hdl = NULL;
812 	}
813 
814 	/* Free buffers */
815 	if (chan->is_tx_chan) {
816 		ibd_rc_fini_txlist(chan);
817 		if (ibd_rc_tx_softintr == 1) {
818 			ddi_remove_softintr(chan->scq_softintr);
819 		}
820 	} else {
821 		if (!chan->state->rc_enable_srq) {
822 			ibd_rc_fini_rxlist(chan);
823 		}
824 	}
825 
826 	mutex_destroy(&chan->tx_poll_lock);
827 	mutex_destroy(&chan->tx_post_lock);
828 	mutex_destroy(&chan->tx_rel_list.dl_mutex);
829 	mutex_destroy(&chan->tx_wqe_list.dl_mutex);
830 	mutex_destroy(&chan->rx_free_list.dl_mutex);
831 	mutex_destroy(&chan->rx_wqe_list.dl_mutex);
832 
833 	/*
834 	 * If it is a passive channel, must make sure it has been removed
835 	 * from chan->state->rc_pass_chan_list
836 	 */
837 	kmem_free(chan, sizeof (ibd_rc_chan_t));
838 }
839 
840 /* Add a RC channel */
841 static inline void
842 ibd_rc_add_to_chan_list(ibd_rc_chan_list_t *list, ibd_rc_chan_t *chan)
843 {
844 	mutex_enter(&list->chan_list_mutex);
845 	if (list->chan_list == NULL) {
846 		list->chan_list = chan;
847 	} else {
848 		chan->next = list->chan_list;
849 		list->chan_list = chan;
850 	}
851 	mutex_exit(&list->chan_list_mutex);
852 }
853 
854 /* Remove a RC channel */
855 static inline void
856 ibd_rc_rm_from_chan_list(ibd_rc_chan_list_t *list, ibd_rc_chan_t *chan)
857 {
858 	ibd_rc_chan_t *pre_chan;
859 
860 	mutex_enter(&list->chan_list_mutex);
861 	if (list->chan_list == chan) {
862 		DPRINT(30, "ibd_rc_rm_from_chan_list(first): found chan(%p)"
863 		    " in chan_list", chan);
864 		list->chan_list = chan->next;
865 	} else {
866 		pre_chan = list->chan_list;
867 		while (pre_chan != NULL) {
868 			if (pre_chan->next == chan) {
869 				DPRINT(30, "ibd_rc_rm_from_chan_list"
870 				    "(middle): found chan(%p) in "
871 				    "rc_pass_chan_list", chan);
872 				pre_chan->next = chan->next;
873 				break;
874 			}
875 			pre_chan = pre_chan->next;
876 		}
877 	}
878 	mutex_exit(&list->chan_list_mutex);
879 }
880 
881 static inline ibd_rc_chan_t *
882 ibd_rc_rm_header_chan_list(ibd_rc_chan_list_t *list)
883 {
884 	ibd_rc_chan_t *rc_chan;
885 
886 	mutex_enter(&list->chan_list_mutex);
887 	rc_chan = list->chan_list;
888 	if (rc_chan != NULL) {
889 		list->chan_list = rc_chan->next;
890 	}
891 	mutex_exit(&list->chan_list_mutex);
892 	return (rc_chan);
893 }
894 
895 static int
896 ibd_rc_alloc_srq_copybufs(ibd_state_t *state)
897 {
898 	ibt_mr_attr_t mem_attr;
899 	uint_t rc_rx_bufs_sz;
900 
901 	/*
902 	 * Allocate one big chunk for all regular rx copy bufs
903 	 */
904 	rc_rx_bufs_sz =  (state->rc_mtu + IPOIB_GRH_SIZE) * state->rc_srq_size;
905 
906 	state->rc_srq_rx_bufs = kmem_zalloc(rc_rx_bufs_sz, KM_SLEEP);
907 
908 	state->rc_srq_rwqes = kmem_zalloc(state->rc_srq_size *
909 	    sizeof (ibd_rwqe_t), KM_SLEEP);
910 
911 	/*
912 	 * Do one memory registration on the entire rxbuf area
913 	 */
914 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->rc_srq_rx_bufs;
915 	mem_attr.mr_len = rc_rx_bufs_sz;
916 	mem_attr.mr_as = NULL;
917 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
918 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
919 	    &state->rc_srq_rx_mr_hdl, &state->rc_srq_rx_mr_desc)
920 	    != IBT_SUCCESS) {
921 		DPRINT(40, "ibd_rc_alloc_srq_copybufs: ibt_register_mr() "
922 		    "failed");
923 		kmem_free(state->rc_srq_rwqes,
924 		    state->rc_srq_size * sizeof (ibd_rwqe_t));
925 		kmem_free(state->rc_srq_rx_bufs, rc_rx_bufs_sz);
926 		state->rc_srq_rx_bufs = NULL;
927 		state->rc_srq_rwqes = NULL;
928 		return (DDI_FAILURE);
929 	}
930 
931 	return (DDI_SUCCESS);
932 }
933 
934 static void
935 ibd_rc_free_srq_copybufs(ibd_state_t *state)
936 {
937 	uint_t rc_rx_buf_sz;
938 
939 	/*
940 	 * Don't change the value of state->rc_mtu at the period from call
941 	 * ibd_rc_alloc_srq_copybufs() to call ibd_rc_free_srq_copybufs().
942 	 */
943 	rc_rx_buf_sz = state->rc_mtu + IPOIB_GRH_SIZE;
944 
945 	/*
946 	 * Unregister rxbuf mr
947 	 */
948 	if (ibt_deregister_mr(state->id_hca_hdl,
949 	    state->rc_srq_rx_mr_hdl) != IBT_SUCCESS) {
950 		DPRINT(40, "ibd_rc_free_srq_copybufs: ibt_deregister_mr()"
951 		    " failed");
952 	}
953 	state->rc_srq_rx_mr_hdl = NULL;
954 
955 	/*
956 	 * Free rxbuf memory
957 	 */
958 	kmem_free(state->rc_srq_rwqes,
959 	    state->rc_srq_size * sizeof (ibd_rwqe_t));
960 	kmem_free(state->rc_srq_rx_bufs, state->rc_srq_size * rc_rx_buf_sz);
961 	state->rc_srq_rwqes = NULL;
962 	state->rc_srq_rx_bufs = NULL;
963 }
964 
965 /*
966  * Allocate and post a certain number of SRQ receive buffers and WRs.
967  */
968 int
969 ibd_rc_init_srq_list(ibd_state_t *state)
970 {
971 	ibd_rwqe_t *rwqe;
972 	ibt_lkey_t lkey;
973 	int i;
974 	uint_t len;
975 	uint8_t *bufaddr;
976 	ibt_srq_sizes_t srq_sizes;
977 	ibt_srq_sizes_t	 srq_real_sizes;
978 	ibt_status_t ret;
979 
980 	srq_sizes.srq_sgl_sz = 1;
981 	srq_sizes.srq_wr_sz = ibd_rc_num_srq;
982 	ret = ibt_alloc_srq(state->id_hca_hdl, IBT_SRQ_NO_FLAGS,
983 	    state->id_pd_hdl, &srq_sizes, &state->rc_srq_hdl, &srq_real_sizes);
984 	if (ret != IBT_SUCCESS) {
985 		DPRINT(10, "ibd_rc_init_srq_list: ibt_alloc_srq failed."
986 		    "req_sgl_sz=%d, req_wr_sz=0x%x, ret=%d",
987 		    srq_sizes.srq_sgl_sz, srq_sizes.srq_wr_sz, ret);
988 		return (DDI_FAILURE);
989 	}
990 
991 	state->rc_srq_size = srq_real_sizes.srq_wr_sz;
992 	if (ibd_rc_alloc_srq_copybufs(state) != DDI_SUCCESS) {
993 		ret = ibt_free_srq(state->rc_srq_hdl);
994 		if (ret != IBT_SUCCESS) {
995 			ibd_print_warn(state, "ibd_rc_init_srq_list: "
996 			    "ibt_free_srq fail, ret=%d", ret);
997 		}
998 		return (DDI_FAILURE);
999 	}
1000 
1001 	/*
1002 	 * Allocate and setup the rwqe list
1003 	 */
1004 	lkey = state->rc_srq_rx_mr_desc.md_lkey;
1005 	rwqe = state->rc_srq_rwqes;
1006 	bufaddr = state->rc_srq_rx_bufs;
1007 	len = state->rc_mtu + IPOIB_GRH_SIZE;
1008 	state->rc_srq_rwqe_list.dl_cnt = 0;
1009 	state->rc_srq_rwqe_list.dl_bufs_outstanding = 0;
1010 	for (i = 0; i < state->rc_srq_size; i++, rwqe++, bufaddr += len) {
1011 		rwqe->w_state = state;
1012 		rwqe->w_freeing_wqe = B_FALSE;
1013 		rwqe->w_freemsg_cb.free_func = ibd_rc_srq_freemsg_cb;
1014 		rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
1015 		rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
1016 
1017 		if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
1018 		    &rwqe->w_freemsg_cb)) == NULL) {
1019 			DPRINT(40, "ibd_rc_init_srq_list : desballoc() failed");
1020 			rwqe->rwqe_copybuf.ic_bufaddr = NULL;
1021 			if (atomic_dec_32_nv(&state->id_running) != 0) {
1022 				cmn_err(CE_WARN, "ibd_rc_init_srq_list: "
1023 				    "id_running was not 1\n");
1024 			}
1025 			ibd_rc_fini_srq_list(state);
1026 			atomic_inc_32(&state->id_running);
1027 			return (DDI_FAILURE);
1028 		}
1029 
1030 		rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
1031 		/* Leave IPOIB_GRH_SIZE space */
1032 		rwqe->rwqe_copybuf.ic_sgl.ds_va =
1033 		    (ib_vaddr_t)(uintptr_t)(bufaddr + IPOIB_GRH_SIZE);
1034 		rwqe->rwqe_copybuf.ic_sgl.ds_len = state->rc_mtu;
1035 		rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
1036 		rwqe->w_rwr.wr_nds = 1;
1037 		rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
1038 		(void) ibd_rc_post_srq(state, rwqe);
1039 	}
1040 
1041 	mutex_enter(&state->rc_srq_free_list.dl_mutex);
1042 	state->rc_srq_free_list.dl_head = NULL;
1043 	state->rc_srq_free_list.dl_cnt = 0;
1044 	mutex_exit(&state->rc_srq_free_list.dl_mutex);
1045 
1046 	return (DDI_SUCCESS);
1047 }
1048 
1049 /*
1050  * Free the statically allocated Rx buffer list for SRQ.
1051  */
1052 void
1053 ibd_rc_fini_srq_list(ibd_state_t *state)
1054 {
1055 	ibd_rwqe_t *rwqe;
1056 	int i;
1057 	ibt_status_t ret;
1058 
1059 	ASSERT(state->id_running == 0);
1060 	ret = ibt_free_srq(state->rc_srq_hdl);
1061 	if (ret != IBT_SUCCESS) {
1062 		ibd_print_warn(state, "ibd_rc_fini_srq_list: "
1063 		    "ibt_free_srq fail, ret=%d", ret);
1064 	}
1065 
1066 	mutex_enter(&state->rc_srq_rwqe_list.dl_mutex);
1067 	rwqe = state->rc_srq_rwqes;
1068 	for (i = 0; i < state->rc_srq_size; i++, rwqe++) {
1069 		if (rwqe->rwqe_im_mblk != NULL) {
1070 			rwqe->w_freeing_wqe = B_TRUE;
1071 			freemsg(rwqe->rwqe_im_mblk);
1072 		}
1073 	}
1074 	mutex_exit(&state->rc_srq_rwqe_list.dl_mutex);
1075 
1076 	ibd_rc_free_srq_copybufs(state);
1077 }
1078 
1079 /* Repost the elements in state->ib_rc_free_list */
1080 int
1081 ibd_rc_repost_srq_free_list(ibd_state_t *state)
1082 {
1083 	ibd_rwqe_t *rwqe;
1084 	ibd_wqe_t *list;
1085 	uint_t len;
1086 
1087 	mutex_enter(&state->rc_srq_free_list.dl_mutex);
1088 	if (state->rc_srq_free_list.dl_head != NULL) {
1089 		/* repost them */
1090 		len = state->rc_mtu + IPOIB_GRH_SIZE;
1091 		list = state->rc_srq_free_list.dl_head;
1092 		state->rc_srq_free_list.dl_head = NULL;
1093 		state->rc_srq_free_list.dl_cnt = 0;
1094 		mutex_exit(&state->rc_srq_free_list.dl_mutex);
1095 		while (list != NULL) {
1096 			rwqe = WQE_TO_RWQE(list);
1097 			if ((rwqe->rwqe_im_mblk == NULL) &&
1098 			    ((rwqe->rwqe_im_mblk = desballoc(
1099 			    rwqe->rwqe_copybuf.ic_bufaddr, len, 0,
1100 			    &rwqe->w_freemsg_cb)) == NULL)) {
1101 				DPRINT(40, "ibd_rc_repost_srq_free_list: "
1102 				    "failed in desballoc()");
1103 				do {
1104 					ibd_rc_srq_free_rwqe(state, rwqe);
1105 					list = list->w_next;
1106 					rwqe = WQE_TO_RWQE(list);
1107 				} while (list != NULL);
1108 				return (DDI_FAILURE);
1109 			}
1110 			if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) {
1111 				ibd_rc_srq_free_rwqe(state, rwqe);
1112 			}
1113 			list = list->w_next;
1114 		}
1115 		return (DDI_SUCCESS);
1116 	}
1117 	mutex_exit(&state->rc_srq_free_list.dl_mutex);
1118 	return (DDI_SUCCESS);
1119 }
1120 
1121 /*
1122  * Free an allocated recv wqe.
1123  */
1124 static void
1125 ibd_rc_srq_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
1126 {
1127 	/*
1128 	 * desballoc() failed (no memory) or the posting of rwqe failed.
1129 	 *
1130 	 * This rwqe is placed on a free list so that it
1131 	 * can be reinstated in future.
1132 	 *
1133 	 * NOTE: no code currently exists to reinstate
1134 	 * these "lost" rwqes.
1135 	 */
1136 	mutex_enter(&state->rc_srq_free_list.dl_mutex);
1137 	state->rc_srq_free_list.dl_cnt++;
1138 	rwqe->rwqe_next = state->rc_srq_free_list.dl_head;
1139 	state->rc_srq_free_list.dl_head = RWQE_TO_WQE(rwqe);
1140 	mutex_exit(&state->rc_srq_free_list.dl_mutex);
1141 }
1142 
1143 static void
1144 ibd_rc_srq_freemsg_cb(char *arg)
1145 {
1146 	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
1147 	ibd_state_t *state = rwqe->w_state;
1148 
1149 	ASSERT(state->rc_enable_srq);
1150 
1151 	/*
1152 	 * If the driver is stopped, just free the rwqe.
1153 	 */
1154 	if (atomic_add_32_nv(&state->id_running, 0) == 0) {
1155 		if (!rwqe->w_freeing_wqe) {
1156 			atomic_dec_32(
1157 			    &state->rc_srq_rwqe_list.dl_bufs_outstanding);
1158 			DPRINT(6, "ibd_rc_srq_freemsg_cb: wqe being freed");
1159 			rwqe->rwqe_im_mblk = NULL;
1160 			ibd_rc_srq_free_rwqe(state, rwqe);
1161 		}
1162 		return;
1163 	}
1164 
1165 	atomic_dec_32(&state->rc_srq_rwqe_list.dl_bufs_outstanding);
1166 
1167 	ASSERT(state->rc_srq_rwqe_list.dl_cnt < state->rc_srq_size);
1168 	ASSERT(!rwqe->w_freeing_wqe);
1169 
1170 	/*
1171 	 * Upper layer has released held mblk, so we have
1172 	 * no more use for keeping the old pointer in
1173 	 * our rwqe.
1174 	 */
1175 	rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
1176 	    state->rc_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
1177 	if (rwqe->rwqe_im_mblk == NULL) {
1178 		DPRINT(40, "ibd_rc_srq_freemsg_cb: desballoc failed");
1179 		ibd_rc_srq_free_rwqe(state, rwqe);
1180 		return;
1181 	}
1182 
1183 	if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) {
1184 		ibd_print_warn(state, "ibd_rc_srq_freemsg_cb: ibd_rc_post_srq"
1185 		    " failed");
1186 		ibd_rc_srq_free_rwqe(state, rwqe);
1187 		return;
1188 	}
1189 }
1190 
1191 /*
1192  * Post a rwqe to the hardware and add it to the Rx list.
1193  */
1194 static int
1195 ibd_rc_post_srq(ibd_state_t *state, ibd_rwqe_t *rwqe)
1196 {
1197 	/*
1198 	 * Here we should add dl_cnt before post recv, because
1199 	 * we would have to make sure dl_cnt is updated before
1200 	 * the corresponding ibd_rc_process_rx() is called.
1201 	 */
1202 	ASSERT(state->rc_srq_rwqe_list.dl_cnt < state->rc_srq_size);
1203 	atomic_add_32(&state->rc_srq_rwqe_list.dl_cnt, 1);
1204 	if (ibt_post_srq(state->rc_srq_hdl, &rwqe->w_rwr, 1, NULL) !=
1205 	    IBT_SUCCESS) {
1206 		atomic_dec_32(&state->rc_srq_rwqe_list.dl_cnt);
1207 		DPRINT(40, "ibd_rc_post_srq : ibt_post_srq() failed");
1208 		return (DDI_FAILURE);
1209 	}
1210 
1211 	return (DDI_SUCCESS);
1212 }
1213 
1214 /*
1215  * Post a rwqe to the hardware and add it to the Rx list.
1216  */
1217 static int
1218 ibd_rc_post_rwqe(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe)
1219 {
1220 	/*
1221 	 * Here we should add dl_cnt before post recv, because we would
1222 	 * have to make sure dl_cnt has already updated before
1223 	 * corresponding ibd_rc_process_rx() is called.
1224 	 */
1225 	atomic_add_32(&chan->rx_wqe_list.dl_cnt, 1);
1226 	if (ibt_post_recv(chan->chan_hdl, &rwqe->w_rwr, 1, NULL) !=
1227 	    IBT_SUCCESS) {
1228 		atomic_dec_32(&chan->rx_wqe_list.dl_cnt);
1229 		DPRINT(40, "ibd_rc_post_rwqe : failed in ibt_post_recv()");
1230 		return (DDI_FAILURE);
1231 	}
1232 	return (DDI_SUCCESS);
1233 }
1234 
1235 static int
1236 ibd_rc_alloc_rx_copybufs(ibd_rc_chan_t *chan)
1237 {
1238 	ibd_state_t *state = chan->state;
1239 	ibt_mr_attr_t mem_attr;
1240 	uint_t rc_rx_bufs_sz;
1241 
1242 	/*
1243 	 * Allocate one big chunk for all regular rx copy bufs
1244 	 */
1245 	rc_rx_bufs_sz = (state->rc_mtu + IPOIB_GRH_SIZE) * chan->rcq_size;
1246 
1247 	chan->rx_bufs = kmem_zalloc(rc_rx_bufs_sz, KM_SLEEP);
1248 
1249 	chan->rx_rwqes = kmem_zalloc(chan->rcq_size *
1250 	    sizeof (ibd_rwqe_t), KM_SLEEP);
1251 
1252 	/*
1253 	 * Do one memory registration on the entire rxbuf area
1254 	 */
1255 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)chan->rx_bufs;
1256 	mem_attr.mr_len = rc_rx_bufs_sz;
1257 	mem_attr.mr_as = NULL;
1258 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
1259 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
1260 	    &chan->rx_mr_hdl, &chan->rx_mr_desc) != IBT_SUCCESS) {
1261 		DPRINT(40, "ibd_rc_alloc_srq_copybufs: ibt_register_mr failed");
1262 		kmem_free(chan->rx_rwqes, chan->rcq_size * sizeof (ibd_rwqe_t));
1263 		kmem_free(chan->rx_bufs, rc_rx_bufs_sz);
1264 		chan->rx_bufs = NULL;
1265 		chan->rx_rwqes = NULL;
1266 		return (DDI_FAILURE);
1267 	}
1268 
1269 	return (DDI_SUCCESS);
1270 }
1271 
1272 static void
1273 ibd_rc_free_rx_copybufs(ibd_rc_chan_t *chan)
1274 {
1275 	ibd_state_t *state = chan->state;
1276 	uint_t rc_rx_buf_sz;
1277 
1278 	ASSERT(!state->rc_enable_srq);
1279 	ASSERT(chan->rx_rwqes != NULL);
1280 	ASSERT(chan->rx_bufs != NULL);
1281 
1282 	/*
1283 	 * Don't change the value of state->rc_mtu at the period from call
1284 	 * ibd_rc_alloc_rx_copybufs() to call ibd_rc_free_rx_copybufs().
1285 	 */
1286 	rc_rx_buf_sz = state->rc_mtu + IPOIB_GRH_SIZE;
1287 
1288 	/*
1289 	 * Unregister rxbuf mr
1290 	 */
1291 	if (ibt_deregister_mr(state->id_hca_hdl,
1292 	    chan->rx_mr_hdl) != IBT_SUCCESS) {
1293 		DPRINT(40, "ibd_rc_free_rx_copybufs: ibt_deregister_mr failed");
1294 	}
1295 	chan->rx_mr_hdl = NULL;
1296 
1297 	/*
1298 	 * Free rxbuf memory
1299 	 */
1300 	kmem_free(chan->rx_rwqes, chan->rcq_size * sizeof (ibd_rwqe_t));
1301 	chan->rx_rwqes = NULL;
1302 
1303 	kmem_free(chan->rx_bufs, chan->rcq_size * rc_rx_buf_sz);
1304 	chan->rx_bufs = NULL;
1305 }
1306 
1307 /*
1308  * Post a certain number of receive buffers and WRs on a RC channel.
1309  */
1310 static int
1311 ibd_rc_init_rxlist(ibd_rc_chan_t *chan)
1312 {
1313 	ibd_state_t *state = chan->state;
1314 	ibd_rwqe_t *rwqe;
1315 	ibt_lkey_t lkey;
1316 	int i;
1317 	uint_t len;
1318 	uint8_t *bufaddr;
1319 
1320 	ASSERT(!state->rc_enable_srq);
1321 	if (ibd_rc_alloc_rx_copybufs(chan) != DDI_SUCCESS)
1322 		return (DDI_FAILURE);
1323 
1324 	/*
1325 	 * Allocate and setup the rwqe list
1326 	 */
1327 	lkey = chan->rx_mr_desc.md_lkey;
1328 	rwqe = chan->rx_rwqes;
1329 	bufaddr = chan->rx_bufs;
1330 	len = state->rc_mtu + IPOIB_GRH_SIZE;
1331 	for (i = 0; i < chan->rcq_size; i++, rwqe++, bufaddr += len) {
1332 		rwqe->w_state = state;
1333 		rwqe->w_chan = chan;
1334 		rwqe->w_freeing_wqe = B_FALSE;
1335 		rwqe->w_freemsg_cb.free_func = ibd_rc_freemsg_cb;
1336 		rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
1337 		rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
1338 
1339 		if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
1340 		    &rwqe->w_freemsg_cb)) == NULL) {
1341 			DPRINT(40, "ibd_rc_init_srq_list: desballoc() failed");
1342 			rwqe->rwqe_copybuf.ic_bufaddr = NULL;
1343 			ibd_rc_fini_rxlist(chan);
1344 			return (DDI_FAILURE);
1345 		}
1346 
1347 		rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
1348 		rwqe->rwqe_copybuf.ic_sgl.ds_va =
1349 		    (ib_vaddr_t)(uintptr_t)(bufaddr + IPOIB_GRH_SIZE);
1350 		rwqe->rwqe_copybuf.ic_sgl.ds_len = state->rc_mtu;
1351 		rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
1352 		rwqe->w_rwr.wr_nds = 1;
1353 		rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
1354 		(void) ibd_rc_post_rwqe(chan, rwqe);
1355 	}
1356 
1357 	return (DDI_SUCCESS);
1358 }
1359 
1360 /*
1361  * Free the statically allocated Rx buffer list for SRQ.
1362  */
1363 static void
1364 ibd_rc_fini_rxlist(ibd_rc_chan_t *chan)
1365 {
1366 	ibd_rwqe_t *rwqe;
1367 	int i;
1368 
1369 	if (chan->rx_bufs == NULL) {
1370 		DPRINT(40, "ibd_rc_fini_rxlist: empty chan->rx_bufs, quit");
1371 		return;
1372 	}
1373 
1374 	/* bufs_outstanding must be 0 */
1375 	ASSERT((chan->rx_wqe_list.dl_head == NULL) ||
1376 	    (chan->rx_wqe_list.dl_bufs_outstanding == 0));
1377 
1378 	mutex_enter(&chan->rx_wqe_list.dl_mutex);
1379 	rwqe = chan->rx_rwqes;
1380 	for (i = 0; i < chan->rcq_size; i++, rwqe++) {
1381 		if (rwqe->rwqe_im_mblk != NULL) {
1382 			rwqe->w_freeing_wqe = B_TRUE;
1383 			freemsg(rwqe->rwqe_im_mblk);
1384 		}
1385 	}
1386 	mutex_exit(&chan->rx_wqe_list.dl_mutex);
1387 
1388 	ibd_rc_free_rx_copybufs(chan);
1389 }
1390 
1391 /*
1392  * Free an allocated recv wqe.
1393  */
1394 static void
1395 ibd_rc_free_rwqe(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe)
1396 {
1397 	/*
1398 	 * desballoc() failed (no memory) or the posting of rwqe failed.
1399 	 *
1400 	 * This rwqe is placed on a free list so that it
1401 	 * can be reinstated in future.
1402 	 *
1403 	 * NOTE: no code currently exists to reinstate
1404 	 * these "lost" rwqes.
1405 	 */
1406 	mutex_enter(&chan->rx_free_list.dl_mutex);
1407 	chan->rx_free_list.dl_cnt++;
1408 	rwqe->rwqe_next = chan->rx_free_list.dl_head;
1409 	chan->rx_free_list.dl_head = RWQE_TO_WQE(rwqe);
1410 	mutex_exit(&chan->rx_free_list.dl_mutex);
1411 }
1412 
1413 /*
1414  * Processing to be done after receipt of a packet; hand off to GLD
1415  * in the format expected by GLD.
1416  */
1417 static void
1418 ibd_rc_process_rx(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
1419 {
1420 	ibd_state_t *state = chan->state;
1421 	ib_header_info_t *phdr;
1422 	ipoib_hdr_t *ipibp;
1423 	mblk_t *mp;
1424 	mblk_t *mpc;
1425 	int rxcnt;
1426 	ip6_t *ip6h;
1427 	int len;
1428 
1429 	/*
1430 	 * Track number handed to upper layer, and number still
1431 	 * available to receive packets.
1432 	 */
1433 	if (state->rc_enable_srq) {
1434 		rxcnt = atomic_dec_32_nv(&state->rc_srq_rwqe_list.dl_cnt);
1435 	} else {
1436 		rxcnt = atomic_dec_32_nv(&chan->rx_wqe_list.dl_cnt);
1437 	}
1438 
1439 	/*
1440 	 * It can not be a IBA multicast packet.
1441 	 */
1442 	ASSERT(!wc->wc_flags & IBT_WC_GRH_PRESENT);
1443 
1444 
1445 #ifdef DEBUG
1446 	if (rxcnt < ibd_rc_rx_rwqe_thresh) {
1447 		state->rc_rwqe_short++;
1448 	}
1449 #endif
1450 
1451 	/*
1452 	 * Possibly replenish the Rx pool if needed.
1453 	 */
1454 	if ((rxcnt >= ibd_rc_rx_rwqe_thresh) &&
1455 	    (wc->wc_bytes_xfer > ibd_rc_rx_copy_thresh)) {
1456 		atomic_add_64(&state->rc_rcv_trans_byte, wc->wc_bytes_xfer);
1457 		atomic_inc_64(&state->rc_rcv_trans_pkt);
1458 
1459 		/*
1460 		 * Record how many rwqe has been occupied by upper
1461 		 * network layer
1462 		 */
1463 		if (state->rc_enable_srq) {
1464 			atomic_add_32(&state->rc_srq_rwqe_list.
1465 			    dl_bufs_outstanding, 1);
1466 		} else {
1467 			atomic_add_32(&chan->rx_wqe_list.
1468 			    dl_bufs_outstanding, 1);
1469 		}
1470 		mp = rwqe->rwqe_im_mblk;
1471 	} else {
1472 		atomic_add_64(&state->rc_rcv_copy_byte, wc->wc_bytes_xfer);
1473 		atomic_inc_64(&state->rc_rcv_copy_pkt);
1474 
1475 		if ((mp = allocb(wc->wc_bytes_xfer + IPOIB_GRH_SIZE,
1476 		    BPRI_HI)) == NULL) {	/* no memory */
1477 			DPRINT(40, "ibd_rc_process_rx: allocb() failed");
1478 			state->rc_rcv_alloc_fail++;
1479 			if (state->rc_enable_srq) {
1480 				if (ibd_rc_post_srq(state, rwqe) ==
1481 				    DDI_FAILURE) {
1482 					ibd_rc_srq_free_rwqe(state, rwqe);
1483 				}
1484 			} else {
1485 				if (ibd_rc_post_rwqe(chan, rwqe) ==
1486 				    DDI_FAILURE) {
1487 					ibd_rc_free_rwqe(chan, rwqe);
1488 				}
1489 			}
1490 			return;
1491 		}
1492 
1493 		bcopy(rwqe->rwqe_im_mblk->b_rptr + IPOIB_GRH_SIZE,
1494 		    mp->b_wptr + IPOIB_GRH_SIZE, wc->wc_bytes_xfer);
1495 
1496 		if (state->rc_enable_srq) {
1497 			if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) {
1498 				ibd_rc_srq_free_rwqe(state, rwqe);
1499 			}
1500 		} else {
1501 			if (ibd_rc_post_rwqe(chan, rwqe) == DDI_FAILURE) {
1502 				ibd_rc_free_rwqe(chan, rwqe);
1503 			}
1504 		}
1505 	}
1506 
1507 	ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + IPOIB_GRH_SIZE);
1508 	if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) {
1509 		ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
1510 		len = ntohs(ip6h->ip6_plen);
1511 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
1512 			/* LINTED: E_CONSTANT_CONDITION */
1513 			IBD_PAD_NSNA(ip6h, len, IBD_RECV);
1514 		}
1515 	}
1516 
1517 	phdr = (ib_header_info_t *)mp->b_rptr;
1518 	phdr->ib_grh.ipoib_vertcflow = 0;
1519 	ovbcopy(&state->id_macaddr, &phdr->ib_dst,
1520 	    sizeof (ipoib_mac_t));
1521 	mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer+ IPOIB_GRH_SIZE;
1522 
1523 	/*
1524 	 * Can RC mode in IB guarantee its checksum correctness?
1525 	 *
1526 	 *	(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
1527 	 *	    HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);
1528 	 */
1529 
1530 	/*
1531 	 * Make sure this is NULL or we're in trouble.
1532 	 */
1533 	if (mp->b_next != NULL) {
1534 		ibd_print_warn(state,
1535 		    "ibd_rc_process_rx: got duplicate mp from rcq?");
1536 		mp->b_next = NULL;
1537 	}
1538 
1539 	/*
1540 	 * Add this mp to the list of processed mp's to send to
1541 	 * the nw layer
1542 	 */
1543 	if (state->rc_enable_srq) {
1544 		mutex_enter(&state->rc_rx_lock);
1545 		if (state->rc_rx_mp) {
1546 			ASSERT(state->rc_rx_mp_tail != NULL);
1547 			state->rc_rx_mp_tail->b_next = mp;
1548 		} else {
1549 			ASSERT(state->rc_rx_mp_tail == NULL);
1550 			state->rc_rx_mp = mp;
1551 		}
1552 
1553 		state->rc_rx_mp_tail = mp;
1554 		state->rc_rx_mp_len++;
1555 
1556 		if (state->rc_rx_mp_len  >= IBD_MAX_RX_MP_LEN) {
1557 			mpc = state->rc_rx_mp;
1558 
1559 			state->rc_rx_mp = NULL;
1560 			state->rc_rx_mp_tail = NULL;
1561 			state->rc_rx_mp_len = 0;
1562 			mutex_exit(&state->rc_rx_lock);
1563 			mac_rx(state->id_mh, NULL, mpc);
1564 		} else {
1565 			mutex_exit(&state->rc_rx_lock);
1566 		}
1567 	} else {
1568 		mutex_enter(&chan->rx_lock);
1569 		if (chan->rx_mp) {
1570 			ASSERT(chan->rx_mp_tail != NULL);
1571 			chan->rx_mp_tail->b_next = mp;
1572 		} else {
1573 			ASSERT(chan->rx_mp_tail == NULL);
1574 			chan->rx_mp = mp;
1575 		}
1576 
1577 		chan->rx_mp_tail = mp;
1578 		chan->rx_mp_len++;
1579 
1580 		if (chan->rx_mp_len  >= IBD_MAX_RX_MP_LEN) {
1581 			mpc = chan->rx_mp;
1582 
1583 			chan->rx_mp = NULL;
1584 			chan->rx_mp_tail = NULL;
1585 			chan->rx_mp_len = 0;
1586 			mutex_exit(&chan->rx_lock);
1587 			mac_rx(state->id_mh, NULL, mpc);
1588 		} else {
1589 			mutex_exit(&chan->rx_lock);
1590 		}
1591 	}
1592 }
1593 
1594 /*
1595  * Callback code invoked from STREAMs when the recv data buffer is free
1596  * for recycling.
1597  */
1598 static void
1599 ibd_rc_freemsg_cb(char *arg)
1600 {
1601 	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
1602 	ibd_rc_chan_t *chan = rwqe->w_chan;
1603 	ibd_state_t *state = rwqe->w_state;
1604 
1605 	/*
1606 	 * If the wqe is being destructed, do not attempt recycling.
1607 	 */
1608 	if (rwqe->w_freeing_wqe == B_TRUE) {
1609 		return;
1610 	}
1611 
1612 	ASSERT(!state->rc_enable_srq);
1613 	ASSERT(chan->rx_wqe_list.dl_cnt < chan->rcq_size);
1614 
1615 	rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
1616 	    state->rc_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
1617 	if (rwqe->rwqe_im_mblk == NULL) {
1618 		DPRINT(40, "ibd_rc_freemsg_cb: desballoc() failed");
1619 		ibd_rc_free_rwqe(chan, rwqe);
1620 		return;
1621 	}
1622 
1623 	/*
1624 	 * Post back to h/w. We could actually have more than
1625 	 * id_num_rwqe WQEs on the list if there were multiple
1626 	 * ibd_freemsg_cb() calls outstanding (since the lock is
1627 	 * not held the entire time). This will start getting
1628 	 * corrected over subsequent ibd_freemsg_cb() calls.
1629 	 */
1630 	if (ibd_rc_post_rwqe(chan, rwqe) == DDI_FAILURE) {
1631 		ibd_rc_free_rwqe(chan, rwqe);
1632 		return;
1633 	}
1634 	atomic_add_32(&chan->rx_wqe_list.dl_bufs_outstanding, -1);
1635 }
1636 
1637 /*
1638  * Common code for interrupt handling as well as for polling
1639  * for all completed wqe's while detaching.
1640  */
1641 static void
1642 ibd_rc_poll_rcq(ibd_rc_chan_t *chan, ibt_cq_hdl_t cq_hdl)
1643 {
1644 	ibd_wqe_t *wqe;
1645 	ibt_wc_t *wc, *wcs;
1646 	uint_t numwcs, real_numwcs;
1647 	int i;
1648 
1649 	wcs = chan->rx_wc;
1650 	numwcs = IBD_RC_MAX_CQ_WC;
1651 
1652 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &real_numwcs) == IBT_SUCCESS) {
1653 		for (i = 0, wc = wcs; i < real_numwcs; i++, wc++) {
1654 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
1655 			if (wc->wc_status != IBT_WC_SUCCESS) {
1656 				chan->state->rc_rcq_err++;
1657 				/*
1658 				 * Channel being torn down.
1659 				 */
1660 				DPRINT(40, "ibd_rc_poll_rcq: wc_status(%d) != "
1661 				    "SUCC, chan=%p", wc->wc_status, chan);
1662 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
1663 					/*
1664 					 * Do not invoke Rx handler because
1665 					 * it might add buffers to the Rx pool
1666 					 * when we are trying to deinitialize.
1667 					 */
1668 					continue;
1669 				}
1670 			}
1671 			ibd_rc_process_rx(chan, WQE_TO_RWQE(wqe), wc);
1672 		}
1673 	}
1674 }
1675 
1676 /* Receive CQ handler */
1677 /* ARGSUSED */
1678 static void
1679 ibd_rc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1680 {
1681 	ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg;
1682 	ibd_state_t *state = chan->state;
1683 
1684 	ASSERT(chan->chan_state == IBD_RC_STATE_PAS_ESTAB);
1685 
1686 	/*
1687 	 * Poll for completed entries; the CQ will not interrupt any
1688 	 * more for incoming (or transmitted) packets.
1689 	 */
1690 	state->rc_rcq_invoke++;
1691 	ibd_rc_poll_rcq(chan, chan->rcq_hdl);
1692 
1693 	/*
1694 	 * Now enable CQ notifications; all packets that arrive now
1695 	 * (or complete transmission) will cause new interrupts.
1696 	 */
1697 	if (ibt_enable_cq_notify(chan->rcq_hdl, IBT_NEXT_COMPLETION) !=
1698 	    IBT_SUCCESS) {
1699 		/*
1700 		 * We do not expect a failure here.
1701 		 */
1702 		DPRINT(40, "ibd_rc_rcq_handler: ibt_enable_cq_notify() failed");
1703 	}
1704 
1705 	/*
1706 	 * Repoll to catch all packets that might have arrived after
1707 	 * we finished the first poll loop and before interrupts got
1708 	 * armed.
1709 	 */
1710 	ibd_rc_poll_rcq(chan, chan->rcq_hdl);
1711 
1712 	if (state->rc_enable_srq) {
1713 		mutex_enter(&state->rc_rx_lock);
1714 
1715 		if (state->rc_rx_mp != NULL) {
1716 			mblk_t *mpc;
1717 			mpc = state->rc_rx_mp;
1718 
1719 			state->rc_rx_mp = NULL;
1720 			state->rc_rx_mp_tail = NULL;
1721 			state->rc_rx_mp_len = 0;
1722 
1723 			mutex_exit(&state->rc_rx_lock);
1724 			mac_rx(state->id_mh, NULL, mpc);
1725 		} else {
1726 			mutex_exit(&state->rc_rx_lock);
1727 		}
1728 	} else {
1729 		mutex_enter(&chan->rx_lock);
1730 
1731 		if (chan->rx_mp != NULL) {
1732 			mblk_t *mpc;
1733 			mpc = chan->rx_mp;
1734 
1735 			chan->rx_mp = NULL;
1736 			chan->rx_mp_tail = NULL;
1737 			chan->rx_mp_len = 0;
1738 
1739 			mutex_exit(&chan->rx_lock);
1740 			mac_rx(state->id_mh, NULL, mpc);
1741 		} else {
1742 			mutex_exit(&chan->rx_lock);
1743 		}
1744 	}
1745 }
1746 
1747 /*
1748  * Allocate the statically allocated Tx buffer list.
1749  */
1750 int
1751 ibd_rc_init_tx_largebuf_list(ibd_state_t *state)
1752 {
1753 	ibd_rc_tx_largebuf_t *lbufp;
1754 	ibd_rc_tx_largebuf_t *tail;
1755 	uint8_t *memp;
1756 	ibt_mr_attr_t mem_attr;
1757 	uint32_t num_swqe;
1758 	size_t  mem_size;
1759 	int i;
1760 
1761 	num_swqe = ibd_rc_num_swqe - 1;
1762 
1763 	/*
1764 	 * Allocate one big chunk for all Tx large copy bufs
1765 	 */
1766 	/* Don't transfer IPOIB_GRH_SIZE bytes (40 bytes) */
1767 	mem_size = num_swqe * state->rc_mtu;
1768 	state->rc_tx_mr_bufs = kmem_zalloc(mem_size, KM_SLEEP);
1769 
1770 	mem_attr.mr_len = mem_size;
1771 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->rc_tx_mr_bufs;
1772 	mem_attr.mr_as = NULL;
1773 	mem_attr.mr_flags = IBT_MR_SLEEP;
1774 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
1775 	    &state->rc_tx_mr_hdl, &state->rc_tx_mr_desc) != IBT_SUCCESS) {
1776 		DPRINT(40, "ibd_rc_init_tx_largebuf_list: ibt_register_mr "
1777 		    "failed");
1778 		kmem_free(state->rc_tx_mr_bufs, mem_size);
1779 		state->rc_tx_mr_bufs = NULL;
1780 		return (DDI_FAILURE);
1781 	}
1782 
1783 	state->rc_tx_largebuf_desc_base = kmem_zalloc(num_swqe *
1784 	    sizeof (ibd_rc_tx_largebuf_t), KM_SLEEP);
1785 
1786 	/*
1787 	 * Set up the buf chain
1788 	 */
1789 	memp = state->rc_tx_mr_bufs;
1790 	mutex_enter(&state->rc_tx_large_bufs_lock);
1791 	lbufp = state->rc_tx_largebuf_desc_base;
1792 	for (i = 0; i < num_swqe; i++) {
1793 		lbufp->lb_buf = memp;
1794 		lbufp->lb_next = lbufp + 1;
1795 
1796 		tail = lbufp;
1797 
1798 		memp += state->rc_mtu;
1799 		lbufp++;
1800 	}
1801 	tail->lb_next = NULL;
1802 
1803 	/*
1804 	 * Set up the buffer information in ibd state
1805 	 */
1806 	state->rc_tx_largebuf_free_head = state->rc_tx_largebuf_desc_base;
1807 	state->rc_tx_largebuf_nfree = num_swqe;
1808 	mutex_exit(&state->rc_tx_large_bufs_lock);
1809 	return (DDI_SUCCESS);
1810 }
1811 
1812 void
1813 ibd_rc_fini_tx_largebuf_list(ibd_state_t *state)
1814 {
1815 	uint32_t num_swqe;
1816 
1817 	num_swqe = ibd_rc_num_swqe - 1;
1818 
1819 	if (ibt_deregister_mr(state->id_hca_hdl,
1820 	    state->rc_tx_mr_hdl) != IBT_SUCCESS) {
1821 		DPRINT(40, "ibd_rc_fini_tx_largebuf_list: ibt_deregister_mr() "
1822 		    "failed");
1823 	}
1824 	state->rc_tx_mr_hdl = NULL;
1825 
1826 	kmem_free(state->rc_tx_mr_bufs, num_swqe * state->rc_mtu);
1827 	state->rc_tx_mr_bufs = NULL;
1828 
1829 	kmem_free(state->rc_tx_largebuf_desc_base,
1830 	    num_swqe * sizeof (ibd_rc_tx_largebuf_t));
1831 	state->rc_tx_largebuf_desc_base = NULL;
1832 }
1833 
1834 static int
1835 ibd_rc_alloc_tx_copybufs(ibd_rc_chan_t *chan)
1836 {
1837 	ibt_mr_attr_t mem_attr;
1838 	ibd_state_t *state;
1839 
1840 	state = chan->state;
1841 	ASSERT(state != NULL);
1842 
1843 	/*
1844 	 * Allocate one big chunk for all regular tx copy bufs
1845 	 */
1846 	mem_attr.mr_len = chan->scq_size * ibd_rc_tx_copy_thresh;
1847 
1848 	chan->tx_mr_bufs = kmem_zalloc(mem_attr.mr_len, KM_SLEEP);
1849 
1850 	/*
1851 	 * Do one memory registration on the entire txbuf area
1852 	 */
1853 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)chan->tx_mr_bufs;
1854 	mem_attr.mr_as = NULL;
1855 	mem_attr.mr_flags = IBT_MR_SLEEP;
1856 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
1857 	    &chan->tx_mr_hdl, &chan->tx_mr_desc) != IBT_SUCCESS) {
1858 		DPRINT(40, "ibd_rc_alloc_tx_copybufs: ibt_register_mr failed");
1859 		ASSERT(mem_attr.mr_len ==
1860 		    chan->scq_size * ibd_rc_tx_copy_thresh);
1861 		kmem_free(chan->tx_mr_bufs, mem_attr.mr_len);
1862 		chan->tx_mr_bufs = NULL;
1863 		return (DDI_FAILURE);
1864 	}
1865 
1866 	return (DDI_SUCCESS);
1867 }
1868 
1869 /*
1870  * Allocate the statically allocated Tx buffer list.
1871  */
1872 static int
1873 ibd_rc_init_txlist(ibd_rc_chan_t *chan)
1874 {
1875 	ibd_swqe_t *swqe;
1876 	int i;
1877 	ibt_lkey_t lkey;
1878 
1879 	if (ibd_rc_alloc_tx_copybufs(chan) != DDI_SUCCESS)
1880 		return (DDI_FAILURE);
1881 
1882 	/*
1883 	 * Allocate and setup the swqe list
1884 	 */
1885 	lkey = chan->tx_mr_desc.md_lkey;
1886 	chan->tx_wqes = kmem_zalloc(chan->scq_size *
1887 	    sizeof (ibd_swqe_t), KM_SLEEP);
1888 	swqe = chan->tx_wqes;
1889 	for (i = 0; i < chan->scq_size; i++, swqe++) {
1890 		swqe->swqe_next = NULL;
1891 		swqe->swqe_im_mblk = NULL;
1892 
1893 		swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
1894 		swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
1895 
1896 		swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
1897 		swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL;
1898 		swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
1899 		    (chan->tx_mr_bufs + i * ibd_rc_tx_copy_thresh);
1900 		swqe->w_swr.wr_trans = IBT_RC_SRV;
1901 
1902 		/* Add to list */
1903 		mutex_enter(&chan->tx_wqe_list.dl_mutex);
1904 		chan->tx_wqe_list.dl_cnt++;
1905 		swqe->swqe_next = chan->tx_wqe_list.dl_head;
1906 		chan->tx_wqe_list.dl_head = SWQE_TO_WQE(swqe);
1907 		mutex_exit(&chan->tx_wqe_list.dl_mutex);
1908 	}
1909 
1910 	return (DDI_SUCCESS);
1911 }
1912 
1913 /*
1914  * Free the statically allocated Tx buffer list.
1915  */
1916 static void
1917 ibd_rc_fini_txlist(ibd_rc_chan_t *chan)
1918 {
1919 	if (chan->tx_mr_hdl != NULL) {
1920 		if (ibt_deregister_mr(chan->state->id_hca_hdl,
1921 		    chan->tx_mr_hdl) != IBT_SUCCESS) {
1922 			DPRINT(40, "ibd_rc_fini_txlist: ibt_deregister_mr "
1923 			    "failed");
1924 		}
1925 		chan->tx_mr_hdl = NULL;
1926 	}
1927 
1928 	if (chan->tx_mr_bufs != NULL) {
1929 		kmem_free(chan->tx_mr_bufs, chan->scq_size *
1930 		    ibd_rc_tx_copy_thresh);
1931 		chan->tx_mr_bufs = NULL;
1932 	}
1933 
1934 	if (chan->tx_wqes != NULL) {
1935 		kmem_free(chan->tx_wqes, chan->scq_size *
1936 		    sizeof (ibd_swqe_t));
1937 		chan->tx_wqes = NULL;
1938 	}
1939 }
1940 
1941 /*
1942  * Acquire send wqe from free list.
1943  * Returns error number and send wqe pointer.
1944  */
1945 ibd_swqe_t *
1946 ibd_rc_acquire_swqes(ibd_rc_chan_t *chan)
1947 {
1948 	ibd_swqe_t *wqe;
1949 
1950 	mutex_enter(&chan->tx_rel_list.dl_mutex);
1951 	if (chan->tx_rel_list.dl_head != NULL) {
1952 		/* transfer id_tx_rel_list to id_tx_list */
1953 		chan->tx_wqe_list.dl_head =
1954 		    chan->tx_rel_list.dl_head;
1955 		chan->tx_wqe_list.dl_cnt =
1956 		    chan->tx_rel_list.dl_cnt;
1957 		chan->tx_wqe_list.dl_pending_sends = B_FALSE;
1958 
1959 		/* clear id_tx_rel_list */
1960 		chan->tx_rel_list.dl_head = NULL;
1961 		chan->tx_rel_list.dl_cnt = 0;
1962 		mutex_exit(&chan->tx_rel_list.dl_mutex);
1963 
1964 		wqe = WQE_TO_SWQE(chan->tx_wqe_list.dl_head);
1965 		chan->tx_wqe_list.dl_cnt -= 1;
1966 		chan->tx_wqe_list.dl_head = wqe->swqe_next;
1967 	} else {	/* no free swqe */
1968 		mutex_exit(&chan->tx_rel_list.dl_mutex);
1969 		chan->tx_wqe_list.dl_pending_sends = B_TRUE;
1970 		wqe = NULL;
1971 	}
1972 	return (wqe);
1973 }
1974 
1975 /*
1976  * Release send wqe back into free list.
1977  */
1978 static void
1979 ibd_rc_release_swqe(ibd_rc_chan_t *chan, ibd_swqe_t *swqe)
1980 {
1981 	/*
1982 	 * Add back on Tx list for reuse.
1983 	 */
1984 	swqe->swqe_next = NULL;
1985 	mutex_enter(&chan->tx_rel_list.dl_mutex);
1986 	chan->tx_rel_list.dl_pending_sends = B_FALSE;
1987 	swqe->swqe_next = chan->tx_rel_list.dl_head;
1988 	chan->tx_rel_list.dl_head = SWQE_TO_WQE(swqe);
1989 	chan->tx_rel_list.dl_cnt++;
1990 	mutex_exit(&chan->tx_rel_list.dl_mutex);
1991 }
1992 
1993 void
1994 ibd_rc_post_send(ibd_rc_chan_t *chan, ibd_swqe_t *node)
1995 {
1996 	uint_t		i;
1997 	uint_t		num_posted;
1998 	uint_t		n_wrs;
1999 	ibt_status_t	ibt_status;
2000 	ibt_send_wr_t	wrs[IBD_MAX_TX_POST_MULTIPLE];
2001 	ibd_swqe_t	*tx_head, *elem;
2002 	ibd_swqe_t	*nodes[IBD_MAX_TX_POST_MULTIPLE];
2003 
2004 	/* post the one request, then check for more */
2005 	ibt_status = ibt_post_send(chan->chan_hdl,
2006 	    &node->w_swr, 1, NULL);
2007 	if (ibt_status != IBT_SUCCESS) {
2008 		ibd_print_warn(chan->state, "ibd_post_send: "
2009 		    "posting one wr failed: ret=%d", ibt_status);
2010 		ibd_rc_tx_cleanup(node);
2011 	}
2012 
2013 	tx_head = NULL;
2014 	for (;;) {
2015 		if (tx_head == NULL) {
2016 			mutex_enter(&chan->tx_post_lock);
2017 			tx_head = chan->tx_head;
2018 			if (tx_head == NULL) {
2019 				chan->tx_busy = 0;
2020 				mutex_exit(&chan->tx_post_lock);
2021 				return;
2022 			}
2023 			chan->tx_head = NULL;
2024 			mutex_exit(&chan->tx_post_lock);
2025 		}
2026 
2027 		/*
2028 		 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs
2029 		 * at a time if possible, and keep posting them.
2030 		 */
2031 		for (n_wrs = 0, elem = tx_head;
2032 		    (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE);
2033 		    elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {
2034 			nodes[n_wrs] = elem;
2035 			wrs[n_wrs] = elem->w_swr;
2036 		}
2037 		tx_head = elem;
2038 
2039 		ASSERT(n_wrs != 0);
2040 
2041 		/*
2042 		 * If posting fails for some reason, we'll never receive
2043 		 * completion intimation, so we'll need to cleanup. But
2044 		 * we need to make sure we don't clean up nodes whose
2045 		 * wrs have been successfully posted. We assume that the
2046 		 * hca driver returns on the first failure to post and
2047 		 * therefore the first 'num_posted' entries don't need
2048 		 * cleanup here.
2049 		 */
2050 		num_posted = 0;
2051 		ibt_status = ibt_post_send(chan->chan_hdl,
2052 		    wrs, n_wrs, &num_posted);
2053 		if (ibt_status != IBT_SUCCESS) {
2054 			ibd_print_warn(chan->state, "ibd_post_send: "
2055 			    "posting multiple wrs failed: "
2056 			    "requested=%d, done=%d, ret=%d",
2057 			    n_wrs, num_posted, ibt_status);
2058 
2059 			for (i = num_posted; i < n_wrs; i++)
2060 				ibd_rc_tx_cleanup(nodes[i]);
2061 		}
2062 	}
2063 }
2064 
2065 /*
2066  * Common code that deals with clean ups after a successful or
2067  * erroneous transmission attempt.
2068  */
2069 void
2070 ibd_rc_tx_cleanup(ibd_swqe_t *swqe)
2071 {
2072 	ibd_ace_t *ace = swqe->w_ahandle;
2073 	ibd_state_t *state;
2074 
2075 	ASSERT(ace != NULL);
2076 	ASSERT(ace->ac_chan != NULL);
2077 
2078 	state = ace->ac_chan->state;
2079 
2080 	/*
2081 	 * If this was a dynamic registration in ibd_send(),
2082 	 * deregister now.
2083 	 */
2084 	if (swqe->swqe_im_mblk != NULL) {
2085 		ASSERT(swqe->w_buftype == IBD_WQE_MAPPED);
2086 		if (swqe->w_buftype == IBD_WQE_MAPPED) {
2087 			ibd_unmap_mem(state, swqe);
2088 		}
2089 		freemsg(swqe->swqe_im_mblk);
2090 		swqe->swqe_im_mblk = NULL;
2091 	} else {
2092 		ASSERT(swqe->w_buftype != IBD_WQE_MAPPED);
2093 	}
2094 
2095 	if (swqe->w_buftype == IBD_WQE_RC_COPYBUF) {
2096 		ibd_rc_tx_largebuf_t *lbufp;
2097 
2098 		lbufp = swqe->w_rc_tx_largebuf;
2099 		ASSERT(lbufp != NULL);
2100 
2101 		mutex_enter(&state->rc_tx_large_bufs_lock);
2102 		lbufp->lb_next = state->rc_tx_largebuf_free_head;
2103 		state->rc_tx_largebuf_free_head = lbufp;
2104 		state->rc_tx_largebuf_nfree ++;
2105 		mutex_exit(&state->rc_tx_large_bufs_lock);
2106 		swqe->w_rc_tx_largebuf = NULL;
2107 	}
2108 
2109 
2110 	/*
2111 	 * Release the send wqe for reuse.
2112 	 */
2113 	ibd_rc_release_swqe(ace->ac_chan, swqe);
2114 
2115 	/*
2116 	 * Drop the reference count on the AH; it can be reused
2117 	 * now for a different destination if there are no more
2118 	 * posted sends that will use it. This can be eliminated
2119 	 * if we can always associate each Tx buffer with an AH.
2120 	 * The ace can be null if we are cleaning up from the
2121 	 * ibd_send() error path.
2122 	 */
2123 	ibd_dec_ref_ace(state, ace);
2124 }
2125 
2126 void
2127 ibd_rc_drain_scq(ibd_rc_chan_t *chan, ibt_cq_hdl_t cq_hdl)
2128 {
2129 	ibd_state_t *state = chan->state;
2130 	ibd_wqe_t *wqe;
2131 	ibt_wc_t *wc, *wcs;
2132 	uint_t numwcs, real_numwcs;
2133 	int i;
2134 
2135 	wcs = chan->tx_wc;
2136 	numwcs = IBD_RC_MAX_CQ_WC;
2137 
2138 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &real_numwcs) == IBT_SUCCESS) {
2139 		for (i = 0, wc = wcs; i < real_numwcs; i++, wc++) {
2140 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
2141 			if (wc->wc_status != IBT_WC_SUCCESS) {
2142 				chan->tx_trans_error_cnt ++;
2143 				DPRINT(30, "ibd_rc_drain_scq: "
2144 				    "wc_status(%d) != SUCC, "
2145 				    "chan=%p, ace=%p, link_state=%d",
2146 				    wc->wc_status, chan, chan->ace,
2147 				    chan->state->id_link_state);
2148 			} else {
2149 				chan->tx_trans_error_cnt = 0;
2150 			}
2151 			ibd_rc_tx_cleanup(WQE_TO_SWQE(wqe));
2152 		}
2153 
2154 		mutex_enter(&state->id_sched_lock);
2155 		if (state->id_sched_needed == 0) {
2156 			mutex_exit(&state->id_sched_lock);
2157 		} else if (state->id_sched_needed & IBD_RSRC_RC_SWQE) {
2158 			mutex_enter(&chan->tx_wqe_list.dl_mutex);
2159 			mutex_enter(&chan->tx_rel_list.dl_mutex);
2160 			if ((chan->tx_rel_list.dl_cnt +
2161 			    chan->tx_wqe_list.dl_cnt) > IBD_RC_TX_FREE_THRESH) {
2162 				state->id_sched_needed &= ~IBD_RSRC_RC_SWQE;
2163 				mutex_exit(&chan->tx_rel_list.dl_mutex);
2164 				mutex_exit(&chan->tx_wqe_list.dl_mutex);
2165 				mutex_exit(&state->id_sched_lock);
2166 				state->rc_swqe_mac_update++;
2167 				mac_tx_update(state->id_mh);
2168 			} else {
2169 				state->rc_scq_no_swqe++;
2170 				mutex_exit(&chan->tx_rel_list.dl_mutex);
2171 				mutex_exit(&chan->tx_wqe_list.dl_mutex);
2172 				mutex_exit(&state->id_sched_lock);
2173 			}
2174 		} else if (state->id_sched_needed & IBD_RSRC_RC_TX_LARGEBUF) {
2175 			mutex_enter(&state->rc_tx_large_bufs_lock);
2176 			if (state->rc_tx_largebuf_nfree >
2177 			    IBD_RC_TX_FREE_THRESH) {
2178 				ASSERT(state->rc_tx_largebuf_free_head != NULL);
2179 				state->id_sched_needed &=
2180 				    ~IBD_RSRC_RC_TX_LARGEBUF;
2181 				mutex_exit(&state->rc_tx_large_bufs_lock);
2182 				mutex_exit(&state->id_sched_lock);
2183 				state->rc_xmt_buf_mac_update++;
2184 				mac_tx_update(state->id_mh);
2185 			} else {
2186 				state->rc_scq_no_largebuf++;
2187 				mutex_exit(&state->rc_tx_large_bufs_lock);
2188 				mutex_exit(&state->id_sched_lock);
2189 			}
2190 		} else if (state->id_sched_needed & IBD_RSRC_SWQE) {
2191 			mutex_enter(&state->id_tx_list.dl_mutex);
2192 			mutex_enter(&state->id_tx_rel_list.dl_mutex);
2193 			if ((state->id_tx_list.dl_cnt +
2194 			    state->id_tx_rel_list.dl_cnt)
2195 			    > IBD_FREE_SWQES_THRESH) {
2196 				state->id_sched_needed &= ~IBD_RSRC_SWQE;
2197 				state->id_sched_cnt++;
2198 				mutex_exit(&state->id_tx_rel_list.dl_mutex);
2199 				mutex_exit(&state->id_tx_list.dl_mutex);
2200 				mutex_exit(&state->id_sched_lock);
2201 				mac_tx_update(state->id_mh);
2202 			} else {
2203 				mutex_exit(&state->id_tx_rel_list.dl_mutex);
2204 				mutex_exit(&state->id_tx_list.dl_mutex);
2205 				mutex_exit(&state->id_sched_lock);
2206 			}
2207 		} else {
2208 			mutex_exit(&state->id_sched_lock);
2209 		}
2210 	}
2211 }
2212 
2213 /* Send CQ handler, call ibd_rx_tx_cleanup to recycle Tx buffers */
2214 /* ARGSUSED */
2215 static void
2216 ibd_rc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
2217 {
2218 	ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg;
2219 
2220 	chan->state->rc_scq_invoke++;
2221 
2222 	if (ibd_rc_tx_softintr == 1) {
2223 		mutex_enter(&chan->tx_poll_lock);
2224 		if (chan->tx_poll_busy & IBD_CQ_POLLING) {
2225 			chan->tx_poll_busy |= IBD_REDO_CQ_POLLING;
2226 			mutex_exit(&chan->tx_poll_lock);
2227 			return;
2228 		} else {
2229 			mutex_exit(&chan->tx_poll_lock);
2230 			ddi_trigger_softintr(chan->scq_softintr);
2231 		}
2232 	} else
2233 		(void) ibd_rc_tx_recycle(arg);
2234 }
2235 
2236 static uint_t
2237 ibd_rc_tx_recycle(caddr_t arg)
2238 {
2239 	ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg;
2240 	ibd_ace_t *ace;
2241 	ibd_state_t *state = chan->state;
2242 	int flag, redo_flag;
2243 	int redo = 1;
2244 
2245 	flag = IBD_CQ_POLLING;
2246 	redo_flag = IBD_REDO_CQ_POLLING;
2247 
2248 	mutex_enter(&chan->tx_poll_lock);
2249 	if (chan->tx_poll_busy & flag) {
2250 		ibd_print_warn(state, "ibd_rc_tx_recycle: multiple polling "
2251 		    "threads");
2252 		chan->tx_poll_busy |= redo_flag;
2253 		mutex_exit(&chan->tx_poll_lock);
2254 		return (DDI_INTR_CLAIMED);
2255 	}
2256 	chan->tx_poll_busy |= flag;
2257 	mutex_exit(&chan->tx_poll_lock);
2258 
2259 	/*
2260 	 * Poll for completed entries; the CQ will not interrupt any
2261 	 * more for completed packets.
2262 	 */
2263 	ibd_rc_drain_scq(chan, chan->scq_hdl);
2264 
2265 	/*
2266 	 * Now enable CQ notifications; all completions originating now
2267 	 * will cause new interrupts.
2268 	 */
2269 	do {
2270 		if (ibt_enable_cq_notify(chan->scq_hdl, IBT_NEXT_COMPLETION) !=
2271 		    IBT_SUCCESS) {
2272 			/*
2273 			 * We do not expect a failure here.
2274 			 */
2275 			DPRINT(40, "ibd_rc_scq_handler: ibt_enable_cq_notify()"
2276 			    " failed");
2277 		}
2278 
2279 		ibd_rc_drain_scq(chan, chan->scq_hdl);
2280 
2281 		if (chan->tx_trans_error_cnt > 3) {
2282 			mutex_enter(&chan->tx_poll_lock);
2283 			chan->tx_poll_busy = 0;
2284 			mutex_exit(&chan->tx_poll_lock);
2285 			goto error_reset_chan;
2286 		}
2287 		mutex_enter(&chan->tx_poll_lock);
2288 		if (chan->tx_poll_busy & redo_flag)
2289 			chan->tx_poll_busy &= ~redo_flag;
2290 		else {
2291 			chan->tx_poll_busy &= ~flag;
2292 			redo = 0;
2293 		}
2294 		mutex_exit(&chan->tx_poll_lock);
2295 
2296 	} while (redo);
2297 
2298 	return (DDI_INTR_CLAIMED);
2299 
2300 error_reset_chan:
2301 	/*
2302 	 * Channel being torn down.
2303 	 */
2304 	mutex_enter(&state->id_ac_mutex);
2305 	if ((chan->chan_state == IBD_RC_STATE_ACT_ESTAB) &&
2306 	    (chan->state->id_link_state == LINK_STATE_UP) &&
2307 	    ((ace = ibd_acache_find(state, &chan->ace->ac_mac, B_FALSE, 0))
2308 	    != NULL) && (ace == chan->ace)) {
2309 		ASSERT(ace->ac_mce == NULL);
2310 		INC_REF(ace, 1);
2311 		IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
2312 		chan->chan_state = IBD_RC_STATE_ACT_CLOSING;
2313 		mutex_exit(&state->id_ac_mutex);
2314 		state->rc_reset_cnt++;
2315 		DPRINT(30, "ibd_rc_tx_recycle(chan=%p, ace=%p): "
2316 		    " reset RC channel", chan, chan->ace);
2317 		ibd_rc_signal_act_close(state, ace);
2318 	} else {
2319 		mutex_exit(&state->id_ac_mutex);
2320 		state->rc_act_close_simultaneous++;
2321 		DPRINT(40, "ibd_rc_tx_recycle: other thread is closing"
2322 		    " it. chan=%p, act_state=%d, link_state=%d, ace=%p",
2323 		    chan, chan->chan_state, state->id_link_state, ace);
2324 	}
2325 	return (DDI_INTR_CLAIMED);
2326 }
2327 
2328 static ibt_status_t
2329 ibd_register_service(ibt_srv_desc_t *srv, ib_svc_id_t sid,
2330     int num_sids, ibt_srv_hdl_t *srv_hdl, ib_svc_id_t *ret_sid)
2331 {
2332 	ibd_service_t *p;
2333 	ibt_status_t status;
2334 
2335 	mutex_enter(&ibd_gstate.ig_mutex);
2336 	for (p = ibd_gstate.ig_service_list; p != NULL; p = p->is_link) {
2337 		if (p->is_sid == sid) {
2338 			p->is_ref_cnt++;
2339 			*srv_hdl = p->is_srv_hdl;
2340 			*ret_sid = sid;
2341 			mutex_exit(&ibd_gstate.ig_mutex);
2342 			return (IBT_SUCCESS);
2343 		}
2344 	}
2345 	status = ibt_register_service(ibd_gstate.ig_ibt_hdl, srv, sid,
2346 	    num_sids, srv_hdl, ret_sid);
2347 	if (status == IBT_SUCCESS) {
2348 		p = kmem_alloc(sizeof (*p), KM_SLEEP);
2349 		p->is_srv_hdl = *srv_hdl;
2350 		p->is_sid = sid;
2351 		p->is_ref_cnt = 1;
2352 		p->is_link = ibd_gstate.ig_service_list;
2353 		ibd_gstate.ig_service_list = p;
2354 	}
2355 	mutex_exit(&ibd_gstate.ig_mutex);
2356 	return (status);
2357 }
2358 
2359 static ibt_status_t
2360 ibd_deregister_service(ibt_srv_hdl_t srv_hdl)
2361 {
2362 	ibd_service_t *p, **pp;
2363 	ibt_status_t status;
2364 
2365 	mutex_enter(&ibd_gstate.ig_mutex);
2366 	for (pp = &ibd_gstate.ig_service_list; *pp != NULL;
2367 	    pp = &((*pp)->is_link)) {
2368 		p = *pp;
2369 		if (p->is_srv_hdl == srv_hdl) {	/* Found it */
2370 			if (--p->is_ref_cnt == 0) {
2371 				status = ibt_deregister_service(
2372 				    ibd_gstate.ig_ibt_hdl, srv_hdl);
2373 				*pp = p->is_link; /* link prev to next */
2374 				kmem_free(p, sizeof (*p));
2375 			} else {
2376 				status = IBT_SUCCESS;
2377 			}
2378 			mutex_exit(&ibd_gstate.ig_mutex);
2379 			return (status);
2380 		}
2381 	}
2382 	/* Should not ever get here */
2383 	mutex_exit(&ibd_gstate.ig_mutex);
2384 	return (IBT_FAILURE);
2385 }
2386 
2387 /* Listen with corresponding service ID */
2388 ibt_status_t
2389 ibd_rc_listen(ibd_state_t *state)
2390 {
2391 	ibt_srv_desc_t srvdesc;
2392 	ib_svc_id_t ret_sid;
2393 	ibt_status_t status;
2394 	ib_gid_t gid;
2395 
2396 	if (state->rc_listen_hdl != NULL) {
2397 		DPRINT(40, "ibd_rc_listen: rc_listen_hdl should be NULL");
2398 		return (IBT_FAILURE);
2399 	}
2400 
2401 	bzero(&srvdesc, sizeof (ibt_srv_desc_t));
2402 	srvdesc.sd_handler = ibd_rc_dispatch_pass_mad;
2403 	srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
2404 
2405 	/*
2406 	 * Register the service with service id
2407 	 * Incoming connection requests should arrive on this service id.
2408 	 */
2409 	status = ibd_register_service(&srvdesc,
2410 	    IBD_RC_QPN_TO_SID(state->id_qpnum),
2411 	    1, &state->rc_listen_hdl, &ret_sid);
2412 	if (status != IBT_SUCCESS) {
2413 		DPRINT(40, "ibd_rc_listen: Service Registration Failed, "
2414 		    "ret=%d", status);
2415 		return (status);
2416 	}
2417 
2418 	gid = state->id_sgid;
2419 
2420 	/* pass state as cm_private */
2421 	status = ibt_bind_service(state->rc_listen_hdl,
2422 	    gid, NULL, state, &state->rc_listen_bind);
2423 	if (status != IBT_SUCCESS) {
2424 		DPRINT(40, "ibd_rc_listen:"
2425 		    " fail to bind port: <%d>", status);
2426 		(void) ibd_deregister_service(state->rc_listen_hdl);
2427 		return (status);
2428 	}
2429 
2430 	/*
2431 	 * Legacy OFED had used a wrong service ID (one additional zero digit)
2432 	 * for many years. To interop with legacy OFED, we support this wrong
2433 	 * service ID here.
2434 	 */
2435 	ASSERT(state->rc_listen_hdl_OFED_interop == NULL);
2436 
2437 	bzero(&srvdesc, sizeof (ibt_srv_desc_t));
2438 	srvdesc.sd_handler = ibd_rc_dispatch_pass_mad;
2439 	srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
2440 
2441 	/*
2442 	 * Register the service with service id
2443 	 * Incoming connection requests should arrive on this service id.
2444 	 */
2445 	status = ibd_register_service(&srvdesc,
2446 	    IBD_RC_QPN_TO_SID_OFED_INTEROP(state->id_qpnum),
2447 	    1, &state->rc_listen_hdl_OFED_interop, &ret_sid);
2448 	if (status != IBT_SUCCESS) {
2449 		DPRINT(40,
2450 		    "ibd_rc_listen: Service Registration for Legacy OFED "
2451 		    "Failed %d", status);
2452 		(void) ibt_unbind_service(state->rc_listen_hdl,
2453 		    state->rc_listen_bind);
2454 		(void) ibd_deregister_service(state->rc_listen_hdl);
2455 		return (status);
2456 	}
2457 
2458 	gid = state->id_sgid;
2459 
2460 	/* pass state as cm_private */
2461 	status = ibt_bind_service(state->rc_listen_hdl_OFED_interop,
2462 	    gid, NULL, state, &state->rc_listen_bind_OFED_interop);
2463 	if (status != IBT_SUCCESS) {
2464 		DPRINT(40, "ibd_rc_listen: fail to bind port: <%d> for "
2465 		    "Legacy OFED listener", status);
2466 		(void) ibd_deregister_service(
2467 		    state->rc_listen_hdl_OFED_interop);
2468 		(void) ibt_unbind_service(state->rc_listen_hdl,
2469 		    state->rc_listen_bind);
2470 		(void) ibd_deregister_service(state->rc_listen_hdl);
2471 		return (status);
2472 	}
2473 
2474 	return (IBT_SUCCESS);
2475 }
2476 
2477 void
2478 ibd_rc_stop_listen(ibd_state_t *state)
2479 {
2480 	int ret;
2481 
2482 	/* Disable incoming connection requests */
2483 	if (state->rc_listen_hdl != NULL) {
2484 		ret = ibt_unbind_all_services(state->rc_listen_hdl);
2485 		if (ret != 0) {
2486 			DPRINT(40, "ibd_rc_stop_listen:"
2487 			    "ibt_unbind_all_services() failed, ret=%d", ret);
2488 		}
2489 		ret = ibd_deregister_service(state->rc_listen_hdl);
2490 		if (ret != 0) {
2491 			DPRINT(40, "ibd_rc_stop_listen:"
2492 			    "ibd_deregister_service() failed, ret=%d", ret);
2493 		} else {
2494 			state->rc_listen_hdl = NULL;
2495 		}
2496 	}
2497 
2498 	/* Disable incoming connection requests */
2499 	if (state->rc_listen_hdl_OFED_interop != NULL) {
2500 		ret = ibt_unbind_all_services(
2501 		    state->rc_listen_hdl_OFED_interop);
2502 		if (ret != 0) {
2503 			DPRINT(40, "ibd_rc_stop_listen:"
2504 			    "ibt_unbind_all_services() failed: %d", ret);
2505 		}
2506 		ret = ibd_deregister_service(state->rc_listen_hdl_OFED_interop);
2507 		if (ret != 0) {
2508 			DPRINT(40, "ibd_rc_stop_listen:"
2509 			    "ibd_deregister_service() failed: %d", ret);
2510 		} else {
2511 			state->rc_listen_hdl_OFED_interop = NULL;
2512 		}
2513 	}
2514 }
2515 
2516 void
2517 ibd_rc_close_all_chan(ibd_state_t *state)
2518 {
2519 	ibd_rc_chan_t *rc_chan;
2520 	ibd_ace_t *ace;
2521 	uint_t attempts;
2522 
2523 	/* Disable all Rx routines */
2524 	mutex_enter(&state->rc_pass_chan_list.chan_list_mutex);
2525 	rc_chan = state->rc_pass_chan_list.chan_list;
2526 	while (rc_chan != NULL) {
2527 		ibt_set_cq_handler(rc_chan->rcq_hdl, 0, 0);
2528 		rc_chan = rc_chan->next;
2529 	}
2530 	mutex_exit(&state->rc_pass_chan_list.chan_list_mutex);
2531 
2532 	if (state->rc_enable_srq) {
2533 		attempts = 10;
2534 		while (state->rc_srq_rwqe_list.dl_bufs_outstanding > 0) {
2535 			DPRINT(30, "ibd_rc_close_all_chan: outstanding > 0");
2536 			delay(drv_usectohz(100000));
2537 			if (--attempts == 0) {
2538 				/*
2539 				 * There are pending bufs with the network
2540 				 * layer and we have no choice but to wait
2541 				 * for them to be done with. Reap all the
2542 				 * Tx/Rx completions that were posted since
2543 				 * we turned off the notification and
2544 				 * return failure.
2545 				 */
2546 				break;
2547 			}
2548 		}
2549 	}
2550 
2551 	/* Close all passive RC channels */
2552 	rc_chan = ibd_rc_rm_header_chan_list(&state->rc_pass_chan_list);
2553 	while (rc_chan != NULL) {
2554 		(void) ibd_rc_pas_close(rc_chan);
2555 		rc_chan = ibd_rc_rm_header_chan_list(&state->rc_pass_chan_list);
2556 	}
2557 
2558 	/* Close all active RC channels */
2559 	mutex_enter(&state->id_ac_mutex);
2560 	ace = list_head(&state->id_ah_active);
2561 	while (ace != NULL) {
2562 		if (ace->ac_chan != NULL) {
2563 			ibd_rc_add_to_chan_list(&state->rc_obs_act_chan_list,
2564 			    ace->ac_chan);
2565 		}
2566 		ace = list_next(&state->id_ah_active, ace);
2567 	}
2568 	mutex_exit(&state->id_ac_mutex);
2569 
2570 	rc_chan = ibd_rc_rm_header_chan_list(&state->rc_obs_act_chan_list);
2571 	while (rc_chan != NULL) {
2572 		ace = rc_chan->ace;
2573 		ibd_rc_act_close(rc_chan);
2574 		if (ace != NULL)
2575 			ace->ac_chan = NULL;
2576 		rc_chan = ibd_rc_rm_header_chan_list(
2577 		    &state->rc_obs_act_chan_list);
2578 	}
2579 }
2580 
2581 void
2582 ibd_rc_try_connect(ibd_state_t *state, ibd_ace_t *ace,  ibt_path_info_t *path)
2583 {
2584 	ibt_status_t status;
2585 
2586 	status = ibd_rc_connect(state, ace, path,
2587 	    IBD_RC_SERVICE_ID_OFED_INTEROP);
2588 
2589 	if (status != IBT_SUCCESS) {
2590 		/* wait peer side remove stale channel */
2591 		delay(drv_usectohz(10000));
2592 		status = ibd_rc_connect(state, ace, path,
2593 		    IBD_RC_SERVICE_ID_OFED_INTEROP);
2594 	}
2595 
2596 	if (status != IBT_SUCCESS) {
2597 		/* wait peer side remove stale channel */
2598 		delay(drv_usectohz(10000));
2599 		(void) ibd_rc_connect(state, ace, path,
2600 		    IBD_RC_SERVICE_ID);
2601 	}
2602 }
2603 
2604 /*
2605  * Allocates channel and sets the ace->ac_chan to it.
2606  * Opens the channel.
2607  */
2608 ibt_status_t
2609 ibd_rc_connect(ibd_state_t *state, ibd_ace_t *ace,  ibt_path_info_t *path,
2610     uint64_t ietf_cm_service_id)
2611 {
2612 	ibt_status_t status = 0;
2613 	ibt_rc_returns_t open_returns;
2614 	ibt_chan_open_args_t open_args;
2615 	ibd_rc_msg_hello_t hello_req_msg;
2616 	ibd_rc_msg_hello_t *hello_ack_msg;
2617 	ibd_rc_chan_t *chan;
2618 
2619 	ASSERT(ace != NULL);
2620 	ASSERT(ace->ac_mce == NULL);
2621 	ASSERT(ace->ac_chan == NULL);
2622 
2623 	if ((status = ibd_rc_alloc_chan(&chan, state, B_TRUE)) != IBT_SUCCESS) {
2624 		DPRINT(10, "ibd_rc_connect: ibd_rc_alloc_chan() failed");
2625 		return (status);
2626 	}
2627 
2628 	ace->ac_chan = chan;
2629 	chan->state = state;
2630 	chan->ace = ace;
2631 
2632 	ibt_set_chan_private(chan->chan_hdl, (void *)(uintptr_t)ace);
2633 
2634 	hello_ack_msg = kmem_zalloc(sizeof (ibd_rc_msg_hello_t), KM_SLEEP);
2635 
2636 	/*
2637 	 * open the channels
2638 	 */
2639 	bzero(&open_args, sizeof (ibt_chan_open_args_t));
2640 	bzero(&open_returns, sizeof (ibt_rc_returns_t));
2641 
2642 	open_args.oc_cm_handler = ibd_rc_dispatch_actv_mad;
2643 	open_args.oc_cm_clnt_private = (void *)(uintptr_t)ace;
2644 
2645 	/*
2646 	 * update path record with the SID
2647 	 */
2648 	path->pi_sid =
2649 	    ietf_cm_service_id | ((ace->ac_dest->ud_dst_qpn) & 0xffffff);
2650 
2651 
2652 	/* pre-allocate memory for hello ack message */
2653 	open_returns.rc_priv_data_len = sizeof (ibd_rc_msg_hello_t);
2654 	open_returns.rc_priv_data = hello_ack_msg;
2655 
2656 	open_args.oc_path = path;
2657 
2658 	open_args.oc_path_rnr_retry_cnt	= 7;
2659 	open_args.oc_path_retry_cnt = 7;
2660 
2661 	/* We don't do RDMA */
2662 	open_args.oc_rdma_ra_out = 0;
2663 	open_args.oc_rdma_ra_in	= 0;
2664 
2665 	hello_req_msg.reserved_qpn = htonl(state->id_qpnum);
2666 	hello_req_msg.rx_mtu = htonl(state->rc_mtu);
2667 	open_args.oc_priv_data_len = sizeof (ibd_rc_msg_hello_t);
2668 	open_args.oc_priv_data = (void *)(&hello_req_msg);
2669 
2670 	ASSERT(open_args.oc_priv_data_len <= IBT_REQ_PRIV_DATA_SZ);
2671 	ASSERT(open_returns.rc_priv_data_len <= IBT_REP_PRIV_DATA_SZ);
2672 	ASSERT(open_args.oc_cm_handler != NULL);
2673 
2674 	status = ibt_open_rc_channel(chan->chan_hdl, IBT_OCHAN_NO_FLAGS,
2675 	    IBT_BLOCKING, &open_args, &open_returns);
2676 
2677 	if (status == IBT_SUCCESS) {
2678 		/* Success! */
2679 		DPRINT(2, "ibd_rc_connect: call ibt_open_rc_channel succ!");
2680 		state->rc_conn_succ++;
2681 		kmem_free(hello_ack_msg, sizeof (ibd_rc_msg_hello_t));
2682 		return (IBT_SUCCESS);
2683 	}
2684 
2685 	/* failure */
2686 	(void) ibt_flush_channel(chan->chan_hdl);
2687 	ibd_rc_free_chan(chan);
2688 	ace->ac_chan = NULL;
2689 
2690 	/* check open_returns report error and exit */
2691 	DPRINT(30, "ibd_rc_connect: call ibt_open_rc_chan fail."
2692 	    "ret status = %d, reason=%d, ace=%p, mtu=0x%x, qpn=0x%x,"
2693 	    " peer qpn=0x%x", status, (int)open_returns.rc_status, ace,
2694 	    hello_req_msg.rx_mtu, hello_req_msg.reserved_qpn,
2695 	    ace->ac_dest->ud_dst_qpn);
2696 	kmem_free(hello_ack_msg, sizeof (ibd_rc_msg_hello_t));
2697 	return (status);
2698 }
2699 
2700 void
2701 ibd_rc_signal_act_close(ibd_state_t *state, ibd_ace_t *ace)
2702 {
2703 	ibd_req_t *req;
2704 
2705 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
2706 	if (req == NULL) {
2707 		ibd_print_warn(state, "ibd_rc_signal_act_close: alloc "
2708 		    "ibd_req_t fail");
2709 		mutex_enter(&state->rc_obs_act_chan_list.chan_list_mutex);
2710 		ace->ac_chan->next = state->rc_obs_act_chan_list.chan_list;
2711 		state->rc_obs_act_chan_list.chan_list = ace->ac_chan;
2712 		mutex_exit(&state->rc_obs_act_chan_list.chan_list_mutex);
2713 	} else {
2714 		req->rq_ptr = ace->ac_chan;
2715 		ibd_queue_work_slot(state, req, IBD_ASYNC_RC_CLOSE_ACT_CHAN);
2716 	}
2717 }
2718 
2719 void
2720 ibd_rc_signal_ace_recycle(ibd_state_t *state, ibd_ace_t *ace)
2721 {
2722 	ibd_req_t *req;
2723 
2724 	mutex_enter(&state->rc_ace_recycle_lock);
2725 	if (state->rc_ace_recycle != NULL) {
2726 		mutex_exit(&state->rc_ace_recycle_lock);
2727 		return;
2728 	}
2729 
2730 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
2731 	if (req == NULL) {
2732 		mutex_exit(&state->rc_ace_recycle_lock);
2733 		return;
2734 	}
2735 
2736 	state->rc_ace_recycle = ace;
2737 	mutex_exit(&state->rc_ace_recycle_lock);
2738 	ASSERT(ace->ac_mce == NULL);
2739 	INC_REF(ace, 1);
2740 	IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
2741 	req->rq_ptr = ace;
2742 	ibd_queue_work_slot(state, req, IBD_ASYNC_RC_RECYCLE_ACE);
2743 }
2744 
2745 static void
2746 ibd_rc_act_close(ibd_rc_chan_t *chan)
2747 {
2748 	uint_t times;
2749 	ibt_status_t ret;
2750 
2751 	ASSERT(chan != NULL);
2752 
2753 	chan->state->rc_act_close++;
2754 	switch (chan->chan_state) {
2755 	case IBD_RC_STATE_ACT_CLOSING:	/* stale, close it */
2756 	case IBD_RC_STATE_ACT_ESTAB:
2757 		DPRINT(30, "ibd_rc_act_close-1: close and free chan, "
2758 		    "act_state=%d, chan=%p", chan->chan_state, chan);
2759 		chan->chan_state = IBD_RC_STATE_ACT_CLOSED;
2760 		ibt_set_cq_handler(chan->rcq_hdl, 0, 0);
2761 		/* Wait send queue empty */
2762 		times = 0;
2763 		mutex_enter(&chan->tx_wqe_list.dl_mutex);
2764 		mutex_enter(&chan->tx_rel_list.dl_mutex);
2765 		while (((chan->tx_wqe_list.dl_cnt + chan->tx_rel_list.dl_cnt)
2766 		    != chan->scq_size) && (times < 50)) {
2767 			DPRINT(30, "ibd_rc_act_close: dl_cnt(tx_wqe_list=%d,"
2768 			    " tx_rel_list=%d) != chan->scq_size=%d",
2769 			    chan->tx_wqe_list.dl_cnt, chan->tx_rel_list.dl_cnt,
2770 			    chan->scq_size);
2771 			mutex_exit(&chan->tx_rel_list.dl_mutex);
2772 			mutex_exit(&chan->tx_wqe_list.dl_mutex);
2773 			mutex_enter(&chan->tx_poll_lock);
2774 			if (chan->tx_poll_busy & IBD_CQ_POLLING) {
2775 				DPRINT(40, "ibd_rc_act_close: multiple "
2776 				    "polling threads");
2777 				mutex_exit(&chan->tx_poll_lock);
2778 			} else {
2779 				chan->tx_poll_busy = IBD_CQ_POLLING;
2780 				mutex_exit(&chan->tx_poll_lock);
2781 				ibd_rc_drain_scq(chan, chan->scq_hdl);
2782 				mutex_enter(&chan->tx_poll_lock);
2783 				chan->tx_poll_busy = 0;
2784 				mutex_exit(&chan->tx_poll_lock);
2785 			}
2786 			delay(drv_usectohz(100000));
2787 			times++;
2788 			mutex_enter(&chan->tx_wqe_list.dl_mutex);
2789 			mutex_enter(&chan->tx_rel_list.dl_mutex);
2790 		}
2791 		mutex_exit(&chan->tx_rel_list.dl_mutex);
2792 		mutex_exit(&chan->tx_wqe_list.dl_mutex);
2793 		ibt_set_cq_handler(chan->scq_hdl, 0, 0);
2794 		ret = ibt_close_rc_channel(chan->chan_hdl,
2795 		    IBT_BLOCKING|IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0);
2796 		if (ret != IBT_SUCCESS) {
2797 			DPRINT(40, "ibd_rc_act_close-2: ibt_close_rc_channel "
2798 			    "fail, chan=%p, returned=%d", chan, ret);
2799 		} else {
2800 			DPRINT(30, "ibd_rc_act_close-2: ibt_close_rc_channel "
2801 			    "succ, chan=%p", chan);
2802 		}
2803 
2804 		ibd_rc_free_chan(chan);
2805 		break;
2806 	case IBD_RC_STATE_ACT_REP_RECV:
2807 		chan->chan_state = IBD_RC_STATE_ACT_CLOSED;
2808 		(void) ibt_flush_channel(chan->chan_hdl);
2809 		ibd_rc_free_chan(chan);
2810 		break;
2811 	case IBD_RC_STATE_ACT_ERROR:
2812 		DPRINT(40, "ibd_rc_act_close: IBD_RC_STATE_ERROR branch");
2813 		break;
2814 	default:
2815 		DPRINT(40, "ibd_rc_act_close: default branch, act_state=%d, "
2816 		    "chan=%p", chan->chan_state, chan);
2817 	}
2818 }
2819 
2820 static int
2821 ibd_rc_pas_close(ibd_rc_chan_t *chan)
2822 {
2823 	uint_t times;
2824 	ibt_status_t ret;
2825 
2826 	ASSERT(chan != NULL);
2827 	chan->state->rc_pas_close++;
2828 
2829 	switch (chan->chan_state) {
2830 	case IBD_RC_STATE_PAS_ESTAB:
2831 		/*
2832 		 * First, stop receive interrupts; this stops the
2833 		 * connection from handing up buffers to higher layers.
2834 		 * Wait for receive buffers to be returned; give up
2835 		 * after 5 seconds.
2836 		 */
2837 		ibt_set_cq_handler(chan->rcq_hdl, 0, 0);
2838 		if (!chan->state->rc_enable_srq) {
2839 			times = 50;
2840 			while (chan->rx_wqe_list.dl_bufs_outstanding > 0) {
2841 				delay(drv_usectohz(100000));
2842 				if (--times == 0) {
2843 					DPRINT(40, "ibd_rc_pas_close : "
2844 					    "reclaiming failed");
2845 					ibd_rc_poll_rcq(chan, chan->rcq_hdl);
2846 					ibt_set_cq_handler(chan->rcq_hdl,
2847 					    ibd_rc_rcq_handler,
2848 					    (void *)(uintptr_t)chan);
2849 					return (DDI_FAILURE);
2850 				}
2851 			}
2852 		}
2853 		ibt_set_cq_handler(chan->scq_hdl, 0, 0);
2854 		chan->chan_state = IBD_RC_STATE_PAS_CLOSED;
2855 		DPRINT(30, "ibd_rc_pas_close-1: close and free chan, "
2856 		    "chan_state=%d, chan=%p", chan->chan_state, chan);
2857 		ret = ibt_close_rc_channel(chan->chan_hdl,
2858 		    IBT_BLOCKING|IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0);
2859 		if (ret != IBT_SUCCESS) {
2860 			DPRINT(40, "ibd_rc_pas_close-2: ibt_close_rc_channel()"
2861 			    " fail, chan=%p, returned=%d", chan, ret);
2862 		} else {
2863 			DPRINT(30, "ibd_rc_pas_close-2: ibt_close_rc_channel()"
2864 			    " succ, chan=%p", chan);
2865 		}
2866 
2867 		ibd_rc_free_chan(chan);
2868 		break;
2869 	case IBD_RC_STATE_PAS_REQ_RECV:
2870 		chan->chan_state = IBD_RC_STATE_PAS_CLOSED;
2871 		(void) ibt_flush_channel(chan->chan_hdl);
2872 		ibd_rc_free_chan(chan);
2873 		break;
2874 	default:
2875 		DPRINT(40, "ibd_rc_pas_close: default, chan_state=%d, chan=%p",
2876 		    chan->chan_state, chan);
2877 	}
2878 	return (DDI_SUCCESS);
2879 }
2880 
2881 /*
2882  * Remove duplicate RC channel which comes from the same mac
2883  *
2884  * From the IP point of view, we could check for same MAC:
2885  * GID, P_Key (or QPN, though in a reboot this is likely to
2886  * change so P_Key is better). The GID usually will equate to
2887  * port (since typically it uses the port GUID in the low 64 bits).
2888  * These fields exists in the REQ messages.
2889  */
2890 void
2891 ibd_rc_handle_req_rm_dup(ibd_state_t *state, ibt_cm_event_t *ibt_cm_event)
2892 {
2893 	ibd_rc_chan_t *chan, *pre_chan;
2894 
2895 	pre_chan = NULL;
2896 	mutex_enter(&state->rc_pass_chan_list.chan_list_mutex);
2897 	chan = state->rc_pass_chan_list.chan_list;
2898 	while (chan != NULL) {
2899 		if ((bcmp(&chan->requester_gid,
2900 		    &ibt_cm_event->cm_event.req.req_prim_addr.av_dgid,
2901 		    sizeof (ib_gid_t)) == 0) && (chan->requester_pkey ==
2902 		    ibt_cm_event->cm_event.req.req_pkey)) {
2903 			if (pre_chan == NULL) {
2904 				state->rc_pass_chan_list.chan_list = chan->next;
2905 			} else {
2906 				pre_chan->next = chan->next;
2907 			}
2908 			break;
2909 		}
2910 		pre_chan = chan;
2911 		chan = chan->next;
2912 	}
2913 	mutex_exit(&state->rc_pass_chan_list.chan_list_mutex);
2914 	if (chan) {
2915 		DPRINT(30, "ibd_rc_handle_req_rm_dup: same gid and pkey, "
2916 		    "remove duplicate channal, chan=%p", chan);
2917 		if (ibd_rc_pas_close(chan) != DDI_SUCCESS) {
2918 			ibd_rc_add_to_chan_list(&state->rc_pass_chan_list,
2919 			    chan);
2920 		}
2921 	}
2922 }
2923 
2924 /*
2925  * Passive Side:
2926  *	Handle an incoming CM REQ from active side.
2927  *
2928  *	If success, this function allocates an ibd_rc_chan_t, then
2929  * assigns it to "*ret_conn".
2930  */
2931 static ibt_cm_status_t
2932 ibd_rc_handle_req(void *arg, ibd_rc_chan_t **ret_conn,
2933     ibt_cm_event_t *ibt_cm_event, ibt_cm_return_args_t *ret_args,
2934     void *ret_priv_data)
2935 {
2936 	ibd_rc_msg_hello_t *hello_msg;
2937 	ibd_state_t *state = (ibd_state_t *)arg;
2938 	ibd_rc_chan_t *chan;
2939 
2940 	ibd_rc_handle_req_rm_dup(state, ibt_cm_event);
2941 
2942 	if (ibd_rc_alloc_chan(&chan, state, B_FALSE) != IBT_SUCCESS) {
2943 		DPRINT(40, "ibd_rc_handle_req: ibd_rc_alloc_chan() failed");
2944 		return (IBT_CM_REJECT);
2945 	}
2946 
2947 	ibd_rc_add_to_chan_list(&state->rc_pass_chan_list, chan);
2948 
2949 	ibt_set_chan_private(chan->chan_hdl, (void *)(uintptr_t)chan);
2950 
2951 	if (!state->rc_enable_srq) {
2952 		if (ibd_rc_init_rxlist(chan) != DDI_SUCCESS) {
2953 			ibd_rc_free_chan(chan);
2954 			DPRINT(40, "ibd_rc_handle_req: ibd_rc_init_rxlist() "
2955 			    "failed");
2956 			return (IBT_CM_REJECT);
2957 		}
2958 	}
2959 
2960 	ret_args->cm_ret.rep.cm_channel = chan->chan_hdl;
2961 
2962 	/* We don't do RDMA */
2963 	ret_args->cm_ret.rep.cm_rdma_ra_out = 0;
2964 	ret_args->cm_ret.rep.cm_rdma_ra_in = 0;
2965 
2966 	ret_args->cm_ret.rep.cm_rnr_retry_cnt = 7;
2967 	ret_args->cm_ret_len = sizeof (ibd_rc_msg_hello_t);
2968 
2969 	hello_msg = (ibd_rc_msg_hello_t *)ibt_cm_event->cm_priv_data;
2970 	DPRINT(30, "ibd_rc_handle_req(): peer qpn=0x%x, peer mtu=0x%x",
2971 	    ntohl(hello_msg->reserved_qpn), ntohl(hello_msg->rx_mtu));
2972 
2973 	hello_msg = (ibd_rc_msg_hello_t *)ret_priv_data;
2974 	hello_msg->reserved_qpn = htonl(state->id_qpnum);
2975 	hello_msg->rx_mtu = htonl(state->rc_mtu);
2976 
2977 	chan->requester_gid = ibt_cm_event->cm_event.req.req_prim_addr.av_dgid;
2978 	chan->requester_pkey = ibt_cm_event->cm_event.req.req_pkey;
2979 	chan->chan_state = IBD_RC_STATE_PAS_REQ_RECV;	/* ready to receive */
2980 	*ret_conn = chan;
2981 
2982 	return (IBT_CM_ACCEPT);
2983 }
2984 
2985 /*
2986  * ibd_rc_handle_act_estab -- handler for connection established completion
2987  * for active side.
2988  */
2989 static ibt_cm_status_t
2990 ibd_rc_handle_act_estab(ibd_ace_t *ace)
2991 {
2992 	ibt_status_t result;
2993 
2994 	switch (ace->ac_chan->chan_state) {
2995 		case IBD_RC_STATE_ACT_REP_RECV:
2996 			ace->ac_chan->chan_state = IBD_RC_STATE_ACT_ESTAB;
2997 			result = ibt_enable_cq_notify(ace->ac_chan->rcq_hdl,
2998 			    IBT_NEXT_COMPLETION);
2999 			if (result != IBT_SUCCESS) {
3000 				DPRINT(40, "ibd_rc_handle_act_estab: "
3001 				    "ibt_enable_cq_notify(rcq) "
3002 				    "failed: status %d", result);
3003 				return (IBT_CM_REJECT);
3004 			}
3005 			break;
3006 		default:
3007 			DPRINT(40, "ibd_rc_handle_act_estab: default "
3008 			    "branch, act_state=%d", ace->ac_chan->chan_state);
3009 			return (IBT_CM_REJECT);
3010 	}
3011 	return (IBT_CM_ACCEPT);
3012 }
3013 
3014 /*
3015  * ibd_rc_handle_pas_estab -- handler for connection established completion
3016  * for passive side.
3017  */
3018 static ibt_cm_status_t
3019 ibd_rc_handle_pas_estab(ibd_rc_chan_t *chan)
3020 {
3021 	ibt_status_t result;
3022 
3023 	switch (chan->chan_state) {
3024 		case IBD_RC_STATE_PAS_REQ_RECV:
3025 			chan->chan_state = IBD_RC_STATE_PAS_ESTAB;
3026 
3027 			result = ibt_enable_cq_notify(chan->rcq_hdl,
3028 			    IBT_NEXT_COMPLETION);
3029 			if (result != IBT_SUCCESS) {
3030 				DPRINT(40, "ibd_rc_handle_pas_estab: "
3031 				    "ibt_enable_cq_notify(rcq) "
3032 				    "failed: status %d", result);
3033 				return (IBT_CM_REJECT);
3034 			}
3035 			break;
3036 		default:
3037 			DPRINT(40, "ibd_rc_handle_pas_estab: default "
3038 			    "branch, chan_state=%d", chan->chan_state);
3039 			return (IBT_CM_REJECT);
3040 	}
3041 	return (IBT_CM_ACCEPT);
3042 }
3043 
3044 /* ARGSUSED */
3045 static ibt_cm_status_t
3046 ibd_rc_dispatch_actv_mad(void *arg, ibt_cm_event_t *ibt_cm_event,
3047     ibt_cm_return_args_t *ret_args, void *ret_priv_data,
3048     ibt_priv_data_len_t ret_len_max)
3049 {
3050 	ibt_cm_status_t result = IBT_CM_ACCEPT;
3051 	ibd_ace_t *ace = (ibd_ace_t *)(uintptr_t)arg;
3052 	ibd_rc_chan_t *rc_chan;
3053 	ibd_state_t *state;
3054 	ibd_rc_msg_hello_t *hello_ack;
3055 	uint_t times;
3056 
3057 	switch (ibt_cm_event->cm_type) {
3058 	case IBT_CM_EVENT_REP_RCV:
3059 		ASSERT(ace->ac_chan != NULL);
3060 		ASSERT(ace->ac_chan->chan_state == IBD_RC_STATE_INIT);
3061 		hello_ack = (ibd_rc_msg_hello_t *)ibt_cm_event->cm_priv_data;
3062 		DPRINT(30, "ibd_rc_handle_rep: hello_ack->mtu=0x%x, "
3063 		    "hello_ack->qpn=0x%x", ntohl(hello_ack->rx_mtu),
3064 		    ntohl(hello_ack->reserved_qpn));
3065 		ace->ac_chan->chan_state = IBD_RC_STATE_ACT_REP_RECV;
3066 		break;
3067 
3068 	case IBT_CM_EVENT_CONN_EST:
3069 		ASSERT(ace->ac_chan != NULL);
3070 		DPRINT(30, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_CONN_EST, "
3071 		    "ace=%p, act_state=%d, chan=%p",
3072 		    ace, ace->ac_chan->chan_state, ace->ac_chan);
3073 		result = ibd_rc_handle_act_estab(ace);
3074 		break;
3075 
3076 	case IBT_CM_EVENT_CONN_CLOSED:
3077 		rc_chan = ace->ac_chan;
3078 		if (rc_chan == NULL) {
3079 			DPRINT(40, "ibd_rc_dispatch_actv_mad: "
3080 			    "rc_chan==NULL, IBT_CM_EVENT_CONN_CLOSED");
3081 			return (IBT_CM_ACCEPT);
3082 		}
3083 		state = rc_chan->state;
3084 		mutex_enter(&state->id_ac_mutex);
3085 		if ((rc_chan->chan_state == IBD_RC_STATE_ACT_ESTAB) &&
3086 		    ((ace = ibd_acache_find(state, &ace->ac_mac, B_FALSE, 0))
3087 		    != NULL) && (ace == rc_chan->ace)) {
3088 			rc_chan->chan_state = IBD_RC_STATE_ACT_CLOSING;
3089 			ASSERT(ace->ac_mce == NULL);
3090 			INC_REF(ace, 1);
3091 			IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
3092 			mutex_exit(&state->id_ac_mutex);
3093 			DPRINT(30, "ibd_rc_dispatch_actv_mad: "
3094 			    "IBT_CM_EVENT_CONN_CLOSED, ace=%p, chan=%p, "
3095 			    "reason=%d", ace, rc_chan,
3096 			    ibt_cm_event->cm_event.closed);
3097 		} else {
3098 			mutex_exit(&state->id_ac_mutex);
3099 			state->rc_act_close_simultaneous++;
3100 			DPRINT(40, "ibd_rc_dispatch_actv_mad: other thread "
3101 			    "is closing it, IBT_CM_EVENT_CONN_CLOSED, "
3102 			    "chan_state=%d", rc_chan->chan_state);
3103 			return (IBT_CM_ACCEPT);
3104 		}
3105 		/* wait until the send queue clean */
3106 		times = 0;
3107 		mutex_enter(&rc_chan->tx_wqe_list.dl_mutex);
3108 		mutex_enter(&rc_chan->tx_rel_list.dl_mutex);
3109 		while (((rc_chan->tx_wqe_list.dl_cnt +
3110 		    rc_chan->tx_rel_list.dl_cnt)
3111 		    != rc_chan->scq_size) && (times < 50)) {
3112 			DPRINT(40, "ibd_rc_dispatch_act_mad: dl_cnt"
3113 			    "(tx_wqe_list=%d, tx_rel_list=%d) != "
3114 			    "chan->scq_size=%d",
3115 			    rc_chan->tx_wqe_list.dl_cnt,
3116 			    rc_chan->tx_rel_list.dl_cnt,
3117 			    rc_chan->scq_size);
3118 			mutex_exit(&rc_chan->tx_rel_list.dl_mutex);
3119 			mutex_exit(&rc_chan->tx_wqe_list.dl_mutex);
3120 			mutex_enter(&rc_chan->tx_poll_lock);
3121 			if (rc_chan->tx_poll_busy & IBD_CQ_POLLING) {
3122 				DPRINT(40, "ibd_rc_dispatch_actv_mad: "
3123 				    "multiple polling threads");
3124 				mutex_exit(&rc_chan->tx_poll_lock);
3125 			} else {
3126 				rc_chan->tx_poll_busy = IBD_CQ_POLLING;
3127 				mutex_exit(&rc_chan->tx_poll_lock);
3128 				ibd_rc_drain_scq(rc_chan, rc_chan->scq_hdl);
3129 				mutex_enter(&rc_chan->tx_poll_lock);
3130 				rc_chan->tx_poll_busy = 0;
3131 				mutex_exit(&rc_chan->tx_poll_lock);
3132 			}
3133 			delay(drv_usectohz(100000));
3134 			times++;
3135 			mutex_enter(&rc_chan->tx_wqe_list.dl_mutex);
3136 			mutex_enter(&rc_chan->tx_rel_list.dl_mutex);
3137 		}
3138 		mutex_exit(&rc_chan->tx_rel_list.dl_mutex);
3139 		mutex_exit(&rc_chan->tx_wqe_list.dl_mutex);
3140 		rc_chan->chan_state = IBD_RC_STATE_ACT_CLOSED;
3141 		ibd_rc_free_chan(rc_chan);
3142 		DPRINT(30, "ibd_rc_dispatch_actv_mad: "
3143 		    "IBT_CM_EVENT_CONN_CLOSED, ref=%x", ace->ac_ref);
3144 		mutex_enter(&state->id_ac_mutex);
3145 		ace->ac_chan = NULL;
3146 		ASSERT(ace->ac_ref != 0);
3147 		atomic_dec_32(&ace->ac_ref);
3148 		if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
3149 			IBD_ACACHE_INSERT_FREE(state, ace);
3150 			ace->ac_ref = 0;
3151 		} else {
3152 			ace->ac_ref |= CYCLEVAL;
3153 			state->rc_delay_ace_recycle++;
3154 		}
3155 		mutex_exit(&state->id_ac_mutex);
3156 		break;
3157 
3158 	case IBT_CM_EVENT_FAILURE:
3159 		DPRINT(30, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_FAILURE,"
3160 		    "ace=%p, chan=%p, code: %d, msg: %d, reason=%d",
3161 		    ace, ace->ac_chan,
3162 		    ibt_cm_event->cm_event.failed.cf_code,
3163 		    ibt_cm_event->cm_event.failed.cf_msg,
3164 		    ibt_cm_event->cm_event.failed.cf_reason);
3165 		/*
3166 		 * Don't need free resource here. The resource is freed
3167 		 * at function ibd_rc_connect()
3168 		 */
3169 		break;
3170 
3171 	case IBT_CM_EVENT_MRA_RCV:
3172 		DPRINT(40, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_MRA_RCV");
3173 		break;
3174 	case IBT_CM_EVENT_LAP_RCV:
3175 		DPRINT(40, "ibd_rc_dispatch_actv_mad: LAP message received");
3176 		break;
3177 	case IBT_CM_EVENT_APR_RCV:
3178 		DPRINT(40, "ibd_rc_dispatch_actv_mad: APR message received");
3179 		break;
3180 	default:
3181 		DPRINT(40, "ibd_rc_dispatch_actv_mad: default branch, "
3182 		    "ibt_cm_event->cm_type=%d", ibt_cm_event->cm_type);
3183 		break;
3184 	}
3185 
3186 	return (result);
3187 }
3188 
3189 /* ARGSUSED */
3190 static ibt_cm_status_t
3191 ibd_rc_dispatch_pass_mad(void *arg, ibt_cm_event_t *ibt_cm_event,
3192     ibt_cm_return_args_t *ret_args, void *ret_priv_data,
3193     ibt_priv_data_len_t ret_len_max)
3194 {
3195 	ibt_cm_status_t result = IBT_CM_ACCEPT;
3196 	ibd_rc_chan_t *chan;
3197 
3198 	if (ibt_cm_event->cm_type == IBT_CM_EVENT_REQ_RCV) {
3199 		DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_REQ_RCV,"
3200 		    "req_pkey=%x", ibt_cm_event->cm_event.req.req_pkey);
3201 		/* Receive an incoming CM REQ from active side */
3202 		result = ibd_rc_handle_req(arg, &chan, ibt_cm_event, ret_args,
3203 		    ret_priv_data);
3204 		return (result);
3205 	}
3206 
3207 	if (ibt_cm_event->cm_channel == 0) {
3208 		DPRINT(30, "ibd_rc_dispatch_pass_mad: "
3209 		    "ERROR ibt_cm_event->cm_channel == 0");
3210 		return (IBT_CM_REJECT);
3211 	}
3212 
3213 	chan =
3214 	    (ibd_rc_chan_t *)ibt_get_chan_private(ibt_cm_event->cm_channel);
3215 	if (chan == NULL) {
3216 		DPRINT(40, "ibd_rc_dispatch_pass_mad: conn == 0");
3217 		return (IBT_CM_REJECT);
3218 	}
3219 
3220 	switch (ibt_cm_event->cm_type) {
3221 	case IBT_CM_EVENT_CONN_EST:
3222 		DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_CONN_EST, "
3223 		    "chan=%p", chan);
3224 		result = ibd_rc_handle_pas_estab(chan);
3225 		break;
3226 	case IBT_CM_EVENT_CONN_CLOSED:
3227 		DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_CONN_CLOSED,"
3228 		    " chan=%p, reason=%d", chan, ibt_cm_event->cm_event.closed);
3229 		ibd_rc_rm_from_chan_list(&chan->state->rc_pass_chan_list, chan);
3230 		ibd_rc_free_chan(chan);
3231 		break;
3232 	case IBT_CM_EVENT_FAILURE:
3233 		DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_FAILURE,"
3234 		    " chan=%p, code: %d, msg: %d, reason=%d", chan,
3235 		    ibt_cm_event->cm_event.failed.cf_code,
3236 		    ibt_cm_event->cm_event.failed.cf_msg,
3237 		    ibt_cm_event->cm_event.failed.cf_reason);
3238 
3239 		ibd_rc_rm_from_chan_list(&chan->state->rc_pass_chan_list, chan);
3240 		ibd_rc_free_chan(chan);
3241 		return (IBT_CM_ACCEPT);
3242 	case IBT_CM_EVENT_MRA_RCV:
3243 		DPRINT(40, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_MRA_RCV");
3244 		break;
3245 	case IBT_CM_EVENT_LAP_RCV:
3246 		DPRINT(40, "ibd_rc_dispatch_pass_mad: LAP message received");
3247 		break;
3248 	case IBT_CM_EVENT_APR_RCV:
3249 		DPRINT(40, "ibd_rc_dispatch_pass_mad: APR message received");
3250 		break;
3251 	default:
3252 		DPRINT(40, "ibd_rc_dispatch_pass_mad: default, type=%d, "
3253 		    "chan=%p", ibt_cm_event->cm_type, chan);
3254 		break;
3255 	}
3256 
3257 	return (result);
3258 }
3259