1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 /*
28  * An implementation of the IPoIB-CM standard based on PSARC 2009/593.
29  */
30 #include <sys/types.h>
31 #include <sys/conf.h>
32 #include <sys/ddi.h>
33 #include <sys/sunddi.h>
34 #include <sys/modctl.h>
35 #include <sys/stropts.h>
36 #include <sys/stream.h>
37 #include <sys/strsun.h>
38 #include <sys/strsubr.h>
39 #include <sys/dlpi.h>
40 #include <sys/mac_provider.h>
41 
42 #include <sys/pattr.h>		/* for HCK_FULLCKSUM */
43 #include <sys/atomic.h>		/* for atomic_add*() */
44 #include <sys/ethernet.h>	/* for ETHERTYPE_IP */
45 #include <netinet/in.h>		/* for netinet/ip.h below */
46 #include <netinet/ip.h>		/* for struct ip */
47 #include <inet/common.h>	/* for inet/ip.h below */
48 #include <inet/ip.h>		/* for ipha_t */
49 #include <inet/ip_if.h>		/* for ETHERTYPE_IPV6 */
50 #include <inet/ip6.h>		/* for ip6_t */
51 #include <netinet/icmp6.h>	/* for icmp6_t */
52 #include <sys/ib/ibtl/ibvti.h>	/* for ace->ac_dest->ud_dst_qpn */
53 
54 #include <sys/ib/clients/ibd/ibd.h>
55 
56 extern ibd_global_state_t ibd_gstate;
57 uint_t ibd_rc_tx_softintr = 1;
58 /*
59  * If the number of WRs in receive queue of each RC connection less than
60  * IBD_RC_RX_WR_THRESHOLD, we will post more receive WRs into it.
61  */
62 #define	IBD_RC_RX_WR_THRESHOLD		0x20
63 
64 /*
65  * If the number of free SWQEs (or large Tx buf) is larger than or equal to
66  * IBD_RC_TX_FREE_THRESH, we will call mac_tx_update to notify GLD to continue
67  * transmitting packets.
68  */
69 #define	IBD_RC_TX_FREE_THRESH		8
70 
71 #define	IBD_RC_QPN_TO_SID(qpn) \
72 	((uint64_t)(IBD_RC_SERVICE_ID | ((qpn) & 0xffffff)))
73 
74 /* For interop with legacy OFED */
75 #define	IBD_RC_QPN_TO_SID_OFED_INTEROP(qpn) \
76 	((uint64_t)(IBD_RC_SERVICE_ID_OFED_INTEROP | ((qpn) & 0xffffff)))
77 
78 /* Internet Header + 64 bits of Data Datagram. Refer to RFC 792 */
79 #define	IBD_RC_IP_ICMP_RETURN_DATA_BYTES	64
80 
81 
82 /* Functions for Reliable Connected Mode */
83 /* Connection Setup/Close Functions */
84 static ibt_cm_status_t ibd_rc_dispatch_pass_mad(void *,
85     ibt_cm_event_t *, ibt_cm_return_args_t *, void *, ibt_priv_data_len_t);
86 static ibt_cm_status_t ibd_rc_dispatch_actv_mad(void *,
87     ibt_cm_event_t *, ibt_cm_return_args_t *, void *, ibt_priv_data_len_t);
88 static int ibd_rc_pas_close(ibd_rc_chan_t *);
89 static void ibd_rc_act_close(ibd_rc_chan_t *);
90 
91 static inline void ibd_rc_add_to_chan_list(ibd_rc_chan_list_t *,
92     ibd_rc_chan_t *);
93 static inline ibd_rc_chan_t *ibd_rc_rm_header_chan_list(
94     ibd_rc_chan_list_t *);
95 static inline void ibd_rc_rm_from_chan_list(ibd_rc_chan_list_t *,
96     ibd_rc_chan_t *);
97 
98 /* CQ handlers */
99 static void ibd_rc_rcq_handler(ibt_cq_hdl_t, void *);
100 static void ibd_rc_scq_handler(ibt_cq_hdl_t, void *);
101 static void ibd_rc_poll_rcq(ibd_rc_chan_t *, ibt_cq_hdl_t);
102 
103 /* Receive Functions */
104 static int ibd_rc_post_srq(ibd_state_t *, ibd_rwqe_t *);
105 static void ibd_rc_srq_freemsg_cb(char *);
106 static void ibd_rc_srq_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
107 
108 static int ibd_rc_post_rwqe(ibd_rc_chan_t *, ibd_rwqe_t *);
109 static void ibd_rc_freemsg_cb(char *);
110 static void ibd_rc_process_rx(ibd_rc_chan_t *, ibd_rwqe_t *, ibt_wc_t *);
111 static void ibd_rc_free_rwqe(ibd_rc_chan_t *, ibd_rwqe_t *);
112 static void ibd_rc_fini_rxlist(ibd_rc_chan_t *);
113 
114 
115 /* Send Functions */
116 static void ibd_rc_release_swqe(ibd_rc_chan_t *, ibd_swqe_t *);
117 static int ibd_rc_init_txlist(ibd_rc_chan_t *);
118 static void ibd_rc_fini_txlist(ibd_rc_chan_t *);
119 static uint_t ibd_rc_tx_recycle(caddr_t);
120 
121 
122 void
123 ibd_async_rc_close_act_chan(ibd_state_t *state, ibd_req_t *req)
124 {
125 	ibd_rc_chan_t *rc_chan = req->rq_ptr;
126 	ibd_ace_t *ace;
127 
128 	while (rc_chan != NULL) {
129 		ace = rc_chan->ace;
130 		ASSERT(ace != NULL);
131 		/* Close old RC channel */
132 		ibd_rc_act_close(rc_chan);
133 		mutex_enter(&state->id_ac_mutex);
134 		ASSERT(ace->ac_ref != 0);
135 		atomic_dec_32(&ace->ac_ref);
136 		ace->ac_chan = NULL;
137 		if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
138 			IBD_ACACHE_INSERT_FREE(state, ace);
139 			ace->ac_ref = 0;
140 		} else {
141 			ace->ac_ref |= CYCLEVAL;
142 			state->rc_delay_ace_recycle++;
143 		}
144 		mutex_exit(&state->id_ac_mutex);
145 		rc_chan = ibd_rc_rm_header_chan_list(
146 		    &state->rc_obs_act_chan_list);
147 	}
148 }
149 
150 void
151 ibd_async_rc_recycle_ace(ibd_state_t *state, ibd_req_t *req)
152 {
153 	ibd_ace_t *ace = req->rq_ptr;
154 	ibd_rc_chan_t *rc_chan;
155 
156 	ASSERT(ace != NULL);
157 	rc_chan = ace->ac_chan;
158 	ASSERT(rc_chan != NULL);
159 	/* Close old RC channel */
160 	ibd_rc_act_close(rc_chan);
161 	mutex_enter(&state->id_ac_mutex);
162 	ASSERT(ace->ac_ref != 0);
163 	atomic_dec_32(&ace->ac_ref);
164 	ace->ac_chan = NULL;
165 	if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
166 		IBD_ACACHE_INSERT_FREE(state, ace);
167 		ace->ac_ref = 0;
168 	} else {
169 		ace->ac_ref |= CYCLEVAL;
170 		state->rc_delay_ace_recycle++;
171 	}
172 	mutex_exit(&state->id_ac_mutex);
173 	mutex_enter(&state->rc_ace_recycle_lock);
174 	state->rc_ace_recycle = NULL;
175 	mutex_exit(&state->rc_ace_recycle_lock);
176 }
177 
178 /* Simple ICMP IP Header Template */
179 static const ipha_t icmp_ipha = {
180 	IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
181 };
182 
183 /* Packet is too big. Send ICMP packet to GLD to request a smaller MTU */
184 void
185 ibd_async_rc_process_too_big(ibd_state_t *state, ibd_req_t *req)
186 {
187 	mblk_t *mp = req->rq_ptr;
188 	ibd_ace_t *ace = req->rq_ptr2;
189 	uint16_t mtu = state->id_mtu - IPOIB_HDRSIZE;
190 	uint_t	len_needed;
191 	size_t	msg_len;
192 	mblk_t	*pmtu_mp;
193 	ushort_t	sap;
194 	ib_header_info_t *ibha;	/* ib header for pmtu_pkt */
195 	/*
196 	 * ipha: IP header for pmtu_pkt
197 	 * old_ipha: IP header for old packet
198 	 */
199 	ipha_t *ipha, *old_ipha;
200 	icmph_t	*icmph;
201 
202 	sap = ntohs(((ipoib_hdr_t *)mp->b_rptr)->ipoib_type);
203 
204 	if (!pullupmsg(mp, -1)) {
205 		DPRINT(40, "ibd_async_rc_process_too_big: pullupmsg fail");
206 		goto too_big_fail;
207 	}
208 	/* move to IP header. */
209 	mp->b_rptr += IPOIB_HDRSIZE;
210 	old_ipha = (ipha_t *)mp->b_rptr;
211 
212 	len_needed = IPH_HDR_LENGTH(old_ipha);
213 	if (old_ipha->ipha_protocol == IPPROTO_ENCAP) {
214 		len_needed += IPH_HDR_LENGTH(((uchar_t *)old_ipha +
215 		    len_needed));
216 	} else if (old_ipha->ipha_protocol == IPPROTO_IPV6) {
217 		ip6_t *ip6h = (ip6_t *)((uchar_t *)old_ipha
218 		    + len_needed);
219 		len_needed += ip_hdr_length_v6(mp, ip6h);
220 	}
221 	len_needed += IBD_RC_IP_ICMP_RETURN_DATA_BYTES;
222 	msg_len = msgdsize(mp);
223 	if (msg_len > len_needed) {
224 		(void) adjmsg(mp, len_needed - msg_len);
225 		msg_len = len_needed;
226 	}
227 
228 	if ((pmtu_mp = allocb(sizeof (ib_header_info_t) + sizeof (ipha_t)
229 	    + sizeof (icmph_t), BPRI_MED)) == NULL) {
230 		DPRINT(40, "ibd_async_rc_process_too_big: allocb fail");
231 		goto too_big_fail;
232 	}
233 	pmtu_mp->b_cont = mp;
234 	pmtu_mp->b_wptr = pmtu_mp->b_rptr + sizeof (ib_header_info_t)
235 	    + sizeof (ipha_t) + sizeof (icmph_t);
236 
237 	ibha = (ib_header_info_t *)pmtu_mp->b_rptr;
238 
239 	/* Fill IB header */
240 	bcopy(&state->id_macaddr, &ibha->ib_dst, IPOIB_ADDRL);
241 	/*
242 	 * If the GRH is not valid, indicate to GLDv3 by setting
243 	 * the VerTcFlow field to 0.
244 	 */
245 	ibha->ib_grh.ipoib_vertcflow = 0;
246 	ibha->ipib_rhdr.ipoib_type = htons(sap);
247 	ibha->ipib_rhdr.ipoib_mbz = 0;
248 
249 	/* Fill IP header */
250 	ipha = (ipha_t *)&ibha[1];
251 	*ipha = icmp_ipha;
252 	ipha->ipha_src = old_ipha->ipha_dst;
253 	ipha->ipha_dst = old_ipha->ipha_src;
254 	ipha->ipha_ttl = old_ipha->ipha_ttl;
255 	msg_len += sizeof (icmp_ipha) + sizeof (icmph_t);
256 	if (msg_len > IP_MAXPACKET) {
257 		ibd_print_warn(state, "ibd_rc_process_too_big_pkt: msg_len(%d) "
258 		    "> IP_MAXPACKET", (uint32_t)msg_len);
259 		(void) adjmsg(mp, IP_MAXPACKET - msg_len);
260 		msg_len = IP_MAXPACKET;
261 	}
262 	ipha->ipha_length = htons((uint16_t)msg_len);
263 	ipha->ipha_hdr_checksum = 0;
264 	ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
265 
266 	/* Fill ICMP body */
267 	icmph = (icmph_t *)&ipha[1];
268 	bzero(icmph, sizeof (icmph_t));
269 	icmph->icmph_type = ICMP_DEST_UNREACHABLE;
270 	icmph->icmph_code = ICMP_FRAGMENTATION_NEEDED;
271 	icmph->icmph_du_mtu = htons(mtu);
272 	icmph->icmph_checksum = 0;
273 	icmph->icmph_checksum = IP_CSUM(pmtu_mp,
274 	    (int32_t)sizeof (ib_header_info_t) + (int32_t)sizeof (ipha_t), 0);
275 
276 	(void) hcksum_assoc(pmtu_mp, NULL, NULL, 0, 0, 0, 0,
277 	    HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);
278 
279 	DPRINT(30, "ibd_async_rc_process_too_big: sap=0x%x, ip_src=0x%x, "
280 	    "ip_dst=0x%x, ttl=%d, len_needed=%d, msg_len=%d",
281 	    sap, ipha->ipha_src, ipha->ipha_dst, ipha->ipha_ttl,
282 	    len_needed, (uint32_t)msg_len);
283 
284 	mac_rx(state->id_mh, state->id_rh, pmtu_mp);
285 
286 	mutex_enter(&ace->tx_too_big_mutex);
287 	ace->tx_too_big_ongoing = B_FALSE;
288 	mutex_exit(&ace->tx_too_big_mutex);
289 	return;
290 
291 too_big_fail:
292 	/* Drop packet */
293 	freemsg(mp);
294 	mutex_enter(&ace->tx_too_big_mutex);
295 	ace->tx_too_big_ongoing = B_FALSE;
296 	mutex_exit(&ace->tx_too_big_mutex);
297 }
298 
299 #ifdef DEBUG
300 /*
301  * ibd_rc_update_stats - update driver private kstat counters
302  *
303  * This routine will dump the internal statistics counters for ibd's
304  * Reliable Connected Mode. The current stats dump values will
305  * be sent to the kernel status area.
306  */
307 static int
308 ibd_rc_update_stats(kstat_t *ksp, int rw)
309 {
310 	ibd_state_t *state;
311 	ibd_rc_stat_t *ibd_rc_ksp;
312 
313 	if (rw == KSTAT_WRITE)
314 		return (EACCES);
315 
316 	state = (ibd_state_t *)ksp->ks_private;
317 	ASSERT(state != NULL);
318 	ibd_rc_ksp = (ibd_rc_stat_t *)ksp->ks_data;
319 
320 	ibd_rc_ksp->rc_rcv_trans_byte.value.ul = state->rc_rcv_trans_byte;
321 	ibd_rc_ksp->rc_rcv_trans_pkt.value.ul = state->rc_rcv_trans_pkt;
322 	ibd_rc_ksp->rc_rcv_copy_byte.value.ul = state->rc_rcv_copy_byte;
323 	ibd_rc_ksp->rc_rcv_copy_pkt.value.ul = state->rc_rcv_copy_pkt;
324 	ibd_rc_ksp->rc_rcv_alloc_fail.value.ul = state->rc_rcv_alloc_fail;
325 
326 	ibd_rc_ksp->rc_rcq_invoke.value.ul = state->rc_rcq_invoke;
327 	ibd_rc_ksp->rc_rcq_err.value.ul = state->rc_rcq_err;
328 	ibd_rc_ksp->rc_scq_invoke.value.ul = state->rc_scq_invoke;
329 
330 	ibd_rc_ksp->rc_rwqe_short.value.ul = state->rc_rwqe_short;
331 
332 	ibd_rc_ksp->rc_xmt_bytes.value.ul = state->rc_xmt_bytes;
333 	ibd_rc_ksp->rc_xmt_small_pkt.value.ul = state->rc_xmt_small_pkt;
334 	ibd_rc_ksp->rc_xmt_fragmented_pkt.value.ul =
335 	    state->rc_xmt_fragmented_pkt;
336 	ibd_rc_ksp->rc_xmt_map_fail_pkt.value.ul = state->rc_xmt_map_fail_pkt;
337 	ibd_rc_ksp->rc_xmt_map_succ_pkt.value.ul = state->rc_xmt_map_succ_pkt;
338 	ibd_rc_ksp->rc_ace_not_found.value.ul = state->rc_ace_not_found;
339 
340 	ibd_rc_ksp->rc_scq_no_swqe.value.ul = state->rc_scq_no_swqe;
341 	ibd_rc_ksp->rc_scq_no_largebuf.value.ul = state->rc_scq_no_largebuf;
342 	ibd_rc_ksp->rc_swqe_short.value.ul = state->rc_swqe_short;
343 	ibd_rc_ksp->rc_swqe_mac_update.value.ul = state->rc_swqe_mac_update;
344 	ibd_rc_ksp->rc_xmt_buf_short.value.ul = state->rc_xmt_buf_short;
345 	ibd_rc_ksp->rc_xmt_buf_mac_update.value.ul =
346 	    state->rc_xmt_buf_mac_update;
347 
348 	ibd_rc_ksp->rc_conn_succ.value.ul = state->rc_conn_succ;
349 	ibd_rc_ksp->rc_conn_fail.value.ul = state->rc_conn_fail;
350 	ibd_rc_ksp->rc_null_conn.value.ul = state->rc_null_conn;
351 	ibd_rc_ksp->rc_no_estab_conn.value.ul = state->rc_no_estab_conn;
352 
353 	ibd_rc_ksp->rc_act_close.value.ul = state->rc_act_close;
354 	ibd_rc_ksp->rc_pas_close.value.ul = state->rc_pas_close;
355 	ibd_rc_ksp->rc_delay_ace_recycle.value.ul = state->rc_delay_ace_recycle;
356 	ibd_rc_ksp->rc_act_close_simultaneous.value.ul =
357 	    state->rc_act_close_simultaneous;
358 	ibd_rc_ksp->rc_reset_cnt.value.ul = state->rc_reset_cnt;
359 
360 	return (0);
361 }
362 
363 
364 /*
365  * ibd_rc_init_stats - initialize kstat data structures
366  *
367  * This routine will create and initialize the driver private
368  * statistics counters.
369  */
370 int
371 ibd_rc_init_stats(ibd_state_t *state)
372 {
373 	kstat_t *ksp;
374 	ibd_rc_stat_t *ibd_rc_ksp;
375 	char stat_name[32];
376 	int inst;
377 
378 	/*
379 	 * Create and init kstat
380 	 */
381 	inst = ddi_get_instance(state->id_dip);
382 	(void) snprintf(stat_name, 31, "statistics%d_%x", inst, state->id_pkey);
383 	ksp = kstat_create("ibd", 0, stat_name, "net", KSTAT_TYPE_NAMED,
384 	    sizeof (ibd_rc_stat_t) / sizeof (kstat_named_t), 0);
385 
386 	if (ksp == NULL) {
387 		ibd_print_warn(state, "ibd_rc_init_stats: Could not create "
388 		    "kernel statistics");
389 		return (DDI_FAILURE);
390 	}
391 
392 	state->rc_ksp = ksp;	/* Fill in the ksp of ibd over RC mode */
393 
394 	ibd_rc_ksp = (ibd_rc_stat_t *)ksp->ks_data;
395 
396 	/*
397 	 * Initialize all the statistics
398 	 */
399 	kstat_named_init(&ibd_rc_ksp->rc_rcv_trans_byte, "RC: Rx Bytes, "
400 	    "transfer mode", KSTAT_DATA_ULONG);
401 	kstat_named_init(&ibd_rc_ksp->rc_rcv_trans_pkt, "RC: Rx Pkts, "
402 	    "transfer mode", KSTAT_DATA_ULONG);
403 	kstat_named_init(&ibd_rc_ksp->rc_rcv_copy_byte, "RC: Rx Bytes, "
404 	    "copy mode", KSTAT_DATA_ULONG);
405 	kstat_named_init(&ibd_rc_ksp->rc_rcv_copy_pkt, "RC: Rx Pkts, "
406 	    "copy mode", KSTAT_DATA_ULONG);
407 	kstat_named_init(&ibd_rc_ksp->rc_rcv_alloc_fail, "RC: Rx alloc fail",
408 	    KSTAT_DATA_ULONG);
409 
410 	kstat_named_init(&ibd_rc_ksp->rc_rcq_invoke, "RC: invoke of Recv CQ "
411 	    "handler", KSTAT_DATA_ULONG);
412 	kstat_named_init(&ibd_rc_ksp->rc_rcq_err, "RC: fail in Recv CQ handler",
413 	    KSTAT_DATA_ULONG);
414 
415 	kstat_named_init(&ibd_rc_ksp->rc_scq_invoke, "RC: invoke of Send CQ "
416 	    "handler", KSTAT_DATA_ULONG);
417 
418 	kstat_named_init(&ibd_rc_ksp->rc_rwqe_short, "RC: Short rwqe",
419 	    KSTAT_DATA_ULONG);
420 
421 	kstat_named_init(&ibd_rc_ksp->rc_xmt_bytes, "RC: Sent Bytes",
422 	    KSTAT_DATA_ULONG);
423 	kstat_named_init(&ibd_rc_ksp->rc_xmt_small_pkt,
424 	    "RC: Tx pkt small size", KSTAT_DATA_ULONG);
425 	kstat_named_init(&ibd_rc_ksp->rc_xmt_fragmented_pkt,
426 	    "RC: Tx pkt fragmentary", KSTAT_DATA_ULONG);
427 	kstat_named_init(&ibd_rc_ksp->rc_xmt_map_fail_pkt,
428 	    "RC: Tx pkt fail ibt_map_mem_iov()", KSTAT_DATA_ULONG);
429 	kstat_named_init(&ibd_rc_ksp->rc_xmt_map_succ_pkt,
430 	    "RC: Tx pkt succ ibt_map_mem_iov()", KSTAT_DATA_ULONG);
431 	kstat_named_init(&ibd_rc_ksp->rc_ace_not_found, "RC: ace not found",
432 	    KSTAT_DATA_ULONG);
433 
434 	kstat_named_init(&ibd_rc_ksp->rc_scq_no_swqe, "RC: No swqe after "
435 	    "recycle", KSTAT_DATA_ULONG);
436 	kstat_named_init(&ibd_rc_ksp->rc_scq_no_largebuf, "RC: No large tx buf "
437 	    "after recycle", KSTAT_DATA_ULONG);
438 	kstat_named_init(&ibd_rc_ksp->rc_swqe_short, "RC: No swqe in ibd_send",
439 	    KSTAT_DATA_ULONG);
440 	kstat_named_init(&ibd_rc_ksp->rc_swqe_mac_update, "RC: mac_tx_update "
441 	    "#, swqe available", KSTAT_DATA_ULONG);
442 	kstat_named_init(&ibd_rc_ksp->rc_xmt_buf_short, "RC: No buf in "
443 	    "ibd_send", KSTAT_DATA_ULONG);
444 	kstat_named_init(&ibd_rc_ksp->rc_xmt_buf_mac_update, "RC: "
445 	    "mac_tx_update #, buf available", KSTAT_DATA_ULONG);
446 
447 	kstat_named_init(&ibd_rc_ksp->rc_conn_succ, "RC: succ connected",
448 	    KSTAT_DATA_ULONG);
449 	kstat_named_init(&ibd_rc_ksp->rc_conn_fail, "RC: fail connect",
450 	    KSTAT_DATA_ULONG);
451 	kstat_named_init(&ibd_rc_ksp->rc_null_conn, "RC: null conn for unicast "
452 	    "pkt", KSTAT_DATA_ULONG);
453 	kstat_named_init(&ibd_rc_ksp->rc_no_estab_conn, "RC: not in act estab "
454 	    "state", KSTAT_DATA_ULONG);
455 
456 	kstat_named_init(&ibd_rc_ksp->rc_act_close, "RC: call ibd_rc_act_close",
457 	    KSTAT_DATA_ULONG);
458 	kstat_named_init(&ibd_rc_ksp->rc_pas_close, "RC: call ibd_rc_pas_close",
459 	    KSTAT_DATA_ULONG);
460 	kstat_named_init(&ibd_rc_ksp->rc_delay_ace_recycle, "RC: delay ace "
461 	    "recycle", KSTAT_DATA_ULONG);
462 	kstat_named_init(&ibd_rc_ksp->rc_act_close_simultaneous, "RC: "
463 	    "simultaneous ibd_rc_act_close", KSTAT_DATA_ULONG);
464 	kstat_named_init(&ibd_rc_ksp->rc_reset_cnt, "RC: Reset RC channel",
465 	    KSTAT_DATA_ULONG);
466 
467 	/*
468 	 * Function to provide kernel stat update on demand
469 	 */
470 	ksp->ks_update = ibd_rc_update_stats;
471 
472 	/*
473 	 * Pointer into provider's raw statistics
474 	 */
475 	ksp->ks_private = (void *)state;
476 
477 	/*
478 	 * Add kstat to systems kstat chain
479 	 */
480 	kstat_install(ksp);
481 
482 	return (DDI_SUCCESS);
483 }
484 #endif
485 
486 static ibt_status_t
487 ibd_rc_alloc_chan(ibd_rc_chan_t **ret_chan, ibd_state_t *state,
488     boolean_t is_tx_chan)
489 {
490 	ibt_status_t result;
491 	ibd_rc_chan_t *chan;
492 	ibt_rc_chan_alloc_args_t alloc_args;
493 	ibt_chan_alloc_flags_t alloc_flags;
494 	ibt_chan_sizes_t sizes;
495 	ibt_cq_attr_t cq_atts;
496 	int rv;
497 
498 	chan = kmem_zalloc(sizeof (ibd_rc_chan_t), KM_SLEEP);
499 
500 	chan->state = state;
501 	mutex_init(&chan->rx_wqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
502 	mutex_init(&chan->rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
503 	mutex_init(&chan->tx_wqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
504 	mutex_init(&chan->tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
505 	mutex_init(&chan->tx_post_lock, NULL, MUTEX_DRIVER, NULL);
506 	mutex_init(&chan->tx_poll_lock, NULL, MUTEX_DRIVER, NULL);
507 
508 	/* Allocate IB structures for a new RC channel. */
509 	if (is_tx_chan) {
510 		chan->scq_size = state->id_rc_num_swqe;
511 		chan->rcq_size = IBD_RC_MIN_CQ_SIZE;
512 	} else {
513 		chan->scq_size = IBD_RC_MIN_CQ_SIZE;
514 		chan->rcq_size = state->id_rc_num_rwqe;
515 	}
516 	cq_atts.cq_size = chan->scq_size;
517 	cq_atts.cq_sched = NULL;
518 	cq_atts.cq_flags = IBT_CQ_NO_FLAGS;
519 	result = ibt_alloc_cq(state->id_hca_hdl, &cq_atts, &chan->scq_hdl,
520 	    &chan->scq_size);
521 	if (result != IBT_SUCCESS) {
522 		DPRINT(40, "ibd_rc_alloc_chan: error <%d>"
523 		    "create scq completion queue (size <%d>)",
524 		    result, chan->scq_size);
525 		goto alloc_scq_err;
526 	}	/* if failure to alloc cq */
527 
528 	if (ibt_modify_cq(chan->scq_hdl, state->id_rc_tx_comp_count,
529 	    state->id_rc_tx_comp_usec, 0) != IBT_SUCCESS) {
530 		ibd_print_warn(state, "ibd_rc_alloc_chan: Send CQ "
531 		    "interrupt moderation failed");
532 	}
533 
534 	ibt_set_cq_private(chan->scq_hdl, (void *) (uintptr_t)chan);
535 	ibt_set_cq_handler(chan->scq_hdl, ibd_rc_scq_handler,
536 	    (void *) (uintptr_t)chan);
537 
538 	cq_atts.cq_size = chan->rcq_size;
539 	cq_atts.cq_sched = NULL;
540 	cq_atts.cq_flags = IBT_CQ_NO_FLAGS;
541 	result = ibt_alloc_cq(state->id_hca_hdl, &cq_atts, &chan->rcq_hdl,
542 	    &chan->rcq_size);
543 	if (result != IBT_SUCCESS) {
544 		ibd_print_warn(state, "ibd_rc_alloc_chan: error <%d> creating "
545 		    "rx completion queue (size <%d>)", result, chan->rcq_size);
546 		goto alloc_rcq_err;
547 	}	/* if failure to alloc cq */
548 
549 	if (ibt_modify_cq(chan->rcq_hdl, state->id_rc_rx_comp_count,
550 	    state->id_rc_rx_comp_usec, 0) != IBT_SUCCESS) {
551 		ibd_print_warn(state, "ibd_rc_alloc_chan: Receive CQ "
552 		    "interrupt moderation failed");
553 	}
554 
555 	ibt_set_cq_private(chan->rcq_hdl, (void *) (uintptr_t)chan);
556 	ibt_set_cq_handler(chan->rcq_hdl, ibd_rc_rcq_handler,
557 	    (void *)(uintptr_t)chan);
558 
559 	if (is_tx_chan) {
560 		chan->is_tx_chan = B_TRUE;
561 		if (ibd_rc_init_txlist(chan) != DDI_SUCCESS) {
562 			ibd_print_warn(state, "ibd_rc_alloc_chan: "
563 			    "ibd_rc_init_txlist failed");
564 			goto init_txlist_err;
565 		}
566 		if (ibd_rc_tx_softintr == 1) {
567 			if ((rv = ddi_add_softintr(state->id_dip,
568 			    DDI_SOFTINT_LOW, &chan->scq_softintr, NULL, NULL,
569 			    ibd_rc_tx_recycle, (caddr_t)chan)) !=
570 			    DDI_SUCCESS) {
571 				DPRINT(10, "ibd_rc_alloc_chan: failed in "
572 				    "ddi_add_softintr(scq_softintr), ret=%d",
573 				    rv);
574 				goto alloc_softintr_err;
575 			}
576 		}
577 	} else {
578 		chan->is_tx_chan = B_FALSE;
579 	}
580 
581 	/*
582 	 * enable completions
583 	 */
584 	result = ibt_enable_cq_notify(chan->scq_hdl, IBT_NEXT_COMPLETION);
585 	if (result != IBT_SUCCESS) {
586 		ibd_print_warn(state, "ibd_rc_alloc_chan: ibt_enable_cq_notify"
587 		    "(scq) failed: status %d\n", result);
588 		goto alloc_scq_enable_err;
589 	}
590 
591 	/* We will enable chan->rcq_hdl later. */
592 
593 	/* alloc a RC channel */
594 	bzero(&alloc_args, sizeof (ibt_rc_chan_alloc_args_t));
595 	bzero(&sizes, sizeof (ibt_chan_sizes_t));
596 
597 	alloc_args.rc_flags = IBT_WR_SIGNALED;
598 	alloc_args.rc_control = IBT_CEP_NO_FLAGS;
599 
600 	alloc_args.rc_scq = chan->scq_hdl;
601 	alloc_args.rc_rcq = chan->rcq_hdl;
602 	alloc_args.rc_pd = state->id_pd_hdl;
603 
604 	alloc_args.rc_hca_port_num = state->id_port;
605 	alloc_args.rc_clone_chan = NULL;
606 
607 	/* scatter/gather */
608 	alloc_args.rc_sizes.cs_sq_sgl = state->rc_tx_max_sqseg;
609 
610 	/*
611 	 * For the number of SGL elements in receive side, I think it
612 	 * should be 1. Because ibd driver allocates a whole block memory
613 	 * for each ibt_post_recv().
614 	 */
615 	alloc_args.rc_sizes.cs_rq_sgl = 1;
616 
617 	/* The send queue size and the receive queue size */
618 	alloc_args.rc_sizes.cs_sq = chan->scq_size;
619 	alloc_args.rc_sizes.cs_rq = chan->rcq_size;
620 
621 	if (state->id_hca_res_lkey_capab) {
622 		alloc_args.rc_flags = IBT_FAST_REG_RES_LKEY;
623 	} else {
624 		DPRINT(40, "ibd_rc_alloc_chan: not support reserved lkey");
625 	}
626 
627 	if (state->rc_enable_srq) {
628 		alloc_flags = IBT_ACHAN_USES_SRQ;
629 		alloc_args.rc_srq = state->rc_srq_hdl;
630 	} else {
631 		alloc_flags = IBT_ACHAN_NO_FLAGS;
632 	}
633 
634 	result = ibt_alloc_rc_channel(state->id_hca_hdl,
635 	    alloc_flags, &alloc_args, &chan->chan_hdl, &sizes);
636 	if (result != IBT_SUCCESS) {
637 		ibd_print_warn(state, "ibd_rc_alloc_chan: ibd_rc_open_channel"
638 		    " fail:<%d>", result);
639 		goto alloc_scq_enable_err;
640 	}
641 
642 	*ret_chan = chan;
643 	return (IBT_SUCCESS);
644 
645 alloc_scq_enable_err:
646 	if (is_tx_chan) {
647 		if (ibd_rc_tx_softintr == 1) {
648 			ddi_remove_softintr(chan->scq_softintr);
649 		}
650 	}
651 alloc_softintr_err:
652 	if (is_tx_chan) {
653 		ibd_rc_fini_txlist(chan);
654 	}
655 init_txlist_err:
656 	(void) ibt_free_cq(chan->rcq_hdl);
657 alloc_rcq_err:
658 	(void) ibt_free_cq(chan->scq_hdl);
659 alloc_scq_err:
660 	mutex_destroy(&chan->tx_poll_lock);
661 	mutex_destroy(&chan->tx_post_lock);
662 	mutex_destroy(&chan->tx_rel_list.dl_mutex);
663 	mutex_destroy(&chan->tx_wqe_list.dl_mutex);
664 	mutex_destroy(&chan->rx_free_list.dl_mutex);
665 	mutex_destroy(&chan->rx_wqe_list.dl_mutex);
666 	kmem_free(chan, sizeof (ibd_rc_chan_t));
667 	return (result);
668 }
669 
670 static void
671 ibd_rc_free_chan(ibd_rc_chan_t *chan)
672 {
673 	ibt_status_t ret;
674 
675 	/* DPRINT(30, "ibd_rc_free_chan: chan=%p", chan); */
676 
677 	if (chan->chan_hdl != NULL) {
678 		ret = ibt_free_channel(chan->chan_hdl);
679 		if (ret != IBT_SUCCESS) {
680 			DPRINT(40, "ib_rc_free_chan: ibt_free_channel failed, "
681 			    "chan=%p, returned: %d", chan, ret);
682 			return;
683 		}
684 		chan->chan_hdl = NULL;
685 	}
686 
687 	if (chan->rcq_hdl != NULL) {
688 		ret = ibt_free_cq(chan->rcq_hdl);
689 		if (ret != IBT_SUCCESS) {
690 			DPRINT(40, "ib_rc_free_chan: ibt_free_cq(rcq) failed, "
691 			    "chan=%p, returned: %d", chan, ret);
692 			return;
693 		}
694 		chan->rcq_hdl = NULL;
695 	}
696 
697 	if (chan->scq_hdl != NULL) {
698 		ret = ibt_free_cq(chan->scq_hdl);
699 		if (ret != IBT_SUCCESS) {
700 			DPRINT(40, "ib_rc_free_chan: ibt_free_cq(scq) failed, "
701 			    "chan=%p, returned: %d", chan, ret);
702 			return;
703 		}
704 		chan->scq_hdl = NULL;
705 	}
706 
707 	/* Free buffers */
708 	if (chan->is_tx_chan) {
709 		ibd_rc_fini_txlist(chan);
710 		if (ibd_rc_tx_softintr == 1) {
711 			ddi_remove_softintr(chan->scq_softintr);
712 		}
713 	} else {
714 		if (!chan->state->rc_enable_srq) {
715 			ibd_rc_fini_rxlist(chan);
716 		}
717 	}
718 
719 	mutex_destroy(&chan->tx_poll_lock);
720 	mutex_destroy(&chan->tx_post_lock);
721 	mutex_destroy(&chan->tx_rel_list.dl_mutex);
722 	mutex_destroy(&chan->tx_wqe_list.dl_mutex);
723 	mutex_destroy(&chan->rx_free_list.dl_mutex);
724 	mutex_destroy(&chan->rx_wqe_list.dl_mutex);
725 
726 	/*
727 	 * If it is a passive channel, must make sure it has been removed
728 	 * from chan->state->rc_pass_chan_list
729 	 */
730 	kmem_free(chan, sizeof (ibd_rc_chan_t));
731 }
732 
733 /* Add a RC channel */
734 static inline void
735 ibd_rc_add_to_chan_list(ibd_rc_chan_list_t *list, ibd_rc_chan_t *chan)
736 {
737 	mutex_enter(&list->chan_list_mutex);
738 	if (list->chan_list == NULL) {
739 		list->chan_list = chan;
740 	} else {
741 		chan->next = list->chan_list;
742 		list->chan_list = chan;
743 	}
744 	mutex_exit(&list->chan_list_mutex);
745 }
746 
747 /* Remove a RC channel */
748 static inline void
749 ibd_rc_rm_from_chan_list(ibd_rc_chan_list_t *list, ibd_rc_chan_t *chan)
750 {
751 	ibd_rc_chan_t *pre_chan;
752 
753 	mutex_enter(&list->chan_list_mutex);
754 	if (list->chan_list == chan) {
755 		DPRINT(30, "ibd_rc_rm_from_chan_list(first): found chan(%p)"
756 		    " in chan_list", chan);
757 		list->chan_list = chan->next;
758 	} else {
759 		pre_chan = list->chan_list;
760 		while (pre_chan != NULL) {
761 			if (pre_chan->next == chan) {
762 				DPRINT(30, "ibd_rc_rm_from_chan_list"
763 				    "(middle): found chan(%p) in "
764 				    "rc_pass_chan_list", chan);
765 				pre_chan->next = chan->next;
766 				break;
767 			}
768 			pre_chan = pre_chan->next;
769 		}
770 	}
771 	mutex_exit(&list->chan_list_mutex);
772 }
773 
774 static inline ibd_rc_chan_t *
775 ibd_rc_rm_header_chan_list(ibd_rc_chan_list_t *list)
776 {
777 	ibd_rc_chan_t *rc_chan;
778 
779 	mutex_enter(&list->chan_list_mutex);
780 	rc_chan = list->chan_list;
781 	if (rc_chan != NULL) {
782 		list->chan_list = rc_chan->next;
783 	}
784 	mutex_exit(&list->chan_list_mutex);
785 	return (rc_chan);
786 }
787 
788 static int
789 ibd_rc_alloc_srq_copybufs(ibd_state_t *state)
790 {
791 	ibt_mr_attr_t mem_attr;
792 	uint_t rc_rx_bufs_sz;
793 
794 	/*
795 	 * Allocate one big chunk for all regular rx copy bufs
796 	 */
797 	rc_rx_bufs_sz =  (state->rc_mtu + IPOIB_GRH_SIZE) * state->rc_srq_size;
798 
799 	state->rc_srq_rx_bufs = kmem_zalloc(rc_rx_bufs_sz, KM_SLEEP);
800 
801 	state->rc_srq_rwqes = kmem_zalloc(state->rc_srq_size *
802 	    sizeof (ibd_rwqe_t), KM_SLEEP);
803 
804 	/*
805 	 * Do one memory registration on the entire rxbuf area
806 	 */
807 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->rc_srq_rx_bufs;
808 	mem_attr.mr_len = rc_rx_bufs_sz;
809 	mem_attr.mr_as = NULL;
810 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
811 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
812 	    &state->rc_srq_rx_mr_hdl, &state->rc_srq_rx_mr_desc)
813 	    != IBT_SUCCESS) {
814 		DPRINT(40, "ibd_rc_alloc_srq_copybufs: ibt_register_mr() "
815 		    "failed");
816 		kmem_free(state->rc_srq_rwqes,
817 		    state->rc_srq_size * sizeof (ibd_rwqe_t));
818 		kmem_free(state->rc_srq_rx_bufs, rc_rx_bufs_sz);
819 		state->rc_srq_rx_bufs = NULL;
820 		state->rc_srq_rwqes = NULL;
821 		return (DDI_FAILURE);
822 	}
823 
824 	return (DDI_SUCCESS);
825 }
826 
827 static void
828 ibd_rc_free_srq_copybufs(ibd_state_t *state)
829 {
830 	uint_t rc_rx_buf_sz;
831 
832 	/*
833 	 * Don't change the value of state->rc_mtu at the period from call
834 	 * ibd_rc_alloc_srq_copybufs() to call ibd_rc_free_srq_copybufs().
835 	 */
836 	rc_rx_buf_sz = state->rc_mtu + IPOIB_GRH_SIZE;
837 
838 	/*
839 	 * Unregister rxbuf mr
840 	 */
841 	if (ibt_deregister_mr(state->id_hca_hdl,
842 	    state->rc_srq_rx_mr_hdl) != IBT_SUCCESS) {
843 		DPRINT(40, "ibd_rc_free_srq_copybufs: ibt_deregister_mr()"
844 		    " failed");
845 	}
846 	state->rc_srq_rx_mr_hdl = NULL;
847 
848 	/*
849 	 * Free rxbuf memory
850 	 */
851 	kmem_free(state->rc_srq_rwqes,
852 	    state->rc_srq_size * sizeof (ibd_rwqe_t));
853 	kmem_free(state->rc_srq_rx_bufs, state->rc_srq_size * rc_rx_buf_sz);
854 	state->rc_srq_rwqes = NULL;
855 	state->rc_srq_rx_bufs = NULL;
856 }
857 
858 /*
859  * Allocate and post a certain number of SRQ receive buffers and WRs.
860  */
861 int
862 ibd_rc_init_srq_list(ibd_state_t *state)
863 {
864 	ibd_rwqe_t *rwqe;
865 	ibt_lkey_t lkey;
866 	int i;
867 	uint_t len;
868 	uint8_t *bufaddr;
869 	ibt_srq_sizes_t srq_sizes;
870 	ibt_srq_sizes_t	 srq_real_sizes;
871 	ibt_status_t ret;
872 
873 	srq_sizes.srq_sgl_sz = 1;
874 	srq_sizes.srq_wr_sz = state->id_rc_num_srq;
875 	ret = ibt_alloc_srq(state->id_hca_hdl, IBT_SRQ_NO_FLAGS,
876 	    state->id_pd_hdl, &srq_sizes, &state->rc_srq_hdl, &srq_real_sizes);
877 	if (ret != IBT_SUCCESS) {
878 		DPRINT(10, "ibd_rc_init_srq_list: ibt_alloc_srq failed."
879 		    "req_sgl_sz=%d, req_wr_sz=0x%x, ret=%d",
880 		    srq_sizes.srq_sgl_sz, srq_sizes.srq_wr_sz, ret);
881 		return (DDI_FAILURE);
882 	}
883 
884 	state->rc_srq_size = srq_real_sizes.srq_wr_sz;
885 	if (ibd_rc_alloc_srq_copybufs(state) != DDI_SUCCESS) {
886 		ret = ibt_free_srq(state->rc_srq_hdl);
887 		if (ret != IBT_SUCCESS) {
888 			ibd_print_warn(state, "ibd_rc_init_srq_list: "
889 			    "ibt_free_srq fail, ret=%d", ret);
890 		}
891 		return (DDI_FAILURE);
892 	}
893 
894 	/*
895 	 * Allocate and setup the rwqe list
896 	 */
897 	lkey = state->rc_srq_rx_mr_desc.md_lkey;
898 	rwqe = state->rc_srq_rwqes;
899 	bufaddr = state->rc_srq_rx_bufs;
900 	len = state->rc_mtu + IPOIB_GRH_SIZE;
901 	state->rc_srq_rwqe_list.dl_cnt = 0;
902 	state->rc_srq_rwqe_list.dl_bufs_outstanding = 0;
903 	for (i = 0; i < state->rc_srq_size; i++, rwqe++, bufaddr += len) {
904 		rwqe->w_state = state;
905 		rwqe->w_freeing_wqe = B_FALSE;
906 		rwqe->w_freemsg_cb.free_func = ibd_rc_srq_freemsg_cb;
907 		rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
908 		rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
909 
910 		if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
911 		    &rwqe->w_freemsg_cb)) == NULL) {
912 			DPRINT(40, "ibd_rc_init_srq_list : desballoc() failed");
913 			rwqe->rwqe_copybuf.ic_bufaddr = NULL;
914 			if (atomic_dec_32_nv(&state->id_running) != 0) {
915 				cmn_err(CE_WARN, "ibd_rc_init_srq_list: "
916 				    "id_running was not 1\n");
917 			}
918 			ibd_rc_fini_srq_list(state);
919 			atomic_inc_32(&state->id_running);
920 			return (DDI_FAILURE);
921 		}
922 
923 		rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
924 		/* Leave IPOIB_GRH_SIZE space */
925 		rwqe->rwqe_copybuf.ic_sgl.ds_va =
926 		    (ib_vaddr_t)(uintptr_t)(bufaddr + IPOIB_GRH_SIZE);
927 		rwqe->rwqe_copybuf.ic_sgl.ds_len = state->rc_mtu;
928 		rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
929 		rwqe->w_rwr.wr_nds = 1;
930 		rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
931 		(void) ibd_rc_post_srq(state, rwqe);
932 	}
933 
934 	mutex_enter(&state->rc_srq_free_list.dl_mutex);
935 	state->rc_srq_free_list.dl_head = NULL;
936 	state->rc_srq_free_list.dl_cnt = 0;
937 	mutex_exit(&state->rc_srq_free_list.dl_mutex);
938 
939 	return (DDI_SUCCESS);
940 }
941 
942 /*
943  * Free the statically allocated Rx buffer list for SRQ.
944  */
945 void
946 ibd_rc_fini_srq_list(ibd_state_t *state)
947 {
948 	ibd_rwqe_t *rwqe;
949 	int i;
950 	ibt_status_t ret;
951 
952 	ASSERT(state->id_running == 0);
953 	ret = ibt_free_srq(state->rc_srq_hdl);
954 	if (ret != IBT_SUCCESS) {
955 		ibd_print_warn(state, "ibd_rc_fini_srq_list: "
956 		    "ibt_free_srq fail, ret=%d", ret);
957 	}
958 
959 	mutex_enter(&state->rc_srq_rwqe_list.dl_mutex);
960 	rwqe = state->rc_srq_rwqes;
961 	for (i = 0; i < state->rc_srq_size; i++, rwqe++) {
962 		if (rwqe->rwqe_im_mblk != NULL) {
963 			rwqe->w_freeing_wqe = B_TRUE;
964 			freemsg(rwqe->rwqe_im_mblk);
965 		}
966 	}
967 	mutex_exit(&state->rc_srq_rwqe_list.dl_mutex);
968 
969 	ibd_rc_free_srq_copybufs(state);
970 }
971 
972 /* Repost the elements in state->ib_rc_free_list */
973 int
974 ibd_rc_repost_srq_free_list(ibd_state_t *state)
975 {
976 	ibd_rwqe_t *rwqe;
977 	ibd_wqe_t *list;
978 	uint_t len;
979 
980 	mutex_enter(&state->rc_srq_free_list.dl_mutex);
981 	if (state->rc_srq_free_list.dl_head != NULL) {
982 		/* repost them */
983 		len = state->rc_mtu + IPOIB_GRH_SIZE;
984 		list = state->rc_srq_free_list.dl_head;
985 		state->rc_srq_free_list.dl_head = NULL;
986 		state->rc_srq_free_list.dl_cnt = 0;
987 		mutex_exit(&state->rc_srq_free_list.dl_mutex);
988 		while (list != NULL) {
989 			rwqe = WQE_TO_RWQE(list);
990 			if ((rwqe->rwqe_im_mblk == NULL) &&
991 			    ((rwqe->rwqe_im_mblk = desballoc(
992 			    rwqe->rwqe_copybuf.ic_bufaddr, len, 0,
993 			    &rwqe->w_freemsg_cb)) == NULL)) {
994 				DPRINT(40, "ibd_rc_repost_srq_free_list: "
995 				    "failed in desballoc()");
996 				do {
997 					ibd_rc_srq_free_rwqe(state, rwqe);
998 					list = list->w_next;
999 					rwqe = WQE_TO_RWQE(list);
1000 				} while (list != NULL);
1001 				return (DDI_FAILURE);
1002 			}
1003 			if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) {
1004 				ibd_rc_srq_free_rwqe(state, rwqe);
1005 			}
1006 			list = list->w_next;
1007 		}
1008 		return (DDI_SUCCESS);
1009 	}
1010 	mutex_exit(&state->rc_srq_free_list.dl_mutex);
1011 	return (DDI_SUCCESS);
1012 }
1013 
1014 /*
1015  * Free an allocated recv wqe.
1016  */
1017 static void
1018 ibd_rc_srq_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
1019 {
1020 	/*
1021 	 * desballoc() failed (no memory) or the posting of rwqe failed.
1022 	 *
1023 	 * This rwqe is placed on a free list so that it
1024 	 * can be reinstated in future.
1025 	 *
1026 	 * NOTE: no code currently exists to reinstate
1027 	 * these "lost" rwqes.
1028 	 */
1029 	mutex_enter(&state->rc_srq_free_list.dl_mutex);
1030 	state->rc_srq_free_list.dl_cnt++;
1031 	rwqe->rwqe_next = state->rc_srq_free_list.dl_head;
1032 	state->rc_srq_free_list.dl_head = RWQE_TO_WQE(rwqe);
1033 	mutex_exit(&state->rc_srq_free_list.dl_mutex);
1034 }
1035 
1036 static void
1037 ibd_rc_srq_freemsg_cb(char *arg)
1038 {
1039 	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
1040 	ibd_state_t *state = rwqe->w_state;
1041 
1042 	ASSERT(state->rc_enable_srq);
1043 
1044 	/*
1045 	 * If the driver is stopped, just free the rwqe.
1046 	 */
1047 	if (atomic_add_32_nv(&state->id_running, 0) == 0) {
1048 		if (!rwqe->w_freeing_wqe) {
1049 			atomic_dec_32(
1050 			    &state->rc_srq_rwqe_list.dl_bufs_outstanding);
1051 			DPRINT(6, "ibd_rc_srq_freemsg_cb: wqe being freed");
1052 			rwqe->rwqe_im_mblk = NULL;
1053 			ibd_rc_srq_free_rwqe(state, rwqe);
1054 		}
1055 		return;
1056 	}
1057 
1058 	atomic_dec_32(&state->rc_srq_rwqe_list.dl_bufs_outstanding);
1059 
1060 	ASSERT(state->rc_srq_rwqe_list.dl_cnt < state->rc_srq_size);
1061 	ASSERT(!rwqe->w_freeing_wqe);
1062 
1063 	/*
1064 	 * Upper layer has released held mblk, so we have
1065 	 * no more use for keeping the old pointer in
1066 	 * our rwqe.
1067 	 */
1068 	rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
1069 	    state->rc_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
1070 	if (rwqe->rwqe_im_mblk == NULL) {
1071 		DPRINT(40, "ibd_rc_srq_freemsg_cb: desballoc failed");
1072 		ibd_rc_srq_free_rwqe(state, rwqe);
1073 		return;
1074 	}
1075 
1076 	if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) {
1077 		ibd_print_warn(state, "ibd_rc_srq_freemsg_cb: ibd_rc_post_srq"
1078 		    " failed");
1079 		ibd_rc_srq_free_rwqe(state, rwqe);
1080 		return;
1081 	}
1082 }
1083 
1084 /*
1085  * Post a rwqe to the hardware and add it to the Rx list.
1086  */
1087 static int
1088 ibd_rc_post_srq(ibd_state_t *state, ibd_rwqe_t *rwqe)
1089 {
1090 	/*
1091 	 * Here we should add dl_cnt before post recv, because
1092 	 * we would have to make sure dl_cnt is updated before
1093 	 * the corresponding ibd_rc_process_rx() is called.
1094 	 */
1095 	ASSERT(state->rc_srq_rwqe_list.dl_cnt < state->rc_srq_size);
1096 	atomic_add_32(&state->rc_srq_rwqe_list.dl_cnt, 1);
1097 	if (ibt_post_srq(state->rc_srq_hdl, &rwqe->w_rwr, 1, NULL) !=
1098 	    IBT_SUCCESS) {
1099 		atomic_dec_32(&state->rc_srq_rwqe_list.dl_cnt);
1100 		DPRINT(40, "ibd_rc_post_srq : ibt_post_srq() failed");
1101 		return (DDI_FAILURE);
1102 	}
1103 
1104 	return (DDI_SUCCESS);
1105 }
1106 
1107 /*
1108  * Post a rwqe to the hardware and add it to the Rx list.
1109  */
1110 static int
1111 ibd_rc_post_rwqe(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe)
1112 {
1113 	/*
1114 	 * Here we should add dl_cnt before post recv, because we would
1115 	 * have to make sure dl_cnt has already updated before
1116 	 * corresponding ibd_rc_process_rx() is called.
1117 	 */
1118 	atomic_add_32(&chan->rx_wqe_list.dl_cnt, 1);
1119 	if (ibt_post_recv(chan->chan_hdl, &rwqe->w_rwr, 1, NULL) !=
1120 	    IBT_SUCCESS) {
1121 		atomic_dec_32(&chan->rx_wqe_list.dl_cnt);
1122 		DPRINT(40, "ibd_rc_post_rwqe : failed in ibt_post_recv()");
1123 		return (DDI_FAILURE);
1124 	}
1125 	return (DDI_SUCCESS);
1126 }
1127 
1128 static int
1129 ibd_rc_alloc_rx_copybufs(ibd_rc_chan_t *chan)
1130 {
1131 	ibd_state_t *state = chan->state;
1132 	ibt_mr_attr_t mem_attr;
1133 	uint_t rc_rx_bufs_sz;
1134 
1135 	/*
1136 	 * Allocate one big chunk for all regular rx copy bufs
1137 	 */
1138 	rc_rx_bufs_sz = (state->rc_mtu + IPOIB_GRH_SIZE) * chan->rcq_size;
1139 
1140 	chan->rx_bufs = kmem_zalloc(rc_rx_bufs_sz, KM_SLEEP);
1141 
1142 	chan->rx_rwqes = kmem_zalloc(chan->rcq_size *
1143 	    sizeof (ibd_rwqe_t), KM_SLEEP);
1144 
1145 	/*
1146 	 * Do one memory registration on the entire rxbuf area
1147 	 */
1148 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)chan->rx_bufs;
1149 	mem_attr.mr_len = rc_rx_bufs_sz;
1150 	mem_attr.mr_as = NULL;
1151 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
1152 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
1153 	    &chan->rx_mr_hdl, &chan->rx_mr_desc) != IBT_SUCCESS) {
1154 		DPRINT(40, "ibd_rc_alloc_srq_copybufs: ibt_register_mr failed");
1155 		kmem_free(chan->rx_rwqes, chan->rcq_size * sizeof (ibd_rwqe_t));
1156 		kmem_free(chan->rx_bufs, rc_rx_bufs_sz);
1157 		chan->rx_bufs = NULL;
1158 		chan->rx_rwqes = NULL;
1159 		return (DDI_FAILURE);
1160 	}
1161 
1162 	return (DDI_SUCCESS);
1163 }
1164 
1165 static void
1166 ibd_rc_free_rx_copybufs(ibd_rc_chan_t *chan)
1167 {
1168 	ibd_state_t *state = chan->state;
1169 	uint_t rc_rx_buf_sz;
1170 
1171 	ASSERT(!state->rc_enable_srq);
1172 	ASSERT(chan->rx_rwqes != NULL);
1173 	ASSERT(chan->rx_bufs != NULL);
1174 
1175 	/*
1176 	 * Don't change the value of state->rc_mtu at the period from call
1177 	 * ibd_rc_alloc_rx_copybufs() to call ibd_rc_free_rx_copybufs().
1178 	 */
1179 	rc_rx_buf_sz = state->rc_mtu + IPOIB_GRH_SIZE;
1180 
1181 	/*
1182 	 * Unregister rxbuf mr
1183 	 */
1184 	if (ibt_deregister_mr(state->id_hca_hdl,
1185 	    chan->rx_mr_hdl) != IBT_SUCCESS) {
1186 		DPRINT(40, "ibd_rc_free_rx_copybufs: ibt_deregister_mr failed");
1187 	}
1188 	chan->rx_mr_hdl = NULL;
1189 
1190 	/*
1191 	 * Free rxbuf memory
1192 	 */
1193 	kmem_free(chan->rx_rwqes, chan->rcq_size * sizeof (ibd_rwqe_t));
1194 	chan->rx_rwqes = NULL;
1195 
1196 	kmem_free(chan->rx_bufs, chan->rcq_size * rc_rx_buf_sz);
1197 	chan->rx_bufs = NULL;
1198 }
1199 
1200 /*
1201  * Post a certain number of receive buffers and WRs on a RC channel.
1202  */
1203 static int
1204 ibd_rc_init_rxlist(ibd_rc_chan_t *chan)
1205 {
1206 	ibd_state_t *state = chan->state;
1207 	ibd_rwqe_t *rwqe;
1208 	ibt_lkey_t lkey;
1209 	int i;
1210 	uint_t len;
1211 	uint8_t *bufaddr;
1212 
1213 	ASSERT(!state->rc_enable_srq);
1214 	if (ibd_rc_alloc_rx_copybufs(chan) != DDI_SUCCESS)
1215 		return (DDI_FAILURE);
1216 
1217 	/*
1218 	 * Allocate and setup the rwqe list
1219 	 */
1220 	lkey = chan->rx_mr_desc.md_lkey;
1221 	rwqe = chan->rx_rwqes;
1222 	bufaddr = chan->rx_bufs;
1223 	len = state->rc_mtu + IPOIB_GRH_SIZE;
1224 	for (i = 0; i < chan->rcq_size; i++, rwqe++, bufaddr += len) {
1225 		rwqe->w_state = state;
1226 		rwqe->w_chan = chan;
1227 		rwqe->w_freeing_wqe = B_FALSE;
1228 		rwqe->w_freemsg_cb.free_func = ibd_rc_freemsg_cb;
1229 		rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
1230 		rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
1231 
1232 		if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
1233 		    &rwqe->w_freemsg_cb)) == NULL) {
1234 			DPRINT(40, "ibd_rc_init_srq_list: desballoc() failed");
1235 			rwqe->rwqe_copybuf.ic_bufaddr = NULL;
1236 			ibd_rc_fini_rxlist(chan);
1237 			return (DDI_FAILURE);
1238 		}
1239 
1240 		rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
1241 		rwqe->rwqe_copybuf.ic_sgl.ds_va =
1242 		    (ib_vaddr_t)(uintptr_t)(bufaddr + IPOIB_GRH_SIZE);
1243 		rwqe->rwqe_copybuf.ic_sgl.ds_len = state->rc_mtu;
1244 		rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
1245 		rwqe->w_rwr.wr_nds = 1;
1246 		rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
1247 		(void) ibd_rc_post_rwqe(chan, rwqe);
1248 	}
1249 
1250 	return (DDI_SUCCESS);
1251 }
1252 
1253 /*
1254  * Free the statically allocated Rx buffer list for SRQ.
1255  */
1256 static void
1257 ibd_rc_fini_rxlist(ibd_rc_chan_t *chan)
1258 {
1259 	ibd_rwqe_t *rwqe;
1260 	int i;
1261 
1262 	if (chan->rx_bufs == NULL) {
1263 		DPRINT(40, "ibd_rc_fini_rxlist: empty chan->rx_bufs, quit");
1264 		return;
1265 	}
1266 
1267 	/* bufs_outstanding must be 0 */
1268 	ASSERT((chan->rx_wqe_list.dl_head == NULL) ||
1269 	    (chan->rx_wqe_list.dl_bufs_outstanding == 0));
1270 
1271 	mutex_enter(&chan->rx_wqe_list.dl_mutex);
1272 	rwqe = chan->rx_rwqes;
1273 	for (i = 0; i < chan->rcq_size; i++, rwqe++) {
1274 		if (rwqe->rwqe_im_mblk != NULL) {
1275 			rwqe->w_freeing_wqe = B_TRUE;
1276 			freemsg(rwqe->rwqe_im_mblk);
1277 		}
1278 	}
1279 	mutex_exit(&chan->rx_wqe_list.dl_mutex);
1280 
1281 	ibd_rc_free_rx_copybufs(chan);
1282 }
1283 
1284 /*
1285  * Free an allocated recv wqe.
1286  */
1287 static void
1288 ibd_rc_free_rwqe(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe)
1289 {
1290 	/*
1291 	 * desballoc() failed (no memory) or the posting of rwqe failed.
1292 	 *
1293 	 * This rwqe is placed on a free list so that it
1294 	 * can be reinstated in future.
1295 	 *
1296 	 * NOTE: no code currently exists to reinstate
1297 	 * these "lost" rwqes.
1298 	 */
1299 	mutex_enter(&chan->rx_free_list.dl_mutex);
1300 	chan->rx_free_list.dl_cnt++;
1301 	rwqe->rwqe_next = chan->rx_free_list.dl_head;
1302 	chan->rx_free_list.dl_head = RWQE_TO_WQE(rwqe);
1303 	mutex_exit(&chan->rx_free_list.dl_mutex);
1304 }
1305 
1306 /*
1307  * Processing to be done after receipt of a packet; hand off to GLD
1308  * in the format expected by GLD.
1309  */
1310 static void
1311 ibd_rc_process_rx(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
1312 {
1313 	ibd_state_t *state = chan->state;
1314 	ib_header_info_t *phdr;
1315 	ipoib_hdr_t *ipibp;
1316 	mblk_t *mp;
1317 	mblk_t *mpc;
1318 	int rxcnt;
1319 	ip6_t *ip6h;
1320 	int len;
1321 
1322 	/*
1323 	 * Track number handed to upper layer, and number still
1324 	 * available to receive packets.
1325 	 */
1326 	if (state->rc_enable_srq) {
1327 		rxcnt = atomic_dec_32_nv(&state->rc_srq_rwqe_list.dl_cnt);
1328 	} else {
1329 		rxcnt = atomic_dec_32_nv(&chan->rx_wqe_list.dl_cnt);
1330 	}
1331 
1332 	/*
1333 	 * It can not be a IBA multicast packet.
1334 	 */
1335 	ASSERT(!wc->wc_flags & IBT_WC_GRH_PRESENT);
1336 
1337 
1338 #ifdef DEBUG
1339 	if (rxcnt < state->id_rc_rx_rwqe_thresh) {
1340 		state->rc_rwqe_short++;
1341 	}
1342 #endif
1343 
1344 	/*
1345 	 * Possibly replenish the Rx pool if needed.
1346 	 */
1347 	if ((rxcnt >= state->id_rc_rx_rwqe_thresh) &&
1348 	    (wc->wc_bytes_xfer > state->id_rc_rx_copy_thresh)) {
1349 		atomic_add_64(&state->rc_rcv_trans_byte, wc->wc_bytes_xfer);
1350 		atomic_inc_64(&state->rc_rcv_trans_pkt);
1351 
1352 		/*
1353 		 * Record how many rwqe has been occupied by upper
1354 		 * network layer
1355 		 */
1356 		if (state->rc_enable_srq) {
1357 			atomic_add_32(&state->rc_srq_rwqe_list.
1358 			    dl_bufs_outstanding, 1);
1359 		} else {
1360 			atomic_add_32(&chan->rx_wqe_list.
1361 			    dl_bufs_outstanding, 1);
1362 		}
1363 		mp = rwqe->rwqe_im_mblk;
1364 	} else {
1365 		atomic_add_64(&state->rc_rcv_copy_byte, wc->wc_bytes_xfer);
1366 		atomic_inc_64(&state->rc_rcv_copy_pkt);
1367 
1368 		if ((mp = allocb(wc->wc_bytes_xfer + IPOIB_GRH_SIZE,
1369 		    BPRI_HI)) == NULL) {	/* no memory */
1370 			DPRINT(40, "ibd_rc_process_rx: allocb() failed");
1371 			state->rc_rcv_alloc_fail++;
1372 			if (state->rc_enable_srq) {
1373 				if (ibd_rc_post_srq(state, rwqe) ==
1374 				    DDI_FAILURE) {
1375 					ibd_rc_srq_free_rwqe(state, rwqe);
1376 				}
1377 			} else {
1378 				if (ibd_rc_post_rwqe(chan, rwqe) ==
1379 				    DDI_FAILURE) {
1380 					ibd_rc_free_rwqe(chan, rwqe);
1381 				}
1382 			}
1383 			return;
1384 		}
1385 
1386 		bcopy(rwqe->rwqe_im_mblk->b_rptr + IPOIB_GRH_SIZE,
1387 		    mp->b_wptr + IPOIB_GRH_SIZE, wc->wc_bytes_xfer);
1388 
1389 		if (state->rc_enable_srq) {
1390 			if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) {
1391 				ibd_rc_srq_free_rwqe(state, rwqe);
1392 			}
1393 		} else {
1394 			if (ibd_rc_post_rwqe(chan, rwqe) == DDI_FAILURE) {
1395 				ibd_rc_free_rwqe(chan, rwqe);
1396 			}
1397 		}
1398 	}
1399 
1400 	ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + IPOIB_GRH_SIZE);
1401 	if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) {
1402 		ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
1403 		len = ntohs(ip6h->ip6_plen);
1404 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
1405 			/* LINTED: E_CONSTANT_CONDITION */
1406 			IBD_PAD_NSNA(ip6h, len, IBD_RECV);
1407 		}
1408 	}
1409 
1410 	phdr = (ib_header_info_t *)mp->b_rptr;
1411 	phdr->ib_grh.ipoib_vertcflow = 0;
1412 	ovbcopy(&state->id_macaddr, &phdr->ib_dst,
1413 	    sizeof (ipoib_mac_t));
1414 	mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer+ IPOIB_GRH_SIZE;
1415 
1416 	/*
1417 	 * Can RC mode in IB guarantee its checksum correctness?
1418 	 *
1419 	 *	(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
1420 	 *	    HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);
1421 	 */
1422 
1423 	/*
1424 	 * Make sure this is NULL or we're in trouble.
1425 	 */
1426 	if (mp->b_next != NULL) {
1427 		ibd_print_warn(state,
1428 		    "ibd_rc_process_rx: got duplicate mp from rcq?");
1429 		mp->b_next = NULL;
1430 	}
1431 
1432 	/*
1433 	 * Add this mp to the list of processed mp's to send to
1434 	 * the nw layer
1435 	 */
1436 	if (state->rc_enable_srq) {
1437 		mutex_enter(&state->rc_rx_lock);
1438 		if (state->rc_rx_mp) {
1439 			ASSERT(state->rc_rx_mp_tail != NULL);
1440 			state->rc_rx_mp_tail->b_next = mp;
1441 		} else {
1442 			ASSERT(state->rc_rx_mp_tail == NULL);
1443 			state->rc_rx_mp = mp;
1444 		}
1445 
1446 		state->rc_rx_mp_tail = mp;
1447 		state->rc_rx_mp_len++;
1448 
1449 		if (state->rc_rx_mp_len  >= IBD_MAX_RX_MP_LEN) {
1450 			mpc = state->rc_rx_mp;
1451 
1452 			state->rc_rx_mp = NULL;
1453 			state->rc_rx_mp_tail = NULL;
1454 			state->rc_rx_mp_len = 0;
1455 			mutex_exit(&state->rc_rx_lock);
1456 			mac_rx(state->id_mh, NULL, mpc);
1457 		} else {
1458 			mutex_exit(&state->rc_rx_lock);
1459 		}
1460 	} else {
1461 		mutex_enter(&chan->rx_lock);
1462 		if (chan->rx_mp) {
1463 			ASSERT(chan->rx_mp_tail != NULL);
1464 			chan->rx_mp_tail->b_next = mp;
1465 		} else {
1466 			ASSERT(chan->rx_mp_tail == NULL);
1467 			chan->rx_mp = mp;
1468 		}
1469 
1470 		chan->rx_mp_tail = mp;
1471 		chan->rx_mp_len++;
1472 
1473 		if (chan->rx_mp_len  >= IBD_MAX_RX_MP_LEN) {
1474 			mpc = chan->rx_mp;
1475 
1476 			chan->rx_mp = NULL;
1477 			chan->rx_mp_tail = NULL;
1478 			chan->rx_mp_len = 0;
1479 			mutex_exit(&chan->rx_lock);
1480 			mac_rx(state->id_mh, NULL, mpc);
1481 		} else {
1482 			mutex_exit(&chan->rx_lock);
1483 		}
1484 	}
1485 }
1486 
1487 /*
1488  * Callback code invoked from STREAMs when the recv data buffer is free
1489  * for recycling.
1490  */
1491 static void
1492 ibd_rc_freemsg_cb(char *arg)
1493 {
1494 	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
1495 	ibd_rc_chan_t *chan = rwqe->w_chan;
1496 	ibd_state_t *state = rwqe->w_state;
1497 
1498 	/*
1499 	 * If the wqe is being destructed, do not attempt recycling.
1500 	 */
1501 	if (rwqe->w_freeing_wqe == B_TRUE) {
1502 		return;
1503 	}
1504 
1505 	ASSERT(!state->rc_enable_srq);
1506 	ASSERT(chan->rx_wqe_list.dl_cnt < chan->rcq_size);
1507 
1508 	rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
1509 	    state->rc_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
1510 	if (rwqe->rwqe_im_mblk == NULL) {
1511 		DPRINT(40, "ibd_rc_freemsg_cb: desballoc() failed");
1512 		ibd_rc_free_rwqe(chan, rwqe);
1513 		return;
1514 	}
1515 
1516 	/*
1517 	 * Post back to h/w. We could actually have more than
1518 	 * id_num_rwqe WQEs on the list if there were multiple
1519 	 * ibd_freemsg_cb() calls outstanding (since the lock is
1520 	 * not held the entire time). This will start getting
1521 	 * corrected over subsequent ibd_freemsg_cb() calls.
1522 	 */
1523 	if (ibd_rc_post_rwqe(chan, rwqe) == DDI_FAILURE) {
1524 		ibd_rc_free_rwqe(chan, rwqe);
1525 		return;
1526 	}
1527 	atomic_add_32(&chan->rx_wqe_list.dl_bufs_outstanding, -1);
1528 }
1529 
1530 /*
1531  * Common code for interrupt handling as well as for polling
1532  * for all completed wqe's while detaching.
1533  */
1534 static void
1535 ibd_rc_poll_rcq(ibd_rc_chan_t *chan, ibt_cq_hdl_t cq_hdl)
1536 {
1537 	ibd_wqe_t *wqe;
1538 	ibt_wc_t *wc, *wcs;
1539 	uint_t numwcs, real_numwcs;
1540 	int i;
1541 
1542 	wcs = chan->rx_wc;
1543 	numwcs = IBD_RC_MAX_CQ_WC;
1544 
1545 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &real_numwcs) == IBT_SUCCESS) {
1546 		for (i = 0, wc = wcs; i < real_numwcs; i++, wc++) {
1547 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
1548 			if (wc->wc_status != IBT_WC_SUCCESS) {
1549 				chan->state->rc_rcq_err++;
1550 				/*
1551 				 * Channel being torn down.
1552 				 */
1553 				DPRINT(40, "ibd_rc_poll_rcq: wc_status(%d) != "
1554 				    "SUCC, chan=%p", wc->wc_status, chan);
1555 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
1556 					/*
1557 					 * Do not invoke Rx handler because
1558 					 * it might add buffers to the Rx pool
1559 					 * when we are trying to deinitialize.
1560 					 */
1561 					continue;
1562 				}
1563 			}
1564 			ibd_rc_process_rx(chan, WQE_TO_RWQE(wqe), wc);
1565 		}
1566 	}
1567 }
1568 
1569 /* Receive CQ handler */
1570 /* ARGSUSED */
1571 static void
1572 ibd_rc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1573 {
1574 	ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg;
1575 	ibd_state_t *state = chan->state;
1576 
1577 	ASSERT(chan->chan_state == IBD_RC_STATE_PAS_ESTAB);
1578 
1579 	/*
1580 	 * Poll for completed entries; the CQ will not interrupt any
1581 	 * more for incoming (or transmitted) packets.
1582 	 */
1583 	state->rc_rcq_invoke++;
1584 	ibd_rc_poll_rcq(chan, chan->rcq_hdl);
1585 
1586 	/*
1587 	 * Now enable CQ notifications; all packets that arrive now
1588 	 * (or complete transmission) will cause new interrupts.
1589 	 */
1590 	if (ibt_enable_cq_notify(chan->rcq_hdl, IBT_NEXT_COMPLETION) !=
1591 	    IBT_SUCCESS) {
1592 		/*
1593 		 * We do not expect a failure here.
1594 		 */
1595 		DPRINT(40, "ibd_rc_rcq_handler: ibt_enable_cq_notify() failed");
1596 	}
1597 
1598 	/*
1599 	 * Repoll to catch all packets that might have arrived after
1600 	 * we finished the first poll loop and before interrupts got
1601 	 * armed.
1602 	 */
1603 	ibd_rc_poll_rcq(chan, chan->rcq_hdl);
1604 
1605 	if (state->rc_enable_srq) {
1606 		mutex_enter(&state->rc_rx_lock);
1607 
1608 		if (state->rc_rx_mp != NULL) {
1609 			mblk_t *mpc;
1610 			mpc = state->rc_rx_mp;
1611 
1612 			state->rc_rx_mp = NULL;
1613 			state->rc_rx_mp_tail = NULL;
1614 			state->rc_rx_mp_len = 0;
1615 
1616 			mutex_exit(&state->rc_rx_lock);
1617 			mac_rx(state->id_mh, NULL, mpc);
1618 		} else {
1619 			mutex_exit(&state->rc_rx_lock);
1620 		}
1621 	} else {
1622 		mutex_enter(&chan->rx_lock);
1623 
1624 		if (chan->rx_mp != NULL) {
1625 			mblk_t *mpc;
1626 			mpc = chan->rx_mp;
1627 
1628 			chan->rx_mp = NULL;
1629 			chan->rx_mp_tail = NULL;
1630 			chan->rx_mp_len = 0;
1631 
1632 			mutex_exit(&chan->rx_lock);
1633 			mac_rx(state->id_mh, NULL, mpc);
1634 		} else {
1635 			mutex_exit(&chan->rx_lock);
1636 		}
1637 	}
1638 }
1639 
1640 /*
1641  * Allocate the statically allocated Tx buffer list.
1642  */
1643 int
1644 ibd_rc_init_tx_largebuf_list(ibd_state_t *state)
1645 {
1646 	ibd_rc_tx_largebuf_t *lbufp;
1647 	ibd_rc_tx_largebuf_t *tail;
1648 	uint8_t *memp;
1649 	ibt_mr_attr_t mem_attr;
1650 	uint32_t num_swqe;
1651 	size_t  mem_size;
1652 	int i;
1653 
1654 	num_swqe = state->id_rc_num_swqe - 1;
1655 
1656 	/*
1657 	 * Allocate one big chunk for all Tx large copy bufs
1658 	 */
1659 	/* Don't transfer IPOIB_GRH_SIZE bytes (40 bytes) */
1660 	mem_size = num_swqe * state->rc_mtu;
1661 	state->rc_tx_mr_bufs = kmem_zalloc(mem_size, KM_SLEEP);
1662 
1663 	mem_attr.mr_len = mem_size;
1664 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->rc_tx_mr_bufs;
1665 	mem_attr.mr_as = NULL;
1666 	mem_attr.mr_flags = IBT_MR_SLEEP;
1667 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
1668 	    &state->rc_tx_mr_hdl, &state->rc_tx_mr_desc) != IBT_SUCCESS) {
1669 		DPRINT(40, "ibd_rc_init_tx_largebuf_list: ibt_register_mr "
1670 		    "failed");
1671 		kmem_free(state->rc_tx_mr_bufs, mem_size);
1672 		state->rc_tx_mr_bufs = NULL;
1673 		return (DDI_FAILURE);
1674 	}
1675 
1676 	state->rc_tx_largebuf_desc_base = kmem_zalloc(num_swqe *
1677 	    sizeof (ibd_rc_tx_largebuf_t), KM_SLEEP);
1678 
1679 	/*
1680 	 * Set up the buf chain
1681 	 */
1682 	memp = state->rc_tx_mr_bufs;
1683 	mutex_enter(&state->rc_tx_large_bufs_lock);
1684 	lbufp = state->rc_tx_largebuf_desc_base;
1685 	for (i = 0; i < num_swqe; i++) {
1686 		lbufp->lb_buf = memp;
1687 		lbufp->lb_next = lbufp + 1;
1688 
1689 		tail = lbufp;
1690 
1691 		memp += state->rc_mtu;
1692 		lbufp++;
1693 	}
1694 	tail->lb_next = NULL;
1695 
1696 	/*
1697 	 * Set up the buffer information in ibd state
1698 	 */
1699 	state->rc_tx_largebuf_free_head = state->rc_tx_largebuf_desc_base;
1700 	state->rc_tx_largebuf_nfree = num_swqe;
1701 	mutex_exit(&state->rc_tx_large_bufs_lock);
1702 	return (DDI_SUCCESS);
1703 }
1704 
1705 void
1706 ibd_rc_fini_tx_largebuf_list(ibd_state_t *state)
1707 {
1708 	uint32_t num_swqe;
1709 
1710 	num_swqe = state->id_rc_num_swqe - 1;
1711 
1712 	if (ibt_deregister_mr(state->id_hca_hdl,
1713 	    state->rc_tx_mr_hdl) != IBT_SUCCESS) {
1714 		DPRINT(40, "ibd_rc_fini_tx_largebuf_list: ibt_deregister_mr() "
1715 		    "failed");
1716 	}
1717 	state->rc_tx_mr_hdl = NULL;
1718 
1719 	kmem_free(state->rc_tx_mr_bufs, num_swqe * state->rc_mtu);
1720 	state->rc_tx_mr_bufs = NULL;
1721 
1722 	kmem_free(state->rc_tx_largebuf_desc_base,
1723 	    num_swqe * sizeof (ibd_rc_tx_largebuf_t));
1724 	state->rc_tx_largebuf_desc_base = NULL;
1725 }
1726 
1727 static int
1728 ibd_rc_alloc_tx_copybufs(ibd_rc_chan_t *chan)
1729 {
1730 	ibt_mr_attr_t mem_attr;
1731 	ibd_state_t *state;
1732 
1733 	state = chan->state;
1734 	ASSERT(state != NULL);
1735 
1736 	/*
1737 	 * Allocate one big chunk for all regular tx copy bufs
1738 	 */
1739 	mem_attr.mr_len = chan->scq_size * state->id_rc_tx_copy_thresh;
1740 
1741 	chan->tx_mr_bufs = kmem_zalloc(mem_attr.mr_len, KM_SLEEP);
1742 
1743 	/*
1744 	 * Do one memory registration on the entire txbuf area
1745 	 */
1746 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)chan->tx_mr_bufs;
1747 	mem_attr.mr_as = NULL;
1748 	mem_attr.mr_flags = IBT_MR_SLEEP;
1749 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
1750 	    &chan->tx_mr_hdl, &chan->tx_mr_desc) != IBT_SUCCESS) {
1751 		DPRINT(40, "ibd_rc_alloc_tx_copybufs: ibt_register_mr failed");
1752 		ASSERT(mem_attr.mr_len ==
1753 		    chan->scq_size * state->id_rc_tx_copy_thresh);
1754 		kmem_free(chan->tx_mr_bufs, mem_attr.mr_len);
1755 		chan->tx_mr_bufs = NULL;
1756 		return (DDI_FAILURE);
1757 	}
1758 
1759 	return (DDI_SUCCESS);
1760 }
1761 
1762 /*
1763  * Allocate the statically allocated Tx buffer list.
1764  */
1765 static int
1766 ibd_rc_init_txlist(ibd_rc_chan_t *chan)
1767 {
1768 	ibd_swqe_t *swqe;
1769 	int i;
1770 	ibt_lkey_t lkey;
1771 	ibd_state_t *state = chan->state;
1772 
1773 	if (ibd_rc_alloc_tx_copybufs(chan) != DDI_SUCCESS)
1774 		return (DDI_FAILURE);
1775 
1776 	/*
1777 	 * Allocate and setup the swqe list
1778 	 */
1779 	lkey = chan->tx_mr_desc.md_lkey;
1780 	chan->tx_wqes = kmem_zalloc(chan->scq_size *
1781 	    sizeof (ibd_swqe_t), KM_SLEEP);
1782 	swqe = chan->tx_wqes;
1783 	for (i = 0; i < chan->scq_size; i++, swqe++) {
1784 		swqe->swqe_next = NULL;
1785 		swqe->swqe_im_mblk = NULL;
1786 
1787 		swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
1788 		swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
1789 
1790 		swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
1791 		swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL;
1792 		swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
1793 		    (chan->tx_mr_bufs + i * state->id_rc_tx_copy_thresh);
1794 		swqe->w_swr.wr_trans = IBT_RC_SRV;
1795 
1796 		/* Add to list */
1797 		mutex_enter(&chan->tx_wqe_list.dl_mutex);
1798 		chan->tx_wqe_list.dl_cnt++;
1799 		swqe->swqe_next = chan->tx_wqe_list.dl_head;
1800 		chan->tx_wqe_list.dl_head = SWQE_TO_WQE(swqe);
1801 		mutex_exit(&chan->tx_wqe_list.dl_mutex);
1802 	}
1803 
1804 	return (DDI_SUCCESS);
1805 }
1806 
1807 /*
1808  * Free the statically allocated Tx buffer list.
1809  */
1810 static void
1811 ibd_rc_fini_txlist(ibd_rc_chan_t *chan)
1812 {
1813 	ibd_state_t *state = chan->state;
1814 	if (chan->tx_mr_hdl != NULL) {
1815 		if (ibt_deregister_mr(chan->state->id_hca_hdl,
1816 		    chan->tx_mr_hdl) != IBT_SUCCESS) {
1817 			DPRINT(40, "ibd_rc_fini_txlist: ibt_deregister_mr "
1818 			    "failed");
1819 		}
1820 		chan->tx_mr_hdl = NULL;
1821 	}
1822 
1823 	if (chan->tx_mr_bufs != NULL) {
1824 		kmem_free(chan->tx_mr_bufs, chan->scq_size *
1825 		    state->id_rc_tx_copy_thresh);
1826 		chan->tx_mr_bufs = NULL;
1827 	}
1828 
1829 	if (chan->tx_wqes != NULL) {
1830 		kmem_free(chan->tx_wqes, chan->scq_size *
1831 		    sizeof (ibd_swqe_t));
1832 		chan->tx_wqes = NULL;
1833 	}
1834 }
1835 
1836 /*
1837  * Acquire send wqe from free list.
1838  * Returns error number and send wqe pointer.
1839  */
1840 ibd_swqe_t *
1841 ibd_rc_acquire_swqes(ibd_rc_chan_t *chan)
1842 {
1843 	ibd_swqe_t *wqe;
1844 
1845 	mutex_enter(&chan->tx_rel_list.dl_mutex);
1846 	if (chan->tx_rel_list.dl_head != NULL) {
1847 		/* transfer id_tx_rel_list to id_tx_list */
1848 		chan->tx_wqe_list.dl_head =
1849 		    chan->tx_rel_list.dl_head;
1850 		chan->tx_wqe_list.dl_cnt =
1851 		    chan->tx_rel_list.dl_cnt;
1852 		chan->tx_wqe_list.dl_pending_sends = B_FALSE;
1853 
1854 		/* clear id_tx_rel_list */
1855 		chan->tx_rel_list.dl_head = NULL;
1856 		chan->tx_rel_list.dl_cnt = 0;
1857 		mutex_exit(&chan->tx_rel_list.dl_mutex);
1858 
1859 		wqe = WQE_TO_SWQE(chan->tx_wqe_list.dl_head);
1860 		chan->tx_wqe_list.dl_cnt -= 1;
1861 		chan->tx_wqe_list.dl_head = wqe->swqe_next;
1862 	} else {	/* no free swqe */
1863 		mutex_exit(&chan->tx_rel_list.dl_mutex);
1864 		chan->tx_wqe_list.dl_pending_sends = B_TRUE;
1865 		wqe = NULL;
1866 	}
1867 	return (wqe);
1868 }
1869 
1870 /*
1871  * Release send wqe back into free list.
1872  */
1873 static void
1874 ibd_rc_release_swqe(ibd_rc_chan_t *chan, ibd_swqe_t *swqe)
1875 {
1876 	/*
1877 	 * Add back on Tx list for reuse.
1878 	 */
1879 	swqe->swqe_next = NULL;
1880 	mutex_enter(&chan->tx_rel_list.dl_mutex);
1881 	chan->tx_rel_list.dl_pending_sends = B_FALSE;
1882 	swqe->swqe_next = chan->tx_rel_list.dl_head;
1883 	chan->tx_rel_list.dl_head = SWQE_TO_WQE(swqe);
1884 	chan->tx_rel_list.dl_cnt++;
1885 	mutex_exit(&chan->tx_rel_list.dl_mutex);
1886 }
1887 
1888 void
1889 ibd_rc_post_send(ibd_rc_chan_t *chan, ibd_swqe_t *node)
1890 {
1891 	uint_t		i;
1892 	uint_t		num_posted;
1893 	uint_t		n_wrs;
1894 	ibt_status_t	ibt_status;
1895 	ibt_send_wr_t	wrs[IBD_MAX_TX_POST_MULTIPLE];
1896 	ibd_swqe_t	*tx_head, *elem;
1897 	ibd_swqe_t	*nodes[IBD_MAX_TX_POST_MULTIPLE];
1898 
1899 	/* post the one request, then check for more */
1900 	ibt_status = ibt_post_send(chan->chan_hdl,
1901 	    &node->w_swr, 1, NULL);
1902 	if (ibt_status != IBT_SUCCESS) {
1903 		ibd_print_warn(chan->state, "ibd_post_send: "
1904 		    "posting one wr failed: ret=%d", ibt_status);
1905 		ibd_rc_tx_cleanup(node);
1906 	}
1907 
1908 	tx_head = NULL;
1909 	for (;;) {
1910 		if (tx_head == NULL) {
1911 			mutex_enter(&chan->tx_post_lock);
1912 			tx_head = chan->tx_head;
1913 			if (tx_head == NULL) {
1914 				chan->tx_busy = 0;
1915 				mutex_exit(&chan->tx_post_lock);
1916 				return;
1917 			}
1918 			chan->tx_head = NULL;
1919 			mutex_exit(&chan->tx_post_lock);
1920 		}
1921 
1922 		/*
1923 		 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs
1924 		 * at a time if possible, and keep posting them.
1925 		 */
1926 		for (n_wrs = 0, elem = tx_head;
1927 		    (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE);
1928 		    elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {
1929 			nodes[n_wrs] = elem;
1930 			wrs[n_wrs] = elem->w_swr;
1931 		}
1932 		tx_head = elem;
1933 
1934 		ASSERT(n_wrs != 0);
1935 
1936 		/*
1937 		 * If posting fails for some reason, we'll never receive
1938 		 * completion intimation, so we'll need to cleanup. But
1939 		 * we need to make sure we don't clean up nodes whose
1940 		 * wrs have been successfully posted. We assume that the
1941 		 * hca driver returns on the first failure to post and
1942 		 * therefore the first 'num_posted' entries don't need
1943 		 * cleanup here.
1944 		 */
1945 		num_posted = 0;
1946 		ibt_status = ibt_post_send(chan->chan_hdl,
1947 		    wrs, n_wrs, &num_posted);
1948 		if (ibt_status != IBT_SUCCESS) {
1949 			ibd_print_warn(chan->state, "ibd_post_send: "
1950 			    "posting multiple wrs failed: "
1951 			    "requested=%d, done=%d, ret=%d",
1952 			    n_wrs, num_posted, ibt_status);
1953 
1954 			for (i = num_posted; i < n_wrs; i++)
1955 				ibd_rc_tx_cleanup(nodes[i]);
1956 		}
1957 	}
1958 }
1959 
1960 /*
1961  * Common code that deals with clean ups after a successful or
1962  * erroneous transmission attempt.
1963  */
1964 void
1965 ibd_rc_tx_cleanup(ibd_swqe_t *swqe)
1966 {
1967 	ibd_ace_t *ace = swqe->w_ahandle;
1968 	ibd_state_t *state;
1969 
1970 	ASSERT(ace != NULL);
1971 	ASSERT(ace->ac_chan != NULL);
1972 
1973 	state = ace->ac_chan->state;
1974 
1975 	/*
1976 	 * If this was a dynamic registration in ibd_send(),
1977 	 * deregister now.
1978 	 */
1979 	if (swqe->swqe_im_mblk != NULL) {
1980 		ASSERT(swqe->w_buftype == IBD_WQE_MAPPED);
1981 		if (swqe->w_buftype == IBD_WQE_MAPPED) {
1982 			ibd_unmap_mem(state, swqe);
1983 		}
1984 		freemsg(swqe->swqe_im_mblk);
1985 		swqe->swqe_im_mblk = NULL;
1986 	} else {
1987 		ASSERT(swqe->w_buftype != IBD_WQE_MAPPED);
1988 	}
1989 
1990 	if (swqe->w_buftype == IBD_WQE_RC_COPYBUF) {
1991 		ibd_rc_tx_largebuf_t *lbufp;
1992 
1993 		lbufp = swqe->w_rc_tx_largebuf;
1994 		ASSERT(lbufp != NULL);
1995 
1996 		mutex_enter(&state->rc_tx_large_bufs_lock);
1997 		lbufp->lb_next = state->rc_tx_largebuf_free_head;
1998 		state->rc_tx_largebuf_free_head = lbufp;
1999 		state->rc_tx_largebuf_nfree ++;
2000 		mutex_exit(&state->rc_tx_large_bufs_lock);
2001 		swqe->w_rc_tx_largebuf = NULL;
2002 	}
2003 
2004 
2005 	/*
2006 	 * Release the send wqe for reuse.
2007 	 */
2008 	ibd_rc_release_swqe(ace->ac_chan, swqe);
2009 
2010 	/*
2011 	 * Drop the reference count on the AH; it can be reused
2012 	 * now for a different destination if there are no more
2013 	 * posted sends that will use it. This can be eliminated
2014 	 * if we can always associate each Tx buffer with an AH.
2015 	 * The ace can be null if we are cleaning up from the
2016 	 * ibd_send() error path.
2017 	 */
2018 	ibd_dec_ref_ace(state, ace);
2019 }
2020 
2021 void
2022 ibd_rc_drain_scq(ibd_rc_chan_t *chan, ibt_cq_hdl_t cq_hdl)
2023 {
2024 	ibd_state_t *state = chan->state;
2025 	ibd_wqe_t *wqe;
2026 	ibt_wc_t *wc, *wcs;
2027 	uint_t numwcs, real_numwcs;
2028 	int i;
2029 
2030 	wcs = chan->tx_wc;
2031 	numwcs = IBD_RC_MAX_CQ_WC;
2032 
2033 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &real_numwcs) == IBT_SUCCESS) {
2034 		for (i = 0, wc = wcs; i < real_numwcs; i++, wc++) {
2035 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
2036 			if (wc->wc_status != IBT_WC_SUCCESS) {
2037 				chan->tx_trans_error_cnt ++;
2038 				DPRINT(30, "ibd_rc_drain_scq: "
2039 				    "wc_status(%d) != SUCC, "
2040 				    "chan=%p, ace=%p, link_state=%d",
2041 				    wc->wc_status, chan, chan->ace,
2042 				    chan->state->id_link_state);
2043 			} else {
2044 				chan->tx_trans_error_cnt = 0;
2045 			}
2046 			ibd_rc_tx_cleanup(WQE_TO_SWQE(wqe));
2047 		}
2048 
2049 		mutex_enter(&state->id_sched_lock);
2050 		if (state->id_sched_needed == 0) {
2051 			mutex_exit(&state->id_sched_lock);
2052 		} else if (state->id_sched_needed & IBD_RSRC_RC_SWQE) {
2053 			mutex_enter(&chan->tx_wqe_list.dl_mutex);
2054 			mutex_enter(&chan->tx_rel_list.dl_mutex);
2055 			if ((chan->tx_rel_list.dl_cnt +
2056 			    chan->tx_wqe_list.dl_cnt) > IBD_RC_TX_FREE_THRESH) {
2057 				state->id_sched_needed &= ~IBD_RSRC_RC_SWQE;
2058 				mutex_exit(&chan->tx_rel_list.dl_mutex);
2059 				mutex_exit(&chan->tx_wqe_list.dl_mutex);
2060 				mutex_exit(&state->id_sched_lock);
2061 				state->rc_swqe_mac_update++;
2062 				mac_tx_update(state->id_mh);
2063 			} else {
2064 				state->rc_scq_no_swqe++;
2065 				mutex_exit(&chan->tx_rel_list.dl_mutex);
2066 				mutex_exit(&chan->tx_wqe_list.dl_mutex);
2067 				mutex_exit(&state->id_sched_lock);
2068 			}
2069 		} else if (state->id_sched_needed & IBD_RSRC_RC_TX_LARGEBUF) {
2070 			mutex_enter(&state->rc_tx_large_bufs_lock);
2071 			if (state->rc_tx_largebuf_nfree >
2072 			    IBD_RC_TX_FREE_THRESH) {
2073 				ASSERT(state->rc_tx_largebuf_free_head != NULL);
2074 				state->id_sched_needed &=
2075 				    ~IBD_RSRC_RC_TX_LARGEBUF;
2076 				mutex_exit(&state->rc_tx_large_bufs_lock);
2077 				mutex_exit(&state->id_sched_lock);
2078 				state->rc_xmt_buf_mac_update++;
2079 				mac_tx_update(state->id_mh);
2080 			} else {
2081 				state->rc_scq_no_largebuf++;
2082 				mutex_exit(&state->rc_tx_large_bufs_lock);
2083 				mutex_exit(&state->id_sched_lock);
2084 			}
2085 		} else if (state->id_sched_needed & IBD_RSRC_SWQE) {
2086 			mutex_enter(&state->id_tx_list.dl_mutex);
2087 			mutex_enter(&state->id_tx_rel_list.dl_mutex);
2088 			if ((state->id_tx_list.dl_cnt +
2089 			    state->id_tx_rel_list.dl_cnt)
2090 			    > IBD_FREE_SWQES_THRESH) {
2091 				state->id_sched_needed &= ~IBD_RSRC_SWQE;
2092 				state->id_sched_cnt++;
2093 				mutex_exit(&state->id_tx_rel_list.dl_mutex);
2094 				mutex_exit(&state->id_tx_list.dl_mutex);
2095 				mutex_exit(&state->id_sched_lock);
2096 				mac_tx_update(state->id_mh);
2097 			} else {
2098 				mutex_exit(&state->id_tx_rel_list.dl_mutex);
2099 				mutex_exit(&state->id_tx_list.dl_mutex);
2100 				mutex_exit(&state->id_sched_lock);
2101 			}
2102 		} else {
2103 			mutex_exit(&state->id_sched_lock);
2104 		}
2105 	}
2106 }
2107 
2108 /* Send CQ handler, call ibd_rx_tx_cleanup to recycle Tx buffers */
2109 /* ARGSUSED */
2110 static void
2111 ibd_rc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
2112 {
2113 	ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg;
2114 
2115 	chan->state->rc_scq_invoke++;
2116 
2117 	if (ibd_rc_tx_softintr == 1) {
2118 		mutex_enter(&chan->tx_poll_lock);
2119 		if (chan->tx_poll_busy & IBD_CQ_POLLING) {
2120 			chan->tx_poll_busy |= IBD_REDO_CQ_POLLING;
2121 			mutex_exit(&chan->tx_poll_lock);
2122 			return;
2123 		} else {
2124 			mutex_exit(&chan->tx_poll_lock);
2125 			ddi_trigger_softintr(chan->scq_softintr);
2126 		}
2127 	} else
2128 		(void) ibd_rc_tx_recycle(arg);
2129 }
2130 
2131 static uint_t
2132 ibd_rc_tx_recycle(caddr_t arg)
2133 {
2134 	ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg;
2135 	ibd_ace_t *ace;
2136 	ibd_state_t *state = chan->state;
2137 	int flag, redo_flag;
2138 	int redo = 1;
2139 
2140 	flag = IBD_CQ_POLLING;
2141 	redo_flag = IBD_REDO_CQ_POLLING;
2142 
2143 	mutex_enter(&chan->tx_poll_lock);
2144 	if (chan->tx_poll_busy & flag) {
2145 		ibd_print_warn(state, "ibd_rc_tx_recycle: multiple polling "
2146 		    "threads");
2147 		chan->tx_poll_busy |= redo_flag;
2148 		mutex_exit(&chan->tx_poll_lock);
2149 		return (DDI_INTR_CLAIMED);
2150 	}
2151 	chan->tx_poll_busy |= flag;
2152 	mutex_exit(&chan->tx_poll_lock);
2153 
2154 	/*
2155 	 * Poll for completed entries; the CQ will not interrupt any
2156 	 * more for completed packets.
2157 	 */
2158 	ibd_rc_drain_scq(chan, chan->scq_hdl);
2159 
2160 	/*
2161 	 * Now enable CQ notifications; all completions originating now
2162 	 * will cause new interrupts.
2163 	 */
2164 	do {
2165 		if (ibt_enable_cq_notify(chan->scq_hdl, IBT_NEXT_COMPLETION) !=
2166 		    IBT_SUCCESS) {
2167 			/*
2168 			 * We do not expect a failure here.
2169 			 */
2170 			DPRINT(40, "ibd_rc_scq_handler: ibt_enable_cq_notify()"
2171 			    " failed");
2172 		}
2173 
2174 		ibd_rc_drain_scq(chan, chan->scq_hdl);
2175 
2176 		if (chan->tx_trans_error_cnt > 3) {
2177 			mutex_enter(&chan->tx_poll_lock);
2178 			chan->tx_poll_busy = 0;
2179 			mutex_exit(&chan->tx_poll_lock);
2180 			goto error_reset_chan;
2181 		}
2182 		mutex_enter(&chan->tx_poll_lock);
2183 		if (chan->tx_poll_busy & redo_flag)
2184 			chan->tx_poll_busy &= ~redo_flag;
2185 		else {
2186 			chan->tx_poll_busy &= ~flag;
2187 			redo = 0;
2188 		}
2189 		mutex_exit(&chan->tx_poll_lock);
2190 
2191 	} while (redo);
2192 
2193 	return (DDI_INTR_CLAIMED);
2194 
2195 error_reset_chan:
2196 	/*
2197 	 * Channel being torn down.
2198 	 */
2199 	mutex_enter(&state->id_ac_mutex);
2200 	if ((chan->chan_state == IBD_RC_STATE_ACT_ESTAB) &&
2201 	    (chan->state->id_link_state == LINK_STATE_UP) &&
2202 	    ((ace = ibd_acache_find(state, &chan->ace->ac_mac, B_FALSE, 0))
2203 	    != NULL) && (ace == chan->ace)) {
2204 		ASSERT(ace->ac_mce == NULL);
2205 		INC_REF(ace, 1);
2206 		IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
2207 		chan->chan_state = IBD_RC_STATE_ACT_CLOSING;
2208 		mutex_exit(&state->id_ac_mutex);
2209 		state->rc_reset_cnt++;
2210 		DPRINT(30, "ibd_rc_tx_recycle(chan=%p, ace=%p): "
2211 		    " reset RC channel", chan, chan->ace);
2212 		ibd_rc_signal_act_close(state, ace);
2213 	} else {
2214 		mutex_exit(&state->id_ac_mutex);
2215 		state->rc_act_close_simultaneous++;
2216 		DPRINT(40, "ibd_rc_tx_recycle: other thread is closing"
2217 		    " it. chan=%p, act_state=%d, link_state=%d, ace=%p",
2218 		    chan, chan->chan_state, state->id_link_state, ace);
2219 	}
2220 	return (DDI_INTR_CLAIMED);
2221 }
2222 
2223 static ibt_status_t
2224 ibd_register_service(ibt_srv_desc_t *srv, ib_svc_id_t sid,
2225     int num_sids, ibt_srv_hdl_t *srv_hdl, ib_svc_id_t *ret_sid)
2226 {
2227 	ibd_service_t *p;
2228 	ibt_status_t status;
2229 
2230 	mutex_enter(&ibd_gstate.ig_mutex);
2231 	for (p = ibd_gstate.ig_service_list; p != NULL; p = p->is_link) {
2232 		if (p->is_sid == sid) {
2233 			p->is_ref_cnt++;
2234 			*srv_hdl = p->is_srv_hdl;
2235 			*ret_sid = sid;
2236 			mutex_exit(&ibd_gstate.ig_mutex);
2237 			return (IBT_SUCCESS);
2238 		}
2239 	}
2240 	status = ibt_register_service(ibd_gstate.ig_ibt_hdl, srv, sid,
2241 	    num_sids, srv_hdl, ret_sid);
2242 	if (status == IBT_SUCCESS) {
2243 		p = kmem_alloc(sizeof (*p), KM_SLEEP);
2244 		p->is_srv_hdl = *srv_hdl;
2245 		p->is_sid = sid;
2246 		p->is_ref_cnt = 1;
2247 		p->is_link = ibd_gstate.ig_service_list;
2248 		ibd_gstate.ig_service_list = p;
2249 	}
2250 	mutex_exit(&ibd_gstate.ig_mutex);
2251 	return (status);
2252 }
2253 
2254 static ibt_status_t
2255 ibd_deregister_service(ibt_srv_hdl_t srv_hdl)
2256 {
2257 	ibd_service_t *p, **pp;
2258 	ibt_status_t status;
2259 
2260 	mutex_enter(&ibd_gstate.ig_mutex);
2261 	for (pp = &ibd_gstate.ig_service_list; *pp != NULL;
2262 	    pp = &((*pp)->is_link)) {
2263 		p = *pp;
2264 		if (p->is_srv_hdl == srv_hdl) {	/* Found it */
2265 			if (--p->is_ref_cnt == 0) {
2266 				status = ibt_deregister_service(
2267 				    ibd_gstate.ig_ibt_hdl, srv_hdl);
2268 				*pp = p->is_link; /* link prev to next */
2269 				kmem_free(p, sizeof (*p));
2270 			} else {
2271 				status = IBT_SUCCESS;
2272 			}
2273 			mutex_exit(&ibd_gstate.ig_mutex);
2274 			return (status);
2275 		}
2276 	}
2277 	/* Should not ever get here */
2278 	mutex_exit(&ibd_gstate.ig_mutex);
2279 	return (IBT_FAILURE);
2280 }
2281 
2282 /* Listen with corresponding service ID */
2283 ibt_status_t
2284 ibd_rc_listen(ibd_state_t *state)
2285 {
2286 	ibt_srv_desc_t srvdesc;
2287 	ib_svc_id_t ret_sid;
2288 	ibt_status_t status;
2289 	ib_gid_t gid;
2290 
2291 	if (state->rc_listen_hdl != NULL) {
2292 		DPRINT(40, "ibd_rc_listen: rc_listen_hdl should be NULL");
2293 		return (IBT_FAILURE);
2294 	}
2295 
2296 	bzero(&srvdesc, sizeof (ibt_srv_desc_t));
2297 	srvdesc.sd_handler = ibd_rc_dispatch_pass_mad;
2298 	srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
2299 
2300 	/*
2301 	 * Register the service with service id
2302 	 * Incoming connection requests should arrive on this service id.
2303 	 */
2304 	status = ibd_register_service(&srvdesc,
2305 	    IBD_RC_QPN_TO_SID(state->id_qpnum),
2306 	    1, &state->rc_listen_hdl, &ret_sid);
2307 	if (status != IBT_SUCCESS) {
2308 		DPRINT(40, "ibd_rc_listen: Service Registration Failed, "
2309 		    "ret=%d", status);
2310 		return (status);
2311 	}
2312 
2313 	gid = state->id_sgid;
2314 
2315 	/* pass state as cm_private */
2316 	status = ibt_bind_service(state->rc_listen_hdl,
2317 	    gid, NULL, state, &state->rc_listen_bind);
2318 	if (status != IBT_SUCCESS) {
2319 		DPRINT(40, "ibd_rc_listen:"
2320 		    " fail to bind port: <%d>", status);
2321 		(void) ibd_deregister_service(state->rc_listen_hdl);
2322 		return (status);
2323 	}
2324 
2325 	/*
2326 	 * Legacy OFED had used a wrong service ID (one additional zero digit)
2327 	 * for many years. To interop with legacy OFED, we support this wrong
2328 	 * service ID here.
2329 	 */
2330 	ASSERT(state->rc_listen_hdl_OFED_interop == NULL);
2331 
2332 	bzero(&srvdesc, sizeof (ibt_srv_desc_t));
2333 	srvdesc.sd_handler = ibd_rc_dispatch_pass_mad;
2334 	srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
2335 
2336 	/*
2337 	 * Register the service with service id
2338 	 * Incoming connection requests should arrive on this service id.
2339 	 */
2340 	status = ibd_register_service(&srvdesc,
2341 	    IBD_RC_QPN_TO_SID_OFED_INTEROP(state->id_qpnum),
2342 	    1, &state->rc_listen_hdl_OFED_interop, &ret_sid);
2343 	if (status != IBT_SUCCESS) {
2344 		DPRINT(40,
2345 		    "ibd_rc_listen: Service Registration for Legacy OFED "
2346 		    "Failed %d", status);
2347 		(void) ibt_unbind_service(state->rc_listen_hdl,
2348 		    state->rc_listen_bind);
2349 		(void) ibd_deregister_service(state->rc_listen_hdl);
2350 		return (status);
2351 	}
2352 
2353 	gid = state->id_sgid;
2354 
2355 	/* pass state as cm_private */
2356 	status = ibt_bind_service(state->rc_listen_hdl_OFED_interop,
2357 	    gid, NULL, state, &state->rc_listen_bind_OFED_interop);
2358 	if (status != IBT_SUCCESS) {
2359 		DPRINT(40, "ibd_rc_listen: fail to bind port: <%d> for "
2360 		    "Legacy OFED listener", status);
2361 		(void) ibd_deregister_service(
2362 		    state->rc_listen_hdl_OFED_interop);
2363 		(void) ibt_unbind_service(state->rc_listen_hdl,
2364 		    state->rc_listen_bind);
2365 		(void) ibd_deregister_service(state->rc_listen_hdl);
2366 		return (status);
2367 	}
2368 
2369 	return (IBT_SUCCESS);
2370 }
2371 
2372 void
2373 ibd_rc_stop_listen(ibd_state_t *state)
2374 {
2375 	int ret;
2376 
2377 	/* Disable incoming connection requests */
2378 	if (state->rc_listen_hdl != NULL) {
2379 		ret = ibt_unbind_all_services(state->rc_listen_hdl);
2380 		if (ret != 0) {
2381 			DPRINT(40, "ibd_rc_stop_listen:"
2382 			    "ibt_unbind_all_services() failed, ret=%d", ret);
2383 		}
2384 		ret = ibd_deregister_service(state->rc_listen_hdl);
2385 		if (ret != 0) {
2386 			DPRINT(40, "ibd_rc_stop_listen:"
2387 			    "ibd_deregister_service() failed, ret=%d", ret);
2388 		} else {
2389 			state->rc_listen_hdl = NULL;
2390 		}
2391 	}
2392 
2393 	/* Disable incoming connection requests */
2394 	if (state->rc_listen_hdl_OFED_interop != NULL) {
2395 		ret = ibt_unbind_all_services(
2396 		    state->rc_listen_hdl_OFED_interop);
2397 		if (ret != 0) {
2398 			DPRINT(40, "ibd_rc_stop_listen:"
2399 			    "ibt_unbind_all_services() failed: %d", ret);
2400 		}
2401 		ret = ibd_deregister_service(state->rc_listen_hdl_OFED_interop);
2402 		if (ret != 0) {
2403 			DPRINT(40, "ibd_rc_stop_listen:"
2404 			    "ibd_deregister_service() failed: %d", ret);
2405 		} else {
2406 			state->rc_listen_hdl_OFED_interop = NULL;
2407 		}
2408 	}
2409 }
2410 
2411 void
2412 ibd_rc_close_all_chan(ibd_state_t *state)
2413 {
2414 	ibd_rc_chan_t *rc_chan;
2415 	ibd_ace_t *ace;
2416 	uint_t attempts;
2417 
2418 	/* Disable all Rx routines */
2419 	mutex_enter(&state->rc_pass_chan_list.chan_list_mutex);
2420 	rc_chan = state->rc_pass_chan_list.chan_list;
2421 	while (rc_chan != NULL) {
2422 		ibt_set_cq_handler(rc_chan->rcq_hdl, 0, 0);
2423 		rc_chan = rc_chan->next;
2424 	}
2425 	mutex_exit(&state->rc_pass_chan_list.chan_list_mutex);
2426 
2427 	if (state->rc_enable_srq) {
2428 		attempts = 10;
2429 		while (state->rc_srq_rwqe_list.dl_bufs_outstanding > 0) {
2430 			DPRINT(30, "ibd_rc_close_all_chan: outstanding > 0");
2431 			delay(drv_usectohz(100000));
2432 			if (--attempts == 0) {
2433 				/*
2434 				 * There are pending bufs with the network
2435 				 * layer and we have no choice but to wait
2436 				 * for them to be done with. Reap all the
2437 				 * Tx/Rx completions that were posted since
2438 				 * we turned off the notification and
2439 				 * return failure.
2440 				 */
2441 				break;
2442 			}
2443 		}
2444 	}
2445 
2446 	/* Close all passive RC channels */
2447 	rc_chan = ibd_rc_rm_header_chan_list(&state->rc_pass_chan_list);
2448 	while (rc_chan != NULL) {
2449 		(void) ibd_rc_pas_close(rc_chan);
2450 		rc_chan = ibd_rc_rm_header_chan_list(&state->rc_pass_chan_list);
2451 	}
2452 
2453 	/* Close all active RC channels */
2454 	mutex_enter(&state->id_ac_mutex);
2455 	ace = list_head(&state->id_ah_active);
2456 	while (ace != NULL) {
2457 		if (ace->ac_chan != NULL) {
2458 			ibd_rc_add_to_chan_list(&state->rc_obs_act_chan_list,
2459 			    ace->ac_chan);
2460 		}
2461 		ace = list_next(&state->id_ah_active, ace);
2462 	}
2463 	mutex_exit(&state->id_ac_mutex);
2464 
2465 	rc_chan = ibd_rc_rm_header_chan_list(&state->rc_obs_act_chan_list);
2466 	while (rc_chan != NULL) {
2467 		ace = rc_chan->ace;
2468 		ibd_rc_act_close(rc_chan);
2469 		if (ace != NULL)
2470 			ace->ac_chan = NULL;
2471 		rc_chan = ibd_rc_rm_header_chan_list(
2472 		    &state->rc_obs_act_chan_list);
2473 	}
2474 }
2475 
2476 void
2477 ibd_rc_try_connect(ibd_state_t *state, ibd_ace_t *ace,  ibt_path_info_t *path)
2478 {
2479 	ibt_status_t status;
2480 
2481 	status = ibd_rc_connect(state, ace, path,
2482 	    IBD_RC_SERVICE_ID_OFED_INTEROP);
2483 
2484 	if (status != IBT_SUCCESS) {
2485 		/* wait peer side remove stale channel */
2486 		delay(drv_usectohz(10000));
2487 		status = ibd_rc_connect(state, ace, path,
2488 		    IBD_RC_SERVICE_ID_OFED_INTEROP);
2489 	}
2490 
2491 	if (status != IBT_SUCCESS) {
2492 		/* wait peer side remove stale channel */
2493 		delay(drv_usectohz(10000));
2494 		(void) ibd_rc_connect(state, ace, path,
2495 		    IBD_RC_SERVICE_ID);
2496 	}
2497 }
2498 
2499 /*
2500  * Allocates channel and sets the ace->ac_chan to it.
2501  * Opens the channel.
2502  */
2503 ibt_status_t
2504 ibd_rc_connect(ibd_state_t *state, ibd_ace_t *ace,  ibt_path_info_t *path,
2505     uint64_t ietf_cm_service_id)
2506 {
2507 	ibt_status_t status = 0;
2508 	ibt_rc_returns_t open_returns;
2509 	ibt_chan_open_args_t open_args;
2510 	ibd_rc_msg_hello_t hello_req_msg;
2511 	ibd_rc_msg_hello_t *hello_ack_msg;
2512 	ibd_rc_chan_t *chan;
2513 
2514 	ASSERT(ace != NULL);
2515 	ASSERT(ace->ac_mce == NULL);
2516 	ASSERT(ace->ac_chan == NULL);
2517 
2518 	if ((status = ibd_rc_alloc_chan(&chan, state, B_TRUE)) != IBT_SUCCESS) {
2519 		DPRINT(10, "ibd_rc_connect: ibd_rc_alloc_chan() failed");
2520 		return (status);
2521 	}
2522 
2523 	ace->ac_chan = chan;
2524 	chan->state = state;
2525 	chan->ace = ace;
2526 
2527 	ibt_set_chan_private(chan->chan_hdl, (void *)(uintptr_t)ace);
2528 
2529 	hello_ack_msg = kmem_zalloc(sizeof (ibd_rc_msg_hello_t), KM_SLEEP);
2530 
2531 	/*
2532 	 * open the channels
2533 	 */
2534 	bzero(&open_args, sizeof (ibt_chan_open_args_t));
2535 	bzero(&open_returns, sizeof (ibt_rc_returns_t));
2536 
2537 	open_args.oc_cm_handler = ibd_rc_dispatch_actv_mad;
2538 	open_args.oc_cm_clnt_private = (void *)(uintptr_t)ace;
2539 
2540 	/*
2541 	 * update path record with the SID
2542 	 */
2543 	path->pi_sid =
2544 	    ietf_cm_service_id | ((ace->ac_dest->ud_dst_qpn) & 0xffffff);
2545 
2546 
2547 	/* pre-allocate memory for hello ack message */
2548 	open_returns.rc_priv_data_len = sizeof (ibd_rc_msg_hello_t);
2549 	open_returns.rc_priv_data = hello_ack_msg;
2550 
2551 	open_args.oc_path = path;
2552 
2553 	open_args.oc_path_rnr_retry_cnt	= 7;
2554 	open_args.oc_path_retry_cnt = 7;
2555 
2556 	/* We don't do RDMA */
2557 	open_args.oc_rdma_ra_out = 0;
2558 	open_args.oc_rdma_ra_in	= 0;
2559 
2560 	hello_req_msg.reserved_qpn = htonl(state->id_qpnum);
2561 	hello_req_msg.rx_mtu = htonl(state->rc_mtu);
2562 	open_args.oc_priv_data_len = sizeof (ibd_rc_msg_hello_t);
2563 	open_args.oc_priv_data = (void *)(&hello_req_msg);
2564 
2565 	ASSERT(open_args.oc_priv_data_len <= IBT_REQ_PRIV_DATA_SZ);
2566 	ASSERT(open_returns.rc_priv_data_len <= IBT_REP_PRIV_DATA_SZ);
2567 	ASSERT(open_args.oc_cm_handler != NULL);
2568 
2569 	status = ibt_open_rc_channel(chan->chan_hdl, IBT_OCHAN_NO_FLAGS,
2570 	    IBT_BLOCKING, &open_args, &open_returns);
2571 
2572 	if (status == IBT_SUCCESS) {
2573 		/* Success! */
2574 		DPRINT(2, "ibd_rc_connect: call ibt_open_rc_channel succ!");
2575 		state->rc_conn_succ++;
2576 		kmem_free(hello_ack_msg, sizeof (ibd_rc_msg_hello_t));
2577 		return (IBT_SUCCESS);
2578 	}
2579 
2580 	/* failure */
2581 	(void) ibt_flush_channel(chan->chan_hdl);
2582 	ibd_rc_free_chan(chan);
2583 	ace->ac_chan = NULL;
2584 
2585 	/* check open_returns report error and exit */
2586 	DPRINT(30, "ibd_rc_connect: call ibt_open_rc_chan fail."
2587 	    "ret status = %d, reason=%d, ace=%p, mtu=0x%x, qpn=0x%x,"
2588 	    " peer qpn=0x%x", status, (int)open_returns.rc_status, ace,
2589 	    hello_req_msg.rx_mtu, hello_req_msg.reserved_qpn,
2590 	    ace->ac_dest->ud_dst_qpn);
2591 	kmem_free(hello_ack_msg, sizeof (ibd_rc_msg_hello_t));
2592 	return (status);
2593 }
2594 
2595 void
2596 ibd_rc_signal_act_close(ibd_state_t *state, ibd_ace_t *ace)
2597 {
2598 	ibd_req_t *req;
2599 
2600 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
2601 	if (req == NULL) {
2602 		ibd_print_warn(state, "ibd_rc_signal_act_close: alloc "
2603 		    "ibd_req_t fail");
2604 		mutex_enter(&state->rc_obs_act_chan_list.chan_list_mutex);
2605 		ace->ac_chan->next = state->rc_obs_act_chan_list.chan_list;
2606 		state->rc_obs_act_chan_list.chan_list = ace->ac_chan;
2607 		mutex_exit(&state->rc_obs_act_chan_list.chan_list_mutex);
2608 	} else {
2609 		req->rq_ptr = ace->ac_chan;
2610 		ibd_queue_work_slot(state, req, IBD_ASYNC_RC_CLOSE_ACT_CHAN);
2611 	}
2612 }
2613 
2614 void
2615 ibd_rc_signal_ace_recycle(ibd_state_t *state, ibd_ace_t *ace)
2616 {
2617 	ibd_req_t *req;
2618 
2619 	mutex_enter(&state->rc_ace_recycle_lock);
2620 	if (state->rc_ace_recycle != NULL) {
2621 		mutex_exit(&state->rc_ace_recycle_lock);
2622 		return;
2623 	}
2624 
2625 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
2626 	if (req == NULL) {
2627 		mutex_exit(&state->rc_ace_recycle_lock);
2628 		return;
2629 	}
2630 
2631 	state->rc_ace_recycle = ace;
2632 	mutex_exit(&state->rc_ace_recycle_lock);
2633 	ASSERT(ace->ac_mce == NULL);
2634 	INC_REF(ace, 1);
2635 	IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
2636 	req->rq_ptr = ace;
2637 	ibd_queue_work_slot(state, req, IBD_ASYNC_RC_RECYCLE_ACE);
2638 }
2639 
2640 static void
2641 ibd_rc_act_close(ibd_rc_chan_t *chan)
2642 {
2643 	uint_t times;
2644 	ibt_status_t ret;
2645 
2646 	ASSERT(chan != NULL);
2647 
2648 	chan->state->rc_act_close++;
2649 	switch (chan->chan_state) {
2650 	case IBD_RC_STATE_ACT_CLOSING:	/* stale, close it */
2651 	case IBD_RC_STATE_ACT_ESTAB:
2652 		DPRINT(30, "ibd_rc_act_close-1: close and free chan, "
2653 		    "act_state=%d, chan=%p", chan->chan_state, chan);
2654 		chan->chan_state = IBD_RC_STATE_ACT_CLOSED;
2655 		ibt_set_cq_handler(chan->rcq_hdl, 0, 0);
2656 		/* Wait send queue empty */
2657 		times = 0;
2658 		mutex_enter(&chan->tx_wqe_list.dl_mutex);
2659 		mutex_enter(&chan->tx_rel_list.dl_mutex);
2660 		while (((chan->tx_wqe_list.dl_cnt + chan->tx_rel_list.dl_cnt)
2661 		    != chan->scq_size) && (times < 50)) {
2662 			DPRINT(30, "ibd_rc_act_close: dl_cnt(tx_wqe_list=%d,"
2663 			    " tx_rel_list=%d) != chan->scq_size=%d",
2664 			    chan->tx_wqe_list.dl_cnt, chan->tx_rel_list.dl_cnt,
2665 			    chan->scq_size);
2666 			mutex_exit(&chan->tx_rel_list.dl_mutex);
2667 			mutex_exit(&chan->tx_wqe_list.dl_mutex);
2668 			mutex_enter(&chan->tx_poll_lock);
2669 			if (chan->tx_poll_busy & IBD_CQ_POLLING) {
2670 				DPRINT(40, "ibd_rc_act_close: multiple "
2671 				    "polling threads");
2672 				mutex_exit(&chan->tx_poll_lock);
2673 			} else {
2674 				chan->tx_poll_busy = IBD_CQ_POLLING;
2675 				mutex_exit(&chan->tx_poll_lock);
2676 				ibd_rc_drain_scq(chan, chan->scq_hdl);
2677 				mutex_enter(&chan->tx_poll_lock);
2678 				chan->tx_poll_busy = 0;
2679 				mutex_exit(&chan->tx_poll_lock);
2680 			}
2681 			delay(drv_usectohz(100000));
2682 			times++;
2683 			mutex_enter(&chan->tx_wqe_list.dl_mutex);
2684 			mutex_enter(&chan->tx_rel_list.dl_mutex);
2685 		}
2686 		mutex_exit(&chan->tx_rel_list.dl_mutex);
2687 		mutex_exit(&chan->tx_wqe_list.dl_mutex);
2688 		ibt_set_cq_handler(chan->scq_hdl, 0, 0);
2689 		ret = ibt_close_rc_channel(chan->chan_hdl,
2690 		    IBT_BLOCKING|IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0);
2691 		if (ret != IBT_SUCCESS) {
2692 			DPRINT(40, "ibd_rc_act_close-2: ibt_close_rc_channel "
2693 			    "fail, chan=%p, returned=%d", chan, ret);
2694 		} else {
2695 			DPRINT(30, "ibd_rc_act_close-2: ibt_close_rc_channel "
2696 			    "succ, chan=%p", chan);
2697 		}
2698 
2699 		ibd_rc_free_chan(chan);
2700 		break;
2701 	case IBD_RC_STATE_ACT_REP_RECV:
2702 		chan->chan_state = IBD_RC_STATE_ACT_CLOSED;
2703 		(void) ibt_flush_channel(chan->chan_hdl);
2704 		ibd_rc_free_chan(chan);
2705 		break;
2706 	case IBD_RC_STATE_ACT_ERROR:
2707 		DPRINT(40, "ibd_rc_act_close: IBD_RC_STATE_ERROR branch");
2708 		break;
2709 	default:
2710 		DPRINT(40, "ibd_rc_act_close: default branch, act_state=%d, "
2711 		    "chan=%p", chan->chan_state, chan);
2712 	}
2713 }
2714 
2715 static int
2716 ibd_rc_pas_close(ibd_rc_chan_t *chan)
2717 {
2718 	uint_t times;
2719 	ibt_status_t ret;
2720 
2721 	ASSERT(chan != NULL);
2722 	chan->state->rc_pas_close++;
2723 
2724 	switch (chan->chan_state) {
2725 	case IBD_RC_STATE_PAS_ESTAB:
2726 		/*
2727 		 * First, stop receive interrupts; this stops the
2728 		 * connection from handing up buffers to higher layers.
2729 		 * Wait for receive buffers to be returned; give up
2730 		 * after 5 seconds.
2731 		 */
2732 		ibt_set_cq_handler(chan->rcq_hdl, 0, 0);
2733 		if (!chan->state->rc_enable_srq) {
2734 			times = 50;
2735 			while (chan->rx_wqe_list.dl_bufs_outstanding > 0) {
2736 				delay(drv_usectohz(100000));
2737 				if (--times == 0) {
2738 					DPRINT(40, "ibd_rc_pas_close : "
2739 					    "reclaiming failed");
2740 					ibd_rc_poll_rcq(chan, chan->rcq_hdl);
2741 					ibt_set_cq_handler(chan->rcq_hdl,
2742 					    ibd_rc_rcq_handler,
2743 					    (void *)(uintptr_t)chan);
2744 					return (DDI_FAILURE);
2745 				}
2746 			}
2747 		}
2748 		ibt_set_cq_handler(chan->scq_hdl, 0, 0);
2749 		chan->chan_state = IBD_RC_STATE_PAS_CLOSED;
2750 		DPRINT(30, "ibd_rc_pas_close-1: close and free chan, "
2751 		    "chan_state=%d, chan=%p", chan->chan_state, chan);
2752 		ret = ibt_close_rc_channel(chan->chan_hdl,
2753 		    IBT_BLOCKING|IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0);
2754 		if (ret != IBT_SUCCESS) {
2755 			DPRINT(40, "ibd_rc_pas_close-2: ibt_close_rc_channel()"
2756 			    " fail, chan=%p, returned=%d", chan, ret);
2757 		} else {
2758 			DPRINT(30, "ibd_rc_pas_close-2: ibt_close_rc_channel()"
2759 			    " succ, chan=%p", chan);
2760 		}
2761 
2762 		ibd_rc_free_chan(chan);
2763 		break;
2764 	case IBD_RC_STATE_PAS_REQ_RECV:
2765 		chan->chan_state = IBD_RC_STATE_PAS_CLOSED;
2766 		(void) ibt_flush_channel(chan->chan_hdl);
2767 		ibd_rc_free_chan(chan);
2768 		break;
2769 	default:
2770 		DPRINT(40, "ibd_rc_pas_close: default, chan_state=%d, chan=%p",
2771 		    chan->chan_state, chan);
2772 	}
2773 	return (DDI_SUCCESS);
2774 }
2775 
2776 /*
2777  * Remove duplicate RC channel which comes from the same mac
2778  *
2779  * From the IP point of view, we could check for same MAC:
2780  * GID, P_Key (or QPN, though in a reboot this is likely to
2781  * change so P_Key is better). The GID usually will equate to
2782  * port (since typically it uses the port GUID in the low 64 bits).
2783  * These fields exists in the REQ messages.
2784  */
2785 void
2786 ibd_rc_handle_req_rm_dup(ibd_state_t *state, ibt_cm_event_t *ibt_cm_event)
2787 {
2788 	ibd_rc_chan_t *chan, *pre_chan;
2789 
2790 	pre_chan = NULL;
2791 	mutex_enter(&state->rc_pass_chan_list.chan_list_mutex);
2792 	chan = state->rc_pass_chan_list.chan_list;
2793 	while (chan != NULL) {
2794 		if ((bcmp(&chan->requester_gid,
2795 		    &ibt_cm_event->cm_event.req.req_prim_addr.av_dgid,
2796 		    sizeof (ib_gid_t)) == 0) && (chan->requester_pkey ==
2797 		    ibt_cm_event->cm_event.req.req_pkey)) {
2798 			if (pre_chan == NULL) {
2799 				state->rc_pass_chan_list.chan_list = chan->next;
2800 			} else {
2801 				pre_chan->next = chan->next;
2802 			}
2803 			break;
2804 		}
2805 		pre_chan = chan;
2806 		chan = chan->next;
2807 	}
2808 	mutex_exit(&state->rc_pass_chan_list.chan_list_mutex);
2809 	if (chan) {
2810 		DPRINT(30, "ibd_rc_handle_req_rm_dup: same gid and pkey, "
2811 		    "remove duplicate channal, chan=%p", chan);
2812 		if (ibd_rc_pas_close(chan) != DDI_SUCCESS) {
2813 			ibd_rc_add_to_chan_list(&state->rc_pass_chan_list,
2814 			    chan);
2815 		}
2816 	}
2817 }
2818 
2819 /*
2820  * Passive Side:
2821  *	Handle an incoming CM REQ from active side.
2822  *
2823  *	If success, this function allocates an ibd_rc_chan_t, then
2824  * assigns it to "*ret_conn".
2825  */
2826 static ibt_cm_status_t
2827 ibd_rc_handle_req(void *arg, ibd_rc_chan_t **ret_conn,
2828     ibt_cm_event_t *ibt_cm_event, ibt_cm_return_args_t *ret_args,
2829     void *ret_priv_data)
2830 {
2831 	ibd_rc_msg_hello_t *hello_msg;
2832 	ibd_state_t *state = (ibd_state_t *)arg;
2833 	ibd_rc_chan_t *chan;
2834 
2835 	ibd_rc_handle_req_rm_dup(state, ibt_cm_event);
2836 
2837 	if (ibd_rc_alloc_chan(&chan, state, B_FALSE) != IBT_SUCCESS) {
2838 		DPRINT(40, "ibd_rc_handle_req: ibd_rc_alloc_chan() failed");
2839 		return (IBT_CM_REJECT);
2840 	}
2841 
2842 	ibd_rc_add_to_chan_list(&state->rc_pass_chan_list, chan);
2843 
2844 	ibt_set_chan_private(chan->chan_hdl, (void *)(uintptr_t)chan);
2845 
2846 	if (!state->rc_enable_srq) {
2847 		if (ibd_rc_init_rxlist(chan) != DDI_SUCCESS) {
2848 			ibd_rc_free_chan(chan);
2849 			DPRINT(40, "ibd_rc_handle_req: ibd_rc_init_rxlist() "
2850 			    "failed");
2851 			return (IBT_CM_REJECT);
2852 		}
2853 	}
2854 
2855 	ret_args->cm_ret.rep.cm_channel = chan->chan_hdl;
2856 
2857 	/* We don't do RDMA */
2858 	ret_args->cm_ret.rep.cm_rdma_ra_out = 0;
2859 	ret_args->cm_ret.rep.cm_rdma_ra_in = 0;
2860 
2861 	ret_args->cm_ret.rep.cm_rnr_retry_cnt = 7;
2862 	ret_args->cm_ret_len = sizeof (ibd_rc_msg_hello_t);
2863 
2864 	hello_msg = (ibd_rc_msg_hello_t *)ibt_cm_event->cm_priv_data;
2865 	DPRINT(30, "ibd_rc_handle_req(): peer qpn=0x%x, peer mtu=0x%x",
2866 	    ntohl(hello_msg->reserved_qpn), ntohl(hello_msg->rx_mtu));
2867 
2868 	hello_msg = (ibd_rc_msg_hello_t *)ret_priv_data;
2869 	hello_msg->reserved_qpn = htonl(state->id_qpnum);
2870 	hello_msg->rx_mtu = htonl(state->rc_mtu);
2871 
2872 	chan->requester_gid = ibt_cm_event->cm_event.req.req_prim_addr.av_dgid;
2873 	chan->requester_pkey = ibt_cm_event->cm_event.req.req_pkey;
2874 	chan->chan_state = IBD_RC_STATE_PAS_REQ_RECV;	/* ready to receive */
2875 	*ret_conn = chan;
2876 
2877 	return (IBT_CM_ACCEPT);
2878 }
2879 
2880 /*
2881  * ibd_rc_handle_act_estab -- handler for connection established completion
2882  * for active side.
2883  */
2884 static ibt_cm_status_t
2885 ibd_rc_handle_act_estab(ibd_ace_t *ace)
2886 {
2887 	ibt_status_t result;
2888 
2889 	switch (ace->ac_chan->chan_state) {
2890 		case IBD_RC_STATE_ACT_REP_RECV:
2891 			ace->ac_chan->chan_state = IBD_RC_STATE_ACT_ESTAB;
2892 			result = ibt_enable_cq_notify(ace->ac_chan->rcq_hdl,
2893 			    IBT_NEXT_COMPLETION);
2894 			if (result != IBT_SUCCESS) {
2895 				DPRINT(40, "ibd_rc_handle_act_estab: "
2896 				    "ibt_enable_cq_notify(rcq) "
2897 				    "failed: status %d", result);
2898 				return (IBT_CM_REJECT);
2899 			}
2900 			break;
2901 		default:
2902 			DPRINT(40, "ibd_rc_handle_act_estab: default "
2903 			    "branch, act_state=%d", ace->ac_chan->chan_state);
2904 			return (IBT_CM_REJECT);
2905 	}
2906 	return (IBT_CM_ACCEPT);
2907 }
2908 
2909 /*
2910  * ibd_rc_handle_pas_estab -- handler for connection established completion
2911  * for passive side.
2912  */
2913 static ibt_cm_status_t
2914 ibd_rc_handle_pas_estab(ibd_rc_chan_t *chan)
2915 {
2916 	ibt_status_t result;
2917 
2918 	switch (chan->chan_state) {
2919 		case IBD_RC_STATE_PAS_REQ_RECV:
2920 			chan->chan_state = IBD_RC_STATE_PAS_ESTAB;
2921 
2922 			result = ibt_enable_cq_notify(chan->rcq_hdl,
2923 			    IBT_NEXT_COMPLETION);
2924 			if (result != IBT_SUCCESS) {
2925 				DPRINT(40, "ibd_rc_handle_pas_estab: "
2926 				    "ibt_enable_cq_notify(rcq) "
2927 				    "failed: status %d", result);
2928 				return (IBT_CM_REJECT);
2929 			}
2930 			break;
2931 		default:
2932 			DPRINT(40, "ibd_rc_handle_pas_estab: default "
2933 			    "branch, chan_state=%d", chan->chan_state);
2934 			return (IBT_CM_REJECT);
2935 	}
2936 	return (IBT_CM_ACCEPT);
2937 }
2938 
2939 /* ARGSUSED */
2940 static ibt_cm_status_t
2941 ibd_rc_dispatch_actv_mad(void *arg, ibt_cm_event_t *ibt_cm_event,
2942     ibt_cm_return_args_t *ret_args, void *ret_priv_data,
2943     ibt_priv_data_len_t ret_len_max)
2944 {
2945 	ibt_cm_status_t result = IBT_CM_ACCEPT;
2946 	ibd_ace_t *ace = (ibd_ace_t *)(uintptr_t)arg;
2947 	ibd_rc_chan_t *rc_chan;
2948 	ibd_state_t *state;
2949 	ibd_rc_msg_hello_t *hello_ack;
2950 	uint_t times;
2951 
2952 	switch (ibt_cm_event->cm_type) {
2953 	case IBT_CM_EVENT_REP_RCV:
2954 		ASSERT(ace->ac_chan != NULL);
2955 		ASSERT(ace->ac_chan->chan_state == IBD_RC_STATE_INIT);
2956 		hello_ack = (ibd_rc_msg_hello_t *)ibt_cm_event->cm_priv_data;
2957 		DPRINT(30, "ibd_rc_handle_rep: hello_ack->mtu=0x%x, "
2958 		    "hello_ack->qpn=0x%x", ntohl(hello_ack->rx_mtu),
2959 		    ntohl(hello_ack->reserved_qpn));
2960 		ace->ac_chan->chan_state = IBD_RC_STATE_ACT_REP_RECV;
2961 		break;
2962 
2963 	case IBT_CM_EVENT_CONN_EST:
2964 		ASSERT(ace->ac_chan != NULL);
2965 		DPRINT(30, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_CONN_EST, "
2966 		    "ace=%p, act_state=%d, chan=%p",
2967 		    ace, ace->ac_chan->chan_state, ace->ac_chan);
2968 		result = ibd_rc_handle_act_estab(ace);
2969 		break;
2970 
2971 	case IBT_CM_EVENT_CONN_CLOSED:
2972 		rc_chan = ace->ac_chan;
2973 		if (rc_chan == NULL) {
2974 			DPRINT(40, "ibd_rc_dispatch_actv_mad: "
2975 			    "rc_chan==NULL, IBT_CM_EVENT_CONN_CLOSED");
2976 			return (IBT_CM_ACCEPT);
2977 		}
2978 		state = rc_chan->state;
2979 		mutex_enter(&state->id_ac_mutex);
2980 		if ((rc_chan->chan_state == IBD_RC_STATE_ACT_ESTAB) &&
2981 		    ((ace = ibd_acache_find(state, &ace->ac_mac, B_FALSE, 0))
2982 		    != NULL) && (ace == rc_chan->ace)) {
2983 			rc_chan->chan_state = IBD_RC_STATE_ACT_CLOSING;
2984 			ASSERT(ace->ac_mce == NULL);
2985 			INC_REF(ace, 1);
2986 			IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
2987 			mutex_exit(&state->id_ac_mutex);
2988 			DPRINT(30, "ibd_rc_dispatch_actv_mad: "
2989 			    "IBT_CM_EVENT_CONN_CLOSED, ace=%p, chan=%p, "
2990 			    "reason=%d", ace, rc_chan,
2991 			    ibt_cm_event->cm_event.closed);
2992 		} else {
2993 			mutex_exit(&state->id_ac_mutex);
2994 			state->rc_act_close_simultaneous++;
2995 			DPRINT(40, "ibd_rc_dispatch_actv_mad: other thread "
2996 			    "is closing it, IBT_CM_EVENT_CONN_CLOSED, "
2997 			    "chan_state=%d", rc_chan->chan_state);
2998 			return (IBT_CM_ACCEPT);
2999 		}
3000 		/* wait until the send queue clean */
3001 		times = 0;
3002 		mutex_enter(&rc_chan->tx_wqe_list.dl_mutex);
3003 		mutex_enter(&rc_chan->tx_rel_list.dl_mutex);
3004 		while (((rc_chan->tx_wqe_list.dl_cnt +
3005 		    rc_chan->tx_rel_list.dl_cnt)
3006 		    != rc_chan->scq_size) && (times < 50)) {
3007 			DPRINT(40, "ibd_rc_dispatch_act_mad: dl_cnt"
3008 			    "(tx_wqe_list=%d, tx_rel_list=%d) != "
3009 			    "chan->scq_size=%d",
3010 			    rc_chan->tx_wqe_list.dl_cnt,
3011 			    rc_chan->tx_rel_list.dl_cnt,
3012 			    rc_chan->scq_size);
3013 			mutex_exit(&rc_chan->tx_rel_list.dl_mutex);
3014 			mutex_exit(&rc_chan->tx_wqe_list.dl_mutex);
3015 			mutex_enter(&rc_chan->tx_poll_lock);
3016 			if (rc_chan->tx_poll_busy & IBD_CQ_POLLING) {
3017 				DPRINT(40, "ibd_rc_dispatch_actv_mad: "
3018 				    "multiple polling threads");
3019 				mutex_exit(&rc_chan->tx_poll_lock);
3020 			} else {
3021 				rc_chan->tx_poll_busy = IBD_CQ_POLLING;
3022 				mutex_exit(&rc_chan->tx_poll_lock);
3023 				ibd_rc_drain_scq(rc_chan, rc_chan->scq_hdl);
3024 				mutex_enter(&rc_chan->tx_poll_lock);
3025 				rc_chan->tx_poll_busy = 0;
3026 				mutex_exit(&rc_chan->tx_poll_lock);
3027 			}
3028 			delay(drv_usectohz(100000));
3029 			times++;
3030 			mutex_enter(&rc_chan->tx_wqe_list.dl_mutex);
3031 			mutex_enter(&rc_chan->tx_rel_list.dl_mutex);
3032 		}
3033 		mutex_exit(&rc_chan->tx_rel_list.dl_mutex);
3034 		mutex_exit(&rc_chan->tx_wqe_list.dl_mutex);
3035 		rc_chan->chan_state = IBD_RC_STATE_ACT_CLOSED;
3036 		ibd_rc_free_chan(rc_chan);
3037 		DPRINT(30, "ibd_rc_dispatch_actv_mad: "
3038 		    "IBT_CM_EVENT_CONN_CLOSED, ref=%x", ace->ac_ref);
3039 		mutex_enter(&state->id_ac_mutex);
3040 		ace->ac_chan = NULL;
3041 		ASSERT(ace->ac_ref != 0);
3042 		atomic_dec_32(&ace->ac_ref);
3043 		if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
3044 			IBD_ACACHE_INSERT_FREE(state, ace);
3045 			ace->ac_ref = 0;
3046 		} else {
3047 			ace->ac_ref |= CYCLEVAL;
3048 			state->rc_delay_ace_recycle++;
3049 		}
3050 		mutex_exit(&state->id_ac_mutex);
3051 		break;
3052 
3053 	case IBT_CM_EVENT_FAILURE:
3054 		DPRINT(30, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_FAILURE,"
3055 		    "ace=%p, chan=%p, code: %d, msg: %d, reason=%d",
3056 		    ace, ace->ac_chan,
3057 		    ibt_cm_event->cm_event.failed.cf_code,
3058 		    ibt_cm_event->cm_event.failed.cf_msg,
3059 		    ibt_cm_event->cm_event.failed.cf_reason);
3060 		/*
3061 		 * Don't need free resource here. The resource is freed
3062 		 * at function ibd_rc_connect()
3063 		 */
3064 		break;
3065 
3066 	case IBT_CM_EVENT_MRA_RCV:
3067 		DPRINT(40, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_MRA_RCV");
3068 		break;
3069 	case IBT_CM_EVENT_LAP_RCV:
3070 		DPRINT(40, "ibd_rc_dispatch_actv_mad: LAP message received");
3071 		break;
3072 	case IBT_CM_EVENT_APR_RCV:
3073 		DPRINT(40, "ibd_rc_dispatch_actv_mad: APR message received");
3074 		break;
3075 	default:
3076 		DPRINT(40, "ibd_rc_dispatch_actv_mad: default branch, "
3077 		    "ibt_cm_event->cm_type=%d", ibt_cm_event->cm_type);
3078 		break;
3079 	}
3080 
3081 	return (result);
3082 }
3083 
3084 /* ARGSUSED */
3085 static ibt_cm_status_t
3086 ibd_rc_dispatch_pass_mad(void *arg, ibt_cm_event_t *ibt_cm_event,
3087     ibt_cm_return_args_t *ret_args, void *ret_priv_data,
3088     ibt_priv_data_len_t ret_len_max)
3089 {
3090 	ibt_cm_status_t result = IBT_CM_ACCEPT;
3091 	ibd_rc_chan_t *chan;
3092 
3093 	if (ibt_cm_event->cm_type == IBT_CM_EVENT_REQ_RCV) {
3094 		DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_REQ_RCV,"
3095 		    "req_pkey=%x", ibt_cm_event->cm_event.req.req_pkey);
3096 		/* Receive an incoming CM REQ from active side */
3097 		result = ibd_rc_handle_req(arg, &chan, ibt_cm_event, ret_args,
3098 		    ret_priv_data);
3099 		return (result);
3100 	}
3101 
3102 	if (ibt_cm_event->cm_channel == 0) {
3103 		DPRINT(30, "ibd_rc_dispatch_pass_mad: "
3104 		    "ERROR ibt_cm_event->cm_channel == 0");
3105 		return (IBT_CM_REJECT);
3106 	}
3107 
3108 	chan =
3109 	    (ibd_rc_chan_t *)ibt_get_chan_private(ibt_cm_event->cm_channel);
3110 	if (chan == NULL) {
3111 		DPRINT(40, "ibd_rc_dispatch_pass_mad: conn == 0");
3112 		return (IBT_CM_REJECT);
3113 	}
3114 
3115 	switch (ibt_cm_event->cm_type) {
3116 	case IBT_CM_EVENT_CONN_EST:
3117 		DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_CONN_EST, "
3118 		    "chan=%p", chan);
3119 		result = ibd_rc_handle_pas_estab(chan);
3120 		break;
3121 	case IBT_CM_EVENT_CONN_CLOSED:
3122 		DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_CONN_CLOSED,"
3123 		    " chan=%p, reason=%d", chan, ibt_cm_event->cm_event.closed);
3124 		ibd_rc_rm_from_chan_list(&chan->state->rc_pass_chan_list, chan);
3125 		ibd_rc_free_chan(chan);
3126 		break;
3127 	case IBT_CM_EVENT_FAILURE:
3128 		DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_FAILURE,"
3129 		    " chan=%p, code: %d, msg: %d, reason=%d", chan,
3130 		    ibt_cm_event->cm_event.failed.cf_code,
3131 		    ibt_cm_event->cm_event.failed.cf_msg,
3132 		    ibt_cm_event->cm_event.failed.cf_reason);
3133 
3134 		ibd_rc_rm_from_chan_list(&chan->state->rc_pass_chan_list, chan);
3135 		ibd_rc_free_chan(chan);
3136 		return (IBT_CM_ACCEPT);
3137 	case IBT_CM_EVENT_MRA_RCV:
3138 		DPRINT(40, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_MRA_RCV");
3139 		break;
3140 	case IBT_CM_EVENT_LAP_RCV:
3141 		DPRINT(40, "ibd_rc_dispatch_pass_mad: LAP message received");
3142 		break;
3143 	case IBT_CM_EVENT_APR_RCV:
3144 		DPRINT(40, "ibd_rc_dispatch_pass_mad: APR message received");
3145 		break;
3146 	default:
3147 		DPRINT(40, "ibd_rc_dispatch_pass_mad: default, type=%d, "
3148 		    "chan=%p", ibt_cm_event->cm_type, chan);
3149 		break;
3150 	}
3151 
3152 	return (result);
3153 }
3154