1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 /* 28 * An implementation of the IPoIB-CM standard based on PSARC 2009/593. 29 */ 30 #include <sys/types.h> 31 #include <sys/conf.h> 32 #include <sys/ddi.h> 33 #include <sys/sunddi.h> 34 #include <sys/modctl.h> 35 #include <sys/stropts.h> 36 #include <sys/stream.h> 37 #include <sys/strsun.h> 38 #include <sys/strsubr.h> 39 #include <sys/dlpi.h> 40 #include <sys/mac_provider.h> 41 42 #include <sys/pattr.h> /* for HCK_FULLCKSUM */ 43 #include <sys/atomic.h> /* for atomic_add*() */ 44 #include <sys/ethernet.h> /* for ETHERTYPE_IP */ 45 #include <netinet/in.h> /* for netinet/ip.h below */ 46 #include <netinet/ip.h> /* for struct ip */ 47 #include <inet/common.h> /* for inet/ip.h below */ 48 #include <inet/ip.h> /* for ipha_t */ 49 #include <inet/ip_if.h> /* for ETHERTYPE_IPV6 */ 50 #include <inet/ip6.h> /* for ip6_t */ 51 #include <netinet/icmp6.h> /* for icmp6_t */ 52 #include <sys/ib/ibtl/ibvti.h> /* for ace->ac_dest->ud_dst_qpn */ 53 54 #include <sys/ib/clients/ibd/ibd.h> 55 56 extern ibd_global_state_t ibd_gstate; 57 uint_t ibd_rc_tx_softintr = 1; 58 /* 59 * If the number of WRs in receive queue of each RC connection less than 60 * IBD_RC_RX_WR_THRESHOLD, we will post more receive WRs into it. 61 */ 62 #define IBD_RC_RX_WR_THRESHOLD 0x20 63 64 /* 65 * If the number of free SWQEs (or large Tx buf) is larger than or equal to 66 * IBD_RC_TX_FREE_THRESH, we will call mac_tx_update to notify GLD to continue 67 * transmitting packets. 68 */ 69 #define IBD_RC_TX_FREE_THRESH 8 70 71 #define IBD_RC_QPN_TO_SID(qpn) \ 72 ((uint64_t)(IBD_RC_SERVICE_ID | ((qpn) & 0xffffff))) 73 74 /* For interop with legacy OFED */ 75 #define IBD_RC_QPN_TO_SID_OFED_INTEROP(qpn) \ 76 ((uint64_t)(IBD_RC_SERVICE_ID_OFED_INTEROP | ((qpn) & 0xffffff))) 77 78 /* Internet Header + 64 bits of Data Datagram. Refer to RFC 792 */ 79 #define IBD_RC_IP_ICMP_RETURN_DATA_BYTES 64 80 81 82 /* Functions for Reliable Connected Mode */ 83 /* Connection Setup/Close Functions */ 84 static ibt_cm_status_t ibd_rc_dispatch_pass_mad(void *, 85 ibt_cm_event_t *, ibt_cm_return_args_t *, void *, ibt_priv_data_len_t); 86 static ibt_cm_status_t ibd_rc_dispatch_actv_mad(void *, 87 ibt_cm_event_t *, ibt_cm_return_args_t *, void *, ibt_priv_data_len_t); 88 static int ibd_rc_pas_close(ibd_rc_chan_t *); 89 static void ibd_rc_act_close(ibd_rc_chan_t *); 90 91 static inline void ibd_rc_add_to_chan_list(ibd_rc_chan_list_t *, 92 ibd_rc_chan_t *); 93 static inline ibd_rc_chan_t *ibd_rc_rm_header_chan_list( 94 ibd_rc_chan_list_t *); 95 static inline void ibd_rc_rm_from_chan_list(ibd_rc_chan_list_t *, 96 ibd_rc_chan_t *); 97 98 /* CQ handlers */ 99 static void ibd_rc_rcq_handler(ibt_cq_hdl_t, void *); 100 static void ibd_rc_scq_handler(ibt_cq_hdl_t, void *); 101 static void ibd_rc_poll_rcq(ibd_rc_chan_t *, ibt_cq_hdl_t); 102 103 /* Receive Functions */ 104 static int ibd_rc_post_srq(ibd_state_t *, ibd_rwqe_t *); 105 static void ibd_rc_srq_freemsg_cb(char *); 106 static void ibd_rc_srq_free_rwqe(ibd_state_t *, ibd_rwqe_t *); 107 108 static int ibd_rc_post_rwqe(ibd_rc_chan_t *, ibd_rwqe_t *); 109 static void ibd_rc_freemsg_cb(char *); 110 static void ibd_rc_process_rx(ibd_rc_chan_t *, ibd_rwqe_t *, ibt_wc_t *); 111 static void ibd_rc_free_rwqe(ibd_rc_chan_t *, ibd_rwqe_t *); 112 static void ibd_rc_fini_rxlist(ibd_rc_chan_t *); 113 114 115 /* Send Functions */ 116 static void ibd_rc_release_swqe(ibd_rc_chan_t *, ibd_swqe_t *); 117 static int ibd_rc_init_txlist(ibd_rc_chan_t *); 118 static void ibd_rc_fini_txlist(ibd_rc_chan_t *); 119 static uint_t ibd_rc_tx_recycle(caddr_t); 120 121 122 void 123 ibd_async_rc_close_act_chan(ibd_state_t *state, ibd_req_t *req) 124 { 125 ibd_rc_chan_t *rc_chan = req->rq_ptr; 126 ibd_ace_t *ace; 127 128 while (rc_chan != NULL) { 129 ace = rc_chan->ace; 130 ASSERT(ace != NULL); 131 /* Close old RC channel */ 132 ibd_rc_act_close(rc_chan); 133 mutex_enter(&state->id_ac_mutex); 134 ASSERT(ace->ac_ref != 0); 135 atomic_dec_32(&ace->ac_ref); 136 ace->ac_chan = NULL; 137 if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) { 138 IBD_ACACHE_INSERT_FREE(state, ace); 139 ace->ac_ref = 0; 140 } else { 141 ace->ac_ref |= CYCLEVAL; 142 state->rc_delay_ace_recycle++; 143 } 144 mutex_exit(&state->id_ac_mutex); 145 rc_chan = ibd_rc_rm_header_chan_list( 146 &state->rc_obs_act_chan_list); 147 } 148 } 149 150 void 151 ibd_async_rc_recycle_ace(ibd_state_t *state, ibd_req_t *req) 152 { 153 ibd_ace_t *ace = req->rq_ptr; 154 ibd_rc_chan_t *rc_chan; 155 156 ASSERT(ace != NULL); 157 rc_chan = ace->ac_chan; 158 ASSERT(rc_chan != NULL); 159 /* Close old RC channel */ 160 ibd_rc_act_close(rc_chan); 161 mutex_enter(&state->id_ac_mutex); 162 ASSERT(ace->ac_ref != 0); 163 atomic_dec_32(&ace->ac_ref); 164 ace->ac_chan = NULL; 165 if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) { 166 IBD_ACACHE_INSERT_FREE(state, ace); 167 ace->ac_ref = 0; 168 } else { 169 ace->ac_ref |= CYCLEVAL; 170 state->rc_delay_ace_recycle++; 171 } 172 mutex_exit(&state->id_ac_mutex); 173 mutex_enter(&state->rc_ace_recycle_lock); 174 state->rc_ace_recycle = NULL; 175 mutex_exit(&state->rc_ace_recycle_lock); 176 } 177 178 /* Simple ICMP IP Header Template */ 179 static const ipha_t icmp_ipha = { 180 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP 181 }; 182 183 /* Packet is too big. Send ICMP packet to GLD to request a smaller MTU */ 184 void 185 ibd_async_rc_process_too_big(ibd_state_t *state, ibd_req_t *req) 186 { 187 mblk_t *mp = req->rq_ptr; 188 ibd_ace_t *ace = req->rq_ptr2; 189 uint16_t mtu = state->id_mtu - IPOIB_HDRSIZE; 190 uint_t len_needed; 191 size_t msg_len; 192 mblk_t *pmtu_mp; 193 ushort_t sap; 194 ib_header_info_t *ibha; /* ib header for pmtu_pkt */ 195 /* 196 * ipha: IP header for pmtu_pkt 197 * old_ipha: IP header for old packet 198 */ 199 ipha_t *ipha, *old_ipha; 200 icmph_t *icmph; 201 202 sap = ntohs(((ipoib_hdr_t *)mp->b_rptr)->ipoib_type); 203 204 if (!pullupmsg(mp, -1)) { 205 DPRINT(40, "ibd_async_rc_process_too_big: pullupmsg fail"); 206 goto too_big_fail; 207 } 208 /* move to IP header. */ 209 mp->b_rptr += IPOIB_HDRSIZE; 210 old_ipha = (ipha_t *)mp->b_rptr; 211 212 len_needed = IPH_HDR_LENGTH(old_ipha); 213 if (old_ipha->ipha_protocol == IPPROTO_ENCAP) { 214 len_needed += IPH_HDR_LENGTH(((uchar_t *)old_ipha + 215 len_needed)); 216 } else if (old_ipha->ipha_protocol == IPPROTO_IPV6) { 217 ip6_t *ip6h = (ip6_t *)((uchar_t *)old_ipha 218 + len_needed); 219 len_needed += ip_hdr_length_v6(mp, ip6h); 220 } 221 len_needed += IBD_RC_IP_ICMP_RETURN_DATA_BYTES; 222 msg_len = msgdsize(mp); 223 if (msg_len > len_needed) { 224 (void) adjmsg(mp, len_needed - msg_len); 225 msg_len = len_needed; 226 } 227 228 if ((pmtu_mp = allocb(sizeof (ib_header_info_t) + sizeof (ipha_t) 229 + sizeof (icmph_t), BPRI_MED)) == NULL) { 230 DPRINT(40, "ibd_async_rc_process_too_big: allocb fail"); 231 goto too_big_fail; 232 } 233 pmtu_mp->b_cont = mp; 234 pmtu_mp->b_wptr = pmtu_mp->b_rptr + sizeof (ib_header_info_t) 235 + sizeof (ipha_t) + sizeof (icmph_t); 236 237 ibha = (ib_header_info_t *)pmtu_mp->b_rptr; 238 239 /* Fill IB header */ 240 bcopy(&state->id_macaddr, &ibha->ib_dst, IPOIB_ADDRL); 241 /* 242 * If the GRH is not valid, indicate to GLDv3 by setting 243 * the VerTcFlow field to 0. 244 */ 245 ibha->ib_grh.ipoib_vertcflow = 0; 246 ibha->ipib_rhdr.ipoib_type = htons(sap); 247 ibha->ipib_rhdr.ipoib_mbz = 0; 248 249 /* Fill IP header */ 250 ipha = (ipha_t *)&ibha[1]; 251 *ipha = icmp_ipha; 252 ipha->ipha_src = old_ipha->ipha_dst; 253 ipha->ipha_dst = old_ipha->ipha_src; 254 ipha->ipha_ttl = old_ipha->ipha_ttl; 255 msg_len += sizeof (icmp_ipha) + sizeof (icmph_t); 256 if (msg_len > IP_MAXPACKET) { 257 ibd_print_warn(state, "ibd_rc_process_too_big_pkt: msg_len(%d) " 258 "> IP_MAXPACKET", (uint32_t)msg_len); 259 (void) adjmsg(mp, IP_MAXPACKET - msg_len); 260 msg_len = IP_MAXPACKET; 261 } 262 ipha->ipha_length = htons((uint16_t)msg_len); 263 ipha->ipha_hdr_checksum = 0; 264 ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha); 265 266 /* Fill ICMP body */ 267 icmph = (icmph_t *)&ipha[1]; 268 bzero(icmph, sizeof (icmph_t)); 269 icmph->icmph_type = ICMP_DEST_UNREACHABLE; 270 icmph->icmph_code = ICMP_FRAGMENTATION_NEEDED; 271 icmph->icmph_du_mtu = htons(mtu); 272 icmph->icmph_checksum = 0; 273 icmph->icmph_checksum = IP_CSUM(pmtu_mp, 274 (int32_t)sizeof (ib_header_info_t) + (int32_t)sizeof (ipha_t), 0); 275 276 (void) hcksum_assoc(pmtu_mp, NULL, NULL, 0, 0, 0, 0, 277 HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0); 278 279 DPRINT(30, "ibd_async_rc_process_too_big: sap=0x%x, ip_src=0x%x, " 280 "ip_dst=0x%x, ttl=%d, len_needed=%d, msg_len=%d", 281 sap, ipha->ipha_src, ipha->ipha_dst, ipha->ipha_ttl, 282 len_needed, (uint32_t)msg_len); 283 284 mac_rx(state->id_mh, state->id_rh, pmtu_mp); 285 286 mutex_enter(&ace->tx_too_big_mutex); 287 ace->tx_too_big_ongoing = B_FALSE; 288 mutex_exit(&ace->tx_too_big_mutex); 289 return; 290 291 too_big_fail: 292 /* Drop packet */ 293 freemsg(mp); 294 mutex_enter(&ace->tx_too_big_mutex); 295 ace->tx_too_big_ongoing = B_FALSE; 296 mutex_exit(&ace->tx_too_big_mutex); 297 } 298 299 #ifdef DEBUG 300 /* 301 * ibd_rc_update_stats - update driver private kstat counters 302 * 303 * This routine will dump the internal statistics counters for ibd's 304 * Reliable Connected Mode. The current stats dump values will 305 * be sent to the kernel status area. 306 */ 307 static int 308 ibd_rc_update_stats(kstat_t *ksp, int rw) 309 { 310 ibd_state_t *state; 311 ibd_rc_stat_t *ibd_rc_ksp; 312 313 if (rw == KSTAT_WRITE) 314 return (EACCES); 315 316 state = (ibd_state_t *)ksp->ks_private; 317 ASSERT(state != NULL); 318 ibd_rc_ksp = (ibd_rc_stat_t *)ksp->ks_data; 319 320 ibd_rc_ksp->rc_rcv_trans_byte.value.ul = state->rc_rcv_trans_byte; 321 ibd_rc_ksp->rc_rcv_trans_pkt.value.ul = state->rc_rcv_trans_pkt; 322 ibd_rc_ksp->rc_rcv_copy_byte.value.ul = state->rc_rcv_copy_byte; 323 ibd_rc_ksp->rc_rcv_copy_pkt.value.ul = state->rc_rcv_copy_pkt; 324 ibd_rc_ksp->rc_rcv_alloc_fail.value.ul = state->rc_rcv_alloc_fail; 325 326 ibd_rc_ksp->rc_rcq_invoke.value.ul = state->rc_rcq_invoke; 327 ibd_rc_ksp->rc_rcq_err.value.ul = state->rc_rcq_err; 328 ibd_rc_ksp->rc_scq_invoke.value.ul = state->rc_scq_invoke; 329 330 ibd_rc_ksp->rc_rwqe_short.value.ul = state->rc_rwqe_short; 331 332 ibd_rc_ksp->rc_xmt_bytes.value.ul = state->rc_xmt_bytes; 333 ibd_rc_ksp->rc_xmt_small_pkt.value.ul = state->rc_xmt_small_pkt; 334 ibd_rc_ksp->rc_xmt_fragmented_pkt.value.ul = 335 state->rc_xmt_fragmented_pkt; 336 ibd_rc_ksp->rc_xmt_map_fail_pkt.value.ul = state->rc_xmt_map_fail_pkt; 337 ibd_rc_ksp->rc_xmt_map_succ_pkt.value.ul = state->rc_xmt_map_succ_pkt; 338 ibd_rc_ksp->rc_ace_not_found.value.ul = state->rc_ace_not_found; 339 340 ibd_rc_ksp->rc_scq_no_swqe.value.ul = state->rc_scq_no_swqe; 341 ibd_rc_ksp->rc_scq_no_largebuf.value.ul = state->rc_scq_no_largebuf; 342 ibd_rc_ksp->rc_swqe_short.value.ul = state->rc_swqe_short; 343 ibd_rc_ksp->rc_swqe_mac_update.value.ul = state->rc_swqe_mac_update; 344 ibd_rc_ksp->rc_xmt_buf_short.value.ul = state->rc_xmt_buf_short; 345 ibd_rc_ksp->rc_xmt_buf_mac_update.value.ul = 346 state->rc_xmt_buf_mac_update; 347 348 ibd_rc_ksp->rc_conn_succ.value.ul = state->rc_conn_succ; 349 ibd_rc_ksp->rc_conn_fail.value.ul = state->rc_conn_fail; 350 ibd_rc_ksp->rc_null_conn.value.ul = state->rc_null_conn; 351 ibd_rc_ksp->rc_no_estab_conn.value.ul = state->rc_no_estab_conn; 352 353 ibd_rc_ksp->rc_act_close.value.ul = state->rc_act_close; 354 ibd_rc_ksp->rc_pas_close.value.ul = state->rc_pas_close; 355 ibd_rc_ksp->rc_delay_ace_recycle.value.ul = state->rc_delay_ace_recycle; 356 ibd_rc_ksp->rc_act_close_simultaneous.value.ul = 357 state->rc_act_close_simultaneous; 358 ibd_rc_ksp->rc_reset_cnt.value.ul = state->rc_reset_cnt; 359 360 return (0); 361 } 362 363 364 /* 365 * ibd_rc_init_stats - initialize kstat data structures 366 * 367 * This routine will create and initialize the driver private 368 * statistics counters. 369 */ 370 int 371 ibd_rc_init_stats(ibd_state_t *state) 372 { 373 kstat_t *ksp; 374 ibd_rc_stat_t *ibd_rc_ksp; 375 char stat_name[32]; 376 int inst; 377 378 /* 379 * Create and init kstat 380 */ 381 inst = ddi_get_instance(state->id_dip); 382 (void) snprintf(stat_name, 31, "statistics%d_%x", inst, state->id_pkey); 383 ksp = kstat_create("ibd", 0, stat_name, "net", KSTAT_TYPE_NAMED, 384 sizeof (ibd_rc_stat_t) / sizeof (kstat_named_t), 0); 385 386 if (ksp == NULL) { 387 ibd_print_warn(state, "ibd_rc_init_stats: Could not create " 388 "kernel statistics"); 389 return (DDI_FAILURE); 390 } 391 392 state->rc_ksp = ksp; /* Fill in the ksp of ibd over RC mode */ 393 394 ibd_rc_ksp = (ibd_rc_stat_t *)ksp->ks_data; 395 396 /* 397 * Initialize all the statistics 398 */ 399 kstat_named_init(&ibd_rc_ksp->rc_rcv_trans_byte, "RC: Rx Bytes, " 400 "transfer mode", KSTAT_DATA_ULONG); 401 kstat_named_init(&ibd_rc_ksp->rc_rcv_trans_pkt, "RC: Rx Pkts, " 402 "transfer mode", KSTAT_DATA_ULONG); 403 kstat_named_init(&ibd_rc_ksp->rc_rcv_copy_byte, "RC: Rx Bytes, " 404 "copy mode", KSTAT_DATA_ULONG); 405 kstat_named_init(&ibd_rc_ksp->rc_rcv_copy_pkt, "RC: Rx Pkts, " 406 "copy mode", KSTAT_DATA_ULONG); 407 kstat_named_init(&ibd_rc_ksp->rc_rcv_alloc_fail, "RC: Rx alloc fail", 408 KSTAT_DATA_ULONG); 409 410 kstat_named_init(&ibd_rc_ksp->rc_rcq_invoke, "RC: invoke of Recv CQ " 411 "handler", KSTAT_DATA_ULONG); 412 kstat_named_init(&ibd_rc_ksp->rc_rcq_err, "RC: fail in Recv CQ handler", 413 KSTAT_DATA_ULONG); 414 415 kstat_named_init(&ibd_rc_ksp->rc_scq_invoke, "RC: invoke of Send CQ " 416 "handler", KSTAT_DATA_ULONG); 417 418 kstat_named_init(&ibd_rc_ksp->rc_rwqe_short, "RC: Short rwqe", 419 KSTAT_DATA_ULONG); 420 421 kstat_named_init(&ibd_rc_ksp->rc_xmt_bytes, "RC: Sent Bytes", 422 KSTAT_DATA_ULONG); 423 kstat_named_init(&ibd_rc_ksp->rc_xmt_small_pkt, 424 "RC: Tx pkt small size", KSTAT_DATA_ULONG); 425 kstat_named_init(&ibd_rc_ksp->rc_xmt_fragmented_pkt, 426 "RC: Tx pkt fragmentary", KSTAT_DATA_ULONG); 427 kstat_named_init(&ibd_rc_ksp->rc_xmt_map_fail_pkt, 428 "RC: Tx pkt fail ibt_map_mem_iov()", KSTAT_DATA_ULONG); 429 kstat_named_init(&ibd_rc_ksp->rc_xmt_map_succ_pkt, 430 "RC: Tx pkt succ ibt_map_mem_iov()", KSTAT_DATA_ULONG); 431 kstat_named_init(&ibd_rc_ksp->rc_ace_not_found, "RC: ace not found", 432 KSTAT_DATA_ULONG); 433 434 kstat_named_init(&ibd_rc_ksp->rc_scq_no_swqe, "RC: No swqe after " 435 "recycle", KSTAT_DATA_ULONG); 436 kstat_named_init(&ibd_rc_ksp->rc_scq_no_largebuf, "RC: No large tx buf " 437 "after recycle", KSTAT_DATA_ULONG); 438 kstat_named_init(&ibd_rc_ksp->rc_swqe_short, "RC: No swqe in ibd_send", 439 KSTAT_DATA_ULONG); 440 kstat_named_init(&ibd_rc_ksp->rc_swqe_mac_update, "RC: mac_tx_update " 441 "#, swqe available", KSTAT_DATA_ULONG); 442 kstat_named_init(&ibd_rc_ksp->rc_xmt_buf_short, "RC: No buf in " 443 "ibd_send", KSTAT_DATA_ULONG); 444 kstat_named_init(&ibd_rc_ksp->rc_xmt_buf_mac_update, "RC: " 445 "mac_tx_update #, buf available", KSTAT_DATA_ULONG); 446 447 kstat_named_init(&ibd_rc_ksp->rc_conn_succ, "RC: succ connected", 448 KSTAT_DATA_ULONG); 449 kstat_named_init(&ibd_rc_ksp->rc_conn_fail, "RC: fail connect", 450 KSTAT_DATA_ULONG); 451 kstat_named_init(&ibd_rc_ksp->rc_null_conn, "RC: null conn for unicast " 452 "pkt", KSTAT_DATA_ULONG); 453 kstat_named_init(&ibd_rc_ksp->rc_no_estab_conn, "RC: not in act estab " 454 "state", KSTAT_DATA_ULONG); 455 456 kstat_named_init(&ibd_rc_ksp->rc_act_close, "RC: call ibd_rc_act_close", 457 KSTAT_DATA_ULONG); 458 kstat_named_init(&ibd_rc_ksp->rc_pas_close, "RC: call ibd_rc_pas_close", 459 KSTAT_DATA_ULONG); 460 kstat_named_init(&ibd_rc_ksp->rc_delay_ace_recycle, "RC: delay ace " 461 "recycle", KSTAT_DATA_ULONG); 462 kstat_named_init(&ibd_rc_ksp->rc_act_close_simultaneous, "RC: " 463 "simultaneous ibd_rc_act_close", KSTAT_DATA_ULONG); 464 kstat_named_init(&ibd_rc_ksp->rc_reset_cnt, "RC: Reset RC channel", 465 KSTAT_DATA_ULONG); 466 467 /* 468 * Function to provide kernel stat update on demand 469 */ 470 ksp->ks_update = ibd_rc_update_stats; 471 472 /* 473 * Pointer into provider's raw statistics 474 */ 475 ksp->ks_private = (void *)state; 476 477 /* 478 * Add kstat to systems kstat chain 479 */ 480 kstat_install(ksp); 481 482 return (DDI_SUCCESS); 483 } 484 #endif 485 486 static ibt_status_t 487 ibd_rc_alloc_chan(ibd_rc_chan_t **ret_chan, ibd_state_t *state, 488 boolean_t is_tx_chan) 489 { 490 ibt_status_t result; 491 ibd_rc_chan_t *chan; 492 ibt_rc_chan_alloc_args_t alloc_args; 493 ibt_chan_alloc_flags_t alloc_flags; 494 ibt_chan_sizes_t sizes; 495 ibt_cq_attr_t cq_atts; 496 int rv; 497 498 chan = kmem_zalloc(sizeof (ibd_rc_chan_t), KM_SLEEP); 499 500 chan->state = state; 501 mutex_init(&chan->rx_wqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 502 mutex_init(&chan->rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 503 mutex_init(&chan->tx_wqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 504 mutex_init(&chan->tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 505 mutex_init(&chan->tx_post_lock, NULL, MUTEX_DRIVER, NULL); 506 mutex_init(&chan->tx_poll_lock, NULL, MUTEX_DRIVER, NULL); 507 508 /* Allocate IB structures for a new RC channel. */ 509 if (is_tx_chan) { 510 chan->scq_size = state->id_rc_num_swqe; 511 chan->rcq_size = IBD_RC_MIN_CQ_SIZE; 512 } else { 513 chan->scq_size = IBD_RC_MIN_CQ_SIZE; 514 chan->rcq_size = state->id_rc_num_rwqe; 515 } 516 cq_atts.cq_size = chan->scq_size; 517 cq_atts.cq_sched = NULL; 518 cq_atts.cq_flags = IBT_CQ_NO_FLAGS; 519 result = ibt_alloc_cq(state->id_hca_hdl, &cq_atts, &chan->scq_hdl, 520 &chan->scq_size); 521 if (result != IBT_SUCCESS) { 522 DPRINT(40, "ibd_rc_alloc_chan: error <%d>" 523 "create scq completion queue (size <%d>)", 524 result, chan->scq_size); 525 goto alloc_scq_err; 526 } /* if failure to alloc cq */ 527 528 if (ibt_modify_cq(chan->scq_hdl, state->id_rc_tx_comp_count, 529 state->id_rc_tx_comp_usec, 0) != IBT_SUCCESS) { 530 ibd_print_warn(state, "ibd_rc_alloc_chan: Send CQ " 531 "interrupt moderation failed"); 532 } 533 534 ibt_set_cq_private(chan->scq_hdl, (void *) (uintptr_t)chan); 535 ibt_set_cq_handler(chan->scq_hdl, ibd_rc_scq_handler, 536 (void *) (uintptr_t)chan); 537 538 cq_atts.cq_size = chan->rcq_size; 539 cq_atts.cq_sched = NULL; 540 cq_atts.cq_flags = IBT_CQ_NO_FLAGS; 541 result = ibt_alloc_cq(state->id_hca_hdl, &cq_atts, &chan->rcq_hdl, 542 &chan->rcq_size); 543 if (result != IBT_SUCCESS) { 544 ibd_print_warn(state, "ibd_rc_alloc_chan: error <%d> creating " 545 "rx completion queue (size <%d>)", result, chan->rcq_size); 546 goto alloc_rcq_err; 547 } /* if failure to alloc cq */ 548 549 if (ibt_modify_cq(chan->rcq_hdl, state->id_rc_rx_comp_count, 550 state->id_rc_rx_comp_usec, 0) != IBT_SUCCESS) { 551 ibd_print_warn(state, "ibd_rc_alloc_chan: Receive CQ " 552 "interrupt moderation failed"); 553 } 554 555 ibt_set_cq_private(chan->rcq_hdl, (void *) (uintptr_t)chan); 556 ibt_set_cq_handler(chan->rcq_hdl, ibd_rc_rcq_handler, 557 (void *)(uintptr_t)chan); 558 559 if (is_tx_chan) { 560 chan->is_tx_chan = B_TRUE; 561 if (ibd_rc_init_txlist(chan) != DDI_SUCCESS) { 562 ibd_print_warn(state, "ibd_rc_alloc_chan: " 563 "ibd_rc_init_txlist failed"); 564 goto init_txlist_err; 565 } 566 if (ibd_rc_tx_softintr == 1) { 567 if ((rv = ddi_add_softintr(state->id_dip, 568 DDI_SOFTINT_LOW, &chan->scq_softintr, NULL, NULL, 569 ibd_rc_tx_recycle, (caddr_t)chan)) != 570 DDI_SUCCESS) { 571 DPRINT(10, "ibd_rc_alloc_chan: failed in " 572 "ddi_add_softintr(scq_softintr), ret=%d", 573 rv); 574 goto alloc_softintr_err; 575 } 576 } 577 } else { 578 chan->is_tx_chan = B_FALSE; 579 } 580 581 /* 582 * enable completions 583 */ 584 result = ibt_enable_cq_notify(chan->scq_hdl, IBT_NEXT_COMPLETION); 585 if (result != IBT_SUCCESS) { 586 ibd_print_warn(state, "ibd_rc_alloc_chan: ibt_enable_cq_notify" 587 "(scq) failed: status %d\n", result); 588 goto alloc_scq_enable_err; 589 } 590 591 /* We will enable chan->rcq_hdl later. */ 592 593 /* alloc a RC channel */ 594 bzero(&alloc_args, sizeof (ibt_rc_chan_alloc_args_t)); 595 bzero(&sizes, sizeof (ibt_chan_sizes_t)); 596 597 alloc_args.rc_flags = IBT_WR_SIGNALED; 598 alloc_args.rc_control = IBT_CEP_NO_FLAGS; 599 600 alloc_args.rc_scq = chan->scq_hdl; 601 alloc_args.rc_rcq = chan->rcq_hdl; 602 alloc_args.rc_pd = state->id_pd_hdl; 603 604 alloc_args.rc_hca_port_num = state->id_port; 605 alloc_args.rc_clone_chan = NULL; 606 607 /* scatter/gather */ 608 alloc_args.rc_sizes.cs_sq_sgl = state->rc_tx_max_sqseg; 609 610 /* 611 * For the number of SGL elements in receive side, I think it 612 * should be 1. Because ibd driver allocates a whole block memory 613 * for each ibt_post_recv(). 614 */ 615 alloc_args.rc_sizes.cs_rq_sgl = 1; 616 617 /* The send queue size and the receive queue size */ 618 alloc_args.rc_sizes.cs_sq = chan->scq_size; 619 alloc_args.rc_sizes.cs_rq = chan->rcq_size; 620 621 if (state->id_hca_res_lkey_capab) { 622 alloc_args.rc_flags = IBT_FAST_REG_RES_LKEY; 623 } else { 624 DPRINT(40, "ibd_rc_alloc_chan: not support reserved lkey"); 625 } 626 627 if (state->rc_enable_srq) { 628 alloc_flags = IBT_ACHAN_USES_SRQ; 629 alloc_args.rc_srq = state->rc_srq_hdl; 630 } else { 631 alloc_flags = IBT_ACHAN_NO_FLAGS; 632 } 633 634 result = ibt_alloc_rc_channel(state->id_hca_hdl, 635 alloc_flags, &alloc_args, &chan->chan_hdl, &sizes); 636 if (result != IBT_SUCCESS) { 637 ibd_print_warn(state, "ibd_rc_alloc_chan: ibd_rc_open_channel" 638 " fail:<%d>", result); 639 goto alloc_scq_enable_err; 640 } 641 642 *ret_chan = chan; 643 return (IBT_SUCCESS); 644 645 alloc_scq_enable_err: 646 if (is_tx_chan) { 647 if (ibd_rc_tx_softintr == 1) { 648 ddi_remove_softintr(chan->scq_softintr); 649 } 650 } 651 alloc_softintr_err: 652 if (is_tx_chan) { 653 ibd_rc_fini_txlist(chan); 654 } 655 init_txlist_err: 656 (void) ibt_free_cq(chan->rcq_hdl); 657 alloc_rcq_err: 658 (void) ibt_free_cq(chan->scq_hdl); 659 alloc_scq_err: 660 mutex_destroy(&chan->tx_poll_lock); 661 mutex_destroy(&chan->tx_post_lock); 662 mutex_destroy(&chan->tx_rel_list.dl_mutex); 663 mutex_destroy(&chan->tx_wqe_list.dl_mutex); 664 mutex_destroy(&chan->rx_free_list.dl_mutex); 665 mutex_destroy(&chan->rx_wqe_list.dl_mutex); 666 kmem_free(chan, sizeof (ibd_rc_chan_t)); 667 return (result); 668 } 669 670 static void 671 ibd_rc_free_chan(ibd_rc_chan_t *chan) 672 { 673 ibt_status_t ret; 674 675 /* DPRINT(30, "ibd_rc_free_chan: chan=%p", chan); */ 676 677 if (chan->chan_hdl != NULL) { 678 ret = ibt_free_channel(chan->chan_hdl); 679 if (ret != IBT_SUCCESS) { 680 DPRINT(40, "ib_rc_free_chan: ibt_free_channel failed, " 681 "chan=%p, returned: %d", chan, ret); 682 return; 683 } 684 chan->chan_hdl = NULL; 685 } 686 687 if (chan->rcq_hdl != NULL) { 688 ret = ibt_free_cq(chan->rcq_hdl); 689 if (ret != IBT_SUCCESS) { 690 DPRINT(40, "ib_rc_free_chan: ibt_free_cq(rcq) failed, " 691 "chan=%p, returned: %d", chan, ret); 692 return; 693 } 694 chan->rcq_hdl = NULL; 695 } 696 697 if (chan->scq_hdl != NULL) { 698 ret = ibt_free_cq(chan->scq_hdl); 699 if (ret != IBT_SUCCESS) { 700 DPRINT(40, "ib_rc_free_chan: ibt_free_cq(scq) failed, " 701 "chan=%p, returned: %d", chan, ret); 702 return; 703 } 704 chan->scq_hdl = NULL; 705 } 706 707 /* Free buffers */ 708 if (chan->is_tx_chan) { 709 ibd_rc_fini_txlist(chan); 710 if (ibd_rc_tx_softintr == 1) { 711 ddi_remove_softintr(chan->scq_softintr); 712 } 713 } else { 714 if (!chan->state->rc_enable_srq) { 715 ibd_rc_fini_rxlist(chan); 716 } 717 } 718 719 mutex_destroy(&chan->tx_poll_lock); 720 mutex_destroy(&chan->tx_post_lock); 721 mutex_destroy(&chan->tx_rel_list.dl_mutex); 722 mutex_destroy(&chan->tx_wqe_list.dl_mutex); 723 mutex_destroy(&chan->rx_free_list.dl_mutex); 724 mutex_destroy(&chan->rx_wqe_list.dl_mutex); 725 726 /* 727 * If it is a passive channel, must make sure it has been removed 728 * from chan->state->rc_pass_chan_list 729 */ 730 kmem_free(chan, sizeof (ibd_rc_chan_t)); 731 } 732 733 /* Add a RC channel */ 734 static inline void 735 ibd_rc_add_to_chan_list(ibd_rc_chan_list_t *list, ibd_rc_chan_t *chan) 736 { 737 mutex_enter(&list->chan_list_mutex); 738 if (list->chan_list == NULL) { 739 list->chan_list = chan; 740 } else { 741 chan->next = list->chan_list; 742 list->chan_list = chan; 743 } 744 mutex_exit(&list->chan_list_mutex); 745 } 746 747 /* Remove a RC channel */ 748 static inline void 749 ibd_rc_rm_from_chan_list(ibd_rc_chan_list_t *list, ibd_rc_chan_t *chan) 750 { 751 ibd_rc_chan_t *pre_chan; 752 753 mutex_enter(&list->chan_list_mutex); 754 if (list->chan_list == chan) { 755 DPRINT(30, "ibd_rc_rm_from_chan_list(first): found chan(%p)" 756 " in chan_list", chan); 757 list->chan_list = chan->next; 758 } else { 759 pre_chan = list->chan_list; 760 while (pre_chan != NULL) { 761 if (pre_chan->next == chan) { 762 DPRINT(30, "ibd_rc_rm_from_chan_list" 763 "(middle): found chan(%p) in " 764 "rc_pass_chan_list", chan); 765 pre_chan->next = chan->next; 766 break; 767 } 768 pre_chan = pre_chan->next; 769 } 770 } 771 mutex_exit(&list->chan_list_mutex); 772 } 773 774 static inline ibd_rc_chan_t * 775 ibd_rc_rm_header_chan_list(ibd_rc_chan_list_t *list) 776 { 777 ibd_rc_chan_t *rc_chan; 778 779 mutex_enter(&list->chan_list_mutex); 780 rc_chan = list->chan_list; 781 if (rc_chan != NULL) { 782 list->chan_list = rc_chan->next; 783 } 784 mutex_exit(&list->chan_list_mutex); 785 return (rc_chan); 786 } 787 788 static int 789 ibd_rc_alloc_srq_copybufs(ibd_state_t *state) 790 { 791 ibt_mr_attr_t mem_attr; 792 uint_t rc_rx_bufs_sz; 793 794 /* 795 * Allocate one big chunk for all regular rx copy bufs 796 */ 797 rc_rx_bufs_sz = (state->rc_mtu + IPOIB_GRH_SIZE) * state->rc_srq_size; 798 799 state->rc_srq_rx_bufs = kmem_zalloc(rc_rx_bufs_sz, KM_SLEEP); 800 801 state->rc_srq_rwqes = kmem_zalloc(state->rc_srq_size * 802 sizeof (ibd_rwqe_t), KM_SLEEP); 803 804 /* 805 * Do one memory registration on the entire rxbuf area 806 */ 807 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->rc_srq_rx_bufs; 808 mem_attr.mr_len = rc_rx_bufs_sz; 809 mem_attr.mr_as = NULL; 810 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 811 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 812 &state->rc_srq_rx_mr_hdl, &state->rc_srq_rx_mr_desc) 813 != IBT_SUCCESS) { 814 DPRINT(40, "ibd_rc_alloc_srq_copybufs: ibt_register_mr() " 815 "failed"); 816 kmem_free(state->rc_srq_rwqes, 817 state->rc_srq_size * sizeof (ibd_rwqe_t)); 818 kmem_free(state->rc_srq_rx_bufs, rc_rx_bufs_sz); 819 state->rc_srq_rx_bufs = NULL; 820 state->rc_srq_rwqes = NULL; 821 return (DDI_FAILURE); 822 } 823 824 return (DDI_SUCCESS); 825 } 826 827 static void 828 ibd_rc_free_srq_copybufs(ibd_state_t *state) 829 { 830 uint_t rc_rx_buf_sz; 831 832 /* 833 * Don't change the value of state->rc_mtu at the period from call 834 * ibd_rc_alloc_srq_copybufs() to call ibd_rc_free_srq_copybufs(). 835 */ 836 rc_rx_buf_sz = state->rc_mtu + IPOIB_GRH_SIZE; 837 838 /* 839 * Unregister rxbuf mr 840 */ 841 if (ibt_deregister_mr(state->id_hca_hdl, 842 state->rc_srq_rx_mr_hdl) != IBT_SUCCESS) { 843 DPRINT(40, "ibd_rc_free_srq_copybufs: ibt_deregister_mr()" 844 " failed"); 845 } 846 state->rc_srq_rx_mr_hdl = NULL; 847 848 /* 849 * Free rxbuf memory 850 */ 851 kmem_free(state->rc_srq_rwqes, 852 state->rc_srq_size * sizeof (ibd_rwqe_t)); 853 kmem_free(state->rc_srq_rx_bufs, state->rc_srq_size * rc_rx_buf_sz); 854 state->rc_srq_rwqes = NULL; 855 state->rc_srq_rx_bufs = NULL; 856 } 857 858 /* 859 * Allocate and post a certain number of SRQ receive buffers and WRs. 860 */ 861 int 862 ibd_rc_init_srq_list(ibd_state_t *state) 863 { 864 ibd_rwqe_t *rwqe; 865 ibt_lkey_t lkey; 866 int i; 867 uint_t len; 868 uint8_t *bufaddr; 869 ibt_srq_sizes_t srq_sizes; 870 ibt_srq_sizes_t srq_real_sizes; 871 ibt_status_t ret; 872 873 srq_sizes.srq_sgl_sz = 1; 874 srq_sizes.srq_wr_sz = state->id_rc_num_srq; 875 ret = ibt_alloc_srq(state->id_hca_hdl, IBT_SRQ_NO_FLAGS, 876 state->id_pd_hdl, &srq_sizes, &state->rc_srq_hdl, &srq_real_sizes); 877 if (ret != IBT_SUCCESS) { 878 DPRINT(10, "ibd_rc_init_srq_list: ibt_alloc_srq failed." 879 "req_sgl_sz=%d, req_wr_sz=0x%x, ret=%d", 880 srq_sizes.srq_sgl_sz, srq_sizes.srq_wr_sz, ret); 881 return (DDI_FAILURE); 882 } 883 884 state->rc_srq_size = srq_real_sizes.srq_wr_sz; 885 if (ibd_rc_alloc_srq_copybufs(state) != DDI_SUCCESS) { 886 ret = ibt_free_srq(state->rc_srq_hdl); 887 if (ret != IBT_SUCCESS) { 888 ibd_print_warn(state, "ibd_rc_init_srq_list: " 889 "ibt_free_srq fail, ret=%d", ret); 890 } 891 return (DDI_FAILURE); 892 } 893 894 /* 895 * Allocate and setup the rwqe list 896 */ 897 lkey = state->rc_srq_rx_mr_desc.md_lkey; 898 rwqe = state->rc_srq_rwqes; 899 bufaddr = state->rc_srq_rx_bufs; 900 len = state->rc_mtu + IPOIB_GRH_SIZE; 901 state->rc_srq_rwqe_list.dl_cnt = 0; 902 state->rc_srq_rwqe_list.dl_bufs_outstanding = 0; 903 for (i = 0; i < state->rc_srq_size; i++, rwqe++, bufaddr += len) { 904 rwqe->w_state = state; 905 rwqe->w_freeing_wqe = B_FALSE; 906 rwqe->w_freemsg_cb.free_func = ibd_rc_srq_freemsg_cb; 907 rwqe->w_freemsg_cb.free_arg = (char *)rwqe; 908 rwqe->rwqe_copybuf.ic_bufaddr = bufaddr; 909 910 if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0, 911 &rwqe->w_freemsg_cb)) == NULL) { 912 DPRINT(40, "ibd_rc_init_srq_list : desballoc() failed"); 913 rwqe->rwqe_copybuf.ic_bufaddr = NULL; 914 if (atomic_dec_32_nv(&state->id_running) != 0) { 915 cmn_err(CE_WARN, "ibd_rc_init_srq_list: " 916 "id_running was not 1\n"); 917 } 918 ibd_rc_fini_srq_list(state); 919 atomic_inc_32(&state->id_running); 920 return (DDI_FAILURE); 921 } 922 923 rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey; 924 /* Leave IPOIB_GRH_SIZE space */ 925 rwqe->rwqe_copybuf.ic_sgl.ds_va = 926 (ib_vaddr_t)(uintptr_t)(bufaddr + IPOIB_GRH_SIZE); 927 rwqe->rwqe_copybuf.ic_sgl.ds_len = state->rc_mtu; 928 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; 929 rwqe->w_rwr.wr_nds = 1; 930 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; 931 (void) ibd_rc_post_srq(state, rwqe); 932 } 933 934 mutex_enter(&state->rc_srq_free_list.dl_mutex); 935 state->rc_srq_free_list.dl_head = NULL; 936 state->rc_srq_free_list.dl_cnt = 0; 937 mutex_exit(&state->rc_srq_free_list.dl_mutex); 938 939 return (DDI_SUCCESS); 940 } 941 942 /* 943 * Free the statically allocated Rx buffer list for SRQ. 944 */ 945 void 946 ibd_rc_fini_srq_list(ibd_state_t *state) 947 { 948 ibd_rwqe_t *rwqe; 949 int i; 950 ibt_status_t ret; 951 952 ASSERT(state->id_running == 0); 953 ret = ibt_free_srq(state->rc_srq_hdl); 954 if (ret != IBT_SUCCESS) { 955 ibd_print_warn(state, "ibd_rc_fini_srq_list: " 956 "ibt_free_srq fail, ret=%d", ret); 957 } 958 959 mutex_enter(&state->rc_srq_rwqe_list.dl_mutex); 960 rwqe = state->rc_srq_rwqes; 961 for (i = 0; i < state->rc_srq_size; i++, rwqe++) { 962 if (rwqe->rwqe_im_mblk != NULL) { 963 rwqe->w_freeing_wqe = B_TRUE; 964 freemsg(rwqe->rwqe_im_mblk); 965 } 966 } 967 mutex_exit(&state->rc_srq_rwqe_list.dl_mutex); 968 969 ibd_rc_free_srq_copybufs(state); 970 } 971 972 /* Repost the elements in state->ib_rc_free_list */ 973 int 974 ibd_rc_repost_srq_free_list(ibd_state_t *state) 975 { 976 ibd_rwqe_t *rwqe; 977 ibd_wqe_t *list; 978 uint_t len; 979 980 mutex_enter(&state->rc_srq_free_list.dl_mutex); 981 if (state->rc_srq_free_list.dl_head != NULL) { 982 /* repost them */ 983 len = state->rc_mtu + IPOIB_GRH_SIZE; 984 list = state->rc_srq_free_list.dl_head; 985 state->rc_srq_free_list.dl_head = NULL; 986 state->rc_srq_free_list.dl_cnt = 0; 987 mutex_exit(&state->rc_srq_free_list.dl_mutex); 988 while (list != NULL) { 989 rwqe = WQE_TO_RWQE(list); 990 if ((rwqe->rwqe_im_mblk == NULL) && 991 ((rwqe->rwqe_im_mblk = desballoc( 992 rwqe->rwqe_copybuf.ic_bufaddr, len, 0, 993 &rwqe->w_freemsg_cb)) == NULL)) { 994 DPRINT(40, "ibd_rc_repost_srq_free_list: " 995 "failed in desballoc()"); 996 do { 997 ibd_rc_srq_free_rwqe(state, rwqe); 998 list = list->w_next; 999 rwqe = WQE_TO_RWQE(list); 1000 } while (list != NULL); 1001 return (DDI_FAILURE); 1002 } 1003 if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) { 1004 ibd_rc_srq_free_rwqe(state, rwqe); 1005 } 1006 list = list->w_next; 1007 } 1008 return (DDI_SUCCESS); 1009 } 1010 mutex_exit(&state->rc_srq_free_list.dl_mutex); 1011 return (DDI_SUCCESS); 1012 } 1013 1014 /* 1015 * Free an allocated recv wqe. 1016 */ 1017 static void 1018 ibd_rc_srq_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 1019 { 1020 /* 1021 * desballoc() failed (no memory) or the posting of rwqe failed. 1022 * 1023 * This rwqe is placed on a free list so that it 1024 * can be reinstated in future. 1025 * 1026 * NOTE: no code currently exists to reinstate 1027 * these "lost" rwqes. 1028 */ 1029 mutex_enter(&state->rc_srq_free_list.dl_mutex); 1030 state->rc_srq_free_list.dl_cnt++; 1031 rwqe->rwqe_next = state->rc_srq_free_list.dl_head; 1032 state->rc_srq_free_list.dl_head = RWQE_TO_WQE(rwqe); 1033 mutex_exit(&state->rc_srq_free_list.dl_mutex); 1034 } 1035 1036 static void 1037 ibd_rc_srq_freemsg_cb(char *arg) 1038 { 1039 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg; 1040 ibd_state_t *state = rwqe->w_state; 1041 1042 ASSERT(state->rc_enable_srq); 1043 1044 /* 1045 * If the driver is stopped, just free the rwqe. 1046 */ 1047 if (atomic_add_32_nv(&state->id_running, 0) == 0) { 1048 if (!rwqe->w_freeing_wqe) { 1049 atomic_dec_32( 1050 &state->rc_srq_rwqe_list.dl_bufs_outstanding); 1051 DPRINT(6, "ibd_rc_srq_freemsg_cb: wqe being freed"); 1052 rwqe->rwqe_im_mblk = NULL; 1053 ibd_rc_srq_free_rwqe(state, rwqe); 1054 } 1055 return; 1056 } 1057 1058 atomic_dec_32(&state->rc_srq_rwqe_list.dl_bufs_outstanding); 1059 1060 ASSERT(state->rc_srq_rwqe_list.dl_cnt < state->rc_srq_size); 1061 ASSERT(!rwqe->w_freeing_wqe); 1062 1063 /* 1064 * Upper layer has released held mblk, so we have 1065 * no more use for keeping the old pointer in 1066 * our rwqe. 1067 */ 1068 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 1069 state->rc_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); 1070 if (rwqe->rwqe_im_mblk == NULL) { 1071 DPRINT(40, "ibd_rc_srq_freemsg_cb: desballoc failed"); 1072 ibd_rc_srq_free_rwqe(state, rwqe); 1073 return; 1074 } 1075 1076 if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) { 1077 ibd_print_warn(state, "ibd_rc_srq_freemsg_cb: ibd_rc_post_srq" 1078 " failed"); 1079 ibd_rc_srq_free_rwqe(state, rwqe); 1080 return; 1081 } 1082 } 1083 1084 /* 1085 * Post a rwqe to the hardware and add it to the Rx list. 1086 */ 1087 static int 1088 ibd_rc_post_srq(ibd_state_t *state, ibd_rwqe_t *rwqe) 1089 { 1090 /* 1091 * Here we should add dl_cnt before post recv, because 1092 * we would have to make sure dl_cnt is updated before 1093 * the corresponding ibd_rc_process_rx() is called. 1094 */ 1095 ASSERT(state->rc_srq_rwqe_list.dl_cnt < state->rc_srq_size); 1096 atomic_add_32(&state->rc_srq_rwqe_list.dl_cnt, 1); 1097 if (ibt_post_srq(state->rc_srq_hdl, &rwqe->w_rwr, 1, NULL) != 1098 IBT_SUCCESS) { 1099 atomic_dec_32(&state->rc_srq_rwqe_list.dl_cnt); 1100 DPRINT(40, "ibd_rc_post_srq : ibt_post_srq() failed"); 1101 return (DDI_FAILURE); 1102 } 1103 1104 return (DDI_SUCCESS); 1105 } 1106 1107 /* 1108 * Post a rwqe to the hardware and add it to the Rx list. 1109 */ 1110 static int 1111 ibd_rc_post_rwqe(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe) 1112 { 1113 /* 1114 * Here we should add dl_cnt before post recv, because we would 1115 * have to make sure dl_cnt has already updated before 1116 * corresponding ibd_rc_process_rx() is called. 1117 */ 1118 atomic_add_32(&chan->rx_wqe_list.dl_cnt, 1); 1119 if (ibt_post_recv(chan->chan_hdl, &rwqe->w_rwr, 1, NULL) != 1120 IBT_SUCCESS) { 1121 atomic_dec_32(&chan->rx_wqe_list.dl_cnt); 1122 DPRINT(40, "ibd_rc_post_rwqe : failed in ibt_post_recv()"); 1123 return (DDI_FAILURE); 1124 } 1125 return (DDI_SUCCESS); 1126 } 1127 1128 static int 1129 ibd_rc_alloc_rx_copybufs(ibd_rc_chan_t *chan) 1130 { 1131 ibd_state_t *state = chan->state; 1132 ibt_mr_attr_t mem_attr; 1133 uint_t rc_rx_bufs_sz; 1134 1135 /* 1136 * Allocate one big chunk for all regular rx copy bufs 1137 */ 1138 rc_rx_bufs_sz = (state->rc_mtu + IPOIB_GRH_SIZE) * chan->rcq_size; 1139 1140 chan->rx_bufs = kmem_zalloc(rc_rx_bufs_sz, KM_SLEEP); 1141 1142 chan->rx_rwqes = kmem_zalloc(chan->rcq_size * 1143 sizeof (ibd_rwqe_t), KM_SLEEP); 1144 1145 /* 1146 * Do one memory registration on the entire rxbuf area 1147 */ 1148 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)chan->rx_bufs; 1149 mem_attr.mr_len = rc_rx_bufs_sz; 1150 mem_attr.mr_as = NULL; 1151 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 1152 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 1153 &chan->rx_mr_hdl, &chan->rx_mr_desc) != IBT_SUCCESS) { 1154 DPRINT(40, "ibd_rc_alloc_srq_copybufs: ibt_register_mr failed"); 1155 kmem_free(chan->rx_rwqes, chan->rcq_size * sizeof (ibd_rwqe_t)); 1156 kmem_free(chan->rx_bufs, rc_rx_bufs_sz); 1157 chan->rx_bufs = NULL; 1158 chan->rx_rwqes = NULL; 1159 return (DDI_FAILURE); 1160 } 1161 1162 return (DDI_SUCCESS); 1163 } 1164 1165 static void 1166 ibd_rc_free_rx_copybufs(ibd_rc_chan_t *chan) 1167 { 1168 ibd_state_t *state = chan->state; 1169 uint_t rc_rx_buf_sz; 1170 1171 ASSERT(!state->rc_enable_srq); 1172 ASSERT(chan->rx_rwqes != NULL); 1173 ASSERT(chan->rx_bufs != NULL); 1174 1175 /* 1176 * Don't change the value of state->rc_mtu at the period from call 1177 * ibd_rc_alloc_rx_copybufs() to call ibd_rc_free_rx_copybufs(). 1178 */ 1179 rc_rx_buf_sz = state->rc_mtu + IPOIB_GRH_SIZE; 1180 1181 /* 1182 * Unregister rxbuf mr 1183 */ 1184 if (ibt_deregister_mr(state->id_hca_hdl, 1185 chan->rx_mr_hdl) != IBT_SUCCESS) { 1186 DPRINT(40, "ibd_rc_free_rx_copybufs: ibt_deregister_mr failed"); 1187 } 1188 chan->rx_mr_hdl = NULL; 1189 1190 /* 1191 * Free rxbuf memory 1192 */ 1193 kmem_free(chan->rx_rwqes, chan->rcq_size * sizeof (ibd_rwqe_t)); 1194 chan->rx_rwqes = NULL; 1195 1196 kmem_free(chan->rx_bufs, chan->rcq_size * rc_rx_buf_sz); 1197 chan->rx_bufs = NULL; 1198 } 1199 1200 /* 1201 * Post a certain number of receive buffers and WRs on a RC channel. 1202 */ 1203 static int 1204 ibd_rc_init_rxlist(ibd_rc_chan_t *chan) 1205 { 1206 ibd_state_t *state = chan->state; 1207 ibd_rwqe_t *rwqe; 1208 ibt_lkey_t lkey; 1209 int i; 1210 uint_t len; 1211 uint8_t *bufaddr; 1212 1213 ASSERT(!state->rc_enable_srq); 1214 if (ibd_rc_alloc_rx_copybufs(chan) != DDI_SUCCESS) 1215 return (DDI_FAILURE); 1216 1217 /* 1218 * Allocate and setup the rwqe list 1219 */ 1220 lkey = chan->rx_mr_desc.md_lkey; 1221 rwqe = chan->rx_rwqes; 1222 bufaddr = chan->rx_bufs; 1223 len = state->rc_mtu + IPOIB_GRH_SIZE; 1224 for (i = 0; i < chan->rcq_size; i++, rwqe++, bufaddr += len) { 1225 rwqe->w_state = state; 1226 rwqe->w_chan = chan; 1227 rwqe->w_freeing_wqe = B_FALSE; 1228 rwqe->w_freemsg_cb.free_func = ibd_rc_freemsg_cb; 1229 rwqe->w_freemsg_cb.free_arg = (char *)rwqe; 1230 rwqe->rwqe_copybuf.ic_bufaddr = bufaddr; 1231 1232 if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0, 1233 &rwqe->w_freemsg_cb)) == NULL) { 1234 DPRINT(40, "ibd_rc_init_srq_list: desballoc() failed"); 1235 rwqe->rwqe_copybuf.ic_bufaddr = NULL; 1236 ibd_rc_fini_rxlist(chan); 1237 return (DDI_FAILURE); 1238 } 1239 1240 rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey; 1241 rwqe->rwqe_copybuf.ic_sgl.ds_va = 1242 (ib_vaddr_t)(uintptr_t)(bufaddr + IPOIB_GRH_SIZE); 1243 rwqe->rwqe_copybuf.ic_sgl.ds_len = state->rc_mtu; 1244 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; 1245 rwqe->w_rwr.wr_nds = 1; 1246 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; 1247 (void) ibd_rc_post_rwqe(chan, rwqe); 1248 } 1249 1250 return (DDI_SUCCESS); 1251 } 1252 1253 /* 1254 * Free the statically allocated Rx buffer list for SRQ. 1255 */ 1256 static void 1257 ibd_rc_fini_rxlist(ibd_rc_chan_t *chan) 1258 { 1259 ibd_rwqe_t *rwqe; 1260 int i; 1261 1262 if (chan->rx_bufs == NULL) { 1263 DPRINT(40, "ibd_rc_fini_rxlist: empty chan->rx_bufs, quit"); 1264 return; 1265 } 1266 1267 /* bufs_outstanding must be 0 */ 1268 ASSERT((chan->rx_wqe_list.dl_head == NULL) || 1269 (chan->rx_wqe_list.dl_bufs_outstanding == 0)); 1270 1271 mutex_enter(&chan->rx_wqe_list.dl_mutex); 1272 rwqe = chan->rx_rwqes; 1273 for (i = 0; i < chan->rcq_size; i++, rwqe++) { 1274 if (rwqe->rwqe_im_mblk != NULL) { 1275 rwqe->w_freeing_wqe = B_TRUE; 1276 freemsg(rwqe->rwqe_im_mblk); 1277 } 1278 } 1279 mutex_exit(&chan->rx_wqe_list.dl_mutex); 1280 1281 ibd_rc_free_rx_copybufs(chan); 1282 } 1283 1284 /* 1285 * Free an allocated recv wqe. 1286 */ 1287 static void 1288 ibd_rc_free_rwqe(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe) 1289 { 1290 /* 1291 * desballoc() failed (no memory) or the posting of rwqe failed. 1292 * 1293 * This rwqe is placed on a free list so that it 1294 * can be reinstated in future. 1295 * 1296 * NOTE: no code currently exists to reinstate 1297 * these "lost" rwqes. 1298 */ 1299 mutex_enter(&chan->rx_free_list.dl_mutex); 1300 chan->rx_free_list.dl_cnt++; 1301 rwqe->rwqe_next = chan->rx_free_list.dl_head; 1302 chan->rx_free_list.dl_head = RWQE_TO_WQE(rwqe); 1303 mutex_exit(&chan->rx_free_list.dl_mutex); 1304 } 1305 1306 /* 1307 * Processing to be done after receipt of a packet; hand off to GLD 1308 * in the format expected by GLD. 1309 */ 1310 static void 1311 ibd_rc_process_rx(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe, ibt_wc_t *wc) 1312 { 1313 ibd_state_t *state = chan->state; 1314 ib_header_info_t *phdr; 1315 ipoib_hdr_t *ipibp; 1316 mblk_t *mp; 1317 mblk_t *mpc; 1318 int rxcnt; 1319 ip6_t *ip6h; 1320 int len; 1321 1322 /* 1323 * Track number handed to upper layer, and number still 1324 * available to receive packets. 1325 */ 1326 if (state->rc_enable_srq) { 1327 rxcnt = atomic_dec_32_nv(&state->rc_srq_rwqe_list.dl_cnt); 1328 } else { 1329 rxcnt = atomic_dec_32_nv(&chan->rx_wqe_list.dl_cnt); 1330 } 1331 1332 /* 1333 * It can not be a IBA multicast packet. 1334 */ 1335 ASSERT(!wc->wc_flags & IBT_WC_GRH_PRESENT); 1336 1337 1338 #ifdef DEBUG 1339 if (rxcnt < state->id_rc_rx_rwqe_thresh) { 1340 state->rc_rwqe_short++; 1341 } 1342 #endif 1343 1344 /* 1345 * Possibly replenish the Rx pool if needed. 1346 */ 1347 if ((rxcnt >= state->id_rc_rx_rwqe_thresh) && 1348 (wc->wc_bytes_xfer > state->id_rc_rx_copy_thresh)) { 1349 atomic_add_64(&state->rc_rcv_trans_byte, wc->wc_bytes_xfer); 1350 atomic_inc_64(&state->rc_rcv_trans_pkt); 1351 1352 /* 1353 * Record how many rwqe has been occupied by upper 1354 * network layer 1355 */ 1356 if (state->rc_enable_srq) { 1357 atomic_add_32(&state->rc_srq_rwqe_list. 1358 dl_bufs_outstanding, 1); 1359 } else { 1360 atomic_add_32(&chan->rx_wqe_list. 1361 dl_bufs_outstanding, 1); 1362 } 1363 mp = rwqe->rwqe_im_mblk; 1364 } else { 1365 atomic_add_64(&state->rc_rcv_copy_byte, wc->wc_bytes_xfer); 1366 atomic_inc_64(&state->rc_rcv_copy_pkt); 1367 1368 if ((mp = allocb(wc->wc_bytes_xfer + IPOIB_GRH_SIZE, 1369 BPRI_HI)) == NULL) { /* no memory */ 1370 DPRINT(40, "ibd_rc_process_rx: allocb() failed"); 1371 state->rc_rcv_alloc_fail++; 1372 if (state->rc_enable_srq) { 1373 if (ibd_rc_post_srq(state, rwqe) == 1374 DDI_FAILURE) { 1375 ibd_rc_srq_free_rwqe(state, rwqe); 1376 } 1377 } else { 1378 if (ibd_rc_post_rwqe(chan, rwqe) == 1379 DDI_FAILURE) { 1380 ibd_rc_free_rwqe(chan, rwqe); 1381 } 1382 } 1383 return; 1384 } 1385 1386 bcopy(rwqe->rwqe_im_mblk->b_rptr + IPOIB_GRH_SIZE, 1387 mp->b_wptr + IPOIB_GRH_SIZE, wc->wc_bytes_xfer); 1388 1389 if (state->rc_enable_srq) { 1390 if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) { 1391 ibd_rc_srq_free_rwqe(state, rwqe); 1392 } 1393 } else { 1394 if (ibd_rc_post_rwqe(chan, rwqe) == DDI_FAILURE) { 1395 ibd_rc_free_rwqe(chan, rwqe); 1396 } 1397 } 1398 } 1399 1400 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + IPOIB_GRH_SIZE); 1401 if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) { 1402 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 1403 len = ntohs(ip6h->ip6_plen); 1404 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 1405 /* LINTED: E_CONSTANT_CONDITION */ 1406 IBD_PAD_NSNA(ip6h, len, IBD_RECV); 1407 } 1408 } 1409 1410 phdr = (ib_header_info_t *)mp->b_rptr; 1411 phdr->ib_grh.ipoib_vertcflow = 0; 1412 ovbcopy(&state->id_macaddr, &phdr->ib_dst, 1413 sizeof (ipoib_mac_t)); 1414 mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer+ IPOIB_GRH_SIZE; 1415 1416 /* 1417 * Can RC mode in IB guarantee its checksum correctness? 1418 * 1419 * (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0, 1420 * HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0); 1421 */ 1422 1423 /* 1424 * Make sure this is NULL or we're in trouble. 1425 */ 1426 if (mp->b_next != NULL) { 1427 ibd_print_warn(state, 1428 "ibd_rc_process_rx: got duplicate mp from rcq?"); 1429 mp->b_next = NULL; 1430 } 1431 1432 /* 1433 * Add this mp to the list of processed mp's to send to 1434 * the nw layer 1435 */ 1436 if (state->rc_enable_srq) { 1437 mutex_enter(&state->rc_rx_lock); 1438 if (state->rc_rx_mp) { 1439 ASSERT(state->rc_rx_mp_tail != NULL); 1440 state->rc_rx_mp_tail->b_next = mp; 1441 } else { 1442 ASSERT(state->rc_rx_mp_tail == NULL); 1443 state->rc_rx_mp = mp; 1444 } 1445 1446 state->rc_rx_mp_tail = mp; 1447 state->rc_rx_mp_len++; 1448 1449 if (state->rc_rx_mp_len >= IBD_MAX_RX_MP_LEN) { 1450 mpc = state->rc_rx_mp; 1451 1452 state->rc_rx_mp = NULL; 1453 state->rc_rx_mp_tail = NULL; 1454 state->rc_rx_mp_len = 0; 1455 mutex_exit(&state->rc_rx_lock); 1456 mac_rx(state->id_mh, NULL, mpc); 1457 } else { 1458 mutex_exit(&state->rc_rx_lock); 1459 } 1460 } else { 1461 mutex_enter(&chan->rx_lock); 1462 if (chan->rx_mp) { 1463 ASSERT(chan->rx_mp_tail != NULL); 1464 chan->rx_mp_tail->b_next = mp; 1465 } else { 1466 ASSERT(chan->rx_mp_tail == NULL); 1467 chan->rx_mp = mp; 1468 } 1469 1470 chan->rx_mp_tail = mp; 1471 chan->rx_mp_len++; 1472 1473 if (chan->rx_mp_len >= IBD_MAX_RX_MP_LEN) { 1474 mpc = chan->rx_mp; 1475 1476 chan->rx_mp = NULL; 1477 chan->rx_mp_tail = NULL; 1478 chan->rx_mp_len = 0; 1479 mutex_exit(&chan->rx_lock); 1480 mac_rx(state->id_mh, NULL, mpc); 1481 } else { 1482 mutex_exit(&chan->rx_lock); 1483 } 1484 } 1485 } 1486 1487 /* 1488 * Callback code invoked from STREAMs when the recv data buffer is free 1489 * for recycling. 1490 */ 1491 static void 1492 ibd_rc_freemsg_cb(char *arg) 1493 { 1494 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg; 1495 ibd_rc_chan_t *chan = rwqe->w_chan; 1496 ibd_state_t *state = rwqe->w_state; 1497 1498 /* 1499 * If the wqe is being destructed, do not attempt recycling. 1500 */ 1501 if (rwqe->w_freeing_wqe == B_TRUE) { 1502 return; 1503 } 1504 1505 ASSERT(!state->rc_enable_srq); 1506 ASSERT(chan->rx_wqe_list.dl_cnt < chan->rcq_size); 1507 1508 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 1509 state->rc_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); 1510 if (rwqe->rwqe_im_mblk == NULL) { 1511 DPRINT(40, "ibd_rc_freemsg_cb: desballoc() failed"); 1512 ibd_rc_free_rwqe(chan, rwqe); 1513 return; 1514 } 1515 1516 /* 1517 * Post back to h/w. We could actually have more than 1518 * id_num_rwqe WQEs on the list if there were multiple 1519 * ibd_freemsg_cb() calls outstanding (since the lock is 1520 * not held the entire time). This will start getting 1521 * corrected over subsequent ibd_freemsg_cb() calls. 1522 */ 1523 if (ibd_rc_post_rwqe(chan, rwqe) == DDI_FAILURE) { 1524 ibd_rc_free_rwqe(chan, rwqe); 1525 return; 1526 } 1527 atomic_add_32(&chan->rx_wqe_list.dl_bufs_outstanding, -1); 1528 } 1529 1530 /* 1531 * Common code for interrupt handling as well as for polling 1532 * for all completed wqe's while detaching. 1533 */ 1534 static void 1535 ibd_rc_poll_rcq(ibd_rc_chan_t *chan, ibt_cq_hdl_t cq_hdl) 1536 { 1537 ibd_wqe_t *wqe; 1538 ibt_wc_t *wc, *wcs; 1539 uint_t numwcs, real_numwcs; 1540 int i; 1541 1542 wcs = chan->rx_wc; 1543 numwcs = IBD_RC_MAX_CQ_WC; 1544 1545 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &real_numwcs) == IBT_SUCCESS) { 1546 for (i = 0, wc = wcs; i < real_numwcs; i++, wc++) { 1547 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; 1548 if (wc->wc_status != IBT_WC_SUCCESS) { 1549 chan->state->rc_rcq_err++; 1550 /* 1551 * Channel being torn down. 1552 */ 1553 DPRINT(40, "ibd_rc_poll_rcq: wc_status(%d) != " 1554 "SUCC, chan=%p", wc->wc_status, chan); 1555 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 1556 /* 1557 * Do not invoke Rx handler because 1558 * it might add buffers to the Rx pool 1559 * when we are trying to deinitialize. 1560 */ 1561 continue; 1562 } 1563 } 1564 ibd_rc_process_rx(chan, WQE_TO_RWQE(wqe), wc); 1565 } 1566 } 1567 } 1568 1569 /* Receive CQ handler */ 1570 /* ARGSUSED */ 1571 static void 1572 ibd_rc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1573 { 1574 ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg; 1575 ibd_state_t *state = chan->state; 1576 1577 ASSERT(chan->chan_state == IBD_RC_STATE_PAS_ESTAB); 1578 1579 /* 1580 * Poll for completed entries; the CQ will not interrupt any 1581 * more for incoming (or transmitted) packets. 1582 */ 1583 state->rc_rcq_invoke++; 1584 ibd_rc_poll_rcq(chan, chan->rcq_hdl); 1585 1586 /* 1587 * Now enable CQ notifications; all packets that arrive now 1588 * (or complete transmission) will cause new interrupts. 1589 */ 1590 if (ibt_enable_cq_notify(chan->rcq_hdl, IBT_NEXT_COMPLETION) != 1591 IBT_SUCCESS) { 1592 /* 1593 * We do not expect a failure here. 1594 */ 1595 DPRINT(40, "ibd_rc_rcq_handler: ibt_enable_cq_notify() failed"); 1596 } 1597 1598 /* 1599 * Repoll to catch all packets that might have arrived after 1600 * we finished the first poll loop and before interrupts got 1601 * armed. 1602 */ 1603 ibd_rc_poll_rcq(chan, chan->rcq_hdl); 1604 1605 if (state->rc_enable_srq) { 1606 mutex_enter(&state->rc_rx_lock); 1607 1608 if (state->rc_rx_mp != NULL) { 1609 mblk_t *mpc; 1610 mpc = state->rc_rx_mp; 1611 1612 state->rc_rx_mp = NULL; 1613 state->rc_rx_mp_tail = NULL; 1614 state->rc_rx_mp_len = 0; 1615 1616 mutex_exit(&state->rc_rx_lock); 1617 mac_rx(state->id_mh, NULL, mpc); 1618 } else { 1619 mutex_exit(&state->rc_rx_lock); 1620 } 1621 } else { 1622 mutex_enter(&chan->rx_lock); 1623 1624 if (chan->rx_mp != NULL) { 1625 mblk_t *mpc; 1626 mpc = chan->rx_mp; 1627 1628 chan->rx_mp = NULL; 1629 chan->rx_mp_tail = NULL; 1630 chan->rx_mp_len = 0; 1631 1632 mutex_exit(&chan->rx_lock); 1633 mac_rx(state->id_mh, NULL, mpc); 1634 } else { 1635 mutex_exit(&chan->rx_lock); 1636 } 1637 } 1638 } 1639 1640 /* 1641 * Allocate the statically allocated Tx buffer list. 1642 */ 1643 int 1644 ibd_rc_init_tx_largebuf_list(ibd_state_t *state) 1645 { 1646 ibd_rc_tx_largebuf_t *lbufp; 1647 ibd_rc_tx_largebuf_t *tail; 1648 uint8_t *memp; 1649 ibt_mr_attr_t mem_attr; 1650 uint32_t num_swqe; 1651 size_t mem_size; 1652 int i; 1653 1654 num_swqe = state->id_rc_num_swqe - 1; 1655 1656 /* 1657 * Allocate one big chunk for all Tx large copy bufs 1658 */ 1659 /* Don't transfer IPOIB_GRH_SIZE bytes (40 bytes) */ 1660 mem_size = num_swqe * state->rc_mtu; 1661 state->rc_tx_mr_bufs = kmem_zalloc(mem_size, KM_SLEEP); 1662 1663 mem_attr.mr_len = mem_size; 1664 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->rc_tx_mr_bufs; 1665 mem_attr.mr_as = NULL; 1666 mem_attr.mr_flags = IBT_MR_SLEEP; 1667 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 1668 &state->rc_tx_mr_hdl, &state->rc_tx_mr_desc) != IBT_SUCCESS) { 1669 DPRINT(40, "ibd_rc_init_tx_largebuf_list: ibt_register_mr " 1670 "failed"); 1671 kmem_free(state->rc_tx_mr_bufs, mem_size); 1672 state->rc_tx_mr_bufs = NULL; 1673 return (DDI_FAILURE); 1674 } 1675 1676 state->rc_tx_largebuf_desc_base = kmem_zalloc(num_swqe * 1677 sizeof (ibd_rc_tx_largebuf_t), KM_SLEEP); 1678 1679 /* 1680 * Set up the buf chain 1681 */ 1682 memp = state->rc_tx_mr_bufs; 1683 mutex_enter(&state->rc_tx_large_bufs_lock); 1684 lbufp = state->rc_tx_largebuf_desc_base; 1685 for (i = 0; i < num_swqe; i++) { 1686 lbufp->lb_buf = memp; 1687 lbufp->lb_next = lbufp + 1; 1688 1689 tail = lbufp; 1690 1691 memp += state->rc_mtu; 1692 lbufp++; 1693 } 1694 tail->lb_next = NULL; 1695 1696 /* 1697 * Set up the buffer information in ibd state 1698 */ 1699 state->rc_tx_largebuf_free_head = state->rc_tx_largebuf_desc_base; 1700 state->rc_tx_largebuf_nfree = num_swqe; 1701 mutex_exit(&state->rc_tx_large_bufs_lock); 1702 return (DDI_SUCCESS); 1703 } 1704 1705 void 1706 ibd_rc_fini_tx_largebuf_list(ibd_state_t *state) 1707 { 1708 uint32_t num_swqe; 1709 1710 num_swqe = state->id_rc_num_swqe - 1; 1711 1712 if (ibt_deregister_mr(state->id_hca_hdl, 1713 state->rc_tx_mr_hdl) != IBT_SUCCESS) { 1714 DPRINT(40, "ibd_rc_fini_tx_largebuf_list: ibt_deregister_mr() " 1715 "failed"); 1716 } 1717 state->rc_tx_mr_hdl = NULL; 1718 1719 kmem_free(state->rc_tx_mr_bufs, num_swqe * state->rc_mtu); 1720 state->rc_tx_mr_bufs = NULL; 1721 1722 kmem_free(state->rc_tx_largebuf_desc_base, 1723 num_swqe * sizeof (ibd_rc_tx_largebuf_t)); 1724 state->rc_tx_largebuf_desc_base = NULL; 1725 } 1726 1727 static int 1728 ibd_rc_alloc_tx_copybufs(ibd_rc_chan_t *chan) 1729 { 1730 ibt_mr_attr_t mem_attr; 1731 ibd_state_t *state; 1732 1733 state = chan->state; 1734 ASSERT(state != NULL); 1735 1736 /* 1737 * Allocate one big chunk for all regular tx copy bufs 1738 */ 1739 mem_attr.mr_len = chan->scq_size * state->id_rc_tx_copy_thresh; 1740 1741 chan->tx_mr_bufs = kmem_zalloc(mem_attr.mr_len, KM_SLEEP); 1742 1743 /* 1744 * Do one memory registration on the entire txbuf area 1745 */ 1746 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)chan->tx_mr_bufs; 1747 mem_attr.mr_as = NULL; 1748 mem_attr.mr_flags = IBT_MR_SLEEP; 1749 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 1750 &chan->tx_mr_hdl, &chan->tx_mr_desc) != IBT_SUCCESS) { 1751 DPRINT(40, "ibd_rc_alloc_tx_copybufs: ibt_register_mr failed"); 1752 ASSERT(mem_attr.mr_len == 1753 chan->scq_size * state->id_rc_tx_copy_thresh); 1754 kmem_free(chan->tx_mr_bufs, mem_attr.mr_len); 1755 chan->tx_mr_bufs = NULL; 1756 return (DDI_FAILURE); 1757 } 1758 1759 return (DDI_SUCCESS); 1760 } 1761 1762 /* 1763 * Allocate the statically allocated Tx buffer list. 1764 */ 1765 static int 1766 ibd_rc_init_txlist(ibd_rc_chan_t *chan) 1767 { 1768 ibd_swqe_t *swqe; 1769 int i; 1770 ibt_lkey_t lkey; 1771 ibd_state_t *state = chan->state; 1772 1773 if (ibd_rc_alloc_tx_copybufs(chan) != DDI_SUCCESS) 1774 return (DDI_FAILURE); 1775 1776 /* 1777 * Allocate and setup the swqe list 1778 */ 1779 lkey = chan->tx_mr_desc.md_lkey; 1780 chan->tx_wqes = kmem_zalloc(chan->scq_size * 1781 sizeof (ibd_swqe_t), KM_SLEEP); 1782 swqe = chan->tx_wqes; 1783 for (i = 0; i < chan->scq_size; i++, swqe++) { 1784 swqe->swqe_next = NULL; 1785 swqe->swqe_im_mblk = NULL; 1786 1787 swqe->swqe_copybuf.ic_sgl.ds_key = lkey; 1788 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */ 1789 1790 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe; 1791 swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL; 1792 swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t) 1793 (chan->tx_mr_bufs + i * state->id_rc_tx_copy_thresh); 1794 swqe->w_swr.wr_trans = IBT_RC_SRV; 1795 1796 /* Add to list */ 1797 mutex_enter(&chan->tx_wqe_list.dl_mutex); 1798 chan->tx_wqe_list.dl_cnt++; 1799 swqe->swqe_next = chan->tx_wqe_list.dl_head; 1800 chan->tx_wqe_list.dl_head = SWQE_TO_WQE(swqe); 1801 mutex_exit(&chan->tx_wqe_list.dl_mutex); 1802 } 1803 1804 return (DDI_SUCCESS); 1805 } 1806 1807 /* 1808 * Free the statically allocated Tx buffer list. 1809 */ 1810 static void 1811 ibd_rc_fini_txlist(ibd_rc_chan_t *chan) 1812 { 1813 ibd_state_t *state = chan->state; 1814 if (chan->tx_mr_hdl != NULL) { 1815 if (ibt_deregister_mr(chan->state->id_hca_hdl, 1816 chan->tx_mr_hdl) != IBT_SUCCESS) { 1817 DPRINT(40, "ibd_rc_fini_txlist: ibt_deregister_mr " 1818 "failed"); 1819 } 1820 chan->tx_mr_hdl = NULL; 1821 } 1822 1823 if (chan->tx_mr_bufs != NULL) { 1824 kmem_free(chan->tx_mr_bufs, chan->scq_size * 1825 state->id_rc_tx_copy_thresh); 1826 chan->tx_mr_bufs = NULL; 1827 } 1828 1829 if (chan->tx_wqes != NULL) { 1830 kmem_free(chan->tx_wqes, chan->scq_size * 1831 sizeof (ibd_swqe_t)); 1832 chan->tx_wqes = NULL; 1833 } 1834 } 1835 1836 /* 1837 * Acquire send wqe from free list. 1838 * Returns error number and send wqe pointer. 1839 */ 1840 ibd_swqe_t * 1841 ibd_rc_acquire_swqes(ibd_rc_chan_t *chan) 1842 { 1843 ibd_swqe_t *wqe; 1844 1845 mutex_enter(&chan->tx_rel_list.dl_mutex); 1846 if (chan->tx_rel_list.dl_head != NULL) { 1847 /* transfer id_tx_rel_list to id_tx_list */ 1848 chan->tx_wqe_list.dl_head = 1849 chan->tx_rel_list.dl_head; 1850 chan->tx_wqe_list.dl_cnt = 1851 chan->tx_rel_list.dl_cnt; 1852 chan->tx_wqe_list.dl_pending_sends = B_FALSE; 1853 1854 /* clear id_tx_rel_list */ 1855 chan->tx_rel_list.dl_head = NULL; 1856 chan->tx_rel_list.dl_cnt = 0; 1857 mutex_exit(&chan->tx_rel_list.dl_mutex); 1858 1859 wqe = WQE_TO_SWQE(chan->tx_wqe_list.dl_head); 1860 chan->tx_wqe_list.dl_cnt -= 1; 1861 chan->tx_wqe_list.dl_head = wqe->swqe_next; 1862 } else { /* no free swqe */ 1863 mutex_exit(&chan->tx_rel_list.dl_mutex); 1864 chan->tx_wqe_list.dl_pending_sends = B_TRUE; 1865 wqe = NULL; 1866 } 1867 return (wqe); 1868 } 1869 1870 /* 1871 * Release send wqe back into free list. 1872 */ 1873 static void 1874 ibd_rc_release_swqe(ibd_rc_chan_t *chan, ibd_swqe_t *swqe) 1875 { 1876 /* 1877 * Add back on Tx list for reuse. 1878 */ 1879 swqe->swqe_next = NULL; 1880 mutex_enter(&chan->tx_rel_list.dl_mutex); 1881 chan->tx_rel_list.dl_pending_sends = B_FALSE; 1882 swqe->swqe_next = chan->tx_rel_list.dl_head; 1883 chan->tx_rel_list.dl_head = SWQE_TO_WQE(swqe); 1884 chan->tx_rel_list.dl_cnt++; 1885 mutex_exit(&chan->tx_rel_list.dl_mutex); 1886 } 1887 1888 void 1889 ibd_rc_post_send(ibd_rc_chan_t *chan, ibd_swqe_t *node) 1890 { 1891 uint_t i; 1892 uint_t num_posted; 1893 uint_t n_wrs; 1894 ibt_status_t ibt_status; 1895 ibt_send_wr_t wrs[IBD_MAX_TX_POST_MULTIPLE]; 1896 ibd_swqe_t *tx_head, *elem; 1897 ibd_swqe_t *nodes[IBD_MAX_TX_POST_MULTIPLE]; 1898 1899 /* post the one request, then check for more */ 1900 ibt_status = ibt_post_send(chan->chan_hdl, 1901 &node->w_swr, 1, NULL); 1902 if (ibt_status != IBT_SUCCESS) { 1903 ibd_print_warn(chan->state, "ibd_post_send: " 1904 "posting one wr failed: ret=%d", ibt_status); 1905 ibd_rc_tx_cleanup(node); 1906 } 1907 1908 tx_head = NULL; 1909 for (;;) { 1910 if (tx_head == NULL) { 1911 mutex_enter(&chan->tx_post_lock); 1912 tx_head = chan->tx_head; 1913 if (tx_head == NULL) { 1914 chan->tx_busy = 0; 1915 mutex_exit(&chan->tx_post_lock); 1916 return; 1917 } 1918 chan->tx_head = NULL; 1919 mutex_exit(&chan->tx_post_lock); 1920 } 1921 1922 /* 1923 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs 1924 * at a time if possible, and keep posting them. 1925 */ 1926 for (n_wrs = 0, elem = tx_head; 1927 (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE); 1928 elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) { 1929 nodes[n_wrs] = elem; 1930 wrs[n_wrs] = elem->w_swr; 1931 } 1932 tx_head = elem; 1933 1934 ASSERT(n_wrs != 0); 1935 1936 /* 1937 * If posting fails for some reason, we'll never receive 1938 * completion intimation, so we'll need to cleanup. But 1939 * we need to make sure we don't clean up nodes whose 1940 * wrs have been successfully posted. We assume that the 1941 * hca driver returns on the first failure to post and 1942 * therefore the first 'num_posted' entries don't need 1943 * cleanup here. 1944 */ 1945 num_posted = 0; 1946 ibt_status = ibt_post_send(chan->chan_hdl, 1947 wrs, n_wrs, &num_posted); 1948 if (ibt_status != IBT_SUCCESS) { 1949 ibd_print_warn(chan->state, "ibd_post_send: " 1950 "posting multiple wrs failed: " 1951 "requested=%d, done=%d, ret=%d", 1952 n_wrs, num_posted, ibt_status); 1953 1954 for (i = num_posted; i < n_wrs; i++) 1955 ibd_rc_tx_cleanup(nodes[i]); 1956 } 1957 } 1958 } 1959 1960 /* 1961 * Common code that deals with clean ups after a successful or 1962 * erroneous transmission attempt. 1963 */ 1964 void 1965 ibd_rc_tx_cleanup(ibd_swqe_t *swqe) 1966 { 1967 ibd_ace_t *ace = swqe->w_ahandle; 1968 ibd_state_t *state; 1969 1970 ASSERT(ace != NULL); 1971 ASSERT(ace->ac_chan != NULL); 1972 1973 state = ace->ac_chan->state; 1974 1975 /* 1976 * If this was a dynamic registration in ibd_send(), 1977 * deregister now. 1978 */ 1979 if (swqe->swqe_im_mblk != NULL) { 1980 ASSERT(swqe->w_buftype == IBD_WQE_MAPPED); 1981 if (swqe->w_buftype == IBD_WQE_MAPPED) { 1982 ibd_unmap_mem(state, swqe); 1983 } 1984 freemsg(swqe->swqe_im_mblk); 1985 swqe->swqe_im_mblk = NULL; 1986 } else { 1987 ASSERT(swqe->w_buftype != IBD_WQE_MAPPED); 1988 } 1989 1990 if (swqe->w_buftype == IBD_WQE_RC_COPYBUF) { 1991 ibd_rc_tx_largebuf_t *lbufp; 1992 1993 lbufp = swqe->w_rc_tx_largebuf; 1994 ASSERT(lbufp != NULL); 1995 1996 mutex_enter(&state->rc_tx_large_bufs_lock); 1997 lbufp->lb_next = state->rc_tx_largebuf_free_head; 1998 state->rc_tx_largebuf_free_head = lbufp; 1999 state->rc_tx_largebuf_nfree ++; 2000 mutex_exit(&state->rc_tx_large_bufs_lock); 2001 swqe->w_rc_tx_largebuf = NULL; 2002 } 2003 2004 2005 /* 2006 * Release the send wqe for reuse. 2007 */ 2008 ibd_rc_release_swqe(ace->ac_chan, swqe); 2009 2010 /* 2011 * Drop the reference count on the AH; it can be reused 2012 * now for a different destination if there are no more 2013 * posted sends that will use it. This can be eliminated 2014 * if we can always associate each Tx buffer with an AH. 2015 * The ace can be null if we are cleaning up from the 2016 * ibd_send() error path. 2017 */ 2018 ibd_dec_ref_ace(state, ace); 2019 } 2020 2021 void 2022 ibd_rc_drain_scq(ibd_rc_chan_t *chan, ibt_cq_hdl_t cq_hdl) 2023 { 2024 ibd_state_t *state = chan->state; 2025 ibd_wqe_t *wqe; 2026 ibt_wc_t *wc, *wcs; 2027 uint_t numwcs, real_numwcs; 2028 int i; 2029 2030 wcs = chan->tx_wc; 2031 numwcs = IBD_RC_MAX_CQ_WC; 2032 2033 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &real_numwcs) == IBT_SUCCESS) { 2034 for (i = 0, wc = wcs; i < real_numwcs; i++, wc++) { 2035 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; 2036 if (wc->wc_status != IBT_WC_SUCCESS) { 2037 chan->tx_trans_error_cnt ++; 2038 DPRINT(30, "ibd_rc_drain_scq: " 2039 "wc_status(%d) != SUCC, " 2040 "chan=%p, ace=%p, link_state=%d", 2041 wc->wc_status, chan, chan->ace, 2042 chan->state->id_link_state); 2043 } else { 2044 chan->tx_trans_error_cnt = 0; 2045 } 2046 ibd_rc_tx_cleanup(WQE_TO_SWQE(wqe)); 2047 } 2048 2049 mutex_enter(&state->id_sched_lock); 2050 if (state->id_sched_needed == 0) { 2051 mutex_exit(&state->id_sched_lock); 2052 } else if (state->id_sched_needed & IBD_RSRC_RC_SWQE) { 2053 mutex_enter(&chan->tx_wqe_list.dl_mutex); 2054 mutex_enter(&chan->tx_rel_list.dl_mutex); 2055 if ((chan->tx_rel_list.dl_cnt + 2056 chan->tx_wqe_list.dl_cnt) > IBD_RC_TX_FREE_THRESH) { 2057 state->id_sched_needed &= ~IBD_RSRC_RC_SWQE; 2058 mutex_exit(&chan->tx_rel_list.dl_mutex); 2059 mutex_exit(&chan->tx_wqe_list.dl_mutex); 2060 mutex_exit(&state->id_sched_lock); 2061 state->rc_swqe_mac_update++; 2062 mac_tx_update(state->id_mh); 2063 } else { 2064 state->rc_scq_no_swqe++; 2065 mutex_exit(&chan->tx_rel_list.dl_mutex); 2066 mutex_exit(&chan->tx_wqe_list.dl_mutex); 2067 mutex_exit(&state->id_sched_lock); 2068 } 2069 } else if (state->id_sched_needed & IBD_RSRC_RC_TX_LARGEBUF) { 2070 mutex_enter(&state->rc_tx_large_bufs_lock); 2071 if (state->rc_tx_largebuf_nfree > 2072 IBD_RC_TX_FREE_THRESH) { 2073 ASSERT(state->rc_tx_largebuf_free_head != NULL); 2074 state->id_sched_needed &= 2075 ~IBD_RSRC_RC_TX_LARGEBUF; 2076 mutex_exit(&state->rc_tx_large_bufs_lock); 2077 mutex_exit(&state->id_sched_lock); 2078 state->rc_xmt_buf_mac_update++; 2079 mac_tx_update(state->id_mh); 2080 } else { 2081 state->rc_scq_no_largebuf++; 2082 mutex_exit(&state->rc_tx_large_bufs_lock); 2083 mutex_exit(&state->id_sched_lock); 2084 } 2085 } else if (state->id_sched_needed & IBD_RSRC_SWQE) { 2086 mutex_enter(&state->id_tx_list.dl_mutex); 2087 mutex_enter(&state->id_tx_rel_list.dl_mutex); 2088 if ((state->id_tx_list.dl_cnt + 2089 state->id_tx_rel_list.dl_cnt) 2090 > IBD_FREE_SWQES_THRESH) { 2091 state->id_sched_needed &= ~IBD_RSRC_SWQE; 2092 state->id_sched_cnt++; 2093 mutex_exit(&state->id_tx_rel_list.dl_mutex); 2094 mutex_exit(&state->id_tx_list.dl_mutex); 2095 mutex_exit(&state->id_sched_lock); 2096 mac_tx_update(state->id_mh); 2097 } else { 2098 mutex_exit(&state->id_tx_rel_list.dl_mutex); 2099 mutex_exit(&state->id_tx_list.dl_mutex); 2100 mutex_exit(&state->id_sched_lock); 2101 } 2102 } else { 2103 mutex_exit(&state->id_sched_lock); 2104 } 2105 } 2106 } 2107 2108 /* Send CQ handler, call ibd_rx_tx_cleanup to recycle Tx buffers */ 2109 /* ARGSUSED */ 2110 static void 2111 ibd_rc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 2112 { 2113 ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg; 2114 2115 chan->state->rc_scq_invoke++; 2116 2117 if (ibd_rc_tx_softintr == 1) { 2118 mutex_enter(&chan->tx_poll_lock); 2119 if (chan->tx_poll_busy & IBD_CQ_POLLING) { 2120 chan->tx_poll_busy |= IBD_REDO_CQ_POLLING; 2121 mutex_exit(&chan->tx_poll_lock); 2122 return; 2123 } else { 2124 mutex_exit(&chan->tx_poll_lock); 2125 ddi_trigger_softintr(chan->scq_softintr); 2126 } 2127 } else 2128 (void) ibd_rc_tx_recycle(arg); 2129 } 2130 2131 static uint_t 2132 ibd_rc_tx_recycle(caddr_t arg) 2133 { 2134 ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg; 2135 ibd_ace_t *ace; 2136 ibd_state_t *state = chan->state; 2137 int flag, redo_flag; 2138 int redo = 1; 2139 2140 flag = IBD_CQ_POLLING; 2141 redo_flag = IBD_REDO_CQ_POLLING; 2142 2143 mutex_enter(&chan->tx_poll_lock); 2144 if (chan->tx_poll_busy & flag) { 2145 ibd_print_warn(state, "ibd_rc_tx_recycle: multiple polling " 2146 "threads"); 2147 chan->tx_poll_busy |= redo_flag; 2148 mutex_exit(&chan->tx_poll_lock); 2149 return (DDI_INTR_CLAIMED); 2150 } 2151 chan->tx_poll_busy |= flag; 2152 mutex_exit(&chan->tx_poll_lock); 2153 2154 /* 2155 * Poll for completed entries; the CQ will not interrupt any 2156 * more for completed packets. 2157 */ 2158 ibd_rc_drain_scq(chan, chan->scq_hdl); 2159 2160 /* 2161 * Now enable CQ notifications; all completions originating now 2162 * will cause new interrupts. 2163 */ 2164 do { 2165 if (ibt_enable_cq_notify(chan->scq_hdl, IBT_NEXT_COMPLETION) != 2166 IBT_SUCCESS) { 2167 /* 2168 * We do not expect a failure here. 2169 */ 2170 DPRINT(40, "ibd_rc_scq_handler: ibt_enable_cq_notify()" 2171 " failed"); 2172 } 2173 2174 ibd_rc_drain_scq(chan, chan->scq_hdl); 2175 2176 if (chan->tx_trans_error_cnt > 3) { 2177 mutex_enter(&chan->tx_poll_lock); 2178 chan->tx_poll_busy = 0; 2179 mutex_exit(&chan->tx_poll_lock); 2180 goto error_reset_chan; 2181 } 2182 mutex_enter(&chan->tx_poll_lock); 2183 if (chan->tx_poll_busy & redo_flag) 2184 chan->tx_poll_busy &= ~redo_flag; 2185 else { 2186 chan->tx_poll_busy &= ~flag; 2187 redo = 0; 2188 } 2189 mutex_exit(&chan->tx_poll_lock); 2190 2191 } while (redo); 2192 2193 return (DDI_INTR_CLAIMED); 2194 2195 error_reset_chan: 2196 /* 2197 * Channel being torn down. 2198 */ 2199 mutex_enter(&state->id_ac_mutex); 2200 if ((chan->chan_state == IBD_RC_STATE_ACT_ESTAB) && 2201 (chan->state->id_link_state == LINK_STATE_UP) && 2202 ((ace = ibd_acache_find(state, &chan->ace->ac_mac, B_FALSE, 0)) 2203 != NULL) && (ace == chan->ace)) { 2204 ASSERT(ace->ac_mce == NULL); 2205 INC_REF(ace, 1); 2206 IBD_ACACHE_PULLOUT_ACTIVE(state, ace); 2207 chan->chan_state = IBD_RC_STATE_ACT_CLOSING; 2208 mutex_exit(&state->id_ac_mutex); 2209 state->rc_reset_cnt++; 2210 DPRINT(30, "ibd_rc_tx_recycle(chan=%p, ace=%p): " 2211 " reset RC channel", chan, chan->ace); 2212 ibd_rc_signal_act_close(state, ace); 2213 } else { 2214 mutex_exit(&state->id_ac_mutex); 2215 state->rc_act_close_simultaneous++; 2216 DPRINT(40, "ibd_rc_tx_recycle: other thread is closing" 2217 " it. chan=%p, act_state=%d, link_state=%d, ace=%p", 2218 chan, chan->chan_state, state->id_link_state, ace); 2219 } 2220 return (DDI_INTR_CLAIMED); 2221 } 2222 2223 static ibt_status_t 2224 ibd_register_service(ibt_srv_desc_t *srv, ib_svc_id_t sid, 2225 int num_sids, ibt_srv_hdl_t *srv_hdl, ib_svc_id_t *ret_sid) 2226 { 2227 ibd_service_t *p; 2228 ibt_status_t status; 2229 2230 mutex_enter(&ibd_gstate.ig_mutex); 2231 for (p = ibd_gstate.ig_service_list; p != NULL; p = p->is_link) { 2232 if (p->is_sid == sid) { 2233 p->is_ref_cnt++; 2234 *srv_hdl = p->is_srv_hdl; 2235 *ret_sid = sid; 2236 mutex_exit(&ibd_gstate.ig_mutex); 2237 return (IBT_SUCCESS); 2238 } 2239 } 2240 status = ibt_register_service(ibd_gstate.ig_ibt_hdl, srv, sid, 2241 num_sids, srv_hdl, ret_sid); 2242 if (status == IBT_SUCCESS) { 2243 p = kmem_alloc(sizeof (*p), KM_SLEEP); 2244 p->is_srv_hdl = *srv_hdl; 2245 p->is_sid = sid; 2246 p->is_ref_cnt = 1; 2247 p->is_link = ibd_gstate.ig_service_list; 2248 ibd_gstate.ig_service_list = p; 2249 } 2250 mutex_exit(&ibd_gstate.ig_mutex); 2251 return (status); 2252 } 2253 2254 static ibt_status_t 2255 ibd_deregister_service(ibt_srv_hdl_t srv_hdl) 2256 { 2257 ibd_service_t *p, **pp; 2258 ibt_status_t status; 2259 2260 mutex_enter(&ibd_gstate.ig_mutex); 2261 for (pp = &ibd_gstate.ig_service_list; *pp != NULL; 2262 pp = &((*pp)->is_link)) { 2263 p = *pp; 2264 if (p->is_srv_hdl == srv_hdl) { /* Found it */ 2265 if (--p->is_ref_cnt == 0) { 2266 status = ibt_deregister_service( 2267 ibd_gstate.ig_ibt_hdl, srv_hdl); 2268 *pp = p->is_link; /* link prev to next */ 2269 kmem_free(p, sizeof (*p)); 2270 } else { 2271 status = IBT_SUCCESS; 2272 } 2273 mutex_exit(&ibd_gstate.ig_mutex); 2274 return (status); 2275 } 2276 } 2277 /* Should not ever get here */ 2278 mutex_exit(&ibd_gstate.ig_mutex); 2279 return (IBT_FAILURE); 2280 } 2281 2282 /* Listen with corresponding service ID */ 2283 ibt_status_t 2284 ibd_rc_listen(ibd_state_t *state) 2285 { 2286 ibt_srv_desc_t srvdesc; 2287 ib_svc_id_t ret_sid; 2288 ibt_status_t status; 2289 ib_gid_t gid; 2290 2291 if (state->rc_listen_hdl != NULL) { 2292 DPRINT(40, "ibd_rc_listen: rc_listen_hdl should be NULL"); 2293 return (IBT_FAILURE); 2294 } 2295 2296 bzero(&srvdesc, sizeof (ibt_srv_desc_t)); 2297 srvdesc.sd_handler = ibd_rc_dispatch_pass_mad; 2298 srvdesc.sd_flags = IBT_SRV_NO_FLAGS; 2299 2300 /* 2301 * Register the service with service id 2302 * Incoming connection requests should arrive on this service id. 2303 */ 2304 status = ibd_register_service(&srvdesc, 2305 IBD_RC_QPN_TO_SID(state->id_qpnum), 2306 1, &state->rc_listen_hdl, &ret_sid); 2307 if (status != IBT_SUCCESS) { 2308 DPRINT(40, "ibd_rc_listen: Service Registration Failed, " 2309 "ret=%d", status); 2310 return (status); 2311 } 2312 2313 gid = state->id_sgid; 2314 2315 /* pass state as cm_private */ 2316 status = ibt_bind_service(state->rc_listen_hdl, 2317 gid, NULL, state, &state->rc_listen_bind); 2318 if (status != IBT_SUCCESS) { 2319 DPRINT(40, "ibd_rc_listen:" 2320 " fail to bind port: <%d>", status); 2321 (void) ibd_deregister_service(state->rc_listen_hdl); 2322 return (status); 2323 } 2324 2325 /* 2326 * Legacy OFED had used a wrong service ID (one additional zero digit) 2327 * for many years. To interop with legacy OFED, we support this wrong 2328 * service ID here. 2329 */ 2330 ASSERT(state->rc_listen_hdl_OFED_interop == NULL); 2331 2332 bzero(&srvdesc, sizeof (ibt_srv_desc_t)); 2333 srvdesc.sd_handler = ibd_rc_dispatch_pass_mad; 2334 srvdesc.sd_flags = IBT_SRV_NO_FLAGS; 2335 2336 /* 2337 * Register the service with service id 2338 * Incoming connection requests should arrive on this service id. 2339 */ 2340 status = ibd_register_service(&srvdesc, 2341 IBD_RC_QPN_TO_SID_OFED_INTEROP(state->id_qpnum), 2342 1, &state->rc_listen_hdl_OFED_interop, &ret_sid); 2343 if (status != IBT_SUCCESS) { 2344 DPRINT(40, 2345 "ibd_rc_listen: Service Registration for Legacy OFED " 2346 "Failed %d", status); 2347 (void) ibt_unbind_service(state->rc_listen_hdl, 2348 state->rc_listen_bind); 2349 (void) ibd_deregister_service(state->rc_listen_hdl); 2350 return (status); 2351 } 2352 2353 gid = state->id_sgid; 2354 2355 /* pass state as cm_private */ 2356 status = ibt_bind_service(state->rc_listen_hdl_OFED_interop, 2357 gid, NULL, state, &state->rc_listen_bind_OFED_interop); 2358 if (status != IBT_SUCCESS) { 2359 DPRINT(40, "ibd_rc_listen: fail to bind port: <%d> for " 2360 "Legacy OFED listener", status); 2361 (void) ibd_deregister_service( 2362 state->rc_listen_hdl_OFED_interop); 2363 (void) ibt_unbind_service(state->rc_listen_hdl, 2364 state->rc_listen_bind); 2365 (void) ibd_deregister_service(state->rc_listen_hdl); 2366 return (status); 2367 } 2368 2369 return (IBT_SUCCESS); 2370 } 2371 2372 void 2373 ibd_rc_stop_listen(ibd_state_t *state) 2374 { 2375 int ret; 2376 2377 /* Disable incoming connection requests */ 2378 if (state->rc_listen_hdl != NULL) { 2379 ret = ibt_unbind_all_services(state->rc_listen_hdl); 2380 if (ret != 0) { 2381 DPRINT(40, "ibd_rc_stop_listen:" 2382 "ibt_unbind_all_services() failed, ret=%d", ret); 2383 } 2384 ret = ibd_deregister_service(state->rc_listen_hdl); 2385 if (ret != 0) { 2386 DPRINT(40, "ibd_rc_stop_listen:" 2387 "ibd_deregister_service() failed, ret=%d", ret); 2388 } else { 2389 state->rc_listen_hdl = NULL; 2390 } 2391 } 2392 2393 /* Disable incoming connection requests */ 2394 if (state->rc_listen_hdl_OFED_interop != NULL) { 2395 ret = ibt_unbind_all_services( 2396 state->rc_listen_hdl_OFED_interop); 2397 if (ret != 0) { 2398 DPRINT(40, "ibd_rc_stop_listen:" 2399 "ibt_unbind_all_services() failed: %d", ret); 2400 } 2401 ret = ibd_deregister_service(state->rc_listen_hdl_OFED_interop); 2402 if (ret != 0) { 2403 DPRINT(40, "ibd_rc_stop_listen:" 2404 "ibd_deregister_service() failed: %d", ret); 2405 } else { 2406 state->rc_listen_hdl_OFED_interop = NULL; 2407 } 2408 } 2409 } 2410 2411 void 2412 ibd_rc_close_all_chan(ibd_state_t *state) 2413 { 2414 ibd_rc_chan_t *rc_chan; 2415 ibd_ace_t *ace; 2416 uint_t attempts; 2417 2418 /* Disable all Rx routines */ 2419 mutex_enter(&state->rc_pass_chan_list.chan_list_mutex); 2420 rc_chan = state->rc_pass_chan_list.chan_list; 2421 while (rc_chan != NULL) { 2422 ibt_set_cq_handler(rc_chan->rcq_hdl, 0, 0); 2423 rc_chan = rc_chan->next; 2424 } 2425 mutex_exit(&state->rc_pass_chan_list.chan_list_mutex); 2426 2427 if (state->rc_enable_srq) { 2428 attempts = 10; 2429 while (state->rc_srq_rwqe_list.dl_bufs_outstanding > 0) { 2430 DPRINT(30, "ibd_rc_close_all_chan: outstanding > 0"); 2431 delay(drv_usectohz(100000)); 2432 if (--attempts == 0) { 2433 /* 2434 * There are pending bufs with the network 2435 * layer and we have no choice but to wait 2436 * for them to be done with. Reap all the 2437 * Tx/Rx completions that were posted since 2438 * we turned off the notification and 2439 * return failure. 2440 */ 2441 break; 2442 } 2443 } 2444 } 2445 2446 /* Close all passive RC channels */ 2447 rc_chan = ibd_rc_rm_header_chan_list(&state->rc_pass_chan_list); 2448 while (rc_chan != NULL) { 2449 (void) ibd_rc_pas_close(rc_chan); 2450 rc_chan = ibd_rc_rm_header_chan_list(&state->rc_pass_chan_list); 2451 } 2452 2453 /* Close all active RC channels */ 2454 mutex_enter(&state->id_ac_mutex); 2455 ace = list_head(&state->id_ah_active); 2456 while (ace != NULL) { 2457 if (ace->ac_chan != NULL) { 2458 ibd_rc_add_to_chan_list(&state->rc_obs_act_chan_list, 2459 ace->ac_chan); 2460 } 2461 ace = list_next(&state->id_ah_active, ace); 2462 } 2463 mutex_exit(&state->id_ac_mutex); 2464 2465 rc_chan = ibd_rc_rm_header_chan_list(&state->rc_obs_act_chan_list); 2466 while (rc_chan != NULL) { 2467 ace = rc_chan->ace; 2468 ibd_rc_act_close(rc_chan); 2469 if (ace != NULL) 2470 ace->ac_chan = NULL; 2471 rc_chan = ibd_rc_rm_header_chan_list( 2472 &state->rc_obs_act_chan_list); 2473 } 2474 } 2475 2476 void 2477 ibd_rc_try_connect(ibd_state_t *state, ibd_ace_t *ace, ibt_path_info_t *path) 2478 { 2479 ibt_status_t status; 2480 2481 status = ibd_rc_connect(state, ace, path, 2482 IBD_RC_SERVICE_ID_OFED_INTEROP); 2483 2484 if (status != IBT_SUCCESS) { 2485 /* wait peer side remove stale channel */ 2486 delay(drv_usectohz(10000)); 2487 status = ibd_rc_connect(state, ace, path, 2488 IBD_RC_SERVICE_ID_OFED_INTEROP); 2489 } 2490 2491 if (status != IBT_SUCCESS) { 2492 /* wait peer side remove stale channel */ 2493 delay(drv_usectohz(10000)); 2494 (void) ibd_rc_connect(state, ace, path, 2495 IBD_RC_SERVICE_ID); 2496 } 2497 } 2498 2499 /* 2500 * Allocates channel and sets the ace->ac_chan to it. 2501 * Opens the channel. 2502 */ 2503 ibt_status_t 2504 ibd_rc_connect(ibd_state_t *state, ibd_ace_t *ace, ibt_path_info_t *path, 2505 uint64_t ietf_cm_service_id) 2506 { 2507 ibt_status_t status = 0; 2508 ibt_rc_returns_t open_returns; 2509 ibt_chan_open_args_t open_args; 2510 ibd_rc_msg_hello_t hello_req_msg; 2511 ibd_rc_msg_hello_t *hello_ack_msg; 2512 ibd_rc_chan_t *chan; 2513 2514 ASSERT(ace != NULL); 2515 ASSERT(ace->ac_mce == NULL); 2516 ASSERT(ace->ac_chan == NULL); 2517 2518 if ((status = ibd_rc_alloc_chan(&chan, state, B_TRUE)) != IBT_SUCCESS) { 2519 DPRINT(10, "ibd_rc_connect: ibd_rc_alloc_chan() failed"); 2520 return (status); 2521 } 2522 2523 ace->ac_chan = chan; 2524 chan->state = state; 2525 chan->ace = ace; 2526 2527 ibt_set_chan_private(chan->chan_hdl, (void *)(uintptr_t)ace); 2528 2529 hello_ack_msg = kmem_zalloc(sizeof (ibd_rc_msg_hello_t), KM_SLEEP); 2530 2531 /* 2532 * open the channels 2533 */ 2534 bzero(&open_args, sizeof (ibt_chan_open_args_t)); 2535 bzero(&open_returns, sizeof (ibt_rc_returns_t)); 2536 2537 open_args.oc_cm_handler = ibd_rc_dispatch_actv_mad; 2538 open_args.oc_cm_clnt_private = (void *)(uintptr_t)ace; 2539 2540 /* 2541 * update path record with the SID 2542 */ 2543 path->pi_sid = 2544 ietf_cm_service_id | ((ace->ac_dest->ud_dst_qpn) & 0xffffff); 2545 2546 2547 /* pre-allocate memory for hello ack message */ 2548 open_returns.rc_priv_data_len = sizeof (ibd_rc_msg_hello_t); 2549 open_returns.rc_priv_data = hello_ack_msg; 2550 2551 open_args.oc_path = path; 2552 2553 open_args.oc_path_rnr_retry_cnt = 7; 2554 open_args.oc_path_retry_cnt = 7; 2555 2556 /* We don't do RDMA */ 2557 open_args.oc_rdma_ra_out = 0; 2558 open_args.oc_rdma_ra_in = 0; 2559 2560 hello_req_msg.reserved_qpn = htonl(state->id_qpnum); 2561 hello_req_msg.rx_mtu = htonl(state->rc_mtu); 2562 open_args.oc_priv_data_len = sizeof (ibd_rc_msg_hello_t); 2563 open_args.oc_priv_data = (void *)(&hello_req_msg); 2564 2565 ASSERT(open_args.oc_priv_data_len <= IBT_REQ_PRIV_DATA_SZ); 2566 ASSERT(open_returns.rc_priv_data_len <= IBT_REP_PRIV_DATA_SZ); 2567 ASSERT(open_args.oc_cm_handler != NULL); 2568 2569 status = ibt_open_rc_channel(chan->chan_hdl, IBT_OCHAN_NO_FLAGS, 2570 IBT_BLOCKING, &open_args, &open_returns); 2571 2572 if (status == IBT_SUCCESS) { 2573 /* Success! */ 2574 DPRINT(2, "ibd_rc_connect: call ibt_open_rc_channel succ!"); 2575 state->rc_conn_succ++; 2576 kmem_free(hello_ack_msg, sizeof (ibd_rc_msg_hello_t)); 2577 return (IBT_SUCCESS); 2578 } 2579 2580 /* failure */ 2581 (void) ibt_flush_channel(chan->chan_hdl); 2582 ibd_rc_free_chan(chan); 2583 ace->ac_chan = NULL; 2584 2585 /* check open_returns report error and exit */ 2586 DPRINT(30, "ibd_rc_connect: call ibt_open_rc_chan fail." 2587 "ret status = %d, reason=%d, ace=%p, mtu=0x%x, qpn=0x%x," 2588 " peer qpn=0x%x", status, (int)open_returns.rc_status, ace, 2589 hello_req_msg.rx_mtu, hello_req_msg.reserved_qpn, 2590 ace->ac_dest->ud_dst_qpn); 2591 kmem_free(hello_ack_msg, sizeof (ibd_rc_msg_hello_t)); 2592 return (status); 2593 } 2594 2595 void 2596 ibd_rc_signal_act_close(ibd_state_t *state, ibd_ace_t *ace) 2597 { 2598 ibd_req_t *req; 2599 2600 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 2601 if (req == NULL) { 2602 ibd_print_warn(state, "ibd_rc_signal_act_close: alloc " 2603 "ibd_req_t fail"); 2604 mutex_enter(&state->rc_obs_act_chan_list.chan_list_mutex); 2605 ace->ac_chan->next = state->rc_obs_act_chan_list.chan_list; 2606 state->rc_obs_act_chan_list.chan_list = ace->ac_chan; 2607 mutex_exit(&state->rc_obs_act_chan_list.chan_list_mutex); 2608 } else { 2609 req->rq_ptr = ace->ac_chan; 2610 ibd_queue_work_slot(state, req, IBD_ASYNC_RC_CLOSE_ACT_CHAN); 2611 } 2612 } 2613 2614 void 2615 ibd_rc_signal_ace_recycle(ibd_state_t *state, ibd_ace_t *ace) 2616 { 2617 ibd_req_t *req; 2618 2619 mutex_enter(&state->rc_ace_recycle_lock); 2620 if (state->rc_ace_recycle != NULL) { 2621 mutex_exit(&state->rc_ace_recycle_lock); 2622 return; 2623 } 2624 2625 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 2626 if (req == NULL) { 2627 mutex_exit(&state->rc_ace_recycle_lock); 2628 return; 2629 } 2630 2631 state->rc_ace_recycle = ace; 2632 mutex_exit(&state->rc_ace_recycle_lock); 2633 ASSERT(ace->ac_mce == NULL); 2634 INC_REF(ace, 1); 2635 IBD_ACACHE_PULLOUT_ACTIVE(state, ace); 2636 req->rq_ptr = ace; 2637 ibd_queue_work_slot(state, req, IBD_ASYNC_RC_RECYCLE_ACE); 2638 } 2639 2640 static void 2641 ibd_rc_act_close(ibd_rc_chan_t *chan) 2642 { 2643 uint_t times; 2644 ibt_status_t ret; 2645 2646 ASSERT(chan != NULL); 2647 2648 chan->state->rc_act_close++; 2649 switch (chan->chan_state) { 2650 case IBD_RC_STATE_ACT_CLOSING: /* stale, close it */ 2651 case IBD_RC_STATE_ACT_ESTAB: 2652 DPRINT(30, "ibd_rc_act_close-1: close and free chan, " 2653 "act_state=%d, chan=%p", chan->chan_state, chan); 2654 chan->chan_state = IBD_RC_STATE_ACT_CLOSED; 2655 ibt_set_cq_handler(chan->rcq_hdl, 0, 0); 2656 /* Wait send queue empty */ 2657 times = 0; 2658 mutex_enter(&chan->tx_wqe_list.dl_mutex); 2659 mutex_enter(&chan->tx_rel_list.dl_mutex); 2660 while (((chan->tx_wqe_list.dl_cnt + chan->tx_rel_list.dl_cnt) 2661 != chan->scq_size) && (times < 50)) { 2662 DPRINT(30, "ibd_rc_act_close: dl_cnt(tx_wqe_list=%d," 2663 " tx_rel_list=%d) != chan->scq_size=%d", 2664 chan->tx_wqe_list.dl_cnt, chan->tx_rel_list.dl_cnt, 2665 chan->scq_size); 2666 mutex_exit(&chan->tx_rel_list.dl_mutex); 2667 mutex_exit(&chan->tx_wqe_list.dl_mutex); 2668 mutex_enter(&chan->tx_poll_lock); 2669 if (chan->tx_poll_busy & IBD_CQ_POLLING) { 2670 DPRINT(40, "ibd_rc_act_close: multiple " 2671 "polling threads"); 2672 mutex_exit(&chan->tx_poll_lock); 2673 } else { 2674 chan->tx_poll_busy = IBD_CQ_POLLING; 2675 mutex_exit(&chan->tx_poll_lock); 2676 ibd_rc_drain_scq(chan, chan->scq_hdl); 2677 mutex_enter(&chan->tx_poll_lock); 2678 chan->tx_poll_busy = 0; 2679 mutex_exit(&chan->tx_poll_lock); 2680 } 2681 delay(drv_usectohz(100000)); 2682 times++; 2683 mutex_enter(&chan->tx_wqe_list.dl_mutex); 2684 mutex_enter(&chan->tx_rel_list.dl_mutex); 2685 } 2686 mutex_exit(&chan->tx_rel_list.dl_mutex); 2687 mutex_exit(&chan->tx_wqe_list.dl_mutex); 2688 ibt_set_cq_handler(chan->scq_hdl, 0, 0); 2689 ret = ibt_close_rc_channel(chan->chan_hdl, 2690 IBT_BLOCKING|IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0); 2691 if (ret != IBT_SUCCESS) { 2692 DPRINT(40, "ibd_rc_act_close-2: ibt_close_rc_channel " 2693 "fail, chan=%p, returned=%d", chan, ret); 2694 } else { 2695 DPRINT(30, "ibd_rc_act_close-2: ibt_close_rc_channel " 2696 "succ, chan=%p", chan); 2697 } 2698 2699 ibd_rc_free_chan(chan); 2700 break; 2701 case IBD_RC_STATE_ACT_REP_RECV: 2702 chan->chan_state = IBD_RC_STATE_ACT_CLOSED; 2703 (void) ibt_flush_channel(chan->chan_hdl); 2704 ibd_rc_free_chan(chan); 2705 break; 2706 case IBD_RC_STATE_ACT_ERROR: 2707 DPRINT(40, "ibd_rc_act_close: IBD_RC_STATE_ERROR branch"); 2708 break; 2709 default: 2710 DPRINT(40, "ibd_rc_act_close: default branch, act_state=%d, " 2711 "chan=%p", chan->chan_state, chan); 2712 } 2713 } 2714 2715 static int 2716 ibd_rc_pas_close(ibd_rc_chan_t *chan) 2717 { 2718 uint_t times; 2719 ibt_status_t ret; 2720 2721 ASSERT(chan != NULL); 2722 chan->state->rc_pas_close++; 2723 2724 switch (chan->chan_state) { 2725 case IBD_RC_STATE_PAS_ESTAB: 2726 /* 2727 * First, stop receive interrupts; this stops the 2728 * connection from handing up buffers to higher layers. 2729 * Wait for receive buffers to be returned; give up 2730 * after 5 seconds. 2731 */ 2732 ibt_set_cq_handler(chan->rcq_hdl, 0, 0); 2733 if (!chan->state->rc_enable_srq) { 2734 times = 50; 2735 while (chan->rx_wqe_list.dl_bufs_outstanding > 0) { 2736 delay(drv_usectohz(100000)); 2737 if (--times == 0) { 2738 DPRINT(40, "ibd_rc_pas_close : " 2739 "reclaiming failed"); 2740 ibd_rc_poll_rcq(chan, chan->rcq_hdl); 2741 ibt_set_cq_handler(chan->rcq_hdl, 2742 ibd_rc_rcq_handler, 2743 (void *)(uintptr_t)chan); 2744 return (DDI_FAILURE); 2745 } 2746 } 2747 } 2748 ibt_set_cq_handler(chan->scq_hdl, 0, 0); 2749 chan->chan_state = IBD_RC_STATE_PAS_CLOSED; 2750 DPRINT(30, "ibd_rc_pas_close-1: close and free chan, " 2751 "chan_state=%d, chan=%p", chan->chan_state, chan); 2752 ret = ibt_close_rc_channel(chan->chan_hdl, 2753 IBT_BLOCKING|IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0); 2754 if (ret != IBT_SUCCESS) { 2755 DPRINT(40, "ibd_rc_pas_close-2: ibt_close_rc_channel()" 2756 " fail, chan=%p, returned=%d", chan, ret); 2757 } else { 2758 DPRINT(30, "ibd_rc_pas_close-2: ibt_close_rc_channel()" 2759 " succ, chan=%p", chan); 2760 } 2761 2762 ibd_rc_free_chan(chan); 2763 break; 2764 case IBD_RC_STATE_PAS_REQ_RECV: 2765 chan->chan_state = IBD_RC_STATE_PAS_CLOSED; 2766 (void) ibt_flush_channel(chan->chan_hdl); 2767 ibd_rc_free_chan(chan); 2768 break; 2769 default: 2770 DPRINT(40, "ibd_rc_pas_close: default, chan_state=%d, chan=%p", 2771 chan->chan_state, chan); 2772 } 2773 return (DDI_SUCCESS); 2774 } 2775 2776 /* 2777 * Remove duplicate RC channel which comes from the same mac 2778 * 2779 * From the IP point of view, we could check for same MAC: 2780 * GID, P_Key (or QPN, though in a reboot this is likely to 2781 * change so P_Key is better). The GID usually will equate to 2782 * port (since typically it uses the port GUID in the low 64 bits). 2783 * These fields exists in the REQ messages. 2784 */ 2785 void 2786 ibd_rc_handle_req_rm_dup(ibd_state_t *state, ibt_cm_event_t *ibt_cm_event) 2787 { 2788 ibd_rc_chan_t *chan, *pre_chan; 2789 2790 pre_chan = NULL; 2791 mutex_enter(&state->rc_pass_chan_list.chan_list_mutex); 2792 chan = state->rc_pass_chan_list.chan_list; 2793 while (chan != NULL) { 2794 if ((bcmp(&chan->requester_gid, 2795 &ibt_cm_event->cm_event.req.req_prim_addr.av_dgid, 2796 sizeof (ib_gid_t)) == 0) && (chan->requester_pkey == 2797 ibt_cm_event->cm_event.req.req_pkey)) { 2798 if (pre_chan == NULL) { 2799 state->rc_pass_chan_list.chan_list = chan->next; 2800 } else { 2801 pre_chan->next = chan->next; 2802 } 2803 break; 2804 } 2805 pre_chan = chan; 2806 chan = chan->next; 2807 } 2808 mutex_exit(&state->rc_pass_chan_list.chan_list_mutex); 2809 if (chan) { 2810 DPRINT(30, "ibd_rc_handle_req_rm_dup: same gid and pkey, " 2811 "remove duplicate channal, chan=%p", chan); 2812 if (ibd_rc_pas_close(chan) != DDI_SUCCESS) { 2813 ibd_rc_add_to_chan_list(&state->rc_pass_chan_list, 2814 chan); 2815 } 2816 } 2817 } 2818 2819 /* 2820 * Passive Side: 2821 * Handle an incoming CM REQ from active side. 2822 * 2823 * If success, this function allocates an ibd_rc_chan_t, then 2824 * assigns it to "*ret_conn". 2825 */ 2826 static ibt_cm_status_t 2827 ibd_rc_handle_req(void *arg, ibd_rc_chan_t **ret_conn, 2828 ibt_cm_event_t *ibt_cm_event, ibt_cm_return_args_t *ret_args, 2829 void *ret_priv_data) 2830 { 2831 ibd_rc_msg_hello_t *hello_msg; 2832 ibd_state_t *state = (ibd_state_t *)arg; 2833 ibd_rc_chan_t *chan; 2834 2835 ibd_rc_handle_req_rm_dup(state, ibt_cm_event); 2836 2837 if (ibd_rc_alloc_chan(&chan, state, B_FALSE) != IBT_SUCCESS) { 2838 DPRINT(40, "ibd_rc_handle_req: ibd_rc_alloc_chan() failed"); 2839 return (IBT_CM_REJECT); 2840 } 2841 2842 ibd_rc_add_to_chan_list(&state->rc_pass_chan_list, chan); 2843 2844 ibt_set_chan_private(chan->chan_hdl, (void *)(uintptr_t)chan); 2845 2846 if (!state->rc_enable_srq) { 2847 if (ibd_rc_init_rxlist(chan) != DDI_SUCCESS) { 2848 ibd_rc_free_chan(chan); 2849 DPRINT(40, "ibd_rc_handle_req: ibd_rc_init_rxlist() " 2850 "failed"); 2851 return (IBT_CM_REJECT); 2852 } 2853 } 2854 2855 ret_args->cm_ret.rep.cm_channel = chan->chan_hdl; 2856 2857 /* We don't do RDMA */ 2858 ret_args->cm_ret.rep.cm_rdma_ra_out = 0; 2859 ret_args->cm_ret.rep.cm_rdma_ra_in = 0; 2860 2861 ret_args->cm_ret.rep.cm_rnr_retry_cnt = 7; 2862 ret_args->cm_ret_len = sizeof (ibd_rc_msg_hello_t); 2863 2864 hello_msg = (ibd_rc_msg_hello_t *)ibt_cm_event->cm_priv_data; 2865 DPRINT(30, "ibd_rc_handle_req(): peer qpn=0x%x, peer mtu=0x%x", 2866 ntohl(hello_msg->reserved_qpn), ntohl(hello_msg->rx_mtu)); 2867 2868 hello_msg = (ibd_rc_msg_hello_t *)ret_priv_data; 2869 hello_msg->reserved_qpn = htonl(state->id_qpnum); 2870 hello_msg->rx_mtu = htonl(state->rc_mtu); 2871 2872 chan->requester_gid = ibt_cm_event->cm_event.req.req_prim_addr.av_dgid; 2873 chan->requester_pkey = ibt_cm_event->cm_event.req.req_pkey; 2874 chan->chan_state = IBD_RC_STATE_PAS_REQ_RECV; /* ready to receive */ 2875 *ret_conn = chan; 2876 2877 return (IBT_CM_ACCEPT); 2878 } 2879 2880 /* 2881 * ibd_rc_handle_act_estab -- handler for connection established completion 2882 * for active side. 2883 */ 2884 static ibt_cm_status_t 2885 ibd_rc_handle_act_estab(ibd_ace_t *ace) 2886 { 2887 ibt_status_t result; 2888 2889 switch (ace->ac_chan->chan_state) { 2890 case IBD_RC_STATE_ACT_REP_RECV: 2891 ace->ac_chan->chan_state = IBD_RC_STATE_ACT_ESTAB; 2892 result = ibt_enable_cq_notify(ace->ac_chan->rcq_hdl, 2893 IBT_NEXT_COMPLETION); 2894 if (result != IBT_SUCCESS) { 2895 DPRINT(40, "ibd_rc_handle_act_estab: " 2896 "ibt_enable_cq_notify(rcq) " 2897 "failed: status %d", result); 2898 return (IBT_CM_REJECT); 2899 } 2900 break; 2901 default: 2902 DPRINT(40, "ibd_rc_handle_act_estab: default " 2903 "branch, act_state=%d", ace->ac_chan->chan_state); 2904 return (IBT_CM_REJECT); 2905 } 2906 return (IBT_CM_ACCEPT); 2907 } 2908 2909 /* 2910 * ibd_rc_handle_pas_estab -- handler for connection established completion 2911 * for passive side. 2912 */ 2913 static ibt_cm_status_t 2914 ibd_rc_handle_pas_estab(ibd_rc_chan_t *chan) 2915 { 2916 ibt_status_t result; 2917 2918 switch (chan->chan_state) { 2919 case IBD_RC_STATE_PAS_REQ_RECV: 2920 chan->chan_state = IBD_RC_STATE_PAS_ESTAB; 2921 2922 result = ibt_enable_cq_notify(chan->rcq_hdl, 2923 IBT_NEXT_COMPLETION); 2924 if (result != IBT_SUCCESS) { 2925 DPRINT(40, "ibd_rc_handle_pas_estab: " 2926 "ibt_enable_cq_notify(rcq) " 2927 "failed: status %d", result); 2928 return (IBT_CM_REJECT); 2929 } 2930 break; 2931 default: 2932 DPRINT(40, "ibd_rc_handle_pas_estab: default " 2933 "branch, chan_state=%d", chan->chan_state); 2934 return (IBT_CM_REJECT); 2935 } 2936 return (IBT_CM_ACCEPT); 2937 } 2938 2939 /* ARGSUSED */ 2940 static ibt_cm_status_t 2941 ibd_rc_dispatch_actv_mad(void *arg, ibt_cm_event_t *ibt_cm_event, 2942 ibt_cm_return_args_t *ret_args, void *ret_priv_data, 2943 ibt_priv_data_len_t ret_len_max) 2944 { 2945 ibt_cm_status_t result = IBT_CM_ACCEPT; 2946 ibd_ace_t *ace = (ibd_ace_t *)(uintptr_t)arg; 2947 ibd_rc_chan_t *rc_chan; 2948 ibd_state_t *state; 2949 ibd_rc_msg_hello_t *hello_ack; 2950 uint_t times; 2951 2952 switch (ibt_cm_event->cm_type) { 2953 case IBT_CM_EVENT_REP_RCV: 2954 ASSERT(ace->ac_chan != NULL); 2955 ASSERT(ace->ac_chan->chan_state == IBD_RC_STATE_INIT); 2956 hello_ack = (ibd_rc_msg_hello_t *)ibt_cm_event->cm_priv_data; 2957 DPRINT(30, "ibd_rc_handle_rep: hello_ack->mtu=0x%x, " 2958 "hello_ack->qpn=0x%x", ntohl(hello_ack->rx_mtu), 2959 ntohl(hello_ack->reserved_qpn)); 2960 ace->ac_chan->chan_state = IBD_RC_STATE_ACT_REP_RECV; 2961 break; 2962 2963 case IBT_CM_EVENT_CONN_EST: 2964 ASSERT(ace->ac_chan != NULL); 2965 DPRINT(30, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_CONN_EST, " 2966 "ace=%p, act_state=%d, chan=%p", 2967 ace, ace->ac_chan->chan_state, ace->ac_chan); 2968 result = ibd_rc_handle_act_estab(ace); 2969 break; 2970 2971 case IBT_CM_EVENT_CONN_CLOSED: 2972 rc_chan = ace->ac_chan; 2973 if (rc_chan == NULL) { 2974 DPRINT(40, "ibd_rc_dispatch_actv_mad: " 2975 "rc_chan==NULL, IBT_CM_EVENT_CONN_CLOSED"); 2976 return (IBT_CM_ACCEPT); 2977 } 2978 state = rc_chan->state; 2979 mutex_enter(&state->id_ac_mutex); 2980 if ((rc_chan->chan_state == IBD_RC_STATE_ACT_ESTAB) && 2981 ((ace = ibd_acache_find(state, &ace->ac_mac, B_FALSE, 0)) 2982 != NULL) && (ace == rc_chan->ace)) { 2983 rc_chan->chan_state = IBD_RC_STATE_ACT_CLOSING; 2984 ASSERT(ace->ac_mce == NULL); 2985 INC_REF(ace, 1); 2986 IBD_ACACHE_PULLOUT_ACTIVE(state, ace); 2987 mutex_exit(&state->id_ac_mutex); 2988 DPRINT(30, "ibd_rc_dispatch_actv_mad: " 2989 "IBT_CM_EVENT_CONN_CLOSED, ace=%p, chan=%p, " 2990 "reason=%d", ace, rc_chan, 2991 ibt_cm_event->cm_event.closed); 2992 } else { 2993 mutex_exit(&state->id_ac_mutex); 2994 state->rc_act_close_simultaneous++; 2995 DPRINT(40, "ibd_rc_dispatch_actv_mad: other thread " 2996 "is closing it, IBT_CM_EVENT_CONN_CLOSED, " 2997 "chan_state=%d", rc_chan->chan_state); 2998 return (IBT_CM_ACCEPT); 2999 } 3000 /* wait until the send queue clean */ 3001 times = 0; 3002 mutex_enter(&rc_chan->tx_wqe_list.dl_mutex); 3003 mutex_enter(&rc_chan->tx_rel_list.dl_mutex); 3004 while (((rc_chan->tx_wqe_list.dl_cnt + 3005 rc_chan->tx_rel_list.dl_cnt) 3006 != rc_chan->scq_size) && (times < 50)) { 3007 DPRINT(40, "ibd_rc_dispatch_act_mad: dl_cnt" 3008 "(tx_wqe_list=%d, tx_rel_list=%d) != " 3009 "chan->scq_size=%d", 3010 rc_chan->tx_wqe_list.dl_cnt, 3011 rc_chan->tx_rel_list.dl_cnt, 3012 rc_chan->scq_size); 3013 mutex_exit(&rc_chan->tx_rel_list.dl_mutex); 3014 mutex_exit(&rc_chan->tx_wqe_list.dl_mutex); 3015 mutex_enter(&rc_chan->tx_poll_lock); 3016 if (rc_chan->tx_poll_busy & IBD_CQ_POLLING) { 3017 DPRINT(40, "ibd_rc_dispatch_actv_mad: " 3018 "multiple polling threads"); 3019 mutex_exit(&rc_chan->tx_poll_lock); 3020 } else { 3021 rc_chan->tx_poll_busy = IBD_CQ_POLLING; 3022 mutex_exit(&rc_chan->tx_poll_lock); 3023 ibd_rc_drain_scq(rc_chan, rc_chan->scq_hdl); 3024 mutex_enter(&rc_chan->tx_poll_lock); 3025 rc_chan->tx_poll_busy = 0; 3026 mutex_exit(&rc_chan->tx_poll_lock); 3027 } 3028 delay(drv_usectohz(100000)); 3029 times++; 3030 mutex_enter(&rc_chan->tx_wqe_list.dl_mutex); 3031 mutex_enter(&rc_chan->tx_rel_list.dl_mutex); 3032 } 3033 mutex_exit(&rc_chan->tx_rel_list.dl_mutex); 3034 mutex_exit(&rc_chan->tx_wqe_list.dl_mutex); 3035 rc_chan->chan_state = IBD_RC_STATE_ACT_CLOSED; 3036 ibd_rc_free_chan(rc_chan); 3037 DPRINT(30, "ibd_rc_dispatch_actv_mad: " 3038 "IBT_CM_EVENT_CONN_CLOSED, ref=%x", ace->ac_ref); 3039 mutex_enter(&state->id_ac_mutex); 3040 ace->ac_chan = NULL; 3041 ASSERT(ace->ac_ref != 0); 3042 atomic_dec_32(&ace->ac_ref); 3043 if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) { 3044 IBD_ACACHE_INSERT_FREE(state, ace); 3045 ace->ac_ref = 0; 3046 } else { 3047 ace->ac_ref |= CYCLEVAL; 3048 state->rc_delay_ace_recycle++; 3049 } 3050 mutex_exit(&state->id_ac_mutex); 3051 break; 3052 3053 case IBT_CM_EVENT_FAILURE: 3054 DPRINT(30, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_FAILURE," 3055 "ace=%p, chan=%p, code: %d, msg: %d, reason=%d", 3056 ace, ace->ac_chan, 3057 ibt_cm_event->cm_event.failed.cf_code, 3058 ibt_cm_event->cm_event.failed.cf_msg, 3059 ibt_cm_event->cm_event.failed.cf_reason); 3060 /* 3061 * Don't need free resource here. The resource is freed 3062 * at function ibd_rc_connect() 3063 */ 3064 break; 3065 3066 case IBT_CM_EVENT_MRA_RCV: 3067 DPRINT(40, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_MRA_RCV"); 3068 break; 3069 case IBT_CM_EVENT_LAP_RCV: 3070 DPRINT(40, "ibd_rc_dispatch_actv_mad: LAP message received"); 3071 break; 3072 case IBT_CM_EVENT_APR_RCV: 3073 DPRINT(40, "ibd_rc_dispatch_actv_mad: APR message received"); 3074 break; 3075 default: 3076 DPRINT(40, "ibd_rc_dispatch_actv_mad: default branch, " 3077 "ibt_cm_event->cm_type=%d", ibt_cm_event->cm_type); 3078 break; 3079 } 3080 3081 return (result); 3082 } 3083 3084 /* ARGSUSED */ 3085 static ibt_cm_status_t 3086 ibd_rc_dispatch_pass_mad(void *arg, ibt_cm_event_t *ibt_cm_event, 3087 ibt_cm_return_args_t *ret_args, void *ret_priv_data, 3088 ibt_priv_data_len_t ret_len_max) 3089 { 3090 ibt_cm_status_t result = IBT_CM_ACCEPT; 3091 ibd_rc_chan_t *chan; 3092 3093 if (ibt_cm_event->cm_type == IBT_CM_EVENT_REQ_RCV) { 3094 DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_REQ_RCV," 3095 "req_pkey=%x", ibt_cm_event->cm_event.req.req_pkey); 3096 /* Receive an incoming CM REQ from active side */ 3097 result = ibd_rc_handle_req(arg, &chan, ibt_cm_event, ret_args, 3098 ret_priv_data); 3099 return (result); 3100 } 3101 3102 if (ibt_cm_event->cm_channel == 0) { 3103 DPRINT(30, "ibd_rc_dispatch_pass_mad: " 3104 "ERROR ibt_cm_event->cm_channel == 0"); 3105 return (IBT_CM_REJECT); 3106 } 3107 3108 chan = 3109 (ibd_rc_chan_t *)ibt_get_chan_private(ibt_cm_event->cm_channel); 3110 if (chan == NULL) { 3111 DPRINT(40, "ibd_rc_dispatch_pass_mad: conn == 0"); 3112 return (IBT_CM_REJECT); 3113 } 3114 3115 switch (ibt_cm_event->cm_type) { 3116 case IBT_CM_EVENT_CONN_EST: 3117 DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_CONN_EST, " 3118 "chan=%p", chan); 3119 result = ibd_rc_handle_pas_estab(chan); 3120 break; 3121 case IBT_CM_EVENT_CONN_CLOSED: 3122 DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_CONN_CLOSED," 3123 " chan=%p, reason=%d", chan, ibt_cm_event->cm_event.closed); 3124 ibd_rc_rm_from_chan_list(&chan->state->rc_pass_chan_list, chan); 3125 ibd_rc_free_chan(chan); 3126 break; 3127 case IBT_CM_EVENT_FAILURE: 3128 DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_FAILURE," 3129 " chan=%p, code: %d, msg: %d, reason=%d", chan, 3130 ibt_cm_event->cm_event.failed.cf_code, 3131 ibt_cm_event->cm_event.failed.cf_msg, 3132 ibt_cm_event->cm_event.failed.cf_reason); 3133 3134 ibd_rc_rm_from_chan_list(&chan->state->rc_pass_chan_list, chan); 3135 ibd_rc_free_chan(chan); 3136 return (IBT_CM_ACCEPT); 3137 case IBT_CM_EVENT_MRA_RCV: 3138 DPRINT(40, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_MRA_RCV"); 3139 break; 3140 case IBT_CM_EVENT_LAP_RCV: 3141 DPRINT(40, "ibd_rc_dispatch_pass_mad: LAP message received"); 3142 break; 3143 case IBT_CM_EVENT_APR_RCV: 3144 DPRINT(40, "ibd_rc_dispatch_pass_mad: APR message received"); 3145 break; 3146 default: 3147 DPRINT(40, "ibd_rc_dispatch_pass_mad: default, type=%d, " 3148 "chan=%p", ibt_cm_event->cm_type, chan); 3149 break; 3150 } 3151 3152 return (result); 3153 } 3154