1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* 26 * Copyright (c) 2005 SilverStorm Technologies, Inc. All rights reserved. 27 * 28 * This software is available to you under a choice of one of two 29 * licenses. You may choose to be licensed under the terms of the GNU 30 * General Public License (GPL) Version 2, available from the file 31 * COPYING in the main directory of this source tree, or the 32 * OpenIB.org BSD license below: 33 * 34 * Redistribution and use in source and binary forms, with or 35 * without modification, are permitted provided that the following 36 * conditions are met: 37 * 38 * - Redistributions of source code must retain the above 39 * copyright notice, this list of conditions and the following 40 * disclaimer. 41 * 42 * - Redistributions in binary form must reproduce the above 43 * copyright notice, this list of conditions and the following 44 * disclaimer in the documentation and/or other materials 45 * provided with the distribution. 46 * 47 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 48 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 49 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 50 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 51 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 52 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 53 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 54 * SOFTWARE. 55 * 56 */ 57 /* 58 * Sun elects to include this software in Sun product 59 * under the OpenIB BSD license. 60 * 61 * 62 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 63 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 64 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 65 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 66 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 67 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 68 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 69 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 70 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 71 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 72 * POSSIBILITY OF SUCH DAMAGE. 73 */ 74 75 #include <sys/types.h> 76 #include <sys/ddi.h> 77 #include <sys/sunddi.h> 78 #include <sys/ib/clients/rds/rdsib_cm.h> 79 #include <sys/ib/clients/rds/rdsib_ib.h> 80 #include <sys/ib/clients/rds/rdsib_buf.h> 81 #include <sys/ib/clients/rds/rdsib_ep.h> 82 #include <sys/ib/clients/rds/rds_kstat.h> 83 84 static void rds_async_handler(void *clntp, ibt_hca_hdl_t hdl, 85 ibt_async_code_t code, ibt_async_event_t *event); 86 87 static struct ibt_clnt_modinfo_s rds_ib_modinfo = { 88 IBTI_V_CURR, 89 IBT_NETWORK, 90 rds_async_handler, 91 NULL, 92 "RDS" 93 }; 94 95 /* performance tunables */ 96 uint_t rds_no_interrupts = 0; 97 uint_t rds_poll_percent_full = 25; 98 uint_t rds_wc_signal = IBT_NEXT_SOLICITED; 99 uint_t rds_waittime_ms = 100; /* ms */ 100 101 extern dev_info_t *rdsib_dev_info; 102 extern void rds_close_sessions(); 103 104 static void 105 rdsib_validate_chan_sizes(ibt_hca_attr_t *hattrp) 106 { 107 /* The SQ size should not be more than that supported by the HCA */ 108 if (((MaxDataSendBuffers + RDS_NUM_ACKS) > hattrp->hca_max_chan_sz) || 109 ((MaxDataSendBuffers + RDS_NUM_ACKS) > hattrp->hca_max_cq_sz)) { 110 RDS_DPRINTF2("RDSIB", "MaxDataSendBuffers + %d is greater " 111 "than that supported by the HCA driver " 112 "(%d + %d > %d or %d), lowering it to a supported value.", 113 RDS_NUM_ACKS, MaxDataSendBuffers, RDS_NUM_ACKS, 114 hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz); 115 116 MaxDataSendBuffers = (hattrp->hca_max_chan_sz > 117 hattrp->hca_max_cq_sz) ? 118 hattrp->hca_max_cq_sz - RDS_NUM_ACKS : 119 hattrp->hca_max_chan_sz - RDS_NUM_ACKS; 120 } 121 122 /* The RQ size should not be more than that supported by the HCA */ 123 if ((MaxDataRecvBuffers > hattrp->hca_max_chan_sz) || 124 (MaxDataRecvBuffers > hattrp->hca_max_cq_sz)) { 125 RDS_DPRINTF2("RDSIB", "MaxDataRecvBuffers is greater than that " 126 "supported by the HCA driver (%d > %d or %d), lowering it " 127 "to a supported value.", MaxDataRecvBuffers, 128 hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz); 129 130 MaxDataRecvBuffers = (hattrp->hca_max_chan_sz > 131 hattrp->hca_max_cq_sz) ? hattrp->hca_max_cq_sz : 132 hattrp->hca_max_chan_sz; 133 } 134 135 /* The SQ size should not be more than that supported by the HCA */ 136 if ((MaxCtrlSendBuffers > hattrp->hca_max_chan_sz) || 137 (MaxCtrlSendBuffers > hattrp->hca_max_cq_sz)) { 138 RDS_DPRINTF2("RDSIB", "MaxCtrlSendBuffers is greater than that " 139 "supported by the HCA driver (%d > %d or %d), lowering it " 140 "to a supported value.", MaxCtrlSendBuffers, 141 hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz); 142 143 MaxCtrlSendBuffers = (hattrp->hca_max_chan_sz > 144 hattrp->hca_max_cq_sz) ? hattrp->hca_max_cq_sz : 145 hattrp->hca_max_chan_sz; 146 } 147 148 /* The RQ size should not be more than that supported by the HCA */ 149 if ((MaxCtrlRecvBuffers > hattrp->hca_max_chan_sz) || 150 (MaxCtrlRecvBuffers > hattrp->hca_max_cq_sz)) { 151 RDS_DPRINTF2("RDSIB", "MaxCtrlRecvBuffers is greater than that " 152 "supported by the HCA driver (%d > %d or %d), lowering it " 153 "to a supported value.", MaxCtrlRecvBuffers, 154 hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz); 155 156 MaxCtrlRecvBuffers = (hattrp->hca_max_chan_sz > 157 hattrp->hca_max_cq_sz) ? hattrp->hca_max_cq_sz : 158 hattrp->hca_max_chan_sz; 159 } 160 161 /* The MaxRecvMemory should be less than that supported by the HCA */ 162 if ((NDataRX * RdsPktSize) > hattrp->hca_max_memr_len) { 163 RDS_DPRINTF2("RDSIB", "MaxRecvMemory is greater than that " 164 "supported by the HCA driver (%d > %d), lowering it to %d", 165 NDataRX * RdsPktSize, hattrp->hca_max_memr_len, 166 hattrp->hca_max_memr_len); 167 168 NDataRX = hattrp->hca_max_memr_len/RdsPktSize; 169 } 170 } 171 172 /* Return hcap, given the hca guid */ 173 rds_hca_t * 174 rds_lkup_hca(ib_guid_t hca_guid) 175 { 176 rds_hca_t *hcap; 177 178 RDS_DPRINTF4("rds_lkup_hca", "Enter: statep: 0x%p " 179 "guid: %llx", rdsib_statep, hca_guid); 180 181 rw_enter(&rdsib_statep->rds_hca_lock, RW_READER); 182 183 hcap = rdsib_statep->rds_hcalistp; 184 while ((hcap != NULL) && (hcap->hca_guid != hca_guid)) { 185 hcap = hcap->hca_nextp; 186 } 187 188 rw_exit(&rdsib_statep->rds_hca_lock); 189 190 RDS_DPRINTF4("rds_lkup_hca", "return"); 191 192 return (hcap); 193 } 194 195 196 static rds_hca_t * 197 rdsib_init_hca(ib_guid_t hca_guid) 198 { 199 rds_hca_t *hcap; 200 boolean_t alloc = B_FALSE; 201 int ret; 202 203 RDS_DPRINTF2("rdsib_init_hca", "enter: HCA 0x%llx", hca_guid); 204 205 /* Do a HCA lookup */ 206 hcap = rds_lkup_hca(hca_guid); 207 208 if (hcap != NULL && hcap->hca_hdl != NULL) { 209 /* 210 * This can happen if we get IBT_HCA_ATTACH_EVENT on an HCA 211 * that we have already opened. Just return NULL so that 212 * we'll not end up reinitializing the HCA again. 213 */ 214 RDS_DPRINTF2("rdsib_init_hca", "HCA already initialized"); 215 return (NULL); 216 } 217 218 if (hcap == NULL) { 219 RDS_DPRINTF2("rdsib_init_hca", "New HCA is added"); 220 hcap = (rds_hca_t *)kmem_zalloc(sizeof (rds_hca_t), KM_SLEEP); 221 alloc = B_TRUE; 222 } 223 224 hcap->hca_guid = hca_guid; 225 ret = ibt_open_hca(rdsib_statep->rds_ibhdl, hca_guid, 226 &hcap->hca_hdl); 227 if (ret != IBT_SUCCESS) { 228 if (ret == IBT_HCA_IN_USE) { 229 RDS_DPRINTF2("rdsib_init_hca", 230 "ibt_open_hca: 0x%llx returned IBT_HCA_IN_USE", 231 hca_guid); 232 } else { 233 RDS_DPRINTF2("rdsib_init_hca", 234 "ibt_open_hca: 0x%llx failed: %d", hca_guid, ret); 235 } 236 if (alloc == B_TRUE) { 237 kmem_free(hcap, sizeof (rds_hca_t)); 238 } 239 return (NULL); 240 } 241 242 ret = ibt_query_hca(hcap->hca_hdl, &hcap->hca_attr); 243 if (ret != IBT_SUCCESS) { 244 RDS_DPRINTF2("rdsib_init_hca", 245 "Query HCA: 0x%llx failed: %d", hca_guid, ret); 246 ret = ibt_close_hca(hcap->hca_hdl); 247 ASSERT(ret == IBT_SUCCESS); 248 if (alloc == B_TRUE) { 249 kmem_free(hcap, sizeof (rds_hca_t)); 250 } else { 251 hcap->hca_hdl = NULL; 252 } 253 return (NULL); 254 } 255 256 ret = ibt_query_hca_ports(hcap->hca_hdl, 0, 257 &hcap->hca_pinfop, &hcap->hca_nports, &hcap->hca_pinfo_sz); 258 if (ret != IBT_SUCCESS) { 259 RDS_DPRINTF2("rdsib_init_hca", 260 "Query HCA 0x%llx ports failed: %d", hca_guid, 261 ret); 262 ret = ibt_close_hca(hcap->hca_hdl); 263 hcap->hca_hdl = NULL; 264 ASSERT(ret == IBT_SUCCESS); 265 if (alloc == B_TRUE) { 266 kmem_free(hcap, sizeof (rds_hca_t)); 267 } else { 268 hcap->hca_hdl = NULL; 269 } 270 return (NULL); 271 } 272 273 /* Only one PD per HCA is allocated, so do it here */ 274 ret = ibt_alloc_pd(hcap->hca_hdl, IBT_PD_NO_FLAGS, 275 &hcap->hca_pdhdl); 276 if (ret != IBT_SUCCESS) { 277 RDS_DPRINTF2("rdsib_init_hca", 278 "ibt_alloc_pd 0x%llx failed: %d", hca_guid, ret); 279 (void) ibt_free_portinfo(hcap->hca_pinfop, 280 hcap->hca_pinfo_sz); 281 ret = ibt_close_hca(hcap->hca_hdl); 282 ASSERT(ret == IBT_SUCCESS); 283 hcap->hca_hdl = NULL; 284 if (alloc == B_TRUE) { 285 kmem_free(hcap, sizeof (rds_hca_t)); 286 } else { 287 hcap->hca_hdl = NULL; 288 } 289 return (NULL); 290 } 291 292 rdsib_validate_chan_sizes(&hcap->hca_attr); 293 294 rw_enter(&rdsib_statep->rds_hca_lock, RW_WRITER); 295 hcap->hca_state = RDS_HCA_STATE_OPEN; 296 if (alloc == B_TRUE) { 297 /* this is a new HCA, add it to the list */ 298 rdsib_statep->rds_nhcas++; 299 hcap->hca_nextp = rdsib_statep->rds_hcalistp; 300 rdsib_statep->rds_hcalistp = hcap; 301 } 302 rw_exit(&rdsib_statep->rds_hca_lock); 303 304 RDS_DPRINTF2("rdsib_init_hca", "return: HCA 0x%llx", hca_guid); 305 306 return (hcap); 307 } 308 309 /* 310 * Called from attach 311 */ 312 int 313 rdsib_initialize_ib() 314 { 315 ib_guid_t *guidp; 316 rds_hca_t *hcap; 317 uint_t ix, hcaix, nhcas; 318 int ret; 319 320 RDS_DPRINTF2("rdsib_initialize_ib", "enter: statep %p", rdsib_statep); 321 322 ASSERT(rdsib_statep != NULL); 323 if (rdsib_statep == NULL) { 324 RDS_DPRINTF1("rdsib_initialize_ib", 325 "RDS Statep not initialized"); 326 return (-1); 327 } 328 329 /* How many hcas are there? */ 330 nhcas = ibt_get_hca_list(&guidp); 331 if (nhcas == 0) { 332 RDS_DPRINTF2("rdsib_initialize_ib", "No IB HCAs Available"); 333 return (-1); 334 } 335 336 RDS_DPRINTF3("rdsib_initialize_ib", "Number of HCAs: %d", nhcas); 337 338 /* Register with IBTF */ 339 ret = ibt_attach(&rds_ib_modinfo, rdsib_dev_info, rdsib_statep, 340 &rdsib_statep->rds_ibhdl); 341 if (ret != IBT_SUCCESS) { 342 RDS_DPRINTF2("rdsib_initialize_ib", "ibt_attach failed: %d", 343 ret); 344 (void) ibt_free_hca_list(guidp, nhcas); 345 return (-1); 346 } 347 348 /* 349 * Open each HCA and gather its information. Don't care about HCAs 350 * that cannot be opened. It is OK as long as atleast one HCA can be 351 * opened. 352 * Initialize a HCA only if all the information is available. 353 */ 354 for (ix = 0, hcaix = 0; ix < nhcas; ix++) { 355 RDS_DPRINTF3(LABEL, "Open HCA: 0x%llx", guidp[ix]); 356 357 hcap = rdsib_init_hca(guidp[ix]); 358 if (hcap != NULL) hcaix++; 359 } 360 361 /* free the HCA list, we are done with it */ 362 (void) ibt_free_hca_list(guidp, nhcas); 363 364 if (hcaix == 0) { 365 /* Failed to Initialize even one HCA */ 366 RDS_DPRINTF2("rdsib_initialize_ib", "No HCAs are initialized"); 367 (void) ibt_detach(rdsib_statep->rds_ibhdl); 368 rdsib_statep->rds_ibhdl = NULL; 369 return (-1); 370 } 371 372 if (hcaix < nhcas) { 373 RDS_DPRINTF2("rdsib_open_ib", "HCAs %d/%d failed to initialize", 374 (nhcas - hcaix), nhcas); 375 } 376 377 RDS_DPRINTF2("rdsib_initialize_ib", "return: statep %p", rdsib_statep); 378 379 return (0); 380 } 381 382 /* 383 * Called from detach 384 */ 385 void 386 rdsib_deinitialize_ib() 387 { 388 rds_hca_t *hcap, *nextp; 389 int ret; 390 391 RDS_DPRINTF2("rdsib_deinitialize_ib", "enter: statep %p", rdsib_statep); 392 393 /* close and destroy all the sessions */ 394 rds_close_sessions(NULL); 395 396 /* Release all HCA resources */ 397 rw_enter(&rdsib_statep->rds_hca_lock, RW_WRITER); 398 RDS_DPRINTF2("rdsib_deinitialize_ib", "HCA List: %p, NHCA: %d", 399 rdsib_statep->rds_hcalistp, rdsib_statep->rds_nhcas); 400 hcap = rdsib_statep->rds_hcalistp; 401 rdsib_statep->rds_hcalistp = NULL; 402 rdsib_statep->rds_nhcas = 0; 403 rw_exit(&rdsib_statep->rds_hca_lock); 404 405 while (hcap != NULL) { 406 nextp = hcap->hca_nextp; 407 408 if (hcap->hca_hdl != NULL) { 409 ret = ibt_free_pd(hcap->hca_hdl, hcap->hca_pdhdl); 410 ASSERT(ret == IBT_SUCCESS); 411 412 (void) ibt_free_portinfo(hcap->hca_pinfop, 413 hcap->hca_pinfo_sz); 414 415 ret = ibt_close_hca(hcap->hca_hdl); 416 ASSERT(ret == IBT_SUCCESS); 417 } 418 419 kmem_free(hcap, sizeof (rds_hca_t)); 420 hcap = nextp; 421 } 422 423 /* Deregister with IBTF */ 424 if (rdsib_statep->rds_ibhdl != NULL) { 425 (void) ibt_detach(rdsib_statep->rds_ibhdl); 426 rdsib_statep->rds_ibhdl = NULL; 427 } 428 429 RDS_DPRINTF2("rdsib_deinitialize_ib", "return: statep %p", 430 rdsib_statep); 431 } 432 433 /* 434 * Called on open of first RDS socket 435 */ 436 int 437 rdsib_open_ib() 438 { 439 int ret; 440 441 RDS_DPRINTF2("rdsib_open_ib", "enter: statep %p", rdsib_statep); 442 443 /* Enable incoming connection requests */ 444 if (rdsib_statep->rds_srvhdl == NULL) { 445 rdsib_statep->rds_srvhdl = 446 rds_register_service(rdsib_statep->rds_ibhdl); 447 if (rdsib_statep->rds_srvhdl == NULL) { 448 RDS_DPRINTF2("rdsib_open_ib", 449 "Service registration failed"); 450 return (-1); 451 } else { 452 /* bind the service on all available ports */ 453 ret = rds_bind_service(rdsib_statep); 454 if (ret != 0) { 455 RDS_DPRINTF2("rdsib_open_ib", 456 "Bind service failed: %d", ret); 457 } 458 } 459 } 460 461 RDS_DPRINTF2("rdsib_open_ib", "return: statep %p", rdsib_statep); 462 463 return (0); 464 } 465 466 /* 467 * Called when all ports are closed. 468 */ 469 void 470 rdsib_close_ib() 471 { 472 int ret; 473 474 RDS_DPRINTF2("rdsib_close_ib", "enter: statep %p", rdsib_statep); 475 476 /* Disable incoming connection requests */ 477 if (rdsib_statep->rds_srvhdl != NULL) { 478 ret = ibt_unbind_all_services(rdsib_statep->rds_srvhdl); 479 if (ret != 0) { 480 RDS_DPRINTF2("rdsib_close_ib", 481 "ibt_unbind_all_services failed: %d\n", ret); 482 } 483 ret = ibt_deregister_service(rdsib_statep->rds_ibhdl, 484 rdsib_statep->rds_srvhdl); 485 if (ret != 0) { 486 RDS_DPRINTF2("rdsib_close_ib", 487 "ibt_deregister_service failed: %d\n", ret); 488 } else { 489 rdsib_statep->rds_srvhdl = NULL; 490 } 491 } 492 493 RDS_DPRINTF2("rdsib_close_ib", "return: statep %p", rdsib_statep); 494 } 495 496 /* Return hcap, given the hca guid */ 497 rds_hca_t * 498 rds_get_hcap(rds_state_t *statep, ib_guid_t hca_guid) 499 { 500 rds_hca_t *hcap; 501 502 RDS_DPRINTF4("rds_get_hcap", "rds_get_hcap: Enter: statep: 0x%p " 503 "guid: %llx", statep, hca_guid); 504 505 rw_enter(&statep->rds_hca_lock, RW_READER); 506 507 hcap = statep->rds_hcalistp; 508 while ((hcap != NULL) && (hcap->hca_guid != hca_guid)) { 509 hcap = hcap->hca_nextp; 510 } 511 512 /* 513 * don't let anyone use this HCA until the RECV memory 514 * is registered with this HCA 515 */ 516 if ((hcap != NULL) && 517 (hcap->hca_state == RDS_HCA_STATE_MEM_REGISTERED)) { 518 ASSERT(hcap->hca_mrhdl != NULL); 519 rw_exit(&statep->rds_hca_lock); 520 return (hcap); 521 } 522 523 RDS_DPRINTF2("rds_get_hcap", 524 "HCA (0x%p, 0x%llx) is not initialized", hcap, hca_guid); 525 rw_exit(&statep->rds_hca_lock); 526 527 RDS_DPRINTF4("rds_get_hcap", "rds_get_hcap: return"); 528 529 return (NULL); 530 } 531 532 /* Return hcap, given a gid */ 533 rds_hca_t * 534 rds_gid_to_hcap(rds_state_t *statep, ib_gid_t gid) 535 { 536 rds_hca_t *hcap; 537 uint_t ix; 538 539 RDS_DPRINTF4("rds_gid_to_hcap", "Enter: statep: 0x%p gid: %llx:%llx", 540 statep, gid.gid_prefix, gid.gid_guid); 541 542 rw_enter(&statep->rds_hca_lock, RW_READER); 543 544 hcap = statep->rds_hcalistp; 545 while (hcap != NULL) { 546 547 /* 548 * don't let anyone use this HCA until the RECV memory 549 * is registered with this HCA 550 */ 551 if (hcap->hca_state != RDS_HCA_STATE_MEM_REGISTERED) { 552 RDS_DPRINTF3("rds_gid_to_hcap", 553 "HCA (0x%p, 0x%llx) is not initialized", 554 hcap, gid.gid_guid); 555 hcap = hcap->hca_nextp; 556 continue; 557 } 558 559 for (ix = 0; ix < hcap->hca_nports; ix++) { 560 if ((hcap->hca_pinfop[ix].p_sgid_tbl[0].gid_prefix == 561 gid.gid_prefix) && 562 (hcap->hca_pinfop[ix].p_sgid_tbl[0].gid_guid == 563 gid.gid_guid)) { 564 RDS_DPRINTF4("rds_gid_to_hcap", 565 "gid found in hcap: 0x%p", hcap); 566 rw_exit(&statep->rds_hca_lock); 567 return (hcap); 568 } 569 } 570 hcap = hcap->hca_nextp; 571 } 572 573 rw_exit(&statep->rds_hca_lock); 574 575 return (NULL); 576 } 577 578 /* This is called from the send CQ handler */ 579 void 580 rds_send_acknowledgement(rds_ep_t *ep) 581 { 582 int ret; 583 uint_t ix; 584 585 RDS_DPRINTF4("rds_send_acknowledgement", "Enter EP(%p)", ep); 586 587 mutex_enter(&ep->ep_lock); 588 589 ASSERT(ep->ep_rdmacnt != 0); 590 591 /* 592 * The previous ACK completed successfully, send the next one 593 * if more messages were received after sending the last ACK 594 */ 595 if (ep->ep_rbufid != *(uintptr_t *)(uintptr_t)ep->ep_ackds.ds_va) { 596 *(uintptr_t *)(uintptr_t)ep->ep_ackds.ds_va = ep->ep_rbufid; 597 mutex_exit(&ep->ep_lock); 598 599 /* send acknowledgement */ 600 RDS_INCR_TXACKS(); 601 ret = ibt_post_send(ep->ep_chanhdl, &ep->ep_ackwr, 1, &ix); 602 if (ret != IBT_SUCCESS) { 603 RDS_DPRINTF2("rds_send_acknowledgement", 604 "EP(%p): ibt_post_send for acknowledgement " 605 "failed: %d, SQ depth: %d", 606 ep, ret, ep->ep_sndpool.pool_nbusy); 607 mutex_enter(&ep->ep_lock); 608 ep->ep_rdmacnt--; 609 mutex_exit(&ep->ep_lock); 610 } 611 } else { 612 /* ACKed all messages, no more to ACK */ 613 ep->ep_rdmacnt--; 614 mutex_exit(&ep->ep_lock); 615 return; 616 } 617 618 RDS_DPRINTF4("rds_send_acknowledgement", "Return EP(%p)", ep); 619 } 620 621 static int 622 rds_poll_ctrl_completions(ibt_cq_hdl_t cq, rds_ep_t *ep) 623 { 624 ibt_wc_t wc; 625 uint_t npolled; 626 rds_buf_t *bp; 627 rds_ctrl_pkt_t *cpkt; 628 rds_qp_t *recvqp; 629 int ret = IBT_SUCCESS; 630 631 RDS_DPRINTF4("rds_poll_ctrl_completions", "Enter: EP(%p)", ep); 632 633 bzero(&wc, sizeof (ibt_wc_t)); 634 ret = ibt_poll_cq(cq, &wc, 1, &npolled); 635 if (ret != IBT_SUCCESS) { 636 if (ret != IBT_CQ_EMPTY) { 637 RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_poll_cq " 638 "returned: %d", ep, cq, ret); 639 } else { 640 RDS_DPRINTF5(LABEL, "EP(%p) CQ(%p): ibt_poll_cq " 641 "returned: IBT_CQ_EMPTY", ep, cq); 642 } 643 return (ret); 644 } 645 646 bp = (rds_buf_t *)(uintptr_t)wc.wc_id; 647 648 if (wc.wc_status != IBT_WC_SUCCESS) { 649 mutex_enter(&ep->ep_recvqp.qp_lock); 650 ep->ep_recvqp.qp_level--; 651 mutex_exit(&ep->ep_recvqp.qp_lock); 652 653 /* Free the buffer */ 654 bp->buf_state = RDS_RCVBUF_FREE; 655 rds_free_recv_buf(bp, 1); 656 657 /* Receive completion failure */ 658 if (wc.wc_status != IBT_WC_WR_FLUSHED_ERR) { 659 RDS_DPRINTF2("rds_poll_ctrl_completions", 660 "EP(%p) CQ(%p) BP(%p): WC Error Status: %d", 661 ep, cq, wc.wc_id, wc.wc_status); 662 } 663 return (ret); 664 } 665 666 /* there is one less in the RQ */ 667 recvqp = &ep->ep_recvqp; 668 mutex_enter(&recvqp->qp_lock); 669 recvqp->qp_level--; 670 if ((recvqp->qp_taskqpending == B_FALSE) && 671 (recvqp->qp_level <= recvqp->qp_lwm)) { 672 /* Time to post more buffers into the RQ */ 673 recvqp->qp_taskqpending = B_TRUE; 674 mutex_exit(&recvqp->qp_lock); 675 676 ret = ddi_taskq_dispatch(rds_taskq, 677 rds_post_recv_buf, (void *)ep->ep_chanhdl, DDI_NOSLEEP); 678 if (ret != DDI_SUCCESS) { 679 RDS_DPRINTF2(LABEL, "ddi_taskq_dispatch failed: %d", 680 ret); 681 mutex_enter(&recvqp->qp_lock); 682 recvqp->qp_taskqpending = B_FALSE; 683 mutex_exit(&recvqp->qp_lock); 684 } 685 } else { 686 mutex_exit(&recvqp->qp_lock); 687 } 688 689 cpkt = (rds_ctrl_pkt_t *)(uintptr_t)bp->buf_ds.ds_va; 690 rds_handle_control_message(ep->ep_sp, cpkt); 691 692 bp->buf_state = RDS_RCVBUF_FREE; 693 rds_free_recv_buf(bp, 1); 694 695 RDS_DPRINTF4("rds_poll_ctrl_completions", "Return: EP(%p)", ep); 696 697 return (ret); 698 } 699 700 #define RDS_POST_FEW_ATATIME 100 701 /* Post recv WRs into the RQ. Assumes the ep->refcnt is already incremented */ 702 void 703 rds_post_recv_buf(void *arg) 704 { 705 ibt_channel_hdl_t chanhdl; 706 rds_ep_t *ep; 707 rds_session_t *sp; 708 rds_qp_t *recvqp; 709 rds_bufpool_t *gp; 710 rds_buf_t *bp, *bp1; 711 ibt_recv_wr_t *wrp, wr[RDS_POST_FEW_ATATIME]; 712 rds_hca_t *hcap; 713 uint_t npost, nspace, rcv_len; 714 uint_t ix, jx, kx; 715 int ret; 716 717 chanhdl = (ibt_channel_hdl_t)arg; 718 RDS_DPRINTF4("rds_post_recv_buf", "Enter: CHAN(%p)", chanhdl); 719 RDS_INCR_POST_RCV_BUF_CALLS(); 720 721 ep = (rds_ep_t *)ibt_get_chan_private(chanhdl); 722 ASSERT(ep != NULL); 723 sp = ep->ep_sp; 724 recvqp = &ep->ep_recvqp; 725 726 RDS_DPRINTF5("rds_post_recv_buf", "EP(%p)", ep); 727 728 /* get the hcap for the HCA hosting this channel */ 729 hcap = rds_lkup_hca(ep->ep_hca_guid); 730 if (hcap == NULL) { 731 RDS_DPRINTF2("rds_post_recv_buf", "HCA (0x%llx) not found", 732 ep->ep_hca_guid); 733 return; 734 } 735 736 /* Make sure the session is still connected */ 737 rw_enter(&sp->session_lock, RW_READER); 738 if ((sp->session_state != RDS_SESSION_STATE_INIT) && 739 (sp->session_state != RDS_SESSION_STATE_CONNECTED) && 740 (sp->session_state != RDS_SESSION_STATE_HCA_CLOSING)) { 741 RDS_DPRINTF2("rds_post_recv_buf", "EP(%p): Session is not " 742 "in active state (%d)", ep, sp->session_state); 743 rw_exit(&sp->session_lock); 744 return; 745 } 746 rw_exit(&sp->session_lock); 747 748 /* how many can be posted */ 749 mutex_enter(&recvqp->qp_lock); 750 nspace = recvqp->qp_depth - recvqp->qp_level; 751 if (nspace == 0) { 752 RDS_DPRINTF2("rds_post_recv_buf", "RQ is FULL"); 753 recvqp->qp_taskqpending = B_FALSE; 754 mutex_exit(&recvqp->qp_lock); 755 return; 756 } 757 mutex_exit(&recvqp->qp_lock); 758 759 if (ep->ep_type == RDS_EP_TYPE_DATA) { 760 gp = &rds_dpool; 761 rcv_len = RdsPktSize; 762 } else { 763 gp = &rds_cpool; 764 rcv_len = RDS_CTRLPKT_SIZE; 765 } 766 767 bp = rds_get_buf(gp, nspace, &jx); 768 if (bp == NULL) { 769 RDS_DPRINTF2(LABEL, "EP(%p): No Recv buffers available", ep); 770 /* try again later */ 771 ret = ddi_taskq_dispatch(rds_taskq, rds_post_recv_buf, 772 (void *)chanhdl, DDI_NOSLEEP); 773 if (ret != DDI_SUCCESS) { 774 RDS_DPRINTF2(LABEL, "ddi_taskq_dispatch failed: %d", 775 ret); 776 mutex_enter(&recvqp->qp_lock); 777 recvqp->qp_taskqpending = B_FALSE; 778 mutex_exit(&recvqp->qp_lock); 779 } 780 return; 781 } 782 783 if (jx != nspace) { 784 RDS_DPRINTF2(LABEL, "EP(%p): Recv buffers " 785 "needed: %d available: %d", ep, nspace, jx); 786 nspace = jx; 787 } 788 789 bp1 = bp; 790 for (ix = 0; ix < nspace; ix++) { 791 bp1->buf_ep = ep; 792 ASSERT(bp1->buf_state == RDS_RCVBUF_FREE); 793 bp1->buf_state = RDS_RCVBUF_POSTED; 794 bp1->buf_ds.ds_key = hcap->hca_lkey; 795 bp1->buf_ds.ds_len = rcv_len; 796 bp1 = bp1->buf_nextp; 797 } 798 799 #if 0 800 wrp = kmem_zalloc(RDS_POST_FEW_ATATIME * sizeof (ibt_recv_wr_t), 801 KM_SLEEP); 802 #else 803 wrp = &wr[0]; 804 #endif 805 806 npost = nspace; 807 while (npost) { 808 jx = (npost > RDS_POST_FEW_ATATIME) ? 809 RDS_POST_FEW_ATATIME : npost; 810 for (ix = 0; ix < jx; ix++) { 811 wrp[ix].wr_id = (uintptr_t)bp; 812 wrp[ix].wr_nds = 1; 813 wrp[ix].wr_sgl = &bp->buf_ds; 814 bp = bp->buf_nextp; 815 } 816 817 ret = ibt_post_recv(chanhdl, wrp, jx, &kx); 818 if ((ret != IBT_SUCCESS) || (kx != jx)) { 819 RDS_DPRINTF2(LABEL, "ibt_post_recv for %d WRs failed: " 820 "%d", npost, ret); 821 npost -= kx; 822 break; 823 } 824 825 npost -= jx; 826 } 827 828 mutex_enter(&recvqp->qp_lock); 829 if (npost != 0) { 830 RDS_DPRINTF2("rds_post_recv_buf", 831 "EP(%p) Failed to post %d WRs", ep, npost); 832 recvqp->qp_level += (nspace - npost); 833 } else { 834 recvqp->qp_level += nspace; 835 } 836 837 /* 838 * sometimes, the recv WRs can get consumed as soon as they are 839 * posted. In that case, taskq thread to post more WRs to the RQ will 840 * not be scheduled as the taskqpending flag is still set. 841 */ 842 if (recvqp->qp_level == 0) { 843 mutex_exit(&recvqp->qp_lock); 844 ret = ddi_taskq_dispatch(rds_taskq, 845 rds_post_recv_buf, (void *)chanhdl, DDI_NOSLEEP); 846 if (ret != DDI_SUCCESS) { 847 RDS_DPRINTF2("rds_post_recv_buf", 848 "ddi_taskq_dispatch failed: %d", ret); 849 mutex_enter(&recvqp->qp_lock); 850 recvqp->qp_taskqpending = B_FALSE; 851 mutex_exit(&recvqp->qp_lock); 852 } 853 } else { 854 recvqp->qp_taskqpending = B_FALSE; 855 mutex_exit(&recvqp->qp_lock); 856 } 857 858 #if 0 859 kmem_free(wrp, RDS_POST_FEW_ATATIME * sizeof (ibt_recv_wr_t)); 860 #endif 861 862 RDS_DPRINTF4("rds_post_recv_buf", "Return: EP(%p)", ep); 863 } 864 865 static int 866 rds_poll_data_completions(ibt_cq_hdl_t cq, rds_ep_t *ep) 867 { 868 ibt_wc_t wc; 869 rds_buf_t *bp; 870 rds_data_hdr_t *pktp; 871 rds_qp_t *recvqp; 872 uint_t npolled; 873 int ret = IBT_SUCCESS; 874 875 876 RDS_DPRINTF4("rds_poll_data_completions", "Enter: EP(%p)", ep); 877 878 bzero(&wc, sizeof (ibt_wc_t)); 879 ret = ibt_poll_cq(cq, &wc, 1, &npolled); 880 if (ret != IBT_SUCCESS) { 881 if (ret != IBT_CQ_EMPTY) { 882 RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_poll_cq " 883 "returned: %d", ep, cq, ret); 884 } else { 885 RDS_DPRINTF5(LABEL, "EP(%p) CQ(%p): ibt_poll_cq " 886 "returned: IBT_CQ_EMPTY", ep, cq); 887 } 888 return (ret); 889 } 890 891 bp = (rds_buf_t *)(uintptr_t)wc.wc_id; 892 ASSERT(bp->buf_state == RDS_RCVBUF_POSTED); 893 bp->buf_state = RDS_RCVBUF_ONSOCKQ; 894 bp->buf_nextp = NULL; 895 896 if (wc.wc_status != IBT_WC_SUCCESS) { 897 mutex_enter(&ep->ep_recvqp.qp_lock); 898 ep->ep_recvqp.qp_level--; 899 mutex_exit(&ep->ep_recvqp.qp_lock); 900 901 /* free the buffer */ 902 bp->buf_state = RDS_RCVBUF_FREE; 903 rds_free_recv_buf(bp, 1); 904 905 /* Receive completion failure */ 906 if (wc.wc_status != IBT_WC_WR_FLUSHED_ERR) { 907 RDS_DPRINTF2("rds_poll_data_completions", 908 "EP(%p) CQ(%p) BP(%p): WC Error Status: %d", 909 ep, cq, wc.wc_id, wc.wc_status); 910 RDS_INCR_RXERRS(); 911 } 912 return (ret); 913 } 914 915 /* there is one less in the RQ */ 916 recvqp = &ep->ep_recvqp; 917 mutex_enter(&recvqp->qp_lock); 918 recvqp->qp_level--; 919 if ((recvqp->qp_taskqpending == B_FALSE) && 920 (recvqp->qp_level <= recvqp->qp_lwm)) { 921 /* Time to post more buffers into the RQ */ 922 recvqp->qp_taskqpending = B_TRUE; 923 mutex_exit(&recvqp->qp_lock); 924 925 ret = ddi_taskq_dispatch(rds_taskq, 926 rds_post_recv_buf, (void *)ep->ep_chanhdl, DDI_NOSLEEP); 927 if (ret != DDI_SUCCESS) { 928 RDS_DPRINTF2(LABEL, "ddi_taskq_dispatch failed: %d", 929 ret); 930 mutex_enter(&recvqp->qp_lock); 931 recvqp->qp_taskqpending = B_FALSE; 932 mutex_exit(&recvqp->qp_lock); 933 } 934 } else { 935 mutex_exit(&recvqp->qp_lock); 936 } 937 938 pktp = (rds_data_hdr_t *)(uintptr_t)bp->buf_ds.ds_va; 939 ASSERT(pktp->dh_datalen != 0); 940 941 RDS_DPRINTF5(LABEL, "Message Received: sendIP: 0x%x recvIP: 0x%x " 942 "sendport: %d recvport: %d npkts: %d pktno: %d", ep->ep_remip, 943 ep->ep_myip, pktp->dh_sendport, pktp->dh_recvport, 944 pktp->dh_npkts, pktp->dh_psn); 945 946 RDS_DPRINTF3(LABEL, "BP(%p): npkts: %d psn: %d", bp, 947 pktp->dh_npkts, pktp->dh_psn); 948 949 if (pktp->dh_npkts == 1) { 950 /* single pkt or last packet */ 951 if (pktp->dh_psn != 0) { 952 /* last packet of a segmented message */ 953 ASSERT(ep->ep_seglbp != NULL); 954 ep->ep_seglbp->buf_nextp = bp; 955 ep->ep_seglbp = bp; 956 rds_received_msg(ep, ep->ep_segfbp); 957 ep->ep_segfbp = NULL; 958 ep->ep_seglbp = NULL; 959 } else { 960 /* single packet */ 961 rds_received_msg(ep, bp); 962 } 963 } else { 964 /* multi-pkt msg */ 965 if (pktp->dh_psn == 0) { 966 /* first packet */ 967 ASSERT(ep->ep_segfbp == NULL); 968 ep->ep_segfbp = bp; 969 ep->ep_seglbp = bp; 970 } else { 971 /* intermediate packet */ 972 ASSERT(ep->ep_segfbp != NULL); 973 ep->ep_seglbp->buf_nextp = bp; 974 ep->ep_seglbp = bp; 975 } 976 } 977 978 RDS_DPRINTF4("rds_poll_data_completions", "Return: EP(%p)", ep); 979 980 return (ret); 981 } 982 983 void 984 rds_recvcq_handler(ibt_cq_hdl_t cq, void *arg) 985 { 986 rds_ep_t *ep; 987 int ret = IBT_SUCCESS; 988 int (*func)(ibt_cq_hdl_t, rds_ep_t *); 989 990 ep = (rds_ep_t *)arg; 991 992 RDS_DPRINTF4("rds_recvcq_handler", "enter: EP(%p)", ep); 993 994 if (ep->ep_type == RDS_EP_TYPE_DATA) { 995 func = rds_poll_data_completions; 996 } else { 997 func = rds_poll_ctrl_completions; 998 } 999 1000 do { 1001 ret = func(cq, ep); 1002 } while (ret != IBT_CQ_EMPTY); 1003 1004 /* enable the CQ */ 1005 ret = ibt_enable_cq_notify(cq, rds_wc_signal); 1006 if (ret != IBT_SUCCESS) { 1007 RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_enable_cq_notify " 1008 "failed: %d", ep, cq, ret); 1009 return; 1010 } 1011 1012 do { 1013 ret = func(cq, ep); 1014 } while (ret != IBT_CQ_EMPTY); 1015 1016 RDS_DPRINTF4("rds_recvcq_handler", "Return: EP(%p)", ep); 1017 } 1018 1019 void 1020 rds_poll_send_completions(ibt_cq_hdl_t cq, rds_ep_t *ep, boolean_t lock) 1021 { 1022 ibt_wc_t wc[RDS_NUM_DATA_SEND_WCS]; 1023 uint_t npolled, nret, send_error = 0; 1024 rds_buf_t *headp, *tailp, *bp; 1025 int ret, ix; 1026 1027 RDS_DPRINTF4("rds_poll_send_completions", "Enter EP(%p)", ep); 1028 1029 headp = NULL; 1030 tailp = NULL; 1031 npolled = 0; 1032 do { 1033 ret = ibt_poll_cq(cq, wc, RDS_NUM_DATA_SEND_WCS, &nret); 1034 if (ret != IBT_SUCCESS) { 1035 if (ret != IBT_CQ_EMPTY) { 1036 RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): " 1037 "ibt_poll_cq returned: %d", ep, cq, ret); 1038 } else { 1039 RDS_DPRINTF5(LABEL, "EP(%p) CQ(%p): " 1040 "ibt_poll_cq returned: IBT_CQ_EMPTY", 1041 ep, cq); 1042 } 1043 1044 break; 1045 } 1046 1047 for (ix = 0; ix < nret; ix++) { 1048 if (wc[ix].wc_status == IBT_WC_SUCCESS) { 1049 if (wc[ix].wc_type == IBT_WRC_RDMAW) { 1050 rds_send_acknowledgement(ep); 1051 continue; 1052 } 1053 1054 bp = (rds_buf_t *)(uintptr_t)wc[ix].wc_id; 1055 ASSERT(bp->buf_state == RDS_SNDBUF_PENDING); 1056 bp->buf_state = RDS_SNDBUF_FREE; 1057 } else if (wc[ix].wc_status == IBT_WC_WR_FLUSHED_ERR) { 1058 RDS_INCR_TXERRS(); 1059 RDS_DPRINTF5("rds_poll_send_completions", 1060 "EP(%p): WC ID: %p ERROR: %d", ep, 1061 wc[ix].wc_id, wc[ix].wc_status); 1062 1063 if (wc[ix].wc_id == RDS_RDMAW_WRID) { 1064 mutex_enter(&ep->ep_lock); 1065 ep->ep_rdmacnt--; 1066 mutex_exit(&ep->ep_lock); 1067 continue; 1068 } 1069 1070 bp = (rds_buf_t *)(uintptr_t)wc[ix].wc_id; 1071 ASSERT(bp->buf_state == RDS_SNDBUF_PENDING); 1072 bp->buf_state = RDS_SNDBUF_FREE; 1073 } else { 1074 RDS_INCR_TXERRS(); 1075 RDS_DPRINTF2("rds_poll_send_completions", 1076 "EP(%p): WC ID: %p ERROR: %d", ep, 1077 wc[ix].wc_id, wc[ix].wc_status); 1078 if (send_error == 0) { 1079 rds_session_t *sp = ep->ep_sp; 1080 1081 /* don't let anyone send anymore */ 1082 rw_enter(&sp->session_lock, RW_WRITER); 1083 if (sp->session_state != 1084 RDS_SESSION_STATE_ERROR) { 1085 sp->session_state = 1086 RDS_SESSION_STATE_ERROR; 1087 /* Make this the active end */ 1088 sp->session_type = 1089 RDS_SESSION_ACTIVE; 1090 } 1091 rw_exit(&sp->session_lock); 1092 } 1093 1094 send_error++; 1095 1096 if (wc[ix].wc_id == RDS_RDMAW_WRID) { 1097 mutex_enter(&ep->ep_lock); 1098 ep->ep_rdmacnt--; 1099 mutex_exit(&ep->ep_lock); 1100 continue; 1101 } 1102 1103 bp = (rds_buf_t *)(uintptr_t)wc[ix].wc_id; 1104 ASSERT(bp->buf_state == RDS_SNDBUF_PENDING); 1105 bp->buf_state = RDS_SNDBUF_FREE; 1106 } 1107 1108 bp->buf_nextp = NULL; 1109 if (headp) { 1110 tailp->buf_nextp = bp; 1111 tailp = bp; 1112 } else { 1113 headp = bp; 1114 tailp = bp; 1115 } 1116 1117 npolled++; 1118 } 1119 1120 if (rds_no_interrupts && (npolled > 100)) { 1121 break; 1122 } 1123 1124 if (rds_no_interrupts == 1) { 1125 break; 1126 } 1127 } while (ret != IBT_CQ_EMPTY); 1128 1129 RDS_DPRINTF5("rds_poll_send_completions", "Npolled: %d send_error: %d", 1130 npolled, send_error); 1131 1132 /* put the buffers to the pool */ 1133 if (npolled != 0) { 1134 rds_free_send_buf(ep, headp, tailp, npolled, lock); 1135 } 1136 1137 if (send_error != 0) { 1138 rds_handle_send_error(ep); 1139 } 1140 1141 RDS_DPRINTF4("rds_poll_send_completions", "Return EP(%p)", ep); 1142 } 1143 1144 void 1145 rds_sendcq_handler(ibt_cq_hdl_t cq, void *arg) 1146 { 1147 rds_ep_t *ep; 1148 int ret; 1149 1150 ep = (rds_ep_t *)arg; 1151 1152 RDS_DPRINTF4("rds_sendcq_handler", "Enter: EP(%p)", ep); 1153 1154 /* enable the CQ */ 1155 ret = ibt_enable_cq_notify(cq, IBT_NEXT_COMPLETION); 1156 if (ret != IBT_SUCCESS) { 1157 RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_enable_cq_notify " 1158 "failed: %d", ep, cq, ret); 1159 return; 1160 } 1161 1162 rds_poll_send_completions(cq, ep, B_FALSE); 1163 1164 RDS_DPRINTF4("rds_sendcq_handler", "Return: EP(%p)", ep); 1165 } 1166 1167 void 1168 rds_ep_free_rc_channel(rds_ep_t *ep) 1169 { 1170 int ret; 1171 1172 RDS_DPRINTF2("rds_ep_free_rc_channel", "EP(%p) - Enter", ep); 1173 1174 ASSERT(mutex_owned(&ep->ep_lock)); 1175 1176 /* free the QP */ 1177 if (ep->ep_chanhdl != NULL) { 1178 /* wait until the RQ is empty */ 1179 (void) ibt_flush_channel(ep->ep_chanhdl); 1180 (void) rds_is_recvq_empty(ep, B_TRUE); 1181 ret = ibt_free_channel(ep->ep_chanhdl); 1182 if (ret != IBT_SUCCESS) { 1183 RDS_DPRINTF2("rds_ep_free_rc_channel", "EP(%p) " 1184 "ibt_free_channel returned: %d", ep, ret); 1185 } 1186 ep->ep_chanhdl = NULL; 1187 } else { 1188 RDS_DPRINTF2("rds_ep_free_rc_channel", 1189 "EP(%p) Channel is ALREADY FREE", ep); 1190 } 1191 1192 /* free the Send CQ */ 1193 if (ep->ep_sendcq != NULL) { 1194 ret = ibt_free_cq(ep->ep_sendcq); 1195 if (ret != IBT_SUCCESS) { 1196 RDS_DPRINTF2("rds_ep_free_rc_channel", 1197 "EP(%p) - for sendcq, ibt_free_cq returned %d", 1198 ep, ret); 1199 } 1200 ep->ep_sendcq = NULL; 1201 } else { 1202 RDS_DPRINTF2("rds_ep_free_rc_channel", 1203 "EP(%p) SendCQ is ALREADY FREE", ep); 1204 } 1205 1206 /* free the Recv CQ */ 1207 if (ep->ep_recvcq != NULL) { 1208 ret = ibt_free_cq(ep->ep_recvcq); 1209 if (ret != IBT_SUCCESS) { 1210 RDS_DPRINTF2("rds_ep_free_rc_channel", 1211 "EP(%p) - for recvcq, ibt_free_cq returned %d", 1212 ep, ret); 1213 } 1214 ep->ep_recvcq = NULL; 1215 } else { 1216 RDS_DPRINTF2("rds_ep_free_rc_channel", 1217 "EP(%p) RecvCQ is ALREADY FREE", ep); 1218 } 1219 1220 RDS_DPRINTF2("rds_ep_free_rc_channel", "EP(%p) - Return", ep); 1221 } 1222 1223 /* Allocate resources for RC channel */ 1224 ibt_channel_hdl_t 1225 rds_ep_alloc_rc_channel(rds_ep_t *ep, uint8_t hca_port) 1226 { 1227 int ret = IBT_SUCCESS; 1228 ibt_cq_attr_t scqattr, rcqattr; 1229 ibt_rc_chan_alloc_args_t chanargs; 1230 ibt_channel_hdl_t chanhdl; 1231 rds_session_t *sp; 1232 rds_hca_t *hcap; 1233 1234 RDS_DPRINTF4("rds_ep_alloc_rc_channel", "Enter: 0x%p port: %d", 1235 ep, hca_port); 1236 1237 /* Update the EP with the right IP address and HCA guid */ 1238 sp = ep->ep_sp; 1239 ASSERT(sp != NULL); 1240 rw_enter(&sp->session_lock, RW_READER); 1241 mutex_enter(&ep->ep_lock); 1242 ep->ep_myip = sp->session_myip; 1243 ep->ep_remip = sp->session_remip; 1244 hcap = rds_gid_to_hcap(rdsib_statep, sp->session_lgid); 1245 ep->ep_hca_guid = hcap->hca_guid; 1246 mutex_exit(&ep->ep_lock); 1247 rw_exit(&sp->session_lock); 1248 1249 /* reset taskqpending flag here */ 1250 ep->ep_recvqp.qp_taskqpending = B_FALSE; 1251 1252 if (ep->ep_type == RDS_EP_TYPE_CTRL) { 1253 scqattr.cq_size = MaxCtrlSendBuffers; 1254 scqattr.cq_sched = NULL; 1255 scqattr.cq_flags = IBT_CQ_NO_FLAGS; 1256 1257 rcqattr.cq_size = MaxCtrlRecvBuffers; 1258 rcqattr.cq_sched = NULL; 1259 rcqattr.cq_flags = IBT_CQ_NO_FLAGS; 1260 1261 chanargs.rc_sizes.cs_sq = MaxCtrlSendBuffers; 1262 chanargs.rc_sizes.cs_rq = MaxCtrlRecvBuffers; 1263 chanargs.rc_sizes.cs_sq_sgl = 1; 1264 chanargs.rc_sizes.cs_rq_sgl = 1; 1265 } else { 1266 scqattr.cq_size = MaxDataSendBuffers + RDS_NUM_ACKS; 1267 scqattr.cq_sched = NULL; 1268 scqattr.cq_flags = IBT_CQ_NO_FLAGS; 1269 1270 rcqattr.cq_size = MaxDataRecvBuffers; 1271 rcqattr.cq_sched = NULL; 1272 rcqattr.cq_flags = IBT_CQ_NO_FLAGS; 1273 1274 chanargs.rc_sizes.cs_sq = MaxDataSendBuffers + RDS_NUM_ACKS; 1275 chanargs.rc_sizes.cs_rq = MaxDataRecvBuffers; 1276 chanargs.rc_sizes.cs_sq_sgl = 1; 1277 chanargs.rc_sizes.cs_rq_sgl = 1; 1278 } 1279 1280 mutex_enter(&ep->ep_lock); 1281 if (ep->ep_sendcq == NULL) { 1282 /* returned size is always greater than the requested size */ 1283 ret = ibt_alloc_cq(hcap->hca_hdl, &scqattr, 1284 &ep->ep_sendcq, NULL); 1285 if (ret != IBT_SUCCESS) { 1286 RDS_DPRINTF2(LABEL, "ibt_alloc_cq for sendCQ " 1287 "failed, size = %d: %d", scqattr.cq_size, ret); 1288 mutex_exit(&ep->ep_lock); 1289 return (NULL); 1290 } 1291 1292 (void) ibt_set_cq_handler(ep->ep_sendcq, rds_sendcq_handler, 1293 ep); 1294 1295 if (rds_no_interrupts == 0) { 1296 ret = ibt_enable_cq_notify(ep->ep_sendcq, 1297 IBT_NEXT_COMPLETION); 1298 if (ret != IBT_SUCCESS) { 1299 RDS_DPRINTF2(LABEL, 1300 "ibt_enable_cq_notify failed: %d", ret); 1301 (void) ibt_free_cq(ep->ep_sendcq); 1302 ep->ep_sendcq = NULL; 1303 mutex_exit(&ep->ep_lock); 1304 return (NULL); 1305 } 1306 } 1307 } 1308 1309 if (ep->ep_recvcq == NULL) { 1310 /* returned size is always greater than the requested size */ 1311 ret = ibt_alloc_cq(hcap->hca_hdl, &rcqattr, 1312 &ep->ep_recvcq, NULL); 1313 if (ret != IBT_SUCCESS) { 1314 RDS_DPRINTF2(LABEL, "ibt_alloc_cq for recvCQ " 1315 "failed, size = %d: %d", rcqattr.cq_size, ret); 1316 (void) ibt_free_cq(ep->ep_sendcq); 1317 ep->ep_sendcq = NULL; 1318 mutex_exit(&ep->ep_lock); 1319 return (NULL); 1320 } 1321 1322 (void) ibt_set_cq_handler(ep->ep_recvcq, rds_recvcq_handler, 1323 ep); 1324 1325 ret = ibt_enable_cq_notify(ep->ep_recvcq, rds_wc_signal); 1326 if (ret != IBT_SUCCESS) { 1327 RDS_DPRINTF2(LABEL, 1328 "ibt_enable_cq_notify failed: %d", ret); 1329 (void) ibt_free_cq(ep->ep_recvcq); 1330 ep->ep_recvcq = NULL; 1331 (void) ibt_free_cq(ep->ep_sendcq); 1332 ep->ep_sendcq = NULL; 1333 mutex_exit(&ep->ep_lock); 1334 return (NULL); 1335 } 1336 } 1337 1338 chanargs.rc_flags = IBT_ALL_SIGNALED; 1339 chanargs.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR | 1340 IBT_CEP_ATOMIC; 1341 chanargs.rc_hca_port_num = hca_port; 1342 chanargs.rc_scq = ep->ep_sendcq; 1343 chanargs.rc_rcq = ep->ep_recvcq; 1344 chanargs.rc_pd = hcap->hca_pdhdl; 1345 chanargs.rc_srq = NULL; 1346 1347 ret = ibt_alloc_rc_channel(hcap->hca_hdl, 1348 IBT_ACHAN_NO_FLAGS, &chanargs, &chanhdl, NULL); 1349 if (ret != IBT_SUCCESS) { 1350 RDS_DPRINTF2(LABEL, "ibt_alloc_rc_channel fail: %d", 1351 ret); 1352 (void) ibt_free_cq(ep->ep_recvcq); 1353 ep->ep_recvcq = NULL; 1354 (void) ibt_free_cq(ep->ep_sendcq); 1355 ep->ep_sendcq = NULL; 1356 mutex_exit(&ep->ep_lock); 1357 return (NULL); 1358 } 1359 mutex_exit(&ep->ep_lock); 1360 1361 /* Chan private should contain the ep */ 1362 (void) ibt_set_chan_private(chanhdl, ep); 1363 1364 RDS_DPRINTF4("rds_ep_alloc_rc_channel", "Return: 0x%p", chanhdl); 1365 1366 return (chanhdl); 1367 } 1368 1369 1370 #if 0 1371 1372 /* Return node guid given a port gid */ 1373 ib_guid_t 1374 rds_gid_to_node_guid(ib_gid_t gid) 1375 { 1376 ibt_node_info_t nodeinfo; 1377 int ret; 1378 1379 RDS_DPRINTF4("rds_gid_to_node_guid", "Enter: gid: %llx:%llx", 1380 gid.gid_prefix, gid.gid_guid); 1381 1382 ret = ibt_gid_to_node_info(gid, &nodeinfo); 1383 if (ret != IBT_SUCCESS) { 1384 RDS_DPRINTF2(LABEL, "ibt_gid_node_info for gid: %llx:%llx " 1385 "failed", gid.gid_prefix, gid.gid_guid); 1386 return (0LL); 1387 } 1388 1389 RDS_DPRINTF4("rds_gid_to_node_guid", "Return: Node guid: %llx", 1390 nodeinfo.n_node_guid); 1391 1392 return (nodeinfo.n_node_guid); 1393 } 1394 1395 #endif 1396 1397 static void 1398 rds_handle_portup_event(rds_state_t *statep, ibt_hca_hdl_t hdl, 1399 ibt_async_event_t *event) 1400 { 1401 rds_hca_t *hcap; 1402 ibt_hca_portinfo_t *newpinfop, *oldpinfop; 1403 uint_t newsize, oldsize, nport; 1404 ib_gid_t gid; 1405 int ret; 1406 1407 RDS_DPRINTF2("rds_handle_portup_event", 1408 "Enter: GUID: 0x%llx Statep: %p", event->ev_hca_guid, statep); 1409 1410 rw_enter(&statep->rds_hca_lock, RW_WRITER); 1411 1412 hcap = statep->rds_hcalistp; 1413 while ((hcap != NULL) && (hcap->hca_guid != event->ev_hca_guid)) { 1414 hcap = hcap->hca_nextp; 1415 } 1416 1417 if (hcap == NULL) { 1418 RDS_DPRINTF2("rds_handle_portup_event", "HCA: 0x%llx is " 1419 "not in our list", event->ev_hca_guid); 1420 rw_exit(&statep->rds_hca_lock); 1421 return; 1422 } 1423 1424 ret = ibt_query_hca_ports(hdl, 0, &newpinfop, &nport, &newsize); 1425 if (ret != IBT_SUCCESS) { 1426 RDS_DPRINTF2(LABEL, "ibt_query_hca_ports failed: %d", ret); 1427 rw_exit(&statep->rds_hca_lock); 1428 return; 1429 } 1430 1431 oldpinfop = hcap->hca_pinfop; 1432 oldsize = hcap->hca_pinfo_sz; 1433 hcap->hca_pinfop = newpinfop; 1434 hcap->hca_pinfo_sz = newsize; 1435 1436 (void) ibt_free_portinfo(oldpinfop, oldsize); 1437 1438 /* If RDS service is not registered then no bind is needed */ 1439 if (statep->rds_srvhdl == NULL) { 1440 RDS_DPRINTF2("rds_handle_portup_event", 1441 "RDS Service is not registered, so no action needed"); 1442 rw_exit(&statep->rds_hca_lock); 1443 return; 1444 } 1445 1446 /* 1447 * If the service was previously bound on this port and 1448 * if this port has changed state down and now up, we do not 1449 * need to bind the service again. The bind is expected to 1450 * persist across state changes. If the service was never bound 1451 * before then we bind it this time. 1452 */ 1453 if (hcap->hca_bindhdl[event->ev_port - 1] == NULL) { 1454 1455 /* structure copy */ 1456 gid = newpinfop[event->ev_port - 1].p_sgid_tbl[0]; 1457 1458 /* bind RDS service on the port, pass statep as cm_private */ 1459 ret = ibt_bind_service(statep->rds_srvhdl, gid, NULL, statep, 1460 &hcap->hca_bindhdl[event->ev_port - 1]); 1461 if (ret != IBT_SUCCESS) { 1462 RDS_DPRINTF2("rds_handle_portup_event", 1463 "Bind service for HCA: 0x%llx Port: %d " 1464 "gid %llx:%llx returned: %d", event->ev_hca_guid, 1465 event->ev_port, gid.gid_prefix, gid.gid_guid, ret); 1466 } 1467 } 1468 1469 rw_exit(&statep->rds_hca_lock); 1470 1471 RDS_DPRINTF2("rds_handle_portup_event", "Return: GUID: 0x%llx", 1472 event->ev_hca_guid); 1473 } 1474 1475 static void 1476 rdsib_add_hca(ib_guid_t hca_guid) 1477 { 1478 rds_hca_t *hcap; 1479 ibt_mr_attr_t mem_attr; 1480 ibt_mr_desc_t mem_desc; 1481 int ret; 1482 1483 RDS_DPRINTF2("rdsib_add_hca", "Enter: GUID: 0x%llx", hca_guid); 1484 1485 hcap = rdsib_init_hca(hca_guid); 1486 if (hcap == NULL) 1487 return; 1488 1489 /* register the recv memory with this hca */ 1490 mutex_enter(&rds_dpool.pool_lock); 1491 if (rds_dpool.pool_memp == NULL) { 1492 /* no memory to register */ 1493 RDS_DPRINTF2("rdsib_add_hca", "No memory to register"); 1494 mutex_exit(&rds_dpool.pool_lock); 1495 return; 1496 } 1497 1498 mem_attr.mr_vaddr = (ib_vaddr_t)(uintptr_t)rds_dpool.pool_memp; 1499 mem_attr.mr_len = rds_dpool.pool_memsize; 1500 mem_attr.mr_as = NULL; 1501 mem_attr.mr_flags = IBT_MR_ENABLE_LOCAL_WRITE; 1502 1503 ret = ibt_register_mr(hcap->hca_hdl, hcap->hca_pdhdl, &mem_attr, 1504 &hcap->hca_mrhdl, &mem_desc); 1505 1506 mutex_exit(&rds_dpool.pool_lock); 1507 1508 if (ret != IBT_SUCCESS) { 1509 RDS_DPRINTF2("rdsib_add_hca", "ibt_register_mr failed: %d", 1510 ret); 1511 } else { 1512 rw_enter(&rdsib_statep->rds_hca_lock, RW_WRITER); 1513 hcap->hca_state = RDS_HCA_STATE_MEM_REGISTERED; 1514 hcap->hca_lkey = mem_desc.md_lkey; 1515 hcap->hca_rkey = mem_desc.md_rkey; 1516 rw_exit(&rdsib_statep->rds_hca_lock); 1517 } 1518 1519 RDS_DPRINTF2("rdsib_add_hca", "Retrun: GUID: 0x%llx", hca_guid); 1520 } 1521 1522 void rds_close_this_session(rds_session_t *sp, uint8_t wait); 1523 int rds_post_control_message(rds_session_t *sp, uint8_t code, in_port_t port); 1524 1525 static void 1526 rdsib_del_hca(rds_state_t *statep, ib_guid_t hca_guid) 1527 { 1528 rds_session_t *sp; 1529 rds_hca_t *hcap; 1530 rds_hca_state_t saved_state; 1531 int ret, ix; 1532 1533 RDS_DPRINTF2("rdsib_del_hca", "Enter: GUID: 0x%llx", hca_guid); 1534 1535 /* 1536 * This should be a write lock as we don't want anyone to get access 1537 * to the hcap while we are modifing its contents 1538 */ 1539 rw_enter(&statep->rds_hca_lock, RW_WRITER); 1540 1541 hcap = statep->rds_hcalistp; 1542 while ((hcap != NULL) && (hcap->hca_guid != hca_guid)) { 1543 hcap = hcap->hca_nextp; 1544 } 1545 1546 /* Prevent initiating any new activity on this HCA */ 1547 ASSERT(hcap != NULL); 1548 saved_state = hcap->hca_state; 1549 hcap->hca_state = RDS_HCA_STATE_STOPPING; 1550 1551 rw_exit(&statep->rds_hca_lock); 1552 1553 /* 1554 * stop the outgoing traffic and close any active sessions on this hca. 1555 * Any pending messages in the SQ will be allowed to complete. 1556 */ 1557 rw_enter(&statep->rds_sessionlock, RW_READER); 1558 sp = statep->rds_sessionlistp; 1559 while (sp) { 1560 if (sp->session_hca_guid != hca_guid) { 1561 sp = sp->session_nextp; 1562 continue; 1563 } 1564 1565 rw_enter(&sp->session_lock, RW_WRITER); 1566 RDS_DPRINTF2("rdsib_del_hca", "SP(%p) State: %d", sp, 1567 sp->session_state); 1568 /* 1569 * We are changing the session state in advance. This prevents 1570 * further messages to be posted to the SQ. We then 1571 * send a control message to the remote and tell it close 1572 * the session. 1573 */ 1574 sp->session_state = RDS_SESSION_STATE_HCA_CLOSING; 1575 RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State " 1576 "RDS_SESSION_STATE_PASSIVE_CLOSING", sp); 1577 rw_exit(&sp->session_lock); 1578 1579 /* 1580 * wait until the sendq is empty then tell the remote to 1581 * close this session. This enables for graceful shutdown of 1582 * the session 1583 */ 1584 rds_is_sendq_empty(&sp->session_dataep, 2); 1585 (void) rds_post_control_message(sp, 1586 RDS_CTRL_CODE_CLOSE_SESSION, 0); 1587 1588 sp = sp->session_nextp; 1589 } 1590 1591 /* wait until all the sessions are off this HCA */ 1592 sp = statep->rds_sessionlistp; 1593 while (sp) { 1594 if (sp->session_hca_guid != hca_guid) { 1595 sp = sp->session_nextp; 1596 continue; 1597 } 1598 1599 rw_enter(&sp->session_lock, RW_READER); 1600 RDS_DPRINTF2("rdsib_del_hca", "SP(%p) State: %d", sp, 1601 sp->session_state); 1602 1603 while ((sp->session_state == RDS_SESSION_STATE_HCA_CLOSING) || 1604 (sp->session_state == RDS_SESSION_STATE_ERROR) || 1605 (sp->session_state == RDS_SESSION_STATE_PASSIVE_CLOSING) || 1606 (sp->session_state == RDS_SESSION_STATE_CLOSED)) { 1607 rw_exit(&sp->session_lock); 1608 delay(drv_usectohz(1000000)); 1609 rw_enter(&sp->session_lock, RW_READER); 1610 RDS_DPRINTF2("rdsib_del_hca", "SP(%p) State: %d", sp, 1611 sp->session_state); 1612 } 1613 1614 rw_exit(&sp->session_lock); 1615 1616 sp = sp->session_nextp; 1617 } 1618 rw_exit(&statep->rds_sessionlock); 1619 1620 /* 1621 * if rdsib_close_ib was called before this, then that would have 1622 * unbound the service on all ports. In that case, the HCA structs 1623 * will contain stale bindhdls. Hence, we do not call unbind unless 1624 * the service is still registered. 1625 */ 1626 if (statep->rds_srvhdl != NULL) { 1627 /* unbind RDS service on all ports on this HCA */ 1628 for (ix = 0; ix < hcap->hca_nports; ix++) { 1629 if (hcap->hca_bindhdl[ix] == NULL) { 1630 continue; 1631 } 1632 1633 RDS_DPRINTF2("rdsib_del_hca", 1634 "Unbinding Service: port: %d, bindhdl: %p", 1635 ix + 1, hcap->hca_bindhdl[ix]); 1636 (void) ibt_unbind_service(rdsib_statep->rds_srvhdl, 1637 hcap->hca_bindhdl[ix]); 1638 hcap->hca_bindhdl[ix] = NULL; 1639 } 1640 } 1641 1642 RDS_DPRINTF2("rdsib_del_hca", "HCA(%p) State: %d", hcap, 1643 hcap->hca_state); 1644 1645 switch (saved_state) { 1646 case RDS_HCA_STATE_MEM_REGISTERED: 1647 ASSERT(hcap->hca_mrhdl != NULL); 1648 ret = ibt_deregister_mr(hcap->hca_hdl, hcap->hca_mrhdl); 1649 if (ret != IBT_SUCCESS) { 1650 RDS_DPRINTF2("rdsib_del_hca", 1651 "ibt_deregister_mr failed: %d", ret); 1652 return; 1653 } 1654 hcap->hca_mrhdl = NULL; 1655 /* FALLTHRU */ 1656 case RDS_HCA_STATE_OPEN: 1657 ASSERT(hcap->hca_hdl != NULL); 1658 ASSERT(hcap->hca_pdhdl != NULL); 1659 1660 1661 ret = ibt_free_pd(hcap->hca_hdl, hcap->hca_pdhdl); 1662 if (ret != IBT_SUCCESS) { 1663 RDS_DPRINTF2("rdsib_del_hca", 1664 "ibt_free_pd failed: %d", ret); 1665 } 1666 1667 (void) ibt_free_portinfo(hcap->hca_pinfop, hcap->hca_pinfo_sz); 1668 1669 ret = ibt_close_hca(hcap->hca_hdl); 1670 if (ret != IBT_SUCCESS) { 1671 RDS_DPRINTF2("rdsib_del_hca", 1672 "ibt_close_hca failed: %d", ret); 1673 } 1674 1675 hcap->hca_hdl = NULL; 1676 hcap->hca_pdhdl = NULL; 1677 hcap->hca_lkey = 0; 1678 hcap->hca_rkey = 0; 1679 } 1680 1681 /* 1682 * This should be a write lock as we don't want anyone to get access 1683 * to the hcap while we are modifing its contents 1684 */ 1685 rw_enter(&statep->rds_hca_lock, RW_WRITER); 1686 hcap->hca_state = RDS_HCA_STATE_REMOVED; 1687 rw_exit(&statep->rds_hca_lock); 1688 1689 RDS_DPRINTF2("rdsib_del_hca", "Return: GUID: 0x%llx", hca_guid); 1690 } 1691 1692 static void 1693 rds_async_handler(void *clntp, ibt_hca_hdl_t hdl, ibt_async_code_t code, 1694 ibt_async_event_t *event) 1695 { 1696 rds_state_t *statep = (rds_state_t *)clntp; 1697 1698 RDS_DPRINTF2("rds_async_handler", "Async code: %d", code); 1699 1700 switch (code) { 1701 case IBT_EVENT_PORT_UP: 1702 rds_handle_portup_event(statep, hdl, event); 1703 break; 1704 case IBT_HCA_ATTACH_EVENT: 1705 /* 1706 * NOTE: In some error recovery paths, it is possible to 1707 * receive IBT_HCA_ATTACH_EVENTs on already known HCAs. 1708 */ 1709 (void) rdsib_add_hca(event->ev_hca_guid); 1710 break; 1711 case IBT_HCA_DETACH_EVENT: 1712 (void) rdsib_del_hca(statep, event->ev_hca_guid); 1713 break; 1714 1715 default: 1716 RDS_DPRINTF2(LABEL, "Async event: %d not handled", code); 1717 } 1718 1719 RDS_DPRINTF2("rds_async_handler", "Return: code: %d", code); 1720 } 1721