1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * Copyright (c) 2006 Oracle. All rights reserved. 27 * 28 * This software is available to you under a choice of one of two 29 * licenses. You may choose to be licensed under the terms of the GNU 30 * General Public License (GPL) Version 2, available from the file 31 * COPYING in the main directory of this source tree, or the 32 * OpenIB.org BSD license below: 33 * 34 * Redistribution and use in source and binary forms, with or 35 * without modification, are permitted provided that the following 36 * conditions are met: 37 * 38 * - Redistributions of source code must retain the above 39 * copyright notice, this list of conditions and the following 40 * disclaimer. 41 * 42 * - Redistributions in binary form must reproduce the above 43 * copyright notice, this list of conditions and the following 44 * disclaimer in the documentation and/or other materials 45 * provided with the distribution. 46 * 47 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 48 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 49 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 50 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 51 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 52 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 53 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 54 * SOFTWARE. 55 * 56 */ 57 #include <sys/rds.h> 58 59 #include <sys/ib/clients/of/ofed_kernel.h> 60 #include <sys/ib/clients/of/rdma/ib_addr.h> 61 #include <sys/ib/clients/of/rdma/rdma_cm.h> 62 63 #include <sys/ib/clients/rdsv3/rdsv3.h> 64 #include <sys/ib/clients/rdsv3/ib.h> 65 #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 66 67 extern ddi_taskq_t *rdsv3_taskq; 68 69 /* 70 * Set the selected protocol version 71 */ 72 static void 73 rdsv3_ib_set_protocol(struct rdsv3_connection *conn, unsigned int version) 74 { 75 RDSV3_DPRINTF4("rdsv3_ib_set_protocol", "conn: %p version: %d", 76 conn, version); 77 conn->c_version = version; 78 } 79 80 /* 81 * Set up flow control 82 */ 83 static void 84 rdsv3_ib_set_flow_control(struct rdsv3_connection *conn, uint32_t credits) 85 { 86 struct rdsv3_ib_connection *ic = conn->c_transport_data; 87 88 RDSV3_DPRINTF2("rdsv3_ib_set_flow_control", 89 "Enter: conn: %p credits: %d", conn, credits); 90 91 if (rdsv3_ib_sysctl_flow_control && credits != 0) { 92 /* We're doing flow control */ 93 ic->i_flowctl = 1; 94 rdsv3_ib_send_add_credits(conn, credits); 95 } else { 96 ic->i_flowctl = 0; 97 } 98 99 RDSV3_DPRINTF2("rdsv3_ib_set_flow_control", 100 "Return: conn: %p credits: %d", 101 conn, credits); 102 } 103 104 /* 105 * Tune RNR behavior. Without flow control, we use a rather 106 * low timeout, but not the absolute minimum - this should 107 * be tunable. 108 * 109 * We already set the RNR retry count to 7 (which is the 110 * smallest infinite number :-) above. 111 * If flow control is off, we want to change this back to 0 112 * so that we learn quickly when our credit accounting is 113 * buggy. 114 * 115 * Caller passes in a qp_attr pointer - don't waste stack spacv 116 * by allocation this twice. 117 */ 118 static void 119 rdsv3_ib_tune_rnr(struct rdsv3_ib_connection *ic, struct ib_qp_attr *attr) 120 { 121 int ret; 122 123 RDSV3_DPRINTF2("rdsv3_ib_tune_rnr", "Enter ic: %p attr: %p", 124 ic, attr); 125 126 attr->min_rnr_timer = IB_RNR_TIMER_000_32; 127 ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER); 128 if (ret) 129 RDSV3_DPRINTF2("rdsv3_ib_tune_rnr", 130 "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d", -ret); 131 } 132 133 /* 134 * Connection established. 135 * We get here for both outgoing and incoming connection. 136 */ 137 void 138 rdsv3_ib_cm_connect_complete(struct rdsv3_connection *conn, 139 struct rdma_cm_event *event) 140 { 141 const struct rdsv3_ib_connect_private *dp = NULL; 142 struct rdsv3_ib_connection *ic = conn->c_transport_data; 143 struct rdsv3_ib_device *rds_ibdev; 144 struct ib_qp_attr qp_attr; 145 int err; 146 147 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", 148 "Enter conn: %p event: %p", conn, event); 149 150 if (event->param.conn.private_data_len >= sizeof (*dp)) { 151 dp = event->param.conn.private_data; 152 153 /* make sure it isn't empty data */ 154 if (dp->dp_protocol_major) { 155 rdsv3_ib_set_protocol(conn, 156 RDS_PROTOCOL(dp->dp_protocol_major, 157 dp->dp_protocol_minor)); 158 rdsv3_ib_set_flow_control(conn, 159 ntohl(dp->dp_credit)); 160 } 161 } 162 163 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", 164 "RDS/IB: connected to %u.%u.%u.%u version %u.%u%s", 165 NIPQUAD(conn->c_faddr), 166 RDS_PROTOCOL_MAJOR(conn->c_version), 167 RDS_PROTOCOL_MINOR(conn->c_version), 168 ic->i_flowctl ? ", flow control" : ""); 169 170 /* 171 * Init rings and fill recv. this needs to wait until protocol 172 * negotiation 173 * is complete, since ring layout is different from 3.0 to 3.1. 174 */ 175 rdsv3_ib_send_init_ring(ic); 176 rdsv3_ib_recv_init_ring(ic); 177 /* 178 * Post receive buffers - as a side effect, this will update 179 * the posted credit count. 180 */ 181 (void) rdsv3_ib_recv_refill(conn, KM_NOSLEEP, 0, 1); 182 183 /* Tune RNR behavior */ 184 rdsv3_ib_tune_rnr(ic, &qp_attr); 185 186 qp_attr.qp_state = IB_QPS_RTS; 187 err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE); 188 if (err) 189 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", 190 "ib_modify_qp(IB_QP_STATE, RTS): err=%d", err); 191 192 /* update ib_device with this local ipaddr & conn */ 193 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rdsv3_ib_client); 194 err = rdsv3_ib_update_ipaddr(rds_ibdev, conn->c_laddr); 195 if (err) 196 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", 197 "rdsv3_ib_update_ipaddr failed (%d)", err); 198 rdsv3_ib_add_conn(rds_ibdev, conn); 199 200 /* 201 * If the peer gave us the last packet it saw, process this as if 202 * we had received a regular ACK. 203 */ 204 if (dp && dp->dp_ack_seq) 205 rdsv3_send_drop_acked(conn, ntohll(dp->dp_ack_seq), NULL); 206 207 rdsv3_connect_complete(conn); 208 209 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", 210 "Return conn: %p event: %p", 211 conn, event); 212 } 213 214 static void 215 rdsv3_ib_cm_fill_conn_param(struct rdsv3_connection *conn, 216 struct rdma_conn_param *conn_param, 217 struct rdsv3_ib_connect_private *dp, 218 uint32_t protocol_version) 219 { 220 RDSV3_DPRINTF2("rdsv3_ib_cm_fill_conn_param", 221 "Enter conn: %p conn_param: %p private: %p version: %d", 222 conn, conn_param, dp, protocol_version); 223 224 (void) memset(conn_param, 0, sizeof (struct rdma_conn_param)); 225 /* XXX tune these? */ 226 conn_param->responder_resources = 1; 227 conn_param->initiator_depth = 1; 228 conn_param->retry_count = min(rdsv3_ib_retry_count, 7); 229 conn_param->rnr_retry_count = 7; 230 231 if (dp) { 232 struct rdsv3_ib_connection *ic = conn->c_transport_data; 233 234 (void) memset(dp, 0, sizeof (*dp)); 235 dp->dp_saddr = conn->c_laddr; 236 dp->dp_daddr = conn->c_faddr; 237 dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version); 238 dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version); 239 dp->dp_protocol_minor_mask = 240 htons(RDSV3_IB_SUPPORTED_PROTOCOLS); 241 dp->dp_ack_seq = rdsv3_ib_piggyb_ack(ic); 242 243 /* Advertise flow control */ 244 if (ic->i_flowctl) { 245 unsigned int credits; 246 247 credits = IB_GET_POST_CREDITS( 248 atomic_get(&ic->i_credits)); 249 dp->dp_credit = htonl(credits); 250 atomic_add_32(&ic->i_credits, 251 -IB_SET_POST_CREDITS(credits)); 252 } 253 254 conn_param->private_data = dp; 255 conn_param->private_data_len = sizeof (*dp); 256 } 257 258 RDSV3_DPRINTF2("rdsv3_ib_cm_fill_conn_param", 259 "Return conn: %p conn_param: %p private: %p version: %d", 260 conn, conn_param, dp, protocol_version); 261 } 262 263 static void 264 rdsv3_ib_cq_event_handler(struct ib_event *event, void *data) 265 { 266 RDSV3_DPRINTF3("rdsv3_ib_cq_event_handler", "event %u data %p", 267 event->event, data); 268 } 269 270 static void 271 rdsv3_ib_qp_event_handler(struct ib_event *event, void *data) 272 { 273 struct rdsv3_connection *conn = data; 274 struct rdsv3_ib_connection *ic = conn->c_transport_data; 275 276 RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", "conn %p ic %p event %u", 277 conn, ic, event->event); 278 279 switch (event->event) { 280 case IB_EVENT_COMM_EST: 281 (void) rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); 282 break; 283 default: 284 if (conn) { 285 RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", 286 "RDS/IB: Fatal QP Event %u - " 287 "connection %u.%u.%u.%u ->%u.%u.%u.%u " 288 "...reconnecting", 289 event->event, NIPQUAD(conn->c_laddr), 290 NIPQUAD(conn->c_faddr)); 291 rdsv3_conn_drop(conn); 292 } else { 293 RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", 294 "RDS/IB: Fatal QP Event %u - connection" 295 "...reconnecting", event->event); 296 } 297 break; 298 } 299 300 RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", "Return conn: %p event: %p", 301 conn, event); 302 } 303 304 extern int rdsv3_ib_alloc_hdrs(ib_device_t *dev, 305 struct rdsv3_ib_connection *ic); 306 extern void rdsv3_ib_free_hdrs(ib_device_t *dev, 307 struct rdsv3_ib_connection *ic); 308 309 /* 310 * This needs to be very careful to not leave IS_ERR pointers around for 311 * cleanup to trip over. 312 */ 313 static int 314 rdsv3_ib_setup_qp(struct rdsv3_connection *conn) 315 { 316 struct rdsv3_ib_connection *ic = conn->c_transport_data; 317 struct ib_device *dev = ic->i_cm_id->device; 318 struct ib_qp_init_attr attr; 319 struct rdsv3_ib_device *rds_ibdev; 320 ibt_send_wr_t *wrp; 321 ibt_wr_ds_t *sgl; 322 int ret, i; 323 324 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", "Enter conn: %p", conn); 325 326 /* 327 * rdsv3_ib_add_one creates a rdsv3_ib_device object per IB device, 328 * and allocates a protection domain, memory range and FMR pool 329 * for each. If that fails for any reason, it will not register 330 * the rds_ibdev at all. 331 */ 332 rds_ibdev = ib_get_client_data(dev, &rdsv3_ib_client); 333 if (rds_ibdev == NULL) { 334 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 335 "RDS/IB: No client_data for device %s", dev->name); 336 return (-EOPNOTSUPP); 337 } 338 339 if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1) 340 rdsv3_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1); 341 if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1) 342 rdsv3_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1); 343 344 /* Protection domain and memory range */ 345 ic->i_pd = rds_ibdev->pd; 346 347 ic->i_send_cq = ib_create_cq(dev, rdsv3_ib_send_cq_comp_handler, 348 rdsv3_ib_cq_event_handler, conn, 349 ic->i_send_ring.w_nr + 1, 0); 350 if (IS_ERR(ic->i_send_cq)) { 351 ret = PTR_ERR(ic->i_send_cq); 352 ic->i_send_cq = NULL; 353 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 354 "ib_create_cq send failed: %d", ret); 355 goto out; 356 } 357 358 ic->i_recv_cq = ib_create_cq(dev, rdsv3_ib_recv_cq_comp_handler, 359 rdsv3_ib_cq_event_handler, conn, 360 ic->i_recv_ring.w_nr, 0); 361 if (IS_ERR(ic->i_recv_cq)) { 362 ret = PTR_ERR(ic->i_recv_cq); 363 ic->i_recv_cq = NULL; 364 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 365 "ib_create_cq recv failed: %d", ret); 366 goto out; 367 } 368 369 ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); 370 if (ret) { 371 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 372 "ib_req_notify_cq send failed: %d", ret); 373 goto out; 374 } 375 376 ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); 377 if (ret) { 378 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 379 "ib_req_notify_cq recv failed: %d", ret); 380 goto out; 381 } 382 383 /* XXX negotiate max send/recv with remote? */ 384 (void) memset(&attr, 0, sizeof (attr)); 385 attr.event_handler = rdsv3_ib_qp_event_handler; 386 attr.qp_context = conn; 387 /* + 1 to allow for the single ack message */ 388 attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1; 389 attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1; 390 attr.cap.max_send_sge = rds_ibdev->max_sge; 391 attr.cap.max_recv_sge = RDSV3_IB_RECV_SGE; 392 attr.sq_sig_type = IB_SIGNAL_REQ_WR; 393 attr.qp_type = IB_QPT_RC; 394 attr.send_cq = ic->i_send_cq; 395 attr.recv_cq = ic->i_recv_cq; 396 397 /* 398 * XXX this can fail if max_*_wr is too large? Are we supposed 399 * to back off until we get a value that the hardware can support? 400 */ 401 ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr); 402 if (ret) { 403 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 404 "rdma_create_qp failed: %d", ret); 405 goto out; 406 } 407 408 ret = rdsv3_ib_alloc_hdrs(dev, ic); 409 if (ret != 0) { 410 ret = -ENOMEM; 411 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 412 "rdsv3_ib_alloc_hdrs failed: %d", ret); 413 goto out; 414 } 415 416 ic->i_sends = kmem_alloc(ic->i_send_ring.w_nr * 417 sizeof (struct rdsv3_ib_send_work), KM_NOSLEEP); 418 if (ic->i_sends == NULL) { 419 ret = -ENOMEM; 420 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 421 "send allocation failed: %d", ret); 422 goto out; 423 } 424 (void) memset(ic->i_sends, 0, ic->i_send_ring.w_nr * 425 sizeof (struct rdsv3_ib_send_work)); 426 427 ic->i_send_wrs = 428 kmem_alloc(RDSV3_IB_SEND_WRS * (sizeof (ibt_send_wr_t) + 429 RDSV3_IB_MAX_SGE * sizeof (ibt_wr_ds_t)), KM_NOSLEEP); 430 if (ic->i_send_wrs == NULL) { 431 ret = -ENOMEM; 432 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 433 "WR allocation failed: %d", ret); 434 goto out; 435 } 436 sgl = (ibt_wr_ds_t *)((uint8_t *)ic->i_send_wrs + 437 (RDSV3_IB_SEND_WRS * sizeof (ibt_send_wr_t))); 438 RDSV3_DPRINTF4("rdsv3_ib_setup_qp", "i_send_wrs: %p sgl: %p", 439 ic->i_send_wrs, sgl); 440 for (i = 0; i < RDSV3_IB_SEND_WRS; i++) { 441 wrp = &ic->i_send_wrs[i]; 442 wrp->wr_sgl = &sgl[i * RDSV3_IB_MAX_SGE]; 443 } 444 445 ic->i_recvs = kmem_alloc(ic->i_recv_ring.w_nr * 446 sizeof (struct rdsv3_ib_recv_work), KM_NOSLEEP); 447 if (ic->i_recvs == NULL) { 448 ret = -ENOMEM; 449 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 450 "recv allocation failed: %d", ret); 451 goto out; 452 } 453 (void) memset(ic->i_recvs, 0, ic->i_recv_ring.w_nr * 454 sizeof (struct rdsv3_ib_recv_work)); 455 456 rdsv3_ib_recv_init_ack(ic); 457 458 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", "conn %p pd %p mr %p cq %p %p", 459 conn, ic->i_pd, ic->i_mr, ic->i_send_cq, ic->i_recv_cq); 460 461 out: 462 return (ret); 463 } 464 465 static uint32_t 466 rdsv3_ib_protocol_compatible(struct rdma_cm_event *event) 467 { 468 const struct rdsv3_ib_connect_private *dp = 469 event->param.conn.private_data; 470 uint16_t common; 471 uint32_t version = 0; 472 473 RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", "Enter event: %p", 474 event); 475 476 /* 477 * rdma_cm private data is odd - when there is any private data in the 478 * request, we will be given a pretty large buffer without telling us 479 * the 480 * original size. The only way to tell the difference is by looking at 481 * the contents, which are initialized to zero. 482 * If the protocol version fields aren't set, 483 * this is a connection attempt 484 * from an older version. This could could be 3.0 or 2.0 - 485 * we can't tell. 486 * We really should have changed this for OFED 1.3 :-( 487 */ 488 489 /* Be paranoid. RDS always has privdata */ 490 if (!event->param.conn.private_data_len) { 491 RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", 492 "RDS incoming connection has no private data, rejecting"); 493 return (0); 494 } 495 496 /* Even if len is crap *now* I still want to check it. -ASG */ 497 if (event->param.conn.private_data_len < sizeof (*dp) || 498 dp->dp_protocol_major == 0) 499 return (RDS_PROTOCOL_3_0); 500 501 common = ntohs(dp->dp_protocol_minor_mask) & 502 RDSV3_IB_SUPPORTED_PROTOCOLS; 503 if (dp->dp_protocol_major == 3 && common) { 504 version = RDS_PROTOCOL_3_0; 505 while ((common >>= 1) != 0) 506 version++; 507 } else { 508 RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", 509 "RDS: Connection from %u.%u.%u.%u using " 510 "incompatible protocol version %u.%u\n", 511 NIPQUAD(dp->dp_saddr), 512 dp->dp_protocol_major, 513 dp->dp_protocol_minor); 514 } 515 516 RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", "Return event: %p", 517 event); 518 519 return (version); 520 } 521 522 int 523 rdsv3_ib_cm_handle_connect(struct rdma_cm_id *cm_id, 524 struct rdma_cm_event *event) 525 { 526 uint64_be_t lguid = cm_id->route.path_rec->sgid.global.interface_id; 527 uint64_be_t fguid = cm_id->route.path_rec->dgid.global.interface_id; 528 const struct rdsv3_ib_connect_private *dp = 529 event->param.conn.private_data; 530 struct rdsv3_ib_connect_private dp_rep; 531 struct rdsv3_connection *conn = NULL; 532 struct rdsv3_ib_connection *ic = NULL; 533 struct rdma_conn_param conn_param; 534 uint32_t version; 535 int err, destroy = 1; 536 boolean_t conn_created = B_FALSE; 537 538 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 539 "Enter cm_id: %p event: %p", cm_id, event); 540 541 /* Check whether the remote protocol version matches ours. */ 542 version = rdsv3_ib_protocol_compatible(event); 543 if (!version) { 544 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 545 "version mismatch"); 546 goto out; 547 } 548 549 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 550 "saddr %u.%u.%u.%u daddr %u.%u.%u.%u RDSv%d.%d lguid 0x%llx fguid " 551 "0x%llx", NIPQUAD(dp->dp_saddr), NIPQUAD(dp->dp_daddr), 552 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version), 553 (unsigned long long)ntohll(lguid), 554 (unsigned long long)ntohll(fguid)); 555 556 conn = rdsv3_conn_create(dp->dp_daddr, dp->dp_saddr, 557 &rdsv3_ib_transport, KM_NOSLEEP); 558 if (IS_ERR(conn)) { 559 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 560 "rdsv3_conn_create failed (%ld)", PTR_ERR(conn)); 561 conn = NULL; 562 goto out; 563 } 564 565 /* 566 * The connection request may occur while the 567 * previous connection exist, e.g. in case of failover. 568 * But as connections may be initiated simultaneously 569 * by both hosts, we have a random backoff mechanism - 570 * see the comment above rdsv3_queue_reconnect() 571 */ 572 mutex_enter(&conn->c_cm_lock); 573 if (!rdsv3_conn_transition(conn, RDSV3_CONN_DOWN, 574 RDSV3_CONN_CONNECTING)) { 575 if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) { 576 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 577 "incoming connect when connected: %p", 578 conn); 579 rdsv3_conn_drop(conn); 580 rdsv3_ib_stats_inc(s_ib_listen_closed_stale); 581 mutex_exit(&conn->c_cm_lock); 582 goto out; 583 } else if (rdsv3_conn_state(conn) == RDSV3_CONN_CONNECTING) { 584 /* Wait and see - our connect may still be succeeding */ 585 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 586 "peer-to-peer connection request: %p, " 587 "lguid: 0x%llx fguid: 0x%llx", 588 conn, lguid, fguid); 589 rdsv3_ib_stats_inc(s_ib_connect_raced); 590 } 591 mutex_exit(&conn->c_cm_lock); 592 goto out; 593 } 594 595 ic = conn->c_transport_data; 596 597 rdsv3_ib_set_protocol(conn, version); 598 rdsv3_ib_set_flow_control(conn, ntohl(dp->dp_credit)); 599 600 /* 601 * If the peer gave us the last packet it saw, process this as if 602 * we had received a regular ACK. 603 */ 604 if (dp->dp_ack_seq) 605 rdsv3_send_drop_acked(conn, ntohll(dp->dp_ack_seq), NULL); 606 607 ASSERT(!cm_id->context); 608 ASSERT(!ic->i_cm_id); 609 610 if (ic->i_cm_id != NULL) 611 RDSV3_PANIC(); 612 613 ic->i_cm_id = cm_id; 614 cm_id->context = conn; 615 616 /* 617 * We got halfway through setting up the ib_connection, if we 618 * fail now, we have to take the long route out of this mess. 619 */ 620 destroy = 0; 621 622 err = rdsv3_ib_setup_qp(conn); 623 if (err) { 624 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 625 "rdsv3_ib_setup_qp failed (%d)", err); 626 mutex_exit(&conn->c_cm_lock); 627 rdsv3_conn_drop(conn); 628 goto out; 629 } 630 631 rdsv3_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version); 632 633 /* rdma_accept() calls rdma_reject() internally if it fails */ 634 err = rdma_accept(cm_id, &conn_param); 635 mutex_exit(&conn->c_cm_lock); 636 if (err) { 637 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 638 "rdma_accept failed (%d)", err); 639 rdsv3_conn_drop(conn); 640 goto out; 641 } 642 643 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 644 "Return cm_id: %p event: %p", cm_id, event); 645 646 return (0); 647 648 out: 649 (void) rdma_reject(cm_id, NULL, 0); 650 return (destroy); 651 } 652 653 654 int 655 rdsv3_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) 656 { 657 struct rdsv3_connection *conn = cm_id->context; 658 struct rdsv3_ib_connection *ic = conn->c_transport_data; 659 struct rdma_conn_param conn_param; 660 struct rdsv3_ib_connect_private dp; 661 int ret; 662 663 RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect", "Enter: cm_id: %p", 664 cm_id); 665 666 /* 667 * If the peer doesn't do protocol negotiation, we must 668 * default to RDSv3.0 669 */ 670 rdsv3_ib_set_protocol(conn, RDS_PROTOCOL_3_0); 671 ic->i_flowctl = 672 rdsv3_ib_sysctl_flow_control; /* advertise flow control */ 673 674 ret = rdsv3_ib_setup_qp(conn); 675 if (ret) { 676 RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect", 677 "rdsv3_ib_setup_qp failed (%d)", ret); 678 rdsv3_conn_drop(conn); 679 goto out; 680 } 681 682 (void) rdsv3_ib_cm_fill_conn_param(conn, &conn_param, &dp, 683 RDS_PROTOCOL_VERSION); 684 685 ret = rdma_connect(cm_id, &conn_param); 686 if (ret) { 687 RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect", 688 "rdma_connect failed (%d)", ret); 689 rdsv3_conn_drop(conn); 690 } 691 692 RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect", 693 "Return: cm_id: %p", cm_id); 694 695 out: 696 /* 697 * Beware - returning non-zero tells the rdma_cm to destroy 698 * the cm_id. We should certainly not do it as long as we still 699 * "own" the cm_id. 700 */ 701 if (ret) { 702 if (ic->i_cm_id == cm_id) 703 ret = 0; 704 } 705 return (ret); 706 } 707 708 int 709 rdsv3_ib_conn_connect(struct rdsv3_connection *conn) 710 { 711 struct rdsv3_ib_connection *ic = conn->c_transport_data; 712 struct sockaddr_in src, dest; 713 ipaddr_t laddr, faddr; 714 int ret; 715 716 RDSV3_DPRINTF2("rdsv3_ib_conn_connect", "Enter: conn: %p", conn); 717 718 /* 719 * XXX I wonder what affect the port space has 720 */ 721 /* delegate cm event handler to rdma_transport */ 722 ic->i_cm_id = rdma_create_id(rdsv3_rdma_cm_event_handler, conn, 723 RDMA_PS_TCP); 724 if (IS_ERR(ic->i_cm_id)) { 725 ret = PTR_ERR(ic->i_cm_id); 726 ic->i_cm_id = NULL; 727 RDSV3_DPRINTF2("rdsv3_ib_conn_connect", 728 "rdma_create_id() failed: %d", ret); 729 goto out; 730 } 731 732 RDSV3_DPRINTF3("rdsv3_ib_conn_connect", 733 "created cm id %p for conn %p", ic->i_cm_id, conn); 734 735 /* The ipaddr should be in the network order */ 736 laddr = conn->c_laddr; 737 faddr = conn->c_faddr; 738 ret = rdsv3_sc_path_lookup(&laddr, &faddr); 739 if (ret == 0) { 740 RDSV3_DPRINTF2(LABEL, "Path not found (0x%x 0x%x)", 741 ntohl(laddr), ntohl(faddr)); 742 } 743 744 src.sin_family = AF_INET; 745 src.sin_addr.s_addr = (uint32_t)laddr; 746 src.sin_port = (uint16_t)htons(0); 747 748 dest.sin_family = AF_INET; 749 dest.sin_addr.s_addr = (uint32_t)faddr; 750 dest.sin_port = (uint16_t)htons(RDSV3_PORT); 751 752 ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src, 753 (struct sockaddr *)&dest, 754 RDSV3_RDMA_RESOLVE_TIMEOUT_MS); 755 if (ret) { 756 RDSV3_DPRINTF2("rdsv3_ib_conn_connect", 757 "addr resolve failed for cm id %p: %d", ic->i_cm_id, ret); 758 rdma_destroy_id(ic->i_cm_id); 759 ic->i_cm_id = NULL; 760 } 761 762 RDSV3_DPRINTF2("rdsv3_ib_conn_connect", "Return: conn: %p", conn); 763 764 out: 765 return (ret); 766 } 767 768 /* 769 * This is so careful about only cleaning up resources that were built up 770 * so that it can be called at any point during startup. In fact it 771 * can be called multiple times for a given connection. 772 */ 773 void 774 rdsv3_ib_conn_shutdown(struct rdsv3_connection *conn) 775 { 776 struct rdsv3_ib_connection *ic = conn->c_transport_data; 777 int err = 0; 778 779 RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", 780 "cm %p pd %p cq %p %p qp %p", ic->i_cm_id, 781 ic->i_pd, ic->i_send_cq, ic->i_recv_cq, 782 ic->i_cm_id ? ic->i_cm_id->qp : NULL); 783 784 if (ic->i_cm_id) { 785 struct ib_device *dev = ic->i_cm_id->device; 786 787 RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", 788 "disconnecting cm %p", ic->i_cm_id); 789 err = rdma_disconnect(ic->i_cm_id); 790 if (err) { 791 /* 792 * Actually this may happen quite frequently, when 793 * an outgoing connect raced with an incoming connect. 794 */ 795 RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", 796 "failed to disconnect, cm: %p err %d", 797 ic->i_cm_id, err); 798 } 799 800 if (ic->i_cm_id->qp) { 801 (void) ibt_flush_qp( 802 ib_get_ibt_channel_hdl(ic->i_cm_id)); 803 804 /* wait until all WRs are flushed */ 805 rdsv3_wait_event(&rdsv3_ib_ring_empty_wait, 806 rdsv3_ib_ring_empty(&ic->i_send_ring) && 807 rdsv3_ib_ring_empty(&ic->i_recv_ring)); 808 809 rdma_destroy_qp(ic->i_cm_id); 810 } 811 812 813 if (ic->i_mr) 814 rdsv3_ib_free_hdrs(dev, ic); 815 816 if (ic->i_sends) 817 rdsv3_ib_send_clear_ring(ic); 818 if (ic->i_recvs) 819 rdsv3_ib_recv_clear_ring(ic); 820 821 if (ic->i_send_cq) 822 (void) ib_destroy_cq(ic->i_send_cq); 823 if (ic->i_recv_cq) 824 (void) ib_destroy_cq(ic->i_recv_cq); 825 rdma_destroy_id(ic->i_cm_id); 826 827 /* 828 * Move connection back to the nodev list. 829 */ 830 if (ic->rds_ibdev) 831 rdsv3_ib_remove_conn(ic->rds_ibdev, conn); 832 833 ic->i_cm_id = NULL; 834 ic->i_pd = NULL; 835 ic->i_mr = NULL; 836 ic->i_send_cq = NULL; 837 ic->i_recv_cq = NULL; 838 ic->i_send_hdrs = NULL; 839 ic->i_recv_hdrs = NULL; 840 ic->i_ack = NULL; 841 } 842 ASSERT(!ic->rds_ibdev); 843 844 /* Clear pending transmit */ 845 if (ic->i_rm) { 846 rdsv3_message_put(ic->i_rm); 847 ic->i_rm = NULL; 848 } 849 850 /* Clear the ACK state */ 851 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); 852 ic->i_ack_next = 0; 853 ic->i_ack_recv = 0; 854 855 /* Clear flow control state */ 856 ic->i_flowctl = 0; 857 ic->i_credits = 0; 858 859 rdsv3_ib_ring_init(&ic->i_send_ring, rdsv3_ib_sysctl_max_send_wr); 860 rdsv3_ib_ring_init(&ic->i_recv_ring, rdsv3_ib_sysctl_max_recv_wr); 861 862 if (ic->i_ibinc) { 863 rdsv3_inc_put(&ic->i_ibinc->ii_inc); 864 ic->i_ibinc = NULL; 865 } 866 867 if (ic->i_sends) { 868 kmem_free(ic->i_sends, 869 ic->i_send_ring.w_nr * sizeof (struct rdsv3_ib_send_work)); 870 ic->i_sends = NULL; 871 } 872 if (ic->i_send_wrs) { 873 kmem_free(ic->i_send_wrs, RDSV3_IB_SEND_WRS * 874 (sizeof (ibt_send_wr_t) + 875 RDSV3_IB_MAX_SGE * sizeof (ibt_wr_ds_t))); 876 ic->i_send_wrs = NULL; 877 } 878 if (ic->i_recvs) { 879 kmem_free(ic->i_recvs, 880 ic->i_recv_ring.w_nr * sizeof (struct rdsv3_ib_recv_work)); 881 ic->i_recvs = NULL; 882 } 883 884 RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", "Return conn: %p", conn); 885 } 886 887 /* 888 * the connection can be allocated from either rdsv3_conn_create_outgoing() 889 * or rdsv3_conn_create(), so ddi_taskq_create() can be called with the 890 * same string. This can print the kstat warning on the console. To prevent 891 * it, this counter value is used. 892 * Note that requests from rdsv3_conn_create_outgoing() refers to the cached 893 * value with the mutex lock before it allocates the connection, so that 894 * the warning cannot be produced in the case. (only between 895 * rdsv3_conn_create() and rdsv3_conn_create_outgoing(). 896 */ 897 static int conn_cnt; 898 899 /* ARGSUSED */ 900 int 901 rdsv3_ib_conn_alloc(struct rdsv3_connection *conn, int gfp) 902 { 903 struct rdsv3_ib_connection *ic; 904 char tq_name[TASKQ_NAMELEN]; 905 906 RDSV3_DPRINTF2("rdsv3_ib_conn_alloc", "conn: %p", conn); 907 908 /* XXX too lazy? */ 909 ic = kmem_zalloc(sizeof (struct rdsv3_ib_connection), gfp); 910 if (ic == NULL) 911 return (-ENOMEM); 912 913 list_link_init(&ic->ib_node); 914 (void) snprintf(tq_name, TASKQ_NAMELEN, "RDSV3_CONN_to_%x:%u", 915 htonl(conn->c_faddr), conn_cnt++ % 100); 916 ic->i_recv_tasklet = 917 ddi_taskq_create(NULL, tq_name, 1, TASKQ_DEFAULTPRI, 0); 918 919 920 mutex_init(&ic->i_recv_mutex, NULL, MUTEX_DRIVER, NULL); 921 mutex_init(&ic->i_ack_lock, NULL, MUTEX_DRIVER, NULL); 922 923 /* 924 * rdsv3_ib_conn_shutdown() waits for these to be emptied so they 925 * must be initialized before it can be called. 926 */ 927 rdsv3_ib_ring_init(&ic->i_send_ring, rdsv3_ib_sysctl_max_send_wr); 928 rdsv3_ib_ring_init(&ic->i_recv_ring, rdsv3_ib_sysctl_max_recv_wr); 929 930 ic->conn = conn; 931 conn->c_transport_data = ic; 932 933 mutex_enter(&ib_nodev_conns_lock); 934 list_insert_tail(&ib_nodev_conns, ic); 935 mutex_exit(&ib_nodev_conns_lock); 936 937 938 RDSV3_DPRINTF2("rdsv3_ib_conn_alloc", "conn %p conn ic %p", 939 conn, conn->c_transport_data); 940 return (0); 941 } 942 943 /* 944 * Free a connection. Connection must be shut down and not set for reconnect. 945 */ 946 void 947 rdsv3_ib_conn_free(void *arg) 948 { 949 struct rdsv3_ib_connection *ic = arg; 950 kmutex_t *lock_ptr; 951 952 RDSV3_DPRINTF2("rdsv3_ib_conn_free", "ic %p\n", ic); 953 954 #ifndef __lock_lint 955 /* 956 * Conn is either on a dev's list or on the nodev list. 957 * A race with shutdown() or connect() would cause problems 958 * (since rds_ibdev would change) but that should never happen. 959 */ 960 lock_ptr = ic->rds_ibdev ? 961 &ic->rds_ibdev->spinlock : &ib_nodev_conns_lock; 962 963 mutex_enter(lock_ptr); 964 list_remove_node(&ic->ib_node); 965 mutex_exit(lock_ptr); 966 #endif 967 968 ddi_taskq_destroy(ic->i_recv_tasklet); 969 kmem_free(ic, sizeof (*ic)); 970 } 971 972 /* 973 * An error occurred on the connection 974 */ 975 void 976 __rdsv3_ib_conn_error(struct rdsv3_connection *conn) 977 { 978 rdsv3_conn_drop(conn); 979 } 980