1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * Copyright (c) 2006 Oracle. All rights reserved. 27 * 28 * This software is available to you under a choice of one of two 29 * licenses. You may choose to be licensed under the terms of the GNU 30 * General Public License (GPL) Version 2, available from the file 31 * COPYING in the main directory of this source tree, or the 32 * OpenIB.org BSD license below: 33 * 34 * Redistribution and use in source and binary forms, with or 35 * without modification, are permitted provided that the following 36 * conditions are met: 37 * 38 * - Redistributions of source code must retain the above 39 * copyright notice, this list of conditions and the following 40 * disclaimer. 41 * 42 * - Redistributions in binary form must reproduce the above 43 * copyright notice, this list of conditions and the following 44 * disclaimer in the documentation and/or other materials 45 * provided with the distribution. 46 * 47 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 48 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 49 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 50 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 51 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 52 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 53 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 54 * SOFTWARE. 55 * 56 */ 57 #include <sys/rds.h> 58 59 #include <sys/ib/clients/of/ofed_kernel.h> 60 #include <sys/ib/clients/of/rdma/ib_addr.h> 61 #include <sys/ib/clients/of/rdma/rdma_cm.h> 62 63 #include <sys/ib/clients/rdsv3/rdsv3.h> 64 #include <sys/ib/clients/rdsv3/ib.h> 65 #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 66 67 extern ddi_taskq_t *rdsv3_taskq; 68 69 /* 70 * Set the selected protocol version 71 */ 72 static void 73 rdsv3_ib_set_protocol(struct rdsv3_connection *conn, unsigned int version) 74 { 75 RDSV3_DPRINTF4("rdsv3_ib_set_protocol", "conn: %p version: %d", 76 conn, version); 77 conn->c_version = version; 78 } 79 80 /* 81 * Set up flow control 82 */ 83 static void 84 rdsv3_ib_set_flow_control(struct rdsv3_connection *conn, uint32_t credits) 85 { 86 struct rdsv3_ib_connection *ic = conn->c_transport_data; 87 88 RDSV3_DPRINTF2("rdsv3_ib_set_flow_control", 89 "Enter: conn: %p credits: %d", conn, credits); 90 91 if (rdsv3_ib_sysctl_flow_control && credits != 0) { 92 /* We're doing flow control */ 93 ic->i_flowctl = 1; 94 rdsv3_ib_send_add_credits(conn, credits); 95 } else { 96 ic->i_flowctl = 0; 97 } 98 99 RDSV3_DPRINTF2("rdsv3_ib_set_flow_control", 100 "Return: conn: %p credits: %d", 101 conn, credits); 102 } 103 104 /* 105 * Tune RNR behavior. Without flow control, we use a rather 106 * low timeout, but not the absolute minimum - this should 107 * be tunable. 108 * 109 * We already set the RNR retry count to 7 (which is the 110 * smallest infinite number :-) above. 111 * If flow control is off, we want to change this back to 0 112 * so that we learn quickly when our credit accounting is 113 * buggy. 114 * 115 * Caller passes in a qp_attr pointer - don't waste stack spacv 116 * by allocation this twice. 117 */ 118 static void 119 rdsv3_ib_tune_rnr(struct rdsv3_ib_connection *ic, struct ib_qp_attr *attr) 120 { 121 int ret; 122 123 RDSV3_DPRINTF2("rdsv3_ib_tune_rnr", "Enter ic: %p attr: %p", 124 ic, attr); 125 126 attr->min_rnr_timer = IB_RNR_TIMER_000_32; 127 ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER); 128 if (ret) 129 RDSV3_DPRINTF2("rdsv3_ib_tune_rnr", 130 "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d", -ret); 131 } 132 133 /* 134 * Connection established. 135 * We get here for both outgoing and incoming connection. 136 */ 137 void 138 rdsv3_ib_cm_connect_complete(struct rdsv3_connection *conn, 139 struct rdma_cm_event *event) 140 { 141 const struct rdsv3_ib_connect_private *dp = NULL; 142 struct rdsv3_ib_connection *ic = conn->c_transport_data; 143 struct rdsv3_ib_device *rds_ibdev; 144 struct ib_qp_attr qp_attr; 145 int err; 146 147 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", 148 "Enter conn: %p event: %p", conn, event); 149 150 if (event->param.conn.private_data_len >= sizeof (*dp)) { 151 dp = event->param.conn.private_data; 152 153 /* make sure it isn't empty data */ 154 if (dp->dp_protocol_major) { 155 rdsv3_ib_set_protocol(conn, 156 RDS_PROTOCOL(dp->dp_protocol_major, 157 dp->dp_protocol_minor)); 158 rdsv3_ib_set_flow_control(conn, 159 ntohl(dp->dp_credit)); 160 } 161 } 162 163 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", 164 "RDS/IB: connected to %u.%u.%u.%u version %u.%u%s", 165 NIPQUAD(conn->c_faddr), 166 RDS_PROTOCOL_MAJOR(conn->c_version), 167 RDS_PROTOCOL_MINOR(conn->c_version), 168 ic->i_flowctl ? ", flow control" : ""); 169 170 /* 171 * Init rings and fill recv. this needs to wait until protocol 172 * negotiation 173 * is complete, since ring layout is different from 3.0 to 3.1. 174 */ 175 rdsv3_ib_send_init_ring(ic); 176 rdsv3_ib_recv_init_ring(ic); 177 /* 178 * Post receive buffers - as a side effect, this will update 179 * the posted credit count. 180 */ 181 (void) rdsv3_ib_recv_refill(conn, KM_NOSLEEP, 1); 182 183 /* Tune RNR behavior */ 184 rdsv3_ib_tune_rnr(ic, &qp_attr); 185 186 qp_attr.qp_state = IB_QPS_RTS; 187 err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE); 188 if (err) 189 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", 190 "ib_modify_qp(IB_QP_STATE, RTS): err=%d", err); 191 192 /* update ib_device with this local ipaddr & conn */ 193 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rdsv3_ib_client); 194 err = rdsv3_ib_update_ipaddr(rds_ibdev, conn->c_laddr); 195 if (err) 196 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", 197 "rdsv3_ib_update_ipaddr failed (%d)", err); 198 rdsv3_ib_add_conn(rds_ibdev, conn); 199 200 /* 201 * If the peer gave us the last packet it saw, process this as if 202 * we had received a regular ACK. 203 */ 204 if (dp && dp->dp_ack_seq) 205 rdsv3_send_drop_acked(conn, ntohll(dp->dp_ack_seq), NULL); 206 207 rdsv3_connect_complete(conn); 208 209 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", 210 "Return conn: %p event: %p", 211 conn, event); 212 } 213 214 static void 215 rdsv3_ib_cm_fill_conn_param(struct rdsv3_connection *conn, 216 struct rdma_conn_param *conn_param, 217 struct rdsv3_ib_connect_private *dp, 218 uint32_t protocol_version) 219 { 220 RDSV3_DPRINTF2("rdsv3_ib_cm_fill_conn_param", 221 "Enter conn: %p conn_param: %p private: %p version: %d", 222 conn, conn_param, dp, protocol_version); 223 224 (void) memset(conn_param, 0, sizeof (struct rdma_conn_param)); 225 /* XXX tune these? */ 226 conn_param->responder_resources = 1; 227 conn_param->initiator_depth = 1; 228 conn_param->retry_count = min(rdsv3_ib_retry_count, 7); 229 conn_param->rnr_retry_count = 7; 230 231 if (dp) { 232 struct rdsv3_ib_connection *ic = conn->c_transport_data; 233 234 (void) memset(dp, 0, sizeof (*dp)); 235 dp->dp_saddr = conn->c_laddr; 236 dp->dp_daddr = conn->c_faddr; 237 dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version); 238 dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version); 239 dp->dp_protocol_minor_mask = 240 htons(RDSV3_IB_SUPPORTED_PROTOCOLS); 241 dp->dp_ack_seq = rdsv3_ib_piggyb_ack(ic); 242 243 /* Advertise flow control */ 244 if (ic->i_flowctl) { 245 unsigned int credits; 246 247 credits = IB_GET_POST_CREDITS( 248 atomic_get(&ic->i_credits)); 249 dp->dp_credit = htonl(credits); 250 atomic_add_32(&ic->i_credits, 251 -IB_SET_POST_CREDITS(credits)); 252 } 253 254 conn_param->private_data = dp; 255 conn_param->private_data_len = sizeof (*dp); 256 } 257 258 RDSV3_DPRINTF2("rdsv3_ib_cm_fill_conn_param", 259 "Return conn: %p conn_param: %p private: %p version: %d", 260 conn, conn_param, dp, protocol_version); 261 } 262 263 static void 264 rdsv3_ib_cq_event_handler(struct ib_event *event, void *data) 265 { 266 RDSV3_DPRINTF3("rdsv3_ib_cq_event_handler", "event %u data %p", 267 event->event, data); 268 } 269 270 static void 271 rdsv3_ib_qp_event_handler(struct ib_event *event, void *data) 272 { 273 struct rdsv3_connection *conn = data; 274 struct rdsv3_ib_connection *ic = conn->c_transport_data; 275 276 RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", "conn %p ic %p event %u", 277 conn, ic, event->event); 278 279 switch (event->event) { 280 case IB_EVENT_COMM_EST: 281 (void) rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); 282 break; 283 default: 284 if (conn) { 285 RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", 286 "RDS/IB: Fatal QP Event %u - " 287 "connection %u.%u.%u.%u ->%u.%u.%u.%u " 288 "...reconnecting", 289 event->event, NIPQUAD(conn->c_laddr), 290 NIPQUAD(conn->c_faddr)); 291 rdsv3_conn_drop(conn); 292 } else { 293 RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", 294 "RDS/IB: Fatal QP Event %u - connection" 295 "...reconnecting", event->event); 296 } 297 break; 298 } 299 300 RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", "Return conn: %p event: %p", 301 conn, event); 302 } 303 304 extern int rdsv3_ib_alloc_hdrs(ib_device_t *dev, 305 struct rdsv3_ib_connection *ic); 306 extern void rdsv3_ib_free_hdrs(ib_device_t *dev, 307 struct rdsv3_ib_connection *ic); 308 309 /* 310 * This needs to be very careful to not leave IS_ERR pointers around for 311 * cleanup to trip over. 312 */ 313 static int 314 rdsv3_ib_setup_qp(struct rdsv3_connection *conn) 315 { 316 struct rdsv3_ib_connection *ic = conn->c_transport_data; 317 struct ib_device *dev = ic->i_cm_id->device; 318 struct ib_qp_init_attr attr; 319 struct rdsv3_ib_device *rds_ibdev; 320 ibt_send_wr_t *wrp; 321 ibt_wr_ds_t *sgl; 322 int ret, i; 323 324 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", "Enter conn: %p", conn); 325 326 /* 327 * rdsv3_ib_add_one creates a rdsv3_ib_device object per IB device, 328 * and allocates a protection domain, memory range and FMR pool 329 * for each. If that fails for any reason, it will not register 330 * the rds_ibdev at all. 331 */ 332 rds_ibdev = ib_get_client_data(dev, &rdsv3_ib_client); 333 if (rds_ibdev == NULL) { 334 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 335 "RDS/IB: No client_data for device %s", dev->name); 336 return (-EOPNOTSUPP); 337 } 338 ic->rds_ibdev = rds_ibdev; 339 340 if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1) 341 rdsv3_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1); 342 if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1) 343 rdsv3_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1); 344 345 /* Protection domain and memory range */ 346 ic->i_pd = rds_ibdev->pd; 347 348 /* 349 * IB_CQ_VECTOR_LEAST_ATTACHED and/or the corresponding feature is 350 * not implmeneted in Hermon yet, but we can pass it to ib_create_cq() 351 * anyway. 352 */ 353 ic->i_send_cq = ib_create_cq(dev, rdsv3_ib_send_cq_comp_handler, 354 rdsv3_ib_cq_event_handler, conn, 355 ic->i_send_ring.w_nr + 1, 356 IB_CQ_VECTOR_LEAST_ATTACHED); 357 if (IS_ERR(ic->i_send_cq)) { 358 ret = PTR_ERR(ic->i_send_cq); 359 ic->i_send_cq = NULL; 360 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 361 "ib_create_cq send failed: %d", ret); 362 goto out; 363 } 364 365 /* 366 * IB_CQ_VECTOR_LEAST_ATTACHED and/or the corresponding feature is 367 * not implmeneted in Hermon yet, but we can pass it to ib_create_cq() 368 * anyway. 369 */ 370 ic->i_recv_cq = ib_create_cq(dev, rdsv3_ib_recv_cq_comp_handler, 371 rdsv3_ib_cq_event_handler, conn, 372 ic->i_recv_ring.w_nr, 373 IB_CQ_VECTOR_LEAST_ATTACHED); 374 if (IS_ERR(ic->i_recv_cq)) { 375 ret = PTR_ERR(ic->i_recv_cq); 376 ic->i_recv_cq = NULL; 377 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 378 "ib_create_cq recv failed: %d", ret); 379 goto out; 380 } 381 382 ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); 383 if (ret) { 384 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 385 "ib_req_notify_cq send failed: %d", ret); 386 goto out; 387 } 388 389 ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); 390 if (ret) { 391 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 392 "ib_req_notify_cq recv failed: %d", ret); 393 goto out; 394 } 395 396 /* XXX negotiate max send/recv with remote? */ 397 (void) memset(&attr, 0, sizeof (attr)); 398 attr.event_handler = rdsv3_ib_qp_event_handler; 399 attr.qp_context = conn; 400 /* + 1 to allow for the single ack message */ 401 attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1; 402 attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1; 403 attr.cap.max_send_sge = rds_ibdev->max_sge; 404 attr.cap.max_recv_sge = RDSV3_IB_RECV_SGE; 405 attr.sq_sig_type = IB_SIGNAL_REQ_WR; 406 attr.qp_type = IB_QPT_RC; 407 attr.send_cq = ic->i_send_cq; 408 attr.recv_cq = ic->i_recv_cq; 409 410 /* 411 * XXX this can fail if max_*_wr is too large? Are we supposed 412 * to back off until we get a value that the hardware can support? 413 */ 414 ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr); 415 if (ret) { 416 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 417 "rdma_create_qp failed: %d", ret); 418 goto out; 419 } 420 421 ret = rdsv3_ib_alloc_hdrs(dev, ic); 422 if (ret != 0) { 423 ret = -ENOMEM; 424 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 425 "rdsv3_ib_alloc_hdrs failed: %d", ret); 426 goto out; 427 } 428 429 ic->i_sends = kmem_alloc(ic->i_send_ring.w_nr * 430 sizeof (struct rdsv3_ib_send_work), KM_NOSLEEP); 431 if (ic->i_sends == NULL) { 432 ret = -ENOMEM; 433 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 434 "send allocation failed: %d", ret); 435 goto out; 436 } 437 (void) memset(ic->i_sends, 0, ic->i_send_ring.w_nr * 438 sizeof (struct rdsv3_ib_send_work)); 439 440 ic->i_send_wrs = 441 kmem_alloc(RDSV3_IB_SEND_WRS * (sizeof (ibt_send_wr_t) + 442 RDSV3_IB_MAX_SGE * sizeof (ibt_wr_ds_t)), KM_NOSLEEP); 443 if (ic->i_send_wrs == NULL) { 444 ret = -ENOMEM; 445 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 446 "Send WR allocation failed: %d", ret); 447 goto out; 448 } 449 sgl = (ibt_wr_ds_t *)((uint8_t *)ic->i_send_wrs + 450 (RDSV3_IB_SEND_WRS * sizeof (ibt_send_wr_t))); 451 for (i = 0; i < RDSV3_IB_SEND_WRS; i++) { 452 wrp = &ic->i_send_wrs[i]; 453 wrp->wr_sgl = &sgl[i * RDSV3_IB_MAX_SGE]; 454 } 455 456 ic->i_recvs = kmem_alloc(ic->i_recv_ring.w_nr * 457 sizeof (struct rdsv3_ib_recv_work), KM_NOSLEEP); 458 if (ic->i_recvs == NULL) { 459 ret = -ENOMEM; 460 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 461 "recv allocation failed: %d", ret); 462 goto out; 463 } 464 (void) memset(ic->i_recvs, 0, ic->i_recv_ring.w_nr * 465 sizeof (struct rdsv3_ib_recv_work)); 466 467 ic->i_recv_wrs = 468 kmem_alloc(ic->i_recv_ring.w_nr * sizeof (ibt_recv_wr_t), 469 KM_NOSLEEP); 470 if (ic->i_recv_wrs == NULL) { 471 ret = -ENOMEM; 472 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 473 "Recv WR allocation failed: %d", ret); 474 goto out; 475 } 476 477 rdsv3_ib_recv_init_ack(ic); 478 479 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", "conn %p pd %p mr %p cq %p %p", 480 conn, ic->i_pd, ic->i_mr, ic->i_send_cq, ic->i_recv_cq); 481 482 out: 483 return (ret); 484 } 485 486 static uint32_t 487 rdsv3_ib_protocol_compatible(struct rdma_cm_event *event) 488 { 489 const struct rdsv3_ib_connect_private *dp = 490 event->param.conn.private_data; 491 uint16_t common; 492 uint32_t version = 0; 493 494 RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", "Enter event: %p", 495 event); 496 497 /* 498 * rdma_cm private data is odd - when there is any private data in the 499 * request, we will be given a pretty large buffer without telling us 500 * the 501 * original size. The only way to tell the difference is by looking at 502 * the contents, which are initialized to zero. 503 * If the protocol version fields aren't set, 504 * this is a connection attempt 505 * from an older version. This could could be 3.0 or 2.0 - 506 * we can't tell. 507 * We really should have changed this for OFED 1.3 :-( 508 */ 509 510 /* Be paranoid. RDS always has privdata */ 511 if (!event->param.conn.private_data_len) { 512 RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", 513 "RDS incoming connection has no private data, rejecting"); 514 return (0); 515 } 516 517 /* Even if len is crap *now* I still want to check it. -ASG */ 518 if (event->param.conn.private_data_len < sizeof (*dp) || 519 dp->dp_protocol_major == 0) 520 return (RDS_PROTOCOL_3_0); 521 522 common = ntohs(dp->dp_protocol_minor_mask) & 523 RDSV3_IB_SUPPORTED_PROTOCOLS; 524 if (dp->dp_protocol_major == 3 && common) { 525 version = RDS_PROTOCOL_3_0; 526 while ((common >>= 1) != 0) 527 version++; 528 } else { 529 RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", 530 "RDS: Connection from %u.%u.%u.%u using " 531 "incompatible protocol version %u.%u\n", 532 NIPQUAD(dp->dp_saddr), 533 dp->dp_protocol_major, 534 dp->dp_protocol_minor); 535 } 536 537 RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", "Return event: %p", 538 event); 539 540 return (version); 541 } 542 543 int 544 rdsv3_ib_cm_handle_connect(struct rdma_cm_id *cm_id, 545 struct rdma_cm_event *event) 546 { 547 uint64_be_t lguid = cm_id->route.path_rec->sgid.global.interface_id; 548 uint64_be_t fguid = cm_id->route.path_rec->dgid.global.interface_id; 549 const struct rdsv3_ib_connect_private *dp = 550 event->param.conn.private_data; 551 struct rdsv3_ib_connect_private dp_rep; 552 struct rdsv3_connection *conn = NULL; 553 struct rdsv3_ib_connection *ic = NULL; 554 struct rdma_conn_param conn_param; 555 uint32_t version; 556 int err, destroy = 1; 557 boolean_t conn_created = B_FALSE; 558 559 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 560 "Enter cm_id: %p event: %p", cm_id, event); 561 562 /* Check whether the remote protocol version matches ours. */ 563 version = rdsv3_ib_protocol_compatible(event); 564 if (!version) { 565 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 566 "version mismatch"); 567 goto out; 568 } 569 570 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 571 "saddr %u.%u.%u.%u daddr %u.%u.%u.%u RDSv%d.%d lguid 0x%llx fguid " 572 "0x%llx", NIPQUAD(dp->dp_saddr), NIPQUAD(dp->dp_daddr), 573 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version), 574 (unsigned long long)ntohll(lguid), 575 (unsigned long long)ntohll(fguid)); 576 577 conn = rdsv3_conn_create(dp->dp_daddr, dp->dp_saddr, 578 &rdsv3_ib_transport, KM_NOSLEEP); 579 if (IS_ERR(conn)) { 580 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 581 "rdsv3_conn_create failed (%ld)", PTR_ERR(conn)); 582 conn = NULL; 583 goto out; 584 } 585 586 /* 587 * The connection request may occur while the 588 * previous connection exist, e.g. in case of failover. 589 * But as connections may be initiated simultaneously 590 * by both hosts, we have a random backoff mechanism - 591 * see the comment above rdsv3_queue_reconnect() 592 */ 593 mutex_enter(&conn->c_cm_lock); 594 if (!rdsv3_conn_transition(conn, RDSV3_CONN_DOWN, 595 RDSV3_CONN_CONNECTING)) { 596 if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) { 597 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 598 "incoming connect when connected: %p", 599 conn); 600 rdsv3_conn_drop(conn); 601 rdsv3_ib_stats_inc(s_ib_listen_closed_stale); 602 mutex_exit(&conn->c_cm_lock); 603 goto out; 604 } else if (rdsv3_conn_state(conn) == RDSV3_CONN_CONNECTING) { 605 /* Wait and see - our connect may still be succeeding */ 606 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 607 "peer-to-peer connection request: %p, " 608 "lguid: 0x%llx fguid: 0x%llx", 609 conn, lguid, fguid); 610 rdsv3_ib_stats_inc(s_ib_connect_raced); 611 } 612 mutex_exit(&conn->c_cm_lock); 613 goto out; 614 } 615 616 ic = conn->c_transport_data; 617 618 rdsv3_ib_set_protocol(conn, version); 619 rdsv3_ib_set_flow_control(conn, ntohl(dp->dp_credit)); 620 621 /* 622 * If the peer gave us the last packet it saw, process this as if 623 * we had received a regular ACK. 624 */ 625 if (dp->dp_ack_seq) 626 rdsv3_send_drop_acked(conn, ntohll(dp->dp_ack_seq), NULL); 627 628 ASSERT(!cm_id->context); 629 ASSERT(!ic->i_cm_id); 630 631 if (ic->i_cm_id != NULL) 632 RDSV3_PANIC(); 633 634 ic->i_cm_id = cm_id; 635 cm_id->context = conn; 636 637 /* 638 * We got halfway through setting up the ib_connection, if we 639 * fail now, we have to take the long route out of this mess. 640 */ 641 destroy = 0; 642 643 err = rdsv3_ib_setup_qp(conn); 644 if (err) { 645 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 646 "rdsv3_ib_setup_qp failed (%d)", err); 647 mutex_exit(&conn->c_cm_lock); 648 rdsv3_conn_drop(conn); 649 goto out; 650 } 651 652 rdsv3_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version); 653 654 /* rdma_accept() calls rdma_reject() internally if it fails */ 655 err = rdma_accept(cm_id, &conn_param); 656 mutex_exit(&conn->c_cm_lock); 657 if (err) { 658 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 659 "rdma_accept failed (%d)", err); 660 rdsv3_conn_drop(conn); 661 goto out; 662 } 663 664 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 665 "Return cm_id: %p event: %p", cm_id, event); 666 667 return (0); 668 669 out: 670 (void) rdma_reject(cm_id, NULL, 0); 671 return (destroy); 672 } 673 674 675 int 676 rdsv3_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) 677 { 678 struct rdsv3_connection *conn = cm_id->context; 679 struct rdsv3_ib_connection *ic = conn->c_transport_data; 680 struct rdma_conn_param conn_param; 681 struct rdsv3_ib_connect_private dp; 682 int ret; 683 684 RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect", "Enter: cm_id: %p", 685 cm_id); 686 687 /* 688 * If the peer doesn't do protocol negotiation, we must 689 * default to RDSv3.0 690 */ 691 rdsv3_ib_set_protocol(conn, RDS_PROTOCOL_3_0); 692 ic->i_flowctl = 693 rdsv3_ib_sysctl_flow_control; /* advertise flow control */ 694 695 ret = rdsv3_ib_setup_qp(conn); 696 if (ret) { 697 RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect", 698 "rdsv3_ib_setup_qp failed (%d)", ret); 699 rdsv3_conn_drop(conn); 700 goto out; 701 } 702 703 (void) rdsv3_ib_cm_fill_conn_param(conn, &conn_param, &dp, 704 RDS_PROTOCOL_VERSION); 705 706 ret = rdma_connect(cm_id, &conn_param); 707 if (ret) { 708 RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect", 709 "rdma_connect failed (%d)", ret); 710 rdsv3_conn_drop(conn); 711 } 712 713 RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect", 714 "Return: cm_id: %p", cm_id); 715 716 out: 717 /* 718 * Beware - returning non-zero tells the rdma_cm to destroy 719 * the cm_id. We should certainly not do it as long as we still 720 * "own" the cm_id. 721 */ 722 if (ret) { 723 if (ic->i_cm_id == cm_id) 724 ret = 0; 725 } 726 return (ret); 727 } 728 729 int 730 rdsv3_ib_conn_connect(struct rdsv3_connection *conn) 731 { 732 struct rdsv3_ib_connection *ic = conn->c_transport_data; 733 struct sockaddr_in src, dest; 734 ipaddr_t laddr, faddr; 735 int ret; 736 737 RDSV3_DPRINTF2("rdsv3_ib_conn_connect", "Enter: conn: %p", conn); 738 739 /* 740 * XXX I wonder what affect the port space has 741 */ 742 /* delegate cm event handler to rdma_transport */ 743 ic->i_cm_id = rdma_create_id(rdsv3_rdma_cm_event_handler, conn, 744 RDMA_PS_TCP); 745 if (IS_ERR(ic->i_cm_id)) { 746 ret = PTR_ERR(ic->i_cm_id); 747 ic->i_cm_id = NULL; 748 RDSV3_DPRINTF2("rdsv3_ib_conn_connect", 749 "rdma_create_id() failed: %d", ret); 750 goto out; 751 } 752 753 RDSV3_DPRINTF3("rdsv3_ib_conn_connect", 754 "created cm id %p for conn %p", ic->i_cm_id, conn); 755 756 /* The ipaddr should be in the network order */ 757 laddr = conn->c_laddr; 758 faddr = conn->c_faddr; 759 ret = rdsv3_sc_path_lookup(&laddr, &faddr); 760 if (ret == 0) { 761 RDSV3_DPRINTF2(LABEL, "Path not found (0x%x 0x%x)", 762 ntohl(laddr), ntohl(faddr)); 763 } 764 765 src.sin_family = AF_INET; 766 src.sin_addr.s_addr = (uint32_t)laddr; 767 src.sin_port = (uint16_t)htons(0); 768 769 dest.sin_family = AF_INET; 770 dest.sin_addr.s_addr = (uint32_t)faddr; 771 dest.sin_port = (uint16_t)htons(RDSV3_PORT); 772 773 ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src, 774 (struct sockaddr *)&dest, 775 RDSV3_RDMA_RESOLVE_TIMEOUT_MS); 776 if (ret) { 777 RDSV3_DPRINTF2("rdsv3_ib_conn_connect", 778 "addr resolve failed for cm id %p: %d", ic->i_cm_id, ret); 779 rdma_destroy_id(ic->i_cm_id); 780 ic->i_cm_id = NULL; 781 } 782 783 RDSV3_DPRINTF2("rdsv3_ib_conn_connect", "Return: conn: %p", conn); 784 785 out: 786 return (ret); 787 } 788 789 /* 790 * This is so careful about only cleaning up resources that were built up 791 * so that it can be called at any point during startup. In fact it 792 * can be called multiple times for a given connection. 793 */ 794 void 795 rdsv3_ib_conn_shutdown(struct rdsv3_connection *conn) 796 { 797 struct rdsv3_ib_connection *ic = conn->c_transport_data; 798 int err = 0; 799 800 RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", 801 "cm %p pd %p cq %p %p qp %p", ic->i_cm_id, 802 ic->i_pd, ic->i_send_cq, ic->i_recv_cq, 803 ic->i_cm_id ? ic->i_cm_id->qp : NULL); 804 805 if (ic->i_cm_id) { 806 struct ib_device *dev = ic->i_cm_id->device; 807 808 RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", 809 "disconnecting cm %p", ic->i_cm_id); 810 err = rdma_disconnect(ic->i_cm_id); 811 if (err) { 812 /* 813 * Actually this may happen quite frequently, when 814 * an outgoing connect raced with an incoming connect. 815 */ 816 RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", 817 "failed to disconnect, cm: %p err %d", 818 ic->i_cm_id, err); 819 } 820 821 if (ic->i_cm_id->qp) { 822 (void) ibt_flush_qp( 823 ib_get_ibt_channel_hdl(ic->i_cm_id)); 824 825 /* wait until all WRs are flushed */ 826 rdsv3_wait_event(&rdsv3_ib_ring_empty_wait, 827 rdsv3_ib_ring_empty(&ic->i_send_ring) && 828 rdsv3_ib_ring_empty(&ic->i_recv_ring)); 829 830 rdma_destroy_qp(ic->i_cm_id); 831 } 832 833 834 if (ic->i_mr) 835 rdsv3_ib_free_hdrs(dev, ic); 836 837 if (ic->i_sends) 838 rdsv3_ib_send_clear_ring(ic); 839 if (ic->i_recvs) 840 rdsv3_ib_recv_clear_ring(ic); 841 842 if (ic->i_send_cq) 843 (void) ib_destroy_cq(ic->i_send_cq); 844 if (ic->i_recv_cq) 845 (void) ib_destroy_cq(ic->i_recv_cq); 846 rdma_destroy_id(ic->i_cm_id); 847 848 /* 849 * Move connection back to the nodev list. 850 */ 851 if (ic->i_on_dev_list) 852 rdsv3_ib_remove_conn(ic->rds_ibdev, conn); 853 854 ic->i_cm_id = NULL; 855 ic->i_pd = NULL; 856 ic->i_mr = NULL; 857 ic->i_send_cq = NULL; 858 ic->i_recv_cq = NULL; 859 ic->i_send_hdrs = NULL; 860 ic->i_recv_hdrs = NULL; 861 ic->i_ack = NULL; 862 } 863 864 ASSERT(!ic->i_on_dev_list); 865 866 /* Clear pending transmit */ 867 if (ic->i_rm) { 868 rdsv3_message_put(ic->i_rm); 869 ic->i_rm = NULL; 870 } 871 872 /* Clear the ACK state */ 873 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); 874 ic->i_ack_next = 0; 875 ic->i_ack_recv = 0; 876 877 /* Clear flow control state */ 878 ic->i_flowctl = 0; 879 ic->i_credits = 0; 880 881 rdsv3_ib_ring_init(&ic->i_send_ring, rdsv3_ib_sysctl_max_send_wr); 882 rdsv3_ib_ring_init(&ic->i_recv_ring, rdsv3_ib_sysctl_max_recv_wr); 883 884 if (ic->i_ibinc) { 885 rdsv3_inc_put(&ic->i_ibinc->ii_inc); 886 ic->i_ibinc = NULL; 887 } 888 889 if (ic->i_sends) { 890 kmem_free(ic->i_sends, 891 ic->i_send_ring.w_nr * sizeof (struct rdsv3_ib_send_work)); 892 ic->i_sends = NULL; 893 } 894 if (ic->i_send_wrs) { 895 kmem_free(ic->i_send_wrs, RDSV3_IB_SEND_WRS * 896 (sizeof (ibt_send_wr_t) + 897 RDSV3_IB_MAX_SGE * sizeof (ibt_wr_ds_t))); 898 ic->i_send_wrs = NULL; 899 } 900 if (ic->i_recvs) { 901 kmem_free(ic->i_recvs, 902 ic->i_recv_ring.w_nr * sizeof (struct rdsv3_ib_recv_work)); 903 ic->i_recvs = NULL; 904 } 905 906 RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", "Return conn: %p", conn); 907 } 908 909 /* 910 * the connection can be allocated from either rdsv3_conn_create_outgoing() 911 * or rdsv3_conn_create(), so ddi_taskq_create() can be called with the 912 * same string. This can print the kstat warning on the console. To prevent 913 * it, this counter value is used. 914 * Note that requests from rdsv3_conn_create_outgoing() refers to the cached 915 * value with the mutex lock before it allocates the connection, so that 916 * the warning cannot be produced in the case. (only between 917 * rdsv3_conn_create() and rdsv3_conn_create_outgoing(). 918 */ 919 static int conn_cnt; 920 921 /* ARGSUSED */ 922 int 923 rdsv3_ib_conn_alloc(struct rdsv3_connection *conn, int gfp) 924 { 925 struct rdsv3_ib_connection *ic; 926 char tq_name[TASKQ_NAMELEN]; 927 928 RDSV3_DPRINTF2("rdsv3_ib_conn_alloc", "conn: %p", conn); 929 930 /* XXX too lazy? */ 931 ic = kmem_zalloc(sizeof (struct rdsv3_ib_connection), gfp); 932 if (ic == NULL) 933 return (-ENOMEM); 934 935 list_link_init(&ic->ib_node); 936 (void) snprintf(tq_name, TASKQ_NAMELEN, "RDSV3_CONN_to_%x:%u", 937 htonl(conn->c_faddr), conn_cnt++ % 100); 938 ic->i_recv_tasklet = 939 ddi_taskq_create(NULL, tq_name, 1, TASKQ_DEFAULTPRI, 0); 940 941 942 mutex_init(&ic->i_recv_mutex, NULL, MUTEX_DRIVER, NULL); 943 mutex_init(&ic->i_ack_lock, NULL, MUTEX_DRIVER, NULL); 944 945 /* 946 * rdsv3_ib_conn_shutdown() waits for these to be emptied so they 947 * must be initialized before it can be called. 948 */ 949 rdsv3_ib_ring_init(&ic->i_send_ring, rdsv3_ib_sysctl_max_send_wr); 950 rdsv3_ib_ring_init(&ic->i_recv_ring, rdsv3_ib_sysctl_max_recv_wr); 951 952 ic->conn = conn; 953 conn->c_transport_data = ic; 954 955 mutex_enter(&ib_nodev_conns_lock); 956 list_insert_tail(&ib_nodev_conns, ic); 957 mutex_exit(&ib_nodev_conns_lock); 958 959 960 RDSV3_DPRINTF2("rdsv3_ib_conn_alloc", "conn %p conn ic %p", 961 conn, conn->c_transport_data); 962 return (0); 963 } 964 965 /* 966 * Free a connection. Connection must be shut down and not set for reconnect. 967 */ 968 void 969 rdsv3_ib_conn_free(void *arg) 970 { 971 struct rdsv3_ib_connection *ic = arg; 972 kmutex_t *lock_ptr; 973 974 RDSV3_DPRINTF2("rdsv3_ib_conn_free", "ic %p\n", ic); 975 976 #ifndef __lock_lint 977 /* 978 * Conn is either on a dev's list or on the nodev list. 979 * A race with shutdown() or connect() would cause problems 980 * (since rds_ibdev would change) but that should never happen. 981 */ 982 lock_ptr = ic->i_on_dev_list ? 983 &ic->rds_ibdev->spinlock : &ib_nodev_conns_lock; 984 985 mutex_enter(lock_ptr); 986 list_remove_node(&ic->ib_node); 987 mutex_exit(lock_ptr); 988 #endif 989 990 ddi_taskq_destroy(ic->i_recv_tasklet); 991 kmem_free(ic, sizeof (*ic)); 992 } 993 994 /* 995 * An error occurred on the connection 996 */ 997 void 998 __rdsv3_ib_conn_error(struct rdsv3_connection *conn) 999 { 1000 rdsv3_conn_drop(conn); 1001 } 1002