1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * Copyright (c) 2006 Oracle. All rights reserved. 27 * 28 * This software is available to you under a choice of one of two 29 * licenses. You may choose to be licensed under the terms of the GNU 30 * General Public License (GPL) Version 2, available from the file 31 * COPYING in the main directory of this source tree, or the 32 * OpenIB.org BSD license below: 33 * 34 * Redistribution and use in source and binary forms, with or 35 * without modification, are permitted provided that the following 36 * conditions are met: 37 * 38 * - Redistributions of source code must retain the above 39 * copyright notice, this list of conditions and the following 40 * disclaimer. 41 * 42 * - Redistributions in binary form must reproduce the above 43 * copyright notice, this list of conditions and the following 44 * disclaimer in the documentation and/or other materials 45 * provided with the distribution. 46 * 47 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 48 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 49 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 50 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 51 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 52 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 53 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 54 * SOFTWARE. 55 * 56 */ 57 #include <sys/types.h> 58 #include <sys/stat.h> 59 #include <sys/conf.h> 60 #include <sys/ddi.h> 61 #include <sys/sunddi.h> 62 #include <sys/modctl.h> 63 #include <sys/rds.h> 64 #include <sys/stropts.h> 65 #include <sys/socket.h> 66 #include <sys/socketvar.h> 67 #include <sys/sockio.h> 68 #include <sys/sysmacros.h> 69 70 #include <inet/ip.h> 71 #include <net/if_types.h> 72 73 #include <sys/ib/clients/rdsv3/rdsv3.h> 74 #include <sys/ib/clients/rdsv3/rdma.h> 75 #include <sys/ib/clients/rdsv3/rdma_transport.h> 76 #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 77 78 extern void rdsv3_remove_bound(struct rdsv3_sock *rds); 79 extern int rdsv3_verify_bind_address(ipaddr_t addr); 80 81 extern ddi_taskq_t *rdsv3_taskq; 82 extern struct rdma_cm_id *rdsv3_rdma_listen_id; 83 84 /* this is just used for stats gathering :/ */ 85 kmutex_t rdsv3_sock_lock; 86 static unsigned long rdsv3_sock_count; 87 list_t rdsv3_sock_list; 88 89 /* 90 * This is called as the final descriptor referencing this socket is closed. 91 * We have to unbind the socket so that another socket can be bound to the 92 * address it was using. 93 * 94 * We have to be careful about racing with the incoming path. sock_orphan() 95 * sets SOCK_DEAD and we use that as an indicator to the rx path that new 96 * messages shouldn't be queued. 97 */ 98 /* ARGSUSED */ 99 static int 100 rdsv3_release(sock_lower_handle_t proto_handle, int flgs, cred_t *cr) 101 { 102 struct rsock *sk = (struct rsock *)proto_handle; 103 struct rdsv3_sock *rs; 104 105 if (!sk) 106 goto out; 107 108 rs = rdsv3_sk_to_rs(sk); 109 RDSV3_DPRINTF4("rdsv3_release", "Enter(rs: %p, sk: %p)", rs, sk); 110 111 rdsv3_sk_sock_orphan(sk); 112 rdsv3_cong_remove_socket(rs); 113 rdsv3_remove_bound(rs); 114 115 /* 116 * Note - rdsv3_clear_recv_queue grabs rs_recv_lock, so 117 * that ensures the recv path has completed messing 118 * with the socket. 119 * 120 * Note2 - rdsv3_clear_recv_queue(rs) should be called first 121 * to prevent some race conditions, which is different from 122 * the Linux code. 123 */ 124 rdsv3_clear_recv_queue(rs); 125 rdsv3_send_drop_to(rs, NULL); 126 rdsv3_rdma_drop_keys(rs); 127 (void) rdsv3_notify_queue_get(rs, NULL); 128 129 mutex_enter(&rdsv3_sock_lock); 130 list_remove_node(&rs->rs_item); 131 rdsv3_sock_count--; 132 mutex_exit(&rdsv3_sock_lock); 133 134 while (sk->sk_refcount > 1) { 135 /* wait for 1 sec and try again */ 136 delay(drv_usectohz(1000000)); 137 } 138 139 /* this will free the rs and sk */ 140 rdsv3_sk_sock_put(sk); 141 142 RDSV3_DPRINTF4("rdsv3_release", "Return (rds: %p)", rs); 143 out: 144 return (0); 145 } 146 147 void 148 __rdsv3_wake_sk_sleep(struct rsock *sk) 149 { 150 /* wakup anyone waiting in recvmsg */ 151 if (!rdsv3_sk_sock_flag(sk, SOCK_DEAD) && sk->sk_sleep) 152 rdsv3_wake_up(sk->sk_sleep); 153 } 154 155 /* 156 * Careful not to race with rdsv3_release -> sock_orphan which clears sk_sleep. 157 * _bh() isn't OK here, we're called from interrupt handlers. It's probably OK 158 * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but 159 * this seems more conservative. 160 * NB - normally, one would use sk_callback_lock for this, but we can 161 * get here from interrupts, whereas the network code grabs sk_callback_lock 162 * with _lock_bh only - so relying on sk_callback_lock introduces livelocks. 163 */ 164 void 165 rdsv3_wake_sk_sleep(struct rdsv3_sock *rs) 166 { 167 RDSV3_DPRINTF4("rdsv3_wake_sk_sleep", "Enter(rs: %p)", rs); 168 169 rw_enter(&rs->rs_recv_lock, RW_READER); 170 __rdsv3_wake_sk_sleep(rdsv3_rs_to_sk(rs)); 171 rw_exit(&rs->rs_recv_lock); 172 } 173 174 /*ARGSUSED*/ 175 static int 176 rdsv3_getname(sock_lower_handle_t proto_handle, struct sockaddr *addr, 177 socklen_t *addr_len, cred_t *cr) 178 { 179 struct rsock *sk = (struct rsock *)proto_handle; 180 struct sockaddr_in *sin = (struct sockaddr_in *)addr; 181 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 182 183 RDSV3_DPRINTF4("rdsv3_getname", "Enter(rs: %p, port: %d)", rs, 184 rs->rs_bound_port); 185 186 sin->sin_port = rs->rs_bound_port; 187 sin->sin_addr.s_addr = rs->rs_bound_addr; 188 189 sin->sin_family = AF_INET_OFFLOAD; 190 191 *addr_len = sizeof (*sin); 192 return (0); 193 } 194 195 /* 196 * RDS' poll is without a doubt the least intuitive part of the interface, 197 * as POLLIN and POLLOUT do not behave entirely as you would expect from 198 * a network protocol. 199 * 200 * POLLIN is asserted if 201 * - there is data on the receive queue. 202 * - to signal that a previously congested destination may have become 203 * uncongested 204 * - A notification has been queued to the socket (this can be a congestion 205 * update, or a RDMA completion). 206 * 207 * POLLOUT is asserted if there is room on the send queue. This does not mean 208 * however, that the next sendmsg() call will succeed. If the application tries 209 * to send to a congested destination, the system call may still fail (and 210 * return ENOBUFS). 211 */ 212 /* ARGSUSED */ 213 static short 214 rdsv3_poll(sock_lower_handle_t proto_handle, short events, int anyyet, 215 cred_t *cr) 216 { 217 struct rsock *sk = (struct rsock *)proto_handle; 218 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 219 unsigned short mask = 0; 220 221 #if 0 222 RDSV3_DPRINTF4("rdsv3_poll", "enter(%p %x %d)", rs, events, anyyet); 223 #endif 224 225 /* 226 * If rs_seen_congestion is on, wait until it's off. 227 * This is implemented for the following OFED code. 228 * if (rs->rs_seen_congestion) 229 * poll_wait(file, &rds_poll_waitq, wait); 230 */ 231 mutex_enter(&rs->rs_congested_lock); 232 while (rs->rs_seen_congestion) { 233 cv_wait(&rs->rs_congested_cv, 234 &rs->rs_congested_lock); 235 } 236 mutex_exit(&rs->rs_congested_lock); 237 238 rw_enter(&rs->rs_recv_lock, RW_READER); 239 if (!rs->rs_cong_monitor) { 240 /* 241 * When a congestion map was updated, we signal POLLIN for 242 * "historical" reasons. Applications can also poll for 243 * WRBAND instead. 244 */ 245 if (rdsv3_cong_updated_since(&rs->rs_cong_track)) 246 mask |= (POLLIN | POLLRDNORM | POLLWRBAND); 247 } else { 248 mutex_enter(&rs->rs_lock); 249 if (rs->rs_cong_notify) 250 mask |= (POLLIN | POLLRDNORM); 251 mutex_exit(&rs->rs_lock); 252 } 253 if (!list_is_empty(&rs->rs_recv_queue) || 254 !list_is_empty(&rs->rs_notify_queue)) 255 mask |= (POLLIN | POLLRDNORM); 256 if (rs->rs_snd_bytes < rdsv3_sk_sndbuf(rs)) 257 mask |= (POLLOUT | POLLWRNORM); 258 259 /* clear state any time we wake a seen-congested socket */ 260 if (mask) { 261 mutex_enter(&rs->rs_congested_lock); 262 rs->rs_seen_congestion = 0; 263 mutex_exit(&rs->rs_congested_lock); 264 } 265 266 rw_exit(&rs->rs_recv_lock); 267 268 #if 0 269 RDSV3_DPRINTF4("rdsv3_poll", "return(%p %x)", rs, mask); 270 #endif 271 272 return (mask); 273 } 274 275 /* ARGSUSED */ 276 static int 277 rdsv3_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, 278 int mode, int32_t *rvalp, cred_t *cr) 279 { 280 ksocket_t so4; 281 struct lifconf lifc; 282 struct lifreq lifr, *lifrp; 283 struct ifconf ifc; 284 struct ifreq ifr; 285 int rval = 0, rc, len; 286 int numifs; 287 int bufsize; 288 void *buf; 289 290 RDSV3_DPRINTF4("rdsv3_ioctl", "enter: cmd: %d", cmd); 291 292 /* Only ipv4 for now */ 293 rval = ksocket_socket(&so4, PF_INET, SOCK_DGRAM, 0, KSOCKET_NOSLEEP, 294 CRED()); 295 if (rval != 0) { 296 RDSV3_DPRINTF2("rdsv3_ioctl", "ksocket_socket returned %d", 297 rval); 298 return (rval); 299 } 300 301 switch (cmd) { 302 case SIOCGLIFNUM : 303 case SIOCGIFNUM : 304 rval = rdsv3_do_ip_ioctl(so4, &buf, &bufsize, &numifs); 305 if (rval != 0) break; 306 if (cmd == SIOCGLIFNUM) { 307 struct lifnum lifn; 308 lifn.lifn_family = AF_INET_OFFLOAD; 309 lifn.lifn_flags = 0; 310 lifn.lifn_count = numifs; 311 (void) ddi_copyout(&lifn, (void *)arg, 312 sizeof (struct lifnum), 0); 313 } else { 314 len = 0; 315 for (lifrp = (struct lifreq *)buf, rc = 0; rc < numifs; 316 rc++, lifrp++) { 317 if (strlen(lifrp->lifr_name) <= IFNAMSIZ) { 318 len++; 319 } 320 } 321 (void) ddi_copyout(&len, (void *)arg, 322 sizeof (int), 0); 323 } 324 kmem_free(buf, bufsize); 325 break; 326 327 case SIOCGLIFCONF : 328 if (ddi_copyin((void *)arg, &lifc, sizeof (struct lifconf), 0) 329 != 0) { 330 RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed lifc"); 331 rval = EFAULT; 332 break; 333 } 334 335 rval = rdsv3_do_ip_ioctl(so4, &buf, &bufsize, &numifs); 336 if (rval != 0) { 337 RDSV3_DPRINTF2("rdsv3_ioctl", 338 "rdsv3_do_ip_ioctl failed: %d", rval); 339 break; 340 } 341 342 if ((lifc.lifc_len > 0) && (numifs > 0)) { 343 if (ddi_copyout(buf, (void *)lifc.lifc_req, 344 (lifc.lifc_len < bufsize) ? lifc.lifc_len : 345 bufsize, 0) != 0) { 346 RDSV3_DPRINTF2("rdsv3_ioctl", 347 "copyout of records failed"); 348 rval = EFAULT; 349 } 350 351 } 352 353 lifc.lifc_len = bufsize; 354 if (ddi_copyout(&lifc, (void *)arg, sizeof (struct lifconf), 355 0) != 0) { 356 RDSV3_DPRINTF2("rdsv3_ioctl", 357 "copyout of lifconf failed"); 358 rval = EFAULT; 359 } 360 361 kmem_free(buf, bufsize); 362 break; 363 364 case SIOCGIFCONF : 365 case O_SIOCGIFCONF : 366 if (ddi_copyin((void *)arg, &ifc, sizeof (struct ifconf), 0) 367 != 0) { 368 RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed ifc"); 369 rval = EFAULT; 370 break; 371 } 372 373 RDSV3_DPRINTF2("rdsv3_ioctl", 374 "O_SIOCGIFCONF: ifc_len: %d, req: %p", 375 ifc.ifc_len, ifc.ifc_req); 376 377 rval = rdsv3_do_ip_ioctl_old(so4, &buf, &bufsize, &numifs); 378 if (rval != 0) { 379 RDSV3_DPRINTF2("rdsv3_ioctl", 380 "rdsv3_do_ip_ioctl_old failed: %d", rval); 381 break; 382 } 383 384 if ((ifc.ifc_len > 0) && (numifs > 0)) { 385 if (ddi_copyout(buf, (void *)ifc.ifc_req, 386 (ifc.ifc_len < bufsize) ? ifc.ifc_len : 387 bufsize, 0) != 0) { 388 RDSV3_DPRINTF2("rdsv3_ioctl", 389 "copyout of records failed"); 390 rval = EFAULT; 391 } 392 393 } 394 395 ifc.ifc_len = bufsize; 396 if (ddi_copyout(&ifc, (void *)arg, sizeof (struct ifconf), 397 0) != 0) { 398 RDSV3_DPRINTF2("rdsv3_ioctl", 399 "copyout of ifconf failed"); 400 rval = EFAULT; 401 } 402 403 kmem_free(buf, bufsize); 404 break; 405 406 case SIOCGLIFFLAGS : 407 case SIOCSLIFFLAGS : 408 case SIOCGLIFMTU : 409 case SIOCGLIFNETMASK : 410 case SIOCGLIFINDEX : 411 if (ddi_copyin((void *)arg, &lifr, sizeof (struct lifreq), 0) 412 != 0) { 413 RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed lifr"); 414 rval = EFAULT; 415 break; 416 } 417 418 rc = ksocket_ioctl(so4, cmd, (intptr_t)&lifr, &rval, CRED()); 419 if (rc != 0) { 420 RDSV3_DPRINTF2("rdsv3_ioctl", 421 "ksocket_ioctl failed: %d, name: %s cmd: 0x%x", 422 rc, lifr.lifr_name, cmd); 423 break; 424 } 425 426 (void) ddi_copyout(&lifr, (void *)arg, 427 sizeof (struct lifreq), 0); 428 break; 429 430 case SIOCGIFFLAGS : 431 case SIOCSIFFLAGS : 432 case SIOCGIFMTU : 433 case SIOCGIFNETMASK : 434 case SIOCGIFINDEX : 435 if (ddi_copyin((void *)arg, &ifr, sizeof (struct ifreq), 0) 436 != 0) { 437 RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed ifr"); 438 rval = EFAULT; 439 break; 440 } 441 442 RDSV3_DPRINTF2("rdsv3_ioctl", "1. name: %s", ifr.ifr_name); 443 444 rc = ksocket_ioctl(so4, cmd, (intptr_t)&ifr, &rval, CRED()); 445 if (rc != 0) { 446 RDSV3_DPRINTF2("rdsv3_ioctl", 447 "ksocket_ioctl failed: %d, name: %s cmd: 0x%x", 448 rc, ifr.ifr_name, cmd); 449 450 break; 451 } 452 453 RDSV3_DPRINTF2("rdsv3_ioctl", "2. name: %s", ifr.ifr_name); 454 455 (void) ddi_copyout(&ifr, (void *)arg, 456 sizeof (struct ifreq), 0); 457 break; 458 459 default: 460 if ((cmd >= RDSV3_INFO_FIRST) && 461 (cmd <= RDSV3_INFO_LAST)) { 462 return (rdsv3_info_ioctl((struct rsock *)proto_handle, 463 cmd, (char *)arg, rvalp)); 464 } 465 RDSV3_DPRINTF2("rdsv3_ioctl", "Unknown ioctl cmd: %d", cmd); 466 cmn_err(CE_CONT, "unsupported IOCTL cmd: %d \n", cmd); 467 rval = EOPNOTSUPP; 468 } 469 470 (void) ksocket_close(so4, CRED()); 471 472 RDSV3_DPRINTF4("rdsv3_ioctl", "return: %d cmd: %d", rval, cmd); 473 474 *rvalp = rval; 475 return (rval); 476 } 477 478 static int 479 rdsv3_cancel_sent_to(struct rdsv3_sock *rs, char *optval, int len) 480 { 481 struct sockaddr_in sin; 482 483 /* racing with another thread binding seems ok here */ 484 if (rs->rs_bound_addr == 0) 485 return (-ENOTCONN); /* XXX not a great errno */ 486 487 if (len < sizeof (struct sockaddr_in)) 488 return (-EINVAL); 489 490 if (ddi_copyin((void *)optval, &sin, sizeof (struct sockaddr_in), 491 0) != 0) { 492 RDSV3_DPRINTF2("rdsv3_cancel_sent_to", "ddi_copyin failed sin"); 493 return (-EFAULT); 494 } 495 496 rdsv3_send_drop_to(rs, &sin); 497 498 return (0); 499 } 500 501 static int 502 rdsv3_set_bool_option(unsigned char *optvar, char *optval, int optlen) 503 { 504 int value = *optval; 505 506 if (optlen < sizeof (int)) 507 return (-EINVAL); 508 *optvar = !!value; 509 return (0); 510 } 511 512 static int 513 rdsv3_cong_monitor(struct rdsv3_sock *rs, char *optval, int optlen) 514 { 515 int ret; 516 517 ret = rdsv3_set_bool_option(&rs->rs_cong_monitor, optval, optlen); 518 if (ret == 0) { 519 if (rs->rs_cong_monitor) { 520 rdsv3_cong_add_socket(rs); 521 } else { 522 rdsv3_cong_remove_socket(rs); 523 rs->rs_cong_mask = 0; 524 rs->rs_cong_notify = 0; 525 } 526 } 527 return (ret); 528 } 529 530 /*ARGSUSED*/ 531 static int 532 rdsv3_setsockopt(sock_lower_handle_t proto_handle, int level, 533 int optname, const void *optval, socklen_t optlen, cred_t *cr) 534 { 535 struct rsock *sk = (struct rsock *)proto_handle; 536 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 537 int ret = 0; 538 539 RDSV3_DPRINTF4("rdsv3_setsockopt", "enter(%p %d %d)", 540 rs, level, optname); 541 542 switch (optname) { 543 case RDSV3_CANCEL_SENT_TO: 544 ret = rdsv3_cancel_sent_to(rs, (char *)optval, optlen); 545 break; 546 case RDSV3_GET_MR: 547 ret = rdsv3_get_mr(rs, optval, optlen); 548 break; 549 case RDSV3_GET_MR_FOR_DEST: 550 ret = rdsv3_get_mr_for_dest(rs, optval, optlen); 551 break; 552 case RDSV3_FREE_MR: 553 ret = rdsv3_free_mr(rs, optval, optlen); 554 break; 555 case RDSV3_RECVERR: 556 ret = rdsv3_set_bool_option(&rs->rs_recverr, 557 (char *)optval, optlen); 558 break; 559 case RDSV3_CONG_MONITOR: 560 ret = rdsv3_cong_monitor(rs, (char *)optval, optlen); 561 break; 562 case SO_SNDBUF: 563 sk->sk_sndbuf = *(uint_t *)optval; 564 return (ret); 565 case SO_RCVBUF: 566 sk->sk_rcvbuf = *(uint_t *)optval; 567 return (ret); 568 default: 569 #if 1 570 break; 571 #else 572 ret = -ENOPROTOOPT; 573 #endif 574 } 575 out: 576 return (ret); 577 } 578 579 /* XXX */ 580 /*ARGSUSED*/ 581 static int 582 rdsv3_getsockopt(sock_lower_handle_t proto_handle, int level, 583 int optname, void *optval, socklen_t *optlen, cred_t *cr) 584 { 585 struct rsock *sk = (struct rsock *)proto_handle; 586 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 587 int ret = 0; 588 589 RDSV3_DPRINTF4("rdsv3_getsockopt", "enter(%p %d %d)", 590 rs, optname, *optlen); 591 592 switch (optname) { 593 case SO_SNDBUF: 594 RDSV3_DPRINTF4("rdsv3_getsockopt", "SO_SNDBUF(%d)", 595 sk->sk_sndbuf); 596 if (*optlen != 0) { 597 *((int *)optval) = sk->sk_sndbuf; 598 *optlen = sizeof (uint_t); 599 } 600 return (ret); 601 case SO_RCVBUF: 602 RDSV3_DPRINTF4("rdsv3_getsockopt", "SO_RCVBUF(%d)", 603 sk->sk_rcvbuf); 604 if (*optlen != 0) { 605 *((int *)optval) = sk->sk_rcvbuf; 606 *optlen = sizeof (uint_t); 607 } 608 return (ret); 609 case RDSV3_RECVERR: 610 RDSV3_DPRINTF4("rdsv3_getsockopt", "RDSV3_RECVERR(%d)", 611 rs->rs_recverr); 612 if (*optlen < sizeof (int)) 613 return (-EINVAL); 614 else { 615 *(int *)optval = rs->rs_recverr; 616 *optlen = sizeof (int); 617 } 618 return (0); 619 default: 620 RDSV3_DPRINTF2("rdsv3_getsockopt", 621 "Unknown: level: %d optname: %d", level, optname); 622 ret = -ENOPROTOOPT; 623 } 624 625 RDSV3_DPRINTF4("rdsv3_getsockopt", "return(%p %d %d)", 626 rs, optname, ret); 627 return (ret); 628 } 629 630 /*ARGSUSED*/ 631 static int rdsv3_connect(sock_lower_handle_t proto_handle, 632 const struct sockaddr *addr, socklen_t addr_len, sock_connid_t *conn, 633 cred_t *cr) 634 { 635 struct rsock *sk = (struct rsock *)proto_handle; 636 struct sockaddr_in *sin = (struct sockaddr_in *)addr; 637 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 638 int ret = 0; 639 640 RDSV3_DPRINTF4("rdsv3_connect", "Enter(rs: %p)", rs); 641 642 mutex_enter(&sk->sk_lock); 643 644 if (addr_len != sizeof (struct sockaddr_in)) { 645 ret = -EINVAL; 646 goto out; 647 } 648 649 if (sin->sin_family != AF_INET_OFFLOAD) { 650 ret = -EAFNOSUPPORT; 651 goto out; 652 } 653 654 if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { 655 ret = -EDESTADDRREQ; 656 goto out; 657 } 658 659 rs->rs_conn_addr = sin->sin_addr.s_addr; 660 rs->rs_conn_port = sin->sin_port; 661 662 sk->sk_upcalls->su_connected(sk->sk_upper_handle, 0, NULL, -1); 663 664 RDSV3_DPRINTF4("rdsv3_connect", "Return(rs: %p)", rs); 665 666 out: 667 mutex_exit(&sk->sk_lock); 668 return (ret); 669 } 670 671 /*ARGSUSED*/ 672 static int 673 rdsv3_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) 674 { 675 struct rsock *sk = (struct rsock *)proto_handle; 676 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 677 678 RDSV3_DPRINTF4("rdsv3_shutdown", "Enter(rs: %p)", rs); 679 680 return (0); 681 } 682 683 /*ARGSUSED*/ 684 void 685 rdsv3_activate(sock_lower_handle_t proto_handle, 686 sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, 687 int flags, cred_t *cr) 688 { 689 struct rsock *sk = (struct rsock *)proto_handle; 690 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 691 692 RDSV3_DPRINTF4("rdsv3_activate", "Enter(rs: %p)", rs); 693 694 sk->sk_upcalls = sock_upcalls; 695 sk->sk_upper_handle = sock_handle; 696 697 RDSV3_DPRINTF4("rdsv3_activate", "Return (rs: %p)", rs); 698 } 699 700 701 /* ARGSUSED */ 702 int 703 rdsv3_send_uio(sock_lower_handle_t proto_handle, uio_t *uio, 704 struct nmsghdr *msg, cred_t *cr) 705 { 706 struct rsock *sk = (struct rsock *)proto_handle; 707 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 708 int ret; 709 710 RDSV3_DPRINTF4("rdsv3_send_uio", "Enter(rs: %p)", rs); 711 ret = rdsv3_sendmsg(rs, uio, msg, uio->uio_resid); 712 713 RDSV3_DPRINTF4("rdsv3_send_uio", "Return(rs: %p ret %d)", rs, ret); 714 if (ret < 0) { 715 return (-ret); 716 } 717 718 return (0); 719 } 720 721 /* ARGSUSED */ 722 int 723 rdsv3_recv_uio(sock_lower_handle_t proto_handle, uio_t *uio, 724 struct nmsghdr *msg, cred_t *cr) 725 { 726 struct rsock *sk = (struct rsock *)proto_handle; 727 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 728 int ret; 729 730 RDSV3_DPRINTF4("rdsv3_recv_uio", "Enter (rs: %p)", rs); 731 ret = rdsv3_recvmsg(rs, uio, msg, uio->uio_resid, msg->msg_flags); 732 733 RDSV3_DPRINTF4("rdsv3_recv_uio", "Return(rs: %p ret %d)", rs, ret); 734 735 if (ret < 0) { 736 return (-ret); 737 } 738 739 return (0); 740 } 741 742 /*ARGSUSED*/ 743 int 744 rdsv3_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr, 745 socklen_t *addr_len, cred_t *cr) 746 { 747 struct sockaddr_in *sin = (struct sockaddr_in *)addr; 748 struct rsock *sk = (struct rsock *)proto_handle; 749 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 750 751 RDSV3_DPRINTF2("rdsv3_getpeername", "enter(rs: %p)", rs); 752 753 (void) memset(sin->sin_zero, 0, sizeof (sin->sin_zero)); 754 755 /* racey, don't care */ 756 if (!rs->rs_conn_addr) 757 return (-ENOTCONN); 758 759 sin->sin_port = rs->rs_conn_port; 760 sin->sin_addr.s_addr = rs->rs_conn_addr; 761 762 sin->sin_family = AF_INET_OFFLOAD; 763 764 *addr_len = sizeof (*sin); 765 return (0); 766 } 767 768 void 769 rdsv3_clrflowctrl(sock_lower_handle_t proto_handle) 770 { 771 struct rsock *sk = (struct rsock *)proto_handle; 772 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 773 774 RDSV3_DPRINTF2("rdsv3_clrflowctrl", "enter(rs: %p)", rs); 775 } 776 777 #ifndef __lock_lint 778 static struct sock_downcalls_s rdsv3_sock_downcalls = { 779 .sd_close = rdsv3_release, 780 .sd_bind = rdsv3_bind, 781 .sd_connect = rdsv3_connect, 782 .sd_accept = NULL, 783 .sd_getsockname = rdsv3_getname, 784 .sd_poll = rdsv3_poll, 785 .sd_ioctl = rdsv3_ioctl, 786 .sd_listen = NULL, 787 .sd_shutdown = rdsv3_shutdown, 788 .sd_setsockopt = rdsv3_setsockopt, 789 .sd_getsockopt = rdsv3_getsockopt, 790 .sd_send_uio = rdsv3_send_uio, 791 .sd_recv_uio = rdsv3_recv_uio, 792 .sd_activate = rdsv3_activate, 793 .sd_getpeername = rdsv3_getpeername, 794 .sd_send = NULL, 795 .sd_clr_flowctrl = NULL 796 }; 797 #else 798 static struct sock_downcalls_s rdsv3_sock_downcalls = { 799 rdsv3_activate, 800 NULL, 801 rdsv3_bind, 802 NULL, 803 rdsv3_connect, 804 rdsv3_getpeername, 805 rdsv3_getname, 806 rdsv3_getsockopt, 807 rdsv3_setsockopt, 808 NULL, 809 rdsv3_send_uio, 810 rdsv3_recv_uio, 811 rdsv3_poll, 812 rdsv3_shutdown, 813 NULL, 814 rdsv3_ioctl, 815 rdsv3_release 816 }; 817 #endif 818 819 sock_lower_handle_t 820 rdsv3_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, 821 uint_t *smodep, int *errorp, int flags, cred_t *credp) 822 { 823 struct rdsv3_sock *rs; 824 struct rsock *sk; 825 826 RDSV3_DPRINTF4("rdsv3_create", "Enter (family: %d type: %d, proto: %d " 827 "flags: %d", family, type, proto, flags); 828 829 sk = rdsv3_sk_alloc(); 830 if (sk == NULL) 831 return (NULL); 832 rdsv3_sock_init_data(sk); 833 834 rs = rdsv3_sk_to_rs(sk); 835 rs->rs_sk = sk; 836 mutex_init(&rs->rs_lock, NULL, MUTEX_DRIVER, NULL); 837 rw_init(&rs->rs_recv_lock, NULL, RW_DRIVER, NULL); 838 list_create(&rs->rs_send_queue, sizeof (struct rdsv3_message), 839 offsetof(struct rdsv3_message, m_sock_item)); 840 list_create(&rs->rs_recv_queue, sizeof (struct rdsv3_incoming), 841 offsetof(struct rdsv3_incoming, i_item)); 842 list_create(&rs->rs_notify_queue, sizeof (struct rdsv3_notifier), 843 offsetof(struct rdsv3_notifier, n_list)); 844 mutex_init(&rs->rs_rdma_lock, NULL, MUTEX_DRIVER, NULL); 845 avl_create(&rs->rs_rdma_keys, rdsv3_mr_compare, 846 sizeof (struct rdsv3_mr), offsetof(struct rdsv3_mr, r_rb_node)); 847 mutex_init(&rs->rs_conn_lock, NULL, MUTEX_DRIVER, NULL); 848 mutex_init(&rs->rs_congested_lock, NULL, MUTEX_DRIVER, NULL); 849 cv_init(&rs->rs_congested_cv, NULL, CV_DRIVER, NULL); 850 rs->rs_cred = credp; 851 rs->rs_zoneid = getzoneid(); 852 crhold(credp); 853 854 mutex_enter(&rdsv3_sock_lock); 855 list_insert_tail(&rdsv3_sock_list, rs); 856 rdsv3_sock_count++; 857 /* Initialize RDMA/IB on the 1st socket if not done at attach */ 858 if (rdsv3_sock_count == 1) { 859 rdsv3_rdma_init(); 860 } 861 mutex_exit(&rdsv3_sock_lock); 862 863 *errorp = 0; 864 *smodep = SM_ATOMIC; 865 *sock_downcalls = &rdsv3_sock_downcalls; 866 867 RDSV3_DPRINTF4("rdsv3_create", "Return: %p", rs); 868 869 return ((sock_lower_handle_t)rdsv3_rs_to_sk(rs)); 870 } 871 872 void 873 rdsv3_sock_addref(struct rdsv3_sock *rs) 874 { 875 RDSV3_DPRINTF4("rdsv3_sock_addref", "Enter(rs: %p)", rs); 876 rdsv3_sk_sock_hold(rdsv3_rs_to_sk(rs)); 877 } 878 879 void 880 rdsv3_sock_put(struct rdsv3_sock *rs) 881 { 882 RDSV3_DPRINTF4("rdsv3_sock_put", "Enter(rs: %p)", rs); 883 rdsv3_sk_sock_put(rdsv3_rs_to_sk(rs)); 884 } 885 886 static void 887 rdsv3_sock_inc_info(struct rsock *sock, unsigned int len, 888 struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens) 889 { 890 struct rdsv3_sock *rs; 891 struct rdsv3_incoming *inc; 892 unsigned int total = 0; 893 894 RDSV3_DPRINTF4("rdsv3_sock_inc_info", "Enter(rs: %p)", 895 rdsv3_sk_to_rs(sock)); 896 897 len /= sizeof (struct rdsv3_info_message); 898 899 mutex_enter(&rdsv3_sock_lock); 900 901 RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_sock_list, rs_item) { 902 rw_enter(&rs->rs_recv_lock, RW_READER); 903 904 /* XXX too lazy to maintain counts.. */ 905 RDSV3_FOR_EACH_LIST_NODE(inc, &rs->rs_recv_queue, i_item) { 906 total++; 907 if (total <= len) 908 rdsv3_inc_info_copy(inc, iter, inc->i_saddr, 909 rs->rs_bound_addr, 1); 910 } 911 912 rw_exit(&rs->rs_recv_lock); 913 } 914 915 mutex_exit(&rdsv3_sock_lock); 916 917 lens->nr = total; 918 lens->each = sizeof (struct rdsv3_info_message); 919 920 RDSV3_DPRINTF4("rdsv3_sock_inc_info", "return(rs: %p)", 921 rdsv3_sk_to_rs(sock)); 922 } 923 924 static void 925 rdsv3_sock_info(struct rsock *sock, unsigned int len, 926 struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens) 927 { 928 struct rdsv3_info_socket sinfo; 929 struct rdsv3_sock *rs; 930 unsigned long bytes; 931 932 RDSV3_DPRINTF4("rdsv3_sock_info", "Enter(rs: %p)", 933 rdsv3_sk_to_rs(sock)); 934 935 len /= sizeof (struct rdsv3_info_socket); 936 937 mutex_enter(&rdsv3_sock_lock); 938 939 if ((len < rdsv3_sock_count) || (iter->addr == NULL)) 940 goto out; 941 942 bytes = sizeof (struct rdsv3_info_socket); 943 RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_sock_list, rs_item) { 944 sinfo.sndbuf = rdsv3_sk_sndbuf(rs); 945 sinfo.rcvbuf = rdsv3_sk_rcvbuf(rs); 946 sinfo.bound_addr = rs->rs_bound_addr; 947 sinfo.connected_addr = rs->rs_conn_addr; 948 sinfo.bound_port = rs->rs_bound_port; 949 sinfo.connected_port = rs->rs_conn_port; 950 951 rdsv3_info_copy(iter, &sinfo, bytes); 952 } 953 954 RDSV3_DPRINTF4("rdsv3_sock_info", "Return(rs: %p)", 955 rdsv3_sk_to_rs(sock)); 956 957 out: 958 lens->nr = rdsv3_sock_count; 959 lens->each = sizeof (struct rdsv3_info_socket); 960 961 mutex_exit(&rdsv3_sock_lock); 962 } 963 964 rdsv3_delayed_work_t *rdsv3_rdma_dwp = NULL; 965 uint_t rdsv3_rdma_init_delay = 5; /* secs */ 966 extern void rdsv3_rdma_init_worker(struct rdsv3_work_s *work); 967 968 void 969 rdsv3_exit(void) 970 { 971 RDSV3_DPRINTF4("rdsv3_exit", "Enter"); 972 973 if (rdsv3_rdma_dwp) { 974 rdsv3_cancel_delayed_work(rdsv3_rdma_dwp); 975 } 976 977 (void) ddi_taskq_dispatch(rdsv3_taskq, rdsv3_rdma_exit, 978 NULL, DDI_SLEEP); 979 while (rdsv3_rdma_listen_id != NULL) { 980 #ifndef __lock_lint 981 RDSV3_DPRINTF5("rdsv3", "%s-%d Waiting for rdsv3_rdma_exit", 982 __func__, __LINE__); 983 #endif 984 delay(drv_usectohz(1000)); 985 } 986 987 rdsv3_conn_exit(); 988 rdsv3_cong_exit(); 989 rdsv3_sysctl_exit(); 990 rdsv3_threads_exit(); 991 rdsv3_stats_exit(); 992 rdsv3_info_deregister_func(RDSV3_INFO_SOCKETS, rdsv3_sock_info); 993 rdsv3_info_deregister_func(RDSV3_INFO_RECV_MESSAGES, 994 rdsv3_sock_inc_info); 995 996 if (rdsv3_rdma_dwp) { 997 kmem_free(rdsv3_rdma_dwp, sizeof (rdsv3_delayed_work_t)); 998 rdsv3_rdma_dwp = NULL; 999 } 1000 1001 RDSV3_DPRINTF4("rdsv3_exit", "Return"); 1002 } 1003 1004 /*ARGSUSED*/ 1005 int 1006 rdsv3_init() 1007 { 1008 int ret; 1009 1010 RDSV3_DPRINTF4("rdsv3_init", "Enter"); 1011 1012 rdsv3_cong_init(); 1013 1014 ret = rdsv3_conn_init(); 1015 if (ret) 1016 goto out; 1017 ret = rdsv3_threads_init(); 1018 if (ret) 1019 goto out_conn; 1020 ret = rdsv3_sysctl_init(); 1021 if (ret) 1022 goto out_threads; 1023 ret = rdsv3_stats_init(); 1024 if (ret) 1025 goto out_sysctl; 1026 1027 rdsv3_info_register_func(RDSV3_INFO_SOCKETS, rdsv3_sock_info); 1028 rdsv3_info_register_func(RDSV3_INFO_RECV_MESSAGES, rdsv3_sock_inc_info); 1029 1030 /* rdsv3_rdma_init need to be called with a little delay */ 1031 rdsv3_rdma_dwp = kmem_zalloc(sizeof (rdsv3_delayed_work_t), KM_SLEEP); 1032 RDSV3_INIT_DELAYED_WORK(rdsv3_rdma_dwp, rdsv3_rdma_init_worker); 1033 rdsv3_queue_delayed_work(rdsv3_wq, rdsv3_rdma_dwp, 1034 rdsv3_rdma_init_delay); 1035 1036 RDSV3_DPRINTF4("rdsv3_init", "Return"); 1037 1038 goto out; 1039 1040 out_stats: 1041 rdsv3_stats_exit(); 1042 out_sysctl: 1043 rdsv3_sysctl_exit(); 1044 out_threads: 1045 rdsv3_threads_exit(); 1046 out_conn: 1047 rdsv3_conn_exit(); 1048 rdsv3_cong_exit(); 1049 out: 1050 return (ret); 1051 } 1052