1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * Copyright (c) 2006 Oracle.  All rights reserved.
27  *
28  * This software is available to you under a choice of one of two
29  * licenses.  You may choose to be licensed under the terms of the GNU
30  * General Public License (GPL) Version 2, available from the file
31  * COPYING in the main directory of this source tree, or the
32  * OpenIB.org BSD license below:
33  *
34  *     Redistribution and use in source and binary forms, with or
35  *     without modification, are permitted provided that the following
36  *     conditions are met:
37  *
38  *      - Redistributions of source code must retain the above
39  *        copyright notice, this list of conditions and the following
40  *        disclaimer.
41  *
42  *      - Redistributions in binary form must reproduce the above
43  *        copyright notice, this list of conditions and the following
44  *        disclaimer in the documentation and/or other materials
45  *        provided with the distribution.
46  *
47  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
48  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
49  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
50  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
51  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
52  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
53  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
54  * SOFTWARE.
55  *
56  */
57 #include <sys/types.h>
58 #include <sys/stat.h>
59 #include <sys/conf.h>
60 #include <sys/ddi.h>
61 #include <sys/sunddi.h>
62 #include <sys/modctl.h>
63 #include <sys/rds.h>
64 #include <sys/stropts.h>
65 #include <sys/socket.h>
66 #include <sys/socketvar.h>
67 #include <sys/sockio.h>
68 #include <sys/sysmacros.h>
69 
70 #include <inet/ip.h>
71 #include <net/if_types.h>
72 
73 #include <sys/ib/clients/rdsv3/rdsv3.h>
74 #include <sys/ib/clients/rdsv3/rdma.h>
75 #include <sys/ib/clients/rdsv3/rdma_transport.h>
76 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
77 
78 extern void rdsv3_remove_bound(struct rdsv3_sock *rds);
79 extern int rdsv3_verify_bind_address(ipaddr_t addr);
80 
81 extern ddi_taskq_t	*rdsv3_taskq;
82 extern struct rdma_cm_id *rdsv3_rdma_listen_id;
83 
84 /* this is just used for stats gathering :/ */
85 kmutex_t rdsv3_sock_lock;
86 static unsigned long rdsv3_sock_count;
87 list_t rdsv3_sock_list;
88 
89 /*
90  * This is called as the final descriptor referencing this socket is closed.
91  * We have to unbind the socket so that another socket can be bound to the
92  * address it was using.
93  *
94  * We have to be careful about racing with the incoming path.  sock_orphan()
95  * sets SOCK_DEAD and we use that as an indicator to the rx path that new
96  * messages shouldn't be queued.
97  */
98 /* ARGSUSED */
99 static int
100 rdsv3_release(sock_lower_handle_t proto_handle, int flgs, cred_t *cr)
101 {
102 	struct rsock *sk = (struct rsock *)proto_handle;
103 	struct rdsv3_sock *rs;
104 
105 	if (!sk)
106 		goto out;
107 
108 	rs = rdsv3_sk_to_rs(sk);
109 	RDSV3_DPRINTF4("rdsv3_release", "Enter(rs: %p, sk: %p)", rs, sk);
110 
111 	rdsv3_sk_sock_orphan(sk);
112 	rdsv3_cong_remove_socket(rs);
113 	rdsv3_remove_bound(rs);
114 
115 	/*
116 	 * Note - rdsv3_clear_recv_queue grabs rs_recv_lock, so
117 	 * that ensures the recv path has completed messing
118 	 * with the socket.
119 	 *
120 	 * Note2 - rdsv3_clear_recv_queue(rs) should be called first
121 	 * to prevent some race conditions, which is different from
122 	 * the Linux code.
123 	 */
124 	rdsv3_clear_recv_queue(rs);
125 	rdsv3_send_drop_to(rs, NULL);
126 	rdsv3_rdma_drop_keys(rs);
127 	(void) rdsv3_notify_queue_get(rs, NULL);
128 
129 	mutex_enter(&rdsv3_sock_lock);
130 	list_remove_node(&rs->rs_item);
131 	rdsv3_sock_count--;
132 	mutex_exit(&rdsv3_sock_lock);
133 
134 	while (sk->sk_refcount > 1) {
135 		/* wait for 1 sec and try again */
136 		delay(drv_usectohz(1000000));
137 	}
138 
139 	/* this will free the rs and sk */
140 	rdsv3_sk_sock_put(sk);
141 
142 	RDSV3_DPRINTF4("rdsv3_release", "Return (rds: %p)", rs);
143 out:
144 	return (0);
145 }
146 
147 void
148 __rdsv3_wake_sk_sleep(struct rsock *sk)
149 {
150 	/* wakup anyone waiting in recvmsg */
151 	if (!rdsv3_sk_sock_flag(sk, SOCK_DEAD) && sk->sk_sleep)
152 		rdsv3_wake_up(sk->sk_sleep);
153 }
154 
155 /*
156  * Careful not to race with rdsv3_release -> sock_orphan which clears sk_sleep.
157  * _bh() isn't OK here, we're called from interrupt handlers.  It's probably OK
158  * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but
159  * this seems more conservative.
160  * NB - normally, one would use sk_callback_lock for this, but we can
161  * get here from interrupts, whereas the network code grabs sk_callback_lock
162  * with _lock_bh only - so relying on sk_callback_lock introduces livelocks.
163  */
164 void
165 rdsv3_wake_sk_sleep(struct rdsv3_sock *rs)
166 {
167 	RDSV3_DPRINTF4("rdsv3_wake_sk_sleep", "Enter(rs: %p)", rs);
168 
169 	rw_enter(&rs->rs_recv_lock, RW_READER);
170 	__rdsv3_wake_sk_sleep(rdsv3_rs_to_sk(rs));
171 	rw_exit(&rs->rs_recv_lock);
172 }
173 
174 /*ARGSUSED*/
175 static int
176 rdsv3_getname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
177     socklen_t *addr_len, cred_t *cr)
178 {
179 	struct rsock *sk = (struct rsock *)proto_handle;
180 	struct sockaddr_in *sin = (struct sockaddr_in *)addr;
181 	struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
182 
183 	RDSV3_DPRINTF4("rdsv3_getname", "Enter(rs: %p, port: %d)", rs,
184 	    rs->rs_bound_port);
185 
186 	sin->sin_port = rs->rs_bound_port;
187 	sin->sin_addr.s_addr = rs->rs_bound_addr;
188 
189 	sin->sin_family = AF_INET_OFFLOAD;
190 
191 	*addr_len = sizeof (*sin);
192 	return (0);
193 }
194 
195 /*
196  * RDS' poll is without a doubt the least intuitive part of the interface,
197  * as POLLIN and POLLOUT do not behave entirely as you would expect from
198  * a network protocol.
199  *
200  * POLLIN is asserted if
201  *  -	there is data on the receive queue.
202  *  -	to signal that a previously congested destination may have become
203  *	uncongested
204  *  -	A notification has been queued to the socket (this can be a congestion
205  *	update, or a RDMA completion).
206  *
207  * POLLOUT is asserted if there is room on the send queue. This does not mean
208  * however, that the next sendmsg() call will succeed. If the application tries
209  * to send to a congested destination, the system call may still fail (and
210  * return ENOBUFS).
211  */
212 /* ARGSUSED */
213 static short
214 rdsv3_poll(sock_lower_handle_t proto_handle, short events, int anyyet,
215     cred_t *cr)
216 {
217 	struct rsock	*sk = (struct rsock *)proto_handle;
218 	struct rdsv3_sock	*rs = rdsv3_sk_to_rs(sk);
219 	unsigned short mask = 0;
220 
221 #if 0
222 	RDSV3_DPRINTF4("rdsv3_poll", "enter(%p %x %d)", rs, events, anyyet);
223 #endif
224 
225 	/*
226 	 * If rs_seen_congestion is on, wait until it's off.
227 	 * This is implemented for the following OFED code.
228 	 * 	if (rs->rs_seen_congestion)
229 	 *		poll_wait(file, &rds_poll_waitq, wait);
230 	 */
231 	mutex_enter(&rs->rs_congested_lock);
232 	while (rs->rs_seen_congestion) {
233 		cv_wait(&rs->rs_congested_cv,
234 		    &rs->rs_congested_lock);
235 	}
236 	mutex_exit(&rs->rs_congested_lock);
237 
238 	rw_enter(&rs->rs_recv_lock, RW_READER);
239 	if (!rs->rs_cong_monitor) {
240 		/*
241 		 * When a congestion map was updated, we signal POLLIN for
242 		 * "historical" reasons. Applications can also poll for
243 		 * WRBAND instead.
244 		 */
245 		if (rdsv3_cong_updated_since(&rs->rs_cong_track))
246 			mask |= (POLLIN | POLLRDNORM | POLLWRBAND);
247 	} else {
248 		mutex_enter(&rs->rs_lock);
249 		if (rs->rs_cong_notify)
250 			mask |= (POLLIN | POLLRDNORM);
251 		mutex_exit(&rs->rs_lock);
252 	}
253 	if (!list_is_empty(&rs->rs_recv_queue) ||
254 	    !list_is_empty(&rs->rs_notify_queue))
255 		mask |= (POLLIN | POLLRDNORM);
256 	if (rs->rs_snd_bytes < rdsv3_sk_sndbuf(rs))
257 		mask |= (POLLOUT | POLLWRNORM);
258 
259 	/* clear state any time we wake a seen-congested socket */
260 	if (mask) {
261 		mutex_enter(&rs->rs_congested_lock);
262 		rs->rs_seen_congestion = 0;
263 		mutex_exit(&rs->rs_congested_lock);
264 	}
265 
266 	rw_exit(&rs->rs_recv_lock);
267 
268 #if 0
269 	RDSV3_DPRINTF4("rdsv3_poll", "return(%p %x)", rs, mask);
270 #endif
271 
272 	return (mask);
273 }
274 
275 /* ARGSUSED */
276 static int
277 rdsv3_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
278     int mode, int32_t *rvalp, cred_t *cr)
279 {
280 	ksocket_t	so4;
281 	struct lifconf	lifc;
282 	struct lifreq	lifr, *lifrp;
283 	struct ifconf	ifc;
284 	struct ifreq	ifr;
285 	int		rval = 0, rc, len;
286 	int		numifs;
287 	int		bufsize;
288 	void		*buf;
289 
290 	RDSV3_DPRINTF4("rdsv3_ioctl", "enter: cmd: %d", cmd);
291 
292 	/* Only ipv4 for now */
293 	rval = ksocket_socket(&so4, PF_INET, SOCK_DGRAM, 0, KSOCKET_NOSLEEP,
294 	    CRED());
295 	if (rval != 0) {
296 		RDSV3_DPRINTF2("rdsv3_ioctl", "ksocket_socket returned %d",
297 		    rval);
298 		return (rval);
299 	}
300 
301 	switch (cmd) {
302 	case SIOCGLIFNUM :
303 	case SIOCGIFNUM :
304 		rval = rdsv3_do_ip_ioctl(so4, &buf, &bufsize, &numifs);
305 		if (rval != 0) break;
306 		if (cmd == SIOCGLIFNUM) {
307 			struct lifnum	lifn;
308 			lifn.lifn_family = AF_INET_OFFLOAD;
309 			lifn.lifn_flags = 0;
310 			lifn.lifn_count = numifs;
311 			(void) ddi_copyout(&lifn, (void *)arg,
312 			    sizeof (struct lifnum), 0);
313 		} else {
314 			len = 0;
315 			for (lifrp = (struct lifreq *)buf, rc = 0; rc < numifs;
316 			    rc++, lifrp++) {
317 				if (strlen(lifrp->lifr_name) <= IFNAMSIZ) {
318 					len++;
319 				}
320 			}
321 			(void) ddi_copyout(&len, (void *)arg,
322 			    sizeof (int), 0);
323 		}
324 		kmem_free(buf, bufsize);
325 		break;
326 
327 	case SIOCGLIFCONF :
328 		if (ddi_copyin((void *)arg, &lifc, sizeof (struct lifconf), 0)
329 		    != 0) {
330 			RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed lifc");
331 			rval = EFAULT;
332 			break;
333 		}
334 
335 		rval = rdsv3_do_ip_ioctl(so4, &buf, &bufsize, &numifs);
336 		if (rval != 0) {
337 			RDSV3_DPRINTF2("rdsv3_ioctl",
338 			    "rdsv3_do_ip_ioctl failed: %d", rval);
339 			break;
340 		}
341 
342 		if ((lifc.lifc_len > 0) && (numifs > 0)) {
343 			if (ddi_copyout(buf, (void *)lifc.lifc_req,
344 			    (lifc.lifc_len < bufsize) ? lifc.lifc_len :
345 			    bufsize, 0) != 0) {
346 				RDSV3_DPRINTF2("rdsv3_ioctl",
347 				    "copyout of records failed");
348 				rval = EFAULT;
349 			}
350 
351 		}
352 
353 		lifc.lifc_len = bufsize;
354 		if (ddi_copyout(&lifc, (void *)arg, sizeof (struct lifconf),
355 		    0) != 0) {
356 			RDSV3_DPRINTF2("rdsv3_ioctl",
357 			    "copyout of lifconf failed");
358 			rval = EFAULT;
359 		}
360 
361 		kmem_free(buf, bufsize);
362 		break;
363 
364 	case SIOCGIFCONF :
365 	case O_SIOCGIFCONF :
366 		if (ddi_copyin((void *)arg, &ifc, sizeof (struct ifconf), 0)
367 		    != 0) {
368 			RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed ifc");
369 			rval = EFAULT;
370 			break;
371 		}
372 
373 		RDSV3_DPRINTF2("rdsv3_ioctl",
374 		    "O_SIOCGIFCONF: ifc_len: %d, req: %p",
375 		    ifc.ifc_len, ifc.ifc_req);
376 
377 		rval = rdsv3_do_ip_ioctl_old(so4, &buf, &bufsize, &numifs);
378 		if (rval != 0) {
379 			RDSV3_DPRINTF2("rdsv3_ioctl",
380 			    "rdsv3_do_ip_ioctl_old failed: %d", rval);
381 			break;
382 		}
383 
384 		if ((ifc.ifc_len > 0) && (numifs > 0)) {
385 			if (ddi_copyout(buf, (void *)ifc.ifc_req,
386 			    (ifc.ifc_len < bufsize) ? ifc.ifc_len :
387 			    bufsize, 0) != 0) {
388 				RDSV3_DPRINTF2("rdsv3_ioctl",
389 				    "copyout of records failed");
390 				rval = EFAULT;
391 			}
392 
393 		}
394 
395 		ifc.ifc_len = bufsize;
396 		if (ddi_copyout(&ifc, (void *)arg, sizeof (struct ifconf),
397 		    0) != 0) {
398 			RDSV3_DPRINTF2("rdsv3_ioctl",
399 			    "copyout of ifconf failed");
400 			rval = EFAULT;
401 		}
402 
403 		kmem_free(buf, bufsize);
404 		break;
405 
406 	case SIOCGLIFFLAGS :
407 	case SIOCSLIFFLAGS :
408 	case SIOCGLIFMTU :
409 	case SIOCGLIFNETMASK :
410 	case SIOCGLIFINDEX :
411 		if (ddi_copyin((void *)arg, &lifr, sizeof (struct lifreq), 0)
412 		    != 0) {
413 			RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed lifr");
414 			rval = EFAULT;
415 			break;
416 		}
417 
418 		rc = ksocket_ioctl(so4, cmd, (intptr_t)&lifr, &rval, CRED());
419 		if (rc != 0) {
420 			RDSV3_DPRINTF2("rdsv3_ioctl",
421 			    "ksocket_ioctl failed: %d, name: %s cmd: 0x%x",
422 			    rc, lifr.lifr_name, cmd);
423 			break;
424 		}
425 
426 		(void) ddi_copyout(&lifr, (void *)arg,
427 		    sizeof (struct lifreq), 0);
428 		break;
429 
430 	case SIOCGIFFLAGS :
431 	case SIOCSIFFLAGS :
432 	case SIOCGIFMTU :
433 	case SIOCGIFNETMASK :
434 	case SIOCGIFINDEX :
435 		if (ddi_copyin((void *)arg, &ifr, sizeof (struct ifreq), 0)
436 		    != 0) {
437 			RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed ifr");
438 			rval = EFAULT;
439 			break;
440 		}
441 
442 		RDSV3_DPRINTF2("rdsv3_ioctl", "1. name: %s", ifr.ifr_name);
443 
444 		rc = ksocket_ioctl(so4, cmd, (intptr_t)&ifr, &rval, CRED());
445 		if (rc != 0) {
446 			RDSV3_DPRINTF2("rdsv3_ioctl",
447 			    "ksocket_ioctl failed: %d, name: %s cmd: 0x%x",
448 			    rc, ifr.ifr_name, cmd);
449 
450 			break;
451 		}
452 
453 		RDSV3_DPRINTF2("rdsv3_ioctl", "2. name: %s", ifr.ifr_name);
454 
455 		(void) ddi_copyout(&ifr, (void *)arg,
456 		    sizeof (struct ifreq), 0);
457 		break;
458 
459 	default:
460 		if ((cmd >= RDSV3_INFO_FIRST) &&
461 		    (cmd <= RDSV3_INFO_LAST)) {
462 			return (rdsv3_info_ioctl((struct rsock *)proto_handle,
463 			    cmd, (char *)arg, rvalp));
464 		}
465 		RDSV3_DPRINTF2("rdsv3_ioctl", "Unknown ioctl cmd: %d",  cmd);
466 		cmn_err(CE_CONT, "unsupported IOCTL cmd: %d \n", cmd);
467 		rval = EOPNOTSUPP;
468 	}
469 
470 	(void) ksocket_close(so4, CRED());
471 
472 	RDSV3_DPRINTF4("rdsv3_ioctl", "return: %d cmd: %d", rval, cmd);
473 
474 	*rvalp = rval;
475 	return (rval);
476 }
477 
478 static int
479 rdsv3_cancel_sent_to(struct rdsv3_sock *rs, char *optval, int len)
480 {
481 	struct sockaddr_in sin;
482 
483 	/* racing with another thread binding seems ok here */
484 	if (rs->rs_bound_addr == 0)
485 		return (-ENOTCONN); /* XXX not a great errno */
486 
487 	if (len < sizeof (struct sockaddr_in))
488 		return (-EINVAL);
489 
490 	if (ddi_copyin((void *)optval, &sin, sizeof (struct sockaddr_in),
491 	    0) != 0) {
492 		RDSV3_DPRINTF2("rdsv3_cancel_sent_to", "ddi_copyin failed sin");
493 		return (-EFAULT);
494 	}
495 
496 	rdsv3_send_drop_to(rs, &sin);
497 
498 	return (0);
499 }
500 
501 static int
502 rdsv3_set_bool_option(unsigned char *optvar, char *optval, int optlen)
503 {
504 	int value = *optval;
505 
506 	if (optlen < sizeof (int))
507 		return (-EINVAL);
508 	*optvar = !!value;
509 	return (0);
510 }
511 
512 static int
513 rdsv3_cong_monitor(struct rdsv3_sock *rs, char *optval, int optlen)
514 {
515 	int ret;
516 
517 	ret = rdsv3_set_bool_option(&rs->rs_cong_monitor, optval, optlen);
518 	if (ret == 0) {
519 		if (rs->rs_cong_monitor) {
520 			rdsv3_cong_add_socket(rs);
521 		} else {
522 			rdsv3_cong_remove_socket(rs);
523 			rs->rs_cong_mask = 0;
524 			rs->rs_cong_notify = 0;
525 		}
526 	}
527 	return (ret);
528 }
529 
530 /*ARGSUSED*/
531 static int
532 rdsv3_setsockopt(sock_lower_handle_t proto_handle, int level,
533     int optname, const void *optval, socklen_t optlen, cred_t *cr)
534 {
535 	struct rsock *sk = (struct rsock *)proto_handle;
536 	struct rdsv3_sock	*rs = rdsv3_sk_to_rs(sk);
537 	int	ret = 0;
538 
539 	RDSV3_DPRINTF4("rdsv3_setsockopt", "enter(%p %d %d)",
540 	    rs, level, optname);
541 
542 	switch (optname) {
543 	case RDSV3_CANCEL_SENT_TO:
544 		ret = rdsv3_cancel_sent_to(rs, (char *)optval, optlen);
545 		break;
546 	case RDSV3_GET_MR:
547 		ret = rdsv3_get_mr(rs, optval, optlen);
548 		break;
549 	case RDSV3_GET_MR_FOR_DEST:
550 		ret = rdsv3_get_mr_for_dest(rs, optval, optlen);
551 		break;
552 	case RDSV3_FREE_MR:
553 		ret = rdsv3_free_mr(rs, optval, optlen);
554 		break;
555 	case RDSV3_RECVERR:
556 		ret = rdsv3_set_bool_option(&rs->rs_recverr,
557 		    (char *)optval, optlen);
558 		break;
559 	case RDSV3_CONG_MONITOR:
560 		ret = rdsv3_cong_monitor(rs, (char *)optval, optlen);
561 		break;
562 	case SO_SNDBUF:
563 		sk->sk_sndbuf = *(uint_t *)optval;
564 		return (ret);
565 	case SO_RCVBUF:
566 		sk->sk_rcvbuf = *(uint_t *)optval;
567 		return (ret);
568 	default:
569 #if 1
570 		break;
571 #else
572 		ret = -ENOPROTOOPT;
573 #endif
574 	}
575 out:
576 	return (ret);
577 }
578 
579 /* XXX */
580 /*ARGSUSED*/
581 static int
582 rdsv3_getsockopt(sock_lower_handle_t proto_handle, int level,
583     int optname, void *optval, socklen_t *optlen, cred_t *cr)
584 {
585 	struct rsock *sk = (struct rsock *)proto_handle;
586 	struct rdsv3_sock	*rs = rdsv3_sk_to_rs(sk);
587 	int ret = 0;
588 
589 	RDSV3_DPRINTF4("rdsv3_getsockopt", "enter(%p %d %d)",
590 	    rs, optname, *optlen);
591 
592 	switch (optname) {
593 	case SO_SNDBUF:
594 		RDSV3_DPRINTF4("rdsv3_getsockopt", "SO_SNDBUF(%d)",
595 		    sk->sk_sndbuf);
596 		if (*optlen != 0) {
597 			*((int *)optval) = sk->sk_sndbuf;
598 			*optlen = sizeof (uint_t);
599 		}
600 		return (ret);
601 	case SO_RCVBUF:
602 		RDSV3_DPRINTF4("rdsv3_getsockopt", "SO_RCVBUF(%d)",
603 		    sk->sk_rcvbuf);
604 		if (*optlen != 0) {
605 			*((int *)optval) = sk->sk_rcvbuf;
606 			*optlen = sizeof (uint_t);
607 		}
608 		return (ret);
609 	case RDSV3_RECVERR:
610 		RDSV3_DPRINTF4("rdsv3_getsockopt", "RDSV3_RECVERR(%d)",
611 		    rs->rs_recverr);
612 		if (*optlen < sizeof (int))
613 			return (-EINVAL);
614 		else {
615 			*(int *)optval = rs->rs_recverr;
616 			*optlen = sizeof (int);
617 		}
618 		return (0);
619 	default:
620 		RDSV3_DPRINTF2("rdsv3_getsockopt",
621 		    "Unknown: level: %d optname: %d", level, optname);
622 		ret = -ENOPROTOOPT;
623 	}
624 
625 	RDSV3_DPRINTF4("rdsv3_getsockopt", "return(%p %d %d)",
626 	    rs, optname, ret);
627 	return (ret);
628 }
629 
630 /*ARGSUSED*/
631 static int rdsv3_connect(sock_lower_handle_t proto_handle,
632     const struct sockaddr *addr, socklen_t addr_len, sock_connid_t *conn,
633     cred_t *cr)
634 {
635 	struct rsock *sk = (struct rsock *)proto_handle;
636 	struct sockaddr_in *sin = (struct sockaddr_in *)addr;
637 	struct rdsv3_sock	*rs = rdsv3_sk_to_rs(sk);
638 	int ret = 0;
639 
640 	RDSV3_DPRINTF4("rdsv3_connect", "Enter(rs: %p)", rs);
641 
642 	mutex_enter(&sk->sk_lock);
643 
644 	if (addr_len != sizeof (struct sockaddr_in)) {
645 		ret = -EINVAL;
646 		goto out;
647 	}
648 
649 	if (sin->sin_family != AF_INET_OFFLOAD) {
650 		ret = -EAFNOSUPPORT;
651 		goto out;
652 	}
653 
654 	if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
655 		ret = -EDESTADDRREQ;
656 		goto out;
657 	}
658 
659 	rs->rs_conn_addr = sin->sin_addr.s_addr;
660 	rs->rs_conn_port = sin->sin_port;
661 
662 	sk->sk_upcalls->su_connected(sk->sk_upper_handle, 0, NULL, -1);
663 
664 	RDSV3_DPRINTF4("rdsv3_connect", "Return(rs: %p)", rs);
665 
666 out:
667 	mutex_exit(&sk->sk_lock);
668 	return (ret);
669 }
670 
671 /*ARGSUSED*/
672 static int
673 rdsv3_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
674 {
675 	struct rsock *sk = (struct rsock *)proto_handle;
676 	struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
677 
678 	RDSV3_DPRINTF4("rdsv3_shutdown", "Enter(rs: %p)", rs);
679 
680 	return (0);
681 }
682 
683 /*ARGSUSED*/
684 void
685 rdsv3_activate(sock_lower_handle_t proto_handle,
686     sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls,
687     int flags, cred_t *cr)
688 {
689 	struct rsock *sk = (struct rsock *)proto_handle;
690 	struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
691 
692 	RDSV3_DPRINTF4("rdsv3_activate", "Enter(rs: %p)", rs);
693 
694 	sk->sk_upcalls = sock_upcalls;
695 	sk->sk_upper_handle = sock_handle;
696 
697 	RDSV3_DPRINTF4("rdsv3_activate", "Return (rs: %p)", rs);
698 }
699 
700 
701 /* ARGSUSED */
702 int
703 rdsv3_send_uio(sock_lower_handle_t proto_handle, uio_t *uio,
704     struct nmsghdr *msg, cred_t *cr)
705 {
706 	struct rsock *sk = (struct rsock *)proto_handle;
707 	struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
708 	int ret;
709 
710 	RDSV3_DPRINTF4("rdsv3_send_uio", "Enter(rs: %p)", rs);
711 	ret = rdsv3_sendmsg(rs, uio, msg, uio->uio_resid);
712 
713 	RDSV3_DPRINTF4("rdsv3_send_uio", "Return(rs: %p ret %d)", rs, ret);
714 	if (ret < 0) {
715 		return (-ret);
716 	}
717 
718 	return (0);
719 }
720 
721 /* ARGSUSED */
722 int
723 rdsv3_recv_uio(sock_lower_handle_t proto_handle, uio_t *uio,
724     struct nmsghdr *msg, cred_t *cr)
725 {
726 	struct rsock *sk = (struct rsock *)proto_handle;
727 	struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
728 	int ret;
729 
730 	RDSV3_DPRINTF4("rdsv3_recv_uio", "Enter (rs: %p)", rs);
731 	ret = rdsv3_recvmsg(rs, uio, msg, uio->uio_resid, msg->msg_flags);
732 
733 	RDSV3_DPRINTF4("rdsv3_recv_uio", "Return(rs: %p ret %d)", rs, ret);
734 
735 	if (ret < 0) {
736 		return (-ret);
737 	}
738 
739 	return (0);
740 }
741 
742 /*ARGSUSED*/
743 int
744 rdsv3_getpeername(sock_lower_handle_t  proto_handle, struct sockaddr *addr,
745     socklen_t *addr_len, cred_t *cr)
746 {
747 	struct sockaddr_in *sin = (struct sockaddr_in *)addr;
748 	struct rsock *sk = (struct rsock *)proto_handle;
749 	struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
750 
751 	RDSV3_DPRINTF2("rdsv3_getpeername", "enter(rs: %p)", rs);
752 
753 	(void) memset(sin->sin_zero, 0, sizeof (sin->sin_zero));
754 
755 	/* racey, don't care */
756 	if (!rs->rs_conn_addr)
757 		return (-ENOTCONN);
758 
759 	sin->sin_port = rs->rs_conn_port;
760 	sin->sin_addr.s_addr = rs->rs_conn_addr;
761 
762 	sin->sin_family = AF_INET_OFFLOAD;
763 
764 	*addr_len = sizeof (*sin);
765 	return (0);
766 }
767 
768 void
769 rdsv3_clrflowctrl(sock_lower_handle_t proto_handle)
770 {
771 	struct rsock *sk = (struct rsock *)proto_handle;
772 	struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
773 
774 	RDSV3_DPRINTF2("rdsv3_clrflowctrl", "enter(rs: %p)", rs);
775 }
776 
777 #ifndef __lock_lint
778 static struct sock_downcalls_s rdsv3_sock_downcalls = {
779 	.sd_close =		rdsv3_release,
780 	.sd_bind =		rdsv3_bind,
781 	.sd_connect =		rdsv3_connect,
782 	.sd_accept =		NULL,
783 	.sd_getsockname =	rdsv3_getname,
784 	.sd_poll =		rdsv3_poll,
785 	.sd_ioctl =		rdsv3_ioctl,
786 	.sd_listen =		NULL,
787 	.sd_shutdown =		rdsv3_shutdown,
788 	.sd_setsockopt =	rdsv3_setsockopt,
789 	.sd_getsockopt =	rdsv3_getsockopt,
790 	.sd_send_uio =		rdsv3_send_uio,
791 	.sd_recv_uio =		rdsv3_recv_uio,
792 	.sd_activate =		rdsv3_activate,
793 	.sd_getpeername =	rdsv3_getpeername,
794 	.sd_send =		NULL,
795 	.sd_clr_flowctrl =	NULL
796 };
797 #else
798 static struct sock_downcalls_s rdsv3_sock_downcalls = {
799 	rdsv3_activate,
800 	NULL,
801 	rdsv3_bind,
802 	NULL,
803 	rdsv3_connect,
804 	rdsv3_getpeername,
805 	rdsv3_getname,
806 	rdsv3_getsockopt,
807 	rdsv3_setsockopt,
808 	NULL,
809 	rdsv3_send_uio,
810 	rdsv3_recv_uio,
811 	rdsv3_poll,
812 	rdsv3_shutdown,
813 	NULL,
814 	rdsv3_ioctl,
815 	rdsv3_release
816 };
817 #endif
818 
819 sock_lower_handle_t
820 rdsv3_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
821     uint_t *smodep, int *errorp, int flags, cred_t *credp)
822 {
823 	struct rdsv3_sock	*rs;
824 	struct rsock		*sk;
825 
826 	RDSV3_DPRINTF4("rdsv3_create", "Enter (family: %d type: %d, proto: %d "
827 	    "flags: %d", family, type, proto, flags);
828 
829 	sk = rdsv3_sk_alloc();
830 	if (sk == NULL)
831 		return (NULL);
832 	rdsv3_sock_init_data(sk);
833 
834 	rs = rdsv3_sk_to_rs(sk);
835 	rs->rs_sk = sk;
836 	mutex_init(&rs->rs_lock, NULL, MUTEX_DRIVER, NULL);
837 	rw_init(&rs->rs_recv_lock, NULL, RW_DRIVER, NULL);
838 	list_create(&rs->rs_send_queue, sizeof (struct rdsv3_message),
839 	    offsetof(struct rdsv3_message, m_sock_item));
840 	list_create(&rs->rs_recv_queue, sizeof (struct rdsv3_incoming),
841 	    offsetof(struct rdsv3_incoming, i_item));
842 	list_create(&rs->rs_notify_queue, sizeof (struct rdsv3_notifier),
843 	    offsetof(struct rdsv3_notifier, n_list));
844 	mutex_init(&rs->rs_rdma_lock, NULL, MUTEX_DRIVER, NULL);
845 	avl_create(&rs->rs_rdma_keys, rdsv3_mr_compare,
846 	    sizeof (struct rdsv3_mr), offsetof(struct rdsv3_mr, r_rb_node));
847 	mutex_init(&rs->rs_conn_lock, NULL, MUTEX_DRIVER, NULL);
848 	mutex_init(&rs->rs_congested_lock, NULL, MUTEX_DRIVER, NULL);
849 	cv_init(&rs->rs_congested_cv, NULL, CV_DRIVER, NULL);
850 	rs->rs_cred = credp;
851 	rs->rs_zoneid = getzoneid();
852 	crhold(credp);
853 
854 	mutex_enter(&rdsv3_sock_lock);
855 	list_insert_tail(&rdsv3_sock_list, rs);
856 	rdsv3_sock_count++;
857 	/* Initialize RDMA/IB on the 1st socket if not done at attach */
858 	if (rdsv3_sock_count == 1) {
859 		rdsv3_rdma_init();
860 	}
861 	mutex_exit(&rdsv3_sock_lock);
862 
863 	*errorp = 0;
864 	*smodep = SM_ATOMIC;
865 	*sock_downcalls = &rdsv3_sock_downcalls;
866 
867 	RDSV3_DPRINTF4("rdsv3_create", "Return: %p", rs);
868 
869 	return ((sock_lower_handle_t)rdsv3_rs_to_sk(rs));
870 }
871 
872 void
873 rdsv3_sock_addref(struct rdsv3_sock *rs)
874 {
875 	RDSV3_DPRINTF4("rdsv3_sock_addref", "Enter(rs: %p)", rs);
876 	rdsv3_sk_sock_hold(rdsv3_rs_to_sk(rs));
877 }
878 
879 void
880 rdsv3_sock_put(struct rdsv3_sock *rs)
881 {
882 	RDSV3_DPRINTF4("rdsv3_sock_put", "Enter(rs: %p)", rs);
883 	rdsv3_sk_sock_put(rdsv3_rs_to_sk(rs));
884 }
885 
886 static void
887 rdsv3_sock_inc_info(struct rsock *sock, unsigned int len,
888     struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens)
889 {
890 	struct rdsv3_sock *rs;
891 	struct rdsv3_incoming *inc;
892 	unsigned int total = 0;
893 
894 	RDSV3_DPRINTF4("rdsv3_sock_inc_info", "Enter(rs: %p)",
895 	    rdsv3_sk_to_rs(sock));
896 
897 	len /= sizeof (struct rdsv3_info_message);
898 
899 	mutex_enter(&rdsv3_sock_lock);
900 
901 	RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_sock_list, rs_item) {
902 		rw_enter(&rs->rs_recv_lock, RW_READER);
903 
904 		/* XXX too lazy to maintain counts.. */
905 		RDSV3_FOR_EACH_LIST_NODE(inc, &rs->rs_recv_queue, i_item) {
906 			total++;
907 			if (total <= len)
908 				rdsv3_inc_info_copy(inc, iter, inc->i_saddr,
909 				    rs->rs_bound_addr, 1);
910 		}
911 
912 		rw_exit(&rs->rs_recv_lock);
913 	}
914 
915 	mutex_exit(&rdsv3_sock_lock);
916 
917 	lens->nr = total;
918 	lens->each = sizeof (struct rdsv3_info_message);
919 
920 	RDSV3_DPRINTF4("rdsv3_sock_inc_info", "return(rs: %p)",
921 	    rdsv3_sk_to_rs(sock));
922 }
923 
924 static void
925 rdsv3_sock_info(struct rsock *sock, unsigned int len,
926     struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens)
927 {
928 	struct rdsv3_info_socket sinfo;
929 	struct rdsv3_sock *rs;
930 	unsigned long bytes;
931 
932 	RDSV3_DPRINTF4("rdsv3_sock_info", "Enter(rs: %p)",
933 	    rdsv3_sk_to_rs(sock));
934 
935 	len /= sizeof (struct rdsv3_info_socket);
936 
937 	mutex_enter(&rdsv3_sock_lock);
938 
939 	if ((len < rdsv3_sock_count) || (iter->addr == NULL))
940 		goto out;
941 
942 	bytes = sizeof (struct rdsv3_info_socket);
943 	RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_sock_list, rs_item) {
944 		sinfo.sndbuf = rdsv3_sk_sndbuf(rs);
945 		sinfo.rcvbuf = rdsv3_sk_rcvbuf(rs);
946 		sinfo.bound_addr = rs->rs_bound_addr;
947 		sinfo.connected_addr = rs->rs_conn_addr;
948 		sinfo.bound_port = rs->rs_bound_port;
949 		sinfo.connected_port = rs->rs_conn_port;
950 
951 		rdsv3_info_copy(iter, &sinfo, bytes);
952 	}
953 
954 	RDSV3_DPRINTF4("rdsv3_sock_info", "Return(rs: %p)",
955 	    rdsv3_sk_to_rs(sock));
956 
957 out:
958 	lens->nr = rdsv3_sock_count;
959 	lens->each = sizeof (struct rdsv3_info_socket);
960 
961 	mutex_exit(&rdsv3_sock_lock);
962 }
963 
964 rdsv3_delayed_work_t	*rdsv3_rdma_dwp = NULL;
965 uint_t			rdsv3_rdma_init_delay = 5; /* secs */
966 extern void rdsv3_rdma_init_worker(struct rdsv3_work_s *work);
967 
968 void
969 rdsv3_exit(void)
970 {
971 	RDSV3_DPRINTF4("rdsv3_exit", "Enter");
972 
973 	if (rdsv3_rdma_dwp) {
974 		rdsv3_cancel_delayed_work(rdsv3_rdma_dwp);
975 	}
976 
977 	(void) ddi_taskq_dispatch(rdsv3_taskq, rdsv3_rdma_exit,
978 	    NULL, DDI_SLEEP);
979 	while (rdsv3_rdma_listen_id != NULL) {
980 #ifndef __lock_lint
981 		RDSV3_DPRINTF5("rdsv3", "%s-%d Waiting for rdsv3_rdma_exit",
982 		    __func__, __LINE__);
983 #endif
984 		delay(drv_usectohz(1000));
985 	}
986 
987 	rdsv3_conn_exit();
988 	rdsv3_cong_exit();
989 	rdsv3_sysctl_exit();
990 	rdsv3_threads_exit();
991 	rdsv3_stats_exit();
992 	rdsv3_info_deregister_func(RDSV3_INFO_SOCKETS, rdsv3_sock_info);
993 	rdsv3_info_deregister_func(RDSV3_INFO_RECV_MESSAGES,
994 	    rdsv3_sock_inc_info);
995 
996 	if (rdsv3_rdma_dwp) {
997 		kmem_free(rdsv3_rdma_dwp, sizeof (rdsv3_delayed_work_t));
998 		rdsv3_rdma_dwp = NULL;
999 	}
1000 
1001 	RDSV3_DPRINTF4("rdsv3_exit", "Return");
1002 }
1003 
1004 /*ARGSUSED*/
1005 int
1006 rdsv3_init()
1007 {
1008 	int ret;
1009 
1010 	RDSV3_DPRINTF4("rdsv3_init", "Enter");
1011 
1012 	rdsv3_cong_init();
1013 
1014 	ret = rdsv3_conn_init();
1015 	if (ret)
1016 		goto out;
1017 	ret = rdsv3_threads_init();
1018 	if (ret)
1019 		goto out_conn;
1020 	ret = rdsv3_sysctl_init();
1021 	if (ret)
1022 		goto out_threads;
1023 	ret = rdsv3_stats_init();
1024 	if (ret)
1025 		goto out_sysctl;
1026 
1027 	rdsv3_info_register_func(RDSV3_INFO_SOCKETS, rdsv3_sock_info);
1028 	rdsv3_info_register_func(RDSV3_INFO_RECV_MESSAGES, rdsv3_sock_inc_info);
1029 
1030 	/* rdsv3_rdma_init need to be called with a little delay */
1031 	rdsv3_rdma_dwp = kmem_zalloc(sizeof (rdsv3_delayed_work_t), KM_SLEEP);
1032 	RDSV3_INIT_DELAYED_WORK(rdsv3_rdma_dwp, rdsv3_rdma_init_worker);
1033 	rdsv3_queue_delayed_work(rdsv3_wq, rdsv3_rdma_dwp,
1034 	    rdsv3_rdma_init_delay);
1035 
1036 	RDSV3_DPRINTF4("rdsv3_init", "Return");
1037 
1038 	goto out;
1039 
1040 out_stats:
1041 	rdsv3_stats_exit();
1042 out_sysctl:
1043 	rdsv3_sysctl_exit();
1044 out_threads:
1045 	rdsv3_threads_exit();
1046 out_conn:
1047 	rdsv3_conn_exit();
1048 	rdsv3_cong_exit();
1049 out:
1050 	return (ret);
1051 }
1052