1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/sysmacros.h>
31 #include <sys/debug.h>
32 #include <sys/cmn_err.h>
33 
34 #include <sys/stropts.h>
35 #include <sys/socket.h>
36 #include <sys/socketvar.h>
37 
38 #define	_SUN_TPI_VERSION	2
39 #include <sys/tihdr.h>
40 #include <sys/sockio.h>
41 #include <sys/kmem_impl.h>
42 
43 #include <sys/strsubr.h>
44 #include <sys/strsun.h>
45 #include <sys/ddi.h>
46 #include <netinet/in.h>
47 #include <inet/ip.h>
48 
49 #include <fs/sockfs/sockcommon.h>
50 
51 #include <sys/socket_proto.h>
52 
53 #include <fs/sockfs/socktpi_impl.h>
54 #include <fs/sockfs/sodirect.h>
55 #include <sys/tihdr.h>
56 #include <fs/sockfs/nl7c.h>
57 #include <inet/kssl/ksslapi.h>
58 
59 
60 extern int xnet_skip_checks;
61 extern int xnet_check_print;
62 
63 static void so_queue_oob(sock_upper_handle_t, mblk_t *, size_t);
64 
65 
66 /*ARGSUSED*/
67 int
68 so_accept_notsupp(struct sonode *lso, int fflag,
69     struct cred *cr, struct sonode **nsop)
70 {
71 	return (EOPNOTSUPP);
72 }
73 
74 /*ARGSUSED*/
75 int
76 so_listen_notsupp(struct sonode *so, int backlog, struct cred *cr)
77 {
78 	return (EOPNOTSUPP);
79 }
80 
81 /*ARGSUSED*/
82 int
83 so_getsockname_notsupp(struct sonode *so, struct sockaddr *sa,
84     socklen_t *len, struct cred *cr)
85 {
86 	return (EOPNOTSUPP);
87 }
88 
89 /*ARGSUSED*/
90 int
91 so_getpeername_notsupp(struct sonode *so, struct sockaddr *addr,
92     socklen_t *addrlen, boolean_t accept, struct cred *cr)
93 {
94 	return (EOPNOTSUPP);
95 }
96 
97 /*ARGSUSED*/
98 int
99 so_shutdown_notsupp(struct sonode *so, int how, struct cred *cr)
100 {
101 	return (EOPNOTSUPP);
102 }
103 
104 /*ARGSUSED*/
105 int
106 so_sendmblk_notsupp(struct sonode *so, struct msghdr *msg, int fflag,
107     struct cred *cr, mblk_t **mpp)
108 {
109 	return (EOPNOTSUPP);
110 }
111 
112 /*
113  * Generic Socket Ops
114  */
115 
116 /* ARGSUSED */
117 int
118 so_init(struct sonode *so, struct sonode *pso, struct cred *cr, int flags)
119 {
120 	return (socket_init_common(so, pso, flags, cr));
121 }
122 
123 int
124 so_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
125     int flags, struct cred *cr)
126 {
127 	int error;
128 
129 	SO_BLOCK_FALLBACK(so, SOP_BIND(so, name, namelen, flags, cr));
130 
131 	ASSERT(flags == _SOBIND_XPG4_2 || flags == _SOBIND_SOCKBSD);
132 
133 	/* X/Open requires this check */
134 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
135 		if (xnet_check_print) {
136 			printf("sockfs: X/Open bind state check "
137 			    "caused EINVAL\n");
138 		}
139 		error = EINVAL;
140 		goto done;
141 	}
142 
143 	/*
144 	 * a bind to a NULL address is interpreted as unbind. So just
145 	 * do the downcall.
146 	 */
147 	if (name == NULL)
148 		goto dobind;
149 
150 	switch (so->so_family) {
151 	case AF_INET:
152 		if ((size_t)namelen != sizeof (sin_t)) {
153 			error = name->sa_family != so->so_family ?
154 			    EAFNOSUPPORT : EINVAL;
155 			eprintsoline(so, error);
156 			goto done;
157 		}
158 
159 		if ((flags & _SOBIND_XPG4_2) &&
160 		    (name->sa_family != so->so_family)) {
161 			/*
162 			 * This check has to be made for X/Open
163 			 * sockets however application failures have
164 			 * been observed when it is applied to
165 			 * all sockets.
166 			 */
167 			error = EAFNOSUPPORT;
168 			eprintsoline(so, error);
169 			goto done;
170 		}
171 		/*
172 		 * Force a zero sa_family to match so_family.
173 		 *
174 		 * Some programs like inetd(1M) don't set the
175 		 * family field. Other programs leave
176 		 * sin_family set to garbage - SunOS 4.X does
177 		 * not check the family field on a bind.
178 		 * We use the family field that
179 		 * was passed in to the socket() call.
180 		 */
181 		name->sa_family = so->so_family;
182 		break;
183 
184 	case AF_INET6: {
185 #ifdef DEBUG
186 		sin6_t *sin6 = (sin6_t *)name;
187 #endif
188 		if ((size_t)namelen != sizeof (sin6_t)) {
189 			error = name->sa_family != so->so_family ?
190 			    EAFNOSUPPORT : EINVAL;
191 			eprintsoline(so, error);
192 			goto done;
193 		}
194 
195 		if (name->sa_family != so->so_family) {
196 			/*
197 			 * With IPv6 we require the family to match
198 			 * unlike in IPv4.
199 			 */
200 			error = EAFNOSUPPORT;
201 			eprintsoline(so, error);
202 			goto done;
203 		}
204 #ifdef DEBUG
205 		/*
206 		 * Verify that apps don't forget to clear
207 		 * sin6_scope_id etc
208 		 */
209 		if (sin6->sin6_scope_id != 0 &&
210 		    !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
211 			zcmn_err(getzoneid(), CE_WARN,
212 			    "bind with uninitialized sin6_scope_id "
213 			    "(%d) on socket. Pid = %d\n",
214 			    (int)sin6->sin6_scope_id,
215 			    (int)curproc->p_pid);
216 		}
217 		if (sin6->__sin6_src_id != 0) {
218 			zcmn_err(getzoneid(), CE_WARN,
219 			    "bind with uninitialized __sin6_src_id "
220 			    "(%d) on socket. Pid = %d\n",
221 			    (int)sin6->__sin6_src_id,
222 			    (int)curproc->p_pid);
223 		}
224 #endif /* DEBUG */
225 
226 		break;
227 	}
228 	default:
229 		/* Just pass the request to the protocol */
230 		goto dobind;
231 	}
232 
233 	/*
234 	 * First we check if either NCA or KSSL has been enabled for
235 	 * the requested address, and if so, we fall back to TPI.
236 	 * If neither of those two services are enabled, then we just
237 	 * pass the request to the protocol.
238 	 *
239 	 * Note that KSSL can only be enabled on a socket if NCA is NOT
240 	 * enabled for that socket, hence the else-statement below.
241 	 */
242 	if (nl7c_enabled && ((so->so_family == AF_INET ||
243 	    so->so_family == AF_INET6) &&
244 	    nl7c_lookup_addr(name, namelen) != NULL)) {
245 		/*
246 		 * NL7C is not supported in non-global zones,
247 		 * we enforce this restriction here.
248 		 */
249 		if (so->so_zoneid == GLOBAL_ZONEID) {
250 			/* NCA should be used, so fall back to TPI */
251 			error = so_tpi_fallback(so, cr);
252 			SO_UNBLOCK_FALLBACK(so);
253 			if (error)
254 				return (error);
255 			else
256 				return (SOP_BIND(so, name, namelen, flags, cr));
257 		}
258 	} else if (so->so_type == SOCK_STREAM) {
259 		/* Check if KSSL has been configured for this address */
260 		kssl_ent_t ent;
261 		kssl_endpt_type_t type;
262 		struct T_bind_req bind_req;
263 		mblk_t *mp;
264 
265 		/*
266 		 * TODO: Check with KSSL team if we could add a function call
267 		 * that only queries whether KSSL is enabled for the given
268 		 * address.
269 		 */
270 		bind_req.PRIM_type = T_BIND_REQ;
271 		bind_req.ADDR_length = namelen;
272 		bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
273 		mp = soallocproto2(&bind_req, sizeof (bind_req),
274 		    name, namelen, 0, _ALLOC_SLEEP, cr);
275 
276 		type = kssl_check_proxy(mp, so, &ent);
277 		freemsg(mp);
278 
279 		if (type != KSSL_NO_PROXY) {
280 			/*
281 			 * KSSL has been configured for this address, so
282 			 * we must fall back to TPI.
283 			 */
284 			kssl_release_ent(ent, so, type);
285 			error = so_tpi_fallback(so, cr);
286 			SO_UNBLOCK_FALLBACK(so);
287 			if (error)
288 				return (error);
289 			else
290 				return (SOP_BIND(so, name, namelen, flags, cr));
291 		}
292 	}
293 
294 dobind:
295 	error = (*so->so_downcalls->sd_bind)
296 	    (so->so_proto_handle, name, namelen, cr);
297 done:
298 	SO_UNBLOCK_FALLBACK(so);
299 
300 	return (error);
301 }
302 
303 int
304 so_listen(struct sonode *so, int backlog, struct cred *cr)
305 {
306 	int	error = 0;
307 
308 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
309 	SO_BLOCK_FALLBACK(so, SOP_LISTEN(so, backlog, cr));
310 
311 	error = (*so->so_downcalls->sd_listen)(so->so_proto_handle, backlog,
312 	    cr);
313 
314 	SO_UNBLOCK_FALLBACK(so);
315 
316 	return (error);
317 }
318 
319 
320 int
321 so_connect(struct sonode *so, const struct sockaddr *name,
322     socklen_t namelen, int fflag, int flags, struct cred *cr)
323 {
324 	int error = 0;
325 	sock_connid_t id;
326 
327 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
328 	SO_BLOCK_FALLBACK(so, SOP_CONNECT(so, name, namelen, fflag, flags, cr));
329 
330 	/*
331 	 * If there is a pending error, return error
332 	 * This can happen if a non blocking operation caused an error.
333 	 */
334 
335 	if (so->so_error != 0) {
336 		mutex_enter(&so->so_lock);
337 		error = sogeterr(so, B_TRUE);
338 		mutex_exit(&so->so_lock);
339 		if (error != 0)
340 			goto done;
341 	}
342 
343 	error = (*so->so_downcalls->sd_connect)(so->so_proto_handle,
344 	    name, namelen, &id, cr);
345 
346 	if (error == EINPROGRESS)
347 		error = so_wait_connected(so, fflag & (FNONBLOCK|FNDELAY), id);
348 
349 done:
350 	SO_UNBLOCK_FALLBACK(so);
351 	return (error);
352 }
353 
354 /*ARGSUSED*/
355 int
356 so_accept(struct sonode *so, int fflag, struct cred *cr, struct sonode **nsop)
357 {
358 	int error = 0;
359 	struct sonode *nso;
360 
361 	*nsop = NULL;
362 
363 	SO_BLOCK_FALLBACK(so, SOP_ACCEPT(so, fflag, cr, nsop));
364 	if ((so->so_state & SS_ACCEPTCONN) == 0) {
365 		SO_UNBLOCK_FALLBACK(so);
366 		return ((so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW) ?
367 		    EOPNOTSUPP : EINVAL);
368 	}
369 
370 	if ((error = so_acceptq_dequeue(so, (fflag & (FNONBLOCK|FNDELAY)),
371 	    &nso)) == 0) {
372 		ASSERT(nso != NULL);
373 
374 		/* finish the accept */
375 		error = (*so->so_downcalls->sd_accept)(so->so_proto_handle,
376 		    nso->so_proto_handle, (sock_upper_handle_t)nso, cr);
377 		if (error != 0) {
378 			(void) socket_close(nso, 0, cr);
379 			socket_destroy(nso);
380 		} else {
381 			*nsop = nso;
382 		}
383 	}
384 
385 	SO_UNBLOCK_FALLBACK(so);
386 	return (error);
387 }
388 
389 int
390 so_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
391     struct cred *cr)
392 {
393 	int error, flags;
394 	boolean_t dontblock;
395 	ssize_t orig_resid;
396 	mblk_t  *mp;
397 
398 	SO_BLOCK_FALLBACK(so, SOP_SENDMSG(so, msg, uiop, cr));
399 
400 	flags = msg->msg_flags;
401 	error = 0;
402 	dontblock = (flags & MSG_DONTWAIT) ||
403 	    (uiop->uio_fmode & (FNONBLOCK|FNDELAY));
404 
405 	if (!(flags & MSG_XPG4_2) && msg->msg_controllen != 0) {
406 		/*
407 		 * Old way of passing fd's is not supported
408 		 */
409 		SO_UNBLOCK_FALLBACK(so);
410 		return (EOPNOTSUPP);
411 	}
412 
413 	if ((so->so_mode & SM_ATOMIC) &&
414 	    uiop->uio_resid > so->so_proto_props.sopp_maxpsz &&
415 	    so->so_proto_props.sopp_maxpsz != -1) {
416 		SO_UNBLOCK_FALLBACK(so);
417 		return (EMSGSIZE);
418 	}
419 
420 	/*
421 	 * For atomic sends we will only do one iteration.
422 	 */
423 	do {
424 		if (so->so_state & SS_CANTSENDMORE) {
425 			error = EPIPE;
426 			break;
427 		}
428 
429 		if (so->so_error != 0) {
430 			mutex_enter(&so->so_lock);
431 			error = sogeterr(so, B_TRUE);
432 			mutex_exit(&so->so_lock);
433 			if (error != 0)
434 				break;
435 		}
436 
437 		/*
438 		 * Send down OOB messages even if the send path is being
439 		 * flow controlled (assuming the protocol supports OOB data).
440 		 */
441 		if (flags & MSG_OOB) {
442 			if ((so->so_mode & SM_EXDATA) == 0) {
443 				error = EOPNOTSUPP;
444 				break;
445 			}
446 		} else if (so->so_snd_qfull) {
447 			/*
448 			 * Need to wait until the protocol is ready to receive
449 			 * more data for transmission.
450 			 */
451 			if ((error = so_snd_wait_qnotfull(so, dontblock)) != 0)
452 				break;
453 		}
454 
455 		/*
456 		 * Time to send data to the protocol. We either copy the
457 		 * data into mblks or pass the uio directly to the protocol.
458 		 * We decide what to do based on the available down calls.
459 		 */
460 		if (so->so_downcalls->sd_send_uio != NULL) {
461 			error = (*so->so_downcalls->sd_send_uio)
462 			    (so->so_proto_handle, uiop, msg, cr);
463 			if (error != 0)
464 				break;
465 		} else {
466 			/* save the resid in case of failure */
467 			orig_resid = uiop->uio_resid;
468 
469 			if ((mp = socopyinuio(uiop,
470 			    so->so_proto_props.sopp_maxpsz,
471 			    so->so_proto_props.sopp_wroff,
472 			    so->so_proto_props.sopp_maxblk,
473 			    so->so_proto_props.sopp_tail, &error,
474 			    cr)) == NULL) {
475 				break;
476 			}
477 			ASSERT(uiop->uio_resid >= 0);
478 
479 			error = (*so->so_downcalls->sd_send)
480 			    (so->so_proto_handle, mp, msg, cr);
481 			if (error != 0) {
482 				/*
483 				 * The send failed. We do not have to free the
484 				 * mblks, because that is the protocol's
485 				 * responsibility. However, uio_resid must
486 				 * remain accurate, so adjust that here.
487 				 */
488 				uiop->uio_resid = orig_resid;
489 					break;
490 			}
491 		}
492 	} while (uiop->uio_resid > 0);
493 
494 	SO_UNBLOCK_FALLBACK(so);
495 
496 	return (error);
497 }
498 
499 int
500 so_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
501     struct cred *cr, mblk_t **mpp)
502 {
503 	int error;
504 	boolean_t dontblock;
505 	size_t size;
506 	mblk_t *mp = *mpp;
507 
508 	SO_BLOCK_FALLBACK(so, SOP_SENDMBLK(so, msg, fflag, cr, mpp));
509 
510 	error = 0;
511 	dontblock = (msg->msg_flags & MSG_DONTWAIT) ||
512 	    (fflag & (FNONBLOCK|FNDELAY));
513 	size = msgdsize(mp);
514 
515 	if ((so->so_mode & SM_SENDFILESUPP) == 0 ||
516 	    so->so_downcalls->sd_send == NULL) {
517 		SO_UNBLOCK_FALLBACK(so);
518 		return (EOPNOTSUPP);
519 	}
520 
521 	if ((so->so_mode & SM_ATOMIC) &&
522 	    size > so->so_proto_props.sopp_maxpsz &&
523 	    so->so_proto_props.sopp_maxpsz != -1) {
524 		SO_UNBLOCK_FALLBACK(so);
525 		return (EMSGSIZE);
526 	}
527 
528 	while (mp != NULL) {
529 		mblk_t *nmp, *last_mblk;
530 		size_t mlen;
531 
532 		if (so->so_state & SS_CANTSENDMORE) {
533 			error = EPIPE;
534 			break;
535 		}
536 		if (so->so_error != 0) {
537 			mutex_enter(&so->so_lock);
538 			error = sogeterr(so, B_TRUE);
539 			mutex_exit(&so->so_lock);
540 			if (error != 0)
541 				break;
542 		}
543 		if (so->so_snd_qfull) {
544 			/*
545 			 * Need to wait until the protocol is ready to receive
546 			 * more data for transmission.
547 			 */
548 			if ((error = so_snd_wait_qnotfull(so, dontblock)) != 0)
549 				break;
550 		}
551 
552 		/*
553 		 * We only allow so_maxpsz of data to be sent down to
554 		 * the protocol at time.
555 		 */
556 		mlen = MBLKL(mp);
557 		nmp = mp->b_cont;
558 		last_mblk = mp;
559 		while (nmp != NULL) {
560 			mlen += MBLKL(nmp);
561 			if (mlen > so->so_proto_props.sopp_maxpsz) {
562 				last_mblk->b_cont = NULL;
563 				break;
564 			}
565 			last_mblk = nmp;
566 			nmp = nmp->b_cont;
567 		}
568 
569 		error = (*so->so_downcalls->sd_send)
570 		    (so->so_proto_handle, mp, msg, cr);
571 		if (error != 0) {
572 			/*
573 			 * The send failed. The protocol will free the mblks
574 			 * that were sent down. Let the caller deal with the
575 			 * rest.
576 			 */
577 			*mpp = nmp;
578 			break;
579 		}
580 
581 		*mpp = mp = nmp;
582 	}
583 
584 	SO_UNBLOCK_FALLBACK(so);
585 
586 	return (error);
587 }
588 
589 int
590 so_shutdown(struct sonode *so, int how, struct cred *cr)
591 {
592 	int error;
593 
594 	SO_BLOCK_FALLBACK(so, SOP_SHUTDOWN(so, how, cr));
595 
596 	/*
597 	 * SunOS 4.X has no check for datagram sockets.
598 	 * 5.X checks that it is connected (ENOTCONN)
599 	 * X/Open requires that we check the connected state.
600 	 */
601 	if (!(so->so_state & SS_ISCONNECTED)) {
602 		if (!xnet_skip_checks) {
603 			error = ENOTCONN;
604 			if (xnet_check_print) {
605 				printf("sockfs: X/Open shutdown check "
606 				    "caused ENOTCONN\n");
607 			}
608 		}
609 		goto done;
610 	}
611 
612 	error = ((*so->so_downcalls->sd_shutdown)(so->so_proto_handle,
613 	    how, cr));
614 
615 	/*
616 	 * Protocol agreed to shutdown. We need to flush the
617 	 * receive buffer if the receive side is being shutdown.
618 	 */
619 	if (error == 0 && how != SHUT_WR) {
620 		mutex_enter(&so->so_lock);
621 		/* wait for active reader to finish */
622 		(void) so_lock_read(so, 0);
623 
624 		so_rcv_flush(so);
625 
626 		so_unlock_read(so);
627 		mutex_exit(&so->so_lock);
628 	}
629 
630 done:
631 	SO_UNBLOCK_FALLBACK(so);
632 	return (error);
633 }
634 
635 int
636 so_getsockname(struct sonode *so, struct sockaddr *addr,
637     socklen_t *addrlen, struct cred *cr)
638 {
639 	int error;
640 
641 	SO_BLOCK_FALLBACK(so, SOP_GETSOCKNAME(so, addr, addrlen, cr));
642 
643 	error = (*so->so_downcalls->sd_getsockname)
644 	    (so->so_proto_handle, addr, addrlen, cr);
645 
646 	SO_UNBLOCK_FALLBACK(so);
647 	return (error);
648 }
649 
650 int
651 so_getpeername(struct sonode *so, struct sockaddr *addr,
652     socklen_t *addrlen, boolean_t accept, struct cred *cr)
653 {
654 	int error;
655 
656 	SO_BLOCK_FALLBACK(so, SOP_GETPEERNAME(so, addr, addrlen, accept, cr));
657 
658 	if (accept) {
659 		error = (*so->so_downcalls->sd_getpeername)
660 		    (so->so_proto_handle, addr, addrlen, cr);
661 	} else if (!(so->so_state & SS_ISCONNECTED)) {
662 		error = ENOTCONN;
663 	} else if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
664 		/* Added this check for X/Open */
665 		error = EINVAL;
666 		if (xnet_check_print) {
667 			printf("sockfs: X/Open getpeername check => EINVAL\n");
668 		}
669 	} else {
670 		error = (*so->so_downcalls->sd_getpeername)
671 		    (so->so_proto_handle, addr, addrlen, cr);
672 	}
673 
674 	SO_UNBLOCK_FALLBACK(so);
675 	return (error);
676 }
677 
678 int
679 so_getsockopt(struct sonode *so, int level, int option_name,
680     void *optval, socklen_t *optlenp, int flags, struct cred *cr)
681 {
682 	int error = 0;
683 
684 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
685 	SO_BLOCK_FALLBACK(so,
686 	    SOP_GETSOCKOPT(so, level, option_name, optval, optlenp, flags, cr));
687 
688 	error = socket_getopt_common(so, level, option_name, optval, optlenp,
689 	    flags);
690 	if (error < 0) {
691 		error = (*so->so_downcalls->sd_getsockopt)
692 		    (so->so_proto_handle, level, option_name, optval, optlenp,
693 		    cr);
694 		if (error ==  ENOPROTOOPT) {
695 			if (level == SOL_SOCKET) {
696 				/*
697 				 * If a protocol does not support a particular
698 				 * socket option, set can fail (not allowed)
699 				 * but get can not fail. This is the previous
700 				 * sockfs bahvior.
701 				 */
702 				switch (option_name) {
703 				case SO_LINGER:
704 					if (*optlenp < (t_uscalar_t)
705 					    sizeof (struct linger)) {
706 						error = EINVAL;
707 						break;
708 					}
709 					error = 0;
710 					bzero(optval, sizeof (struct linger));
711 					*optlenp = sizeof (struct linger);
712 					break;
713 				case SO_RCVTIMEO:
714 				case SO_SNDTIMEO:
715 					if (*optlenp < (t_uscalar_t)
716 					    sizeof (struct timeval)) {
717 						error = EINVAL;
718 						break;
719 					}
720 					error = 0;
721 					bzero(optval, sizeof (struct timeval));
722 					*optlenp = sizeof (struct timeval);
723 					break;
724 				case SO_SND_BUFINFO:
725 					if (*optlenp < (t_uscalar_t)
726 					    sizeof (struct so_snd_bufinfo)) {
727 						error = EINVAL;
728 						break;
729 					}
730 					error = 0;
731 					bzero(optval,
732 					    sizeof (struct so_snd_bufinfo));
733 					*optlenp =
734 					    sizeof (struct so_snd_bufinfo);
735 					break;
736 				case SO_DEBUG:
737 				case SO_REUSEADDR:
738 				case SO_KEEPALIVE:
739 				case SO_DONTROUTE:
740 				case SO_BROADCAST:
741 				case SO_USELOOPBACK:
742 				case SO_OOBINLINE:
743 				case SO_DGRAM_ERRIND:
744 				case SO_SNDBUF:
745 				case SO_RCVBUF:
746 					error = 0;
747 					*((int32_t *)optval) = 0;
748 					*optlenp = sizeof (int32_t);
749 					break;
750 				default:
751 					break;
752 				}
753 			}
754 		}
755 	}
756 
757 	SO_UNBLOCK_FALLBACK(so);
758 	return (error);
759 }
760 
761 int
762 so_setsockopt(struct sonode *so, int level, int option_name,
763     const void *optval, socklen_t optlen, struct cred *cr)
764 {
765 	int error = 0;
766 	struct timeval tl;
767 	const void *opt = optval;
768 
769 	SO_BLOCK_FALLBACK(so,
770 	    SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr));
771 
772 	/* X/Open requires this check */
773 	if (so->so_state & SS_CANTSENDMORE && !xnet_skip_checks) {
774 		SO_UNBLOCK_FALLBACK(so);
775 		if (xnet_check_print)
776 			printf("sockfs: X/Open setsockopt check => EINVAL\n");
777 		return (EINVAL);
778 	}
779 
780 	if (level == SOL_SOCKET) {
781 		switch (option_name) {
782 		case SO_RCVTIMEO:
783 		case SO_SNDTIMEO: {
784 			/*
785 			 * We pass down these two options to protocol in order
786 			 * to support some third part protocols which need to
787 			 * know them. For those protocols which don't care
788 			 * these two options, simply return 0.
789 			 */
790 			clock_t t_usec;
791 
792 			if (get_udatamodel() == DATAMODEL_NONE ||
793 			    get_udatamodel() == DATAMODEL_NATIVE) {
794 				if (optlen != sizeof (struct timeval)) {
795 					error = EINVAL;
796 					goto done;
797 				}
798 				bcopy((struct timeval *)optval, &tl,
799 				    sizeof (struct timeval));
800 			} else {
801 				if (optlen != sizeof (struct timeval32)) {
802 					error = EINVAL;
803 					goto done;
804 				}
805 				TIMEVAL32_TO_TIMEVAL(&tl,
806 				    (struct timeval32 *)optval);
807 			}
808 			opt = &tl;
809 			optlen = sizeof (tl);
810 			t_usec = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
811 			mutex_enter(&so->so_lock);
812 			if (option_name == SO_RCVTIMEO)
813 				so->so_rcvtimeo = drv_usectohz(t_usec);
814 			else
815 				so->so_sndtimeo = drv_usectohz(t_usec);
816 			mutex_exit(&so->so_lock);
817 			break;
818 		}
819 		case SO_RCVBUF:
820 			/*
821 			 * XXX XPG 4.2 applications retrieve SO_RCVBUF from
822 			 * sockfs since the transport might adjust the value
823 			 * and not return exactly what was set by the
824 			 * application.
825 			 */
826 			so->so_xpg_rcvbuf = *(int32_t *)optval;
827 			break;
828 		}
829 	}
830 	error = (*so->so_downcalls->sd_setsockopt)
831 	    (so->so_proto_handle, level, option_name, opt, optlen, cr);
832 done:
833 	SO_UNBLOCK_FALLBACK(so);
834 	return (error);
835 }
836 
837 int
838 so_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
839     struct cred *cr, int32_t *rvalp)
840 {
841 	int error = 0;
842 
843 	SO_BLOCK_FALLBACK(so, SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
844 
845 	/*
846 	 * If there is a pending error, return error
847 	 * This can happen if a non blocking operation caused an error.
848 	 */
849 	if (so->so_error != 0) {
850 		mutex_enter(&so->so_lock);
851 		error = sogeterr(so, B_TRUE);
852 		mutex_exit(&so->so_lock);
853 		if (error != 0)
854 			goto done;
855 	}
856 
857 	/*
858 	 * calling strioc can result in the socket falling back to TPI,
859 	 * if that is supported.
860 	 */
861 	if ((error = socket_ioctl_common(so, cmd, arg, mode, cr, rvalp)) < 0 &&
862 	    (error = socket_strioc_common(so, cmd, arg, mode, cr, rvalp)) < 0) {
863 		error = (*so->so_downcalls->sd_ioctl)(so->so_proto_handle,
864 		    cmd, arg, mode, rvalp, cr);
865 	}
866 
867 done:
868 	SO_UNBLOCK_FALLBACK(so);
869 
870 	return (error);
871 }
872 
873 int
874 so_poll(struct sonode *so, short events, int anyyet, short *reventsp,
875     struct pollhead **phpp)
876 {
877 	int state = so->so_state;
878 	*reventsp = 0;
879 
880 	/*
881 	 * In sockets the errors are represented as input/output events
882 	 */
883 	if (so->so_error != 0 &&
884 	    ((POLLIN|POLLRDNORM|POLLOUT) & events) != 0) {
885 		*reventsp = (POLLIN|POLLRDNORM|POLLOUT) & events;
886 		return (0);
887 	}
888 
889 	/*
890 	 * If the socket is in a state where it can send data
891 	 * turn on POLLWRBAND and POLLOUT events.
892 	 */
893 	if ((so->so_mode & SM_CONNREQUIRED) == 0 || (state & SS_ISCONNECTED)) {
894 		/*
895 		 * out of band data is allowed even if the connection
896 		 * is flow controlled
897 		 */
898 		*reventsp |= POLLWRBAND & events;
899 		if (!so->so_snd_qfull) {
900 			/*
901 			 * As long as there is buffer to send data
902 			 * turn on POLLOUT events
903 			 */
904 			*reventsp |= POLLOUT & events;
905 		}
906 	}
907 
908 	/*
909 	 * Turn on POLLIN whenever there is data on the receive queue,
910 	 * or the socket is in a state where no more data will be received.
911 	 * Also, if the socket is accepting connections, flip the bit if
912 	 * there is something on the queue.
913 	 *
914 	 * We do an initial check for events without holding locks. However,
915 	 * if there are no event available, then we redo the check for POLLIN
916 	 * events under the lock.
917 	 */
918 
919 	/* Pending connections */
920 	if (so->so_acceptq_len > 0)
921 		*reventsp |= (POLLIN|POLLRDNORM) & events;
922 
923 	/* Data */
924 	/* so_downcalls is null for sctp */
925 	if (so->so_downcalls != NULL && so->so_downcalls->sd_poll != NULL) {
926 		*reventsp |= (*so->so_downcalls->sd_poll)
927 		    (so->so_proto_handle, events & SO_PROTO_POLLEV, anyyet,
928 		    CRED()) & events;
929 		ASSERT((*reventsp & ~events) == 0);
930 		/* do not recheck events */
931 		events &= ~SO_PROTO_POLLEV;
932 	} else {
933 		if (SO_HAVE_DATA(so))
934 			*reventsp |= (POLLIN|POLLRDNORM) & events;
935 
936 		/* Urgent data */
937 		if ((state & SS_OOBPEND) != 0) {
938 			*reventsp |= (POLLRDBAND | POLLPRI) & events;
939 		}
940 	}
941 
942 	if (!*reventsp && !anyyet) {
943 		/* Check for read events again, but this time under lock */
944 		if (events & (POLLIN|POLLRDNORM)) {
945 			mutex_enter(&so->so_lock);
946 			if (SO_HAVE_DATA(so) || so->so_acceptq_len > 0) {
947 				mutex_exit(&so->so_lock);
948 				*reventsp |= (POLLIN|POLLRDNORM) & events;
949 				return (0);
950 			} else {
951 				so->so_pollev |= SO_POLLEV_IN;
952 				mutex_exit(&so->so_lock);
953 			}
954 		}
955 		*phpp = &so->so_poll_list;
956 	}
957 	return (0);
958 }
959 
960 /*
961  * Generic Upcalls
962  */
963 void
964 so_connected(sock_upper_handle_t sock_handle, sock_connid_t id,
965     cred_t *peer_cred, pid_t peer_cpid)
966 {
967 	struct sonode *so = (struct sonode *)sock_handle;
968 
969 	mutex_enter(&so->so_lock);
970 	ASSERT(so->so_proto_handle != NULL);
971 
972 	if (peer_cred != NULL) {
973 		if (so->so_peercred != NULL)
974 			crfree(so->so_peercred);
975 		crhold(peer_cred);
976 		so->so_peercred = peer_cred;
977 		so->so_cpid = peer_cpid;
978 	}
979 
980 	so->so_proto_connid = id;
981 	soisconnected(so);
982 	/*
983 	 * Wake ones who're waiting for conn to become established.
984 	 */
985 	so_notify_connected(so);
986 }
987 
988 int
989 so_disconnected(sock_upper_handle_t sock_handle, sock_connid_t id, int error)
990 {
991 	struct sonode *so = (struct sonode *)sock_handle;
992 
993 	mutex_enter(&so->so_lock);
994 
995 	so->so_proto_connid = id;
996 	soisdisconnected(so, error);
997 	so_notify_disconnected(so, error);
998 
999 	return (0);
1000 }
1001 
1002 void
1003 so_opctl(sock_upper_handle_t sock_handle, sock_opctl_action_t action,
1004     uintptr_t arg)
1005 {
1006 	struct sonode *so = (struct sonode *)sock_handle;
1007 
1008 	switch (action) {
1009 	case SOCK_OPCTL_SHUT_SEND:
1010 		mutex_enter(&so->so_lock);
1011 		socantsendmore(so);
1012 		so_notify_disconnecting(so);
1013 		break;
1014 	case SOCK_OPCTL_SHUT_RECV: {
1015 		mutex_enter(&so->so_lock);
1016 		socantrcvmore(so);
1017 		so_notify_eof(so);
1018 		break;
1019 	}
1020 	case SOCK_OPCTL_ENAB_ACCEPT:
1021 		mutex_enter(&so->so_lock);
1022 		so->so_state |= SS_ACCEPTCONN;
1023 		so->so_backlog = (unsigned int)arg;
1024 		mutex_exit(&so->so_lock);
1025 		break;
1026 	default:
1027 		ASSERT(0);
1028 		break;
1029 	}
1030 }
1031 
1032 void
1033 so_txq_full(sock_upper_handle_t sock_handle, boolean_t qfull)
1034 {
1035 	struct sonode *so = (struct sonode *)sock_handle;
1036 
1037 	if (qfull) {
1038 		so_snd_qfull(so);
1039 	} else {
1040 		so_snd_qnotfull(so);
1041 		mutex_enter(&so->so_lock);
1042 		so_notify_writable(so);
1043 	}
1044 }
1045 
1046 sock_upper_handle_t
1047 so_newconn(sock_upper_handle_t parenthandle,
1048     sock_lower_handle_t proto_handle, sock_downcalls_t *sock_downcalls,
1049     struct cred *peer_cred, pid_t peer_cpid, sock_upcalls_t **sock_upcallsp)
1050 {
1051 	struct sonode	*so = (struct sonode *)parenthandle;
1052 	struct sonode	*nso;
1053 	int error;
1054 
1055 	ASSERT(proto_handle != NULL);
1056 
1057 	if ((so->so_state & SS_ACCEPTCONN) == 0 ||
1058 	    so->so_acceptq_len >= so->so_backlog)
1059 		return (NULL);
1060 
1061 	nso = socket_newconn(so, proto_handle, sock_downcalls, SOCKET_NOSLEEP,
1062 	    &error);
1063 	if (nso == NULL)
1064 		return (NULL);
1065 
1066 	if (peer_cred != NULL) {
1067 		crhold(peer_cred);
1068 		nso->so_peercred = peer_cred;
1069 		nso->so_cpid = peer_cpid;
1070 	}
1071 
1072 	/*
1073 	 * The new socket (nso), proto_handle and sock_upcallsp are all
1074 	 * valid at this point. But as soon as nso is placed in the accept
1075 	 * queue that can no longer be assumed (since an accept() thread may
1076 	 * pull it off the queue and close the socket).
1077 	 */
1078 	*sock_upcallsp = &so_upcalls;
1079 
1080 	(void) so_acceptq_enqueue(so, nso);
1081 
1082 	mutex_enter(&so->so_lock);
1083 	so_notify_newconn(so);
1084 
1085 	return ((sock_upper_handle_t)nso);
1086 }
1087 
1088 void
1089 so_set_prop(sock_upper_handle_t sock_handle, struct sock_proto_props *soppp)
1090 {
1091 	struct sonode *so;
1092 
1093 	so = (struct sonode *)sock_handle;
1094 
1095 	mutex_enter(&so->so_lock);
1096 
1097 	if (soppp->sopp_flags & SOCKOPT_MAXBLK)
1098 		so->so_proto_props.sopp_maxblk = soppp->sopp_maxblk;
1099 	if (soppp->sopp_flags & SOCKOPT_WROFF)
1100 		so->so_proto_props.sopp_wroff = soppp->sopp_wroff;
1101 	if (soppp->sopp_flags & SOCKOPT_TAIL)
1102 		so->so_proto_props.sopp_tail = soppp->sopp_tail;
1103 	if (soppp->sopp_flags & SOCKOPT_RCVHIWAT)
1104 		so->so_proto_props.sopp_rxhiwat = soppp->sopp_rxhiwat;
1105 	if (soppp->sopp_flags & SOCKOPT_RCVLOWAT)
1106 		so->so_proto_props.sopp_rxlowat = soppp->sopp_rxlowat;
1107 	if (soppp->sopp_flags & SOCKOPT_MAXPSZ)
1108 		so->so_proto_props.sopp_maxpsz = soppp->sopp_maxpsz;
1109 	if (soppp->sopp_flags & SOCKOPT_MINPSZ)
1110 		so->so_proto_props.sopp_minpsz = soppp->sopp_minpsz;
1111 	if (soppp->sopp_flags & SOCKOPT_ZCOPY) {
1112 		if (soppp->sopp_zcopyflag & ZCVMSAFE) {
1113 			so->so_proto_props.sopp_zcopyflag |= STZCVMSAFE;
1114 			so->so_proto_props.sopp_zcopyflag &= ~STZCVMUNSAFE;
1115 		} else if (soppp->sopp_zcopyflag & ZCVMUNSAFE) {
1116 			so->so_proto_props.sopp_zcopyflag |= STZCVMUNSAFE;
1117 			so->so_proto_props.sopp_zcopyflag &= ~STZCVMSAFE;
1118 		}
1119 
1120 		if (soppp->sopp_zcopyflag & COPYCACHED) {
1121 			so->so_proto_props.sopp_zcopyflag |= STRCOPYCACHED;
1122 		}
1123 	}
1124 	if (soppp->sopp_flags & SOCKOPT_OOBINLINE)
1125 		so->so_proto_props.sopp_oobinline = soppp->sopp_oobinline;
1126 	if (soppp->sopp_flags & SOCKOPT_RCVTIMER)
1127 		so->so_proto_props.sopp_rcvtimer = soppp->sopp_rcvtimer;
1128 	if (soppp->sopp_flags & SOCKOPT_RCVTHRESH)
1129 		so->so_proto_props.sopp_rcvthresh = soppp->sopp_rcvthresh;
1130 	if (soppp->sopp_flags & SOCKOPT_MAXADDRLEN)
1131 		so->so_proto_props.sopp_maxaddrlen = soppp->sopp_maxaddrlen;
1132 	if (soppp->sopp_flags & SOCKOPT_LOOPBACK)
1133 		so->so_proto_props.sopp_loopback = soppp->sopp_loopback;
1134 
1135 	mutex_exit(&so->so_lock);
1136 
1137 #ifdef DEBUG
1138 	soppp->sopp_flags &= ~(SOCKOPT_MAXBLK | SOCKOPT_WROFF | SOCKOPT_TAIL |
1139 	    SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | SOCKOPT_MAXPSZ |
1140 	    SOCKOPT_ZCOPY | SOCKOPT_OOBINLINE | SOCKOPT_RCVTIMER |
1141 	    SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ |
1142 	    SOCKOPT_LOOPBACK);
1143 	ASSERT(soppp->sopp_flags == 0);
1144 #endif
1145 }
1146 
1147 /* ARGSUSED */
1148 ssize_t
1149 so_queue_msg(sock_upper_handle_t sock_handle, mblk_t *mp,
1150     size_t msg_size, int flags, int *errorp,  boolean_t *force_pushp)
1151 {
1152 	struct sonode *so = (struct sonode *)sock_handle;
1153 	boolean_t force_push = B_TRUE;
1154 	int space_left;
1155 	sodirect_t *sodp = so->so_direct;
1156 
1157 	ASSERT(errorp != NULL);
1158 	*errorp = 0;
1159 	if (mp == NULL) {
1160 		if (msg_size > 0) {
1161 			ASSERT(so->so_downcalls->sd_recv_uio != NULL);
1162 			mutex_enter(&so->so_lock);
1163 			/* the notify functions will drop the lock */
1164 			if (flags & MSG_OOB)
1165 				so_notify_oobdata(so, IS_SO_OOB_INLINE(so));
1166 			else
1167 				so_notify_data(so, msg_size);
1168 			return (0);
1169 		}
1170 		/*
1171 		 * recv space check
1172 		 */
1173 		mutex_enter(&so->so_lock);
1174 		space_left = so->so_rcvbuf - so->so_rcv_queued;
1175 		if (space_left <= 0) {
1176 			so->so_flowctrld = B_TRUE;
1177 			*errorp = ENOSPC;
1178 			space_left = -1;
1179 		}
1180 		goto done_unlock;
1181 	}
1182 
1183 	ASSERT(mp->b_next == NULL);
1184 	ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_PROTO);
1185 	ASSERT(msg_size == msgdsize(mp));
1186 
1187 	if (flags & MSG_OOB) {
1188 		so_queue_oob(sock_handle, mp, msg_size);
1189 		return (0);
1190 	}
1191 
1192 	if (force_pushp != NULL)
1193 		force_push = *force_pushp;
1194 
1195 	if (DB_TYPE(mp) == M_PROTO && !__TPI_PRIM_ISALIGNED(mp->b_rptr)) {
1196 		/* The read pointer is not aligned correctly for TPI */
1197 		zcmn_err(getzoneid(), CE_WARN,
1198 		    "sockfs: Unaligned TPI message received. rptr = %p\n",
1199 		    (void *)mp->b_rptr);
1200 		freemsg(mp);
1201 		mutex_enter(&so->so_lock);
1202 		if (sodp != NULL)
1203 			SOD_UIOAFINI(sodp);
1204 		mutex_exit(&so->so_lock);
1205 
1206 		return (so->so_rcvbuf - so->so_rcv_queued);
1207 	}
1208 
1209 	mutex_enter(&so->so_lock);
1210 	if (so->so_state & (SS_FALLBACK_DRAIN | SS_FALLBACK_COMP)) {
1211 		if (sodp != NULL)
1212 			SOD_DISABLE(sodp);
1213 		mutex_exit(&so->so_lock);
1214 		*errorp = EOPNOTSUPP;
1215 		return (-1);
1216 	}
1217 	if (so->so_state & SS_CANTRCVMORE) {
1218 		freemsg(mp);
1219 		if (sodp != NULL)
1220 			SOD_DISABLE(sodp);
1221 		mutex_exit(&so->so_lock);
1222 		return (0);
1223 	}
1224 
1225 	/* process the mblk via I/OAT if capable */
1226 	if (sodp != NULL && sodp->sod_enabled) {
1227 		if (DB_TYPE(mp) == M_DATA) {
1228 			sod_uioa_mblk_init(sodp, mp, msg_size);
1229 		} else {
1230 			SOD_UIOAFINI(sodp);
1231 		}
1232 	}
1233 
1234 	if (mp->b_next == NULL) {
1235 		so_enqueue_msg(so, mp, msg_size);
1236 	} else {
1237 		do {
1238 			mblk_t *nmp;
1239 
1240 			if ((nmp = mp->b_next) != NULL) {
1241 				mp->b_next = NULL;
1242 			}
1243 			so_enqueue_msg(so, mp, msgdsize(mp));
1244 			mp = nmp;
1245 		} while (mp != NULL);
1246 	}
1247 
1248 	space_left = so->so_rcvbuf - so->so_rcv_queued;
1249 	if (space_left <= 0) {
1250 		so->so_flowctrld = B_TRUE;
1251 		*errorp = ENOSPC;
1252 		space_left = -1;
1253 	}
1254 
1255 	if (force_push || so->so_rcv_queued >= so->so_rcv_thresh ||
1256 	    so->so_rcv_queued >= so->so_rcv_wanted) {
1257 		SOCKET_TIMER_CANCEL(so);
1258 		/*
1259 		 * so_notify_data will release the lock
1260 		 */
1261 		so_notify_data(so, so->so_rcv_queued);
1262 
1263 		if (force_pushp != NULL)
1264 			*force_pushp = B_TRUE;
1265 		goto done;
1266 	} else if (so->so_rcv_timer_tid == 0) {
1267 		/* Make sure the recv push timer is running */
1268 		SOCKET_TIMER_START(so);
1269 	}
1270 
1271 done_unlock:
1272 	mutex_exit(&so->so_lock);
1273 done:
1274 	return (space_left);
1275 }
1276 
1277 /*
1278  * Set the offset of where the oob data is relative to the bytes in
1279  * queued. Also generate SIGURG
1280  */
1281 void
1282 so_signal_oob(sock_upper_handle_t sock_handle, ssize_t offset)
1283 {
1284 	struct sonode *so;
1285 
1286 	ASSERT(offset >= 0);
1287 	so = (struct sonode *)sock_handle;
1288 	mutex_enter(&so->so_lock);
1289 	if (so->so_direct != NULL)
1290 		SOD_UIOAFINI(so->so_direct);
1291 
1292 	/*
1293 	 * New urgent data on the way so forget about any old
1294 	 * urgent data.
1295 	 */
1296 	so->so_state &= ~(SS_HAVEOOBDATA|SS_HADOOBDATA);
1297 
1298 	/*
1299 	 * Record that urgent data is pending.
1300 	 */
1301 	so->so_state |= SS_OOBPEND;
1302 
1303 	if (so->so_oobmsg != NULL) {
1304 		dprintso(so, 1, ("sock: discarding old oob\n"));
1305 		freemsg(so->so_oobmsg);
1306 		so->so_oobmsg = NULL;
1307 	}
1308 
1309 	/*
1310 	 * set the offset where the urgent byte is
1311 	 */
1312 	so->so_oobmark = so->so_rcv_queued + offset;
1313 	if (so->so_oobmark == 0)
1314 		so->so_state |= SS_RCVATMARK;
1315 	else
1316 		so->so_state &= ~SS_RCVATMARK;
1317 
1318 	so_notify_oobsig(so);
1319 }
1320 
1321 /*
1322  * Queue the OOB byte
1323  */
1324 static void
1325 so_queue_oob(sock_upper_handle_t sock_handle, mblk_t *mp, size_t len)
1326 {
1327 	struct sonode *so;
1328 
1329 	so = (struct sonode *)sock_handle;
1330 	mutex_enter(&so->so_lock);
1331 	if (so->so_direct != NULL)
1332 		SOD_UIOAFINI(so->so_direct);
1333 
1334 	ASSERT(mp != NULL);
1335 	if (!IS_SO_OOB_INLINE(so)) {
1336 		so->so_oobmsg = mp;
1337 		so->so_state |= SS_HAVEOOBDATA;
1338 	} else {
1339 		so_enqueue_msg(so, mp, len);
1340 	}
1341 
1342 	so_notify_oobdata(so, IS_SO_OOB_INLINE(so));
1343 }
1344 
1345 int
1346 so_close(struct sonode *so, int flag, struct cred *cr)
1347 {
1348 	int error;
1349 
1350 	error = (*so->so_downcalls->sd_close)(so->so_proto_handle, flag, cr);
1351 
1352 	/*
1353 	 * At this point there will be no more upcalls from the protocol
1354 	 */
1355 	mutex_enter(&so->so_lock);
1356 
1357 	ASSERT(so_verify_oobstate(so));
1358 
1359 	so_rcv_flush(so);
1360 	mutex_exit(&so->so_lock);
1361 
1362 	return (error);
1363 }
1364 
1365 void
1366 so_zcopy_notify(sock_upper_handle_t sock_handle)
1367 {
1368 	struct sonode *so = (struct sonode *)sock_handle;
1369 
1370 	mutex_enter(&so->so_lock);
1371 	so->so_copyflag |= STZCNOTIFY;
1372 	cv_broadcast(&so->so_copy_cv);
1373 	mutex_exit(&so->so_lock);
1374 }
1375 
1376 void
1377 so_set_error(sock_upper_handle_t sock_handle, int error)
1378 {
1379 	struct sonode *so = (struct sonode *)sock_handle;
1380 
1381 	mutex_enter(&so->so_lock);
1382 
1383 	soseterror(so, error);
1384 
1385 	so_notify_error(so);
1386 }
1387 
1388 /*
1389  * so_recvmsg - read data from the socket
1390  *
1391  * There are two ways of obtaining data; either we ask the protocol to
1392  * copy directly into the supplied buffer, or we copy data from the
1393  * sonode's receive queue. The decision which one to use depends on
1394  * whether the protocol has a sd_recv_uio down call.
1395  */
1396 int
1397 so_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
1398     struct cred *cr)
1399 {
1400 	rval_t 		rval;
1401 	int 		flags = 0;
1402 	t_uscalar_t	controllen, namelen;
1403 	int 		error = 0;
1404 	int ret;
1405 	mblk_t		*mctlp = NULL;
1406 	union T_primitives *tpr;
1407 	void		*control;
1408 	ssize_t		saved_resid;
1409 	struct uio	*suiop;
1410 
1411 	SO_BLOCK_FALLBACK(so, SOP_RECVMSG(so, msg, uiop, cr));
1412 
1413 	if ((so->so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
1414 	    (so->so_mode & SM_CONNREQUIRED)) {
1415 		SO_UNBLOCK_FALLBACK(so);
1416 		return (ENOTCONN);
1417 	}
1418 
1419 	if (msg->msg_flags & MSG_PEEK)
1420 		msg->msg_flags &= ~MSG_WAITALL;
1421 
1422 	if (so->so_mode & SM_ATOMIC)
1423 		msg->msg_flags |= MSG_TRUNC;
1424 
1425 	if (msg->msg_flags & MSG_OOB) {
1426 		if ((so->so_mode & SM_EXDATA) == 0) {
1427 			error = EOPNOTSUPP;
1428 		} else if (so->so_downcalls->sd_recv_uio != NULL) {
1429 			error = (*so->so_downcalls->sd_recv_uio)
1430 			    (so->so_proto_handle, uiop, msg, cr);
1431 		} else {
1432 			error = sorecvoob(so, msg, uiop, msg->msg_flags,
1433 			    IS_SO_OOB_INLINE(so));
1434 		}
1435 		SO_UNBLOCK_FALLBACK(so);
1436 		return (error);
1437 	}
1438 
1439 	/*
1440 	 * If the protocol has the recv down call, then pass the request
1441 	 * down.
1442 	 */
1443 	if (so->so_downcalls->sd_recv_uio != NULL) {
1444 		error = (*so->so_downcalls->sd_recv_uio)
1445 		    (so->so_proto_handle, uiop, msg, cr);
1446 		SO_UNBLOCK_FALLBACK(so);
1447 		return (error);
1448 	}
1449 
1450 	/*
1451 	 * Reading data from the socket buffer
1452 	 */
1453 	flags = msg->msg_flags;
1454 	msg->msg_flags = 0;
1455 
1456 	/*
1457 	 * Set msg_controllen and msg_namelen to zero here to make it
1458 	 * simpler in the cases that no control or name is returned.
1459 	 */
1460 	controllen = msg->msg_controllen;
1461 	namelen = msg->msg_namelen;
1462 	msg->msg_controllen = 0;
1463 	msg->msg_namelen = 0;
1464 
1465 	mutex_enter(&so->so_lock);
1466 	/* Set SOREADLOCKED */
1467 	error = so_lock_read_intr(so,
1468 	    uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
1469 	mutex_exit(&so->so_lock);
1470 	if (error) {
1471 		SO_UNBLOCK_FALLBACK(so);
1472 		return (error);
1473 	}
1474 
1475 	suiop = sod_rcv_init(so, flags, &uiop);
1476 retry:
1477 	saved_resid = uiop->uio_resid;
1478 	error = so_dequeue_msg(so, &mctlp, uiop, &rval, flags);
1479 	if (error != 0) {
1480 		goto out;
1481 	}
1482 	/*
1483 	 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
1484 	 * For non-datagrams MOREDATA is used to set MSG_EOR.
1485 	 */
1486 	ASSERT(!(rval.r_val1 & MORECTL));
1487 	if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
1488 		msg->msg_flags |= MSG_TRUNC;
1489 	if (mctlp == NULL) {
1490 		dprintso(so, 1, ("so_recvmsg: got M_DATA\n"));
1491 
1492 		mutex_enter(&so->so_lock);
1493 		/* Set MSG_EOR based on MOREDATA */
1494 		if (!(rval.r_val1 & MOREDATA)) {
1495 			if (so->so_state & SS_SAVEDEOR) {
1496 				msg->msg_flags |= MSG_EOR;
1497 				so->so_state &= ~SS_SAVEDEOR;
1498 			}
1499 		}
1500 		/*
1501 		 * If some data was received (i.e. not EOF) and the
1502 		 * read/recv* has not been satisfied wait for some more.
1503 		 */
1504 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
1505 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
1506 			mutex_exit(&so->so_lock);
1507 			flags |= MSG_NOMARK;
1508 			goto retry;
1509 		}
1510 
1511 		goto out_locked;
1512 	}
1513 	/* so_queue_msg has already verified length and alignment */
1514 	tpr = (union T_primitives *)mctlp->b_rptr;
1515 	dprintso(so, 1, ("so_recvmsg: type %d\n", tpr->type));
1516 	switch (tpr->type) {
1517 	case T_DATA_IND: {
1518 		/*
1519 		 * Set msg_flags to MSG_EOR based on
1520 		 * MORE_flag and MOREDATA.
1521 		 */
1522 		mutex_enter(&so->so_lock);
1523 		so->so_state &= ~SS_SAVEDEOR;
1524 		if (!(tpr->data_ind.MORE_flag & 1)) {
1525 			if (!(rval.r_val1 & MOREDATA))
1526 				msg->msg_flags |= MSG_EOR;
1527 			else
1528 				so->so_state |= SS_SAVEDEOR;
1529 		}
1530 		freemsg(mctlp);
1531 		/*
1532 		 * If some data was received (i.e. not EOF) and the
1533 		 * read/recv* has not been satisfied wait for some more.
1534 		 */
1535 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
1536 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
1537 			mutex_exit(&so->so_lock);
1538 			flags |= MSG_NOMARK;
1539 			goto retry;
1540 		}
1541 		goto out_locked;
1542 	}
1543 	case T_UNITDATA_IND: {
1544 		void *addr;
1545 		t_uscalar_t addrlen;
1546 		void *abuf;
1547 		t_uscalar_t optlen;
1548 		void *opt;
1549 
1550 		if (namelen != 0) {
1551 			/* Caller wants source address */
1552 			addrlen = tpr->unitdata_ind.SRC_length;
1553 			addr = sogetoff(mctlp, tpr->unitdata_ind.SRC_offset,
1554 			    addrlen, 1);
1555 			if (addr == NULL) {
1556 				freemsg(mctlp);
1557 				error = EPROTO;
1558 				eprintsoline(so, error);
1559 				goto out;
1560 			}
1561 			ASSERT(so->so_family != AF_UNIX);
1562 		}
1563 		optlen = tpr->unitdata_ind.OPT_length;
1564 		if (optlen != 0) {
1565 			t_uscalar_t ncontrollen;
1566 
1567 			/*
1568 			 * Extract any source address option.
1569 			 * Determine how large cmsg buffer is needed.
1570 			 */
1571 			opt = sogetoff(mctlp, tpr->unitdata_ind.OPT_offset,
1572 			    optlen, __TPI_ALIGN_SIZE);
1573 
1574 			if (opt == NULL) {
1575 				freemsg(mctlp);
1576 				error = EPROTO;
1577 				eprintsoline(so, error);
1578 				goto out;
1579 			}
1580 			if (so->so_family == AF_UNIX)
1581 				so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
1582 			ncontrollen = so_cmsglen(mctlp, opt, optlen,
1583 			    !(flags & MSG_XPG4_2));
1584 			if (controllen != 0)
1585 				controllen = ncontrollen;
1586 			else if (ncontrollen != 0)
1587 				msg->msg_flags |= MSG_CTRUNC;
1588 		} else {
1589 			controllen = 0;
1590 		}
1591 
1592 		if (namelen != 0) {
1593 			/*
1594 			 * Return address to caller.
1595 			 * Caller handles truncation if length
1596 			 * exceeds msg_namelen.
1597 			 * NOTE: AF_UNIX NUL termination is ensured by
1598 			 * the sender's copyin_name().
1599 			 */
1600 			abuf = kmem_alloc(addrlen, KM_SLEEP);
1601 
1602 			bcopy(addr, abuf, addrlen);
1603 			msg->msg_name = abuf;
1604 			msg->msg_namelen = addrlen;
1605 		}
1606 
1607 		if (controllen != 0) {
1608 			/*
1609 			 * Return control msg to caller.
1610 			 * Caller handles truncation if length
1611 			 * exceeds msg_controllen.
1612 			 */
1613 			control = kmem_zalloc(controllen, KM_SLEEP);
1614 
1615 			error = so_opt2cmsg(mctlp, opt, optlen,
1616 			    !(flags & MSG_XPG4_2), control, controllen);
1617 			if (error) {
1618 				freemsg(mctlp);
1619 				if (msg->msg_namelen != 0)
1620 					kmem_free(msg->msg_name,
1621 					    msg->msg_namelen);
1622 				kmem_free(control, controllen);
1623 				eprintsoline(so, error);
1624 				goto out;
1625 			}
1626 			msg->msg_control = control;
1627 			msg->msg_controllen = controllen;
1628 		}
1629 
1630 		freemsg(mctlp);
1631 		goto out;
1632 	}
1633 	case T_OPTDATA_IND: {
1634 		struct T_optdata_req *tdr;
1635 		void *opt;
1636 		t_uscalar_t optlen;
1637 
1638 		tdr = (struct T_optdata_req *)mctlp->b_rptr;
1639 		optlen = tdr->OPT_length;
1640 		if (optlen != 0) {
1641 			t_uscalar_t ncontrollen;
1642 			/*
1643 			 * Determine how large cmsg buffer is needed.
1644 			 */
1645 			opt = sogetoff(mctlp,
1646 			    tpr->optdata_ind.OPT_offset, optlen,
1647 			    __TPI_ALIGN_SIZE);
1648 
1649 			if (opt == NULL) {
1650 				freemsg(mctlp);
1651 				error = EPROTO;
1652 				eprintsoline(so, error);
1653 				goto out;
1654 			}
1655 
1656 			ncontrollen = so_cmsglen(mctlp, opt, optlen,
1657 			    !(flags & MSG_XPG4_2));
1658 			if (controllen != 0)
1659 				controllen = ncontrollen;
1660 			else if (ncontrollen != 0)
1661 				msg->msg_flags |= MSG_CTRUNC;
1662 		} else {
1663 			controllen = 0;
1664 		}
1665 
1666 		if (controllen != 0) {
1667 			/*
1668 			 * Return control msg to caller.
1669 			 * Caller handles truncation if length
1670 			 * exceeds msg_controllen.
1671 			 */
1672 			control = kmem_zalloc(controllen, KM_SLEEP);
1673 
1674 			error = so_opt2cmsg(mctlp, opt, optlen,
1675 			    !(flags & MSG_XPG4_2), control, controllen);
1676 			if (error) {
1677 				freemsg(mctlp);
1678 				kmem_free(control, controllen);
1679 				eprintsoline(so, error);
1680 				goto out;
1681 			}
1682 			msg->msg_control = control;
1683 			msg->msg_controllen = controllen;
1684 		}
1685 
1686 		/*
1687 		 * Set msg_flags to MSG_EOR based on
1688 		 * DATA_flag and MOREDATA.
1689 		 */
1690 		mutex_enter(&so->so_lock);
1691 		so->so_state &= ~SS_SAVEDEOR;
1692 		if (!(tpr->data_ind.MORE_flag & 1)) {
1693 			if (!(rval.r_val1 & MOREDATA))
1694 				msg->msg_flags |= MSG_EOR;
1695 			else
1696 				so->so_state |= SS_SAVEDEOR;
1697 		}
1698 		freemsg(mctlp);
1699 		/*
1700 		 * If some data was received (i.e. not EOF) and the
1701 		 * read/recv* has not been satisfied wait for some more.
1702 		 * Not possible to wait if control info was received.
1703 		 */
1704 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
1705 		    controllen == 0 &&
1706 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
1707 			mutex_exit(&so->so_lock);
1708 			flags |= MSG_NOMARK;
1709 			goto retry;
1710 		}
1711 		goto out_locked;
1712 	}
1713 	default:
1714 		cmn_err(CE_CONT, "so_recvmsg bad type %x \n",
1715 		    tpr->type);
1716 		freemsg(mctlp);
1717 		error = EPROTO;
1718 		ASSERT(0);
1719 	}
1720 out:
1721 	mutex_enter(&so->so_lock);
1722 out_locked:
1723 	ret = sod_rcv_done(so, suiop, uiop);
1724 	if (ret != 0 && error == 0)
1725 		error = ret;
1726 
1727 	so_unlock_read(so);	/* Clear SOREADLOCKED */
1728 	mutex_exit(&so->so_lock);
1729 
1730 	SO_UNBLOCK_FALLBACK(so);
1731 
1732 	return (error);
1733 }
1734 
1735 sonodeops_t so_sonodeops = {
1736 	so_init,		/* sop_init	*/
1737 	so_accept,		/* sop_accept   */
1738 	so_bind,		/* sop_bind	*/
1739 	so_listen,		/* sop_listen   */
1740 	so_connect,		/* sop_connect  */
1741 	so_recvmsg,		/* sop_recvmsg  */
1742 	so_sendmsg,		/* sop_sendmsg  */
1743 	so_sendmblk,		/* sop_sendmblk */
1744 	so_getpeername,		/* sop_getpeername */
1745 	so_getsockname,		/* sop_getsockname */
1746 	so_shutdown,		/* sop_shutdown */
1747 	so_getsockopt,		/* sop_getsockopt */
1748 	so_setsockopt,		/* sop_setsockopt */
1749 	so_ioctl,		/* sop_ioctl    */
1750 	so_poll,		/* sop_poll	*/
1751 	so_close,		/* sop_close */
1752 };
1753 
1754 sock_upcalls_t so_upcalls = {
1755 	so_newconn,
1756 	so_connected,
1757 	so_disconnected,
1758 	so_opctl,
1759 	so_queue_msg,
1760 	so_set_prop,
1761 	so_txq_full,
1762 	so_signal_oob,
1763 	so_zcopy_notify,
1764 	so_set_error
1765 };
1766