1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/sysmacros.h>
31 #include <sys/debug.h>
32 #include <sys/cmn_err.h>
33 
34 #include <sys/stropts.h>
35 #include <sys/socket.h>
36 #include <sys/socketvar.h>
37 
38 #define	_SUN_TPI_VERSION	2
39 #include <sys/tihdr.h>
40 #include <sys/sockio.h>
41 #include <sys/kmem_impl.h>
42 
43 #include <sys/strsubr.h>
44 #include <sys/strsun.h>
45 #include <sys/ddi.h>
46 #include <netinet/in.h>
47 #include <inet/ip.h>
48 
49 #include <fs/sockfs/sockcommon.h>
50 
51 #include <sys/socket_proto.h>
52 
53 #include <fs/sockfs/socktpi_impl.h>
54 #include <fs/sockfs/sodirect.h>
55 #include <sys/tihdr.h>
56 #include <fs/sockfs/nl7c.h>
57 #include <inet/kssl/ksslapi.h>
58 
59 
60 extern int xnet_skip_checks;
61 extern int xnet_check_print;
62 
63 static void so_queue_oob(sock_upper_handle_t, mblk_t *, size_t);
64 
65 
66 /*ARGSUSED*/
67 int
68 so_accept_notsupp(struct sonode *lso, int fflag,
69     struct cred *cr, struct sonode **nsop)
70 {
71 	return (EOPNOTSUPP);
72 }
73 
74 /*ARGSUSED*/
75 int
76 so_listen_notsupp(struct sonode *so, int backlog, struct cred *cr)
77 {
78 	return (EOPNOTSUPP);
79 }
80 
81 /*ARGSUSED*/
82 int
83 so_getsockname_notsupp(struct sonode *so, struct sockaddr *sa,
84     socklen_t *len, struct cred *cr)
85 {
86 	return (EOPNOTSUPP);
87 }
88 
89 /*ARGSUSED*/
90 int
91 so_getpeername_notsupp(struct sonode *so, struct sockaddr *addr,
92     socklen_t *addrlen, boolean_t accept, struct cred *cr)
93 {
94 	return (EOPNOTSUPP);
95 }
96 
97 /*ARGSUSED*/
98 int
99 so_shutdown_notsupp(struct sonode *so, int how, struct cred *cr)
100 {
101 	return (EOPNOTSUPP);
102 }
103 
104 /*ARGSUSED*/
105 int
106 so_sendmblk_notsupp(struct sonode *so, struct msghdr *msg, int fflag,
107     struct cred *cr, mblk_t **mpp)
108 {
109 	return (EOPNOTSUPP);
110 }
111 
112 /*
113  * Generic Socket Ops
114  */
115 
116 /* ARGSUSED */
117 int
118 so_init(struct sonode *so, struct sonode *pso, struct cred *cr, int flags)
119 {
120 	return (socket_init_common(so, pso, flags, cr));
121 }
122 
123 int
124 so_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
125     int flags, struct cred *cr)
126 {
127 	int error;
128 
129 	SO_BLOCK_FALLBACK(so, SOP_BIND(so, name, namelen, flags, cr));
130 
131 	ASSERT(flags == _SOBIND_XPG4_2 || flags == _SOBIND_SOCKBSD);
132 
133 	/* X/Open requires this check */
134 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
135 		if (xnet_check_print) {
136 			printf("sockfs: X/Open bind state check "
137 			    "caused EINVAL\n");
138 		}
139 		error = EINVAL;
140 		goto done;
141 	}
142 
143 	/*
144 	 * a bind to a NULL address is interpreted as unbind. So just
145 	 * do the downcall.
146 	 */
147 	if (name == NULL)
148 		goto dobind;
149 
150 	switch (so->so_family) {
151 	case AF_INET:
152 		if ((size_t)namelen != sizeof (sin_t)) {
153 			error = name->sa_family != so->so_family ?
154 			    EAFNOSUPPORT : EINVAL;
155 			eprintsoline(so, error);
156 			goto done;
157 		}
158 
159 		if ((flags & _SOBIND_XPG4_2) &&
160 		    (name->sa_family != so->so_family)) {
161 			/*
162 			 * This check has to be made for X/Open
163 			 * sockets however application failures have
164 			 * been observed when it is applied to
165 			 * all sockets.
166 			 */
167 			error = EAFNOSUPPORT;
168 			eprintsoline(so, error);
169 			goto done;
170 		}
171 		/*
172 		 * Force a zero sa_family to match so_family.
173 		 *
174 		 * Some programs like inetd(1M) don't set the
175 		 * family field. Other programs leave
176 		 * sin_family set to garbage - SunOS 4.X does
177 		 * not check the family field on a bind.
178 		 * We use the family field that
179 		 * was passed in to the socket() call.
180 		 */
181 		name->sa_family = so->so_family;
182 		break;
183 
184 	case AF_INET6: {
185 #ifdef DEBUG
186 		sin6_t *sin6 = (sin6_t *)name;
187 #endif
188 		if ((size_t)namelen != sizeof (sin6_t)) {
189 			error = name->sa_family != so->so_family ?
190 			    EAFNOSUPPORT : EINVAL;
191 			eprintsoline(so, error);
192 			goto done;
193 		}
194 
195 		if (name->sa_family != so->so_family) {
196 			/*
197 			 * With IPv6 we require the family to match
198 			 * unlike in IPv4.
199 			 */
200 			error = EAFNOSUPPORT;
201 			eprintsoline(so, error);
202 			goto done;
203 		}
204 #ifdef DEBUG
205 		/*
206 		 * Verify that apps don't forget to clear
207 		 * sin6_scope_id etc
208 		 */
209 		if (sin6->sin6_scope_id != 0 &&
210 		    !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
211 			zcmn_err(getzoneid(), CE_WARN,
212 			    "bind with uninitialized sin6_scope_id "
213 			    "(%d) on socket. Pid = %d\n",
214 			    (int)sin6->sin6_scope_id,
215 			    (int)curproc->p_pid);
216 		}
217 		if (sin6->__sin6_src_id != 0) {
218 			zcmn_err(getzoneid(), CE_WARN,
219 			    "bind with uninitialized __sin6_src_id "
220 			    "(%d) on socket. Pid = %d\n",
221 			    (int)sin6->__sin6_src_id,
222 			    (int)curproc->p_pid);
223 		}
224 #endif /* DEBUG */
225 
226 		break;
227 	}
228 	default:
229 		/* Just pass the request to the protocol */
230 		goto dobind;
231 	}
232 
233 	/*
234 	 * First we check if either NCA or KSSL has been enabled for
235 	 * the requested address, and if so, we fall back to TPI.
236 	 * If neither of those two services are enabled, then we just
237 	 * pass the request to the protocol.
238 	 *
239 	 * Note that KSSL can only be enabled on a socket if NCA is NOT
240 	 * enabled for that socket, hence the else-statement below.
241 	 */
242 	if (nl7c_enabled && ((so->so_family == AF_INET ||
243 	    so->so_family == AF_INET6) &&
244 	    nl7c_lookup_addr(name, namelen) != NULL)) {
245 		/*
246 		 * NL7C is not supported in non-global zones,
247 		 * we enforce this restriction here.
248 		 */
249 		if (so->so_zoneid == GLOBAL_ZONEID) {
250 			/* NCA should be used, so fall back to TPI */
251 			error = so_tpi_fallback(so, cr);
252 			SO_UNBLOCK_FALLBACK(so);
253 			if (error)
254 				return (error);
255 			else
256 				return (SOP_BIND(so, name, namelen, flags, cr));
257 		}
258 	} else if (so->so_type == SOCK_STREAM) {
259 		/* Check if KSSL has been configured for this address */
260 		kssl_ent_t ent;
261 		kssl_endpt_type_t type;
262 		struct T_bind_req bind_req;
263 		mblk_t *mp;
264 
265 		/*
266 		 * TODO: Check with KSSL team if we could add a function call
267 		 * that only queries whether KSSL is enabled for the given
268 		 * address.
269 		 */
270 		bind_req.PRIM_type = T_BIND_REQ;
271 		bind_req.ADDR_length = namelen;
272 		bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
273 		mp = soallocproto2(&bind_req, sizeof (bind_req),
274 		    name, namelen, 0, _ALLOC_SLEEP, cr);
275 
276 		type = kssl_check_proxy(mp, so, &ent);
277 		freemsg(mp);
278 
279 		if (type != KSSL_NO_PROXY) {
280 			/*
281 			 * KSSL has been configured for this address, so
282 			 * we must fall back to TPI.
283 			 */
284 			kssl_release_ent(ent, so, type);
285 			error = so_tpi_fallback(so, cr);
286 			SO_UNBLOCK_FALLBACK(so);
287 			if (error)
288 				return (error);
289 			else
290 				return (SOP_BIND(so, name, namelen, flags, cr));
291 		}
292 	}
293 
294 dobind:
295 	error = (*so->so_downcalls->sd_bind)
296 	    (so->so_proto_handle, name, namelen, cr);
297 done:
298 	SO_UNBLOCK_FALLBACK(so);
299 
300 	return (error);
301 }
302 
303 int
304 so_listen(struct sonode *so, int backlog, struct cred *cr)
305 {
306 	int	error = 0;
307 
308 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
309 	SO_BLOCK_FALLBACK(so, SOP_LISTEN(so, backlog, cr));
310 
311 	error = (*so->so_downcalls->sd_listen)(so->so_proto_handle, backlog,
312 	    cr);
313 
314 	SO_UNBLOCK_FALLBACK(so);
315 
316 	return (error);
317 }
318 
319 
320 int
321 so_connect(struct sonode *so, const struct sockaddr *name,
322     socklen_t namelen, int fflag, int flags, struct cred *cr)
323 {
324 	int error = 0;
325 	sock_connid_t id;
326 
327 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
328 	SO_BLOCK_FALLBACK(so, SOP_CONNECT(so, name, namelen, fflag, flags, cr));
329 
330 	/*
331 	 * If there is a pending error, return error
332 	 * This can happen if a non blocking operation caused an error.
333 	 */
334 
335 	if (so->so_error != 0) {
336 		mutex_enter(&so->so_lock);
337 		error = sogeterr(so, B_TRUE);
338 		mutex_exit(&so->so_lock);
339 		if (error != 0)
340 			goto done;
341 	}
342 
343 	error = (*so->so_downcalls->sd_connect)(so->so_proto_handle,
344 	    name, namelen, &id, cr);
345 
346 	if (error == EINPROGRESS)
347 		error = so_wait_connected(so, fflag & (FNONBLOCK|FNDELAY), id);
348 
349 done:
350 	SO_UNBLOCK_FALLBACK(so);
351 	return (error);
352 }
353 
354 /*ARGSUSED*/
355 int
356 so_accept(struct sonode *so, int fflag, struct cred *cr, struct sonode **nsop)
357 {
358 	int error = 0;
359 	struct sonode *nso;
360 
361 	*nsop = NULL;
362 
363 	SO_BLOCK_FALLBACK(so, SOP_ACCEPT(so, fflag, cr, nsop));
364 	if ((so->so_state & SS_ACCEPTCONN) == 0) {
365 		SO_UNBLOCK_FALLBACK(so);
366 		return ((so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW) ?
367 		    EOPNOTSUPP : EINVAL);
368 	}
369 
370 	if ((error = so_acceptq_dequeue(so, (fflag & (FNONBLOCK|FNDELAY)),
371 	    &nso)) == 0) {
372 		ASSERT(nso != NULL);
373 
374 		/* finish the accept */
375 		error = (*so->so_downcalls->sd_accept)(so->so_proto_handle,
376 		    nso->so_proto_handle, (sock_upper_handle_t)nso, cr);
377 		if (error != 0) {
378 			(void) socket_close(nso, 0, cr);
379 			socket_destroy(nso);
380 		} else {
381 			*nsop = nso;
382 		}
383 	}
384 
385 	SO_UNBLOCK_FALLBACK(so);
386 	return (error);
387 }
388 
389 int
390 so_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
391     struct cred *cr)
392 {
393 	int error, flags;
394 	boolean_t dontblock;
395 	ssize_t orig_resid;
396 	mblk_t  *mp;
397 
398 	SO_BLOCK_FALLBACK(so, SOP_SENDMSG(so, msg, uiop, cr));
399 
400 	flags = msg->msg_flags;
401 	error = 0;
402 	dontblock = (flags & MSG_DONTWAIT) ||
403 	    (uiop->uio_fmode & (FNONBLOCK|FNDELAY));
404 
405 	if (!(flags & MSG_XPG4_2) && msg->msg_controllen != 0) {
406 		/*
407 		 * Old way of passing fd's is not supported
408 		 */
409 		SO_UNBLOCK_FALLBACK(so);
410 		return (EOPNOTSUPP);
411 	}
412 
413 	if ((so->so_mode & SM_ATOMIC) &&
414 	    uiop->uio_resid > so->so_proto_props.sopp_maxpsz &&
415 	    so->so_proto_props.sopp_maxpsz != -1) {
416 		SO_UNBLOCK_FALLBACK(so);
417 		return (EMSGSIZE);
418 	}
419 
420 	/*
421 	 * For atomic sends we will only do one iteration.
422 	 */
423 	do {
424 		if (so->so_state & SS_CANTSENDMORE) {
425 			error = EPIPE;
426 			break;
427 		}
428 
429 		if (so->so_error != 0) {
430 			mutex_enter(&so->so_lock);
431 			error = sogeterr(so, B_TRUE);
432 			mutex_exit(&so->so_lock);
433 			if (error != 0)
434 				break;
435 		}
436 
437 		/*
438 		 * Send down OOB messages even if the send path is being
439 		 * flow controlled (assuming the protocol supports OOB data).
440 		 */
441 		if (flags & MSG_OOB) {
442 			if ((so->so_mode & SM_EXDATA) == 0) {
443 				error = EOPNOTSUPP;
444 				break;
445 			}
446 		} else if (so->so_snd_qfull) {
447 			/*
448 			 * Need to wait until the protocol is ready to receive
449 			 * more data for transmission.
450 			 */
451 			if ((error = so_snd_wait_qnotfull(so, dontblock)) != 0)
452 				break;
453 		}
454 
455 		/*
456 		 * Time to send data to the protocol. We either copy the
457 		 * data into mblks or pass the uio directly to the protocol.
458 		 * We decide what to do based on the available down calls.
459 		 */
460 		if (so->so_downcalls->sd_send_uio != NULL) {
461 			error = (*so->so_downcalls->sd_send_uio)
462 			    (so->so_proto_handle, uiop, msg, cr);
463 			if (error != 0)
464 				break;
465 		} else {
466 			/* save the resid in case of failure */
467 			orig_resid = uiop->uio_resid;
468 
469 			if ((mp = socopyinuio(uiop,
470 			    so->so_proto_props.sopp_maxpsz,
471 			    so->so_proto_props.sopp_wroff,
472 			    so->so_proto_props.sopp_maxblk,
473 			    so->so_proto_props.sopp_tail, &error)) == NULL) {
474 				break;
475 			}
476 			ASSERT(uiop->uio_resid >= 0);
477 
478 			error = (*so->so_downcalls->sd_send)
479 			    (so->so_proto_handle, mp, msg, cr);
480 			if (error != 0) {
481 				/*
482 				 * The send failed. We do not have to free the
483 				 * mblks, because that is the protocol's
484 				 * responsibility. However, uio_resid must
485 				 * remain accurate, so adjust that here.
486 				 */
487 				uiop->uio_resid = orig_resid;
488 					break;
489 			}
490 		}
491 	} while (uiop->uio_resid > 0);
492 
493 	SO_UNBLOCK_FALLBACK(so);
494 
495 	return (error);
496 }
497 
498 int
499 so_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
500     struct cred *cr, mblk_t **mpp)
501 {
502 	int error;
503 	boolean_t dontblock;
504 	size_t size;
505 	mblk_t *mp = *mpp;
506 
507 	SO_BLOCK_FALLBACK(so, SOP_SENDMBLK(so, msg, fflag, cr, mpp));
508 
509 	error = 0;
510 	dontblock = (msg->msg_flags & MSG_DONTWAIT) ||
511 	    (fflag & (FNONBLOCK|FNDELAY));
512 	size = msgdsize(mp);
513 
514 	if ((so->so_mode & SM_SENDFILESUPP) == 0 ||
515 	    so->so_downcalls->sd_send == NULL) {
516 		SO_UNBLOCK_FALLBACK(so);
517 		return (EOPNOTSUPP);
518 	}
519 
520 	if ((so->so_mode & SM_ATOMIC) &&
521 	    size > so->so_proto_props.sopp_maxpsz &&
522 	    so->so_proto_props.sopp_maxpsz != -1) {
523 		SO_UNBLOCK_FALLBACK(so);
524 		return (EMSGSIZE);
525 	}
526 
527 	while (mp != NULL) {
528 		mblk_t *nmp, *last_mblk;
529 		size_t mlen;
530 
531 		if (so->so_state & SS_CANTSENDMORE) {
532 			error = EPIPE;
533 			break;
534 		}
535 		if (so->so_error != 0) {
536 			mutex_enter(&so->so_lock);
537 			error = sogeterr(so, B_TRUE);
538 			mutex_exit(&so->so_lock);
539 			if (error != 0)
540 				break;
541 		}
542 		if (so->so_snd_qfull) {
543 			/*
544 			 * Need to wait until the protocol is ready to receive
545 			 * more data for transmission.
546 			 */
547 			if ((error = so_snd_wait_qnotfull(so, dontblock)) != 0)
548 				break;
549 		}
550 
551 		/*
552 		 * We only allow so_maxpsz of data to be sent down to
553 		 * the protocol at time.
554 		 */
555 		mlen = MBLKL(mp);
556 		nmp = mp->b_cont;
557 		last_mblk = mp;
558 		while (nmp != NULL) {
559 			mlen += MBLKL(nmp);
560 			if (mlen > so->so_proto_props.sopp_maxpsz) {
561 				last_mblk->b_cont = NULL;
562 				break;
563 			}
564 			last_mblk = nmp;
565 			nmp = nmp->b_cont;
566 		}
567 
568 		error = (*so->so_downcalls->sd_send)
569 		    (so->so_proto_handle, mp, msg, cr);
570 		if (error != 0) {
571 			/*
572 			 * The send failed. The protocol will free the mblks
573 			 * that were sent down. Let the caller deal with the
574 			 * rest.
575 			 */
576 			*mpp = nmp;
577 			break;
578 		}
579 
580 		*mpp = mp = nmp;
581 	}
582 
583 	SO_UNBLOCK_FALLBACK(so);
584 
585 	return (error);
586 }
587 
588 int
589 so_shutdown(struct sonode *so, int how, struct cred *cr)
590 {
591 	int error;
592 
593 	SO_BLOCK_FALLBACK(so, SOP_SHUTDOWN(so, how, cr));
594 
595 	/*
596 	 * SunOS 4.X has no check for datagram sockets.
597 	 * 5.X checks that it is connected (ENOTCONN)
598 	 * X/Open requires that we check the connected state.
599 	 */
600 	if (!(so->so_state & SS_ISCONNECTED)) {
601 		if (!xnet_skip_checks) {
602 			error = ENOTCONN;
603 			if (xnet_check_print) {
604 				printf("sockfs: X/Open shutdown check "
605 				    "caused ENOTCONN\n");
606 			}
607 		}
608 		goto done;
609 	}
610 
611 	error = ((*so->so_downcalls->sd_shutdown)(so->so_proto_handle,
612 	    how, cr));
613 
614 	/*
615 	 * Protocol agreed to shutdown. We need to flush the
616 	 * receive buffer if the receive side is being shutdown.
617 	 */
618 	if (error == 0 && how != SHUT_WR) {
619 		mutex_enter(&so->so_lock);
620 		/* wait for active reader to finish */
621 		(void) so_lock_read(so, 0);
622 
623 		so_rcv_flush(so);
624 
625 		so_unlock_read(so);
626 		mutex_exit(&so->so_lock);
627 	}
628 
629 done:
630 	SO_UNBLOCK_FALLBACK(so);
631 	return (error);
632 }
633 
634 int
635 so_getsockname(struct sonode *so, struct sockaddr *addr,
636     socklen_t *addrlen, struct cred *cr)
637 {
638 	int error;
639 
640 	SO_BLOCK_FALLBACK(so, SOP_GETSOCKNAME(so, addr, addrlen, cr));
641 
642 	error = (*so->so_downcalls->sd_getsockname)
643 	    (so->so_proto_handle, addr, addrlen, cr);
644 
645 	SO_UNBLOCK_FALLBACK(so);
646 	return (error);
647 }
648 
649 int
650 so_getpeername(struct sonode *so, struct sockaddr *addr,
651     socklen_t *addrlen, boolean_t accept, struct cred *cr)
652 {
653 	int error;
654 
655 	SO_BLOCK_FALLBACK(so, SOP_GETPEERNAME(so, addr, addrlen, accept, cr));
656 
657 	if (accept) {
658 		error = (*so->so_downcalls->sd_getpeername)
659 		    (so->so_proto_handle, addr, addrlen, cr);
660 	} else if (!(so->so_state & SS_ISCONNECTED)) {
661 		error = ENOTCONN;
662 	} else if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
663 		/* Added this check for X/Open */
664 		error = EINVAL;
665 		if (xnet_check_print) {
666 			printf("sockfs: X/Open getpeername check => EINVAL\n");
667 		}
668 	} else {
669 		error = (*so->so_downcalls->sd_getpeername)
670 		    (so->so_proto_handle, addr, addrlen, cr);
671 	}
672 
673 	SO_UNBLOCK_FALLBACK(so);
674 	return (error);
675 }
676 
677 int
678 so_getsockopt(struct sonode *so, int level, int option_name,
679     void *optval, socklen_t *optlenp, int flags, struct cred *cr)
680 {
681 	int error = 0;
682 
683 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
684 	SO_BLOCK_FALLBACK(so,
685 	    SOP_GETSOCKOPT(so, level, option_name, optval, optlenp, flags, cr));
686 
687 	error = socket_getopt_common(so, level, option_name, optval, optlenp,
688 	    flags);
689 	if (error < 0) {
690 		error = (*so->so_downcalls->sd_getsockopt)
691 		    (so->so_proto_handle, level, option_name, optval, optlenp,
692 		    cr);
693 		if (error ==  ENOPROTOOPT) {
694 			if (level == SOL_SOCKET) {
695 				/*
696 				 * If a protocol does not support a particular
697 				 * socket option, set can fail (not allowed)
698 				 * but get can not fail. This is the previous
699 				 * sockfs bahvior.
700 				 */
701 				switch (option_name) {
702 				case SO_LINGER:
703 					if (*optlenp < (t_uscalar_t)
704 					    sizeof (struct linger)) {
705 						error = EINVAL;
706 						break;
707 					}
708 					error = 0;
709 					bzero(optval, sizeof (struct linger));
710 					*optlenp = sizeof (struct linger);
711 					break;
712 				case SO_RCVTIMEO:
713 				case SO_SNDTIMEO:
714 					if (*optlenp < (t_uscalar_t)
715 					    sizeof (struct timeval)) {
716 						error = EINVAL;
717 						break;
718 					}
719 					error = 0;
720 					bzero(optval, sizeof (struct timeval));
721 					*optlenp = sizeof (struct timeval);
722 					break;
723 				case SO_SND_BUFINFO:
724 					if (*optlenp < (t_uscalar_t)
725 					    sizeof (struct so_snd_bufinfo)) {
726 						error = EINVAL;
727 						break;
728 					}
729 					error = 0;
730 					bzero(optval,
731 					    sizeof (struct so_snd_bufinfo));
732 					*optlenp =
733 					    sizeof (struct so_snd_bufinfo);
734 					break;
735 				case SO_DEBUG:
736 				case SO_REUSEADDR:
737 				case SO_KEEPALIVE:
738 				case SO_DONTROUTE:
739 				case SO_BROADCAST:
740 				case SO_USELOOPBACK:
741 				case SO_OOBINLINE:
742 				case SO_DGRAM_ERRIND:
743 				case SO_SNDBUF:
744 				case SO_RCVBUF:
745 					error = 0;
746 					*((int32_t *)optval) = 0;
747 					*optlenp = sizeof (int32_t);
748 					break;
749 				default:
750 					break;
751 				}
752 			}
753 		}
754 	}
755 
756 	SO_UNBLOCK_FALLBACK(so);
757 	return (error);
758 }
759 
760 int
761 so_setsockopt(struct sonode *so, int level, int option_name,
762     const void *optval, socklen_t optlen, struct cred *cr)
763 {
764 	int error = 0;
765 	struct timeval tl;
766 	const void *opt = optval;
767 
768 	SO_BLOCK_FALLBACK(so,
769 	    SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr));
770 
771 	/* X/Open requires this check */
772 	if (so->so_state & SS_CANTSENDMORE && !xnet_skip_checks) {
773 		SO_UNBLOCK_FALLBACK(so);
774 		if (xnet_check_print)
775 			printf("sockfs: X/Open setsockopt check => EINVAL\n");
776 		return (EINVAL);
777 	}
778 
779 	if (level == SOL_SOCKET) {
780 		switch (option_name) {
781 		case SO_RCVTIMEO:
782 		case SO_SNDTIMEO: {
783 			/*
784 			 * We pass down these two options to protocol in order
785 			 * to support some third part protocols which need to
786 			 * know them. For those protocols which don't care
787 			 * these two options, simply return 0.
788 			 */
789 			clock_t t_usec;
790 
791 			if (get_udatamodel() == DATAMODEL_NONE ||
792 			    get_udatamodel() == DATAMODEL_NATIVE) {
793 				if (optlen != sizeof (struct timeval)) {
794 					error = EINVAL;
795 					goto done;
796 				}
797 				bcopy((struct timeval *)optval, &tl,
798 				    sizeof (struct timeval));
799 			} else {
800 				if (optlen != sizeof (struct timeval32)) {
801 					error = EINVAL;
802 					goto done;
803 				}
804 				TIMEVAL32_TO_TIMEVAL(&tl,
805 				    (struct timeval32 *)optval);
806 			}
807 			opt = &tl;
808 			optlen = sizeof (tl);
809 			t_usec = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
810 			mutex_enter(&so->so_lock);
811 			if (option_name == SO_RCVTIMEO)
812 				so->so_rcvtimeo = drv_usectohz(t_usec);
813 			else
814 				so->so_sndtimeo = drv_usectohz(t_usec);
815 			mutex_exit(&so->so_lock);
816 			break;
817 		}
818 		case SO_RCVBUF:
819 			/*
820 			 * XXX XPG 4.2 applications retrieve SO_RCVBUF from
821 			 * sockfs since the transport might adjust the value
822 			 * and not return exactly what was set by the
823 			 * application.
824 			 */
825 			so->so_xpg_rcvbuf = *(int32_t *)optval;
826 			break;
827 		}
828 	}
829 	error = (*so->so_downcalls->sd_setsockopt)
830 	    (so->so_proto_handle, level, option_name, opt, optlen, cr);
831 done:
832 	SO_UNBLOCK_FALLBACK(so);
833 	return (error);
834 }
835 
836 int
837 so_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
838     struct cred *cr, int32_t *rvalp)
839 {
840 	int error = 0;
841 
842 	SO_BLOCK_FALLBACK(so, SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
843 
844 	/*
845 	 * If there is a pending error, return error
846 	 * This can happen if a non blocking operation caused an error.
847 	 */
848 	if (so->so_error != 0) {
849 		mutex_enter(&so->so_lock);
850 		error = sogeterr(so, B_TRUE);
851 		mutex_exit(&so->so_lock);
852 		if (error != 0)
853 			goto done;
854 	}
855 
856 	/*
857 	 * calling strioc can result in the socket falling back to TPI,
858 	 * if that is supported.
859 	 */
860 	if ((error = socket_ioctl_common(so, cmd, arg, mode, cr, rvalp)) < 0 &&
861 	    (error = socket_strioc_common(so, cmd, arg, mode, cr, rvalp)) < 0) {
862 		error = (*so->so_downcalls->sd_ioctl)(so->so_proto_handle,
863 		    cmd, arg, mode, rvalp, cr);
864 	}
865 
866 done:
867 	SO_UNBLOCK_FALLBACK(so);
868 
869 	return (error);
870 }
871 
872 int
873 so_poll(struct sonode *so, short events, int anyyet, short *reventsp,
874     struct pollhead **phpp)
875 {
876 	int state = so->so_state;
877 	*reventsp = 0;
878 
879 	/*
880 	 * In sockets the errors are represented as input/output events
881 	 */
882 	if (so->so_error != 0 &&
883 	    ((POLLIN|POLLRDNORM|POLLOUT) & events) != 0) {
884 		*reventsp = (POLLIN|POLLRDNORM|POLLOUT) & events;
885 		return (0);
886 	}
887 
888 	/*
889 	 * If the socket is in a state where it can send data
890 	 * turn on POLLWRBAND and POLLOUT events.
891 	 */
892 	if ((so->so_mode & SM_CONNREQUIRED) == 0 || (state & SS_ISCONNECTED)) {
893 		/*
894 		 * out of band data is allowed even if the connection
895 		 * is flow controlled
896 		 */
897 		*reventsp |= POLLWRBAND & events;
898 		if (!so->so_snd_qfull) {
899 			/*
900 			 * As long as there is buffer to send data
901 			 * turn on POLLOUT events
902 			 */
903 			*reventsp |= POLLOUT & events;
904 		}
905 	}
906 
907 	/*
908 	 * Turn on POLLIN whenever there is data on the receive queue,
909 	 * or the socket is in a state where no more data will be received.
910 	 * Also, if the socket is accepting connections, flip the bit if
911 	 * there is something on the queue.
912 	 *
913 	 * We do an initial check for events without holding locks. However,
914 	 * if there are no event available, then we redo the check for POLLIN
915 	 * events under the lock.
916 	 */
917 
918 	/* Pending connections */
919 	if (so->so_acceptq_len > 0)
920 		*reventsp |= (POLLIN|POLLRDNORM) & events;
921 
922 	/* Data */
923 	/* so_downcalls is null for sctp */
924 	if (so->so_downcalls != NULL && so->so_downcalls->sd_poll != NULL) {
925 		*reventsp |= (*so->so_downcalls->sd_poll)
926 		    (so->so_proto_handle, events & SO_PROTO_POLLEV, anyyet,
927 		    CRED()) & events;
928 		ASSERT((*reventsp & ~events) == 0);
929 		/* do not recheck events */
930 		events &= ~SO_PROTO_POLLEV;
931 	} else {
932 		if (SO_HAVE_DATA(so))
933 			*reventsp |= (POLLIN|POLLRDNORM) & events;
934 
935 		/* Urgent data */
936 		if ((state & SS_OOBPEND) != 0) {
937 			*reventsp |= (POLLRDBAND | POLLPRI) & events;
938 		}
939 	}
940 
941 	if (!*reventsp && !anyyet) {
942 		/* Check for read events again, but this time under lock */
943 		if (events & (POLLIN|POLLRDNORM)) {
944 			mutex_enter(&so->so_lock);
945 			if (SO_HAVE_DATA(so) || so->so_acceptq_len > 0) {
946 				mutex_exit(&so->so_lock);
947 				*reventsp |= (POLLIN|POLLRDNORM) & events;
948 				return (0);
949 			} else {
950 				so->so_pollev |= SO_POLLEV_IN;
951 				mutex_exit(&so->so_lock);
952 			}
953 		}
954 		*phpp = &so->so_poll_list;
955 	}
956 	return (0);
957 }
958 
959 /*
960  * Generic Upcalls
961  */
962 void
963 so_connected(sock_upper_handle_t sock_handle, sock_connid_t id,
964     cred_t *peer_cred, pid_t peer_cpid)
965 {
966 	struct sonode *so = (struct sonode *)sock_handle;
967 
968 	mutex_enter(&so->so_lock);
969 	ASSERT(so->so_proto_handle != NULL);
970 
971 	if (peer_cred != NULL) {
972 		if (so->so_peercred != NULL)
973 			crfree(so->so_peercred);
974 		crhold(peer_cred);
975 		so->so_peercred = peer_cred;
976 		so->so_cpid = peer_cpid;
977 	}
978 
979 	so->so_proto_connid = id;
980 	soisconnected(so);
981 	/*
982 	 * Wake ones who're waiting for conn to become established.
983 	 */
984 	so_notify_connected(so);
985 }
986 
987 int
988 so_disconnected(sock_upper_handle_t sock_handle, sock_connid_t id, int error)
989 {
990 	struct sonode *so = (struct sonode *)sock_handle;
991 
992 	mutex_enter(&so->so_lock);
993 
994 	so->so_proto_connid = id;
995 	soisdisconnected(so, error);
996 	so_notify_disconnected(so, error);
997 
998 	return (0);
999 }
1000 
1001 void
1002 so_opctl(sock_upper_handle_t sock_handle, sock_opctl_action_t action,
1003     uintptr_t arg)
1004 {
1005 	struct sonode *so = (struct sonode *)sock_handle;
1006 
1007 	switch (action) {
1008 	case SOCK_OPCTL_SHUT_SEND:
1009 		mutex_enter(&so->so_lock);
1010 		socantsendmore(so);
1011 		so_notify_disconnecting(so);
1012 		break;
1013 	case SOCK_OPCTL_SHUT_RECV: {
1014 		mutex_enter(&so->so_lock);
1015 		socantrcvmore(so);
1016 		so_notify_eof(so);
1017 		break;
1018 	}
1019 	case SOCK_OPCTL_ENAB_ACCEPT:
1020 		mutex_enter(&so->so_lock);
1021 		so->so_state |= SS_ACCEPTCONN;
1022 		so->so_backlog = (unsigned int)arg;
1023 		mutex_exit(&so->so_lock);
1024 		break;
1025 	default:
1026 		ASSERT(0);
1027 		break;
1028 	}
1029 }
1030 
1031 void
1032 so_txq_full(sock_upper_handle_t sock_handle, boolean_t qfull)
1033 {
1034 	struct sonode *so = (struct sonode *)sock_handle;
1035 
1036 	if (qfull) {
1037 		so_snd_qfull(so);
1038 	} else {
1039 		so_snd_qnotfull(so);
1040 		mutex_enter(&so->so_lock);
1041 		so_notify_writable(so);
1042 	}
1043 }
1044 
1045 sock_upper_handle_t
1046 so_newconn(sock_upper_handle_t parenthandle,
1047     sock_lower_handle_t proto_handle, sock_downcalls_t *sock_downcalls,
1048     struct cred *peer_cred, pid_t peer_cpid, sock_upcalls_t **sock_upcallsp)
1049 {
1050 	struct sonode	*so = (struct sonode *)parenthandle;
1051 	struct sonode	*nso;
1052 	int error;
1053 
1054 	ASSERT(proto_handle != NULL);
1055 
1056 	if ((so->so_state & SS_ACCEPTCONN) == 0 ||
1057 	    so->so_acceptq_len >= so->so_backlog)
1058 		return (NULL);
1059 
1060 	nso = socket_newconn(so, proto_handle, sock_downcalls, SOCKET_NOSLEEP,
1061 	    &error);
1062 	if (nso == NULL)
1063 		return (NULL);
1064 
1065 	if (peer_cred != NULL) {
1066 		crhold(peer_cred);
1067 		nso->so_peercred = peer_cred;
1068 		nso->so_cpid = peer_cpid;
1069 	}
1070 
1071 	/*
1072 	 * The new socket (nso), proto_handle and sock_upcallsp are all
1073 	 * valid at this point. But as soon as nso is placed in the accept
1074 	 * queue that can no longer be assumed (since an accept() thread may
1075 	 * pull it off the queue and close the socket).
1076 	 */
1077 	*sock_upcallsp = &so_upcalls;
1078 
1079 	(void) so_acceptq_enqueue(so, nso);
1080 
1081 	mutex_enter(&so->so_lock);
1082 	so_notify_newconn(so);
1083 
1084 	return ((sock_upper_handle_t)nso);
1085 }
1086 
1087 void
1088 so_set_prop(sock_upper_handle_t sock_handle, struct sock_proto_props *soppp)
1089 {
1090 	struct sonode *so;
1091 
1092 	so = (struct sonode *)sock_handle;
1093 
1094 	mutex_enter(&so->so_lock);
1095 
1096 	if (soppp->sopp_flags & SOCKOPT_MAXBLK)
1097 		so->so_proto_props.sopp_maxblk = soppp->sopp_maxblk;
1098 	if (soppp->sopp_flags & SOCKOPT_WROFF)
1099 		so->so_proto_props.sopp_wroff = soppp->sopp_wroff;
1100 	if (soppp->sopp_flags & SOCKOPT_TAIL)
1101 		so->so_proto_props.sopp_tail = soppp->sopp_tail;
1102 	if (soppp->sopp_flags & SOCKOPT_RCVHIWAT)
1103 		so->so_proto_props.sopp_rxhiwat = soppp->sopp_rxhiwat;
1104 	if (soppp->sopp_flags & SOCKOPT_RCVLOWAT)
1105 		so->so_proto_props.sopp_rxlowat = soppp->sopp_rxlowat;
1106 	if (soppp->sopp_flags & SOCKOPT_MAXPSZ)
1107 		so->so_proto_props.sopp_maxpsz = soppp->sopp_maxpsz;
1108 	if (soppp->sopp_flags & SOCKOPT_MINPSZ)
1109 		so->so_proto_props.sopp_minpsz = soppp->sopp_minpsz;
1110 	if (soppp->sopp_flags & SOCKOPT_ZCOPY) {
1111 		if (soppp->sopp_zcopyflag & ZCVMSAFE) {
1112 			so->so_proto_props.sopp_zcopyflag |= STZCVMSAFE;
1113 			so->so_proto_props.sopp_zcopyflag &= ~STZCVMUNSAFE;
1114 		} else if (soppp->sopp_zcopyflag & ZCVMUNSAFE) {
1115 			so->so_proto_props.sopp_zcopyflag |= STZCVMUNSAFE;
1116 			so->so_proto_props.sopp_zcopyflag &= ~STZCVMSAFE;
1117 		}
1118 
1119 		if (soppp->sopp_zcopyflag & COPYCACHED) {
1120 			so->so_proto_props.sopp_zcopyflag |= STRCOPYCACHED;
1121 		}
1122 	}
1123 	if (soppp->sopp_flags & SOCKOPT_OOBINLINE)
1124 		so->so_proto_props.sopp_oobinline = soppp->sopp_oobinline;
1125 	if (soppp->sopp_flags & SOCKOPT_RCVTIMER)
1126 		so->so_proto_props.sopp_rcvtimer = soppp->sopp_rcvtimer;
1127 	if (soppp->sopp_flags & SOCKOPT_RCVTHRESH)
1128 		so->so_proto_props.sopp_rcvthresh = soppp->sopp_rcvthresh;
1129 	if (soppp->sopp_flags & SOCKOPT_MAXADDRLEN)
1130 		so->so_proto_props.sopp_maxaddrlen = soppp->sopp_maxaddrlen;
1131 	if (soppp->sopp_flags & SOCKOPT_LOOPBACK)
1132 		so->so_proto_props.sopp_loopback = soppp->sopp_loopback;
1133 
1134 	mutex_exit(&so->so_lock);
1135 
1136 #ifdef DEBUG
1137 	soppp->sopp_flags &= ~(SOCKOPT_MAXBLK | SOCKOPT_WROFF | SOCKOPT_TAIL |
1138 	    SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | SOCKOPT_MAXPSZ |
1139 	    SOCKOPT_ZCOPY | SOCKOPT_OOBINLINE | SOCKOPT_RCVTIMER |
1140 	    SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ |
1141 	    SOCKOPT_LOOPBACK);
1142 	ASSERT(soppp->sopp_flags == 0);
1143 #endif
1144 }
1145 
1146 /* ARGSUSED */
1147 ssize_t
1148 so_queue_msg(sock_upper_handle_t sock_handle, mblk_t *mp,
1149     size_t msg_size, int flags, int *errorp,  boolean_t *force_pushp)
1150 {
1151 	struct sonode *so = (struct sonode *)sock_handle;
1152 	boolean_t force_push = B_TRUE;
1153 	int space_left;
1154 	sodirect_t *sodp = so->so_direct;
1155 
1156 	ASSERT(errorp != NULL);
1157 	*errorp = 0;
1158 	if (mp == NULL) {
1159 		if (msg_size > 0) {
1160 			ASSERT(so->so_downcalls->sd_recv_uio != NULL);
1161 			mutex_enter(&so->so_lock);
1162 			/* the notify functions will drop the lock */
1163 			if (flags & MSG_OOB)
1164 				so_notify_oobdata(so, IS_SO_OOB_INLINE(so));
1165 			else
1166 				so_notify_data(so, msg_size);
1167 			return (0);
1168 		}
1169 		/*
1170 		 * recv space check
1171 		 */
1172 		mutex_enter(&so->so_lock);
1173 		space_left = so->so_rcvbuf - so->so_rcv_queued;
1174 		if (space_left <= 0) {
1175 			so->so_flowctrld = B_TRUE;
1176 			*errorp = ENOSPC;
1177 			space_left = -1;
1178 		}
1179 		goto done_unlock;
1180 	}
1181 
1182 	ASSERT(mp->b_next == NULL);
1183 	ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_PROTO);
1184 	ASSERT(msg_size == msgdsize(mp));
1185 
1186 	if (flags & MSG_OOB) {
1187 		so_queue_oob(sock_handle, mp, msg_size);
1188 		return (0);
1189 	}
1190 
1191 	if (force_pushp != NULL)
1192 		force_push = *force_pushp;
1193 
1194 	if (DB_TYPE(mp) == M_PROTO && !__TPI_PRIM_ISALIGNED(mp->b_rptr)) {
1195 		/* The read pointer is not aligned correctly for TPI */
1196 		zcmn_err(getzoneid(), CE_WARN,
1197 		    "sockfs: Unaligned TPI message received. rptr = %p\n",
1198 		    (void *)mp->b_rptr);
1199 		freemsg(mp);
1200 		mutex_enter(&so->so_lock);
1201 		if (sodp != NULL)
1202 			SOD_UIOAFINI(sodp);
1203 		mutex_exit(&so->so_lock);
1204 
1205 		return (so->so_rcvbuf - so->so_rcv_queued);
1206 	}
1207 
1208 	mutex_enter(&so->so_lock);
1209 	if (so->so_state & (SS_FALLBACK_DRAIN | SS_FALLBACK_COMP)) {
1210 		if (sodp != NULL)
1211 			SOD_DISABLE(sodp);
1212 		mutex_exit(&so->so_lock);
1213 		*errorp = EOPNOTSUPP;
1214 		return (-1);
1215 	}
1216 	if (so->so_state & SS_CANTRCVMORE) {
1217 		freemsg(mp);
1218 		if (sodp != NULL)
1219 			SOD_DISABLE(sodp);
1220 		mutex_exit(&so->so_lock);
1221 		return (0);
1222 	}
1223 
1224 	/* process the mblk via I/OAT if capable */
1225 	if (sodp != NULL && sodp->sod_enabled) {
1226 		if (DB_TYPE(mp) == M_DATA) {
1227 			sod_uioa_mblk_init(sodp, mp, msg_size);
1228 		} else {
1229 			SOD_UIOAFINI(sodp);
1230 		}
1231 	}
1232 
1233 	if (mp->b_next == NULL) {
1234 		so_enqueue_msg(so, mp, msg_size);
1235 	} else {
1236 		do {
1237 			mblk_t *nmp;
1238 
1239 			if ((nmp = mp->b_next) != NULL) {
1240 				mp->b_next = NULL;
1241 			}
1242 			so_enqueue_msg(so, mp, msgdsize(mp));
1243 			mp = nmp;
1244 		} while (mp != NULL);
1245 	}
1246 
1247 	space_left = so->so_rcvbuf - so->so_rcv_queued;
1248 	if (space_left <= 0) {
1249 		so->so_flowctrld = B_TRUE;
1250 		*errorp = ENOSPC;
1251 		space_left = -1;
1252 	}
1253 
1254 	if (force_push || so->so_rcv_queued >= so->so_rcv_thresh ||
1255 	    so->so_rcv_queued >= so->so_rcv_wanted) {
1256 		SOCKET_TIMER_CANCEL(so);
1257 		/*
1258 		 * so_notify_data will release the lock
1259 		 */
1260 		so_notify_data(so, so->so_rcv_queued);
1261 
1262 		if (force_pushp != NULL)
1263 			*force_pushp = B_TRUE;
1264 		goto done;
1265 	} else if (so->so_rcv_timer_tid == 0) {
1266 		/* Make sure the recv push timer is running */
1267 		SOCKET_TIMER_START(so);
1268 	}
1269 
1270 done_unlock:
1271 	mutex_exit(&so->so_lock);
1272 done:
1273 	return (space_left);
1274 }
1275 
1276 /*
1277  * Set the offset of where the oob data is relative to the bytes in
1278  * queued. Also generate SIGURG
1279  */
1280 void
1281 so_signal_oob(sock_upper_handle_t sock_handle, ssize_t offset)
1282 {
1283 	struct sonode *so;
1284 
1285 	ASSERT(offset >= 0);
1286 	so = (struct sonode *)sock_handle;
1287 	mutex_enter(&so->so_lock);
1288 	if (so->so_direct != NULL)
1289 		SOD_UIOAFINI(so->so_direct);
1290 
1291 	/*
1292 	 * New urgent data on the way so forget about any old
1293 	 * urgent data.
1294 	 */
1295 	so->so_state &= ~(SS_HAVEOOBDATA|SS_HADOOBDATA);
1296 
1297 	/*
1298 	 * Record that urgent data is pending.
1299 	 */
1300 	so->so_state |= SS_OOBPEND;
1301 
1302 	if (so->so_oobmsg != NULL) {
1303 		dprintso(so, 1, ("sock: discarding old oob\n"));
1304 		freemsg(so->so_oobmsg);
1305 		so->so_oobmsg = NULL;
1306 	}
1307 
1308 	/*
1309 	 * set the offset where the urgent byte is
1310 	 */
1311 	so->so_oobmark = so->so_rcv_queued + offset;
1312 	if (so->so_oobmark == 0)
1313 		so->so_state |= SS_RCVATMARK;
1314 	else
1315 		so->so_state &= ~SS_RCVATMARK;
1316 
1317 	so_notify_oobsig(so);
1318 }
1319 
1320 /*
1321  * Queue the OOB byte
1322  */
1323 static void
1324 so_queue_oob(sock_upper_handle_t sock_handle, mblk_t *mp, size_t len)
1325 {
1326 	struct sonode *so;
1327 
1328 	so = (struct sonode *)sock_handle;
1329 	mutex_enter(&so->so_lock);
1330 	if (so->so_direct != NULL)
1331 		SOD_UIOAFINI(so->so_direct);
1332 
1333 	ASSERT(mp != NULL);
1334 	if (!IS_SO_OOB_INLINE(so)) {
1335 		so->so_oobmsg = mp;
1336 		so->so_state |= SS_HAVEOOBDATA;
1337 	} else {
1338 		so_enqueue_msg(so, mp, len);
1339 	}
1340 
1341 	so_notify_oobdata(so, IS_SO_OOB_INLINE(so));
1342 }
1343 
1344 int
1345 so_close(struct sonode *so, int flag, struct cred *cr)
1346 {
1347 	int error;
1348 
1349 	error = (*so->so_downcalls->sd_close)(so->so_proto_handle, flag, cr);
1350 
1351 	/*
1352 	 * At this point there will be no more upcalls from the protocol
1353 	 */
1354 	mutex_enter(&so->so_lock);
1355 
1356 	ASSERT(so_verify_oobstate(so));
1357 
1358 	so_rcv_flush(so);
1359 	mutex_exit(&so->so_lock);
1360 
1361 	return (error);
1362 }
1363 
1364 void
1365 so_zcopy_notify(sock_upper_handle_t sock_handle)
1366 {
1367 	struct sonode *so = (struct sonode *)sock_handle;
1368 
1369 	mutex_enter(&so->so_lock);
1370 	so->so_copyflag |= STZCNOTIFY;
1371 	cv_broadcast(&so->so_copy_cv);
1372 	mutex_exit(&so->so_lock);
1373 }
1374 
1375 void
1376 so_set_error(sock_upper_handle_t sock_handle, int error)
1377 {
1378 	struct sonode *so = (struct sonode *)sock_handle;
1379 
1380 	mutex_enter(&so->so_lock);
1381 
1382 	soseterror(so, error);
1383 
1384 	so_notify_error(so);
1385 }
1386 
1387 /*
1388  * so_recvmsg - read data from the socket
1389  *
1390  * There are two ways of obtaining data; either we ask the protocol to
1391  * copy directly into the supplied buffer, or we copy data from the
1392  * sonode's receive queue. The decision which one to use depends on
1393  * whether the protocol has a sd_recv_uio down call.
1394  */
1395 int
1396 so_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
1397     struct cred *cr)
1398 {
1399 	rval_t 		rval;
1400 	int 		flags = 0;
1401 	t_uscalar_t	controllen, namelen;
1402 	int 		error = 0;
1403 	int ret;
1404 	mblk_t		*mctlp = NULL;
1405 	union T_primitives *tpr;
1406 	void		*control;
1407 	ssize_t		saved_resid;
1408 	struct uio	*suiop;
1409 
1410 	SO_BLOCK_FALLBACK(so, SOP_RECVMSG(so, msg, uiop, cr));
1411 
1412 	if ((so->so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
1413 	    (so->so_mode & SM_CONNREQUIRED)) {
1414 		SO_UNBLOCK_FALLBACK(so);
1415 		return (ENOTCONN);
1416 	}
1417 
1418 	if (msg->msg_flags & MSG_PEEK)
1419 		msg->msg_flags &= ~MSG_WAITALL;
1420 
1421 	if (so->so_mode & SM_ATOMIC)
1422 		msg->msg_flags |= MSG_TRUNC;
1423 
1424 	if (msg->msg_flags & MSG_OOB) {
1425 		if ((so->so_mode & SM_EXDATA) == 0) {
1426 			error = EOPNOTSUPP;
1427 		} else if (so->so_downcalls->sd_recv_uio != NULL) {
1428 			error = (*so->so_downcalls->sd_recv_uio)
1429 			    (so->so_proto_handle, uiop, msg, cr);
1430 		} else {
1431 			error = sorecvoob(so, msg, uiop, msg->msg_flags,
1432 			    IS_SO_OOB_INLINE(so));
1433 		}
1434 		SO_UNBLOCK_FALLBACK(so);
1435 		return (error);
1436 	}
1437 
1438 	/*
1439 	 * If the protocol has the recv down call, then pass the request
1440 	 * down.
1441 	 */
1442 	if (so->so_downcalls->sd_recv_uio != NULL) {
1443 		error = (*so->so_downcalls->sd_recv_uio)
1444 		    (so->so_proto_handle, uiop, msg, cr);
1445 		SO_UNBLOCK_FALLBACK(so);
1446 		return (error);
1447 	}
1448 
1449 	/*
1450 	 * Reading data from the socket buffer
1451 	 */
1452 	flags = msg->msg_flags;
1453 	msg->msg_flags = 0;
1454 
1455 	/*
1456 	 * Set msg_controllen and msg_namelen to zero here to make it
1457 	 * simpler in the cases that no control or name is returned.
1458 	 */
1459 	controllen = msg->msg_controllen;
1460 	namelen = msg->msg_namelen;
1461 	msg->msg_controllen = 0;
1462 	msg->msg_namelen = 0;
1463 
1464 	mutex_enter(&so->so_lock);
1465 	/* Set SOREADLOCKED */
1466 	error = so_lock_read_intr(so,
1467 	    uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
1468 	mutex_exit(&so->so_lock);
1469 	if (error) {
1470 		SO_UNBLOCK_FALLBACK(so);
1471 		return (error);
1472 	}
1473 
1474 	suiop = sod_rcv_init(so, flags, &uiop);
1475 retry:
1476 	saved_resid = uiop->uio_resid;
1477 	error = so_dequeue_msg(so, &mctlp, uiop, &rval, flags);
1478 	if (error != 0) {
1479 		goto out;
1480 	}
1481 	/*
1482 	 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
1483 	 * For non-datagrams MOREDATA is used to set MSG_EOR.
1484 	 */
1485 	ASSERT(!(rval.r_val1 & MORECTL));
1486 	if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
1487 		msg->msg_flags |= MSG_TRUNC;
1488 	if (mctlp == NULL) {
1489 		dprintso(so, 1, ("so_recvmsg: got M_DATA\n"));
1490 
1491 		mutex_enter(&so->so_lock);
1492 		/* Set MSG_EOR based on MOREDATA */
1493 		if (!(rval.r_val1 & MOREDATA)) {
1494 			if (so->so_state & SS_SAVEDEOR) {
1495 				msg->msg_flags |= MSG_EOR;
1496 				so->so_state &= ~SS_SAVEDEOR;
1497 			}
1498 		}
1499 		/*
1500 		 * If some data was received (i.e. not EOF) and the
1501 		 * read/recv* has not been satisfied wait for some more.
1502 		 */
1503 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
1504 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
1505 			mutex_exit(&so->so_lock);
1506 			flags |= MSG_NOMARK;
1507 			goto retry;
1508 		}
1509 
1510 		goto out_locked;
1511 	}
1512 	/* so_queue_msg has already verified length and alignment */
1513 	tpr = (union T_primitives *)mctlp->b_rptr;
1514 	dprintso(so, 1, ("so_recvmsg: type %d\n", tpr->type));
1515 	switch (tpr->type) {
1516 	case T_DATA_IND: {
1517 		/*
1518 		 * Set msg_flags to MSG_EOR based on
1519 		 * MORE_flag and MOREDATA.
1520 		 */
1521 		mutex_enter(&so->so_lock);
1522 		so->so_state &= ~SS_SAVEDEOR;
1523 		if (!(tpr->data_ind.MORE_flag & 1)) {
1524 			if (!(rval.r_val1 & MOREDATA))
1525 				msg->msg_flags |= MSG_EOR;
1526 			else
1527 				so->so_state |= SS_SAVEDEOR;
1528 		}
1529 		freemsg(mctlp);
1530 		/*
1531 		 * If some data was received (i.e. not EOF) and the
1532 		 * read/recv* has not been satisfied wait for some more.
1533 		 */
1534 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
1535 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
1536 			mutex_exit(&so->so_lock);
1537 			flags |= MSG_NOMARK;
1538 			goto retry;
1539 		}
1540 		goto out_locked;
1541 	}
1542 	case T_UNITDATA_IND: {
1543 		void *addr;
1544 		t_uscalar_t addrlen;
1545 		void *abuf;
1546 		t_uscalar_t optlen;
1547 		void *opt;
1548 
1549 		if (namelen != 0) {
1550 			/* Caller wants source address */
1551 			addrlen = tpr->unitdata_ind.SRC_length;
1552 			addr = sogetoff(mctlp, tpr->unitdata_ind.SRC_offset,
1553 			    addrlen, 1);
1554 			if (addr == NULL) {
1555 				freemsg(mctlp);
1556 				error = EPROTO;
1557 				eprintsoline(so, error);
1558 				goto out;
1559 			}
1560 			ASSERT(so->so_family != AF_UNIX);
1561 		}
1562 		optlen = tpr->unitdata_ind.OPT_length;
1563 		if (optlen != 0) {
1564 			t_uscalar_t ncontrollen;
1565 
1566 			/*
1567 			 * Extract any source address option.
1568 			 * Determine how large cmsg buffer is needed.
1569 			 */
1570 			opt = sogetoff(mctlp, tpr->unitdata_ind.OPT_offset,
1571 			    optlen, __TPI_ALIGN_SIZE);
1572 
1573 			if (opt == NULL) {
1574 				freemsg(mctlp);
1575 				error = EPROTO;
1576 				eprintsoline(so, error);
1577 				goto out;
1578 			}
1579 			if (so->so_family == AF_UNIX)
1580 				so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
1581 			ncontrollen = so_cmsglen(mctlp, opt, optlen,
1582 			    !(flags & MSG_XPG4_2));
1583 			if (controllen != 0)
1584 				controllen = ncontrollen;
1585 			else if (ncontrollen != 0)
1586 				msg->msg_flags |= MSG_CTRUNC;
1587 		} else {
1588 			controllen = 0;
1589 		}
1590 
1591 		if (namelen != 0) {
1592 			/*
1593 			 * Return address to caller.
1594 			 * Caller handles truncation if length
1595 			 * exceeds msg_namelen.
1596 			 * NOTE: AF_UNIX NUL termination is ensured by
1597 			 * the sender's copyin_name().
1598 			 */
1599 			abuf = kmem_alloc(addrlen, KM_SLEEP);
1600 
1601 			bcopy(addr, abuf, addrlen);
1602 			msg->msg_name = abuf;
1603 			msg->msg_namelen = addrlen;
1604 		}
1605 
1606 		if (controllen != 0) {
1607 			/*
1608 			 * Return control msg to caller.
1609 			 * Caller handles truncation if length
1610 			 * exceeds msg_controllen.
1611 			 */
1612 			control = kmem_zalloc(controllen, KM_SLEEP);
1613 
1614 			error = so_opt2cmsg(mctlp, opt, optlen,
1615 			    !(flags & MSG_XPG4_2), control, controllen);
1616 			if (error) {
1617 				freemsg(mctlp);
1618 				if (msg->msg_namelen != 0)
1619 					kmem_free(msg->msg_name,
1620 					    msg->msg_namelen);
1621 				kmem_free(control, controllen);
1622 				eprintsoline(so, error);
1623 				goto out;
1624 			}
1625 			msg->msg_control = control;
1626 			msg->msg_controllen = controllen;
1627 		}
1628 
1629 		freemsg(mctlp);
1630 		goto out;
1631 	}
1632 	case T_OPTDATA_IND: {
1633 		struct T_optdata_req *tdr;
1634 		void *opt;
1635 		t_uscalar_t optlen;
1636 
1637 		tdr = (struct T_optdata_req *)mctlp->b_rptr;
1638 		optlen = tdr->OPT_length;
1639 		if (optlen != 0) {
1640 			t_uscalar_t ncontrollen;
1641 			/*
1642 			 * Determine how large cmsg buffer is needed.
1643 			 */
1644 			opt = sogetoff(mctlp,
1645 			    tpr->optdata_ind.OPT_offset, optlen,
1646 			    __TPI_ALIGN_SIZE);
1647 
1648 			if (opt == NULL) {
1649 				freemsg(mctlp);
1650 				error = EPROTO;
1651 				eprintsoline(so, error);
1652 				goto out;
1653 			}
1654 
1655 			ncontrollen = so_cmsglen(mctlp, opt, optlen,
1656 			    !(flags & MSG_XPG4_2));
1657 			if (controllen != 0)
1658 				controllen = ncontrollen;
1659 			else if (ncontrollen != 0)
1660 				msg->msg_flags |= MSG_CTRUNC;
1661 		} else {
1662 			controllen = 0;
1663 		}
1664 
1665 		if (controllen != 0) {
1666 			/*
1667 			 * Return control msg to caller.
1668 			 * Caller handles truncation if length
1669 			 * exceeds msg_controllen.
1670 			 */
1671 			control = kmem_zalloc(controllen, KM_SLEEP);
1672 
1673 			error = so_opt2cmsg(mctlp, opt, optlen,
1674 			    !(flags & MSG_XPG4_2), control, controllen);
1675 			if (error) {
1676 				freemsg(mctlp);
1677 				kmem_free(control, controllen);
1678 				eprintsoline(so, error);
1679 				goto out;
1680 			}
1681 			msg->msg_control = control;
1682 			msg->msg_controllen = controllen;
1683 		}
1684 
1685 		/*
1686 		 * Set msg_flags to MSG_EOR based on
1687 		 * DATA_flag and MOREDATA.
1688 		 */
1689 		mutex_enter(&so->so_lock);
1690 		so->so_state &= ~SS_SAVEDEOR;
1691 		if (!(tpr->data_ind.MORE_flag & 1)) {
1692 			if (!(rval.r_val1 & MOREDATA))
1693 				msg->msg_flags |= MSG_EOR;
1694 			else
1695 				so->so_state |= SS_SAVEDEOR;
1696 		}
1697 		freemsg(mctlp);
1698 		/*
1699 		 * If some data was received (i.e. not EOF) and the
1700 		 * read/recv* has not been satisfied wait for some more.
1701 		 * Not possible to wait if control info was received.
1702 		 */
1703 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
1704 		    controllen == 0 &&
1705 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
1706 			mutex_exit(&so->so_lock);
1707 			flags |= MSG_NOMARK;
1708 			goto retry;
1709 		}
1710 		goto out_locked;
1711 	}
1712 	default:
1713 		cmn_err(CE_CONT, "so_recvmsg bad type %x \n",
1714 		    tpr->type);
1715 		freemsg(mctlp);
1716 		error = EPROTO;
1717 		ASSERT(0);
1718 	}
1719 out:
1720 	mutex_enter(&so->so_lock);
1721 out_locked:
1722 	ret = sod_rcv_done(so, suiop, uiop);
1723 	if (ret != 0 && error == 0)
1724 		error = ret;
1725 
1726 	so_unlock_read(so);	/* Clear SOREADLOCKED */
1727 	mutex_exit(&so->so_lock);
1728 
1729 	SO_UNBLOCK_FALLBACK(so);
1730 
1731 	return (error);
1732 }
1733 
1734 sonodeops_t so_sonodeops = {
1735 	so_init,		/* sop_init	*/
1736 	so_accept,		/* sop_accept   */
1737 	so_bind,		/* sop_bind	*/
1738 	so_listen,		/* sop_listen   */
1739 	so_connect,		/* sop_connect  */
1740 	so_recvmsg,		/* sop_recvmsg  */
1741 	so_sendmsg,		/* sop_sendmsg  */
1742 	so_sendmblk,		/* sop_sendmblk */
1743 	so_getpeername,		/* sop_getpeername */
1744 	so_getsockname,		/* sop_getsockname */
1745 	so_shutdown,		/* sop_shutdown */
1746 	so_getsockopt,		/* sop_getsockopt */
1747 	so_setsockopt,		/* sop_setsockopt */
1748 	so_ioctl,		/* sop_ioctl    */
1749 	so_poll,		/* sop_poll	*/
1750 	so_close,		/* sop_close */
1751 };
1752 
1753 sock_upcalls_t so_upcalls = {
1754 	so_newconn,
1755 	so_connected,
1756 	so_disconnected,
1757 	so_opctl,
1758 	so_queue_msg,
1759 	so_set_prop,
1760 	so_txq_full,
1761 	so_signal_oob,
1762 	so_zcopy_notify,
1763 	so_set_error
1764 };
1765