xref: /openbsd/sys/kern/uipc_socket.c (revision 776c8ff6)
1 /*	$OpenBSD: uipc_socket.c,v 1.326 2024/04/02 12:21:39 mvs Exp $	*/
2 /*	$NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1990, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
33  */
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/proc.h>
38 #include <sys/file.h>
39 #include <sys/filedesc.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/domain.h>
43 #include <sys/event.h>
44 #include <sys/protosw.h>
45 #include <sys/socket.h>
46 #include <sys/unpcb.h>
47 #include <sys/socketvar.h>
48 #include <sys/signalvar.h>
49 #include <sys/pool.h>
50 #include <sys/atomic.h>
51 #include <sys/rwlock.h>
52 #include <sys/time.h>
53 #include <sys/refcnt.h>
54 
55 #ifdef DDB
56 #include <machine/db_machdep.h>
57 #endif
58 
59 void	sbsync(struct sockbuf *, struct mbuf *);
60 
61 int	sosplice(struct socket *, int, off_t, struct timeval *);
62 void	sounsplice(struct socket *, struct socket *, int);
63 void	soidle(void *);
64 void	sotask(void *);
65 void	soreaper(void *);
66 void	soput(void *);
67 int	somove(struct socket *, int);
68 
69 void	filt_sordetach(struct knote *kn);
70 int	filt_soread(struct knote *kn, long hint);
71 void	filt_sowdetach(struct knote *kn);
72 int	filt_sowrite(struct knote *kn, long hint);
73 int	filt_soexcept(struct knote *kn, long hint);
74 
75 int	filt_sowmodify(struct kevent *kev, struct knote *kn);
76 int	filt_sowprocess(struct knote *kn, struct kevent *kev);
77 
78 int	filt_sormodify(struct kevent *kev, struct knote *kn);
79 int	filt_sorprocess(struct knote *kn, struct kevent *kev);
80 
81 const struct filterops soread_filtops = {
82 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
83 	.f_attach	= NULL,
84 	.f_detach	= filt_sordetach,
85 	.f_event	= filt_soread,
86 	.f_modify	= filt_sormodify,
87 	.f_process	= filt_sorprocess,
88 };
89 
90 const struct filterops sowrite_filtops = {
91 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
92 	.f_attach	= NULL,
93 	.f_detach	= filt_sowdetach,
94 	.f_event	= filt_sowrite,
95 	.f_modify	= filt_sowmodify,
96 	.f_process	= filt_sowprocess,
97 };
98 
99 const struct filterops soexcept_filtops = {
100 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
101 	.f_attach	= NULL,
102 	.f_detach	= filt_sordetach,
103 	.f_event	= filt_soexcept,
104 	.f_modify	= filt_sormodify,
105 	.f_process	= filt_sorprocess,
106 };
107 
108 #ifndef SOMINCONN
109 #define SOMINCONN 80
110 #endif /* SOMINCONN */
111 
112 int	somaxconn = SOMAXCONN;
113 int	sominconn = SOMINCONN;
114 
115 struct pool socket_pool;
116 #ifdef SOCKET_SPLICE
117 struct pool sosplice_pool;
118 struct taskq *sosplice_taskq;
119 struct rwlock sosplice_lock = RWLOCK_INITIALIZER("sosplicelk");
120 #endif
121 
122 void
123 soinit(void)
124 {
125 	pool_init(&socket_pool, sizeof(struct socket), 0, IPL_SOFTNET, 0,
126 	    "sockpl", NULL);
127 #ifdef SOCKET_SPLICE
128 	pool_init(&sosplice_pool, sizeof(struct sosplice), 0, IPL_SOFTNET, 0,
129 	    "sosppl", NULL);
130 #endif
131 }
132 
133 struct socket *
134 soalloc(const struct protosw *prp, int wait)
135 {
136 	const struct domain *dp = prp->pr_domain;
137 	struct socket *so;
138 
139 	so = pool_get(&socket_pool, (wait == M_WAIT ? PR_WAITOK : PR_NOWAIT) |
140 	    PR_ZERO);
141 	if (so == NULL)
142 		return (NULL);
143 	rw_init_flags(&so->so_lock, dp->dom_name, RWL_DUPOK);
144 	refcnt_init(&so->so_refcnt);
145 	mtx_init(&so->so_rcv.sb_mtx, IPL_MPFLOOR);
146 	mtx_init(&so->so_snd.sb_mtx, IPL_MPFLOOR);
147 	klist_init_mutex(&so->so_rcv.sb_klist, &so->so_rcv.sb_mtx);
148 	klist_init_mutex(&so->so_snd.sb_klist, &so->so_snd.sb_mtx);
149 	sigio_init(&so->so_sigio);
150 	TAILQ_INIT(&so->so_q0);
151 	TAILQ_INIT(&so->so_q);
152 
153 	switch (dp->dom_family) {
154 	case AF_INET:
155 	case AF_INET6:
156 		switch (prp->pr_type) {
157 		case SOCK_DGRAM:
158 			so->so_rcv.sb_flags |= SB_OWNLOCK;
159 			/* FALLTHROUGH */
160 		case SOCK_RAW:
161 			so->so_rcv.sb_flags |= SB_MTXLOCK;
162 			break;
163 		}
164 		break;
165 	case AF_UNIX:
166 		so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK;
167 		break;
168 	}
169 
170 	return (so);
171 }
172 
173 /*
174  * Socket operation routines.
175  * These routines are called by the routines in
176  * sys_socket.c or from a system process, and
177  * implement the semantics of socket operations by
178  * switching out to the protocol specific routines.
179  */
180 int
181 socreate(int dom, struct socket **aso, int type, int proto)
182 {
183 	struct proc *p = curproc;		/* XXX */
184 	const struct protosw *prp;
185 	struct socket *so;
186 	int error;
187 
188 	if (proto)
189 		prp = pffindproto(dom, proto, type);
190 	else
191 		prp = pffindtype(dom, type);
192 	if (prp == NULL || prp->pr_usrreqs == NULL)
193 		return (EPROTONOSUPPORT);
194 	if (prp->pr_type != type)
195 		return (EPROTOTYPE);
196 	so = soalloc(prp, M_WAIT);
197 	so->so_type = type;
198 	if (suser(p) == 0)
199 		so->so_state = SS_PRIV;
200 	so->so_ruid = p->p_ucred->cr_ruid;
201 	so->so_euid = p->p_ucred->cr_uid;
202 	so->so_rgid = p->p_ucred->cr_rgid;
203 	so->so_egid = p->p_ucred->cr_gid;
204 	so->so_cpid = p->p_p->ps_pid;
205 	so->so_proto = prp;
206 	so->so_snd.sb_timeo_nsecs = INFSLP;
207 	so->so_rcv.sb_timeo_nsecs = INFSLP;
208 
209 	solock(so);
210 	error = pru_attach(so, proto, M_WAIT);
211 	if (error) {
212 		so->so_state |= SS_NOFDREF;
213 		/* sofree() calls sounlock(). */
214 		sofree(so, 0);
215 		return (error);
216 	}
217 	sounlock(so);
218 	*aso = so;
219 	return (0);
220 }
221 
222 int
223 sobind(struct socket *so, struct mbuf *nam, struct proc *p)
224 {
225 	soassertlocked(so);
226 	return pru_bind(so, nam, p);
227 }
228 
229 int
230 solisten(struct socket *so, int backlog)
231 {
232 	int somaxconn_local = READ_ONCE(somaxconn);
233 	int sominconn_local = READ_ONCE(sominconn);
234 	int error;
235 
236 	switch (so->so_type) {
237 	case SOCK_STREAM:
238 	case SOCK_SEQPACKET:
239 		break;
240 	default:
241 		return (EOPNOTSUPP);
242 	}
243 
244 	soassertlocked(so);
245 
246 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
247 		return (EINVAL);
248 #ifdef SOCKET_SPLICE
249 	if (isspliced(so) || issplicedback(so))
250 		return (EOPNOTSUPP);
251 #endif /* SOCKET_SPLICE */
252 	error = pru_listen(so);
253 	if (error)
254 		return (error);
255 	if (TAILQ_FIRST(&so->so_q) == NULL)
256 		so->so_options |= SO_ACCEPTCONN;
257 	if (backlog < 0 || backlog > somaxconn_local)
258 		backlog = somaxconn_local;
259 	if (backlog < sominconn_local)
260 		backlog = sominconn_local;
261 	so->so_qlimit = backlog;
262 	return (0);
263 }
264 
265 #define SOSP_FREEING_READ	1
266 #define SOSP_FREEING_WRITE	2
267 void
268 sofree(struct socket *so, int keep_lock)
269 {
270 	int persocket = solock_persocket(so);
271 
272 	soassertlocked(so);
273 
274 	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) {
275 		if (!keep_lock)
276 			sounlock(so);
277 		return;
278 	}
279 	if (so->so_head) {
280 		struct socket *head = so->so_head;
281 
282 		/*
283 		 * We must not decommission a socket that's on the accept(2)
284 		 * queue.  If we do, then accept(2) may hang after select(2)
285 		 * indicated that the listening socket was ready.
286 		 */
287 		if (so->so_onq == &head->so_q) {
288 			if (!keep_lock)
289 				sounlock(so);
290 			return;
291 		}
292 
293 		if (persocket) {
294 			/*
295 			 * Concurrent close of `head' could
296 			 * abort `so' due to re-lock.
297 			 */
298 			soref(so);
299 			soref(head);
300 			sounlock(so);
301 			solock(head);
302 			solock(so);
303 
304 			if (so->so_onq != &head->so_q0) {
305 				sounlock(head);
306 				sounlock(so);
307 				sorele(head);
308 				sorele(so);
309 				return;
310 			}
311 
312 			sorele(head);
313 			sorele(so);
314 		}
315 
316 		soqremque(so, 0);
317 
318 		if (persocket)
319 			sounlock(head);
320 	}
321 
322 	if (persocket) {
323 		sounlock(so);
324 		refcnt_finalize(&so->so_refcnt, "sofinal");
325 		solock(so);
326 	}
327 
328 	sigio_free(&so->so_sigio);
329 	klist_free(&so->so_rcv.sb_klist);
330 	klist_free(&so->so_snd.sb_klist);
331 #ifdef SOCKET_SPLICE
332 	if (issplicedback(so)) {
333 		int freeing = SOSP_FREEING_WRITE;
334 
335 		if (so->so_sp->ssp_soback == so)
336 			freeing |= SOSP_FREEING_READ;
337 		sounsplice(so->so_sp->ssp_soback, so, freeing);
338 	}
339 	if (isspliced(so)) {
340 		int freeing = SOSP_FREEING_READ;
341 
342 		if (so == so->so_sp->ssp_socket)
343 			freeing |= SOSP_FREEING_WRITE;
344 		sounsplice(so, so->so_sp->ssp_socket, freeing);
345 	}
346 #endif /* SOCKET_SPLICE */
347 	sbrelease(so, &so->so_snd);
348 	sorflush(so);
349 	if (!keep_lock)
350 		sounlock(so);
351 #ifdef SOCKET_SPLICE
352 	if (so->so_sp) {
353 		/* Reuse splice idle, sounsplice() has been called before. */
354 		timeout_set_proc(&so->so_sp->ssp_idleto, soreaper, so);
355 		timeout_add(&so->so_sp->ssp_idleto, 0);
356 	} else
357 #endif /* SOCKET_SPLICE */
358 	{
359 		pool_put(&socket_pool, so);
360 	}
361 }
362 
363 static inline uint64_t
364 solinger_nsec(struct socket *so)
365 {
366 	if (so->so_linger == 0)
367 		return INFSLP;
368 
369 	return SEC_TO_NSEC(so->so_linger);
370 }
371 
372 /*
373  * Close a socket on last file table reference removal.
374  * Initiate disconnect if connected.
375  * Free socket when disconnect complete.
376  */
377 int
378 soclose(struct socket *so, int flags)
379 {
380 	struct socket *so2;
381 	int error = 0;
382 
383 	solock(so);
384 	/* Revoke async IO early. There is a final revocation in sofree(). */
385 	sigio_free(&so->so_sigio);
386 	if (so->so_state & SS_ISCONNECTED) {
387 		if (so->so_pcb == NULL)
388 			goto discard;
389 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
390 			error = sodisconnect(so);
391 			if (error)
392 				goto drop;
393 		}
394 		if (so->so_options & SO_LINGER) {
395 			if ((so->so_state & SS_ISDISCONNECTING) &&
396 			    (flags & MSG_DONTWAIT))
397 				goto drop;
398 			while (so->so_state & SS_ISCONNECTED) {
399 				error = sosleep_nsec(so, &so->so_timeo,
400 				    PSOCK | PCATCH, "netcls",
401 				    solinger_nsec(so));
402 				if (error)
403 					break;
404 			}
405 		}
406 	}
407 drop:
408 	if (so->so_pcb) {
409 		int error2;
410 		error2 = pru_detach(so);
411 		if (error == 0)
412 			error = error2;
413 	}
414 	if (so->so_options & SO_ACCEPTCONN) {
415 		int persocket = solock_persocket(so);
416 
417 		if (persocket) {
418 			/* Wait concurrent sonewconn() threads. */
419 			while (so->so_newconn > 0) {
420 				so->so_state |= SS_NEWCONN_WAIT;
421 				sosleep_nsec(so, &so->so_newconn, PSOCK,
422 				    "newcon", INFSLP);
423 			}
424 		}
425 
426 		while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) {
427 			if (persocket)
428 				solock(so2);
429 			(void) soqremque(so2, 0);
430 			if (persocket)
431 				sounlock(so);
432 			soabort(so2);
433 			if (persocket)
434 				solock(so);
435 		}
436 		while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) {
437 			if (persocket)
438 				solock(so2);
439 			(void) soqremque(so2, 1);
440 			if (persocket)
441 				sounlock(so);
442 			soabort(so2);
443 			if (persocket)
444 				solock(so);
445 		}
446 	}
447 discard:
448 	if (so->so_state & SS_NOFDREF)
449 		panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type);
450 	so->so_state |= SS_NOFDREF;
451 	/* sofree() calls sounlock(). */
452 	sofree(so, 0);
453 	return (error);
454 }
455 
456 void
457 soabort(struct socket *so)
458 {
459 	soassertlocked(so);
460 	pru_abort(so);
461 }
462 
463 int
464 soaccept(struct socket *so, struct mbuf *nam)
465 {
466 	int error = 0;
467 
468 	soassertlocked(so);
469 
470 	if ((so->so_state & SS_NOFDREF) == 0)
471 		panic("soaccept !NOFDREF: so %p, so_type %d", so, so->so_type);
472 	so->so_state &= ~SS_NOFDREF;
473 	if ((so->so_state & SS_ISDISCONNECTED) == 0 ||
474 	    (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0)
475 		error = pru_accept(so, nam);
476 	else
477 		error = ECONNABORTED;
478 	return (error);
479 }
480 
481 int
482 soconnect(struct socket *so, struct mbuf *nam)
483 {
484 	int error;
485 
486 	soassertlocked(so);
487 
488 	if (so->so_options & SO_ACCEPTCONN)
489 		return (EOPNOTSUPP);
490 	/*
491 	 * If protocol is connection-based, can only connect once.
492 	 * Otherwise, if connected, try to disconnect first.
493 	 * This allows user to disconnect by connecting to, e.g.,
494 	 * a null address.
495 	 */
496 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
497 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
498 	    (error = sodisconnect(so))))
499 		error = EISCONN;
500 	else
501 		error = pru_connect(so, nam);
502 	return (error);
503 }
504 
505 int
506 soconnect2(struct socket *so1, struct socket *so2)
507 {
508 	int persocket, error;
509 
510 	if ((persocket = solock_persocket(so1)))
511 		solock_pair(so1, so2);
512 	else
513 		solock(so1);
514 
515 	error = pru_connect2(so1, so2);
516 
517 	if (persocket)
518 		sounlock(so2);
519 	sounlock(so1);
520 	return (error);
521 }
522 
523 int
524 sodisconnect(struct socket *so)
525 {
526 	int error;
527 
528 	soassertlocked(so);
529 
530 	if ((so->so_state & SS_ISCONNECTED) == 0)
531 		return (ENOTCONN);
532 	if (so->so_state & SS_ISDISCONNECTING)
533 		return (EALREADY);
534 	error = pru_disconnect(so);
535 	return (error);
536 }
537 
538 int m_getuio(struct mbuf **, int, long, struct uio *);
539 
540 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
541 /*
542  * Send on a socket.
543  * If send must go all at once and message is larger than
544  * send buffering, then hard error.
545  * Lock against other senders.
546  * If must go all at once and not enough room now, then
547  * inform user that this would block and do nothing.
548  * Otherwise, if nonblocking, send as much as possible.
549  * The data to be sent is described by "uio" if nonzero,
550  * otherwise by the mbuf chain "top" (which must be null
551  * if uio is not).  Data provided in mbuf chain must be small
552  * enough to send all at once.
553  *
554  * Returns nonzero on error, timeout or signal; callers
555  * must check for short counts if EINTR/ERESTART are returned.
556  * Data and control buffers are freed on return.
557  */
558 int
559 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top,
560     struct mbuf *control, int flags)
561 {
562 	long space, clen = 0;
563 	size_t resid;
564 	int error;
565 	int atomic = sosendallatonce(so) || top;
566 
567 	if (uio)
568 		resid = uio->uio_resid;
569 	else
570 		resid = top->m_pkthdr.len;
571 	/* MSG_EOR on a SOCK_STREAM socket is invalid. */
572 	if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) {
573 		m_freem(top);
574 		m_freem(control);
575 		return (EINVAL);
576 	}
577 	if (uio && uio->uio_procp)
578 		uio->uio_procp->p_ru.ru_msgsnd++;
579 	if (control) {
580 		/*
581 		 * In theory clen should be unsigned (since control->m_len is).
582 		 * However, space must be signed, as it might be less than 0
583 		 * if we over-committed, and we must use a signed comparison
584 		 * of space and clen.
585 		 */
586 		clen = control->m_len;
587 		/* reserve extra space for AF_UNIX's internalize */
588 		if (so->so_proto->pr_domain->dom_family == AF_UNIX &&
589 		    clen >= CMSG_ALIGN(sizeof(struct cmsghdr)) &&
590 		    mtod(control, struct cmsghdr *)->cmsg_type == SCM_RIGHTS)
591 			clen = CMSG_SPACE(
592 			    (clen - CMSG_ALIGN(sizeof(struct cmsghdr))) *
593 			    (sizeof(struct fdpass) / sizeof(int)));
594 	}
595 
596 #define	snderr(errno)	{ error = errno; goto release; }
597 
598 	solock_shared(so);
599 restart:
600 	if ((error = sblock(so, &so->so_snd, SBLOCKWAIT(flags))) != 0)
601 		goto out;
602 	so->so_snd.sb_state |= SS_ISSENDING;
603 	do {
604 		if (so->so_snd.sb_state & SS_CANTSENDMORE)
605 			snderr(EPIPE);
606 		if (so->so_error) {
607 			error = so->so_error;
608 			so->so_error = 0;
609 			snderr(error);
610 		}
611 		if ((so->so_state & SS_ISCONNECTED) == 0) {
612 			if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
613 				if (!(resid == 0 && clen != 0))
614 					snderr(ENOTCONN);
615 			} else if (addr == NULL)
616 				snderr(EDESTADDRREQ);
617 		}
618 		space = sbspace(so, &so->so_snd);
619 		if (flags & MSG_OOB)
620 			space += 1024;
621 		if (so->so_proto->pr_domain->dom_family == AF_UNIX) {
622 			if (atomic && resid > so->so_snd.sb_hiwat)
623 				snderr(EMSGSIZE);
624 		} else {
625 			if (clen > so->so_snd.sb_hiwat ||
626 			    (atomic && resid > so->so_snd.sb_hiwat - clen))
627 				snderr(EMSGSIZE);
628 		}
629 		if (space < clen ||
630 		    (space - clen < resid &&
631 		    (atomic || space < so->so_snd.sb_lowat))) {
632 			if (flags & MSG_DONTWAIT)
633 				snderr(EWOULDBLOCK);
634 			sbunlock(so, &so->so_snd);
635 			error = sbwait(so, &so->so_snd);
636 			so->so_snd.sb_state &= ~SS_ISSENDING;
637 			if (error)
638 				goto out;
639 			goto restart;
640 		}
641 		space -= clen;
642 		do {
643 			if (uio == NULL) {
644 				/*
645 				 * Data is prepackaged in "top".
646 				 */
647 				resid = 0;
648 				if (flags & MSG_EOR)
649 					top->m_flags |= M_EOR;
650 			} else {
651 				sounlock_shared(so);
652 				error = m_getuio(&top, atomic, space, uio);
653 				solock_shared(so);
654 				if (error)
655 					goto release;
656 				space -= top->m_pkthdr.len;
657 				resid = uio->uio_resid;
658 				if (flags & MSG_EOR)
659 					top->m_flags |= M_EOR;
660 			}
661 			if (resid == 0)
662 				so->so_snd.sb_state &= ~SS_ISSENDING;
663 			if (top && so->so_options & SO_ZEROIZE)
664 				top->m_flags |= M_ZEROIZE;
665 			if (flags & MSG_OOB)
666 				error = pru_sendoob(so, top, addr, control);
667 			else
668 				error = pru_send(so, top, addr, control);
669 			clen = 0;
670 			control = NULL;
671 			top = NULL;
672 			if (error)
673 				goto release;
674 		} while (resid && space > 0);
675 	} while (resid);
676 
677 release:
678 	so->so_snd.sb_state &= ~SS_ISSENDING;
679 	sbunlock(so, &so->so_snd);
680 out:
681 	sounlock_shared(so);
682 	m_freem(top);
683 	m_freem(control);
684 	return (error);
685 }
686 
687 int
688 m_getuio(struct mbuf **mp, int atomic, long space, struct uio *uio)
689 {
690 	struct mbuf *m, *top = NULL;
691 	struct mbuf **nextp = &top;
692 	u_long len, mlen;
693 	size_t resid = uio->uio_resid;
694 	int error;
695 
696 	do {
697 		if (top == NULL) {
698 			MGETHDR(m, M_WAIT, MT_DATA);
699 			mlen = MHLEN;
700 			m->m_pkthdr.len = 0;
701 			m->m_pkthdr.ph_ifidx = 0;
702 		} else {
703 			MGET(m, M_WAIT, MT_DATA);
704 			mlen = MLEN;
705 		}
706 		/* chain mbuf together */
707 		*nextp = m;
708 		nextp = &m->m_next;
709 
710 		resid = ulmin(resid, space);
711 		if (resid >= MINCLSIZE) {
712 			MCLGETL(m, M_NOWAIT, ulmin(resid, MAXMCLBYTES));
713 			if ((m->m_flags & M_EXT) == 0)
714 				MCLGETL(m, M_NOWAIT, MCLBYTES);
715 			if ((m->m_flags & M_EXT) == 0)
716 				goto nopages;
717 			mlen = m->m_ext.ext_size;
718 			len = ulmin(mlen, resid);
719 			/*
720 			 * For datagram protocols, leave room
721 			 * for protocol headers in first mbuf.
722 			 */
723 			if (atomic && m == top && len < mlen - max_hdr)
724 				m->m_data += max_hdr;
725 		} else {
726 nopages:
727 			len = ulmin(mlen, resid);
728 			/*
729 			 * For datagram protocols, leave room
730 			 * for protocol headers in first mbuf.
731 			 */
732 			if (atomic && m == top && len < mlen - max_hdr)
733 				m_align(m, len);
734 		}
735 
736 		error = uiomove(mtod(m, caddr_t), len, uio);
737 		if (error) {
738 			m_freem(top);
739 			return (error);
740 		}
741 
742 		/* adjust counters */
743 		resid = uio->uio_resid;
744 		space -= len;
745 		m->m_len = len;
746 		top->m_pkthdr.len += len;
747 
748 		/* Is there more space and more data? */
749 	} while (space > 0 && resid > 0);
750 
751 	*mp = top;
752 	return 0;
753 }
754 
755 /*
756  * Following replacement or removal of the first mbuf on the first
757  * mbuf chain of a socket buffer, push necessary state changes back
758  * into the socket buffer so that other consumers see the values
759  * consistently.  'nextrecord' is the callers locally stored value of
760  * the original value of sb->sb_mb->m_nextpkt which must be restored
761  * when the lead mbuf changes.  NOTE: 'nextrecord' may be NULL.
762  */
763 void
764 sbsync(struct sockbuf *sb, struct mbuf *nextrecord)
765 {
766 
767 	/*
768 	 * First, update for the new value of nextrecord.  If necessary,
769 	 * make it the first record.
770 	 */
771 	if (sb->sb_mb != NULL)
772 		sb->sb_mb->m_nextpkt = nextrecord;
773 	else
774 		sb->sb_mb = nextrecord;
775 
776 	/*
777 	 * Now update any dependent socket buffer fields to reflect
778 	 * the new state.  This is an inline of SB_EMPTY_FIXUP, with
779 	 * the addition of a second clause that takes care of the
780 	 * case where sb_mb has been updated, but remains the last
781 	 * record.
782 	 */
783 	if (sb->sb_mb == NULL) {
784 		sb->sb_mbtail = NULL;
785 		sb->sb_lastrecord = NULL;
786 	} else if (sb->sb_mb->m_nextpkt == NULL)
787 		sb->sb_lastrecord = sb->sb_mb;
788 }
789 
790 /*
791  * Implement receive operations on a socket.
792  * We depend on the way that records are added to the sockbuf
793  * by sbappend*.  In particular, each record (mbufs linked through m_next)
794  * must begin with an address if the protocol so specifies,
795  * followed by an optional mbuf or mbufs containing ancillary data,
796  * and then zero or more mbufs of data.
797  * In order to avoid blocking network for the entire time here, we release
798  * the solock() while doing the actual copy to user space.
799  * Although the sockbuf is locked, new data may still be appended,
800  * and thus we must maintain consistency of the sockbuf during that time.
801  *
802  * The caller may receive the data as a single mbuf chain by supplying
803  * an mbuf **mp0 for use in returning the chain.  The uio is then used
804  * only for the count in uio_resid.
805  */
806 int
807 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio,
808     struct mbuf **mp0, struct mbuf **controlp, int *flagsp,
809     socklen_t controllen)
810 {
811 	struct mbuf *m, **mp;
812 	struct mbuf *cm;
813 	u_long len, offset, moff;
814 	int flags, error, type, uio_error = 0;
815 	const struct protosw *pr = so->so_proto;
816 	struct mbuf *nextrecord;
817 	size_t resid, orig_resid = uio->uio_resid;
818 
819 	mp = mp0;
820 	if (paddr)
821 		*paddr = NULL;
822 	if (controlp)
823 		*controlp = NULL;
824 	if (flagsp)
825 		flags = *flagsp &~ MSG_EOR;
826 	else
827 		flags = 0;
828 	if (flags & MSG_OOB) {
829 		m = m_get(M_WAIT, MT_DATA);
830 		solock(so);
831 		error = pru_rcvoob(so, m, flags & MSG_PEEK);
832 		sounlock(so);
833 		if (error)
834 			goto bad;
835 		do {
836 			error = uiomove(mtod(m, caddr_t),
837 			    ulmin(uio->uio_resid, m->m_len), uio);
838 			m = m_free(m);
839 		} while (uio->uio_resid && error == 0 && m);
840 bad:
841 		m_freem(m);
842 		return (error);
843 	}
844 	if (mp)
845 		*mp = NULL;
846 
847 	solock_shared(so);
848 restart:
849 	if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) {
850 		sounlock_shared(so);
851 		return (error);
852 	}
853 	sb_mtx_lock(&so->so_rcv);
854 
855 	m = so->so_rcv.sb_mb;
856 #ifdef SOCKET_SPLICE
857 	if (isspliced(so))
858 		m = NULL;
859 #endif /* SOCKET_SPLICE */
860 	/*
861 	 * If we have less data than requested, block awaiting more
862 	 * (subject to any timeout) if:
863 	 *   1. the current count is less than the low water mark,
864 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
865 	 *	receive operation at once if we block (resid <= hiwat), or
866 	 *   3. MSG_DONTWAIT is not set.
867 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
868 	 * we have to do the receive in sections, and thus risk returning
869 	 * a short count if a timeout or signal occurs after we start.
870 	 */
871 	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
872 	    so->so_rcv.sb_cc < uio->uio_resid) &&
873 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
874 	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
875 	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
876 #ifdef DIAGNOSTIC
877 		if (m == NULL && so->so_rcv.sb_cc)
878 #ifdef SOCKET_SPLICE
879 		    if (!isspliced(so))
880 #endif /* SOCKET_SPLICE */
881 			panic("receive 1: so %p, so_type %d, sb_cc %lu",
882 			    so, so->so_type, so->so_rcv.sb_cc);
883 #endif
884 		if (so->so_error) {
885 			if (m)
886 				goto dontblock;
887 			error = so->so_error;
888 			if ((flags & MSG_PEEK) == 0)
889 				so->so_error = 0;
890 			goto release;
891 		}
892 		if (so->so_rcv.sb_state & SS_CANTRCVMORE) {
893 			if (m)
894 				goto dontblock;
895 			else if (so->so_rcv.sb_cc == 0)
896 				goto release;
897 		}
898 		for (; m; m = m->m_next)
899 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
900 				m = so->so_rcv.sb_mb;
901 				goto dontblock;
902 			}
903 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
904 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
905 			error = ENOTCONN;
906 			goto release;
907 		}
908 		if (uio->uio_resid == 0 && controlp == NULL)
909 			goto release;
910 		if (flags & MSG_DONTWAIT) {
911 			error = EWOULDBLOCK;
912 			goto release;
913 		}
914 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
915 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
916 
917 		if (so->so_rcv.sb_flags & SB_OWNLOCK) {
918 			sbunlock_locked(so, &so->so_rcv);
919 			sounlock_shared(so);
920 			error = sbwait_locked(so, &so->so_rcv);
921 			sb_mtx_unlock(&so->so_rcv);
922 			if (error)
923 				return (error);
924 			solock_shared(so);
925 		} else {
926 			sb_mtx_unlock(&so->so_rcv);
927 			sbunlock(so, &so->so_rcv);
928 			error = sbwait(so, &so->so_rcv);
929 			if (error) {
930 				sounlock_shared(so);
931 				return (error);
932 			}
933 		}
934 		goto restart;
935 	}
936 dontblock:
937 	/*
938 	 * On entry here, m points to the first record of the socket buffer.
939 	 * From this point onward, we maintain 'nextrecord' as a cache of the
940 	 * pointer to the next record in the socket buffer.  We must keep the
941 	 * various socket buffer pointers and local stack versions of the
942 	 * pointers in sync, pushing out modifications before operations that
943 	 * may sleep, and re-reading them afterwards.
944 	 *
945 	 * Otherwise, we will race with the network stack appending new data
946 	 * or records onto the socket buffer by using inconsistent/stale
947 	 * versions of the field, possibly resulting in socket buffer
948 	 * corruption.
949 	 */
950 	if (uio->uio_procp)
951 		uio->uio_procp->p_ru.ru_msgrcv++;
952 	KASSERT(m == so->so_rcv.sb_mb);
953 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
954 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
955 	nextrecord = m->m_nextpkt;
956 	if (pr->pr_flags & PR_ADDR) {
957 #ifdef DIAGNOSTIC
958 		if (m->m_type != MT_SONAME)
959 			panic("receive 1a: so %p, so_type %d, m %p, m_type %d",
960 			    so, so->so_type, m, m->m_type);
961 #endif
962 		orig_resid = 0;
963 		if (flags & MSG_PEEK) {
964 			if (paddr)
965 				*paddr = m_copym(m, 0, m->m_len, M_NOWAIT);
966 			m = m->m_next;
967 		} else {
968 			sbfree(so, &so->so_rcv, m);
969 			if (paddr) {
970 				*paddr = m;
971 				so->so_rcv.sb_mb = m->m_next;
972 				m->m_next = NULL;
973 				m = so->so_rcv.sb_mb;
974 			} else {
975 				so->so_rcv.sb_mb = m_free(m);
976 				m = so->so_rcv.sb_mb;
977 			}
978 			sbsync(&so->so_rcv, nextrecord);
979 		}
980 	}
981 	while (m && m->m_type == MT_CONTROL && error == 0) {
982 		int skip = 0;
983 		if (flags & MSG_PEEK) {
984 			if (mtod(m, struct cmsghdr *)->cmsg_type ==
985 			    SCM_RIGHTS) {
986 				/* don't leak internalized SCM_RIGHTS msgs */
987 				skip = 1;
988 			} else if (controlp)
989 				*controlp = m_copym(m, 0, m->m_len, M_NOWAIT);
990 			m = m->m_next;
991 		} else {
992 			sbfree(so, &so->so_rcv, m);
993 			so->so_rcv.sb_mb = m->m_next;
994 			m->m_nextpkt = m->m_next = NULL;
995 			cm = m;
996 			m = so->so_rcv.sb_mb;
997 			sbsync(&so->so_rcv, nextrecord);
998 			if (controlp) {
999 				if (pr->pr_domain->dom_externalize) {
1000 					sb_mtx_unlock(&so->so_rcv);
1001 					sounlock_shared(so);
1002 					error =
1003 					    (*pr->pr_domain->dom_externalize)
1004 					    (cm, controllen, flags);
1005 					solock_shared(so);
1006 					sb_mtx_lock(&so->so_rcv);
1007 				}
1008 				*controlp = cm;
1009 			} else {
1010 				/*
1011 				 * Dispose of any SCM_RIGHTS message that went
1012 				 * through the read path rather than recv.
1013 				 */
1014 				if (pr->pr_domain->dom_dispose) {
1015 					sb_mtx_unlock(&so->so_rcv);
1016 					pr->pr_domain->dom_dispose(cm);
1017 					sb_mtx_lock(&so->so_rcv);
1018 				}
1019 				m_free(cm);
1020 			}
1021 		}
1022 		if (m != NULL)
1023 			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1024 		else
1025 			nextrecord = so->so_rcv.sb_mb;
1026 		if (controlp && !skip)
1027 			controlp = &(*controlp)->m_next;
1028 		orig_resid = 0;
1029 	}
1030 
1031 	/* If m is non-NULL, we have some data to read. */
1032 	if (m) {
1033 		type = m->m_type;
1034 		if (type == MT_OOBDATA)
1035 			flags |= MSG_OOB;
1036 		if (m->m_flags & M_BCAST)
1037 			flags |= MSG_BCAST;
1038 		if (m->m_flags & M_MCAST)
1039 			flags |= MSG_MCAST;
1040 	}
1041 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
1042 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
1043 
1044 	moff = 0;
1045 	offset = 0;
1046 	while (m && uio->uio_resid > 0 && error == 0) {
1047 		if (m->m_type == MT_OOBDATA) {
1048 			if (type != MT_OOBDATA)
1049 				break;
1050 		} else if (type == MT_OOBDATA) {
1051 			break;
1052 		} else if (m->m_type == MT_CONTROL) {
1053 			/*
1054 			 * If there is more than one control message in the
1055 			 * stream, we do a short read.  Next can be received
1056 			 * or disposed by another system call.
1057 			 */
1058 			break;
1059 #ifdef DIAGNOSTIC
1060 		} else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) {
1061 			panic("receive 3: so %p, so_type %d, m %p, m_type %d",
1062 			    so, so->so_type, m, m->m_type);
1063 #endif
1064 		}
1065 		so->so_rcv.sb_state &= ~SS_RCVATMARK;
1066 		len = uio->uio_resid;
1067 		if (so->so_oobmark && len > so->so_oobmark - offset)
1068 			len = so->so_oobmark - offset;
1069 		if (len > m->m_len - moff)
1070 			len = m->m_len - moff;
1071 		/*
1072 		 * If mp is set, just pass back the mbufs.
1073 		 * Otherwise copy them out via the uio, then free.
1074 		 * Sockbuf must be consistent here (points to current mbuf,
1075 		 * it points to next record) when we drop priority;
1076 		 * we must note any additions to the sockbuf when we
1077 		 * block interrupts again.
1078 		 */
1079 		if (mp == NULL && uio_error == 0) {
1080 			SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
1081 			SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
1082 			resid = uio->uio_resid;
1083 			sb_mtx_unlock(&so->so_rcv);
1084 			sounlock_shared(so);
1085 			uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio);
1086 			solock_shared(so);
1087 			sb_mtx_lock(&so->so_rcv);
1088 			if (uio_error)
1089 				uio->uio_resid = resid - len;
1090 		} else
1091 			uio->uio_resid -= len;
1092 		if (len == m->m_len - moff) {
1093 			if (m->m_flags & M_EOR)
1094 				flags |= MSG_EOR;
1095 			if (flags & MSG_PEEK) {
1096 				m = m->m_next;
1097 				moff = 0;
1098 				orig_resid = 0;
1099 			} else {
1100 				nextrecord = m->m_nextpkt;
1101 				sbfree(so, &so->so_rcv, m);
1102 				if (mp) {
1103 					*mp = m;
1104 					mp = &m->m_next;
1105 					so->so_rcv.sb_mb = m = m->m_next;
1106 					*mp = NULL;
1107 				} else {
1108 					so->so_rcv.sb_mb = m_free(m);
1109 					m = so->so_rcv.sb_mb;
1110 				}
1111 				/*
1112 				 * If m != NULL, we also know that
1113 				 * so->so_rcv.sb_mb != NULL.
1114 				 */
1115 				KASSERT(so->so_rcv.sb_mb == m);
1116 				if (m) {
1117 					m->m_nextpkt = nextrecord;
1118 					if (nextrecord == NULL)
1119 						so->so_rcv.sb_lastrecord = m;
1120 				} else {
1121 					so->so_rcv.sb_mb = nextrecord;
1122 					SB_EMPTY_FIXUP(&so->so_rcv);
1123 				}
1124 				SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
1125 				SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
1126 			}
1127 		} else {
1128 			if (flags & MSG_PEEK) {
1129 				moff += len;
1130 				orig_resid = 0;
1131 			} else {
1132 				if (mp)
1133 					*mp = m_copym(m, 0, len, M_WAIT);
1134 				m->m_data += len;
1135 				m->m_len -= len;
1136 				so->so_rcv.sb_cc -= len;
1137 				so->so_rcv.sb_datacc -= len;
1138 			}
1139 		}
1140 		if (so->so_oobmark) {
1141 			if ((flags & MSG_PEEK) == 0) {
1142 				so->so_oobmark -= len;
1143 				if (so->so_oobmark == 0) {
1144 					so->so_rcv.sb_state |= SS_RCVATMARK;
1145 					break;
1146 				}
1147 			} else {
1148 				offset += len;
1149 				if (offset == so->so_oobmark)
1150 					break;
1151 			}
1152 		}
1153 		if (flags & MSG_EOR)
1154 			break;
1155 		/*
1156 		 * If the MSG_WAITALL flag is set (for non-atomic socket),
1157 		 * we must not quit until "uio->uio_resid == 0" or an error
1158 		 * termination.  If a signal/timeout occurs, return
1159 		 * with a short count but without error.
1160 		 * Keep sockbuf locked against other readers.
1161 		 */
1162 		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1163 		    !sosendallatonce(so) && !nextrecord) {
1164 			if (so->so_rcv.sb_state & SS_CANTRCVMORE ||
1165 			    so->so_error)
1166 				break;
1167 			SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
1168 			SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
1169 			sb_mtx_unlock(&so->so_rcv);
1170 			error = sbwait(so, &so->so_rcv);
1171 			if (error) {
1172 				sbunlock(so, &so->so_rcv);
1173 				sounlock_shared(so);
1174 				return (0);
1175 			}
1176 			sb_mtx_lock(&so->so_rcv);
1177 			if ((m = so->so_rcv.sb_mb) != NULL)
1178 				nextrecord = m->m_nextpkt;
1179 		}
1180 	}
1181 
1182 	if (m && pr->pr_flags & PR_ATOMIC) {
1183 		flags |= MSG_TRUNC;
1184 		if ((flags & MSG_PEEK) == 0)
1185 			(void) sbdroprecord(so, &so->so_rcv);
1186 	}
1187 	if ((flags & MSG_PEEK) == 0) {
1188 		if (m == NULL) {
1189 			/*
1190 			 * First part is an inline SB_EMPTY_FIXUP().  Second
1191 			 * part makes sure sb_lastrecord is up-to-date if
1192 			 * there is still data in the socket buffer.
1193 			 */
1194 			so->so_rcv.sb_mb = nextrecord;
1195 			if (so->so_rcv.sb_mb == NULL) {
1196 				so->so_rcv.sb_mbtail = NULL;
1197 				so->so_rcv.sb_lastrecord = NULL;
1198 			} else if (nextrecord->m_nextpkt == NULL)
1199 				so->so_rcv.sb_lastrecord = nextrecord;
1200 		}
1201 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
1202 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
1203 		if (pr->pr_flags & PR_WANTRCVD) {
1204 			sb_mtx_unlock(&so->so_rcv);
1205 			pru_rcvd(so);
1206 			sb_mtx_lock(&so->so_rcv);
1207 		}
1208 	}
1209 	if (orig_resid == uio->uio_resid && orig_resid &&
1210 	    (flags & MSG_EOR) == 0 &&
1211 	    (so->so_rcv.sb_state & SS_CANTRCVMORE) == 0) {
1212 		sb_mtx_unlock(&so->so_rcv);
1213 		sbunlock(so, &so->so_rcv);
1214 		goto restart;
1215 	}
1216 
1217 	if (uio_error)
1218 		error = uio_error;
1219 
1220 	if (flagsp)
1221 		*flagsp |= flags;
1222 release:
1223 	sb_mtx_unlock(&so->so_rcv);
1224 	sbunlock(so, &so->so_rcv);
1225 	sounlock_shared(so);
1226 	return (error);
1227 }
1228 
1229 int
1230 soshutdown(struct socket *so, int how)
1231 {
1232 	int error = 0;
1233 
1234 	solock(so);
1235 	switch (how) {
1236 	case SHUT_RD:
1237 		sorflush(so);
1238 		break;
1239 	case SHUT_RDWR:
1240 		sorflush(so);
1241 		/* FALLTHROUGH */
1242 	case SHUT_WR:
1243 		error = pru_shutdown(so);
1244 		break;
1245 	default:
1246 		error = EINVAL;
1247 		break;
1248 	}
1249 	sounlock(so);
1250 
1251 	return (error);
1252 }
1253 
1254 void
1255 sorflush(struct socket *so)
1256 {
1257 	struct sockbuf *sb = &so->so_rcv;
1258 	struct mbuf *m;
1259 	const struct protosw *pr = so->so_proto;
1260 	int error;
1261 
1262 	error = sblock(so, sb, SBL_WAIT | SBL_NOINTR);
1263 	/* with SBL_WAIT and SLB_NOINTR sblock() must not fail */
1264 	KASSERT(error == 0);
1265 	socantrcvmore(so);
1266 	mtx_enter(&sb->sb_mtx);
1267 	m = sb->sb_mb;
1268 	memset(&sb->sb_startzero, 0,
1269 	     (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero);
1270 	sb->sb_timeo_nsecs = INFSLP;
1271 	mtx_leave(&sb->sb_mtx);
1272 	sbunlock(so, sb);
1273 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
1274 		(*pr->pr_domain->dom_dispose)(m);
1275 	m_purge(m);
1276 }
1277 
1278 #ifdef SOCKET_SPLICE
1279 
1280 #define so_splicelen	so_sp->ssp_len
1281 #define so_splicemax	so_sp->ssp_max
1282 #define so_idletv	so_sp->ssp_idletv
1283 #define so_idleto	so_sp->ssp_idleto
1284 #define so_splicetask	so_sp->ssp_task
1285 
1286 int
1287 sosplice(struct socket *so, int fd, off_t max, struct timeval *tv)
1288 {
1289 	struct file	*fp;
1290 	struct socket	*sosp;
1291 	struct sosplice	*sp;
1292 	struct taskq	*tq;
1293 	int		 error = 0;
1294 
1295 	soassertlocked(so);
1296 
1297 	if (sosplice_taskq == NULL) {
1298 		rw_enter_write(&sosplice_lock);
1299 		if (sosplice_taskq == NULL) {
1300 			tq = taskq_create("sosplice", 1, IPL_SOFTNET,
1301 			    TASKQ_MPSAFE);
1302 			if (tq == NULL) {
1303 				rw_exit_write(&sosplice_lock);
1304 				return (ENOMEM);
1305 			}
1306 			/* Ensure the taskq is fully visible to other CPUs. */
1307 			membar_producer();
1308 			sosplice_taskq = tq;
1309 		}
1310 		rw_exit_write(&sosplice_lock);
1311 	} else {
1312 		/* Ensure the taskq is fully visible on this CPU. */
1313 		membar_consumer();
1314 	}
1315 
1316 	if ((so->so_proto->pr_flags & PR_SPLICE) == 0)
1317 		return (EPROTONOSUPPORT);
1318 	if (so->so_options & SO_ACCEPTCONN)
1319 		return (EOPNOTSUPP);
1320 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1321 	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
1322 		return (ENOTCONN);
1323 	if (so->so_sp == NULL) {
1324 		sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
1325 		if (so->so_sp == NULL)
1326 			so->so_sp = sp;
1327 		else
1328 			pool_put(&sosplice_pool, sp);
1329 	}
1330 
1331 	/* If no fd is given, unsplice by removing existing link. */
1332 	if (fd < 0) {
1333 		/* Lock receive buffer. */
1334 		if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) {
1335 			return (error);
1336 		}
1337 		if (so->so_sp->ssp_socket)
1338 			sounsplice(so, so->so_sp->ssp_socket, 0);
1339 		sbunlock(so, &so->so_rcv);
1340 		return (0);
1341 	}
1342 
1343 	if (max && max < 0)
1344 		return (EINVAL);
1345 
1346 	if (tv && (tv->tv_sec < 0 || !timerisvalid(tv)))
1347 		return (EINVAL);
1348 
1349 	/* Find sosp, the drain socket where data will be spliced into. */
1350 	if ((error = getsock(curproc, fd, &fp)) != 0)
1351 		return (error);
1352 	sosp = fp->f_data;
1353 	if (sosp->so_proto->pr_usrreqs->pru_send !=
1354 	    so->so_proto->pr_usrreqs->pru_send) {
1355 		error = EPROTONOSUPPORT;
1356 		goto frele;
1357 	}
1358 	if (sosp->so_sp == NULL) {
1359 		sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
1360 		if (sosp->so_sp == NULL)
1361 			sosp->so_sp = sp;
1362 		else
1363 			pool_put(&sosplice_pool, sp);
1364 	}
1365 
1366 	/* Lock both receive and send buffer. */
1367 	if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) {
1368 		goto frele;
1369 	}
1370 	if ((error = sblock(so, &sosp->so_snd, SBL_WAIT)) != 0) {
1371 		sbunlock(so, &so->so_rcv);
1372 		goto frele;
1373 	}
1374 
1375 	if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) {
1376 		error = EBUSY;
1377 		goto release;
1378 	}
1379 	if (sosp->so_options & SO_ACCEPTCONN) {
1380 		error = EOPNOTSUPP;
1381 		goto release;
1382 	}
1383 	if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) {
1384 		error = ENOTCONN;
1385 		goto release;
1386 	}
1387 
1388 	/* Splice so and sosp together. */
1389 	so->so_sp->ssp_socket = sosp;
1390 	sosp->so_sp->ssp_soback = so;
1391 	so->so_splicelen = 0;
1392 	so->so_splicemax = max;
1393 	if (tv)
1394 		so->so_idletv = *tv;
1395 	else
1396 		timerclear(&so->so_idletv);
1397 	timeout_set_proc(&so->so_idleto, soidle, so);
1398 	task_set(&so->so_splicetask, sotask, so);
1399 
1400 	/*
1401 	 * To prevent softnet interrupt from calling somove() while
1402 	 * we sleep, the socket buffers are not marked as spliced yet.
1403 	 */
1404 	if (somove(so, M_WAIT)) {
1405 		mtx_enter(&so->so_rcv.sb_mtx);
1406 		so->so_rcv.sb_flags |= SB_SPLICE;
1407 		mtx_leave(&so->so_rcv.sb_mtx);
1408 		sosp->so_snd.sb_flags |= SB_SPLICE;
1409 	}
1410 
1411  release:
1412 	sbunlock(sosp, &sosp->so_snd);
1413 	sbunlock(so, &so->so_rcv);
1414  frele:
1415 	/*
1416 	 * FRELE() must not be called with the socket lock held. It is safe to
1417 	 * release the lock here as long as no other operation happen on the
1418 	 * socket when sosplice() returns. The dance could be avoided by
1419 	 * grabbing the socket lock inside this function.
1420 	 */
1421 	sounlock(so);
1422 	FRELE(fp, curproc);
1423 	solock(so);
1424 	return (error);
1425 }
1426 
1427 void
1428 sounsplice(struct socket *so, struct socket *sosp, int freeing)
1429 {
1430 	soassertlocked(so);
1431 
1432 	task_del(sosplice_taskq, &so->so_splicetask);
1433 	timeout_del(&so->so_idleto);
1434 	sosp->so_snd.sb_flags &= ~SB_SPLICE;
1435 	mtx_enter(&so->so_rcv.sb_mtx);
1436 	so->so_rcv.sb_flags &= ~SB_SPLICE;
1437 	mtx_leave(&so->so_rcv.sb_mtx);
1438 	so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL;
1439 	/* Do not wakeup a socket that is about to be freed. */
1440 	if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so))
1441 		sorwakeup(so);
1442 	if ((freeing & SOSP_FREEING_WRITE) == 0 && sowriteable(sosp))
1443 		sowwakeup(sosp);
1444 }
1445 
1446 void
1447 soidle(void *arg)
1448 {
1449 	struct socket *so = arg;
1450 
1451 	solock(so);
1452 	if (so->so_rcv.sb_flags & SB_SPLICE) {
1453 		so->so_error = ETIMEDOUT;
1454 		sounsplice(so, so->so_sp->ssp_socket, 0);
1455 	}
1456 	sounlock(so);
1457 }
1458 
1459 void
1460 sotask(void *arg)
1461 {
1462 	struct socket *so = arg;
1463 
1464 	solock(so);
1465 	if (so->so_rcv.sb_flags & SB_SPLICE) {
1466 		/*
1467 		 * We may not sleep here as sofree() and unsplice() may be
1468 		 * called from softnet interrupt context.  This would remove
1469 		 * the socket during somove().
1470 		 */
1471 		somove(so, M_DONTWAIT);
1472 	}
1473 	sounlock(so);
1474 
1475 	/* Avoid user land starvation. */
1476 	yield();
1477 }
1478 
1479 /*
1480  * The socket splicing task or idle timeout may sleep while grabbing the net
1481  * lock.  As sofree() can be called anytime, sotask() or soidle() could access
1482  * the socket memory of a freed socket after wakeup.  So delay the pool_put()
1483  * after all pending socket splicing tasks or timeouts have finished.  Do this
1484  * by scheduling it on the same threads.
1485  */
1486 void
1487 soreaper(void *arg)
1488 {
1489 	struct socket *so = arg;
1490 
1491 	/* Reuse splice task, sounsplice() has been called before. */
1492 	task_set(&so->so_sp->ssp_task, soput, so);
1493 	task_add(sosplice_taskq, &so->so_sp->ssp_task);
1494 }
1495 
1496 void
1497 soput(void *arg)
1498 {
1499 	struct socket *so = arg;
1500 
1501 	pool_put(&sosplice_pool, so->so_sp);
1502 	pool_put(&socket_pool, so);
1503 }
1504 
1505 /*
1506  * Move data from receive buffer of spliced source socket to send
1507  * buffer of drain socket.  Try to move as much as possible in one
1508  * big chunk.  It is a TCP only implementation.
1509  * Return value 0 means splicing has been finished, 1 continue.
1510  */
1511 int
1512 somove(struct socket *so, int wait)
1513 {
1514 	struct socket	*sosp = so->so_sp->ssp_socket;
1515 	struct mbuf	*m, **mp, *nextrecord;
1516 	u_long		 len, off, oobmark;
1517 	long		 space;
1518 	int		 error = 0, maxreached = 0;
1519 	unsigned int	 rcvstate;
1520 
1521 	soassertlocked(so);
1522 
1523  nextpkt:
1524 	if (so->so_error) {
1525 		error = so->so_error;
1526 		goto release;
1527 	}
1528 	if (sosp->so_snd.sb_state & SS_CANTSENDMORE) {
1529 		error = EPIPE;
1530 		goto release;
1531 	}
1532 	if (sosp->so_error && sosp->so_error != ETIMEDOUT &&
1533 	    sosp->so_error != EFBIG && sosp->so_error != ELOOP) {
1534 		error = sosp->so_error;
1535 		goto release;
1536 	}
1537 	if ((sosp->so_state & SS_ISCONNECTED) == 0)
1538 		goto release;
1539 
1540 	/* Calculate how many bytes can be copied now. */
1541 	len = so->so_rcv.sb_datacc;
1542 	if (so->so_splicemax) {
1543 		KASSERT(so->so_splicelen < so->so_splicemax);
1544 		if (so->so_splicemax <= so->so_splicelen + len) {
1545 			len = so->so_splicemax - so->so_splicelen;
1546 			maxreached = 1;
1547 		}
1548 	}
1549 	space = sbspace(sosp, &sosp->so_snd);
1550 	if (so->so_oobmark && so->so_oobmark < len &&
1551 	    so->so_oobmark < space + 1024)
1552 		space += 1024;
1553 	if (space <= 0) {
1554 		maxreached = 0;
1555 		goto release;
1556 	}
1557 	if (space < len) {
1558 		maxreached = 0;
1559 		if (space < sosp->so_snd.sb_lowat)
1560 			goto release;
1561 		len = space;
1562 	}
1563 	sosp->so_snd.sb_state |= SS_ISSENDING;
1564 
1565 	SBLASTRECORDCHK(&so->so_rcv, "somove 1");
1566 	SBLASTMBUFCHK(&so->so_rcv, "somove 1");
1567 	m = so->so_rcv.sb_mb;
1568 	if (m == NULL)
1569 		goto release;
1570 	nextrecord = m->m_nextpkt;
1571 
1572 	/* Drop address and control information not used with splicing. */
1573 	if (so->so_proto->pr_flags & PR_ADDR) {
1574 #ifdef DIAGNOSTIC
1575 		if (m->m_type != MT_SONAME)
1576 			panic("somove soname: so %p, so_type %d, m %p, "
1577 			    "m_type %d", so, so->so_type, m, m->m_type);
1578 #endif
1579 		m = m->m_next;
1580 	}
1581 	while (m && m->m_type == MT_CONTROL)
1582 		m = m->m_next;
1583 	if (m == NULL) {
1584 		sbdroprecord(so, &so->so_rcv);
1585 		if (so->so_proto->pr_flags & PR_WANTRCVD)
1586 			pru_rcvd(so);
1587 		goto nextpkt;
1588 	}
1589 
1590 	/*
1591 	 * By splicing sockets connected to localhost, userland might create a
1592 	 * loop.  Dissolve splicing with error if loop is detected by counter.
1593 	 *
1594 	 * If we deal with looped broadcast/multicast packet we bail out with
1595 	 * no error to suppress splice termination.
1596 	 */
1597 	if ((m->m_flags & M_PKTHDR) &&
1598 	    ((m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) ||
1599 	    ((m->m_flags & M_LOOP) && (m->m_flags & (M_BCAST|M_MCAST))))) {
1600 		error = ELOOP;
1601 		goto release;
1602 	}
1603 
1604 	if (so->so_proto->pr_flags & PR_ATOMIC) {
1605 		if ((m->m_flags & M_PKTHDR) == 0)
1606 			panic("somove !PKTHDR: so %p, so_type %d, m %p, "
1607 			    "m_type %d", so, so->so_type, m, m->m_type);
1608 		if (sosp->so_snd.sb_hiwat < m->m_pkthdr.len) {
1609 			error = EMSGSIZE;
1610 			goto release;
1611 		}
1612 		if (len < m->m_pkthdr.len)
1613 			goto release;
1614 		if (m->m_pkthdr.len < len) {
1615 			maxreached = 0;
1616 			len = m->m_pkthdr.len;
1617 		}
1618 		/*
1619 		 * Throw away the name mbuf after it has been assured
1620 		 * that the whole first record can be processed.
1621 		 */
1622 		m = so->so_rcv.sb_mb;
1623 		sbfree(so, &so->so_rcv, m);
1624 		so->so_rcv.sb_mb = m_free(m);
1625 		sbsync(&so->so_rcv, nextrecord);
1626 	}
1627 	/*
1628 	 * Throw away the control mbufs after it has been assured
1629 	 * that the whole first record can be processed.
1630 	 */
1631 	m = so->so_rcv.sb_mb;
1632 	while (m && m->m_type == MT_CONTROL) {
1633 		sbfree(so, &so->so_rcv, m);
1634 		so->so_rcv.sb_mb = m_free(m);
1635 		m = so->so_rcv.sb_mb;
1636 		sbsync(&so->so_rcv, nextrecord);
1637 	}
1638 
1639 	SBLASTRECORDCHK(&so->so_rcv, "somove 2");
1640 	SBLASTMBUFCHK(&so->so_rcv, "somove 2");
1641 
1642 	/* Take at most len mbufs out of receive buffer. */
1643 	for (off = 0, mp = &m; off <= len && *mp;
1644 	    off += (*mp)->m_len, mp = &(*mp)->m_next) {
1645 		u_long size = len - off;
1646 
1647 #ifdef DIAGNOSTIC
1648 		if ((*mp)->m_type != MT_DATA && (*mp)->m_type != MT_HEADER)
1649 			panic("somove type: so %p, so_type %d, m %p, "
1650 			    "m_type %d", so, so->so_type, *mp, (*mp)->m_type);
1651 #endif
1652 		if ((*mp)->m_len > size) {
1653 			/*
1654 			 * Move only a partial mbuf at maximum splice length or
1655 			 * if the drain buffer is too small for this large mbuf.
1656 			 */
1657 			if (!maxreached && so->so_snd.sb_datacc > 0) {
1658 				len -= size;
1659 				break;
1660 			}
1661 			*mp = m_copym(so->so_rcv.sb_mb, 0, size, wait);
1662 			if (*mp == NULL) {
1663 				len -= size;
1664 				break;
1665 			}
1666 			so->so_rcv.sb_mb->m_data += size;
1667 			so->so_rcv.sb_mb->m_len -= size;
1668 			so->so_rcv.sb_cc -= size;
1669 			so->so_rcv.sb_datacc -= size;
1670 		} else {
1671 			*mp = so->so_rcv.sb_mb;
1672 			sbfree(so, &so->so_rcv, *mp);
1673 			so->so_rcv.sb_mb = (*mp)->m_next;
1674 			sbsync(&so->so_rcv, nextrecord);
1675 		}
1676 	}
1677 	*mp = NULL;
1678 
1679 	SBLASTRECORDCHK(&so->so_rcv, "somove 3");
1680 	SBLASTMBUFCHK(&so->so_rcv, "somove 3");
1681 	SBCHECK(so, &so->so_rcv);
1682 	if (m == NULL)
1683 		goto release;
1684 	m->m_nextpkt = NULL;
1685 	if (m->m_flags & M_PKTHDR) {
1686 		m_resethdr(m);
1687 		m->m_pkthdr.len = len;
1688 	}
1689 
1690 	/* Send window update to source peer as receive buffer has changed. */
1691 	if (so->so_proto->pr_flags & PR_WANTRCVD)
1692 		pru_rcvd(so);
1693 
1694 	/* Receive buffer did shrink by len bytes, adjust oob. */
1695 	mtx_enter(&so->so_rcv.sb_mtx);
1696 	rcvstate = so->so_rcv.sb_state;
1697 	so->so_rcv.sb_state &= ~SS_RCVATMARK;
1698 	oobmark = so->so_oobmark;
1699 	so->so_oobmark = oobmark > len ? oobmark - len : 0;
1700 	if (oobmark) {
1701 		if (oobmark == len)
1702 			so->so_rcv.sb_state |= SS_RCVATMARK;
1703 		if (oobmark >= len)
1704 			oobmark = 0;
1705 	}
1706 	mtx_leave(&so->so_rcv.sb_mtx);
1707 
1708 	/*
1709 	 * Handle oob data.  If any malloc fails, ignore error.
1710 	 * TCP urgent data is not very reliable anyway.
1711 	 */
1712 	while (((rcvstate & SS_RCVATMARK) || oobmark) &&
1713 	    (so->so_options & SO_OOBINLINE)) {
1714 		struct mbuf *o = NULL;
1715 
1716 		if (rcvstate & SS_RCVATMARK) {
1717 			o = m_get(wait, MT_DATA);
1718 			rcvstate &= ~SS_RCVATMARK;
1719 		} else if (oobmark) {
1720 			o = m_split(m, oobmark, wait);
1721 			if (o) {
1722 				error = pru_send(sosp, m, NULL, NULL);
1723 				if (error) {
1724 					if (sosp->so_snd.sb_state &
1725 					    SS_CANTSENDMORE)
1726 						error = EPIPE;
1727 					m_freem(o);
1728 					goto release;
1729 				}
1730 				len -= oobmark;
1731 				so->so_splicelen += oobmark;
1732 				m = o;
1733 				o = m_get(wait, MT_DATA);
1734 			}
1735 			oobmark = 0;
1736 		}
1737 		if (o) {
1738 			o->m_len = 1;
1739 			*mtod(o, caddr_t) = *mtod(m, caddr_t);
1740 			error = pru_sendoob(sosp, o, NULL, NULL);
1741 			if (error) {
1742 				if (sosp->so_snd.sb_state & SS_CANTSENDMORE)
1743 					error = EPIPE;
1744 				m_freem(m);
1745 				goto release;
1746 			}
1747 			len -= 1;
1748 			so->so_splicelen += 1;
1749 			if (oobmark) {
1750 				oobmark -= 1;
1751 				if (oobmark == 0)
1752 					rcvstate |= SS_RCVATMARK;
1753 			}
1754 			m_adj(m, 1);
1755 		}
1756 	}
1757 
1758 	/* Append all remaining data to drain socket. */
1759 	if (so->so_rcv.sb_cc == 0 || maxreached)
1760 		sosp->so_snd.sb_state &= ~SS_ISSENDING;
1761 	error = pru_send(sosp, m, NULL, NULL);
1762 	if (error) {
1763 		if (sosp->so_snd.sb_state & SS_CANTSENDMORE)
1764 			error = EPIPE;
1765 		goto release;
1766 	}
1767 	so->so_splicelen += len;
1768 
1769 	/* Move several packets if possible. */
1770 	if (!maxreached && nextrecord)
1771 		goto nextpkt;
1772 
1773  release:
1774 	sosp->so_snd.sb_state &= ~SS_ISSENDING;
1775 	if (!error && maxreached && so->so_splicemax == so->so_splicelen)
1776 		error = EFBIG;
1777 	if (error)
1778 		so->so_error = error;
1779 	if (((so->so_rcv.sb_state & SS_CANTRCVMORE) &&
1780 	    so->so_rcv.sb_cc == 0) ||
1781 	    (sosp->so_snd.sb_state & SS_CANTSENDMORE) ||
1782 	    maxreached || error) {
1783 		sounsplice(so, sosp, 0);
1784 		return (0);
1785 	}
1786 	if (timerisset(&so->so_idletv))
1787 		timeout_add_tv(&so->so_idleto, &so->so_idletv);
1788 	return (1);
1789 }
1790 
1791 #endif /* SOCKET_SPLICE */
1792 
1793 void
1794 sorwakeup(struct socket *so)
1795 {
1796 	if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0)
1797 		soassertlocked_readonly(so);
1798 
1799 #ifdef SOCKET_SPLICE
1800 	if (so->so_rcv.sb_flags & SB_SPLICE) {
1801 		/*
1802 		 * TCP has a sendbuffer that can handle multiple packets
1803 		 * at once.  So queue the stream a bit to accumulate data.
1804 		 * The sosplice thread will call somove() later and send
1805 		 * the packets calling tcp_output() only once.
1806 		 * In the UDP case, send out the packets immediately.
1807 		 * Using a thread would make things slower.
1808 		 */
1809 		if (so->so_proto->pr_flags & PR_WANTRCVD)
1810 			task_add(sosplice_taskq, &so->so_splicetask);
1811 		else
1812 			somove(so, M_DONTWAIT);
1813 	}
1814 	if (isspliced(so))
1815 		return;
1816 #endif
1817 	sowakeup(so, &so->so_rcv);
1818 	if (so->so_upcall)
1819 		(*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT);
1820 }
1821 
1822 void
1823 sowwakeup(struct socket *so)
1824 {
1825 	soassertlocked_readonly(so);
1826 
1827 #ifdef SOCKET_SPLICE
1828 	if (so->so_snd.sb_flags & SB_SPLICE)
1829 		task_add(sosplice_taskq, &so->so_sp->ssp_soback->so_splicetask);
1830 	if (issplicedback(so))
1831 		return;
1832 #endif
1833 	sowakeup(so, &so->so_snd);
1834 }
1835 
1836 int
1837 sosetopt(struct socket *so, int level, int optname, struct mbuf *m)
1838 {
1839 	int error = 0;
1840 
1841 	if (level != SOL_SOCKET) {
1842 		if (so->so_proto->pr_ctloutput) {
1843 			solock(so);
1844 			error = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so,
1845 			    level, optname, m);
1846 			sounlock(so);
1847 			return (error);
1848 		}
1849 		error = ENOPROTOOPT;
1850 	} else {
1851 		switch (optname) {
1852 
1853 		case SO_LINGER:
1854 			if (m == NULL || m->m_len != sizeof (struct linger) ||
1855 			    mtod(m, struct linger *)->l_linger < 0 ||
1856 			    mtod(m, struct linger *)->l_linger > SHRT_MAX)
1857 				return (EINVAL);
1858 
1859 			solock(so);
1860 			so->so_linger = mtod(m, struct linger *)->l_linger;
1861 			if (*mtod(m, int *))
1862 				so->so_options |= optname;
1863 			else
1864 				so->so_options &= ~optname;
1865 			sounlock(so);
1866 
1867 			break;
1868 		case SO_BINDANY:
1869 			if ((error = suser(curproc)) != 0)	/* XXX */
1870 				return (error);
1871 			/* FALLTHROUGH */
1872 
1873 		case SO_DEBUG:
1874 		case SO_KEEPALIVE:
1875 		case SO_USELOOPBACK:
1876 		case SO_BROADCAST:
1877 		case SO_REUSEADDR:
1878 		case SO_REUSEPORT:
1879 		case SO_OOBINLINE:
1880 		case SO_TIMESTAMP:
1881 		case SO_ZEROIZE:
1882 			if (m == NULL || m->m_len < sizeof (int))
1883 				return (EINVAL);
1884 
1885 			solock(so);
1886 			if (*mtod(m, int *))
1887 				so->so_options |= optname;
1888 			else
1889 				so->so_options &= ~optname;
1890 			sounlock(so);
1891 
1892 			break;
1893 		case SO_DONTROUTE:
1894 			if (m == NULL || m->m_len < sizeof (int))
1895 				return (EINVAL);
1896 			if (*mtod(m, int *))
1897 				error = EOPNOTSUPP;
1898 			break;
1899 
1900 		case SO_SNDBUF:
1901 		case SO_RCVBUF:
1902 		case SO_SNDLOWAT:
1903 		case SO_RCVLOWAT:
1904 		    {
1905 			struct sockbuf *sb = (optname == SO_SNDBUF ||
1906 			    optname == SO_SNDLOWAT ?
1907 			    &so->so_snd : &so->so_rcv);
1908 			u_long cnt;
1909 
1910 			if (m == NULL || m->m_len < sizeof (int))
1911 				return (EINVAL);
1912 			cnt = *mtod(m, int *);
1913 			if ((long)cnt <= 0)
1914 				cnt = 1;
1915 
1916 			solock(so);
1917 			mtx_enter(&sb->sb_mtx);
1918 
1919 			switch (optname) {
1920 			case SO_SNDBUF:
1921 			case SO_RCVBUF:
1922 				if (sb->sb_state &
1923 				    (SS_CANTSENDMORE | SS_CANTRCVMORE)) {
1924 					error = EINVAL;
1925 					break;
1926 				}
1927 				if (sbcheckreserve(cnt, sb->sb_wat) ||
1928 				    sbreserve(so, sb, cnt)) {
1929 					error = ENOBUFS;
1930 					break;
1931 				}
1932 				sb->sb_wat = cnt;
1933 				break;
1934 			case SO_SNDLOWAT:
1935 			case SO_RCVLOWAT:
1936 				sb->sb_lowat = (cnt > sb->sb_hiwat) ?
1937 				    sb->sb_hiwat : cnt;
1938 				break;
1939 			}
1940 
1941 			mtx_leave(&sb->sb_mtx);
1942 			sounlock(so);
1943 
1944 			break;
1945 		    }
1946 
1947 		case SO_SNDTIMEO:
1948 		case SO_RCVTIMEO:
1949 		    {
1950 			struct sockbuf *sb = (optname == SO_SNDTIMEO ?
1951 			    &so->so_snd : &so->so_rcv);
1952 			struct timeval tv;
1953 			uint64_t nsecs;
1954 
1955 			if (m == NULL || m->m_len < sizeof (tv))
1956 				return (EINVAL);
1957 			memcpy(&tv, mtod(m, struct timeval *), sizeof tv);
1958 			if (!timerisvalid(&tv))
1959 				return (EINVAL);
1960 			nsecs = TIMEVAL_TO_NSEC(&tv);
1961 			if (nsecs == UINT64_MAX)
1962 				return (EDOM);
1963 			if (nsecs == 0)
1964 				nsecs = INFSLP;
1965 
1966 			mtx_enter(&sb->sb_mtx);
1967 			sb->sb_timeo_nsecs = nsecs;
1968 			mtx_leave(&sb->sb_mtx);
1969 			break;
1970 		    }
1971 
1972 		case SO_RTABLE:
1973 			if (so->so_proto->pr_domain &&
1974 			    so->so_proto->pr_domain->dom_protosw &&
1975 			    so->so_proto->pr_ctloutput) {
1976 				const struct domain *dom =
1977 				    so->so_proto->pr_domain;
1978 
1979 				level = dom->dom_protosw->pr_protocol;
1980 				solock(so);
1981 				error = (*so->so_proto->pr_ctloutput)
1982 				    (PRCO_SETOPT, so, level, optname, m);
1983 				sounlock(so);
1984 			} else
1985 				error = ENOPROTOOPT;
1986 			break;
1987 #ifdef SOCKET_SPLICE
1988 		case SO_SPLICE:
1989 			solock(so);
1990 			if (m == NULL) {
1991 				error = sosplice(so, -1, 0, NULL);
1992 			} else if (m->m_len < sizeof(int)) {
1993 				error = EINVAL;
1994 			} else if (m->m_len < sizeof(struct splice)) {
1995 				error = sosplice(so, *mtod(m, int *), 0, NULL);
1996 			} else {
1997 				error = sosplice(so,
1998 				    mtod(m, struct splice *)->sp_fd,
1999 				    mtod(m, struct splice *)->sp_max,
2000 				   &mtod(m, struct splice *)->sp_idle);
2001 			}
2002 			sounlock(so);
2003 			break;
2004 #endif /* SOCKET_SPLICE */
2005 
2006 		default:
2007 			error = ENOPROTOOPT;
2008 			break;
2009 		}
2010 	}
2011 
2012 	return (error);
2013 }
2014 
2015 int
2016 sogetopt(struct socket *so, int level, int optname, struct mbuf *m)
2017 {
2018 	int error = 0;
2019 
2020 	if (level != SOL_SOCKET) {
2021 		if (so->so_proto->pr_ctloutput) {
2022 			m->m_len = 0;
2023 
2024 			solock(so);
2025 			error = (*so->so_proto->pr_ctloutput)(PRCO_GETOPT, so,
2026 			    level, optname, m);
2027 			sounlock(so);
2028 			return (error);
2029 		} else
2030 			return (ENOPROTOOPT);
2031 	} else {
2032 		m->m_len = sizeof (int);
2033 
2034 		switch (optname) {
2035 
2036 		case SO_LINGER:
2037 			m->m_len = sizeof (struct linger);
2038 			solock_shared(so);
2039 			mtod(m, struct linger *)->l_onoff =
2040 				so->so_options & SO_LINGER;
2041 			mtod(m, struct linger *)->l_linger = so->so_linger;
2042 			sounlock_shared(so);
2043 			break;
2044 
2045 		case SO_BINDANY:
2046 		case SO_USELOOPBACK:
2047 		case SO_DEBUG:
2048 		case SO_KEEPALIVE:
2049 		case SO_REUSEADDR:
2050 		case SO_REUSEPORT:
2051 		case SO_BROADCAST:
2052 		case SO_OOBINLINE:
2053 		case SO_TIMESTAMP:
2054 		case SO_ZEROIZE:
2055 			*mtod(m, int *) = so->so_options & optname;
2056 			break;
2057 
2058 		case SO_DONTROUTE:
2059 			*mtod(m, int *) = 0;
2060 			break;
2061 
2062 		case SO_TYPE:
2063 			*mtod(m, int *) = so->so_type;
2064 			break;
2065 
2066 		case SO_ERROR:
2067 			solock(so);
2068 			*mtod(m, int *) = so->so_error;
2069 			so->so_error = 0;
2070 			sounlock(so);
2071 
2072 			break;
2073 
2074 		case SO_DOMAIN:
2075 			*mtod(m, int *) = so->so_proto->pr_domain->dom_family;
2076 			break;
2077 
2078 		case SO_PROTOCOL:
2079 			*mtod(m, int *) = so->so_proto->pr_protocol;
2080 			break;
2081 
2082 		case SO_SNDBUF:
2083 			*mtod(m, int *) = so->so_snd.sb_hiwat;
2084 			break;
2085 
2086 		case SO_RCVBUF:
2087 			*mtod(m, int *) = so->so_rcv.sb_hiwat;
2088 			break;
2089 
2090 		case SO_SNDLOWAT:
2091 			*mtod(m, int *) = so->so_snd.sb_lowat;
2092 			break;
2093 
2094 		case SO_RCVLOWAT:
2095 			*mtod(m, int *) = so->so_rcv.sb_lowat;
2096 			break;
2097 
2098 		case SO_SNDTIMEO:
2099 		case SO_RCVTIMEO:
2100 		    {
2101 			struct sockbuf *sb = (optname == SO_SNDTIMEO ?
2102 			    &so->so_snd : &so->so_rcv);
2103 			struct timeval tv;
2104 			uint64_t nsecs;
2105 
2106 			mtx_enter(&sb->sb_mtx);
2107 			nsecs = sb->sb_timeo_nsecs;
2108 			mtx_leave(&sb->sb_mtx);
2109 
2110 			m->m_len = sizeof(struct timeval);
2111 			memset(&tv, 0, sizeof(tv));
2112 			if (nsecs != INFSLP)
2113 				NSEC_TO_TIMEVAL(nsecs, &tv);
2114 			memcpy(mtod(m, struct timeval *), &tv, sizeof tv);
2115 			break;
2116 		    }
2117 
2118 		case SO_RTABLE:
2119 			if (so->so_proto->pr_domain &&
2120 			    so->so_proto->pr_domain->dom_protosw &&
2121 			    so->so_proto->pr_ctloutput) {
2122 				const struct domain *dom =
2123 				    so->so_proto->pr_domain;
2124 
2125 				level = dom->dom_protosw->pr_protocol;
2126 				solock(so);
2127 				error = (*so->so_proto->pr_ctloutput)
2128 				    (PRCO_GETOPT, so, level, optname, m);
2129 				sounlock(so);
2130 				if (error)
2131 					return (error);
2132 				break;
2133 			}
2134 			return (ENOPROTOOPT);
2135 
2136 #ifdef SOCKET_SPLICE
2137 		case SO_SPLICE:
2138 		    {
2139 			off_t len;
2140 
2141 			m->m_len = sizeof(off_t);
2142 			solock_shared(so);
2143 			len = so->so_sp ? so->so_sp->ssp_len : 0;
2144 			sounlock_shared(so);
2145 			memcpy(mtod(m, off_t *), &len, sizeof(off_t));
2146 			break;
2147 		    }
2148 #endif /* SOCKET_SPLICE */
2149 
2150 		case SO_PEERCRED:
2151 			if (so->so_proto->pr_protocol == AF_UNIX) {
2152 				struct unpcb *unp = sotounpcb(so);
2153 
2154 				solock(so);
2155 				if (unp->unp_flags & UNP_FEIDS) {
2156 					m->m_len = sizeof(unp->unp_connid);
2157 					memcpy(mtod(m, caddr_t),
2158 					    &(unp->unp_connid), m->m_len);
2159 					sounlock(so);
2160 					break;
2161 				}
2162 				sounlock(so);
2163 
2164 				return (ENOTCONN);
2165 			}
2166 			return (EOPNOTSUPP);
2167 
2168 		default:
2169 			return (ENOPROTOOPT);
2170 		}
2171 		return (0);
2172 	}
2173 }
2174 
2175 void
2176 sohasoutofband(struct socket *so)
2177 {
2178 	pgsigio(&so->so_sigio, SIGURG, 0);
2179 	knote(&so->so_rcv.sb_klist, 0);
2180 }
2181 
2182 void
2183 sofilt_lock(struct socket *so, struct sockbuf *sb)
2184 {
2185 	switch (so->so_proto->pr_domain->dom_family) {
2186 	case PF_INET:
2187 	case PF_INET6:
2188 		NET_LOCK_SHARED();
2189 		break;
2190 	default:
2191 		rw_enter_write(&so->so_lock);
2192 		break;
2193 	}
2194 
2195 	mtx_enter(&sb->sb_mtx);
2196 }
2197 
2198 void
2199 sofilt_unlock(struct socket *so, struct sockbuf *sb)
2200 {
2201 	mtx_leave(&sb->sb_mtx);
2202 
2203 	switch (so->so_proto->pr_domain->dom_family) {
2204 	case PF_INET:
2205 	case PF_INET6:
2206 		NET_UNLOCK_SHARED();
2207 		break;
2208 	default:
2209 		rw_exit_write(&so->so_lock);
2210 		break;
2211 	}
2212 }
2213 
2214 int
2215 soo_kqfilter(struct file *fp, struct knote *kn)
2216 {
2217 	struct socket *so = kn->kn_fp->f_data;
2218 	struct sockbuf *sb;
2219 
2220 	switch (kn->kn_filter) {
2221 	case EVFILT_READ:
2222 		kn->kn_fop = &soread_filtops;
2223 		sb = &so->so_rcv;
2224 		break;
2225 	case EVFILT_WRITE:
2226 		kn->kn_fop = &sowrite_filtops;
2227 		sb = &so->so_snd;
2228 		break;
2229 	case EVFILT_EXCEPT:
2230 		kn->kn_fop = &soexcept_filtops;
2231 		sb = &so->so_rcv;
2232 		break;
2233 	default:
2234 		return (EINVAL);
2235 	}
2236 
2237 	klist_insert(&sb->sb_klist, kn);
2238 
2239 	return (0);
2240 }
2241 
2242 void
2243 filt_sordetach(struct knote *kn)
2244 {
2245 	struct socket *so = kn->kn_fp->f_data;
2246 
2247 	klist_remove(&so->so_rcv.sb_klist, kn);
2248 }
2249 
2250 int
2251 filt_soread(struct knote *kn, long hint)
2252 {
2253 	struct socket *so = kn->kn_fp->f_data;
2254 	int rv = 0;
2255 
2256 	MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx);
2257 	if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0)
2258 		soassertlocked_readonly(so);
2259 
2260 	if (so->so_options & SO_ACCEPTCONN) {
2261 		if (so->so_rcv.sb_flags & SB_MTXLOCK)
2262 			soassertlocked_readonly(so);
2263 
2264 		kn->kn_data = so->so_qlen;
2265 		rv = (kn->kn_data != 0);
2266 
2267 		if (kn->kn_flags & (__EV_POLL | __EV_SELECT)) {
2268 			if (so->so_state & SS_ISDISCONNECTED) {
2269 				kn->kn_flags |= __EV_HUP;
2270 				rv = 1;
2271 			} else {
2272 				rv = soreadable(so);
2273 			}
2274 		}
2275 
2276 		return rv;
2277 	}
2278 
2279 	kn->kn_data = so->so_rcv.sb_cc;
2280 #ifdef SOCKET_SPLICE
2281 	if (isspliced(so)) {
2282 		rv = 0;
2283 	} else
2284 #endif /* SOCKET_SPLICE */
2285 	if (so->so_rcv.sb_state & SS_CANTRCVMORE) {
2286 		kn->kn_flags |= EV_EOF;
2287 		if (kn->kn_flags & __EV_POLL) {
2288 			if (so->so_state & SS_ISDISCONNECTED)
2289 				kn->kn_flags |= __EV_HUP;
2290 		}
2291 		kn->kn_fflags = so->so_error;
2292 		rv = 1;
2293 	} else if (so->so_error) {
2294 		rv = 1;
2295 	} else if (kn->kn_sfflags & NOTE_LOWAT) {
2296 		rv = (kn->kn_data >= kn->kn_sdata);
2297 	} else {
2298 		rv = (kn->kn_data >= so->so_rcv.sb_lowat);
2299 	}
2300 
2301 	return rv;
2302 }
2303 
2304 void
2305 filt_sowdetach(struct knote *kn)
2306 {
2307 	struct socket *so = kn->kn_fp->f_data;
2308 
2309 	klist_remove(&so->so_snd.sb_klist, kn);
2310 }
2311 
2312 int
2313 filt_sowrite(struct knote *kn, long hint)
2314 {
2315 	struct socket *so = kn->kn_fp->f_data;
2316 	int rv;
2317 
2318 	MUTEX_ASSERT_LOCKED(&so->so_snd.sb_mtx);
2319 	soassertlocked_readonly(so);
2320 
2321 	kn->kn_data = sbspace(so, &so->so_snd);
2322 	if (so->so_snd.sb_state & SS_CANTSENDMORE) {
2323 		kn->kn_flags |= EV_EOF;
2324 		if (kn->kn_flags & __EV_POLL) {
2325 			if (so->so_state & SS_ISDISCONNECTED)
2326 				kn->kn_flags |= __EV_HUP;
2327 		}
2328 		kn->kn_fflags = so->so_error;
2329 		rv = 1;
2330 	} else if (so->so_error) {
2331 		rv = 1;
2332 	} else if (((so->so_state & SS_ISCONNECTED) == 0) &&
2333 	    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
2334 		rv = 0;
2335 	} else if (kn->kn_sfflags & NOTE_LOWAT) {
2336 		rv = (kn->kn_data >= kn->kn_sdata);
2337 	} else {
2338 		rv = (kn->kn_data >= so->so_snd.sb_lowat);
2339 	}
2340 
2341 	return (rv);
2342 }
2343 
2344 int
2345 filt_soexcept(struct knote *kn, long hint)
2346 {
2347 	struct socket *so = kn->kn_fp->f_data;
2348 	int rv = 0;
2349 
2350 	MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx);
2351 	if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0)
2352 		soassertlocked_readonly(so);
2353 
2354 #ifdef SOCKET_SPLICE
2355 	if (isspliced(so)) {
2356 		rv = 0;
2357 	} else
2358 #endif /* SOCKET_SPLICE */
2359 	if (kn->kn_sfflags & NOTE_OOB) {
2360 		if (so->so_oobmark || (so->so_rcv.sb_state & SS_RCVATMARK)) {
2361 			kn->kn_fflags |= NOTE_OOB;
2362 			kn->kn_data -= so->so_oobmark;
2363 			rv = 1;
2364 		}
2365 	}
2366 
2367 	if (kn->kn_flags & __EV_POLL) {
2368 		if (so->so_state & SS_ISDISCONNECTED) {
2369 			kn->kn_flags |= __EV_HUP;
2370 			rv = 1;
2371 		}
2372 	}
2373 
2374 	return rv;
2375 }
2376 
2377 int
2378 filt_sowmodify(struct kevent *kev, struct knote *kn)
2379 {
2380 	struct socket *so = kn->kn_fp->f_data;
2381 	int rv;
2382 
2383 	sofilt_lock(so, &so->so_snd);
2384 	rv = knote_modify(kev, kn);
2385 	sofilt_unlock(so, &so->so_snd);
2386 
2387 	return (rv);
2388 }
2389 
2390 int
2391 filt_sowprocess(struct knote *kn, struct kevent *kev)
2392 {
2393 	struct socket *so = kn->kn_fp->f_data;
2394 	int rv;
2395 
2396 	sofilt_lock(so, &so->so_snd);
2397 	rv = knote_process(kn, kev);
2398 	sofilt_unlock(so, &so->so_snd);
2399 
2400 	return (rv);
2401 }
2402 
2403 int
2404 filt_sormodify(struct kevent *kev, struct knote *kn)
2405 {
2406 	struct socket *so = kn->kn_fp->f_data;
2407 	int rv;
2408 
2409 	sofilt_lock(so, &so->so_rcv);
2410 	rv = knote_modify(kev, kn);
2411 	sofilt_unlock(so, &so->so_rcv);
2412 
2413 	return (rv);
2414 }
2415 
2416 int
2417 filt_sorprocess(struct knote *kn, struct kevent *kev)
2418 {
2419 	struct socket *so = kn->kn_fp->f_data;
2420 	int rv;
2421 
2422 	sofilt_lock(so, &so->so_rcv);
2423 	rv = knote_process(kn, kev);
2424 	sofilt_unlock(so, &so->so_rcv);
2425 
2426 	return (rv);
2427 }
2428 
2429 #ifdef DDB
2430 void
2431 sobuf_print(struct sockbuf *,
2432     int (*)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))));
2433 
2434 void
2435 sobuf_print(struct sockbuf *sb,
2436     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
2437 {
2438 	(*pr)("\tsb_cc: %lu\n", sb->sb_cc);
2439 	(*pr)("\tsb_datacc: %lu\n", sb->sb_datacc);
2440 	(*pr)("\tsb_hiwat: %lu\n", sb->sb_hiwat);
2441 	(*pr)("\tsb_wat: %lu\n", sb->sb_wat);
2442 	(*pr)("\tsb_mbcnt: %lu\n", sb->sb_mbcnt);
2443 	(*pr)("\tsb_mbmax: %lu\n", sb->sb_mbmax);
2444 	(*pr)("\tsb_lowat: %ld\n", sb->sb_lowat);
2445 	(*pr)("\tsb_mb: %p\n", sb->sb_mb);
2446 	(*pr)("\tsb_mbtail: %p\n", sb->sb_mbtail);
2447 	(*pr)("\tsb_lastrecord: %p\n", sb->sb_lastrecord);
2448 	(*pr)("\tsb_sel: ...\n");
2449 	(*pr)("\tsb_flags: %04x\n", sb->sb_flags);
2450 	(*pr)("\tsb_state: %04x\n", sb->sb_state);
2451 	(*pr)("\tsb_timeo_nsecs: %llu\n", sb->sb_timeo_nsecs);
2452 }
2453 
2454 void
2455 so_print(void *v,
2456     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
2457 {
2458 	struct socket *so = v;
2459 
2460 	(*pr)("socket %p\n", so);
2461 	(*pr)("so_type: %i\n", so->so_type);
2462 	(*pr)("so_options: 0x%04x\n", so->so_options); /* %b */
2463 	(*pr)("so_linger: %i\n", so->so_linger);
2464 	(*pr)("so_state: 0x%04x\n", so->so_state);
2465 	(*pr)("so_pcb: %p\n", so->so_pcb);
2466 	(*pr)("so_proto: %p\n", so->so_proto);
2467 	(*pr)("so_sigio: %p\n", so->so_sigio.sir_sigio);
2468 
2469 	(*pr)("so_head: %p\n", so->so_head);
2470 	(*pr)("so_onq: %p\n", so->so_onq);
2471 	(*pr)("so_q0: @%p first: %p\n", &so->so_q0, TAILQ_FIRST(&so->so_q0));
2472 	(*pr)("so_q: @%p first: %p\n", &so->so_q, TAILQ_FIRST(&so->so_q));
2473 	(*pr)("so_eq: next: %p\n", TAILQ_NEXT(so, so_qe));
2474 	(*pr)("so_q0len: %i\n", so->so_q0len);
2475 	(*pr)("so_qlen: %i\n", so->so_qlen);
2476 	(*pr)("so_qlimit: %i\n", so->so_qlimit);
2477 	(*pr)("so_timeo: %i\n", so->so_timeo);
2478 	(*pr)("so_obmark: %lu\n", so->so_oobmark);
2479 
2480 	(*pr)("so_sp: %p\n", so->so_sp);
2481 	if (so->so_sp != NULL) {
2482 		(*pr)("\tssp_socket: %p\n", so->so_sp->ssp_socket);
2483 		(*pr)("\tssp_soback: %p\n", so->so_sp->ssp_soback);
2484 		(*pr)("\tssp_len: %lld\n",
2485 		    (unsigned long long)so->so_sp->ssp_len);
2486 		(*pr)("\tssp_max: %lld\n",
2487 		    (unsigned long long)so->so_sp->ssp_max);
2488 		(*pr)("\tssp_idletv: %lld %ld\n", so->so_sp->ssp_idletv.tv_sec,
2489 		    so->so_sp->ssp_idletv.tv_usec);
2490 		(*pr)("\tssp_idleto: %spending (@%i)\n",
2491 		    timeout_pending(&so->so_sp->ssp_idleto) ? "" : "not ",
2492 		    so->so_sp->ssp_idleto.to_time);
2493 	}
2494 
2495 	(*pr)("so_rcv:\n");
2496 	sobuf_print(&so->so_rcv, pr);
2497 	(*pr)("so_snd:\n");
2498 	sobuf_print(&so->so_snd, pr);
2499 
2500 	(*pr)("so_upcall: %p so_upcallarg: %p\n",
2501 	    so->so_upcall, so->so_upcallarg);
2502 
2503 	(*pr)("so_euid: %d so_ruid: %d\n", so->so_euid, so->so_ruid);
2504 	(*pr)("so_egid: %d so_rgid: %d\n", so->so_egid, so->so_rgid);
2505 	(*pr)("so_cpid: %d\n", so->so_cpid);
2506 }
2507 #endif
2508