xref: /openbsd/sys/kern/uipc_socket.c (revision 3370674d)
1 /*	$OpenBSD: uipc_socket.c,v 1.364 2025/01/23 10:44:13 bluhm Exp $	*/
2 /*	$NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1990, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
33  */
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/proc.h>
38 #include <sys/file.h>
39 #include <sys/filedesc.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/domain.h>
43 #include <sys/event.h>
44 #include <sys/protosw.h>
45 #include <sys/socket.h>
46 #include <sys/unpcb.h>
47 #include <sys/socketvar.h>
48 #include <sys/signalvar.h>
49 #include <sys/pool.h>
50 #include <sys/atomic.h>
51 #include <sys/rwlock.h>
52 #include <sys/time.h>
53 #include <sys/refcnt.h>
54 
55 #ifdef DDB
56 #include <machine/db_machdep.h>
57 #endif
58 
59 void	sbsync(struct sockbuf *, struct mbuf *);
60 
61 int	sosplice(struct socket *, int, off_t, struct timeval *);
62 void	sounsplice(struct socket *, struct socket *, int);
63 void	soidle(void *);
64 void	sotask(void *);
65 int	somove(struct socket *, int);
66 void	sorflush(struct socket *);
67 
68 void	filt_sordetach(struct knote *kn);
69 int	filt_soread(struct knote *kn, long hint);
70 void	filt_sowdetach(struct knote *kn);
71 int	filt_sowrite(struct knote *kn, long hint);
72 int	filt_soexcept(struct knote *kn, long hint);
73 
74 int	filt_sowmodify(struct kevent *kev, struct knote *kn);
75 int	filt_sowprocess(struct knote *kn, struct kevent *kev);
76 
77 int	filt_sormodify(struct kevent *kev, struct knote *kn);
78 int	filt_sorprocess(struct knote *kn, struct kevent *kev);
79 
80 const struct filterops soread_filtops = {
81 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
82 	.f_attach	= NULL,
83 	.f_detach	= filt_sordetach,
84 	.f_event	= filt_soread,
85 	.f_modify	= filt_sormodify,
86 	.f_process	= filt_sorprocess,
87 };
88 
89 const struct filterops sowrite_filtops = {
90 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
91 	.f_attach	= NULL,
92 	.f_detach	= filt_sowdetach,
93 	.f_event	= filt_sowrite,
94 	.f_modify	= filt_sowmodify,
95 	.f_process	= filt_sowprocess,
96 };
97 
98 const struct filterops soexcept_filtops = {
99 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
100 	.f_attach	= NULL,
101 	.f_detach	= filt_sordetach,
102 	.f_event	= filt_soexcept,
103 	.f_modify	= filt_sormodify,
104 	.f_process	= filt_sorprocess,
105 };
106 
107 #ifndef SOMINCONN
108 #define SOMINCONN 80
109 #endif /* SOMINCONN */
110 
111 int	somaxconn = SOMAXCONN;
112 int	sominconn = SOMINCONN;
113 
114 struct pool socket_pool;
115 #ifdef SOCKET_SPLICE
116 struct pool sosplice_pool;
117 struct taskq *sosplice_taskq;
118 struct rwlock sosplice_lock = RWLOCK_INITIALIZER("sosplicelk");
119 #endif
120 
121 void
122 soinit(void)
123 {
124 	pool_init(&socket_pool, sizeof(struct socket), 0, IPL_SOFTNET, 0,
125 	    "sockpl", NULL);
126 #ifdef SOCKET_SPLICE
127 	pool_init(&sosplice_pool, sizeof(struct sosplice), 0, IPL_SOFTNET, 0,
128 	    "sosppl", NULL);
129 #endif
130 }
131 
132 struct socket *
133 soalloc(const struct protosw *prp, int wait)
134 {
135 	const struct domain *dp = prp->pr_domain;
136 	const char *dom_name = dp->dom_name;
137 	struct socket *so;
138 
139 	so = pool_get(&socket_pool, (wait == M_WAIT ? PR_WAITOK : PR_NOWAIT) |
140 	    PR_ZERO);
141 	if (so == NULL)
142 		return (NULL);
143 
144 #ifdef WITNESS
145 	/*
146 	 * XXX: Make WITNESS happy. AF_INET and AF_INET6 sockets could be
147 	 * spliced together.
148 	 */
149 	switch (dp->dom_family) {
150 	case AF_INET:
151 	case AF_INET6:
152 		dom_name = "inet46";
153 		break;
154 	}
155 #endif
156 
157 	refcnt_init_trace(&so->so_refcnt, DT_REFCNT_IDX_SOCKET);
158 	rw_init_flags(&so->so_lock, dom_name, RWL_DUPOK);
159 	rw_init(&so->so_rcv.sb_lock, "sbufrcv");
160 	rw_init(&so->so_snd.sb_lock, "sbufsnd");
161 	mtx_init_flags(&so->so_rcv.sb_mtx, IPL_MPFLOOR, "sbrcv", 0);
162 	mtx_init_flags(&so->so_snd.sb_mtx, IPL_MPFLOOR, "sbsnd", 0);
163 	klist_init_mutex(&so->so_rcv.sb_klist, &so->so_rcv.sb_mtx);
164 	klist_init_mutex(&so->so_snd.sb_klist, &so->so_snd.sb_mtx);
165 	sigio_init(&so->so_sigio);
166 	TAILQ_INIT(&so->so_q0);
167 	TAILQ_INIT(&so->so_q);
168 
169 	return (so);
170 }
171 
172 /*
173  * Socket operation routines.
174  * These routines are called by the routines in
175  * sys_socket.c or from a system process, and
176  * implement the semantics of socket operations by
177  * switching out to the protocol specific routines.
178  */
179 int
180 socreate(int dom, struct socket **aso, int type, int proto)
181 {
182 	struct proc *p = curproc;		/* XXX */
183 	const struct protosw *prp;
184 	struct socket *so;
185 	int error;
186 
187 	if (proto)
188 		prp = pffindproto(dom, proto, type);
189 	else
190 		prp = pffindtype(dom, type);
191 	if (prp == NULL || prp->pr_usrreqs == NULL)
192 		return (EPROTONOSUPPORT);
193 	if (prp->pr_type != type)
194 		return (EPROTOTYPE);
195 	so = soalloc(prp, M_WAIT);
196 	so->so_type = type;
197 	if (suser(p) == 0)
198 		so->so_state = SS_PRIV;
199 	so->so_ruid = p->p_ucred->cr_ruid;
200 	so->so_euid = p->p_ucred->cr_uid;
201 	so->so_rgid = p->p_ucred->cr_rgid;
202 	so->so_egid = p->p_ucred->cr_gid;
203 	so->so_cpid = p->p_p->ps_pid;
204 	so->so_proto = prp;
205 	so->so_snd.sb_timeo_nsecs = INFSLP;
206 	so->so_rcv.sb_timeo_nsecs = INFSLP;
207 
208 	solock(so);
209 	error = pru_attach(so, proto, M_WAIT);
210 	if (error) {
211 		so->so_state |= SS_NOFDREF;
212 		/* sofree() calls sounlock(). */
213 		sofree(so, 0);
214 		return (error);
215 	}
216 	sounlock(so);
217 	*aso = so;
218 	return (0);
219 }
220 
221 int
222 sobind(struct socket *so, struct mbuf *nam, struct proc *p)
223 {
224 	soassertlocked(so);
225 	return pru_bind(so, nam, p);
226 }
227 
228 int
229 solisten(struct socket *so, int backlog)
230 {
231 	int somaxconn_local = atomic_load_int(&somaxconn);
232 	int sominconn_local = atomic_load_int(&sominconn);
233 	int error;
234 
235 	switch (so->so_type) {
236 	case SOCK_STREAM:
237 	case SOCK_SEQPACKET:
238 		break;
239 	default:
240 		return (EOPNOTSUPP);
241 	}
242 
243 	soassertlocked(so);
244 
245 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
246 		return (EINVAL);
247 #ifdef SOCKET_SPLICE
248 	if (isspliced(so) || issplicedback(so))
249 		return (EOPNOTSUPP);
250 #endif /* SOCKET_SPLICE */
251 	error = pru_listen(so);
252 	if (error)
253 		return (error);
254 	if (TAILQ_FIRST(&so->so_q) == NULL)
255 		so->so_options |= SO_ACCEPTCONN;
256 	if (backlog < 0 || backlog > somaxconn_local)
257 		backlog = somaxconn_local;
258 	if (backlog < sominconn_local)
259 		backlog = sominconn_local;
260 	so->so_qlimit = backlog;
261 	return (0);
262 }
263 
264 void
265 sorele(struct socket *so)
266 {
267 	if (refcnt_rele(&so->so_refcnt) == 0)
268 		return;
269 
270 	sigio_free(&so->so_sigio);
271 	klist_free(&so->so_rcv.sb_klist);
272 	klist_free(&so->so_snd.sb_klist);
273 
274 	mtx_enter(&so->so_snd.sb_mtx);
275 	sbrelease(so, &so->so_snd);
276 	mtx_leave(&so->so_snd.sb_mtx);
277 
278 	if (so->so_proto->pr_flags & PR_RIGHTS &&
279 	    so->so_proto->pr_domain->dom_dispose)
280 		(*so->so_proto->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
281 	m_purge(so->so_rcv.sb_mb);
282 
283 #ifdef SOCKET_SPLICE
284 	if (so->so_sp)
285 		pool_put(&sosplice_pool, so->so_sp);
286 #endif
287 	pool_put(&socket_pool, so);
288 }
289 
290 #define SOSP_FREEING_READ	1
291 #define SOSP_FREEING_WRITE	2
292 void
293 sofree(struct socket *so, int keep_lock)
294 {
295 	int persocket = solock_persocket(so);
296 
297 	soassertlocked(so);
298 
299 	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) {
300 		if (!keep_lock)
301 			sounlock(so);
302 		return;
303 	}
304 	if (so->so_head) {
305 		struct socket *head = so->so_head;
306 
307 		/*
308 		 * We must not decommission a socket that's on the accept(2)
309 		 * queue.  If we do, then accept(2) may hang after select(2)
310 		 * indicated that the listening socket was ready.
311 		 */
312 		if (so->so_onq == &head->so_q) {
313 			if (!keep_lock)
314 				sounlock(so);
315 			return;
316 		}
317 
318 		if (persocket) {
319 			soref(head);
320 			sounlock(so);
321 			solock(head);
322 			solock(so);
323 
324 			if (so->so_onq != &head->so_q0) {
325 				sounlock(so);
326 				sounlock(head);
327 				sorele(head);
328 				return;
329 			}
330 		}
331 
332 		soqremque(so, 0);
333 
334 		if (persocket) {
335 			sounlock(head);
336 			sorele(head);
337 		}
338 	}
339 
340 	if (!keep_lock)
341 		sounlock(so);
342 	sorele(so);
343 }
344 
345 static inline uint64_t
346 solinger_nsec(struct socket *so)
347 {
348 	if (so->so_linger == 0)
349 		return INFSLP;
350 
351 	return SEC_TO_NSEC(so->so_linger);
352 }
353 
354 /*
355  * Close a socket on last file table reference removal.
356  * Initiate disconnect if connected.
357  * Free socket when disconnect complete.
358  */
359 int
360 soclose(struct socket *so, int flags)
361 {
362 	struct socket *so2;
363 	int error = 0;
364 
365 	solock(so);
366 	/* Revoke async IO early. There is a final revocation in sofree(). */
367 	sigio_free(&so->so_sigio);
368 	if (so->so_state & SS_ISCONNECTED) {
369 		if (so->so_pcb == NULL)
370 			goto discard;
371 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
372 			error = sodisconnect(so);
373 			if (error)
374 				goto drop;
375 		}
376 		if (so->so_options & SO_LINGER) {
377 			if ((so->so_state & SS_ISDISCONNECTING) &&
378 			    (flags & MSG_DONTWAIT))
379 				goto drop;
380 			while (so->so_state & SS_ISCONNECTED) {
381 				error = sosleep_nsec(so, &so->so_timeo,
382 				    PSOCK | PCATCH, "netcls",
383 				    solinger_nsec(so));
384 				if (error)
385 					break;
386 			}
387 		}
388 	}
389 drop:
390 	if (so->so_pcb) {
391 		int error2;
392 		error2 = pru_detach(so);
393 		if (error == 0)
394 			error = error2;
395 	}
396 	if (so->so_options & SO_ACCEPTCONN) {
397 		int persocket = solock_persocket(so);
398 
399 		while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) {
400 			soref(so2);
401 			solock(so2);
402 			(void) soqremque(so2, 0);
403 			sounlock(so);
404 			soabort(so2);
405 			sounlock(so2);
406 			sorele(so2);
407 			solock(so);
408 		}
409 		while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) {
410 			soref(so2);
411 			solock_nonet(so2);
412 			(void) soqremque(so2, 1);
413 			if (persocket)
414 				sounlock(so);
415 			soabort(so2);
416 			sounlock_nonet(so2);
417 			sorele(so2);
418 			if (persocket)
419 				solock(so);
420 		}
421 	}
422 discard:
423 #ifdef SOCKET_SPLICE
424 	if (so->so_sp) {
425 		struct socket *soback;
426 
427 		sounlock(so);
428 		mtx_enter(&so->so_snd.sb_mtx);
429 		/*
430 		 * Concurrent sounsplice() locks `sb_mtx' mutexes on
431 		 * both `so_snd' and `so_rcv' before unsplice sockets.
432 		 */
433 		if ((soback = so->so_sp->ssp_soback) == NULL) {
434 			mtx_leave(&so->so_snd.sb_mtx);
435 			goto notsplicedback;
436 		}
437 		soref(soback);
438 		mtx_leave(&so->so_snd.sb_mtx);
439 
440 		/*
441 		 * `so' can be only unspliced, and never spliced again.
442 		 * Thus if issplicedback(so) check is positive, socket is
443 		 * still spliced and `ssp_soback' points to the same
444 		 * socket that `soback'.
445 		 */
446 		sblock(&soback->so_rcv, SBL_WAIT | SBL_NOINTR);
447 		if (issplicedback(so)) {
448 			int freeing = SOSP_FREEING_WRITE;
449 
450 			if (so->so_sp->ssp_soback == so)
451 				freeing |= SOSP_FREEING_READ;
452 			sounsplice(so->so_sp->ssp_soback, so, freeing);
453 		}
454 		sbunlock(&soback->so_rcv);
455 		sorele(soback);
456 
457 notsplicedback:
458 		sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR);
459 		if (isspliced(so)) {
460 			struct socket *sosp;
461 			int freeing = SOSP_FREEING_READ;
462 
463 			if (so == so->so_sp->ssp_socket)
464 				freeing |= SOSP_FREEING_WRITE;
465 			sosp = soref(so->so_sp->ssp_socket);
466 			sounsplice(so, so->so_sp->ssp_socket, freeing);
467 			sorele(sosp);
468 		}
469 		sbunlock(&so->so_rcv);
470 
471 		timeout_del_barrier(&so->so_sp->ssp_idleto);
472 		task_del(sosplice_taskq, &so->so_sp->ssp_task);
473 		taskq_barrier(sosplice_taskq);
474 
475 		solock(so);
476 	}
477 #endif /* SOCKET_SPLICE */
478 
479 	if (so->so_state & SS_NOFDREF)
480 		panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type);
481 	so->so_state |= SS_NOFDREF;
482 
483 	/* sofree() calls sounlock(). */
484 	sofree(so, 0);
485 	return (error);
486 }
487 
488 void
489 soabort(struct socket *so)
490 {
491 	soassertlocked(so);
492 	pru_abort(so);
493 }
494 
495 int
496 soaccept(struct socket *so, struct mbuf *nam)
497 {
498 	int error = 0;
499 
500 	soassertlocked(so);
501 
502 	if ((so->so_state & SS_NOFDREF) == 0)
503 		panic("soaccept !NOFDREF: so %p, so_type %d", so, so->so_type);
504 	so->so_state &= ~SS_NOFDREF;
505 	if ((so->so_state & SS_ISDISCONNECTED) == 0 ||
506 	    (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0)
507 		error = pru_accept(so, nam);
508 	else
509 		error = ECONNABORTED;
510 	return (error);
511 }
512 
513 int
514 soconnect(struct socket *so, struct mbuf *nam)
515 {
516 	int error;
517 
518 	soassertlocked(so);
519 
520 	if (so->so_options & SO_ACCEPTCONN)
521 		return (EOPNOTSUPP);
522 	/*
523 	 * If protocol is connection-based, can only connect once.
524 	 * Otherwise, if connected, try to disconnect first.
525 	 * This allows user to disconnect by connecting to, e.g.,
526 	 * a null address.
527 	 */
528 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
529 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
530 	    (error = sodisconnect(so))))
531 		error = EISCONN;
532 	else
533 		error = pru_connect(so, nam);
534 	return (error);
535 }
536 
537 int
538 soconnect2(struct socket *so1, struct socket *so2)
539 {
540 	int persocket, error;
541 
542 	if ((persocket = solock_persocket(so1)))
543 		solock_pair(so1, so2);
544 	else
545 		solock(so1);
546 
547 	error = pru_connect2(so1, so2);
548 
549 	if (persocket)
550 		sounlock(so2);
551 	sounlock(so1);
552 	return (error);
553 }
554 
555 int
556 sodisconnect(struct socket *so)
557 {
558 	int error;
559 
560 	soassertlocked(so);
561 
562 	if ((so->so_state & SS_ISCONNECTED) == 0)
563 		return (ENOTCONN);
564 	if (so->so_state & SS_ISDISCONNECTING)
565 		return (EALREADY);
566 	error = pru_disconnect(so);
567 	return (error);
568 }
569 
570 int m_getuio(struct mbuf **, int, long, struct uio *);
571 
572 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
573 /*
574  * Send on a socket.
575  * If send must go all at once and message is larger than
576  * send buffering, then hard error.
577  * Lock against other senders.
578  * If must go all at once and not enough room now, then
579  * inform user that this would block and do nothing.
580  * Otherwise, if nonblocking, send as much as possible.
581  * The data to be sent is described by "uio" if nonzero,
582  * otherwise by the mbuf chain "top" (which must be null
583  * if uio is not).  Data provided in mbuf chain must be small
584  * enough to send all at once.
585  *
586  * Returns nonzero on error, timeout or signal; callers
587  * must check for short counts if EINTR/ERESTART are returned.
588  * Data and control buffers are freed on return.
589  */
590 int
591 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top,
592     struct mbuf *control, int flags)
593 {
594 	long space, clen = 0;
595 	size_t resid;
596 	int error;
597 	int atomic = sosendallatonce(so) || top;
598 
599 	if (uio)
600 		resid = uio->uio_resid;
601 	else
602 		resid = top->m_pkthdr.len;
603 	/* MSG_EOR on a SOCK_STREAM socket is invalid. */
604 	if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) {
605 		m_freem(top);
606 		m_freem(control);
607 		return (EINVAL);
608 	}
609 	if (uio && uio->uio_procp)
610 		uio->uio_procp->p_ru.ru_msgsnd++;
611 	if (control) {
612 		/*
613 		 * In theory clen should be unsigned (since control->m_len is).
614 		 * However, space must be signed, as it might be less than 0
615 		 * if we over-committed, and we must use a signed comparison
616 		 * of space and clen.
617 		 */
618 		clen = control->m_len;
619 		/* reserve extra space for AF_UNIX's internalize */
620 		if (so->so_proto->pr_domain->dom_family == AF_UNIX &&
621 		    clen >= CMSG_ALIGN(sizeof(struct cmsghdr)) &&
622 		    mtod(control, struct cmsghdr *)->cmsg_type == SCM_RIGHTS)
623 			clen = CMSG_SPACE(
624 			    (clen - CMSG_ALIGN(sizeof(struct cmsghdr))) *
625 			    (sizeof(struct fdpass) / sizeof(int)));
626 	}
627 
628 #define	snderr(errno)	{ error = errno; goto release; }
629 
630 restart:
631 	if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0)
632 		goto out;
633 	mtx_enter(&so->so_snd.sb_mtx);
634 	so->so_snd.sb_state |= SS_ISSENDING;
635 	do {
636 		if (so->so_snd.sb_state & SS_CANTSENDMORE)
637 			snderr(EPIPE);
638 		if ((error = READ_ONCE(so->so_error))) {
639 			so->so_error = 0;
640 			snderr(error);
641 		}
642 		if ((so->so_state & SS_ISCONNECTED) == 0) {
643 			if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
644 				if (!(resid == 0 && clen != 0))
645 					snderr(ENOTCONN);
646 			} else if (addr == NULL)
647 				snderr(EDESTADDRREQ);
648 		}
649 		space = sbspace_locked(so, &so->so_snd);
650 		if (flags & MSG_OOB)
651 			space += 1024;
652 		if (so->so_proto->pr_domain->dom_family == AF_UNIX) {
653 			if (atomic && resid > so->so_snd.sb_hiwat)
654 				snderr(EMSGSIZE);
655 		} else {
656 			if (clen > so->so_snd.sb_hiwat ||
657 			    (atomic && resid > so->so_snd.sb_hiwat - clen))
658 				snderr(EMSGSIZE);
659 		}
660 		if (space < clen ||
661 		    (space - clen < resid &&
662 		    (atomic || space < so->so_snd.sb_lowat))) {
663 			if (flags & MSG_DONTWAIT)
664 				snderr(EWOULDBLOCK);
665 			sbunlock(&so->so_snd);
666 			error = sbwait(&so->so_snd);
667 			so->so_snd.sb_state &= ~SS_ISSENDING;
668 			mtx_leave(&so->so_snd.sb_mtx);
669 			if (error)
670 				goto out;
671 			goto restart;
672 		}
673 		space -= clen;
674 		do {
675 			if (uio == NULL) {
676 				/*
677 				 * Data is prepackaged in "top".
678 				 */
679 				resid = 0;
680 				if (flags & MSG_EOR)
681 					top->m_flags |= M_EOR;
682 			} else {
683 				mtx_leave(&so->so_snd.sb_mtx);
684 				error = m_getuio(&top, atomic, space, uio);
685 				mtx_enter(&so->so_snd.sb_mtx);
686 				if (error)
687 					goto release;
688 				space -= top->m_pkthdr.len;
689 				resid = uio->uio_resid;
690 				if (flags & MSG_EOR)
691 					top->m_flags |= M_EOR;
692 			}
693 			if (resid == 0)
694 				so->so_snd.sb_state &= ~SS_ISSENDING;
695 			if (top && so->so_options & SO_ZEROIZE)
696 				top->m_flags |= M_ZEROIZE;
697 			mtx_leave(&so->so_snd.sb_mtx);
698 			solock_shared(so);
699 			if (flags & MSG_OOB)
700 				error = pru_sendoob(so, top, addr, control);
701 			else
702 				error = pru_send(so, top, addr, control);
703 			sounlock_shared(so);
704 			mtx_enter(&so->so_snd.sb_mtx);
705 			clen = 0;
706 			control = NULL;
707 			top = NULL;
708 			if (error)
709 				goto release;
710 		} while (resid && space > 0);
711 	} while (resid);
712 
713 release:
714 	so->so_snd.sb_state &= ~SS_ISSENDING;
715 	mtx_leave(&so->so_snd.sb_mtx);
716 	sbunlock(&so->so_snd);
717 out:
718 	m_freem(top);
719 	m_freem(control);
720 	return (error);
721 }
722 
723 int
724 m_getuio(struct mbuf **mp, int atomic, long space, struct uio *uio)
725 {
726 	struct mbuf *m, *top = NULL;
727 	struct mbuf **nextp = &top;
728 	u_long len, mlen;
729 	size_t resid = uio->uio_resid;
730 	int error;
731 
732 	do {
733 		if (top == NULL) {
734 			MGETHDR(m, M_WAIT, MT_DATA);
735 			mlen = MHLEN;
736 		} else {
737 			MGET(m, M_WAIT, MT_DATA);
738 			mlen = MLEN;
739 		}
740 		/* chain mbuf together */
741 		*nextp = m;
742 		nextp = &m->m_next;
743 
744 		resid = ulmin(resid, space);
745 		if (resid >= MINCLSIZE) {
746 			MCLGETL(m, M_NOWAIT, ulmin(resid, MAXMCLBYTES));
747 			if ((m->m_flags & M_EXT) == 0)
748 				MCLGETL(m, M_NOWAIT, MCLBYTES);
749 			if ((m->m_flags & M_EXT) == 0)
750 				goto nopages;
751 			mlen = m->m_ext.ext_size;
752 			len = ulmin(mlen, resid);
753 			/*
754 			 * For datagram protocols, leave room
755 			 * for protocol headers in first mbuf.
756 			 */
757 			if (atomic && m == top && len < mlen - max_hdr)
758 				m->m_data += max_hdr;
759 		} else {
760 nopages:
761 			len = ulmin(mlen, resid);
762 			/*
763 			 * For datagram protocols, leave room
764 			 * for protocol headers in first mbuf.
765 			 */
766 			if (atomic && m == top && len < mlen - max_hdr)
767 				m_align(m, len);
768 		}
769 
770 		error = uiomove(mtod(m, caddr_t), len, uio);
771 		if (error) {
772 			m_freem(top);
773 			return (error);
774 		}
775 
776 		/* adjust counters */
777 		resid = uio->uio_resid;
778 		space -= len;
779 		m->m_len = len;
780 		top->m_pkthdr.len += len;
781 
782 		/* Is there more space and more data? */
783 	} while (space > 0 && resid > 0);
784 
785 	*mp = top;
786 	return 0;
787 }
788 
789 /*
790  * Following replacement or removal of the first mbuf on the first
791  * mbuf chain of a socket buffer, push necessary state changes back
792  * into the socket buffer so that other consumers see the values
793  * consistently.  'nextrecord' is the callers locally stored value of
794  * the original value of sb->sb_mb->m_nextpkt which must be restored
795  * when the lead mbuf changes.  NOTE: 'nextrecord' may be NULL.
796  */
797 void
798 sbsync(struct sockbuf *sb, struct mbuf *nextrecord)
799 {
800 
801 	/*
802 	 * First, update for the new value of nextrecord.  If necessary,
803 	 * make it the first record.
804 	 */
805 	if (sb->sb_mb != NULL)
806 		sb->sb_mb->m_nextpkt = nextrecord;
807 	else
808 		sb->sb_mb = nextrecord;
809 
810 	/*
811 	 * Now update any dependent socket buffer fields to reflect
812 	 * the new state.  This is an inline of SB_EMPTY_FIXUP, with
813 	 * the addition of a second clause that takes care of the
814 	 * case where sb_mb has been updated, but remains the last
815 	 * record.
816 	 */
817 	if (sb->sb_mb == NULL) {
818 		sb->sb_mbtail = NULL;
819 		sb->sb_lastrecord = NULL;
820 	} else if (sb->sb_mb->m_nextpkt == NULL)
821 		sb->sb_lastrecord = sb->sb_mb;
822 }
823 
824 /*
825  * Implement receive operations on a socket.
826  * We depend on the way that records are added to the sockbuf
827  * by sbappend*.  In particular, each record (mbufs linked through m_next)
828  * must begin with an address if the protocol so specifies,
829  * followed by an optional mbuf or mbufs containing ancillary data,
830  * and then zero or more mbufs of data.
831  * In order to avoid blocking network for the entire time here, we release
832  * the solock() while doing the actual copy to user space.
833  * Although the sockbuf is locked, new data may still be appended,
834  * and thus we must maintain consistency of the sockbuf during that time.
835  *
836  * The caller may receive the data as a single mbuf chain by supplying
837  * an mbuf **mp0 for use in returning the chain.  The uio is then used
838  * only for the count in uio_resid.
839  */
840 int
841 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio,
842     struct mbuf **mp0, struct mbuf **controlp, int *flagsp,
843     socklen_t controllen)
844 {
845 	struct mbuf *m, **mp;
846 	struct mbuf *cm;
847 	u_long len, offset, moff;
848 	int flags, error, error2, type, uio_error = 0;
849 	const struct protosw *pr = so->so_proto;
850 	struct mbuf *nextrecord;
851 	size_t resid, orig_resid = uio->uio_resid;
852 
853 	mp = mp0;
854 	if (paddr)
855 		*paddr = NULL;
856 	if (controlp)
857 		*controlp = NULL;
858 	if (flagsp)
859 		flags = *flagsp &~ MSG_EOR;
860 	else
861 		flags = 0;
862 	if (flags & MSG_OOB) {
863 		m = m_get(M_WAIT, MT_DATA);
864 		solock_shared(so);
865 		error = pru_rcvoob(so, m, flags & MSG_PEEK);
866 		sounlock_shared(so);
867 		if (error)
868 			goto bad;
869 		do {
870 			error = uiomove(mtod(m, caddr_t),
871 			    ulmin(uio->uio_resid, m->m_len), uio);
872 			m = m_free(m);
873 		} while (uio->uio_resid && error == 0 && m);
874 bad:
875 		m_freem(m);
876 		return (error);
877 	}
878 	if (mp)
879 		*mp = NULL;
880 
881 restart:
882 	if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0)
883 		return (error);
884 	mtx_enter(&so->so_rcv.sb_mtx);
885 
886 	m = so->so_rcv.sb_mb;
887 #ifdef SOCKET_SPLICE
888 	if (isspliced(so))
889 		m = NULL;
890 #endif /* SOCKET_SPLICE */
891 	/*
892 	 * If we have less data than requested, block awaiting more
893 	 * (subject to any timeout) if:
894 	 *   1. the current count is less than the low water mark,
895 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
896 	 *	receive operation at once if we block (resid <= hiwat), or
897 	 *   3. MSG_DONTWAIT is not set.
898 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
899 	 * we have to do the receive in sections, and thus risk returning
900 	 * a short count if a timeout or signal occurs after we start.
901 	 */
902 	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
903 	    so->so_rcv.sb_cc < uio->uio_resid) &&
904 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
905 	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
906 	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
907 #ifdef DIAGNOSTIC
908 		if (m == NULL && so->so_rcv.sb_cc)
909 #ifdef SOCKET_SPLICE
910 		    if (!isspliced(so))
911 #endif /* SOCKET_SPLICE */
912 			panic("receive 1: so %p, so_type %d, sb_cc %lu",
913 			    so, so->so_type, so->so_rcv.sb_cc);
914 #endif
915 		if ((error2 = READ_ONCE(so->so_error))) {
916 			if (m)
917 				goto dontblock;
918 			error = error2;
919 			if ((flags & MSG_PEEK) == 0)
920 				so->so_error = 0;
921 			goto release;
922 		}
923 		if (so->so_rcv.sb_state & SS_CANTRCVMORE) {
924 			if (m)
925 				goto dontblock;
926 			else if (so->so_rcv.sb_cc == 0)
927 				goto release;
928 		}
929 		for (; m; m = m->m_next)
930 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
931 				m = so->so_rcv.sb_mb;
932 				goto dontblock;
933 			}
934 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
935 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
936 			error = ENOTCONN;
937 			goto release;
938 		}
939 		if (uio->uio_resid == 0 && controlp == NULL)
940 			goto release;
941 		if (flags & MSG_DONTWAIT) {
942 			error = EWOULDBLOCK;
943 			goto release;
944 		}
945 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
946 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
947 
948 		sbunlock(&so->so_rcv);
949 		error = sbwait(&so->so_rcv);
950 		mtx_leave(&so->so_rcv.sb_mtx);
951 		if (error)
952 			return (error);
953 		goto restart;
954 	}
955 dontblock:
956 	/*
957 	 * On entry here, m points to the first record of the socket buffer.
958 	 * From this point onward, we maintain 'nextrecord' as a cache of the
959 	 * pointer to the next record in the socket buffer.  We must keep the
960 	 * various socket buffer pointers and local stack versions of the
961 	 * pointers in sync, pushing out modifications before operations that
962 	 * may sleep, and re-reading them afterwards.
963 	 *
964 	 * Otherwise, we will race with the network stack appending new data
965 	 * or records onto the socket buffer by using inconsistent/stale
966 	 * versions of the field, possibly resulting in socket buffer
967 	 * corruption.
968 	 */
969 	if (uio->uio_procp)
970 		uio->uio_procp->p_ru.ru_msgrcv++;
971 	KASSERT(m == so->so_rcv.sb_mb);
972 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
973 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
974 	nextrecord = m->m_nextpkt;
975 	if (pr->pr_flags & PR_ADDR) {
976 #ifdef DIAGNOSTIC
977 		if (m->m_type != MT_SONAME)
978 			panic("receive 1a: so %p, so_type %d, m %p, m_type %d",
979 			    so, so->so_type, m, m->m_type);
980 #endif
981 		orig_resid = 0;
982 		if (flags & MSG_PEEK) {
983 			if (paddr)
984 				*paddr = m_copym(m, 0, m->m_len, M_NOWAIT);
985 			m = m->m_next;
986 		} else {
987 			sbfree(so, &so->so_rcv, m);
988 			if (paddr) {
989 				*paddr = m;
990 				so->so_rcv.sb_mb = m->m_next;
991 				m->m_next = NULL;
992 				m = so->so_rcv.sb_mb;
993 			} else {
994 				so->so_rcv.sb_mb = m_free(m);
995 				m = so->so_rcv.sb_mb;
996 			}
997 			sbsync(&so->so_rcv, nextrecord);
998 		}
999 	}
1000 	while (m && m->m_type == MT_CONTROL && error == 0) {
1001 		int skip = 0;
1002 		if (flags & MSG_PEEK) {
1003 			if (mtod(m, struct cmsghdr *)->cmsg_type ==
1004 			    SCM_RIGHTS) {
1005 				/* don't leak internalized SCM_RIGHTS msgs */
1006 				skip = 1;
1007 			} else if (controlp)
1008 				*controlp = m_copym(m, 0, m->m_len, M_NOWAIT);
1009 			m = m->m_next;
1010 		} else {
1011 			sbfree(so, &so->so_rcv, m);
1012 			so->so_rcv.sb_mb = m->m_next;
1013 			m->m_nextpkt = m->m_next = NULL;
1014 			cm = m;
1015 			m = so->so_rcv.sb_mb;
1016 			sbsync(&so->so_rcv, nextrecord);
1017 			if (controlp) {
1018 				if (pr->pr_domain->dom_externalize) {
1019 					mtx_leave(&so->so_rcv.sb_mtx);
1020 					error =
1021 					    (*pr->pr_domain->dom_externalize)
1022 					    (cm, controllen, flags);
1023 					mtx_enter(&so->so_rcv.sb_mtx);
1024 				}
1025 				*controlp = cm;
1026 			} else {
1027 				/*
1028 				 * Dispose of any SCM_RIGHTS message that went
1029 				 * through the read path rather than recv.
1030 				 */
1031 				if (pr->pr_domain->dom_dispose) {
1032 					mtx_leave(&so->so_rcv.sb_mtx);
1033 					pr->pr_domain->dom_dispose(cm);
1034 					mtx_enter(&so->so_rcv.sb_mtx);
1035 				}
1036 				m_free(cm);
1037 			}
1038 		}
1039 		if (m != NULL)
1040 			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1041 		else
1042 			nextrecord = so->so_rcv.sb_mb;
1043 		if (controlp && !skip)
1044 			controlp = &(*controlp)->m_next;
1045 		orig_resid = 0;
1046 	}
1047 
1048 	/* If m is non-NULL, we have some data to read. */
1049 	if (m) {
1050 		type = m->m_type;
1051 		if (type == MT_OOBDATA)
1052 			flags |= MSG_OOB;
1053 		if (m->m_flags & M_BCAST)
1054 			flags |= MSG_BCAST;
1055 		if (m->m_flags & M_MCAST)
1056 			flags |= MSG_MCAST;
1057 	}
1058 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
1059 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
1060 
1061 	moff = 0;
1062 	offset = 0;
1063 	while (m && uio->uio_resid > 0 && error == 0) {
1064 		if (m->m_type == MT_OOBDATA) {
1065 			if (type != MT_OOBDATA)
1066 				break;
1067 		} else if (type == MT_OOBDATA) {
1068 			break;
1069 		} else if (m->m_type == MT_CONTROL) {
1070 			/*
1071 			 * If there is more than one control message in the
1072 			 * stream, we do a short read.  Next can be received
1073 			 * or disposed by another system call.
1074 			 */
1075 			break;
1076 #ifdef DIAGNOSTIC
1077 		} else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) {
1078 			panic("receive 3: so %p, so_type %d, m %p, m_type %d",
1079 			    so, so->so_type, m, m->m_type);
1080 #endif
1081 		}
1082 		so->so_rcv.sb_state &= ~SS_RCVATMARK;
1083 		len = uio->uio_resid;
1084 		if (so->so_oobmark && len > so->so_oobmark - offset)
1085 			len = so->so_oobmark - offset;
1086 		if (len > m->m_len - moff)
1087 			len = m->m_len - moff;
1088 		/*
1089 		 * If mp is set, just pass back the mbufs.
1090 		 * Otherwise copy them out via the uio, then free.
1091 		 * Sockbuf must be consistent here (points to current mbuf,
1092 		 * it points to next record) when we drop priority;
1093 		 * we must note any additions to the sockbuf when we
1094 		 * block interrupts again.
1095 		 */
1096 		if (mp == NULL && uio_error == 0) {
1097 			SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
1098 			SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
1099 			resid = uio->uio_resid;
1100 			mtx_leave(&so->so_rcv.sb_mtx);
1101 			uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio);
1102 			mtx_enter(&so->so_rcv.sb_mtx);
1103 			if (uio_error)
1104 				uio->uio_resid = resid - len;
1105 		} else
1106 			uio->uio_resid -= len;
1107 		if (len == m->m_len - moff) {
1108 			if (m->m_flags & M_EOR)
1109 				flags |= MSG_EOR;
1110 			if (flags & MSG_PEEK) {
1111 				m = m->m_next;
1112 				moff = 0;
1113 				orig_resid = 0;
1114 			} else {
1115 				nextrecord = m->m_nextpkt;
1116 				sbfree(so, &so->so_rcv, m);
1117 				if (mp) {
1118 					*mp = m;
1119 					mp = &m->m_next;
1120 					so->so_rcv.sb_mb = m = m->m_next;
1121 					*mp = NULL;
1122 				} else {
1123 					so->so_rcv.sb_mb = m_free(m);
1124 					m = so->so_rcv.sb_mb;
1125 				}
1126 				/*
1127 				 * If m != NULL, we also know that
1128 				 * so->so_rcv.sb_mb != NULL.
1129 				 */
1130 				KASSERT(so->so_rcv.sb_mb == m);
1131 				if (m) {
1132 					m->m_nextpkt = nextrecord;
1133 					if (nextrecord == NULL)
1134 						so->so_rcv.sb_lastrecord = m;
1135 				} else {
1136 					so->so_rcv.sb_mb = nextrecord;
1137 					SB_EMPTY_FIXUP(&so->so_rcv);
1138 				}
1139 				SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
1140 				SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
1141 			}
1142 		} else {
1143 			if (flags & MSG_PEEK) {
1144 				moff += len;
1145 				orig_resid = 0;
1146 			} else {
1147 				if (mp)
1148 					*mp = m_copym(m, 0, len, M_WAIT);
1149 				m->m_data += len;
1150 				m->m_len -= len;
1151 				so->so_rcv.sb_cc -= len;
1152 				so->so_rcv.sb_datacc -= len;
1153 			}
1154 		}
1155 		if (so->so_oobmark) {
1156 			if ((flags & MSG_PEEK) == 0) {
1157 				so->so_oobmark -= len;
1158 				if (so->so_oobmark == 0) {
1159 					so->so_rcv.sb_state |= SS_RCVATMARK;
1160 					break;
1161 				}
1162 			} else {
1163 				offset += len;
1164 				if (offset == so->so_oobmark)
1165 					break;
1166 			}
1167 		}
1168 		if (flags & MSG_EOR)
1169 			break;
1170 		/*
1171 		 * If the MSG_WAITALL flag is set (for non-atomic socket),
1172 		 * we must not quit until "uio->uio_resid == 0" or an error
1173 		 * termination.  If a signal/timeout occurs, return
1174 		 * with a short count but without error.
1175 		 * Keep sockbuf locked against other readers.
1176 		 */
1177 		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1178 		    !sosendallatonce(so) && !nextrecord) {
1179 			if (so->so_rcv.sb_state & SS_CANTRCVMORE ||
1180 			    so->so_error)
1181 				break;
1182 			SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
1183 			SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
1184 			if (sbwait(&so->so_rcv)) {
1185 				mtx_leave(&so->so_rcv.sb_mtx);
1186 				sbunlock(&so->so_rcv);
1187 				return (0);
1188 			}
1189 			if ((m = so->so_rcv.sb_mb) != NULL)
1190 				nextrecord = m->m_nextpkt;
1191 		}
1192 	}
1193 
1194 	if (m && pr->pr_flags & PR_ATOMIC) {
1195 		flags |= MSG_TRUNC;
1196 		if ((flags & MSG_PEEK) == 0)
1197 			(void) sbdroprecord(so, &so->so_rcv);
1198 	}
1199 	if ((flags & MSG_PEEK) == 0) {
1200 		if (m == NULL) {
1201 			/*
1202 			 * First part is an inline SB_EMPTY_FIXUP().  Second
1203 			 * part makes sure sb_lastrecord is up-to-date if
1204 			 * there is still data in the socket buffer.
1205 			 */
1206 			so->so_rcv.sb_mb = nextrecord;
1207 			if (so->so_rcv.sb_mb == NULL) {
1208 				so->so_rcv.sb_mbtail = NULL;
1209 				so->so_rcv.sb_lastrecord = NULL;
1210 			} else if (nextrecord->m_nextpkt == NULL)
1211 				so->so_rcv.sb_lastrecord = nextrecord;
1212 		}
1213 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
1214 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
1215 		if (pr->pr_flags & PR_WANTRCVD) {
1216 			mtx_leave(&so->so_rcv.sb_mtx);
1217 			solock_shared(so);
1218 			pru_rcvd(so);
1219 			sounlock_shared(so);
1220 			mtx_enter(&so->so_rcv.sb_mtx);
1221 		}
1222 	}
1223 	if (orig_resid == uio->uio_resid && orig_resid &&
1224 	    (flags & MSG_EOR) == 0 &&
1225 	    (so->so_rcv.sb_state & SS_CANTRCVMORE) == 0) {
1226 		mtx_leave(&so->so_rcv.sb_mtx);
1227 		sbunlock(&so->so_rcv);
1228 		goto restart;
1229 	}
1230 
1231 	if (uio_error)
1232 		error = uio_error;
1233 
1234 	if (flagsp)
1235 		*flagsp |= flags;
1236 release:
1237 	mtx_leave(&so->so_rcv.sb_mtx);
1238 	sbunlock(&so->so_rcv);
1239 	return (error);
1240 }
1241 
1242 int
1243 soshutdown(struct socket *so, int how)
1244 {
1245 	int error = 0;
1246 
1247 	switch (how) {
1248 	case SHUT_RD:
1249 		sorflush(so);
1250 		break;
1251 	case SHUT_RDWR:
1252 		sorflush(so);
1253 		/* FALLTHROUGH */
1254 	case SHUT_WR:
1255 		solock(so);
1256 		error = pru_shutdown(so);
1257 		sounlock(so);
1258 		break;
1259 	default:
1260 		error = EINVAL;
1261 		break;
1262 	}
1263 
1264 	return (error);
1265 }
1266 
1267 void
1268 sorflush(struct socket *so)
1269 {
1270 	struct sockbuf *sb = &so->so_rcv;
1271 	struct mbuf *m;
1272 	const struct protosw *pr = so->so_proto;
1273 	int error;
1274 
1275 	error = sblock(sb, SBL_WAIT | SBL_NOINTR);
1276 	/* with SBL_WAIT and SLB_NOINTR sblock() must not fail */
1277 	KASSERT(error == 0);
1278 
1279 	solock_shared(so);
1280 	socantrcvmore(so);
1281 	mtx_enter(&sb->sb_mtx);
1282 	m = sb->sb_mb;
1283 	memset(&sb->sb_startzero, 0,
1284 	     (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero);
1285 	sb->sb_timeo_nsecs = INFSLP;
1286 	mtx_leave(&sb->sb_mtx);
1287 	sounlock_shared(so);
1288 	sbunlock(sb);
1289 
1290 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
1291 		(*pr->pr_domain->dom_dispose)(m);
1292 	m_purge(m);
1293 }
1294 
1295 #ifdef SOCKET_SPLICE
1296 
1297 #define so_splicelen	so_sp->ssp_len
1298 #define so_splicemax	so_sp->ssp_max
1299 #define so_idletv	so_sp->ssp_idletv
1300 #define so_idleto	so_sp->ssp_idleto
1301 #define so_splicetask	so_sp->ssp_task
1302 
1303 void
1304 sosplice_solock_pair(struct socket *so1, struct socket *so2)
1305 {
1306 	NET_LOCK_SHARED();
1307 
1308 	if (so1 == so2)
1309 		rw_enter_write(&so1->so_lock);
1310 	else if (so1 < so2) {
1311 		rw_enter_write(&so1->so_lock);
1312 		rw_enter_write(&so2->so_lock);
1313 	} else {
1314 		rw_enter_write(&so2->so_lock);
1315 		rw_enter_write(&so1->so_lock);
1316 	}
1317 }
1318 
1319 void
1320 sosplice_sounlock_pair(struct socket *so1, struct socket *so2)
1321 {
1322 	if (so1 == so2)
1323 		rw_exit_write(&so1->so_lock);
1324 	else if (so1 < so2) {
1325 		rw_exit_write(&so2->so_lock);
1326 		rw_exit_write(&so1->so_lock);
1327 	} else {
1328 		rw_exit_write(&so1->so_lock);
1329 		rw_exit_write(&so2->so_lock);
1330 	}
1331 
1332 	NET_UNLOCK_SHARED();
1333 }
1334 
1335 int
1336 sosplice(struct socket *so, int fd, off_t max, struct timeval *tv)
1337 {
1338 	struct file	*fp;
1339 	struct socket	*sosp;
1340 	struct taskq	*tq;
1341 	int		 error = 0;
1342 
1343 	if ((so->so_proto->pr_flags & PR_SPLICE) == 0)
1344 		return (EPROTONOSUPPORT);
1345 	if (max && max < 0)
1346 		return (EINVAL);
1347 	if (tv && (tv->tv_sec < 0 || !timerisvalid(tv)))
1348 		return (EINVAL);
1349 
1350 	/* If no fd is given, unsplice by removing existing link. */
1351 	if (fd < 0) {
1352 		if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0)
1353 			return (error);
1354 		if (so->so_sp && so->so_sp->ssp_socket) {
1355 			sosp = soref(so->so_sp->ssp_socket);
1356 			sounsplice(so, so->so_sp->ssp_socket, 0);
1357 			sorele(sosp);
1358 		} else
1359 			error = EPROTO;
1360 		sbunlock(&so->so_rcv);
1361 		return (error);
1362 	}
1363 
1364 	if (sosplice_taskq == NULL) {
1365 		rw_enter_write(&sosplice_lock);
1366 		if (sosplice_taskq == NULL) {
1367 			tq = taskq_create("sosplice", 1, IPL_SOFTNET,
1368 			    TASKQ_MPSAFE);
1369 			if (tq == NULL) {
1370 				rw_exit_write(&sosplice_lock);
1371 				return (ENOMEM);
1372 			}
1373 			/* Ensure the taskq is fully visible to other CPUs. */
1374 			membar_producer();
1375 			sosplice_taskq = tq;
1376 		}
1377 		rw_exit_write(&sosplice_lock);
1378 	} else {
1379 		/* Ensure the taskq is fully visible on this CPU. */
1380 		membar_consumer();
1381 	}
1382 
1383 	/* Find sosp, the drain socket where data will be spliced into. */
1384 	if ((error = getsock(curproc, fd, &fp)) != 0)
1385 		return (error);
1386 	sosp = fp->f_data;
1387 
1388 	if (sosp->so_proto->pr_usrreqs->pru_send !=
1389 	    so->so_proto->pr_usrreqs->pru_send) {
1390 		error = EPROTONOSUPPORT;
1391 		goto frele;
1392 	}
1393 
1394 	if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0)
1395 		goto frele;
1396 	if ((error = sblock(&sosp->so_snd, SBL_WAIT)) != 0) {
1397 		sbunlock(&so->so_rcv);
1398 		goto frele;
1399 	}
1400 	sosplice_solock_pair(so, sosp);
1401 
1402 	if ((so->so_options & SO_ACCEPTCONN) ||
1403 	    (sosp->so_options & SO_ACCEPTCONN)) {
1404 		error = EOPNOTSUPP;
1405 		goto release;
1406 	}
1407 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1408 	    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1409 		error = ENOTCONN;
1410 		goto release;
1411 	}
1412 	if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) {
1413 		error = ENOTCONN;
1414 		goto release;
1415 	}
1416 	if (so->so_sp == NULL) {
1417 		struct sosplice *so_sp;
1418 
1419 		so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
1420 		timeout_set_flags(&so_sp->ssp_idleto, soidle, so,
1421 		    KCLOCK_NONE, TIMEOUT_PROC | TIMEOUT_MPSAFE);
1422 		task_set(&so_sp->ssp_task, sotask, so);
1423 
1424 		so->so_sp = so_sp;
1425 	}
1426 	if (sosp->so_sp == NULL) {
1427 		struct sosplice *so_sp;
1428 
1429 		so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
1430 		timeout_set_flags(&so_sp->ssp_idleto, soidle, sosp,
1431 		    KCLOCK_NONE, TIMEOUT_PROC | TIMEOUT_MPSAFE);
1432 		task_set(&so_sp->ssp_task, sotask, sosp);
1433 
1434 		sosp->so_sp = so_sp;
1435 	}
1436 	if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) {
1437 		error = EBUSY;
1438 		goto release;
1439 	}
1440 
1441 	so->so_splicelen = 0;
1442 	so->so_splicemax = max;
1443 	if (tv)
1444 		so->so_idletv = *tv;
1445 	else
1446 		timerclear(&so->so_idletv);
1447 
1448 	/*
1449 	 * To prevent sorwakeup() calling somove() before this somove()
1450 	 * has finished, the socket buffers are not marked as spliced yet.
1451 	 */
1452 
1453 	/* Splice so and sosp together. */
1454 	mtx_enter(&so->so_rcv.sb_mtx);
1455 	mtx_enter(&sosp->so_snd.sb_mtx);
1456 	so->so_sp->ssp_socket = sosp;
1457 	sosp->so_sp->ssp_soback = so;
1458 	mtx_leave(&sosp->so_snd.sb_mtx);
1459 	mtx_leave(&so->so_rcv.sb_mtx);
1460 
1461 	sosplice_sounlock_pair(so, sosp);
1462 	sbunlock(&sosp->so_snd);
1463 
1464 	if (somove(so, M_WAIT)) {
1465 		mtx_enter(&so->so_rcv.sb_mtx);
1466 		mtx_enter(&sosp->so_snd.sb_mtx);
1467 		so->so_rcv.sb_flags |= SB_SPLICE;
1468 		sosp->so_snd.sb_flags |= SB_SPLICE;
1469 		mtx_leave(&sosp->so_snd.sb_mtx);
1470 		mtx_leave(&so->so_rcv.sb_mtx);
1471 	}
1472 
1473 	sbunlock(&so->so_rcv);
1474 	FRELE(fp, curproc);
1475 	return (0);
1476 
1477  release:
1478 	sosplice_sounlock_pair(so, sosp);
1479 	sbunlock(&sosp->so_snd);
1480 	sbunlock(&so->so_rcv);
1481  frele:
1482 	FRELE(fp, curproc);
1483 	return (error);
1484 }
1485 
1486 void
1487 sounsplice(struct socket *so, struct socket *sosp, int freeing)
1488 {
1489 	sbassertlocked(&so->so_rcv);
1490 
1491 	mtx_enter(&so->so_rcv.sb_mtx);
1492 	mtx_enter(&sosp->so_snd.sb_mtx);
1493 	so->so_rcv.sb_flags &= ~SB_SPLICE;
1494 	sosp->so_snd.sb_flags &= ~SB_SPLICE;
1495 	so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL;
1496 	mtx_leave(&sosp->so_snd.sb_mtx);
1497 	mtx_leave(&so->so_rcv.sb_mtx);
1498 
1499 	task_del(sosplice_taskq, &so->so_splicetask);
1500 	timeout_del(&so->so_idleto);
1501 
1502 	/* Do not wakeup a socket that is about to be freed. */
1503 	if ((freeing & SOSP_FREEING_READ) == 0) {
1504 		int readable;
1505 
1506 		solock_shared(so);
1507 		mtx_enter(&so->so_rcv.sb_mtx);
1508 		readable = soreadable(so);
1509 		mtx_leave(&so->so_rcv.sb_mtx);
1510 		if (readable)
1511 			sorwakeup(so);
1512 		sounlock_shared(so);
1513 	}
1514 	if ((freeing & SOSP_FREEING_WRITE) == 0) {
1515 		solock_shared(sosp);
1516 		if (sowriteable(sosp))
1517 			sowwakeup(sosp);
1518 		sounlock_shared(sosp);
1519 	}
1520 }
1521 
1522 void
1523 soidle(void *arg)
1524 {
1525 	struct socket *so = arg;
1526 
1527 	sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR);
1528 	if (so->so_rcv.sb_flags & SB_SPLICE) {
1529 		struct socket *sosp;
1530 
1531 		WRITE_ONCE(so->so_error, ETIMEDOUT);
1532 		sosp = soref(so->so_sp->ssp_socket);
1533 		sounsplice(so, so->so_sp->ssp_socket, 0);
1534 		sorele(sosp);
1535 	}
1536 	sbunlock(&so->so_rcv);
1537 }
1538 
1539 void
1540 sotask(void *arg)
1541 {
1542 	struct socket *so = arg;
1543 	int doyield = 0;
1544 
1545 	sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR);
1546 	if (so->so_rcv.sb_flags & SB_SPLICE) {
1547 		if (so->so_proto->pr_flags & PR_WANTRCVD)
1548 			doyield = 1;
1549 		somove(so, M_DONTWAIT);
1550 	}
1551 	sbunlock(&so->so_rcv);
1552 
1553 	if (doyield) {
1554 		/* Avoid user land starvation. */
1555 		yield();
1556 	}
1557 }
1558 
1559 /*
1560  * Move data from receive buffer of spliced source socket to send
1561  * buffer of drain socket.  Try to move as much as possible in one
1562  * big chunk.  It is a TCP only implementation.
1563  * Return value 0 means splicing has been finished, 1 continue.
1564  */
1565 int
1566 somove(struct socket *so, int wait)
1567 {
1568 	struct socket	*sosp = so->so_sp->ssp_socket;
1569 	struct mbuf	*m, **mp, *nextrecord;
1570 	u_long		 len, off, oobmark;
1571 	long		 space;
1572 	int		 error = 0, maxreached = 0, unsplice = 0;
1573 	unsigned int	 rcvstate;
1574 
1575 	sbassertlocked(&so->so_rcv);
1576 
1577 	if (so->so_proto->pr_flags & PR_WANTRCVD)
1578 		sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR);
1579 
1580 	mtx_enter(&so->so_rcv.sb_mtx);
1581 	mtx_enter(&sosp->so_snd.sb_mtx);
1582 
1583  nextpkt:
1584 	if ((error = READ_ONCE(so->so_error)))
1585 		goto release;
1586 	if (sosp->so_snd.sb_state & SS_CANTSENDMORE) {
1587 		error = EPIPE;
1588 		goto release;
1589 	}
1590 
1591 	error = READ_ONCE(sosp->so_error);
1592 	if (error) {
1593 		if (error != ETIMEDOUT && error != EFBIG && error != ELOOP)
1594 			goto release;
1595 		error = 0;
1596 	}
1597 	if ((sosp->so_state & SS_ISCONNECTED) == 0)
1598 		goto release;
1599 
1600 	/* Calculate how many bytes can be copied now. */
1601 	len = so->so_rcv.sb_datacc;
1602 	if (so->so_splicemax) {
1603 		KASSERT(so->so_splicelen < so->so_splicemax);
1604 		if (so->so_splicemax <= so->so_splicelen + len) {
1605 			len = so->so_splicemax - so->so_splicelen;
1606 			maxreached = 1;
1607 		}
1608 	}
1609 	space = sbspace_locked(sosp, &sosp->so_snd);
1610 	if (so->so_oobmark && so->so_oobmark < len &&
1611 	    so->so_oobmark < space + 1024)
1612 		space += 1024;
1613 	if (space <= 0) {
1614 		maxreached = 0;
1615 		goto release;
1616 	}
1617 	if (space < len) {
1618 		maxreached = 0;
1619 		if (space < sosp->so_snd.sb_lowat)
1620 			goto release;
1621 		len = space;
1622 	}
1623 	sosp->so_snd.sb_state |= SS_ISSENDING;
1624 
1625 	SBLASTRECORDCHK(&so->so_rcv, "somove 1");
1626 	SBLASTMBUFCHK(&so->so_rcv, "somove 1");
1627 	m = so->so_rcv.sb_mb;
1628 	if (m == NULL)
1629 		goto release;
1630 	nextrecord = m->m_nextpkt;
1631 
1632 	/* Drop address and control information not used with splicing. */
1633 	if (so->so_proto->pr_flags & PR_ADDR) {
1634 #ifdef DIAGNOSTIC
1635 		if (m->m_type != MT_SONAME)
1636 			panic("somove soname: so %p, so_type %d, m %p, "
1637 			    "m_type %d", so, so->so_type, m, m->m_type);
1638 #endif
1639 		m = m->m_next;
1640 	}
1641 	while (m && m->m_type == MT_CONTROL)
1642 		m = m->m_next;
1643 	if (m == NULL) {
1644 		sbdroprecord(so, &so->so_rcv);
1645 		if (so->so_proto->pr_flags & PR_WANTRCVD) {
1646 			mtx_leave(&sosp->so_snd.sb_mtx);
1647 			mtx_leave(&so->so_rcv.sb_mtx);
1648 			solock_shared(so);
1649 			pru_rcvd(so);
1650 			sounlock_shared(so);
1651 			mtx_enter(&so->so_rcv.sb_mtx);
1652 			mtx_enter(&sosp->so_snd.sb_mtx);
1653 		}
1654 		goto nextpkt;
1655 	}
1656 
1657 	/*
1658 	 * By splicing sockets connected to localhost, userland might create a
1659 	 * loop.  Dissolve splicing with error if loop is detected by counter.
1660 	 *
1661 	 * If we deal with looped broadcast/multicast packet we bail out with
1662 	 * no error to suppress splice termination.
1663 	 */
1664 	if ((m->m_flags & M_PKTHDR) &&
1665 	    ((m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) ||
1666 	    ((m->m_flags & M_LOOP) && (m->m_flags & (M_BCAST|M_MCAST))))) {
1667 		error = ELOOP;
1668 		goto release;
1669 	}
1670 
1671 	if (so->so_proto->pr_flags & PR_ATOMIC) {
1672 		if ((m->m_flags & M_PKTHDR) == 0)
1673 			panic("somove !PKTHDR: so %p, so_type %d, m %p, "
1674 			    "m_type %d", so, so->so_type, m, m->m_type);
1675 		if (sosp->so_snd.sb_hiwat < m->m_pkthdr.len) {
1676 			error = EMSGSIZE;
1677 			goto release;
1678 		}
1679 		if (len < m->m_pkthdr.len)
1680 			goto release;
1681 		if (m->m_pkthdr.len < len) {
1682 			maxreached = 0;
1683 			len = m->m_pkthdr.len;
1684 		}
1685 		/*
1686 		 * Throw away the name mbuf after it has been assured
1687 		 * that the whole first record can be processed.
1688 		 */
1689 		m = so->so_rcv.sb_mb;
1690 		sbfree(so, &so->so_rcv, m);
1691 		so->so_rcv.sb_mb = m_free(m);
1692 		sbsync(&so->so_rcv, nextrecord);
1693 	}
1694 	/*
1695 	 * Throw away the control mbufs after it has been assured
1696 	 * that the whole first record can be processed.
1697 	 */
1698 	m = so->so_rcv.sb_mb;
1699 	while (m && m->m_type == MT_CONTROL) {
1700 		sbfree(so, &so->so_rcv, m);
1701 		so->so_rcv.sb_mb = m_free(m);
1702 		m = so->so_rcv.sb_mb;
1703 		sbsync(&so->so_rcv, nextrecord);
1704 	}
1705 
1706 	SBLASTRECORDCHK(&so->so_rcv, "somove 2");
1707 	SBLASTMBUFCHK(&so->so_rcv, "somove 2");
1708 
1709 	/* Take at most len mbufs out of receive buffer. */
1710 	for (off = 0, mp = &m; off <= len && *mp;
1711 	    off += (*mp)->m_len, mp = &(*mp)->m_next) {
1712 		u_long size = len - off;
1713 
1714 #ifdef DIAGNOSTIC
1715 		if ((*mp)->m_type != MT_DATA && (*mp)->m_type != MT_HEADER)
1716 			panic("somove type: so %p, so_type %d, m %p, "
1717 			    "m_type %d", so, so->so_type, *mp, (*mp)->m_type);
1718 #endif
1719 		if ((*mp)->m_len > size) {
1720 			/*
1721 			 * Move only a partial mbuf at maximum splice length or
1722 			 * if the drain buffer is too small for this large mbuf.
1723 			 */
1724 			if (!maxreached && sosp->so_snd.sb_datacc > 0) {
1725 				len -= size;
1726 				break;
1727 			}
1728 			*mp = m_copym(so->so_rcv.sb_mb, 0, size, wait);
1729 			if (*mp == NULL) {
1730 				len -= size;
1731 				break;
1732 			}
1733 			so->so_rcv.sb_mb->m_data += size;
1734 			so->so_rcv.sb_mb->m_len -= size;
1735 			so->so_rcv.sb_cc -= size;
1736 			so->so_rcv.sb_datacc -= size;
1737 		} else {
1738 			*mp = so->so_rcv.sb_mb;
1739 			sbfree(so, &so->so_rcv, *mp);
1740 			so->so_rcv.sb_mb = (*mp)->m_next;
1741 			sbsync(&so->so_rcv, nextrecord);
1742 		}
1743 	}
1744 	*mp = NULL;
1745 
1746 	SBLASTRECORDCHK(&so->so_rcv, "somove 3");
1747 	SBLASTMBUFCHK(&so->so_rcv, "somove 3");
1748 	SBCHECK(so, &so->so_rcv);
1749 	if (m == NULL)
1750 		goto release;
1751 	m->m_nextpkt = NULL;
1752 	if (m->m_flags & M_PKTHDR) {
1753 		m_resethdr(m);
1754 		m->m_pkthdr.len = len;
1755 	}
1756 
1757 	/* Receive buffer did shrink by len bytes, adjust oob. */
1758 	rcvstate = so->so_rcv.sb_state;
1759 	so->so_rcv.sb_state &= ~SS_RCVATMARK;
1760 	oobmark = so->so_oobmark;
1761 	so->so_oobmark = oobmark > len ? oobmark - len : 0;
1762 	if (oobmark) {
1763 		if (oobmark == len)
1764 			so->so_rcv.sb_state |= SS_RCVATMARK;
1765 		if (oobmark >= len)
1766 			oobmark = 0;
1767 	}
1768 
1769 	/* Send window update to source peer as receive buffer has changed. */
1770 	if (so->so_proto->pr_flags & PR_WANTRCVD) {
1771 		mtx_leave(&sosp->so_snd.sb_mtx);
1772 		mtx_leave(&so->so_rcv.sb_mtx);
1773 		solock_shared(so);
1774 		pru_rcvd(so);
1775 		sounlock_shared(so);
1776 		mtx_enter(&so->so_rcv.sb_mtx);
1777 		mtx_enter(&sosp->so_snd.sb_mtx);
1778 	}
1779 
1780 	/*
1781 	 * Handle oob data.  If any malloc fails, ignore error.
1782 	 * TCP urgent data is not very reliable anyway.
1783 	 */
1784 	while (((rcvstate & SS_RCVATMARK) || oobmark) &&
1785 	    (so->so_options & SO_OOBINLINE)) {
1786 		struct mbuf *o = NULL;
1787 
1788 		if (rcvstate & SS_RCVATMARK) {
1789 			o = m_get(wait, MT_DATA);
1790 			rcvstate &= ~SS_RCVATMARK;
1791 		} else if (oobmark) {
1792 			o = m_split(m, oobmark, wait);
1793 			if (o) {
1794 				mtx_leave(&sosp->so_snd.sb_mtx);
1795 				mtx_leave(&so->so_rcv.sb_mtx);
1796 				solock_shared(sosp);
1797 				error = pru_send(sosp, m, NULL, NULL);
1798 				sounlock_shared(sosp);
1799 				mtx_enter(&so->so_rcv.sb_mtx);
1800 				mtx_enter(&sosp->so_snd.sb_mtx);
1801 
1802 				if (error) {
1803 					if (sosp->so_snd.sb_state &
1804 					    SS_CANTSENDMORE)
1805 						error = EPIPE;
1806 					m_freem(o);
1807 					goto release;
1808 				}
1809 				len -= oobmark;
1810 				so->so_splicelen += oobmark;
1811 				m = o;
1812 				o = m_get(wait, MT_DATA);
1813 			}
1814 			oobmark = 0;
1815 		}
1816 		if (o) {
1817 			o->m_len = 1;
1818 			*mtod(o, caddr_t) = *mtod(m, caddr_t);
1819 
1820 			mtx_leave(&sosp->so_snd.sb_mtx);
1821 			mtx_leave(&so->so_rcv.sb_mtx);
1822 			solock_shared(sosp);
1823 			error = pru_sendoob(sosp, o, NULL, NULL);
1824 			sounlock_shared(sosp);
1825 			mtx_enter(&so->so_rcv.sb_mtx);
1826 			mtx_enter(&sosp->so_snd.sb_mtx);
1827 
1828 			if (error) {
1829 				if (sosp->so_snd.sb_state & SS_CANTSENDMORE)
1830 					error = EPIPE;
1831 				m_freem(m);
1832 				goto release;
1833 			}
1834 			len -= 1;
1835 			so->so_splicelen += 1;
1836 			if (oobmark) {
1837 				oobmark -= 1;
1838 				if (oobmark == 0)
1839 					rcvstate |= SS_RCVATMARK;
1840 			}
1841 			m_adj(m, 1);
1842 		}
1843 	}
1844 
1845 	/* Append all remaining data to drain socket. */
1846 	if (so->so_rcv.sb_cc == 0 || maxreached)
1847 		sosp->so_snd.sb_state &= ~SS_ISSENDING;
1848 
1849 	mtx_leave(&sosp->so_snd.sb_mtx);
1850 	mtx_leave(&so->so_rcv.sb_mtx);
1851 	solock_shared(sosp);
1852 	error = pru_send(sosp, m, NULL, NULL);
1853 	sounlock_shared(sosp);
1854 	mtx_enter(&so->so_rcv.sb_mtx);
1855 	mtx_enter(&sosp->so_snd.sb_mtx);
1856 
1857 	if (error) {
1858 		if (sosp->so_snd.sb_state & SS_CANTSENDMORE ||
1859 		    sosp->so_pcb == NULL)
1860 			error = EPIPE;
1861 		goto release;
1862 	}
1863 	so->so_splicelen += len;
1864 
1865 	/* Move several packets if possible. */
1866 	if (!maxreached && nextrecord)
1867 		goto nextpkt;
1868 
1869  release:
1870 	sosp->so_snd.sb_state &= ~SS_ISSENDING;
1871 
1872 	if (!error && maxreached && so->so_splicemax == so->so_splicelen)
1873 		error = EFBIG;
1874 	if (error)
1875 		WRITE_ONCE(so->so_error, error);
1876 
1877 	if (((so->so_rcv.sb_state & SS_CANTRCVMORE) &&
1878 	    so->so_rcv.sb_cc == 0) ||
1879 	    (sosp->so_snd.sb_state & SS_CANTSENDMORE) ||
1880 	    maxreached || error)
1881 		unsplice = 1;
1882 
1883 	mtx_leave(&sosp->so_snd.sb_mtx);
1884 	mtx_leave(&so->so_rcv.sb_mtx);
1885 
1886 	if (so->so_proto->pr_flags & PR_WANTRCVD)
1887 		sbunlock(&so->so_snd);
1888 
1889 	if (unsplice) {
1890 		soref(sosp);
1891 		sounsplice(so, sosp, 0);
1892 		sorele(sosp);
1893 
1894 		return (0);
1895 	}
1896 	if (timerisset(&so->so_idletv))
1897 		timeout_add_tv(&so->so_idleto, &so->so_idletv);
1898 	return (1);
1899 }
1900 #endif /* SOCKET_SPLICE */
1901 
1902 void
1903 sorwakeup(struct socket *so)
1904 {
1905 #ifdef SOCKET_SPLICE
1906 	if (so->so_proto->pr_flags & PR_SPLICE) {
1907 		mtx_enter(&so->so_rcv.sb_mtx);
1908 		if (so->so_rcv.sb_flags & SB_SPLICE)
1909 			task_add(sosplice_taskq, &so->so_splicetask);
1910 		if (isspliced(so)) {
1911 			mtx_leave(&so->so_rcv.sb_mtx);
1912 			return;
1913 		}
1914 		mtx_leave(&so->so_rcv.sb_mtx);
1915 	}
1916 #endif
1917 	sowakeup(so, &so->so_rcv);
1918 	if (so->so_upcall)
1919 		(*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT);
1920 }
1921 
1922 void
1923 sowwakeup(struct socket *so)
1924 {
1925 #ifdef SOCKET_SPLICE
1926 	if (so->so_proto->pr_flags & PR_SPLICE) {
1927 		mtx_enter(&so->so_snd.sb_mtx);
1928 		if (so->so_snd.sb_flags & SB_SPLICE)
1929 			task_add(sosplice_taskq,
1930 			    &so->so_sp->ssp_soback->so_splicetask);
1931 		if (issplicedback(so)) {
1932 			mtx_leave(&so->so_snd.sb_mtx);
1933 			return;
1934 		}
1935 		mtx_leave(&so->so_snd.sb_mtx);
1936 	}
1937 #endif
1938 	sowakeup(so, &so->so_snd);
1939 }
1940 
1941 int
1942 sosetopt(struct socket *so, int level, int optname, struct mbuf *m)
1943 {
1944 	int error = 0;
1945 
1946 	if (level != SOL_SOCKET) {
1947 		if (so->so_proto->pr_ctloutput) {
1948 			solock(so);
1949 			error = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so,
1950 			    level, optname, m);
1951 			sounlock(so);
1952 			return (error);
1953 		}
1954 		error = ENOPROTOOPT;
1955 	} else {
1956 		switch (optname) {
1957 
1958 		case SO_LINGER:
1959 			if (m == NULL || m->m_len != sizeof (struct linger) ||
1960 			    mtod(m, struct linger *)->l_linger < 0 ||
1961 			    mtod(m, struct linger *)->l_linger > SHRT_MAX)
1962 				return (EINVAL);
1963 
1964 			solock(so);
1965 			so->so_linger = mtod(m, struct linger *)->l_linger;
1966 			if (*mtod(m, int *))
1967 				so->so_options |= optname;
1968 			else
1969 				so->so_options &= ~optname;
1970 			sounlock(so);
1971 
1972 			break;
1973 		case SO_BINDANY:
1974 			if ((error = suser(curproc)) != 0)	/* XXX */
1975 				return (error);
1976 			/* FALLTHROUGH */
1977 
1978 		case SO_DEBUG:
1979 		case SO_KEEPALIVE:
1980 		case SO_USELOOPBACK:
1981 		case SO_BROADCAST:
1982 		case SO_REUSEADDR:
1983 		case SO_REUSEPORT:
1984 		case SO_OOBINLINE:
1985 		case SO_TIMESTAMP:
1986 		case SO_ZEROIZE:
1987 			if (m == NULL || m->m_len < sizeof (int))
1988 				return (EINVAL);
1989 
1990 			solock(so);
1991 			if (*mtod(m, int *))
1992 				so->so_options |= optname;
1993 			else
1994 				so->so_options &= ~optname;
1995 			sounlock(so);
1996 
1997 			break;
1998 		case SO_DONTROUTE:
1999 			if (m == NULL || m->m_len < sizeof (int))
2000 				return (EINVAL);
2001 			if (*mtod(m, int *))
2002 				error = EOPNOTSUPP;
2003 			break;
2004 
2005 		case SO_SNDBUF:
2006 		case SO_RCVBUF:
2007 		case SO_SNDLOWAT:
2008 		case SO_RCVLOWAT:
2009 		    {
2010 			struct sockbuf *sb = (optname == SO_SNDBUF ||
2011 			    optname == SO_SNDLOWAT ?
2012 			    &so->so_snd : &so->so_rcv);
2013 			u_long cnt;
2014 
2015 			if (m == NULL || m->m_len < sizeof (int))
2016 				return (EINVAL);
2017 			cnt = *mtod(m, int *);
2018 			if ((long)cnt <= 0)
2019 				cnt = 1;
2020 
2021 			mtx_enter(&sb->sb_mtx);
2022 			switch (optname) {
2023 			case SO_SNDBUF:
2024 			case SO_RCVBUF:
2025 				if (sb->sb_state &
2026 				    (SS_CANTSENDMORE | SS_CANTRCVMORE)) {
2027 					error = EINVAL;
2028 					break;
2029 				}
2030 				if (sbcheckreserve(cnt, sb->sb_wat) ||
2031 				    sbreserve(so, sb, cnt)) {
2032 					error = ENOBUFS;
2033 					break;
2034 				}
2035 				sb->sb_wat = cnt;
2036 				break;
2037 			case SO_SNDLOWAT:
2038 			case SO_RCVLOWAT:
2039 				sb->sb_lowat = (cnt > sb->sb_hiwat) ?
2040 				    sb->sb_hiwat : cnt;
2041 				break;
2042 			}
2043 			mtx_leave(&sb->sb_mtx);
2044 
2045 			break;
2046 		    }
2047 
2048 		case SO_SNDTIMEO:
2049 		case SO_RCVTIMEO:
2050 		    {
2051 			struct sockbuf *sb = (optname == SO_SNDTIMEO ?
2052 			    &so->so_snd : &so->so_rcv);
2053 			struct timeval tv;
2054 			uint64_t nsecs;
2055 
2056 			if (m == NULL || m->m_len < sizeof (tv))
2057 				return (EINVAL);
2058 			memcpy(&tv, mtod(m, struct timeval *), sizeof tv);
2059 			if (!timerisvalid(&tv))
2060 				return (EINVAL);
2061 			nsecs = TIMEVAL_TO_NSEC(&tv);
2062 			if (nsecs == UINT64_MAX)
2063 				return (EDOM);
2064 			if (nsecs == 0)
2065 				nsecs = INFSLP;
2066 
2067 			mtx_enter(&sb->sb_mtx);
2068 			sb->sb_timeo_nsecs = nsecs;
2069 			mtx_leave(&sb->sb_mtx);
2070 			break;
2071 		    }
2072 
2073 		case SO_RTABLE:
2074 			if (so->so_proto->pr_domain &&
2075 			    so->so_proto->pr_domain->dom_protosw &&
2076 			    so->so_proto->pr_ctloutput) {
2077 				const struct domain *dom =
2078 				    so->so_proto->pr_domain;
2079 
2080 				level = dom->dom_protosw->pr_protocol;
2081 				solock(so);
2082 				error = (*so->so_proto->pr_ctloutput)
2083 				    (PRCO_SETOPT, so, level, optname, m);
2084 				sounlock(so);
2085 			} else
2086 				error = ENOPROTOOPT;
2087 			break;
2088 #ifdef SOCKET_SPLICE
2089 		case SO_SPLICE:
2090 			if (m == NULL) {
2091 				error = sosplice(so, -1, 0, NULL);
2092 			} else if (m->m_len < sizeof(int)) {
2093 				error = EINVAL;
2094 			} else if (m->m_len < sizeof(struct splice)) {
2095 				error = sosplice(so, *mtod(m, int *), 0, NULL);
2096 			} else {
2097 				error = sosplice(so,
2098 				    mtod(m, struct splice *)->sp_fd,
2099 				    mtod(m, struct splice *)->sp_max,
2100 				   &mtod(m, struct splice *)->sp_idle);
2101 			}
2102 			break;
2103 #endif /* SOCKET_SPLICE */
2104 
2105 		default:
2106 			error = ENOPROTOOPT;
2107 			break;
2108 		}
2109 	}
2110 
2111 	return (error);
2112 }
2113 
2114 int
2115 sogetopt(struct socket *so, int level, int optname, struct mbuf *m)
2116 {
2117 	int error = 0;
2118 
2119 	if (level != SOL_SOCKET) {
2120 		if (so->so_proto->pr_ctloutput) {
2121 			m->m_len = 0;
2122 
2123 			solock(so);
2124 			error = (*so->so_proto->pr_ctloutput)(PRCO_GETOPT, so,
2125 			    level, optname, m);
2126 			sounlock(so);
2127 			return (error);
2128 		} else
2129 			return (ENOPROTOOPT);
2130 	} else {
2131 		m->m_len = sizeof (int);
2132 
2133 		switch (optname) {
2134 
2135 		case SO_LINGER:
2136 			m->m_len = sizeof (struct linger);
2137 			solock_shared(so);
2138 			mtod(m, struct linger *)->l_onoff =
2139 				so->so_options & SO_LINGER;
2140 			mtod(m, struct linger *)->l_linger = so->so_linger;
2141 			sounlock_shared(so);
2142 			break;
2143 
2144 		case SO_BINDANY:
2145 		case SO_USELOOPBACK:
2146 		case SO_DEBUG:
2147 		case SO_KEEPALIVE:
2148 		case SO_REUSEADDR:
2149 		case SO_REUSEPORT:
2150 		case SO_BROADCAST:
2151 		case SO_OOBINLINE:
2152 		case SO_ACCEPTCONN:
2153 		case SO_TIMESTAMP:
2154 		case SO_ZEROIZE:
2155 			*mtod(m, int *) = so->so_options & optname;
2156 			break;
2157 
2158 		case SO_DONTROUTE:
2159 			*mtod(m, int *) = 0;
2160 			break;
2161 
2162 		case SO_TYPE:
2163 			*mtod(m, int *) = so->so_type;
2164 			break;
2165 
2166 		case SO_ERROR:
2167 			solock(so);
2168 			*mtod(m, int *) = so->so_error;
2169 			so->so_error = 0;
2170 			sounlock(so);
2171 
2172 			break;
2173 
2174 		case SO_DOMAIN:
2175 			*mtod(m, int *) = so->so_proto->pr_domain->dom_family;
2176 			break;
2177 
2178 		case SO_PROTOCOL:
2179 			*mtod(m, int *) = so->so_proto->pr_protocol;
2180 			break;
2181 
2182 		case SO_SNDBUF:
2183 			*mtod(m, int *) = so->so_snd.sb_hiwat;
2184 			break;
2185 
2186 		case SO_RCVBUF:
2187 			*mtod(m, int *) = so->so_rcv.sb_hiwat;
2188 			break;
2189 
2190 		case SO_SNDLOWAT:
2191 			*mtod(m, int *) = so->so_snd.sb_lowat;
2192 			break;
2193 
2194 		case SO_RCVLOWAT:
2195 			*mtod(m, int *) = so->so_rcv.sb_lowat;
2196 			break;
2197 
2198 		case SO_SNDTIMEO:
2199 		case SO_RCVTIMEO:
2200 		    {
2201 			struct sockbuf *sb = (optname == SO_SNDTIMEO ?
2202 			    &so->so_snd : &so->so_rcv);
2203 			struct timeval tv;
2204 			uint64_t nsecs;
2205 
2206 			mtx_enter(&sb->sb_mtx);
2207 			nsecs = sb->sb_timeo_nsecs;
2208 			mtx_leave(&sb->sb_mtx);
2209 
2210 			m->m_len = sizeof(struct timeval);
2211 			memset(&tv, 0, sizeof(tv));
2212 			if (nsecs != INFSLP)
2213 				NSEC_TO_TIMEVAL(nsecs, &tv);
2214 			memcpy(mtod(m, struct timeval *), &tv, sizeof tv);
2215 			break;
2216 		    }
2217 
2218 		case SO_RTABLE:
2219 			if (so->so_proto->pr_domain &&
2220 			    so->so_proto->pr_domain->dom_protosw &&
2221 			    so->so_proto->pr_ctloutput) {
2222 				const struct domain *dom =
2223 				    so->so_proto->pr_domain;
2224 
2225 				level = dom->dom_protosw->pr_protocol;
2226 				solock(so);
2227 				error = (*so->so_proto->pr_ctloutput)
2228 				    (PRCO_GETOPT, so, level, optname, m);
2229 				sounlock(so);
2230 				if (error)
2231 					return (error);
2232 				break;
2233 			}
2234 			return (ENOPROTOOPT);
2235 
2236 #ifdef SOCKET_SPLICE
2237 		case SO_SPLICE:
2238 		    {
2239 			off_t len;
2240 
2241 			m->m_len = sizeof(off_t);
2242 			solock_shared(so);
2243 			len = so->so_sp ? so->so_sp->ssp_len : 0;
2244 			sounlock_shared(so);
2245 			memcpy(mtod(m, off_t *), &len, sizeof(off_t));
2246 			break;
2247 		    }
2248 #endif /* SOCKET_SPLICE */
2249 
2250 		case SO_PEERCRED:
2251 			if (so->so_proto->pr_protocol == AF_UNIX) {
2252 				struct unpcb *unp = sotounpcb(so);
2253 
2254 				solock(so);
2255 				if (unp->unp_flags & UNP_FEIDS) {
2256 					m->m_len = sizeof(unp->unp_connid);
2257 					memcpy(mtod(m, caddr_t),
2258 					    &(unp->unp_connid), m->m_len);
2259 					sounlock(so);
2260 					break;
2261 				}
2262 				sounlock(so);
2263 
2264 				return (ENOTCONN);
2265 			}
2266 			return (EOPNOTSUPP);
2267 
2268 		default:
2269 			return (ENOPROTOOPT);
2270 		}
2271 		return (0);
2272 	}
2273 }
2274 
2275 void
2276 sohasoutofband(struct socket *so)
2277 {
2278 	pgsigio(&so->so_sigio, SIGURG, 0);
2279 	knote(&so->so_rcv.sb_klist, 0);
2280 }
2281 
2282 void
2283 sofilt_lock(struct socket *so, struct sockbuf *sb)
2284 {
2285 	switch (so->so_proto->pr_domain->dom_family) {
2286 	case PF_INET:
2287 	case PF_INET6:
2288 		NET_LOCK_SHARED();
2289 		break;
2290 	default:
2291 		rw_enter_write(&so->so_lock);
2292 		break;
2293 	}
2294 
2295 	mtx_enter(&sb->sb_mtx);
2296 }
2297 
2298 void
2299 sofilt_unlock(struct socket *so, struct sockbuf *sb)
2300 {
2301 	mtx_leave(&sb->sb_mtx);
2302 
2303 	switch (so->so_proto->pr_domain->dom_family) {
2304 	case PF_INET:
2305 	case PF_INET6:
2306 		NET_UNLOCK_SHARED();
2307 		break;
2308 	default:
2309 		rw_exit_write(&so->so_lock);
2310 		break;
2311 	}
2312 }
2313 
2314 int
2315 soo_kqfilter(struct file *fp, struct knote *kn)
2316 {
2317 	struct socket *so = kn->kn_fp->f_data;
2318 	struct sockbuf *sb;
2319 
2320 	switch (kn->kn_filter) {
2321 	case EVFILT_READ:
2322 		kn->kn_fop = &soread_filtops;
2323 		sb = &so->so_rcv;
2324 		break;
2325 	case EVFILT_WRITE:
2326 		kn->kn_fop = &sowrite_filtops;
2327 		sb = &so->so_snd;
2328 		break;
2329 	case EVFILT_EXCEPT:
2330 		kn->kn_fop = &soexcept_filtops;
2331 		sb = &so->so_rcv;
2332 		break;
2333 	default:
2334 		return (EINVAL);
2335 	}
2336 
2337 	klist_insert(&sb->sb_klist, kn);
2338 
2339 	return (0);
2340 }
2341 
2342 void
2343 filt_sordetach(struct knote *kn)
2344 {
2345 	struct socket *so = kn->kn_fp->f_data;
2346 
2347 	klist_remove(&so->so_rcv.sb_klist, kn);
2348 }
2349 
2350 int
2351 filt_soread(struct knote *kn, long hint)
2352 {
2353 	struct socket *so = kn->kn_fp->f_data;
2354 	u_int state = READ_ONCE(so->so_state);
2355 	u_int error = READ_ONCE(so->so_error);
2356 	int rv = 0;
2357 
2358 	MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx);
2359 
2360 	if (so->so_options & SO_ACCEPTCONN) {
2361 		short qlen = READ_ONCE(so->so_qlen);
2362 
2363 		soassertlocked_readonly(so);
2364 
2365 		kn->kn_data = qlen;
2366 		rv = (kn->kn_data != 0);
2367 
2368 		if (kn->kn_flags & (__EV_POLL | __EV_SELECT)) {
2369 			if (state & SS_ISDISCONNECTED) {
2370 				kn->kn_flags |= __EV_HUP;
2371 				rv = 1;
2372 			} else {
2373 				rv = qlen || soreadable(so);
2374 			}
2375 		}
2376 
2377 		return rv;
2378 	}
2379 
2380 	kn->kn_data = so->so_rcv.sb_cc;
2381 #ifdef SOCKET_SPLICE
2382 	if (isspliced(so)) {
2383 		rv = 0;
2384 	} else
2385 #endif /* SOCKET_SPLICE */
2386 	if (so->so_rcv.sb_state & SS_CANTRCVMORE) {
2387 		kn->kn_flags |= EV_EOF;
2388 		if (kn->kn_flags & __EV_POLL) {
2389 			if (state & SS_ISDISCONNECTED)
2390 				kn->kn_flags |= __EV_HUP;
2391 		}
2392 		kn->kn_fflags = error;
2393 		rv = 1;
2394 	} else if (error) {
2395 		rv = 1;
2396 	} else if (kn->kn_sfflags & NOTE_LOWAT) {
2397 		rv = (kn->kn_data >= kn->kn_sdata);
2398 	} else {
2399 		rv = (kn->kn_data >= so->so_rcv.sb_lowat);
2400 	}
2401 
2402 	return rv;
2403 }
2404 
2405 void
2406 filt_sowdetach(struct knote *kn)
2407 {
2408 	struct socket *so = kn->kn_fp->f_data;
2409 
2410 	klist_remove(&so->so_snd.sb_klist, kn);
2411 }
2412 
2413 int
2414 filt_sowrite(struct knote *kn, long hint)
2415 {
2416 	struct socket *so = kn->kn_fp->f_data;
2417 	u_int state = READ_ONCE(so->so_state);
2418 	u_int error = READ_ONCE(so->so_error);
2419 	int rv;
2420 
2421 	MUTEX_ASSERT_LOCKED(&so->so_snd.sb_mtx);
2422 
2423 	kn->kn_data = sbspace_locked(so, &so->so_snd);
2424 	if (so->so_snd.sb_state & SS_CANTSENDMORE) {
2425 		kn->kn_flags |= EV_EOF;
2426 		if (kn->kn_flags & __EV_POLL) {
2427 			if (state & SS_ISDISCONNECTED)
2428 				kn->kn_flags |= __EV_HUP;
2429 		}
2430 		kn->kn_fflags = error;
2431 		rv = 1;
2432 	} else if (error) {
2433 		rv = 1;
2434 	} else if (((state & SS_ISCONNECTED) == 0) &&
2435 	    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
2436 		rv = 0;
2437 	} else if (kn->kn_sfflags & NOTE_LOWAT) {
2438 		rv = (kn->kn_data >= kn->kn_sdata);
2439 	} else {
2440 		rv = (kn->kn_data >= so->so_snd.sb_lowat);
2441 	}
2442 
2443 	return (rv);
2444 }
2445 
2446 int
2447 filt_soexcept(struct knote *kn, long hint)
2448 {
2449 	struct socket *so = kn->kn_fp->f_data;
2450 	int rv = 0;
2451 
2452 	MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx);
2453 
2454 #ifdef SOCKET_SPLICE
2455 	if (isspliced(so)) {
2456 		rv = 0;
2457 	} else
2458 #endif /* SOCKET_SPLICE */
2459 	if (kn->kn_sfflags & NOTE_OOB) {
2460 		if (so->so_oobmark || (so->so_rcv.sb_state & SS_RCVATMARK)) {
2461 			kn->kn_fflags |= NOTE_OOB;
2462 			kn->kn_data -= so->so_oobmark;
2463 			rv = 1;
2464 		}
2465 	}
2466 
2467 	if (kn->kn_flags & __EV_POLL) {
2468 		u_int state = READ_ONCE(so->so_state);
2469 
2470 		if (state & SS_ISDISCONNECTED) {
2471 			kn->kn_flags |= __EV_HUP;
2472 			rv = 1;
2473 		}
2474 	}
2475 
2476 	return rv;
2477 }
2478 
2479 int
2480 filt_sowmodify(struct kevent *kev, struct knote *kn)
2481 {
2482 	struct socket *so = kn->kn_fp->f_data;
2483 	int rv;
2484 
2485 	sofilt_lock(so, &so->so_snd);
2486 	rv = knote_modify(kev, kn);
2487 	sofilt_unlock(so, &so->so_snd);
2488 
2489 	return (rv);
2490 }
2491 
2492 int
2493 filt_sowprocess(struct knote *kn, struct kevent *kev)
2494 {
2495 	struct socket *so = kn->kn_fp->f_data;
2496 	int rv;
2497 
2498 	sofilt_lock(so, &so->so_snd);
2499 	rv = knote_process(kn, kev);
2500 	sofilt_unlock(so, &so->so_snd);
2501 
2502 	return (rv);
2503 }
2504 
2505 int
2506 filt_sormodify(struct kevent *kev, struct knote *kn)
2507 {
2508 	struct socket *so = kn->kn_fp->f_data;
2509 	int rv;
2510 
2511 	sofilt_lock(so, &so->so_rcv);
2512 	rv = knote_modify(kev, kn);
2513 	sofilt_unlock(so, &so->so_rcv);
2514 
2515 	return (rv);
2516 }
2517 
2518 int
2519 filt_sorprocess(struct knote *kn, struct kevent *kev)
2520 {
2521 	struct socket *so = kn->kn_fp->f_data;
2522 	int rv;
2523 
2524 	sofilt_lock(so, &so->so_rcv);
2525 	rv = knote_process(kn, kev);
2526 	sofilt_unlock(so, &so->so_rcv);
2527 
2528 	return (rv);
2529 }
2530 
2531 #ifdef DDB
2532 void
2533 sobuf_print(struct sockbuf *,
2534     int (*)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))));
2535 
2536 void
2537 sobuf_print(struct sockbuf *sb,
2538     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
2539 {
2540 	(*pr)("\tsb_cc: %lu\n", sb->sb_cc);
2541 	(*pr)("\tsb_datacc: %lu\n", sb->sb_datacc);
2542 	(*pr)("\tsb_hiwat: %lu\n", sb->sb_hiwat);
2543 	(*pr)("\tsb_wat: %lu\n", sb->sb_wat);
2544 	(*pr)("\tsb_mbcnt: %lu\n", sb->sb_mbcnt);
2545 	(*pr)("\tsb_mbmax: %lu\n", sb->sb_mbmax);
2546 	(*pr)("\tsb_lowat: %ld\n", sb->sb_lowat);
2547 	(*pr)("\tsb_mb: %p\n", sb->sb_mb);
2548 	(*pr)("\tsb_mbtail: %p\n", sb->sb_mbtail);
2549 	(*pr)("\tsb_lastrecord: %p\n", sb->sb_lastrecord);
2550 	(*pr)("\tsb_flags: %04x\n", sb->sb_flags);
2551 	(*pr)("\tsb_state: %04x\n", sb->sb_state);
2552 	(*pr)("\tsb_timeo_nsecs: %llu\n", sb->sb_timeo_nsecs);
2553 }
2554 
2555 void
2556 so_print(void *v,
2557     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
2558 {
2559 	struct socket *so = v;
2560 
2561 	(*pr)("socket %p\n", so);
2562 	(*pr)("so_type: %i\n", so->so_type);
2563 	(*pr)("so_options: 0x%04x\n", so->so_options); /* %b */
2564 	(*pr)("so_linger: %i\n", so->so_linger);
2565 	(*pr)("so_state: 0x%04x\n", so->so_state);
2566 	(*pr)("so_pcb: %p\n", so->so_pcb);
2567 	(*pr)("so_proto: %p\n", so->so_proto);
2568 	(*pr)("so_sigio: %p\n", so->so_sigio.sir_sigio);
2569 
2570 	(*pr)("so_head: %p\n", so->so_head);
2571 	(*pr)("so_onq: %p\n", so->so_onq);
2572 	(*pr)("so_q0: @%p first: %p\n", &so->so_q0, TAILQ_FIRST(&so->so_q0));
2573 	(*pr)("so_q: @%p first: %p\n", &so->so_q, TAILQ_FIRST(&so->so_q));
2574 	(*pr)("so_eq: next: %p\n", TAILQ_NEXT(so, so_qe));
2575 	(*pr)("so_q0len: %i\n", so->so_q0len);
2576 	(*pr)("so_qlen: %i\n", so->so_qlen);
2577 	(*pr)("so_qlimit: %i\n", so->so_qlimit);
2578 	(*pr)("so_timeo: %i\n", so->so_timeo);
2579 	(*pr)("so_obmark: %lu\n", so->so_oobmark);
2580 
2581 	(*pr)("so_sp: %p\n", so->so_sp);
2582 	if (so->so_sp != NULL) {
2583 		(*pr)("\tssp_socket: %p\n", so->so_sp->ssp_socket);
2584 		(*pr)("\tssp_soback: %p\n", so->so_sp->ssp_soback);
2585 		(*pr)("\tssp_len: %lld\n",
2586 		    (unsigned long long)so->so_sp->ssp_len);
2587 		(*pr)("\tssp_max: %lld\n",
2588 		    (unsigned long long)so->so_sp->ssp_max);
2589 		(*pr)("\tssp_idletv: %lld %ld\n", so->so_sp->ssp_idletv.tv_sec,
2590 		    so->so_sp->ssp_idletv.tv_usec);
2591 		(*pr)("\tssp_idleto: %spending (@%i)\n",
2592 		    timeout_pending(&so->so_sp->ssp_idleto) ? "" : "not ",
2593 		    so->so_sp->ssp_idleto.to_time);
2594 	}
2595 
2596 	(*pr)("so_rcv:\n");
2597 	sobuf_print(&so->so_rcv, pr);
2598 	(*pr)("so_snd:\n");
2599 	sobuf_print(&so->so_snd, pr);
2600 
2601 	(*pr)("so_upcall: %p so_upcallarg: %p\n",
2602 	    so->so_upcall, so->so_upcallarg);
2603 
2604 	(*pr)("so_euid: %d so_ruid: %d\n", so->so_euid, so->so_ruid);
2605 	(*pr)("so_egid: %d so_rgid: %d\n", so->so_egid, so->so_rgid);
2606 	(*pr)("so_cpid: %d\n", so->so_cpid);
2607 }
2608 #endif
2609