xref: /openbsd/sys/kern/uipc_socket.c (revision 889c0915)
1 /*	$OpenBSD: uipc_socket.c,v 1.89 2011/04/04 11:10:26 claudio Exp $	*/
2 /*	$NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1990, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
33  */
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/proc.h>
38 #include <sys/file.h>
39 #include <sys/filedesc.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/domain.h>
43 #include <sys/kernel.h>
44 #include <sys/event.h>
45 #include <sys/protosw.h>
46 #include <sys/socket.h>
47 #include <sys/unpcb.h>
48 #include <sys/socketvar.h>
49 #include <sys/signalvar.h>
50 #include <sys/resourcevar.h>
51 #include <net/route.h>
52 #include <sys/pool.h>
53 
54 int	sosplice(struct socket *, int, off_t);
55 int	somove(struct socket *, int);
56 void 	filt_sordetach(struct knote *kn);
57 int 	filt_soread(struct knote *kn, long hint);
58 void 	filt_sowdetach(struct knote *kn);
59 int	filt_sowrite(struct knote *kn, long hint);
60 int	filt_solisten(struct knote *kn, long hint);
61 
62 struct filterops solisten_filtops =
63 	{ 1, NULL, filt_sordetach, filt_solisten };
64 struct filterops soread_filtops =
65 	{ 1, NULL, filt_sordetach, filt_soread };
66 struct filterops sowrite_filtops =
67 	{ 1, NULL, filt_sowdetach, filt_sowrite };
68 
69 
70 #ifndef SOMINCONN
71 #define SOMINCONN 80
72 #endif /* SOMINCONN */
73 
74 int	somaxconn = SOMAXCONN;
75 int	sominconn = SOMINCONN;
76 
77 struct pool socket_pool;
78 
79 void
80 soinit(void)
81 {
82 
83 	pool_init(&socket_pool, sizeof(struct socket), 0, 0, 0, "sockpl", NULL);
84 }
85 
86 /*
87  * Socket operation routines.
88  * These routines are called by the routines in
89  * sys_socket.c or from a system process, and
90  * implement the semantics of socket operations by
91  * switching out to the protocol specific routines.
92  */
93 /*ARGSUSED*/
94 int
95 socreate(int dom, struct socket **aso, int type, int proto)
96 {
97 	struct proc *p = curproc;		/* XXX */
98 	struct protosw *prp;
99 	struct socket *so;
100 	int error, s;
101 
102 	if (proto)
103 		prp = pffindproto(dom, proto, type);
104 	else
105 		prp = pffindtype(dom, type);
106 	if (prp == NULL || prp->pr_usrreq == 0)
107 		return (EPROTONOSUPPORT);
108 	if (prp->pr_type != type)
109 		return (EPROTOTYPE);
110 	s = splsoftnet();
111 	so = pool_get(&socket_pool, PR_WAITOK | PR_ZERO);
112 	TAILQ_INIT(&so->so_q0);
113 	TAILQ_INIT(&so->so_q);
114 	so->so_type = type;
115 	if (suser(p, 0) == 0)
116 		so->so_state = SS_PRIV;
117 	so->so_ruid = p->p_cred->p_ruid;
118 	so->so_euid = p->p_ucred->cr_uid;
119 	so->so_rgid = p->p_cred->p_rgid;
120 	so->so_egid = p->p_ucred->cr_gid;
121 	so->so_cpid = p->p_pid;
122 	so->so_proto = prp;
123 	error = (*prp->pr_usrreq)(so, PRU_ATTACH, NULL,
124 	    (struct mbuf *)(long)proto, NULL, p);
125 	if (error) {
126 		so->so_state |= SS_NOFDREF;
127 		sofree(so);
128 		splx(s);
129 		return (error);
130 	}
131 	splx(s);
132 	*aso = so;
133 	return (0);
134 }
135 
136 int
137 sobind(struct socket *so, struct mbuf *nam, struct proc *p)
138 {
139 	int s = splsoftnet();
140 	int error;
141 
142 	error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, NULL, nam, NULL, p);
143 	splx(s);
144 	return (error);
145 }
146 
147 int
148 solisten(struct socket *so, int backlog)
149 {
150 	int s, error;
151 
152 #ifdef SOCKET_SPLICE
153 	if (so->so_splice || so->so_spliceback)
154 		return (EOPNOTSUPP);
155 #endif /* SOCKET_SPLICE */
156 	s = splsoftnet();
157 	error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL, NULL, NULL,
158 	    curproc);
159 	if (error) {
160 		splx(s);
161 		return (error);
162 	}
163 	if (TAILQ_FIRST(&so->so_q) == NULL)
164 		so->so_options |= SO_ACCEPTCONN;
165 	if (backlog < 0 || backlog > somaxconn)
166 		backlog = somaxconn;
167 	if (backlog < sominconn)
168 		backlog = sominconn;
169 	so->so_qlimit = backlog;
170 	splx(s);
171 	return (0);
172 }
173 
174 /*
175  *  Must be called at splsoftnet()
176  */
177 
178 void
179 sofree(struct socket *so)
180 {
181 	splsoftassert(IPL_SOFTNET);
182 
183 	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
184 		return;
185 	if (so->so_head) {
186 		/*
187 		 * We must not decommission a socket that's on the accept(2)
188 		 * queue.  If we do, then accept(2) may hang after select(2)
189 		 * indicated that the listening socket was ready.
190 		 */
191 		if (!soqremque(so, 0))
192 			return;
193 	}
194 #ifdef SOCKET_SPLICE
195 	if (so->so_spliceback) {
196 		so->so_snd.sb_flags &= ~SB_SPLICE;
197 		so->so_spliceback->so_rcv.sb_flags &= ~SB_SPLICE;
198 		so->so_spliceback->so_splice = NULL;
199 		if (soreadable(so->so_spliceback))
200 			sorwakeup(so->so_spliceback);
201 	}
202 	if (so->so_splice) {
203 		so->so_splice->so_snd.sb_flags &= ~SB_SPLICE;
204 		so->so_rcv.sb_flags &= ~SB_SPLICE;
205 		so->so_splice->so_spliceback = NULL;
206 	}
207 	so->so_spliceback = so->so_splice = NULL;
208 #endif /* SOCKET_SPLICE */
209 	sbrelease(&so->so_snd);
210 	sorflush(so);
211 	pool_put(&socket_pool, so);
212 }
213 
214 /*
215  * Close a socket on last file table reference removal.
216  * Initiate disconnect if connected.
217  * Free socket when disconnect complete.
218  */
219 int
220 soclose(struct socket *so)
221 {
222 	struct socket *so2;
223 	int s = splsoftnet();		/* conservative */
224 	int error = 0;
225 
226 	if (so->so_options & SO_ACCEPTCONN) {
227 		while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) {
228 			(void) soqremque(so2, 0);
229 			(void) soabort(so2);
230 		}
231 		while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) {
232 			(void) soqremque(so2, 1);
233 			(void) soabort(so2);
234 		}
235 	}
236 	if (so->so_pcb == 0)
237 		goto discard;
238 	if (so->so_state & SS_ISCONNECTED) {
239 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
240 			error = sodisconnect(so);
241 			if (error)
242 				goto drop;
243 		}
244 		if (so->so_options & SO_LINGER) {
245 			if ((so->so_state & SS_ISDISCONNECTING) &&
246 			    (so->so_state & SS_NBIO))
247 				goto drop;
248 			while (so->so_state & SS_ISCONNECTED) {
249 				error = tsleep(&so->so_timeo,
250 				    PSOCK | PCATCH, "netcls",
251 				    so->so_linger * hz);
252 				if (error)
253 					break;
254 			}
255 		}
256 	}
257 drop:
258 	if (so->so_pcb) {
259 		int error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH, NULL,
260 		    NULL, NULL, curproc);
261 		if (error == 0)
262 			error = error2;
263 	}
264 discard:
265 	if (so->so_state & SS_NOFDREF)
266 		panic("soclose: NOFDREF");
267 	so->so_state |= SS_NOFDREF;
268 	sofree(so);
269 	splx(s);
270 	return (error);
271 }
272 
273 /*
274  * Must be called at splsoftnet.
275  */
276 int
277 soabort(struct socket *so)
278 {
279 	splsoftassert(IPL_SOFTNET);
280 
281 	return (*so->so_proto->pr_usrreq)(so, PRU_ABORT, NULL, NULL, NULL,
282 	   curproc);
283 }
284 
285 int
286 soaccept(struct socket *so, struct mbuf *nam)
287 {
288 	int s = splsoftnet();
289 	int error = 0;
290 
291 	if ((so->so_state & SS_NOFDREF) == 0)
292 		panic("soaccept: !NOFDREF");
293 	so->so_state &= ~SS_NOFDREF;
294 	if ((so->so_state & SS_ISDISCONNECTED) == 0 ||
295 	    (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0)
296 		error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT, NULL,
297 		    nam, NULL, curproc);
298 	else
299 		error = ECONNABORTED;
300 	splx(s);
301 	return (error);
302 }
303 
304 int
305 soconnect(struct socket *so, struct mbuf *nam)
306 {
307 	int s;
308 	int error;
309 
310 	if (so->so_options & SO_ACCEPTCONN)
311 		return (EOPNOTSUPP);
312 	s = splsoftnet();
313 	/*
314 	 * If protocol is connection-based, can only connect once.
315 	 * Otherwise, if connected, try to disconnect first.
316 	 * This allows user to disconnect by connecting to, e.g.,
317 	 * a null address.
318 	 */
319 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
320 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
321 	    (error = sodisconnect(so))))
322 		error = EISCONN;
323 	else
324 		error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT,
325 		    NULL, nam, NULL, curproc);
326 	splx(s);
327 	return (error);
328 }
329 
330 int
331 soconnect2(struct socket *so1, struct socket *so2)
332 {
333 	int s = splsoftnet();
334 	int error;
335 
336 	error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, NULL,
337 	    (struct mbuf *)so2, NULL, curproc);
338 	splx(s);
339 	return (error);
340 }
341 
342 int
343 sodisconnect(struct socket *so)
344 {
345 	int s = splsoftnet();
346 	int error;
347 
348 	if ((so->so_state & SS_ISCONNECTED) == 0) {
349 		error = ENOTCONN;
350 		goto bad;
351 	}
352 	if (so->so_state & SS_ISDISCONNECTING) {
353 		error = EALREADY;
354 		goto bad;
355 	}
356 	error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, NULL, NULL,
357 	    NULL, curproc);
358 bad:
359 	splx(s);
360 	return (error);
361 }
362 
363 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
364 /*
365  * Send on a socket.
366  * If send must go all at once and message is larger than
367  * send buffering, then hard error.
368  * Lock against other senders.
369  * If must go all at once and not enough room now, then
370  * inform user that this would block and do nothing.
371  * Otherwise, if nonblocking, send as much as possible.
372  * The data to be sent is described by "uio" if nonzero,
373  * otherwise by the mbuf chain "top" (which must be null
374  * if uio is not).  Data provided in mbuf chain must be small
375  * enough to send all at once.
376  *
377  * Returns nonzero on error, timeout or signal; callers
378  * must check for short counts if EINTR/ERESTART are returned.
379  * Data and control buffers are freed on return.
380  */
381 int
382 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top,
383     struct mbuf *control, int flags)
384 {
385 	struct mbuf **mp;
386 	struct mbuf *m;
387 	long space, len, mlen, clen = 0;
388 	quad_t resid;
389 	int error, s, dontroute;
390 	int atomic = sosendallatonce(so) || top;
391 
392 	if (uio)
393 		resid = uio->uio_resid;
394 	else
395 		resid = top->m_pkthdr.len;
396 	/*
397 	 * In theory resid should be unsigned (since uio->uio_resid is).
398 	 * However, space must be signed, as it might be less than 0
399 	 * if we over-committed, and we must use a signed comparison
400 	 * of space and resid.  On the other hand, a negative resid
401 	 * causes us to loop sending 0-length segments to the protocol.
402 	 * MSG_EOR on a SOCK_STREAM socket is also invalid.
403 	 */
404 	if (resid < 0 ||
405 	    (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
406 		error = EINVAL;
407 		goto out;
408 	}
409 	dontroute =
410 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
411 	    (so->so_proto->pr_flags & PR_ATOMIC);
412 	if (uio && uio->uio_procp)
413 		uio->uio_procp->p_stats->p_ru.ru_msgsnd++;
414 	if (control)
415 		clen = control->m_len;
416 #define	snderr(errno)	{ error = errno; splx(s); goto release; }
417 
418 restart:
419 	if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0)
420 		goto out;
421 	so->so_state |= SS_ISSENDING;
422 	do {
423 		s = splsoftnet();
424 		if (so->so_state & SS_CANTSENDMORE)
425 			snderr(EPIPE);
426 		if (so->so_error) {
427 			error = so->so_error;
428 			so->so_error = 0;
429 			splx(s);
430 			goto release;
431 		}
432 		if ((so->so_state & SS_ISCONNECTED) == 0) {
433 			if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
434 				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
435 				    !(resid == 0 && clen != 0))
436 					snderr(ENOTCONN);
437 			} else if (addr == 0)
438 				snderr(EDESTADDRREQ);
439 		}
440 		space = sbspace(&so->so_snd);
441 		if (flags & MSG_OOB)
442 			space += 1024;
443 		if ((atomic && resid > so->so_snd.sb_hiwat) ||
444 		    clen > so->so_snd.sb_hiwat)
445 			snderr(EMSGSIZE);
446 		if (space < resid + clen &&
447 		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
448 			if (so->so_state & SS_NBIO)
449 				snderr(EWOULDBLOCK);
450 			sbunlock(&so->so_snd);
451 			error = sbwait(&so->so_snd);
452 			so->so_state &= ~SS_ISSENDING;
453 			splx(s);
454 			if (error)
455 				goto out;
456 			goto restart;
457 		}
458 		splx(s);
459 		mp = &top;
460 		space -= clen;
461 		do {
462 			if (uio == NULL) {
463 				/*
464 				 * Data is prepackaged in "top".
465 				 */
466 				resid = 0;
467 				if (flags & MSG_EOR)
468 					top->m_flags |= M_EOR;
469 			} else do {
470 				if (top == 0) {
471 					MGETHDR(m, M_WAIT, MT_DATA);
472 					mlen = MHLEN;
473 					m->m_pkthdr.len = 0;
474 					m->m_pkthdr.rcvif = (struct ifnet *)0;
475 				} else {
476 					MGET(m, M_WAIT, MT_DATA);
477 					mlen = MLEN;
478 				}
479 				if (resid >= MINCLSIZE && space >= MCLBYTES) {
480 					MCLGET(m, M_NOWAIT);
481 					if ((m->m_flags & M_EXT) == 0)
482 						goto nopages;
483 					mlen = MCLBYTES;
484 					if (atomic && top == 0) {
485 						len = lmin(MCLBYTES - max_hdr, resid);
486 						m->m_data += max_hdr;
487 					} else
488 						len = lmin(MCLBYTES, resid);
489 					space -= len;
490 				} else {
491 nopages:
492 					len = lmin(lmin(mlen, resid), space);
493 					space -= len;
494 					/*
495 					 * For datagram protocols, leave room
496 					 * for protocol headers in first mbuf.
497 					 */
498 					if (atomic && top == 0 && len < mlen)
499 						MH_ALIGN(m, len);
500 				}
501 				error = uiomove(mtod(m, caddr_t), (int)len,
502 				    uio);
503 				resid = uio->uio_resid;
504 				m->m_len = len;
505 				*mp = m;
506 				top->m_pkthdr.len += len;
507 				if (error)
508 					goto release;
509 				mp = &m->m_next;
510 				if (resid <= 0) {
511 					if (flags & MSG_EOR)
512 						top->m_flags |= M_EOR;
513 					break;
514 				}
515 			} while (space > 0 && atomic);
516 			if (dontroute)
517 				so->so_options |= SO_DONTROUTE;
518 			s = splsoftnet();		/* XXX */
519 			if (resid <= 0)
520 				so->so_state &= ~SS_ISSENDING;
521 			error = (*so->so_proto->pr_usrreq)(so,
522 			    (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND,
523 			    top, addr, control, curproc);
524 			splx(s);
525 			if (dontroute)
526 				so->so_options &= ~SO_DONTROUTE;
527 			clen = 0;
528 			control = 0;
529 			top = 0;
530 			mp = &top;
531 			if (error)
532 				goto release;
533 		} while (resid && space > 0);
534 	} while (resid);
535 
536 release:
537 	so->so_state &= ~SS_ISSENDING;
538 	sbunlock(&so->so_snd);
539 out:
540 	if (top)
541 		m_freem(top);
542 	if (control)
543 		m_freem(control);
544 	return (error);
545 }
546 
547 /*
548  * Implement receive operations on a socket.
549  * We depend on the way that records are added to the sockbuf
550  * by sbappend*.  In particular, each record (mbufs linked through m_next)
551  * must begin with an address if the protocol so specifies,
552  * followed by an optional mbuf or mbufs containing ancillary data,
553  * and then zero or more mbufs of data.
554  * In order to avoid blocking network interrupts for the entire time here,
555  * we splx() while doing the actual copy to user space.
556  * Although the sockbuf is locked, new data may still be appended,
557  * and thus we must maintain consistency of the sockbuf during that time.
558  *
559  * The caller may receive the data as a single mbuf chain by supplying
560  * an mbuf **mp0 for use in returning the chain.  The uio is then used
561  * only for the count in uio_resid.
562  */
563 int
564 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio,
565     struct mbuf **mp0, struct mbuf **controlp, int *flagsp,
566     socklen_t controllen)
567 {
568 	struct mbuf *m, **mp;
569 	int flags, len, error, s, offset;
570 	struct protosw *pr = so->so_proto;
571 	struct mbuf *nextrecord;
572 	int moff, type = 0;
573 	size_t orig_resid = uio->uio_resid;
574 	int uio_error = 0;
575 	int resid;
576 
577 	mp = mp0;
578 	if (paddr)
579 		*paddr = 0;
580 	if (controlp)
581 		*controlp = 0;
582 	if (flagsp)
583 		flags = *flagsp &~ MSG_EOR;
584 	else
585 		flags = 0;
586 	if (so->so_state & SS_NBIO)
587 		flags |= MSG_DONTWAIT;
588 	if (flags & MSG_OOB) {
589 		m = m_get(M_WAIT, MT_DATA);
590 		error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m,
591 		    (struct mbuf *)(long)(flags & MSG_PEEK), NULL, curproc);
592 		if (error)
593 			goto bad;
594 		do {
595 			error = uiomove(mtod(m, caddr_t),
596 			    (int) min(uio->uio_resid, m->m_len), uio);
597 			m = m_free(m);
598 		} while (uio->uio_resid && error == 0 && m);
599 bad:
600 		if (m)
601 			m_freem(m);
602 		return (error);
603 	}
604 	if (mp)
605 		*mp = NULL;
606 	if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
607 		(*pr->pr_usrreq)(so, PRU_RCVD, NULL, NULL, NULL, curproc);
608 
609 restart:
610 	if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0)
611 		return (error);
612 	s = splsoftnet();
613 
614 	m = so->so_rcv.sb_mb;
615 #ifdef SOCKET_SPLICE
616 	if (so->so_splice)
617 		m = NULL;
618 #endif /* SOCKET_SPLICE */
619 	/*
620 	 * If we have less data than requested, block awaiting more
621 	 * (subject to any timeout) if:
622 	 *   1. the current count is less than the low water mark,
623 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
624 	 *	receive operation at once if we block (resid <= hiwat), or
625 	 *   3. MSG_DONTWAIT is not set.
626 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
627 	 * we have to do the receive in sections, and thus risk returning
628 	 * a short count if a timeout or signal occurs after we start.
629 	 */
630 	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
631 	    so->so_rcv.sb_cc < uio->uio_resid) &&
632 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
633 	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
634 	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
635 #ifdef DIAGNOSTIC
636 		if (m == NULL && so->so_rcv.sb_cc)
637 #ifdef SOCKET_SPLICE
638 		    if (so->so_splice == NULL)
639 #endif /* SOCKET_SPLICE */
640 			panic("receive 1");
641 #endif
642 		if (so->so_error) {
643 			if (m)
644 				goto dontblock;
645 			error = so->so_error;
646 			if ((flags & MSG_PEEK) == 0)
647 				so->so_error = 0;
648 			goto release;
649 		}
650 		if (so->so_state & SS_CANTRCVMORE) {
651 			if (m)
652 				goto dontblock;
653 			else if (so->so_rcv.sb_cc == 0)
654 				goto release;
655 		}
656 		for (; m; m = m->m_next)
657 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
658 				m = so->so_rcv.sb_mb;
659 				goto dontblock;
660 			}
661 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
662 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
663 			error = ENOTCONN;
664 			goto release;
665 		}
666 		if (uio->uio_resid == 0 && controlp == NULL)
667 			goto release;
668 		if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
669 			error = EWOULDBLOCK;
670 			goto release;
671 		}
672 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
673 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
674 		sbunlock(&so->so_rcv);
675 		error = sbwait(&so->so_rcv);
676 		splx(s);
677 		if (error)
678 			return (error);
679 		goto restart;
680 	}
681 dontblock:
682 	/*
683 	 * On entry here, m points to the first record of the socket buffer.
684 	 * While we process the initial mbufs containing address and control
685 	 * info, we save a copy of m->m_nextpkt into nextrecord.
686 	 */
687 	if (uio->uio_procp)
688 		uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
689 	KASSERT(m == so->so_rcv.sb_mb);
690 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
691 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
692 	nextrecord = m->m_nextpkt;
693 	if (pr->pr_flags & PR_ADDR) {
694 #ifdef DIAGNOSTIC
695 		if (m->m_type != MT_SONAME)
696 			panic("receive 1a");
697 #endif
698 		orig_resid = 0;
699 		if (flags & MSG_PEEK) {
700 			if (paddr)
701 				*paddr = m_copy(m, 0, m->m_len);
702 			m = m->m_next;
703 		} else {
704 			sbfree(&so->so_rcv, m);
705 			if (paddr) {
706 				*paddr = m;
707 				so->so_rcv.sb_mb = m->m_next;
708 				m->m_next = 0;
709 				m = so->so_rcv.sb_mb;
710 			} else {
711 				MFREE(m, so->so_rcv.sb_mb);
712 				m = so->so_rcv.sb_mb;
713 			}
714 		}
715 	}
716 	while (m && m->m_type == MT_CONTROL && error == 0) {
717 		if (flags & MSG_PEEK) {
718 			if (controlp)
719 				*controlp = m_copy(m, 0, m->m_len);
720 			m = m->m_next;
721 		} else {
722 			sbfree(&so->so_rcv, m);
723 			if (controlp) {
724 				if (pr->pr_domain->dom_externalize &&
725 				    mtod(m, struct cmsghdr *)->cmsg_type ==
726 				    SCM_RIGHTS)
727 				   error = (*pr->pr_domain->dom_externalize)(m,
728 				       controllen);
729 				*controlp = m;
730 				so->so_rcv.sb_mb = m->m_next;
731 				m->m_next = 0;
732 				m = so->so_rcv.sb_mb;
733 			} else {
734 				/*
735 				 * Dispose of any SCM_RIGHTS message that went
736 				 * through the read path rather than recv.
737 				 */
738 				if (pr->pr_domain->dom_dispose &&
739 				    mtod(m, struct cmsghdr *)->cmsg_type == SCM_RIGHTS)
740 					pr->pr_domain->dom_dispose(m);
741 				MFREE(m, so->so_rcv.sb_mb);
742 				m = so->so_rcv.sb_mb;
743 			}
744 		}
745 		if (controlp) {
746 			orig_resid = 0;
747 			controlp = &(*controlp)->m_next;
748 		}
749 	}
750 
751 	/*
752 	 * If m is non-NULL, we have some data to read.  From now on,
753 	 * make sure to keep sb_lastrecord consistent when working on
754 	 * the last packet on the chain (nextrecord == NULL) and we
755 	 * change m->m_nextpkt.
756 	 */
757 	if (m) {
758 		if ((flags & MSG_PEEK) == 0) {
759 			m->m_nextpkt = nextrecord;
760 			/*
761 			 * If nextrecord == NULL (this is a single chain),
762 			 * then sb_lastrecord may not be valid here if m
763 			 * was changed earlier.
764 			 */
765 			if (nextrecord == NULL) {
766 				KASSERT(so->so_rcv.sb_mb == m);
767 				so->so_rcv.sb_lastrecord = m;
768 			}
769 		}
770 		type = m->m_type;
771 		if (type == MT_OOBDATA)
772 			flags |= MSG_OOB;
773 		if (m->m_flags & M_BCAST)
774 			flags |= MSG_BCAST;
775 		if (m->m_flags & M_MCAST)
776 			flags |= MSG_MCAST;
777 	} else {
778 		if ((flags & MSG_PEEK) == 0) {
779 			KASSERT(so->so_rcv.sb_mb == m);
780 			so->so_rcv.sb_mb = nextrecord;
781 			SB_EMPTY_FIXUP(&so->so_rcv);
782 		}
783 	}
784 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
785 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
786 
787 	moff = 0;
788 	offset = 0;
789 	while (m && uio->uio_resid > 0 && error == 0) {
790 		if (m->m_type == MT_OOBDATA) {
791 			if (type != MT_OOBDATA)
792 				break;
793 		} else if (type == MT_OOBDATA)
794 			break;
795 #ifdef DIAGNOSTIC
796 		else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
797 			panic("receive 3");
798 #endif
799 		so->so_state &= ~SS_RCVATMARK;
800 		len = uio->uio_resid;
801 		if (so->so_oobmark && len > so->so_oobmark - offset)
802 			len = so->so_oobmark - offset;
803 		if (len > m->m_len - moff)
804 			len = m->m_len - moff;
805 		/*
806 		 * If mp is set, just pass back the mbufs.
807 		 * Otherwise copy them out via the uio, then free.
808 		 * Sockbuf must be consistent here (points to current mbuf,
809 		 * it points to next record) when we drop priority;
810 		 * we must note any additions to the sockbuf when we
811 		 * block interrupts again.
812 		 */
813 		if (mp == NULL && uio_error == 0) {
814 			SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
815 			SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
816 			resid = uio->uio_resid;
817 			splx(s);
818 			uio_error =
819 				uiomove(mtod(m, caddr_t) + moff, (int)len,
820 					uio);
821 			s = splsoftnet();
822 			if (uio_error)
823 				uio->uio_resid = resid - len;
824 		} else
825 			uio->uio_resid -= len;
826 		if (len == m->m_len - moff) {
827 			if (m->m_flags & M_EOR)
828 				flags |= MSG_EOR;
829 			if (flags & MSG_PEEK) {
830 				m = m->m_next;
831 				moff = 0;
832 			} else {
833 				nextrecord = m->m_nextpkt;
834 				sbfree(&so->so_rcv, m);
835 				if (mp) {
836 					*mp = m;
837 					mp = &m->m_next;
838 					so->so_rcv.sb_mb = m = m->m_next;
839 					*mp = NULL;
840 				} else {
841 					MFREE(m, so->so_rcv.sb_mb);
842 					m = so->so_rcv.sb_mb;
843 				}
844 				/*
845 				 * If m != NULL, we also know that
846 				 * so->so_rcv.sb_mb != NULL.
847 				 */
848 				KASSERT(so->so_rcv.sb_mb == m);
849 				if (m) {
850 					m->m_nextpkt = nextrecord;
851 					if (nextrecord == NULL)
852 						so->so_rcv.sb_lastrecord = m;
853 				} else {
854 					so->so_rcv.sb_mb = nextrecord;
855 					SB_EMPTY_FIXUP(&so->so_rcv);
856 				}
857 				SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
858 				SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
859 			}
860 		} else {
861 			if (flags & MSG_PEEK)
862 				moff += len;
863 			else {
864 				if (mp)
865 					*mp = m_copym(m, 0, len, M_WAIT);
866 				m->m_data += len;
867 				m->m_len -= len;
868 				so->so_rcv.sb_cc -= len;
869 				so->so_rcv.sb_datacc -= len;
870 			}
871 		}
872 		if (so->so_oobmark) {
873 			if ((flags & MSG_PEEK) == 0) {
874 				so->so_oobmark -= len;
875 				if (so->so_oobmark == 0) {
876 					so->so_state |= SS_RCVATMARK;
877 					break;
878 				}
879 			} else {
880 				offset += len;
881 				if (offset == so->so_oobmark)
882 					break;
883 			}
884 		}
885 		if (flags & MSG_EOR)
886 			break;
887 		/*
888 		 * If the MSG_WAITALL flag is set (for non-atomic socket),
889 		 * we must not quit until "uio->uio_resid == 0" or an error
890 		 * termination.  If a signal/timeout occurs, return
891 		 * with a short count but without error.
892 		 * Keep sockbuf locked against other readers.
893 		 */
894 		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
895 		    !sosendallatonce(so) && !nextrecord) {
896 			if (so->so_error || so->so_state & SS_CANTRCVMORE)
897 				break;
898 			SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
899 			SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
900 			error = sbwait(&so->so_rcv);
901 			if (error) {
902 				sbunlock(&so->so_rcv);
903 				splx(s);
904 				return (0);
905 			}
906 			if ((m = so->so_rcv.sb_mb) != NULL)
907 				nextrecord = m->m_nextpkt;
908 		}
909 	}
910 
911 	if (m && pr->pr_flags & PR_ATOMIC) {
912 		flags |= MSG_TRUNC;
913 		if ((flags & MSG_PEEK) == 0)
914 			(void) sbdroprecord(&so->so_rcv);
915 	}
916 	if ((flags & MSG_PEEK) == 0) {
917 		if (m == NULL) {
918 			/*
919 			 * First part is an inline SB_EMPTY_FIXUP().  Second
920 			 * part makes sure sb_lastrecord is up-to-date if
921 			 * there is still data in the socket buffer.
922 			 */
923 			so->so_rcv.sb_mb = nextrecord;
924 			if (so->so_rcv.sb_mb == NULL) {
925 				so->so_rcv.sb_mbtail = NULL;
926 				so->so_rcv.sb_lastrecord = NULL;
927 			} else if (nextrecord->m_nextpkt == NULL)
928 				so->so_rcv.sb_lastrecord = nextrecord;
929 		}
930 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
931 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
932 		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
933 			(*pr->pr_usrreq)(so, PRU_RCVD, NULL,
934 			    (struct mbuf *)(long)flags, NULL, curproc);
935 	}
936 	if (orig_resid == uio->uio_resid && orig_resid &&
937 	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
938 		sbunlock(&so->so_rcv);
939 		splx(s);
940 		goto restart;
941 	}
942 
943 	if (uio_error)
944 		error = uio_error;
945 
946 	if (flagsp)
947 		*flagsp |= flags;
948 release:
949 	sbunlock(&so->so_rcv);
950 	splx(s);
951 	return (error);
952 }
953 
954 int
955 soshutdown(struct socket *so, int how)
956 {
957 	struct protosw *pr = so->so_proto;
958 
959 	switch (how) {
960 	case SHUT_RD:
961 	case SHUT_RDWR:
962 		sorflush(so);
963 		if (how == SHUT_RD)
964 			return (0);
965 		/* FALLTHROUGH */
966 	case SHUT_WR:
967 		return (*pr->pr_usrreq)(so, PRU_SHUTDOWN, NULL, NULL, NULL,
968 		    curproc);
969 	default:
970 		return (EINVAL);
971 	}
972 }
973 
974 void
975 sorflush(struct socket *so)
976 {
977 	struct sockbuf *sb = &so->so_rcv;
978 	struct protosw *pr = so->so_proto;
979 	int s;
980 	struct sockbuf asb;
981 
982 	sb->sb_flags |= SB_NOINTR;
983 	(void) sblock(sb, M_WAITOK);
984 	s = splnet();
985 	socantrcvmore(so);
986 	sbunlock(sb);
987 	asb = *sb;
988 	bzero(sb, sizeof (*sb));
989 	/* XXX - the bzero stumps all over so_rcv */
990 	if (asb.sb_flags & SB_KNOTE) {
991 		sb->sb_sel.si_note = asb.sb_sel.si_note;
992 		sb->sb_flags = SB_KNOTE;
993 	}
994 	splx(s);
995 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
996 		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
997 	sbrelease(&asb);
998 }
999 
1000 #ifdef SOCKET_SPLICE
1001 int
1002 sosplice(struct socket *so, int fd, off_t max)
1003 {
1004 	struct file	*fp;
1005 	struct socket	*sosp;
1006 	int		 s, error = 0;
1007 
1008 	if ((so->so_proto->pr_flags & PR_SPLICE) == 0)
1009 		return (EPROTONOSUPPORT);
1010 	if (so->so_options & SO_ACCEPTCONN)
1011 		return (EOPNOTSUPP);
1012 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0)
1013 		return (ENOTCONN);
1014 
1015 	/* If no fd is given, unsplice by removing existing link. */
1016 	if (fd < 0) {
1017 		s = splsoftnet();
1018 		if (so->so_splice) {
1019 			so->so_splice->so_snd.sb_flags &= ~SB_SPLICE;
1020 			so->so_rcv.sb_flags &= ~SB_SPLICE;
1021 			so->so_splice->so_spliceback = NULL;
1022 			so->so_splice = NULL;
1023 			if (soreadable(so))
1024 				sorwakeup(so);
1025 		}
1026 		splx(s);
1027 		return (0);
1028 	}
1029 
1030 	if (max && max < 0)
1031 		return (EINVAL);
1032 
1033 	/* Find sosp, the drain socket where data will be spliced into. */
1034 	if ((error = getsock(curproc->p_fd, fd, &fp)) != 0)
1035 		return (error);
1036 	sosp = fp->f_data;
1037 
1038 	/* Lock both receive and send buffer. */
1039 	if ((error = sblock(&so->so_rcv,
1040 	    (so->so_state & SS_NBIO) ? M_NOWAIT : M_WAITOK)) != 0) {
1041 		FRELE(fp);
1042 		return (error);
1043 	}
1044 	if ((error = sblock(&sosp->so_snd, M_WAITOK)) != 0) {
1045 		sbunlock(&so->so_rcv);
1046 		FRELE(fp);
1047 		return (error);
1048 	}
1049 	s = splsoftnet();
1050 
1051 	if (so->so_splice || sosp->so_spliceback) {
1052 		error = EBUSY;
1053 		goto release;
1054 	}
1055 	if (sosp->so_proto->pr_usrreq != so->so_proto->pr_usrreq) {
1056 		error = EPROTONOSUPPORT;
1057 		goto release;
1058 	}
1059 	if (sosp->so_options & SO_ACCEPTCONN) {
1060 		error = EOPNOTSUPP;
1061 		goto release;
1062 	}
1063 	if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) {
1064 		error = ENOTCONN;
1065 		goto release;
1066 	}
1067 
1068 	/* Splice so and sosp together. */
1069 	so->so_splice = sosp;
1070 	sosp->so_spliceback = so;
1071 	so->so_splicelen = 0;
1072 	so->so_splicemax = max;
1073 
1074 	/*
1075 	 * To prevent softnet interrupt from calling somove() while
1076 	 * we sleep, the socket buffers are not marked as spliced yet.
1077 	 */
1078 	if (somove(so, M_WAIT)) {
1079 		so->so_rcv.sb_flags |= SB_SPLICE;
1080 		sosp->so_snd.sb_flags |= SB_SPLICE;
1081 	}
1082 
1083  release:
1084 	splx(s);
1085 	sbunlock(&sosp->so_snd);
1086 	sbunlock(&so->so_rcv);
1087 	FRELE(fp);
1088 	return (error);
1089 }
1090 
1091 /*
1092  * Move data from receive buffer of spliced source socket to send
1093  * buffer of drain socket.  Try to move as much as possible in one
1094  * big chunk.  It is a TCP only implementation.
1095  * Return value 0 means splicing has been finished, 1 continue.
1096  */
1097 int
1098 somove(struct socket *so, int wait)
1099 {
1100 	struct socket	*sosp = so->so_splice;
1101 	struct mbuf	*m = NULL, **mp;
1102 	u_long		 len, off, oobmark;
1103 	long		 space;
1104 	int		 error = 0, maxreached = 0;
1105 	short		 state;
1106 
1107 	splsoftassert(IPL_SOFTNET);
1108 
1109 	if (so->so_error) {
1110 		error = so->so_error;
1111 		goto release;
1112 	}
1113 	if (sosp->so_state & SS_CANTSENDMORE) {
1114 		error = EPIPE;
1115 		goto release;
1116 	}
1117 	if (sosp->so_error) {
1118 		error = sosp->so_error;
1119 		goto release;
1120 	}
1121 	if ((sosp->so_state & SS_ISCONNECTED) == 0)
1122 		goto release;
1123 
1124 	/* Calculate how many bytes can be copied now. */
1125 	len = so->so_rcv.sb_cc;
1126 	if (len == 0)
1127 		goto release;
1128 	if (so->so_splicemax) {
1129 		KASSERT(so->so_splicelen < so->so_splicemax);
1130 		if (so->so_splicemax <= so->so_splicelen + len) {
1131 			len = so->so_splicemax - so->so_splicelen;
1132 			maxreached = 1;
1133 		}
1134 	}
1135 	space = sbspace(&sosp->so_snd);
1136 	if (so->so_oobmark && so->so_oobmark < len &&
1137 	    so->so_oobmark < space + 1024)
1138 		space += 1024;
1139 	if (space <= 0) {
1140 		maxreached = 0;
1141 		goto release;
1142 	}
1143 	if (space < len) {
1144 		maxreached = 0;
1145 		if (space < sosp->so_snd.sb_lowat)
1146 			goto release;
1147 		len = space;
1148 	}
1149 	sosp->so_state |= SS_ISSENDING;
1150 
1151 	/* Take at most len mbufs out of receive buffer. */
1152 	m = so->so_rcv.sb_mb;
1153 	for (off = 0, mp = &m; off < len;
1154 	    off += (*mp)->m_len, mp = &(*mp)->m_next) {
1155 		u_long size = len - off;
1156 
1157 		if ((*mp)->m_len > size) {
1158 			if (!maxreached || (*mp = m_copym(
1159 			    so->so_rcv.sb_mb, 0, size, wait)) == NULL) {
1160 				len -= size;
1161 				break;
1162 			}
1163 			so->so_rcv.sb_mb->m_data += size;
1164 			so->so_rcv.sb_mb->m_len -= size;
1165 			so->so_rcv.sb_cc -= size;
1166 			so->so_rcv.sb_datacc -= size;
1167 		} else {
1168 			*mp = so->so_rcv.sb_mb;
1169 			sbfree(&so->so_rcv, *mp);
1170 			so->so_rcv.sb_mb = (*mp)->m_next;
1171 		}
1172 	}
1173 	*mp = NULL;
1174 	SB_EMPTY_FIXUP(&so->so_rcv);
1175 	so->so_rcv.sb_lastrecord = so->so_rcv.sb_mb;
1176 
1177 	SBLASTRECORDCHK(&so->so_rcv, "somove");
1178 	SBLASTMBUFCHK(&so->so_rcv, "somove");
1179 	KDASSERT(m->m_nextpkt == NULL);
1180 	KASSERT(so->so_rcv.sb_mb == so->so_rcv.sb_lastrecord);
1181 #ifdef SOCKBUF_DEBUG
1182 	sbcheck(&so->so_rcv);
1183 #endif
1184 
1185 	/* Send window update to source peer if receive buffer has changed. */
1186 	if (m)
1187 		(so->so_proto->pr_usrreq)(so, PRU_RCVD, NULL,
1188 		    (struct mbuf *)0L, NULL, NULL);
1189 
1190 	/* Receive buffer did shrink by len bytes, adjust oob. */
1191 	state = so->so_state;
1192 	so->so_state &= ~SS_RCVATMARK;
1193 	oobmark = so->so_oobmark;
1194 	so->so_oobmark = oobmark > len ? oobmark - len : 0;
1195 	if (oobmark) {
1196 		if (oobmark == len)
1197 			so->so_state |= SS_RCVATMARK;
1198 		if (oobmark >= len)
1199 			oobmark = 0;
1200 	}
1201 
1202 	/*
1203 	 * Handle oob data.  If any malloc fails, ignore error.
1204 	 * TCP urgent data is not very reliable anyway.
1205 	 */
1206 	while (m && ((state & SS_RCVATMARK) || oobmark) &&
1207 	    (so->so_options & SO_OOBINLINE)) {
1208 		struct mbuf *o = NULL;
1209 
1210 		if (state & SS_RCVATMARK) {
1211 			o = m_get(wait, MT_DATA);
1212 			state &= ~SS_RCVATMARK;
1213 		} else if (oobmark) {
1214 			o = m_split(m, oobmark, wait);
1215 			if (o) {
1216 				error = (*sosp->so_proto->pr_usrreq)(sosp,
1217 				    PRU_SEND, m, NULL, NULL, NULL);
1218 				m = NULL;
1219 				if (error) {
1220 					m_freem(o);
1221 					if (sosp->so_state & SS_CANTSENDMORE)
1222 						error = EPIPE;
1223 					goto release;
1224 				}
1225 				len -= oobmark;
1226 				so->so_splicelen += oobmark;
1227 				m = o;
1228 				o = m_get(wait, MT_DATA);
1229 			}
1230 			oobmark = 0;
1231 		}
1232 		if (o) {
1233 			o->m_len = 1;
1234 			*mtod(o, caddr_t) = *mtod(m, caddr_t);
1235 			error = (*sosp->so_proto->pr_usrreq)(sosp, PRU_SENDOOB,
1236 			    o, NULL, NULL, NULL);
1237 			if (error) {
1238 				if (sosp->so_state & SS_CANTSENDMORE)
1239 					error = EPIPE;
1240 				goto release;
1241 			}
1242 			len -= 1;
1243 			so->so_splicelen += 1;
1244 			if (oobmark) {
1245 				oobmark -= 1;
1246 				if (oobmark == 0)
1247 					state |= SS_RCVATMARK;
1248 			}
1249 			m_adj(m, 1);
1250 		}
1251 	}
1252 
1253 	/* Append all remaining data to drain socket. */
1254 	if (m) {
1255 		if (so->so_rcv.sb_cc == 0 || maxreached)
1256 			sosp->so_state &= ~SS_ISSENDING;
1257 		error = (*sosp->so_proto->pr_usrreq)(sosp, PRU_SEND, m, NULL,
1258 		    NULL, NULL);
1259 		m = NULL;
1260 		if (error) {
1261 			if (sosp->so_state & SS_CANTSENDMORE)
1262 				error = EPIPE;
1263 			goto release;
1264 		}
1265 		so->so_splicelen += len;
1266 	}
1267 
1268  release:
1269 	if (m)
1270 		m_freem(m);
1271 	sosp->so_state &= ~SS_ISSENDING;
1272 	if (error)
1273 		so->so_error = error;
1274 	if (((so->so_state & SS_CANTRCVMORE) && so->so_rcv.sb_cc == 0) ||
1275 	    (sosp->so_state & SS_CANTSENDMORE) || maxreached || error) {
1276 		sosp->so_snd.sb_flags &= ~SB_SPLICE;
1277 		so->so_rcv.sb_flags &= ~SB_SPLICE;
1278 		so->so_splice = sosp->so_spliceback = NULL;
1279 		if (soreadable(so))
1280 			sorwakeup(so);
1281 		return (0);
1282 	}
1283 	return (1);
1284 }
1285 
1286 void
1287 sorwakeup(struct socket *so)
1288 {
1289 	if (so->so_rcv.sb_flags & SB_SPLICE) {
1290 		(void) somove(so, M_DONTWAIT);
1291 		return;
1292 	}
1293 	_sorwakeup(so);
1294 }
1295 
1296 void
1297 sowwakeup(struct socket *so)
1298 {
1299 	if (so->so_snd.sb_flags & SB_SPLICE)
1300 		(void) somove(so->so_spliceback, M_DONTWAIT);
1301 	_sowwakeup(so);
1302 }
1303 #endif /* SOCKET_SPLICE */
1304 
1305 int
1306 sosetopt(struct socket *so, int level, int optname, struct mbuf *m0)
1307 {
1308 	int error = 0;
1309 	struct mbuf *m = m0;
1310 
1311 	if (level != SOL_SOCKET) {
1312 		if (so->so_proto && so->so_proto->pr_ctloutput)
1313 			return ((*so->so_proto->pr_ctloutput)
1314 				  (PRCO_SETOPT, so, level, optname, &m0));
1315 		error = ENOPROTOOPT;
1316 	} else {
1317 		switch (optname) {
1318 		case SO_BINDANY:
1319 			if ((error = suser(curproc, 0)) != 0)	/* XXX */
1320 				goto bad;
1321 			break;
1322 		}
1323 
1324 		switch (optname) {
1325 
1326 		case SO_LINGER:
1327 			if (m == NULL || m->m_len != sizeof (struct linger) ||
1328 			    mtod(m, struct linger *)->l_linger < 0 ||
1329 			    mtod(m, struct linger *)->l_linger > SHRT_MAX) {
1330 				error = EINVAL;
1331 				goto bad;
1332 			}
1333 			so->so_linger = mtod(m, struct linger *)->l_linger;
1334 			/* FALLTHROUGH */
1335 
1336 		case SO_BINDANY:
1337 		case SO_DEBUG:
1338 		case SO_KEEPALIVE:
1339 		case SO_DONTROUTE:
1340 		case SO_USELOOPBACK:
1341 		case SO_BROADCAST:
1342 		case SO_REUSEADDR:
1343 		case SO_REUSEPORT:
1344 		case SO_OOBINLINE:
1345 		case SO_JUMBO:
1346 		case SO_TIMESTAMP:
1347 			if (m == NULL || m->m_len < sizeof (int)) {
1348 				error = EINVAL;
1349 				goto bad;
1350 			}
1351 			if (*mtod(m, int *))
1352 				so->so_options |= optname;
1353 			else
1354 				so->so_options &= ~optname;
1355 			break;
1356 
1357 		case SO_SNDBUF:
1358 		case SO_RCVBUF:
1359 		case SO_SNDLOWAT:
1360 		case SO_RCVLOWAT:
1361 		    {
1362 			u_long cnt;
1363 
1364 			if (m == NULL || m->m_len < sizeof (int)) {
1365 				error = EINVAL;
1366 				goto bad;
1367 			}
1368 			cnt = *mtod(m, int *);
1369 			if ((long)cnt <= 0)
1370 				cnt = 1;
1371 			switch (optname) {
1372 
1373 			case SO_SNDBUF:
1374 				if (so->so_state & SS_CANTSENDMORE) {
1375 					error = EINVAL;
1376 					goto bad;
1377 				}
1378 				if (sbcheckreserve(cnt, so->so_snd.sb_wat) ||
1379 				    sbreserve(&so->so_snd, cnt)) {
1380 					error = ENOBUFS;
1381 					goto bad;
1382 				}
1383 				so->so_snd.sb_wat = cnt;
1384 				break;
1385 
1386 			case SO_RCVBUF:
1387 				if (so->so_state & SS_CANTRCVMORE) {
1388 					error = EINVAL;
1389 					goto bad;
1390 				}
1391 				if (sbcheckreserve(cnt, so->so_rcv.sb_wat) ||
1392 				    sbreserve(&so->so_rcv, cnt)) {
1393 					error = ENOBUFS;
1394 					goto bad;
1395 				}
1396 				so->so_rcv.sb_wat = cnt;
1397 				break;
1398 
1399 			case SO_SNDLOWAT:
1400 				so->so_snd.sb_lowat =
1401 				    (cnt > so->so_snd.sb_hiwat) ?
1402 				    so->so_snd.sb_hiwat : cnt;
1403 				break;
1404 			case SO_RCVLOWAT:
1405 				so->so_rcv.sb_lowat =
1406 				    (cnt > so->so_rcv.sb_hiwat) ?
1407 				    so->so_rcv.sb_hiwat : cnt;
1408 				break;
1409 			}
1410 			break;
1411 		    }
1412 
1413 		case SO_SNDTIMEO:
1414 		case SO_RCVTIMEO:
1415 		    {
1416 			struct timeval *tv;
1417 			u_short val;
1418 
1419 			if (m == NULL || m->m_len < sizeof (*tv)) {
1420 				error = EINVAL;
1421 				goto bad;
1422 			}
1423 			tv = mtod(m, struct timeval *);
1424 			if (tv->tv_sec > (USHRT_MAX - tv->tv_usec / tick) / hz) {
1425 				error = EDOM;
1426 				goto bad;
1427 			}
1428 			val = tv->tv_sec * hz + tv->tv_usec / tick;
1429 			if (val == 0 && tv->tv_usec != 0)
1430 				val = 1;
1431 
1432 			switch (optname) {
1433 
1434 			case SO_SNDTIMEO:
1435 				so->so_snd.sb_timeo = val;
1436 				break;
1437 			case SO_RCVTIMEO:
1438 				so->so_rcv.sb_timeo = val;
1439 				break;
1440 			}
1441 			break;
1442 		    }
1443 
1444 #ifdef SOCKET_SPLICE
1445 		case SO_SPLICE:
1446 			if (m == NULL) {
1447 				error = sosplice(so, -1, 0);
1448 			} else if (m->m_len < sizeof(int)) {
1449 				error = EINVAL;
1450 				goto bad;
1451 			} else if (m->m_len < sizeof(struct splice)) {
1452 				error = sosplice(so, *mtod(m, int *), 0);
1453 			} else {
1454 				error = sosplice(so,
1455 				    mtod(m, struct splice *)->sp_fd,
1456 				    mtod(m, struct splice *)->sp_max);
1457 			}
1458 			break;
1459 #endif /* SOCKET_SPLICE */
1460 
1461 		default:
1462 			error = ENOPROTOOPT;
1463 			break;
1464 		}
1465 		if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
1466 			(void) ((*so->so_proto->pr_ctloutput)
1467 				  (PRCO_SETOPT, so, level, optname, &m0));
1468 			m = NULL;	/* freed by protocol */
1469 		}
1470 	}
1471 bad:
1472 	if (m)
1473 		(void) m_free(m);
1474 	return (error);
1475 }
1476 
1477 int
1478 sogetopt(struct socket *so, int level, int optname, struct mbuf **mp)
1479 {
1480 	struct mbuf *m;
1481 
1482 	if (level != SOL_SOCKET) {
1483 		if (so->so_proto && so->so_proto->pr_ctloutput) {
1484 			return ((*so->so_proto->pr_ctloutput)
1485 				  (PRCO_GETOPT, so, level, optname, mp));
1486 		} else
1487 			return (ENOPROTOOPT);
1488 	} else {
1489 		m = m_get(M_WAIT, MT_SOOPTS);
1490 		m->m_len = sizeof (int);
1491 
1492 		switch (optname) {
1493 
1494 		case SO_LINGER:
1495 			m->m_len = sizeof (struct linger);
1496 			mtod(m, struct linger *)->l_onoff =
1497 				so->so_options & SO_LINGER;
1498 			mtod(m, struct linger *)->l_linger = so->so_linger;
1499 			break;
1500 
1501 		case SO_BINDANY:
1502 		case SO_USELOOPBACK:
1503 		case SO_DONTROUTE:
1504 		case SO_DEBUG:
1505 		case SO_KEEPALIVE:
1506 		case SO_REUSEADDR:
1507 		case SO_REUSEPORT:
1508 		case SO_BROADCAST:
1509 		case SO_OOBINLINE:
1510 		case SO_JUMBO:
1511 		case SO_TIMESTAMP:
1512 			*mtod(m, int *) = so->so_options & optname;
1513 			break;
1514 
1515 		case SO_TYPE:
1516 			*mtod(m, int *) = so->so_type;
1517 			break;
1518 
1519 		case SO_ERROR:
1520 			*mtod(m, int *) = so->so_error;
1521 			so->so_error = 0;
1522 			break;
1523 
1524 		case SO_SNDBUF:
1525 			*mtod(m, int *) = so->so_snd.sb_hiwat;
1526 			break;
1527 
1528 		case SO_RCVBUF:
1529 			*mtod(m, int *) = so->so_rcv.sb_hiwat;
1530 			break;
1531 
1532 		case SO_SNDLOWAT:
1533 			*mtod(m, int *) = so->so_snd.sb_lowat;
1534 			break;
1535 
1536 		case SO_RCVLOWAT:
1537 			*mtod(m, int *) = so->so_rcv.sb_lowat;
1538 			break;
1539 
1540 		case SO_SNDTIMEO:
1541 		case SO_RCVTIMEO:
1542 		    {
1543 			int val = (optname == SO_SNDTIMEO ?
1544 			    so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1545 
1546 			m->m_len = sizeof(struct timeval);
1547 			mtod(m, struct timeval *)->tv_sec = val / hz;
1548 			mtod(m, struct timeval *)->tv_usec =
1549 			    (val % hz) * tick;
1550 			break;
1551 		    }
1552 
1553 #ifdef SOCKET_SPLICE
1554 		case SO_SPLICE:
1555 		    {
1556 			int s = splsoftnet();
1557 
1558 			m->m_len = sizeof(off_t);
1559 			*mtod(m, off_t *) = so->so_splicelen;
1560 			splx(s);
1561 			break;
1562 		    }
1563 #endif /* SOCKET_SPLICE */
1564 
1565 		case SO_PEERCRED:
1566 			if (so->so_proto->pr_protocol == AF_UNIX) {
1567 				struct unpcb *unp = sotounpcb(so);
1568 
1569 				if (unp->unp_flags & UNP_FEIDS) {
1570 					*mp = m = m_get(M_WAIT, MT_SOOPTS);
1571 					m->m_len = sizeof(unp->unp_connid);
1572 					bcopy((caddr_t)(&(unp->unp_connid)),
1573 					    mtod(m, caddr_t),
1574 					    (unsigned)m->m_len);
1575 				} else
1576 					return (ENOTCONN);
1577 			} else
1578 				return (EOPNOTSUPP);
1579 			break;
1580 
1581 		default:
1582 			(void)m_free(m);
1583 			return (ENOPROTOOPT);
1584 		}
1585 		*mp = m;
1586 		return (0);
1587 	}
1588 }
1589 
1590 void
1591 sohasoutofband(struct socket *so)
1592 {
1593 	csignal(so->so_pgid, SIGURG, so->so_siguid, so->so_sigeuid);
1594 	selwakeup(&so->so_rcv.sb_sel);
1595 }
1596 
1597 int
1598 soo_kqfilter(struct file *fp, struct knote *kn)
1599 {
1600 	struct socket *so = (struct socket *)kn->kn_fp->f_data;
1601 	struct sockbuf *sb;
1602 	int s;
1603 
1604 	switch (kn->kn_filter) {
1605 	case EVFILT_READ:
1606 		if (so->so_options & SO_ACCEPTCONN)
1607 			kn->kn_fop = &solisten_filtops;
1608 		else
1609 			kn->kn_fop = &soread_filtops;
1610 		sb = &so->so_rcv;
1611 		break;
1612 	case EVFILT_WRITE:
1613 		kn->kn_fop = &sowrite_filtops;
1614 		sb = &so->so_snd;
1615 		break;
1616 	default:
1617 		return (1);
1618 	}
1619 
1620 	s = splnet();
1621 	SLIST_INSERT_HEAD(&sb->sb_sel.si_note, kn, kn_selnext);
1622 	sb->sb_flags |= SB_KNOTE;
1623 	splx(s);
1624 	return (0);
1625 }
1626 
1627 void
1628 filt_sordetach(struct knote *kn)
1629 {
1630 	struct socket *so = (struct socket *)kn->kn_fp->f_data;
1631 	int s = splnet();
1632 
1633 	SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext);
1634 	if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note))
1635 		so->so_rcv.sb_flags &= ~SB_KNOTE;
1636 	splx(s);
1637 }
1638 
1639 /*ARGSUSED*/
1640 int
1641 filt_soread(struct knote *kn, long hint)
1642 {
1643 	struct socket *so = (struct socket *)kn->kn_fp->f_data;
1644 
1645 	kn->kn_data = so->so_rcv.sb_cc;
1646 #ifdef SOCKET_SPLICE
1647 	if (so->so_splice)
1648 		return (0);
1649 #endif /* SOCKET_SPLICE */
1650 	if (so->so_state & SS_CANTRCVMORE) {
1651 		kn->kn_flags |= EV_EOF;
1652 		kn->kn_fflags = so->so_error;
1653 		return (1);
1654 	}
1655 	if (so->so_error)	/* temporary udp error */
1656 		return (1);
1657 	if (kn->kn_sfflags & NOTE_LOWAT)
1658 		return (kn->kn_data >= kn->kn_sdata);
1659 	return (kn->kn_data >= so->so_rcv.sb_lowat);
1660 }
1661 
1662 void
1663 filt_sowdetach(struct knote *kn)
1664 {
1665 	struct socket *so = (struct socket *)kn->kn_fp->f_data;
1666 	int s = splnet();
1667 
1668 	SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext);
1669 	if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note))
1670 		so->so_snd.sb_flags &= ~SB_KNOTE;
1671 	splx(s);
1672 }
1673 
1674 /*ARGSUSED*/
1675 int
1676 filt_sowrite(struct knote *kn, long hint)
1677 {
1678 	struct socket *so = (struct socket *)kn->kn_fp->f_data;
1679 
1680 	kn->kn_data = sbspace(&so->so_snd);
1681 	if (so->so_state & SS_CANTSENDMORE) {
1682 		kn->kn_flags |= EV_EOF;
1683 		kn->kn_fflags = so->so_error;
1684 		return (1);
1685 	}
1686 	if (so->so_error)	/* temporary udp error */
1687 		return (1);
1688 	if (((so->so_state & SS_ISCONNECTED) == 0) &&
1689 	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
1690 		return (0);
1691 	if (kn->kn_sfflags & NOTE_LOWAT)
1692 		return (kn->kn_data >= kn->kn_sdata);
1693 	return (kn->kn_data >= so->so_snd.sb_lowat);
1694 }
1695 
1696 /*ARGSUSED*/
1697 int
1698 filt_solisten(struct knote *kn, long hint)
1699 {
1700 	struct socket *so = (struct socket *)kn->kn_fp->f_data;
1701 
1702 	kn->kn_data = so->so_qlen;
1703 	return (so->so_qlen != 0);
1704 }
1705