1 /* $OpenBSD: uipc_socket.c,v 1.335 2024/05/17 19:11:14 mvs Exp $ */
2 /* $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $ */
3
4 /*
5 * Copyright (c) 1982, 1986, 1988, 1990, 1993
6 * The Regents of the University of California. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
33 */
34
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/proc.h>
38 #include <sys/file.h>
39 #include <sys/filedesc.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/domain.h>
43 #include <sys/event.h>
44 #include <sys/protosw.h>
45 #include <sys/socket.h>
46 #include <sys/unpcb.h>
47 #include <sys/socketvar.h>
48 #include <sys/signalvar.h>
49 #include <sys/pool.h>
50 #include <sys/atomic.h>
51 #include <sys/rwlock.h>
52 #include <sys/time.h>
53 #include <sys/refcnt.h>
54
55 #ifdef DDB
56 #include <machine/db_machdep.h>
57 #endif
58
59 void sbsync(struct sockbuf *, struct mbuf *);
60
61 int sosplice(struct socket *, int, off_t, struct timeval *);
62 void sounsplice(struct socket *, struct socket *, int);
63 void soidle(void *);
64 void sotask(void *);
65 void soreaper(void *);
66 void soput(void *);
67 int somove(struct socket *, int);
68 void sorflush(struct socket *);
69
70 void filt_sordetach(struct knote *kn);
71 int filt_soread(struct knote *kn, long hint);
72 void filt_sowdetach(struct knote *kn);
73 int filt_sowrite(struct knote *kn, long hint);
74 int filt_soexcept(struct knote *kn, long hint);
75
76 int filt_sowmodify(struct kevent *kev, struct knote *kn);
77 int filt_sowprocess(struct knote *kn, struct kevent *kev);
78
79 int filt_sormodify(struct kevent *kev, struct knote *kn);
80 int filt_sorprocess(struct knote *kn, struct kevent *kev);
81
82 const struct filterops soread_filtops = {
83 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
84 .f_attach = NULL,
85 .f_detach = filt_sordetach,
86 .f_event = filt_soread,
87 .f_modify = filt_sormodify,
88 .f_process = filt_sorprocess,
89 };
90
91 const struct filterops sowrite_filtops = {
92 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
93 .f_attach = NULL,
94 .f_detach = filt_sowdetach,
95 .f_event = filt_sowrite,
96 .f_modify = filt_sowmodify,
97 .f_process = filt_sowprocess,
98 };
99
100 const struct filterops soexcept_filtops = {
101 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
102 .f_attach = NULL,
103 .f_detach = filt_sordetach,
104 .f_event = filt_soexcept,
105 .f_modify = filt_sormodify,
106 .f_process = filt_sorprocess,
107 };
108
109 #ifndef SOMINCONN
110 #define SOMINCONN 80
111 #endif /* SOMINCONN */
112
113 int somaxconn = SOMAXCONN;
114 int sominconn = SOMINCONN;
115
116 struct pool socket_pool;
117 #ifdef SOCKET_SPLICE
118 struct pool sosplice_pool;
119 struct taskq *sosplice_taskq;
120 struct rwlock sosplice_lock = RWLOCK_INITIALIZER("sosplicelk");
121 #endif
122
123 void
soinit(void)124 soinit(void)
125 {
126 pool_init(&socket_pool, sizeof(struct socket), 0, IPL_SOFTNET, 0,
127 "sockpl", NULL);
128 #ifdef SOCKET_SPLICE
129 pool_init(&sosplice_pool, sizeof(struct sosplice), 0, IPL_SOFTNET, 0,
130 "sosppl", NULL);
131 #endif
132 }
133
134 struct socket *
soalloc(const struct protosw * prp,int wait)135 soalloc(const struct protosw *prp, int wait)
136 {
137 const struct domain *dp = prp->pr_domain;
138 struct socket *so;
139
140 so = pool_get(&socket_pool, (wait == M_WAIT ? PR_WAITOK : PR_NOWAIT) |
141 PR_ZERO);
142 if (so == NULL)
143 return (NULL);
144 rw_init_flags(&so->so_lock, dp->dom_name, RWL_DUPOK);
145 refcnt_init(&so->so_refcnt);
146 rw_init(&so->so_rcv.sb_lock, "sbufrcv");
147 rw_init(&so->so_snd.sb_lock, "sbufsnd");
148 mtx_init_flags(&so->so_rcv.sb_mtx, IPL_MPFLOOR, "sbrcv", 0);
149 mtx_init_flags(&so->so_snd.sb_mtx, IPL_MPFLOOR, "sbsnd", 0);
150 klist_init_mutex(&so->so_rcv.sb_klist, &so->so_rcv.sb_mtx);
151 klist_init_mutex(&so->so_snd.sb_klist, &so->so_snd.sb_mtx);
152 sigio_init(&so->so_sigio);
153 TAILQ_INIT(&so->so_q0);
154 TAILQ_INIT(&so->so_q);
155
156 switch (dp->dom_family) {
157 case AF_INET:
158 case AF_INET6:
159 switch (prp->pr_type) {
160 case SOCK_RAW:
161 so->so_snd.sb_flags |= SB_MTXLOCK;
162 /* FALLTHROUGH */
163 case SOCK_DGRAM:
164 so->so_rcv.sb_flags |= SB_MTXLOCK;
165 break;
166 }
167 break;
168 case AF_KEY:
169 case AF_UNIX:
170 so->so_snd.sb_flags |= SB_MTXLOCK;
171 so->so_rcv.sb_flags |= SB_MTXLOCK;
172 break;
173 }
174
175 return (so);
176 }
177
178 /*
179 * Socket operation routines.
180 * These routines are called by the routines in
181 * sys_socket.c or from a system process, and
182 * implement the semantics of socket operations by
183 * switching out to the protocol specific routines.
184 */
185 int
socreate(int dom,struct socket ** aso,int type,int proto)186 socreate(int dom, struct socket **aso, int type, int proto)
187 {
188 struct proc *p = curproc; /* XXX */
189 const struct protosw *prp;
190 struct socket *so;
191 int error;
192
193 if (proto)
194 prp = pffindproto(dom, proto, type);
195 else
196 prp = pffindtype(dom, type);
197 if (prp == NULL || prp->pr_usrreqs == NULL)
198 return (EPROTONOSUPPORT);
199 if (prp->pr_type != type)
200 return (EPROTOTYPE);
201 so = soalloc(prp, M_WAIT);
202 so->so_type = type;
203 if (suser(p) == 0)
204 so->so_state = SS_PRIV;
205 so->so_ruid = p->p_ucred->cr_ruid;
206 so->so_euid = p->p_ucred->cr_uid;
207 so->so_rgid = p->p_ucred->cr_rgid;
208 so->so_egid = p->p_ucred->cr_gid;
209 so->so_cpid = p->p_p->ps_pid;
210 so->so_proto = prp;
211 so->so_snd.sb_timeo_nsecs = INFSLP;
212 so->so_rcv.sb_timeo_nsecs = INFSLP;
213
214 solock(so);
215 error = pru_attach(so, proto, M_WAIT);
216 if (error) {
217 so->so_state |= SS_NOFDREF;
218 /* sofree() calls sounlock(). */
219 sofree(so, 0);
220 return (error);
221 }
222 sounlock(so);
223 *aso = so;
224 return (0);
225 }
226
227 int
sobind(struct socket * so,struct mbuf * nam,struct proc * p)228 sobind(struct socket *so, struct mbuf *nam, struct proc *p)
229 {
230 soassertlocked(so);
231 return pru_bind(so, nam, p);
232 }
233
234 int
solisten(struct socket * so,int backlog)235 solisten(struct socket *so, int backlog)
236 {
237 int somaxconn_local = READ_ONCE(somaxconn);
238 int sominconn_local = READ_ONCE(sominconn);
239 int error;
240
241 switch (so->so_type) {
242 case SOCK_STREAM:
243 case SOCK_SEQPACKET:
244 break;
245 default:
246 return (EOPNOTSUPP);
247 }
248
249 soassertlocked(so);
250
251 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
252 return (EINVAL);
253 #ifdef SOCKET_SPLICE
254 if (isspliced(so) || issplicedback(so))
255 return (EOPNOTSUPP);
256 #endif /* SOCKET_SPLICE */
257 error = pru_listen(so);
258 if (error)
259 return (error);
260 if (TAILQ_FIRST(&so->so_q) == NULL)
261 so->so_options |= SO_ACCEPTCONN;
262 if (backlog < 0 || backlog > somaxconn_local)
263 backlog = somaxconn_local;
264 if (backlog < sominconn_local)
265 backlog = sominconn_local;
266 so->so_qlimit = backlog;
267 return (0);
268 }
269
270 #define SOSP_FREEING_READ 1
271 #define SOSP_FREEING_WRITE 2
272 void
sofree(struct socket * so,int keep_lock)273 sofree(struct socket *so, int keep_lock)
274 {
275 int persocket = solock_persocket(so);
276
277 soassertlocked(so);
278
279 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) {
280 if (!keep_lock)
281 sounlock(so);
282 return;
283 }
284 if (so->so_head) {
285 struct socket *head = so->so_head;
286
287 /*
288 * We must not decommission a socket that's on the accept(2)
289 * queue. If we do, then accept(2) may hang after select(2)
290 * indicated that the listening socket was ready.
291 */
292 if (so->so_onq == &head->so_q) {
293 if (!keep_lock)
294 sounlock(so);
295 return;
296 }
297
298 if (persocket) {
299 /*
300 * Concurrent close of `head' could
301 * abort `so' due to re-lock.
302 */
303 soref(so);
304 soref(head);
305 sounlock(so);
306 solock(head);
307 solock(so);
308
309 if (so->so_onq != &head->so_q0) {
310 sounlock(head);
311 sounlock(so);
312 sorele(head);
313 sorele(so);
314 return;
315 }
316
317 sorele(head);
318 sorele(so);
319 }
320
321 soqremque(so, 0);
322
323 if (persocket)
324 sounlock(head);
325 }
326
327 if (persocket) {
328 sounlock(so);
329 refcnt_finalize(&so->so_refcnt, "sofinal");
330 solock(so);
331 }
332
333 sigio_free(&so->so_sigio);
334 klist_free(&so->so_rcv.sb_klist);
335 klist_free(&so->so_snd.sb_klist);
336 #ifdef SOCKET_SPLICE
337 if (issplicedback(so)) {
338 int freeing = SOSP_FREEING_WRITE;
339
340 if (so->so_sp->ssp_soback == so)
341 freeing |= SOSP_FREEING_READ;
342 sounsplice(so->so_sp->ssp_soback, so, freeing);
343 }
344 if (isspliced(so)) {
345 int freeing = SOSP_FREEING_READ;
346
347 if (so == so->so_sp->ssp_socket)
348 freeing |= SOSP_FREEING_WRITE;
349 sounsplice(so, so->so_sp->ssp_socket, freeing);
350 }
351 #endif /* SOCKET_SPLICE */
352
353 mtx_enter(&so->so_snd.sb_mtx);
354 sbrelease(so, &so->so_snd);
355 mtx_leave(&so->so_snd.sb_mtx);
356
357 /*
358 * Unlocked dispose and cleanup is safe. Socket is unlinked
359 * from everywhere. Even concurrent sotask() thread will not
360 * call somove().
361 */
362 if (so->so_proto->pr_flags & PR_RIGHTS &&
363 so->so_proto->pr_domain->dom_dispose)
364 (*so->so_proto->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
365 m_purge(so->so_rcv.sb_mb);
366
367 if (!keep_lock)
368 sounlock(so);
369
370 #ifdef SOCKET_SPLICE
371 if (so->so_sp) {
372 /* Reuse splice idle, sounsplice() has been called before. */
373 timeout_set_proc(&so->so_sp->ssp_idleto, soreaper, so);
374 timeout_add(&so->so_sp->ssp_idleto, 0);
375 } else
376 #endif /* SOCKET_SPLICE */
377 {
378 pool_put(&socket_pool, so);
379 }
380 }
381
382 static inline uint64_t
solinger_nsec(struct socket * so)383 solinger_nsec(struct socket *so)
384 {
385 if (so->so_linger == 0)
386 return INFSLP;
387
388 return SEC_TO_NSEC(so->so_linger);
389 }
390
391 /*
392 * Close a socket on last file table reference removal.
393 * Initiate disconnect if connected.
394 * Free socket when disconnect complete.
395 */
396 int
soclose(struct socket * so,int flags)397 soclose(struct socket *so, int flags)
398 {
399 struct socket *so2;
400 int error = 0;
401
402 solock(so);
403 /* Revoke async IO early. There is a final revocation in sofree(). */
404 sigio_free(&so->so_sigio);
405 if (so->so_state & SS_ISCONNECTED) {
406 if (so->so_pcb == NULL)
407 goto discard;
408 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
409 error = sodisconnect(so);
410 if (error)
411 goto drop;
412 }
413 if (so->so_options & SO_LINGER) {
414 if ((so->so_state & SS_ISDISCONNECTING) &&
415 (flags & MSG_DONTWAIT))
416 goto drop;
417 while (so->so_state & SS_ISCONNECTED) {
418 error = sosleep_nsec(so, &so->so_timeo,
419 PSOCK | PCATCH, "netcls",
420 solinger_nsec(so));
421 if (error)
422 break;
423 }
424 }
425 }
426 drop:
427 if (so->so_pcb) {
428 int error2;
429 error2 = pru_detach(so);
430 if (error == 0)
431 error = error2;
432 }
433 if (so->so_options & SO_ACCEPTCONN) {
434 int persocket = solock_persocket(so);
435
436 while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) {
437 if (persocket)
438 solock(so2);
439 (void) soqremque(so2, 0);
440 if (persocket)
441 sounlock(so);
442 soabort(so2);
443 if (persocket)
444 solock(so);
445 }
446 while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) {
447 if (persocket)
448 solock(so2);
449 (void) soqremque(so2, 1);
450 if (persocket)
451 sounlock(so);
452 soabort(so2);
453 if (persocket)
454 solock(so);
455 }
456 }
457 discard:
458 if (so->so_state & SS_NOFDREF)
459 panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type);
460 so->so_state |= SS_NOFDREF;
461 /* sofree() calls sounlock(). */
462 sofree(so, 0);
463 return (error);
464 }
465
466 void
soabort(struct socket * so)467 soabort(struct socket *so)
468 {
469 soassertlocked(so);
470 pru_abort(so);
471 }
472
473 int
soaccept(struct socket * so,struct mbuf * nam)474 soaccept(struct socket *so, struct mbuf *nam)
475 {
476 int error = 0;
477
478 soassertlocked(so);
479
480 if ((so->so_state & SS_NOFDREF) == 0)
481 panic("soaccept !NOFDREF: so %p, so_type %d", so, so->so_type);
482 so->so_state &= ~SS_NOFDREF;
483 if ((so->so_state & SS_ISDISCONNECTED) == 0 ||
484 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0)
485 error = pru_accept(so, nam);
486 else
487 error = ECONNABORTED;
488 return (error);
489 }
490
491 int
soconnect(struct socket * so,struct mbuf * nam)492 soconnect(struct socket *so, struct mbuf *nam)
493 {
494 int error;
495
496 soassertlocked(so);
497
498 if (so->so_options & SO_ACCEPTCONN)
499 return (EOPNOTSUPP);
500 /*
501 * If protocol is connection-based, can only connect once.
502 * Otherwise, if connected, try to disconnect first.
503 * This allows user to disconnect by connecting to, e.g.,
504 * a null address.
505 */
506 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
507 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
508 (error = sodisconnect(so))))
509 error = EISCONN;
510 else
511 error = pru_connect(so, nam);
512 return (error);
513 }
514
515 int
soconnect2(struct socket * so1,struct socket * so2)516 soconnect2(struct socket *so1, struct socket *so2)
517 {
518 int persocket, error;
519
520 if ((persocket = solock_persocket(so1)))
521 solock_pair(so1, so2);
522 else
523 solock(so1);
524
525 error = pru_connect2(so1, so2);
526
527 if (persocket)
528 sounlock(so2);
529 sounlock(so1);
530 return (error);
531 }
532
533 int
sodisconnect(struct socket * so)534 sodisconnect(struct socket *so)
535 {
536 int error;
537
538 soassertlocked(so);
539
540 if ((so->so_state & SS_ISCONNECTED) == 0)
541 return (ENOTCONN);
542 if (so->so_state & SS_ISDISCONNECTING)
543 return (EALREADY);
544 error = pru_disconnect(so);
545 return (error);
546 }
547
548 int m_getuio(struct mbuf **, int, long, struct uio *);
549
550 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
551 /*
552 * Send on a socket.
553 * If send must go all at once and message is larger than
554 * send buffering, then hard error.
555 * Lock against other senders.
556 * If must go all at once and not enough room now, then
557 * inform user that this would block and do nothing.
558 * Otherwise, if nonblocking, send as much as possible.
559 * The data to be sent is described by "uio" if nonzero,
560 * otherwise by the mbuf chain "top" (which must be null
561 * if uio is not). Data provided in mbuf chain must be small
562 * enough to send all at once.
563 *
564 * Returns nonzero on error, timeout or signal; callers
565 * must check for short counts if EINTR/ERESTART are returned.
566 * Data and control buffers are freed on return.
567 */
568 int
sosend(struct socket * so,struct mbuf * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)569 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top,
570 struct mbuf *control, int flags)
571 {
572 long space, clen = 0;
573 size_t resid;
574 int error;
575 int atomic = sosendallatonce(so) || top;
576 int dosolock = ((so->so_snd.sb_flags & SB_MTXLOCK) == 0);
577
578 if (uio)
579 resid = uio->uio_resid;
580 else
581 resid = top->m_pkthdr.len;
582 /* MSG_EOR on a SOCK_STREAM socket is invalid. */
583 if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) {
584 m_freem(top);
585 m_freem(control);
586 return (EINVAL);
587 }
588 if (uio && uio->uio_procp)
589 uio->uio_procp->p_ru.ru_msgsnd++;
590 if (control) {
591 /*
592 * In theory clen should be unsigned (since control->m_len is).
593 * However, space must be signed, as it might be less than 0
594 * if we over-committed, and we must use a signed comparison
595 * of space and clen.
596 */
597 clen = control->m_len;
598 /* reserve extra space for AF_UNIX's internalize */
599 if (so->so_proto->pr_domain->dom_family == AF_UNIX &&
600 clen >= CMSG_ALIGN(sizeof(struct cmsghdr)) &&
601 mtod(control, struct cmsghdr *)->cmsg_type == SCM_RIGHTS)
602 clen = CMSG_SPACE(
603 (clen - CMSG_ALIGN(sizeof(struct cmsghdr))) *
604 (sizeof(struct fdpass) / sizeof(int)));
605 }
606
607 #define snderr(errno) { error = errno; goto release; }
608
609 restart:
610 if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0)
611 goto out;
612 if (dosolock)
613 solock_shared(so);
614 sb_mtx_lock(&so->so_snd);
615 so->so_snd.sb_state |= SS_ISSENDING;
616 do {
617 if (so->so_snd.sb_state & SS_CANTSENDMORE)
618 snderr(EPIPE);
619 if ((error = READ_ONCE(so->so_error))) {
620 so->so_error = 0;
621 snderr(error);
622 }
623 if ((so->so_state & SS_ISCONNECTED) == 0) {
624 if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
625 if (!(resid == 0 && clen != 0))
626 snderr(ENOTCONN);
627 } else if (addr == NULL)
628 snderr(EDESTADDRREQ);
629 }
630 space = sbspace(so, &so->so_snd);
631 if (flags & MSG_OOB)
632 space += 1024;
633 if (so->so_proto->pr_domain->dom_family == AF_UNIX) {
634 if (atomic && resid > so->so_snd.sb_hiwat)
635 snderr(EMSGSIZE);
636 } else {
637 if (clen > so->so_snd.sb_hiwat ||
638 (atomic && resid > so->so_snd.sb_hiwat - clen))
639 snderr(EMSGSIZE);
640 }
641 if (space < clen ||
642 (space - clen < resid &&
643 (atomic || space < so->so_snd.sb_lowat))) {
644 if (flags & MSG_DONTWAIT)
645 snderr(EWOULDBLOCK);
646 sbunlock(&so->so_snd);
647 error = sbwait(so, &so->so_snd);
648 so->so_snd.sb_state &= ~SS_ISSENDING;
649 sb_mtx_unlock(&so->so_snd);
650 if (dosolock)
651 sounlock_shared(so);
652 if (error)
653 goto out;
654 goto restart;
655 }
656 space -= clen;
657 do {
658 if (uio == NULL) {
659 /*
660 * Data is prepackaged in "top".
661 */
662 resid = 0;
663 if (flags & MSG_EOR)
664 top->m_flags |= M_EOR;
665 } else {
666 sb_mtx_unlock(&so->so_snd);
667 if (dosolock)
668 sounlock_shared(so);
669 error = m_getuio(&top, atomic, space, uio);
670 if (dosolock)
671 solock_shared(so);
672 sb_mtx_lock(&so->so_snd);
673 if (error)
674 goto release;
675 space -= top->m_pkthdr.len;
676 resid = uio->uio_resid;
677 if (flags & MSG_EOR)
678 top->m_flags |= M_EOR;
679 }
680 if (resid == 0)
681 so->so_snd.sb_state &= ~SS_ISSENDING;
682 if (top && so->so_options & SO_ZEROIZE)
683 top->m_flags |= M_ZEROIZE;
684 sb_mtx_unlock(&so->so_snd);
685 if (!dosolock)
686 solock_shared(so);
687 if (flags & MSG_OOB)
688 error = pru_sendoob(so, top, addr, control);
689 else
690 error = pru_send(so, top, addr, control);
691 if (!dosolock)
692 sounlock_shared(so);
693 sb_mtx_lock(&so->so_snd);
694 clen = 0;
695 control = NULL;
696 top = NULL;
697 if (error)
698 goto release;
699 } while (resid && space > 0);
700 } while (resid);
701
702 release:
703 so->so_snd.sb_state &= ~SS_ISSENDING;
704 sb_mtx_unlock(&so->so_snd);
705 if (dosolock)
706 sounlock_shared(so);
707 sbunlock(&so->so_snd);
708 out:
709 m_freem(top);
710 m_freem(control);
711 return (error);
712 }
713
714 int
m_getuio(struct mbuf ** mp,int atomic,long space,struct uio * uio)715 m_getuio(struct mbuf **mp, int atomic, long space, struct uio *uio)
716 {
717 struct mbuf *m, *top = NULL;
718 struct mbuf **nextp = ⊤
719 u_long len, mlen;
720 size_t resid = uio->uio_resid;
721 int error;
722
723 do {
724 if (top == NULL) {
725 MGETHDR(m, M_WAIT, MT_DATA);
726 mlen = MHLEN;
727 m->m_pkthdr.len = 0;
728 m->m_pkthdr.ph_ifidx = 0;
729 } else {
730 MGET(m, M_WAIT, MT_DATA);
731 mlen = MLEN;
732 }
733 /* chain mbuf together */
734 *nextp = m;
735 nextp = &m->m_next;
736
737 resid = ulmin(resid, space);
738 if (resid >= MINCLSIZE) {
739 MCLGETL(m, M_NOWAIT, ulmin(resid, MAXMCLBYTES));
740 if ((m->m_flags & M_EXT) == 0)
741 MCLGETL(m, M_NOWAIT, MCLBYTES);
742 if ((m->m_flags & M_EXT) == 0)
743 goto nopages;
744 mlen = m->m_ext.ext_size;
745 len = ulmin(mlen, resid);
746 /*
747 * For datagram protocols, leave room
748 * for protocol headers in first mbuf.
749 */
750 if (atomic && m == top && len < mlen - max_hdr)
751 m->m_data += max_hdr;
752 } else {
753 nopages:
754 len = ulmin(mlen, resid);
755 /*
756 * For datagram protocols, leave room
757 * for protocol headers in first mbuf.
758 */
759 if (atomic && m == top && len < mlen - max_hdr)
760 m_align(m, len);
761 }
762
763 error = uiomove(mtod(m, caddr_t), len, uio);
764 if (error) {
765 m_freem(top);
766 return (error);
767 }
768
769 /* adjust counters */
770 resid = uio->uio_resid;
771 space -= len;
772 m->m_len = len;
773 top->m_pkthdr.len += len;
774
775 /* Is there more space and more data? */
776 } while (space > 0 && resid > 0);
777
778 *mp = top;
779 return 0;
780 }
781
782 /*
783 * Following replacement or removal of the first mbuf on the first
784 * mbuf chain of a socket buffer, push necessary state changes back
785 * into the socket buffer so that other consumers see the values
786 * consistently. 'nextrecord' is the callers locally stored value of
787 * the original value of sb->sb_mb->m_nextpkt which must be restored
788 * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL.
789 */
790 void
sbsync(struct sockbuf * sb,struct mbuf * nextrecord)791 sbsync(struct sockbuf *sb, struct mbuf *nextrecord)
792 {
793
794 /*
795 * First, update for the new value of nextrecord. If necessary,
796 * make it the first record.
797 */
798 if (sb->sb_mb != NULL)
799 sb->sb_mb->m_nextpkt = nextrecord;
800 else
801 sb->sb_mb = nextrecord;
802
803 /*
804 * Now update any dependent socket buffer fields to reflect
805 * the new state. This is an inline of SB_EMPTY_FIXUP, with
806 * the addition of a second clause that takes care of the
807 * case where sb_mb has been updated, but remains the last
808 * record.
809 */
810 if (sb->sb_mb == NULL) {
811 sb->sb_mbtail = NULL;
812 sb->sb_lastrecord = NULL;
813 } else if (sb->sb_mb->m_nextpkt == NULL)
814 sb->sb_lastrecord = sb->sb_mb;
815 }
816
817 /*
818 * Implement receive operations on a socket.
819 * We depend on the way that records are added to the sockbuf
820 * by sbappend*. In particular, each record (mbufs linked through m_next)
821 * must begin with an address if the protocol so specifies,
822 * followed by an optional mbuf or mbufs containing ancillary data,
823 * and then zero or more mbufs of data.
824 * In order to avoid blocking network for the entire time here, we release
825 * the solock() while doing the actual copy to user space.
826 * Although the sockbuf is locked, new data may still be appended,
827 * and thus we must maintain consistency of the sockbuf during that time.
828 *
829 * The caller may receive the data as a single mbuf chain by supplying
830 * an mbuf **mp0 for use in returning the chain. The uio is then used
831 * only for the count in uio_resid.
832 */
833 int
soreceive(struct socket * so,struct mbuf ** paddr,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp,socklen_t controllen)834 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio,
835 struct mbuf **mp0, struct mbuf **controlp, int *flagsp,
836 socklen_t controllen)
837 {
838 struct mbuf *m, **mp;
839 struct mbuf *cm;
840 u_long len, offset, moff;
841 int flags, error, error2, type, uio_error = 0;
842 const struct protosw *pr = so->so_proto;
843 struct mbuf *nextrecord;
844 size_t resid, orig_resid = uio->uio_resid;
845 int dosolock = ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0);
846
847 mp = mp0;
848 if (paddr)
849 *paddr = NULL;
850 if (controlp)
851 *controlp = NULL;
852 if (flagsp)
853 flags = *flagsp &~ MSG_EOR;
854 else
855 flags = 0;
856 if (flags & MSG_OOB) {
857 m = m_get(M_WAIT, MT_DATA);
858 solock(so);
859 error = pru_rcvoob(so, m, flags & MSG_PEEK);
860 sounlock(so);
861 if (error)
862 goto bad;
863 do {
864 error = uiomove(mtod(m, caddr_t),
865 ulmin(uio->uio_resid, m->m_len), uio);
866 m = m_free(m);
867 } while (uio->uio_resid && error == 0 && m);
868 bad:
869 m_freem(m);
870 return (error);
871 }
872 if (mp)
873 *mp = NULL;
874
875 restart:
876 if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0)
877 return (error);
878 if (dosolock)
879 solock_shared(so);
880 sb_mtx_lock(&so->so_rcv);
881
882 m = so->so_rcv.sb_mb;
883 #ifdef SOCKET_SPLICE
884 if (isspliced(so))
885 m = NULL;
886 #endif /* SOCKET_SPLICE */
887 /*
888 * If we have less data than requested, block awaiting more
889 * (subject to any timeout) if:
890 * 1. the current count is less than the low water mark,
891 * 2. MSG_WAITALL is set, and it is possible to do the entire
892 * receive operation at once if we block (resid <= hiwat), or
893 * 3. MSG_DONTWAIT is not set.
894 * If MSG_WAITALL is set but resid is larger than the receive buffer,
895 * we have to do the receive in sections, and thus risk returning
896 * a short count if a timeout or signal occurs after we start.
897 */
898 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
899 so->so_rcv.sb_cc < uio->uio_resid) &&
900 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
901 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
902 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
903 #ifdef DIAGNOSTIC
904 if (m == NULL && so->so_rcv.sb_cc)
905 #ifdef SOCKET_SPLICE
906 if (!isspliced(so))
907 #endif /* SOCKET_SPLICE */
908 panic("receive 1: so %p, so_type %d, sb_cc %lu",
909 so, so->so_type, so->so_rcv.sb_cc);
910 #endif
911 if ((error2 = READ_ONCE(so->so_error))) {
912 if (m)
913 goto dontblock;
914 error = error2;
915 if ((flags & MSG_PEEK) == 0)
916 so->so_error = 0;
917 goto release;
918 }
919 if (so->so_rcv.sb_state & SS_CANTRCVMORE) {
920 if (m)
921 goto dontblock;
922 else if (so->so_rcv.sb_cc == 0)
923 goto release;
924 }
925 for (; m; m = m->m_next)
926 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
927 m = so->so_rcv.sb_mb;
928 goto dontblock;
929 }
930 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
931 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
932 error = ENOTCONN;
933 goto release;
934 }
935 if (uio->uio_resid == 0 && controlp == NULL)
936 goto release;
937 if (flags & MSG_DONTWAIT) {
938 error = EWOULDBLOCK;
939 goto release;
940 }
941 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
942 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
943
944 sbunlock(&so->so_rcv);
945 error = sbwait(so, &so->so_rcv);
946 sb_mtx_unlock(&so->so_rcv);
947 if (dosolock)
948 sounlock_shared(so);
949 if (error)
950 return (error);
951 goto restart;
952 }
953 dontblock:
954 /*
955 * On entry here, m points to the first record of the socket buffer.
956 * From this point onward, we maintain 'nextrecord' as a cache of the
957 * pointer to the next record in the socket buffer. We must keep the
958 * various socket buffer pointers and local stack versions of the
959 * pointers in sync, pushing out modifications before operations that
960 * may sleep, and re-reading them afterwards.
961 *
962 * Otherwise, we will race with the network stack appending new data
963 * or records onto the socket buffer by using inconsistent/stale
964 * versions of the field, possibly resulting in socket buffer
965 * corruption.
966 */
967 if (uio->uio_procp)
968 uio->uio_procp->p_ru.ru_msgrcv++;
969 KASSERT(m == so->so_rcv.sb_mb);
970 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
971 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
972 nextrecord = m->m_nextpkt;
973 if (pr->pr_flags & PR_ADDR) {
974 #ifdef DIAGNOSTIC
975 if (m->m_type != MT_SONAME)
976 panic("receive 1a: so %p, so_type %d, m %p, m_type %d",
977 so, so->so_type, m, m->m_type);
978 #endif
979 orig_resid = 0;
980 if (flags & MSG_PEEK) {
981 if (paddr)
982 *paddr = m_copym(m, 0, m->m_len, M_NOWAIT);
983 m = m->m_next;
984 } else {
985 sbfree(so, &so->so_rcv, m);
986 if (paddr) {
987 *paddr = m;
988 so->so_rcv.sb_mb = m->m_next;
989 m->m_next = NULL;
990 m = so->so_rcv.sb_mb;
991 } else {
992 so->so_rcv.sb_mb = m_free(m);
993 m = so->so_rcv.sb_mb;
994 }
995 sbsync(&so->so_rcv, nextrecord);
996 }
997 }
998 while (m && m->m_type == MT_CONTROL && error == 0) {
999 int skip = 0;
1000 if (flags & MSG_PEEK) {
1001 if (mtod(m, struct cmsghdr *)->cmsg_type ==
1002 SCM_RIGHTS) {
1003 /* don't leak internalized SCM_RIGHTS msgs */
1004 skip = 1;
1005 } else if (controlp)
1006 *controlp = m_copym(m, 0, m->m_len, M_NOWAIT);
1007 m = m->m_next;
1008 } else {
1009 sbfree(so, &so->so_rcv, m);
1010 so->so_rcv.sb_mb = m->m_next;
1011 m->m_nextpkt = m->m_next = NULL;
1012 cm = m;
1013 m = so->so_rcv.sb_mb;
1014 sbsync(&so->so_rcv, nextrecord);
1015 if (controlp) {
1016 if (pr->pr_domain->dom_externalize) {
1017 sb_mtx_unlock(&so->so_rcv);
1018 if (dosolock)
1019 sounlock_shared(so);
1020 error =
1021 (*pr->pr_domain->dom_externalize)
1022 (cm, controllen, flags);
1023 if (dosolock)
1024 solock_shared(so);
1025 sb_mtx_lock(&so->so_rcv);
1026 }
1027 *controlp = cm;
1028 } else {
1029 /*
1030 * Dispose of any SCM_RIGHTS message that went
1031 * through the read path rather than recv.
1032 */
1033 if (pr->pr_domain->dom_dispose) {
1034 sb_mtx_unlock(&so->so_rcv);
1035 pr->pr_domain->dom_dispose(cm);
1036 sb_mtx_lock(&so->so_rcv);
1037 }
1038 m_free(cm);
1039 }
1040 }
1041 if (m != NULL)
1042 nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1043 else
1044 nextrecord = so->so_rcv.sb_mb;
1045 if (controlp && !skip)
1046 controlp = &(*controlp)->m_next;
1047 orig_resid = 0;
1048 }
1049
1050 /* If m is non-NULL, we have some data to read. */
1051 if (m) {
1052 type = m->m_type;
1053 if (type == MT_OOBDATA)
1054 flags |= MSG_OOB;
1055 if (m->m_flags & M_BCAST)
1056 flags |= MSG_BCAST;
1057 if (m->m_flags & M_MCAST)
1058 flags |= MSG_MCAST;
1059 }
1060 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
1061 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
1062
1063 moff = 0;
1064 offset = 0;
1065 while (m && uio->uio_resid > 0 && error == 0) {
1066 if (m->m_type == MT_OOBDATA) {
1067 if (type != MT_OOBDATA)
1068 break;
1069 } else if (type == MT_OOBDATA) {
1070 break;
1071 } else if (m->m_type == MT_CONTROL) {
1072 /*
1073 * If there is more than one control message in the
1074 * stream, we do a short read. Next can be received
1075 * or disposed by another system call.
1076 */
1077 break;
1078 #ifdef DIAGNOSTIC
1079 } else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) {
1080 panic("receive 3: so %p, so_type %d, m %p, m_type %d",
1081 so, so->so_type, m, m->m_type);
1082 #endif
1083 }
1084 so->so_rcv.sb_state &= ~SS_RCVATMARK;
1085 len = uio->uio_resid;
1086 if (so->so_oobmark && len > so->so_oobmark - offset)
1087 len = so->so_oobmark - offset;
1088 if (len > m->m_len - moff)
1089 len = m->m_len - moff;
1090 /*
1091 * If mp is set, just pass back the mbufs.
1092 * Otherwise copy them out via the uio, then free.
1093 * Sockbuf must be consistent here (points to current mbuf,
1094 * it points to next record) when we drop priority;
1095 * we must note any additions to the sockbuf when we
1096 * block interrupts again.
1097 */
1098 if (mp == NULL && uio_error == 0) {
1099 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
1100 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
1101 resid = uio->uio_resid;
1102 sb_mtx_unlock(&so->so_rcv);
1103 if (dosolock)
1104 sounlock_shared(so);
1105 uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio);
1106 if (dosolock)
1107 solock_shared(so);
1108 sb_mtx_lock(&so->so_rcv);
1109 if (uio_error)
1110 uio->uio_resid = resid - len;
1111 } else
1112 uio->uio_resid -= len;
1113 if (len == m->m_len - moff) {
1114 if (m->m_flags & M_EOR)
1115 flags |= MSG_EOR;
1116 if (flags & MSG_PEEK) {
1117 m = m->m_next;
1118 moff = 0;
1119 orig_resid = 0;
1120 } else {
1121 nextrecord = m->m_nextpkt;
1122 sbfree(so, &so->so_rcv, m);
1123 if (mp) {
1124 *mp = m;
1125 mp = &m->m_next;
1126 so->so_rcv.sb_mb = m = m->m_next;
1127 *mp = NULL;
1128 } else {
1129 so->so_rcv.sb_mb = m_free(m);
1130 m = so->so_rcv.sb_mb;
1131 }
1132 /*
1133 * If m != NULL, we also know that
1134 * so->so_rcv.sb_mb != NULL.
1135 */
1136 KASSERT(so->so_rcv.sb_mb == m);
1137 if (m) {
1138 m->m_nextpkt = nextrecord;
1139 if (nextrecord == NULL)
1140 so->so_rcv.sb_lastrecord = m;
1141 } else {
1142 so->so_rcv.sb_mb = nextrecord;
1143 SB_EMPTY_FIXUP(&so->so_rcv);
1144 }
1145 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
1146 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
1147 }
1148 } else {
1149 if (flags & MSG_PEEK) {
1150 moff += len;
1151 orig_resid = 0;
1152 } else {
1153 if (mp)
1154 *mp = m_copym(m, 0, len, M_WAIT);
1155 m->m_data += len;
1156 m->m_len -= len;
1157 so->so_rcv.sb_cc -= len;
1158 so->so_rcv.sb_datacc -= len;
1159 }
1160 }
1161 if (so->so_oobmark) {
1162 if ((flags & MSG_PEEK) == 0) {
1163 so->so_oobmark -= len;
1164 if (so->so_oobmark == 0) {
1165 so->so_rcv.sb_state |= SS_RCVATMARK;
1166 break;
1167 }
1168 } else {
1169 offset += len;
1170 if (offset == so->so_oobmark)
1171 break;
1172 }
1173 }
1174 if (flags & MSG_EOR)
1175 break;
1176 /*
1177 * If the MSG_WAITALL flag is set (for non-atomic socket),
1178 * we must not quit until "uio->uio_resid == 0" or an error
1179 * termination. If a signal/timeout occurs, return
1180 * with a short count but without error.
1181 * Keep sockbuf locked against other readers.
1182 */
1183 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1184 !sosendallatonce(so) && !nextrecord) {
1185 if (so->so_rcv.sb_state & SS_CANTRCVMORE ||
1186 so->so_error)
1187 break;
1188 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
1189 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
1190 if (sbwait(so, &so->so_rcv)) {
1191 sb_mtx_unlock(&so->so_rcv);
1192 if (dosolock)
1193 sounlock_shared(so);
1194 sbunlock(&so->so_rcv);
1195 return (0);
1196 }
1197 if ((m = so->so_rcv.sb_mb) != NULL)
1198 nextrecord = m->m_nextpkt;
1199 }
1200 }
1201
1202 if (m && pr->pr_flags & PR_ATOMIC) {
1203 flags |= MSG_TRUNC;
1204 if ((flags & MSG_PEEK) == 0)
1205 (void) sbdroprecord(so, &so->so_rcv);
1206 }
1207 if ((flags & MSG_PEEK) == 0) {
1208 if (m == NULL) {
1209 /*
1210 * First part is an inline SB_EMPTY_FIXUP(). Second
1211 * part makes sure sb_lastrecord is up-to-date if
1212 * there is still data in the socket buffer.
1213 */
1214 so->so_rcv.sb_mb = nextrecord;
1215 if (so->so_rcv.sb_mb == NULL) {
1216 so->so_rcv.sb_mbtail = NULL;
1217 so->so_rcv.sb_lastrecord = NULL;
1218 } else if (nextrecord->m_nextpkt == NULL)
1219 so->so_rcv.sb_lastrecord = nextrecord;
1220 }
1221 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
1222 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
1223 if (pr->pr_flags & PR_WANTRCVD) {
1224 sb_mtx_unlock(&so->so_rcv);
1225 if (!dosolock)
1226 solock_shared(so);
1227 pru_rcvd(so);
1228 if (!dosolock)
1229 sounlock_shared(so);
1230 sb_mtx_lock(&so->so_rcv);
1231 }
1232 }
1233 if (orig_resid == uio->uio_resid && orig_resid &&
1234 (flags & MSG_EOR) == 0 &&
1235 (so->so_rcv.sb_state & SS_CANTRCVMORE) == 0) {
1236 sb_mtx_unlock(&so->so_rcv);
1237 sbunlock(&so->so_rcv);
1238 goto restart;
1239 }
1240
1241 if (uio_error)
1242 error = uio_error;
1243
1244 if (flagsp)
1245 *flagsp |= flags;
1246 release:
1247 sb_mtx_unlock(&so->so_rcv);
1248 if (dosolock)
1249 sounlock_shared(so);
1250 sbunlock(&so->so_rcv);
1251 return (error);
1252 }
1253
1254 int
soshutdown(struct socket * so,int how)1255 soshutdown(struct socket *so, int how)
1256 {
1257 int error = 0;
1258
1259 switch (how) {
1260 case SHUT_RD:
1261 sorflush(so);
1262 break;
1263 case SHUT_RDWR:
1264 sorflush(so);
1265 /* FALLTHROUGH */
1266 case SHUT_WR:
1267 solock(so);
1268 error = pru_shutdown(so);
1269 sounlock(so);
1270 break;
1271 default:
1272 error = EINVAL;
1273 break;
1274 }
1275
1276 return (error);
1277 }
1278
1279 void
sorflush(struct socket * so)1280 sorflush(struct socket *so)
1281 {
1282 struct sockbuf *sb = &so->so_rcv;
1283 struct mbuf *m;
1284 const struct protosw *pr = so->so_proto;
1285 int error;
1286
1287 error = sblock(sb, SBL_WAIT | SBL_NOINTR);
1288 /* with SBL_WAIT and SLB_NOINTR sblock() must not fail */
1289 KASSERT(error == 0);
1290
1291 solock_shared(so);
1292 socantrcvmore(so);
1293 mtx_enter(&sb->sb_mtx);
1294 m = sb->sb_mb;
1295 memset(&sb->sb_startzero, 0,
1296 (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero);
1297 sb->sb_timeo_nsecs = INFSLP;
1298 mtx_leave(&sb->sb_mtx);
1299 sounlock_shared(so);
1300 sbunlock(sb);
1301
1302 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
1303 (*pr->pr_domain->dom_dispose)(m);
1304 m_purge(m);
1305 }
1306
1307 #ifdef SOCKET_SPLICE
1308
1309 #define so_splicelen so_sp->ssp_len
1310 #define so_splicemax so_sp->ssp_max
1311 #define so_idletv so_sp->ssp_idletv
1312 #define so_idleto so_sp->ssp_idleto
1313 #define so_splicetask so_sp->ssp_task
1314
1315 int
sosplice(struct socket * so,int fd,off_t max,struct timeval * tv)1316 sosplice(struct socket *so, int fd, off_t max, struct timeval *tv)
1317 {
1318 struct file *fp;
1319 struct socket *sosp;
1320 struct taskq *tq;
1321 int error = 0;
1322
1323 if ((so->so_proto->pr_flags & PR_SPLICE) == 0)
1324 return (EPROTONOSUPPORT);
1325 if (max && max < 0)
1326 return (EINVAL);
1327 if (tv && (tv->tv_sec < 0 || !timerisvalid(tv)))
1328 return (EINVAL);
1329
1330 /* If no fd is given, unsplice by removing existing link. */
1331 if (fd < 0) {
1332 if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0)
1333 return (error);
1334 solock(so);
1335 if (so->so_options & SO_ACCEPTCONN) {
1336 error = EOPNOTSUPP;
1337 goto out;
1338 }
1339 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1340 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1341 error = ENOTCONN;
1342 goto out;
1343 }
1344
1345 if (so->so_sp && so->so_sp->ssp_socket)
1346 sounsplice(so, so->so_sp->ssp_socket, 0);
1347 out:
1348 sounlock(so);
1349 sbunlock(&so->so_rcv);
1350 return (error);
1351 }
1352
1353 if (sosplice_taskq == NULL) {
1354 rw_enter_write(&sosplice_lock);
1355 if (sosplice_taskq == NULL) {
1356 tq = taskq_create("sosplice", 1, IPL_SOFTNET,
1357 TASKQ_MPSAFE);
1358 if (tq == NULL) {
1359 rw_exit_write(&sosplice_lock);
1360 return (ENOMEM);
1361 }
1362 /* Ensure the taskq is fully visible to other CPUs. */
1363 membar_producer();
1364 sosplice_taskq = tq;
1365 }
1366 rw_exit_write(&sosplice_lock);
1367 } else {
1368 /* Ensure the taskq is fully visible on this CPU. */
1369 membar_consumer();
1370 }
1371
1372 /* Find sosp, the drain socket where data will be spliced into. */
1373 if ((error = getsock(curproc, fd, &fp)) != 0)
1374 return (error);
1375 sosp = fp->f_data;
1376
1377 if (sosp->so_proto->pr_usrreqs->pru_send !=
1378 so->so_proto->pr_usrreqs->pru_send) {
1379 error = EPROTONOSUPPORT;
1380 goto frele;
1381 }
1382
1383 if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0)
1384 goto frele;
1385 if ((error = sblock(&sosp->so_snd, SBL_WAIT)) != 0) {
1386 sbunlock(&so->so_rcv);
1387 goto frele;
1388 }
1389 solock(so);
1390
1391 if ((so->so_options & SO_ACCEPTCONN) ||
1392 (sosp->so_options & SO_ACCEPTCONN)) {
1393 error = EOPNOTSUPP;
1394 goto release;
1395 }
1396 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1397 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1398 error = ENOTCONN;
1399 goto release;
1400 }
1401 if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) {
1402 error = ENOTCONN;
1403 goto release;
1404 }
1405 if (so->so_sp == NULL)
1406 so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
1407 if (sosp->so_sp == NULL)
1408 sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
1409 if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) {
1410 error = EBUSY;
1411 goto release;
1412 }
1413
1414 /* Splice so and sosp together. */
1415 mtx_enter(&so->so_rcv.sb_mtx);
1416 so->so_sp->ssp_socket = sosp;
1417 sosp->so_sp->ssp_soback = so;
1418 mtx_leave(&so->so_rcv.sb_mtx);
1419 so->so_splicelen = 0;
1420 so->so_splicemax = max;
1421 if (tv)
1422 so->so_idletv = *tv;
1423 else
1424 timerclear(&so->so_idletv);
1425 timeout_set_proc(&so->so_idleto, soidle, so);
1426 task_set(&so->so_splicetask, sotask, so);
1427
1428 /*
1429 * To prevent softnet interrupt from calling somove() while
1430 * we sleep, the socket buffers are not marked as spliced yet.
1431 */
1432 if (somove(so, M_WAIT)) {
1433 mtx_enter(&so->so_rcv.sb_mtx);
1434 so->so_rcv.sb_flags |= SB_SPLICE;
1435 mtx_leave(&so->so_rcv.sb_mtx);
1436 sosp->so_snd.sb_flags |= SB_SPLICE;
1437 }
1438
1439 release:
1440 sounlock(so);
1441 sbunlock(&sosp->so_snd);
1442 sbunlock(&so->so_rcv);
1443 frele:
1444 FRELE(fp, curproc);
1445
1446 return (error);
1447 }
1448
1449 void
sounsplice(struct socket * so,struct socket * sosp,int freeing)1450 sounsplice(struct socket *so, struct socket *sosp, int freeing)
1451 {
1452 soassertlocked(so);
1453
1454 task_del(sosplice_taskq, &so->so_splicetask);
1455 timeout_del(&so->so_idleto);
1456 sosp->so_snd.sb_flags &= ~SB_SPLICE;
1457
1458 mtx_enter(&so->so_rcv.sb_mtx);
1459 so->so_rcv.sb_flags &= ~SB_SPLICE;
1460 so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL;
1461 mtx_leave(&so->so_rcv.sb_mtx);
1462
1463 /* Do not wakeup a socket that is about to be freed. */
1464 if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so))
1465 sorwakeup(so);
1466 if ((freeing & SOSP_FREEING_WRITE) == 0 && sowriteable(sosp))
1467 sowwakeup(sosp);
1468 }
1469
1470 void
soidle(void * arg)1471 soidle(void *arg)
1472 {
1473 struct socket *so = arg;
1474
1475 solock(so);
1476 if (so->so_rcv.sb_flags & SB_SPLICE) {
1477 so->so_error = ETIMEDOUT;
1478 sounsplice(so, so->so_sp->ssp_socket, 0);
1479 }
1480 sounlock(so);
1481 }
1482
1483 void
sotask(void * arg)1484 sotask(void *arg)
1485 {
1486 struct socket *so = arg;
1487
1488 solock(so);
1489 if (so->so_rcv.sb_flags & SB_SPLICE) {
1490 /*
1491 * We may not sleep here as sofree() and unsplice() may be
1492 * called from softnet interrupt context. This would remove
1493 * the socket during somove().
1494 */
1495 somove(so, M_DONTWAIT);
1496 }
1497 sounlock(so);
1498
1499 /* Avoid user land starvation. */
1500 yield();
1501 }
1502
1503 /*
1504 * The socket splicing task or idle timeout may sleep while grabbing the net
1505 * lock. As sofree() can be called anytime, sotask() or soidle() could access
1506 * the socket memory of a freed socket after wakeup. So delay the pool_put()
1507 * after all pending socket splicing tasks or timeouts have finished. Do this
1508 * by scheduling it on the same threads.
1509 */
1510 void
soreaper(void * arg)1511 soreaper(void *arg)
1512 {
1513 struct socket *so = arg;
1514
1515 /* Reuse splice task, sounsplice() has been called before. */
1516 task_set(&so->so_sp->ssp_task, soput, so);
1517 task_add(sosplice_taskq, &so->so_sp->ssp_task);
1518 }
1519
1520 void
soput(void * arg)1521 soput(void *arg)
1522 {
1523 struct socket *so = arg;
1524
1525 pool_put(&sosplice_pool, so->so_sp);
1526 pool_put(&socket_pool, so);
1527 }
1528
1529 /*
1530 * Move data from receive buffer of spliced source socket to send
1531 * buffer of drain socket. Try to move as much as possible in one
1532 * big chunk. It is a TCP only implementation.
1533 * Return value 0 means splicing has been finished, 1 continue.
1534 */
1535 int
somove(struct socket * so,int wait)1536 somove(struct socket *so, int wait)
1537 {
1538 struct socket *sosp = so->so_sp->ssp_socket;
1539 struct mbuf *m, **mp, *nextrecord;
1540 u_long len, off, oobmark;
1541 long space;
1542 int error = 0, maxreached = 0;
1543 unsigned int rcvstate;
1544
1545 soassertlocked(so);
1546
1547 nextpkt:
1548 if (so->so_error) {
1549 error = so->so_error;
1550 goto release;
1551 }
1552 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) {
1553 error = EPIPE;
1554 goto release;
1555 }
1556 if (sosp->so_error && sosp->so_error != ETIMEDOUT &&
1557 sosp->so_error != EFBIG && sosp->so_error != ELOOP) {
1558 error = sosp->so_error;
1559 goto release;
1560 }
1561 if ((sosp->so_state & SS_ISCONNECTED) == 0)
1562 goto release;
1563
1564 /* Calculate how many bytes can be copied now. */
1565 len = so->so_rcv.sb_datacc;
1566 if (so->so_splicemax) {
1567 KASSERT(so->so_splicelen < so->so_splicemax);
1568 if (so->so_splicemax <= so->so_splicelen + len) {
1569 len = so->so_splicemax - so->so_splicelen;
1570 maxreached = 1;
1571 }
1572 }
1573 space = sbspace(sosp, &sosp->so_snd);
1574 if (so->so_oobmark && so->so_oobmark < len &&
1575 so->so_oobmark < space + 1024)
1576 space += 1024;
1577 if (space <= 0) {
1578 maxreached = 0;
1579 goto release;
1580 }
1581 if (space < len) {
1582 maxreached = 0;
1583 if (space < sosp->so_snd.sb_lowat)
1584 goto release;
1585 len = space;
1586 }
1587 sosp->so_snd.sb_state |= SS_ISSENDING;
1588
1589 SBLASTRECORDCHK(&so->so_rcv, "somove 1");
1590 SBLASTMBUFCHK(&so->so_rcv, "somove 1");
1591 m = so->so_rcv.sb_mb;
1592 if (m == NULL)
1593 goto release;
1594 nextrecord = m->m_nextpkt;
1595
1596 /* Drop address and control information not used with splicing. */
1597 if (so->so_proto->pr_flags & PR_ADDR) {
1598 #ifdef DIAGNOSTIC
1599 if (m->m_type != MT_SONAME)
1600 panic("somove soname: so %p, so_type %d, m %p, "
1601 "m_type %d", so, so->so_type, m, m->m_type);
1602 #endif
1603 m = m->m_next;
1604 }
1605 while (m && m->m_type == MT_CONTROL)
1606 m = m->m_next;
1607 if (m == NULL) {
1608 sbdroprecord(so, &so->so_rcv);
1609 if (so->so_proto->pr_flags & PR_WANTRCVD)
1610 pru_rcvd(so);
1611 goto nextpkt;
1612 }
1613
1614 /*
1615 * By splicing sockets connected to localhost, userland might create a
1616 * loop. Dissolve splicing with error if loop is detected by counter.
1617 *
1618 * If we deal with looped broadcast/multicast packet we bail out with
1619 * no error to suppress splice termination.
1620 */
1621 if ((m->m_flags & M_PKTHDR) &&
1622 ((m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) ||
1623 ((m->m_flags & M_LOOP) && (m->m_flags & (M_BCAST|M_MCAST))))) {
1624 error = ELOOP;
1625 goto release;
1626 }
1627
1628 if (so->so_proto->pr_flags & PR_ATOMIC) {
1629 if ((m->m_flags & M_PKTHDR) == 0)
1630 panic("somove !PKTHDR: so %p, so_type %d, m %p, "
1631 "m_type %d", so, so->so_type, m, m->m_type);
1632 if (sosp->so_snd.sb_hiwat < m->m_pkthdr.len) {
1633 error = EMSGSIZE;
1634 goto release;
1635 }
1636 if (len < m->m_pkthdr.len)
1637 goto release;
1638 if (m->m_pkthdr.len < len) {
1639 maxreached = 0;
1640 len = m->m_pkthdr.len;
1641 }
1642 /*
1643 * Throw away the name mbuf after it has been assured
1644 * that the whole first record can be processed.
1645 */
1646 m = so->so_rcv.sb_mb;
1647 sbfree(so, &so->so_rcv, m);
1648 so->so_rcv.sb_mb = m_free(m);
1649 sbsync(&so->so_rcv, nextrecord);
1650 }
1651 /*
1652 * Throw away the control mbufs after it has been assured
1653 * that the whole first record can be processed.
1654 */
1655 m = so->so_rcv.sb_mb;
1656 while (m && m->m_type == MT_CONTROL) {
1657 sbfree(so, &so->so_rcv, m);
1658 so->so_rcv.sb_mb = m_free(m);
1659 m = so->so_rcv.sb_mb;
1660 sbsync(&so->so_rcv, nextrecord);
1661 }
1662
1663 SBLASTRECORDCHK(&so->so_rcv, "somove 2");
1664 SBLASTMBUFCHK(&so->so_rcv, "somove 2");
1665
1666 /* Take at most len mbufs out of receive buffer. */
1667 for (off = 0, mp = &m; off <= len && *mp;
1668 off += (*mp)->m_len, mp = &(*mp)->m_next) {
1669 u_long size = len - off;
1670
1671 #ifdef DIAGNOSTIC
1672 if ((*mp)->m_type != MT_DATA && (*mp)->m_type != MT_HEADER)
1673 panic("somove type: so %p, so_type %d, m %p, "
1674 "m_type %d", so, so->so_type, *mp, (*mp)->m_type);
1675 #endif
1676 if ((*mp)->m_len > size) {
1677 /*
1678 * Move only a partial mbuf at maximum splice length or
1679 * if the drain buffer is too small for this large mbuf.
1680 */
1681 if (!maxreached && so->so_snd.sb_datacc > 0) {
1682 len -= size;
1683 break;
1684 }
1685 *mp = m_copym(so->so_rcv.sb_mb, 0, size, wait);
1686 if (*mp == NULL) {
1687 len -= size;
1688 break;
1689 }
1690 so->so_rcv.sb_mb->m_data += size;
1691 so->so_rcv.sb_mb->m_len -= size;
1692 so->so_rcv.sb_cc -= size;
1693 so->so_rcv.sb_datacc -= size;
1694 } else {
1695 *mp = so->so_rcv.sb_mb;
1696 sbfree(so, &so->so_rcv, *mp);
1697 so->so_rcv.sb_mb = (*mp)->m_next;
1698 sbsync(&so->so_rcv, nextrecord);
1699 }
1700 }
1701 *mp = NULL;
1702
1703 SBLASTRECORDCHK(&so->so_rcv, "somove 3");
1704 SBLASTMBUFCHK(&so->so_rcv, "somove 3");
1705 SBCHECK(so, &so->so_rcv);
1706 if (m == NULL)
1707 goto release;
1708 m->m_nextpkt = NULL;
1709 if (m->m_flags & M_PKTHDR) {
1710 m_resethdr(m);
1711 m->m_pkthdr.len = len;
1712 }
1713
1714 /* Send window update to source peer as receive buffer has changed. */
1715 if (so->so_proto->pr_flags & PR_WANTRCVD)
1716 pru_rcvd(so);
1717
1718 /* Receive buffer did shrink by len bytes, adjust oob. */
1719 mtx_enter(&so->so_rcv.sb_mtx);
1720 rcvstate = so->so_rcv.sb_state;
1721 so->so_rcv.sb_state &= ~SS_RCVATMARK;
1722 oobmark = so->so_oobmark;
1723 so->so_oobmark = oobmark > len ? oobmark - len : 0;
1724 if (oobmark) {
1725 if (oobmark == len)
1726 so->so_rcv.sb_state |= SS_RCVATMARK;
1727 if (oobmark >= len)
1728 oobmark = 0;
1729 }
1730 mtx_leave(&so->so_rcv.sb_mtx);
1731
1732 /*
1733 * Handle oob data. If any malloc fails, ignore error.
1734 * TCP urgent data is not very reliable anyway.
1735 */
1736 while (((rcvstate & SS_RCVATMARK) || oobmark) &&
1737 (so->so_options & SO_OOBINLINE)) {
1738 struct mbuf *o = NULL;
1739
1740 if (rcvstate & SS_RCVATMARK) {
1741 o = m_get(wait, MT_DATA);
1742 rcvstate &= ~SS_RCVATMARK;
1743 } else if (oobmark) {
1744 o = m_split(m, oobmark, wait);
1745 if (o) {
1746 error = pru_send(sosp, m, NULL, NULL);
1747 if (error) {
1748 if (sosp->so_snd.sb_state &
1749 SS_CANTSENDMORE)
1750 error = EPIPE;
1751 m_freem(o);
1752 goto release;
1753 }
1754 len -= oobmark;
1755 so->so_splicelen += oobmark;
1756 m = o;
1757 o = m_get(wait, MT_DATA);
1758 }
1759 oobmark = 0;
1760 }
1761 if (o) {
1762 o->m_len = 1;
1763 *mtod(o, caddr_t) = *mtod(m, caddr_t);
1764 error = pru_sendoob(sosp, o, NULL, NULL);
1765 if (error) {
1766 if (sosp->so_snd.sb_state & SS_CANTSENDMORE)
1767 error = EPIPE;
1768 m_freem(m);
1769 goto release;
1770 }
1771 len -= 1;
1772 so->so_splicelen += 1;
1773 if (oobmark) {
1774 oobmark -= 1;
1775 if (oobmark == 0)
1776 rcvstate |= SS_RCVATMARK;
1777 }
1778 m_adj(m, 1);
1779 }
1780 }
1781
1782 /* Append all remaining data to drain socket. */
1783 if (so->so_rcv.sb_cc == 0 || maxreached)
1784 sosp->so_snd.sb_state &= ~SS_ISSENDING;
1785 error = pru_send(sosp, m, NULL, NULL);
1786 if (error) {
1787 if (sosp->so_snd.sb_state & SS_CANTSENDMORE)
1788 error = EPIPE;
1789 goto release;
1790 }
1791 so->so_splicelen += len;
1792
1793 /* Move several packets if possible. */
1794 if (!maxreached && nextrecord)
1795 goto nextpkt;
1796
1797 release:
1798 sosp->so_snd.sb_state &= ~SS_ISSENDING;
1799 if (!error && maxreached && so->so_splicemax == so->so_splicelen)
1800 error = EFBIG;
1801 if (error)
1802 so->so_error = error;
1803 if (((so->so_rcv.sb_state & SS_CANTRCVMORE) &&
1804 so->so_rcv.sb_cc == 0) ||
1805 (sosp->so_snd.sb_state & SS_CANTSENDMORE) ||
1806 maxreached || error) {
1807 sounsplice(so, sosp, 0);
1808 return (0);
1809 }
1810 if (timerisset(&so->so_idletv))
1811 timeout_add_tv(&so->so_idleto, &so->so_idletv);
1812 return (1);
1813 }
1814
1815 #endif /* SOCKET_SPLICE */
1816
1817 void
sorwakeup(struct socket * so)1818 sorwakeup(struct socket *so)
1819 {
1820 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0)
1821 soassertlocked_readonly(so);
1822
1823 #ifdef SOCKET_SPLICE
1824 if (so->so_rcv.sb_flags & SB_SPLICE) {
1825 /*
1826 * TCP has a sendbuffer that can handle multiple packets
1827 * at once. So queue the stream a bit to accumulate data.
1828 * The sosplice thread will call somove() later and send
1829 * the packets calling tcp_output() only once.
1830 * In the UDP case, send out the packets immediately.
1831 * Using a thread would make things slower.
1832 */
1833 if (so->so_proto->pr_flags & PR_WANTRCVD)
1834 task_add(sosplice_taskq, &so->so_splicetask);
1835 else
1836 somove(so, M_DONTWAIT);
1837 }
1838 if (isspliced(so))
1839 return;
1840 #endif
1841 sowakeup(so, &so->so_rcv);
1842 if (so->so_upcall)
1843 (*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT);
1844 }
1845
1846 void
sowwakeup(struct socket * so)1847 sowwakeup(struct socket *so)
1848 {
1849 if ((so->so_snd.sb_flags & SB_MTXLOCK) == 0)
1850 soassertlocked_readonly(so);
1851
1852 #ifdef SOCKET_SPLICE
1853 if (so->so_snd.sb_flags & SB_SPLICE)
1854 task_add(sosplice_taskq, &so->so_sp->ssp_soback->so_splicetask);
1855 if (issplicedback(so))
1856 return;
1857 #endif
1858 sowakeup(so, &so->so_snd);
1859 }
1860
1861 int
sosetopt(struct socket * so,int level,int optname,struct mbuf * m)1862 sosetopt(struct socket *so, int level, int optname, struct mbuf *m)
1863 {
1864 int error = 0;
1865
1866 if (level != SOL_SOCKET) {
1867 if (so->so_proto->pr_ctloutput) {
1868 solock(so);
1869 error = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so,
1870 level, optname, m);
1871 sounlock(so);
1872 return (error);
1873 }
1874 error = ENOPROTOOPT;
1875 } else {
1876 switch (optname) {
1877
1878 case SO_LINGER:
1879 if (m == NULL || m->m_len != sizeof (struct linger) ||
1880 mtod(m, struct linger *)->l_linger < 0 ||
1881 mtod(m, struct linger *)->l_linger > SHRT_MAX)
1882 return (EINVAL);
1883
1884 solock(so);
1885 so->so_linger = mtod(m, struct linger *)->l_linger;
1886 if (*mtod(m, int *))
1887 so->so_options |= optname;
1888 else
1889 so->so_options &= ~optname;
1890 sounlock(so);
1891
1892 break;
1893 case SO_BINDANY:
1894 if ((error = suser(curproc)) != 0) /* XXX */
1895 return (error);
1896 /* FALLTHROUGH */
1897
1898 case SO_DEBUG:
1899 case SO_KEEPALIVE:
1900 case SO_USELOOPBACK:
1901 case SO_BROADCAST:
1902 case SO_REUSEADDR:
1903 case SO_REUSEPORT:
1904 case SO_OOBINLINE:
1905 case SO_TIMESTAMP:
1906 case SO_ZEROIZE:
1907 if (m == NULL || m->m_len < sizeof (int))
1908 return (EINVAL);
1909
1910 solock(so);
1911 if (*mtod(m, int *))
1912 so->so_options |= optname;
1913 else
1914 so->so_options &= ~optname;
1915 sounlock(so);
1916
1917 break;
1918 case SO_DONTROUTE:
1919 if (m == NULL || m->m_len < sizeof (int))
1920 return (EINVAL);
1921 if (*mtod(m, int *))
1922 error = EOPNOTSUPP;
1923 break;
1924
1925 case SO_SNDBUF:
1926 case SO_RCVBUF:
1927 case SO_SNDLOWAT:
1928 case SO_RCVLOWAT:
1929 {
1930 struct sockbuf *sb = (optname == SO_SNDBUF ||
1931 optname == SO_SNDLOWAT ?
1932 &so->so_snd : &so->so_rcv);
1933 u_long cnt;
1934
1935 if (m == NULL || m->m_len < sizeof (int))
1936 return (EINVAL);
1937 cnt = *mtod(m, int *);
1938 if ((long)cnt <= 0)
1939 cnt = 1;
1940
1941 if (((sb->sb_flags & SB_MTXLOCK) == 0))
1942 solock(so);
1943 mtx_enter(&sb->sb_mtx);
1944
1945 switch (optname) {
1946 case SO_SNDBUF:
1947 case SO_RCVBUF:
1948 if (sb->sb_state &
1949 (SS_CANTSENDMORE | SS_CANTRCVMORE)) {
1950 error = EINVAL;
1951 break;
1952 }
1953 if (sbcheckreserve(cnt, sb->sb_wat) ||
1954 sbreserve(so, sb, cnt)) {
1955 error = ENOBUFS;
1956 break;
1957 }
1958 sb->sb_wat = cnt;
1959 break;
1960 case SO_SNDLOWAT:
1961 case SO_RCVLOWAT:
1962 sb->sb_lowat = (cnt > sb->sb_hiwat) ?
1963 sb->sb_hiwat : cnt;
1964 break;
1965 }
1966
1967 mtx_leave(&sb->sb_mtx);
1968 if (((sb->sb_flags & SB_MTXLOCK) == 0))
1969 sounlock(so);
1970
1971 break;
1972 }
1973
1974 case SO_SNDTIMEO:
1975 case SO_RCVTIMEO:
1976 {
1977 struct sockbuf *sb = (optname == SO_SNDTIMEO ?
1978 &so->so_snd : &so->so_rcv);
1979 struct timeval tv;
1980 uint64_t nsecs;
1981
1982 if (m == NULL || m->m_len < sizeof (tv))
1983 return (EINVAL);
1984 memcpy(&tv, mtod(m, struct timeval *), sizeof tv);
1985 if (!timerisvalid(&tv))
1986 return (EINVAL);
1987 nsecs = TIMEVAL_TO_NSEC(&tv);
1988 if (nsecs == UINT64_MAX)
1989 return (EDOM);
1990 if (nsecs == 0)
1991 nsecs = INFSLP;
1992
1993 mtx_enter(&sb->sb_mtx);
1994 sb->sb_timeo_nsecs = nsecs;
1995 mtx_leave(&sb->sb_mtx);
1996 break;
1997 }
1998
1999 case SO_RTABLE:
2000 if (so->so_proto->pr_domain &&
2001 so->so_proto->pr_domain->dom_protosw &&
2002 so->so_proto->pr_ctloutput) {
2003 const struct domain *dom =
2004 so->so_proto->pr_domain;
2005
2006 level = dom->dom_protosw->pr_protocol;
2007 solock(so);
2008 error = (*so->so_proto->pr_ctloutput)
2009 (PRCO_SETOPT, so, level, optname, m);
2010 sounlock(so);
2011 } else
2012 error = ENOPROTOOPT;
2013 break;
2014 #ifdef SOCKET_SPLICE
2015 case SO_SPLICE:
2016 if (m == NULL) {
2017 error = sosplice(so, -1, 0, NULL);
2018 } else if (m->m_len < sizeof(int)) {
2019 error = EINVAL;
2020 } else if (m->m_len < sizeof(struct splice)) {
2021 error = sosplice(so, *mtod(m, int *), 0, NULL);
2022 } else {
2023 error = sosplice(so,
2024 mtod(m, struct splice *)->sp_fd,
2025 mtod(m, struct splice *)->sp_max,
2026 &mtod(m, struct splice *)->sp_idle);
2027 }
2028 break;
2029 #endif /* SOCKET_SPLICE */
2030
2031 default:
2032 error = ENOPROTOOPT;
2033 break;
2034 }
2035 }
2036
2037 return (error);
2038 }
2039
2040 int
sogetopt(struct socket * so,int level,int optname,struct mbuf * m)2041 sogetopt(struct socket *so, int level, int optname, struct mbuf *m)
2042 {
2043 int error = 0;
2044
2045 if (level != SOL_SOCKET) {
2046 if (so->so_proto->pr_ctloutput) {
2047 m->m_len = 0;
2048
2049 solock(so);
2050 error = (*so->so_proto->pr_ctloutput)(PRCO_GETOPT, so,
2051 level, optname, m);
2052 sounlock(so);
2053 return (error);
2054 } else
2055 return (ENOPROTOOPT);
2056 } else {
2057 m->m_len = sizeof (int);
2058
2059 switch (optname) {
2060
2061 case SO_LINGER:
2062 m->m_len = sizeof (struct linger);
2063 solock_shared(so);
2064 mtod(m, struct linger *)->l_onoff =
2065 so->so_options & SO_LINGER;
2066 mtod(m, struct linger *)->l_linger = so->so_linger;
2067 sounlock_shared(so);
2068 break;
2069
2070 case SO_BINDANY:
2071 case SO_USELOOPBACK:
2072 case SO_DEBUG:
2073 case SO_KEEPALIVE:
2074 case SO_REUSEADDR:
2075 case SO_REUSEPORT:
2076 case SO_BROADCAST:
2077 case SO_OOBINLINE:
2078 case SO_ACCEPTCONN:
2079 case SO_TIMESTAMP:
2080 case SO_ZEROIZE:
2081 *mtod(m, int *) = so->so_options & optname;
2082 break;
2083
2084 case SO_DONTROUTE:
2085 *mtod(m, int *) = 0;
2086 break;
2087
2088 case SO_TYPE:
2089 *mtod(m, int *) = so->so_type;
2090 break;
2091
2092 case SO_ERROR:
2093 solock(so);
2094 *mtod(m, int *) = so->so_error;
2095 so->so_error = 0;
2096 sounlock(so);
2097
2098 break;
2099
2100 case SO_DOMAIN:
2101 *mtod(m, int *) = so->so_proto->pr_domain->dom_family;
2102 break;
2103
2104 case SO_PROTOCOL:
2105 *mtod(m, int *) = so->so_proto->pr_protocol;
2106 break;
2107
2108 case SO_SNDBUF:
2109 *mtod(m, int *) = so->so_snd.sb_hiwat;
2110 break;
2111
2112 case SO_RCVBUF:
2113 *mtod(m, int *) = so->so_rcv.sb_hiwat;
2114 break;
2115
2116 case SO_SNDLOWAT:
2117 *mtod(m, int *) = so->so_snd.sb_lowat;
2118 break;
2119
2120 case SO_RCVLOWAT:
2121 *mtod(m, int *) = so->so_rcv.sb_lowat;
2122 break;
2123
2124 case SO_SNDTIMEO:
2125 case SO_RCVTIMEO:
2126 {
2127 struct sockbuf *sb = (optname == SO_SNDTIMEO ?
2128 &so->so_snd : &so->so_rcv);
2129 struct timeval tv;
2130 uint64_t nsecs;
2131
2132 mtx_enter(&sb->sb_mtx);
2133 nsecs = sb->sb_timeo_nsecs;
2134 mtx_leave(&sb->sb_mtx);
2135
2136 m->m_len = sizeof(struct timeval);
2137 memset(&tv, 0, sizeof(tv));
2138 if (nsecs != INFSLP)
2139 NSEC_TO_TIMEVAL(nsecs, &tv);
2140 memcpy(mtod(m, struct timeval *), &tv, sizeof tv);
2141 break;
2142 }
2143
2144 case SO_RTABLE:
2145 if (so->so_proto->pr_domain &&
2146 so->so_proto->pr_domain->dom_protosw &&
2147 so->so_proto->pr_ctloutput) {
2148 const struct domain *dom =
2149 so->so_proto->pr_domain;
2150
2151 level = dom->dom_protosw->pr_protocol;
2152 solock(so);
2153 error = (*so->so_proto->pr_ctloutput)
2154 (PRCO_GETOPT, so, level, optname, m);
2155 sounlock(so);
2156 if (error)
2157 return (error);
2158 break;
2159 }
2160 return (ENOPROTOOPT);
2161
2162 #ifdef SOCKET_SPLICE
2163 case SO_SPLICE:
2164 {
2165 off_t len;
2166
2167 m->m_len = sizeof(off_t);
2168 solock_shared(so);
2169 len = so->so_sp ? so->so_sp->ssp_len : 0;
2170 sounlock_shared(so);
2171 memcpy(mtod(m, off_t *), &len, sizeof(off_t));
2172 break;
2173 }
2174 #endif /* SOCKET_SPLICE */
2175
2176 case SO_PEERCRED:
2177 if (so->so_proto->pr_protocol == AF_UNIX) {
2178 struct unpcb *unp = sotounpcb(so);
2179
2180 solock(so);
2181 if (unp->unp_flags & UNP_FEIDS) {
2182 m->m_len = sizeof(unp->unp_connid);
2183 memcpy(mtod(m, caddr_t),
2184 &(unp->unp_connid), m->m_len);
2185 sounlock(so);
2186 break;
2187 }
2188 sounlock(so);
2189
2190 return (ENOTCONN);
2191 }
2192 return (EOPNOTSUPP);
2193
2194 default:
2195 return (ENOPROTOOPT);
2196 }
2197 return (0);
2198 }
2199 }
2200
2201 void
sohasoutofband(struct socket * so)2202 sohasoutofband(struct socket *so)
2203 {
2204 pgsigio(&so->so_sigio, SIGURG, 0);
2205 knote(&so->so_rcv.sb_klist, 0);
2206 }
2207
2208 void
sofilt_lock(struct socket * so,struct sockbuf * sb)2209 sofilt_lock(struct socket *so, struct sockbuf *sb)
2210 {
2211 switch (so->so_proto->pr_domain->dom_family) {
2212 case PF_INET:
2213 case PF_INET6:
2214 NET_LOCK_SHARED();
2215 break;
2216 default:
2217 rw_enter_write(&so->so_lock);
2218 break;
2219 }
2220
2221 mtx_enter(&sb->sb_mtx);
2222 }
2223
2224 void
sofilt_unlock(struct socket * so,struct sockbuf * sb)2225 sofilt_unlock(struct socket *so, struct sockbuf *sb)
2226 {
2227 mtx_leave(&sb->sb_mtx);
2228
2229 switch (so->so_proto->pr_domain->dom_family) {
2230 case PF_INET:
2231 case PF_INET6:
2232 NET_UNLOCK_SHARED();
2233 break;
2234 default:
2235 rw_exit_write(&so->so_lock);
2236 break;
2237 }
2238 }
2239
2240 int
soo_kqfilter(struct file * fp,struct knote * kn)2241 soo_kqfilter(struct file *fp, struct knote *kn)
2242 {
2243 struct socket *so = kn->kn_fp->f_data;
2244 struct sockbuf *sb;
2245
2246 switch (kn->kn_filter) {
2247 case EVFILT_READ:
2248 kn->kn_fop = &soread_filtops;
2249 sb = &so->so_rcv;
2250 break;
2251 case EVFILT_WRITE:
2252 kn->kn_fop = &sowrite_filtops;
2253 sb = &so->so_snd;
2254 break;
2255 case EVFILT_EXCEPT:
2256 kn->kn_fop = &soexcept_filtops;
2257 sb = &so->so_rcv;
2258 break;
2259 default:
2260 return (EINVAL);
2261 }
2262
2263 klist_insert(&sb->sb_klist, kn);
2264
2265 return (0);
2266 }
2267
2268 void
filt_sordetach(struct knote * kn)2269 filt_sordetach(struct knote *kn)
2270 {
2271 struct socket *so = kn->kn_fp->f_data;
2272
2273 klist_remove(&so->so_rcv.sb_klist, kn);
2274 }
2275
2276 int
filt_soread(struct knote * kn,long hint)2277 filt_soread(struct knote *kn, long hint)
2278 {
2279 struct socket *so = kn->kn_fp->f_data;
2280 int rv = 0;
2281
2282 MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx);
2283 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0)
2284 soassertlocked_readonly(so);
2285
2286 if (so->so_options & SO_ACCEPTCONN) {
2287 if (so->so_rcv.sb_flags & SB_MTXLOCK)
2288 soassertlocked_readonly(so);
2289
2290 kn->kn_data = so->so_qlen;
2291 rv = (kn->kn_data != 0);
2292
2293 if (kn->kn_flags & (__EV_POLL | __EV_SELECT)) {
2294 if (so->so_state & SS_ISDISCONNECTED) {
2295 kn->kn_flags |= __EV_HUP;
2296 rv = 1;
2297 } else {
2298 rv = soreadable(so);
2299 }
2300 }
2301
2302 return rv;
2303 }
2304
2305 kn->kn_data = so->so_rcv.sb_cc;
2306 #ifdef SOCKET_SPLICE
2307 if (isspliced(so)) {
2308 rv = 0;
2309 } else
2310 #endif /* SOCKET_SPLICE */
2311 if (so->so_rcv.sb_state & SS_CANTRCVMORE) {
2312 kn->kn_flags |= EV_EOF;
2313 if (kn->kn_flags & __EV_POLL) {
2314 if (so->so_state & SS_ISDISCONNECTED)
2315 kn->kn_flags |= __EV_HUP;
2316 }
2317 kn->kn_fflags = so->so_error;
2318 rv = 1;
2319 } else if (so->so_error) {
2320 rv = 1;
2321 } else if (kn->kn_sfflags & NOTE_LOWAT) {
2322 rv = (kn->kn_data >= kn->kn_sdata);
2323 } else {
2324 rv = (kn->kn_data >= so->so_rcv.sb_lowat);
2325 }
2326
2327 return rv;
2328 }
2329
2330 void
filt_sowdetach(struct knote * kn)2331 filt_sowdetach(struct knote *kn)
2332 {
2333 struct socket *so = kn->kn_fp->f_data;
2334
2335 klist_remove(&so->so_snd.sb_klist, kn);
2336 }
2337
2338 int
filt_sowrite(struct knote * kn,long hint)2339 filt_sowrite(struct knote *kn, long hint)
2340 {
2341 struct socket *so = kn->kn_fp->f_data;
2342 int rv;
2343
2344 MUTEX_ASSERT_LOCKED(&so->so_snd.sb_mtx);
2345 if ((so->so_snd.sb_flags & SB_MTXLOCK) == 0)
2346 soassertlocked_readonly(so);
2347
2348 kn->kn_data = sbspace(so, &so->so_snd);
2349 if (so->so_snd.sb_state & SS_CANTSENDMORE) {
2350 kn->kn_flags |= EV_EOF;
2351 if (kn->kn_flags & __EV_POLL) {
2352 if (so->so_state & SS_ISDISCONNECTED)
2353 kn->kn_flags |= __EV_HUP;
2354 }
2355 kn->kn_fflags = so->so_error;
2356 rv = 1;
2357 } else if (so->so_error) {
2358 rv = 1;
2359 } else if (((so->so_state & SS_ISCONNECTED) == 0) &&
2360 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
2361 rv = 0;
2362 } else if (kn->kn_sfflags & NOTE_LOWAT) {
2363 rv = (kn->kn_data >= kn->kn_sdata);
2364 } else {
2365 rv = (kn->kn_data >= so->so_snd.sb_lowat);
2366 }
2367
2368 return (rv);
2369 }
2370
2371 int
filt_soexcept(struct knote * kn,long hint)2372 filt_soexcept(struct knote *kn, long hint)
2373 {
2374 struct socket *so = kn->kn_fp->f_data;
2375 int rv = 0;
2376
2377 MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx);
2378 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0)
2379 soassertlocked_readonly(so);
2380
2381 #ifdef SOCKET_SPLICE
2382 if (isspliced(so)) {
2383 rv = 0;
2384 } else
2385 #endif /* SOCKET_SPLICE */
2386 if (kn->kn_sfflags & NOTE_OOB) {
2387 if (so->so_oobmark || (so->so_rcv.sb_state & SS_RCVATMARK)) {
2388 kn->kn_fflags |= NOTE_OOB;
2389 kn->kn_data -= so->so_oobmark;
2390 rv = 1;
2391 }
2392 }
2393
2394 if (kn->kn_flags & __EV_POLL) {
2395 if (so->so_state & SS_ISDISCONNECTED) {
2396 kn->kn_flags |= __EV_HUP;
2397 rv = 1;
2398 }
2399 }
2400
2401 return rv;
2402 }
2403
2404 int
filt_sowmodify(struct kevent * kev,struct knote * kn)2405 filt_sowmodify(struct kevent *kev, struct knote *kn)
2406 {
2407 struct socket *so = kn->kn_fp->f_data;
2408 int rv;
2409
2410 sofilt_lock(so, &so->so_snd);
2411 rv = knote_modify(kev, kn);
2412 sofilt_unlock(so, &so->so_snd);
2413
2414 return (rv);
2415 }
2416
2417 int
filt_sowprocess(struct knote * kn,struct kevent * kev)2418 filt_sowprocess(struct knote *kn, struct kevent *kev)
2419 {
2420 struct socket *so = kn->kn_fp->f_data;
2421 int rv;
2422
2423 sofilt_lock(so, &so->so_snd);
2424 rv = knote_process(kn, kev);
2425 sofilt_unlock(so, &so->so_snd);
2426
2427 return (rv);
2428 }
2429
2430 int
filt_sormodify(struct kevent * kev,struct knote * kn)2431 filt_sormodify(struct kevent *kev, struct knote *kn)
2432 {
2433 struct socket *so = kn->kn_fp->f_data;
2434 int rv;
2435
2436 sofilt_lock(so, &so->so_rcv);
2437 rv = knote_modify(kev, kn);
2438 sofilt_unlock(so, &so->so_rcv);
2439
2440 return (rv);
2441 }
2442
2443 int
filt_sorprocess(struct knote * kn,struct kevent * kev)2444 filt_sorprocess(struct knote *kn, struct kevent *kev)
2445 {
2446 struct socket *so = kn->kn_fp->f_data;
2447 int rv;
2448
2449 sofilt_lock(so, &so->so_rcv);
2450 rv = knote_process(kn, kev);
2451 sofilt_unlock(so, &so->so_rcv);
2452
2453 return (rv);
2454 }
2455
2456 #ifdef DDB
2457 void
2458 sobuf_print(struct sockbuf *,
2459 int (*)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))));
2460
2461 void
sobuf_print(struct sockbuf * sb,int (* pr)(const char *,...))2462 sobuf_print(struct sockbuf *sb,
2463 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
2464 {
2465 (*pr)("\tsb_cc: %lu\n", sb->sb_cc);
2466 (*pr)("\tsb_datacc: %lu\n", sb->sb_datacc);
2467 (*pr)("\tsb_hiwat: %lu\n", sb->sb_hiwat);
2468 (*pr)("\tsb_wat: %lu\n", sb->sb_wat);
2469 (*pr)("\tsb_mbcnt: %lu\n", sb->sb_mbcnt);
2470 (*pr)("\tsb_mbmax: %lu\n", sb->sb_mbmax);
2471 (*pr)("\tsb_lowat: %ld\n", sb->sb_lowat);
2472 (*pr)("\tsb_mb: %p\n", sb->sb_mb);
2473 (*pr)("\tsb_mbtail: %p\n", sb->sb_mbtail);
2474 (*pr)("\tsb_lastrecord: %p\n", sb->sb_lastrecord);
2475 (*pr)("\tsb_sel: ...\n");
2476 (*pr)("\tsb_flags: %04x\n", sb->sb_flags);
2477 (*pr)("\tsb_state: %04x\n", sb->sb_state);
2478 (*pr)("\tsb_timeo_nsecs: %llu\n", sb->sb_timeo_nsecs);
2479 }
2480
2481 void
so_print(void * v,int (* pr)(const char *,...))2482 so_print(void *v,
2483 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
2484 {
2485 struct socket *so = v;
2486
2487 (*pr)("socket %p\n", so);
2488 (*pr)("so_type: %i\n", so->so_type);
2489 (*pr)("so_options: 0x%04x\n", so->so_options); /* %b */
2490 (*pr)("so_linger: %i\n", so->so_linger);
2491 (*pr)("so_state: 0x%04x\n", so->so_state);
2492 (*pr)("so_pcb: %p\n", so->so_pcb);
2493 (*pr)("so_proto: %p\n", so->so_proto);
2494 (*pr)("so_sigio: %p\n", so->so_sigio.sir_sigio);
2495
2496 (*pr)("so_head: %p\n", so->so_head);
2497 (*pr)("so_onq: %p\n", so->so_onq);
2498 (*pr)("so_q0: @%p first: %p\n", &so->so_q0, TAILQ_FIRST(&so->so_q0));
2499 (*pr)("so_q: @%p first: %p\n", &so->so_q, TAILQ_FIRST(&so->so_q));
2500 (*pr)("so_eq: next: %p\n", TAILQ_NEXT(so, so_qe));
2501 (*pr)("so_q0len: %i\n", so->so_q0len);
2502 (*pr)("so_qlen: %i\n", so->so_qlen);
2503 (*pr)("so_qlimit: %i\n", so->so_qlimit);
2504 (*pr)("so_timeo: %i\n", so->so_timeo);
2505 (*pr)("so_obmark: %lu\n", so->so_oobmark);
2506
2507 (*pr)("so_sp: %p\n", so->so_sp);
2508 if (so->so_sp != NULL) {
2509 (*pr)("\tssp_socket: %p\n", so->so_sp->ssp_socket);
2510 (*pr)("\tssp_soback: %p\n", so->so_sp->ssp_soback);
2511 (*pr)("\tssp_len: %lld\n",
2512 (unsigned long long)so->so_sp->ssp_len);
2513 (*pr)("\tssp_max: %lld\n",
2514 (unsigned long long)so->so_sp->ssp_max);
2515 (*pr)("\tssp_idletv: %lld %ld\n", so->so_sp->ssp_idletv.tv_sec,
2516 so->so_sp->ssp_idletv.tv_usec);
2517 (*pr)("\tssp_idleto: %spending (@%i)\n",
2518 timeout_pending(&so->so_sp->ssp_idleto) ? "" : "not ",
2519 so->so_sp->ssp_idleto.to_time);
2520 }
2521
2522 (*pr)("so_rcv:\n");
2523 sobuf_print(&so->so_rcv, pr);
2524 (*pr)("so_snd:\n");
2525 sobuf_print(&so->so_snd, pr);
2526
2527 (*pr)("so_upcall: %p so_upcallarg: %p\n",
2528 so->so_upcall, so->so_upcallarg);
2529
2530 (*pr)("so_euid: %d so_ruid: %d\n", so->so_euid, so->so_ruid);
2531 (*pr)("so_egid: %d so_rgid: %d\n", so->so_egid, so->so_rgid);
2532 (*pr)("so_cpid: %d\n", so->so_cpid);
2533 }
2534 #endif
2535