1 /* $OpenBSD: uipc_socket2.c,v 1.171 2025/01/27 14:57:13 mvs Exp $ */
2 /* $NetBSD: uipc_socket2.c,v 1.11 1996/02/04 02:17:55 christos Exp $ */
3
4 /*
5 * Copyright (c) 1982, 1986, 1988, 1990, 1993
6 * The Regents of the University of California. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93
33 */
34
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/malloc.h>
38 #include <sys/mbuf.h>
39 #include <sys/protosw.h>
40 #include <sys/domain.h>
41 #include <sys/socket.h>
42 #include <sys/socketvar.h>
43 #include <sys/signalvar.h>
44 #include <sys/pool.h>
45
46 /*
47 * Primitive routines for operating on sockets and socket buffers
48 */
49
50 u_long sb_max = SB_MAX; /* [I] patchable */
51
52 extern struct pool mclpools[];
53 extern struct pool mbpool;
54
55 /*
56 * Procedures to manipulate state flags of socket
57 * and do appropriate wakeups. Normal sequence from the
58 * active (originating) side is that soisconnecting() is
59 * called during processing of connect() call,
60 * resulting in an eventual call to soisconnected() if/when the
61 * connection is established. When the connection is torn down
62 * soisdisconnecting() is called during processing of disconnect() call,
63 * and soisdisconnected() is called when the connection to the peer
64 * is totally severed. The semantics of these routines are such that
65 * connectionless protocols can call soisconnected() and soisdisconnected()
66 * only, bypassing the in-progress calls when setting up a ``connection''
67 * takes no time.
68 *
69 * From the passive side, a socket is created with
70 * two queues of sockets: so_q0 for connections in progress
71 * and so_q for connections already made and awaiting user acceptance.
72 * As a protocol is preparing incoming connections, it creates a socket
73 * structure queued on so_q0 by calling sonewconn(). When the connection
74 * is established, soisconnected() is called, and transfers the
75 * socket structure to so_q, making it available to accept().
76 *
77 * If a socket is closed with sockets on either
78 * so_q0 or so_q, these sockets are dropped.
79 *
80 * If higher level protocols are implemented in
81 * the kernel, the wakeups done here will sometimes
82 * cause software-interrupt process scheduling.
83 */
84
85 void
soisconnecting(struct socket * so)86 soisconnecting(struct socket *so)
87 {
88 soassertlocked(so);
89 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
90 so->so_state |= SS_ISCONNECTING;
91 }
92
93 void
soisconnected(struct socket * so)94 soisconnected(struct socket *so)
95 {
96 struct socket *head = so->so_head;
97
98 soassertlocked(so);
99 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING);
100 so->so_state |= SS_ISCONNECTED;
101
102 if (head != NULL && so->so_onq == &head->so_q0) {
103 soref(head);
104 sounlock(so);
105 solock(head);
106 solock(so);
107
108 if (so->so_onq != &head->so_q0) {
109 sounlock(head);
110 sorele(head);
111 return;
112 }
113
114 soqremque(so, 0);
115 soqinsque(head, so, 1);
116 sorwakeup(head);
117 wakeup_one(&head->so_timeo);
118
119 sounlock(head);
120 sorele(head);
121 } else {
122 wakeup(&so->so_timeo);
123 sorwakeup(so);
124 sowwakeup(so);
125 }
126 }
127
128 void
soisdisconnecting(struct socket * so)129 soisdisconnecting(struct socket *so)
130 {
131 soassertlocked(so);
132 so->so_state &= ~SS_ISCONNECTING;
133 so->so_state |= SS_ISDISCONNECTING;
134
135 mtx_enter(&so->so_rcv.sb_mtx);
136 so->so_rcv.sb_state |= SS_CANTRCVMORE;
137 mtx_leave(&so->so_rcv.sb_mtx);
138
139 mtx_enter(&so->so_snd.sb_mtx);
140 so->so_snd.sb_state |= SS_CANTSENDMORE;
141 mtx_leave(&so->so_snd.sb_mtx);
142
143 wakeup(&so->so_timeo);
144 sowwakeup(so);
145 sorwakeup(so);
146 }
147
148 void
soisdisconnected(struct socket * so)149 soisdisconnected(struct socket *so)
150 {
151 soassertlocked(so);
152
153 mtx_enter(&so->so_rcv.sb_mtx);
154 so->so_rcv.sb_state |= SS_CANTRCVMORE;
155 mtx_leave(&so->so_rcv.sb_mtx);
156
157 mtx_enter(&so->so_snd.sb_mtx);
158 so->so_snd.sb_state |= SS_CANTSENDMORE;
159 mtx_leave(&so->so_snd.sb_mtx);
160
161 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
162 so->so_state |= SS_ISDISCONNECTED;
163
164 wakeup(&so->so_timeo);
165 sowwakeup(so);
166 sorwakeup(so);
167 }
168
169 /*
170 * When an attempt at a new connection is noted on a socket
171 * which accepts connections, sonewconn is called. If the
172 * connection is possible (subject to space constraints, etc.)
173 * then we allocate a new structure, properly linked into the
174 * data structure of the original socket, and return this.
175 * Connstatus may be 0 or SS_ISCONNECTED.
176 */
177 struct socket *
sonewconn(struct socket * head,int connstatus,int wait)178 sonewconn(struct socket *head, int connstatus, int wait)
179 {
180 struct socket *so;
181 int soqueue = connstatus ? 1 : 0;
182
183 soassertlocked(head);
184
185 if (m_pool_used() > 95)
186 return (NULL);
187 if (head->so_qlen + head->so_q0len > head->so_qlimit * 3)
188 return (NULL);
189 so = soalloc(head->so_proto, wait);
190 if (so == NULL)
191 return (NULL);
192 so->so_type = head->so_type;
193 so->so_options = head->so_options &~ SO_ACCEPTCONN;
194 so->so_linger = head->so_linger;
195 so->so_state = head->so_state | SS_NOFDREF;
196 so->so_proto = head->so_proto;
197 so->so_timeo = head->so_timeo;
198 so->so_euid = head->so_euid;
199 so->so_ruid = head->so_ruid;
200 so->so_egid = head->so_egid;
201 so->so_rgid = head->so_rgid;
202 so->so_cpid = head->so_cpid;
203
204 /*
205 * Lock order will be `head' -> `so' while these sockets are linked.
206 */
207 solock_nonet(so);
208
209 /*
210 * Inherit watermarks but those may get clamped in low mem situations.
211 */
212 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat))
213 goto fail;
214
215 mtx_enter(&head->so_snd.sb_mtx);
216 so->so_snd.sb_wat = head->so_snd.sb_wat;
217 so->so_snd.sb_lowat = head->so_snd.sb_lowat;
218 so->so_snd.sb_timeo_nsecs = head->so_snd.sb_timeo_nsecs;
219 mtx_leave(&head->so_snd.sb_mtx);
220
221 mtx_enter(&head->so_rcv.sb_mtx);
222 so->so_rcv.sb_wat = head->so_rcv.sb_wat;
223 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
224 so->so_rcv.sb_timeo_nsecs = head->so_rcv.sb_timeo_nsecs;
225 mtx_leave(&head->so_rcv.sb_mtx);
226
227 sigio_copy(&so->so_sigio, &head->so_sigio);
228
229 soqinsque(head, so, soqueue);
230 if (pru_attach(so, 0, wait) != 0) {
231 soqremque(so, soqueue);
232 goto fail;
233 }
234 if (connstatus) {
235 so->so_state |= connstatus;
236 sorwakeup(head);
237 wakeup(&head->so_timeo);
238 }
239
240 return (so);
241
242 fail:
243 sounlock_nonet(so);
244 sigio_free(&so->so_sigio);
245 klist_free(&so->so_rcv.sb_klist);
246 klist_free(&so->so_snd.sb_klist);
247 pool_put(&socket_pool, so);
248
249 return (NULL);
250 }
251
252 void
soqinsque(struct socket * head,struct socket * so,int q)253 soqinsque(struct socket *head, struct socket *so, int q)
254 {
255 soassertlocked(head);
256 soassertlocked(so);
257
258 KASSERT(so->so_onq == NULL);
259
260 so->so_head = head;
261 if (q == 0) {
262 head->so_q0len++;
263 so->so_onq = &head->so_q0;
264 } else {
265 head->so_qlen++;
266 so->so_onq = &head->so_q;
267 }
268 TAILQ_INSERT_TAIL(so->so_onq, so, so_qe);
269 }
270
271 int
soqremque(struct socket * so,int q)272 soqremque(struct socket *so, int q)
273 {
274 struct socket *head = so->so_head;
275
276 soassertlocked(so);
277 soassertlocked(head);
278
279 if (q == 0) {
280 if (so->so_onq != &head->so_q0)
281 return (0);
282 head->so_q0len--;
283 } else {
284 if (so->so_onq != &head->so_q)
285 return (0);
286 head->so_qlen--;
287 }
288 TAILQ_REMOVE(so->so_onq, so, so_qe);
289 so->so_onq = NULL;
290 so->so_head = NULL;
291 return (1);
292 }
293
294 /*
295 * Socantsendmore indicates that no more data will be sent on the
296 * socket; it would normally be applied to a socket when the user
297 * informs the system that no more data is to be sent, by the protocol
298 * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data
299 * will be received, and will normally be applied to the socket by a
300 * protocol when it detects that the peer will send no more data.
301 * Data queued for reading in the socket may yet be read.
302 */
303
304 void
socantsendmore(struct socket * so)305 socantsendmore(struct socket *so)
306 {
307 soassertlocked(so);
308 mtx_enter(&so->so_snd.sb_mtx);
309 so->so_snd.sb_state |= SS_CANTSENDMORE;
310 mtx_leave(&so->so_snd.sb_mtx);
311 sowwakeup(so);
312 }
313
314 void
socantrcvmore(struct socket * so)315 socantrcvmore(struct socket *so)
316 {
317 mtx_enter(&so->so_rcv.sb_mtx);
318 so->so_rcv.sb_state |= SS_CANTRCVMORE;
319 mtx_leave(&so->so_rcv.sb_mtx);
320 sorwakeup(so);
321 }
322
323 void
solock(struct socket * so)324 solock(struct socket *so)
325 {
326 switch (so->so_proto->pr_domain->dom_family) {
327 case PF_INET:
328 case PF_INET6:
329 NET_LOCK();
330 break;
331 default:
332 rw_enter_write(&so->so_lock);
333 break;
334 }
335 }
336
337 void
solock_shared(struct socket * so)338 solock_shared(struct socket *so)
339 {
340 switch (so->so_proto->pr_domain->dom_family) {
341 case PF_INET:
342 case PF_INET6:
343 NET_LOCK_SHARED();
344 break;
345 }
346 rw_enter_write(&so->so_lock);
347 }
348
349 void
solock_nonet(struct socket * so)350 solock_nonet(struct socket *so)
351 {
352 switch (so->so_proto->pr_domain->dom_family) {
353 case PF_INET:
354 case PF_INET6:
355 NET_ASSERT_LOCKED();
356 break;
357 }
358 rw_enter_write(&so->so_lock);
359 }
360
361 int
solock_persocket(struct socket * so)362 solock_persocket(struct socket *so)
363 {
364 switch (so->so_proto->pr_domain->dom_family) {
365 case PF_INET:
366 case PF_INET6:
367 return 0;
368 default:
369 return 1;
370 }
371 }
372
373 void
solock_pair(struct socket * so1,struct socket * so2)374 solock_pair(struct socket *so1, struct socket *so2)
375 {
376 KASSERT(so1 != so2);
377 KASSERT(so1->so_type == so2->so_type);
378 KASSERT(solock_persocket(so1));
379
380 if (so1 < so2) {
381 solock(so1);
382 solock(so2);
383 } else {
384 solock(so2);
385 solock(so1);
386 }
387 }
388
389 void
sounlock(struct socket * so)390 sounlock(struct socket *so)
391 {
392 switch (so->so_proto->pr_domain->dom_family) {
393 case PF_INET:
394 case PF_INET6:
395 NET_UNLOCK();
396 break;
397 default:
398 rw_exit_write(&so->so_lock);
399 break;
400 }
401 }
402
403 void
sounlock_shared(struct socket * so)404 sounlock_shared(struct socket *so)
405 {
406 rw_exit_write(&so->so_lock);
407 switch (so->so_proto->pr_domain->dom_family) {
408 case PF_INET:
409 case PF_INET6:
410 NET_UNLOCK_SHARED();
411 break;
412 }
413 }
414
415 void
sounlock_nonet(struct socket * so)416 sounlock_nonet(struct socket *so)
417 {
418 rw_exit_write(&so->so_lock);
419 }
420
421 void
soassertlocked_readonly(struct socket * so)422 soassertlocked_readonly(struct socket *so)
423 {
424 switch (so->so_proto->pr_domain->dom_family) {
425 case PF_INET:
426 case PF_INET6:
427 NET_ASSERT_LOCKED();
428 break;
429 default:
430 rw_assert_wrlock(&so->so_lock);
431 break;
432 }
433 }
434
435 void
soassertlocked(struct socket * so)436 soassertlocked(struct socket *so)
437 {
438 switch (so->so_proto->pr_domain->dom_family) {
439 case PF_INET:
440 case PF_INET6:
441 if (rw_status(&netlock) == RW_READ) {
442 NET_ASSERT_LOCKED();
443
444 if (splassert_ctl > 0 &&
445 rw_status(&so->so_lock) != RW_WRITE)
446 splassert_fail(0, RW_WRITE, __func__);
447 } else
448 NET_ASSERT_LOCKED_EXCLUSIVE();
449 break;
450 default:
451 rw_assert_wrlock(&so->so_lock);
452 break;
453 }
454 }
455
456 int
sosleep_nsec(struct socket * so,void * ident,int prio,const char * wmesg,uint64_t nsecs)457 sosleep_nsec(struct socket *so, void *ident, int prio, const char *wmesg,
458 uint64_t nsecs)
459 {
460 int ret;
461
462 switch (so->so_proto->pr_domain->dom_family) {
463 case PF_INET:
464 case PF_INET6:
465 if (rw_status(&netlock) == RW_READ)
466 rw_exit_write(&so->so_lock);
467 ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs);
468 if (rw_status(&netlock) == RW_READ)
469 rw_enter_write(&so->so_lock);
470 break;
471 default:
472 ret = rwsleep_nsec(ident, &so->so_lock, prio, wmesg, nsecs);
473 break;
474 }
475
476 return ret;
477 }
478
479 void
sbmtxassertlocked(struct sockbuf * sb)480 sbmtxassertlocked(struct sockbuf *sb)
481 {
482 if (splassert_ctl > 0 && mtx_owned(&sb->sb_mtx) == 0)
483 splassert_fail(0, RW_WRITE, __func__);
484 }
485
486 /*
487 * Wait for data to arrive at/drain from a socket buffer.
488 */
489 int
sbwait(struct sockbuf * sb)490 sbwait(struct sockbuf *sb)
491 {
492 int prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH;
493
494 MUTEX_ASSERT_LOCKED(&sb->sb_mtx);
495
496 sb->sb_flags |= SB_WAIT;
497 return msleep_nsec(&sb->sb_cc, &sb->sb_mtx, prio, "sbwait",
498 sb->sb_timeo_nsecs);
499 }
500
501 int
sblock(struct sockbuf * sb,int flags)502 sblock(struct sockbuf *sb, int flags)
503 {
504 int rwflags = RW_WRITE, error;
505
506 if (!(flags & SBL_NOINTR || sb->sb_flags & SB_NOINTR))
507 rwflags |= RW_INTR;
508 if (!(flags & SBL_WAIT))
509 rwflags |= RW_NOSLEEP;
510
511 error = rw_enter(&sb->sb_lock, rwflags);
512 if (error == EBUSY)
513 error = EWOULDBLOCK;
514
515 return error;
516 }
517
518 void
sbunlock(struct sockbuf * sb)519 sbunlock(struct sockbuf *sb)
520 {
521 rw_exit(&sb->sb_lock);
522 }
523
524 /*
525 * Wakeup processes waiting on a socket buffer.
526 * Do asynchronous notification via SIGIO
527 * if the socket buffer has the SB_ASYNC flag set.
528 */
529 void
sowakeup(struct socket * so,struct sockbuf * sb)530 sowakeup(struct socket *so, struct sockbuf *sb)
531 {
532 int dowakeup = 0, dopgsigio = 0;
533
534 mtx_enter(&sb->sb_mtx);
535 if (sb->sb_flags & SB_WAIT) {
536 sb->sb_flags &= ~SB_WAIT;
537 dowakeup = 1;
538 }
539 if (sb->sb_flags & SB_ASYNC)
540 dopgsigio = 1;
541
542 knote_locked(&sb->sb_klist, 0);
543 mtx_leave(&sb->sb_mtx);
544
545 if (dowakeup)
546 wakeup(&sb->sb_cc);
547
548 if (dopgsigio)
549 pgsigio(&so->so_sigio, SIGIO, 0);
550 }
551
552 /*
553 * Socket buffer (struct sockbuf) utility routines.
554 *
555 * Each socket contains two socket buffers: one for sending data and
556 * one for receiving data. Each buffer contains a queue of mbufs,
557 * information about the number of mbufs and amount of data in the
558 * queue, and other fields allowing select() statements and notification
559 * on data availability to be implemented.
560 *
561 * Data stored in a socket buffer is maintained as a list of records.
562 * Each record is a list of mbufs chained together with the m_next
563 * field. Records are chained together with the m_nextpkt field. The upper
564 * level routine soreceive() expects the following conventions to be
565 * observed when placing information in the receive buffer:
566 *
567 * 1. If the protocol requires each message be preceded by the sender's
568 * name, then a record containing that name must be present before
569 * any associated data (mbuf's must be of type MT_SONAME).
570 * 2. If the protocol supports the exchange of ``access rights'' (really
571 * just additional data associated with the message), and there are
572 * ``rights'' to be received, then a record containing this data
573 * should be present (mbuf's must be of type MT_CONTROL).
574 * 3. If a name or rights record exists, then it must be followed by
575 * a data record, perhaps of zero length.
576 *
577 * Before using a new socket structure it is first necessary to reserve
578 * buffer space to the socket, by calling sbreserve(). This should commit
579 * some of the available buffer space in the system buffer pool for the
580 * socket (currently, it does nothing but enforce limits). The space
581 * should be released by calling sbrelease() when the socket is destroyed.
582 */
583
584 int
soreserve(struct socket * so,u_long sndcc,u_long rcvcc)585 soreserve(struct socket *so, u_long sndcc, u_long rcvcc)
586 {
587 soassertlocked(so);
588
589 mtx_enter(&so->so_rcv.sb_mtx);
590 mtx_enter(&so->so_snd.sb_mtx);
591 if (sbreserve(so, &so->so_snd, sndcc))
592 goto bad;
593 so->so_snd.sb_wat = sndcc;
594 if (so->so_snd.sb_lowat == 0)
595 so->so_snd.sb_lowat = MCLBYTES;
596 if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
597 so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
598 if (sbreserve(so, &so->so_rcv, rcvcc))
599 goto bad2;
600 so->so_rcv.sb_wat = rcvcc;
601 if (so->so_rcv.sb_lowat == 0)
602 so->so_rcv.sb_lowat = 1;
603 mtx_leave(&so->so_snd.sb_mtx);
604 mtx_leave(&so->so_rcv.sb_mtx);
605
606 return (0);
607 bad2:
608 sbrelease(so, &so->so_snd);
609 bad:
610 mtx_leave(&so->so_snd.sb_mtx);
611 mtx_leave(&so->so_rcv.sb_mtx);
612 return (ENOBUFS);
613 }
614
615 /*
616 * Allot mbufs to a sockbuf.
617 * Attempt to scale mbmax so that mbcnt doesn't become limiting
618 * if buffering efficiency is near the normal case.
619 */
620 int
sbreserve(struct socket * so,struct sockbuf * sb,u_long cc)621 sbreserve(struct socket *so, struct sockbuf *sb, u_long cc)
622 {
623 sbmtxassertlocked(sb);
624
625 if (cc == 0 || cc > sb_max)
626 return (1);
627 sb->sb_hiwat = cc;
628 sb->sb_mbmax = max(3 * MAXMCLBYTES, cc * 8);
629 if (sb->sb_lowat > sb->sb_hiwat)
630 sb->sb_lowat = sb->sb_hiwat;
631 return (0);
632 }
633
634 /*
635 * In low memory situation, do not accept any greater than normal request.
636 */
637 int
sbcheckreserve(u_long cnt,u_long defcnt)638 sbcheckreserve(u_long cnt, u_long defcnt)
639 {
640 if (cnt > defcnt && sbchecklowmem())
641 return (ENOBUFS);
642 return (0);
643 }
644
645 int
sbchecklowmem(void)646 sbchecklowmem(void)
647 {
648 static int sblowmem;
649 unsigned int used;
650
651 /*
652 * m_pool_used() is thread safe. Global variable sblowmem is updated
653 * by multiple CPUs, but most times with the same value. And even
654 * if the value is not correct for a short time, it does not matter.
655 */
656 used = m_pool_used();
657 if (used < 60)
658 atomic_store_int(&sblowmem, 0);
659 else if (used > 80)
660 atomic_store_int(&sblowmem, 1);
661
662 return (atomic_load_int(&sblowmem));
663 }
664
665 /*
666 * Free mbufs held by a socket, and reserved mbuf space.
667 */
668 void
sbrelease(struct socket * so,struct sockbuf * sb)669 sbrelease(struct socket *so, struct sockbuf *sb)
670 {
671
672 sbflush(so, sb);
673 sb->sb_hiwat = sb->sb_mbmax = 0;
674 }
675
676 /*
677 * Routines to add and remove
678 * data from an mbuf queue.
679 *
680 * The routines sbappend() or sbappendrecord() are normally called to
681 * append new mbufs to a socket buffer, after checking that adequate
682 * space is available, comparing the function sbspace() with the amount
683 * of data to be added. sbappendrecord() differs from sbappend() in
684 * that data supplied is treated as the beginning of a new record.
685 * To place a sender's address, optional access rights, and data in a
686 * socket receive buffer, sbappendaddr() should be used. To place
687 * access rights and data in a socket receive buffer, sbappendrights()
688 * should be used. In either case, the new data begins a new record.
689 * Note that unlike sbappend() and sbappendrecord(), these routines check
690 * for the caller that there will be enough space to store the data.
691 * Each fails if there is not enough space, or if it cannot find mbufs
692 * to store additional information in.
693 *
694 * Reliable protocols may use the socket send buffer to hold data
695 * awaiting acknowledgement. Data is normally copied from a socket
696 * send buffer in a protocol with m_copym for output to a peer,
697 * and then removing the data from the socket buffer with sbdrop()
698 * or sbdroprecord() when the data is acknowledged by the peer.
699 */
700
701 #ifdef SOCKBUF_DEBUG
702 void
sblastrecordchk(struct sockbuf * sb,const char * where)703 sblastrecordchk(struct sockbuf *sb, const char *where)
704 {
705 struct mbuf *m = sb->sb_mb;
706
707 while (m && m->m_nextpkt)
708 m = m->m_nextpkt;
709
710 if (m != sb->sb_lastrecord) {
711 printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n",
712 sb->sb_mb, sb->sb_lastrecord, m);
713 printf("packet chain:\n");
714 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
715 printf("\t%p\n", m);
716 panic("sblastrecordchk from %s", where);
717 }
718 }
719
720 void
sblastmbufchk(struct sockbuf * sb,const char * where)721 sblastmbufchk(struct sockbuf *sb, const char *where)
722 {
723 struct mbuf *m = sb->sb_mb;
724 struct mbuf *n;
725
726 while (m && m->m_nextpkt)
727 m = m->m_nextpkt;
728
729 while (m && m->m_next)
730 m = m->m_next;
731
732 if (m != sb->sb_mbtail) {
733 printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n",
734 sb->sb_mb, sb->sb_mbtail, m);
735 printf("packet tree:\n");
736 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
737 printf("\t");
738 for (n = m; n != NULL; n = n->m_next)
739 printf("%p ", n);
740 printf("\n");
741 }
742 panic("sblastmbufchk from %s", where);
743 }
744 }
745 #endif /* SOCKBUF_DEBUG */
746
747 #define SBLINKRECORD(sb, m0) \
748 do { \
749 if ((sb)->sb_lastrecord != NULL) \
750 (sb)->sb_lastrecord->m_nextpkt = (m0); \
751 else \
752 (sb)->sb_mb = (m0); \
753 (sb)->sb_lastrecord = (m0); \
754 } while (/*CONSTCOND*/0)
755
756 /*
757 * Append mbuf chain m to the last record in the
758 * socket buffer sb. The additional space associated
759 * the mbuf chain is recorded in sb. Empty mbufs are
760 * discarded and mbufs are compacted where possible.
761 */
762 void
sbappend(struct socket * so,struct sockbuf * sb,struct mbuf * m)763 sbappend(struct socket *so, struct sockbuf *sb, struct mbuf *m)
764 {
765 struct mbuf *n;
766
767 if (m == NULL)
768 return;
769
770 sbmtxassertlocked(sb);
771 SBLASTRECORDCHK(sb, "sbappend 1");
772
773 if ((n = sb->sb_lastrecord) != NULL) {
774 /*
775 * XXX Would like to simply use sb_mbtail here, but
776 * XXX I need to verify that I won't miss an EOR that
777 * XXX way.
778 */
779 do {
780 if (n->m_flags & M_EOR) {
781 sbappendrecord(so, sb, m); /* XXXXXX!!!! */
782 return;
783 }
784 } while (n->m_next && (n = n->m_next));
785 } else {
786 /*
787 * If this is the first record in the socket buffer, it's
788 * also the last record.
789 */
790 sb->sb_lastrecord = m;
791 }
792 sbcompress(so, sb, m, n);
793 SBLASTRECORDCHK(sb, "sbappend 2");
794 }
795
796 /*
797 * This version of sbappend() should only be used when the caller
798 * absolutely knows that there will never be more than one record
799 * in the socket buffer, that is, a stream protocol (such as TCP).
800 */
801 void
sbappendstream(struct socket * so,struct sockbuf * sb,struct mbuf * m)802 sbappendstream(struct socket *so, struct sockbuf *sb, struct mbuf *m)
803 {
804 sbmtxassertlocked(sb);
805 KDASSERT(m->m_nextpkt == NULL);
806 KASSERT(sb->sb_mb == sb->sb_lastrecord);
807
808 SBLASTMBUFCHK(sb, __func__);
809
810 sbcompress(so, sb, m, sb->sb_mbtail);
811
812 sb->sb_lastrecord = sb->sb_mb;
813 SBLASTRECORDCHK(sb, __func__);
814 }
815
816 #ifdef SOCKBUF_DEBUG
817 void
sbcheck(struct socket * so,struct sockbuf * sb)818 sbcheck(struct socket *so, struct sockbuf *sb)
819 {
820 struct mbuf *m, *n;
821 u_long len = 0, mbcnt = 0;
822
823 for (m = sb->sb_mb; m; m = m->m_nextpkt) {
824 for (n = m; n; n = n->m_next) {
825 len += n->m_len;
826 mbcnt += MSIZE;
827 if (n->m_flags & M_EXT)
828 mbcnt += n->m_ext.ext_size;
829 if (m != n && n->m_nextpkt)
830 panic("sbcheck nextpkt");
831 }
832 }
833 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
834 printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc,
835 mbcnt, sb->sb_mbcnt);
836 panic("sbcheck");
837 }
838 }
839 #endif
840
841 /*
842 * As above, except the mbuf chain
843 * begins a new record.
844 */
845 void
sbappendrecord(struct socket * so,struct sockbuf * sb,struct mbuf * m0)846 sbappendrecord(struct socket *so, struct sockbuf *sb, struct mbuf *m0)
847 {
848 struct mbuf *m;
849
850 sbmtxassertlocked(sb);
851
852 if (m0 == NULL)
853 return;
854
855 /*
856 * Put the first mbuf on the queue.
857 * Note this permits zero length records.
858 */
859 sballoc(sb, m0);
860 SBLASTRECORDCHK(sb, "sbappendrecord 1");
861 SBLINKRECORD(sb, m0);
862 m = m0->m_next;
863 m0->m_next = NULL;
864 if (m && (m0->m_flags & M_EOR)) {
865 m0->m_flags &= ~M_EOR;
866 m->m_flags |= M_EOR;
867 }
868 sbcompress(so, sb, m, m0);
869 SBLASTRECORDCHK(sb, "sbappendrecord 2");
870 }
871
872 /*
873 * Append address and data, and optionally, control (ancillary) data
874 * to the receive queue of a socket. If present,
875 * m0 must include a packet header with total length.
876 * Returns 0 if no space in sockbuf or insufficient mbufs.
877 */
878 int
sbappendaddr(struct socket * so,struct sockbuf * sb,const struct sockaddr * asa,struct mbuf * m0,struct mbuf * control)879 sbappendaddr(struct socket *so, struct sockbuf *sb, const struct sockaddr *asa,
880 struct mbuf *m0, struct mbuf *control)
881 {
882 struct mbuf *m, *n, *nlast;
883 int space = asa->sa_len;
884
885 sbmtxassertlocked(sb);
886
887 if (m0 && (m0->m_flags & M_PKTHDR) == 0)
888 panic("sbappendaddr");
889 if (m0)
890 space += m0->m_pkthdr.len;
891 for (n = control; n; n = n->m_next) {
892 space += n->m_len;
893 if (n->m_next == NULL) /* keep pointer to last control buf */
894 break;
895 }
896 if (space > sbspace_locked(so, sb))
897 return (0);
898 if (asa->sa_len > MLEN)
899 return (0);
900 MGET(m, M_DONTWAIT, MT_SONAME);
901 if (m == NULL)
902 return (0);
903 m->m_len = asa->sa_len;
904 memcpy(mtod(m, caddr_t), asa, asa->sa_len);
905 if (n)
906 n->m_next = m0; /* concatenate data to control */
907 else
908 control = m0;
909 m->m_next = control;
910
911 SBLASTRECORDCHK(sb, "sbappendaddr 1");
912
913 for (n = m; n->m_next != NULL; n = n->m_next)
914 sballoc(sb, n);
915 sballoc(sb, n);
916 nlast = n;
917 SBLINKRECORD(sb, m);
918
919 sb->sb_mbtail = nlast;
920 SBLASTMBUFCHK(sb, "sbappendaddr");
921
922 SBLASTRECORDCHK(sb, "sbappendaddr 2");
923
924 return (1);
925 }
926
927 int
sbappendcontrol(struct socket * so,struct sockbuf * sb,struct mbuf * m0,struct mbuf * control)928 sbappendcontrol(struct socket *so, struct sockbuf *sb, struct mbuf *m0,
929 struct mbuf *control)
930 {
931 struct mbuf *m, *mlast, *n;
932 int eor = 0, space = 0;
933
934 sbmtxassertlocked(sb);
935
936 if (control == NULL)
937 panic("sbappendcontrol");
938 for (m = control; ; m = m->m_next) {
939 space += m->m_len;
940 if (m->m_next == NULL)
941 break;
942 }
943 n = m; /* save pointer to last control buffer */
944 for (m = m0; m; m = m->m_next) {
945 space += m->m_len;
946 eor |= m->m_flags & M_EOR;
947 if (eor) {
948 if (m->m_next == NULL)
949 m->m_flags |= M_EOR;
950 else
951 m->m_flags &= ~M_EOR;
952 }
953 }
954 if (space > sbspace_locked(so, sb))
955 return (0);
956 n->m_next = m0; /* concatenate data to control */
957
958 SBLASTRECORDCHK(sb, "sbappendcontrol 1");
959
960 for (m = control; m->m_next != NULL; m = m->m_next)
961 sballoc(sb, m);
962 sballoc(sb, m);
963 mlast = m;
964 SBLINKRECORD(sb, control);
965
966 sb->sb_mbtail = mlast;
967 SBLASTMBUFCHK(sb, "sbappendcontrol");
968
969 SBLASTRECORDCHK(sb, "sbappendcontrol 2");
970
971 return (1);
972 }
973
974 /*
975 * Compress mbuf chain m into the socket
976 * buffer sb following mbuf n. If n
977 * is null, the buffer is presumed empty.
978 */
979 void
sbcompress(struct socket * so,struct sockbuf * sb,struct mbuf * m,struct mbuf * n)980 sbcompress(struct socket *so, struct sockbuf *sb, struct mbuf *m,
981 struct mbuf *n)
982 {
983 int eor = 0;
984 struct mbuf *o;
985
986 while (m) {
987 eor |= m->m_flags & M_EOR;
988 if (m->m_len == 0 &&
989 (eor == 0 ||
990 (((o = m->m_next) || (o = n)) &&
991 o->m_type == m->m_type))) {
992 if (sb->sb_lastrecord == m)
993 sb->sb_lastrecord = m->m_next;
994 m = m_free(m);
995 continue;
996 }
997 if (n && (n->m_flags & M_EOR) == 0 &&
998 /* m_trailingspace() checks buffer writeability */
999 m->m_len <= ((n->m_flags & M_EXT)? n->m_ext.ext_size :
1000 MCLBYTES) / 4 && /* XXX Don't copy too much */
1001 m->m_len <= m_trailingspace(n) &&
1002 n->m_type == m->m_type) {
1003 memcpy(mtod(n, caddr_t) + n->m_len, mtod(m, caddr_t),
1004 m->m_len);
1005 n->m_len += m->m_len;
1006 sb->sb_cc += m->m_len;
1007 if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME)
1008 sb->sb_datacc += m->m_len;
1009 m = m_free(m);
1010 continue;
1011 }
1012 if (n)
1013 n->m_next = m;
1014 else
1015 sb->sb_mb = m;
1016 sb->sb_mbtail = m;
1017 sballoc(sb, m);
1018 n = m;
1019 m->m_flags &= ~M_EOR;
1020 m = m->m_next;
1021 n->m_next = NULL;
1022 }
1023 if (eor) {
1024 if (n)
1025 n->m_flags |= eor;
1026 else
1027 printf("semi-panic: sbcompress");
1028 }
1029 SBLASTMBUFCHK(sb, __func__);
1030 }
1031
1032 /*
1033 * Free all mbufs in a sockbuf.
1034 * Check that all resources are reclaimed.
1035 */
1036 void
sbflush(struct socket * so,struct sockbuf * sb)1037 sbflush(struct socket *so, struct sockbuf *sb)
1038 {
1039 KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
1040 rw_assert_unlocked(&sb->sb_lock);
1041
1042 while (sb->sb_mbcnt)
1043 sbdrop(so, sb, (int)sb->sb_cc);
1044
1045 KASSERT(sb->sb_cc == 0);
1046 KASSERT(sb->sb_datacc == 0);
1047 KASSERT(sb->sb_mb == NULL);
1048 KASSERT(sb->sb_mbtail == NULL);
1049 KASSERT(sb->sb_lastrecord == NULL);
1050 }
1051
1052 /*
1053 * Drop data from (the front of) a sockbuf.
1054 */
1055 void
sbdrop(struct socket * so,struct sockbuf * sb,int len)1056 sbdrop(struct socket *so, struct sockbuf *sb, int len)
1057 {
1058 struct mbuf *m, *mn;
1059 struct mbuf *next;
1060
1061 sbmtxassertlocked(sb);
1062
1063 next = (m = sb->sb_mb) ? m->m_nextpkt : NULL;
1064 while (len > 0) {
1065 if (m == NULL) {
1066 if (next == NULL)
1067 panic("sbdrop");
1068 m = next;
1069 next = m->m_nextpkt;
1070 continue;
1071 }
1072 if (m->m_len > len) {
1073 m->m_len -= len;
1074 m->m_data += len;
1075 sb->sb_cc -= len;
1076 if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME)
1077 sb->sb_datacc -= len;
1078 break;
1079 }
1080 len -= m->m_len;
1081 sbfree(sb, m);
1082 mn = m_free(m);
1083 m = mn;
1084 }
1085 while (m && m->m_len == 0) {
1086 sbfree(sb, m);
1087 mn = m_free(m);
1088 m = mn;
1089 }
1090 if (m) {
1091 sb->sb_mb = m;
1092 m->m_nextpkt = next;
1093 } else
1094 sb->sb_mb = next;
1095 /*
1096 * First part is an inline SB_EMPTY_FIXUP(). Second part
1097 * makes sure sb_lastrecord is up-to-date if we dropped
1098 * part of the last record.
1099 */
1100 m = sb->sb_mb;
1101 if (m == NULL) {
1102 sb->sb_mbtail = NULL;
1103 sb->sb_lastrecord = NULL;
1104 } else if (m->m_nextpkt == NULL)
1105 sb->sb_lastrecord = m;
1106 }
1107
1108 /*
1109 * Drop a record off the front of a sockbuf
1110 * and move the next record to the front.
1111 */
1112 void
sbdroprecord(struct socket * so,struct sockbuf * sb)1113 sbdroprecord(struct socket *so, struct sockbuf *sb)
1114 {
1115 struct mbuf *m, *mn;
1116
1117 m = sb->sb_mb;
1118 if (m) {
1119 sb->sb_mb = m->m_nextpkt;
1120 do {
1121 sbfree(sb, m);
1122 mn = m_free(m);
1123 } while ((m = mn) != NULL);
1124 }
1125 SB_EMPTY_FIXUP(sb);
1126 }
1127
1128 /*
1129 * Create a "control" mbuf containing the specified data
1130 * with the specified type for presentation on a socket buffer.
1131 */
1132 struct mbuf *
sbcreatecontrol(const void * p,size_t size,int type,int level)1133 sbcreatecontrol(const void *p, size_t size, int type, int level)
1134 {
1135 struct cmsghdr *cp;
1136 struct mbuf *m;
1137
1138 if (CMSG_SPACE(size) > MCLBYTES) {
1139 printf("sbcreatecontrol: message too large %zu\n", size);
1140 return (NULL);
1141 }
1142
1143 if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
1144 return (NULL);
1145 if (CMSG_SPACE(size) > MLEN) {
1146 MCLGET(m, M_DONTWAIT);
1147 if ((m->m_flags & M_EXT) == 0) {
1148 m_free(m);
1149 return NULL;
1150 }
1151 }
1152 cp = mtod(m, struct cmsghdr *);
1153 memset(cp, 0, CMSG_SPACE(size));
1154 memcpy(CMSG_DATA(cp), p, size);
1155 m->m_len = CMSG_SPACE(size);
1156 cp->cmsg_len = CMSG_LEN(size);
1157 cp->cmsg_level = level;
1158 cp->cmsg_type = type;
1159 return (m);
1160 }
1161