1 /* $OpenBSD: uipc_socket2.c,v 1.166 2025/01/18 10:44:52 bluhm Exp $ */ 2 /* $NetBSD: uipc_socket2.c,v 1.11 1996/02/04 02:17:55 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/malloc.h> 38 #include <sys/mbuf.h> 39 #include <sys/protosw.h> 40 #include <sys/domain.h> 41 #include <sys/socket.h> 42 #include <sys/socketvar.h> 43 #include <sys/signalvar.h> 44 #include <sys/pool.h> 45 46 /* 47 * Primitive routines for operating on sockets and socket buffers 48 */ 49 50 u_long sb_max = SB_MAX; /* [I] patchable */ 51 52 extern struct pool mclpools[]; 53 extern struct pool mbpool; 54 55 /* 56 * Procedures to manipulate state flags of socket 57 * and do appropriate wakeups. Normal sequence from the 58 * active (originating) side is that soisconnecting() is 59 * called during processing of connect() call, 60 * resulting in an eventual call to soisconnected() if/when the 61 * connection is established. When the connection is torn down 62 * soisdisconnecting() is called during processing of disconnect() call, 63 * and soisdisconnected() is called when the connection to the peer 64 * is totally severed. The semantics of these routines are such that 65 * connectionless protocols can call soisconnected() and soisdisconnected() 66 * only, bypassing the in-progress calls when setting up a ``connection'' 67 * takes no time. 68 * 69 * From the passive side, a socket is created with 70 * two queues of sockets: so_q0 for connections in progress 71 * and so_q for connections already made and awaiting user acceptance. 72 * As a protocol is preparing incoming connections, it creates a socket 73 * structure queued on so_q0 by calling sonewconn(). When the connection 74 * is established, soisconnected() is called, and transfers the 75 * socket structure to so_q, making it available to accept(). 76 * 77 * If a socket is closed with sockets on either 78 * so_q0 or so_q, these sockets are dropped. 79 * 80 * If higher level protocols are implemented in 81 * the kernel, the wakeups done here will sometimes 82 * cause software-interrupt process scheduling. 83 */ 84 85 void 86 soisconnecting(struct socket *so) 87 { 88 soassertlocked(so); 89 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 90 so->so_state |= SS_ISCONNECTING; 91 } 92 93 void 94 soisconnected(struct socket *so) 95 { 96 struct socket *head = so->so_head; 97 98 soassertlocked(so); 99 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING); 100 so->so_state |= SS_ISCONNECTED; 101 102 if (head != NULL && so->so_onq == &head->so_q0) { 103 KASSERT(solock_persocket(so)); 104 105 soref(head); 106 sounlock(so); 107 solock(head); 108 solock(so); 109 110 if (so->so_onq != &head->so_q0) { 111 sounlock(head); 112 sorele(head); 113 return; 114 } 115 116 soqremque(so, 0); 117 soqinsque(head, so, 1); 118 sorwakeup(head); 119 wakeup_one(&head->so_timeo); 120 121 sounlock(head); 122 sorele(head); 123 } else { 124 wakeup(&so->so_timeo); 125 sorwakeup(so); 126 sowwakeup(so); 127 } 128 } 129 130 void 131 soisdisconnecting(struct socket *so) 132 { 133 soassertlocked(so); 134 so->so_state &= ~SS_ISCONNECTING; 135 so->so_state |= SS_ISDISCONNECTING; 136 137 mtx_enter(&so->so_rcv.sb_mtx); 138 so->so_rcv.sb_state |= SS_CANTRCVMORE; 139 mtx_leave(&so->so_rcv.sb_mtx); 140 141 mtx_enter(&so->so_snd.sb_mtx); 142 so->so_snd.sb_state |= SS_CANTSENDMORE; 143 mtx_leave(&so->so_snd.sb_mtx); 144 145 wakeup(&so->so_timeo); 146 sowwakeup(so); 147 sorwakeup(so); 148 } 149 150 void 151 soisdisconnected(struct socket *so) 152 { 153 soassertlocked(so); 154 155 mtx_enter(&so->so_rcv.sb_mtx); 156 so->so_rcv.sb_state |= SS_CANTRCVMORE; 157 mtx_leave(&so->so_rcv.sb_mtx); 158 159 mtx_enter(&so->so_snd.sb_mtx); 160 so->so_snd.sb_state |= SS_CANTSENDMORE; 161 mtx_leave(&so->so_snd.sb_mtx); 162 163 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 164 so->so_state |= SS_ISDISCONNECTED; 165 166 wakeup(&so->so_timeo); 167 sowwakeup(so); 168 sorwakeup(so); 169 } 170 171 /* 172 * When an attempt at a new connection is noted on a socket 173 * which accepts connections, sonewconn is called. If the 174 * connection is possible (subject to space constraints, etc.) 175 * then we allocate a new structure, properly linked into the 176 * data structure of the original socket, and return this. 177 * Connstatus may be 0 or SS_ISCONNECTED. 178 */ 179 struct socket * 180 sonewconn(struct socket *head, int connstatus, int wait) 181 { 182 struct socket *so; 183 int soqueue = connstatus ? 1 : 0; 184 185 soassertlocked(head); 186 187 if (m_pool_used() > 95) 188 return (NULL); 189 if (head->so_qlen + head->so_q0len > head->so_qlimit * 3) 190 return (NULL); 191 so = soalloc(head->so_proto, wait); 192 if (so == NULL) 193 return (NULL); 194 so->so_type = head->so_type; 195 so->so_options = head->so_options &~ SO_ACCEPTCONN; 196 so->so_linger = head->so_linger; 197 so->so_state = head->so_state | SS_NOFDREF; 198 so->so_proto = head->so_proto; 199 so->so_timeo = head->so_timeo; 200 so->so_euid = head->so_euid; 201 so->so_ruid = head->so_ruid; 202 so->so_egid = head->so_egid; 203 so->so_rgid = head->so_rgid; 204 so->so_cpid = head->so_cpid; 205 206 /* 207 * Lock order will be `head' -> `so' while these sockets are linked. 208 */ 209 solock_nonet(so); 210 211 /* 212 * Inherit watermarks but those may get clamped in low mem situations. 213 */ 214 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) 215 goto fail; 216 217 mtx_enter(&head->so_snd.sb_mtx); 218 so->so_snd.sb_wat = head->so_snd.sb_wat; 219 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 220 so->so_snd.sb_timeo_nsecs = head->so_snd.sb_timeo_nsecs; 221 mtx_leave(&head->so_snd.sb_mtx); 222 223 mtx_enter(&head->so_rcv.sb_mtx); 224 so->so_rcv.sb_wat = head->so_rcv.sb_wat; 225 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 226 so->so_rcv.sb_timeo_nsecs = head->so_rcv.sb_timeo_nsecs; 227 mtx_leave(&head->so_rcv.sb_mtx); 228 229 sigio_copy(&so->so_sigio, &head->so_sigio); 230 231 soqinsque(head, so, soqueue); 232 if (pru_attach(so, 0, wait) != 0) { 233 soqremque(so, soqueue); 234 goto fail; 235 } 236 if (connstatus) { 237 so->so_state |= connstatus; 238 sorwakeup(head); 239 wakeup(&head->so_timeo); 240 } 241 242 sounlock_nonet(so); 243 244 return (so); 245 246 fail: 247 sounlock_nonet(so); 248 sigio_free(&so->so_sigio); 249 klist_free(&so->so_rcv.sb_klist); 250 klist_free(&so->so_snd.sb_klist); 251 pool_put(&socket_pool, so); 252 253 return (NULL); 254 } 255 256 void 257 soqinsque(struct socket *head, struct socket *so, int q) 258 { 259 soassertlocked(head); 260 soassertlocked(so); 261 262 KASSERT(so->so_onq == NULL); 263 264 so->so_head = head; 265 if (q == 0) { 266 head->so_q0len++; 267 so->so_onq = &head->so_q0; 268 } else { 269 head->so_qlen++; 270 so->so_onq = &head->so_q; 271 } 272 TAILQ_INSERT_TAIL(so->so_onq, so, so_qe); 273 } 274 275 int 276 soqremque(struct socket *so, int q) 277 { 278 struct socket *head = so->so_head; 279 280 soassertlocked(so); 281 soassertlocked(head); 282 283 if (q == 0) { 284 if (so->so_onq != &head->so_q0) 285 return (0); 286 head->so_q0len--; 287 } else { 288 if (so->so_onq != &head->so_q) 289 return (0); 290 head->so_qlen--; 291 } 292 TAILQ_REMOVE(so->so_onq, so, so_qe); 293 so->so_onq = NULL; 294 so->so_head = NULL; 295 return (1); 296 } 297 298 /* 299 * Socantsendmore indicates that no more data will be sent on the 300 * socket; it would normally be applied to a socket when the user 301 * informs the system that no more data is to be sent, by the protocol 302 * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data 303 * will be received, and will normally be applied to the socket by a 304 * protocol when it detects that the peer will send no more data. 305 * Data queued for reading in the socket may yet be read. 306 */ 307 308 void 309 socantsendmore(struct socket *so) 310 { 311 soassertlocked(so); 312 mtx_enter(&so->so_snd.sb_mtx); 313 so->so_snd.sb_state |= SS_CANTSENDMORE; 314 mtx_leave(&so->so_snd.sb_mtx); 315 sowwakeup(so); 316 } 317 318 void 319 socantrcvmore(struct socket *so) 320 { 321 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 322 soassertlocked(so); 323 324 mtx_enter(&so->so_rcv.sb_mtx); 325 so->so_rcv.sb_state |= SS_CANTRCVMORE; 326 mtx_leave(&so->so_rcv.sb_mtx); 327 sorwakeup(so); 328 } 329 330 void 331 solock(struct socket *so) 332 { 333 switch (so->so_proto->pr_domain->dom_family) { 334 case PF_INET: 335 case PF_INET6: 336 NET_LOCK(); 337 break; 338 default: 339 rw_enter_write(&so->so_lock); 340 break; 341 } 342 } 343 344 void 345 solock_shared(struct socket *so) 346 { 347 switch (so->so_proto->pr_domain->dom_family) { 348 case PF_INET: 349 case PF_INET6: 350 NET_LOCK_SHARED(); 351 break; 352 } 353 rw_enter_write(&so->so_lock); 354 } 355 356 void 357 solock_nonet(struct socket *so) 358 { 359 switch (so->so_proto->pr_domain->dom_family) { 360 case PF_INET: 361 case PF_INET6: 362 NET_ASSERT_LOCKED(); 363 break; 364 } 365 rw_enter_write(&so->so_lock); 366 } 367 368 int 369 solock_persocket(struct socket *so) 370 { 371 switch (so->so_proto->pr_domain->dom_family) { 372 case PF_INET: 373 case PF_INET6: 374 return 0; 375 default: 376 return 1; 377 } 378 } 379 380 void 381 solock_pair(struct socket *so1, struct socket *so2) 382 { 383 KASSERT(so1 != so2); 384 KASSERT(so1->so_type == so2->so_type); 385 KASSERT(solock_persocket(so1)); 386 387 if (so1 < so2) { 388 solock(so1); 389 solock(so2); 390 } else { 391 solock(so2); 392 solock(so1); 393 } 394 } 395 396 void 397 sounlock(struct socket *so) 398 { 399 switch (so->so_proto->pr_domain->dom_family) { 400 case PF_INET: 401 case PF_INET6: 402 NET_UNLOCK(); 403 break; 404 default: 405 rw_exit_write(&so->so_lock); 406 break; 407 } 408 } 409 410 void 411 sounlock_shared(struct socket *so) 412 { 413 rw_exit_write(&so->so_lock); 414 switch (so->so_proto->pr_domain->dom_family) { 415 case PF_INET: 416 case PF_INET6: 417 NET_UNLOCK_SHARED(); 418 break; 419 } 420 } 421 422 void 423 sounlock_nonet(struct socket *so) 424 { 425 rw_exit_write(&so->so_lock); 426 } 427 428 void 429 soassertlocked_readonly(struct socket *so) 430 { 431 switch (so->so_proto->pr_domain->dom_family) { 432 case PF_INET: 433 case PF_INET6: 434 NET_ASSERT_LOCKED(); 435 break; 436 default: 437 rw_assert_wrlock(&so->so_lock); 438 break; 439 } 440 } 441 442 void 443 soassertlocked(struct socket *so) 444 { 445 switch (so->so_proto->pr_domain->dom_family) { 446 case PF_INET: 447 case PF_INET6: 448 if (rw_status(&netlock) == RW_READ) { 449 NET_ASSERT_LOCKED(); 450 451 if (splassert_ctl > 0 && 452 rw_status(&so->so_lock) != RW_WRITE) 453 splassert_fail(0, RW_WRITE, __func__); 454 } else 455 NET_ASSERT_LOCKED_EXCLUSIVE(); 456 break; 457 default: 458 rw_assert_wrlock(&so->so_lock); 459 break; 460 } 461 } 462 463 int 464 sosleep_nsec(struct socket *so, void *ident, int prio, const char *wmesg, 465 uint64_t nsecs) 466 { 467 int ret; 468 469 switch (so->so_proto->pr_domain->dom_family) { 470 case PF_INET: 471 case PF_INET6: 472 if (rw_status(&netlock) == RW_READ) 473 rw_exit_write(&so->so_lock); 474 ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs); 475 if (rw_status(&netlock) == RW_READ) 476 rw_enter_write(&so->so_lock); 477 break; 478 default: 479 ret = rwsleep_nsec(ident, &so->so_lock, prio, wmesg, nsecs); 480 break; 481 } 482 483 return ret; 484 } 485 486 void 487 sbmtxassertlocked(struct socket *so, struct sockbuf *sb) 488 { 489 if (sb->sb_flags & SB_MTXLOCK) { 490 if (splassert_ctl > 0 && mtx_owned(&sb->sb_mtx) == 0) 491 splassert_fail(0, RW_WRITE, __func__); 492 } else 493 soassertlocked(so); 494 } 495 496 /* 497 * Wait for data to arrive at/drain from a socket buffer. 498 */ 499 int 500 sbwait(struct socket *so, struct sockbuf *sb) 501 { 502 uint64_t timeo_nsecs; 503 int prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH; 504 505 if (sb->sb_flags & SB_MTXLOCK) { 506 MUTEX_ASSERT_LOCKED(&sb->sb_mtx); 507 508 sb->sb_flags |= SB_WAIT; 509 return msleep_nsec(&sb->sb_cc, &sb->sb_mtx, prio, "sbwait", 510 sb->sb_timeo_nsecs); 511 } 512 513 soassertlocked(so); 514 515 mtx_enter(&sb->sb_mtx); 516 timeo_nsecs = sb->sb_timeo_nsecs; 517 sb->sb_flags |= SB_WAIT; 518 mtx_leave(&sb->sb_mtx); 519 520 return sosleep_nsec(so, &sb->sb_cc, prio, "netio", timeo_nsecs); 521 } 522 523 int 524 sblock(struct sockbuf *sb, int flags) 525 { 526 int rwflags = RW_WRITE, error; 527 528 if (!(flags & SBL_NOINTR || sb->sb_flags & SB_NOINTR)) 529 rwflags |= RW_INTR; 530 if (!(flags & SBL_WAIT)) 531 rwflags |= RW_NOSLEEP; 532 533 error = rw_enter(&sb->sb_lock, rwflags); 534 if (error == EBUSY) 535 error = EWOULDBLOCK; 536 537 return error; 538 } 539 540 void 541 sbunlock(struct sockbuf *sb) 542 { 543 rw_exit(&sb->sb_lock); 544 } 545 546 /* 547 * Wakeup processes waiting on a socket buffer. 548 * Do asynchronous notification via SIGIO 549 * if the socket buffer has the SB_ASYNC flag set. 550 */ 551 void 552 sowakeup(struct socket *so, struct sockbuf *sb) 553 { 554 int dowakeup = 0, dopgsigio = 0; 555 556 mtx_enter(&sb->sb_mtx); 557 if (sb->sb_flags & SB_WAIT) { 558 sb->sb_flags &= ~SB_WAIT; 559 dowakeup = 1; 560 } 561 if (sb->sb_flags & SB_ASYNC) 562 dopgsigio = 1; 563 564 knote_locked(&sb->sb_klist, 0); 565 mtx_leave(&sb->sb_mtx); 566 567 if (dowakeup) 568 wakeup(&sb->sb_cc); 569 570 if (dopgsigio) 571 pgsigio(&so->so_sigio, SIGIO, 0); 572 } 573 574 /* 575 * Socket buffer (struct sockbuf) utility routines. 576 * 577 * Each socket contains two socket buffers: one for sending data and 578 * one for receiving data. Each buffer contains a queue of mbufs, 579 * information about the number of mbufs and amount of data in the 580 * queue, and other fields allowing select() statements and notification 581 * on data availability to be implemented. 582 * 583 * Data stored in a socket buffer is maintained as a list of records. 584 * Each record is a list of mbufs chained together with the m_next 585 * field. Records are chained together with the m_nextpkt field. The upper 586 * level routine soreceive() expects the following conventions to be 587 * observed when placing information in the receive buffer: 588 * 589 * 1. If the protocol requires each message be preceded by the sender's 590 * name, then a record containing that name must be present before 591 * any associated data (mbuf's must be of type MT_SONAME). 592 * 2. If the protocol supports the exchange of ``access rights'' (really 593 * just additional data associated with the message), and there are 594 * ``rights'' to be received, then a record containing this data 595 * should be present (mbuf's must be of type MT_CONTROL). 596 * 3. If a name or rights record exists, then it must be followed by 597 * a data record, perhaps of zero length. 598 * 599 * Before using a new socket structure it is first necessary to reserve 600 * buffer space to the socket, by calling sbreserve(). This should commit 601 * some of the available buffer space in the system buffer pool for the 602 * socket (currently, it does nothing but enforce limits). The space 603 * should be released by calling sbrelease() when the socket is destroyed. 604 */ 605 606 int 607 soreserve(struct socket *so, u_long sndcc, u_long rcvcc) 608 { 609 soassertlocked(so); 610 611 mtx_enter(&so->so_rcv.sb_mtx); 612 mtx_enter(&so->so_snd.sb_mtx); 613 if (sbreserve(so, &so->so_snd, sndcc)) 614 goto bad; 615 so->so_snd.sb_wat = sndcc; 616 if (so->so_snd.sb_lowat == 0) 617 so->so_snd.sb_lowat = MCLBYTES; 618 if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) 619 so->so_snd.sb_lowat = so->so_snd.sb_hiwat; 620 if (sbreserve(so, &so->so_rcv, rcvcc)) 621 goto bad2; 622 so->so_rcv.sb_wat = rcvcc; 623 if (so->so_rcv.sb_lowat == 0) 624 so->so_rcv.sb_lowat = 1; 625 mtx_leave(&so->so_snd.sb_mtx); 626 mtx_leave(&so->so_rcv.sb_mtx); 627 628 return (0); 629 bad2: 630 sbrelease(so, &so->so_snd); 631 bad: 632 mtx_leave(&so->so_snd.sb_mtx); 633 mtx_leave(&so->so_rcv.sb_mtx); 634 return (ENOBUFS); 635 } 636 637 /* 638 * Allot mbufs to a sockbuf. 639 * Attempt to scale mbmax so that mbcnt doesn't become limiting 640 * if buffering efficiency is near the normal case. 641 */ 642 int 643 sbreserve(struct socket *so, struct sockbuf *sb, u_long cc) 644 { 645 sbmtxassertlocked(so, sb); 646 647 if (cc == 0 || cc > sb_max) 648 return (1); 649 sb->sb_hiwat = cc; 650 sb->sb_mbmax = max(3 * MAXMCLBYTES, cc * 8); 651 if (sb->sb_lowat > sb->sb_hiwat) 652 sb->sb_lowat = sb->sb_hiwat; 653 return (0); 654 } 655 656 /* 657 * In low memory situation, do not accept any greater than normal request. 658 */ 659 int 660 sbcheckreserve(u_long cnt, u_long defcnt) 661 { 662 if (cnt > defcnt && sbchecklowmem()) 663 return (ENOBUFS); 664 return (0); 665 } 666 667 int 668 sbchecklowmem(void) 669 { 670 static int sblowmem; 671 unsigned int used; 672 673 /* 674 * m_pool_used() is thread safe. Global variable sblowmem is updated 675 * by multiple CPUs, but most times with the same value. And even 676 * if the value is not correct for a short time, it does not matter. 677 */ 678 used = m_pool_used(); 679 if (used < 60) 680 atomic_store_int(&sblowmem, 0); 681 else if (used > 80) 682 atomic_store_int(&sblowmem, 1); 683 684 return (atomic_load_int(&sblowmem)); 685 } 686 687 /* 688 * Free mbufs held by a socket, and reserved mbuf space. 689 */ 690 void 691 sbrelease(struct socket *so, struct sockbuf *sb) 692 { 693 694 sbflush(so, sb); 695 sb->sb_hiwat = sb->sb_mbmax = 0; 696 } 697 698 /* 699 * Routines to add and remove 700 * data from an mbuf queue. 701 * 702 * The routines sbappend() or sbappendrecord() are normally called to 703 * append new mbufs to a socket buffer, after checking that adequate 704 * space is available, comparing the function sbspace() with the amount 705 * of data to be added. sbappendrecord() differs from sbappend() in 706 * that data supplied is treated as the beginning of a new record. 707 * To place a sender's address, optional access rights, and data in a 708 * socket receive buffer, sbappendaddr() should be used. To place 709 * access rights and data in a socket receive buffer, sbappendrights() 710 * should be used. In either case, the new data begins a new record. 711 * Note that unlike sbappend() and sbappendrecord(), these routines check 712 * for the caller that there will be enough space to store the data. 713 * Each fails if there is not enough space, or if it cannot find mbufs 714 * to store additional information in. 715 * 716 * Reliable protocols may use the socket send buffer to hold data 717 * awaiting acknowledgement. Data is normally copied from a socket 718 * send buffer in a protocol with m_copym for output to a peer, 719 * and then removing the data from the socket buffer with sbdrop() 720 * or sbdroprecord() when the data is acknowledged by the peer. 721 */ 722 723 #ifdef SOCKBUF_DEBUG 724 void 725 sblastrecordchk(struct sockbuf *sb, const char *where) 726 { 727 struct mbuf *m = sb->sb_mb; 728 729 while (m && m->m_nextpkt) 730 m = m->m_nextpkt; 731 732 if (m != sb->sb_lastrecord) { 733 printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n", 734 sb->sb_mb, sb->sb_lastrecord, m); 735 printf("packet chain:\n"); 736 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) 737 printf("\t%p\n", m); 738 panic("sblastrecordchk from %s", where); 739 } 740 } 741 742 void 743 sblastmbufchk(struct sockbuf *sb, const char *where) 744 { 745 struct mbuf *m = sb->sb_mb; 746 struct mbuf *n; 747 748 while (m && m->m_nextpkt) 749 m = m->m_nextpkt; 750 751 while (m && m->m_next) 752 m = m->m_next; 753 754 if (m != sb->sb_mbtail) { 755 printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n", 756 sb->sb_mb, sb->sb_mbtail, m); 757 printf("packet tree:\n"); 758 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) { 759 printf("\t"); 760 for (n = m; n != NULL; n = n->m_next) 761 printf("%p ", n); 762 printf("\n"); 763 } 764 panic("sblastmbufchk from %s", where); 765 } 766 } 767 #endif /* SOCKBUF_DEBUG */ 768 769 #define SBLINKRECORD(sb, m0) \ 770 do { \ 771 if ((sb)->sb_lastrecord != NULL) \ 772 (sb)->sb_lastrecord->m_nextpkt = (m0); \ 773 else \ 774 (sb)->sb_mb = (m0); \ 775 (sb)->sb_lastrecord = (m0); \ 776 } while (/*CONSTCOND*/0) 777 778 /* 779 * Append mbuf chain m to the last record in the 780 * socket buffer sb. The additional space associated 781 * the mbuf chain is recorded in sb. Empty mbufs are 782 * discarded and mbufs are compacted where possible. 783 */ 784 void 785 sbappend(struct socket *so, struct sockbuf *sb, struct mbuf *m) 786 { 787 struct mbuf *n; 788 789 if (m == NULL) 790 return; 791 792 sbmtxassertlocked(so, sb); 793 SBLASTRECORDCHK(sb, "sbappend 1"); 794 795 if ((n = sb->sb_lastrecord) != NULL) { 796 /* 797 * XXX Would like to simply use sb_mbtail here, but 798 * XXX I need to verify that I won't miss an EOR that 799 * XXX way. 800 */ 801 do { 802 if (n->m_flags & M_EOR) { 803 sbappendrecord(so, sb, m); /* XXXXXX!!!! */ 804 return; 805 } 806 } while (n->m_next && (n = n->m_next)); 807 } else { 808 /* 809 * If this is the first record in the socket buffer, it's 810 * also the last record. 811 */ 812 sb->sb_lastrecord = m; 813 } 814 sbcompress(so, sb, m, n); 815 SBLASTRECORDCHK(sb, "sbappend 2"); 816 } 817 818 /* 819 * This version of sbappend() should only be used when the caller 820 * absolutely knows that there will never be more than one record 821 * in the socket buffer, that is, a stream protocol (such as TCP). 822 */ 823 void 824 sbappendstream(struct socket *so, struct sockbuf *sb, struct mbuf *m) 825 { 826 sbmtxassertlocked(so, sb); 827 KDASSERT(m->m_nextpkt == NULL); 828 KASSERT(sb->sb_mb == sb->sb_lastrecord); 829 830 SBLASTMBUFCHK(sb, __func__); 831 832 sbcompress(so, sb, m, sb->sb_mbtail); 833 834 sb->sb_lastrecord = sb->sb_mb; 835 SBLASTRECORDCHK(sb, __func__); 836 } 837 838 #ifdef SOCKBUF_DEBUG 839 void 840 sbcheck(struct socket *so, struct sockbuf *sb) 841 { 842 struct mbuf *m, *n; 843 u_long len = 0, mbcnt = 0; 844 845 for (m = sb->sb_mb; m; m = m->m_nextpkt) { 846 for (n = m; n; n = n->m_next) { 847 len += n->m_len; 848 mbcnt += MSIZE; 849 if (n->m_flags & M_EXT) 850 mbcnt += n->m_ext.ext_size; 851 if (m != n && n->m_nextpkt) 852 panic("sbcheck nextpkt"); 853 } 854 } 855 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { 856 printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc, 857 mbcnt, sb->sb_mbcnt); 858 panic("sbcheck"); 859 } 860 } 861 #endif 862 863 /* 864 * As above, except the mbuf chain 865 * begins a new record. 866 */ 867 void 868 sbappendrecord(struct socket *so, struct sockbuf *sb, struct mbuf *m0) 869 { 870 struct mbuf *m; 871 872 sbmtxassertlocked(so, sb); 873 874 if (m0 == NULL) 875 return; 876 877 /* 878 * Put the first mbuf on the queue. 879 * Note this permits zero length records. 880 */ 881 sballoc(so, sb, m0); 882 SBLASTRECORDCHK(sb, "sbappendrecord 1"); 883 SBLINKRECORD(sb, m0); 884 m = m0->m_next; 885 m0->m_next = NULL; 886 if (m && (m0->m_flags & M_EOR)) { 887 m0->m_flags &= ~M_EOR; 888 m->m_flags |= M_EOR; 889 } 890 sbcompress(so, sb, m, m0); 891 SBLASTRECORDCHK(sb, "sbappendrecord 2"); 892 } 893 894 /* 895 * Append address and data, and optionally, control (ancillary) data 896 * to the receive queue of a socket. If present, 897 * m0 must include a packet header with total length. 898 * Returns 0 if no space in sockbuf or insufficient mbufs. 899 */ 900 int 901 sbappendaddr(struct socket *so, struct sockbuf *sb, const struct sockaddr *asa, 902 struct mbuf *m0, struct mbuf *control) 903 { 904 struct mbuf *m, *n, *nlast; 905 int space = asa->sa_len; 906 907 sbmtxassertlocked(so, sb); 908 909 if (m0 && (m0->m_flags & M_PKTHDR) == 0) 910 panic("sbappendaddr"); 911 if (m0) 912 space += m0->m_pkthdr.len; 913 for (n = control; n; n = n->m_next) { 914 space += n->m_len; 915 if (n->m_next == NULL) /* keep pointer to last control buf */ 916 break; 917 } 918 if (space > sbspace_locked(so, sb)) 919 return (0); 920 if (asa->sa_len > MLEN) 921 return (0); 922 MGET(m, M_DONTWAIT, MT_SONAME); 923 if (m == NULL) 924 return (0); 925 m->m_len = asa->sa_len; 926 memcpy(mtod(m, caddr_t), asa, asa->sa_len); 927 if (n) 928 n->m_next = m0; /* concatenate data to control */ 929 else 930 control = m0; 931 m->m_next = control; 932 933 SBLASTRECORDCHK(sb, "sbappendaddr 1"); 934 935 for (n = m; n->m_next != NULL; n = n->m_next) 936 sballoc(so, sb, n); 937 sballoc(so, sb, n); 938 nlast = n; 939 SBLINKRECORD(sb, m); 940 941 sb->sb_mbtail = nlast; 942 SBLASTMBUFCHK(sb, "sbappendaddr"); 943 944 SBLASTRECORDCHK(sb, "sbappendaddr 2"); 945 946 return (1); 947 } 948 949 int 950 sbappendcontrol(struct socket *so, struct sockbuf *sb, struct mbuf *m0, 951 struct mbuf *control) 952 { 953 struct mbuf *m, *mlast, *n; 954 int eor = 0, space = 0; 955 956 sbmtxassertlocked(so, sb); 957 958 if (control == NULL) 959 panic("sbappendcontrol"); 960 for (m = control; ; m = m->m_next) { 961 space += m->m_len; 962 if (m->m_next == NULL) 963 break; 964 } 965 n = m; /* save pointer to last control buffer */ 966 for (m = m0; m; m = m->m_next) { 967 space += m->m_len; 968 eor |= m->m_flags & M_EOR; 969 if (eor) { 970 if (m->m_next == NULL) 971 m->m_flags |= M_EOR; 972 else 973 m->m_flags &= ~M_EOR; 974 } 975 } 976 if (space > sbspace_locked(so, sb)) 977 return (0); 978 n->m_next = m0; /* concatenate data to control */ 979 980 SBLASTRECORDCHK(sb, "sbappendcontrol 1"); 981 982 for (m = control; m->m_next != NULL; m = m->m_next) 983 sballoc(so, sb, m); 984 sballoc(so, sb, m); 985 mlast = m; 986 SBLINKRECORD(sb, control); 987 988 sb->sb_mbtail = mlast; 989 SBLASTMBUFCHK(sb, "sbappendcontrol"); 990 991 SBLASTRECORDCHK(sb, "sbappendcontrol 2"); 992 993 return (1); 994 } 995 996 /* 997 * Compress mbuf chain m into the socket 998 * buffer sb following mbuf n. If n 999 * is null, the buffer is presumed empty. 1000 */ 1001 void 1002 sbcompress(struct socket *so, struct sockbuf *sb, struct mbuf *m, 1003 struct mbuf *n) 1004 { 1005 int eor = 0; 1006 struct mbuf *o; 1007 1008 while (m) { 1009 eor |= m->m_flags & M_EOR; 1010 if (m->m_len == 0 && 1011 (eor == 0 || 1012 (((o = m->m_next) || (o = n)) && 1013 o->m_type == m->m_type))) { 1014 if (sb->sb_lastrecord == m) 1015 sb->sb_lastrecord = m->m_next; 1016 m = m_free(m); 1017 continue; 1018 } 1019 if (n && (n->m_flags & M_EOR) == 0 && 1020 /* m_trailingspace() checks buffer writeability */ 1021 m->m_len <= ((n->m_flags & M_EXT)? n->m_ext.ext_size : 1022 MCLBYTES) / 4 && /* XXX Don't copy too much */ 1023 m->m_len <= m_trailingspace(n) && 1024 n->m_type == m->m_type) { 1025 memcpy(mtod(n, caddr_t) + n->m_len, mtod(m, caddr_t), 1026 m->m_len); 1027 n->m_len += m->m_len; 1028 sb->sb_cc += m->m_len; 1029 if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME) 1030 sb->sb_datacc += m->m_len; 1031 m = m_free(m); 1032 continue; 1033 } 1034 if (n) 1035 n->m_next = m; 1036 else 1037 sb->sb_mb = m; 1038 sb->sb_mbtail = m; 1039 sballoc(so, sb, m); 1040 n = m; 1041 m->m_flags &= ~M_EOR; 1042 m = m->m_next; 1043 n->m_next = NULL; 1044 } 1045 if (eor) { 1046 if (n) 1047 n->m_flags |= eor; 1048 else 1049 printf("semi-panic: sbcompress"); 1050 } 1051 SBLASTMBUFCHK(sb, __func__); 1052 } 1053 1054 /* 1055 * Free all mbufs in a sockbuf. 1056 * Check that all resources are reclaimed. 1057 */ 1058 void 1059 sbflush(struct socket *so, struct sockbuf *sb) 1060 { 1061 KASSERT(sb == &so->so_rcv || sb == &so->so_snd); 1062 rw_assert_unlocked(&sb->sb_lock); 1063 1064 while (sb->sb_mbcnt) 1065 sbdrop(so, sb, (int)sb->sb_cc); 1066 1067 KASSERT(sb->sb_cc == 0); 1068 KASSERT(sb->sb_datacc == 0); 1069 KASSERT(sb->sb_mb == NULL); 1070 KASSERT(sb->sb_mbtail == NULL); 1071 KASSERT(sb->sb_lastrecord == NULL); 1072 } 1073 1074 /* 1075 * Drop data from (the front of) a sockbuf. 1076 */ 1077 void 1078 sbdrop(struct socket *so, struct sockbuf *sb, int len) 1079 { 1080 struct mbuf *m, *mn; 1081 struct mbuf *next; 1082 1083 sbmtxassertlocked(so, sb); 1084 1085 next = (m = sb->sb_mb) ? m->m_nextpkt : NULL; 1086 while (len > 0) { 1087 if (m == NULL) { 1088 if (next == NULL) 1089 panic("sbdrop"); 1090 m = next; 1091 next = m->m_nextpkt; 1092 continue; 1093 } 1094 if (m->m_len > len) { 1095 m->m_len -= len; 1096 m->m_data += len; 1097 sb->sb_cc -= len; 1098 if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME) 1099 sb->sb_datacc -= len; 1100 break; 1101 } 1102 len -= m->m_len; 1103 sbfree(so, sb, m); 1104 mn = m_free(m); 1105 m = mn; 1106 } 1107 while (m && m->m_len == 0) { 1108 sbfree(so, sb, m); 1109 mn = m_free(m); 1110 m = mn; 1111 } 1112 if (m) { 1113 sb->sb_mb = m; 1114 m->m_nextpkt = next; 1115 } else 1116 sb->sb_mb = next; 1117 /* 1118 * First part is an inline SB_EMPTY_FIXUP(). Second part 1119 * makes sure sb_lastrecord is up-to-date if we dropped 1120 * part of the last record. 1121 */ 1122 m = sb->sb_mb; 1123 if (m == NULL) { 1124 sb->sb_mbtail = NULL; 1125 sb->sb_lastrecord = NULL; 1126 } else if (m->m_nextpkt == NULL) 1127 sb->sb_lastrecord = m; 1128 } 1129 1130 /* 1131 * Drop a record off the front of a sockbuf 1132 * and move the next record to the front. 1133 */ 1134 void 1135 sbdroprecord(struct socket *so, struct sockbuf *sb) 1136 { 1137 struct mbuf *m, *mn; 1138 1139 m = sb->sb_mb; 1140 if (m) { 1141 sb->sb_mb = m->m_nextpkt; 1142 do { 1143 sbfree(so, sb, m); 1144 mn = m_free(m); 1145 } while ((m = mn) != NULL); 1146 } 1147 SB_EMPTY_FIXUP(sb); 1148 } 1149 1150 /* 1151 * Create a "control" mbuf containing the specified data 1152 * with the specified type for presentation on a socket buffer. 1153 */ 1154 struct mbuf * 1155 sbcreatecontrol(const void *p, size_t size, int type, int level) 1156 { 1157 struct cmsghdr *cp; 1158 struct mbuf *m; 1159 1160 if (CMSG_SPACE(size) > MCLBYTES) { 1161 printf("sbcreatecontrol: message too large %zu\n", size); 1162 return (NULL); 1163 } 1164 1165 if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL) 1166 return (NULL); 1167 if (CMSG_SPACE(size) > MLEN) { 1168 MCLGET(m, M_DONTWAIT); 1169 if ((m->m_flags & M_EXT) == 0) { 1170 m_free(m); 1171 return NULL; 1172 } 1173 } 1174 cp = mtod(m, struct cmsghdr *); 1175 memset(cp, 0, CMSG_SPACE(size)); 1176 memcpy(CMSG_DATA(cp), p, size); 1177 m->m_len = CMSG_SPACE(size); 1178 cp->cmsg_len = CMSG_LEN(size); 1179 cp->cmsg_level = level; 1180 cp->cmsg_type = type; 1181 return (m); 1182 } 1183