1 /* $OpenBSD: uipc_socket2.c,v 1.155 2024/05/17 19:11:14 mvs Exp $ */ 2 /* $NetBSD: uipc_socket2.c,v 1.11 1996/02/04 02:17:55 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/malloc.h> 38 #include <sys/mbuf.h> 39 #include <sys/protosw.h> 40 #include <sys/domain.h> 41 #include <sys/socket.h> 42 #include <sys/socketvar.h> 43 #include <sys/signalvar.h> 44 #include <sys/pool.h> 45 46 /* 47 * Primitive routines for operating on sockets and socket buffers 48 */ 49 50 u_long sb_max = SB_MAX; /* patchable */ 51 52 extern struct pool mclpools[]; 53 extern struct pool mbpool; 54 55 /* 56 * Procedures to manipulate state flags of socket 57 * and do appropriate wakeups. Normal sequence from the 58 * active (originating) side is that soisconnecting() is 59 * called during processing of connect() call, 60 * resulting in an eventual call to soisconnected() if/when the 61 * connection is established. When the connection is torn down 62 * soisdisconnecting() is called during processing of disconnect() call, 63 * and soisdisconnected() is called when the connection to the peer 64 * is totally severed. The semantics of these routines are such that 65 * connectionless protocols can call soisconnected() and soisdisconnected() 66 * only, bypassing the in-progress calls when setting up a ``connection'' 67 * takes no time. 68 * 69 * From the passive side, a socket is created with 70 * two queues of sockets: so_q0 for connections in progress 71 * and so_q for connections already made and awaiting user acceptance. 72 * As a protocol is preparing incoming connections, it creates a socket 73 * structure queued on so_q0 by calling sonewconn(). When the connection 74 * is established, soisconnected() is called, and transfers the 75 * socket structure to so_q, making it available to accept(). 76 * 77 * If a socket is closed with sockets on either 78 * so_q0 or so_q, these sockets are dropped. 79 * 80 * If higher level protocols are implemented in 81 * the kernel, the wakeups done here will sometimes 82 * cause software-interrupt process scheduling. 83 */ 84 85 void 86 soisconnecting(struct socket *so) 87 { 88 soassertlocked(so); 89 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 90 so->so_state |= SS_ISCONNECTING; 91 } 92 93 void 94 soisconnected(struct socket *so) 95 { 96 struct socket *head = so->so_head; 97 98 soassertlocked(so); 99 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING); 100 so->so_state |= SS_ISCONNECTED; 101 102 if (head != NULL && so->so_onq == &head->so_q0) { 103 int persocket = solock_persocket(so); 104 105 if (persocket) { 106 soref(so); 107 soref(head); 108 109 sounlock(so); 110 solock(head); 111 solock(so); 112 113 if (so->so_onq != &head->so_q0) { 114 sounlock(head); 115 sorele(head); 116 sorele(so); 117 118 return; 119 } 120 121 sorele(head); 122 sorele(so); 123 } 124 125 soqremque(so, 0); 126 soqinsque(head, so, 1); 127 sorwakeup(head); 128 wakeup_one(&head->so_timeo); 129 130 if (persocket) 131 sounlock(head); 132 } else { 133 wakeup(&so->so_timeo); 134 sorwakeup(so); 135 sowwakeup(so); 136 } 137 } 138 139 void 140 soisdisconnecting(struct socket *so) 141 { 142 soassertlocked(so); 143 so->so_state &= ~SS_ISCONNECTING; 144 so->so_state |= SS_ISDISCONNECTING; 145 146 mtx_enter(&so->so_rcv.sb_mtx); 147 so->so_rcv.sb_state |= SS_CANTRCVMORE; 148 mtx_leave(&so->so_rcv.sb_mtx); 149 150 mtx_enter(&so->so_snd.sb_mtx); 151 so->so_snd.sb_state |= SS_CANTSENDMORE; 152 mtx_leave(&so->so_snd.sb_mtx); 153 154 wakeup(&so->so_timeo); 155 sowwakeup(so); 156 sorwakeup(so); 157 } 158 159 void 160 soisdisconnected(struct socket *so) 161 { 162 soassertlocked(so); 163 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 164 so->so_state |= SS_ISDISCONNECTED; 165 166 mtx_enter(&so->so_rcv.sb_mtx); 167 so->so_rcv.sb_state |= SS_CANTRCVMORE; 168 mtx_leave(&so->so_rcv.sb_mtx); 169 170 mtx_enter(&so->so_snd.sb_mtx); 171 so->so_snd.sb_state |= SS_CANTSENDMORE; 172 mtx_leave(&so->so_snd.sb_mtx); 173 174 wakeup(&so->so_timeo); 175 sowwakeup(so); 176 sorwakeup(so); 177 } 178 179 /* 180 * When an attempt at a new connection is noted on a socket 181 * which accepts connections, sonewconn is called. If the 182 * connection is possible (subject to space constraints, etc.) 183 * then we allocate a new structure, properly linked into the 184 * data structure of the original socket, and return this. 185 * Connstatus may be 0 or SS_ISCONNECTED. 186 */ 187 struct socket * 188 sonewconn(struct socket *head, int connstatus, int wait) 189 { 190 struct socket *so; 191 int persocket = solock_persocket(head); 192 int soqueue = connstatus ? 1 : 0; 193 194 /* 195 * XXXSMP as long as `so' and `head' share the same lock, we 196 * can call soreserve() and pr_attach() below w/o explicitly 197 * locking `so'. 198 */ 199 soassertlocked(head); 200 201 if (m_pool_used() > 95) 202 return (NULL); 203 if (head->so_qlen + head->so_q0len > head->so_qlimit * 3) 204 return (NULL); 205 so = soalloc(head->so_proto, wait); 206 if (so == NULL) 207 return (NULL); 208 so->so_type = head->so_type; 209 so->so_options = head->so_options &~ SO_ACCEPTCONN; 210 so->so_linger = head->so_linger; 211 so->so_state = head->so_state | SS_NOFDREF; 212 so->so_proto = head->so_proto; 213 so->so_timeo = head->so_timeo; 214 so->so_euid = head->so_euid; 215 so->so_ruid = head->so_ruid; 216 so->so_egid = head->so_egid; 217 so->so_rgid = head->so_rgid; 218 so->so_cpid = head->so_cpid; 219 220 /* 221 * Lock order will be `head' -> `so' while these sockets are linked. 222 */ 223 if (persocket) 224 solock(so); 225 226 /* 227 * Inherit watermarks but those may get clamped in low mem situations. 228 */ 229 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) 230 goto fail; 231 232 mtx_enter(&head->so_snd.sb_mtx); 233 so->so_snd.sb_wat = head->so_snd.sb_wat; 234 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 235 so->so_snd.sb_timeo_nsecs = head->so_snd.sb_timeo_nsecs; 236 mtx_leave(&head->so_snd.sb_mtx); 237 238 mtx_enter(&head->so_rcv.sb_mtx); 239 so->so_rcv.sb_wat = head->so_rcv.sb_wat; 240 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 241 so->so_rcv.sb_timeo_nsecs = head->so_rcv.sb_timeo_nsecs; 242 mtx_leave(&head->so_rcv.sb_mtx); 243 244 sigio_copy(&so->so_sigio, &head->so_sigio); 245 246 soqinsque(head, so, soqueue); 247 if (pru_attach(so, 0, wait) != 0) { 248 soqremque(so, soqueue); 249 goto fail; 250 } 251 if (connstatus) { 252 so->so_state |= connstatus; 253 sorwakeup(head); 254 wakeup(&head->so_timeo); 255 } 256 257 if (persocket) 258 sounlock(so); 259 260 return (so); 261 262 fail: 263 if (persocket) 264 sounlock(so); 265 sigio_free(&so->so_sigio); 266 klist_free(&so->so_rcv.sb_klist); 267 klist_free(&so->so_snd.sb_klist); 268 pool_put(&socket_pool, so); 269 270 return (NULL); 271 } 272 273 void 274 soqinsque(struct socket *head, struct socket *so, int q) 275 { 276 soassertlocked(head); 277 soassertlocked(so); 278 279 KASSERT(so->so_onq == NULL); 280 281 so->so_head = head; 282 if (q == 0) { 283 head->so_q0len++; 284 so->so_onq = &head->so_q0; 285 } else { 286 head->so_qlen++; 287 so->so_onq = &head->so_q; 288 } 289 TAILQ_INSERT_TAIL(so->so_onq, so, so_qe); 290 } 291 292 int 293 soqremque(struct socket *so, int q) 294 { 295 struct socket *head = so->so_head; 296 297 soassertlocked(so); 298 soassertlocked(head); 299 300 if (q == 0) { 301 if (so->so_onq != &head->so_q0) 302 return (0); 303 head->so_q0len--; 304 } else { 305 if (so->so_onq != &head->so_q) 306 return (0); 307 head->so_qlen--; 308 } 309 TAILQ_REMOVE(so->so_onq, so, so_qe); 310 so->so_onq = NULL; 311 so->so_head = NULL; 312 return (1); 313 } 314 315 /* 316 * Socantsendmore indicates that no more data will be sent on the 317 * socket; it would normally be applied to a socket when the user 318 * informs the system that no more data is to be sent, by the protocol 319 * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data 320 * will be received, and will normally be applied to the socket by a 321 * protocol when it detects that the peer will send no more data. 322 * Data queued for reading in the socket may yet be read. 323 */ 324 325 void 326 socantsendmore(struct socket *so) 327 { 328 soassertlocked(so); 329 mtx_enter(&so->so_snd.sb_mtx); 330 so->so_snd.sb_state |= SS_CANTSENDMORE; 331 mtx_leave(&so->so_snd.sb_mtx); 332 sowwakeup(so); 333 } 334 335 void 336 socantrcvmore(struct socket *so) 337 { 338 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 339 soassertlocked(so); 340 341 mtx_enter(&so->so_rcv.sb_mtx); 342 so->so_rcv.sb_state |= SS_CANTRCVMORE; 343 mtx_leave(&so->so_rcv.sb_mtx); 344 sorwakeup(so); 345 } 346 347 void 348 solock(struct socket *so) 349 { 350 switch (so->so_proto->pr_domain->dom_family) { 351 case PF_INET: 352 case PF_INET6: 353 NET_LOCK(); 354 break; 355 default: 356 rw_enter_write(&so->so_lock); 357 break; 358 } 359 } 360 361 void 362 solock_shared(struct socket *so) 363 { 364 switch (so->so_proto->pr_domain->dom_family) { 365 case PF_INET: 366 case PF_INET6: 367 if (so->so_proto->pr_usrreqs->pru_lock != NULL) { 368 NET_LOCK_SHARED(); 369 rw_enter_write(&so->so_lock); 370 } else 371 NET_LOCK(); 372 break; 373 default: 374 rw_enter_write(&so->so_lock); 375 break; 376 } 377 } 378 379 int 380 solock_persocket(struct socket *so) 381 { 382 switch (so->so_proto->pr_domain->dom_family) { 383 case PF_INET: 384 case PF_INET6: 385 return 0; 386 default: 387 return 1; 388 } 389 } 390 391 void 392 solock_pair(struct socket *so1, struct socket *so2) 393 { 394 KASSERT(so1 != so2); 395 KASSERT(so1->so_type == so2->so_type); 396 KASSERT(solock_persocket(so1)); 397 398 if (so1 < so2) { 399 solock(so1); 400 solock(so2); 401 } else { 402 solock(so2); 403 solock(so1); 404 } 405 } 406 407 void 408 sounlock(struct socket *so) 409 { 410 switch (so->so_proto->pr_domain->dom_family) { 411 case PF_INET: 412 case PF_INET6: 413 NET_UNLOCK(); 414 break; 415 default: 416 rw_exit_write(&so->so_lock); 417 break; 418 } 419 } 420 421 void 422 sounlock_shared(struct socket *so) 423 { 424 switch (so->so_proto->pr_domain->dom_family) { 425 case PF_INET: 426 case PF_INET6: 427 if (so->so_proto->pr_usrreqs->pru_unlock != NULL) { 428 rw_exit_write(&so->so_lock); 429 NET_UNLOCK_SHARED(); 430 } else 431 NET_UNLOCK(); 432 break; 433 default: 434 rw_exit_write(&so->so_lock); 435 break; 436 } 437 } 438 439 void 440 soassertlocked_readonly(struct socket *so) 441 { 442 switch (so->so_proto->pr_domain->dom_family) { 443 case PF_INET: 444 case PF_INET6: 445 NET_ASSERT_LOCKED(); 446 break; 447 default: 448 rw_assert_wrlock(&so->so_lock); 449 break; 450 } 451 } 452 453 void 454 soassertlocked(struct socket *so) 455 { 456 switch (so->so_proto->pr_domain->dom_family) { 457 case PF_INET: 458 case PF_INET6: 459 if (rw_status(&netlock) == RW_READ) { 460 NET_ASSERT_LOCKED(); 461 462 if (splassert_ctl > 0 && pru_locked(so) == 0 && 463 rw_status(&so->so_lock) != RW_WRITE) 464 splassert_fail(0, RW_WRITE, __func__); 465 } else 466 NET_ASSERT_LOCKED_EXCLUSIVE(); 467 break; 468 default: 469 rw_assert_wrlock(&so->so_lock); 470 break; 471 } 472 } 473 474 int 475 sosleep_nsec(struct socket *so, void *ident, int prio, const char *wmesg, 476 uint64_t nsecs) 477 { 478 int ret; 479 480 switch (so->so_proto->pr_domain->dom_family) { 481 case PF_INET: 482 case PF_INET6: 483 if (so->so_proto->pr_usrreqs->pru_unlock != NULL && 484 rw_status(&netlock) == RW_READ) { 485 rw_exit_write(&so->so_lock); 486 } 487 ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs); 488 if (so->so_proto->pr_usrreqs->pru_lock != NULL && 489 rw_status(&netlock) == RW_READ) { 490 rw_enter_write(&so->so_lock); 491 } 492 break; 493 default: 494 ret = rwsleep_nsec(ident, &so->so_lock, prio, wmesg, nsecs); 495 break; 496 } 497 498 return ret; 499 } 500 501 void 502 sbmtxassertlocked(struct socket *so, struct sockbuf *sb) 503 { 504 if (sb->sb_flags & SB_MTXLOCK) { 505 if (splassert_ctl > 0 && mtx_owned(&sb->sb_mtx) == 0) 506 splassert_fail(0, RW_WRITE, __func__); 507 } else 508 soassertlocked(so); 509 } 510 511 /* 512 * Wait for data to arrive at/drain from a socket buffer. 513 */ 514 int 515 sbwait(struct socket *so, struct sockbuf *sb) 516 { 517 uint64_t timeo_nsecs; 518 int prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH; 519 520 if (sb->sb_flags & SB_MTXLOCK) { 521 MUTEX_ASSERT_LOCKED(&sb->sb_mtx); 522 523 sb->sb_flags |= SB_WAIT; 524 return msleep_nsec(&sb->sb_cc, &sb->sb_mtx, prio, "sbwait", 525 sb->sb_timeo_nsecs); 526 } 527 528 soassertlocked(so); 529 530 mtx_enter(&sb->sb_mtx); 531 timeo_nsecs = sb->sb_timeo_nsecs; 532 sb->sb_flags |= SB_WAIT; 533 mtx_leave(&sb->sb_mtx); 534 535 return sosleep_nsec(so, &sb->sb_cc, prio, "netio", timeo_nsecs); 536 } 537 538 int 539 sblock(struct sockbuf *sb, int flags) 540 { 541 int rwflags = RW_WRITE, error; 542 543 if (!(flags & SBL_NOINTR || sb->sb_flags & SB_NOINTR)) 544 rwflags |= RW_INTR; 545 if (!(flags & SBL_WAIT)) 546 rwflags |= RW_NOSLEEP; 547 548 error = rw_enter(&sb->sb_lock, rwflags); 549 if (error == EBUSY) 550 error = EWOULDBLOCK; 551 552 return error; 553 } 554 555 void 556 sbunlock(struct sockbuf *sb) 557 { 558 rw_exit(&sb->sb_lock); 559 } 560 561 /* 562 * Wakeup processes waiting on a socket buffer. 563 * Do asynchronous notification via SIGIO 564 * if the socket buffer has the SB_ASYNC flag set. 565 */ 566 void 567 sowakeup(struct socket *so, struct sockbuf *sb) 568 { 569 int dowakeup = 0, dopgsigio = 0; 570 571 mtx_enter(&sb->sb_mtx); 572 if (sb->sb_flags & SB_WAIT) { 573 sb->sb_flags &= ~SB_WAIT; 574 dowakeup = 1; 575 } 576 if (sb->sb_flags & SB_ASYNC) 577 dopgsigio = 1; 578 579 knote_locked(&sb->sb_klist, 0); 580 mtx_leave(&sb->sb_mtx); 581 582 if (dowakeup) 583 wakeup(&sb->sb_cc); 584 585 if (dopgsigio) 586 pgsigio(&so->so_sigio, SIGIO, 0); 587 } 588 589 /* 590 * Socket buffer (struct sockbuf) utility routines. 591 * 592 * Each socket contains two socket buffers: one for sending data and 593 * one for receiving data. Each buffer contains a queue of mbufs, 594 * information about the number of mbufs and amount of data in the 595 * queue, and other fields allowing select() statements and notification 596 * on data availability to be implemented. 597 * 598 * Data stored in a socket buffer is maintained as a list of records. 599 * Each record is a list of mbufs chained together with the m_next 600 * field. Records are chained together with the m_nextpkt field. The upper 601 * level routine soreceive() expects the following conventions to be 602 * observed when placing information in the receive buffer: 603 * 604 * 1. If the protocol requires each message be preceded by the sender's 605 * name, then a record containing that name must be present before 606 * any associated data (mbuf's must be of type MT_SONAME). 607 * 2. If the protocol supports the exchange of ``access rights'' (really 608 * just additional data associated with the message), and there are 609 * ``rights'' to be received, then a record containing this data 610 * should be present (mbuf's must be of type MT_CONTROL). 611 * 3. If a name or rights record exists, then it must be followed by 612 * a data record, perhaps of zero length. 613 * 614 * Before using a new socket structure it is first necessary to reserve 615 * buffer space to the socket, by calling sbreserve(). This should commit 616 * some of the available buffer space in the system buffer pool for the 617 * socket (currently, it does nothing but enforce limits). The space 618 * should be released by calling sbrelease() when the socket is destroyed. 619 */ 620 621 int 622 soreserve(struct socket *so, u_long sndcc, u_long rcvcc) 623 { 624 soassertlocked(so); 625 626 mtx_enter(&so->so_rcv.sb_mtx); 627 mtx_enter(&so->so_snd.sb_mtx); 628 if (sbreserve(so, &so->so_snd, sndcc)) 629 goto bad; 630 so->so_snd.sb_wat = sndcc; 631 if (so->so_snd.sb_lowat == 0) 632 so->so_snd.sb_lowat = MCLBYTES; 633 if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) 634 so->so_snd.sb_lowat = so->so_snd.sb_hiwat; 635 if (sbreserve(so, &so->so_rcv, rcvcc)) 636 goto bad2; 637 so->so_rcv.sb_wat = rcvcc; 638 if (so->so_rcv.sb_lowat == 0) 639 so->so_rcv.sb_lowat = 1; 640 mtx_leave(&so->so_snd.sb_mtx); 641 mtx_leave(&so->so_rcv.sb_mtx); 642 643 return (0); 644 bad2: 645 sbrelease(so, &so->so_snd); 646 bad: 647 mtx_leave(&so->so_snd.sb_mtx); 648 mtx_leave(&so->so_rcv.sb_mtx); 649 return (ENOBUFS); 650 } 651 652 /* 653 * Allot mbufs to a sockbuf. 654 * Attempt to scale mbmax so that mbcnt doesn't become limiting 655 * if buffering efficiency is near the normal case. 656 */ 657 int 658 sbreserve(struct socket *so, struct sockbuf *sb, u_long cc) 659 { 660 sbmtxassertlocked(so, sb); 661 662 if (cc == 0 || cc > sb_max) 663 return (1); 664 sb->sb_hiwat = cc; 665 sb->sb_mbmax = max(3 * MAXMCLBYTES, cc * 8); 666 if (sb->sb_lowat > sb->sb_hiwat) 667 sb->sb_lowat = sb->sb_hiwat; 668 return (0); 669 } 670 671 /* 672 * In low memory situation, do not accept any greater than normal request. 673 */ 674 int 675 sbcheckreserve(u_long cnt, u_long defcnt) 676 { 677 if (cnt > defcnt && sbchecklowmem()) 678 return (ENOBUFS); 679 return (0); 680 } 681 682 int 683 sbchecklowmem(void) 684 { 685 static int sblowmem; 686 unsigned int used = m_pool_used(); 687 688 if (used < 60) 689 sblowmem = 0; 690 else if (used > 80) 691 sblowmem = 1; 692 693 return (sblowmem); 694 } 695 696 /* 697 * Free mbufs held by a socket, and reserved mbuf space. 698 */ 699 void 700 sbrelease(struct socket *so, struct sockbuf *sb) 701 { 702 703 sbflush(so, sb); 704 sb->sb_hiwat = sb->sb_mbmax = 0; 705 } 706 707 /* 708 * Routines to add and remove 709 * data from an mbuf queue. 710 * 711 * The routines sbappend() or sbappendrecord() are normally called to 712 * append new mbufs to a socket buffer, after checking that adequate 713 * space is available, comparing the function sbspace() with the amount 714 * of data to be added. sbappendrecord() differs from sbappend() in 715 * that data supplied is treated as the beginning of a new record. 716 * To place a sender's address, optional access rights, and data in a 717 * socket receive buffer, sbappendaddr() should be used. To place 718 * access rights and data in a socket receive buffer, sbappendrights() 719 * should be used. In either case, the new data begins a new record. 720 * Note that unlike sbappend() and sbappendrecord(), these routines check 721 * for the caller that there will be enough space to store the data. 722 * Each fails if there is not enough space, or if it cannot find mbufs 723 * to store additional information in. 724 * 725 * Reliable protocols may use the socket send buffer to hold data 726 * awaiting acknowledgement. Data is normally copied from a socket 727 * send buffer in a protocol with m_copym for output to a peer, 728 * and then removing the data from the socket buffer with sbdrop() 729 * or sbdroprecord() when the data is acknowledged by the peer. 730 */ 731 732 #ifdef SOCKBUF_DEBUG 733 void 734 sblastrecordchk(struct sockbuf *sb, const char *where) 735 { 736 struct mbuf *m = sb->sb_mb; 737 738 while (m && m->m_nextpkt) 739 m = m->m_nextpkt; 740 741 if (m != sb->sb_lastrecord) { 742 printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n", 743 sb->sb_mb, sb->sb_lastrecord, m); 744 printf("packet chain:\n"); 745 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) 746 printf("\t%p\n", m); 747 panic("sblastrecordchk from %s", where); 748 } 749 } 750 751 void 752 sblastmbufchk(struct sockbuf *sb, const char *where) 753 { 754 struct mbuf *m = sb->sb_mb; 755 struct mbuf *n; 756 757 while (m && m->m_nextpkt) 758 m = m->m_nextpkt; 759 760 while (m && m->m_next) 761 m = m->m_next; 762 763 if (m != sb->sb_mbtail) { 764 printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n", 765 sb->sb_mb, sb->sb_mbtail, m); 766 printf("packet tree:\n"); 767 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) { 768 printf("\t"); 769 for (n = m; n != NULL; n = n->m_next) 770 printf("%p ", n); 771 printf("\n"); 772 } 773 panic("sblastmbufchk from %s", where); 774 } 775 } 776 #endif /* SOCKBUF_DEBUG */ 777 778 #define SBLINKRECORD(sb, m0) \ 779 do { \ 780 if ((sb)->sb_lastrecord != NULL) \ 781 (sb)->sb_lastrecord->m_nextpkt = (m0); \ 782 else \ 783 (sb)->sb_mb = (m0); \ 784 (sb)->sb_lastrecord = (m0); \ 785 } while (/*CONSTCOND*/0) 786 787 /* 788 * Append mbuf chain m to the last record in the 789 * socket buffer sb. The additional space associated 790 * the mbuf chain is recorded in sb. Empty mbufs are 791 * discarded and mbufs are compacted where possible. 792 */ 793 void 794 sbappend(struct socket *so, struct sockbuf *sb, struct mbuf *m) 795 { 796 struct mbuf *n; 797 798 if (m == NULL) 799 return; 800 801 sbmtxassertlocked(so, sb); 802 SBLASTRECORDCHK(sb, "sbappend 1"); 803 804 if ((n = sb->sb_lastrecord) != NULL) { 805 /* 806 * XXX Would like to simply use sb_mbtail here, but 807 * XXX I need to verify that I won't miss an EOR that 808 * XXX way. 809 */ 810 do { 811 if (n->m_flags & M_EOR) { 812 sbappendrecord(so, sb, m); /* XXXXXX!!!! */ 813 return; 814 } 815 } while (n->m_next && (n = n->m_next)); 816 } else { 817 /* 818 * If this is the first record in the socket buffer, it's 819 * also the last record. 820 */ 821 sb->sb_lastrecord = m; 822 } 823 sbcompress(so, sb, m, n); 824 SBLASTRECORDCHK(sb, "sbappend 2"); 825 } 826 827 /* 828 * This version of sbappend() should only be used when the caller 829 * absolutely knows that there will never be more than one record 830 * in the socket buffer, that is, a stream protocol (such as TCP). 831 */ 832 void 833 sbappendstream(struct socket *so, struct sockbuf *sb, struct mbuf *m) 834 { 835 KASSERT(sb == &so->so_rcv || sb == &so->so_snd); 836 soassertlocked(so); 837 KDASSERT(m->m_nextpkt == NULL); 838 KASSERT(sb->sb_mb == sb->sb_lastrecord); 839 840 SBLASTMBUFCHK(sb, __func__); 841 842 sbcompress(so, sb, m, sb->sb_mbtail); 843 844 sb->sb_lastrecord = sb->sb_mb; 845 SBLASTRECORDCHK(sb, __func__); 846 } 847 848 #ifdef SOCKBUF_DEBUG 849 void 850 sbcheck(struct socket *so, struct sockbuf *sb) 851 { 852 struct mbuf *m, *n; 853 u_long len = 0, mbcnt = 0; 854 855 for (m = sb->sb_mb; m; m = m->m_nextpkt) { 856 for (n = m; n; n = n->m_next) { 857 len += n->m_len; 858 mbcnt += MSIZE; 859 if (n->m_flags & M_EXT) 860 mbcnt += n->m_ext.ext_size; 861 if (m != n && n->m_nextpkt) 862 panic("sbcheck nextpkt"); 863 } 864 } 865 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { 866 printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc, 867 mbcnt, sb->sb_mbcnt); 868 panic("sbcheck"); 869 } 870 } 871 #endif 872 873 /* 874 * As above, except the mbuf chain 875 * begins a new record. 876 */ 877 void 878 sbappendrecord(struct socket *so, struct sockbuf *sb, struct mbuf *m0) 879 { 880 struct mbuf *m; 881 882 sbmtxassertlocked(so, sb); 883 884 if (m0 == NULL) 885 return; 886 887 /* 888 * Put the first mbuf on the queue. 889 * Note this permits zero length records. 890 */ 891 sballoc(so, sb, m0); 892 SBLASTRECORDCHK(sb, "sbappendrecord 1"); 893 SBLINKRECORD(sb, m0); 894 m = m0->m_next; 895 m0->m_next = NULL; 896 if (m && (m0->m_flags & M_EOR)) { 897 m0->m_flags &= ~M_EOR; 898 m->m_flags |= M_EOR; 899 } 900 sbcompress(so, sb, m, m0); 901 SBLASTRECORDCHK(sb, "sbappendrecord 2"); 902 } 903 904 /* 905 * Append address and data, and optionally, control (ancillary) data 906 * to the receive queue of a socket. If present, 907 * m0 must include a packet header with total length. 908 * Returns 0 if no space in sockbuf or insufficient mbufs. 909 */ 910 int 911 sbappendaddr(struct socket *so, struct sockbuf *sb, const struct sockaddr *asa, 912 struct mbuf *m0, struct mbuf *control) 913 { 914 struct mbuf *m, *n, *nlast; 915 int space = asa->sa_len; 916 917 sbmtxassertlocked(so, sb); 918 919 if (m0 && (m0->m_flags & M_PKTHDR) == 0) 920 panic("sbappendaddr"); 921 if (m0) 922 space += m0->m_pkthdr.len; 923 for (n = control; n; n = n->m_next) { 924 space += n->m_len; 925 if (n->m_next == NULL) /* keep pointer to last control buf */ 926 break; 927 } 928 if (space > sbspace(so, sb)) 929 return (0); 930 if (asa->sa_len > MLEN) 931 return (0); 932 MGET(m, M_DONTWAIT, MT_SONAME); 933 if (m == NULL) 934 return (0); 935 m->m_len = asa->sa_len; 936 memcpy(mtod(m, caddr_t), asa, asa->sa_len); 937 if (n) 938 n->m_next = m0; /* concatenate data to control */ 939 else 940 control = m0; 941 m->m_next = control; 942 943 SBLASTRECORDCHK(sb, "sbappendaddr 1"); 944 945 for (n = m; n->m_next != NULL; n = n->m_next) 946 sballoc(so, sb, n); 947 sballoc(so, sb, n); 948 nlast = n; 949 SBLINKRECORD(sb, m); 950 951 sb->sb_mbtail = nlast; 952 SBLASTMBUFCHK(sb, "sbappendaddr"); 953 954 SBLASTRECORDCHK(sb, "sbappendaddr 2"); 955 956 return (1); 957 } 958 959 int 960 sbappendcontrol(struct socket *so, struct sockbuf *sb, struct mbuf *m0, 961 struct mbuf *control) 962 { 963 struct mbuf *m, *mlast, *n; 964 int eor = 0, space = 0; 965 966 sbmtxassertlocked(so, sb); 967 968 if (control == NULL) 969 panic("sbappendcontrol"); 970 for (m = control; ; m = m->m_next) { 971 space += m->m_len; 972 if (m->m_next == NULL) 973 break; 974 } 975 n = m; /* save pointer to last control buffer */ 976 for (m = m0; m; m = m->m_next) { 977 space += m->m_len; 978 eor |= m->m_flags & M_EOR; 979 if (eor) { 980 if (m->m_next == NULL) 981 m->m_flags |= M_EOR; 982 else 983 m->m_flags &= ~M_EOR; 984 } 985 } 986 if (space > sbspace(so, sb)) 987 return (0); 988 n->m_next = m0; /* concatenate data to control */ 989 990 SBLASTRECORDCHK(sb, "sbappendcontrol 1"); 991 992 for (m = control; m->m_next != NULL; m = m->m_next) 993 sballoc(so, sb, m); 994 sballoc(so, sb, m); 995 mlast = m; 996 SBLINKRECORD(sb, control); 997 998 sb->sb_mbtail = mlast; 999 SBLASTMBUFCHK(sb, "sbappendcontrol"); 1000 1001 SBLASTRECORDCHK(sb, "sbappendcontrol 2"); 1002 1003 return (1); 1004 } 1005 1006 /* 1007 * Compress mbuf chain m into the socket 1008 * buffer sb following mbuf n. If n 1009 * is null, the buffer is presumed empty. 1010 */ 1011 void 1012 sbcompress(struct socket *so, struct sockbuf *sb, struct mbuf *m, 1013 struct mbuf *n) 1014 { 1015 int eor = 0; 1016 struct mbuf *o; 1017 1018 while (m) { 1019 eor |= m->m_flags & M_EOR; 1020 if (m->m_len == 0 && 1021 (eor == 0 || 1022 (((o = m->m_next) || (o = n)) && 1023 o->m_type == m->m_type))) { 1024 if (sb->sb_lastrecord == m) 1025 sb->sb_lastrecord = m->m_next; 1026 m = m_free(m); 1027 continue; 1028 } 1029 if (n && (n->m_flags & M_EOR) == 0 && 1030 /* m_trailingspace() checks buffer writeability */ 1031 m->m_len <= ((n->m_flags & M_EXT)? n->m_ext.ext_size : 1032 MCLBYTES) / 4 && /* XXX Don't copy too much */ 1033 m->m_len <= m_trailingspace(n) && 1034 n->m_type == m->m_type) { 1035 memcpy(mtod(n, caddr_t) + n->m_len, mtod(m, caddr_t), 1036 m->m_len); 1037 n->m_len += m->m_len; 1038 sb->sb_cc += m->m_len; 1039 if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME) 1040 sb->sb_datacc += m->m_len; 1041 m = m_free(m); 1042 continue; 1043 } 1044 if (n) 1045 n->m_next = m; 1046 else 1047 sb->sb_mb = m; 1048 sb->sb_mbtail = m; 1049 sballoc(so, sb, m); 1050 n = m; 1051 m->m_flags &= ~M_EOR; 1052 m = m->m_next; 1053 n->m_next = NULL; 1054 } 1055 if (eor) { 1056 if (n) 1057 n->m_flags |= eor; 1058 else 1059 printf("semi-panic: sbcompress"); 1060 } 1061 SBLASTMBUFCHK(sb, __func__); 1062 } 1063 1064 /* 1065 * Free all mbufs in a sockbuf. 1066 * Check that all resources are reclaimed. 1067 */ 1068 void 1069 sbflush(struct socket *so, struct sockbuf *sb) 1070 { 1071 KASSERT(sb == &so->so_rcv || sb == &so->so_snd); 1072 rw_assert_unlocked(&sb->sb_lock); 1073 1074 while (sb->sb_mbcnt) 1075 sbdrop(so, sb, (int)sb->sb_cc); 1076 1077 KASSERT(sb->sb_cc == 0); 1078 KASSERT(sb->sb_datacc == 0); 1079 KASSERT(sb->sb_mb == NULL); 1080 KASSERT(sb->sb_mbtail == NULL); 1081 KASSERT(sb->sb_lastrecord == NULL); 1082 } 1083 1084 /* 1085 * Drop data from (the front of) a sockbuf. 1086 */ 1087 void 1088 sbdrop(struct socket *so, struct sockbuf *sb, int len) 1089 { 1090 struct mbuf *m, *mn; 1091 struct mbuf *next; 1092 1093 sbmtxassertlocked(so, sb); 1094 1095 next = (m = sb->sb_mb) ? m->m_nextpkt : NULL; 1096 while (len > 0) { 1097 if (m == NULL) { 1098 if (next == NULL) 1099 panic("sbdrop"); 1100 m = next; 1101 next = m->m_nextpkt; 1102 continue; 1103 } 1104 if (m->m_len > len) { 1105 m->m_len -= len; 1106 m->m_data += len; 1107 sb->sb_cc -= len; 1108 if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME) 1109 sb->sb_datacc -= len; 1110 break; 1111 } 1112 len -= m->m_len; 1113 sbfree(so, sb, m); 1114 mn = m_free(m); 1115 m = mn; 1116 } 1117 while (m && m->m_len == 0) { 1118 sbfree(so, sb, m); 1119 mn = m_free(m); 1120 m = mn; 1121 } 1122 if (m) { 1123 sb->sb_mb = m; 1124 m->m_nextpkt = next; 1125 } else 1126 sb->sb_mb = next; 1127 /* 1128 * First part is an inline SB_EMPTY_FIXUP(). Second part 1129 * makes sure sb_lastrecord is up-to-date if we dropped 1130 * part of the last record. 1131 */ 1132 m = sb->sb_mb; 1133 if (m == NULL) { 1134 sb->sb_mbtail = NULL; 1135 sb->sb_lastrecord = NULL; 1136 } else if (m->m_nextpkt == NULL) 1137 sb->sb_lastrecord = m; 1138 } 1139 1140 /* 1141 * Drop a record off the front of a sockbuf 1142 * and move the next record to the front. 1143 */ 1144 void 1145 sbdroprecord(struct socket *so, struct sockbuf *sb) 1146 { 1147 struct mbuf *m, *mn; 1148 1149 m = sb->sb_mb; 1150 if (m) { 1151 sb->sb_mb = m->m_nextpkt; 1152 do { 1153 sbfree(so, sb, m); 1154 mn = m_free(m); 1155 } while ((m = mn) != NULL); 1156 } 1157 SB_EMPTY_FIXUP(sb); 1158 } 1159 1160 /* 1161 * Create a "control" mbuf containing the specified data 1162 * with the specified type for presentation on a socket buffer. 1163 */ 1164 struct mbuf * 1165 sbcreatecontrol(const void *p, size_t size, int type, int level) 1166 { 1167 struct cmsghdr *cp; 1168 struct mbuf *m; 1169 1170 if (CMSG_SPACE(size) > MCLBYTES) { 1171 printf("sbcreatecontrol: message too large %zu\n", size); 1172 return (NULL); 1173 } 1174 1175 if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL) 1176 return (NULL); 1177 if (CMSG_SPACE(size) > MLEN) { 1178 MCLGET(m, M_DONTWAIT); 1179 if ((m->m_flags & M_EXT) == 0) { 1180 m_free(m); 1181 return NULL; 1182 } 1183 } 1184 cp = mtod(m, struct cmsghdr *); 1185 memset(cp, 0, CMSG_SPACE(size)); 1186 memcpy(CMSG_DATA(cp), p, size); 1187 m->m_len = CMSG_SPACE(size); 1188 cp->cmsg_len = CMSG_LEN(size); 1189 cp->cmsg_level = level; 1190 cp->cmsg_type = type; 1191 return (m); 1192 } 1193