1 /* $OpenBSD: uipc_socket2.c,v 1.140 2024/01/11 14:15:11 bluhm Exp $ */ 2 /* $NetBSD: uipc_socket2.c,v 1.11 1996/02/04 02:17:55 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/malloc.h> 38 #include <sys/mbuf.h> 39 #include <sys/protosw.h> 40 #include <sys/domain.h> 41 #include <sys/socket.h> 42 #include <sys/socketvar.h> 43 #include <sys/signalvar.h> 44 #include <sys/pool.h> 45 46 /* 47 * Primitive routines for operating on sockets and socket buffers 48 */ 49 50 u_long sb_max = SB_MAX; /* patchable */ 51 52 extern struct pool mclpools[]; 53 extern struct pool mbpool; 54 55 /* 56 * Procedures to manipulate state flags of socket 57 * and do appropriate wakeups. Normal sequence from the 58 * active (originating) side is that soisconnecting() is 59 * called during processing of connect() call, 60 * resulting in an eventual call to soisconnected() if/when the 61 * connection is established. When the connection is torn down 62 * soisdisconnecting() is called during processing of disconnect() call, 63 * and soisdisconnected() is called when the connection to the peer 64 * is totally severed. The semantics of these routines are such that 65 * connectionless protocols can call soisconnected() and soisdisconnected() 66 * only, bypassing the in-progress calls when setting up a ``connection'' 67 * takes no time. 68 * 69 * From the passive side, a socket is created with 70 * two queues of sockets: so_q0 for connections in progress 71 * and so_q for connections already made and awaiting user acceptance. 72 * As a protocol is preparing incoming connections, it creates a socket 73 * structure queued on so_q0 by calling sonewconn(). When the connection 74 * is established, soisconnected() is called, and transfers the 75 * socket structure to so_q, making it available to accept(). 76 * 77 * If a socket is closed with sockets on either 78 * so_q0 or so_q, these sockets are dropped. 79 * 80 * If higher level protocols are implemented in 81 * the kernel, the wakeups done here will sometimes 82 * cause software-interrupt process scheduling. 83 */ 84 85 void 86 soisconnecting(struct socket *so) 87 { 88 soassertlocked(so); 89 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 90 so->so_state |= SS_ISCONNECTING; 91 } 92 93 void 94 soisconnected(struct socket *so) 95 { 96 struct socket *head = so->so_head; 97 98 soassertlocked(so); 99 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING); 100 so->so_state |= SS_ISCONNECTED; 101 102 if (head != NULL && so->so_onq == &head->so_q0) { 103 int persocket = solock_persocket(so); 104 105 if (persocket) { 106 soref(so); 107 soref(head); 108 109 sounlock(so); 110 solock(head); 111 solock(so); 112 113 if (so->so_onq != &head->so_q0) { 114 sounlock(head); 115 sorele(head); 116 sorele(so); 117 118 return; 119 } 120 121 sorele(head); 122 sorele(so); 123 } 124 125 soqremque(so, 0); 126 soqinsque(head, so, 1); 127 sorwakeup(head); 128 wakeup_one(&head->so_timeo); 129 130 if (persocket) 131 sounlock(head); 132 } else { 133 wakeup(&so->so_timeo); 134 sorwakeup(so); 135 sowwakeup(so); 136 } 137 } 138 139 void 140 soisdisconnecting(struct socket *so) 141 { 142 soassertlocked(so); 143 so->so_state &= ~SS_ISCONNECTING; 144 so->so_state |= SS_ISDISCONNECTING; 145 so->so_rcv.sb_state |= SS_CANTRCVMORE; 146 so->so_snd.sb_state |= SS_CANTSENDMORE; 147 wakeup(&so->so_timeo); 148 sowwakeup(so); 149 sorwakeup(so); 150 } 151 152 void 153 soisdisconnected(struct socket *so) 154 { 155 soassertlocked(so); 156 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 157 so->so_state |= SS_ISDISCONNECTED; 158 so->so_rcv.sb_state |= SS_CANTRCVMORE; 159 so->so_snd.sb_state |= SS_CANTSENDMORE; 160 wakeup(&so->so_timeo); 161 sowwakeup(so); 162 sorwakeup(so); 163 } 164 165 /* 166 * When an attempt at a new connection is noted on a socket 167 * which accepts connections, sonewconn is called. If the 168 * connection is possible (subject to space constraints, etc.) 169 * then we allocate a new structure, properly linked into the 170 * data structure of the original socket, and return this. 171 * Connstatus may be 0 or SS_ISCONNECTED. 172 */ 173 struct socket * 174 sonewconn(struct socket *head, int connstatus, int wait) 175 { 176 struct socket *so; 177 int persocket = solock_persocket(head); 178 int error; 179 180 /* 181 * XXXSMP as long as `so' and `head' share the same lock, we 182 * can call soreserve() and pr_attach() below w/o explicitly 183 * locking `so'. 184 */ 185 soassertlocked(head); 186 187 if (m_pool_used() > 95) 188 return (NULL); 189 if (head->so_qlen + head->so_q0len > head->so_qlimit * 3) 190 return (NULL); 191 so = soalloc(head->so_proto->pr_domain, wait); 192 if (so == NULL) 193 return (NULL); 194 so->so_type = head->so_type; 195 so->so_options = head->so_options &~ SO_ACCEPTCONN; 196 so->so_linger = head->so_linger; 197 so->so_state = head->so_state | SS_NOFDREF; 198 so->so_proto = head->so_proto; 199 so->so_timeo = head->so_timeo; 200 so->so_euid = head->so_euid; 201 so->so_ruid = head->so_ruid; 202 so->so_egid = head->so_egid; 203 so->so_rgid = head->so_rgid; 204 so->so_cpid = head->so_cpid; 205 206 /* 207 * Lock order will be `head' -> `so' while these sockets are linked. 208 */ 209 if (persocket) 210 solock(so); 211 212 /* 213 * Inherit watermarks but those may get clamped in low mem situations. 214 */ 215 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) 216 goto fail; 217 so->so_snd.sb_wat = head->so_snd.sb_wat; 218 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 219 so->so_snd.sb_timeo_nsecs = head->so_snd.sb_timeo_nsecs; 220 so->so_rcv.sb_wat = head->so_rcv.sb_wat; 221 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 222 so->so_rcv.sb_timeo_nsecs = head->so_rcv.sb_timeo_nsecs; 223 224 sigio_copy(&so->so_sigio, &head->so_sigio); 225 226 soqinsque(head, so, 0); 227 228 /* 229 * We need to unlock `head' because PCB layer could release 230 * solock() to enforce desired lock order. 231 */ 232 if (persocket) { 233 head->so_newconn++; 234 sounlock(head); 235 } 236 237 error = pru_attach(so, 0, wait); 238 239 if (persocket) { 240 sounlock(so); 241 solock(head); 242 solock(so); 243 244 if ((head->so_newconn--) == 0) { 245 if ((head->so_state & SS_NEWCONN_WAIT) != 0) { 246 head->so_state &= ~SS_NEWCONN_WAIT; 247 wakeup(&head->so_newconn); 248 } 249 } 250 } 251 252 if (error) { 253 soqremque(so, 0); 254 goto fail; 255 } 256 257 if (connstatus) { 258 so->so_state |= connstatus; 259 soqremque(so, 0); 260 soqinsque(head, so, 1); 261 sorwakeup(head); 262 wakeup(&head->so_timeo); 263 } 264 265 if (persocket) 266 sounlock(so); 267 268 return (so); 269 270 fail: 271 if (persocket) 272 sounlock(so); 273 sigio_free(&so->so_sigio); 274 klist_free(&so->so_rcv.sb_klist); 275 klist_free(&so->so_snd.sb_klist); 276 pool_put(&socket_pool, so); 277 278 return (NULL); 279 } 280 281 void 282 soqinsque(struct socket *head, struct socket *so, int q) 283 { 284 soassertlocked(head); 285 soassertlocked(so); 286 287 KASSERT(so->so_onq == NULL); 288 289 so->so_head = head; 290 if (q == 0) { 291 head->so_q0len++; 292 so->so_onq = &head->so_q0; 293 } else { 294 head->so_qlen++; 295 so->so_onq = &head->so_q; 296 } 297 TAILQ_INSERT_TAIL(so->so_onq, so, so_qe); 298 } 299 300 int 301 soqremque(struct socket *so, int q) 302 { 303 struct socket *head = so->so_head; 304 305 soassertlocked(so); 306 soassertlocked(head); 307 308 if (q == 0) { 309 if (so->so_onq != &head->so_q0) 310 return (0); 311 head->so_q0len--; 312 } else { 313 if (so->so_onq != &head->so_q) 314 return (0); 315 head->so_qlen--; 316 } 317 TAILQ_REMOVE(so->so_onq, so, so_qe); 318 so->so_onq = NULL; 319 so->so_head = NULL; 320 return (1); 321 } 322 323 /* 324 * Socantsendmore indicates that no more data will be sent on the 325 * socket; it would normally be applied to a socket when the user 326 * informs the system that no more data is to be sent, by the protocol 327 * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data 328 * will be received, and will normally be applied to the socket by a 329 * protocol when it detects that the peer will send no more data. 330 * Data queued for reading in the socket may yet be read. 331 */ 332 333 void 334 socantsendmore(struct socket *so) 335 { 336 soassertlocked(so); 337 so->so_snd.sb_state |= SS_CANTSENDMORE; 338 sowwakeup(so); 339 } 340 341 void 342 socantrcvmore(struct socket *so) 343 { 344 soassertlocked(so); 345 so->so_rcv.sb_state |= SS_CANTRCVMORE; 346 sorwakeup(so); 347 } 348 349 void 350 solock(struct socket *so) 351 { 352 switch (so->so_proto->pr_domain->dom_family) { 353 case PF_INET: 354 case PF_INET6: 355 NET_LOCK(); 356 break; 357 default: 358 rw_enter_write(&so->so_lock); 359 break; 360 } 361 } 362 363 void 364 solock_shared(struct socket *so) 365 { 366 switch (so->so_proto->pr_domain->dom_family) { 367 case PF_INET: 368 case PF_INET6: 369 if (so->so_proto->pr_usrreqs->pru_lock != NULL) { 370 NET_LOCK_SHARED(); 371 rw_enter_write(&so->so_lock); 372 } else 373 NET_LOCK(); 374 break; 375 default: 376 rw_enter_write(&so->so_lock); 377 break; 378 } 379 } 380 381 int 382 solock_persocket(struct socket *so) 383 { 384 switch (so->so_proto->pr_domain->dom_family) { 385 case PF_INET: 386 case PF_INET6: 387 return 0; 388 default: 389 return 1; 390 } 391 } 392 393 void 394 solock_pair(struct socket *so1, struct socket *so2) 395 { 396 KASSERT(so1 != so2); 397 KASSERT(so1->so_type == so2->so_type); 398 KASSERT(solock_persocket(so1)); 399 400 if (so1 < so2) { 401 solock(so1); 402 solock(so2); 403 } else { 404 solock(so2); 405 solock(so1); 406 } 407 } 408 409 void 410 sounlock(struct socket *so) 411 { 412 switch (so->so_proto->pr_domain->dom_family) { 413 case PF_INET: 414 case PF_INET6: 415 NET_UNLOCK(); 416 break; 417 default: 418 rw_exit_write(&so->so_lock); 419 break; 420 } 421 } 422 423 void 424 sounlock_shared(struct socket *so) 425 { 426 switch (so->so_proto->pr_domain->dom_family) { 427 case PF_INET: 428 case PF_INET6: 429 if (so->so_proto->pr_usrreqs->pru_unlock != NULL) { 430 rw_exit_write(&so->so_lock); 431 NET_UNLOCK_SHARED(); 432 } else 433 NET_UNLOCK(); 434 break; 435 default: 436 rw_exit_write(&so->so_lock); 437 break; 438 } 439 } 440 441 void 442 soassertlocked(struct socket *so) 443 { 444 switch (so->so_proto->pr_domain->dom_family) { 445 case PF_INET: 446 case PF_INET6: 447 NET_ASSERT_LOCKED(); 448 break; 449 default: 450 rw_assert_wrlock(&so->so_lock); 451 break; 452 } 453 } 454 455 int 456 sosleep_nsec(struct socket *so, void *ident, int prio, const char *wmesg, 457 uint64_t nsecs) 458 { 459 int ret; 460 461 switch (so->so_proto->pr_domain->dom_family) { 462 case PF_INET: 463 case PF_INET6: 464 if (so->so_proto->pr_usrreqs->pru_unlock != NULL && 465 rw_status(&netlock) == RW_READ) { 466 rw_exit_write(&so->so_lock); 467 } 468 ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs); 469 if (so->so_proto->pr_usrreqs->pru_lock != NULL && 470 rw_status(&netlock) == RW_READ) { 471 rw_enter_write(&so->so_lock); 472 } 473 break; 474 default: 475 ret = rwsleep_nsec(ident, &so->so_lock, prio, wmesg, nsecs); 476 break; 477 } 478 479 return ret; 480 } 481 482 /* 483 * Wait for data to arrive at/drain from a socket buffer. 484 */ 485 int 486 sbwait(struct socket *so, struct sockbuf *sb) 487 { 488 int prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH; 489 490 soassertlocked(so); 491 492 sb->sb_flags |= SB_WAIT; 493 return sosleep_nsec(so, &sb->sb_cc, prio, "netio", sb->sb_timeo_nsecs); 494 } 495 496 int 497 sblock(struct socket *so, struct sockbuf *sb, int flags) 498 { 499 int error, prio = PSOCK; 500 501 soassertlocked(so); 502 503 if ((sb->sb_flags & SB_LOCK) == 0) { 504 sb->sb_flags |= SB_LOCK; 505 return (0); 506 } 507 if ((flags & SBL_WAIT) == 0) 508 return (EWOULDBLOCK); 509 if (!(flags & SBL_NOINTR || sb->sb_flags & SB_NOINTR)) 510 prio |= PCATCH; 511 512 while (sb->sb_flags & SB_LOCK) { 513 sb->sb_flags |= SB_WANT; 514 error = sosleep_nsec(so, &sb->sb_flags, prio, "netlck", INFSLP); 515 if (error) 516 return (error); 517 } 518 sb->sb_flags |= SB_LOCK; 519 return (0); 520 } 521 522 void 523 sbunlock(struct socket *so, struct sockbuf *sb) 524 { 525 soassertlocked(so); 526 527 sb->sb_flags &= ~SB_LOCK; 528 if (sb->sb_flags & SB_WANT) { 529 sb->sb_flags &= ~SB_WANT; 530 wakeup(&sb->sb_flags); 531 } 532 } 533 534 /* 535 * Wakeup processes waiting on a socket buffer. 536 * Do asynchronous notification via SIGIO 537 * if the socket buffer has the SB_ASYNC flag set. 538 */ 539 void 540 sowakeup(struct socket *so, struct sockbuf *sb) 541 { 542 soassertlocked(so); 543 544 if (sb->sb_flags & SB_WAIT) { 545 sb->sb_flags &= ~SB_WAIT; 546 wakeup(&sb->sb_cc); 547 } 548 if (sb->sb_flags & SB_ASYNC) 549 pgsigio(&so->so_sigio, SIGIO, 0); 550 knote_locked(&sb->sb_klist, 0); 551 } 552 553 /* 554 * Socket buffer (struct sockbuf) utility routines. 555 * 556 * Each socket contains two socket buffers: one for sending data and 557 * one for receiving data. Each buffer contains a queue of mbufs, 558 * information about the number of mbufs and amount of data in the 559 * queue, and other fields allowing select() statements and notification 560 * on data availability to be implemented. 561 * 562 * Data stored in a socket buffer is maintained as a list of records. 563 * Each record is a list of mbufs chained together with the m_next 564 * field. Records are chained together with the m_nextpkt field. The upper 565 * level routine soreceive() expects the following conventions to be 566 * observed when placing information in the receive buffer: 567 * 568 * 1. If the protocol requires each message be preceded by the sender's 569 * name, then a record containing that name must be present before 570 * any associated data (mbuf's must be of type MT_SONAME). 571 * 2. If the protocol supports the exchange of ``access rights'' (really 572 * just additional data associated with the message), and there are 573 * ``rights'' to be received, then a record containing this data 574 * should be present (mbuf's must be of type MT_CONTROL). 575 * 3. If a name or rights record exists, then it must be followed by 576 * a data record, perhaps of zero length. 577 * 578 * Before using a new socket structure it is first necessary to reserve 579 * buffer space to the socket, by calling sbreserve(). This should commit 580 * some of the available buffer space in the system buffer pool for the 581 * socket (currently, it does nothing but enforce limits). The space 582 * should be released by calling sbrelease() when the socket is destroyed. 583 */ 584 585 int 586 soreserve(struct socket *so, u_long sndcc, u_long rcvcc) 587 { 588 soassertlocked(so); 589 590 if (sbreserve(so, &so->so_snd, sndcc)) 591 goto bad; 592 if (sbreserve(so, &so->so_rcv, rcvcc)) 593 goto bad2; 594 so->so_snd.sb_wat = sndcc; 595 so->so_rcv.sb_wat = rcvcc; 596 if (so->so_rcv.sb_lowat == 0) 597 so->so_rcv.sb_lowat = 1; 598 if (so->so_snd.sb_lowat == 0) 599 so->so_snd.sb_lowat = MCLBYTES; 600 if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) 601 so->so_snd.sb_lowat = so->so_snd.sb_hiwat; 602 return (0); 603 bad2: 604 sbrelease(so, &so->so_snd); 605 bad: 606 return (ENOBUFS); 607 } 608 609 /* 610 * Allot mbufs to a sockbuf. 611 * Attempt to scale mbmax so that mbcnt doesn't become limiting 612 * if buffering efficiency is near the normal case. 613 */ 614 int 615 sbreserve(struct socket *so, struct sockbuf *sb, u_long cc) 616 { 617 KASSERT(sb == &so->so_rcv || sb == &so->so_snd); 618 soassertlocked(so); 619 620 if (cc == 0 || cc > sb_max) 621 return (1); 622 sb->sb_hiwat = cc; 623 sb->sb_mbmax = max(3 * MAXMCLBYTES, cc * 8); 624 if (sb->sb_lowat > sb->sb_hiwat) 625 sb->sb_lowat = sb->sb_hiwat; 626 return (0); 627 } 628 629 /* 630 * In low memory situation, do not accept any greater than normal request. 631 */ 632 int 633 sbcheckreserve(u_long cnt, u_long defcnt) 634 { 635 if (cnt > defcnt && sbchecklowmem()) 636 return (ENOBUFS); 637 return (0); 638 } 639 640 int 641 sbchecklowmem(void) 642 { 643 static int sblowmem; 644 unsigned int used = m_pool_used(); 645 646 if (used < 60) 647 sblowmem = 0; 648 else if (used > 80) 649 sblowmem = 1; 650 651 return (sblowmem); 652 } 653 654 /* 655 * Free mbufs held by a socket, and reserved mbuf space. 656 */ 657 void 658 sbrelease(struct socket *so, struct sockbuf *sb) 659 { 660 661 sbflush(so, sb); 662 sb->sb_hiwat = sb->sb_mbmax = 0; 663 } 664 665 /* 666 * Routines to add and remove 667 * data from an mbuf queue. 668 * 669 * The routines sbappend() or sbappendrecord() are normally called to 670 * append new mbufs to a socket buffer, after checking that adequate 671 * space is available, comparing the function sbspace() with the amount 672 * of data to be added. sbappendrecord() differs from sbappend() in 673 * that data supplied is treated as the beginning of a new record. 674 * To place a sender's address, optional access rights, and data in a 675 * socket receive buffer, sbappendaddr() should be used. To place 676 * access rights and data in a socket receive buffer, sbappendrights() 677 * should be used. In either case, the new data begins a new record. 678 * Note that unlike sbappend() and sbappendrecord(), these routines check 679 * for the caller that there will be enough space to store the data. 680 * Each fails if there is not enough space, or if it cannot find mbufs 681 * to store additional information in. 682 * 683 * Reliable protocols may use the socket send buffer to hold data 684 * awaiting acknowledgement. Data is normally copied from a socket 685 * send buffer in a protocol with m_copym for output to a peer, 686 * and then removing the data from the socket buffer with sbdrop() 687 * or sbdroprecord() when the data is acknowledged by the peer. 688 */ 689 690 #ifdef SOCKBUF_DEBUG 691 void 692 sblastrecordchk(struct sockbuf *sb, const char *where) 693 { 694 struct mbuf *m = sb->sb_mb; 695 696 while (m && m->m_nextpkt) 697 m = m->m_nextpkt; 698 699 if (m != sb->sb_lastrecord) { 700 printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n", 701 sb->sb_mb, sb->sb_lastrecord, m); 702 printf("packet chain:\n"); 703 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) 704 printf("\t%p\n", m); 705 panic("sblastrecordchk from %s", where); 706 } 707 } 708 709 void 710 sblastmbufchk(struct sockbuf *sb, const char *where) 711 { 712 struct mbuf *m = sb->sb_mb; 713 struct mbuf *n; 714 715 while (m && m->m_nextpkt) 716 m = m->m_nextpkt; 717 718 while (m && m->m_next) 719 m = m->m_next; 720 721 if (m != sb->sb_mbtail) { 722 printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n", 723 sb->sb_mb, sb->sb_mbtail, m); 724 printf("packet tree:\n"); 725 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) { 726 printf("\t"); 727 for (n = m; n != NULL; n = n->m_next) 728 printf("%p ", n); 729 printf("\n"); 730 } 731 panic("sblastmbufchk from %s", where); 732 } 733 } 734 #endif /* SOCKBUF_DEBUG */ 735 736 #define SBLINKRECORD(sb, m0) \ 737 do { \ 738 if ((sb)->sb_lastrecord != NULL) \ 739 (sb)->sb_lastrecord->m_nextpkt = (m0); \ 740 else \ 741 (sb)->sb_mb = (m0); \ 742 (sb)->sb_lastrecord = (m0); \ 743 } while (/*CONSTCOND*/0) 744 745 /* 746 * Append mbuf chain m to the last record in the 747 * socket buffer sb. The additional space associated 748 * the mbuf chain is recorded in sb. Empty mbufs are 749 * discarded and mbufs are compacted where possible. 750 */ 751 void 752 sbappend(struct socket *so, struct sockbuf *sb, struct mbuf *m) 753 { 754 struct mbuf *n; 755 756 if (m == NULL) 757 return; 758 759 soassertlocked(so); 760 SBLASTRECORDCHK(sb, "sbappend 1"); 761 762 if ((n = sb->sb_lastrecord) != NULL) { 763 /* 764 * XXX Would like to simply use sb_mbtail here, but 765 * XXX I need to verify that I won't miss an EOR that 766 * XXX way. 767 */ 768 do { 769 if (n->m_flags & M_EOR) { 770 sbappendrecord(so, sb, m); /* XXXXXX!!!! */ 771 return; 772 } 773 } while (n->m_next && (n = n->m_next)); 774 } else { 775 /* 776 * If this is the first record in the socket buffer, it's 777 * also the last record. 778 */ 779 sb->sb_lastrecord = m; 780 } 781 sbcompress(so, sb, m, n); 782 SBLASTRECORDCHK(sb, "sbappend 2"); 783 } 784 785 /* 786 * This version of sbappend() should only be used when the caller 787 * absolutely knows that there will never be more than one record 788 * in the socket buffer, that is, a stream protocol (such as TCP). 789 */ 790 void 791 sbappendstream(struct socket *so, struct sockbuf *sb, struct mbuf *m) 792 { 793 KASSERT(sb == &so->so_rcv || sb == &so->so_snd); 794 soassertlocked(so); 795 KDASSERT(m->m_nextpkt == NULL); 796 KASSERT(sb->sb_mb == sb->sb_lastrecord); 797 798 SBLASTMBUFCHK(sb, __func__); 799 800 sbcompress(so, sb, m, sb->sb_mbtail); 801 802 sb->sb_lastrecord = sb->sb_mb; 803 SBLASTRECORDCHK(sb, __func__); 804 } 805 806 #ifdef SOCKBUF_DEBUG 807 void 808 sbcheck(struct socket *so, struct sockbuf *sb) 809 { 810 struct mbuf *m, *n; 811 u_long len = 0, mbcnt = 0; 812 813 for (m = sb->sb_mb; m; m = m->m_nextpkt) { 814 for (n = m; n; n = n->m_next) { 815 len += n->m_len; 816 mbcnt += MSIZE; 817 if (n->m_flags & M_EXT) 818 mbcnt += n->m_ext.ext_size; 819 if (m != n && n->m_nextpkt) 820 panic("sbcheck nextpkt"); 821 } 822 } 823 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { 824 printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc, 825 mbcnt, sb->sb_mbcnt); 826 panic("sbcheck"); 827 } 828 } 829 #endif 830 831 /* 832 * As above, except the mbuf chain 833 * begins a new record. 834 */ 835 void 836 sbappendrecord(struct socket *so, struct sockbuf *sb, struct mbuf *m0) 837 { 838 struct mbuf *m; 839 840 KASSERT(sb == &so->so_rcv || sb == &so->so_snd); 841 soassertlocked(so); 842 843 if (m0 == NULL) 844 return; 845 846 /* 847 * Put the first mbuf on the queue. 848 * Note this permits zero length records. 849 */ 850 sballoc(so, sb, m0); 851 SBLASTRECORDCHK(sb, "sbappendrecord 1"); 852 SBLINKRECORD(sb, m0); 853 m = m0->m_next; 854 m0->m_next = NULL; 855 if (m && (m0->m_flags & M_EOR)) { 856 m0->m_flags &= ~M_EOR; 857 m->m_flags |= M_EOR; 858 } 859 sbcompress(so, sb, m, m0); 860 SBLASTRECORDCHK(sb, "sbappendrecord 2"); 861 } 862 863 /* 864 * Append address and data, and optionally, control (ancillary) data 865 * to the receive queue of a socket. If present, 866 * m0 must include a packet header with total length. 867 * Returns 0 if no space in sockbuf or insufficient mbufs. 868 */ 869 int 870 sbappendaddr(struct socket *so, struct sockbuf *sb, const struct sockaddr *asa, 871 struct mbuf *m0, struct mbuf *control) 872 { 873 struct mbuf *m, *n, *nlast; 874 int space = asa->sa_len; 875 876 soassertlocked(so); 877 878 if (m0 && (m0->m_flags & M_PKTHDR) == 0) 879 panic("sbappendaddr"); 880 if (m0) 881 space += m0->m_pkthdr.len; 882 for (n = control; n; n = n->m_next) { 883 space += n->m_len; 884 if (n->m_next == NULL) /* keep pointer to last control buf */ 885 break; 886 } 887 if (space > sbspace(so, sb)) 888 return (0); 889 if (asa->sa_len > MLEN) 890 return (0); 891 MGET(m, M_DONTWAIT, MT_SONAME); 892 if (m == NULL) 893 return (0); 894 m->m_len = asa->sa_len; 895 memcpy(mtod(m, caddr_t), asa, asa->sa_len); 896 if (n) 897 n->m_next = m0; /* concatenate data to control */ 898 else 899 control = m0; 900 m->m_next = control; 901 902 SBLASTRECORDCHK(sb, "sbappendaddr 1"); 903 904 for (n = m; n->m_next != NULL; n = n->m_next) 905 sballoc(so, sb, n); 906 sballoc(so, sb, n); 907 nlast = n; 908 SBLINKRECORD(sb, m); 909 910 sb->sb_mbtail = nlast; 911 SBLASTMBUFCHK(sb, "sbappendaddr"); 912 913 SBLASTRECORDCHK(sb, "sbappendaddr 2"); 914 915 return (1); 916 } 917 918 int 919 sbappendcontrol(struct socket *so, struct sockbuf *sb, struct mbuf *m0, 920 struct mbuf *control) 921 { 922 struct mbuf *m, *mlast, *n; 923 int eor = 0, space = 0; 924 925 if (control == NULL) 926 panic("sbappendcontrol"); 927 for (m = control; ; m = m->m_next) { 928 space += m->m_len; 929 if (m->m_next == NULL) 930 break; 931 } 932 n = m; /* save pointer to last control buffer */ 933 for (m = m0; m; m = m->m_next) { 934 space += m->m_len; 935 eor |= m->m_flags & M_EOR; 936 if (eor) { 937 if (m->m_next == NULL) 938 m->m_flags |= M_EOR; 939 else 940 m->m_flags &= ~M_EOR; 941 } 942 } 943 if (space > sbspace(so, sb)) 944 return (0); 945 n->m_next = m0; /* concatenate data to control */ 946 947 SBLASTRECORDCHK(sb, "sbappendcontrol 1"); 948 949 for (m = control; m->m_next != NULL; m = m->m_next) 950 sballoc(so, sb, m); 951 sballoc(so, sb, m); 952 mlast = m; 953 SBLINKRECORD(sb, control); 954 955 sb->sb_mbtail = mlast; 956 SBLASTMBUFCHK(sb, "sbappendcontrol"); 957 958 SBLASTRECORDCHK(sb, "sbappendcontrol 2"); 959 960 return (1); 961 } 962 963 /* 964 * Compress mbuf chain m into the socket 965 * buffer sb following mbuf n. If n 966 * is null, the buffer is presumed empty. 967 */ 968 void 969 sbcompress(struct socket *so, struct sockbuf *sb, struct mbuf *m, 970 struct mbuf *n) 971 { 972 int eor = 0; 973 struct mbuf *o; 974 975 while (m) { 976 eor |= m->m_flags & M_EOR; 977 if (m->m_len == 0 && 978 (eor == 0 || 979 (((o = m->m_next) || (o = n)) && 980 o->m_type == m->m_type))) { 981 if (sb->sb_lastrecord == m) 982 sb->sb_lastrecord = m->m_next; 983 m = m_free(m); 984 continue; 985 } 986 if (n && (n->m_flags & M_EOR) == 0 && 987 /* m_trailingspace() checks buffer writeability */ 988 m->m_len <= ((n->m_flags & M_EXT)? n->m_ext.ext_size : 989 MCLBYTES) / 4 && /* XXX Don't copy too much */ 990 m->m_len <= m_trailingspace(n) && 991 n->m_type == m->m_type) { 992 memcpy(mtod(n, caddr_t) + n->m_len, mtod(m, caddr_t), 993 m->m_len); 994 n->m_len += m->m_len; 995 sb->sb_cc += m->m_len; 996 if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME) 997 sb->sb_datacc += m->m_len; 998 m = m_free(m); 999 continue; 1000 } 1001 if (n) 1002 n->m_next = m; 1003 else 1004 sb->sb_mb = m; 1005 sb->sb_mbtail = m; 1006 sballoc(so, sb, m); 1007 n = m; 1008 m->m_flags &= ~M_EOR; 1009 m = m->m_next; 1010 n->m_next = NULL; 1011 } 1012 if (eor) { 1013 if (n) 1014 n->m_flags |= eor; 1015 else 1016 printf("semi-panic: sbcompress"); 1017 } 1018 SBLASTMBUFCHK(sb, __func__); 1019 } 1020 1021 /* 1022 * Free all mbufs in a sockbuf. 1023 * Check that all resources are reclaimed. 1024 */ 1025 void 1026 sbflush(struct socket *so, struct sockbuf *sb) 1027 { 1028 KASSERT(sb == &so->so_rcv || sb == &so->so_snd); 1029 KASSERT((sb->sb_flags & SB_LOCK) == 0); 1030 1031 while (sb->sb_mbcnt) 1032 sbdrop(so, sb, (int)sb->sb_cc); 1033 1034 KASSERT(sb->sb_cc == 0); 1035 KASSERT(sb->sb_datacc == 0); 1036 KASSERT(sb->sb_mb == NULL); 1037 KASSERT(sb->sb_mbtail == NULL); 1038 KASSERT(sb->sb_lastrecord == NULL); 1039 } 1040 1041 /* 1042 * Drop data from (the front of) a sockbuf. 1043 */ 1044 void 1045 sbdrop(struct socket *so, struct sockbuf *sb, int len) 1046 { 1047 struct mbuf *m, *mn; 1048 struct mbuf *next; 1049 1050 KASSERT(sb == &so->so_rcv || sb == &so->so_snd); 1051 soassertlocked(so); 1052 1053 next = (m = sb->sb_mb) ? m->m_nextpkt : NULL; 1054 while (len > 0) { 1055 if (m == NULL) { 1056 if (next == NULL) 1057 panic("sbdrop"); 1058 m = next; 1059 next = m->m_nextpkt; 1060 continue; 1061 } 1062 if (m->m_len > len) { 1063 m->m_len -= len; 1064 m->m_data += len; 1065 sb->sb_cc -= len; 1066 if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME) 1067 sb->sb_datacc -= len; 1068 break; 1069 } 1070 len -= m->m_len; 1071 sbfree(so, sb, m); 1072 mn = m_free(m); 1073 m = mn; 1074 } 1075 while (m && m->m_len == 0) { 1076 sbfree(so, sb, m); 1077 mn = m_free(m); 1078 m = mn; 1079 } 1080 if (m) { 1081 sb->sb_mb = m; 1082 m->m_nextpkt = next; 1083 } else 1084 sb->sb_mb = next; 1085 /* 1086 * First part is an inline SB_EMPTY_FIXUP(). Second part 1087 * makes sure sb_lastrecord is up-to-date if we dropped 1088 * part of the last record. 1089 */ 1090 m = sb->sb_mb; 1091 if (m == NULL) { 1092 sb->sb_mbtail = NULL; 1093 sb->sb_lastrecord = NULL; 1094 } else if (m->m_nextpkt == NULL) 1095 sb->sb_lastrecord = m; 1096 } 1097 1098 /* 1099 * Drop a record off the front of a sockbuf 1100 * and move the next record to the front. 1101 */ 1102 void 1103 sbdroprecord(struct socket *so, struct sockbuf *sb) 1104 { 1105 struct mbuf *m, *mn; 1106 1107 m = sb->sb_mb; 1108 if (m) { 1109 sb->sb_mb = m->m_nextpkt; 1110 do { 1111 sbfree(so, sb, m); 1112 mn = m_free(m); 1113 } while ((m = mn) != NULL); 1114 } 1115 SB_EMPTY_FIXUP(sb); 1116 } 1117 1118 /* 1119 * Create a "control" mbuf containing the specified data 1120 * with the specified type for presentation on a socket buffer. 1121 */ 1122 struct mbuf * 1123 sbcreatecontrol(const void *p, size_t size, int type, int level) 1124 { 1125 struct cmsghdr *cp; 1126 struct mbuf *m; 1127 1128 if (CMSG_SPACE(size) > MCLBYTES) { 1129 printf("sbcreatecontrol: message too large %zu\n", size); 1130 return (NULL); 1131 } 1132 1133 if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL) 1134 return (NULL); 1135 if (CMSG_SPACE(size) > MLEN) { 1136 MCLGET(m, M_DONTWAIT); 1137 if ((m->m_flags & M_EXT) == 0) { 1138 m_free(m); 1139 return NULL; 1140 } 1141 } 1142 cp = mtod(m, struct cmsghdr *); 1143 memset(cp, 0, CMSG_SPACE(size)); 1144 memcpy(CMSG_DATA(cp), p, size); 1145 m->m_len = CMSG_SPACE(size); 1146 cp->cmsg_len = CMSG_LEN(size); 1147 cp->cmsg_level = level; 1148 cp->cmsg_type = type; 1149 return (m); 1150 } 1151