1 /* 2 * Copyright (c) 2005 Jeffrey M. Hsu. All rights reserved. 3 * Copyright (c) 1982, 1986, 1988, 1990, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. All advertising materials mentioning features or use of this software 15 * must display the following acknowledgement: 16 * This product includes software developed by the University of 17 * California, Berkeley and its contributors. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 35 * $FreeBSD: src/sys/kern/uipc_socket2.c,v 1.55.2.17 2002/08/31 19:04:55 dwmalone Exp $ 36 * $DragonFly: src/sys/kern/uipc_socket2.c,v 1.22 2005/07/23 07:28:34 dillon Exp $ 37 */ 38 39 #include "opt_param.h" 40 #include <sys/param.h> 41 #include <sys/systm.h> 42 #include <sys/domain.h> 43 #include <sys/file.h> /* for maxfiles */ 44 #include <sys/kernel.h> 45 #include <sys/proc.h> 46 #include <sys/malloc.h> 47 #include <sys/mbuf.h> 48 #include <sys/protosw.h> 49 #include <sys/resourcevar.h> 50 #include <sys/stat.h> 51 #include <sys/socket.h> 52 #include <sys/socketvar.h> 53 #include <sys/signalvar.h> 54 #include <sys/sysctl.h> 55 #include <sys/aio.h> /* for aio_swake proto */ 56 #include <sys/event.h> 57 58 #include <sys/thread2.h> 59 #include <sys/msgport2.h> 60 61 int maxsockets; 62 63 /* 64 * Primitive routines for operating on sockets and socket buffers 65 */ 66 67 u_long sb_max = SB_MAX; 68 u_long sb_max_adj = 69 SB_MAX * MCLBYTES / (MSIZE + MCLBYTES); /* adjusted sb_max */ 70 71 static u_long sb_efficiency = 8; /* parameter for sbreserve() */ 72 73 /* 74 * Procedures to manipulate state flags of socket 75 * and do appropriate wakeups. Normal sequence from the 76 * active (originating) side is that soisconnecting() is 77 * called during processing of connect() call, 78 * resulting in an eventual call to soisconnected() if/when the 79 * connection is established. When the connection is torn down 80 * soisdisconnecting() is called during processing of disconnect() call, 81 * and soisdisconnected() is called when the connection to the peer 82 * is totally severed. The semantics of these routines are such that 83 * connectionless protocols can call soisconnected() and soisdisconnected() 84 * only, bypassing the in-progress calls when setting up a ``connection'' 85 * takes no time. 86 * 87 * From the passive side, a socket is created with 88 * two queues of sockets: so_incomp for connections in progress 89 * and so_comp for connections already made and awaiting user acceptance. 90 * As a protocol is preparing incoming connections, it creates a socket 91 * structure queued on so_incomp by calling sonewconn(). When the connection 92 * is established, soisconnected() is called, and transfers the 93 * socket structure to so_comp, making it available to accept(). 94 * 95 * If a socket is closed with sockets on either 96 * so_incomp or so_comp, these sockets are dropped. 97 * 98 * If higher level protocols are implemented in 99 * the kernel, the wakeups done here will sometimes 100 * cause software-interrupt process scheduling. 101 */ 102 103 void 104 soisconnecting(so) 105 struct socket *so; 106 { 107 108 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 109 so->so_state |= SS_ISCONNECTING; 110 } 111 112 void 113 soisconnected(so) 114 struct socket *so; 115 { 116 struct socket *head = so->so_head; 117 118 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); 119 so->so_state |= SS_ISCONNECTED; 120 if (head && (so->so_state & SS_INCOMP)) { 121 if ((so->so_options & SO_ACCEPTFILTER) != 0) { 122 so->so_upcall = head->so_accf->so_accept_filter->accf_callback; 123 so->so_upcallarg = head->so_accf->so_accept_filter_arg; 124 so->so_rcv.sb_flags |= SB_UPCALL; 125 so->so_options &= ~SO_ACCEPTFILTER; 126 so->so_upcall(so, so->so_upcallarg, 0); 127 return; 128 } 129 TAILQ_REMOVE(&head->so_incomp, so, so_list); 130 head->so_incqlen--; 131 so->so_state &= ~SS_INCOMP; 132 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); 133 head->so_qlen++; 134 so->so_state |= SS_COMP; 135 sorwakeup(head); 136 wakeup_one(&head->so_timeo); 137 } else { 138 wakeup(&so->so_timeo); 139 sorwakeup(so); 140 sowwakeup(so); 141 } 142 } 143 144 void 145 soisdisconnecting(so) 146 struct socket *so; 147 { 148 149 so->so_state &= ~SS_ISCONNECTING; 150 so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE); 151 wakeup((caddr_t)&so->so_timeo); 152 sowwakeup(so); 153 sorwakeup(so); 154 } 155 156 void 157 soisdisconnected(so) 158 struct socket *so; 159 { 160 161 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 162 so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED); 163 wakeup((caddr_t)&so->so_timeo); 164 sbdrop(&so->so_snd, so->so_snd.sb_cc); 165 sowwakeup(so); 166 sorwakeup(so); 167 } 168 169 /* 170 * When an attempt at a new connection is noted on a socket 171 * which accepts connections, sonewconn is called. If the 172 * connection is possible (subject to space constraints, etc.) 173 * then we allocate a new structure, propoerly linked into the 174 * data structure of the original socket, and return this. 175 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED. 176 */ 177 struct socket * 178 sonewconn(struct socket *head, int connstatus) 179 { 180 struct socket *so; 181 struct pru_attach_info ai; 182 183 if (head->so_qlen > 3 * head->so_qlimit / 2) 184 return ((struct socket *)0); 185 so = soalloc(0); 186 if (so == NULL) 187 return ((struct socket *)0); 188 if ((head->so_options & SO_ACCEPTFILTER) != 0) 189 connstatus = 0; 190 so->so_head = head; 191 so->so_type = head->so_type; 192 so->so_options = head->so_options &~ SO_ACCEPTCONN; 193 so->so_linger = head->so_linger; 194 so->so_state = head->so_state | SS_NOFDREF; 195 so->so_proto = head->so_proto; 196 so->so_timeo = head->so_timeo; 197 so->so_cred = crhold(head->so_cred); 198 ai.sb_rlimit = NULL; 199 ai.p_ucred = NULL; 200 ai.fd_rdir = NULL; /* jail code cruft XXX JH */ 201 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat, NULL) || 202 /* Directly call function since we're already at protocol level. */ 203 (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, &ai)) { 204 sodealloc(so); 205 return ((struct socket *)0); 206 } 207 208 if (connstatus) { 209 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); 210 so->so_state |= SS_COMP; 211 head->so_qlen++; 212 } else { 213 if (head->so_incqlen > head->so_qlimit) { 214 struct socket *sp; 215 sp = TAILQ_FIRST(&head->so_incomp); 216 (void) soabort(sp); 217 } 218 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); 219 so->so_state |= SS_INCOMP; 220 head->so_incqlen++; 221 } 222 if (connstatus) { 223 sorwakeup(head); 224 wakeup((caddr_t)&head->so_timeo); 225 so->so_state |= connstatus; 226 } 227 return (so); 228 } 229 230 /* 231 * Socantsendmore indicates that no more data will be sent on the 232 * socket; it would normally be applied to a socket when the user 233 * informs the system that no more data is to be sent, by the protocol 234 * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data 235 * will be received, and will normally be applied to the socket by a 236 * protocol when it detects that the peer will send no more data. 237 * Data queued for reading in the socket may yet be read. 238 */ 239 240 void 241 socantsendmore(so) 242 struct socket *so; 243 { 244 245 so->so_state |= SS_CANTSENDMORE; 246 sowwakeup(so); 247 } 248 249 void 250 socantrcvmore(so) 251 struct socket *so; 252 { 253 254 so->so_state |= SS_CANTRCVMORE; 255 sorwakeup(so); 256 } 257 258 /* 259 * Wait for data to arrive at/drain from a socket buffer. 260 */ 261 int 262 sbwait(sb) 263 struct sockbuf *sb; 264 { 265 266 sb->sb_flags |= SB_WAIT; 267 return (tsleep((caddr_t)&sb->sb_cc, 268 ((sb->sb_flags & SB_NOINTR) ? 0 : PCATCH), 269 "sbwait", 270 sb->sb_timeo)); 271 } 272 273 /* 274 * Lock a sockbuf already known to be locked; 275 * return any error returned from sleep (EINTR). 276 */ 277 int 278 sb_lock(sb) 279 struct sockbuf *sb; 280 { 281 int error; 282 283 while (sb->sb_flags & SB_LOCK) { 284 sb->sb_flags |= SB_WANT; 285 error = tsleep((caddr_t)&sb->sb_flags, 286 ((sb->sb_flags & SB_NOINTR) ? 0 : PCATCH), 287 "sblock", 0); 288 if (error) 289 return (error); 290 } 291 sb->sb_flags |= SB_LOCK; 292 return (0); 293 } 294 295 /* 296 * Wakeup processes waiting on a socket buffer. Do asynchronous notification 297 * via SIGIO if the socket has the SS_ASYNC flag set. 298 */ 299 void 300 sowakeup(so, sb) 301 struct socket *so; 302 struct sockbuf *sb; 303 { 304 struct selinfo *selinfo = &sb->sb_sel; 305 306 selwakeup(selinfo); 307 sb->sb_flags &= ~SB_SEL; 308 if (sb->sb_flags & SB_WAIT) { 309 sb->sb_flags &= ~SB_WAIT; 310 wakeup((caddr_t)&sb->sb_cc); 311 } 312 if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL) 313 pgsigio(so->so_sigio, SIGIO, 0); 314 if (sb->sb_flags & SB_UPCALL) 315 (*so->so_upcall)(so, so->so_upcallarg, MB_DONTWAIT); 316 if (sb->sb_flags & SB_AIO) 317 aio_swake(so, sb); 318 KNOTE(&selinfo->si_note, 0); 319 if (sb->sb_flags & SB_MEVENT) { 320 struct netmsg_so_notify *msg, *nmsg; 321 322 TAILQ_FOREACH_MUTABLE(msg, &selinfo->si_mlist, nm_list, nmsg) { 323 if (msg->nm_predicate((struct netmsg *)msg)) { 324 TAILQ_REMOVE(&selinfo->si_mlist, msg, nm_list); 325 lwkt_replymsg(&msg->nm_lmsg, 326 msg->nm_lmsg.ms_error); 327 } 328 } 329 if (TAILQ_EMPTY(&sb->sb_sel.si_mlist)) 330 sb->sb_flags &= ~SB_MEVENT; 331 } 332 } 333 334 /* 335 * Socket buffer (struct sockbuf) utility routines. 336 * 337 * Each socket contains two socket buffers: one for sending data and 338 * one for receiving data. Each buffer contains a queue of mbufs, 339 * information about the number of mbufs and amount of data in the 340 * queue, and other fields allowing select() statements and notification 341 * on data availability to be implemented. 342 * 343 * Data stored in a socket buffer is maintained as a list of records. 344 * Each record is a list of mbufs chained together with the m_next 345 * field. Records are chained together with the m_nextpkt field. The upper 346 * level routine soreceive() expects the following conventions to be 347 * observed when placing information in the receive buffer: 348 * 349 * 1. If the protocol requires each message be preceded by the sender's 350 * name, then a record containing that name must be present before 351 * any associated data (mbuf's must be of type MT_SONAME). 352 * 2. If the protocol supports the exchange of ``access rights'' (really 353 * just additional data associated with the message), and there are 354 * ``rights'' to be received, then a record containing this data 355 * should be present (mbuf's must be of type MT_RIGHTS). 356 * 3. If a name or rights record exists, then it must be followed by 357 * a data record, perhaps of zero length. 358 * 359 * Before using a new socket structure it is first necessary to reserve 360 * buffer space to the socket, by calling sbreserve(). This should commit 361 * some of the available buffer space in the system buffer pool for the 362 * socket (currently, it does nothing but enforce limits). The space 363 * should be released by calling sbrelease() when the socket is destroyed. 364 */ 365 366 int 367 soreserve(struct socket *so, u_long sndcc, u_long rcvcc, struct rlimit *rl) 368 { 369 if (sbreserve(&so->so_snd, sndcc, so, rl) == 0) 370 goto bad; 371 if (sbreserve(&so->so_rcv, rcvcc, so, rl) == 0) 372 goto bad2; 373 if (so->so_rcv.sb_lowat == 0) 374 so->so_rcv.sb_lowat = 1; 375 if (so->so_snd.sb_lowat == 0) 376 so->so_snd.sb_lowat = MCLBYTES; 377 if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) 378 so->so_snd.sb_lowat = so->so_snd.sb_hiwat; 379 return (0); 380 bad2: 381 sbrelease(&so->so_snd, so); 382 bad: 383 return (ENOBUFS); 384 } 385 386 static int 387 sysctl_handle_sb_max(SYSCTL_HANDLER_ARGS) 388 { 389 int error = 0; 390 u_long old_sb_max = sb_max; 391 392 error = SYSCTL_OUT(req, arg1, sizeof(int)); 393 if (error || !req->newptr) 394 return (error); 395 error = SYSCTL_IN(req, arg1, sizeof(int)); 396 if (error) 397 return (error); 398 if (sb_max < MSIZE + MCLBYTES) { 399 sb_max = old_sb_max; 400 return (EINVAL); 401 } 402 sb_max_adj = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES); 403 return (0); 404 } 405 406 /* 407 * Allot mbufs to a sockbuf. 408 * Attempt to scale mbmax so that mbcnt doesn't become limiting 409 * if buffering efficiency is near the normal case. 410 */ 411 int 412 sbreserve(struct sockbuf *sb, u_long cc, struct socket *so, struct rlimit *rl) 413 { 414 415 /* 416 * rl will only be NULL when we're in an interrupt (eg, in tcp_input) 417 * or when called from netgraph (ie, ngd_attach) 418 */ 419 if (cc > sb_max_adj) 420 return (0); 421 if (!chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, cc, 422 rl ? rl->rlim_cur : RLIM_INFINITY)) { 423 return (0); 424 } 425 sb->sb_mbmax = min(cc * sb_efficiency, sb_max); 426 if (sb->sb_lowat > sb->sb_hiwat) 427 sb->sb_lowat = sb->sb_hiwat; 428 return (1); 429 } 430 431 /* 432 * Free mbufs held by a socket, and reserved mbuf space. 433 */ 434 void 435 sbrelease(sb, so) 436 struct sockbuf *sb; 437 struct socket *so; 438 { 439 440 sbflush(sb); 441 (void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0, 442 RLIM_INFINITY); 443 sb->sb_mbmax = 0; 444 } 445 446 /* 447 * Routines to add and remove 448 * data from an mbuf queue. 449 * 450 * The routines sbappend() or sbappendrecord() are normally called to 451 * append new mbufs to a socket buffer, after checking that adequate 452 * space is available, comparing the function sbspace() with the amount 453 * of data to be added. sbappendrecord() differs from sbappend() in 454 * that data supplied is treated as the beginning of a new record. 455 * To place a sender's address, optional access rights, and data in a 456 * socket receive buffer, sbappendaddr() should be used. To place 457 * access rights and data in a socket receive buffer, sbappendrights() 458 * should be used. In either case, the new data begins a new record. 459 * Note that unlike sbappend() and sbappendrecord(), these routines check 460 * for the caller that there will be enough space to store the data. 461 * Each fails if there is not enough space, or if it cannot find mbufs 462 * to store additional information in. 463 * 464 * Reliable protocols may use the socket send buffer to hold data 465 * awaiting acknowledgement. Data is normally copied from a socket 466 * send buffer in a protocol with m_copy for output to a peer, 467 * and then removing the data from the socket buffer with sbdrop() 468 * or sbdroprecord() when the data is acknowledged by the peer. 469 */ 470 471 /* 472 * Append mbuf chain m to the last record in the 473 * socket buffer sb. The additional space associated 474 * the mbuf chain is recorded in sb. Empty mbufs are 475 * discarded and mbufs are compacted where possible. 476 */ 477 void 478 sbappend(struct sockbuf *sb, struct mbuf *m) 479 { 480 struct mbuf *n; 481 482 if (m) { 483 n = sb->sb_mb; 484 if (n) { 485 while (n->m_nextpkt) 486 n = n->m_nextpkt; 487 do { 488 if (n->m_flags & M_EOR) { 489 /* XXXXXX!!!! */ 490 sbappendrecord(sb, m); 491 return; 492 } 493 } while (n->m_next && (n = n->m_next)); 494 } 495 sbcompress(sb, m, n); 496 } 497 } 498 499 /* 500 * sbappendstream() is an optimized form of sbappend() for protocols 501 * such as TCP that only have one record in the socket buffer, are 502 * not PR_ATOMIC, nor allow MT_CONTROL data. A protocol that uses 503 * sbappendstream() must use sbappendstream() exclusively. 504 */ 505 void 506 sbappendstream(struct sockbuf *sb, struct mbuf *m) 507 { 508 KKASSERT(m->m_nextpkt == NULL); 509 sbcompress(sb, m, sb->sb_lastmbuf); 510 } 511 512 #ifdef SOCKBUF_DEBUG 513 514 void 515 _sbcheck(struct sockbuf *sb) 516 { 517 struct mbuf *m; 518 struct mbuf *n = NULL; 519 u_long len = 0, mbcnt = 0; 520 521 for (m = sb->sb_mb; m; m = n) { 522 n = m->m_nextpkt; 523 if (n == NULL && sb->sb_lastrecord != m) { 524 printf("sockbuf %p mismatched lastrecord %p vs %p\n", sb, sb->sb_lastrecord, m); 525 panic("sbcheck1"); 526 527 } 528 for (; m; m = m->m_next) { 529 len += m->m_len; 530 mbcnt += MSIZE; 531 if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */ 532 mbcnt += m->m_ext.ext_size; 533 if (n == NULL && m->m_next == NULL) { 534 if (sb->sb_lastmbuf != m) { 535 printf("sockbuf %p mismatched lastmbuf %p vs %p\n", sb, sb->sb_lastmbuf, m); 536 panic("sbcheck2"); 537 } 538 } 539 } 540 } 541 if (sb->sb_mb == NULL) { 542 if (sb->sb_lastrecord != NULL) { 543 printf("sockbuf %p is empty, lastrecord not NULL: %p\n", 544 sb, sb->sb_lastrecord); 545 panic("sbcheck3"); 546 } 547 if (sb->sb_lastmbuf != NULL) { 548 printf("sockbuf %p is empty, lastmbuf not NULL: %p\n", 549 sb, sb->sb_lastmbuf); 550 panic("sbcheck4"); 551 } 552 } 553 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { 554 printf("sockbuf %p cc %ld != %ld || mbcnt %ld != %ld\n", 555 sb, len, sb->sb_cc, mbcnt, sb->sb_mbcnt); 556 panic("sbcheck5"); 557 } 558 } 559 560 #endif 561 562 /* 563 * Same as sbappend(), except the mbuf chain begins a new record. 564 */ 565 void 566 sbappendrecord(struct sockbuf *sb, struct mbuf *m0) 567 { 568 struct mbuf *firstmbuf; 569 struct mbuf *secondmbuf; 570 571 if (m0 == NULL) 572 return; 573 574 sbcheck(sb); 575 576 /* 577 * Break the first mbuf off from the rest of the mbuf chain. 578 */ 579 firstmbuf = m0; 580 secondmbuf = m0->m_next; 581 m0->m_next = NULL; 582 583 /* 584 * Insert the first mbuf of the m0 mbuf chain as the last record of 585 * the sockbuf. Note this permits zero length records! Keep the 586 * sockbuf state consistent. 587 */ 588 if (sb->sb_mb == NULL) 589 sb->sb_mb = firstmbuf; 590 else 591 sb->sb_lastrecord->m_nextpkt = firstmbuf; 592 sb->sb_lastrecord = firstmbuf; /* update hint for new last record */ 593 sb->sb_lastmbuf = firstmbuf; /* update hint for new last mbuf */ 594 595 if ((firstmbuf->m_flags & M_EOR) && (secondmbuf != NULL)) { 596 /* propagate the EOR flag */ 597 firstmbuf->m_flags &= ~M_EOR; 598 secondmbuf->m_flags |= M_EOR; 599 } 600 601 /* 602 * The succeeding call to sbcompress() omits accounting for 603 * the first mbuf, so do it here. 604 */ 605 sballoc(sb, firstmbuf); 606 607 /* Compact the rest of the mbuf chain in after the first mbuf. */ 608 sbcompress(sb, secondmbuf, firstmbuf); 609 } 610 611 #if 0 612 /* 613 * As above except that OOB data is inserted at the beginning of the sockbuf, 614 * but after any other OOB data. 615 */ 616 void 617 sbinsertoob(struct sockbuf *sb, struct mbuf *m0) 618 { 619 struct mbuf *m; 620 struct mbuf **mp; 621 622 if (m0 == NULL) 623 return; 624 for (mp = &sb->sb_mb; *mp ; mp = &((*mp)->m_nextpkt)) { 625 m = *mp; 626 again: 627 switch (m->m_type) { 628 629 case MT_OOBDATA: 630 continue; /* WANT next train */ 631 632 case MT_CONTROL: 633 m = m->m_next; 634 if (m) 635 goto again; /* inspect THIS train further */ 636 } 637 break; 638 } 639 /* 640 * Put the first mbuf on the queue. 641 * Note this permits zero length records. 642 */ 643 sballoc(sb, m0); 644 m0->m_nextpkt = *mp; 645 *mp = m0; 646 if (m0->m_nextpkt == NULL) 647 sb->sb_lastrecord = m0; 648 649 m = m0->m_next; 650 m0->m_next = NULL; 651 if (m && (m0->m_flags & M_EOR)) { 652 m0->m_flags &= ~M_EOR; 653 m->m_flags |= M_EOR; 654 } 655 sbcompress(sb, m, m0); 656 } 657 #endif 658 659 /* 660 * Append address and data, and optionally, control (ancillary) data 661 * to the receive queue of a socket. If present, 662 * m0 must include a packet header with total length. 663 * Returns 0 if no space in sockbuf or insufficient mbufs. 664 */ 665 int 666 sbappendaddr(sb, asa, m0, control) 667 struct sockbuf *sb; 668 const struct sockaddr *asa; 669 struct mbuf *m0, *control; 670 { 671 struct mbuf *m, *n; 672 int space = asa->sa_len; 673 674 if (m0 && (m0->m_flags & M_PKTHDR) == 0) 675 panic("sbappendaddr"); 676 sbcheck(sb); 677 678 if (m0) 679 space += m0->m_pkthdr.len; 680 for (n = control; n; n = n->m_next) { 681 space += n->m_len; 682 if (n->m_next == 0) /* keep pointer to last control buf */ 683 break; 684 } 685 if (space > sbspace(sb)) 686 return (0); 687 if (asa->sa_len > MLEN) 688 return (0); 689 MGET(m, MB_DONTWAIT, MT_SONAME); 690 if (m == NULL) 691 return (0); 692 KKASSERT(m->m_nextpkt == NULL); 693 m->m_len = asa->sa_len; 694 bcopy(asa, mtod(m, caddr_t), asa->sa_len); 695 if (n) 696 n->m_next = m0; /* concatenate data to control */ 697 else 698 control = m0; 699 m->m_next = control; 700 for (n = m; n; n = n->m_next) 701 sballoc(sb, n); 702 703 if (sb->sb_mb == NULL) 704 sb->sb_mb = m; 705 else 706 sb->sb_lastrecord->m_nextpkt = m; 707 sb->sb_lastrecord = m; 708 while (m->m_next) 709 m = m->m_next; 710 sb->sb_lastmbuf = m; 711 712 return (1); 713 } 714 715 /* 716 * Append control information followed by data. 717 * control must be non-null. 718 */ 719 int 720 sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control) 721 { 722 struct mbuf *n; 723 u_int length, cmbcnt, m0mbcnt; 724 725 KASSERT(control != NULL, ("sbappendcontrol")); 726 KKASSERT(control->m_nextpkt == NULL); 727 sbcheck(sb); 728 729 length = m_countm(control, &n, &cmbcnt) + m_countm(m0, NULL, &m0mbcnt); 730 if (length > sbspace(sb)) 731 return (0); 732 733 n->m_next = m0; /* concatenate data to control */ 734 735 if (sb->sb_mb == NULL) 736 sb->sb_mb = control; 737 else 738 sb->sb_lastrecord->m_nextpkt = control; 739 sb->sb_lastrecord = control; 740 sb->sb_lastmbuf = m0; 741 742 sb->sb_cc += length; 743 sb->sb_mbcnt += cmbcnt + m0mbcnt; 744 745 return (1); 746 } 747 748 /* 749 * Compress mbuf chain m into the socket buffer sb following mbuf tailm. 750 * If tailm is null, the buffer is presumed empty. Also, as a side-effect, 751 * increment the sockbuf counts for each mbuf in the chain. 752 */ 753 void 754 sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *tailm) 755 { 756 int eor = 0; 757 struct mbuf *free_chain = NULL; 758 759 sbcheck(sb); 760 while (m) { 761 struct mbuf *o; 762 763 eor |= m->m_flags & M_EOR; 764 /* 765 * Disregard empty mbufs as long as we don't encounter 766 * an end-of-record or there is a trailing mbuf of 767 * the same type to propagate the EOR flag to. 768 * 769 * Defer the m_free() call because it can block and break 770 * the atomicy of the sockbuf. 771 */ 772 if (m->m_len == 0 && 773 (eor == 0 || 774 (((o = m->m_next) || (o = tailm)) && 775 o->m_type == m->m_type))) { 776 o = m->m_next; 777 m->m_next = free_chain; 778 free_chain = m; 779 m = o; 780 continue; 781 } 782 783 /* See if we can coalesce with preceding mbuf. */ 784 if (tailm && !(tailm->m_flags & M_EOR) && M_WRITABLE(tailm) && 785 m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ 786 m->m_len <= M_TRAILINGSPACE(tailm) && 787 tailm->m_type == m->m_type) { 788 bcopy(mtod(m, caddr_t), 789 mtod(tailm, caddr_t) + tailm->m_len, 790 (unsigned)m->m_len); 791 tailm->m_len += m->m_len; 792 sb->sb_cc += m->m_len; /* update sb counter */ 793 o = m->m_next; 794 m->m_next = free_chain; 795 free_chain = m; 796 m = o; 797 continue; 798 } 799 800 /* Insert whole mbuf. */ 801 if (tailm == NULL) { 802 KASSERT(sb->sb_mb == NULL, 803 ("sbcompress: sb_mb not NULL")); 804 sb->sb_mb = m; /* only mbuf in sockbuf */ 805 sb->sb_lastrecord = m; /* new last record */ 806 } else { 807 tailm->m_next = m; /* tack m on following tailm */ 808 } 809 sb->sb_lastmbuf = m; /* update last mbuf hint */ 810 811 tailm = m; /* just inserted mbuf becomes the new tail */ 812 m = m->m_next; /* advance to next mbuf */ 813 tailm->m_next = NULL; /* split inserted mbuf off from chain */ 814 815 /* update sb counters for just added mbuf */ 816 sballoc(sb, tailm); 817 818 /* clear EOR on intermediate mbufs */ 819 tailm->m_flags &= ~M_EOR; 820 } 821 822 /* 823 * Propogate EOR to the last mbuf 824 */ 825 if (eor) { 826 if (tailm) 827 tailm->m_flags |= eor; 828 else 829 printf("semi-panic: sbcompress"); 830 } 831 832 /* 833 * Clean up any defered frees. 834 */ 835 while (free_chain) 836 free_chain = m_free(free_chain); 837 838 sbcheck(sb); 839 } 840 841 /* 842 * Free all mbufs in a sockbuf. 843 * Check that all resources are reclaimed. 844 */ 845 void 846 sbflush(sb) 847 struct sockbuf *sb; 848 { 849 850 if (sb->sb_flags & SB_LOCK) 851 panic("sbflush: locked"); 852 while (sb->sb_mbcnt) { 853 /* 854 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty: 855 * we would loop forever. Panic instead. 856 */ 857 if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len)) 858 break; 859 sbdrop(sb, (int)sb->sb_cc); 860 } 861 KASSERT(!(sb->sb_cc || sb->sb_mb || sb->sb_mbcnt || sb->sb_lastmbuf), 862 ("sbflush: cc %ld || mb %p || mbcnt %ld || lastmbuf %p", 863 sb->sb_cc, sb->sb_mb, sb->sb_mbcnt, sb->sb_lastmbuf)); 864 } 865 866 /* 867 * Drop data from (the front of) a sockbuf. 868 */ 869 void 870 sbdrop(sb, len) 871 struct sockbuf *sb; 872 int len; 873 { 874 struct mbuf *m; 875 struct mbuf *free_chain = NULL; 876 877 sbcheck(sb); 878 crit_enter(); 879 880 /* 881 * Remove mbufs from multiple records until the count is exhausted. 882 */ 883 m = sb->sb_mb; 884 while (m && len > 0) { 885 if (m->m_len > len) { 886 m->m_len -= len; 887 m->m_data += len; 888 sb->sb_cc -= len; 889 break; 890 } 891 len -= m->m_len; 892 m = sbunlinkmbuf(sb, m, &free_chain); 893 if (m == NULL && len) 894 m = sb->sb_mb; 895 } 896 897 /* 898 * Remove any trailing 0-length mbufs in the current record. If 899 * the last record for which data was removed is now empty, m will be 900 * NULL. 901 */ 902 while (m && m->m_len == 0) { 903 m = sbunlinkmbuf(sb, m, &free_chain); 904 } 905 crit_exit(); 906 if (free_chain) 907 m_freem(free_chain); 908 sbcheck(sb); 909 } 910 911 /* 912 * Drop a record off the front of a sockbuf and move the next record 913 * to the front. 914 * 915 * Must be called while holding a critical section. 916 */ 917 void 918 sbdroprecord(sb) 919 struct sockbuf *sb; 920 { 921 struct mbuf *m; 922 struct mbuf *n; 923 924 sbcheck(sb); 925 m = sb->sb_mb; 926 if (m) { 927 if ((sb->sb_mb = m->m_nextpkt) == NULL) { 928 sb->sb_lastrecord = NULL; 929 sb->sb_lastmbuf = NULL; 930 } 931 m->m_nextpkt = NULL; 932 for (n = m; n; n = n->m_next) 933 sbfree(sb, n); 934 m_freem(m); 935 sbcheck(sb); 936 } 937 } 938 939 /* 940 * Drop the first mbuf off the sockbuf and move the next mbuf to the front. 941 * Currently only the head mbuf of the sockbuf may be dropped this way. 942 * 943 * The next mbuf in the same record as the mbuf being removed is returned 944 * or NULL if the record is exhausted. Note that other records may remain 945 * in the sockbuf when NULL is returned. 946 * 947 * Must be called while holding a critical section. 948 */ 949 struct mbuf * 950 sbunlinkmbuf(struct sockbuf *sb, struct mbuf *m, struct mbuf **free_chain) 951 { 952 struct mbuf *n; 953 954 KKASSERT(sb->sb_mb == m); 955 sbfree(sb, m); 956 n = m->m_next; 957 if (n) { 958 sb->sb_mb = n; 959 if (sb->sb_lastrecord == m) 960 sb->sb_lastrecord = n; 961 KKASSERT(sb->sb_lastmbuf != m); 962 n->m_nextpkt = m->m_nextpkt; 963 } else { 964 sb->sb_mb = m->m_nextpkt; 965 if (sb->sb_lastrecord == m) { 966 KKASSERT(sb->sb_mb == NULL); 967 sb->sb_lastrecord = NULL; 968 } 969 if (sb->sb_mb == NULL) 970 sb->sb_lastmbuf = NULL; 971 } 972 m->m_nextpkt = NULL; 973 if (free_chain) { 974 m->m_next = *free_chain; 975 *free_chain = m; 976 } else { 977 m->m_next = NULL; 978 } 979 return(n); 980 } 981 982 /* 983 * Create a "control" mbuf containing the specified data 984 * with the specified type for presentation on a socket buffer. 985 */ 986 struct mbuf * 987 sbcreatecontrol(p, size, type, level) 988 caddr_t p; 989 int size; 990 int type, level; 991 { 992 struct cmsghdr *cp; 993 struct mbuf *m; 994 995 if (CMSG_SPACE((u_int)size) > MCLBYTES) 996 return (NULL); 997 m = m_getl(CMSG_SPACE((u_int)size), MB_DONTWAIT, MT_CONTROL, 0, NULL); 998 if (m == NULL) 999 return (NULL); 1000 m->m_len = CMSG_SPACE(size); 1001 cp = mtod(m, struct cmsghdr *); 1002 if (p != NULL) 1003 memcpy(CMSG_DATA(cp), p, size); 1004 cp->cmsg_len = CMSG_LEN(size); 1005 cp->cmsg_level = level; 1006 cp->cmsg_type = type; 1007 return (m); 1008 } 1009 1010 /* 1011 * Some routines that return EOPNOTSUPP for entry points that are not 1012 * supported by a protocol. Fill in as needed. 1013 */ 1014 int 1015 pru_accept_notsupp(struct socket *so, struct sockaddr **nam) 1016 { 1017 return EOPNOTSUPP; 1018 } 1019 1020 int 1021 pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) 1022 { 1023 return EOPNOTSUPP; 1024 } 1025 1026 int 1027 pru_connect2_notsupp(struct socket *so1, struct socket *so2) 1028 { 1029 return EOPNOTSUPP; 1030 } 1031 1032 int 1033 pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data, 1034 struct ifnet *ifp, struct thread *td) 1035 { 1036 return EOPNOTSUPP; 1037 } 1038 1039 int 1040 pru_listen_notsupp(struct socket *so, struct thread *td) 1041 { 1042 return EOPNOTSUPP; 1043 } 1044 1045 int 1046 pru_rcvd_notsupp(struct socket *so, int flags) 1047 { 1048 return EOPNOTSUPP; 1049 } 1050 1051 int 1052 pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags) 1053 { 1054 return EOPNOTSUPP; 1055 } 1056 1057 /* 1058 * This isn't really a ``null'' operation, but it's the default one 1059 * and doesn't do anything destructive. 1060 */ 1061 int 1062 pru_sense_null(struct socket *so, struct stat *sb) 1063 { 1064 sb->st_blksize = so->so_snd.sb_hiwat; 1065 return 0; 1066 } 1067 1068 /* 1069 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. Callers 1070 * of this routine assume that it always succeeds, so we have to use a 1071 * blockable allocation even though we might be called from a critical thread. 1072 */ 1073 struct sockaddr * 1074 dup_sockaddr(const struct sockaddr *sa) 1075 { 1076 struct sockaddr *sa2; 1077 1078 sa2 = malloc(sa->sa_len, M_SONAME, M_INTWAIT); 1079 bcopy(sa, sa2, sa->sa_len); 1080 return (sa2); 1081 } 1082 1083 /* 1084 * Create an external-format (``xsocket'') structure using the information 1085 * in the kernel-format socket structure pointed to by so. This is done 1086 * to reduce the spew of irrelevant information over this interface, 1087 * to isolate user code from changes in the kernel structure, and 1088 * potentially to provide information-hiding if we decide that 1089 * some of this information should be hidden from users. 1090 */ 1091 void 1092 sotoxsocket(struct socket *so, struct xsocket *xso) 1093 { 1094 xso->xso_len = sizeof *xso; 1095 xso->xso_so = so; 1096 xso->so_type = so->so_type; 1097 xso->so_options = so->so_options; 1098 xso->so_linger = so->so_linger; 1099 xso->so_state = so->so_state; 1100 xso->so_pcb = so->so_pcb; 1101 xso->xso_protocol = so->so_proto->pr_protocol; 1102 xso->xso_family = so->so_proto->pr_domain->dom_family; 1103 xso->so_qlen = so->so_qlen; 1104 xso->so_incqlen = so->so_incqlen; 1105 xso->so_qlimit = so->so_qlimit; 1106 xso->so_timeo = so->so_timeo; 1107 xso->so_error = so->so_error; 1108 xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; 1109 xso->so_oobmark = so->so_oobmark; 1110 sbtoxsockbuf(&so->so_snd, &xso->so_snd); 1111 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); 1112 xso->so_uid = so->so_cred->cr_uid; 1113 } 1114 1115 /* 1116 * This does the same for sockbufs. Note that the xsockbuf structure, 1117 * since it is always embedded in a socket, does not include a self 1118 * pointer nor a length. We make this entry point public in case 1119 * some other mechanism needs it. 1120 */ 1121 void 1122 sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb) 1123 { 1124 xsb->sb_cc = sb->sb_cc; 1125 xsb->sb_hiwat = sb->sb_hiwat; 1126 xsb->sb_mbcnt = sb->sb_mbcnt; 1127 xsb->sb_mbmax = sb->sb_mbmax; 1128 xsb->sb_lowat = sb->sb_lowat; 1129 xsb->sb_flags = sb->sb_flags; 1130 xsb->sb_timeo = sb->sb_timeo; 1131 } 1132 1133 /* 1134 * Here is the definition of some of the basic objects in the kern.ipc 1135 * branch of the MIB. 1136 */ 1137 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC"); 1138 1139 /* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */ 1140 static int dummy; 1141 SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, ""); 1142 SYSCTL_OID(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLTYPE_INT|CTLFLAG_RW, 1143 &sb_max, 0, sysctl_handle_sb_max, "I", "Maximum socket buffer size"); 1144 SYSCTL_INT(_kern_ipc, OID_AUTO, maxsockets, CTLFLAG_RD, 1145 &maxsockets, 0, "Maximum number of sockets avaliable"); 1146 SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW, 1147 &sb_efficiency, 0, ""); 1148 1149 /* 1150 * Initialise maxsockets 1151 */ 1152 static void init_maxsockets(void *ignored) 1153 { 1154 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); 1155 maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters)); 1156 } 1157 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); 1158