1 /* 2 * Copyright (c) 2005 Jeffrey M. Hsu. All rights reserved. 3 * Copyright (c) 1982, 1986, 1988, 1990, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. Neither the name of the University nor the names of its contributors 15 * may be used to endorse or promote products derived from this software 16 * without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 31 * $FreeBSD: src/sys/kern/uipc_socket2.c,v 1.55.2.17 2002/08/31 19:04:55 dwmalone Exp $ 32 */ 33 34 #include "opt_param.h" 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/domain.h> 38 #include <sys/file.h> /* for maxfiles */ 39 #include <sys/kernel.h> 40 #include <sys/proc.h> 41 #include <sys/malloc.h> 42 #include <sys/mbuf.h> 43 #include <sys/protosw.h> 44 #include <sys/resourcevar.h> 45 #include <sys/stat.h> 46 #include <sys/socket.h> 47 #include <sys/socketvar.h> 48 49 #include <sys/thread2.h> 50 #include <sys/msgport2.h> 51 52 /* 53 * Routines to add and remove data from an mbuf queue. 54 * 55 * The routines sbappend() or sbappendrecord() are normally called to 56 * append new mbufs to a socket buffer. sbappendrecord() differs from 57 * sbappend() in that data supplied is treated as the beginning of a new 58 * record. sbappend() only begins a new record if the last mbuf in the 59 * sockbuf is marked M_EOR. 60 * 61 * To place a sender's address, optional access rights, and data in a 62 * socket receive buffer, sbappendaddr() or sbappendcontrol() should be 63 * used. These functions also begin a new record. 64 * 65 * Reliable protocols may use the socket send buffer to hold data 66 * awaiting acknowledgement. Data is normally copied from a socket 67 * send buffer in a protocol with m_copy for output to a peer, 68 * and then removing the data from the socket buffer with sbdrop() 69 * or sbdroprecord() when the data is acknowledged by the peer. 70 */ 71 72 /* 73 * Append mbuf chain m to the last record in the socket buffer sb. 74 * The additional space associated the mbuf chain is recorded in sb. 75 * Empty mbufs are discarded and mbufs are compacted where possible. 76 * 77 * If M_EOR is set in the first or last mbuf of the last record, the 78 * mbuf chain is appended as a new record. M_EOR is usually just set 79 * in the last mbuf of the last record's mbuf chain (see sbcompress()), 80 * but this may be changed in the future since there is no real need 81 * to propogate the flag any more. 82 */ 83 void 84 sbappend(struct sockbuf *sb, struct mbuf *m) 85 { 86 struct mbuf *n; 87 88 mbuftrackid(m, 16); 89 90 if (m) { 91 n = sb->sb_lastrecord; 92 if (n) { 93 if (n->m_flags & M_EOR) { 94 sbappendrecord(sb, m); 95 return; 96 } 97 } 98 n = sb->sb_lastmbuf; 99 if (n) { 100 if (n->m_flags & M_EOR) { 101 sbappendrecord(sb, m); 102 return; 103 } 104 } 105 sbcompress(sb, m, n); 106 } 107 } 108 109 /* 110 * sbappendstream() is an optimized form of sbappend() for protocols 111 * such as TCP that only have one record in the socket buffer, are 112 * not PR_ATOMIC, nor allow MT_CONTROL data. A protocol that uses 113 * sbappendstream() must use sbappendstream() exclusively. 114 */ 115 void 116 sbappendstream(struct sockbuf *sb, struct mbuf *m) 117 { 118 mbuftrackid(m, 17); 119 KKASSERT(m->m_nextpkt == NULL); 120 sbcompress(sb, m, sb->sb_lastmbuf); 121 } 122 123 #ifdef SOCKBUF_DEBUG 124 125 void 126 _sbcheck(struct sockbuf *sb) 127 { 128 struct mbuf *m; 129 struct mbuf *n = NULL; 130 u_long len = 0, mbcnt = 0; 131 132 for (m = sb->sb_mb; m; m = n) { 133 n = m->m_nextpkt; 134 if (n == NULL && sb->sb_lastrecord != m) { 135 kprintf("sockbuf %p mismatched lastrecord %p vs %p\n", sb, sb->sb_lastrecord, m); 136 panic("sbcheck1"); 137 138 } 139 for (; m; m = m->m_next) { 140 len += m->m_len; 141 mbcnt += MSIZE; 142 if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */ 143 mbcnt += m->m_ext.ext_size; 144 if (n == NULL && m->m_next == NULL) { 145 if (sb->sb_lastmbuf != m) { 146 kprintf("sockbuf %p mismatched lastmbuf %p vs %p\n", sb, sb->sb_lastmbuf, m); 147 panic("sbcheck2"); 148 } 149 } 150 } 151 } 152 if (sb->sb_mb == NULL) { 153 if (sb->sb_lastrecord != NULL) { 154 kprintf("sockbuf %p is empty, lastrecord not NULL: %p\n", 155 sb, sb->sb_lastrecord); 156 panic("sbcheck3"); 157 } 158 if (sb->sb_lastmbuf != NULL) { 159 kprintf("sockbuf %p is empty, lastmbuf not NULL: %p\n", 160 sb, sb->sb_lastmbuf); 161 panic("sbcheck4"); 162 } 163 } 164 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { 165 kprintf("sockbuf %p cc %ld != %ld || mbcnt %ld != %ld\n", 166 sb, len, sb->sb_cc, mbcnt, sb->sb_mbcnt); 167 panic("sbcheck5"); 168 } 169 } 170 171 #endif 172 173 /* 174 * Same as sbappend(), except the mbuf chain begins a new record. 175 */ 176 void 177 sbappendrecord(struct sockbuf *sb, struct mbuf *m0) 178 { 179 struct mbuf *firstmbuf; 180 struct mbuf *secondmbuf; 181 182 if (m0 == NULL) 183 return; 184 mbuftrackid(m0, 18); 185 186 sbcheck(sb); 187 188 /* 189 * Break the first mbuf off from the rest of the mbuf chain. 190 */ 191 firstmbuf = m0; 192 secondmbuf = m0->m_next; 193 m0->m_next = NULL; 194 195 /* 196 * Insert the first mbuf of the m0 mbuf chain as the last record of 197 * the sockbuf. Note this permits zero length records! Keep the 198 * sockbuf state consistent. 199 */ 200 if (sb->sb_mb == NULL) 201 sb->sb_mb = firstmbuf; 202 else 203 sb->sb_lastrecord->m_nextpkt = firstmbuf; 204 sb->sb_lastrecord = firstmbuf; /* update hint for new last record */ 205 sb->sb_lastmbuf = firstmbuf; /* update hint for new last mbuf */ 206 207 /* 208 * propagate the EOR flag so sbcompress() can pick it up 209 */ 210 if ((firstmbuf->m_flags & M_EOR) && (secondmbuf != NULL)) { 211 firstmbuf->m_flags &= ~M_EOR; 212 secondmbuf->m_flags |= M_EOR; 213 } 214 215 /* 216 * The succeeding call to sbcompress() omits accounting for 217 * the first mbuf, so do it here. 218 */ 219 sballoc(sb, firstmbuf); 220 221 /* Compact the rest of the mbuf chain in after the first mbuf. */ 222 sbcompress(sb, secondmbuf, firstmbuf); 223 } 224 225 /* 226 * Append address and data, and optionally, control (ancillary) data 227 * to the receive queue of a socket. If present, 228 * m0 must include a packet header with total length. 229 * Returns 0 if insufficient mbufs. 230 */ 231 int 232 sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, 233 struct mbuf *control) 234 { 235 struct mbuf *m, *n; 236 int eor; 237 238 mbuftrackid(m0, 19); 239 mbuftrackid(control, 20); 240 if (m0 && (m0->m_flags & M_PKTHDR) == 0) 241 panic("sbappendaddr"); 242 sbcheck(sb); 243 244 for (n = control; n; n = n->m_next) { 245 if (n->m_next == NULL) /* keep pointer to last control buf */ 246 break; 247 } 248 if (asa->sa_len > MLEN) 249 return (0); 250 MGET(m, M_NOWAIT, MT_SONAME); 251 if (m == NULL) 252 return (0); 253 KKASSERT(m->m_nextpkt == NULL); 254 m->m_len = asa->sa_len; 255 bcopy(asa, mtod(m, caddr_t), asa->sa_len); 256 if (n) 257 n->m_next = m0; /* concatenate data to control */ 258 else 259 control = m0; 260 m->m_next = control; 261 for (n = m; n; n = n->m_next) 262 sballoc(sb, n); 263 264 if (sb->sb_mb == NULL) 265 sb->sb_mb = m; 266 else 267 sb->sb_lastrecord->m_nextpkt = m; 268 sb->sb_lastrecord = m; 269 270 /* 271 * Propogate M_EOR to the last mbuf and calculate sb_lastmbuf 272 * so sbappend() can find it. 273 */ 274 eor = m->m_flags; 275 while (m->m_next) { 276 m->m_flags &= ~M_EOR; 277 m = m->m_next; 278 eor |= m->m_flags; 279 } 280 m->m_flags |= eor & M_EOR; 281 sb->sb_lastmbuf = m; 282 283 return (1); 284 } 285 286 /* 287 * Append control information followed by data. Both the control and data 288 * must be non-null. 289 */ 290 int 291 sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control) 292 { 293 struct mbuf *n; 294 u_int length, cmbcnt, m0mbcnt; 295 int eor; 296 297 KASSERT(control != NULL, ("sbappendcontrol")); 298 KKASSERT(control->m_nextpkt == NULL); 299 sbcheck(sb); 300 301 mbuftrackid(m0, 21); 302 mbuftrackid(control, 22); 303 304 length = m_countm(control, &n, &cmbcnt) + m_countm(m0, NULL, &m0mbcnt); 305 306 KKASSERT(m0 != NULL); 307 308 n->m_next = m0; /* concatenate data to control */ 309 310 if (sb->sb_mb == NULL) 311 sb->sb_mb = control; 312 else 313 sb->sb_lastrecord->m_nextpkt = control; 314 sb->sb_lastrecord = control; 315 316 /* 317 * Propogate M_EOR to the last mbuf and calculate sb_lastmbuf 318 * so sbappend() can find it. 319 */ 320 eor = m0->m_flags; 321 while (m0->m_next) { 322 m0->m_flags &= ~M_EOR; 323 m0 = m0->m_next; 324 eor |= m0->m_flags; 325 } 326 m0->m_flags |= eor & M_EOR; 327 sb->sb_lastmbuf = m0; 328 329 sb->sb_cc += length; 330 sb->sb_mbcnt += cmbcnt + m0mbcnt; 331 332 return (1); 333 } 334 335 /* 336 * Compress mbuf chain m into the socket buffer sb following mbuf tailm. 337 * If tailm is null, the buffer is presumed empty. Also, as a side-effect, 338 * increment the sockbuf counts for each mbuf in the chain. 339 */ 340 void 341 sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *tailm) 342 { 343 int eor = 0; 344 struct mbuf *free_chain = NULL; 345 346 mbuftrackid(m, 23); 347 348 sbcheck(sb); 349 while (m) { 350 struct mbuf *o; 351 352 eor |= m->m_flags & M_EOR; 353 /* 354 * Disregard empty mbufs as long as we don't encounter 355 * an end-of-record or there is a trailing mbuf of 356 * the same type to propagate the EOR flag to. 357 * 358 * Defer the m_free() call because it can block and break 359 * the atomicy of the sockbuf. 360 */ 361 if (m->m_len == 0 && 362 (eor == 0 || 363 (((o = m->m_next) || (o = tailm)) && 364 o->m_type == m->m_type))) { 365 o = m->m_next; 366 m->m_next = free_chain; 367 free_chain = m; 368 m = o; 369 continue; 370 } 371 372 /* 373 * See if we can coalesce with preceding mbuf. Never try 374 * to coalesce a mbuf representing an end-of-record or 375 * a mbuf locked by userland for reading. 376 */ 377 if (tailm && !(tailm->m_flags & (M_EOR | M_SOLOCKED)) && 378 M_WRITABLE(tailm) && 379 m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ 380 m->m_len <= M_TRAILINGSPACE(tailm) && 381 tailm->m_type == m->m_type) { 382 u_long mbcnt_sz; 383 384 bcopy(mtod(m, caddr_t), 385 mtod(tailm, caddr_t) + tailm->m_len, 386 (unsigned)m->m_len); 387 tailm->m_len += m->m_len; 388 389 sb->sb_cc += m->m_len; /* update sb counter */ 390 391 /* 392 * Fix the wrongly updated mbcnt_prealloc 393 */ 394 mbcnt_sz = MSIZE; 395 if (m->m_flags & M_EXT) 396 mbcnt_sz += m->m_ext.ext_size; 397 atomic_subtract_long(&sb->sb_mbcnt_prealloc, mbcnt_sz); 398 399 o = m->m_next; 400 m->m_next = free_chain; 401 free_chain = m; 402 m = o; 403 continue; 404 } 405 406 /* Insert whole mbuf. */ 407 if (tailm == NULL) { 408 KASSERT(sb->sb_mb == NULL, 409 ("sbcompress: sb_mb not NULL")); 410 sb->sb_mb = m; /* only mbuf in sockbuf */ 411 sb->sb_lastrecord = m; /* new last record */ 412 } else { 413 tailm->m_next = m; /* tack m on following tailm */ 414 } 415 sb->sb_lastmbuf = m; /* update last mbuf hint */ 416 417 tailm = m; /* just inserted mbuf becomes the new tail */ 418 m = m->m_next; /* advance to next mbuf */ 419 tailm->m_next = NULL; /* split inserted mbuf off from chain */ 420 421 /* update sb counters for just added mbuf */ 422 sballoc(sb, tailm); 423 424 /* clear EOR on intermediate mbufs */ 425 tailm->m_flags &= ~M_EOR; 426 } 427 428 /* 429 * Propogate EOR to the last mbuf 430 */ 431 if (eor) { 432 if (tailm) 433 tailm->m_flags |= eor; 434 else 435 kprintf("semi-panic: sbcompress"); 436 } 437 438 /* 439 * Clean up any defered frees. 440 */ 441 while (free_chain) 442 free_chain = m_free(free_chain); 443 444 sbcheck(sb); 445 } 446 447 /* 448 * Free all mbufs in a sockbuf. 449 * Check that all resources are reclaimed. 450 */ 451 void 452 sbflush(struct sockbuf *sb) 453 { 454 while (sb->sb_mbcnt) { 455 /* 456 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty: 457 * we would loop forever. Panic instead. 458 */ 459 if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len)) 460 break; 461 sbdrop(sb, (int)sb->sb_cc); 462 } 463 KASSERT(!(sb->sb_cc || sb->sb_mb || sb->sb_mbcnt || sb->sb_lastmbuf), 464 ("sbflush: cc %ld || mb %p || mbcnt %ld || lastmbuf %p", 465 sb->sb_cc, sb->sb_mb, sb->sb_mbcnt, sb->sb_lastmbuf)); 466 } 467 468 /* 469 * Drop data from (the front of) a sockbuf. If the current record is 470 * exhausted this routine will move onto the next one and continue dropping 471 * data. 472 */ 473 void 474 sbdrop(struct sockbuf *sb, int len) 475 { 476 struct mbuf *m; 477 struct mbuf *free_chain = NULL; 478 479 sbcheck(sb); 480 crit_enter(); 481 482 m = sb->sb_mb; 483 while (m && len > 0) { 484 if (m->m_len > len) { 485 m->m_len -= len; 486 m->m_data += len; 487 sb->sb_cc -= len; 488 atomic_subtract_long(&sb->sb_cc_prealloc, len); 489 break; 490 } 491 len -= m->m_len; 492 m = sbunlinkmbuf(sb, m, &free_chain); 493 if (m == NULL && len) 494 m = sb->sb_mb; 495 } 496 497 /* 498 * Remove any trailing 0-length mbufs in the current record. If 499 * the last record for which data was removed is now empty, m will be 500 * NULL. 501 */ 502 while (m && m->m_len == 0) { 503 m = sbunlinkmbuf(sb, m, &free_chain); 504 } 505 crit_exit(); 506 if (free_chain) 507 m_freem(free_chain); 508 sbcheck(sb); 509 } 510 511 /* 512 * Drop a record off the front of a sockbuf and move the next record 513 * to the front. 514 * 515 * Must be called while holding a critical section. 516 */ 517 void 518 sbdroprecord(struct sockbuf *sb) 519 { 520 struct mbuf *m; 521 struct mbuf *n; 522 523 sbcheck(sb); 524 m = sb->sb_mb; 525 if (m) { 526 if ((sb->sb_mb = m->m_nextpkt) == NULL) { 527 sb->sb_lastrecord = NULL; 528 sb->sb_lastmbuf = NULL; 529 } 530 m->m_nextpkt = NULL; 531 for (n = m; n; n = n->m_next) 532 sbfree(sb, n); 533 m_freem(m); 534 sbcheck(sb); 535 } 536 } 537 538 /* 539 * Drop the first mbuf off the sockbuf and move the next mbuf to the front. 540 * Currently only the head mbuf of the sockbuf may be dropped this way. 541 * 542 * The next mbuf in the same record as the mbuf being removed is returned 543 * or NULL if the record is exhausted. Note that other records may remain 544 * in the sockbuf when NULL is returned. 545 * 546 * Must be called while holding a critical section. 547 */ 548 struct mbuf * 549 sbunlinkmbuf(struct sockbuf *sb, struct mbuf *m, struct mbuf **free_chain) 550 { 551 struct mbuf *n; 552 553 KKASSERT(sb->sb_mb == m); 554 sbfree(sb, m); 555 n = m->m_next; 556 if (n) { 557 sb->sb_mb = n; 558 if (sb->sb_lastrecord == m) 559 sb->sb_lastrecord = n; 560 KKASSERT(sb->sb_lastmbuf != m); 561 n->m_nextpkt = m->m_nextpkt; 562 } else { 563 sb->sb_mb = m->m_nextpkt; 564 if (sb->sb_lastrecord == m) { 565 KKASSERT(sb->sb_mb == NULL); 566 sb->sb_lastrecord = NULL; 567 } 568 if (sb->sb_mb == NULL) 569 sb->sb_lastmbuf = NULL; 570 } 571 m->m_nextpkt = NULL; 572 if (free_chain) { 573 m->m_next = *free_chain; 574 *free_chain = m; 575 } else { 576 m->m_next = NULL; 577 } 578 return(n); 579 } 580 581 /* 582 * Create a "control" mbuf containing the specified data 583 * with the specified type for presentation on a socket buffer. 584 */ 585 struct mbuf * 586 sbcreatecontrol(caddr_t p, int size, int type, int level) 587 { 588 struct cmsghdr *cp; 589 struct mbuf *m; 590 591 if (CMSG_SPACE((u_int)size) > MCLBYTES) 592 return (NULL); 593 m = m_getl(CMSG_SPACE((u_int)size), M_NOWAIT, MT_CONTROL, 0, NULL); 594 if (m == NULL) 595 return (NULL); 596 m->m_len = CMSG_SPACE(size); 597 cp = mtod(m, struct cmsghdr *); 598 if (p != NULL) 599 memcpy(CMSG_DATA(cp), p, size); 600 cp->cmsg_len = CMSG_LEN(size); 601 cp->cmsg_level = level; 602 cp->cmsg_type = type; 603 mbuftrackid(m, 24); 604 return (m); 605 } 606 607