1 /* 2 * Copyright (c) 2005 Jeffrey M. Hsu. All rights reserved. 3 * Copyright (c) 1982, 1986, 1988, 1990, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 4. Neither the name of the University nor the names of its contributors 15 * may be used to endorse or promote products derived from this software 16 * without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 31 * $FreeBSD: src/sys/kern/uipc_socket2.c,v 1.55.2.17 2002/08/31 19:04:55 dwmalone Exp $ 32 * $DragonFly: src/sys/kern/uipc_sockbuf.c,v 1.3 2007/08/09 01:10:04 dillon Exp $ 33 */ 34 35 #include "opt_param.h" 36 #include <sys/param.h> 37 #include <sys/systm.h> 38 #include <sys/domain.h> 39 #include <sys/file.h> /* for maxfiles */ 40 #include <sys/kernel.h> 41 #include <sys/proc.h> 42 #include <sys/malloc.h> 43 #include <sys/mbuf.h> 44 #include <sys/protosw.h> 45 #include <sys/resourcevar.h> 46 #include <sys/stat.h> 47 #include <sys/socket.h> 48 #include <sys/socketvar.h> 49 50 #include <sys/thread2.h> 51 #include <sys/msgport2.h> 52 53 /* 54 * Routines to add and remove data from an mbuf queue. 55 * 56 * The routines sbappend() or sbappendrecord() are normally called to 57 * append new mbufs to a socket buffer. sbappendrecord() differs from 58 * sbappend() in that data supplied is treated as the beginning of a new 59 * record. sbappend() only begins a new record if the last mbuf in the 60 * sockbuf is marked M_EOR. 61 * 62 * To place a sender's address, optional access rights, and data in a 63 * socket receive buffer, sbappendaddr() or sbappendcontrol() should be 64 * used. These functions also begin a new record. 65 * 66 * Reliable protocols may use the socket send buffer to hold data 67 * awaiting acknowledgement. Data is normally copied from a socket 68 * send buffer in a protocol with m_copy for output to a peer, 69 * and then removing the data from the socket buffer with sbdrop() 70 * or sbdroprecord() when the data is acknowledged by the peer. 71 */ 72 73 /* 74 * Append mbuf chain m to the last record in the socket buffer sb. 75 * The additional space associated the mbuf chain is recorded in sb. 76 * Empty mbufs are discarded and mbufs are compacted where possible. 77 * 78 * If M_EOR is set in the first or last mbuf of the last record, the 79 * mbuf chain is appended as a new record. M_EOR is usually just set 80 * in the last mbuf of the last record's mbuf chain (see sbcompress()), 81 * but this may be changed in the future since there is no real need 82 * to propogate the flag any more. 83 */ 84 void 85 sbappend(struct sockbuf *sb, struct mbuf *m) 86 { 87 struct mbuf *n; 88 89 mbuftrackid(m, 16); 90 91 if (m) { 92 n = sb->sb_lastrecord; 93 if (n) { 94 if (n->m_flags & M_EOR) { 95 sbappendrecord(sb, m); 96 return; 97 } 98 } 99 n = sb->sb_lastmbuf; 100 if (n) { 101 if (n->m_flags & M_EOR) { 102 sbappendrecord(sb, m); 103 return; 104 } 105 } 106 sbcompress(sb, m, n); 107 } 108 } 109 110 /* 111 * sbappendstream() is an optimized form of sbappend() for protocols 112 * such as TCP that only have one record in the socket buffer, are 113 * not PR_ATOMIC, nor allow MT_CONTROL data. A protocol that uses 114 * sbappendstream() must use sbappendstream() exclusively. 115 */ 116 void 117 sbappendstream(struct sockbuf *sb, struct mbuf *m) 118 { 119 mbuftrackid(m, 17); 120 KKASSERT(m->m_nextpkt == NULL); 121 sbcompress(sb, m, sb->sb_lastmbuf); 122 } 123 124 #ifdef SOCKBUF_DEBUG 125 126 void 127 _sbcheck(struct sockbuf *sb) 128 { 129 struct mbuf *m; 130 struct mbuf *n = NULL; 131 u_long len = 0, mbcnt = 0; 132 133 for (m = sb->sb_mb; m; m = n) { 134 n = m->m_nextpkt; 135 if (n == NULL && sb->sb_lastrecord != m) { 136 kprintf("sockbuf %p mismatched lastrecord %p vs %p\n", sb, sb->sb_lastrecord, m); 137 panic("sbcheck1"); 138 139 } 140 for (; m; m = m->m_next) { 141 len += m->m_len; 142 mbcnt += MSIZE; 143 if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */ 144 mbcnt += m->m_ext.ext_size; 145 if (n == NULL && m->m_next == NULL) { 146 if (sb->sb_lastmbuf != m) { 147 kprintf("sockbuf %p mismatched lastmbuf %p vs %p\n", sb, sb->sb_lastmbuf, m); 148 panic("sbcheck2"); 149 } 150 } 151 } 152 } 153 if (sb->sb_mb == NULL) { 154 if (sb->sb_lastrecord != NULL) { 155 kprintf("sockbuf %p is empty, lastrecord not NULL: %p\n", 156 sb, sb->sb_lastrecord); 157 panic("sbcheck3"); 158 } 159 if (sb->sb_lastmbuf != NULL) { 160 kprintf("sockbuf %p is empty, lastmbuf not NULL: %p\n", 161 sb, sb->sb_lastmbuf); 162 panic("sbcheck4"); 163 } 164 } 165 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { 166 kprintf("sockbuf %p cc %ld != %ld || mbcnt %ld != %ld\n", 167 sb, len, sb->sb_cc, mbcnt, sb->sb_mbcnt); 168 panic("sbcheck5"); 169 } 170 } 171 172 #endif 173 174 /* 175 * Same as sbappend(), except the mbuf chain begins a new record. 176 */ 177 void 178 sbappendrecord(struct sockbuf *sb, struct mbuf *m0) 179 { 180 struct mbuf *firstmbuf; 181 struct mbuf *secondmbuf; 182 183 if (m0 == NULL) 184 return; 185 mbuftrackid(m0, 18); 186 187 sbcheck(sb); 188 189 /* 190 * Break the first mbuf off from the rest of the mbuf chain. 191 */ 192 firstmbuf = m0; 193 secondmbuf = m0->m_next; 194 m0->m_next = NULL; 195 196 /* 197 * Insert the first mbuf of the m0 mbuf chain as the last record of 198 * the sockbuf. Note this permits zero length records! Keep the 199 * sockbuf state consistent. 200 */ 201 if (sb->sb_mb == NULL) 202 sb->sb_mb = firstmbuf; 203 else 204 sb->sb_lastrecord->m_nextpkt = firstmbuf; 205 sb->sb_lastrecord = firstmbuf; /* update hint for new last record */ 206 sb->sb_lastmbuf = firstmbuf; /* update hint for new last mbuf */ 207 208 /* 209 * propagate the EOR flag so sbcompress() can pick it up 210 */ 211 if ((firstmbuf->m_flags & M_EOR) && (secondmbuf != NULL)) { 212 firstmbuf->m_flags &= ~M_EOR; 213 secondmbuf->m_flags |= M_EOR; 214 } 215 216 /* 217 * The succeeding call to sbcompress() omits accounting for 218 * the first mbuf, so do it here. 219 */ 220 sballoc(sb, firstmbuf); 221 222 /* Compact the rest of the mbuf chain in after the first mbuf. */ 223 sbcompress(sb, secondmbuf, firstmbuf); 224 } 225 226 /* 227 * Append address and data, and optionally, control (ancillary) data 228 * to the receive queue of a socket. If present, 229 * m0 must include a packet header with total length. 230 * Returns 0 if insufficient mbufs. 231 */ 232 int 233 sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, 234 struct mbuf *control) 235 { 236 struct mbuf *m, *n; 237 int eor; 238 239 mbuftrackid(m0, 19); 240 mbuftrackid(control, 20); 241 if (m0 && (m0->m_flags & M_PKTHDR) == 0) 242 panic("sbappendaddr"); 243 sbcheck(sb); 244 245 for (n = control; n; n = n->m_next) { 246 if (n->m_next == NULL) /* keep pointer to last control buf */ 247 break; 248 } 249 if (asa->sa_len > MLEN) 250 return (0); 251 MGET(m, MB_DONTWAIT, MT_SONAME); 252 if (m == NULL) 253 return (0); 254 KKASSERT(m->m_nextpkt == NULL); 255 m->m_len = asa->sa_len; 256 bcopy(asa, mtod(m, caddr_t), asa->sa_len); 257 if (n) 258 n->m_next = m0; /* concatenate data to control */ 259 else 260 control = m0; 261 m->m_next = control; 262 for (n = m; n; n = n->m_next) 263 sballoc(sb, n); 264 265 if (sb->sb_mb == NULL) 266 sb->sb_mb = m; 267 else 268 sb->sb_lastrecord->m_nextpkt = m; 269 sb->sb_lastrecord = m; 270 271 /* 272 * Propogate M_EOR to the last mbuf and calculate sb_lastmbuf 273 * so sbappend() can find it. 274 */ 275 eor = m->m_flags; 276 while (m->m_next) { 277 m->m_flags &= ~M_EOR; 278 m = m->m_next; 279 eor |= m->m_flags; 280 } 281 m->m_flags |= eor & M_EOR; 282 sb->sb_lastmbuf = m; 283 284 return (1); 285 } 286 287 /* 288 * Append control information followed by data. Both the control and data 289 * must be non-null. 290 */ 291 int 292 sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control) 293 { 294 struct mbuf *n; 295 u_int length, cmbcnt, m0mbcnt; 296 int eor; 297 298 KASSERT(control != NULL, ("sbappendcontrol")); 299 KKASSERT(control->m_nextpkt == NULL); 300 sbcheck(sb); 301 302 mbuftrackid(m0, 21); 303 mbuftrackid(control, 22); 304 305 length = m_countm(control, &n, &cmbcnt) + m_countm(m0, NULL, &m0mbcnt); 306 307 KKASSERT(m0 != NULL); 308 309 n->m_next = m0; /* concatenate data to control */ 310 311 if (sb->sb_mb == NULL) 312 sb->sb_mb = control; 313 else 314 sb->sb_lastrecord->m_nextpkt = control; 315 sb->sb_lastrecord = control; 316 317 /* 318 * Propogate M_EOR to the last mbuf and calculate sb_lastmbuf 319 * so sbappend() can find it. 320 */ 321 eor = m0->m_flags; 322 while (m0->m_next) { 323 m0->m_flags &= ~M_EOR; 324 m0 = m0->m_next; 325 eor |= m0->m_flags; 326 } 327 m0->m_flags |= eor & M_EOR; 328 sb->sb_lastmbuf = m0; 329 330 sb->sb_cc += length; 331 sb->sb_mbcnt += cmbcnt + m0mbcnt; 332 333 return (1); 334 } 335 336 /* 337 * Compress mbuf chain m into the socket buffer sb following mbuf tailm. 338 * If tailm is null, the buffer is presumed empty. Also, as a side-effect, 339 * increment the sockbuf counts for each mbuf in the chain. 340 */ 341 void 342 sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *tailm) 343 { 344 int eor = 0; 345 struct mbuf *free_chain = NULL; 346 347 mbuftrackid(m, 23); 348 349 sbcheck(sb); 350 while (m) { 351 struct mbuf *o; 352 353 eor |= m->m_flags & M_EOR; 354 /* 355 * Disregard empty mbufs as long as we don't encounter 356 * an end-of-record or there is a trailing mbuf of 357 * the same type to propagate the EOR flag to. 358 * 359 * Defer the m_free() call because it can block and break 360 * the atomicy of the sockbuf. 361 */ 362 if (m->m_len == 0 && 363 (eor == 0 || 364 (((o = m->m_next) || (o = tailm)) && 365 o->m_type == m->m_type))) { 366 o = m->m_next; 367 m->m_next = free_chain; 368 free_chain = m; 369 m = o; 370 continue; 371 } 372 373 /* See if we can coalesce with preceding mbuf. */ 374 if (tailm && !(tailm->m_flags & M_EOR) && M_WRITABLE(tailm) && 375 m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ 376 m->m_len <= M_TRAILINGSPACE(tailm) && 377 tailm->m_type == m->m_type) { 378 u_long mbcnt_sz; 379 380 bcopy(mtod(m, caddr_t), 381 mtod(tailm, caddr_t) + tailm->m_len, 382 (unsigned)m->m_len); 383 tailm->m_len += m->m_len; 384 385 sb->sb_cc += m->m_len; /* update sb counter */ 386 387 /* 388 * Fix the wrongly updated mbcnt_prealloc 389 */ 390 mbcnt_sz = MSIZE; 391 if (m->m_flags & M_EXT) 392 mbcnt_sz += m->m_ext.ext_size; 393 atomic_subtract_long(&sb->sb_mbcnt_prealloc, mbcnt_sz); 394 395 o = m->m_next; 396 m->m_next = free_chain; 397 free_chain = m; 398 m = o; 399 continue; 400 } 401 402 /* Insert whole mbuf. */ 403 if (tailm == NULL) { 404 KASSERT(sb->sb_mb == NULL, 405 ("sbcompress: sb_mb not NULL")); 406 sb->sb_mb = m; /* only mbuf in sockbuf */ 407 sb->sb_lastrecord = m; /* new last record */ 408 } else { 409 tailm->m_next = m; /* tack m on following tailm */ 410 } 411 sb->sb_lastmbuf = m; /* update last mbuf hint */ 412 413 tailm = m; /* just inserted mbuf becomes the new tail */ 414 m = m->m_next; /* advance to next mbuf */ 415 tailm->m_next = NULL; /* split inserted mbuf off from chain */ 416 417 /* update sb counters for just added mbuf */ 418 sballoc(sb, tailm); 419 420 /* clear EOR on intermediate mbufs */ 421 tailm->m_flags &= ~M_EOR; 422 } 423 424 /* 425 * Propogate EOR to the last mbuf 426 */ 427 if (eor) { 428 if (tailm) 429 tailm->m_flags |= eor; 430 else 431 kprintf("semi-panic: sbcompress"); 432 } 433 434 /* 435 * Clean up any defered frees. 436 */ 437 while (free_chain) 438 free_chain = m_free(free_chain); 439 440 sbcheck(sb); 441 } 442 443 /* 444 * Free all mbufs in a sockbuf. 445 * Check that all resources are reclaimed. 446 */ 447 void 448 sbflush(struct sockbuf *sb) 449 { 450 while (sb->sb_mbcnt) { 451 /* 452 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty: 453 * we would loop forever. Panic instead. 454 */ 455 if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len)) 456 break; 457 sbdrop(sb, (int)sb->sb_cc); 458 } 459 KASSERT(!(sb->sb_cc || sb->sb_mb || sb->sb_mbcnt || sb->sb_lastmbuf), 460 ("sbflush: cc %ld || mb %p || mbcnt %ld || lastmbuf %p", 461 sb->sb_cc, sb->sb_mb, sb->sb_mbcnt, sb->sb_lastmbuf)); 462 } 463 464 /* 465 * Drop data from (the front of) a sockbuf. If the current record is 466 * exhausted this routine will move onto the next one and continue dropping 467 * data. 468 */ 469 void 470 sbdrop(struct sockbuf *sb, int len) 471 { 472 struct mbuf *m; 473 struct mbuf *free_chain = NULL; 474 475 sbcheck(sb); 476 crit_enter(); 477 478 m = sb->sb_mb; 479 while (m && len > 0) { 480 if (m->m_len > len) { 481 m->m_len -= len; 482 m->m_data += len; 483 sb->sb_cc -= len; 484 atomic_subtract_long(&sb->sb_cc_prealloc, len); 485 break; 486 } 487 len -= m->m_len; 488 m = sbunlinkmbuf(sb, m, &free_chain); 489 if (m == NULL && len) 490 m = sb->sb_mb; 491 } 492 493 /* 494 * Remove any trailing 0-length mbufs in the current record. If 495 * the last record for which data was removed is now empty, m will be 496 * NULL. 497 */ 498 while (m && m->m_len == 0) { 499 m = sbunlinkmbuf(sb, m, &free_chain); 500 } 501 crit_exit(); 502 if (free_chain) 503 m_freem(free_chain); 504 sbcheck(sb); 505 } 506 507 /* 508 * Drop a record off the front of a sockbuf and move the next record 509 * to the front. 510 * 511 * Must be called while holding a critical section. 512 */ 513 void 514 sbdroprecord(struct sockbuf *sb) 515 { 516 struct mbuf *m; 517 struct mbuf *n; 518 519 sbcheck(sb); 520 m = sb->sb_mb; 521 if (m) { 522 if ((sb->sb_mb = m->m_nextpkt) == NULL) { 523 sb->sb_lastrecord = NULL; 524 sb->sb_lastmbuf = NULL; 525 } 526 m->m_nextpkt = NULL; 527 for (n = m; n; n = n->m_next) 528 sbfree(sb, n); 529 m_freem(m); 530 sbcheck(sb); 531 } 532 } 533 534 /* 535 * Drop the first mbuf off the sockbuf and move the next mbuf to the front. 536 * Currently only the head mbuf of the sockbuf may be dropped this way. 537 * 538 * The next mbuf in the same record as the mbuf being removed is returned 539 * or NULL if the record is exhausted. Note that other records may remain 540 * in the sockbuf when NULL is returned. 541 * 542 * Must be called while holding a critical section. 543 */ 544 struct mbuf * 545 sbunlinkmbuf(struct sockbuf *sb, struct mbuf *m, struct mbuf **free_chain) 546 { 547 struct mbuf *n; 548 549 KKASSERT(sb->sb_mb == m); 550 sbfree(sb, m); 551 n = m->m_next; 552 if (n) { 553 sb->sb_mb = n; 554 if (sb->sb_lastrecord == m) 555 sb->sb_lastrecord = n; 556 KKASSERT(sb->sb_lastmbuf != m); 557 n->m_nextpkt = m->m_nextpkt; 558 } else { 559 sb->sb_mb = m->m_nextpkt; 560 if (sb->sb_lastrecord == m) { 561 KKASSERT(sb->sb_mb == NULL); 562 sb->sb_lastrecord = NULL; 563 } 564 if (sb->sb_mb == NULL) 565 sb->sb_lastmbuf = NULL; 566 } 567 m->m_nextpkt = NULL; 568 if (free_chain) { 569 m->m_next = *free_chain; 570 *free_chain = m; 571 } else { 572 m->m_next = NULL; 573 } 574 return(n); 575 } 576 577 /* 578 * Create a "control" mbuf containing the specified data 579 * with the specified type for presentation on a socket buffer. 580 */ 581 struct mbuf * 582 sbcreatecontrol(caddr_t p, int size, int type, int level) 583 { 584 struct cmsghdr *cp; 585 struct mbuf *m; 586 587 if (CMSG_SPACE((u_int)size) > MCLBYTES) 588 return (NULL); 589 m = m_getl(CMSG_SPACE((u_int)size), MB_DONTWAIT, MT_CONTROL, 0, NULL); 590 if (m == NULL) 591 return (NULL); 592 m->m_len = CMSG_SPACE(size); 593 cp = mtod(m, struct cmsghdr *); 594 if (p != NULL) 595 memcpy(CMSG_DATA(cp), p, size); 596 cp->cmsg_len = CMSG_LEN(size); 597 cp->cmsg_level = level; 598 cp->cmsg_type = type; 599 mbuftrackid(m, 24); 600 return (m); 601 } 602 603