1 /* 2 * Copyright (c) 2005 Jeffrey M. Hsu. All rights reserved. 3 * Copyright (c) 1982, 1986, 1988, 1990, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. All advertising materials mentioning features or use of this software 15 * must display the following acknowledgement: 16 * This product includes software developed by the University of 17 * California, Berkeley and its contributors. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 35 * $FreeBSD: src/sys/kern/uipc_socket2.c,v 1.55.2.17 2002/08/31 19:04:55 dwmalone Exp $ 36 * $DragonFly: src/sys/kern/uipc_sockbuf.c,v 1.3 2007/08/09 01:10:04 dillon Exp $ 37 */ 38 39 #include "opt_param.h" 40 #include <sys/param.h> 41 #include <sys/systm.h> 42 #include <sys/domain.h> 43 #include <sys/file.h> /* for maxfiles */ 44 #include <sys/kernel.h> 45 #include <sys/proc.h> 46 #include <sys/malloc.h> 47 #include <sys/mbuf.h> 48 #include <sys/protosw.h> 49 #include <sys/resourcevar.h> 50 #include <sys/stat.h> 51 #include <sys/socket.h> 52 #include <sys/socketvar.h> 53 54 #include <sys/thread2.h> 55 #include <sys/msgport2.h> 56 57 /* 58 * Routines to add and remove data from an mbuf queue. 59 * 60 * The routines sbappend() or sbappendrecord() are normally called to 61 * append new mbufs to a socket buffer. sbappendrecord() differs from 62 * sbappend() in that data supplied is treated as the beginning of a new 63 * record. sbappend() only begins a new record if the last mbuf in the 64 * sockbuf is marked M_EOR. 65 * 66 * To place a sender's address, optional access rights, and data in a 67 * socket receive buffer, sbappendaddr() or sbappendcontrol() should be 68 * used. These functions also begin a new record. 69 * 70 * Reliable protocols may use the socket send buffer to hold data 71 * awaiting acknowledgement. Data is normally copied from a socket 72 * send buffer in a protocol with m_copy for output to a peer, 73 * and then removing the data from the socket buffer with sbdrop() 74 * or sbdroprecord() when the data is acknowledged by the peer. 75 */ 76 77 /* 78 * Append mbuf chain m to the last record in the socket buffer sb. 79 * The additional space associated the mbuf chain is recorded in sb. 80 * Empty mbufs are discarded and mbufs are compacted where possible. 81 * 82 * If M_EOR is set in the first or last mbuf of the last record, the 83 * mbuf chain is appended as a new record. M_EOR is usually just set 84 * in the last mbuf of the last record's mbuf chain (see sbcompress()), 85 * but this may be changed in the future since there is no real need 86 * to propogate the flag any more. 87 */ 88 void 89 sbappend(struct sockbuf *sb, struct mbuf *m) 90 { 91 struct mbuf *n; 92 93 mbuftrackid(m, 16); 94 95 if (m) { 96 n = sb->sb_lastrecord; 97 if (n) { 98 if (n->m_flags & M_EOR) { 99 sbappendrecord(sb, m); 100 return; 101 } 102 } 103 n = sb->sb_lastmbuf; 104 if (n) { 105 if (n->m_flags & M_EOR) { 106 sbappendrecord(sb, m); 107 return; 108 } 109 } 110 sbcompress(sb, m, n); 111 } 112 } 113 114 /* 115 * sbappendstream() is an optimized form of sbappend() for protocols 116 * such as TCP that only have one record in the socket buffer, are 117 * not PR_ATOMIC, nor allow MT_CONTROL data. A protocol that uses 118 * sbappendstream() must use sbappendstream() exclusively. 119 */ 120 void 121 sbappendstream(struct sockbuf *sb, struct mbuf *m) 122 { 123 mbuftrackid(m, 17); 124 KKASSERT(m->m_nextpkt == NULL); 125 sbcompress(sb, m, sb->sb_lastmbuf); 126 } 127 128 #ifdef SOCKBUF_DEBUG 129 130 void 131 _sbcheck(struct sockbuf *sb) 132 { 133 struct mbuf *m; 134 struct mbuf *n = NULL; 135 u_long len = 0, mbcnt = 0; 136 137 for (m = sb->sb_mb; m; m = n) { 138 n = m->m_nextpkt; 139 if (n == NULL && sb->sb_lastrecord != m) { 140 kprintf("sockbuf %p mismatched lastrecord %p vs %p\n", sb, sb->sb_lastrecord, m); 141 panic("sbcheck1"); 142 143 } 144 for (; m; m = m->m_next) { 145 len += m->m_len; 146 mbcnt += MSIZE; 147 if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */ 148 mbcnt += m->m_ext.ext_size; 149 if (n == NULL && m->m_next == NULL) { 150 if (sb->sb_lastmbuf != m) { 151 kprintf("sockbuf %p mismatched lastmbuf %p vs %p\n", sb, sb->sb_lastmbuf, m); 152 panic("sbcheck2"); 153 } 154 } 155 } 156 } 157 if (sb->sb_mb == NULL) { 158 if (sb->sb_lastrecord != NULL) { 159 kprintf("sockbuf %p is empty, lastrecord not NULL: %p\n", 160 sb, sb->sb_lastrecord); 161 panic("sbcheck3"); 162 } 163 if (sb->sb_lastmbuf != NULL) { 164 kprintf("sockbuf %p is empty, lastmbuf not NULL: %p\n", 165 sb, sb->sb_lastmbuf); 166 panic("sbcheck4"); 167 } 168 } 169 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { 170 kprintf("sockbuf %p cc %ld != %ld || mbcnt %ld != %ld\n", 171 sb, len, sb->sb_cc, mbcnt, sb->sb_mbcnt); 172 panic("sbcheck5"); 173 } 174 } 175 176 #endif 177 178 /* 179 * Same as sbappend(), except the mbuf chain begins a new record. 180 */ 181 void 182 sbappendrecord(struct sockbuf *sb, struct mbuf *m0) 183 { 184 struct mbuf *firstmbuf; 185 struct mbuf *secondmbuf; 186 187 if (m0 == NULL) 188 return; 189 mbuftrackid(m0, 18); 190 191 sbcheck(sb); 192 193 /* 194 * Break the first mbuf off from the rest of the mbuf chain. 195 */ 196 firstmbuf = m0; 197 secondmbuf = m0->m_next; 198 m0->m_next = NULL; 199 200 /* 201 * Insert the first mbuf of the m0 mbuf chain as the last record of 202 * the sockbuf. Note this permits zero length records! Keep the 203 * sockbuf state consistent. 204 */ 205 if (sb->sb_mb == NULL) 206 sb->sb_mb = firstmbuf; 207 else 208 sb->sb_lastrecord->m_nextpkt = firstmbuf; 209 sb->sb_lastrecord = firstmbuf; /* update hint for new last record */ 210 sb->sb_lastmbuf = firstmbuf; /* update hint for new last mbuf */ 211 212 /* 213 * propagate the EOR flag so sbcompress() can pick it up 214 */ 215 if ((firstmbuf->m_flags & M_EOR) && (secondmbuf != NULL)) { 216 firstmbuf->m_flags &= ~M_EOR; 217 secondmbuf->m_flags |= M_EOR; 218 } 219 220 /* 221 * The succeeding call to sbcompress() omits accounting for 222 * the first mbuf, so do it here. 223 */ 224 sballoc(sb, firstmbuf); 225 226 /* Compact the rest of the mbuf chain in after the first mbuf. */ 227 sbcompress(sb, secondmbuf, firstmbuf); 228 } 229 230 /* 231 * Append address and data, and optionally, control (ancillary) data 232 * to the receive queue of a socket. If present, 233 * m0 must include a packet header with total length. 234 * Returns 0 if insufficient mbufs. 235 */ 236 int 237 sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, 238 struct mbuf *control) 239 { 240 struct mbuf *m, *n; 241 int eor; 242 243 mbuftrackid(m0, 19); 244 mbuftrackid(control, 20); 245 if (m0 && (m0->m_flags & M_PKTHDR) == 0) 246 panic("sbappendaddr"); 247 sbcheck(sb); 248 249 for (n = control; n; n = n->m_next) { 250 if (n->m_next == NULL) /* keep pointer to last control buf */ 251 break; 252 } 253 if (asa->sa_len > MLEN) 254 return (0); 255 MGET(m, MB_DONTWAIT, MT_SONAME); 256 if (m == NULL) 257 return (0); 258 KKASSERT(m->m_nextpkt == NULL); 259 m->m_len = asa->sa_len; 260 bcopy(asa, mtod(m, caddr_t), asa->sa_len); 261 if (n) 262 n->m_next = m0; /* concatenate data to control */ 263 else 264 control = m0; 265 m->m_next = control; 266 for (n = m; n; n = n->m_next) 267 sballoc(sb, n); 268 269 if (sb->sb_mb == NULL) 270 sb->sb_mb = m; 271 else 272 sb->sb_lastrecord->m_nextpkt = m; 273 sb->sb_lastrecord = m; 274 275 /* 276 * Propogate M_EOR to the last mbuf and calculate sb_lastmbuf 277 * so sbappend() can find it. 278 */ 279 eor = m->m_flags; 280 while (m->m_next) { 281 m->m_flags &= ~M_EOR; 282 m = m->m_next; 283 eor |= m->m_flags; 284 } 285 m->m_flags |= eor & M_EOR; 286 sb->sb_lastmbuf = m; 287 288 return (1); 289 } 290 291 /* 292 * Append control information followed by data. Both the control and data 293 * must be non-null. 294 */ 295 int 296 sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control) 297 { 298 struct mbuf *n; 299 u_int length, cmbcnt, m0mbcnt; 300 int eor; 301 302 KASSERT(control != NULL, ("sbappendcontrol")); 303 KKASSERT(control->m_nextpkt == NULL); 304 sbcheck(sb); 305 306 mbuftrackid(m0, 21); 307 mbuftrackid(control, 22); 308 309 length = m_countm(control, &n, &cmbcnt) + m_countm(m0, NULL, &m0mbcnt); 310 311 KKASSERT(m0 != NULL); 312 313 n->m_next = m0; /* concatenate data to control */ 314 315 if (sb->sb_mb == NULL) 316 sb->sb_mb = control; 317 else 318 sb->sb_lastrecord->m_nextpkt = control; 319 sb->sb_lastrecord = control; 320 321 /* 322 * Propogate M_EOR to the last mbuf and calculate sb_lastmbuf 323 * so sbappend() can find it. 324 */ 325 eor = m0->m_flags; 326 while (m0->m_next) { 327 m0->m_flags &= ~M_EOR; 328 m0 = m0->m_next; 329 eor |= m0->m_flags; 330 } 331 m0->m_flags |= eor & M_EOR; 332 sb->sb_lastmbuf = m0; 333 334 sb->sb_cc += length; 335 sb->sb_mbcnt += cmbcnt + m0mbcnt; 336 337 return (1); 338 } 339 340 /* 341 * Compress mbuf chain m into the socket buffer sb following mbuf tailm. 342 * If tailm is null, the buffer is presumed empty. Also, as a side-effect, 343 * increment the sockbuf counts for each mbuf in the chain. 344 */ 345 void 346 sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *tailm) 347 { 348 int eor = 0; 349 struct mbuf *free_chain = NULL; 350 351 mbuftrackid(m, 23); 352 353 sbcheck(sb); 354 while (m) { 355 struct mbuf *o; 356 357 eor |= m->m_flags & M_EOR; 358 /* 359 * Disregard empty mbufs as long as we don't encounter 360 * an end-of-record or there is a trailing mbuf of 361 * the same type to propagate the EOR flag to. 362 * 363 * Defer the m_free() call because it can block and break 364 * the atomicy of the sockbuf. 365 */ 366 if (m->m_len == 0 && 367 (eor == 0 || 368 (((o = m->m_next) || (o = tailm)) && 369 o->m_type == m->m_type))) { 370 o = m->m_next; 371 m->m_next = free_chain; 372 free_chain = m; 373 m = o; 374 continue; 375 } 376 377 /* See if we can coalesce with preceding mbuf. */ 378 if (tailm && !(tailm->m_flags & M_EOR) && M_WRITABLE(tailm) && 379 m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ 380 m->m_len <= M_TRAILINGSPACE(tailm) && 381 tailm->m_type == m->m_type) { 382 bcopy(mtod(m, caddr_t), 383 mtod(tailm, caddr_t) + tailm->m_len, 384 (unsigned)m->m_len); 385 tailm->m_len += m->m_len; 386 sb->sb_cc += m->m_len; /* update sb counter */ 387 o = m->m_next; 388 m->m_next = free_chain; 389 free_chain = m; 390 m = o; 391 continue; 392 } 393 394 /* Insert whole mbuf. */ 395 if (tailm == NULL) { 396 KASSERT(sb->sb_mb == NULL, 397 ("sbcompress: sb_mb not NULL")); 398 sb->sb_mb = m; /* only mbuf in sockbuf */ 399 sb->sb_lastrecord = m; /* new last record */ 400 } else { 401 tailm->m_next = m; /* tack m on following tailm */ 402 } 403 sb->sb_lastmbuf = m; /* update last mbuf hint */ 404 405 tailm = m; /* just inserted mbuf becomes the new tail */ 406 m = m->m_next; /* advance to next mbuf */ 407 tailm->m_next = NULL; /* split inserted mbuf off from chain */ 408 409 /* update sb counters for just added mbuf */ 410 sballoc(sb, tailm); 411 412 /* clear EOR on intermediate mbufs */ 413 tailm->m_flags &= ~M_EOR; 414 } 415 416 /* 417 * Propogate EOR to the last mbuf 418 */ 419 if (eor) { 420 if (tailm) 421 tailm->m_flags |= eor; 422 else 423 kprintf("semi-panic: sbcompress"); 424 } 425 426 /* 427 * Clean up any defered frees. 428 */ 429 while (free_chain) 430 free_chain = m_free(free_chain); 431 432 sbcheck(sb); 433 } 434 435 /* 436 * Free all mbufs in a sockbuf. 437 * Check that all resources are reclaimed. 438 */ 439 void 440 sbflush(struct sockbuf *sb) 441 { 442 while (sb->sb_mbcnt) { 443 /* 444 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty: 445 * we would loop forever. Panic instead. 446 */ 447 if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len)) 448 break; 449 sbdrop(sb, (int)sb->sb_cc); 450 } 451 KASSERT(!(sb->sb_cc || sb->sb_mb || sb->sb_mbcnt || sb->sb_lastmbuf), 452 ("sbflush: cc %ld || mb %p || mbcnt %ld || lastmbuf %p", 453 sb->sb_cc, sb->sb_mb, sb->sb_mbcnt, sb->sb_lastmbuf)); 454 } 455 456 /* 457 * Drop data from (the front of) a sockbuf. If the current record is 458 * exhausted this routine will move onto the next one and continue dropping 459 * data. 460 */ 461 void 462 sbdrop(struct sockbuf *sb, int len) 463 { 464 struct mbuf *m; 465 struct mbuf *free_chain = NULL; 466 467 sbcheck(sb); 468 crit_enter(); 469 470 m = sb->sb_mb; 471 while (m && len > 0) { 472 if (m->m_len > len) { 473 m->m_len -= len; 474 m->m_data += len; 475 sb->sb_cc -= len; 476 break; 477 } 478 len -= m->m_len; 479 m = sbunlinkmbuf(sb, m, &free_chain); 480 if (m == NULL && len) 481 m = sb->sb_mb; 482 } 483 484 /* 485 * Remove any trailing 0-length mbufs in the current record. If 486 * the last record for which data was removed is now empty, m will be 487 * NULL. 488 */ 489 while (m && m->m_len == 0) { 490 m = sbunlinkmbuf(sb, m, &free_chain); 491 } 492 crit_exit(); 493 if (free_chain) 494 m_freem(free_chain); 495 sbcheck(sb); 496 } 497 498 /* 499 * Drop a record off the front of a sockbuf and move the next record 500 * to the front. 501 * 502 * Must be called while holding a critical section. 503 */ 504 void 505 sbdroprecord(struct sockbuf *sb) 506 { 507 struct mbuf *m; 508 struct mbuf *n; 509 510 sbcheck(sb); 511 m = sb->sb_mb; 512 if (m) { 513 if ((sb->sb_mb = m->m_nextpkt) == NULL) { 514 sb->sb_lastrecord = NULL; 515 sb->sb_lastmbuf = NULL; 516 } 517 m->m_nextpkt = NULL; 518 for (n = m; n; n = n->m_next) 519 sbfree(sb, n); 520 m_freem(m); 521 sbcheck(sb); 522 } 523 } 524 525 /* 526 * Drop the first mbuf off the sockbuf and move the next mbuf to the front. 527 * Currently only the head mbuf of the sockbuf may be dropped this way. 528 * 529 * The next mbuf in the same record as the mbuf being removed is returned 530 * or NULL if the record is exhausted. Note that other records may remain 531 * in the sockbuf when NULL is returned. 532 * 533 * Must be called while holding a critical section. 534 */ 535 struct mbuf * 536 sbunlinkmbuf(struct sockbuf *sb, struct mbuf *m, struct mbuf **free_chain) 537 { 538 struct mbuf *n; 539 540 KKASSERT(sb->sb_mb == m); 541 sbfree(sb, m); 542 n = m->m_next; 543 if (n) { 544 sb->sb_mb = n; 545 if (sb->sb_lastrecord == m) 546 sb->sb_lastrecord = n; 547 KKASSERT(sb->sb_lastmbuf != m); 548 n->m_nextpkt = m->m_nextpkt; 549 } else { 550 sb->sb_mb = m->m_nextpkt; 551 if (sb->sb_lastrecord == m) { 552 KKASSERT(sb->sb_mb == NULL); 553 sb->sb_lastrecord = NULL; 554 } 555 if (sb->sb_mb == NULL) 556 sb->sb_lastmbuf = NULL; 557 } 558 m->m_nextpkt = NULL; 559 if (free_chain) { 560 m->m_next = *free_chain; 561 *free_chain = m; 562 } else { 563 m->m_next = NULL; 564 } 565 return(n); 566 } 567 568 /* 569 * Create a "control" mbuf containing the specified data 570 * with the specified type for presentation on a socket buffer. 571 */ 572 struct mbuf * 573 sbcreatecontrol(caddr_t p, int size, int type, int level) 574 { 575 struct cmsghdr *cp; 576 struct mbuf *m; 577 578 if (CMSG_SPACE((u_int)size) > MCLBYTES) 579 return (NULL); 580 m = m_getl(CMSG_SPACE((u_int)size), MB_DONTWAIT, MT_CONTROL, 0, NULL); 581 if (m == NULL) 582 return (NULL); 583 m->m_len = CMSG_SPACE(size); 584 cp = mtod(m, struct cmsghdr *); 585 if (p != NULL) 586 memcpy(CMSG_DATA(cp), p, size); 587 cp->cmsg_len = CMSG_LEN(size); 588 cp->cmsg_level = level; 589 cp->cmsg_type = type; 590 mbuftrackid(m, 24); 591 return (m); 592 } 593 594